1/*
2 * Armv8 Neon optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
5 *                          All Rights Reserved.
6 * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 * Copyright (C) 2013-2014, Linaro Limited.  All Rights Reserved.
8 * Author:  Ragesh Radhakrishnan <ragesh.r@linaro.org>
9 * Copyright (C) 2014-2016, 2020, D. R. Commander.  All Rights Reserved.
10 * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
11 * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
12 *
13 * This software is provided 'as-is', without any express or implied
14 * warranty.  In no event will the authors be held liable for any damages
15 * arising from the use of this software.
16 *
17 * Permission is granted to anyone to use this software for any purpose,
18 * including commercial applications, and to alter it and redistribute it
19 * freely, subject to the following restrictions:
20 *
21 * 1. The origin of this software must not be misrepresented; you must not
22 *    claim that you wrote the original software. If you use this software
23 *    in a product, an acknowledgment in the product documentation would be
24 *    appreciated but is not required.
25 * 2. Altered source versions must be plainly marked as such, and must not be
26 *    misrepresented as being the original software.
27 * 3. This notice may not be removed or altered from any source distribution.
28 */
29
30#if defined(__linux__) && defined(__ELF__)
31.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
32#endif
33
34#if defined(__APPLE__)
35.section __DATA, __const
36#elif defined(_WIN32)
37.section .rdata
38#else
39.section .rodata, "a", %progbits
40#endif
41
42/* Constants for jsimd_idct_islow_neon() */
43
44#define F_0_298   2446  /* FIX(0.298631336) */
45#define F_0_390   3196  /* FIX(0.390180644) */
46#define F_0_541   4433  /* FIX(0.541196100) */
47#define F_0_765   6270  /* FIX(0.765366865) */
48#define F_0_899   7373  /* FIX(0.899976223) */
49#define F_1_175   9633  /* FIX(1.175875602) */
50#define F_1_501  12299  /* FIX(1.501321110) */
51#define F_1_847  15137  /* FIX(1.847759065) */
52#define F_1_961  16069  /* FIX(1.961570560) */
53#define F_2_053  16819  /* FIX(2.053119869) */
54#define F_2_562  20995  /* FIX(2.562915447) */
55#define F_3_072  25172  /* FIX(3.072711026) */
56
57.balign 16
58Ljsimd_idct_islow_neon_consts:
59  .short F_0_298
60  .short -F_0_390
61  .short F_0_541
62  .short F_0_765
63  .short - F_0_899
64  .short F_1_175
65  .short F_1_501
66  .short - F_1_847
67  .short - F_1_961
68  .short F_2_053
69  .short - F_2_562
70  .short F_3_072
71  .short 0          /* padding */
72  .short 0
73  .short 0
74  .short 0
75
76#undef F_0_298
77#undef F_0_390
78#undef F_0_541
79#undef F_0_765
80#undef F_0_899
81#undef F_1_175
82#undef F_1_501
83#undef F_1_847
84#undef F_1_961
85#undef F_2_053
86#undef F_2_562
87#undef F_3_072
88
89/* Constants for jsimd_idct_ifast_neon() */
90
91.balign 16
92Ljsimd_idct_ifast_neon_consts:
93  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
94  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
95  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
96  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
97
98/* Constants for jsimd_idct_4x4_neon() and jsimd_idct_2x2_neon() */
99
100#define CONST_BITS  13
101
102#define FIX_0_211164243  (1730)   /* FIX(0.211164243) */
103#define FIX_0_509795579  (4176)   /* FIX(0.509795579) */
104#define FIX_0_601344887  (4926)   /* FIX(0.601344887) */
105#define FIX_0_720959822  (5906)   /* FIX(0.720959822) */
106#define FIX_0_765366865  (6270)   /* FIX(0.765366865) */
107#define FIX_0_850430095  (6967)   /* FIX(0.850430095) */
108#define FIX_0_899976223  (7373)   /* FIX(0.899976223) */
109#define FIX_1_061594337  (8697)   /* FIX(1.061594337) */
110#define FIX_1_272758580  (10426)  /* FIX(1.272758580) */
111#define FIX_1_451774981  (11893)  /* FIX(1.451774981) */
112#define FIX_1_847759065  (15137)  /* FIX(1.847759065) */
113#define FIX_2_172734803  (17799)  /* FIX(2.172734803) */
114#define FIX_2_562915447  (20995)  /* FIX(2.562915447) */
115#define FIX_3_624509785  (29692)  /* FIX(3.624509785) */
116
117.balign 16
118Ljsimd_idct_4x4_neon_consts:
119  .short FIX_1_847759065        /* v0.h[0] */
120  .short -FIX_0_765366865       /* v0.h[1] */
121  .short -FIX_0_211164243       /* v0.h[2] */
122  .short FIX_1_451774981        /* v0.h[3] */
123  .short -FIX_2_172734803       /* d1[0] */
124  .short FIX_1_061594337        /* d1[1] */
125  .short -FIX_0_509795579       /* d1[2] */
126  .short -FIX_0_601344887       /* d1[3] */
127  .short FIX_0_899976223        /* v2.h[0] */
128  .short FIX_2_562915447        /* v2.h[1] */
129  .short 1 << (CONST_BITS + 1)  /* v2.h[2] */
130  .short 0                      /* v2.h[3] */
131
132.balign 8
133Ljsimd_idct_2x2_neon_consts:
134  .short -FIX_0_720959822  /* v14[0] */
135  .short FIX_0_850430095   /* v14[1] */
136  .short -FIX_1_272758580  /* v14[2] */
137  .short FIX_3_624509785   /* v14[3] */
138
139/* Constants for jsimd_ycc_*_neon() */
140
141.balign 16
142Ljsimd_ycc_rgb_neon_consts:
143  .short 0,      0,     0,      0
144  .short 22971, -11277, -23401, 29033
145  .short -128,  -128,   -128,   -128
146  .short -128,  -128,   -128,   -128
147
148/* Constants for jsimd_*_ycc_neon() */
149
150.balign 16
151Ljsimd_rgb_ycc_neon_consts:
152  .short 19595, 38470, 7471, 11059
153  .short 21709, 32768, 27439, 5329
154  .short 32767, 128, 32767, 128
155  .short 32767, 128, 32767, 128
156
157/* Constants for jsimd_fdct_islow_neon() */
158
159#define F_0_298   2446  /* FIX(0.298631336) */
160#define F_0_390   3196  /* FIX(0.390180644) */
161#define F_0_541   4433  /* FIX(0.541196100) */
162#define F_0_765   6270  /* FIX(0.765366865) */
163#define F_0_899   7373  /* FIX(0.899976223) */
164#define F_1_175   9633  /* FIX(1.175875602) */
165#define F_1_501  12299  /* FIX(1.501321110) */
166#define F_1_847  15137  /* FIX(1.847759065) */
167#define F_1_961  16069  /* FIX(1.961570560) */
168#define F_2_053  16819  /* FIX(2.053119869) */
169#define F_2_562  20995  /* FIX(2.562915447) */
170#define F_3_072  25172  /* FIX(3.072711026) */
171
172.balign 16
173Ljsimd_fdct_islow_neon_consts:
174  .short F_0_298
175  .short -F_0_390
176  .short F_0_541
177  .short F_0_765
178  .short - F_0_899
179  .short F_1_175
180  .short F_1_501
181  .short - F_1_847
182  .short - F_1_961
183  .short F_2_053
184  .short - F_2_562
185  .short F_3_072
186  .short 0          /* padding */
187  .short 0
188  .short 0
189  .short 0
190
191#undef F_0_298
192#undef F_0_390
193#undef F_0_541
194#undef F_0_765
195#undef F_0_899
196#undef F_1_175
197#undef F_1_501
198#undef F_1_847
199#undef F_1_961
200#undef F_2_053
201#undef F_2_562
202#undef F_3_072
203
204/* Constants for jsimd_fdct_ifast_neon() */
205
206.balign 16
207Ljsimd_fdct_ifast_neon_consts:
208  .short (98 * 128)               /* XFIX_0_382683433 */
209  .short (139 * 128)              /* XFIX_0_541196100 */
210  .short (181 * 128)              /* XFIX_0_707106781 */
211  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
212
213/* Constants for jsimd_h2*_downsample_neon() */
214
215.balign 16
216Ljsimd_h2_downsample_neon_consts:
217  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
218        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
219  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
220        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
221  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
222        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
223  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
224        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
225  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
226        0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
227  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
228        0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
229  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
230        0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
231  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
232        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
233  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
234        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
235  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
236        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
237  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
238        0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
239  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
240        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
241  .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
242        0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
243  .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
244        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
245  .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
246        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
247  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
248        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
249
250/* Constants for jsimd_huff_encode_one_block_neon() */
251
252.balign 16
253Ljsimd_huff_encode_one_block_neon_consts:
254    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
255          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
256    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
257            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
258    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
259            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
260    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
261           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
262    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
263            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
264    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
265            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
266    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
267            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
268    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
269            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
270    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
271            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
272    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
273           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
274    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
275             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
276    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
277           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
278    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
279           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
280
281.text
282
283
284#define RESPECT_STRICT_ALIGNMENT  1
285
286
287/*****************************************************************************/
288
289/* Supplementary macro for setting function attributes */
290.macro asm_function fname
291#ifdef __APPLE__
292    .private_extern _\fname
293    .globl _\fname
294_\fname:
295#else
296    .global \fname
297#ifdef __ELF__
298    .hidden \fname
299    .type \fname, %function
300#endif
301\fname:
302#endif
303.endm
304
305/* Get symbol location */
306.macro get_symbol_loc reg, symbol
307#ifdef __APPLE__
308    adrp            \reg, \symbol@PAGE
309    add             \reg, \reg, \symbol@PAGEOFF
310#else
311    adrp            \reg, \symbol
312    add             \reg, \reg, :lo12:\symbol
313#endif
314.endm
315
316/* Transpose elements of single 128 bit registers */
317.macro transpose_single x0, x1, xi, xilen, literal
318    ins             \xi\xilen[0], \x0\xilen[0]
319    ins             \x1\xilen[0], \x0\xilen[1]
320    trn1            \x0\literal, \x0\literal, \x1\literal
321    trn2            \x1\literal, \xi\literal, \x1\literal
322.endm
323
324/* Transpose elements of 2 different registers */
325.macro transpose x0, x1, xi, xilen, literal
326    mov             \xi\xilen, \x0\xilen
327    trn1            \x0\literal, \x0\literal, \x1\literal
328    trn2            \x1\literal, \xi\literal, \x1\literal
329.endm
330
331/* Transpose a block of 4x4 coefficients in four 64-bit registers */
332.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
333    mov             \xi\xilen, \x0\xilen
334    trn1            \x0\x0len, \x0\x0len, \x2\x2len
335    trn2            \x2\x2len, \xi\x0len, \x2\x2len
336    mov             \xi\xilen, \x1\xilen
337    trn1            \x1\x1len, \x1\x1len, \x3\x3len
338    trn2            \x3\x3len, \xi\x1len, \x3\x3len
339.endm
340
341.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
342    mov             \xi\xilen, \x0\xilen
343    trn1            \x0\x0len, \x0\x0len, \x1\x1len
344    trn2            \x1\x2len, \xi\x0len, \x1\x2len
345    mov             \xi\xilen, \x2\xilen
346    trn1            \x2\x2len, \x2\x2len, \x3\x3len
347    trn2            \x3\x2len, \xi\x1len, \x3\x3len
348.endm
349
350.macro transpose_4x4 x0, x1, x2, x3, x5
351    transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
352    transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
353.endm
354
355.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
356    trn1            \t0\().8h, \l0\().8h, \l1\().8h
357    trn1            \t1\().8h, \l2\().8h, \l3\().8h
358    trn1            \t2\().8h, \l4\().8h, \l5\().8h
359    trn1            \t3\().8h, \l6\().8h, \l7\().8h
360    trn2            \l1\().8h, \l0\().8h, \l1\().8h
361    trn2            \l3\().8h, \l2\().8h, \l3\().8h
362    trn2            \l5\().8h, \l4\().8h, \l5\().8h
363    trn2            \l7\().8h, \l6\().8h, \l7\().8h
364
365    trn1            \l4\().4s, \t2\().4s, \t3\().4s
366    trn2            \t3\().4s, \t2\().4s, \t3\().4s
367    trn1            \t2\().4s, \t0\().4s, \t1\().4s
368    trn2            \l2\().4s, \t0\().4s, \t1\().4s
369    trn1            \t0\().4s, \l1\().4s, \l3\().4s
370    trn2            \l3\().4s, \l1\().4s, \l3\().4s
371    trn2            \t1\().4s, \l5\().4s, \l7\().4s
372    trn1            \l5\().4s, \l5\().4s, \l7\().4s
373
374    trn2            \l6\().2d, \l2\().2d, \t3\().2d
375    trn1            \l0\().2d, \t2\().2d, \l4\().2d
376    trn1            \l1\().2d, \t0\().2d, \l5\().2d
377    trn2            \l7\().2d, \l3\().2d, \t1\().2d
378    trn1            \l2\().2d, \l2\().2d, \t3\().2d
379    trn2            \l4\().2d, \t2\().2d, \l4\().2d
380    trn1            \l3\().2d, \l3\().2d, \t1\().2d
381    trn2            \l5\().2d, \t0\().2d, \l5\().2d
382.endm
383
384
385#define CENTERJSAMPLE  128
386
387/*****************************************************************************/
388
389/*
390 * Perform dequantization and inverse DCT on one block of coefficients.
391 *
392 * GLOBAL(void)
393 * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
394 *                       JSAMPARRAY output_buf, JDIMENSION output_col)
395 */
396
397#define CONST_BITS  13
398#define PASS1_BITS  2
399
400#define XFIX_P_0_298  v0.h[0]
401#define XFIX_N_0_390  v0.h[1]
402#define XFIX_P_0_541  v0.h[2]
403#define XFIX_P_0_765  v0.h[3]
404#define XFIX_N_0_899  v0.h[4]
405#define XFIX_P_1_175  v0.h[5]
406#define XFIX_P_1_501  v0.h[6]
407#define XFIX_N_1_847  v0.h[7]
408#define XFIX_N_1_961  v1.h[0]
409#define XFIX_P_2_053  v1.h[1]
410#define XFIX_N_2_562  v1.h[2]
411#define XFIX_P_3_072  v1.h[3]
412
413asm_function jsimd_idct_islow_neon
414    DCT_TABLE       .req x0
415    COEF_BLOCK      .req x1
416    OUTPUT_BUF      .req x2
417    OUTPUT_COL      .req x3
418    TMP1            .req x0
419    TMP2            .req x1
420    TMP3            .req x9
421    TMP4            .req x10
422    TMP5            .req x11
423    TMP6            .req x12
424    TMP7            .req x13
425    TMP8            .req x14
426
427    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
428       guarantee that the upper (unused) 32 bits of x3 are valid.  This
429       instruction ensures that those bits are set to zero. */
430    uxtw x3, w3
431
432    sub             sp, sp, #64
433    get_symbol_loc  x15, Ljsimd_idct_islow_neon_consts
434    mov             x10, sp
435    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
436    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
437    ld1             {v0.8h, v1.8h}, [x15]
438    ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
439    ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
440    ld1             {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
441    ld1             {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
442
443    cmeq            v16.8h, v3.8h, #0
444    cmeq            v26.8h, v4.8h, #0
445    cmeq            v27.8h, v5.8h, #0
446    cmeq            v28.8h, v6.8h, #0
447    cmeq            v29.8h, v7.8h, #0
448    cmeq            v30.8h, v8.8h, #0
449    cmeq            v31.8h, v9.8h, #0
450
451    and             v10.16b, v16.16b, v26.16b
452    and             v11.16b, v27.16b, v28.16b
453    and             v12.16b, v29.16b, v30.16b
454    and             v13.16b, v31.16b, v10.16b
455    and             v14.16b, v11.16b, v12.16b
456    mul             v2.8h, v2.8h, v18.8h
457    and             v15.16b, v13.16b, v14.16b
458    shl             v10.8h, v2.8h, #(PASS1_BITS)
459    sqxtn           v16.8b, v15.8h
460    mov             TMP1, v16.d[0]
461    mvn             TMP2, TMP1
462
463    cbnz            TMP2, 2f
464    /* case all AC coeffs are zeros */
465    dup             v2.2d, v10.d[0]
466    dup             v6.2d, v10.d[1]
467    mov             v3.16b, v2.16b
468    mov             v7.16b, v6.16b
469    mov             v4.16b, v2.16b
470    mov             v8.16b, v6.16b
471    mov             v5.16b, v2.16b
472    mov             v9.16b, v6.16b
4731:
474    /* for this transpose, we should organise data like this:
475     * 00, 01, 02, 03, 40, 41, 42, 43
476     * 10, 11, 12, 13, 50, 51, 52, 53
477     * 20, 21, 22, 23, 60, 61, 62, 63
478     * 30, 31, 32, 33, 70, 71, 72, 73
479     * 04, 05, 06, 07, 44, 45, 46, 47
480     * 14, 15, 16, 17, 54, 55, 56, 57
481     * 24, 25, 26, 27, 64, 65, 66, 67
482     * 34, 35, 36, 37, 74, 75, 76, 77
483     */
484    trn1            v28.8h, v2.8h, v3.8h
485    trn1            v29.8h, v4.8h, v5.8h
486    trn1            v30.8h, v6.8h, v7.8h
487    trn1            v31.8h, v8.8h, v9.8h
488    trn2            v16.8h, v2.8h, v3.8h
489    trn2            v17.8h, v4.8h, v5.8h
490    trn2            v18.8h, v6.8h, v7.8h
491    trn2            v19.8h, v8.8h, v9.8h
492    trn1            v2.4s, v28.4s, v29.4s
493    trn1            v6.4s, v30.4s, v31.4s
494    trn1            v3.4s, v16.4s, v17.4s
495    trn1            v7.4s, v18.4s, v19.4s
496    trn2            v4.4s, v28.4s, v29.4s
497    trn2            v8.4s, v30.4s, v31.4s
498    trn2            v5.4s, v16.4s, v17.4s
499    trn2            v9.4s, v18.4s, v19.4s
500    /* Even part: reverse the even part of the forward DCT. */
501    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
502    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
503    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
504    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
505    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
506    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
507    mov             v21.16b, v19.16b               /* tmp3 = z1 */
508    mov             v20.16b, v18.16b               /* tmp3 = z1 */
509    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
510    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
511    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
512    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
513    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
514    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
515    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
516    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
517    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
518    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
519    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
520    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
521    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
522    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
523    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
524
525    /* Odd part per figure 8; the matrix is unitary and hence its
526     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
527     */
528
529    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
530    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
531    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
532    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
533    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
534
535    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
536    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
537    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
538    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
539    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
540    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
541    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
542    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
543    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
544
545    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
546    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
547    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
548    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
549    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
550    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
551    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
552    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
553    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
554
555    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
556    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
557    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
558    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
559
560    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
561    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
562    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
563    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
564    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
565    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
566    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
567    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
568
569    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
570    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
571    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
572    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
573    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
574    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
575    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
576    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
577
578    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
579
580    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
581    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
582    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
583    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
584    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
585    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
586    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
587    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
588    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
589    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
590    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
591    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
592    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
593    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
594    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
595    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
596
597    shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
598    shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
599    shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
600    shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
601    shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
602    shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
603    shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
604    shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
605    shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
606    shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
607    shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
608    shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
609    shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
610    shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
611    shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
612    shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
613    movi            v0.16b, #(CENTERJSAMPLE)
614    /* Prepare pointers (dual-issue with Neon instructions) */
615      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
616    sqrshrn         v28.8b, v2.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
617      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
618    sqrshrn         v29.8b, v3.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
619      add             TMP1, TMP1, OUTPUT_COL
620    sqrshrn         v30.8b, v4.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
621      add             TMP2, TMP2, OUTPUT_COL
622    sqrshrn         v31.8b, v5.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
623      add             TMP3, TMP3, OUTPUT_COL
624    sqrshrn2        v28.16b, v6.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
625      add             TMP4, TMP4, OUTPUT_COL
626    sqrshrn2        v29.16b, v7.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
627      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
628    sqrshrn2        v30.16b, v8.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
629      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
630    sqrshrn2        v31.16b, v9.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
631      add             TMP5, TMP5, OUTPUT_COL
632    add             v16.16b, v28.16b, v0.16b
633      add             TMP6, TMP6, OUTPUT_COL
634    add             v18.16b, v29.16b, v0.16b
635      add             TMP7, TMP7, OUTPUT_COL
636    add             v20.16b, v30.16b, v0.16b
637      add             TMP8, TMP8, OUTPUT_COL
638    add             v22.16b, v31.16b, v0.16b
639
640    /* Transpose the final 8-bit samples */
641    trn1            v28.16b, v16.16b, v18.16b
642    trn1            v30.16b, v20.16b, v22.16b
643    trn2            v29.16b, v16.16b, v18.16b
644    trn2            v31.16b, v20.16b, v22.16b
645
646    trn1            v16.8h, v28.8h, v30.8h
647    trn2            v18.8h, v28.8h, v30.8h
648    trn1            v20.8h, v29.8h, v31.8h
649    trn2            v22.8h, v29.8h, v31.8h
650
651    uzp1            v28.4s, v16.4s, v18.4s
652    uzp2            v30.4s, v16.4s, v18.4s
653    uzp1            v29.4s, v20.4s, v22.4s
654    uzp2            v31.4s, v20.4s, v22.4s
655
656    /* Store results to the output buffer */
657    st1             {v28.d}[0], [TMP1]
658    st1             {v29.d}[0], [TMP2]
659    st1             {v28.d}[1], [TMP3]
660    st1             {v29.d}[1], [TMP4]
661    st1             {v30.d}[0], [TMP5]
662    st1             {v31.d}[0], [TMP6]
663    st1             {v30.d}[1], [TMP7]
664    st1             {v31.d}[1], [TMP8]
665    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
666    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
667    blr             x30
668
669.balign 16
6702:
671    mul             v3.8h, v3.8h, v19.8h
672    mul             v4.8h, v4.8h, v20.8h
673    mul             v5.8h, v5.8h, v21.8h
674    add             TMP4, xzr, TMP2, LSL #32
675    mul             v6.8h, v6.8h, v22.8h
676    mul             v7.8h, v7.8h, v23.8h
677    adds            TMP3, xzr, TMP2, LSR #32
678    mul             v8.8h, v8.8h, v24.8h
679    mul             v9.8h, v9.8h, v25.8h
680    b.ne            3f
681    /* Right AC coef is zero */
682    dup             v15.2d, v10.d[1]
683    /* Even part: reverse the even part of the forward DCT. */
684    add             v18.4h, v4.4h, v8.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
685    add             v22.4h, v2.4h, v6.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
686    sub             v26.4h, v2.4h, v6.4h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
687    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
688    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
689    mov             v20.16b, v18.16b               /* tmp3 = z1 */
690    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
691    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
692    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
693    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
694    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
695    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
696    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
697
698    /* Odd part per figure 8; the matrix is unitary and hence its
699     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
700     */
701
702    add             v22.4h, v9.4h, v5.4h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
703    add             v24.4h, v7.4h, v3.4h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
704    add             v18.4h, v9.4h, v3.4h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
705    add             v20.4h, v7.4h, v5.4h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
706    add             v26.4h, v22.4h, v24.4h  /* z5 = z3 + z4 */
707
708    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
709    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
710    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
711    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
712    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
713    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
714    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
715    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
716    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
717
718    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
719    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
720
721    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
722    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
723    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
724    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
725
726    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
727    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
728    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
729    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
730
731    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
732
733    add             v18.4s, v2.4s, v16.4s  /* tmp10 + tmp3 */
734    sub             v20.4s, v2.4s, v16.4s  /* tmp10 - tmp3 */
735    add             v22.4s, v8.4s, v14.4s  /* tmp11 + tmp2 */
736    sub             v24.4s, v8.4s, v14.4s  /* tmp11 - tmp2 */
737    add             v26.4s, v4.4s, v12.4s  /* tmp12 + tmp1 */
738    sub             v28.4s, v4.4s, v12.4s  /* tmp12 - tmp1 */
739    add             v14.4s, v6.4s, v10.4s  /* tmp13 + tmp0 */
740    sub             v16.4s, v6.4s, v10.4s  /* tmp13 - tmp0 */
741
742    rshrn           v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
743    rshrn           v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
744    rshrn           v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
745    rshrn           v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
746    rshrn2          v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
747    rshrn2          v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
748    rshrn2          v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
749    rshrn2          v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
750    mov             v6.16b, v15.16b
751    mov             v7.16b, v15.16b
752    mov             v8.16b, v15.16b
753    mov             v9.16b, v15.16b
754    b               1b
755
756.balign 16
7573:
758    cbnz            TMP4, 4f
759    /* Left AC coef is zero */
760    dup             v14.2d, v10.d[0]
761    /* Even part: reverse the even part of the forward DCT. */
762    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
763    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
764    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
765    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
766    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
767    mov             v21.16b, v19.16b               /* tmp3 = z1 */
768    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
769    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
770    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
771    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
772    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
773    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
774    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
775
776    /* Odd part per figure 8; the matrix is unitary and hence its
777     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
778     */
779
780    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
781    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
782    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
783    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
784    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
785
786    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
787    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
788    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
789    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
790    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
791    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
792    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
793    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
794    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
795
796    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
797    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
798    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
799    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
800
801    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
802    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
803    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
804    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
805
806    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
807    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
808    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
809    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
810
811    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
812
813    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
814    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
815    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
816    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
817    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
818    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
819    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
820    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
821
822    mov             v2.16b, v14.16b
823    mov             v3.16b, v14.16b
824    mov             v4.16b, v14.16b
825    mov             v5.16b, v14.16b
826    rshrn           v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
827    rshrn           v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
828    rshrn           v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
829    rshrn           v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
830    rshrn2          v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
831    rshrn2          v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
832    rshrn2          v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
833    rshrn2          v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
834    b               1b
835
836.balign 16
8374:
838    /* "No" AC coef is zero */
839    /* Even part: reverse the even part of the forward DCT. */
840    add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
841    add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
842    smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
843    sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
844    smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
845    sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
846    mov             v21.16b, v19.16b               /* tmp3 = z1 */
847    mov             v20.16b, v18.16b               /* tmp3 = z1 */
848    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
849    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
850    sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
851    smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
852    smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
853    sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
854    sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
855    add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
856    sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
857    add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
858    sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
859    add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
860    sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
861    add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
862    sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
863
864    /* Odd part per figure 8; the matrix is unitary and hence its
865     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
866     */
867
868    add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
869    add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
870    add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
871    add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
872    add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
873
874    smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
875    smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
876    smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
877    smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
878    smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
879    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
880    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
881    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
882    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
883
884    smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
885    smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
886    smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
887    smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
888    smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
889    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
890    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
891    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
892    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
893
894    add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
895    add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
896    add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
897    add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
898
899    add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
900    add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
901    add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
902    add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
903    add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
904    add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
905    add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
906    add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
907
908    add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
909    add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
910    add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
911    add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
912    add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
913    add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
914    add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
915    add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
916
917    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
918
919    add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
920    add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
921    sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
922    sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
923    add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
924    add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
925    sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
926    sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
927    add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
928    add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
929    sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
930    sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
931    add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
932    add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
933    sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
934    sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
935
936    rshrn           v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
937    rshrn           v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
938    rshrn           v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
939    rshrn           v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
940    rshrn           v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
941    rshrn           v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
942    rshrn           v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
943    rshrn           v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
944    rshrn2          v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
945    rshrn2          v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
946    rshrn2          v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
947    rshrn2          v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
948    rshrn2          v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
949    rshrn2          v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
950    rshrn2          v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
951    rshrn2          v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
952    b               1b
953
954    .unreq          DCT_TABLE
955    .unreq          COEF_BLOCK
956    .unreq          OUTPUT_BUF
957    .unreq          OUTPUT_COL
958    .unreq          TMP1
959    .unreq          TMP2
960    .unreq          TMP3
961    .unreq          TMP4
962    .unreq          TMP5
963    .unreq          TMP6
964    .unreq          TMP7
965    .unreq          TMP8
966
967#undef CENTERJSAMPLE
968#undef CONST_BITS
969#undef PASS1_BITS
970#undef XFIX_P_0_298
971#undef XFIX_N_0_390
972#undef XFIX_P_0_541
973#undef XFIX_P_0_765
974#undef XFIX_N_0_899
975#undef XFIX_P_1_175
976#undef XFIX_P_1_501
977#undef XFIX_N_1_847
978#undef XFIX_N_1_961
979#undef XFIX_P_2_053
980#undef XFIX_N_2_562
981#undef XFIX_P_3_072
982
983
984/*****************************************************************************/
985
986/*
987 * jsimd_idct_ifast_neon
988 *
989 * This function contains a fast, not so accurate integer implementation of
990 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
991 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
992 * function from jidctfst.c
993 *
994 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
995 * But in Arm Neon case some extra additions are required because VQDMULH
996 * instruction can't handle the constants larger than 1. So the expressions
997 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
998 * which introduces an extra addition. Overall, there are 6 extra additions
999 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
1000 */
1001
1002#define XFIX_1_082392200  v0.h[0]
1003#define XFIX_1_414213562  v0.h[1]
1004#define XFIX_1_847759065  v0.h[2]
1005#define XFIX_2_613125930  v0.h[3]
1006
1007asm_function jsimd_idct_ifast_neon
1008
1009    DCT_TABLE       .req x0
1010    COEF_BLOCK      .req x1
1011    OUTPUT_BUF      .req x2
1012    OUTPUT_COL      .req x3
1013    TMP1            .req x0
1014    TMP2            .req x1
1015    TMP3            .req x9
1016    TMP4            .req x10
1017    TMP5            .req x11
1018    TMP6            .req x12
1019    TMP7            .req x13
1020    TMP8            .req x14
1021
1022    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1023       guarantee that the upper (unused) 32 bits of x3 are valid.  This
1024       instruction ensures that those bits are set to zero. */
1025    uxtw x3, w3
1026
1027    /* Load and dequantize coefficients into Neon registers
1028     * with the following allocation:
1029     *       0 1 2 3 | 4 5 6 7
1030     *      ---------+--------
1031     *   0 | d16     | d17     ( v16.8h )
1032     *   1 | d18     | d19     ( v17.8h )
1033     *   2 | d20     | d21     ( v18.8h )
1034     *   3 | d22     | d23     ( v19.8h )
1035     *   4 | d24     | d25     ( v20.8h )
1036     *   5 | d26     | d27     ( v21.8h )
1037     *   6 | d28     | d29     ( v22.8h )
1038     *   7 | d30     | d31     ( v23.8h )
1039     */
1040    /* Save Neon registers used in fast IDCT */
1041    get_symbol_loc  TMP5, Ljsimd_idct_ifast_neon_consts
1042    ld1             {v16.8h, v17.8h}, [COEF_BLOCK], 32
1043    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
1044    ld1             {v18.8h, v19.8h}, [COEF_BLOCK], 32
1045    mul             v16.8h, v16.8h, v0.8h
1046    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
1047    mul             v17.8h, v17.8h, v1.8h
1048    ld1             {v20.8h, v21.8h}, [COEF_BLOCK], 32
1049    mul             v18.8h, v18.8h, v2.8h
1050    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
1051    mul             v19.8h, v19.8h, v3.8h
1052    ld1             {v22.8h, v23.8h}, [COEF_BLOCK], 32
1053    mul             v20.8h, v20.8h, v0.8h
1054    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
1055    mul             v22.8h, v22.8h, v2.8h
1056    mul             v21.8h, v21.8h, v1.8h
1057    ld1             {v0.4h}, [TMP5]        /* load constants */
1058    mul             v23.8h, v23.8h, v3.8h
1059
1060    /* 1-D IDCT, pass 1 */
1061    sub             v2.8h, v18.8h, v22.8h
1062    add             v22.8h, v18.8h, v22.8h
1063    sub             v1.8h, v19.8h, v21.8h
1064    add             v21.8h, v19.8h, v21.8h
1065    sub             v5.8h, v17.8h, v23.8h
1066    add             v23.8h, v17.8h, v23.8h
1067    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
1068    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
1069    add             v3.8h, v1.8h, v1.8h
1070    sub             v1.8h, v5.8h, v1.8h
1071    add             v18.8h, v2.8h, v4.8h
1072    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
1073    sub             v2.8h, v23.8h, v21.8h
1074    add             v3.8h, v3.8h, v6.8h
1075    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
1076    add             v1.8h, v1.8h, v4.8h
1077    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
1078    sub             v18.8h, v18.8h, v22.8h
1079    add             v2.8h, v2.8h, v6.8h
1080    sub             v6.8h, v16.8h, v20.8h
1081    add             v20.8h, v16.8h, v20.8h
1082    add             v17.8h, v5.8h, v4.8h
1083    add             v5.8h, v6.8h, v18.8h
1084    sub             v18.8h, v6.8h, v18.8h
1085    add             v6.8h, v23.8h, v21.8h
1086    add             v16.8h, v20.8h, v22.8h
1087    sub             v3.8h, v6.8h, v3.8h
1088    sub             v20.8h, v20.8h, v22.8h
1089    sub             v3.8h, v3.8h, v1.8h
1090    sub             v1.8h, v17.8h, v1.8h
1091    add             v2.8h, v3.8h, v2.8h
1092    sub             v23.8h, v16.8h, v6.8h
1093    add             v1.8h, v1.8h, v2.8h
1094    add             v16.8h, v16.8h, v6.8h
1095    add             v22.8h, v5.8h, v3.8h
1096    sub             v17.8h, v5.8h, v3.8h
1097    sub             v21.8h, v18.8h, v2.8h
1098    add             v18.8h, v18.8h, v2.8h
1099    sub             v19.8h, v20.8h, v1.8h
1100    add             v20.8h, v20.8h, v1.8h
1101    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
1102    /* 1-D IDCT, pass 2 */
1103    sub             v2.8h, v18.8h, v22.8h
1104    add             v22.8h, v18.8h, v22.8h
1105    sub             v1.8h, v19.8h, v21.8h
1106    add             v21.8h, v19.8h, v21.8h
1107    sub             v5.8h, v17.8h, v23.8h
1108    add             v23.8h, v17.8h, v23.8h
1109    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
1110    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
1111    add             v3.8h, v1.8h, v1.8h
1112    sub             v1.8h, v5.8h, v1.8h
1113    add             v18.8h, v2.8h, v4.8h
1114    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
1115    sub             v2.8h, v23.8h, v21.8h
1116    add             v3.8h, v3.8h, v6.8h
1117    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
1118    add             v1.8h, v1.8h, v4.8h
1119    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
1120    sub             v18.8h, v18.8h, v22.8h
1121    add             v2.8h, v2.8h, v6.8h
1122    sub             v6.8h, v16.8h, v20.8h
1123    add             v20.8h, v16.8h, v20.8h
1124    add             v17.8h, v5.8h, v4.8h
1125    add             v5.8h, v6.8h, v18.8h
1126    sub             v18.8h, v6.8h, v18.8h
1127    add             v6.8h, v23.8h, v21.8h
1128    add             v16.8h, v20.8h, v22.8h
1129    sub             v3.8h, v6.8h, v3.8h
1130    sub             v20.8h, v20.8h, v22.8h
1131    sub             v3.8h, v3.8h, v1.8h
1132    sub             v1.8h, v17.8h, v1.8h
1133    add             v2.8h, v3.8h, v2.8h
1134    sub             v23.8h, v16.8h, v6.8h
1135    add             v1.8h, v1.8h, v2.8h
1136    add             v16.8h, v16.8h, v6.8h
1137    add             v22.8h, v5.8h, v3.8h
1138    sub             v17.8h, v5.8h, v3.8h
1139    sub             v21.8h, v18.8h, v2.8h
1140    add             v18.8h, v18.8h, v2.8h
1141    sub             v19.8h, v20.8h, v1.8h
1142    add             v20.8h, v20.8h, v1.8h
1143    /* Descale to 8-bit and range limit */
1144    movi            v0.16b, #0x80
1145      /* Prepare pointers (dual-issue with Neon instructions) */
1146      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
1147    sqshrn          v28.8b, v16.8h, #5
1148      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
1149    sqshrn          v29.8b, v17.8h, #5
1150      add             TMP1, TMP1, OUTPUT_COL
1151    sqshrn          v30.8b, v18.8h, #5
1152      add             TMP2, TMP2, OUTPUT_COL
1153    sqshrn          v31.8b, v19.8h, #5
1154      add             TMP3, TMP3, OUTPUT_COL
1155    sqshrn2         v28.16b, v20.8h, #5
1156      add             TMP4, TMP4, OUTPUT_COL
1157    sqshrn2         v29.16b, v21.8h, #5
1158      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
1159    sqshrn2         v30.16b, v22.8h, #5
1160      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
1161    sqshrn2         v31.16b, v23.8h, #5
1162      add             TMP5, TMP5, OUTPUT_COL
1163    add             v16.16b, v28.16b, v0.16b
1164      add             TMP6, TMP6, OUTPUT_COL
1165    add             v18.16b, v29.16b, v0.16b
1166      add             TMP7, TMP7, OUTPUT_COL
1167    add             v20.16b, v30.16b, v0.16b
1168      add             TMP8, TMP8, OUTPUT_COL
1169    add             v22.16b, v31.16b, v0.16b
1170
1171    /* Transpose the final 8-bit samples */
1172    trn1            v28.16b, v16.16b, v18.16b
1173    trn1            v30.16b, v20.16b, v22.16b
1174    trn2            v29.16b, v16.16b, v18.16b
1175    trn2            v31.16b, v20.16b, v22.16b
1176
1177    trn1            v16.8h, v28.8h, v30.8h
1178    trn2            v18.8h, v28.8h, v30.8h
1179    trn1            v20.8h, v29.8h, v31.8h
1180    trn2            v22.8h, v29.8h, v31.8h
1181
1182    uzp1            v28.4s, v16.4s, v18.4s
1183    uzp2            v30.4s, v16.4s, v18.4s
1184    uzp1            v29.4s, v20.4s, v22.4s
1185    uzp2            v31.4s, v20.4s, v22.4s
1186
1187    /* Store results to the output buffer */
1188    st1             {v28.d}[0], [TMP1]
1189    st1             {v29.d}[0], [TMP2]
1190    st1             {v28.d}[1], [TMP3]
1191    st1             {v29.d}[1], [TMP4]
1192    st1             {v30.d}[0], [TMP5]
1193    st1             {v31.d}[0], [TMP6]
1194    st1             {v30.d}[1], [TMP7]
1195    st1             {v31.d}[1], [TMP8]
1196    blr             x30
1197
1198    .unreq          DCT_TABLE
1199    .unreq          COEF_BLOCK
1200    .unreq          OUTPUT_BUF
1201    .unreq          OUTPUT_COL
1202    .unreq          TMP1
1203    .unreq          TMP2
1204    .unreq          TMP3
1205    .unreq          TMP4
1206    .unreq          TMP5
1207    .unreq          TMP6
1208    .unreq          TMP7
1209    .unreq          TMP8
1210
1211
1212/*****************************************************************************/
1213
1214/*
1215 * jsimd_idct_4x4_neon
1216 *
1217 * This function contains inverse-DCT code for getting reduced-size
1218 * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
1219 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
1220 * function from jpeg-6b (jidctred.c).
1221 *
1222 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
1223 *       requires much less arithmetic operations and hence should be faster.
1224 *       The primary purpose of this particular Neon optimized function is
1225 *       bit exact compatibility with jpeg-6b.
1226 *
1227 * TODO: a bit better instructions scheduling can be achieved by expanding
1228 *       idct_helper/transpose_4x4 macros and reordering instructions,
1229 *       but readability will suffer somewhat.
1230 */
1231
1232.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
1233    smull           v28.4s, \x4, v2.h[2]
1234    smlal           v28.4s, \x8, v0.h[0]
1235    smlal           v28.4s, \x14, v0.h[1]
1236
1237    smull           v26.4s, \x16, v1.h[2]
1238    smlal           v26.4s, \x12, v1.h[3]
1239    smlal           v26.4s, \x10, v2.h[0]
1240    smlal           v26.4s, \x6, v2.h[1]
1241
1242    smull           v30.4s, \x4, v2.h[2]
1243    smlsl           v30.4s, \x8, v0.h[0]
1244    smlsl           v30.4s, \x14, v0.h[1]
1245
1246    smull           v24.4s, \x16, v0.h[2]
1247    smlal           v24.4s, \x12, v0.h[3]
1248    smlal           v24.4s, \x10, v1.h[0]
1249    smlal           v24.4s, \x6, v1.h[1]
1250
1251    add             v20.4s, v28.4s, v26.4s
1252    sub             v28.4s, v28.4s, v26.4s
1253
1254  .if \shift > 16
1255    srshr           v20.4s, v20.4s, #\shift
1256    srshr           v28.4s, v28.4s, #\shift
1257    xtn             \y26, v20.4s
1258    xtn             \y29, v28.4s
1259  .else
1260    rshrn           \y26, v20.4s, #\shift
1261    rshrn           \y29, v28.4s, #\shift
1262  .endif
1263
1264    add             v20.4s, v30.4s, v24.4s
1265    sub             v30.4s, v30.4s, v24.4s
1266
1267  .if \shift > 16
1268    srshr           v20.4s, v20.4s, #\shift
1269    srshr           v30.4s, v30.4s, #\shift
1270    xtn             \y27, v20.4s
1271    xtn             \y28, v30.4s
1272  .else
1273    rshrn           \y27, v20.4s, #\shift
1274    rshrn           \y28, v30.4s, #\shift
1275  .endif
1276.endm
1277
1278asm_function jsimd_idct_4x4_neon
1279
1280    DCT_TABLE       .req x0
1281    COEF_BLOCK      .req x1
1282    OUTPUT_BUF      .req x2
1283    OUTPUT_COL      .req x3
1284    TMP1            .req x0
1285    TMP2            .req x1
1286    TMP3            .req x2
1287    TMP4            .req x15
1288
1289    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1290       guarantee that the upper (unused) 32 bits of x3 are valid.  This
1291       instruction ensures that those bits are set to zero. */
1292    uxtw x3, w3
1293
1294    /* Save all used Neon registers */
1295    sub             sp, sp, 64
1296    mov             x9, sp
1297    /* Load constants (v3.4h is just used for padding) */
1298    get_symbol_loc  TMP4, Ljsimd_idct_4x4_neon_consts
1299    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1300    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1301    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
1302
1303    /* Load all COEF_BLOCK into Neon registers with the following allocation:
1304     *       0 1 2 3 | 4 5 6 7
1305     *      ---------+--------
1306     *   0 | v4.4h   | v5.4h
1307     *   1 | v6.4h   | v7.4h
1308     *   2 | v8.4h   | v9.4h
1309     *   3 | v10.4h  | v11.4h
1310     *   4 | -       | -
1311     *   5 | v12.4h  | v13.4h
1312     *   6 | v14.4h  | v15.4h
1313     *   7 | v16.4h  | v17.4h
1314     */
1315    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1316    ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
1317    add             COEF_BLOCK, COEF_BLOCK, #16
1318    ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
1319    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
1320    /* dequantize */
1321    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1322    mul             v4.4h, v4.4h, v18.4h
1323    mul             v5.4h, v5.4h, v19.4h
1324    ins             v4.d[1], v5.d[0]              /* 128 bit q4 */
1325    ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
1326    mul             v6.4h, v6.4h, v20.4h
1327    mul             v7.4h, v7.4h, v21.4h
1328    ins             v6.d[1], v7.d[0]              /* 128 bit q6 */
1329    mul             v8.4h, v8.4h, v22.4h
1330    mul             v9.4h, v9.4h, v23.4h
1331    ins             v8.d[1], v9.d[0]              /* 128 bit q8 */
1332    add             DCT_TABLE, DCT_TABLE, #16
1333    ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
1334    mul             v10.4h, v10.4h, v24.4h
1335    mul             v11.4h, v11.4h, v25.4h
1336    ins             v10.d[1], v11.d[0]            /* 128 bit q10 */
1337    mul             v12.4h, v12.4h, v26.4h
1338    mul             v13.4h, v13.4h, v27.4h
1339    ins             v12.d[1], v13.d[0]            /* 128 bit q12 */
1340    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
1341    mul             v14.4h, v14.4h, v28.4h
1342    mul             v15.4h, v15.4h, v29.4h
1343    ins             v14.d[1], v15.d[0]            /* 128 bit q14 */
1344    mul             v16.4h, v16.4h, v30.4h
1345    mul             v17.4h, v17.4h, v31.4h
1346    ins             v16.d[1], v17.d[0]            /* 128 bit q16 */
1347
1348    /* Pass 1 */
1349    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
1350                    v4.4h, v6.4h, v8.4h, v10.4h
1351    transpose_4x4   v4, v6, v8, v10, v3
1352    ins             v10.d[1], v11.d[0]
1353    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
1354                    v5.4h, v7.4h, v9.4h, v11.4h
1355    transpose_4x4   v5, v7, v9, v11, v3
1356    ins             v10.d[1], v11.d[0]
1357
1358    /* Pass 2 */
1359    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
1360                    v26.4h, v27.4h, v28.4h, v29.4h
1361    transpose_4x4   v26, v27, v28, v29, v3
1362
1363    /* Range limit */
1364    movi            v30.8h, #0x80
1365    ins             v26.d[1], v27.d[0]
1366    ins             v28.d[1], v29.d[0]
1367    add             v26.8h, v26.8h, v30.8h
1368    add             v28.8h, v28.8h, v30.8h
1369    sqxtun          v26.8b, v26.8h
1370    sqxtun          v27.8b, v28.8h
1371
1372    /* Store results to the output buffer */
1373    ldp             TMP1, TMP2, [OUTPUT_BUF], 16
1374    ldp             TMP3, TMP4, [OUTPUT_BUF]
1375    add             TMP1, TMP1, OUTPUT_COL
1376    add             TMP2, TMP2, OUTPUT_COL
1377    add             TMP3, TMP3, OUTPUT_COL
1378    add             TMP4, TMP4, OUTPUT_COL
1379
1380#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1381    /* We can use much less instructions on little endian systems if the
1382     * OS kernel is not configured to trap unaligned memory accesses
1383     */
1384    st1             {v26.s}[0], [TMP1], 4
1385    st1             {v27.s}[0], [TMP3], 4
1386    st1             {v26.s}[1], [TMP2], 4
1387    st1             {v27.s}[1], [TMP4], 4
1388#else
1389    st1             {v26.b}[0], [TMP1], 1
1390    st1             {v27.b}[0], [TMP3], 1
1391    st1             {v26.b}[1], [TMP1], 1
1392    st1             {v27.b}[1], [TMP3], 1
1393    st1             {v26.b}[2], [TMP1], 1
1394    st1             {v27.b}[2], [TMP3], 1
1395    st1             {v26.b}[3], [TMP1], 1
1396    st1             {v27.b}[3], [TMP3], 1
1397
1398    st1             {v26.b}[4], [TMP2], 1
1399    st1             {v27.b}[4], [TMP4], 1
1400    st1             {v26.b}[5], [TMP2], 1
1401    st1             {v27.b}[5], [TMP4], 1
1402    st1             {v26.b}[6], [TMP2], 1
1403    st1             {v27.b}[6], [TMP4], 1
1404    st1             {v26.b}[7], [TMP2], 1
1405    st1             {v27.b}[7], [TMP4], 1
1406#endif
1407
1408    /* vpop            {v8.4h - v15.4h}    (not available) */
1409    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1410    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1411    blr             x30
1412
1413    .unreq          DCT_TABLE
1414    .unreq          COEF_BLOCK
1415    .unreq          OUTPUT_BUF
1416    .unreq          OUTPUT_COL
1417    .unreq          TMP1
1418    .unreq          TMP2
1419    .unreq          TMP3
1420    .unreq          TMP4
1421
1422.purgem idct_helper
1423
1424
1425/*****************************************************************************/
1426
1427/*
1428 * jsimd_idct_2x2_neon
1429 *
1430 * This function contains inverse-DCT code for getting reduced-size
1431 * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
1432 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1433 * function from jpeg-6b (jidctred.c).
1434 *
1435 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1436 *       requires much less arithmetic operations and hence should be faster.
1437 *       The primary purpose of this particular Neon optimized function is
1438 *       bit exact compatibility with jpeg-6b.
1439 */
1440
1441.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1442    sshll           v15.4s, \x4, #15
1443    smull           v26.4s, \x6, v14.h[3]
1444    smlal           v26.4s, \x10, v14.h[2]
1445    smlal           v26.4s, \x12, v14.h[1]
1446    smlal           v26.4s, \x16, v14.h[0]
1447
1448    add             v20.4s, v15.4s, v26.4s
1449    sub             v15.4s, v15.4s, v26.4s
1450
1451  .if \shift > 16
1452    srshr           v20.4s, v20.4s, #\shift
1453    srshr           v15.4s, v15.4s, #\shift
1454    xtn             \y26, v20.4s
1455    xtn             \y27, v15.4s
1456  .else
1457    rshrn           \y26, v20.4s, #\shift
1458    rshrn           \y27, v15.4s, #\shift
1459  .endif
1460.endm
1461
1462asm_function jsimd_idct_2x2_neon
1463
1464    DCT_TABLE       .req x0
1465    COEF_BLOCK      .req x1
1466    OUTPUT_BUF      .req x2
1467    OUTPUT_COL      .req x3
1468    TMP1            .req x0
1469    TMP2            .req x15
1470
1471    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1472       guarantee that the upper (unused) 32 bits of x3 are valid.  This
1473       instruction ensures that those bits are set to zero. */
1474    uxtw x3, w3
1475
1476    /* vpush           {v8.4h - v15.4h}    (not available) */
1477    sub             sp, sp, 64
1478    mov             x9, sp
1479
1480    /* Load constants */
1481    get_symbol_loc  TMP2, Ljsimd_idct_2x2_neon_consts
1482    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1483    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1484    ld1             {v14.4h}, [TMP2]
1485
1486    /* Load all COEF_BLOCK into Neon registers with the following allocation:
1487     *       0 1 2 3 | 4 5 6 7
1488     *      ---------+--------
1489     *   0 | v4.4h   | v5.4h
1490     *   1 | v6.4h   | v7.4h
1491     *   2 | -       | -
1492     *   3 | v10.4h  | v11.4h
1493     *   4 | -       | -
1494     *   5 | v12.4h  | v13.4h
1495     *   6 | -       | -
1496     *   7 | v16.4h  | v17.4h
1497     */
1498    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1499    add             COEF_BLOCK, COEF_BLOCK, #16
1500    ld1             {v10.4h, v11.4h}, [COEF_BLOCK], 16
1501    add             COEF_BLOCK, COEF_BLOCK, #16
1502    ld1             {v12.4h, v13.4h}, [COEF_BLOCK], 16
1503    add             COEF_BLOCK, COEF_BLOCK, #16
1504    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
1505    /* Dequantize */
1506    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1507    mul             v4.4h, v4.4h, v18.4h
1508    mul             v5.4h, v5.4h, v19.4h
1509    ins             v4.d[1], v5.d[0]
1510    mul             v6.4h, v6.4h, v20.4h
1511    mul             v7.4h, v7.4h, v21.4h
1512    ins             v6.d[1], v7.d[0]
1513    add             DCT_TABLE, DCT_TABLE, #16
1514    ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
1515    mul             v10.4h, v10.4h, v24.4h
1516    mul             v11.4h, v11.4h, v25.4h
1517    ins             v10.d[1], v11.d[0]
1518    add             DCT_TABLE, DCT_TABLE, #16
1519    ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
1520    mul             v12.4h, v12.4h, v26.4h
1521    mul             v13.4h, v13.4h, v27.4h
1522    ins             v12.d[1], v13.d[0]
1523    add             DCT_TABLE, DCT_TABLE, #16
1524    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
1525    mul             v16.4h, v16.4h, v30.4h
1526    mul             v17.4h, v17.4h, v31.4h
1527    ins             v16.d[1], v17.d[0]
1528
1529    /* Pass 1 */
1530#if 0
1531    idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
1532    transpose_4x4   v4.4h, v6.4h, v8.4h, v10.4h
1533    idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
1534    transpose_4x4   v5.4h, v7.4h, v9.4h, v11.4h
1535#else
1536    smull           v26.4s, v6.4h, v14.h[3]
1537    smlal           v26.4s, v10.4h, v14.h[2]
1538    smlal           v26.4s, v12.4h, v14.h[1]
1539    smlal           v26.4s, v16.4h, v14.h[0]
1540    smull           v24.4s, v7.4h, v14.h[3]
1541    smlal           v24.4s, v11.4h, v14.h[2]
1542    smlal           v24.4s, v13.4h, v14.h[1]
1543    smlal           v24.4s, v17.4h, v14.h[0]
1544    sshll           v15.4s, v4.4h, #15
1545    sshll           v30.4s, v5.4h, #15
1546    add             v20.4s, v15.4s, v26.4s
1547    sub             v15.4s, v15.4s, v26.4s
1548    rshrn           v4.4h, v20.4s, #13
1549    rshrn           v6.4h, v15.4s, #13
1550    add             v20.4s, v30.4s, v24.4s
1551    sub             v15.4s, v30.4s, v24.4s
1552    rshrn           v5.4h, v20.4s, #13
1553    rshrn           v7.4h, v15.4s, #13
1554    ins             v4.d[1], v5.d[0]
1555    ins             v6.d[1], v7.d[0]
1556    transpose       v4, v6, v3, .16b, .8h
1557    transpose       v6, v10, v3, .16b, .4s
1558    ins             v11.d[0], v10.d[1]
1559    ins             v7.d[0], v6.d[1]
1560#endif
1561
1562    /* Pass 2 */
1563    idct_helper     v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
1564
1565    /* Range limit */
1566    movi            v30.8h, #0x80
1567    ins             v26.d[1], v27.d[0]
1568    add             v26.8h, v26.8h, v30.8h
1569    sqxtun          v30.8b, v26.8h
1570    ins             v26.d[0], v30.d[0]
1571    sqxtun          v27.8b, v26.8h
1572
1573    /* Store results to the output buffer */
1574    ldp             TMP1, TMP2, [OUTPUT_BUF]
1575    add             TMP1, TMP1, OUTPUT_COL
1576    add             TMP2, TMP2, OUTPUT_COL
1577
1578    st1             {v26.b}[0], [TMP1], 1
1579    st1             {v27.b}[4], [TMP1], 1
1580    st1             {v26.b}[1], [TMP2], 1
1581    st1             {v27.b}[5], [TMP2], 1
1582
1583    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1584    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1585    blr             x30
1586
1587    .unreq          DCT_TABLE
1588    .unreq          COEF_BLOCK
1589    .unreq          OUTPUT_BUF
1590    .unreq          OUTPUT_COL
1591    .unreq          TMP1
1592    .unreq          TMP2
1593
1594.purgem idct_helper
1595
1596
1597/*****************************************************************************/
1598
1599/*
1600 * jsimd_ycc_extrgb_convert_neon
1601 * jsimd_ycc_extbgr_convert_neon
1602 * jsimd_ycc_extrgbx_convert_neon
1603 * jsimd_ycc_extbgrx_convert_neon
1604 * jsimd_ycc_extxbgr_convert_neon
1605 * jsimd_ycc_extxrgb_convert_neon
1606 *
1607 * Colorspace conversion YCbCr -> RGB
1608 */
1609
1610.macro do_load size
1611  .if \size == 8
1612    ld1             {v4.8b}, [U], 8
1613    ld1             {v5.8b}, [V], 8
1614    ld1             {v0.8b}, [Y], 8
1615    prfm            pldl1keep, [U, #64]
1616    prfm            pldl1keep, [V, #64]
1617    prfm            pldl1keep, [Y, #64]
1618  .elseif \size == 4
1619    ld1             {v4.b}[0], [U], 1
1620    ld1             {v4.b}[1], [U], 1
1621    ld1             {v4.b}[2], [U], 1
1622    ld1             {v4.b}[3], [U], 1
1623    ld1             {v5.b}[0], [V], 1
1624    ld1             {v5.b}[1], [V], 1
1625    ld1             {v5.b}[2], [V], 1
1626    ld1             {v5.b}[3], [V], 1
1627    ld1             {v0.b}[0], [Y], 1
1628    ld1             {v0.b}[1], [Y], 1
1629    ld1             {v0.b}[2], [Y], 1
1630    ld1             {v0.b}[3], [Y], 1
1631  .elseif \size == 2
1632    ld1             {v4.b}[4], [U], 1
1633    ld1             {v4.b}[5], [U], 1
1634    ld1             {v5.b}[4], [V], 1
1635    ld1             {v5.b}[5], [V], 1
1636    ld1             {v0.b}[4], [Y], 1
1637    ld1             {v0.b}[5], [Y], 1
1638  .elseif \size == 1
1639    ld1             {v4.b}[6], [U], 1
1640    ld1             {v5.b}[6], [V], 1
1641    ld1             {v0.b}[6], [Y], 1
1642  .else
1643    .error unsupported macroblock size
1644  .endif
1645.endm
1646
1647.macro do_store bpp, size, fast_st3
1648  .if \bpp == 24
1649    .if \size == 8
1650      .if \fast_st3 == 1
1651        st3         {v10.8b, v11.8b, v12.8b}, [RGB], 24
1652      .else
1653        st1         {v10.b}[0], [RGB], #1
1654        st1         {v11.b}[0], [RGB], #1
1655        st1         {v12.b}[0], [RGB], #1
1656
1657        st1         {v10.b}[1], [RGB], #1
1658        st1         {v11.b}[1], [RGB], #1
1659        st1         {v12.b}[1], [RGB], #1
1660
1661        st1         {v10.b}[2], [RGB], #1
1662        st1         {v11.b}[2], [RGB], #1
1663        st1         {v12.b}[2], [RGB], #1
1664
1665        st1         {v10.b}[3], [RGB], #1
1666        st1         {v11.b}[3], [RGB], #1
1667        st1         {v12.b}[3], [RGB], #1
1668
1669        st1         {v10.b}[4], [RGB], #1
1670        st1         {v11.b}[4], [RGB], #1
1671        st1         {v12.b}[4], [RGB], #1
1672
1673        st1         {v10.b}[5], [RGB], #1
1674        st1         {v11.b}[5], [RGB], #1
1675        st1         {v12.b}[5], [RGB], #1
1676
1677        st1         {v10.b}[6], [RGB], #1
1678        st1         {v11.b}[6], [RGB], #1
1679        st1         {v12.b}[6], [RGB], #1
1680
1681        st1         {v10.b}[7], [RGB], #1
1682        st1         {v11.b}[7], [RGB], #1
1683        st1         {v12.b}[7], [RGB], #1
1684      .endif
1685    .elseif \size == 4
1686      st3           {v10.b, v11.b, v12.b}[0], [RGB], 3
1687      st3           {v10.b, v11.b, v12.b}[1], [RGB], 3
1688      st3           {v10.b, v11.b, v12.b}[2], [RGB], 3
1689      st3           {v10.b, v11.b, v12.b}[3], [RGB], 3
1690    .elseif \size == 2
1691      st3           {v10.b, v11.b, v12.b}[4], [RGB], 3
1692      st3           {v10.b, v11.b, v12.b}[5], [RGB], 3
1693    .elseif \size == 1
1694      st3           {v10.b, v11.b, v12.b}[6], [RGB], 3
1695    .else
1696     .error unsupported macroblock size
1697    .endif
1698  .elseif \bpp == 32
1699    .if \size == 8
1700      st4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
1701    .elseif \size == 4
1702      st4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
1703      st4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
1704      st4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
1705      st4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
1706    .elseif \size == 2
1707      st4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
1708      st4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
1709    .elseif \size == 1
1710      st4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
1711    .else
1712      .error unsupported macroblock size
1713    .endif
1714  .elseif \bpp == 16
1715    .if \size == 8
1716      st1           {v25.8h}, [RGB], 16
1717    .elseif \size == 4
1718      st1           {v25.4h}, [RGB], 8
1719    .elseif \size == 2
1720      st1           {v25.h}[4], [RGB], 2
1721      st1           {v25.h}[5], [RGB], 2
1722    .elseif \size == 1
1723      st1           {v25.h}[6], [RGB], 2
1724    .else
1725      .error unsupported macroblock size
1726    .endif
1727  .else
1728    .error unsupported bpp
1729  .endif
1730.endm
1731
1732.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
1733                                           g_offs, gsize, b_offs, bsize, \
1734                                           defsize, fast_st3
1735
1736/*
1737 * 2-stage pipelined YCbCr->RGB conversion
1738 */
1739
1740.macro do_yuv_to_rgb_stage1
1741    uaddw           v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
1742    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
1743    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
1744    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
1745    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
1746    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
1747    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
1748    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
1749    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
1750    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
1751.endm
1752
1753.macro do_yuv_to_rgb_stage2
1754    rshrn           v20.4h, v20.4s, #15
1755    rshrn2          v20.8h, v22.4s, #15
1756    rshrn           v24.4h, v24.4s, #14
1757    rshrn2          v24.8h, v26.4s, #14
1758    rshrn           v28.4h, v28.4s, #14
1759    rshrn2          v28.8h, v30.4s, #14
1760    uaddw           v20.8h, v20.8h, v0.8b
1761    uaddw           v24.8h, v24.8h, v0.8b
1762    uaddw           v28.8h, v28.8h, v0.8b
1763  .if \bpp != 16
1764    sqxtun          v1\g_offs\defsize, v20.8h
1765    sqxtun          v1\r_offs\defsize, v24.8h
1766    sqxtun          v1\b_offs\defsize, v28.8h
1767  .else
1768    sqshlu          v21.8h, v20.8h, #8
1769    sqshlu          v25.8h, v24.8h, #8
1770    sqshlu          v29.8h, v28.8h, #8
1771    sri             v25.8h, v21.8h, #5
1772    sri             v25.8h, v29.8h, #11
1773  .endif
1774.endm
1775
1776.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
1777    rshrn           v20.4h, v20.4s, #15
1778    rshrn           v24.4h, v24.4s, #14
1779    rshrn           v28.4h, v28.4s, #14
1780    ld1             {v4.8b}, [U], 8
1781    rshrn2          v20.8h, v22.4s, #15
1782    rshrn2          v24.8h, v26.4s, #14
1783    rshrn2          v28.8h, v30.4s, #14
1784    ld1             {v5.8b}, [V], 8
1785    uaddw           v20.8h, v20.8h, v0.8b
1786    uaddw           v24.8h, v24.8h, v0.8b
1787    uaddw           v28.8h, v28.8h, v0.8b
1788  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
1789    sqxtun          v1\g_offs\defsize, v20.8h
1790    ld1             {v0.8b}, [Y], 8
1791    sqxtun          v1\r_offs\defsize, v24.8h
1792    prfm            pldl1keep, [U, #64]
1793    prfm            pldl1keep, [V, #64]
1794    prfm            pldl1keep, [Y, #64]
1795    sqxtun          v1\b_offs\defsize, v28.8h
1796    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
1797    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
1798    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
1799    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
1800    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
1801    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
1802    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
1803    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
1804  .else  /**************************** rgb565 ********************************/
1805    sqshlu          v21.8h, v20.8h, #8
1806    sqshlu          v25.8h, v24.8h, #8
1807    sqshlu          v29.8h, v28.8h, #8
1808    uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
1809    uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
1810    ld1             {v0.8b}, [Y], 8
1811    smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
1812    smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
1813    smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
1814    smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
1815    sri             v25.8h, v21.8h, #5
1816    smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
1817    smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
1818    prfm            pldl1keep, [U, #64]
1819    prfm            pldl1keep, [V, #64]
1820    prfm            pldl1keep, [Y, #64]
1821    sri             v25.8h, v29.8h, #11
1822  .endif
1823    do_store        \bpp, 8, \fast_st3
1824    smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
1825    smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
1826.endm
1827
1828.macro do_yuv_to_rgb
1829    do_yuv_to_rgb_stage1
1830    do_yuv_to_rgb_stage2
1831.endm
1832
1833.if \fast_st3 == 1
1834asm_function jsimd_ycc_\colorid\()_convert_neon
1835.else
1836asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
1837.endif
1838    OUTPUT_WIDTH    .req w0
1839    INPUT_BUF       .req x1
1840    INPUT_ROW       .req w2
1841    OUTPUT_BUF      .req x3
1842    NUM_ROWS        .req w4
1843
1844    INPUT_BUF0      .req x5
1845    INPUT_BUF1      .req x6
1846    INPUT_BUF2      .req x1
1847
1848    RGB             .req x7
1849    Y               .req x9
1850    U               .req x10
1851    V               .req x11
1852    N               .req w15
1853
1854    sub             sp, sp, 64
1855    mov             x9, sp
1856
1857    /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
1858    get_symbol_loc  x15, Ljsimd_ycc_rgb_neon_consts
1859
1860    /* Save Neon registers */
1861    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1862    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1863    ld1             {v0.4h, v1.4h}, [x15], 16
1864    ld1             {v2.8h}, [x15]
1865
1866    ldr             INPUT_BUF0, [INPUT_BUF]
1867    ldr             INPUT_BUF1, [INPUT_BUF, #8]
1868    ldr             INPUT_BUF2, [INPUT_BUF, #16]
1869    .unreq          INPUT_BUF
1870
1871    /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
1872    movi            v10.16b, #255
1873    movi            v13.16b, #255
1874
1875    /* Outer loop over scanlines */
1876    cmp             NUM_ROWS, #1
1877    b.lt            9f
18780:
1879    ldr             Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
1880    ldr             U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
1881    mov             N, OUTPUT_WIDTH
1882    ldr             V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
1883    add             INPUT_ROW, INPUT_ROW, #1
1884    ldr             RGB, [OUTPUT_BUF], #8
1885
1886    /* Inner loop over pixels */
1887    subs            N, N, #8
1888    b.lt            3f
1889    do_load         8
1890    do_yuv_to_rgb_stage1
1891    subs            N, N, #8
1892    b.lt            2f
18931:
1894    do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
1895    subs            N, N, #8
1896    b.ge            1b
18972:
1898    do_yuv_to_rgb_stage2
1899    do_store        \bpp, 8, \fast_st3
1900    tst             N, #7
1901    b.eq            8f
19023:
1903    tst             N, #4
1904    b.eq            3f
1905    do_load         4
19063:
1907    tst             N, #2
1908    b.eq            4f
1909    do_load         2
19104:
1911    tst             N, #1
1912    b.eq            5f
1913    do_load         1
19145:
1915    do_yuv_to_rgb
1916    tst             N, #4
1917    b.eq            6f
1918    do_store        \bpp, 4, \fast_st3
19196:
1920    tst             N, #2
1921    b.eq            7f
1922    do_store        \bpp, 2, \fast_st3
19237:
1924    tst             N, #1
1925    b.eq            8f
1926    do_store        \bpp, 1, \fast_st3
19278:
1928    subs            NUM_ROWS, NUM_ROWS, #1
1929    b.gt            0b
19309:
1931    /* Restore all registers and return */
1932    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1933    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1934    br              x30
1935    .unreq          OUTPUT_WIDTH
1936    .unreq          INPUT_ROW
1937    .unreq          OUTPUT_BUF
1938    .unreq          NUM_ROWS
1939    .unreq          INPUT_BUF0
1940    .unreq          INPUT_BUF1
1941    .unreq          INPUT_BUF2
1942    .unreq          RGB
1943    .unreq          Y
1944    .unreq          U
1945    .unreq          V
1946    .unreq          N
1947
1948.purgem do_yuv_to_rgb
1949.purgem do_yuv_to_rgb_stage1
1950.purgem do_yuv_to_rgb_stage2
1951.purgem do_yuv_to_rgb_stage2_store_load_stage1
1952
1953.endm
1954
1955/*--------------------------------- id ----- bpp R  rsize G  gsize B  bsize defsize fast_st3*/
1956generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
1957generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
1958generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
1959generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
1960generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,  2, .4h,  1, .4h,  .8b,    1
1961generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,  2, .4h,  3, .4h,  .8b,    1
1962generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,  0, .4h,  0, .4h,  .8b,    1
1963
1964generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    0
1965generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    0
1966
1967.purgem do_load
1968.purgem do_store
1969
1970
1971/*****************************************************************************/
1972
1973/*
1974 * jsimd_extrgb_ycc_convert_neon
1975 * jsimd_extbgr_ycc_convert_neon
1976 * jsimd_extrgbx_ycc_convert_neon
1977 * jsimd_extbgrx_ycc_convert_neon
1978 * jsimd_extxbgr_ycc_convert_neon
1979 * jsimd_extxrgb_ycc_convert_neon
1980 *
1981 * Colorspace conversion RGB -> YCbCr
1982 */
1983
1984.macro do_store size
1985  .if \size == 8
1986    st1             {v20.8b}, [Y], #8
1987    st1             {v21.8b}, [U], #8
1988    st1             {v22.8b}, [V], #8
1989  .elseif \size == 4
1990    st1             {v20.b}[0], [Y], #1
1991    st1             {v20.b}[1], [Y], #1
1992    st1             {v20.b}[2], [Y], #1
1993    st1             {v20.b}[3], [Y], #1
1994    st1             {v21.b}[0], [U], #1
1995    st1             {v21.b}[1], [U], #1
1996    st1             {v21.b}[2], [U], #1
1997    st1             {v21.b}[3], [U], #1
1998    st1             {v22.b}[0], [V], #1
1999    st1             {v22.b}[1], [V], #1
2000    st1             {v22.b}[2], [V], #1
2001    st1             {v22.b}[3], [V], #1
2002  .elseif \size == 2
2003    st1             {v20.b}[4], [Y], #1
2004    st1             {v20.b}[5], [Y], #1
2005    st1             {v21.b}[4], [U], #1
2006    st1             {v21.b}[5], [U], #1
2007    st1             {v22.b}[4], [V], #1
2008    st1             {v22.b}[5], [V], #1
2009  .elseif \size == 1
2010    st1             {v20.b}[6], [Y], #1
2011    st1             {v21.b}[6], [U], #1
2012    st1             {v22.b}[6], [V], #1
2013  .else
2014    .error unsupported macroblock size
2015  .endif
2016.endm
2017
2018.macro do_load bpp, size, fast_ld3
2019  .if \bpp == 24
2020    .if \size == 8
2021      .if \fast_ld3 == 1
2022        ld3         {v10.8b, v11.8b, v12.8b}, [RGB], #24
2023      .else
2024        ld1         {v10.b}[0], [RGB], #1
2025        ld1         {v11.b}[0], [RGB], #1
2026        ld1         {v12.b}[0], [RGB], #1
2027
2028        ld1         {v10.b}[1], [RGB], #1
2029        ld1         {v11.b}[1], [RGB], #1
2030        ld1         {v12.b}[1], [RGB], #1
2031
2032        ld1         {v10.b}[2], [RGB], #1
2033        ld1         {v11.b}[2], [RGB], #1
2034        ld1         {v12.b}[2], [RGB], #1
2035
2036        ld1         {v10.b}[3], [RGB], #1
2037        ld1         {v11.b}[3], [RGB], #1
2038        ld1         {v12.b}[3], [RGB], #1
2039
2040        ld1         {v10.b}[4], [RGB], #1
2041        ld1         {v11.b}[4], [RGB], #1
2042        ld1         {v12.b}[4], [RGB], #1
2043
2044        ld1         {v10.b}[5], [RGB], #1
2045        ld1         {v11.b}[5], [RGB], #1
2046        ld1         {v12.b}[5], [RGB], #1
2047
2048        ld1         {v10.b}[6], [RGB], #1
2049        ld1         {v11.b}[6], [RGB], #1
2050        ld1         {v12.b}[6], [RGB], #1
2051
2052        ld1         {v10.b}[7], [RGB], #1
2053        ld1         {v11.b}[7], [RGB], #1
2054        ld1         {v12.b}[7], [RGB], #1
2055      .endif
2056      prfm          pldl1keep, [RGB, #128]
2057    .elseif \size == 4
2058      ld3           {v10.b, v11.b, v12.b}[0], [RGB], #3
2059      ld3           {v10.b, v11.b, v12.b}[1], [RGB], #3
2060      ld3           {v10.b, v11.b, v12.b}[2], [RGB], #3
2061      ld3           {v10.b, v11.b, v12.b}[3], [RGB], #3
2062    .elseif \size == 2
2063      ld3           {v10.b, v11.b, v12.b}[4], [RGB], #3
2064      ld3           {v10.b, v11.b, v12.b}[5], [RGB], #3
2065    .elseif \size == 1
2066      ld3           {v10.b, v11.b, v12.b}[6], [RGB], #3
2067    .else
2068      .error unsupported macroblock size
2069    .endif
2070  .elseif \bpp == 32
2071    .if \size == 8
2072      ld4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
2073      prfm          pldl1keep, [RGB, #128]
2074    .elseif \size == 4
2075      ld4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
2076      ld4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
2077      ld4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
2078      ld4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
2079    .elseif \size == 2
2080      ld4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
2081      ld4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
2082    .elseif \size == 1
2083      ld4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
2084    .else
2085      .error unsupported macroblock size
2086    .endif
2087  .else
2088    .error unsupported bpp
2089  .endif
2090.endm
2091
2092.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
2093                                           b_offs, fast_ld3
2094
2095/*
2096 * 2-stage pipelined RGB->YCbCr conversion
2097 */
2098
2099.macro do_rgb_to_yuv_stage1
2100    ushll           v4.8h, v1\r_offs\().8b, #0  /* r = v4 */
2101    ushll           v6.8h, v1\g_offs\().8b, #0  /* g = v6 */
2102    ushll           v8.8h, v1\b_offs\().8b, #0  /* b = v8 */
2103    rev64           v18.4s, v1.4s
2104    rev64           v26.4s, v1.4s
2105    rev64           v28.4s, v1.4s
2106    rev64           v30.4s, v1.4s
2107    umull           v14.4s, v4.4h, v0.h[0]
2108    umull2          v16.4s, v4.8h, v0.h[0]
2109    umlsl           v18.4s, v4.4h, v0.h[3]
2110    umlsl2          v26.4s, v4.8h, v0.h[3]
2111    umlal           v28.4s, v4.4h, v0.h[5]
2112    umlal2          v30.4s, v4.8h, v0.h[5]
2113    umlal           v14.4s, v6.4h, v0.h[1]
2114    umlal2          v16.4s, v6.8h, v0.h[1]
2115    umlsl           v18.4s, v6.4h, v0.h[4]
2116    umlsl2          v26.4s, v6.8h, v0.h[4]
2117    umlsl           v28.4s, v6.4h, v0.h[6]
2118    umlsl2          v30.4s, v6.8h, v0.h[6]
2119    umlal           v14.4s, v8.4h, v0.h[2]
2120    umlal2          v16.4s, v8.8h, v0.h[2]
2121    umlal           v18.4s, v8.4h, v0.h[5]
2122    umlal2          v26.4s, v8.8h, v0.h[5]
2123    umlsl           v28.4s, v8.4h, v0.h[7]
2124    umlsl2          v30.4s, v8.8h, v0.h[7]
2125.endm
2126
2127.macro do_rgb_to_yuv_stage2
2128    rshrn           v20.4h, v14.4s, #16
2129    shrn            v22.4h, v18.4s, #16
2130    shrn            v24.4h, v28.4s, #16
2131    rshrn2          v20.8h, v16.4s, #16
2132    shrn2           v22.8h, v26.4s, #16
2133    shrn2           v24.8h, v30.4s, #16
2134    xtn             v20.8b, v20.8h       /* v20 = y */
2135    xtn             v21.8b, v22.8h       /* v21 = u */
2136    xtn             v22.8b, v24.8h       /* v22 = v */
2137.endm
2138
2139.macro do_rgb_to_yuv
2140    do_rgb_to_yuv_stage1
2141    do_rgb_to_yuv_stage2
2142.endm
2143
2144/* TODO: expand macros and interleave instructions if some in-order
2145 *       AArch64 processor actually can dual-issue LOAD/STORE with ALU */
2146.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
2147    do_rgb_to_yuv_stage2
2148    do_load         \bpp, 8, \fast_ld3
2149    st1             {v20.8b}, [Y], #8
2150    st1             {v21.8b}, [U], #8
2151    st1             {v22.8b}, [V], #8
2152    do_rgb_to_yuv_stage1
2153.endm
2154
2155.if \fast_ld3 == 1
2156asm_function jsimd_\colorid\()_ycc_convert_neon
2157.else
2158asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
2159.endif
2160    OUTPUT_WIDTH    .req w0
2161    INPUT_BUF       .req x1
2162    OUTPUT_BUF      .req x2
2163    OUTPUT_ROW      .req w3
2164    NUM_ROWS        .req w4
2165
2166    OUTPUT_BUF0     .req x5
2167    OUTPUT_BUF1     .req x6
2168    OUTPUT_BUF2     .req x2  /* OUTPUT_BUF */
2169
2170    RGB             .req x7
2171    Y               .req x9
2172    U               .req x10
2173    V               .req x11
2174    N               .req w12
2175
2176    /* Load constants to d0, d1, d2, d3 */
2177    get_symbol_loc  x13, Ljsimd_rgb_ycc_neon_consts
2178    ld1             {v0.8h, v1.8h}, [x13]
2179
2180    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
2181    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #8]
2182    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #16]
2183    .unreq          OUTPUT_BUF
2184
2185    /* Save Neon registers */
2186    sub             sp, sp, #64
2187    mov             x9, sp
2188    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
2189    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
2190
2191    /* Outer loop over scanlines */
2192    cmp             NUM_ROWS, #1
2193    b.lt            9f
21940:
2195    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
2196    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
2197    mov             N, OUTPUT_WIDTH
2198    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
2199    add             OUTPUT_ROW, OUTPUT_ROW, #1
2200    ldr             RGB, [INPUT_BUF], #8
2201
2202    /* Inner loop over pixels */
2203    subs            N, N, #8
2204    b.lt            3f
2205    do_load         \bpp, 8, \fast_ld3
2206    do_rgb_to_yuv_stage1
2207    subs            N, N, #8
2208    b.lt            2f
22091:
2210    do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
2211    subs            N, N, #8
2212    b.ge            1b
22132:
2214    do_rgb_to_yuv_stage2
2215    do_store        8
2216    tst             N, #7
2217    b.eq            8f
22183:
2219    tbz             N, #2, 3f
2220    do_load         \bpp, 4, \fast_ld3
22213:
2222    tbz             N, #1, 4f
2223    do_load         \bpp, 2, \fast_ld3
22244:
2225    tbz             N, #0, 5f
2226    do_load         \bpp, 1, \fast_ld3
22275:
2228    do_rgb_to_yuv
2229    tbz             N, #2, 6f
2230    do_store        4
22316:
2232    tbz             N, #1, 7f
2233    do_store        2
22347:
2235    tbz             N, #0, 8f
2236    do_store        1
22378:
2238    subs            NUM_ROWS, NUM_ROWS, #1
2239    b.gt            0b
22409:
2241    /* Restore all registers and return */
2242    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2243    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2244    br              x30
2245
2246    .unreq          OUTPUT_WIDTH
2247    .unreq          OUTPUT_ROW
2248    .unreq          INPUT_BUF
2249    .unreq          NUM_ROWS
2250    .unreq          OUTPUT_BUF0
2251    .unreq          OUTPUT_BUF1
2252    .unreq          OUTPUT_BUF2
2253    .unreq          RGB
2254    .unreq          Y
2255    .unreq          U
2256    .unreq          V
2257    .unreq          N
2258
2259.purgem do_rgb_to_yuv
2260.purgem do_rgb_to_yuv_stage1
2261.purgem do_rgb_to_yuv_stage2
2262.purgem do_rgb_to_yuv_stage2_store_load_stage1
2263
2264.endm
2265
2266/*--------------------------------- id ----- bpp R  G  B  Fast LD3 */
2267generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 1
2268generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 1
2269generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
2270generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
2271generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
2272generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
2273
2274generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 0
2275generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 0
2276
2277.purgem do_load
2278.purgem do_store
2279
2280
2281/*****************************************************************************/
2282
2283/*
2284 * Load data into workspace, applying unsigned->signed conversion
2285 *
2286 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
2287 *       rid of VST1.16 instructions
2288 */
2289
2290asm_function jsimd_convsamp_neon
2291    SAMPLE_DATA     .req x0
2292    START_COL       .req x1
2293    WORKSPACE       .req x2
2294    TMP1            .req x9
2295    TMP2            .req x10
2296    TMP3            .req x11
2297    TMP4            .req x12
2298    TMP5            .req x13
2299    TMP6            .req x14
2300    TMP7            .req x15
2301    TMP8            .req x4
2302    TMPDUP          .req w3
2303
2304    /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
2305       guarantee that the upper (unused) 32 bits of x1 are valid.  This
2306       instruction ensures that those bits are set to zero. */
2307    uxtw x1, w1
2308
2309    mov             TMPDUP, #128
2310    ldp             TMP1, TMP2, [SAMPLE_DATA], 16
2311    ldp             TMP3, TMP4, [SAMPLE_DATA], 16
2312    dup             v0.8b, TMPDUP
2313    add             TMP1, TMP1, START_COL
2314    add             TMP2, TMP2, START_COL
2315    ldp             TMP5, TMP6, [SAMPLE_DATA], 16
2316    add             TMP3, TMP3, START_COL
2317    add             TMP4, TMP4, START_COL
2318    ldp             TMP7, TMP8, [SAMPLE_DATA], 16
2319    add             TMP5, TMP5, START_COL
2320    add             TMP6, TMP6, START_COL
2321    ld1             {v16.8b}, [TMP1]
2322    add             TMP7, TMP7, START_COL
2323    add             TMP8, TMP8, START_COL
2324    ld1             {v17.8b}, [TMP2]
2325    usubl           v16.8h, v16.8b, v0.8b
2326    ld1             {v18.8b}, [TMP3]
2327    usubl           v17.8h, v17.8b, v0.8b
2328    ld1             {v19.8b}, [TMP4]
2329    usubl           v18.8h, v18.8b, v0.8b
2330    ld1             {v20.8b}, [TMP5]
2331    usubl           v19.8h, v19.8b, v0.8b
2332    ld1             {v21.8b}, [TMP6]
2333    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
2334    usubl           v20.8h, v20.8b, v0.8b
2335    ld1             {v22.8b}, [TMP7]
2336    usubl           v21.8h, v21.8b, v0.8b
2337    ld1             {v23.8b}, [TMP8]
2338    usubl           v22.8h, v22.8b, v0.8b
2339    usubl           v23.8h, v23.8b, v0.8b
2340    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
2341
2342    br              x30
2343
2344    .unreq          SAMPLE_DATA
2345    .unreq          START_COL
2346    .unreq          WORKSPACE
2347    .unreq          TMP1
2348    .unreq          TMP2
2349    .unreq          TMP3
2350    .unreq          TMP4
2351    .unreq          TMP5
2352    .unreq          TMP6
2353    .unreq          TMP7
2354    .unreq          TMP8
2355    .unreq          TMPDUP
2356
2357/*****************************************************************************/
2358
2359/*
2360 * jsimd_fdct_islow_neon
2361 *
2362 * This file contains a slower but more accurate integer implementation of the
2363 * forward DCT (Discrete Cosine Transform). The following code is based
2364 * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
2365 * more details.
2366 *
2367 * TODO: can be combined with 'jsimd_convsamp_neon' to get
2368 *       rid of a bunch of VLD1.16 instructions
2369 */
2370
2371#define CONST_BITS  13
2372#define PASS1_BITS  2
2373
2374#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
2375#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
2376
2377#define XFIX_P_0_298  v0.h[0]
2378#define XFIX_N_0_390  v0.h[1]
2379#define XFIX_P_0_541  v0.h[2]
2380#define XFIX_P_0_765  v0.h[3]
2381#define XFIX_N_0_899  v0.h[4]
2382#define XFIX_P_1_175  v0.h[5]
2383#define XFIX_P_1_501  v0.h[6]
2384#define XFIX_N_1_847  v0.h[7]
2385#define XFIX_N_1_961  v1.h[0]
2386#define XFIX_P_2_053  v1.h[1]
2387#define XFIX_N_2_562  v1.h[2]
2388#define XFIX_P_3_072  v1.h[3]
2389
2390asm_function jsimd_fdct_islow_neon
2391
2392    DATA            .req x0
2393    TMP             .req x9
2394
2395    /* Load constants */
2396    get_symbol_loc  TMP, Ljsimd_fdct_islow_neon_consts
2397    ld1             {v0.8h, v1.8h}, [TMP]
2398
2399    /* Save Neon registers */
2400    sub             sp, sp, #64
2401    mov             x10, sp
2402    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
2403    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
2404
2405    /* Load all DATA into Neon registers with the following allocation:
2406     *       0 1 2 3 | 4 5 6 7
2407     *      ---------+--------
2408     *   0 | d16     | d17    | v16.8h
2409     *   1 | d18     | d19    | v17.8h
2410     *   2 | d20     | d21    | v18.8h
2411     *   3 | d22     | d23    | v19.8h
2412     *   4 | d24     | d25    | v20.8h
2413     *   5 | d26     | d27    | v21.8h
2414     *   6 | d28     | d29    | v22.8h
2415     *   7 | d30     | d31    | v23.8h
2416     */
2417
2418    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2419    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2420    sub             DATA, DATA, #64
2421
2422    /* Transpose */
2423    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
2424    /* 1-D FDCT */
2425    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
2426    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
2427    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
2428    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
2429    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
2430    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
2431    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
2432    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
2433
2434    /* even part */
2435
2436    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
2437    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
2438    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
2439    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
2440
2441    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
2442    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
2443
2444    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
2445
2446    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
2447    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
2448
2449    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2450    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2451    mov             v22.16b, v18.16b
2452    mov             v25.16b, v24.16b
2453
2454    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2455    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2456    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2457    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2458
2459    rshrn           v18.4h, v18.4s, #DESCALE_P1
2460    rshrn           v22.4h, v22.4s, #DESCALE_P1
2461    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2462    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
2463
2464    /* Odd part */
2465
2466    add             v8.8h, v28.8h, v31.8h        /* z1 = tmp4 + tmp7; */
2467    add             v9.8h, v29.8h, v30.8h        /* z2 = tmp5 + tmp6; */
2468    add             v10.8h, v28.8h, v30.8h       /* z3 = tmp4 + tmp6; */
2469    add             v11.8h, v29.8h, v31.8h       /* z4 = tmp5 + tmp7; */
2470    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
2471    smull2          v5.4s, v10.8h, XFIX_P_1_175
2472    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
2473    smlal2          v5.4s, v11.8h, XFIX_P_1_175
2474
2475    smull2          v24.4s, v28.8h, XFIX_P_0_298
2476    smull2          v25.4s, v29.8h, XFIX_P_2_053
2477    smull2          v26.4s, v30.8h, XFIX_P_3_072
2478    smull2          v27.4s, v31.8h, XFIX_P_1_501
2479    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
2480    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
2481    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
2482    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
2483
2484    smull2          v12.4s, v8.8h, XFIX_N_0_899
2485    smull2          v13.4s, v9.8h, XFIX_N_2_562
2486    smull2          v14.4s, v10.8h, XFIX_N_1_961
2487    smull2          v15.4s, v11.8h, XFIX_N_0_390
2488    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
2489    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
2490    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
2491    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
2492
2493    add             v10.4s, v10.4s, v4.4s  /* z3 += z5 */
2494    add             v14.4s, v14.4s, v5.4s
2495    add             v11.4s, v11.4s, v4.4s  /* z4 += z5 */
2496    add             v15.4s, v15.4s, v5.4s
2497
2498    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
2499    add             v24.4s, v24.4s, v12.4s
2500    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
2501    add             v25.4s, v25.4s, v13.4s
2502    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
2503    add             v26.4s, v26.4s, v14.4s
2504    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
2505    add             v27.4s, v27.4s, v15.4s
2506
2507    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
2508    add             v24.4s, v24.4s, v14.4s
2509    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
2510    add             v25.4s, v25.4s, v15.4s
2511    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
2512    add             v26.4s, v26.4s, v13.4s
2513    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
2514    add             v27.4s, v27.4s, v12.4s
2515
2516    rshrn           v23.4h, v28.4s, #DESCALE_P1
2517    rshrn           v21.4h, v29.4s, #DESCALE_P1
2518    rshrn           v19.4h, v30.4s, #DESCALE_P1
2519    rshrn           v17.4h, v31.4s, #DESCALE_P1
2520    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2521    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2522    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2523    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
2524
2525    /* Transpose */
2526    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
2527
2528    /* 1-D FDCT */
2529    add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
2530    sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
2531    add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
2532    sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
2533    add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
2534    sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
2535    add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
2536    sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
2537
2538    /* even part */
2539    add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
2540    sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
2541    add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
2542    sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
2543
2544    add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
2545    sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
2546
2547    add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
2548
2549    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
2550    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
2551
2552    smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2553    smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2554    mov             v22.16b, v18.16b
2555    mov             v25.16b, v24.16b
2556
2557    smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2558    smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2559    smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2560    smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2561
2562    rshrn           v18.4h, v18.4s, #DESCALE_P2
2563    rshrn           v22.4h, v22.4s, #DESCALE_P2
2564    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2565    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
2566
2567    /* Odd part */
2568    add             v8.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
2569    add             v9.8h, v29.8h, v30.8h   /* z2 = tmp5 + tmp6; */
2570    add             v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
2571    add             v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
2572
2573    smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
2574    smull2          v5.4s, v10.8h, XFIX_P_1_175
2575    smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
2576    smlal2          v5.4s, v11.8h, XFIX_P_1_175
2577
2578    smull2          v24.4s, v28.8h, XFIX_P_0_298
2579    smull2          v25.4s, v29.8h, XFIX_P_2_053
2580    smull2          v26.4s, v30.8h, XFIX_P_3_072
2581    smull2          v27.4s, v31.8h, XFIX_P_1_501
2582    smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
2583    smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
2584    smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
2585    smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
2586
2587    smull2          v12.4s, v8.8h, XFIX_N_0_899
2588    smull2          v13.4s, v9.8h, XFIX_N_2_562
2589    smull2          v14.4s, v10.8h, XFIX_N_1_961
2590    smull2          v15.4s, v11.8h, XFIX_N_0_390
2591    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
2592    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
2593    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
2594    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
2595
2596    add             v10.4s, v10.4s, v4.4s
2597    add             v14.4s, v14.4s, v5.4s
2598    add             v11.4s, v11.4s, v4.4s
2599    add             v15.4s, v15.4s, v5.4s
2600
2601    add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
2602    add             v24.4s, v24.4s, v12.4s
2603    add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
2604    add             v25.4s, v25.4s, v13.4s
2605    add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
2606    add             v26.4s, v26.4s, v14.4s
2607    add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
2608    add             v27.4s, v27.4s, v15.4s
2609
2610    add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
2611    add             v24.4s, v24.4s, v14.4s
2612    add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
2613    add             v25.4s, v25.4s, v15.4s
2614    add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
2615    add             v26.4s, v26.4s, v13.4s
2616    add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
2617    add             v27.4s, v27.4s, v12.4s
2618
2619    rshrn           v23.4h, v28.4s, #DESCALE_P2
2620    rshrn           v21.4h, v29.4s, #DESCALE_P2
2621    rshrn           v19.4h, v30.4s, #DESCALE_P2
2622    rshrn           v17.4h, v31.4s, #DESCALE_P2
2623    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2624    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2625    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2626    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
2627
2628    /* store results */
2629    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2630    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2631
2632    /* Restore Neon registers */
2633    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2634    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2635
2636    br              x30
2637
2638    .unreq          DATA
2639    .unreq          TMP
2640
2641#undef XFIX_P_0_298
2642#undef XFIX_N_0_390
2643#undef XFIX_P_0_541
2644#undef XFIX_P_0_765
2645#undef XFIX_N_0_899
2646#undef XFIX_P_1_175
2647#undef XFIX_P_1_501
2648#undef XFIX_N_1_847
2649#undef XFIX_N_1_961
2650#undef XFIX_P_2_053
2651#undef XFIX_N_2_562
2652#undef XFIX_P_3_072
2653
2654
2655/*****************************************************************************/
2656
2657/*
2658 * jsimd_fdct_ifast_neon
2659 *
2660 * This function contains a fast, not so accurate integer implementation of
2661 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
2662 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
2663 * function from jfdctfst.c
2664 *
2665 * TODO: can be combined with 'jsimd_convsamp_neon' to get
2666 *       rid of a bunch of VLD1.16 instructions
2667 */
2668
2669#undef XFIX_0_541196100
2670#define XFIX_0_382683433  v0.h[0]
2671#define XFIX_0_541196100  v0.h[1]
2672#define XFIX_0_707106781  v0.h[2]
2673#define XFIX_1_306562965  v0.h[3]
2674
2675asm_function jsimd_fdct_ifast_neon
2676
2677    DATA            .req x0
2678    TMP             .req x9
2679
2680    /* Load constants */
2681    get_symbol_loc  TMP, Ljsimd_fdct_ifast_neon_consts
2682    ld1             {v0.4h}, [TMP]
2683
2684    /* Load all DATA into Neon registers with the following allocation:
2685     *       0 1 2 3 | 4 5 6 7
2686     *      ---------+--------
2687     *   0 | d16     | d17    | v0.8h
2688     *   1 | d18     | d19    | q9
2689     *   2 | d20     | d21    | q10
2690     *   3 | d22     | d23    | q11
2691     *   4 | d24     | d25    | q12
2692     *   5 | d26     | d27    | q13
2693     *   6 | d28     | d29    | q14
2694     *   7 | d30     | d31    | q15
2695     */
2696
2697    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2698    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2699    mov             TMP, #2
2700    sub             DATA, DATA, #64
27011:
2702    /* Transpose */
2703    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
2704    subs            TMP, TMP, #1
2705    /* 1-D FDCT */
2706    add             v4.8h, v19.8h, v20.8h
2707    sub             v20.8h, v19.8h, v20.8h
2708    sub             v28.8h, v18.8h, v21.8h
2709    add             v18.8h, v18.8h, v21.8h
2710    sub             v29.8h, v17.8h, v22.8h
2711    add             v17.8h, v17.8h, v22.8h
2712    sub             v21.8h, v16.8h, v23.8h
2713    add             v16.8h, v16.8h, v23.8h
2714    sub             v6.8h, v17.8h, v18.8h
2715    sub             v7.8h, v16.8h, v4.8h
2716    add             v5.8h, v17.8h, v18.8h
2717    add             v6.8h, v6.8h, v7.8h
2718    add             v4.8h, v16.8h, v4.8h
2719    sqdmulh         v6.8h, v6.8h, XFIX_0_707106781
2720    add             v19.8h, v20.8h, v28.8h
2721    add             v16.8h, v4.8h, v5.8h
2722    sub             v20.8h, v4.8h, v5.8h
2723    add             v5.8h, v28.8h, v29.8h
2724    add             v29.8h, v29.8h, v21.8h
2725    sqdmulh         v5.8h, v5.8h, XFIX_0_707106781
2726    sub             v28.8h, v19.8h, v29.8h
2727    add             v18.8h, v7.8h, v6.8h
2728    sqdmulh         v28.8h, v28.8h, XFIX_0_382683433
2729    sub             v22.8h, v7.8h, v6.8h
2730    sqdmulh         v19.8h, v19.8h, XFIX_0_541196100
2731    sqdmulh         v7.8h, v29.8h, XFIX_1_306562965
2732    add             v6.8h, v21.8h, v5.8h
2733    sub             v5.8h, v21.8h, v5.8h
2734    add             v29.8h, v29.8h, v28.8h
2735    add             v19.8h, v19.8h, v28.8h
2736    add             v29.8h, v29.8h, v7.8h
2737    add             v21.8h, v5.8h, v19.8h
2738    sub             v19.8h, v5.8h, v19.8h
2739    add             v17.8h, v6.8h, v29.8h
2740    sub             v23.8h, v6.8h, v29.8h
2741
2742    b.ne            1b
2743
2744    /* store results */
2745    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2746    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2747
2748    br              x30
2749
2750    .unreq          DATA
2751    .unreq          TMP
2752#undef XFIX_0_382683433
2753#undef XFIX_0_541196100
2754#undef XFIX_0_707106781
2755#undef XFIX_1_306562965
2756
2757
2758/*****************************************************************************/
2759
2760/*
2761 * GLOBAL(void)
2762 * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
2763 *                     DCTELEM *workspace);
2764 *
2765 */
2766asm_function jsimd_quantize_neon
2767
2768    COEF_BLOCK      .req x0
2769    DIVISORS        .req x1
2770    WORKSPACE       .req x2
2771
2772    RECIPROCAL      .req DIVISORS
2773    CORRECTION      .req x9
2774    SHIFT           .req x10
2775    LOOP_COUNT      .req x11
2776
2777    mov             LOOP_COUNT, #2
2778    add             CORRECTION, DIVISORS, #(64 * 2)
2779    add             SHIFT, DIVISORS, #(64 * 6)
27801:
2781    subs            LOOP_COUNT, LOOP_COUNT, #1
2782    ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
2783    ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
2784    abs             v20.8h, v0.8h
2785    abs             v21.8h, v1.8h
2786    abs             v22.8h, v2.8h
2787    abs             v23.8h, v3.8h
2788    ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
2789    add             v20.8h, v20.8h, v4.8h  /* add correction */
2790    add             v21.8h, v21.8h, v5.8h
2791    add             v22.8h, v22.8h, v6.8h
2792    add             v23.8h, v23.8h, v7.8h
2793    umull           v4.4s, v20.4h, v28.4h  /* multiply by reciprocal */
2794    umull2          v16.4s, v20.8h, v28.8h
2795    umull           v5.4s, v21.4h, v29.4h
2796    umull2          v17.4s, v21.8h, v29.8h
2797    umull           v6.4s, v22.4h, v30.4h  /* multiply by reciprocal */
2798    umull2          v18.4s, v22.8h, v30.8h
2799    umull           v7.4s, v23.4h, v31.4h
2800    umull2          v19.4s, v23.8h, v31.8h
2801    ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
2802    shrn            v4.4h, v4.4s, #16
2803    shrn            v5.4h, v5.4s, #16
2804    shrn            v6.4h, v6.4s, #16
2805    shrn            v7.4h, v7.4s, #16
2806    shrn2           v4.8h, v16.4s, #16
2807    shrn2           v5.8h, v17.4s, #16
2808    shrn2           v6.8h, v18.4s, #16
2809    shrn2           v7.8h, v19.4s, #16
2810    neg             v24.8h, v24.8h
2811    neg             v25.8h, v25.8h
2812    neg             v26.8h, v26.8h
2813    neg             v27.8h, v27.8h
2814    sshr            v0.8h, v0.8h, #15  /* extract sign */
2815    sshr            v1.8h, v1.8h, #15
2816    sshr            v2.8h, v2.8h, #15
2817    sshr            v3.8h, v3.8h, #15
2818    ushl            v4.8h, v4.8h, v24.8h  /* shift */
2819    ushl            v5.8h, v5.8h, v25.8h
2820    ushl            v6.8h, v6.8h, v26.8h
2821    ushl            v7.8h, v7.8h, v27.8h
2822
2823    eor             v4.16b, v4.16b, v0.16b  /* restore sign */
2824    eor             v5.16b, v5.16b, v1.16b
2825    eor             v6.16b, v6.16b, v2.16b
2826    eor             v7.16b, v7.16b, v3.16b
2827    sub             v4.8h, v4.8h, v0.8h
2828    sub             v5.8h, v5.8h, v1.8h
2829    sub             v6.8h, v6.8h, v2.8h
2830    sub             v7.8h, v7.8h, v3.8h
2831    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
2832
2833    b.ne            1b
2834
2835    br              x30  /* return */
2836
2837    .unreq          COEF_BLOCK
2838    .unreq          DIVISORS
2839    .unreq          WORKSPACE
2840    .unreq          RECIPROCAL
2841    .unreq          CORRECTION
2842    .unreq          SHIFT
2843    .unreq          LOOP_COUNT
2844
2845
2846/*****************************************************************************/
2847
2848/*
2849 * Downsample pixel values of a single component.
2850 * This version handles the common case of 2:1 horizontal and 1:1 vertical,
2851 * without smoothing.
2852 *
2853 * GLOBAL(void)
2854 * jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
2855 *                            JDIMENSION v_samp_factor,
2856 *                            JDIMENSION width_in_blocks,
2857 *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
2858 */
2859
2860asm_function jsimd_h2v1_downsample_neon
2861    IMAGE_WIDTH     .req x0
2862    MAX_V_SAMP      .req x1
2863    V_SAMP          .req x2
2864    BLOCK_WIDTH     .req x3
2865    INPUT_DATA      .req x4
2866    OUTPUT_DATA     .req x5
2867    OUTPTR          .req x9
2868    INPTR           .req x10
2869    TMP1            .req x11
2870    TMP2            .req x12
2871    TMP3            .req x13
2872    TMPDUP          .req w15
2873
2874    mov             TMPDUP, #0x10000
2875    lsl             TMP2, BLOCK_WIDTH, #4
2876    sub             TMP2, TMP2, IMAGE_WIDTH
2877    get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
2878    add             TMP3, TMP3, TMP2, lsl #4
2879    dup             v16.4s, TMPDUP
2880    ld1             {v18.16b}, [TMP3]
2881
28821:  /* row loop */
2883    ldr             INPTR, [INPUT_DATA], #8
2884    ldr             OUTPTR, [OUTPUT_DATA], #8
2885    subs            TMP1, BLOCK_WIDTH, #1
2886    b.eq            3f
28872:  /* columns */
2888    ld1             {v0.16b}, [INPTR], #16
2889    mov             v4.16b, v16.16b
2890    subs            TMP1, TMP1, #1
2891    uadalp          v4.8h, v0.16b
2892    shrn            v6.8b, v4.8h, #1
2893    st1             {v6.8b}, [OUTPTR], #8
2894    b.ne            2b
28953:  /* last columns */
2896    ld1             {v0.16b}, [INPTR]
2897    mov             v4.16b, v16.16b
2898    subs            V_SAMP, V_SAMP, #1
2899    /* expand right */
2900    tbl             v2.16b, {v0.16b}, v18.16b
2901    uadalp          v4.8h, v2.16b
2902    shrn            v6.8b, v4.8h, #1
2903    st1             {v6.8b}, [OUTPTR], #8
2904    b.ne            1b
2905
2906    br              x30
2907
2908    .unreq          IMAGE_WIDTH
2909    .unreq          MAX_V_SAMP
2910    .unreq          V_SAMP
2911    .unreq          BLOCK_WIDTH
2912    .unreq          INPUT_DATA
2913    .unreq          OUTPUT_DATA
2914    .unreq          OUTPTR
2915    .unreq          INPTR
2916    .unreq          TMP1
2917    .unreq          TMP2
2918    .unreq          TMP3
2919    .unreq          TMPDUP
2920
2921
2922/*****************************************************************************/
2923
2924/*
2925 * Downsample pixel values of a single component.
2926 * This version handles the common case of 2:1 horizontal and 2:1 vertical,
2927 * without smoothing.
2928 *
2929 * GLOBAL(void)
2930 * jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
2931 *                            JDIMENSION v_samp_factor,
2932 *                            JDIMENSION width_in_blocks,
2933 *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
2934 */
2935
2936.balign 16
2937asm_function jsimd_h2v2_downsample_neon
2938    IMAGE_WIDTH     .req x0
2939    MAX_V_SAMP      .req x1
2940    V_SAMP          .req x2
2941    BLOCK_WIDTH     .req x3
2942    INPUT_DATA      .req x4
2943    OUTPUT_DATA     .req x5
2944    OUTPTR          .req x9
2945    INPTR0          .req x10
2946    INPTR1          .req x14
2947    TMP1            .req x11
2948    TMP2            .req x12
2949    TMP3            .req x13
2950    TMPDUP          .req w15
2951
2952    mov             TMPDUP, #1
2953    lsl             TMP2, BLOCK_WIDTH, #4
2954    lsl             TMPDUP, TMPDUP, #17
2955    sub             TMP2, TMP2, IMAGE_WIDTH
2956    get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
2957    orr             TMPDUP, TMPDUP, #1
2958    add             TMP3, TMP3, TMP2, lsl #4
2959    dup             v16.4s, TMPDUP
2960    ld1             {v18.16b}, [TMP3]
2961
29621:  /* row loop */
2963    ldr             INPTR0, [INPUT_DATA], #8
2964    ldr             OUTPTR, [OUTPUT_DATA], #8
2965    ldr             INPTR1, [INPUT_DATA], #8
2966    subs            TMP1, BLOCK_WIDTH, #1
2967    b.eq            3f
29682:  /* columns */
2969    ld1             {v0.16b}, [INPTR0], #16
2970    ld1             {v1.16b}, [INPTR1], #16
2971    mov             v4.16b, v16.16b
2972    subs            TMP1, TMP1, #1
2973    uadalp          v4.8h, v0.16b
2974    uadalp          v4.8h, v1.16b
2975    shrn            v6.8b, v4.8h, #2
2976    st1             {v6.8b}, [OUTPTR], #8
2977    b.ne            2b
29783:  /* last columns */
2979    ld1             {v0.16b}, [INPTR0], #16
2980    ld1             {v1.16b}, [INPTR1], #16
2981    mov             v4.16b, v16.16b
2982    subs            V_SAMP, V_SAMP, #1
2983    /* expand right */
2984    tbl             v2.16b, {v0.16b}, v18.16b
2985    tbl             v3.16b, {v1.16b}, v18.16b
2986    uadalp          v4.8h, v2.16b
2987    uadalp          v4.8h, v3.16b
2988    shrn            v6.8b, v4.8h, #2
2989    st1             {v6.8b}, [OUTPTR], #8
2990    b.ne            1b
2991
2992    br              x30
2993
2994    .unreq          IMAGE_WIDTH
2995    .unreq          MAX_V_SAMP
2996    .unreq          V_SAMP
2997    .unreq          BLOCK_WIDTH
2998    .unreq          INPUT_DATA
2999    .unreq          OUTPUT_DATA
3000    .unreq          OUTPTR
3001    .unreq          INPTR0
3002    .unreq          INPTR1
3003    .unreq          TMP1
3004    .unreq          TMP2
3005    .unreq          TMP3
3006    .unreq          TMPDUP
3007
3008
3009/*****************************************************************************/
3010
3011/*
3012 * GLOBAL(JOCTET *)
3013 * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
3014 *                             JCOEFPTR block, int last_dc_val,
3015 *                             c_derived_tbl *dctbl, c_derived_tbl *actbl)
3016 *
3017 */
3018
3019    BUFFER          .req x1
3020    PUT_BUFFER      .req x6
3021    PUT_BITS        .req x7
3022    PUT_BITSw       .req w7
3023
3024.macro emit_byte
3025    sub             PUT_BITS, PUT_BITS, #0x8
3026    lsr             x19, PUT_BUFFER, PUT_BITS
3027    uxtb            w19, w19
3028    strb            w19, [BUFFER, #1]!
3029    cmp             w19, #0xff
3030    b.ne            14f
3031    strb            wzr, [BUFFER, #1]!
303214:
3033.endm
3034.macro put_bits CODE, SIZE
3035    lsl             PUT_BUFFER, PUT_BUFFER, \SIZE
3036    add             PUT_BITS, PUT_BITS, \SIZE
3037    orr             PUT_BUFFER, PUT_BUFFER, \CODE
3038.endm
3039.macro checkbuf31
3040    cmp             PUT_BITS, #0x20
3041    b.lt            31f
3042    emit_byte
3043    emit_byte
3044    emit_byte
3045    emit_byte
304631:
3047.endm
3048.macro checkbuf47
3049    cmp             PUT_BITS, #0x30
3050    b.lt            47f
3051    emit_byte
3052    emit_byte
3053    emit_byte
3054    emit_byte
3055    emit_byte
3056    emit_byte
305747:
3058.endm
3059
3060.macro generate_jsimd_huff_encode_one_block fast_tbl
3061
3062.if \fast_tbl == 1
3063asm_function jsimd_huff_encode_one_block_neon
3064.else
3065asm_function jsimd_huff_encode_one_block_neon_slowtbl
3066.endif
3067    sub             sp, sp, 272
3068    sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
3069    /* Save Arm registers */
3070    stp             x19, x20, [sp]
3071    get_symbol_loc  x15, Ljsimd_huff_encode_one_block_neon_consts
3072    ldr             PUT_BUFFER, [x0, #0x10]
3073    ldr             PUT_BITSw, [x0, #0x18]
3074    ldrsh           w12, [x2]               /* load DC coeff in w12 */
3075    /* prepare data */
3076.if \fast_tbl == 1
3077    ld1             {v23.16b}, [x15], #16
3078    ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
3079    ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
3080    ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
3081    ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
3082    ld1             {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
3083    sub             w12, w12, w3      /* last_dc_val, not used afterwards */
3084    /* ZigZag 8x8 */
3085    tbl             v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
3086    tbl             v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
3087    tbl             v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
3088    tbl             v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
3089    tbl             v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
3090    tbl             v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
3091    tbl             v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
3092    tbl             v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
3093    ins             v0.h[0], w12
3094    tbx             v1.16b, {v28.16b}, v16.16b
3095    tbx             v2.16b, {v29.16b, v30.16b}, v17.16b
3096    tbx             v5.16b, {v29.16b, v30.16b}, v18.16b
3097    tbx             v6.16b, {v31.16b}, v19.16b
3098.else
3099      add             x13, x2, #0x22
3100      sub             w12, w12, w3    /* last_dc_val, not used afterwards */
3101    ld1             {v23.16b}, [x15]
3102      add             x14, x2, #0x18
3103      add             x3, x2, #0x36
3104    ins             v0.h[0], w12
3105      add             x9, x2, #0x2
3106    ld1             {v1.h}[0], [x13]
3107      add             x15, x2, #0x30
3108    ld1             {v2.h}[0], [x14]
3109      add             x19, x2, #0x26
3110    ld1             {v3.h}[0], [x3]
3111      add             x20, x2, #0x28
3112    ld1             {v0.h}[1], [x9]
3113      add             x12, x2, #0x10
3114    ld1             {v1.h}[1], [x15]
3115      add             x13, x2, #0x40
3116    ld1             {v2.h}[1], [x19]
3117      add             x14, x2, #0x34
3118    ld1             {v3.h}[1], [x20]
3119      add             x3, x2, #0x1a
3120    ld1             {v0.h}[2], [x12]
3121      add             x9, x2, #0x20
3122    ld1             {v1.h}[2], [x13]
3123      add             x15, x2, #0x32
3124    ld1             {v2.h}[2], [x14]
3125      add             x19, x2, #0x42
3126    ld1             {v3.h}[2], [x3]
3127      add             x20, x2, #0xc
3128    ld1             {v0.h}[3], [x9]
3129      add             x12, x2, #0x12
3130    ld1             {v1.h}[3], [x15]
3131      add             x13, x2, #0x24
3132    ld1             {v2.h}[3], [x19]
3133      add             x14, x2, #0x50
3134    ld1             {v3.h}[3], [x20]
3135      add             x3, x2, #0xe
3136    ld1             {v0.h}[4], [x12]
3137      add             x9, x2, #0x4
3138    ld1             {v1.h}[4], [x13]
3139      add             x15, x2, #0x16
3140    ld1             {v2.h}[4], [x14]
3141      add             x19, x2, #0x60
3142    ld1             {v3.h}[4], [x3]
3143      add             x20, x2, #0x1c
3144    ld1             {v0.h}[5], [x9]
3145      add             x12, x2, #0x6
3146    ld1             {v1.h}[5], [x15]
3147      add             x13, x2, #0x8
3148    ld1             {v2.h}[5], [x19]
3149      add             x14, x2, #0x52
3150    ld1             {v3.h}[5], [x20]
3151      add             x3, x2, #0x2a
3152    ld1             {v0.h}[6], [x12]
3153      add             x9, x2, #0x14
3154    ld1             {v1.h}[6], [x13]
3155      add             x15, x2, #0xa
3156    ld1             {v2.h}[6], [x14]
3157      add             x19, x2, #0x44
3158    ld1             {v3.h}[6], [x3]
3159      add             x20, x2, #0x38
3160    ld1             {v0.h}[7], [x9]
3161      add             x12, x2, #0x46
3162    ld1             {v1.h}[7], [x15]
3163      add             x13, x2, #0x3a
3164    ld1             {v2.h}[7], [x19]
3165      add             x14, x2, #0x74
3166    ld1             {v3.h}[7], [x20]
3167      add             x3, x2, #0x6a
3168    ld1             {v4.h}[0], [x12]
3169      add             x9, x2, #0x54
3170    ld1             {v5.h}[0], [x13]
3171      add             x15, x2, #0x2c
3172    ld1             {v6.h}[0], [x14]
3173      add             x19, x2, #0x76
3174    ld1             {v7.h}[0], [x3]
3175      add             x20, x2, #0x78
3176    ld1             {v4.h}[1], [x9]
3177      add             x12, x2, #0x62
3178    ld1             {v5.h}[1], [x15]
3179      add             x13, x2, #0x1e
3180    ld1             {v6.h}[1], [x19]
3181      add             x14, x2, #0x68
3182    ld1             {v7.h}[1], [x20]
3183      add             x3, x2, #0x7a
3184    ld1             {v4.h}[2], [x12]
3185      add             x9, x2, #0x70
3186    ld1             {v5.h}[2], [x13]
3187      add             x15, x2, #0x2e
3188    ld1             {v6.h}[2], [x14]
3189      add             x19, x2, #0x5a
3190    ld1             {v7.h}[2], [x3]
3191      add             x20, x2, #0x6c
3192    ld1             {v4.h}[3], [x9]
3193      add             x12, x2, #0x72
3194    ld1             {v5.h}[3], [x15]
3195      add             x13, x2, #0x3c
3196    ld1             {v6.h}[3], [x19]
3197      add             x14, x2, #0x4c
3198    ld1             {v7.h}[3], [x20]
3199      add             x3, x2, #0x5e
3200    ld1             {v4.h}[4], [x12]
3201      add             x9, x2, #0x64
3202    ld1             {v5.h}[4], [x13]
3203      add             x15, x2, #0x4a
3204    ld1             {v6.h}[4], [x14]
3205      add             x19, x2, #0x3e
3206    ld1             {v7.h}[4], [x3]
3207      add             x20, x2, #0x6e
3208    ld1             {v4.h}[5], [x9]
3209      add             x12, x2, #0x56
3210    ld1             {v5.h}[5], [x15]
3211      add             x13, x2, #0x58
3212    ld1             {v6.h}[5], [x19]
3213      add             x14, x2, #0x4e
3214    ld1             {v7.h}[5], [x20]
3215      add             x3, x2, #0x7c
3216    ld1             {v4.h}[6], [x12]
3217      add             x9, x2, #0x48
3218    ld1             {v5.h}[6], [x13]
3219      add             x15, x2, #0x66
3220    ld1             {v6.h}[6], [x14]
3221      add             x19, x2, #0x5c
3222    ld1             {v7.h}[6], [x3]
3223      add             x20, x2, #0x7e
3224    ld1             {v4.h}[7], [x9]
3225    ld1             {v5.h}[7], [x15]
3226    ld1             {v6.h}[7], [x19]
3227    ld1             {v7.h}[7], [x20]
3228.endif
3229    cmlt            v24.8h, v0.8h, #0
3230    cmlt            v25.8h, v1.8h, #0
3231    cmlt            v26.8h, v2.8h, #0
3232    cmlt            v27.8h, v3.8h, #0
3233    cmlt            v28.8h, v4.8h, #0
3234    cmlt            v29.8h, v5.8h, #0
3235    cmlt            v30.8h, v6.8h, #0
3236    cmlt            v31.8h, v7.8h, #0
3237    abs             v0.8h, v0.8h
3238    abs             v1.8h, v1.8h
3239    abs             v2.8h, v2.8h
3240    abs             v3.8h, v3.8h
3241    abs             v4.8h, v4.8h
3242    abs             v5.8h, v5.8h
3243    abs             v6.8h, v6.8h
3244    abs             v7.8h, v7.8h
3245    eor             v24.16b, v24.16b, v0.16b
3246    eor             v25.16b, v25.16b, v1.16b
3247    eor             v26.16b, v26.16b, v2.16b
3248    eor             v27.16b, v27.16b, v3.16b
3249    eor             v28.16b, v28.16b, v4.16b
3250    eor             v29.16b, v29.16b, v5.16b
3251    eor             v30.16b, v30.16b, v6.16b
3252    eor             v31.16b, v31.16b, v7.16b
3253    cmeq            v16.8h, v0.8h, #0
3254    cmeq            v17.8h, v1.8h, #0
3255    cmeq            v18.8h, v2.8h, #0
3256    cmeq            v19.8h, v3.8h, #0
3257    cmeq            v20.8h, v4.8h, #0
3258    cmeq            v21.8h, v5.8h, #0
3259    cmeq            v22.8h, v6.8h, #0
3260    xtn             v16.8b, v16.8h
3261    xtn             v18.8b, v18.8h
3262    xtn             v20.8b, v20.8h
3263    xtn             v22.8b, v22.8h
3264      umov            w14, v0.h[0]
3265    xtn2            v16.16b, v17.8h
3266      umov            w13, v24.h[0]
3267    xtn2            v18.16b, v19.8h
3268      clz             w14, w14
3269    xtn2            v20.16b, v21.8h
3270      lsl             w13, w13, w14
3271    cmeq            v17.8h, v7.8h, #0
3272      sub             w12, w14, #32
3273    xtn2            v22.16b, v17.8h
3274      lsr             w13, w13, w14
3275    and             v16.16b, v16.16b, v23.16b
3276      neg             w12, w12
3277    and             v18.16b, v18.16b, v23.16b
3278      add             x3, x4, #0x400           /* r1 = dctbl->ehufsi */
3279    and             v20.16b, v20.16b, v23.16b
3280      add             x15, sp, #0x90           /* x15 = t2 */
3281    and             v22.16b, v22.16b, v23.16b
3282      ldr             w10, [x4, x12, lsl #2]
3283    addp            v16.16b, v16.16b, v18.16b
3284      ldrb            w11, [x3, x12]
3285    addp            v20.16b, v20.16b, v22.16b
3286      checkbuf47
3287    addp            v16.16b, v16.16b, v20.16b
3288      put_bits        x10, x11
3289    addp            v16.16b, v16.16b, v18.16b
3290      checkbuf47
3291    umov            x9, v16.D[0]
3292      put_bits        x13, x12
3293    cnt             v17.8b, v16.8b
3294      mvn             x9, x9
3295    addv            B18, v17.8b
3296      add             x4, x5, #0x400   /* x4 = actbl->ehufsi */
3297    umov            w12, v18.b[0]
3298      lsr             x9, x9, #0x1     /* clear AC coeff */
3299    ldr             w13, [x5, #0x3c0]  /* x13 = actbl->ehufco[0xf0] */
3300    rbit            x9, x9             /* x9 = index0 */
3301    ldrb            w14, [x4, #0xf0]   /* x14 = actbl->ehufsi[0xf0] */
3302    cmp             w12, #(64-8)
3303    add             x11, sp, #16
3304    b.lt            4f
3305    cbz             x9, 6f
3306    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3307    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3308    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3309    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
33101:
3311    clz             x2, x9
3312    add             x15, x15, x2, lsl #1
3313    lsl             x9, x9, x2
3314    ldrh            w20, [x15, #-126]
33152:
3316    cmp             x2, #0x10
3317    b.lt            3f
3318    sub             x2, x2, #0x10
3319    checkbuf47
3320    put_bits        x13, x14
3321    b               2b
33223:
3323    clz             w20, w20
3324    ldrh            w3, [x15, #2]!
3325    sub             w11, w20, #32
3326    lsl             w3, w3, w20
3327    neg             w11, w11
3328    lsr             w3, w3, w20
3329    add             x2, x11, x2, lsl #4
3330    lsl             x9, x9, #0x1
3331    ldr             w12, [x5, x2, lsl #2]
3332    ldrb            w10, [x4, x2]
3333    checkbuf31
3334    put_bits        x12, x10
3335    put_bits        x3, x11
3336    cbnz            x9, 1b
3337    b               6f
33384:
3339    movi            v21.8h, #0x0010
3340    clz             v0.8h, v0.8h
3341    clz             v1.8h, v1.8h
3342    clz             v2.8h, v2.8h
3343    clz             v3.8h, v3.8h
3344    clz             v4.8h, v4.8h
3345    clz             v5.8h, v5.8h
3346    clz             v6.8h, v6.8h
3347    clz             v7.8h, v7.8h
3348    ushl            v24.8h, v24.8h, v0.8h
3349    ushl            v25.8h, v25.8h, v1.8h
3350    ushl            v26.8h, v26.8h, v2.8h
3351    ushl            v27.8h, v27.8h, v3.8h
3352    ushl            v28.8h, v28.8h, v4.8h
3353    ushl            v29.8h, v29.8h, v5.8h
3354    ushl            v30.8h, v30.8h, v6.8h
3355    ushl            v31.8h, v31.8h, v7.8h
3356    neg             v0.8h, v0.8h
3357    neg             v1.8h, v1.8h
3358    neg             v2.8h, v2.8h
3359    neg             v3.8h, v3.8h
3360    neg             v4.8h, v4.8h
3361    neg             v5.8h, v5.8h
3362    neg             v6.8h, v6.8h
3363    neg             v7.8h, v7.8h
3364    ushl            v24.8h, v24.8h, v0.8h
3365    ushl            v25.8h, v25.8h, v1.8h
3366    ushl            v26.8h, v26.8h, v2.8h
3367    ushl            v27.8h, v27.8h, v3.8h
3368    ushl            v28.8h, v28.8h, v4.8h
3369    ushl            v29.8h, v29.8h, v5.8h
3370    ushl            v30.8h, v30.8h, v6.8h
3371    ushl            v31.8h, v31.8h, v7.8h
3372    add             v0.8h, v21.8h, v0.8h
3373    add             v1.8h, v21.8h, v1.8h
3374    add             v2.8h, v21.8h, v2.8h
3375    add             v3.8h, v21.8h, v3.8h
3376    add             v4.8h, v21.8h, v4.8h
3377    add             v5.8h, v21.8h, v5.8h
3378    add             v6.8h, v21.8h, v6.8h
3379    add             v7.8h, v21.8h, v7.8h
3380    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3381    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3382    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3383    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
33841:
3385    clz             x2, x9
3386    add             x15, x15, x2, lsl #1
3387    lsl             x9, x9, x2
3388    ldrh            w11, [x15, #-126]
33892:
3390    cmp             x2, #0x10
3391    b.lt            3f
3392    sub             x2, x2, #0x10
3393    checkbuf47
3394    put_bits        x13, x14
3395    b               2b
33963:
3397    ldrh            w3, [x15, #2]!
3398    add             x2, x11, x2, lsl #4
3399    lsl             x9, x9, #0x1
3400    ldr             w12, [x5, x2, lsl #2]
3401    ldrb            w10, [x4, x2]
3402    checkbuf31
3403    put_bits        x12, x10
3404    put_bits        x3, x11
3405    cbnz            x9, 1b
34066:
3407    add             x13, sp, #0x10e
3408    cmp             x15, x13
3409    b.hs            1f
3410    ldr             w12, [x5]
3411    ldrb            w14, [x4]
3412    checkbuf47
3413    put_bits        x12, x14
34141:
3415    str             PUT_BUFFER, [x0, #0x10]
3416    str             PUT_BITSw, [x0, #0x18]
3417    ldp             x19, x20, [sp], 16
3418    add             x0, BUFFER, #0x1
3419    add             sp, sp, 256
3420    br              x30
3421
3422.endm
3423
3424generate_jsimd_huff_encode_one_block 1
3425generate_jsimd_huff_encode_one_block 0
3426
3427    .unreq          BUFFER
3428    .unreq          PUT_BUFFER
3429    .unreq          PUT_BITS
3430    .unreq          PUT_BITSw
3431
3432.purgem emit_byte
3433.purgem put_bits
3434.purgem checkbuf31
3435.purgem checkbuf47
3436