1/*
2 * ARMv7 NEON optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
5 * All rights reserved.
6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 * Copyright (C) 2014 Siarhei Siamashka.  All Rights Reserved.
8 * Copyright (C) 2014 Linaro Limited.  All Rights Reserved.
9 * Copyright (C) 2015 D. R. Commander.  All Rights Reserved.
10 * Copyright (C) 2015-2016 Matthieu Darbois.  All Rights Reserved.
11 *
12 * This software is provided 'as-is', without any express or implied
13 * warranty.  In no event will the authors be held liable for any damages
14 * arising from the use of this software.
15 *
16 * Permission is granted to anyone to use this software for any purpose,
17 * including commercial applications, and to alter it and redistribute it
18 * freely, subject to the following restrictions:
19 *
20 * 1. The origin of this software must not be misrepresented; you must not
21 *    claim that you wrote the original software. If you use this software
22 *    in a product, an acknowledgment in the product documentation would be
23 *    appreciated but is not required.
24 * 2. Altered source versions must be plainly marked as such, and must not be
25 *    misrepresented as being the original software.
26 * 3. This notice may not be removed or altered from any source distribution.
27 */
28
29#if defined(__linux__) && defined(__ELF__)
30.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
31#endif
32
33.text
34.fpu neon
35.arch armv7a
36.object_arch armv4
37.arm
38.syntax unified
39
40
41#define RESPECT_STRICT_ALIGNMENT 1
42
43
44/*****************************************************************************/
45
46/* Supplementary macro for setting function attributes */
47.macro asm_function fname
48#ifdef __APPLE__
49    .globl _\fname
50_\fname:
51#else
52    .global \fname
53#ifdef __ELF__
54    .hidden \fname
55    .type \fname, %function
56#endif
57\fname:
58#endif
59.endm
60
61/* Transpose a block of 4x4 coefficients in four 64-bit registers */
62.macro transpose_4x4 x0, x1, x2, x3
63    vtrn.16         \x0, \x1
64    vtrn.16         \x2, \x3
65    vtrn.32         \x0, \x2
66    vtrn.32         \x1, \x3
67.endm
68
69
70#define CENTERJSAMPLE 128
71
72/*****************************************************************************/
73
74/*
75 * Perform dequantization and inverse DCT on one block of coefficients.
76 *
77 * GLOBAL(void)
78 * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
79 *                        JSAMPARRAY output_buf, JDIMENSION output_col)
80 */
81
82#define FIX_0_298631336 (2446)
83#define FIX_0_390180644 (3196)
84#define FIX_0_541196100 (4433)
85#define FIX_0_765366865 (6270)
86#define FIX_0_899976223 (7373)
87#define FIX_1_175875602 (9633)
88#define FIX_1_501321110 (12299)
89#define FIX_1_847759065 (15137)
90#define FIX_1_961570560 (16069)
91#define FIX_2_053119869 (16819)
92#define FIX_2_562915447 (20995)
93#define FIX_3_072711026 (25172)
94
95#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
96#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
97#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
98#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
99#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
100#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
101#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
102#define FIX_0_541196100_PLUS_0_765366865  (FIX_0_541196100 + FIX_0_765366865)
103
104/*
105 * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
106 * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
107 */
108#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
109{                                                                             \
110    DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
111    JLONG   q1, q2, q3, q4, q5, q6, q7;                                       \
112    JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
113                                                                              \
114    /* 1-D iDCT input data */                                                 \
115    row0 = xrow0;                                                             \
116    row1 = xrow1;                                                             \
117    row2 = xrow2;                                                             \
118    row3 = xrow3;                                                             \
119    row4 = xrow4;                                                             \
120    row5 = xrow5;                                                             \
121    row6 = xrow6;                                                             \
122    row7 = xrow7;                                                             \
123                                                                              \
124    q5 = row7 + row3;                                                         \
125    q4 = row5 + row1;                                                         \
126    q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) +                    \
127         MULTIPLY(q4, FIX_1_175875602);                                       \
128    q7 = MULTIPLY(q5, FIX_1_175875602) +                                      \
129         MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644);                     \
130    q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
131         MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
132    q4 = q6;                                                                  \
133    q3 = ((JLONG) row0 - (JLONG) row4) << 13;                                 \
134    q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
135          MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
136    /* now we can use q1 (reloadable constants have been used up) */          \
137    q1 = q3 + q2;                                                             \
138    q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) +                 \
139          MULTIPLY(row1, -FIX_0_899976223);                                   \
140    q5 = q7;                                                                  \
141    q1 = q1 + q6;                                                             \
142    q7 += MULTIPLY(row7, -FIX_0_899976223) +                                  \
143          MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223);                  \
144                                                                              \
145    /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */        \
146    tmp11_plus_tmp2 = q1;                                                     \
147    row1 = 0;                                                                 \
148                                                                              \
149    q1 = q1 - q6;                                                             \
150    q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) +                 \
151          MULTIPLY(row3, -FIX_2_562915447);                                   \
152    q1 = q1 - q6;                                                             \
153    q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) +                   \
154         MULTIPLY(row6, FIX_0_541196100);                                     \
155    q3 = q3 - q2;                                                             \
156                                                                              \
157    /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
158    tmp11_minus_tmp2 = q1;                                                    \
159                                                                              \
160    q1 = ((JLONG) row0 + (JLONG) row4) << 13;                                 \
161    q2 = q1 + q6;                                                             \
162    q1 = q1 - q6;                                                             \
163                                                                              \
164    /* pick up the results */                                                 \
165    tmp0  = q4;                                                               \
166    tmp1  = q5;                                                               \
167    tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2;                         \
168    tmp3  = q7;                                                               \
169    tmp10 = q2;                                                               \
170    tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2;                         \
171    tmp12 = q3;                                                               \
172    tmp13 = q1;                                                               \
173}
174
175#define XFIX_0_899976223                   d0[0]
176#define XFIX_0_541196100                   d0[1]
177#define XFIX_2_562915447                   d0[2]
178#define XFIX_0_298631336_MINUS_0_899976223 d0[3]
179#define XFIX_1_501321110_MINUS_0_899976223 d1[0]
180#define XFIX_2_053119869_MINUS_2_562915447 d1[1]
181#define XFIX_0_541196100_PLUS_0_765366865  d1[2]
182#define XFIX_1_175875602                   d1[3]
183#define XFIX_1_175875602_MINUS_0_390180644 d2[0]
184#define XFIX_0_541196100_MINUS_1_847759065 d2[1]
185#define XFIX_3_072711026_MINUS_2_562915447 d2[2]
186#define XFIX_1_175875602_MINUS_1_961570560 d2[3]
187
188.balign 16
189jsimd_idct_islow_neon_consts:
190  .short FIX_0_899976223                    /* d0[0] */
191  .short FIX_0_541196100                    /* d0[1] */
192  .short FIX_2_562915447                    /* d0[2] */
193  .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
194  .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
195  .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
196  .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
197  .short FIX_1_175875602                    /* d1[3] */
198  /* reloadable constants */
199  .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
200  .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
201  .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
202  .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
203
204asm_function jsimd_idct_islow_neon
205
206    DCT_TABLE       .req r0
207    COEF_BLOCK      .req r1
208    OUTPUT_BUF      .req r2
209    OUTPUT_COL      .req r3
210    TMP1            .req r0
211    TMP2            .req r1
212    TMP3            .req r2
213    TMP4            .req ip
214
215    ROW0L           .req d16
216    ROW0R           .req d17
217    ROW1L           .req d18
218    ROW1R           .req d19
219    ROW2L           .req d20
220    ROW2R           .req d21
221    ROW3L           .req d22
222    ROW3R           .req d23
223    ROW4L           .req d24
224    ROW4R           .req d25
225    ROW5L           .req d26
226    ROW5R           .req d27
227    ROW6L           .req d28
228    ROW6R           .req d29
229    ROW7L           .req d30
230    ROW7R           .req d31
231
232    /* Load and dequantize coefficients into NEON registers
233     * with the following allocation:
234     *       0 1 2 3 | 4 5 6 7
235     *      ---------+--------
236     *   0 | d16     | d17     ( q8  )
237     *   1 | d18     | d19     ( q9  )
238     *   2 | d20     | d21     ( q10 )
239     *   3 | d22     | d23     ( q11 )
240     *   4 | d24     | d25     ( q12 )
241     *   5 | d26     | d27     ( q13 )
242     *   6 | d28     | d29     ( q14 )
243     *   7 | d30     | d31     ( q15 )
244     */
245    adr             ip, jsimd_idct_islow_neon_consts
246    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
247    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
248    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
249    vmul.s16        q8, q8, q0
250    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
251    vmul.s16        q9, q9, q1
252    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
253    vmul.s16        q10, q10, q2
254    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
255    vmul.s16        q11, q11, q3
256    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
257    vmul.s16        q12, q12, q0
258    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
259    vmul.s16        q14, q14, q2
260    vmul.s16        q13, q13, q1
261    vld1.16         {d0, d1, d2, d3}, [ip, :128]  /* load constants */
262    add             ip, ip, #16
263    vmul.s16        q15, q15, q3
264    vpush           {d8-d15}                      /* save NEON registers */
265    /* 1-D IDCT, pass 1, left 4x8 half */
266    vadd.s16        d4, ROW7L, ROW3L
267    vadd.s16        d5, ROW5L, ROW1L
268    vmull.s16       q6, d4, XFIX_1_175875602_MINUS_1_961570560
269    vmlal.s16       q6, d5, XFIX_1_175875602
270    vmull.s16       q7, d4, XFIX_1_175875602
271      /* Check for the zero coefficients in the right 4x8 half */
272      push            {r4, r5}
273    vmlal.s16       q7, d5, XFIX_1_175875602_MINUS_0_390180644
274    vsubl.s16       q3, ROW0L, ROW4L
275      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
276    vmull.s16       q2, ROW2L, XFIX_0_541196100
277    vmlal.s16       q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
278      orr             r0, r4, r5
279    vmov            q4, q6
280    vmlsl.s16       q6, ROW5L, XFIX_2_562915447
281      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
282    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
283    vshl.s32        q3, q3, #13
284      orr             r0, r0, r4
285    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
286      orr             r0, r0, r5
287    vadd.s32        q1, q3, q2
288      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
289    vmov            q5, q7
290    vadd.s32        q1, q1, q6
291      orr             r0, r0, r4
292    vmlsl.s16       q7, ROW7L, XFIX_0_899976223
293      orr             r0, r0, r5
294    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
295    vrshrn.s32      ROW1L, q1, #11
296      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
297    vsub.s32        q1, q1, q6
298    vmlal.s16       q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
299      orr             r0, r0, r4
300    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
301      orr             r0, r0, r5
302    vsub.s32        q1, q1, q6
303    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
304      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
305    vmlal.s16       q6, ROW6L, XFIX_0_541196100
306    vsub.s32        q3, q3, q2
307      orr             r0, r0, r4
308    vrshrn.s32      ROW6L, q1, #11
309      orr             r0, r0, r5
310    vadd.s32        q1, q3, q5
311      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
312    vsub.s32        q3, q3, q5
313    vaddl.s16       q5, ROW0L, ROW4L
314      orr             r0, r0, r4
315    vrshrn.s32      ROW2L, q1, #11
316      orr             r0, r0, r5
317    vrshrn.s32      ROW5L, q3, #11
318      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
319    vshl.s32        q5, q5, #13
320    vmlal.s16       q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
321      orr             r0, r0, r4
322    vadd.s32        q2, q5, q6
323      orrs            r0, r0, r5
324    vsub.s32        q1, q5, q6
325    vadd.s32        q6, q2, q7
326      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
327    vsub.s32        q2, q2, q7
328    vadd.s32        q5, q1, q4
329      orr             r0, r4, r5
330    vsub.s32        q3, q1, q4
331      pop             {r4, r5}
332    vrshrn.s32      ROW7L, q2, #11
333    vrshrn.s32      ROW3L, q5, #11
334    vrshrn.s32      ROW0L, q6, #11
335    vrshrn.s32      ROW4L, q3, #11
336
337      beq             3f  /* Go to do some special handling for the sparse
338                             right 4x8 half */
339
340    /* 1-D IDCT, pass 1, right 4x8 half */
341    vld1.s16        {d2}, [ip, :64]  /* reload constants */
342    vadd.s16        d10, ROW7R, ROW3R
343    vadd.s16        d8, ROW5R, ROW1R
344      /* Transpose left 4x8 half */
345      vtrn.16         ROW6L, ROW7L
346    vmull.s16       q6, d10, XFIX_1_175875602_MINUS_1_961570560
347    vmlal.s16       q6, d8, XFIX_1_175875602
348      vtrn.16         ROW2L, ROW3L
349    vmull.s16       q7, d10, XFIX_1_175875602
350    vmlal.s16       q7, d8, XFIX_1_175875602_MINUS_0_390180644
351      vtrn.16         ROW0L, ROW1L
352    vsubl.s16       q3, ROW0R, ROW4R
353    vmull.s16       q2, ROW2R, XFIX_0_541196100
354    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
355      vtrn.16         ROW4L, ROW5L
356    vmov            q4, q6
357    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
358    vmlal.s16       q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
359      vtrn.32         ROW1L, ROW3L
360    vshl.s32        q3, q3, #13
361    vmlsl.s16       q4, ROW1R, XFIX_0_899976223
362      vtrn.32         ROW4L, ROW6L
363    vadd.s32        q1, q3, q2
364    vmov            q5, q7
365    vadd.s32        q1, q1, q6
366      vtrn.32         ROW0L, ROW2L
367    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
368    vmlal.s16       q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
369    vrshrn.s32      ROW1R, q1, #11
370      vtrn.32         ROW5L, ROW7L
371    vsub.s32        q1, q1, q6
372    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
373    vmlsl.s16       q5, ROW3R, XFIX_2_562915447
374    vsub.s32        q1, q1, q6
375    vmull.s16       q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
376    vmlal.s16       q6, ROW6R, XFIX_0_541196100
377    vsub.s32        q3, q3, q2
378    vrshrn.s32      ROW6R, q1, #11
379    vadd.s32        q1, q3, q5
380    vsub.s32        q3, q3, q5
381    vaddl.s16       q5, ROW0R, ROW4R
382    vrshrn.s32      ROW2R, q1, #11
383    vrshrn.s32      ROW5R, q3, #11
384    vshl.s32        q5, q5, #13
385    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
386    vadd.s32        q2, q5, q6
387    vsub.s32        q1, q5, q6
388    vadd.s32        q6, q2, q7
389    vsub.s32        q2, q2, q7
390    vadd.s32        q5, q1, q4
391    vsub.s32        q3, q1, q4
392    vrshrn.s32      ROW7R, q2, #11
393    vrshrn.s32      ROW3R, q5, #11
394    vrshrn.s32      ROW0R, q6, #11
395    vrshrn.s32      ROW4R, q3, #11
396    /* Transpose right 4x8 half */
397    vtrn.16         ROW6R, ROW7R
398    vtrn.16         ROW2R, ROW3R
399    vtrn.16         ROW0R, ROW1R
400    vtrn.16         ROW4R, ROW5R
401    vtrn.32         ROW1R, ROW3R
402    vtrn.32         ROW4R, ROW6R
403    vtrn.32         ROW0R, ROW2R
404    vtrn.32         ROW5R, ROW7R
405
4061:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
407    vld1.s16        {d2}, [ip, :64]               /* reload constants */
408    vmull.s16       q6, ROW1R, XFIX_1_175875602   /* ROW5L <-> ROW1R */
409    vmlal.s16       q6, ROW1L, XFIX_1_175875602
410    vmlal.s16       q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
411    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
412    vmull.s16       q7, ROW3R, XFIX_1_175875602   /* ROW7L <-> ROW3R */
413    vmlal.s16       q7, ROW3L, XFIX_1_175875602
414    vmlal.s16       q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
415    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
416    vsubl.s16       q3, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
417    vmull.s16       q2, ROW2L, XFIX_0_541196100
418    vmlal.s16       q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065  /* ROW6L <-> ROW2R */
419    vmov            q4, q6
420    vmlsl.s16       q6, ROW1R, XFIX_2_562915447   /* ROW5L <-> ROW1R */
421    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
422    vshl.s32        q3, q3, #13
423    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
424    vadd.s32        q1, q3, q2
425    vmov            q5, q7
426    vadd.s32        q1, q1, q6
427    vmlsl.s16       q7, ROW3R, XFIX_0_899976223   /* ROW7L <-> ROW3R */
428    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
429    vshrn.s32       ROW1L, q1, #16
430    vsub.s32        q1, q1, q6
431    vmlal.s16       q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447  /* ROW5L <-> ROW1R */
432    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
433    vsub.s32        q1, q1, q6
434    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
435    vmlal.s16       q6, ROW2R, XFIX_0_541196100   /* ROW6L <-> ROW2R */
436    vsub.s32        q3, q3, q2
437    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
438    vadd.s32        q1, q3, q5
439    vsub.s32        q3, q3, q5
440    vaddl.s16       q5, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
441    vshrn.s32       ROW2L, q1, #16
442    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
443    vshl.s32        q5, q5, #13
444    vmlal.s16       q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223  /* ROW7L <-> ROW3R */
445    vadd.s32        q2, q5, q6
446    vsub.s32        q1, q5, q6
447    vadd.s32        q6, q2, q7
448    vsub.s32        q2, q2, q7
449    vadd.s32        q5, q1, q4
450    vsub.s32        q3, q1, q4
451    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
452    vshrn.s32       ROW3L, q5, #16
453    vshrn.s32       ROW0L, q6, #16
454    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
455    /* 1-D IDCT, pass 2, right 4x8 half */
456    vld1.s16        {d2}, [ip, :64]               /* reload constants */
457    vmull.s16       q6, ROW5R, XFIX_1_175875602
458    vmlal.s16       q6, ROW5L, XFIX_1_175875602   /* ROW5L <-> ROW1R */
459    vmlal.s16       q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
460    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
461    vmull.s16       q7, ROW7R, XFIX_1_175875602
462    vmlal.s16       q7, ROW7L, XFIX_1_175875602   /* ROW7L <-> ROW3R */
463    vmlal.s16       q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
464    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
465    vsubl.s16       q3, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
466    vmull.s16       q2, ROW6L, XFIX_0_541196100   /* ROW6L <-> ROW2R */
467    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
468    vmov            q4, q6
469    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
470    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447  /* ROW7L <-> ROW3R */
471    vshl.s32        q3, q3, #13
472    vmlsl.s16       q4, ROW5L, XFIX_0_899976223   /* ROW5L <-> ROW1R */
473    vadd.s32        q1, q3, q2
474    vmov            q5, q7
475    vadd.s32        q1, q1, q6
476    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
477    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223  /* ROW5L <-> ROW1R */
478    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
479    vsub.s32        q1, q1, q6
480    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
481    vmlsl.s16       q5, ROW7L, XFIX_2_562915447   /* ROW7L <-> ROW3R */
482    vsub.s32        q1, q1, q6
483    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865  /* ROW6L <-> ROW2R */
484    vmlal.s16       q6, ROW6R, XFIX_0_541196100
485    vsub.s32        q3, q3, q2
486    vshrn.s32       ROW6R, q1, #16
487    vadd.s32        q1, q3, q5
488    vsub.s32        q3, q3, q5
489    vaddl.s16       q5, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
490    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
491    vshrn.s32       ROW5R, q3, #16
492    vshl.s32        q5, q5, #13
493    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
494    vadd.s32        q2, q5, q6
495    vsub.s32        q1, q5, q6
496    vadd.s32        q6, q2, q7
497    vsub.s32        q2, q2, q7
498    vadd.s32        q5, q1, q4
499    vsub.s32        q3, q1, q4
500    vshrn.s32       ROW7R, q2, #16
501    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
502    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
503    vshrn.s32       ROW4R, q3, #16
504
5052:  /* Descale to 8-bit and range limit */
506    vqrshrn.s16     d16, q8, #2
507    vqrshrn.s16     d17, q9, #2
508    vqrshrn.s16     d18, q10, #2
509    vqrshrn.s16     d19, q11, #2
510    vpop            {d8-d15}                      /* restore NEON registers */
511    vqrshrn.s16     d20, q12, #2
512      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
513      vtrn.16         q8, q9
514    vqrshrn.s16     d21, q13, #2
515    vqrshrn.s16     d22, q14, #2
516      vmov.u8         q0, #(CENTERJSAMPLE)
517    vqrshrn.s16     d23, q15, #2
518      vtrn.8          d16, d17
519      vtrn.8          d18, d19
520      vadd.u8         q8, q8, q0
521      vadd.u8         q9, q9, q0
522      vtrn.16         q10, q11
523        /* Store results to the output buffer */
524        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
525        add             TMP1, TMP1, OUTPUT_COL
526        add             TMP2, TMP2, OUTPUT_COL
527        vst1.8          {d16}, [TMP1]
528      vtrn.8          d20, d21
529        vst1.8          {d17}, [TMP2]
530        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
531        add             TMP1, TMP1, OUTPUT_COL
532        add             TMP2, TMP2, OUTPUT_COL
533        vst1.8          {d18}, [TMP1]
534      vadd.u8         q10, q10, q0
535        vst1.8          {d19}, [TMP2]
536        ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
537        add             TMP1, TMP1, OUTPUT_COL
538        add             TMP2, TMP2, OUTPUT_COL
539        add             TMP3, TMP3, OUTPUT_COL
540        add             TMP4, TMP4, OUTPUT_COL
541      vtrn.8          d22, d23
542        vst1.8          {d20}, [TMP1]
543      vadd.u8         q11, q11, q0
544        vst1.8          {d21}, [TMP2]
545        vst1.8          {d22}, [TMP3]
546        vst1.8          {d23}, [TMP4]
547    bx              lr
548
5493:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
550
551    /* Transpose left 4x8 half */
552    vtrn.16         ROW6L, ROW7L
553    vtrn.16         ROW2L, ROW3L
554    vtrn.16         ROW0L, ROW1L
555    vtrn.16         ROW4L, ROW5L
556    vshl.s16        ROW0R, ROW0R, #2  /* PASS1_BITS */
557    vtrn.32         ROW1L, ROW3L
558    vtrn.32         ROW4L, ROW6L
559    vtrn.32         ROW0L, ROW2L
560    vtrn.32         ROW5L, ROW7L
561
562    cmp             r0, #0
563    beq             4f  /* Right 4x8 half has all zeros, go to 'sparse' second
564                           pass */
565
566    /* Only row 0 is non-zero for the right 4x8 half  */
567    vdup.s16        ROW1R, ROW0R[1]
568    vdup.s16        ROW2R, ROW0R[2]
569    vdup.s16        ROW3R, ROW0R[3]
570    vdup.s16        ROW4R, ROW0R[0]
571    vdup.s16        ROW5R, ROW0R[1]
572    vdup.s16        ROW6R, ROW0R[2]
573    vdup.s16        ROW7R, ROW0R[3]
574    vdup.s16        ROW0R, ROW0R[0]
575    b               1b  /* Go to 'normal' second pass */
576
5774:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
578    vld1.s16        {d2}, [ip, :64]               /* reload constants */
579    vmull.s16       q6, ROW1L, XFIX_1_175875602
580    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
581    vmull.s16       q7, ROW3L, XFIX_1_175875602
582    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
583    vmull.s16       q2, ROW2L, XFIX_0_541196100
584    vshll.s16       q3, ROW0L, #13
585    vmov            q4, q6
586    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
587    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
588    vadd.s32        q1, q3, q2
589    vmov            q5, q7
590    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
591    vadd.s32        q1, q1, q6
592    vadd.s32        q6, q6, q6
593    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
594    vshrn.s32       ROW1L, q1, #16
595    vsub.s32        q1, q1, q6
596    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
597    vsub.s32        q3, q3, q2
598    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
599    vadd.s32        q1, q3, q5
600    vsub.s32        q3, q3, q5
601    vshll.s16       q5, ROW0L, #13
602    vshrn.s32       ROW2L, q1, #16
603    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
604    vadd.s32        q2, q5, q6
605    vsub.s32        q1, q5, q6
606    vadd.s32        q6, q2, q7
607    vsub.s32        q2, q2, q7
608    vadd.s32        q5, q1, q4
609    vsub.s32        q3, q1, q4
610    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
611    vshrn.s32       ROW3L, q5, #16
612    vshrn.s32       ROW0L, q6, #16
613    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
614    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
615    vld1.s16        {d2}, [ip, :64]               /* reload constants */
616    vmull.s16       q6, ROW5L, XFIX_1_175875602
617    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
618    vmull.s16       q7, ROW7L, XFIX_1_175875602
619    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
620    vmull.s16       q2, ROW6L, XFIX_0_541196100
621    vshll.s16       q3, ROW4L, #13
622    vmov            q4, q6
623    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
624    vmlsl.s16       q4, ROW5L, XFIX_0_899976223
625    vadd.s32        q1, q3, q2
626    vmov            q5, q7
627    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
628    vadd.s32        q1, q1, q6
629    vadd.s32        q6, q6, q6
630    vmlsl.s16       q5, ROW7L, XFIX_2_562915447
631    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
632    vsub.s32        q1, q1, q6
633    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
634    vsub.s32        q3, q3, q2
635    vshrn.s32       ROW6R, q1, #16
636    vadd.s32        q1, q3, q5
637    vsub.s32        q3, q3, q5
638    vshll.s16       q5, ROW4L, #13
639    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
640    vshrn.s32       ROW5R, q3, #16
641    vadd.s32        q2, q5, q6
642    vsub.s32        q1, q5, q6
643    vadd.s32        q6, q2, q7
644    vsub.s32        q2, q2, q7
645    vadd.s32        q5, q1, q4
646    vsub.s32        q3, q1, q4
647    vshrn.s32       ROW7R, q2, #16
648    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
649    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
650    vshrn.s32       ROW4R, q3, #16
651    b               2b                            /* Go to epilogue */
652
653    .unreq          DCT_TABLE
654    .unreq          COEF_BLOCK
655    .unreq          OUTPUT_BUF
656    .unreq          OUTPUT_COL
657    .unreq          TMP1
658    .unreq          TMP2
659    .unreq          TMP3
660    .unreq          TMP4
661
662    .unreq          ROW0L
663    .unreq          ROW0R
664    .unreq          ROW1L
665    .unreq          ROW1R
666    .unreq          ROW2L
667    .unreq          ROW2R
668    .unreq          ROW3L
669    .unreq          ROW3R
670    .unreq          ROW4L
671    .unreq          ROW4R
672    .unreq          ROW5L
673    .unreq          ROW5R
674    .unreq          ROW6L
675    .unreq          ROW6R
676    .unreq          ROW7L
677    .unreq          ROW7R
678
679
680/*****************************************************************************/
681
682/*
683 * jsimd_idct_ifast_neon
684 *
685 * This function contains a fast, not so accurate integer implementation of
686 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
687 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
688 * function from jidctfst.c
689 *
690 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
691 * But in ARM NEON case some extra additions are required because VQDMULH
692 * instruction can't handle the constants larger than 1. So the expressions
693 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
694 * which introduces an extra addition. Overall, there are 6 extra additions
695 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
696 */
697
698#define XFIX_1_082392200 d0[0]
699#define XFIX_1_414213562 d0[1]
700#define XFIX_1_847759065 d0[2]
701#define XFIX_2_613125930 d0[3]
702
703.balign 16
704jsimd_idct_ifast_neon_consts:
705  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
706  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
707  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
708  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
709
710asm_function jsimd_idct_ifast_neon
711
712    DCT_TABLE       .req r0
713    COEF_BLOCK      .req r1
714    OUTPUT_BUF      .req r2
715    OUTPUT_COL      .req r3
716    TMP1            .req r0
717    TMP2            .req r1
718    TMP3            .req r2
719    TMP4            .req ip
720
721    /* Load and dequantize coefficients into NEON registers
722     * with the following allocation:
723     *       0 1 2 3 | 4 5 6 7
724     *      ---------+--------
725     *   0 | d16     | d17     ( q8  )
726     *   1 | d18     | d19     ( q9  )
727     *   2 | d20     | d21     ( q10 )
728     *   3 | d22     | d23     ( q11 )
729     *   4 | d24     | d25     ( q12 )
730     *   5 | d26     | d27     ( q13 )
731     *   6 | d28     | d29     ( q14 )
732     *   7 | d30     | d31     ( q15 )
733     */
734    adr             ip, jsimd_idct_ifast_neon_consts
735    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
736    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
737    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
738    vmul.s16        q8, q8, q0
739    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
740    vmul.s16        q9, q9, q1
741    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
742    vmul.s16        q10, q10, q2
743    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
744    vmul.s16        q11, q11, q3
745    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
746    vmul.s16        q12, q12, q0
747    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
748    vmul.s16        q14, q14, q2
749    vmul.s16        q13, q13, q1
750    vld1.16         {d0}, [ip, :64]  /* load constants */
751    vmul.s16        q15, q15, q3
752    vpush           {d8-d13}         /* save NEON registers */
753    /* 1-D IDCT, pass 1 */
754    vsub.s16        q2, q10, q14
755    vadd.s16        q14, q10, q14
756    vsub.s16        q1, q11, q13
757    vadd.s16        q13, q11, q13
758    vsub.s16        q5, q9, q15
759    vadd.s16        q15, q9, q15
760    vqdmulh.s16     q4, q2, XFIX_1_414213562
761    vqdmulh.s16     q6, q1, XFIX_2_613125930
762    vadd.s16        q3, q1, q1
763    vsub.s16        q1, q5, q1
764    vadd.s16        q10, q2, q4
765    vqdmulh.s16     q4, q1, XFIX_1_847759065
766    vsub.s16        q2, q15, q13
767    vadd.s16        q3, q3, q6
768    vqdmulh.s16     q6, q2, XFIX_1_414213562
769    vadd.s16        q1, q1, q4
770    vqdmulh.s16     q4, q5, XFIX_1_082392200
771    vsub.s16        q10, q10, q14
772    vadd.s16        q2, q2, q6
773    vsub.s16        q6, q8, q12
774    vadd.s16        q12, q8, q12
775    vadd.s16        q9, q5, q4
776    vadd.s16        q5, q6, q10
777    vsub.s16        q10, q6, q10
778    vadd.s16        q6, q15, q13
779    vadd.s16        q8, q12, q14
780    vsub.s16        q3, q6, q3
781    vsub.s16        q12, q12, q14
782    vsub.s16        q3, q3, q1
783    vsub.s16        q1, q9, q1
784    vadd.s16        q2, q3, q2
785    vsub.s16        q15, q8, q6
786    vadd.s16        q1, q1, q2
787    vadd.s16        q8, q8, q6
788    vadd.s16        q14, q5, q3
789    vsub.s16        q9, q5, q3
790    vsub.s16        q13, q10, q2
791    vadd.s16        q10, q10, q2
792      /* Transpose */
793      vtrn.16         q8, q9
794    vsub.s16        q11, q12, q1
795      vtrn.16         q14, q15
796    vadd.s16        q12, q12, q1
797      vtrn.16         q10, q11
798      vtrn.16         q12, q13
799      vtrn.32         q9, q11
800      vtrn.32         q12, q14
801      vtrn.32         q8, q10
802      vtrn.32         q13, q15
803      vswp            d28, d21
804      vswp            d26, d19
805    /* 1-D IDCT, pass 2 */
806    vsub.s16        q2, q10, q14
807      vswp            d30, d23
808    vadd.s16        q14, q10, q14
809      vswp            d24, d17
810    vsub.s16        q1, q11, q13
811    vadd.s16        q13, q11, q13
812    vsub.s16        q5, q9, q15
813    vadd.s16        q15, q9, q15
814    vqdmulh.s16     q4, q2, XFIX_1_414213562
815    vqdmulh.s16     q6, q1, XFIX_2_613125930
816    vadd.s16        q3, q1, q1
817    vsub.s16        q1, q5, q1
818    vadd.s16        q10, q2, q4
819    vqdmulh.s16     q4, q1, XFIX_1_847759065
820    vsub.s16        q2, q15, q13
821    vadd.s16        q3, q3, q6
822    vqdmulh.s16     q6, q2, XFIX_1_414213562
823    vadd.s16        q1, q1, q4
824    vqdmulh.s16     q4, q5, XFIX_1_082392200
825    vsub.s16        q10, q10, q14
826    vadd.s16        q2, q2, q6
827    vsub.s16        q6, q8, q12
828    vadd.s16        q12, q8, q12
829    vadd.s16        q9, q5, q4
830    vadd.s16        q5, q6, q10
831    vsub.s16        q10, q6, q10
832    vadd.s16        q6, q15, q13
833    vadd.s16        q8, q12, q14
834    vsub.s16        q3, q6, q3
835    vsub.s16        q12, q12, q14
836    vsub.s16        q3, q3, q1
837    vsub.s16        q1, q9, q1
838    vadd.s16        q2, q3, q2
839    vsub.s16        q15, q8, q6
840    vadd.s16        q1, q1, q2
841    vadd.s16        q8, q8, q6
842    vadd.s16        q14, q5, q3
843    vsub.s16        q9, q5, q3
844    vsub.s16        q13, q10, q2
845    vpop            {d8-d13}      /* restore NEON registers */
846    vadd.s16        q10, q10, q2
847    vsub.s16        q11, q12, q1
848    vadd.s16        q12, q12, q1
849    /* Descale to 8-bit and range limit */
850    vmov.u8         q0, #0x80
851    vqshrn.s16      d16, q8, #5
852    vqshrn.s16      d17, q9, #5
853    vqshrn.s16      d18, q10, #5
854    vqshrn.s16      d19, q11, #5
855    vqshrn.s16      d20, q12, #5
856    vqshrn.s16      d21, q13, #5
857    vqshrn.s16      d22, q14, #5
858    vqshrn.s16      d23, q15, #5
859    vadd.u8         q8, q8, q0
860    vadd.u8         q9, q9, q0
861    vadd.u8         q10, q10, q0
862    vadd.u8         q11, q11, q0
863    /* Transpose the final 8-bit samples */
864    vtrn.16         q8, q9
865    vtrn.16         q10, q11
866    vtrn.32         q8, q10
867    vtrn.32         q9, q11
868    vtrn.8          d16, d17
869    vtrn.8          d18, d19
870      /* Store results to the output buffer */
871      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
872      add             TMP1, TMP1, OUTPUT_COL
873      add             TMP2, TMP2, OUTPUT_COL
874      vst1.8          {d16}, [TMP1]
875      vst1.8          {d17}, [TMP2]
876      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
877      add             TMP1, TMP1, OUTPUT_COL
878      add             TMP2, TMP2, OUTPUT_COL
879      vst1.8          {d18}, [TMP1]
880    vtrn.8          d20, d21
881      vst1.8          {d19}, [TMP2]
882      ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
883      add             TMP1, TMP1, OUTPUT_COL
884      add             TMP2, TMP2, OUTPUT_COL
885      add             TMP3, TMP3, OUTPUT_COL
886      add             TMP4, TMP4, OUTPUT_COL
887      vst1.8          {d20}, [TMP1]
888    vtrn.8          d22, d23
889      vst1.8          {d21}, [TMP2]
890      vst1.8          {d22}, [TMP3]
891      vst1.8          {d23}, [TMP4]
892    bx              lr
893
894    .unreq          DCT_TABLE
895    .unreq          COEF_BLOCK
896    .unreq          OUTPUT_BUF
897    .unreq          OUTPUT_COL
898    .unreq          TMP1
899    .unreq          TMP2
900    .unreq          TMP3
901    .unreq          TMP4
902
903
904/*****************************************************************************/
905
906/*
907 * jsimd_idct_4x4_neon
908 *
909 * This function contains inverse-DCT code for getting reduced-size
910 * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
911 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
912 * function from jpeg-6b (jidctred.c).
913 *
914 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
915 *       requires much less arithmetic operations and hence should be faster.
916 *       The primary purpose of this particular NEON optimized function is
917 *       bit exact compatibility with jpeg-6b.
918 *
919 * TODO: a bit better instructions scheduling can be achieved by expanding
920 *       idct_helper/transpose_4x4 macros and reordering instructions,
921 *       but readability will suffer somewhat.
922 */
923
924#define CONST_BITS  13
925
926#define FIX_0_211164243 (1730)   /* FIX(0.211164243) */
927#define FIX_0_509795579 (4176)   /* FIX(0.509795579) */
928#define FIX_0_601344887 (4926)   /* FIX(0.601344887) */
929#define FIX_0_720959822 (5906)   /* FIX(0.720959822) */
930#define FIX_0_765366865 (6270)   /* FIX(0.765366865) */
931#define FIX_0_850430095 (6967)   /* FIX(0.850430095) */
932#define FIX_0_899976223 (7373)   /* FIX(0.899976223) */
933#define FIX_1_061594337 (8697)   /* FIX(1.061594337) */
934#define FIX_1_272758580 (10426)  /* FIX(1.272758580) */
935#define FIX_1_451774981 (11893)  /* FIX(1.451774981) */
936#define FIX_1_847759065 (15137)  /* FIX(1.847759065) */
937#define FIX_2_172734803 (17799)  /* FIX(2.172734803) */
938#define FIX_2_562915447 (20995)  /* FIX(2.562915447) */
939#define FIX_3_624509785 (29692)  /* FIX(3.624509785) */
940
941.balign 16
942jsimd_idct_4x4_neon_consts:
943  .short FIX_1_847759065      /* d0[0] */
944  .short -FIX_0_765366865     /* d0[1] */
945  .short -FIX_0_211164243     /* d0[2] */
946  .short FIX_1_451774981      /* d0[3] */
947  .short -FIX_2_172734803     /* d1[0] */
948  .short FIX_1_061594337      /* d1[1] */
949  .short -FIX_0_509795579     /* d1[2] */
950  .short -FIX_0_601344887     /* d1[3] */
951  .short FIX_0_899976223      /* d2[0] */
952  .short FIX_2_562915447      /* d2[1] */
953  .short 1 << (CONST_BITS+1)  /* d2[2] */
954  .short 0                    /* d2[3] */
955
956.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
957    vmull.s16       q14, \x4, d2[2]
958    vmlal.s16       q14, \x8, d0[0]
959    vmlal.s16       q14, \x14, d0[1]
960
961    vmull.s16       q13, \x16, d1[2]
962    vmlal.s16       q13, \x12, d1[3]
963    vmlal.s16       q13, \x10, d2[0]
964    vmlal.s16       q13, \x6, d2[1]
965
966    vmull.s16       q15, \x4, d2[2]
967    vmlsl.s16       q15, \x8, d0[0]
968    vmlsl.s16       q15, \x14, d0[1]
969
970    vmull.s16       q12, \x16, d0[2]
971    vmlal.s16       q12, \x12, d0[3]
972    vmlal.s16       q12, \x10, d1[0]
973    vmlal.s16       q12, \x6, d1[1]
974
975    vadd.s32        q10, q14, q13
976    vsub.s32        q14, q14, q13
977
978  .if \shift > 16
979    vrshr.s32       q10, q10, #\shift
980    vrshr.s32       q14, q14, #\shift
981    vmovn.s32       \y26, q10
982    vmovn.s32       \y29, q14
983  .else
984    vrshrn.s32      \y26, q10, #\shift
985    vrshrn.s32      \y29, q14, #\shift
986  .endif
987
988    vadd.s32        q10, q15, q12
989    vsub.s32        q15, q15, q12
990
991  .if \shift > 16
992    vrshr.s32       q10, q10, #\shift
993    vrshr.s32       q15, q15, #\shift
994    vmovn.s32       \y27, q10
995    vmovn.s32       \y28, q15
996  .else
997    vrshrn.s32      \y27, q10, #\shift
998    vrshrn.s32      \y28, q15, #\shift
999  .endif
1000.endm
1001
1002asm_function jsimd_idct_4x4_neon
1003
1004    DCT_TABLE       .req r0
1005    COEF_BLOCK      .req r1
1006    OUTPUT_BUF      .req r2
1007    OUTPUT_COL      .req r3
1008    TMP1            .req r0
1009    TMP2            .req r1
1010    TMP3            .req r2
1011    TMP4            .req ip
1012
1013    vpush           {d8-d15}
1014
1015    /* Load constants (d3 is just used for padding) */
1016    adr             TMP4, jsimd_idct_4x4_neon_consts
1017    vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
1018
1019    /* Load all COEF_BLOCK into NEON registers with the following allocation:
1020     *       0 1 2 3 | 4 5 6 7
1021     *      ---------+--------
1022     *   0 | d4      | d5
1023     *   1 | d6      | d7
1024     *   2 | d8      | d9
1025     *   3 | d10     | d11
1026     *   4 | -       | -
1027     *   5 | d12     | d13
1028     *   6 | d14     | d15
1029     *   7 | d16     | d17
1030     */
1031    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1032    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
1033    add COEF_BLOCK, COEF_BLOCK, #16
1034    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
1035    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
1036    /* dequantize */
1037    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1038    vmul.s16        q2, q2, q9
1039    vld1.16         {d22, d23, d24, d25}, [DCT_TABLE, :128]!
1040    vmul.s16        q3, q3, q10
1041    vmul.s16        q4, q4, q11
1042    add             DCT_TABLE, DCT_TABLE, #16
1043    vld1.16         {d26, d27, d28, d29}, [DCT_TABLE, :128]!
1044    vmul.s16        q5, q5, q12
1045    vmul.s16        q6, q6, q13
1046    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
1047    vmul.s16        q7, q7, q14
1048    vmul.s16        q8, q8, q15
1049
1050    /* Pass 1 */
1051    idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
1052    transpose_4x4   d4, d6, d8, d10
1053    idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
1054    transpose_4x4   d5, d7, d9, d11
1055
1056    /* Pass 2 */
1057    idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
1058    transpose_4x4   d26, d27, d28, d29
1059
1060    /* Range limit */
1061    vmov.u16        q15, #0x80
1062    vadd.s16        q13, q13, q15
1063    vadd.s16        q14, q14, q15
1064    vqmovun.s16     d26, q13
1065    vqmovun.s16     d27, q14
1066
1067    /* Store results to the output buffer */
1068    ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
1069    add             TMP1, TMP1, OUTPUT_COL
1070    add             TMP2, TMP2, OUTPUT_COL
1071    add             TMP3, TMP3, OUTPUT_COL
1072    add             TMP4, TMP4, OUTPUT_COL
1073
1074#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1075    /* We can use much less instructions on little endian systems if the
1076     * OS kernel is not configured to trap unaligned memory accesses
1077     */
1078    vst1.32         {d26[0]}, [TMP1]!
1079    vst1.32         {d27[0]}, [TMP3]!
1080    vst1.32         {d26[1]}, [TMP2]!
1081    vst1.32         {d27[1]}, [TMP4]!
1082#else
1083    vst1.8          {d26[0]}, [TMP1]!
1084    vst1.8          {d27[0]}, [TMP3]!
1085    vst1.8          {d26[1]}, [TMP1]!
1086    vst1.8          {d27[1]}, [TMP3]!
1087    vst1.8          {d26[2]}, [TMP1]!
1088    vst1.8          {d27[2]}, [TMP3]!
1089    vst1.8          {d26[3]}, [TMP1]!
1090    vst1.8          {d27[3]}, [TMP3]!
1091
1092    vst1.8          {d26[4]}, [TMP2]!
1093    vst1.8          {d27[4]}, [TMP4]!
1094    vst1.8          {d26[5]}, [TMP2]!
1095    vst1.8          {d27[5]}, [TMP4]!
1096    vst1.8          {d26[6]}, [TMP2]!
1097    vst1.8          {d27[6]}, [TMP4]!
1098    vst1.8          {d26[7]}, [TMP2]!
1099    vst1.8          {d27[7]}, [TMP4]!
1100#endif
1101
1102    vpop            {d8-d15}
1103    bx              lr
1104
1105    .unreq          DCT_TABLE
1106    .unreq          COEF_BLOCK
1107    .unreq          OUTPUT_BUF
1108    .unreq          OUTPUT_COL
1109    .unreq          TMP1
1110    .unreq          TMP2
1111    .unreq          TMP3
1112    .unreq          TMP4
1113
1114.purgem idct_helper
1115
1116
1117/*****************************************************************************/
1118
1119/*
1120 * jsimd_idct_2x2_neon
1121 *
1122 * This function contains inverse-DCT code for getting reduced-size
1123 * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
1124 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1125 * function from jpeg-6b (jidctred.c).
1126 *
1127 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1128 *       requires much less arithmetic operations and hence should be faster.
1129 *       The primary purpose of this particular NEON optimized function is
1130 *       bit exact compatibility with jpeg-6b.
1131 */
1132
1133.balign 8
1134jsimd_idct_2x2_neon_consts:
1135  .short -FIX_0_720959822  /* d0[0] */
1136  .short FIX_0_850430095   /* d0[1] */
1137  .short -FIX_1_272758580  /* d0[2] */
1138  .short FIX_3_624509785   /* d0[3] */
1139
1140.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1141    vshll.s16       q14, \x4, #15
1142    vmull.s16       q13, \x6, d0[3]
1143    vmlal.s16       q13, \x10, d0[2]
1144    vmlal.s16       q13, \x12, d0[1]
1145    vmlal.s16       q13, \x16, d0[0]
1146
1147    vadd.s32        q10, q14, q13
1148    vsub.s32        q14, q14, q13
1149
1150  .if \shift > 16
1151    vrshr.s32       q10, q10, #\shift
1152    vrshr.s32       q14, q14, #\shift
1153    vmovn.s32       \y26, q10
1154    vmovn.s32       \y27, q14
1155  .else
1156    vrshrn.s32      \y26, q10, #\shift
1157    vrshrn.s32      \y27, q14, #\shift
1158  .endif
1159.endm
1160
1161asm_function jsimd_idct_2x2_neon
1162
1163    DCT_TABLE       .req r0
1164    COEF_BLOCK      .req r1
1165    OUTPUT_BUF      .req r2
1166    OUTPUT_COL      .req r3
1167    TMP1            .req r0
1168    TMP2            .req ip
1169
1170    vpush           {d8-d15}
1171
1172    /* Load constants */
1173    adr             TMP2, jsimd_idct_2x2_neon_consts
1174    vld1.16         {d0}, [TMP2, :64]
1175
1176    /* Load all COEF_BLOCK into NEON registers with the following allocation:
1177     *       0 1 2 3 | 4 5 6 7
1178     *      ---------+--------
1179     *   0 | d4      | d5
1180     *   1 | d6      | d7
1181     *   2 | -       | -
1182     *   3 | d10     | d11
1183     *   4 | -       | -
1184     *   5 | d12     | d13
1185     *   6 | -       | -
1186     *   7 | d16     | d17
1187     */
1188    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1189    add             COEF_BLOCK, COEF_BLOCK, #16
1190    vld1.16         {d10, d11}, [COEF_BLOCK, :128]!
1191    add             COEF_BLOCK, COEF_BLOCK, #16
1192    vld1.16         {d12, d13}, [COEF_BLOCK, :128]!
1193    add             COEF_BLOCK, COEF_BLOCK, #16
1194    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
1195    /* Dequantize */
1196    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1197    vmul.s16        q2, q2, q9
1198    vmul.s16        q3, q3, q10
1199    add             DCT_TABLE, DCT_TABLE, #16
1200    vld1.16         {d24, d25}, [DCT_TABLE, :128]!
1201    vmul.s16        q5, q5, q12
1202    add             DCT_TABLE, DCT_TABLE, #16
1203    vld1.16         {d26, d27}, [DCT_TABLE, :128]!
1204    vmul.s16        q6, q6, q13
1205    add             DCT_TABLE, DCT_TABLE, #16
1206    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
1207    vmul.s16        q8, q8, q15
1208
1209    /* Pass 1 */
1210#if 0
1211    idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
1212    transpose_4x4   d4, d6, d8, d10
1213    idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
1214    transpose_4x4   d5, d7, d9, d11
1215#else
1216    vmull.s16       q13, d6, d0[3]
1217    vmlal.s16       q13, d10, d0[2]
1218    vmlal.s16       q13, d12, d0[1]
1219    vmlal.s16       q13, d16, d0[0]
1220    vmull.s16       q12, d7, d0[3]
1221    vmlal.s16       q12, d11, d0[2]
1222    vmlal.s16       q12, d13, d0[1]
1223    vmlal.s16       q12, d17, d0[0]
1224    vshll.s16       q14, d4, #15
1225    vshll.s16       q15, d5, #15
1226    vadd.s32        q10, q14, q13
1227    vsub.s32        q14, q14, q13
1228    vrshrn.s32      d4, q10, #13
1229    vrshrn.s32      d6, q14, #13
1230    vadd.s32        q10, q15, q12
1231    vsub.s32        q14, q15, q12
1232    vrshrn.s32      d5, q10, #13
1233    vrshrn.s32      d7, q14, #13
1234    vtrn.16         q2, q3
1235    vtrn.32         q3, q5
1236#endif
1237
1238    /* Pass 2 */
1239    idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
1240
1241    /* Range limit */
1242    vmov.u16        q15, #0x80
1243    vadd.s16        q13, q13, q15
1244    vqmovun.s16     d26, q13
1245    vqmovun.s16     d27, q13
1246
1247    /* Store results to the output buffer */
1248    ldmia           OUTPUT_BUF, {TMP1, TMP2}
1249    add             TMP1, TMP1, OUTPUT_COL
1250    add             TMP2, TMP2, OUTPUT_COL
1251
1252    vst1.8          {d26[0]}, [TMP1]!
1253    vst1.8          {d27[4]}, [TMP1]!
1254    vst1.8          {d26[1]}, [TMP2]!
1255    vst1.8          {d27[5]}, [TMP2]!
1256
1257    vpop            {d8-d15}
1258    bx              lr
1259
1260    .unreq          DCT_TABLE
1261    .unreq          COEF_BLOCK
1262    .unreq          OUTPUT_BUF
1263    .unreq          OUTPUT_COL
1264    .unreq          TMP1
1265    .unreq          TMP2
1266
1267.purgem idct_helper
1268
1269
1270/*****************************************************************************/
1271
1272/*
1273 * jsimd_ycc_extrgb_convert_neon
1274 * jsimd_ycc_extbgr_convert_neon
1275 * jsimd_ycc_extrgbx_convert_neon
1276 * jsimd_ycc_extbgrx_convert_neon
1277 * jsimd_ycc_extxbgr_convert_neon
1278 * jsimd_ycc_extxrgb_convert_neon
1279 *
1280 * Colorspace conversion YCbCr -> RGB
1281 */
1282
1283
1284.macro do_load size
1285  .if \size == 8
1286    vld1.8          {d4}, [U, :64]!
1287    vld1.8          {d5}, [V, :64]!
1288    vld1.8          {d0}, [Y, :64]!
1289    pld             [U, #64]
1290    pld             [V, #64]
1291    pld             [Y, #64]
1292  .elseif \size == 4
1293    vld1.8          {d4[0]}, [U]!
1294    vld1.8          {d4[1]}, [U]!
1295    vld1.8          {d4[2]}, [U]!
1296    vld1.8          {d4[3]}, [U]!
1297    vld1.8          {d5[0]}, [V]!
1298    vld1.8          {d5[1]}, [V]!
1299    vld1.8          {d5[2]}, [V]!
1300    vld1.8          {d5[3]}, [V]!
1301    vld1.8          {d0[0]}, [Y]!
1302    vld1.8          {d0[1]}, [Y]!
1303    vld1.8          {d0[2]}, [Y]!
1304    vld1.8          {d0[3]}, [Y]!
1305  .elseif \size == 2
1306    vld1.8          {d4[4]}, [U]!
1307    vld1.8          {d4[5]}, [U]!
1308    vld1.8          {d5[4]}, [V]!
1309    vld1.8          {d5[5]}, [V]!
1310    vld1.8          {d0[4]}, [Y]!
1311    vld1.8          {d0[5]}, [Y]!
1312  .elseif \size == 1
1313    vld1.8          {d4[6]}, [U]!
1314    vld1.8          {d5[6]}, [V]!
1315    vld1.8          {d0[6]}, [Y]!
1316  .else
1317    .error unsupported macroblock size
1318  .endif
1319.endm
1320
1321.macro do_store bpp, size
1322  .if \bpp == 24
1323    .if \size == 8
1324      vst3.8        {d10, d11, d12}, [RGB]!
1325    .elseif \size == 4
1326      vst3.8        {d10[0], d11[0], d12[0]}, [RGB]!
1327      vst3.8        {d10[1], d11[1], d12[1]}, [RGB]!
1328      vst3.8        {d10[2], d11[2], d12[2]}, [RGB]!
1329      vst3.8        {d10[3], d11[3], d12[3]}, [RGB]!
1330    .elseif \size == 2
1331      vst3.8        {d10[4], d11[4], d12[4]}, [RGB]!
1332      vst3.8        {d10[5], d11[5], d12[5]}, [RGB]!
1333    .elseif \size == 1
1334      vst3.8        {d10[6], d11[6], d12[6]}, [RGB]!
1335    .else
1336      .error unsupported macroblock size
1337    .endif
1338  .elseif \bpp == 32
1339    .if \size == 8
1340      vst4.8        {d10, d11, d12, d13}, [RGB]!
1341    .elseif \size == 4
1342      vst4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1343      vst4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1344      vst4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1345      vst4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1346    .elseif \size == 2
1347      vst4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1348      vst4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1349    .elseif \size == 1
1350      vst4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1351    .else
1352      .error unsupported macroblock size
1353    .endif
1354  .elseif \bpp == 16
1355    .if \size == 8
1356      vst1.16       {q15}, [RGB]!
1357    .elseif \size == 4
1358      vst1.16       {d30}, [RGB]!
1359    .elseif \size == 2
1360      vst1.16       {d31[0]}, [RGB]!
1361      vst1.16       {d31[1]}, [RGB]!
1362    .elseif \size == 1
1363      vst1.16       {d31[2]}, [RGB]!
1364    .else
1365      .error unsupported macroblock size
1366    .endif
1367  .else
1368    .error unsupported bpp
1369  .endif
1370.endm
1371
1372.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1373
1374/*
1375 * 2-stage pipelined YCbCr->RGB conversion
1376 */
1377
1378.macro do_yuv_to_rgb_stage1
1379    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
1380    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
1381    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
1382    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
1383    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
1384    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
1385    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
1386    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
1387    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
1388    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
1389.endm
1390
1391.macro do_yuv_to_rgb_stage2
1392    vrshrn.s32      d20, q10, #15
1393    vrshrn.s32      d21, q11, #15
1394    vrshrn.s32      d24, q12, #14
1395    vrshrn.s32      d25, q13, #14
1396    vrshrn.s32      d28, q14, #14
1397    vrshrn.s32      d29, q15, #14
1398    vaddw.u8        q11, q10, d0
1399    vaddw.u8        q12, q12, d0
1400    vaddw.u8        q14, q14, d0
1401  .if \bpp != 16
1402    vqmovun.s16     d1\g_offs, q11
1403    vqmovun.s16     d1\r_offs, q12
1404    vqmovun.s16     d1\b_offs, q14
1405  .else  /* rgb565 */
1406    vqshlu.s16      q13, q11, #8
1407    vqshlu.s16      q15, q12, #8
1408    vqshlu.s16      q14, q14, #8
1409    vsri.u16        q15, q13, #5
1410    vsri.u16        q15, q14, #11
1411  .endif
1412.endm
1413
1414.macro do_yuv_to_rgb_stage2_store_load_stage1
1415                                       /* "do_yuv_to_rgb_stage2" and "store" */
1416                                       vrshrn.s32      d20, q10, #15
1417    /* "load" and "do_yuv_to_rgb_stage1" */
1418    pld             [U, #64]
1419                                       vrshrn.s32      d21, q11, #15
1420    pld             [V, #64]
1421                                       vrshrn.s32      d24, q12, #14
1422                                       vrshrn.s32      d25, q13, #14
1423    vld1.8          {d4}, [U, :64]!
1424                                       vrshrn.s32      d28, q14, #14
1425    vld1.8          {d5}, [V, :64]!
1426                                       vrshrn.s32      d29, q15, #14
1427    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
1428    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
1429                                       vaddw.u8        q11, q10, d0
1430    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
1431    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
1432                                       vaddw.u8        q12, q12, d0
1433                                       vaddw.u8        q14, q14, d0
1434  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
1435                                       vqmovun.s16     d1\g_offs, q11
1436    pld             [Y, #64]
1437                                       vqmovun.s16     d1\r_offs, q12
1438    vld1.8          {d0}, [Y, :64]!
1439                                       vqmovun.s16     d1\b_offs, q14
1440    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
1441    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
1442                                       do_store        \bpp, 8
1443    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
1444    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
1445    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
1446    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
1447  .else  /**************************** rgb565 ********************************/
1448                                       vqshlu.s16      q13, q11, #8
1449    pld             [Y, #64]
1450                                       vqshlu.s16      q15, q12, #8
1451                                       vqshlu.s16      q14, q14, #8
1452    vld1.8          {d0}, [Y, :64]!
1453    vmull.s16       q11, d7, d1[1]
1454    vmlal.s16       q11, d9, d1[2]
1455                                       vsri.u16        q15, q13, #5
1456    vmull.s16       q12, d8, d1[0]
1457                                       vsri.u16        q15, q14, #11
1458    vmull.s16       q13, d9, d1[0]
1459    vmull.s16       q14, d6, d1[3]
1460                                       do_store        \bpp, 8
1461    vmull.s16       q15, d7, d1[3]
1462  .endif
1463.endm
1464
1465.macro do_yuv_to_rgb
1466    do_yuv_to_rgb_stage1
1467    do_yuv_to_rgb_stage2
1468.endm
1469
1470/* Apple gas crashes on adrl, work around that by using adr.
1471 * But this requires a copy of these constants for each function.
1472 */
1473
1474.balign 16
1475jsimd_ycc_\colorid\()_neon_consts:
1476  .short 0,      0,     0,      0
1477  .short 22971, -11277, -23401, 29033
1478  .short -128,  -128,   -128,   -128
1479  .short -128,  -128,   -128,   -128
1480
1481asm_function jsimd_ycc_\colorid\()_convert_neon
1482    OUTPUT_WIDTH    .req r0
1483    INPUT_BUF       .req r1
1484    INPUT_ROW       .req r2
1485    OUTPUT_BUF      .req r3
1486    NUM_ROWS        .req r4
1487
1488    INPUT_BUF0      .req r5
1489    INPUT_BUF1      .req r6
1490    INPUT_BUF2      .req INPUT_BUF
1491
1492    RGB             .req r7
1493    Y               .req r8
1494    U               .req r9
1495    V               .req r10
1496    N               .req ip
1497
1498    /* Load constants to d1, d2, d3 (d0 is just used for padding) */
1499    adr             ip, jsimd_ycc_\colorid\()_neon_consts
1500    vld1.16         {d0, d1, d2, d3}, [ip, :128]
1501
1502    /* Save ARM registers and handle input arguments */
1503    push            {r4, r5, r6, r7, r8, r9, r10, lr}
1504    ldr             NUM_ROWS, [sp, #(4 * 8)]
1505    ldr             INPUT_BUF0, [INPUT_BUF]
1506    ldr             INPUT_BUF1, [INPUT_BUF, #4]
1507    ldr             INPUT_BUF2, [INPUT_BUF, #8]
1508    .unreq          INPUT_BUF
1509
1510    /* Save NEON registers */
1511    vpush           {d8-d15}
1512
1513    /* Initially set d10, d11, d12, d13 to 0xFF */
1514    vmov.u8         q5, #255
1515    vmov.u8         q6, #255
1516
1517    /* Outer loop over scanlines */
1518    cmp             NUM_ROWS, #1
1519    blt             9f
15200:
1521    ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
1522    ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
1523    mov             N, OUTPUT_WIDTH
1524    ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
1525    add             INPUT_ROW, INPUT_ROW, #1
1526    ldr             RGB, [OUTPUT_BUF], #4
1527
1528    /* Inner loop over pixels */
1529    subs            N, N, #8
1530    blt             3f
1531    do_load         8
1532    do_yuv_to_rgb_stage1
1533    subs            N, N, #8
1534    blt             2f
15351:
1536    do_yuv_to_rgb_stage2_store_load_stage1
1537    subs            N, N, #8
1538    bge             1b
15392:
1540    do_yuv_to_rgb_stage2
1541    do_store        \bpp, 8
1542    tst             N, #7
1543    beq             8f
15443:
1545    tst             N, #4
1546    beq             3f
1547    do_load         4
15483:
1549    tst             N, #2
1550    beq             4f
1551    do_load         2
15524:
1553    tst             N, #1
1554    beq             5f
1555    do_load         1
15565:
1557    do_yuv_to_rgb
1558    tst             N, #4
1559    beq             6f
1560    do_store        \bpp, 4
15616:
1562    tst             N, #2
1563    beq             7f
1564    do_store        \bpp, 2
15657:
1566    tst             N, #1
1567    beq             8f
1568    do_store        \bpp, 1
15698:
1570    subs            NUM_ROWS, NUM_ROWS, #1
1571    bgt             0b
15729:
1573    /* Restore all registers and return */
1574    vpop            {d8-d15}
1575    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
1576
1577    .unreq          OUTPUT_WIDTH
1578    .unreq          INPUT_ROW
1579    .unreq          OUTPUT_BUF
1580    .unreq          NUM_ROWS
1581    .unreq          INPUT_BUF0
1582    .unreq          INPUT_BUF1
1583    .unreq          INPUT_BUF2
1584    .unreq          RGB
1585    .unreq          Y
1586    .unreq          U
1587    .unreq          V
1588    .unreq          N
1589
1590.purgem do_yuv_to_rgb
1591.purgem do_yuv_to_rgb_stage1
1592.purgem do_yuv_to_rgb_stage2
1593.purgem do_yuv_to_rgb_stage2_store_load_stage1
1594
1595.endm
1596
1597/*--------------------------------- id ----- bpp R  G  B */
1598generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, 1, 2
1599generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, 1, 0
1600generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
1601generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
1602generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
1603generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
1604generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, 0, 0
1605
1606.purgem do_load
1607.purgem do_store
1608
1609
1610/*****************************************************************************/
1611
1612/*
1613 * jsimd_extrgb_ycc_convert_neon
1614 * jsimd_extbgr_ycc_convert_neon
1615 * jsimd_extrgbx_ycc_convert_neon
1616 * jsimd_extbgrx_ycc_convert_neon
1617 * jsimd_extxbgr_ycc_convert_neon
1618 * jsimd_extxrgb_ycc_convert_neon
1619 *
1620 * Colorspace conversion RGB -> YCbCr
1621 */
1622
1623.macro do_store size
1624  .if \size == 8
1625    vst1.8          {d20}, [Y]!
1626    vst1.8          {d21}, [U]!
1627    vst1.8          {d22}, [V]!
1628  .elseif \size == 4
1629    vst1.8          {d20[0]}, [Y]!
1630    vst1.8          {d20[1]}, [Y]!
1631    vst1.8          {d20[2]}, [Y]!
1632    vst1.8          {d20[3]}, [Y]!
1633    vst1.8          {d21[0]}, [U]!
1634    vst1.8          {d21[1]}, [U]!
1635    vst1.8          {d21[2]}, [U]!
1636    vst1.8          {d21[3]}, [U]!
1637    vst1.8          {d22[0]}, [V]!
1638    vst1.8          {d22[1]}, [V]!
1639    vst1.8          {d22[2]}, [V]!
1640    vst1.8          {d22[3]}, [V]!
1641  .elseif \size == 2
1642    vst1.8          {d20[4]}, [Y]!
1643    vst1.8          {d20[5]}, [Y]!
1644    vst1.8          {d21[4]}, [U]!
1645    vst1.8          {d21[5]}, [U]!
1646    vst1.8          {d22[4]}, [V]!
1647    vst1.8          {d22[5]}, [V]!
1648  .elseif \size == 1
1649    vst1.8          {d20[6]}, [Y]!
1650    vst1.8          {d21[6]}, [U]!
1651    vst1.8          {d22[6]}, [V]!
1652  .else
1653    .error unsupported macroblock size
1654  .endif
1655.endm
1656
1657.macro do_load bpp, size
1658  .if \bpp == 24
1659    .if \size == 8
1660      vld3.8        {d10, d11, d12}, [RGB]!
1661      pld           [RGB, #128]
1662    .elseif \size == 4
1663      vld3.8        {d10[0], d11[0], d12[0]}, [RGB]!
1664      vld3.8        {d10[1], d11[1], d12[1]}, [RGB]!
1665      vld3.8        {d10[2], d11[2], d12[2]}, [RGB]!
1666      vld3.8        {d10[3], d11[3], d12[3]}, [RGB]!
1667    .elseif \size == 2
1668      vld3.8        {d10[4], d11[4], d12[4]}, [RGB]!
1669      vld3.8        {d10[5], d11[5], d12[5]}, [RGB]!
1670    .elseif \size == 1
1671      vld3.8        {d10[6], d11[6], d12[6]}, [RGB]!
1672    .else
1673      .error unsupported macroblock size
1674    .endif
1675  .elseif \bpp == 32
1676    .if \size == 8
1677      vld4.8        {d10, d11, d12, d13}, [RGB]!
1678      pld           [RGB, #128]
1679    .elseif \size == 4
1680      vld4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1681      vld4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1682      vld4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1683      vld4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1684    .elseif \size == 2
1685      vld4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1686      vld4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1687    .elseif \size == 1
1688      vld4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1689    .else
1690      .error unsupported macroblock size
1691    .endif
1692  .else
1693    .error unsupported bpp
1694  .endif
1695.endm
1696
1697.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1698
1699/*
1700 * 2-stage pipelined RGB->YCbCr conversion
1701 */
1702
1703.macro do_rgb_to_yuv_stage1
1704    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
1705    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
1706    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
1707    vmull.u16       q7, d4, d0[0]
1708    vmlal.u16       q7, d6, d0[1]
1709    vmlal.u16       q7, d8, d0[2]
1710    vmull.u16       q8, d5, d0[0]
1711    vmlal.u16       q8, d7, d0[1]
1712    vmlal.u16       q8, d9, d0[2]
1713    vrev64.32       q9, q1
1714    vrev64.32       q13, q1
1715    vmlsl.u16       q9, d4, d0[3]
1716    vmlsl.u16       q9, d6, d1[0]
1717    vmlal.u16       q9, d8, d1[1]
1718    vmlsl.u16       q13, d5, d0[3]
1719    vmlsl.u16       q13, d7, d1[0]
1720    vmlal.u16       q13, d9, d1[1]
1721    vrev64.32       q14, q1
1722    vrev64.32       q15, q1
1723    vmlal.u16       q14, d4, d1[1]
1724    vmlsl.u16       q14, d6, d1[2]
1725    vmlsl.u16       q14, d8, d1[3]
1726    vmlal.u16       q15, d5, d1[1]
1727    vmlsl.u16       q15, d7, d1[2]
1728    vmlsl.u16       q15, d9, d1[3]
1729.endm
1730
1731.macro do_rgb_to_yuv_stage2
1732    vrshrn.u32      d20, q7, #16
1733    vrshrn.u32      d21, q8, #16
1734    vshrn.u32       d22, q9, #16
1735    vshrn.u32       d23, q13, #16
1736    vshrn.u32       d24, q14, #16
1737    vshrn.u32       d25, q15, #16
1738    vmovn.u16       d20, q10       /* d20 = y */
1739    vmovn.u16       d21, q11       /* d21 = u */
1740    vmovn.u16       d22, q12       /* d22 = v */
1741.endm
1742
1743.macro do_rgb_to_yuv
1744    do_rgb_to_yuv_stage1
1745    do_rgb_to_yuv_stage2
1746.endm
1747
1748.macro do_rgb_to_yuv_stage2_store_load_stage1
1749      vrshrn.u32      d20, q7, #16
1750      vrshrn.u32      d21, q8, #16
1751      vshrn.u32       d22, q9, #16
1752    vrev64.32       q9, q1
1753      vshrn.u32       d23, q13, #16
1754    vrev64.32       q13, q1
1755      vshrn.u32       d24, q14, #16
1756      vshrn.u32       d25, q15, #16
1757    do_load         \bpp, 8
1758      vmovn.u16       d20, q10     /* d20 = y */
1759    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
1760      vmovn.u16       d21, q11     /* d21 = u */
1761    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
1762      vmovn.u16       d22, q12     /* d22 = v */
1763    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
1764    vmull.u16       q7, d4, d0[0]
1765    vmlal.u16       q7, d6, d0[1]
1766    vmlal.u16       q7, d8, d0[2]
1767      vst1.8          {d20}, [Y]!
1768    vmull.u16       q8, d5, d0[0]
1769    vmlal.u16       q8, d7, d0[1]
1770    vmlal.u16       q8, d9, d0[2]
1771    vmlsl.u16       q9, d4, d0[3]
1772    vmlsl.u16       q9, d6, d1[0]
1773    vmlal.u16       q9, d8, d1[1]
1774      vst1.8          {d21}, [U]!
1775    vmlsl.u16       q13, d5, d0[3]
1776    vmlsl.u16       q13, d7, d1[0]
1777    vmlal.u16       q13, d9, d1[1]
1778    vrev64.32       q14, q1
1779    vrev64.32       q15, q1
1780    vmlal.u16       q14, d4, d1[1]
1781    vmlsl.u16       q14, d6, d1[2]
1782    vmlsl.u16       q14, d8, d1[3]
1783      vst1.8          {d22}, [V]!
1784    vmlal.u16       q15, d5, d1[1]
1785    vmlsl.u16       q15, d7, d1[2]
1786    vmlsl.u16       q15, d9, d1[3]
1787.endm
1788
1789.balign 16
1790jsimd_\colorid\()_ycc_neon_consts:
1791  .short 19595, 38470, 7471,  11059
1792  .short 21709, 32768, 27439, 5329
1793  .short 32767, 128,   32767, 128
1794  .short 32767, 128,   32767, 128
1795
1796asm_function jsimd_\colorid\()_ycc_convert_neon
1797    OUTPUT_WIDTH    .req r0
1798    INPUT_BUF       .req r1
1799    OUTPUT_BUF      .req r2
1800    OUTPUT_ROW      .req r3
1801    NUM_ROWS        .req r4
1802
1803    OUTPUT_BUF0     .req r5
1804    OUTPUT_BUF1     .req r6
1805    OUTPUT_BUF2     .req OUTPUT_BUF
1806
1807    RGB             .req r7
1808    Y               .req r8
1809    U               .req r9
1810    V               .req r10
1811    N               .req ip
1812
1813    /* Load constants to d0, d1, d2, d3 */
1814    adr             ip, jsimd_\colorid\()_ycc_neon_consts
1815    vld1.16         {d0, d1, d2, d3}, [ip, :128]
1816
1817    /* Save ARM registers and handle input arguments */
1818    push            {r4, r5, r6, r7, r8, r9, r10, lr}
1819    ldr             NUM_ROWS, [sp, #(4 * 8)]
1820    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
1821    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
1822    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
1823    .unreq          OUTPUT_BUF
1824
1825    /* Save NEON registers */
1826    vpush           {d8-d15}
1827
1828    /* Outer loop over scanlines */
1829    cmp             NUM_ROWS, #1
1830    blt             9f
18310:
1832    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
1833    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
1834    mov             N, OUTPUT_WIDTH
1835    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
1836    add             OUTPUT_ROW, OUTPUT_ROW, #1
1837    ldr             RGB, [INPUT_BUF], #4
1838
1839    /* Inner loop over pixels */
1840    subs            N, N, #8
1841    blt             3f
1842    do_load         \bpp, 8
1843    do_rgb_to_yuv_stage1
1844    subs            N, N, #8
1845    blt             2f
18461:
1847    do_rgb_to_yuv_stage2_store_load_stage1
1848    subs            N, N, #8
1849    bge             1b
18502:
1851    do_rgb_to_yuv_stage2
1852    do_store        8
1853    tst             N, #7
1854    beq             8f
18553:
1856    tst             N, #4
1857    beq             3f
1858    do_load         \bpp, 4
18593:
1860    tst             N, #2
1861    beq             4f
1862    do_load         \bpp, 2
18634:
1864    tst             N, #1
1865    beq             5f
1866    do_load         \bpp, 1
18675:
1868    do_rgb_to_yuv
1869    tst             N, #4
1870    beq             6f
1871    do_store        4
18726:
1873    tst             N, #2
1874    beq             7f
1875    do_store        2
18767:
1877    tst             N, #1
1878    beq             8f
1879    do_store        1
18808:
1881    subs            NUM_ROWS, NUM_ROWS, #1
1882    bgt             0b
18839:
1884    /* Restore all registers and return */
1885    vpop            {d8-d15}
1886    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
1887
1888    .unreq          OUTPUT_WIDTH
1889    .unreq          OUTPUT_ROW
1890    .unreq          INPUT_BUF
1891    .unreq          NUM_ROWS
1892    .unreq          OUTPUT_BUF0
1893    .unreq          OUTPUT_BUF1
1894    .unreq          OUTPUT_BUF2
1895    .unreq          RGB
1896    .unreq          Y
1897    .unreq          U
1898    .unreq          V
1899    .unreq          N
1900
1901.purgem do_rgb_to_yuv
1902.purgem do_rgb_to_yuv_stage1
1903.purgem do_rgb_to_yuv_stage2
1904.purgem do_rgb_to_yuv_stage2_store_load_stage1
1905
1906.endm
1907
1908/*--------------------------------- id ----- bpp R  G  B */
1909generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
1910generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
1911generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
1912generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
1913generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
1914generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
1915
1916.purgem do_load
1917.purgem do_store
1918
1919
1920/*****************************************************************************/
1921
1922/*
1923 * Load data into workspace, applying unsigned->signed conversion
1924 *
1925 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
1926 *       rid of VST1.16 instructions
1927 */
1928
1929asm_function jsimd_convsamp_neon
1930    SAMPLE_DATA     .req r0
1931    START_COL       .req r1
1932    WORKSPACE       .req r2
1933    TMP1            .req r3
1934    TMP2            .req r4
1935    TMP3            .req r5
1936    TMP4            .req ip
1937
1938    push            {r4, r5}
1939    vmov.u8         d0, #128
1940
1941    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1942    add             TMP1, TMP1, START_COL
1943    add             TMP2, TMP2, START_COL
1944    add             TMP3, TMP3, START_COL
1945    add             TMP4, TMP4, START_COL
1946    vld1.8          {d16}, [TMP1]
1947    vsubl.u8        q8, d16, d0
1948    vld1.8          {d18}, [TMP2]
1949    vsubl.u8        q9, d18, d0
1950    vld1.8          {d20}, [TMP3]
1951    vsubl.u8        q10, d20, d0
1952    vld1.8          {d22}, [TMP4]
1953    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1954    vsubl.u8        q11, d22, d0
1955    vst1.16         {d16, d17, d18, d19}, [WORKSPACE, :128]!
1956    add             TMP1, TMP1, START_COL
1957    add             TMP2, TMP2, START_COL
1958    vst1.16         {d20, d21, d22, d23}, [WORKSPACE, :128]!
1959    add             TMP3, TMP3, START_COL
1960    add             TMP4, TMP4, START_COL
1961    vld1.8          {d24}, [TMP1]
1962    vsubl.u8        q12, d24, d0
1963    vld1.8          {d26}, [TMP2]
1964    vsubl.u8        q13, d26, d0
1965    vld1.8          {d28}, [TMP3]
1966    vsubl.u8        q14, d28, d0
1967    vld1.8          {d30}, [TMP4]
1968    vsubl.u8        q15, d30, d0
1969    vst1.16         {d24, d25, d26, d27}, [WORKSPACE, :128]!
1970    vst1.16         {d28, d29, d30, d31}, [WORKSPACE, :128]!
1971    pop             {r4, r5}
1972    bx              lr
1973
1974    .unreq          SAMPLE_DATA
1975    .unreq          START_COL
1976    .unreq          WORKSPACE
1977    .unreq          TMP1
1978    .unreq          TMP2
1979    .unreq          TMP3
1980    .unreq          TMP4
1981
1982
1983/*****************************************************************************/
1984
1985/*
1986 * jsimd_fdct_ifast_neon
1987 *
1988 * This function contains a fast, not so accurate integer implementation of
1989 * the forward DCT (Discrete Cosine Transform). It uses the same calculations
1990 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
1991 * function from jfdctfst.c
1992 *
1993 * TODO: can be combined with 'jsimd_convsamp_neon' to get
1994 *       rid of a bunch of VLD1.16 instructions
1995 */
1996
1997#define XFIX_0_382683433 d0[0]
1998#define XFIX_0_541196100 d0[1]
1999#define XFIX_0_707106781 d0[2]
2000#define XFIX_1_306562965 d0[3]
2001
2002.balign 16
2003jsimd_fdct_ifast_neon_consts:
2004  .short (98 * 128)               /* XFIX_0_382683433 */
2005  .short (139 * 128)              /* XFIX_0_541196100 */
2006  .short (181 * 128)              /* XFIX_0_707106781 */
2007  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
2008
2009asm_function jsimd_fdct_ifast_neon
2010
2011    DATA            .req r0
2012    TMP             .req ip
2013
2014    vpush           {d8-d15}
2015
2016    /* Load constants */
2017    adr             TMP, jsimd_fdct_ifast_neon_consts
2018    vld1.16         {d0}, [TMP, :64]
2019
2020    /* Load all DATA into NEON registers with the following allocation:
2021     *       0 1 2 3 | 4 5 6 7
2022     *      ---------+--------
2023     *   0 | d16     | d17    | q8
2024     *   1 | d18     | d19    | q9
2025     *   2 | d20     | d21    | q10
2026     *   3 | d22     | d23    | q11
2027     *   4 | d24     | d25    | q12
2028     *   5 | d26     | d27    | q13
2029     *   6 | d28     | d29    | q14
2030     *   7 | d30     | d31    | q15
2031     */
2032
2033    vld1.16         {d16, d17, d18, d19}, [DATA, :128]!
2034    vld1.16         {d20, d21, d22, d23}, [DATA, :128]!
2035    vld1.16         {d24, d25, d26, d27}, [DATA, :128]!
2036    vld1.16         {d28, d29, d30, d31}, [DATA, :128]
2037    sub             DATA, DATA, #(128 - 32)
2038
2039    mov             TMP, #2
20401:
2041    /* Transpose */
2042    vtrn.16         q12, q13
2043    vtrn.16         q10, q11
2044    vtrn.16         q8, q9
2045    vtrn.16         q14, q15
2046    vtrn.32         q9, q11
2047    vtrn.32         q13, q15
2048    vtrn.32         q8, q10
2049    vtrn.32         q12, q14
2050    vswp            d30, d23
2051    vswp            d24, d17
2052    vswp            d26, d19
2053      /* 1-D FDCT */
2054      vadd.s16        q2, q11, q12
2055    vswp            d28, d21
2056      vsub.s16        q12, q11, q12
2057      vsub.s16        q6, q10, q13
2058      vadd.s16        q10, q10, q13
2059      vsub.s16        q7, q9, q14
2060      vadd.s16        q9, q9, q14
2061      vsub.s16        q1, q8, q15
2062      vadd.s16        q8, q8, q15
2063      vsub.s16        q4, q9, q10
2064      vsub.s16        q5, q8, q2
2065      vadd.s16        q3, q9, q10
2066      vadd.s16        q4, q4, q5
2067      vadd.s16        q2, q8, q2
2068      vqdmulh.s16     q4, q4, XFIX_0_707106781
2069      vadd.s16        q11, q12, q6
2070      vadd.s16        q8, q2, q3
2071      vsub.s16        q12, q2, q3
2072      vadd.s16        q3, q6, q7
2073      vadd.s16        q7, q7, q1
2074      vqdmulh.s16     q3, q3, XFIX_0_707106781
2075      vsub.s16        q6, q11, q7
2076      vadd.s16        q10, q5, q4
2077      vqdmulh.s16     q6, q6, XFIX_0_382683433
2078      vsub.s16        q14, q5, q4
2079      vqdmulh.s16     q11, q11, XFIX_0_541196100
2080      vqdmulh.s16     q5, q7, XFIX_1_306562965
2081      vadd.s16        q4, q1, q3
2082      vsub.s16        q3, q1, q3
2083      vadd.s16        q7, q7, q6
2084      vadd.s16        q11, q11, q6
2085      vadd.s16        q7, q7, q5
2086      vadd.s16        q13, q3, q11
2087      vsub.s16        q11, q3, q11
2088      vadd.s16        q9, q4, q7
2089      vsub.s16        q15, q4, q7
2090    subs            TMP, TMP, #1
2091    bne             1b
2092
2093    /* store results */
2094    vst1.16         {d16, d17, d18, d19}, [DATA, :128]!
2095    vst1.16         {d20, d21, d22, d23}, [DATA, :128]!
2096    vst1.16         {d24, d25, d26, d27}, [DATA, :128]!
2097    vst1.16         {d28, d29, d30, d31}, [DATA, :128]
2098
2099    vpop            {d8-d15}
2100    bx              lr
2101
2102    .unreq          DATA
2103    .unreq          TMP
2104
2105
2106/*****************************************************************************/
2107
2108/*
2109 * GLOBAL(void)
2110 * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
2111 *                      DCTELEM *workspace);
2112 *
2113 * Note: the code uses 2 stage pipelining in order to improve instructions
2114 *       scheduling and eliminate stalls (this provides ~15% better
2115 *       performance for this function on both ARM Cortex-A8 and
2116 *       ARM Cortex-A9 when compared to the non-pipelined variant).
2117 *       The instructions which belong to the second stage use different
2118 *       indentation for better readiability.
2119 */
2120asm_function jsimd_quantize_neon
2121
2122    COEF_BLOCK      .req r0
2123    DIVISORS        .req r1
2124    WORKSPACE       .req r2
2125
2126    RECIPROCAL      .req DIVISORS
2127    CORRECTION      .req r3
2128    SHIFT           .req ip
2129    LOOP_COUNT      .req r4
2130
2131    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
2132    vabs.s16        q12, q0
2133    add             CORRECTION, DIVISORS, #(64 * 2)
2134    add             SHIFT, DIVISORS, #(64 * 6)
2135    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
2136    vabs.s16        q13, q1
2137    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2138    vadd.u16        q12, q12, q10  /* add correction */
2139    vadd.u16        q13, q13, q11
2140    vmull.u16       q10, d24, d16  /* multiply by reciprocal */
2141    vmull.u16       q11, d25, d17
2142    vmull.u16       q8, d26, d18
2143    vmull.u16       q9, d27, d19
2144    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
2145    vshrn.u32       d20, q10, #16
2146    vshrn.u32       d21, q11, #16
2147    vshrn.u32       d22, q8, #16
2148    vshrn.u32       d23, q9, #16
2149    vneg.s16        q12, q12
2150    vneg.s16        q13, q13
2151    vshr.s16        q2, q0, #15    /* extract sign */
2152    vshr.s16        q3, q1, #15
2153    vshl.u16        q14, q10, q12  /* shift */
2154    vshl.u16        q15, q11, q13
2155
2156    push            {r4, r5}
2157    mov             LOOP_COUNT, #3
21581:
2159    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
2160      veor.u16        q14, q14, q2  /* restore sign */
2161    vabs.s16        q12, q0
2162    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
2163    vabs.s16        q13, q1
2164      veor.u16        q15, q15, q3
2165    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2166    vadd.u16        q12, q12, q10  /* add correction */
2167    vadd.u16        q13, q13, q11
2168    vmull.u16       q10, d24, d16  /* multiply by reciprocal */
2169    vmull.u16       q11, d25, d17
2170    vmull.u16       q8, d26, d18
2171    vmull.u16       q9, d27, d19
2172      vsub.u16        q14, q14, q2
2173    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
2174      vsub.u16        q15, q15, q3
2175    vshrn.u32       d20, q10, #16
2176    vshrn.u32       d21, q11, #16
2177      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2178    vshrn.u32       d22, q8, #16
2179    vshrn.u32       d23, q9, #16
2180    vneg.s16        q12, q12
2181    vneg.s16        q13, q13
2182    vshr.s16        q2, q0, #15    /* extract sign */
2183    vshr.s16        q3, q1, #15
2184    vshl.u16        q14, q10, q12  /* shift */
2185    vshl.u16        q15, q11, q13
2186    subs            LOOP_COUNT, LOOP_COUNT, #1
2187    bne             1b
2188    pop             {r4, r5}
2189
2190      veor.u16        q14, q14, q2  /* restore sign */
2191      veor.u16        q15, q15, q3
2192      vsub.u16        q14, q14, q2
2193      vsub.u16        q15, q15, q3
2194      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2195
2196    bx              lr  /* return */
2197
2198    .unreq          COEF_BLOCK
2199    .unreq          DIVISORS
2200    .unreq          WORKSPACE
2201    .unreq          RECIPROCAL
2202    .unreq          CORRECTION
2203    .unreq          SHIFT
2204    .unreq          LOOP_COUNT
2205
2206
2207/*****************************************************************************/
2208
2209/*
2210 * GLOBAL(void)
2211 * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
2212 *                                 JDIMENSION downsampled_width,
2213 *                                 JSAMPARRAY input_data,
2214 *                                 JSAMPARRAY *output_data_ptr);
2215 *
2216 * Note: the use of unaligned writes is the main remaining bottleneck in
2217 *       this code, which can be potentially solved to get up to tens
2218 *       of percents performance improvement on Cortex-A8/Cortex-A9.
2219 */
2220
2221/*
2222 * Upsample 16 source pixels to 32 destination pixels. The new 16 source
2223 * pixels are loaded to q0. The previous 16 source pixels are in q1. The
2224 * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
2225 * Register d28 is used for multiplication by 3. Register q15 is used
2226 * for adding +1 bias.
2227 */
2228.macro upsample16 OUTPTR, INPTR
2229    vld1.8          {q0}, [\INPTR]!
2230    vmovl.u8        q8, d0
2231    vext.8          q2, q1, q0, #15
2232    vmovl.u8        q9, d1
2233    vaddw.u8        q10, q15, d4
2234    vaddw.u8        q11, q15, d5
2235    vmlal.u8        q8, d4, d28
2236    vmlal.u8        q9, d5, d28
2237    vmlal.u8        q10, d0, d28
2238    vmlal.u8        q11, d1, d28
2239    vmov            q1, q0        /* backup source pixels to q1 */
2240    vrshrn.u16      d6, q8, #2
2241    vrshrn.u16      d7, q9, #2
2242    vshrn.u16       d8, q10, #2
2243    vshrn.u16       d9, q11, #2
2244    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
2245.endm
2246
2247/*
2248 * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
2249 * macro, the roles of q0 and q1 registers are reversed for even and odd
2250 * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
2251 * Also this unrolling allows to reorder loads and stores to compensate
2252 * multiplication latency and reduce stalls.
2253 */
2254.macro upsample32 OUTPTR, INPTR
2255    /* even 16 pixels group */
2256    vld1.8          {q0}, [\INPTR]!
2257    vmovl.u8        q8, d0
2258    vext.8          q2, q1, q0, #15
2259    vmovl.u8        q9, d1
2260    vaddw.u8        q10, q15, d4
2261    vaddw.u8        q11, q15, d5
2262    vmlal.u8        q8, d4, d28
2263    vmlal.u8        q9, d5, d28
2264    vmlal.u8        q10, d0, d28
2265    vmlal.u8        q11, d1, d28
2266      /* odd 16 pixels group */
2267      vld1.8          {q1}, [\INPTR]!
2268    vrshrn.u16      d6, q8, #2
2269    vrshrn.u16      d7, q9, #2
2270    vshrn.u16       d8, q10, #2
2271    vshrn.u16       d9, q11, #2
2272      vmovl.u8        q8, d2
2273      vext.8          q2, q0, q1, #15
2274      vmovl.u8        q9, d3
2275      vaddw.u8        q10, q15, d4
2276      vaddw.u8        q11, q15, d5
2277      vmlal.u8        q8, d4, d28
2278      vmlal.u8        q9, d5, d28
2279      vmlal.u8        q10, d2, d28
2280      vmlal.u8        q11, d3, d28
2281    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
2282      vrshrn.u16      d6, q8, #2
2283      vrshrn.u16      d7, q9, #2
2284      vshrn.u16       d8, q10, #2
2285      vshrn.u16       d9, q11, #2
2286      vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
2287.endm
2288
2289/*
2290 * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
2291 */
2292.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
2293    /* special case for the first and last pixels */
2294    sub             \WIDTH, \WIDTH, #1
2295    add             \OUTPTR, \OUTPTR, #1
2296    ldrb            \TMP1, [\INPTR, \WIDTH]
2297    strb            \TMP1, [\OUTPTR, \WIDTH, asl #1]
2298    ldrb            \TMP1, [\INPTR], #1
2299    strb            \TMP1, [\OUTPTR, #-1]
2300    vmov.8          d3[7], \TMP1
2301
2302    subs            \WIDTH, \WIDTH, #32
2303    blt             5f
23040:  /* process 32 pixels per iteration */
2305    upsample32      \OUTPTR, \INPTR
2306    subs            \WIDTH, \WIDTH, #32
2307    bge             0b
23085:
2309    adds            \WIDTH, \WIDTH, #16
2310    blt             1f
23110:  /* process 16 pixels if needed */
2312    upsample16      \OUTPTR, \INPTR
2313    subs            \WIDTH, \WIDTH, #16
23141:
2315    adds            \WIDTH, \WIDTH, #16
2316    beq             9f
2317
2318    /* load the remaining 1-15 pixels */
2319    add             \INPTR, \INPTR, \WIDTH
2320    tst             \WIDTH, #1
2321    beq             2f
2322    sub             \INPTR, \INPTR, #1
2323    vld1.8          {d0[0]}, [\INPTR]
23242:
2325    tst             \WIDTH, #2
2326    beq             2f
2327    vext.8          d0, d0, d0, #6
2328    sub             \INPTR, \INPTR, #1
2329    vld1.8          {d0[1]}, [\INPTR]
2330    sub             \INPTR, \INPTR, #1
2331    vld1.8          {d0[0]}, [\INPTR]
23322:
2333    tst             \WIDTH, #4
2334    beq             2f
2335    vrev64.32       d0, d0
2336    sub             \INPTR, \INPTR, #1
2337    vld1.8          {d0[3]}, [\INPTR]
2338    sub             \INPTR, \INPTR, #1
2339    vld1.8          {d0[2]}, [\INPTR]
2340    sub             \INPTR, \INPTR, #1
2341    vld1.8          {d0[1]}, [\INPTR]
2342    sub             \INPTR, \INPTR, #1
2343    vld1.8          {d0[0]}, [\INPTR]
23442:
2345    tst             \WIDTH, #8
2346    beq             2f
2347    vmov            d1, d0
2348    sub             \INPTR, \INPTR, #8
2349    vld1.8          {d0}, [\INPTR]
23502:  /* upsample the remaining pixels */
2351    vmovl.u8        q8, d0
2352    vext.8          q2, q1, q0, #15
2353    vmovl.u8        q9, d1
2354    vaddw.u8        q10, q15, d4
2355    vaddw.u8        q11, q15, d5
2356    vmlal.u8        q8, d4, d28
2357    vmlal.u8        q9, d5, d28
2358    vmlal.u8        q10, d0, d28
2359    vmlal.u8        q11, d1, d28
2360    vrshrn.u16      d10, q8, #2
2361    vrshrn.u16      d12, q9, #2
2362    vshrn.u16       d11, q10, #2
2363    vshrn.u16       d13, q11, #2
2364    vzip.8          d10, d11
2365    vzip.8          d12, d13
2366    /* store the remaining pixels */
2367    tst             \WIDTH, #8
2368    beq             2f
2369    vst1.8          {d10, d11}, [\OUTPTR]!
2370    vmov            q5, q6
23712:
2372    tst             \WIDTH, #4
2373    beq             2f
2374    vst1.8          {d10}, [\OUTPTR]!
2375    vmov            d10, d11
23762:
2377    tst             \WIDTH, #2
2378    beq             2f
2379    vst1.8          {d10[0]}, [\OUTPTR]!
2380    vst1.8          {d10[1]}, [\OUTPTR]!
2381    vst1.8          {d10[2]}, [\OUTPTR]!
2382    vst1.8          {d10[3]}, [\OUTPTR]!
2383    vext.8          d10, d10, d10, #4
23842:
2385    tst             \WIDTH, #1
2386    beq             2f
2387    vst1.8          {d10[0]}, [\OUTPTR]!
2388    vst1.8          {d10[1]}, [\OUTPTR]!
23892:
23909:
2391.endm
2392
2393asm_function jsimd_h2v1_fancy_upsample_neon
2394
2395    MAX_V_SAMP_FACTOR .req r0
2396    DOWNSAMPLED_WIDTH .req r1
2397    INPUT_DATA        .req r2
2398    OUTPUT_DATA_PTR   .req r3
2399    OUTPUT_DATA       .req OUTPUT_DATA_PTR
2400
2401    OUTPTR            .req r4
2402    INPTR             .req r5
2403    WIDTH             .req ip
2404    TMP               .req lr
2405
2406    push            {r4, r5, r6, lr}
2407    vpush           {d8-d15}
2408
2409    ldr             OUTPUT_DATA, [OUTPUT_DATA_PTR]
2410    cmp             MAX_V_SAMP_FACTOR, #0
2411    ble             99f
2412
2413    /* initialize constants */
2414    vmov.u8         d28, #3
2415    vmov.u16        q15, #1
241611:
2417    ldr             INPTR, [INPUT_DATA], #4
2418    ldr             OUTPTR, [OUTPUT_DATA], #4
2419    mov             WIDTH, DOWNSAMPLED_WIDTH
2420    upsample_row    OUTPTR, INPTR, WIDTH, TMP
2421    subs            MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
2422    bgt             11b
2423
242499:
2425    vpop            {d8-d15}
2426    pop             {r4, r5, r6, pc}
2427
2428    .unreq          MAX_V_SAMP_FACTOR
2429    .unreq          DOWNSAMPLED_WIDTH
2430    .unreq          INPUT_DATA
2431    .unreq          OUTPUT_DATA_PTR
2432    .unreq          OUTPUT_DATA
2433
2434    .unreq          OUTPTR
2435    .unreq          INPTR
2436    .unreq          WIDTH
2437    .unreq          TMP
2438
2439.purgem upsample16
2440.purgem upsample32
2441.purgem upsample_row
2442
2443
2444/*****************************************************************************/
2445
2446/*
2447 * GLOBAL(JOCTET*)
2448 * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
2449 *                              JCOEFPTR block, int last_dc_val,
2450 *                              c_derived_tbl *dctbl, c_derived_tbl *actbl)
2451 *
2452 */
2453
2454.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
2455    sub             \PUT_BITS, \PUT_BITS, #0x8
2456    lsr             \TMP, \PUT_BUFFER, \PUT_BITS
2457    uxtb            \TMP, \TMP
2458    strb            \TMP, [\BUFFER, #1]!
2459    cmp             \TMP, #0xff
2460    /*it eq*/
2461    strbeq          \ZERO, [\BUFFER, #1]!
2462.endm
2463
2464.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
2465    /*lsl             \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
2466    add             \PUT_BITS, \SIZE
2467    /*orr             \PUT_BUFFER, \PUT_BUFFER, \CODE*/
2468    orr             \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
2469.endm
2470
2471.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
2472  cmp               \PUT_BITS, #0x10
2473  blt               15f
2474    eor               \ZERO, \ZERO, \ZERO
2475    emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
2476    emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
247715:
2478.endm
2479
2480.balign 16
2481jsimd_huff_encode_one_block_neon_consts:
2482  .byte 0x01
2483  .byte 0x02
2484  .byte 0x04
2485  .byte 0x08
2486  .byte 0x10
2487  .byte 0x20
2488  .byte 0x40
2489  .byte 0x80
2490
2491asm_function jsimd_huff_encode_one_block_neon
2492    push            {r4, r5, r6, r7, r8, r9, r10, r11, lr}
2493    add             r7, sp, #0x1c
2494    sub             r4, sp, #0x40
2495    bfc             r4, #0, #5
2496    mov             sp, r4           /* align sp on 32 bytes */
2497    vst1.64         {d8, d9, d10, d11}, [r4, :128]!
2498    vst1.64         {d12, d13, d14, d15}, [r4, :128]
2499    sub             sp, #0x140       /* reserve 320 bytes */
2500    str             r0, [sp, #0x18]  /* working state > sp + Ox18 */
2501    add             r4, sp, #0x20    /* r4 = t1 */
2502    ldr             lr, [r7, #0x8]   /* lr = dctbl */
2503    sub             r10, r1, #0x1    /* r10=buffer-- */
2504    ldrsh           r1, [r2]
2505    mov             r9, #0x10
2506    mov             r8, #0x1
2507    adr             r5, jsimd_huff_encode_one_block_neon_consts
2508    /* prepare data */
2509    vld1.8          {d26}, [r5, :64]
2510    veor            q8, q8, q8
2511    veor            q9, q9, q9
2512    vdup.16         q14, r9
2513    vdup.16         q15, r8
2514    veor            q10, q10, q10
2515    veor            q11, q11, q11
2516    sub             r1, r1, r3
2517    add             r9, r2, #0x22
2518    add             r8, r2, #0x18
2519    add             r3, r2, #0x36
2520    vmov.16         d0[0], r1
2521    vld1.16         {d2[0]}, [r9, :16]
2522    vld1.16         {d4[0]}, [r8, :16]
2523    vld1.16         {d6[0]}, [r3, :16]
2524    add             r1, r2, #0x2
2525    add             r9, r2, #0x30
2526    add             r8, r2, #0x26
2527    add             r3, r2, #0x28
2528    vld1.16         {d0[1]}, [r1, :16]
2529    vld1.16         {d2[1]}, [r9, :16]
2530    vld1.16         {d4[1]}, [r8, :16]
2531    vld1.16         {d6[1]}, [r3, :16]
2532    add             r1, r2, #0x10
2533    add             r9, r2, #0x40
2534    add             r8, r2, #0x34
2535    add             r3, r2, #0x1a
2536    vld1.16         {d0[2]}, [r1, :16]
2537    vld1.16         {d2[2]}, [r9, :16]
2538    vld1.16         {d4[2]}, [r8, :16]
2539    vld1.16         {d6[2]}, [r3, :16]
2540    add             r1, r2, #0x20
2541    add             r9, r2, #0x32
2542    add             r8, r2, #0x42
2543    add             r3, r2, #0xc
2544    vld1.16         {d0[3]}, [r1, :16]
2545    vld1.16         {d2[3]}, [r9, :16]
2546    vld1.16         {d4[3]}, [r8, :16]
2547    vld1.16         {d6[3]}, [r3, :16]
2548    add             r1, r2, #0x12
2549    add             r9, r2, #0x24
2550    add             r8, r2, #0x50
2551    add             r3, r2, #0xe
2552    vld1.16         {d1[0]}, [r1, :16]
2553    vld1.16         {d3[0]}, [r9, :16]
2554    vld1.16         {d5[0]}, [r8, :16]
2555    vld1.16         {d7[0]}, [r3, :16]
2556    add             r1, r2, #0x4
2557    add             r9, r2, #0x16
2558    add             r8, r2, #0x60
2559    add             r3, r2, #0x1c
2560    vld1.16         {d1[1]}, [r1, :16]
2561    vld1.16         {d3[1]}, [r9, :16]
2562    vld1.16         {d5[1]}, [r8, :16]
2563    vld1.16         {d7[1]}, [r3, :16]
2564    add             r1, r2, #0x6
2565    add             r9, r2, #0x8
2566    add             r8, r2, #0x52
2567    add             r3, r2, #0x2a
2568    vld1.16         {d1[2]}, [r1, :16]
2569    vld1.16         {d3[2]}, [r9, :16]
2570    vld1.16         {d5[2]}, [r8, :16]
2571    vld1.16         {d7[2]}, [r3, :16]
2572    add             r1, r2, #0x14
2573    add             r9, r2, #0xa
2574    add             r8, r2, #0x44
2575    add             r3, r2, #0x38
2576    vld1.16         {d1[3]}, [r1, :16]
2577    vld1.16         {d3[3]}, [r9, :16]
2578    vld1.16         {d5[3]}, [r8, :16]
2579    vld1.16         {d7[3]}, [r3, :16]
2580    vcgt.s16        q8, q8, q0
2581    vcgt.s16        q9, q9, q1
2582    vcgt.s16        q10, q10, q2
2583    vcgt.s16        q11, q11, q3
2584    vabs.s16        q0, q0
2585    vabs.s16        q1, q1
2586    vabs.s16        q2, q2
2587    vabs.s16        q3, q3
2588    veor            q8, q8, q0
2589    veor            q9, q9, q1
2590    veor            q10, q10, q2
2591    veor            q11, q11, q3
2592    add             r9, r4, #0x20
2593    add             r8, r4, #0x80
2594    add             r3, r4, #0xa0
2595    vclz.i16        q0, q0
2596    vclz.i16        q1, q1
2597    vclz.i16        q2, q2
2598    vclz.i16        q3, q3
2599    vsub.i16        q0, q14, q0
2600    vsub.i16        q1, q14, q1
2601    vsub.i16        q2, q14, q2
2602    vsub.i16        q3, q14, q3
2603    vst1.16         {d0, d1, d2, d3}, [r4, :256]
2604    vst1.16         {d4, d5, d6, d7}, [r9, :256]
2605    vshl.s16        q0, q15, q0
2606    vshl.s16        q1, q15, q1
2607    vshl.s16        q2, q15, q2
2608    vshl.s16        q3, q15, q3
2609    vsub.i16        q0, q0, q15
2610    vsub.i16        q1, q1, q15
2611    vsub.i16        q2, q2, q15
2612    vsub.i16        q3, q3, q15
2613    vand            q8, q8, q0
2614    vand            q9, q9, q1
2615    vand            q10, q10, q2
2616    vand            q11, q11, q3
2617    vst1.16         {d16, d17, d18, d19}, [r8, :256]
2618    vst1.16         {d20, d21, d22, d23}, [r3, :256]
2619    add             r1, r2, #0x46
2620    add             r9, r2, #0x3a
2621    add             r8, r2, #0x74
2622    add             r3, r2, #0x6a
2623    vld1.16         {d8[0]}, [r1, :16]
2624    vld1.16         {d10[0]}, [r9, :16]
2625    vld1.16         {d12[0]}, [r8, :16]
2626    vld1.16         {d14[0]}, [r3, :16]
2627    veor            q8, q8, q8
2628    veor            q9, q9, q9
2629    veor            q10, q10, q10
2630    veor            q11, q11, q11
2631    add             r1, r2, #0x54
2632    add             r9, r2, #0x2c
2633    add             r8, r2, #0x76
2634    add             r3, r2, #0x78
2635    vld1.16         {d8[1]}, [r1, :16]
2636    vld1.16         {d10[1]}, [r9, :16]
2637    vld1.16         {d12[1]}, [r8, :16]
2638    vld1.16         {d14[1]}, [r3, :16]
2639    add             r1, r2, #0x62
2640    add             r9, r2, #0x1e
2641    add             r8, r2, #0x68
2642    add             r3, r2, #0x7a
2643    vld1.16         {d8[2]}, [r1, :16]
2644    vld1.16         {d10[2]}, [r9, :16]
2645    vld1.16         {d12[2]}, [r8, :16]
2646    vld1.16         {d14[2]}, [r3, :16]
2647    add             r1, r2, #0x70
2648    add             r9, r2, #0x2e
2649    add             r8, r2, #0x5a
2650    add             r3, r2, #0x6c
2651    vld1.16         {d8[3]}, [r1, :16]
2652    vld1.16         {d10[3]}, [r9, :16]
2653    vld1.16         {d12[3]}, [r8, :16]
2654    vld1.16         {d14[3]}, [r3, :16]
2655    add             r1, r2, #0x72
2656    add             r9, r2, #0x3c
2657    add             r8, r2, #0x4c
2658    add             r3, r2, #0x5e
2659    vld1.16         {d9[0]}, [r1, :16]
2660    vld1.16         {d11[0]}, [r9, :16]
2661    vld1.16         {d13[0]}, [r8, :16]
2662    vld1.16         {d15[0]}, [r3, :16]
2663    add             r1, r2, #0x64
2664    add             r9, r2, #0x4a
2665    add             r8, r2, #0x3e
2666    add             r3, r2, #0x6e
2667    vld1.16         {d9[1]}, [r1, :16]
2668    vld1.16         {d11[1]}, [r9, :16]
2669    vld1.16         {d13[1]}, [r8, :16]
2670    vld1.16         {d15[1]}, [r3, :16]
2671    add             r1, r2, #0x56
2672    add             r9, r2, #0x58
2673    add             r8, r2, #0x4e
2674    add             r3, r2, #0x7c
2675    vld1.16         {d9[2]}, [r1, :16]
2676    vld1.16         {d11[2]}, [r9, :16]
2677    vld1.16         {d13[2]}, [r8, :16]
2678    vld1.16         {d15[2]}, [r3, :16]
2679    add             r1, r2, #0x48
2680    add             r9, r2, #0x66
2681    add             r8, r2, #0x5c
2682    add             r3, r2, #0x7e
2683    vld1.16         {d9[3]}, [r1, :16]
2684    vld1.16         {d11[3]}, [r9, :16]
2685    vld1.16         {d13[3]}, [r8, :16]
2686    vld1.16         {d15[3]}, [r3, :16]
2687    vcgt.s16        q8, q8, q4
2688    vcgt.s16        q9, q9, q5
2689    vcgt.s16        q10, q10, q6
2690    vcgt.s16        q11, q11, q7
2691    vabs.s16        q4, q4
2692    vabs.s16        q5, q5
2693    vabs.s16        q6, q6
2694    vabs.s16        q7, q7
2695    veor            q8, q8, q4
2696    veor            q9, q9, q5
2697    veor            q10, q10, q6
2698    veor            q11, q11, q7
2699    add             r1, r4, #0x40
2700    add             r9, r4, #0x60
2701    add             r8, r4, #0xc0
2702    add             r3, r4, #0xe0
2703    vclz.i16        q4, q4
2704    vclz.i16        q5, q5
2705    vclz.i16        q6, q6
2706    vclz.i16        q7, q7
2707    vsub.i16        q4, q14, q4
2708    vsub.i16        q5, q14, q5
2709    vsub.i16        q6, q14, q6
2710    vsub.i16        q7, q14, q7
2711    vst1.16         {d8, d9, d10, d11}, [r1, :256]
2712    vst1.16         {d12, d13, d14, d15}, [r9, :256]
2713    vshl.s16        q4, q15, q4
2714    vshl.s16        q5, q15, q5
2715    vshl.s16        q6, q15, q6
2716    vshl.s16        q7, q15, q7
2717    vsub.i16        q4, q4, q15
2718    vsub.i16        q5, q5, q15
2719    vsub.i16        q6, q6, q15
2720    vsub.i16        q7, q7, q15
2721    vand            q8, q8, q4
2722    vand            q9, q9, q5
2723    vand            q10, q10, q6
2724    vand            q11, q11, q7
2725    vst1.16         {d16, d17, d18, d19}, [r8, :256]
2726    vst1.16         {d20, d21, d22, d23}, [r3, :256]
2727    ldr             r12, [r7, #0xc]       /* r12 = actbl */
2728    add             r1, lr, #0x400        /* r1 = dctbl->ehufsi */
2729    mov             r9, r12               /* r9 = actbl */
2730    add             r6, r4, #0x80         /* r6 = t2 */
2731    ldr             r11, [r0, #0x8]       /* r11 = put_buffer */
2732    ldr             r4, [r0, #0xc]        /* r4  = put_bits */
2733    ldrh            r2, [r6, #-128]       /* r2  = nbits */
2734    ldrh            r3, [r6]              /* r3  = temp2 & (((JLONG) 1)<<nbits) - 1; */
2735    ldr             r0, [lr, r2, lsl #2]
2736    ldrb            r5, [r1, r2]
2737    put_bits        r11, r4, r0, r5
2738    checkbuf15      r10, r11, r4, r5, r0
2739    put_bits        r11, r4, r3, r2
2740    checkbuf15      r10, r11, r4, r5, r0
2741    mov             lr, r6                /* lr = t2 */
2742    add             r5, r9, #0x400        /* r5 = actbl->ehufsi */
2743    ldrsb           r6, [r5, #0xf0]       /* r6 = actbl->ehufsi[0xf0] */
2744    veor            q8, q8, q8
2745    vceq.i16        q0, q0, q8
2746    vceq.i16        q1, q1, q8
2747    vceq.i16        q2, q2, q8
2748    vceq.i16        q3, q3, q8
2749    vceq.i16        q4, q4, q8
2750    vceq.i16        q5, q5, q8
2751    vceq.i16        q6, q6, q8
2752    vceq.i16        q7, q7, q8
2753    vmovn.i16       d0, q0
2754    vmovn.i16       d2, q1
2755    vmovn.i16       d4, q2
2756    vmovn.i16       d6, q3
2757    vmovn.i16       d8, q4
2758    vmovn.i16       d10, q5
2759    vmovn.i16       d12, q6
2760    vmovn.i16       d14, q7
2761    vand            d0, d0, d26
2762    vand            d2, d2, d26
2763    vand            d4, d4, d26
2764    vand            d6, d6, d26
2765    vand            d8, d8, d26
2766    vand            d10, d10, d26
2767    vand            d12, d12, d26
2768    vand            d14, d14, d26
2769    vpadd.i8        d0, d0, d2
2770    vpadd.i8        d4, d4, d6
2771    vpadd.i8        d8, d8, d10
2772    vpadd.i8        d12, d12, d14
2773    vpadd.i8        d0, d0, d4
2774    vpadd.i8        d8, d8, d12
2775    vpadd.i8        d0, d0, d8
2776    vmov.32         r1, d0[1]
2777    vmov.32         r8, d0[0]
2778    mvn             r1, r1
2779    mvn             r8, r8
2780    lsrs            r1, r1, #0x1
2781    rrx             r8, r8            /* shift in last r1 bit while shifting out DC bit */
2782    rbit            r1, r1            /* r1 = index1 */
2783    rbit            r8, r8            /* r8 = index0 */
2784    ldr             r0, [r9, #0x3c0]  /* r0 = actbl->ehufco[0xf0] */
2785    str             r1, [sp, #0x14]   /* index1 > sp + 0x14 */
2786    cmp             r8, #0x0
2787    beq             6f
27881:
2789    clz             r2, r8
2790    add             lr, lr, r2, lsl #1
2791    lsl             r8, r8, r2
2792    ldrh            r1, [lr, #-126]
27932:
2794    cmp             r2, #0x10
2795    blt             3f
2796    sub             r2, r2, #0x10
2797    put_bits        r11, r4, r0, r6
2798    cmp             r4, #0x10
2799    blt             2b
2800    eor             r3, r3, r3
2801    emit_byte       r10, r11, r4, r3, r12
2802    emit_byte       r10, r11, r4, r3, r12
2803    b               2b
28043:
2805    add             r2, r1, r2, lsl #4
2806    ldrh            r3, [lr, #2]!
2807    ldr             r12, [r9, r2, lsl #2]
2808    ldrb            r2, [r5, r2]
2809    put_bits        r11, r4, r12, r2
2810    checkbuf15      r10, r11, r4, r2, r12
2811    put_bits        r11, r4, r3, r1
2812    checkbuf15      r10, r11, r4, r2, r12
2813    lsls            r8, r8, #0x1
2814    bne             1b
28156:
2816    add             r12, sp, #0x20   /* r12 = t1 */
2817    ldr             r8, [sp, #0x14]  /* r8 = index1 */
2818    adds            r12, #0xc0       /* r12 = t2 + (DCTSIZE2/2) */
2819    cmp             r8, #0x0
2820    beq             6f
2821    clz             r2, r8
2822    sub             r12, r12, lr
2823    lsl             r8, r8, r2
2824    add             r2, r2, r12, lsr #1
2825    add             lr, lr, r2, lsl #1
2826    b               7f
28271:
2828    clz             r2, r8
2829    add             lr, lr, r2, lsl #1
2830    lsl             r8, r8, r2
28317:
2832    ldrh            r1, [lr, #-126]
28332:
2834    cmp             r2, #0x10
2835    blt             3f
2836    sub             r2, r2, #0x10
2837    put_bits        r11, r4, r0, r6
2838    cmp             r4, #0x10
2839    blt             2b
2840    eor             r3, r3, r3
2841    emit_byte       r10, r11, r4, r3, r12
2842    emit_byte       r10, r11, r4, r3, r12
2843    b               2b
28443:
2845    add             r2, r1, r2, lsl #4
2846    ldrh            r3, [lr, #2]!
2847    ldr             r12, [r9, r2, lsl #2]
2848    ldrb            r2, [r5, r2]
2849    put_bits        r11, r4, r12, r2
2850    checkbuf15      r10, r11, r4, r2, r12
2851    put_bits        r11, r4, r3, r1
2852    checkbuf15      r10, r11, r4, r2, r12
2853    lsls            r8, r8, #0x1
2854    bne             1b
28556:
2856    add             r0, sp, #0x20
2857    add             r0, #0xfe
2858    cmp             lr, r0
2859    bhs             1f
2860    ldr             r1, [r9]
2861    ldrb            r0, [r5]
2862    put_bits        r11, r4, r1, r0
2863    checkbuf15      r10, r11, r4, r0, r1
28641:
2865    ldr             r12, [sp, #0x18]
2866    str             r11, [r12, #0x8]
2867    str             r4, [r12, #0xc]
2868    add             r0, r10, #0x1
2869    add             r4, sp, #0x140
2870    vld1.64         {d8, d9, d10, d11}, [r4, :128]!
2871    vld1.64         {d12, d13, d14, d15}, [r4, :128]
2872    sub             r4, r7, #0x1c
2873    mov             sp, r4
2874    pop             {r4, r5, r6, r7, r8, r9, r10, r11, pc}
2875
2876.purgem emit_byte
2877.purgem put_bits
2878.purgem checkbuf15
2879