1/*
2 * ARMv8 NEON optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
5 *                          All Rights Reserved.
6 * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
7 * Copyright (C) 2013-2014, Linaro Limited.  All Rights Reserved.
8 * Author:  Ragesh Radhakrishnan <ragesh.r@linaro.org>
9 * Copyright (C) 2014-2016, D. R. Commander.  All Rights Reserved.
10 * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
11 * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
12 *
13 * This software is provided 'as-is', without any express or implied
14 * warranty.  In no event will the authors be held liable for any damages
15 * arising from the use of this software.
16 *
17 * Permission is granted to anyone to use this software for any purpose,
18 * including commercial applications, and to alter it and redistribute it
19 * freely, subject to the following restrictions:
20 *
21 * 1. The origin of this software must not be misrepresented; you must not
22 *    claim that you wrote the original software. If you use this software
23 *    in a product, an acknowledgment in the product documentation would be
24 *    appreciated but is not required.
25 * 2. Altered source versions must be plainly marked as such, and must not be
26 *    misrepresented as being the original software.
27 * 3. This notice may not be removed or altered from any source distribution.
28 */
29
30#if defined(__linux__) && defined(__ELF__)
31.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
32#endif
33
34#if defined(__APPLE__)
35.section __DATA, __const
36#elif defined(_WIN32)
37.section .rdata
38#else
39.section .rodata, "a", %progbits
40#endif
41
42/* Constants for jsimd_huff_encode_one_block_neon() */
43
44.balign 16
45Ljsimd_huff_encode_one_block_neon_consts:
46    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
47          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
48    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
49            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
50    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
51            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
52    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
53           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
54    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
55            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
56    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
57            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
58    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
59            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
60    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
61            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
62    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
63            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
64    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
65           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
66    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
67             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
68    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
69           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
70    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
71           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
72
73.text
74
75
76#define RESPECT_STRICT_ALIGNMENT  1
77
78
79/*****************************************************************************/
80
81/* Supplementary macro for setting function attributes */
82.macro asm_function fname
83#ifdef __APPLE__
84    .private_extern _\fname
85    .globl _\fname
86_\fname:
87#else
88    .global \fname
89#ifdef __ELF__
90    .hidden \fname
91    .type \fname, %function
92#endif
93\fname:
94#endif
95.endm
96
97/* Get symbol location */
98.macro get_symbol_loc reg, symbol
99#ifdef __APPLE__
100    adrp            \reg, \symbol@PAGE
101    add             \reg, \reg, \symbol@PAGEOFF
102#else
103    adrp            \reg, \symbol
104    add             \reg, \reg, :lo12:\symbol
105#endif
106.endm
107
108
109#define CENTERJSAMPLE  128
110
111/*****************************************************************************/
112
113/*
114 * GLOBAL(JOCTET *)
115 * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
116 *                             JCOEFPTR block, int last_dc_val,
117 *                             c_derived_tbl *dctbl, c_derived_tbl *actbl)
118 *
119 */
120
121    BUFFER          .req x1
122    PUT_BUFFER      .req x6
123    PUT_BITS        .req x7
124    PUT_BITSw       .req w7
125
126.macro emit_byte
127    sub             PUT_BITS, PUT_BITS, #0x8
128    lsr             x19, PUT_BUFFER, PUT_BITS
129    uxtb            w19, w19
130    strb            w19, [BUFFER, #1]!
131    cmp             w19, #0xff
132    b.ne            14f
133    strb            wzr, [BUFFER, #1]!
13414:
135.endm
136.macro put_bits CODE, SIZE
137    lsl             PUT_BUFFER, PUT_BUFFER, \SIZE
138    add             PUT_BITS, PUT_BITS, \SIZE
139    orr             PUT_BUFFER, PUT_BUFFER, \CODE
140.endm
141.macro checkbuf31
142    cmp             PUT_BITS, #0x20
143    b.lt            31f
144    emit_byte
145    emit_byte
146    emit_byte
147    emit_byte
14831:
149.endm
150.macro checkbuf47
151    cmp             PUT_BITS, #0x30
152    b.lt            47f
153    emit_byte
154    emit_byte
155    emit_byte
156    emit_byte
157    emit_byte
158    emit_byte
15947:
160.endm
161
162.macro generate_jsimd_huff_encode_one_block fast_tbl
163
164.balign 16
165.if \fast_tbl == 1
166asm_function jsimd_huff_encode_one_block_neon
167.else
168asm_function jsimd_huff_encode_one_block_neon_slowtbl
169.endif
170    sub             sp, sp, 272
171    sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
172    /* Save ARM registers */
173    stp             x19, x20, [sp]
174    get_symbol_loc  x15, Ljsimd_huff_encode_one_block_neon_consts
175    ldr             PUT_BUFFER, [x0, #0x10]
176    ldr             PUT_BITSw, [x0, #0x18]
177    ldrsh           w12, [x2]               /* load DC coeff in w12 */
178    /* prepare data */
179.if \fast_tbl == 1
180    ld1             {v23.16b}, [x15], #16
181    ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
182    ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
183    ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
184    ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
185    ld1             {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
186    sub             w12, w12, w3      /* last_dc_val, not used afterwards */
187    /* ZigZag 8x8 */
188    tbl             v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
189    tbl             v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
190    tbl             v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
191    tbl             v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
192    tbl             v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
193    tbl             v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
194    tbl             v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
195    tbl             v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
196    ins             v0.h[0], w12
197    tbx             v1.16b, {v28.16b}, v16.16b
198    tbx             v2.16b, {v29.16b, v30.16b}, v17.16b
199    tbx             v5.16b, {v29.16b, v30.16b}, v18.16b
200    tbx             v6.16b, {v31.16b}, v19.16b
201.else
202      add             x13, x2, #0x22
203      sub             w12, w12, w3    /* last_dc_val, not used afterwards */
204    ld1             {v23.16b}, [x15]
205      add             x14, x2, #0x18
206      add             x3, x2, #0x36
207    ins             v0.h[0], w12
208      add             x9, x2, #0x2
209    ld1             {v1.h}[0], [x13]
210      add             x15, x2, #0x30
211    ld1             {v2.h}[0], [x14]
212      add             x19, x2, #0x26
213    ld1             {v3.h}[0], [x3]
214      add             x20, x2, #0x28
215    ld1             {v0.h}[1], [x9]
216      add             x12, x2, #0x10
217    ld1             {v1.h}[1], [x15]
218      add             x13, x2, #0x40
219    ld1             {v2.h}[1], [x19]
220      add             x14, x2, #0x34
221    ld1             {v3.h}[1], [x20]
222      add             x3, x2, #0x1a
223    ld1             {v0.h}[2], [x12]
224      add             x9, x2, #0x20
225    ld1             {v1.h}[2], [x13]
226      add             x15, x2, #0x32
227    ld1             {v2.h}[2], [x14]
228      add             x19, x2, #0x42
229    ld1             {v3.h}[2], [x3]
230      add             x20, x2, #0xc
231    ld1             {v0.h}[3], [x9]
232      add             x12, x2, #0x12
233    ld1             {v1.h}[3], [x15]
234      add             x13, x2, #0x24
235    ld1             {v2.h}[3], [x19]
236      add             x14, x2, #0x50
237    ld1             {v3.h}[3], [x20]
238      add             x3, x2, #0xe
239    ld1             {v0.h}[4], [x12]
240      add             x9, x2, #0x4
241    ld1             {v1.h}[4], [x13]
242      add             x15, x2, #0x16
243    ld1             {v2.h}[4], [x14]
244      add             x19, x2, #0x60
245    ld1             {v3.h}[4], [x3]
246      add             x20, x2, #0x1c
247    ld1             {v0.h}[5], [x9]
248      add             x12, x2, #0x6
249    ld1             {v1.h}[5], [x15]
250      add             x13, x2, #0x8
251    ld1             {v2.h}[5], [x19]
252      add             x14, x2, #0x52
253    ld1             {v3.h}[5], [x20]
254      add             x3, x2, #0x2a
255    ld1             {v0.h}[6], [x12]
256      add             x9, x2, #0x14
257    ld1             {v1.h}[6], [x13]
258      add             x15, x2, #0xa
259    ld1             {v2.h}[6], [x14]
260      add             x19, x2, #0x44
261    ld1             {v3.h}[6], [x3]
262      add             x20, x2, #0x38
263    ld1             {v0.h}[7], [x9]
264      add             x12, x2, #0x46
265    ld1             {v1.h}[7], [x15]
266      add             x13, x2, #0x3a
267    ld1             {v2.h}[7], [x19]
268      add             x14, x2, #0x74
269    ld1             {v3.h}[7], [x20]
270      add             x3, x2, #0x6a
271    ld1             {v4.h}[0], [x12]
272      add             x9, x2, #0x54
273    ld1             {v5.h}[0], [x13]
274      add             x15, x2, #0x2c
275    ld1             {v6.h}[0], [x14]
276      add             x19, x2, #0x76
277    ld1             {v7.h}[0], [x3]
278      add             x20, x2, #0x78
279    ld1             {v4.h}[1], [x9]
280      add             x12, x2, #0x62
281    ld1             {v5.h}[1], [x15]
282      add             x13, x2, #0x1e
283    ld1             {v6.h}[1], [x19]
284      add             x14, x2, #0x68
285    ld1             {v7.h}[1], [x20]
286      add             x3, x2, #0x7a
287    ld1             {v4.h}[2], [x12]
288      add             x9, x2, #0x70
289    ld1             {v5.h}[2], [x13]
290      add             x15, x2, #0x2e
291    ld1             {v6.h}[2], [x14]
292      add             x19, x2, #0x5a
293    ld1             {v7.h}[2], [x3]
294      add             x20, x2, #0x6c
295    ld1             {v4.h}[3], [x9]
296      add             x12, x2, #0x72
297    ld1             {v5.h}[3], [x15]
298      add             x13, x2, #0x3c
299    ld1             {v6.h}[3], [x19]
300      add             x14, x2, #0x4c
301    ld1             {v7.h}[3], [x20]
302      add             x3, x2, #0x5e
303    ld1             {v4.h}[4], [x12]
304      add             x9, x2, #0x64
305    ld1             {v5.h}[4], [x13]
306      add             x15, x2, #0x4a
307    ld1             {v6.h}[4], [x14]
308      add             x19, x2, #0x3e
309    ld1             {v7.h}[4], [x3]
310      add             x20, x2, #0x6e
311    ld1             {v4.h}[5], [x9]
312      add             x12, x2, #0x56
313    ld1             {v5.h}[5], [x15]
314      add             x13, x2, #0x58
315    ld1             {v6.h}[5], [x19]
316      add             x14, x2, #0x4e
317    ld1             {v7.h}[5], [x20]
318      add             x3, x2, #0x7c
319    ld1             {v4.h}[6], [x12]
320      add             x9, x2, #0x48
321    ld1             {v5.h}[6], [x13]
322      add             x15, x2, #0x66
323    ld1             {v6.h}[6], [x14]
324      add             x19, x2, #0x5c
325    ld1             {v7.h}[6], [x3]
326      add             x20, x2, #0x7e
327    ld1             {v4.h}[7], [x9]
328    ld1             {v5.h}[7], [x15]
329    ld1             {v6.h}[7], [x19]
330    ld1             {v7.h}[7], [x20]
331.endif
332    cmlt            v24.8h, v0.8h, #0
333    cmlt            v25.8h, v1.8h, #0
334    cmlt            v26.8h, v2.8h, #0
335    cmlt            v27.8h, v3.8h, #0
336    cmlt            v28.8h, v4.8h, #0
337    cmlt            v29.8h, v5.8h, #0
338    cmlt            v30.8h, v6.8h, #0
339    cmlt            v31.8h, v7.8h, #0
340    abs             v0.8h, v0.8h
341    abs             v1.8h, v1.8h
342    abs             v2.8h, v2.8h
343    abs             v3.8h, v3.8h
344    abs             v4.8h, v4.8h
345    abs             v5.8h, v5.8h
346    abs             v6.8h, v6.8h
347    abs             v7.8h, v7.8h
348    eor             v24.16b, v24.16b, v0.16b
349    eor             v25.16b, v25.16b, v1.16b
350    eor             v26.16b, v26.16b, v2.16b
351    eor             v27.16b, v27.16b, v3.16b
352    eor             v28.16b, v28.16b, v4.16b
353    eor             v29.16b, v29.16b, v5.16b
354    eor             v30.16b, v30.16b, v6.16b
355    eor             v31.16b, v31.16b, v7.16b
356    cmeq            v16.8h, v0.8h, #0
357    cmeq            v17.8h, v1.8h, #0
358    cmeq            v18.8h, v2.8h, #0
359    cmeq            v19.8h, v3.8h, #0
360    cmeq            v20.8h, v4.8h, #0
361    cmeq            v21.8h, v5.8h, #0
362    cmeq            v22.8h, v6.8h, #0
363    xtn             v16.8b, v16.8h
364    xtn             v18.8b, v18.8h
365    xtn             v20.8b, v20.8h
366    xtn             v22.8b, v22.8h
367      umov            w14, v0.h[0]
368    xtn2            v16.16b, v17.8h
369      umov            w13, v24.h[0]
370    xtn2            v18.16b, v19.8h
371      clz             w14, w14
372    xtn2            v20.16b, v21.8h
373      lsl             w13, w13, w14
374    cmeq            v17.8h, v7.8h, #0
375      sub             w12, w14, #32
376    xtn2            v22.16b, v17.8h
377      lsr             w13, w13, w14
378    and             v16.16b, v16.16b, v23.16b
379      neg             w12, w12
380    and             v18.16b, v18.16b, v23.16b
381      add             x3, x4, #0x400           /* r1 = dctbl->ehufsi */
382    and             v20.16b, v20.16b, v23.16b
383      add             x15, sp, #0x90           /* x15 = t2 */
384    and             v22.16b, v22.16b, v23.16b
385      ldr             w10, [x4, x12, lsl #2]
386    addp            v16.16b, v16.16b, v18.16b
387      ldrb            w11, [x3, x12]
388    addp            v20.16b, v20.16b, v22.16b
389      checkbuf47
390    addp            v16.16b, v16.16b, v20.16b
391      put_bits        x10, x11
392    addp            v16.16b, v16.16b, v18.16b
393      checkbuf47
394    umov            x9, v16.D[0]
395      put_bits        x13, x12
396    cnt             v17.8b, v16.8b
397      mvn             x9, x9
398    addv            B18, v17.8b
399      add             x4, x5, #0x400   /* x4 = actbl->ehufsi */
400    umov            w12, v18.b[0]
401      lsr             x9, x9, #0x1     /* clear AC coeff */
402    ldr             w13, [x5, #0x3c0]  /* x13 = actbl->ehufco[0xf0] */
403    rbit            x9, x9             /* x9 = index0 */
404    ldrb            w14, [x4, #0xf0]   /* x14 = actbl->ehufsi[0xf0] */
405    cmp             w12, #(64-8)
406    add             x11, sp, #16
407    b.lt            4f
408    cbz             x9, 6f
409    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
410    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
411    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
412    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
4131:
414    clz             x2, x9
415    add             x15, x15, x2, lsl #1
416    lsl             x9, x9, x2
417    ldrh            w20, [x15, #-126]
4182:
419    cmp             x2, #0x10
420    b.lt            3f
421    sub             x2, x2, #0x10
422    checkbuf47
423    put_bits        x13, x14
424    b               2b
4253:
426    clz             w20, w20
427    ldrh            w3, [x15, #2]!
428    sub             w11, w20, #32
429    lsl             w3, w3, w20
430    neg             w11, w11
431    lsr             w3, w3, w20
432    add             x2, x11, x2, lsl #4
433    lsl             x9, x9, #0x1
434    ldr             w12, [x5, x2, lsl #2]
435    ldrb            w10, [x4, x2]
436    checkbuf31
437    put_bits        x12, x10
438    put_bits        x3, x11
439    cbnz            x9, 1b
440    b               6f
4414:
442    movi            v21.8h, #0x0010
443    clz             v0.8h, v0.8h
444    clz             v1.8h, v1.8h
445    clz             v2.8h, v2.8h
446    clz             v3.8h, v3.8h
447    clz             v4.8h, v4.8h
448    clz             v5.8h, v5.8h
449    clz             v6.8h, v6.8h
450    clz             v7.8h, v7.8h
451    ushl            v24.8h, v24.8h, v0.8h
452    ushl            v25.8h, v25.8h, v1.8h
453    ushl            v26.8h, v26.8h, v2.8h
454    ushl            v27.8h, v27.8h, v3.8h
455    ushl            v28.8h, v28.8h, v4.8h
456    ushl            v29.8h, v29.8h, v5.8h
457    ushl            v30.8h, v30.8h, v6.8h
458    ushl            v31.8h, v31.8h, v7.8h
459    neg             v0.8h, v0.8h
460    neg             v1.8h, v1.8h
461    neg             v2.8h, v2.8h
462    neg             v3.8h, v3.8h
463    neg             v4.8h, v4.8h
464    neg             v5.8h, v5.8h
465    neg             v6.8h, v6.8h
466    neg             v7.8h, v7.8h
467    ushl            v24.8h, v24.8h, v0.8h
468    ushl            v25.8h, v25.8h, v1.8h
469    ushl            v26.8h, v26.8h, v2.8h
470    ushl            v27.8h, v27.8h, v3.8h
471    ushl            v28.8h, v28.8h, v4.8h
472    ushl            v29.8h, v29.8h, v5.8h
473    ushl            v30.8h, v30.8h, v6.8h
474    ushl            v31.8h, v31.8h, v7.8h
475    add             v0.8h, v21.8h, v0.8h
476    add             v1.8h, v21.8h, v1.8h
477    add             v2.8h, v21.8h, v2.8h
478    add             v3.8h, v21.8h, v3.8h
479    add             v4.8h, v21.8h, v4.8h
480    add             v5.8h, v21.8h, v5.8h
481    add             v6.8h, v21.8h, v6.8h
482    add             v7.8h, v21.8h, v7.8h
483    st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
484    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
485    st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
486    st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
4871:
488    clz             x2, x9
489    add             x15, x15, x2, lsl #1
490    lsl             x9, x9, x2
491    ldrh            w11, [x15, #-126]
4922:
493    cmp             x2, #0x10
494    b.lt            3f
495    sub             x2, x2, #0x10
496    checkbuf47
497    put_bits        x13, x14
498    b               2b
4993:
500    ldrh            w3, [x15, #2]!
501    add             x2, x11, x2, lsl #4
502    lsl             x9, x9, #0x1
503    ldr             w12, [x5, x2, lsl #2]
504    ldrb            w10, [x4, x2]
505    checkbuf31
506    put_bits        x12, x10
507    put_bits        x3, x11
508    cbnz            x9, 1b
5096:
510    add             x13, sp, #0x10e
511    cmp             x15, x13
512    b.hs            1f
513    ldr             w12, [x5]
514    ldrb            w14, [x4]
515    checkbuf47
516    put_bits        x12, x14
5171:
518    str             PUT_BUFFER, [x0, #0x10]
519    str             PUT_BITSw, [x0, #0x18]
520    ldp             x19, x20, [sp], 16
521    add             x0, BUFFER, #0x1
522    add             sp, sp, 256
523    br              x30
524
525.endm
526
527generate_jsimd_huff_encode_one_block 1
528generate_jsimd_huff_encode_one_block 0
529
530    .unreq          BUFFER
531    .unreq          PUT_BUFFER
532    .unreq          PUT_BITS
533    .unreq          PUT_BITSw
534
535.purgem emit_byte
536.purgem put_bits
537.purgem checkbuf31
538.purgem checkbuf47
539