1/* 2 * ARMv8 NEON optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). 5 * All Rights Reserved. 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 7 * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved. 8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> 9 * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved. 10 * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved. 11 * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved. 12 * 13 * This software is provided 'as-is', without any express or implied 14 * warranty. In no event will the authors be held liable for any damages 15 * arising from the use of this software. 16 * 17 * Permission is granted to anyone to use this software for any purpose, 18 * including commercial applications, and to alter it and redistribute it 19 * freely, subject to the following restrictions: 20 * 21 * 1. The origin of this software must not be misrepresented; you must not 22 * claim that you wrote the original software. If you use this software 23 * in a product, an acknowledgment in the product documentation would be 24 * appreciated but is not required. 25 * 2. Altered source versions must be plainly marked as such, and must not be 26 * misrepresented as being the original software. 27 * 3. This notice may not be removed or altered from any source distribution. 28 */ 29 30#if defined(__linux__) && defined(__ELF__) 31.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ 32#endif 33 34#if defined(__APPLE__) 35.section __DATA, __const 36#elif defined(_WIN32) 37.section .rdata 38#else 39.section .rodata, "a", %progbits 40#endif 41 42/* Constants for jsimd_huff_encode_one_block_neon() */ 43 44.balign 16 45Ljsimd_huff_encode_one_block_neon_consts: 46 .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \ 47 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 48 .byte 0, 1, 2, 3, 16, 17, 32, 33, \ 49 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */ 50 .byte 34, 35, 48, 49, 255, 255, 50, 51, \ 51 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */ 52 .byte 8, 9, 22, 23, 36, 37, 50, 51, \ 53 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */ 54 .byte 54, 55, 40, 41, 26, 27, 12, 13, \ 55 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */ 56 .byte 6, 7, 20, 21, 34, 35, 48, 49, \ 57 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */ 58 .byte 42, 43, 28, 29, 14, 15, 30, 31, \ 59 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */ 60 .byte 255, 255, 255, 255, 56, 57, 42, 43, \ 61 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */ 62 .byte 26, 27, 40, 41, 42, 43, 28, 29, \ 63 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */ 64 .byte 255, 255, 255, 255, 0, 1, 255, 255, \ 65 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */ 66 .byte 255, 255, 255, 255, 255, 255, 255, 255, \ 67 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */ 68 .byte 255, 255, 255, 255, 255, 255, 255, 255, \ 69 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */ 70 .byte 4, 5, 6, 7, 255, 255, 255, 255, \ 71 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */ 72 73.text 74 75 76#define RESPECT_STRICT_ALIGNMENT 1 77 78 79/*****************************************************************************/ 80 81/* Supplementary macro for setting function attributes */ 82.macro asm_function fname 83#ifdef __APPLE__ 84 .private_extern _\fname 85 .globl _\fname 86_\fname: 87#else 88 .global \fname 89#ifdef __ELF__ 90 .hidden \fname 91 .type \fname, %function 92#endif 93\fname: 94#endif 95.endm 96 97/* Get symbol location */ 98.macro get_symbol_loc reg, symbol 99#ifdef __APPLE__ 100 adrp \reg, \symbol@PAGE 101 add \reg, \reg, \symbol@PAGEOFF 102#else 103 adrp \reg, \symbol 104 add \reg, \reg, :lo12:\symbol 105#endif 106.endm 107 108 109#define CENTERJSAMPLE 128 110 111/*****************************************************************************/ 112 113/* 114 * GLOBAL(JOCTET *) 115 * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer, 116 * JCOEFPTR block, int last_dc_val, 117 * c_derived_tbl *dctbl, c_derived_tbl *actbl) 118 * 119 */ 120 121 BUFFER .req x1 122 PUT_BUFFER .req x6 123 PUT_BITS .req x7 124 PUT_BITSw .req w7 125 126.macro emit_byte 127 sub PUT_BITS, PUT_BITS, #0x8 128 lsr x19, PUT_BUFFER, PUT_BITS 129 uxtb w19, w19 130 strb w19, [BUFFER, #1]! 131 cmp w19, #0xff 132 b.ne 14f 133 strb wzr, [BUFFER, #1]! 13414: 135.endm 136.macro put_bits CODE, SIZE 137 lsl PUT_BUFFER, PUT_BUFFER, \SIZE 138 add PUT_BITS, PUT_BITS, \SIZE 139 orr PUT_BUFFER, PUT_BUFFER, \CODE 140.endm 141.macro checkbuf31 142 cmp PUT_BITS, #0x20 143 b.lt 31f 144 emit_byte 145 emit_byte 146 emit_byte 147 emit_byte 14831: 149.endm 150.macro checkbuf47 151 cmp PUT_BITS, #0x30 152 b.lt 47f 153 emit_byte 154 emit_byte 155 emit_byte 156 emit_byte 157 emit_byte 158 emit_byte 15947: 160.endm 161 162.macro generate_jsimd_huff_encode_one_block fast_tbl 163 164.balign 16 165.if \fast_tbl == 1 166asm_function jsimd_huff_encode_one_block_neon 167.else 168asm_function jsimd_huff_encode_one_block_neon_slowtbl 169.endif 170 sub sp, sp, 272 171 sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */ 172 /* Save ARM registers */ 173 stp x19, x20, [sp] 174 get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_consts 175 ldr PUT_BUFFER, [x0, #0x10] 176 ldr PUT_BITSw, [x0, #0x18] 177 ldrsh w12, [x2] /* load DC coeff in w12 */ 178 /* prepare data */ 179.if \fast_tbl == 1 180 ld1 {v23.16b}, [x15], #16 181 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64 182 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64 183 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64 184 ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64 185 ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64 186 sub w12, w12, w3 /* last_dc_val, not used afterwards */ 187 /* ZigZag 8x8 */ 188 tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b 189 tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b 190 tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b 191 tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b 192 tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b 193 tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b 194 tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b 195 tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b 196 ins v0.h[0], w12 197 tbx v1.16b, {v28.16b}, v16.16b 198 tbx v2.16b, {v29.16b, v30.16b}, v17.16b 199 tbx v5.16b, {v29.16b, v30.16b}, v18.16b 200 tbx v6.16b, {v31.16b}, v19.16b 201.else 202 add x13, x2, #0x22 203 sub w12, w12, w3 /* last_dc_val, not used afterwards */ 204 ld1 {v23.16b}, [x15] 205 add x14, x2, #0x18 206 add x3, x2, #0x36 207 ins v0.h[0], w12 208 add x9, x2, #0x2 209 ld1 {v1.h}[0], [x13] 210 add x15, x2, #0x30 211 ld1 {v2.h}[0], [x14] 212 add x19, x2, #0x26 213 ld1 {v3.h}[0], [x3] 214 add x20, x2, #0x28 215 ld1 {v0.h}[1], [x9] 216 add x12, x2, #0x10 217 ld1 {v1.h}[1], [x15] 218 add x13, x2, #0x40 219 ld1 {v2.h}[1], [x19] 220 add x14, x2, #0x34 221 ld1 {v3.h}[1], [x20] 222 add x3, x2, #0x1a 223 ld1 {v0.h}[2], [x12] 224 add x9, x2, #0x20 225 ld1 {v1.h}[2], [x13] 226 add x15, x2, #0x32 227 ld1 {v2.h}[2], [x14] 228 add x19, x2, #0x42 229 ld1 {v3.h}[2], [x3] 230 add x20, x2, #0xc 231 ld1 {v0.h}[3], [x9] 232 add x12, x2, #0x12 233 ld1 {v1.h}[3], [x15] 234 add x13, x2, #0x24 235 ld1 {v2.h}[3], [x19] 236 add x14, x2, #0x50 237 ld1 {v3.h}[3], [x20] 238 add x3, x2, #0xe 239 ld1 {v0.h}[4], [x12] 240 add x9, x2, #0x4 241 ld1 {v1.h}[4], [x13] 242 add x15, x2, #0x16 243 ld1 {v2.h}[4], [x14] 244 add x19, x2, #0x60 245 ld1 {v3.h}[4], [x3] 246 add x20, x2, #0x1c 247 ld1 {v0.h}[5], [x9] 248 add x12, x2, #0x6 249 ld1 {v1.h}[5], [x15] 250 add x13, x2, #0x8 251 ld1 {v2.h}[5], [x19] 252 add x14, x2, #0x52 253 ld1 {v3.h}[5], [x20] 254 add x3, x2, #0x2a 255 ld1 {v0.h}[6], [x12] 256 add x9, x2, #0x14 257 ld1 {v1.h}[6], [x13] 258 add x15, x2, #0xa 259 ld1 {v2.h}[6], [x14] 260 add x19, x2, #0x44 261 ld1 {v3.h}[6], [x3] 262 add x20, x2, #0x38 263 ld1 {v0.h}[7], [x9] 264 add x12, x2, #0x46 265 ld1 {v1.h}[7], [x15] 266 add x13, x2, #0x3a 267 ld1 {v2.h}[7], [x19] 268 add x14, x2, #0x74 269 ld1 {v3.h}[7], [x20] 270 add x3, x2, #0x6a 271 ld1 {v4.h}[0], [x12] 272 add x9, x2, #0x54 273 ld1 {v5.h}[0], [x13] 274 add x15, x2, #0x2c 275 ld1 {v6.h}[0], [x14] 276 add x19, x2, #0x76 277 ld1 {v7.h}[0], [x3] 278 add x20, x2, #0x78 279 ld1 {v4.h}[1], [x9] 280 add x12, x2, #0x62 281 ld1 {v5.h}[1], [x15] 282 add x13, x2, #0x1e 283 ld1 {v6.h}[1], [x19] 284 add x14, x2, #0x68 285 ld1 {v7.h}[1], [x20] 286 add x3, x2, #0x7a 287 ld1 {v4.h}[2], [x12] 288 add x9, x2, #0x70 289 ld1 {v5.h}[2], [x13] 290 add x15, x2, #0x2e 291 ld1 {v6.h}[2], [x14] 292 add x19, x2, #0x5a 293 ld1 {v7.h}[2], [x3] 294 add x20, x2, #0x6c 295 ld1 {v4.h}[3], [x9] 296 add x12, x2, #0x72 297 ld1 {v5.h}[3], [x15] 298 add x13, x2, #0x3c 299 ld1 {v6.h}[3], [x19] 300 add x14, x2, #0x4c 301 ld1 {v7.h}[3], [x20] 302 add x3, x2, #0x5e 303 ld1 {v4.h}[4], [x12] 304 add x9, x2, #0x64 305 ld1 {v5.h}[4], [x13] 306 add x15, x2, #0x4a 307 ld1 {v6.h}[4], [x14] 308 add x19, x2, #0x3e 309 ld1 {v7.h}[4], [x3] 310 add x20, x2, #0x6e 311 ld1 {v4.h}[5], [x9] 312 add x12, x2, #0x56 313 ld1 {v5.h}[5], [x15] 314 add x13, x2, #0x58 315 ld1 {v6.h}[5], [x19] 316 add x14, x2, #0x4e 317 ld1 {v7.h}[5], [x20] 318 add x3, x2, #0x7c 319 ld1 {v4.h}[6], [x12] 320 add x9, x2, #0x48 321 ld1 {v5.h}[6], [x13] 322 add x15, x2, #0x66 323 ld1 {v6.h}[6], [x14] 324 add x19, x2, #0x5c 325 ld1 {v7.h}[6], [x3] 326 add x20, x2, #0x7e 327 ld1 {v4.h}[7], [x9] 328 ld1 {v5.h}[7], [x15] 329 ld1 {v6.h}[7], [x19] 330 ld1 {v7.h}[7], [x20] 331.endif 332 cmlt v24.8h, v0.8h, #0 333 cmlt v25.8h, v1.8h, #0 334 cmlt v26.8h, v2.8h, #0 335 cmlt v27.8h, v3.8h, #0 336 cmlt v28.8h, v4.8h, #0 337 cmlt v29.8h, v5.8h, #0 338 cmlt v30.8h, v6.8h, #0 339 cmlt v31.8h, v7.8h, #0 340 abs v0.8h, v0.8h 341 abs v1.8h, v1.8h 342 abs v2.8h, v2.8h 343 abs v3.8h, v3.8h 344 abs v4.8h, v4.8h 345 abs v5.8h, v5.8h 346 abs v6.8h, v6.8h 347 abs v7.8h, v7.8h 348 eor v24.16b, v24.16b, v0.16b 349 eor v25.16b, v25.16b, v1.16b 350 eor v26.16b, v26.16b, v2.16b 351 eor v27.16b, v27.16b, v3.16b 352 eor v28.16b, v28.16b, v4.16b 353 eor v29.16b, v29.16b, v5.16b 354 eor v30.16b, v30.16b, v6.16b 355 eor v31.16b, v31.16b, v7.16b 356 cmeq v16.8h, v0.8h, #0 357 cmeq v17.8h, v1.8h, #0 358 cmeq v18.8h, v2.8h, #0 359 cmeq v19.8h, v3.8h, #0 360 cmeq v20.8h, v4.8h, #0 361 cmeq v21.8h, v5.8h, #0 362 cmeq v22.8h, v6.8h, #0 363 xtn v16.8b, v16.8h 364 xtn v18.8b, v18.8h 365 xtn v20.8b, v20.8h 366 xtn v22.8b, v22.8h 367 umov w14, v0.h[0] 368 xtn2 v16.16b, v17.8h 369 umov w13, v24.h[0] 370 xtn2 v18.16b, v19.8h 371 clz w14, w14 372 xtn2 v20.16b, v21.8h 373 lsl w13, w13, w14 374 cmeq v17.8h, v7.8h, #0 375 sub w12, w14, #32 376 xtn2 v22.16b, v17.8h 377 lsr w13, w13, w14 378 and v16.16b, v16.16b, v23.16b 379 neg w12, w12 380 and v18.16b, v18.16b, v23.16b 381 add x3, x4, #0x400 /* r1 = dctbl->ehufsi */ 382 and v20.16b, v20.16b, v23.16b 383 add x15, sp, #0x90 /* x15 = t2 */ 384 and v22.16b, v22.16b, v23.16b 385 ldr w10, [x4, x12, lsl #2] 386 addp v16.16b, v16.16b, v18.16b 387 ldrb w11, [x3, x12] 388 addp v20.16b, v20.16b, v22.16b 389 checkbuf47 390 addp v16.16b, v16.16b, v20.16b 391 put_bits x10, x11 392 addp v16.16b, v16.16b, v18.16b 393 checkbuf47 394 umov x9, v16.D[0] 395 put_bits x13, x12 396 cnt v17.8b, v16.8b 397 mvn x9, x9 398 addv B18, v17.8b 399 add x4, x5, #0x400 /* x4 = actbl->ehufsi */ 400 umov w12, v18.b[0] 401 lsr x9, x9, #0x1 /* clear AC coeff */ 402 ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */ 403 rbit x9, x9 /* x9 = index0 */ 404 ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */ 405 cmp w12, #(64-8) 406 add x11, sp, #16 407 b.lt 4f 408 cbz x9, 6f 409 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 410 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 411 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 412 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 4131: 414 clz x2, x9 415 add x15, x15, x2, lsl #1 416 lsl x9, x9, x2 417 ldrh w20, [x15, #-126] 4182: 419 cmp x2, #0x10 420 b.lt 3f 421 sub x2, x2, #0x10 422 checkbuf47 423 put_bits x13, x14 424 b 2b 4253: 426 clz w20, w20 427 ldrh w3, [x15, #2]! 428 sub w11, w20, #32 429 lsl w3, w3, w20 430 neg w11, w11 431 lsr w3, w3, w20 432 add x2, x11, x2, lsl #4 433 lsl x9, x9, #0x1 434 ldr w12, [x5, x2, lsl #2] 435 ldrb w10, [x4, x2] 436 checkbuf31 437 put_bits x12, x10 438 put_bits x3, x11 439 cbnz x9, 1b 440 b 6f 4414: 442 movi v21.8h, #0x0010 443 clz v0.8h, v0.8h 444 clz v1.8h, v1.8h 445 clz v2.8h, v2.8h 446 clz v3.8h, v3.8h 447 clz v4.8h, v4.8h 448 clz v5.8h, v5.8h 449 clz v6.8h, v6.8h 450 clz v7.8h, v7.8h 451 ushl v24.8h, v24.8h, v0.8h 452 ushl v25.8h, v25.8h, v1.8h 453 ushl v26.8h, v26.8h, v2.8h 454 ushl v27.8h, v27.8h, v3.8h 455 ushl v28.8h, v28.8h, v4.8h 456 ushl v29.8h, v29.8h, v5.8h 457 ushl v30.8h, v30.8h, v6.8h 458 ushl v31.8h, v31.8h, v7.8h 459 neg v0.8h, v0.8h 460 neg v1.8h, v1.8h 461 neg v2.8h, v2.8h 462 neg v3.8h, v3.8h 463 neg v4.8h, v4.8h 464 neg v5.8h, v5.8h 465 neg v6.8h, v6.8h 466 neg v7.8h, v7.8h 467 ushl v24.8h, v24.8h, v0.8h 468 ushl v25.8h, v25.8h, v1.8h 469 ushl v26.8h, v26.8h, v2.8h 470 ushl v27.8h, v27.8h, v3.8h 471 ushl v28.8h, v28.8h, v4.8h 472 ushl v29.8h, v29.8h, v5.8h 473 ushl v30.8h, v30.8h, v6.8h 474 ushl v31.8h, v31.8h, v7.8h 475 add v0.8h, v21.8h, v0.8h 476 add v1.8h, v21.8h, v1.8h 477 add v2.8h, v21.8h, v2.8h 478 add v3.8h, v21.8h, v3.8h 479 add v4.8h, v21.8h, v4.8h 480 add v5.8h, v21.8h, v5.8h 481 add v6.8h, v21.8h, v6.8h 482 add v7.8h, v21.8h, v7.8h 483 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 484 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 485 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 486 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 4871: 488 clz x2, x9 489 add x15, x15, x2, lsl #1 490 lsl x9, x9, x2 491 ldrh w11, [x15, #-126] 4922: 493 cmp x2, #0x10 494 b.lt 3f 495 sub x2, x2, #0x10 496 checkbuf47 497 put_bits x13, x14 498 b 2b 4993: 500 ldrh w3, [x15, #2]! 501 add x2, x11, x2, lsl #4 502 lsl x9, x9, #0x1 503 ldr w12, [x5, x2, lsl #2] 504 ldrb w10, [x4, x2] 505 checkbuf31 506 put_bits x12, x10 507 put_bits x3, x11 508 cbnz x9, 1b 5096: 510 add x13, sp, #0x10e 511 cmp x15, x13 512 b.hs 1f 513 ldr w12, [x5] 514 ldrb w14, [x4] 515 checkbuf47 516 put_bits x12, x14 5171: 518 str PUT_BUFFER, [x0, #0x10] 519 str PUT_BITSw, [x0, #0x18] 520 ldp x19, x20, [sp], 16 521 add x0, BUFFER, #0x1 522 add sp, sp, 256 523 br x30 524 525.endm 526 527generate_jsimd_huff_encode_one_block 1 528generate_jsimd_huff_encode_one_block 0 529 530 .unreq BUFFER 531 .unreq PUT_BUFFER 532 .unreq PUT_BITS 533 .unreq PUT_BITSw 534 535.purgem emit_byte 536.purgem put_bits 537.purgem checkbuf31 538.purgem checkbuf47 539