1/* 2 * Armv8 Neon optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies). 5 * All Rights Reserved. 6 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com> 7 * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved. 8 * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org> 9 * Copyright (C) 2014-2016, 2020, D. R. Commander. All Rights Reserved. 10 * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved. 11 * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved. 12 * 13 * This software is provided 'as-is', without any express or implied 14 * warranty. In no event will the authors be held liable for any damages 15 * arising from the use of this software. 16 * 17 * Permission is granted to anyone to use this software for any purpose, 18 * including commercial applications, and to alter it and redistribute it 19 * freely, subject to the following restrictions: 20 * 21 * 1. The origin of this software must not be misrepresented; you must not 22 * claim that you wrote the original software. If you use this software 23 * in a product, an acknowledgment in the product documentation would be 24 * appreciated but is not required. 25 * 2. Altered source versions must be plainly marked as such, and must not be 26 * misrepresented as being the original software. 27 * 3. This notice may not be removed or altered from any source distribution. 28 */ 29 30#if defined(__linux__) && defined(__ELF__) 31.section .note.GNU-stack, "", %progbits /* mark stack as non-executable */ 32#endif 33 34#if defined(__APPLE__) 35.section __DATA, __const 36#elif defined(_WIN32) 37.section .rdata 38#else 39.section .rodata, "a", %progbits 40#endif 41 42/* Constants for jsimd_idct_islow_neon() */ 43 44#define F_0_298 2446 /* FIX(0.298631336) */ 45#define F_0_390 3196 /* FIX(0.390180644) */ 46#define F_0_541 4433 /* FIX(0.541196100) */ 47#define F_0_765 6270 /* FIX(0.765366865) */ 48#define F_0_899 7373 /* FIX(0.899976223) */ 49#define F_1_175 9633 /* FIX(1.175875602) */ 50#define F_1_501 12299 /* FIX(1.501321110) */ 51#define F_1_847 15137 /* FIX(1.847759065) */ 52#define F_1_961 16069 /* FIX(1.961570560) */ 53#define F_2_053 16819 /* FIX(2.053119869) */ 54#define F_2_562 20995 /* FIX(2.562915447) */ 55#define F_3_072 25172 /* FIX(3.072711026) */ 56 57.balign 16 58Ljsimd_idct_islow_neon_consts: 59 .short F_0_298 60 .short -F_0_390 61 .short F_0_541 62 .short F_0_765 63 .short - F_0_899 64 .short F_1_175 65 .short F_1_501 66 .short - F_1_847 67 .short - F_1_961 68 .short F_2_053 69 .short - F_2_562 70 .short F_3_072 71 .short 0 /* padding */ 72 .short 0 73 .short 0 74 .short 0 75 76#undef F_0_298 77#undef F_0_390 78#undef F_0_541 79#undef F_0_765 80#undef F_0_899 81#undef F_1_175 82#undef F_1_501 83#undef F_1_847 84#undef F_1_961 85#undef F_2_053 86#undef F_2_562 87#undef F_3_072 88 89/* Constants for jsimd_idct_ifast_neon() */ 90 91.balign 16 92Ljsimd_idct_ifast_neon_consts: 93 .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */ 94 .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */ 95 .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */ 96 .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */ 97 98/* Constants for jsimd_idct_4x4_neon() and jsimd_idct_2x2_neon() */ 99 100#define CONST_BITS 13 101 102#define FIX_0_211164243 (1730) /* FIX(0.211164243) */ 103#define FIX_0_509795579 (4176) /* FIX(0.509795579) */ 104#define FIX_0_601344887 (4926) /* FIX(0.601344887) */ 105#define FIX_0_720959822 (5906) /* FIX(0.720959822) */ 106#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ 107#define FIX_0_850430095 (6967) /* FIX(0.850430095) */ 108#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ 109#define FIX_1_061594337 (8697) /* FIX(1.061594337) */ 110#define FIX_1_272758580 (10426) /* FIX(1.272758580) */ 111#define FIX_1_451774981 (11893) /* FIX(1.451774981) */ 112#define FIX_1_847759065 (15137) /* FIX(1.847759065) */ 113#define FIX_2_172734803 (17799) /* FIX(2.172734803) */ 114#define FIX_2_562915447 (20995) /* FIX(2.562915447) */ 115#define FIX_3_624509785 (29692) /* FIX(3.624509785) */ 116 117.balign 16 118Ljsimd_idct_4x4_neon_consts: 119 .short FIX_1_847759065 /* v0.h[0] */ 120 .short -FIX_0_765366865 /* v0.h[1] */ 121 .short -FIX_0_211164243 /* v0.h[2] */ 122 .short FIX_1_451774981 /* v0.h[3] */ 123 .short -FIX_2_172734803 /* d1[0] */ 124 .short FIX_1_061594337 /* d1[1] */ 125 .short -FIX_0_509795579 /* d1[2] */ 126 .short -FIX_0_601344887 /* d1[3] */ 127 .short FIX_0_899976223 /* v2.h[0] */ 128 .short FIX_2_562915447 /* v2.h[1] */ 129 .short 1 << (CONST_BITS + 1) /* v2.h[2] */ 130 .short 0 /* v2.h[3] */ 131 132.balign 8 133Ljsimd_idct_2x2_neon_consts: 134 .short -FIX_0_720959822 /* v14[0] */ 135 .short FIX_0_850430095 /* v14[1] */ 136 .short -FIX_1_272758580 /* v14[2] */ 137 .short FIX_3_624509785 /* v14[3] */ 138 139/* Constants for jsimd_ycc_*_neon() */ 140 141.balign 16 142Ljsimd_ycc_rgb_neon_consts: 143 .short 0, 0, 0, 0 144 .short 22971, -11277, -23401, 29033 145 .short -128, -128, -128, -128 146 .short -128, -128, -128, -128 147 148/* Constants for jsimd_*_ycc_neon() */ 149 150.balign 16 151Ljsimd_rgb_ycc_neon_consts: 152 .short 19595, 38470, 7471, 11059 153 .short 21709, 32768, 27439, 5329 154 .short 32767, 128, 32767, 128 155 .short 32767, 128, 32767, 128 156 157/* Constants for jsimd_fdct_islow_neon() */ 158 159#define F_0_298 2446 /* FIX(0.298631336) */ 160#define F_0_390 3196 /* FIX(0.390180644) */ 161#define F_0_541 4433 /* FIX(0.541196100) */ 162#define F_0_765 6270 /* FIX(0.765366865) */ 163#define F_0_899 7373 /* FIX(0.899976223) */ 164#define F_1_175 9633 /* FIX(1.175875602) */ 165#define F_1_501 12299 /* FIX(1.501321110) */ 166#define F_1_847 15137 /* FIX(1.847759065) */ 167#define F_1_961 16069 /* FIX(1.961570560) */ 168#define F_2_053 16819 /* FIX(2.053119869) */ 169#define F_2_562 20995 /* FIX(2.562915447) */ 170#define F_3_072 25172 /* FIX(3.072711026) */ 171 172.balign 16 173Ljsimd_fdct_islow_neon_consts: 174 .short F_0_298 175 .short -F_0_390 176 .short F_0_541 177 .short F_0_765 178 .short - F_0_899 179 .short F_1_175 180 .short F_1_501 181 .short - F_1_847 182 .short - F_1_961 183 .short F_2_053 184 .short - F_2_562 185 .short F_3_072 186 .short 0 /* padding */ 187 .short 0 188 .short 0 189 .short 0 190 191#undef F_0_298 192#undef F_0_390 193#undef F_0_541 194#undef F_0_765 195#undef F_0_899 196#undef F_1_175 197#undef F_1_501 198#undef F_1_847 199#undef F_1_961 200#undef F_2_053 201#undef F_2_562 202#undef F_3_072 203 204/* Constants for jsimd_fdct_ifast_neon() */ 205 206.balign 16 207Ljsimd_fdct_ifast_neon_consts: 208 .short (98 * 128) /* XFIX_0_382683433 */ 209 .short (139 * 128) /* XFIX_0_541196100 */ 210 .short (181 * 128) /* XFIX_0_707106781 */ 211 .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */ 212 213/* Constants for jsimd_h2*_downsample_neon() */ 214 215.balign 16 216Ljsimd_h2_downsample_neon_consts: 217 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 218 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */ 219 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 220 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */ 221 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 222 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */ 223 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 224 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */ 225 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 226 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */ 227 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 228 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */ 229 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 230 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */ 231 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 232 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */ 233 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ 234 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */ 235 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \ 236 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */ 237 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \ 238 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */ 239 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \ 240 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */ 241 .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \ 242 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */ 243 .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \ 244 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */ 245 .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \ 246 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */ 247 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 248 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */ 249 250/* Constants for jsimd_huff_encode_one_block_neon() */ 251 252.balign 16 253Ljsimd_huff_encode_one_block_neon_consts: 254 .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \ 255 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 256 .byte 0, 1, 2, 3, 16, 17, 32, 33, \ 257 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */ 258 .byte 34, 35, 48, 49, 255, 255, 50, 51, \ 259 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */ 260 .byte 8, 9, 22, 23, 36, 37, 50, 51, \ 261 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */ 262 .byte 54, 55, 40, 41, 26, 27, 12, 13, \ 263 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */ 264 .byte 6, 7, 20, 21, 34, 35, 48, 49, \ 265 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */ 266 .byte 42, 43, 28, 29, 14, 15, 30, 31, \ 267 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */ 268 .byte 255, 255, 255, 255, 56, 57, 42, 43, \ 269 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */ 270 .byte 26, 27, 40, 41, 42, 43, 28, 29, \ 271 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */ 272 .byte 255, 255, 255, 255, 0, 1, 255, 255, \ 273 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */ 274 .byte 255, 255, 255, 255, 255, 255, 255, 255, \ 275 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */ 276 .byte 255, 255, 255, 255, 255, 255, 255, 255, \ 277 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */ 278 .byte 4, 5, 6, 7, 255, 255, 255, 255, \ 279 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */ 280 281.text 282 283 284#define RESPECT_STRICT_ALIGNMENT 1 285 286 287/*****************************************************************************/ 288 289/* Supplementary macro for setting function attributes */ 290.macro asm_function fname 291#ifdef __APPLE__ 292 .private_extern _\fname 293 .globl _\fname 294_\fname: 295#else 296 .global \fname 297#ifdef __ELF__ 298 .hidden \fname 299 .type \fname, %function 300#endif 301\fname: 302#endif 303.endm 304 305/* Get symbol location */ 306.macro get_symbol_loc reg, symbol 307#ifdef __APPLE__ 308 adrp \reg, \symbol@PAGE 309 add \reg, \reg, \symbol@PAGEOFF 310#else 311 adrp \reg, \symbol 312 add \reg, \reg, :lo12:\symbol 313#endif 314.endm 315 316/* Transpose elements of single 128 bit registers */ 317.macro transpose_single x0, x1, xi, xilen, literal 318 ins \xi\xilen[0], \x0\xilen[0] 319 ins \x1\xilen[0], \x0\xilen[1] 320 trn1 \x0\literal, \x0\literal, \x1\literal 321 trn2 \x1\literal, \xi\literal, \x1\literal 322.endm 323 324/* Transpose elements of 2 different registers */ 325.macro transpose x0, x1, xi, xilen, literal 326 mov \xi\xilen, \x0\xilen 327 trn1 \x0\literal, \x0\literal, \x1\literal 328 trn2 \x1\literal, \xi\literal, \x1\literal 329.endm 330 331/* Transpose a block of 4x4 coefficients in four 64-bit registers */ 332.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen 333 mov \xi\xilen, \x0\xilen 334 trn1 \x0\x0len, \x0\x0len, \x2\x2len 335 trn2 \x2\x2len, \xi\x0len, \x2\x2len 336 mov \xi\xilen, \x1\xilen 337 trn1 \x1\x1len, \x1\x1len, \x3\x3len 338 trn2 \x3\x3len, \xi\x1len, \x3\x3len 339.endm 340 341.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen 342 mov \xi\xilen, \x0\xilen 343 trn1 \x0\x0len, \x0\x0len, \x1\x1len 344 trn2 \x1\x2len, \xi\x0len, \x1\x2len 345 mov \xi\xilen, \x2\xilen 346 trn1 \x2\x2len, \x2\x2len, \x3\x3len 347 trn2 \x3\x2len, \xi\x1len, \x3\x3len 348.endm 349 350.macro transpose_4x4 x0, x1, x2, x3, x5 351 transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b 352 transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b 353.endm 354 355.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3 356 trn1 \t0\().8h, \l0\().8h, \l1\().8h 357 trn1 \t1\().8h, \l2\().8h, \l3\().8h 358 trn1 \t2\().8h, \l4\().8h, \l5\().8h 359 trn1 \t3\().8h, \l6\().8h, \l7\().8h 360 trn2 \l1\().8h, \l0\().8h, \l1\().8h 361 trn2 \l3\().8h, \l2\().8h, \l3\().8h 362 trn2 \l5\().8h, \l4\().8h, \l5\().8h 363 trn2 \l7\().8h, \l6\().8h, \l7\().8h 364 365 trn1 \l4\().4s, \t2\().4s, \t3\().4s 366 trn2 \t3\().4s, \t2\().4s, \t3\().4s 367 trn1 \t2\().4s, \t0\().4s, \t1\().4s 368 trn2 \l2\().4s, \t0\().4s, \t1\().4s 369 trn1 \t0\().4s, \l1\().4s, \l3\().4s 370 trn2 \l3\().4s, \l1\().4s, \l3\().4s 371 trn2 \t1\().4s, \l5\().4s, \l7\().4s 372 trn1 \l5\().4s, \l5\().4s, \l7\().4s 373 374 trn2 \l6\().2d, \l2\().2d, \t3\().2d 375 trn1 \l0\().2d, \t2\().2d, \l4\().2d 376 trn1 \l1\().2d, \t0\().2d, \l5\().2d 377 trn2 \l7\().2d, \l3\().2d, \t1\().2d 378 trn1 \l2\().2d, \l2\().2d, \t3\().2d 379 trn2 \l4\().2d, \t2\().2d, \l4\().2d 380 trn1 \l3\().2d, \l3\().2d, \t1\().2d 381 trn2 \l5\().2d, \t0\().2d, \l5\().2d 382.endm 383 384 385#define CENTERJSAMPLE 128 386 387/*****************************************************************************/ 388 389/* 390 * Perform dequantization and inverse DCT on one block of coefficients. 391 * 392 * GLOBAL(void) 393 * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block, 394 * JSAMPARRAY output_buf, JDIMENSION output_col) 395 */ 396 397#define CONST_BITS 13 398#define PASS1_BITS 2 399 400#define XFIX_P_0_298 v0.h[0] 401#define XFIX_N_0_390 v0.h[1] 402#define XFIX_P_0_541 v0.h[2] 403#define XFIX_P_0_765 v0.h[3] 404#define XFIX_N_0_899 v0.h[4] 405#define XFIX_P_1_175 v0.h[5] 406#define XFIX_P_1_501 v0.h[6] 407#define XFIX_N_1_847 v0.h[7] 408#define XFIX_N_1_961 v1.h[0] 409#define XFIX_P_2_053 v1.h[1] 410#define XFIX_N_2_562 v1.h[2] 411#define XFIX_P_3_072 v1.h[3] 412 413asm_function jsimd_idct_islow_neon 414 DCT_TABLE .req x0 415 COEF_BLOCK .req x1 416 OUTPUT_BUF .req x2 417 OUTPUT_COL .req x3 418 TMP1 .req x0 419 TMP2 .req x1 420 TMP3 .req x9 421 TMP4 .req x10 422 TMP5 .req x11 423 TMP6 .req x12 424 TMP7 .req x13 425 TMP8 .req x14 426 427 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 428 guarantee that the upper (unused) 32 bits of x3 are valid. This 429 instruction ensures that those bits are set to zero. */ 430 uxtw x3, w3 431 432 sub sp, sp, #64 433 get_symbol_loc x15, Ljsimd_idct_islow_neon_consts 434 mov x10, sp 435 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32 436 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32 437 ld1 {v0.8h, v1.8h}, [x15] 438 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64 439 ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64 440 ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64 441 ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64 442 443 cmeq v16.8h, v3.8h, #0 444 cmeq v26.8h, v4.8h, #0 445 cmeq v27.8h, v5.8h, #0 446 cmeq v28.8h, v6.8h, #0 447 cmeq v29.8h, v7.8h, #0 448 cmeq v30.8h, v8.8h, #0 449 cmeq v31.8h, v9.8h, #0 450 451 and v10.16b, v16.16b, v26.16b 452 and v11.16b, v27.16b, v28.16b 453 and v12.16b, v29.16b, v30.16b 454 and v13.16b, v31.16b, v10.16b 455 and v14.16b, v11.16b, v12.16b 456 mul v2.8h, v2.8h, v18.8h 457 and v15.16b, v13.16b, v14.16b 458 shl v10.8h, v2.8h, #(PASS1_BITS) 459 sqxtn v16.8b, v15.8h 460 mov TMP1, v16.d[0] 461 mvn TMP2, TMP1 462 463 cbnz TMP2, 2f 464 /* case all AC coeffs are zeros */ 465 dup v2.2d, v10.d[0] 466 dup v6.2d, v10.d[1] 467 mov v3.16b, v2.16b 468 mov v7.16b, v6.16b 469 mov v4.16b, v2.16b 470 mov v8.16b, v6.16b 471 mov v5.16b, v2.16b 472 mov v9.16b, v6.16b 4731: 474 /* for this transpose, we should organise data like this: 475 * 00, 01, 02, 03, 40, 41, 42, 43 476 * 10, 11, 12, 13, 50, 51, 52, 53 477 * 20, 21, 22, 23, 60, 61, 62, 63 478 * 30, 31, 32, 33, 70, 71, 72, 73 479 * 04, 05, 06, 07, 44, 45, 46, 47 480 * 14, 15, 16, 17, 54, 55, 56, 57 481 * 24, 25, 26, 27, 64, 65, 66, 67 482 * 34, 35, 36, 37, 74, 75, 76, 77 483 */ 484 trn1 v28.8h, v2.8h, v3.8h 485 trn1 v29.8h, v4.8h, v5.8h 486 trn1 v30.8h, v6.8h, v7.8h 487 trn1 v31.8h, v8.8h, v9.8h 488 trn2 v16.8h, v2.8h, v3.8h 489 trn2 v17.8h, v4.8h, v5.8h 490 trn2 v18.8h, v6.8h, v7.8h 491 trn2 v19.8h, v8.8h, v9.8h 492 trn1 v2.4s, v28.4s, v29.4s 493 trn1 v6.4s, v30.4s, v31.4s 494 trn1 v3.4s, v16.4s, v17.4s 495 trn1 v7.4s, v18.4s, v19.4s 496 trn2 v4.4s, v28.4s, v29.4s 497 trn2 v8.4s, v30.4s, v31.4s 498 trn2 v5.4s, v16.4s, v17.4s 499 trn2 v9.4s, v18.4s, v19.4s 500 /* Even part: reverse the even part of the forward DCT. */ 501 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 502 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 503 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 504 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 505 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 506 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 507 mov v21.16b, v19.16b /* tmp3 = z1 */ 508 mov v20.16b, v18.16b /* tmp3 = z1 */ 509 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 510 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 511 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 512 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 513 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 514 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 515 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 516 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ 517 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ 518 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ 519 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ 520 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ 521 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ 522 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ 523 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ 524 525 /* Odd part per figure 8; the matrix is unitary and hence its 526 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 527 */ 528 529 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 530 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 531 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 532 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 533 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ 534 535 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 536 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 537 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 538 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 539 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 540 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 541 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 542 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 543 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 544 545 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 546 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 547 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 548 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 549 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 550 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 551 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 552 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 553 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 554 555 add v23.4s, v23.4s, v27.4s /* z3 += z5 */ 556 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 557 add v25.4s, v25.4s, v27.4s /* z4 += z5 */ 558 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 559 560 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ 561 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ 562 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ 563 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ 564 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ 565 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ 566 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ 567 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ 568 569 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ 570 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ 571 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ 572 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ 573 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ 574 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ 575 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ 576 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ 577 578 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 579 580 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ 581 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ 582 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ 583 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ 584 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ 585 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ 586 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ 587 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ 588 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ 589 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ 590 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ 591 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ 592 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ 593 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ 594 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ 595 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ 596 597 shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ 598 shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ 599 shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ 600 shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ 601 shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ 602 shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ 603 shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ 604 shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ 605 shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */ 606 shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */ 607 shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */ 608 shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */ 609 shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */ 610 shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */ 611 shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */ 612 shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */ 613 movi v0.16b, #(CENTERJSAMPLE) 614 /* Prepare pointers (dual-issue with Neon instructions) */ 615 ldp TMP1, TMP2, [OUTPUT_BUF], 16 616 sqrshrn v28.8b, v2.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) 617 ldp TMP3, TMP4, [OUTPUT_BUF], 16 618 sqrshrn v29.8b, v3.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) 619 add TMP1, TMP1, OUTPUT_COL 620 sqrshrn v30.8b, v4.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) 621 add TMP2, TMP2, OUTPUT_COL 622 sqrshrn v31.8b, v5.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) 623 add TMP3, TMP3, OUTPUT_COL 624 sqrshrn2 v28.16b, v6.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) 625 add TMP4, TMP4, OUTPUT_COL 626 sqrshrn2 v29.16b, v7.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) 627 ldp TMP5, TMP6, [OUTPUT_BUF], 16 628 sqrshrn2 v30.16b, v8.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) 629 ldp TMP7, TMP8, [OUTPUT_BUF], 16 630 sqrshrn2 v31.16b, v9.8h, #(CONST_BITS + PASS1_BITS + 3 - 16) 631 add TMP5, TMP5, OUTPUT_COL 632 add v16.16b, v28.16b, v0.16b 633 add TMP6, TMP6, OUTPUT_COL 634 add v18.16b, v29.16b, v0.16b 635 add TMP7, TMP7, OUTPUT_COL 636 add v20.16b, v30.16b, v0.16b 637 add TMP8, TMP8, OUTPUT_COL 638 add v22.16b, v31.16b, v0.16b 639 640 /* Transpose the final 8-bit samples */ 641 trn1 v28.16b, v16.16b, v18.16b 642 trn1 v30.16b, v20.16b, v22.16b 643 trn2 v29.16b, v16.16b, v18.16b 644 trn2 v31.16b, v20.16b, v22.16b 645 646 trn1 v16.8h, v28.8h, v30.8h 647 trn2 v18.8h, v28.8h, v30.8h 648 trn1 v20.8h, v29.8h, v31.8h 649 trn2 v22.8h, v29.8h, v31.8h 650 651 uzp1 v28.4s, v16.4s, v18.4s 652 uzp2 v30.4s, v16.4s, v18.4s 653 uzp1 v29.4s, v20.4s, v22.4s 654 uzp2 v31.4s, v20.4s, v22.4s 655 656 /* Store results to the output buffer */ 657 st1 {v28.d}[0], [TMP1] 658 st1 {v29.d}[0], [TMP2] 659 st1 {v28.d}[1], [TMP3] 660 st1 {v29.d}[1], [TMP4] 661 st1 {v30.d}[0], [TMP5] 662 st1 {v31.d}[0], [TMP6] 663 st1 {v30.d}[1], [TMP7] 664 st1 {v31.d}[1], [TMP8] 665 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32 666 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32 667 blr x30 668 669.balign 16 6702: 671 mul v3.8h, v3.8h, v19.8h 672 mul v4.8h, v4.8h, v20.8h 673 mul v5.8h, v5.8h, v21.8h 674 add TMP4, xzr, TMP2, LSL #32 675 mul v6.8h, v6.8h, v22.8h 676 mul v7.8h, v7.8h, v23.8h 677 adds TMP3, xzr, TMP2, LSR #32 678 mul v8.8h, v8.8h, v24.8h 679 mul v9.8h, v9.8h, v25.8h 680 b.ne 3f 681 /* Right AC coef is zero */ 682 dup v15.2d, v10.d[1] 683 /* Even part: reverse the even part of the forward DCT. */ 684 add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 685 add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 686 sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 687 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 688 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 689 mov v20.16b, v18.16b /* tmp3 = z1 */ 690 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 691 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 692 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 693 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ 694 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ 695 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ 696 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ 697 698 /* Odd part per figure 8; the matrix is unitary and hence its 699 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 700 */ 701 702 add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 703 add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 704 add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 705 add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 706 add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */ 707 708 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 709 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 710 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 711 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 712 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 713 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 714 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 715 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 716 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 717 718 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 719 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 720 721 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ 722 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ 723 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ 724 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ 725 726 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ 727 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ 728 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ 729 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ 730 731 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 732 733 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ 734 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ 735 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ 736 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ 737 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ 738 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ 739 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ 740 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ 741 742 rshrn v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 743 rshrn v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 744 rshrn v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 745 rshrn v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 746 rshrn2 v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 747 rshrn2 v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 748 rshrn2 v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 749 rshrn2 v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 750 mov v6.16b, v15.16b 751 mov v7.16b, v15.16b 752 mov v8.16b, v15.16b 753 mov v9.16b, v15.16b 754 b 1b 755 756.balign 16 7573: 758 cbnz TMP4, 4f 759 /* Left AC coef is zero */ 760 dup v14.2d, v10.d[0] 761 /* Even part: reverse the even part of the forward DCT. */ 762 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 763 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 764 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 765 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 766 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 767 mov v21.16b, v19.16b /* tmp3 = z1 */ 768 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 769 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 770 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 771 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ 772 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ 773 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ 774 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ 775 776 /* Odd part per figure 8; the matrix is unitary and hence its 777 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 778 */ 779 780 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 781 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 782 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 783 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 784 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ 785 786 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 787 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 788 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 789 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 790 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 791 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 792 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 793 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 794 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 795 796 add v23.4s, v23.4s, v27.4s /* z3 += z5 */ 797 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 798 add v25.4s, v25.4s, v27.4s /* z4 += z5 */ 799 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 800 801 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ 802 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ 803 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ 804 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ 805 806 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ 807 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ 808 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ 809 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ 810 811 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 812 813 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ 814 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ 815 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ 816 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ 817 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ 818 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ 819 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ 820 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ 821 822 mov v2.16b, v14.16b 823 mov v3.16b, v14.16b 824 mov v4.16b, v14.16b 825 mov v5.16b, v14.16b 826 rshrn v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 827 rshrn v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 828 rshrn v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 829 rshrn v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 830 rshrn2 v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 831 rshrn2 v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 832 rshrn2 v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 833 rshrn2 v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 834 b 1b 835 836.balign 16 8374: 838 /* "No" AC coef is zero */ 839 /* Even part: reverse the even part of the forward DCT. */ 840 add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */ 841 add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 842 smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 843 sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */ 844 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */ 845 sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 846 mov v21.16b, v19.16b /* tmp3 = z1 */ 847 mov v20.16b, v18.16b /* tmp3 = z1 */ 848 smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 849 smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */ 850 sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 851 smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 852 smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */ 853 sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */ 854 sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */ 855 add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */ 856 sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */ 857 add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */ 858 sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */ 859 add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */ 860 sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */ 861 add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */ 862 sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */ 863 864 /* Odd part per figure 8; the matrix is unitary and hence its 865 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. 866 */ 867 868 add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 869 add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 870 add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */ 871 add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */ 872 add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */ 873 874 smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 875 smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 876 smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 877 smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 878 smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 879 smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 880 smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 881 smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 882 smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 883 884 smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 885 smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 886 smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 887 smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 888 smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 889 smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */ 890 smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */ 891 smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */ 892 smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */ 893 894 add v23.4s, v23.4s, v27.4s /* z3 += z5 */ 895 add v22.4s, v22.4s, v26.4s /* z3 += z5 */ 896 add v25.4s, v25.4s, v27.4s /* z4 += z5 */ 897 add v24.4s, v24.4s, v26.4s /* z4 += z5 */ 898 899 add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */ 900 add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */ 901 add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */ 902 add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */ 903 add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */ 904 add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */ 905 add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */ 906 add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */ 907 908 add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */ 909 add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */ 910 add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */ 911 add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */ 912 add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */ 913 add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */ 914 add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */ 915 add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */ 916 917 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ 918 919 add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */ 920 add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */ 921 sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */ 922 sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */ 923 add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */ 924 add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */ 925 sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */ 926 sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */ 927 add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */ 928 add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */ 929 sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */ 930 sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */ 931 add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */ 932 add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */ 933 sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */ 934 sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */ 935 936 rshrn v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 937 rshrn v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 938 rshrn v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 939 rshrn v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 940 rshrn v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */ 941 rshrn v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */ 942 rshrn v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */ 943 rshrn v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */ 944 rshrn2 v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 945 rshrn2 v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 946 rshrn2 v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 947 rshrn2 v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 948 rshrn2 v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */ 949 rshrn2 v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */ 950 rshrn2 v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */ 951 rshrn2 v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */ 952 b 1b 953 954 .unreq DCT_TABLE 955 .unreq COEF_BLOCK 956 .unreq OUTPUT_BUF 957 .unreq OUTPUT_COL 958 .unreq TMP1 959 .unreq TMP2 960 .unreq TMP3 961 .unreq TMP4 962 .unreq TMP5 963 .unreq TMP6 964 .unreq TMP7 965 .unreq TMP8 966 967#undef CENTERJSAMPLE 968#undef CONST_BITS 969#undef PASS1_BITS 970#undef XFIX_P_0_298 971#undef XFIX_N_0_390 972#undef XFIX_P_0_541 973#undef XFIX_P_0_765 974#undef XFIX_N_0_899 975#undef XFIX_P_1_175 976#undef XFIX_P_1_501 977#undef XFIX_N_1_847 978#undef XFIX_N_1_961 979#undef XFIX_P_2_053 980#undef XFIX_N_2_562 981#undef XFIX_P_3_072 982 983 984/*****************************************************************************/ 985 986/* 987 * jsimd_idct_ifast_neon 988 * 989 * This function contains a fast, not so accurate integer implementation of 990 * the inverse DCT (Discrete Cosine Transform). It uses the same calculations 991 * and produces exactly the same output as IJG's original 'jpeg_idct_ifast' 992 * function from jidctfst.c 993 * 994 * Normally 1-D AAN DCT needs 5 multiplications and 29 additions. 995 * But in Arm Neon case some extra additions are required because VQDMULH 996 * instruction can't handle the constants larger than 1. So the expressions 997 * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x", 998 * which introduces an extra addition. Overall, there are 6 extra additions 999 * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions. 1000 */ 1001 1002#define XFIX_1_082392200 v0.h[0] 1003#define XFIX_1_414213562 v0.h[1] 1004#define XFIX_1_847759065 v0.h[2] 1005#define XFIX_2_613125930 v0.h[3] 1006 1007asm_function jsimd_idct_ifast_neon 1008 1009 DCT_TABLE .req x0 1010 COEF_BLOCK .req x1 1011 OUTPUT_BUF .req x2 1012 OUTPUT_COL .req x3 1013 TMP1 .req x0 1014 TMP2 .req x1 1015 TMP3 .req x9 1016 TMP4 .req x10 1017 TMP5 .req x11 1018 TMP6 .req x12 1019 TMP7 .req x13 1020 TMP8 .req x14 1021 1022 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 1023 guarantee that the upper (unused) 32 bits of x3 are valid. This 1024 instruction ensures that those bits are set to zero. */ 1025 uxtw x3, w3 1026 1027 /* Load and dequantize coefficients into Neon registers 1028 * with the following allocation: 1029 * 0 1 2 3 | 4 5 6 7 1030 * ---------+-------- 1031 * 0 | d16 | d17 ( v16.8h ) 1032 * 1 | d18 | d19 ( v17.8h ) 1033 * 2 | d20 | d21 ( v18.8h ) 1034 * 3 | d22 | d23 ( v19.8h ) 1035 * 4 | d24 | d25 ( v20.8h ) 1036 * 5 | d26 | d27 ( v21.8h ) 1037 * 6 | d28 | d29 ( v22.8h ) 1038 * 7 | d30 | d31 ( v23.8h ) 1039 */ 1040 /* Save Neon registers used in fast IDCT */ 1041 get_symbol_loc TMP5, Ljsimd_idct_ifast_neon_consts 1042 ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32 1043 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 1044 ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32 1045 mul v16.8h, v16.8h, v0.8h 1046 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 1047 mul v17.8h, v17.8h, v1.8h 1048 ld1 {v20.8h, v21.8h}, [COEF_BLOCK], 32 1049 mul v18.8h, v18.8h, v2.8h 1050 ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32 1051 mul v19.8h, v19.8h, v3.8h 1052 ld1 {v22.8h, v23.8h}, [COEF_BLOCK], 32 1053 mul v20.8h, v20.8h, v0.8h 1054 ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32 1055 mul v22.8h, v22.8h, v2.8h 1056 mul v21.8h, v21.8h, v1.8h 1057 ld1 {v0.4h}, [TMP5] /* load constants */ 1058 mul v23.8h, v23.8h, v3.8h 1059 1060 /* 1-D IDCT, pass 1 */ 1061 sub v2.8h, v18.8h, v22.8h 1062 add v22.8h, v18.8h, v22.8h 1063 sub v1.8h, v19.8h, v21.8h 1064 add v21.8h, v19.8h, v21.8h 1065 sub v5.8h, v17.8h, v23.8h 1066 add v23.8h, v17.8h, v23.8h 1067 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 1068 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 1069 add v3.8h, v1.8h, v1.8h 1070 sub v1.8h, v5.8h, v1.8h 1071 add v18.8h, v2.8h, v4.8h 1072 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 1073 sub v2.8h, v23.8h, v21.8h 1074 add v3.8h, v3.8h, v6.8h 1075 sqdmulh v6.8h, v2.8h, XFIX_1_414213562 1076 add v1.8h, v1.8h, v4.8h 1077 sqdmulh v4.8h, v5.8h, XFIX_1_082392200 1078 sub v18.8h, v18.8h, v22.8h 1079 add v2.8h, v2.8h, v6.8h 1080 sub v6.8h, v16.8h, v20.8h 1081 add v20.8h, v16.8h, v20.8h 1082 add v17.8h, v5.8h, v4.8h 1083 add v5.8h, v6.8h, v18.8h 1084 sub v18.8h, v6.8h, v18.8h 1085 add v6.8h, v23.8h, v21.8h 1086 add v16.8h, v20.8h, v22.8h 1087 sub v3.8h, v6.8h, v3.8h 1088 sub v20.8h, v20.8h, v22.8h 1089 sub v3.8h, v3.8h, v1.8h 1090 sub v1.8h, v17.8h, v1.8h 1091 add v2.8h, v3.8h, v2.8h 1092 sub v23.8h, v16.8h, v6.8h 1093 add v1.8h, v1.8h, v2.8h 1094 add v16.8h, v16.8h, v6.8h 1095 add v22.8h, v5.8h, v3.8h 1096 sub v17.8h, v5.8h, v3.8h 1097 sub v21.8h, v18.8h, v2.8h 1098 add v18.8h, v18.8h, v2.8h 1099 sub v19.8h, v20.8h, v1.8h 1100 add v20.8h, v20.8h, v1.8h 1101 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31 1102 /* 1-D IDCT, pass 2 */ 1103 sub v2.8h, v18.8h, v22.8h 1104 add v22.8h, v18.8h, v22.8h 1105 sub v1.8h, v19.8h, v21.8h 1106 add v21.8h, v19.8h, v21.8h 1107 sub v5.8h, v17.8h, v23.8h 1108 add v23.8h, v17.8h, v23.8h 1109 sqdmulh v4.8h, v2.8h, XFIX_1_414213562 1110 sqdmulh v6.8h, v1.8h, XFIX_2_613125930 1111 add v3.8h, v1.8h, v1.8h 1112 sub v1.8h, v5.8h, v1.8h 1113 add v18.8h, v2.8h, v4.8h 1114 sqdmulh v4.8h, v1.8h, XFIX_1_847759065 1115 sub v2.8h, v23.8h, v21.8h 1116 add v3.8h, v3.8h, v6.8h 1117 sqdmulh v6.8h, v2.8h, XFIX_1_414213562 1118 add v1.8h, v1.8h, v4.8h 1119 sqdmulh v4.8h, v5.8h, XFIX_1_082392200 1120 sub v18.8h, v18.8h, v22.8h 1121 add v2.8h, v2.8h, v6.8h 1122 sub v6.8h, v16.8h, v20.8h 1123 add v20.8h, v16.8h, v20.8h 1124 add v17.8h, v5.8h, v4.8h 1125 add v5.8h, v6.8h, v18.8h 1126 sub v18.8h, v6.8h, v18.8h 1127 add v6.8h, v23.8h, v21.8h 1128 add v16.8h, v20.8h, v22.8h 1129 sub v3.8h, v6.8h, v3.8h 1130 sub v20.8h, v20.8h, v22.8h 1131 sub v3.8h, v3.8h, v1.8h 1132 sub v1.8h, v17.8h, v1.8h 1133 add v2.8h, v3.8h, v2.8h 1134 sub v23.8h, v16.8h, v6.8h 1135 add v1.8h, v1.8h, v2.8h 1136 add v16.8h, v16.8h, v6.8h 1137 add v22.8h, v5.8h, v3.8h 1138 sub v17.8h, v5.8h, v3.8h 1139 sub v21.8h, v18.8h, v2.8h 1140 add v18.8h, v18.8h, v2.8h 1141 sub v19.8h, v20.8h, v1.8h 1142 add v20.8h, v20.8h, v1.8h 1143 /* Descale to 8-bit and range limit */ 1144 movi v0.16b, #0x80 1145 /* Prepare pointers (dual-issue with Neon instructions) */ 1146 ldp TMP1, TMP2, [OUTPUT_BUF], 16 1147 sqshrn v28.8b, v16.8h, #5 1148 ldp TMP3, TMP4, [OUTPUT_BUF], 16 1149 sqshrn v29.8b, v17.8h, #5 1150 add TMP1, TMP1, OUTPUT_COL 1151 sqshrn v30.8b, v18.8h, #5 1152 add TMP2, TMP2, OUTPUT_COL 1153 sqshrn v31.8b, v19.8h, #5 1154 add TMP3, TMP3, OUTPUT_COL 1155 sqshrn2 v28.16b, v20.8h, #5 1156 add TMP4, TMP4, OUTPUT_COL 1157 sqshrn2 v29.16b, v21.8h, #5 1158 ldp TMP5, TMP6, [OUTPUT_BUF], 16 1159 sqshrn2 v30.16b, v22.8h, #5 1160 ldp TMP7, TMP8, [OUTPUT_BUF], 16 1161 sqshrn2 v31.16b, v23.8h, #5 1162 add TMP5, TMP5, OUTPUT_COL 1163 add v16.16b, v28.16b, v0.16b 1164 add TMP6, TMP6, OUTPUT_COL 1165 add v18.16b, v29.16b, v0.16b 1166 add TMP7, TMP7, OUTPUT_COL 1167 add v20.16b, v30.16b, v0.16b 1168 add TMP8, TMP8, OUTPUT_COL 1169 add v22.16b, v31.16b, v0.16b 1170 1171 /* Transpose the final 8-bit samples */ 1172 trn1 v28.16b, v16.16b, v18.16b 1173 trn1 v30.16b, v20.16b, v22.16b 1174 trn2 v29.16b, v16.16b, v18.16b 1175 trn2 v31.16b, v20.16b, v22.16b 1176 1177 trn1 v16.8h, v28.8h, v30.8h 1178 trn2 v18.8h, v28.8h, v30.8h 1179 trn1 v20.8h, v29.8h, v31.8h 1180 trn2 v22.8h, v29.8h, v31.8h 1181 1182 uzp1 v28.4s, v16.4s, v18.4s 1183 uzp2 v30.4s, v16.4s, v18.4s 1184 uzp1 v29.4s, v20.4s, v22.4s 1185 uzp2 v31.4s, v20.4s, v22.4s 1186 1187 /* Store results to the output buffer */ 1188 st1 {v28.d}[0], [TMP1] 1189 st1 {v29.d}[0], [TMP2] 1190 st1 {v28.d}[1], [TMP3] 1191 st1 {v29.d}[1], [TMP4] 1192 st1 {v30.d}[0], [TMP5] 1193 st1 {v31.d}[0], [TMP6] 1194 st1 {v30.d}[1], [TMP7] 1195 st1 {v31.d}[1], [TMP8] 1196 blr x30 1197 1198 .unreq DCT_TABLE 1199 .unreq COEF_BLOCK 1200 .unreq OUTPUT_BUF 1201 .unreq OUTPUT_COL 1202 .unreq TMP1 1203 .unreq TMP2 1204 .unreq TMP3 1205 .unreq TMP4 1206 .unreq TMP5 1207 .unreq TMP6 1208 .unreq TMP7 1209 .unreq TMP8 1210 1211 1212/*****************************************************************************/ 1213 1214/* 1215 * jsimd_idct_4x4_neon 1216 * 1217 * This function contains inverse-DCT code for getting reduced-size 1218 * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations 1219 * and produces exactly the same output as IJG's original 'jpeg_idct_4x4' 1220 * function from jpeg-6b (jidctred.c). 1221 * 1222 * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which 1223 * requires much less arithmetic operations and hence should be faster. 1224 * The primary purpose of this particular Neon optimized function is 1225 * bit exact compatibility with jpeg-6b. 1226 * 1227 * TODO: a bit better instructions scheduling can be achieved by expanding 1228 * idct_helper/transpose_4x4 macros and reordering instructions, 1229 * but readability will suffer somewhat. 1230 */ 1231 1232.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29 1233 smull v28.4s, \x4, v2.h[2] 1234 smlal v28.4s, \x8, v0.h[0] 1235 smlal v28.4s, \x14, v0.h[1] 1236 1237 smull v26.4s, \x16, v1.h[2] 1238 smlal v26.4s, \x12, v1.h[3] 1239 smlal v26.4s, \x10, v2.h[0] 1240 smlal v26.4s, \x6, v2.h[1] 1241 1242 smull v30.4s, \x4, v2.h[2] 1243 smlsl v30.4s, \x8, v0.h[0] 1244 smlsl v30.4s, \x14, v0.h[1] 1245 1246 smull v24.4s, \x16, v0.h[2] 1247 smlal v24.4s, \x12, v0.h[3] 1248 smlal v24.4s, \x10, v1.h[0] 1249 smlal v24.4s, \x6, v1.h[1] 1250 1251 add v20.4s, v28.4s, v26.4s 1252 sub v28.4s, v28.4s, v26.4s 1253 1254 .if \shift > 16 1255 srshr v20.4s, v20.4s, #\shift 1256 srshr v28.4s, v28.4s, #\shift 1257 xtn \y26, v20.4s 1258 xtn \y29, v28.4s 1259 .else 1260 rshrn \y26, v20.4s, #\shift 1261 rshrn \y29, v28.4s, #\shift 1262 .endif 1263 1264 add v20.4s, v30.4s, v24.4s 1265 sub v30.4s, v30.4s, v24.4s 1266 1267 .if \shift > 16 1268 srshr v20.4s, v20.4s, #\shift 1269 srshr v30.4s, v30.4s, #\shift 1270 xtn \y27, v20.4s 1271 xtn \y28, v30.4s 1272 .else 1273 rshrn \y27, v20.4s, #\shift 1274 rshrn \y28, v30.4s, #\shift 1275 .endif 1276.endm 1277 1278asm_function jsimd_idct_4x4_neon 1279 1280 DCT_TABLE .req x0 1281 COEF_BLOCK .req x1 1282 OUTPUT_BUF .req x2 1283 OUTPUT_COL .req x3 1284 TMP1 .req x0 1285 TMP2 .req x1 1286 TMP3 .req x2 1287 TMP4 .req x15 1288 1289 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 1290 guarantee that the upper (unused) 32 bits of x3 are valid. This 1291 instruction ensures that those bits are set to zero. */ 1292 uxtw x3, w3 1293 1294 /* Save all used Neon registers */ 1295 sub sp, sp, 64 1296 mov x9, sp 1297 /* Load constants (v3.4h is just used for padding) */ 1298 get_symbol_loc TMP4, Ljsimd_idct_4x4_neon_consts 1299 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 1300 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 1301 ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4] 1302 1303 /* Load all COEF_BLOCK into Neon registers with the following allocation: 1304 * 0 1 2 3 | 4 5 6 7 1305 * ---------+-------- 1306 * 0 | v4.4h | v5.4h 1307 * 1 | v6.4h | v7.4h 1308 * 2 | v8.4h | v9.4h 1309 * 3 | v10.4h | v11.4h 1310 * 4 | - | - 1311 * 5 | v12.4h | v13.4h 1312 * 6 | v14.4h | v15.4h 1313 * 7 | v16.4h | v17.4h 1314 */ 1315 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 1316 ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32 1317 add COEF_BLOCK, COEF_BLOCK, #16 1318 ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32 1319 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 1320 /* dequantize */ 1321 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 1322 mul v4.4h, v4.4h, v18.4h 1323 mul v5.4h, v5.4h, v19.4h 1324 ins v4.d[1], v5.d[0] /* 128 bit q4 */ 1325 ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32 1326 mul v6.4h, v6.4h, v20.4h 1327 mul v7.4h, v7.4h, v21.4h 1328 ins v6.d[1], v7.d[0] /* 128 bit q6 */ 1329 mul v8.4h, v8.4h, v22.4h 1330 mul v9.4h, v9.4h, v23.4h 1331 ins v8.d[1], v9.d[0] /* 128 bit q8 */ 1332 add DCT_TABLE, DCT_TABLE, #16 1333 ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32 1334 mul v10.4h, v10.4h, v24.4h 1335 mul v11.4h, v11.4h, v25.4h 1336 ins v10.d[1], v11.d[0] /* 128 bit q10 */ 1337 mul v12.4h, v12.4h, v26.4h 1338 mul v13.4h, v13.4h, v27.4h 1339 ins v12.d[1], v13.d[0] /* 128 bit q12 */ 1340 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 1341 mul v14.4h, v14.4h, v28.4h 1342 mul v15.4h, v15.4h, v29.4h 1343 ins v14.d[1], v15.d[0] /* 128 bit q14 */ 1344 mul v16.4h, v16.4h, v30.4h 1345 mul v17.4h, v17.4h, v31.4h 1346 ins v16.d[1], v17.d[0] /* 128 bit q16 */ 1347 1348 /* Pass 1 */ 1349 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \ 1350 v4.4h, v6.4h, v8.4h, v10.4h 1351 transpose_4x4 v4, v6, v8, v10, v3 1352 ins v10.d[1], v11.d[0] 1353 idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \ 1354 v5.4h, v7.4h, v9.4h, v11.4h 1355 transpose_4x4 v5, v7, v9, v11, v3 1356 ins v10.d[1], v11.d[0] 1357 1358 /* Pass 2 */ 1359 idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \ 1360 v26.4h, v27.4h, v28.4h, v29.4h 1361 transpose_4x4 v26, v27, v28, v29, v3 1362 1363 /* Range limit */ 1364 movi v30.8h, #0x80 1365 ins v26.d[1], v27.d[0] 1366 ins v28.d[1], v29.d[0] 1367 add v26.8h, v26.8h, v30.8h 1368 add v28.8h, v28.8h, v30.8h 1369 sqxtun v26.8b, v26.8h 1370 sqxtun v27.8b, v28.8h 1371 1372 /* Store results to the output buffer */ 1373 ldp TMP1, TMP2, [OUTPUT_BUF], 16 1374 ldp TMP3, TMP4, [OUTPUT_BUF] 1375 add TMP1, TMP1, OUTPUT_COL 1376 add TMP2, TMP2, OUTPUT_COL 1377 add TMP3, TMP3, OUTPUT_COL 1378 add TMP4, TMP4, OUTPUT_COL 1379 1380#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT 1381 /* We can use much less instructions on little endian systems if the 1382 * OS kernel is not configured to trap unaligned memory accesses 1383 */ 1384 st1 {v26.s}[0], [TMP1], 4 1385 st1 {v27.s}[0], [TMP3], 4 1386 st1 {v26.s}[1], [TMP2], 4 1387 st1 {v27.s}[1], [TMP4], 4 1388#else 1389 st1 {v26.b}[0], [TMP1], 1 1390 st1 {v27.b}[0], [TMP3], 1 1391 st1 {v26.b}[1], [TMP1], 1 1392 st1 {v27.b}[1], [TMP3], 1 1393 st1 {v26.b}[2], [TMP1], 1 1394 st1 {v27.b}[2], [TMP3], 1 1395 st1 {v26.b}[3], [TMP1], 1 1396 st1 {v27.b}[3], [TMP3], 1 1397 1398 st1 {v26.b}[4], [TMP2], 1 1399 st1 {v27.b}[4], [TMP4], 1 1400 st1 {v26.b}[5], [TMP2], 1 1401 st1 {v27.b}[5], [TMP4], 1 1402 st1 {v26.b}[6], [TMP2], 1 1403 st1 {v27.b}[6], [TMP4], 1 1404 st1 {v26.b}[7], [TMP2], 1 1405 st1 {v27.b}[7], [TMP4], 1 1406#endif 1407 1408 /* vpop {v8.4h - v15.4h} (not available) */ 1409 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1410 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1411 blr x30 1412 1413 .unreq DCT_TABLE 1414 .unreq COEF_BLOCK 1415 .unreq OUTPUT_BUF 1416 .unreq OUTPUT_COL 1417 .unreq TMP1 1418 .unreq TMP2 1419 .unreq TMP3 1420 .unreq TMP4 1421 1422.purgem idct_helper 1423 1424 1425/*****************************************************************************/ 1426 1427/* 1428 * jsimd_idct_2x2_neon 1429 * 1430 * This function contains inverse-DCT code for getting reduced-size 1431 * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations 1432 * and produces exactly the same output as IJG's original 'jpeg_idct_2x2' 1433 * function from jpeg-6b (jidctred.c). 1434 * 1435 * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which 1436 * requires much less arithmetic operations and hence should be faster. 1437 * The primary purpose of this particular Neon optimized function is 1438 * bit exact compatibility with jpeg-6b. 1439 */ 1440 1441.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27 1442 sshll v15.4s, \x4, #15 1443 smull v26.4s, \x6, v14.h[3] 1444 smlal v26.4s, \x10, v14.h[2] 1445 smlal v26.4s, \x12, v14.h[1] 1446 smlal v26.4s, \x16, v14.h[0] 1447 1448 add v20.4s, v15.4s, v26.4s 1449 sub v15.4s, v15.4s, v26.4s 1450 1451 .if \shift > 16 1452 srshr v20.4s, v20.4s, #\shift 1453 srshr v15.4s, v15.4s, #\shift 1454 xtn \y26, v20.4s 1455 xtn \y27, v15.4s 1456 .else 1457 rshrn \y26, v20.4s, #\shift 1458 rshrn \y27, v15.4s, #\shift 1459 .endif 1460.endm 1461 1462asm_function jsimd_idct_2x2_neon 1463 1464 DCT_TABLE .req x0 1465 COEF_BLOCK .req x1 1466 OUTPUT_BUF .req x2 1467 OUTPUT_COL .req x3 1468 TMP1 .req x0 1469 TMP2 .req x15 1470 1471 /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 1472 guarantee that the upper (unused) 32 bits of x3 are valid. This 1473 instruction ensures that those bits are set to zero. */ 1474 uxtw x3, w3 1475 1476 /* vpush {v8.4h - v15.4h} (not available) */ 1477 sub sp, sp, 64 1478 mov x9, sp 1479 1480 /* Load constants */ 1481 get_symbol_loc TMP2, Ljsimd_idct_2x2_neon_consts 1482 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 1483 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 1484 ld1 {v14.4h}, [TMP2] 1485 1486 /* Load all COEF_BLOCK into Neon registers with the following allocation: 1487 * 0 1 2 3 | 4 5 6 7 1488 * ---------+-------- 1489 * 0 | v4.4h | v5.4h 1490 * 1 | v6.4h | v7.4h 1491 * 2 | - | - 1492 * 3 | v10.4h | v11.4h 1493 * 4 | - | - 1494 * 5 | v12.4h | v13.4h 1495 * 6 | - | - 1496 * 7 | v16.4h | v17.4h 1497 */ 1498 ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32 1499 add COEF_BLOCK, COEF_BLOCK, #16 1500 ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16 1501 add COEF_BLOCK, COEF_BLOCK, #16 1502 ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16 1503 add COEF_BLOCK, COEF_BLOCK, #16 1504 ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16 1505 /* Dequantize */ 1506 ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32 1507 mul v4.4h, v4.4h, v18.4h 1508 mul v5.4h, v5.4h, v19.4h 1509 ins v4.d[1], v5.d[0] 1510 mul v6.4h, v6.4h, v20.4h 1511 mul v7.4h, v7.4h, v21.4h 1512 ins v6.d[1], v7.d[0] 1513 add DCT_TABLE, DCT_TABLE, #16 1514 ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16 1515 mul v10.4h, v10.4h, v24.4h 1516 mul v11.4h, v11.4h, v25.4h 1517 ins v10.d[1], v11.d[0] 1518 add DCT_TABLE, DCT_TABLE, #16 1519 ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16 1520 mul v12.4h, v12.4h, v26.4h 1521 mul v13.4h, v13.4h, v27.4h 1522 ins v12.d[1], v13.d[0] 1523 add DCT_TABLE, DCT_TABLE, #16 1524 ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16 1525 mul v16.4h, v16.4h, v30.4h 1526 mul v17.4h, v17.4h, v31.4h 1527 ins v16.d[1], v17.d[0] 1528 1529 /* Pass 1 */ 1530#if 0 1531 idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h 1532 transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h 1533 idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h 1534 transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h 1535#else 1536 smull v26.4s, v6.4h, v14.h[3] 1537 smlal v26.4s, v10.4h, v14.h[2] 1538 smlal v26.4s, v12.4h, v14.h[1] 1539 smlal v26.4s, v16.4h, v14.h[0] 1540 smull v24.4s, v7.4h, v14.h[3] 1541 smlal v24.4s, v11.4h, v14.h[2] 1542 smlal v24.4s, v13.4h, v14.h[1] 1543 smlal v24.4s, v17.4h, v14.h[0] 1544 sshll v15.4s, v4.4h, #15 1545 sshll v30.4s, v5.4h, #15 1546 add v20.4s, v15.4s, v26.4s 1547 sub v15.4s, v15.4s, v26.4s 1548 rshrn v4.4h, v20.4s, #13 1549 rshrn v6.4h, v15.4s, #13 1550 add v20.4s, v30.4s, v24.4s 1551 sub v15.4s, v30.4s, v24.4s 1552 rshrn v5.4h, v20.4s, #13 1553 rshrn v7.4h, v15.4s, #13 1554 ins v4.d[1], v5.d[0] 1555 ins v6.d[1], v7.d[0] 1556 transpose v4, v6, v3, .16b, .8h 1557 transpose v6, v10, v3, .16b, .4s 1558 ins v11.d[0], v10.d[1] 1559 ins v7.d[0], v6.d[1] 1560#endif 1561 1562 /* Pass 2 */ 1563 idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h 1564 1565 /* Range limit */ 1566 movi v30.8h, #0x80 1567 ins v26.d[1], v27.d[0] 1568 add v26.8h, v26.8h, v30.8h 1569 sqxtun v30.8b, v26.8h 1570 ins v26.d[0], v30.d[0] 1571 sqxtun v27.8b, v26.8h 1572 1573 /* Store results to the output buffer */ 1574 ldp TMP1, TMP2, [OUTPUT_BUF] 1575 add TMP1, TMP1, OUTPUT_COL 1576 add TMP2, TMP2, OUTPUT_COL 1577 1578 st1 {v26.b}[0], [TMP1], 1 1579 st1 {v27.b}[4], [TMP1], 1 1580 st1 {v26.b}[1], [TMP2], 1 1581 st1 {v27.b}[5], [TMP2], 1 1582 1583 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1584 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1585 blr x30 1586 1587 .unreq DCT_TABLE 1588 .unreq COEF_BLOCK 1589 .unreq OUTPUT_BUF 1590 .unreq OUTPUT_COL 1591 .unreq TMP1 1592 .unreq TMP2 1593 1594.purgem idct_helper 1595 1596 1597/*****************************************************************************/ 1598 1599/* 1600 * jsimd_ycc_extrgb_convert_neon 1601 * jsimd_ycc_extbgr_convert_neon 1602 * jsimd_ycc_extrgbx_convert_neon 1603 * jsimd_ycc_extbgrx_convert_neon 1604 * jsimd_ycc_extxbgr_convert_neon 1605 * jsimd_ycc_extxrgb_convert_neon 1606 * 1607 * Colorspace conversion YCbCr -> RGB 1608 */ 1609 1610.macro do_load size 1611 .if \size == 8 1612 ld1 {v4.8b}, [U], 8 1613 ld1 {v5.8b}, [V], 8 1614 ld1 {v0.8b}, [Y], 8 1615 prfm pldl1keep, [U, #64] 1616 prfm pldl1keep, [V, #64] 1617 prfm pldl1keep, [Y, #64] 1618 .elseif \size == 4 1619 ld1 {v4.b}[0], [U], 1 1620 ld1 {v4.b}[1], [U], 1 1621 ld1 {v4.b}[2], [U], 1 1622 ld1 {v4.b}[3], [U], 1 1623 ld1 {v5.b}[0], [V], 1 1624 ld1 {v5.b}[1], [V], 1 1625 ld1 {v5.b}[2], [V], 1 1626 ld1 {v5.b}[3], [V], 1 1627 ld1 {v0.b}[0], [Y], 1 1628 ld1 {v0.b}[1], [Y], 1 1629 ld1 {v0.b}[2], [Y], 1 1630 ld1 {v0.b}[3], [Y], 1 1631 .elseif \size == 2 1632 ld1 {v4.b}[4], [U], 1 1633 ld1 {v4.b}[5], [U], 1 1634 ld1 {v5.b}[4], [V], 1 1635 ld1 {v5.b}[5], [V], 1 1636 ld1 {v0.b}[4], [Y], 1 1637 ld1 {v0.b}[5], [Y], 1 1638 .elseif \size == 1 1639 ld1 {v4.b}[6], [U], 1 1640 ld1 {v5.b}[6], [V], 1 1641 ld1 {v0.b}[6], [Y], 1 1642 .else 1643 .error unsupported macroblock size 1644 .endif 1645.endm 1646 1647.macro do_store bpp, size, fast_st3 1648 .if \bpp == 24 1649 .if \size == 8 1650 .if \fast_st3 == 1 1651 st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24 1652 .else 1653 st1 {v10.b}[0], [RGB], #1 1654 st1 {v11.b}[0], [RGB], #1 1655 st1 {v12.b}[0], [RGB], #1 1656 1657 st1 {v10.b}[1], [RGB], #1 1658 st1 {v11.b}[1], [RGB], #1 1659 st1 {v12.b}[1], [RGB], #1 1660 1661 st1 {v10.b}[2], [RGB], #1 1662 st1 {v11.b}[2], [RGB], #1 1663 st1 {v12.b}[2], [RGB], #1 1664 1665 st1 {v10.b}[3], [RGB], #1 1666 st1 {v11.b}[3], [RGB], #1 1667 st1 {v12.b}[3], [RGB], #1 1668 1669 st1 {v10.b}[4], [RGB], #1 1670 st1 {v11.b}[4], [RGB], #1 1671 st1 {v12.b}[4], [RGB], #1 1672 1673 st1 {v10.b}[5], [RGB], #1 1674 st1 {v11.b}[5], [RGB], #1 1675 st1 {v12.b}[5], [RGB], #1 1676 1677 st1 {v10.b}[6], [RGB], #1 1678 st1 {v11.b}[6], [RGB], #1 1679 st1 {v12.b}[6], [RGB], #1 1680 1681 st1 {v10.b}[7], [RGB], #1 1682 st1 {v11.b}[7], [RGB], #1 1683 st1 {v12.b}[7], [RGB], #1 1684 .endif 1685 .elseif \size == 4 1686 st3 {v10.b, v11.b, v12.b}[0], [RGB], 3 1687 st3 {v10.b, v11.b, v12.b}[1], [RGB], 3 1688 st3 {v10.b, v11.b, v12.b}[2], [RGB], 3 1689 st3 {v10.b, v11.b, v12.b}[3], [RGB], 3 1690 .elseif \size == 2 1691 st3 {v10.b, v11.b, v12.b}[4], [RGB], 3 1692 st3 {v10.b, v11.b, v12.b}[5], [RGB], 3 1693 .elseif \size == 1 1694 st3 {v10.b, v11.b, v12.b}[6], [RGB], 3 1695 .else 1696 .error unsupported macroblock size 1697 .endif 1698 .elseif \bpp == 32 1699 .if \size == 8 1700 st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32 1701 .elseif \size == 4 1702 st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4 1703 st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4 1704 st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4 1705 st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4 1706 .elseif \size == 2 1707 st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4 1708 st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4 1709 .elseif \size == 1 1710 st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4 1711 .else 1712 .error unsupported macroblock size 1713 .endif 1714 .elseif \bpp == 16 1715 .if \size == 8 1716 st1 {v25.8h}, [RGB], 16 1717 .elseif \size == 4 1718 st1 {v25.4h}, [RGB], 8 1719 .elseif \size == 2 1720 st1 {v25.h}[4], [RGB], 2 1721 st1 {v25.h}[5], [RGB], 2 1722 .elseif \size == 1 1723 st1 {v25.h}[6], [RGB], 2 1724 .else 1725 .error unsupported macroblock size 1726 .endif 1727 .else 1728 .error unsupported bpp 1729 .endif 1730.endm 1731 1732.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \ 1733 g_offs, gsize, b_offs, bsize, \ 1734 defsize, fast_st3 1735 1736/* 1737 * 2-stage pipelined YCbCr->RGB conversion 1738 */ 1739 1740.macro do_yuv_to_rgb_stage1 1741 uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */ 1742 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1743 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ 1744 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ 1745 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ 1746 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ 1747 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ 1748 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ 1749 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ 1750 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ 1751.endm 1752 1753.macro do_yuv_to_rgb_stage2 1754 rshrn v20.4h, v20.4s, #15 1755 rshrn2 v20.8h, v22.4s, #15 1756 rshrn v24.4h, v24.4s, #14 1757 rshrn2 v24.8h, v26.4s, #14 1758 rshrn v28.4h, v28.4s, #14 1759 rshrn2 v28.8h, v30.4s, #14 1760 uaddw v20.8h, v20.8h, v0.8b 1761 uaddw v24.8h, v24.8h, v0.8b 1762 uaddw v28.8h, v28.8h, v0.8b 1763 .if \bpp != 16 1764 sqxtun v1\g_offs\defsize, v20.8h 1765 sqxtun v1\r_offs\defsize, v24.8h 1766 sqxtun v1\b_offs\defsize, v28.8h 1767 .else 1768 sqshlu v21.8h, v20.8h, #8 1769 sqshlu v25.8h, v24.8h, #8 1770 sqshlu v29.8h, v28.8h, #8 1771 sri v25.8h, v21.8h, #5 1772 sri v25.8h, v29.8h, #11 1773 .endif 1774.endm 1775 1776.macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3 1777 rshrn v20.4h, v20.4s, #15 1778 rshrn v24.4h, v24.4s, #14 1779 rshrn v28.4h, v28.4s, #14 1780 ld1 {v4.8b}, [U], 8 1781 rshrn2 v20.8h, v22.4s, #15 1782 rshrn2 v24.8h, v26.4s, #14 1783 rshrn2 v28.8h, v30.4s, #14 1784 ld1 {v5.8b}, [V], 8 1785 uaddw v20.8h, v20.8h, v0.8b 1786 uaddw v24.8h, v24.8h, v0.8b 1787 uaddw v28.8h, v28.8h, v0.8b 1788 .if \bpp != 16 /**************** rgb24/rgb32 ******************************/ 1789 sqxtun v1\g_offs\defsize, v20.8h 1790 ld1 {v0.8b}, [Y], 8 1791 sqxtun v1\r_offs\defsize, v24.8h 1792 prfm pldl1keep, [U, #64] 1793 prfm pldl1keep, [V, #64] 1794 prfm pldl1keep, [Y, #64] 1795 sqxtun v1\b_offs\defsize, v28.8h 1796 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ 1797 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1798 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ 1799 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ 1800 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ 1801 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ 1802 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ 1803 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ 1804 .else /**************************** rgb565 ********************************/ 1805 sqshlu v21.8h, v20.8h, #8 1806 sqshlu v25.8h, v24.8h, #8 1807 sqshlu v29.8h, v28.8h, #8 1808 uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */ 1809 uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */ 1810 ld1 {v0.8b}, [Y], 8 1811 smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */ 1812 smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */ 1813 smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */ 1814 smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */ 1815 sri v25.8h, v21.8h, #5 1816 smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */ 1817 smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */ 1818 prfm pldl1keep, [U, #64] 1819 prfm pldl1keep, [V, #64] 1820 prfm pldl1keep, [Y, #64] 1821 sri v25.8h, v29.8h, #11 1822 .endif 1823 do_store \bpp, 8, \fast_st3 1824 smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */ 1825 smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */ 1826.endm 1827 1828.macro do_yuv_to_rgb 1829 do_yuv_to_rgb_stage1 1830 do_yuv_to_rgb_stage2 1831.endm 1832 1833.if \fast_st3 == 1 1834asm_function jsimd_ycc_\colorid\()_convert_neon 1835.else 1836asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3 1837.endif 1838 OUTPUT_WIDTH .req w0 1839 INPUT_BUF .req x1 1840 INPUT_ROW .req w2 1841 OUTPUT_BUF .req x3 1842 NUM_ROWS .req w4 1843 1844 INPUT_BUF0 .req x5 1845 INPUT_BUF1 .req x6 1846 INPUT_BUF2 .req x1 1847 1848 RGB .req x7 1849 Y .req x9 1850 U .req x10 1851 V .req x11 1852 N .req w15 1853 1854 sub sp, sp, 64 1855 mov x9, sp 1856 1857 /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */ 1858 get_symbol_loc x15, Ljsimd_ycc_rgb_neon_consts 1859 1860 /* Save Neon registers */ 1861 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 1862 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 1863 ld1 {v0.4h, v1.4h}, [x15], 16 1864 ld1 {v2.8h}, [x15] 1865 1866 ldr INPUT_BUF0, [INPUT_BUF] 1867 ldr INPUT_BUF1, [INPUT_BUF, #8] 1868 ldr INPUT_BUF2, [INPUT_BUF, #16] 1869 .unreq INPUT_BUF 1870 1871 /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */ 1872 movi v10.16b, #255 1873 movi v13.16b, #255 1874 1875 /* Outer loop over scanlines */ 1876 cmp NUM_ROWS, #1 1877 b.lt 9f 18780: 1879 ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3] 1880 ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3] 1881 mov N, OUTPUT_WIDTH 1882 ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3] 1883 add INPUT_ROW, INPUT_ROW, #1 1884 ldr RGB, [OUTPUT_BUF], #8 1885 1886 /* Inner loop over pixels */ 1887 subs N, N, #8 1888 b.lt 3f 1889 do_load 8 1890 do_yuv_to_rgb_stage1 1891 subs N, N, #8 1892 b.lt 2f 18931: 1894 do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3 1895 subs N, N, #8 1896 b.ge 1b 18972: 1898 do_yuv_to_rgb_stage2 1899 do_store \bpp, 8, \fast_st3 1900 tst N, #7 1901 b.eq 8f 19023: 1903 tst N, #4 1904 b.eq 3f 1905 do_load 4 19063: 1907 tst N, #2 1908 b.eq 4f 1909 do_load 2 19104: 1911 tst N, #1 1912 b.eq 5f 1913 do_load 1 19145: 1915 do_yuv_to_rgb 1916 tst N, #4 1917 b.eq 6f 1918 do_store \bpp, 4, \fast_st3 19196: 1920 tst N, #2 1921 b.eq 7f 1922 do_store \bpp, 2, \fast_st3 19237: 1924 tst N, #1 1925 b.eq 8f 1926 do_store \bpp, 1, \fast_st3 19278: 1928 subs NUM_ROWS, NUM_ROWS, #1 1929 b.gt 0b 19309: 1931 /* Restore all registers and return */ 1932 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 1933 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 1934 br x30 1935 .unreq OUTPUT_WIDTH 1936 .unreq INPUT_ROW 1937 .unreq OUTPUT_BUF 1938 .unreq NUM_ROWS 1939 .unreq INPUT_BUF0 1940 .unreq INPUT_BUF1 1941 .unreq INPUT_BUF2 1942 .unreq RGB 1943 .unreq Y 1944 .unreq U 1945 .unreq V 1946 .unreq N 1947 1948.purgem do_yuv_to_rgb 1949.purgem do_yuv_to_rgb_stage1 1950.purgem do_yuv_to_rgb_stage2 1951.purgem do_yuv_to_rgb_stage2_store_load_stage1 1952 1953.endm 1954 1955/*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/ 1956generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1 1957generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1 1958generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1 1959generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1 1960generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1 1961generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1 1962generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1 1963 1964generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0 1965generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0 1966 1967.purgem do_load 1968.purgem do_store 1969 1970 1971/*****************************************************************************/ 1972 1973/* 1974 * jsimd_extrgb_ycc_convert_neon 1975 * jsimd_extbgr_ycc_convert_neon 1976 * jsimd_extrgbx_ycc_convert_neon 1977 * jsimd_extbgrx_ycc_convert_neon 1978 * jsimd_extxbgr_ycc_convert_neon 1979 * jsimd_extxrgb_ycc_convert_neon 1980 * 1981 * Colorspace conversion RGB -> YCbCr 1982 */ 1983 1984.macro do_store size 1985 .if \size == 8 1986 st1 {v20.8b}, [Y], #8 1987 st1 {v21.8b}, [U], #8 1988 st1 {v22.8b}, [V], #8 1989 .elseif \size == 4 1990 st1 {v20.b}[0], [Y], #1 1991 st1 {v20.b}[1], [Y], #1 1992 st1 {v20.b}[2], [Y], #1 1993 st1 {v20.b}[3], [Y], #1 1994 st1 {v21.b}[0], [U], #1 1995 st1 {v21.b}[1], [U], #1 1996 st1 {v21.b}[2], [U], #1 1997 st1 {v21.b}[3], [U], #1 1998 st1 {v22.b}[0], [V], #1 1999 st1 {v22.b}[1], [V], #1 2000 st1 {v22.b}[2], [V], #1 2001 st1 {v22.b}[3], [V], #1 2002 .elseif \size == 2 2003 st1 {v20.b}[4], [Y], #1 2004 st1 {v20.b}[5], [Y], #1 2005 st1 {v21.b}[4], [U], #1 2006 st1 {v21.b}[5], [U], #1 2007 st1 {v22.b}[4], [V], #1 2008 st1 {v22.b}[5], [V], #1 2009 .elseif \size == 1 2010 st1 {v20.b}[6], [Y], #1 2011 st1 {v21.b}[6], [U], #1 2012 st1 {v22.b}[6], [V], #1 2013 .else 2014 .error unsupported macroblock size 2015 .endif 2016.endm 2017 2018.macro do_load bpp, size, fast_ld3 2019 .if \bpp == 24 2020 .if \size == 8 2021 .if \fast_ld3 == 1 2022 ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24 2023 .else 2024 ld1 {v10.b}[0], [RGB], #1 2025 ld1 {v11.b}[0], [RGB], #1 2026 ld1 {v12.b}[0], [RGB], #1 2027 2028 ld1 {v10.b}[1], [RGB], #1 2029 ld1 {v11.b}[1], [RGB], #1 2030 ld1 {v12.b}[1], [RGB], #1 2031 2032 ld1 {v10.b}[2], [RGB], #1 2033 ld1 {v11.b}[2], [RGB], #1 2034 ld1 {v12.b}[2], [RGB], #1 2035 2036 ld1 {v10.b}[3], [RGB], #1 2037 ld1 {v11.b}[3], [RGB], #1 2038 ld1 {v12.b}[3], [RGB], #1 2039 2040 ld1 {v10.b}[4], [RGB], #1 2041 ld1 {v11.b}[4], [RGB], #1 2042 ld1 {v12.b}[4], [RGB], #1 2043 2044 ld1 {v10.b}[5], [RGB], #1 2045 ld1 {v11.b}[5], [RGB], #1 2046 ld1 {v12.b}[5], [RGB], #1 2047 2048 ld1 {v10.b}[6], [RGB], #1 2049 ld1 {v11.b}[6], [RGB], #1 2050 ld1 {v12.b}[6], [RGB], #1 2051 2052 ld1 {v10.b}[7], [RGB], #1 2053 ld1 {v11.b}[7], [RGB], #1 2054 ld1 {v12.b}[7], [RGB], #1 2055 .endif 2056 prfm pldl1keep, [RGB, #128] 2057 .elseif \size == 4 2058 ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3 2059 ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3 2060 ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3 2061 ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3 2062 .elseif \size == 2 2063 ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3 2064 ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3 2065 .elseif \size == 1 2066 ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3 2067 .else 2068 .error unsupported macroblock size 2069 .endif 2070 .elseif \bpp == 32 2071 .if \size == 8 2072 ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32 2073 prfm pldl1keep, [RGB, #128] 2074 .elseif \size == 4 2075 ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4 2076 ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4 2077 ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4 2078 ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4 2079 .elseif \size == 2 2080 ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4 2081 ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4 2082 .elseif \size == 1 2083 ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4 2084 .else 2085 .error unsupported macroblock size 2086 .endif 2087 .else 2088 .error unsupported bpp 2089 .endif 2090.endm 2091 2092.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \ 2093 b_offs, fast_ld3 2094 2095/* 2096 * 2-stage pipelined RGB->YCbCr conversion 2097 */ 2098 2099.macro do_rgb_to_yuv_stage1 2100 ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */ 2101 ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */ 2102 ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */ 2103 rev64 v18.4s, v1.4s 2104 rev64 v26.4s, v1.4s 2105 rev64 v28.4s, v1.4s 2106 rev64 v30.4s, v1.4s 2107 umull v14.4s, v4.4h, v0.h[0] 2108 umull2 v16.4s, v4.8h, v0.h[0] 2109 umlsl v18.4s, v4.4h, v0.h[3] 2110 umlsl2 v26.4s, v4.8h, v0.h[3] 2111 umlal v28.4s, v4.4h, v0.h[5] 2112 umlal2 v30.4s, v4.8h, v0.h[5] 2113 umlal v14.4s, v6.4h, v0.h[1] 2114 umlal2 v16.4s, v6.8h, v0.h[1] 2115 umlsl v18.4s, v6.4h, v0.h[4] 2116 umlsl2 v26.4s, v6.8h, v0.h[4] 2117 umlsl v28.4s, v6.4h, v0.h[6] 2118 umlsl2 v30.4s, v6.8h, v0.h[6] 2119 umlal v14.4s, v8.4h, v0.h[2] 2120 umlal2 v16.4s, v8.8h, v0.h[2] 2121 umlal v18.4s, v8.4h, v0.h[5] 2122 umlal2 v26.4s, v8.8h, v0.h[5] 2123 umlsl v28.4s, v8.4h, v0.h[7] 2124 umlsl2 v30.4s, v8.8h, v0.h[7] 2125.endm 2126 2127.macro do_rgb_to_yuv_stage2 2128 rshrn v20.4h, v14.4s, #16 2129 shrn v22.4h, v18.4s, #16 2130 shrn v24.4h, v28.4s, #16 2131 rshrn2 v20.8h, v16.4s, #16 2132 shrn2 v22.8h, v26.4s, #16 2133 shrn2 v24.8h, v30.4s, #16 2134 xtn v20.8b, v20.8h /* v20 = y */ 2135 xtn v21.8b, v22.8h /* v21 = u */ 2136 xtn v22.8b, v24.8h /* v22 = v */ 2137.endm 2138 2139.macro do_rgb_to_yuv 2140 do_rgb_to_yuv_stage1 2141 do_rgb_to_yuv_stage2 2142.endm 2143 2144/* TODO: expand macros and interleave instructions if some in-order 2145 * AArch64 processor actually can dual-issue LOAD/STORE with ALU */ 2146.macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3 2147 do_rgb_to_yuv_stage2 2148 do_load \bpp, 8, \fast_ld3 2149 st1 {v20.8b}, [Y], #8 2150 st1 {v21.8b}, [U], #8 2151 st1 {v22.8b}, [V], #8 2152 do_rgb_to_yuv_stage1 2153.endm 2154 2155.if \fast_ld3 == 1 2156asm_function jsimd_\colorid\()_ycc_convert_neon 2157.else 2158asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3 2159.endif 2160 OUTPUT_WIDTH .req w0 2161 INPUT_BUF .req x1 2162 OUTPUT_BUF .req x2 2163 OUTPUT_ROW .req w3 2164 NUM_ROWS .req w4 2165 2166 OUTPUT_BUF0 .req x5 2167 OUTPUT_BUF1 .req x6 2168 OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */ 2169 2170 RGB .req x7 2171 Y .req x9 2172 U .req x10 2173 V .req x11 2174 N .req w12 2175 2176 /* Load constants to d0, d1, d2, d3 */ 2177 get_symbol_loc x13, Ljsimd_rgb_ycc_neon_consts 2178 ld1 {v0.8h, v1.8h}, [x13] 2179 2180 ldr OUTPUT_BUF0, [OUTPUT_BUF] 2181 ldr OUTPUT_BUF1, [OUTPUT_BUF, #8] 2182 ldr OUTPUT_BUF2, [OUTPUT_BUF, #16] 2183 .unreq OUTPUT_BUF 2184 2185 /* Save Neon registers */ 2186 sub sp, sp, #64 2187 mov x9, sp 2188 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32 2189 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32 2190 2191 /* Outer loop over scanlines */ 2192 cmp NUM_ROWS, #1 2193 b.lt 9f 21940: 2195 ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3] 2196 ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3] 2197 mov N, OUTPUT_WIDTH 2198 ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3] 2199 add OUTPUT_ROW, OUTPUT_ROW, #1 2200 ldr RGB, [INPUT_BUF], #8 2201 2202 /* Inner loop over pixels */ 2203 subs N, N, #8 2204 b.lt 3f 2205 do_load \bpp, 8, \fast_ld3 2206 do_rgb_to_yuv_stage1 2207 subs N, N, #8 2208 b.lt 2f 22091: 2210 do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3 2211 subs N, N, #8 2212 b.ge 1b 22132: 2214 do_rgb_to_yuv_stage2 2215 do_store 8 2216 tst N, #7 2217 b.eq 8f 22183: 2219 tbz N, #2, 3f 2220 do_load \bpp, 4, \fast_ld3 22213: 2222 tbz N, #1, 4f 2223 do_load \bpp, 2, \fast_ld3 22244: 2225 tbz N, #0, 5f 2226 do_load \bpp, 1, \fast_ld3 22275: 2228 do_rgb_to_yuv 2229 tbz N, #2, 6f 2230 do_store 4 22316: 2232 tbz N, #1, 7f 2233 do_store 2 22347: 2235 tbz N, #0, 8f 2236 do_store 1 22378: 2238 subs NUM_ROWS, NUM_ROWS, #1 2239 b.gt 0b 22409: 2241 /* Restore all registers and return */ 2242 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 2243 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 2244 br x30 2245 2246 .unreq OUTPUT_WIDTH 2247 .unreq OUTPUT_ROW 2248 .unreq INPUT_BUF 2249 .unreq NUM_ROWS 2250 .unreq OUTPUT_BUF0 2251 .unreq OUTPUT_BUF1 2252 .unreq OUTPUT_BUF2 2253 .unreq RGB 2254 .unreq Y 2255 .unreq U 2256 .unreq V 2257 .unreq N 2258 2259.purgem do_rgb_to_yuv 2260.purgem do_rgb_to_yuv_stage1 2261.purgem do_rgb_to_yuv_stage2 2262.purgem do_rgb_to_yuv_stage2_store_load_stage1 2263 2264.endm 2265 2266/*--------------------------------- id ----- bpp R G B Fast LD3 */ 2267generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1 2268generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1 2269generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1 2270generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1 2271generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1 2272generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1 2273 2274generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0 2275generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0 2276 2277.purgem do_load 2278.purgem do_store 2279 2280 2281/*****************************************************************************/ 2282 2283/* 2284 * Load data into workspace, applying unsigned->signed conversion 2285 * 2286 * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get 2287 * rid of VST1.16 instructions 2288 */ 2289 2290asm_function jsimd_convsamp_neon 2291 SAMPLE_DATA .req x0 2292 START_COL .req x1 2293 WORKSPACE .req x2 2294 TMP1 .req x9 2295 TMP2 .req x10 2296 TMP3 .req x11 2297 TMP4 .req x12 2298 TMP5 .req x13 2299 TMP6 .req x14 2300 TMP7 .req x15 2301 TMP8 .req x4 2302 TMPDUP .req w3 2303 2304 /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't 2305 guarantee that the upper (unused) 32 bits of x1 are valid. This 2306 instruction ensures that those bits are set to zero. */ 2307 uxtw x1, w1 2308 2309 mov TMPDUP, #128 2310 ldp TMP1, TMP2, [SAMPLE_DATA], 16 2311 ldp TMP3, TMP4, [SAMPLE_DATA], 16 2312 dup v0.8b, TMPDUP 2313 add TMP1, TMP1, START_COL 2314 add TMP2, TMP2, START_COL 2315 ldp TMP5, TMP6, [SAMPLE_DATA], 16 2316 add TMP3, TMP3, START_COL 2317 add TMP4, TMP4, START_COL 2318 ldp TMP7, TMP8, [SAMPLE_DATA], 16 2319 add TMP5, TMP5, START_COL 2320 add TMP6, TMP6, START_COL 2321 ld1 {v16.8b}, [TMP1] 2322 add TMP7, TMP7, START_COL 2323 add TMP8, TMP8, START_COL 2324 ld1 {v17.8b}, [TMP2] 2325 usubl v16.8h, v16.8b, v0.8b 2326 ld1 {v18.8b}, [TMP3] 2327 usubl v17.8h, v17.8b, v0.8b 2328 ld1 {v19.8b}, [TMP4] 2329 usubl v18.8h, v18.8b, v0.8b 2330 ld1 {v20.8b}, [TMP5] 2331 usubl v19.8h, v19.8b, v0.8b 2332 ld1 {v21.8b}, [TMP6] 2333 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64 2334 usubl v20.8h, v20.8b, v0.8b 2335 ld1 {v22.8b}, [TMP7] 2336 usubl v21.8h, v21.8b, v0.8b 2337 ld1 {v23.8b}, [TMP8] 2338 usubl v22.8h, v22.8b, v0.8b 2339 usubl v23.8h, v23.8b, v0.8b 2340 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64 2341 2342 br x30 2343 2344 .unreq SAMPLE_DATA 2345 .unreq START_COL 2346 .unreq WORKSPACE 2347 .unreq TMP1 2348 .unreq TMP2 2349 .unreq TMP3 2350 .unreq TMP4 2351 .unreq TMP5 2352 .unreq TMP6 2353 .unreq TMP7 2354 .unreq TMP8 2355 .unreq TMPDUP 2356 2357/*****************************************************************************/ 2358 2359/* 2360 * jsimd_fdct_islow_neon 2361 * 2362 * This file contains a slower but more accurate integer implementation of the 2363 * forward DCT (Discrete Cosine Transform). The following code is based 2364 * directly on the IJG''s original jfdctint.c; see the jfdctint.c for 2365 * more details. 2366 * 2367 * TODO: can be combined with 'jsimd_convsamp_neon' to get 2368 * rid of a bunch of VLD1.16 instructions 2369 */ 2370 2371#define CONST_BITS 13 2372#define PASS1_BITS 2 2373 2374#define DESCALE_P1 (CONST_BITS - PASS1_BITS) 2375#define DESCALE_P2 (CONST_BITS + PASS1_BITS) 2376 2377#define XFIX_P_0_298 v0.h[0] 2378#define XFIX_N_0_390 v0.h[1] 2379#define XFIX_P_0_541 v0.h[2] 2380#define XFIX_P_0_765 v0.h[3] 2381#define XFIX_N_0_899 v0.h[4] 2382#define XFIX_P_1_175 v0.h[5] 2383#define XFIX_P_1_501 v0.h[6] 2384#define XFIX_N_1_847 v0.h[7] 2385#define XFIX_N_1_961 v1.h[0] 2386#define XFIX_P_2_053 v1.h[1] 2387#define XFIX_N_2_562 v1.h[2] 2388#define XFIX_P_3_072 v1.h[3] 2389 2390asm_function jsimd_fdct_islow_neon 2391 2392 DATA .req x0 2393 TMP .req x9 2394 2395 /* Load constants */ 2396 get_symbol_loc TMP, Ljsimd_fdct_islow_neon_consts 2397 ld1 {v0.8h, v1.8h}, [TMP] 2398 2399 /* Save Neon registers */ 2400 sub sp, sp, #64 2401 mov x10, sp 2402 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32 2403 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32 2404 2405 /* Load all DATA into Neon registers with the following allocation: 2406 * 0 1 2 3 | 4 5 6 7 2407 * ---------+-------- 2408 * 0 | d16 | d17 | v16.8h 2409 * 1 | d18 | d19 | v17.8h 2410 * 2 | d20 | d21 | v18.8h 2411 * 3 | d22 | d23 | v19.8h 2412 * 4 | d24 | d25 | v20.8h 2413 * 5 | d26 | d27 | v21.8h 2414 * 6 | d28 | d29 | v22.8h 2415 * 7 | d30 | d31 | v23.8h 2416 */ 2417 2418 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 2419 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 2420 sub DATA, DATA, #64 2421 2422 /* Transpose */ 2423 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 2424 /* 1-D FDCT */ 2425 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ 2426 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ 2427 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ 2428 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ 2429 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ 2430 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ 2431 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ 2432 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ 2433 2434 /* even part */ 2435 2436 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ 2437 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ 2438 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ 2439 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ 2440 2441 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ 2442 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ 2443 2444 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ 2445 2446 shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */ 2447 shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */ 2448 2449 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 2450 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 2451 mov v22.16b, v18.16b 2452 mov v25.16b, v24.16b 2453 2454 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 2455 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 2456 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 2457 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 2458 2459 rshrn v18.4h, v18.4s, #DESCALE_P1 2460 rshrn v22.4h, v22.4s, #DESCALE_P1 2461 rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ 2462 rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ 2463 2464 /* Odd part */ 2465 2466 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ 2467 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ 2468 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ 2469 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ 2470 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ 2471 smull2 v5.4s, v10.8h, XFIX_P_1_175 2472 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ 2473 smlal2 v5.4s, v11.8h, XFIX_P_1_175 2474 2475 smull2 v24.4s, v28.8h, XFIX_P_0_298 2476 smull2 v25.4s, v29.8h, XFIX_P_2_053 2477 smull2 v26.4s, v30.8h, XFIX_P_3_072 2478 smull2 v27.4s, v31.8h, XFIX_P_1_501 2479 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ 2480 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ 2481 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ 2482 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ 2483 2484 smull2 v12.4s, v8.8h, XFIX_N_0_899 2485 smull2 v13.4s, v9.8h, XFIX_N_2_562 2486 smull2 v14.4s, v10.8h, XFIX_N_1_961 2487 smull2 v15.4s, v11.8h, XFIX_N_0_390 2488 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */ 2489 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */ 2490 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */ 2491 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */ 2492 2493 add v10.4s, v10.4s, v4.4s /* z3 += z5 */ 2494 add v14.4s, v14.4s, v5.4s 2495 add v11.4s, v11.4s, v4.4s /* z4 += z5 */ 2496 add v15.4s, v15.4s, v5.4s 2497 2498 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ 2499 add v24.4s, v24.4s, v12.4s 2500 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ 2501 add v25.4s, v25.4s, v13.4s 2502 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ 2503 add v26.4s, v26.4s, v14.4s 2504 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ 2505 add v27.4s, v27.4s, v15.4s 2506 2507 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ 2508 add v24.4s, v24.4s, v14.4s 2509 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ 2510 add v25.4s, v25.4s, v15.4s 2511 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ 2512 add v26.4s, v26.4s, v13.4s 2513 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ 2514 add v27.4s, v27.4s, v12.4s 2515 2516 rshrn v23.4h, v28.4s, #DESCALE_P1 2517 rshrn v21.4h, v29.4s, #DESCALE_P1 2518 rshrn v19.4h, v30.4s, #DESCALE_P1 2519 rshrn v17.4h, v31.4s, #DESCALE_P1 2520 rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ 2521 rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ 2522 rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ 2523 rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ 2524 2525 /* Transpose */ 2526 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4 2527 2528 /* 1-D FDCT */ 2529 add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */ 2530 sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */ 2531 add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */ 2532 sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */ 2533 add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */ 2534 sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */ 2535 add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */ 2536 sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */ 2537 2538 /* even part */ 2539 add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */ 2540 sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */ 2541 add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */ 2542 sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */ 2543 2544 add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */ 2545 sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */ 2546 2547 add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */ 2548 2549 srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */ 2550 srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */ 2551 2552 smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 2553 smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */ 2554 mov v22.16b, v18.16b 2555 mov v25.16b, v24.16b 2556 2557 smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 2558 smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */ 2559 smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 2560 smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */ 2561 2562 rshrn v18.4h, v18.4s, #DESCALE_P2 2563 rshrn v22.4h, v22.4s, #DESCALE_P2 2564 rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */ 2565 rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */ 2566 2567 /* Odd part */ 2568 add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */ 2569 add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */ 2570 add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */ 2571 add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */ 2572 2573 smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */ 2574 smull2 v5.4s, v10.8h, XFIX_P_1_175 2575 smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */ 2576 smlal2 v5.4s, v11.8h, XFIX_P_1_175 2577 2578 smull2 v24.4s, v28.8h, XFIX_P_0_298 2579 smull2 v25.4s, v29.8h, XFIX_P_2_053 2580 smull2 v26.4s, v30.8h, XFIX_P_3_072 2581 smull2 v27.4s, v31.8h, XFIX_P_1_501 2582 smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */ 2583 smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */ 2584 smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */ 2585 smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */ 2586 2587 smull2 v12.4s, v8.8h, XFIX_N_0_899 2588 smull2 v13.4s, v9.8h, XFIX_N_2_562 2589 smull2 v14.4s, v10.8h, XFIX_N_1_961 2590 smull2 v15.4s, v11.8h, XFIX_N_0_390 2591 smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */ 2592 smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */ 2593 smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */ 2594 smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */ 2595 2596 add v10.4s, v10.4s, v4.4s 2597 add v14.4s, v14.4s, v5.4s 2598 add v11.4s, v11.4s, v4.4s 2599 add v15.4s, v15.4s, v5.4s 2600 2601 add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */ 2602 add v24.4s, v24.4s, v12.4s 2603 add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */ 2604 add v25.4s, v25.4s, v13.4s 2605 add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */ 2606 add v26.4s, v26.4s, v14.4s 2607 add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */ 2608 add v27.4s, v27.4s, v15.4s 2609 2610 add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */ 2611 add v24.4s, v24.4s, v14.4s 2612 add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */ 2613 add v25.4s, v25.4s, v15.4s 2614 add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */ 2615 add v26.4s, v26.4s, v13.4s 2616 add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */ 2617 add v27.4s, v27.4s, v12.4s 2618 2619 rshrn v23.4h, v28.4s, #DESCALE_P2 2620 rshrn v21.4h, v29.4s, #DESCALE_P2 2621 rshrn v19.4h, v30.4s, #DESCALE_P2 2622 rshrn v17.4h, v31.4s, #DESCALE_P2 2623 rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */ 2624 rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */ 2625 rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */ 2626 rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */ 2627 2628 /* store results */ 2629 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 2630 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 2631 2632 /* Restore Neon registers */ 2633 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32 2634 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32 2635 2636 br x30 2637 2638 .unreq DATA 2639 .unreq TMP 2640 2641#undef XFIX_P_0_298 2642#undef XFIX_N_0_390 2643#undef XFIX_P_0_541 2644#undef XFIX_P_0_765 2645#undef XFIX_N_0_899 2646#undef XFIX_P_1_175 2647#undef XFIX_P_1_501 2648#undef XFIX_N_1_847 2649#undef XFIX_N_1_961 2650#undef XFIX_P_2_053 2651#undef XFIX_N_2_562 2652#undef XFIX_P_3_072 2653 2654 2655/*****************************************************************************/ 2656 2657/* 2658 * jsimd_fdct_ifast_neon 2659 * 2660 * This function contains a fast, not so accurate integer implementation of 2661 * the forward DCT (Discrete Cosine Transform). It uses the same calculations 2662 * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast' 2663 * function from jfdctfst.c 2664 * 2665 * TODO: can be combined with 'jsimd_convsamp_neon' to get 2666 * rid of a bunch of VLD1.16 instructions 2667 */ 2668 2669#undef XFIX_0_541196100 2670#define XFIX_0_382683433 v0.h[0] 2671#define XFIX_0_541196100 v0.h[1] 2672#define XFIX_0_707106781 v0.h[2] 2673#define XFIX_1_306562965 v0.h[3] 2674 2675asm_function jsimd_fdct_ifast_neon 2676 2677 DATA .req x0 2678 TMP .req x9 2679 2680 /* Load constants */ 2681 get_symbol_loc TMP, Ljsimd_fdct_ifast_neon_consts 2682 ld1 {v0.4h}, [TMP] 2683 2684 /* Load all DATA into Neon registers with the following allocation: 2685 * 0 1 2 3 | 4 5 6 7 2686 * ---------+-------- 2687 * 0 | d16 | d17 | v0.8h 2688 * 1 | d18 | d19 | q9 2689 * 2 | d20 | d21 | q10 2690 * 3 | d22 | d23 | q11 2691 * 4 | d24 | d25 | q12 2692 * 5 | d26 | d27 | q13 2693 * 6 | d28 | d29 | q14 2694 * 7 | d30 | d31 | q15 2695 */ 2696 2697 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 2698 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 2699 mov TMP, #2 2700 sub DATA, DATA, #64 27011: 2702 /* Transpose */ 2703 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4 2704 subs TMP, TMP, #1 2705 /* 1-D FDCT */ 2706 add v4.8h, v19.8h, v20.8h 2707 sub v20.8h, v19.8h, v20.8h 2708 sub v28.8h, v18.8h, v21.8h 2709 add v18.8h, v18.8h, v21.8h 2710 sub v29.8h, v17.8h, v22.8h 2711 add v17.8h, v17.8h, v22.8h 2712 sub v21.8h, v16.8h, v23.8h 2713 add v16.8h, v16.8h, v23.8h 2714 sub v6.8h, v17.8h, v18.8h 2715 sub v7.8h, v16.8h, v4.8h 2716 add v5.8h, v17.8h, v18.8h 2717 add v6.8h, v6.8h, v7.8h 2718 add v4.8h, v16.8h, v4.8h 2719 sqdmulh v6.8h, v6.8h, XFIX_0_707106781 2720 add v19.8h, v20.8h, v28.8h 2721 add v16.8h, v4.8h, v5.8h 2722 sub v20.8h, v4.8h, v5.8h 2723 add v5.8h, v28.8h, v29.8h 2724 add v29.8h, v29.8h, v21.8h 2725 sqdmulh v5.8h, v5.8h, XFIX_0_707106781 2726 sub v28.8h, v19.8h, v29.8h 2727 add v18.8h, v7.8h, v6.8h 2728 sqdmulh v28.8h, v28.8h, XFIX_0_382683433 2729 sub v22.8h, v7.8h, v6.8h 2730 sqdmulh v19.8h, v19.8h, XFIX_0_541196100 2731 sqdmulh v7.8h, v29.8h, XFIX_1_306562965 2732 add v6.8h, v21.8h, v5.8h 2733 sub v5.8h, v21.8h, v5.8h 2734 add v29.8h, v29.8h, v28.8h 2735 add v19.8h, v19.8h, v28.8h 2736 add v29.8h, v29.8h, v7.8h 2737 add v21.8h, v5.8h, v19.8h 2738 sub v19.8h, v5.8h, v19.8h 2739 add v17.8h, v6.8h, v29.8h 2740 sub v23.8h, v6.8h, v29.8h 2741 2742 b.ne 1b 2743 2744 /* store results */ 2745 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64 2746 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA] 2747 2748 br x30 2749 2750 .unreq DATA 2751 .unreq TMP 2752#undef XFIX_0_382683433 2753#undef XFIX_0_541196100 2754#undef XFIX_0_707106781 2755#undef XFIX_1_306562965 2756 2757 2758/*****************************************************************************/ 2759 2760/* 2761 * GLOBAL(void) 2762 * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors, 2763 * DCTELEM *workspace); 2764 * 2765 */ 2766asm_function jsimd_quantize_neon 2767 2768 COEF_BLOCK .req x0 2769 DIVISORS .req x1 2770 WORKSPACE .req x2 2771 2772 RECIPROCAL .req DIVISORS 2773 CORRECTION .req x9 2774 SHIFT .req x10 2775 LOOP_COUNT .req x11 2776 2777 mov LOOP_COUNT, #2 2778 add CORRECTION, DIVISORS, #(64 * 2) 2779 add SHIFT, DIVISORS, #(64 * 6) 27801: 2781 subs LOOP_COUNT, LOOP_COUNT, #1 2782 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64 2783 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64 2784 abs v20.8h, v0.8h 2785 abs v21.8h, v1.8h 2786 abs v22.8h, v2.8h 2787 abs v23.8h, v3.8h 2788 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64 2789 add v20.8h, v20.8h, v4.8h /* add correction */ 2790 add v21.8h, v21.8h, v5.8h 2791 add v22.8h, v22.8h, v6.8h 2792 add v23.8h, v23.8h, v7.8h 2793 umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */ 2794 umull2 v16.4s, v20.8h, v28.8h 2795 umull v5.4s, v21.4h, v29.4h 2796 umull2 v17.4s, v21.8h, v29.8h 2797 umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */ 2798 umull2 v18.4s, v22.8h, v30.8h 2799 umull v7.4s, v23.4h, v31.4h 2800 umull2 v19.4s, v23.8h, v31.8h 2801 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64 2802 shrn v4.4h, v4.4s, #16 2803 shrn v5.4h, v5.4s, #16 2804 shrn v6.4h, v6.4s, #16 2805 shrn v7.4h, v7.4s, #16 2806 shrn2 v4.8h, v16.4s, #16 2807 shrn2 v5.8h, v17.4s, #16 2808 shrn2 v6.8h, v18.4s, #16 2809 shrn2 v7.8h, v19.4s, #16 2810 neg v24.8h, v24.8h 2811 neg v25.8h, v25.8h 2812 neg v26.8h, v26.8h 2813 neg v27.8h, v27.8h 2814 sshr v0.8h, v0.8h, #15 /* extract sign */ 2815 sshr v1.8h, v1.8h, #15 2816 sshr v2.8h, v2.8h, #15 2817 sshr v3.8h, v3.8h, #15 2818 ushl v4.8h, v4.8h, v24.8h /* shift */ 2819 ushl v5.8h, v5.8h, v25.8h 2820 ushl v6.8h, v6.8h, v26.8h 2821 ushl v7.8h, v7.8h, v27.8h 2822 2823 eor v4.16b, v4.16b, v0.16b /* restore sign */ 2824 eor v5.16b, v5.16b, v1.16b 2825 eor v6.16b, v6.16b, v2.16b 2826 eor v7.16b, v7.16b, v3.16b 2827 sub v4.8h, v4.8h, v0.8h 2828 sub v5.8h, v5.8h, v1.8h 2829 sub v6.8h, v6.8h, v2.8h 2830 sub v7.8h, v7.8h, v3.8h 2831 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64 2832 2833 b.ne 1b 2834 2835 br x30 /* return */ 2836 2837 .unreq COEF_BLOCK 2838 .unreq DIVISORS 2839 .unreq WORKSPACE 2840 .unreq RECIPROCAL 2841 .unreq CORRECTION 2842 .unreq SHIFT 2843 .unreq LOOP_COUNT 2844 2845 2846/*****************************************************************************/ 2847 2848/* 2849 * Downsample pixel values of a single component. 2850 * This version handles the common case of 2:1 horizontal and 1:1 vertical, 2851 * without smoothing. 2852 * 2853 * GLOBAL(void) 2854 * jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor, 2855 * JDIMENSION v_samp_factor, 2856 * JDIMENSION width_in_blocks, 2857 * JSAMPARRAY input_data, JSAMPARRAY output_data); 2858 */ 2859 2860asm_function jsimd_h2v1_downsample_neon 2861 IMAGE_WIDTH .req x0 2862 MAX_V_SAMP .req x1 2863 V_SAMP .req x2 2864 BLOCK_WIDTH .req x3 2865 INPUT_DATA .req x4 2866 OUTPUT_DATA .req x5 2867 OUTPTR .req x9 2868 INPTR .req x10 2869 TMP1 .req x11 2870 TMP2 .req x12 2871 TMP3 .req x13 2872 TMPDUP .req w15 2873 2874 mov TMPDUP, #0x10000 2875 lsl TMP2, BLOCK_WIDTH, #4 2876 sub TMP2, TMP2, IMAGE_WIDTH 2877 get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts 2878 add TMP3, TMP3, TMP2, lsl #4 2879 dup v16.4s, TMPDUP 2880 ld1 {v18.16b}, [TMP3] 2881 28821: /* row loop */ 2883 ldr INPTR, [INPUT_DATA], #8 2884 ldr OUTPTR, [OUTPUT_DATA], #8 2885 subs TMP1, BLOCK_WIDTH, #1 2886 b.eq 3f 28872: /* columns */ 2888 ld1 {v0.16b}, [INPTR], #16 2889 mov v4.16b, v16.16b 2890 subs TMP1, TMP1, #1 2891 uadalp v4.8h, v0.16b 2892 shrn v6.8b, v4.8h, #1 2893 st1 {v6.8b}, [OUTPTR], #8 2894 b.ne 2b 28953: /* last columns */ 2896 ld1 {v0.16b}, [INPTR] 2897 mov v4.16b, v16.16b 2898 subs V_SAMP, V_SAMP, #1 2899 /* expand right */ 2900 tbl v2.16b, {v0.16b}, v18.16b 2901 uadalp v4.8h, v2.16b 2902 shrn v6.8b, v4.8h, #1 2903 st1 {v6.8b}, [OUTPTR], #8 2904 b.ne 1b 2905 2906 br x30 2907 2908 .unreq IMAGE_WIDTH 2909 .unreq MAX_V_SAMP 2910 .unreq V_SAMP 2911 .unreq BLOCK_WIDTH 2912 .unreq INPUT_DATA 2913 .unreq OUTPUT_DATA 2914 .unreq OUTPTR 2915 .unreq INPTR 2916 .unreq TMP1 2917 .unreq TMP2 2918 .unreq TMP3 2919 .unreq TMPDUP 2920 2921 2922/*****************************************************************************/ 2923 2924/* 2925 * Downsample pixel values of a single component. 2926 * This version handles the common case of 2:1 horizontal and 2:1 vertical, 2927 * without smoothing. 2928 * 2929 * GLOBAL(void) 2930 * jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor, 2931 * JDIMENSION v_samp_factor, 2932 * JDIMENSION width_in_blocks, 2933 * JSAMPARRAY input_data, JSAMPARRAY output_data); 2934 */ 2935 2936.balign 16 2937asm_function jsimd_h2v2_downsample_neon 2938 IMAGE_WIDTH .req x0 2939 MAX_V_SAMP .req x1 2940 V_SAMP .req x2 2941 BLOCK_WIDTH .req x3 2942 INPUT_DATA .req x4 2943 OUTPUT_DATA .req x5 2944 OUTPTR .req x9 2945 INPTR0 .req x10 2946 INPTR1 .req x14 2947 TMP1 .req x11 2948 TMP2 .req x12 2949 TMP3 .req x13 2950 TMPDUP .req w15 2951 2952 mov TMPDUP, #1 2953 lsl TMP2, BLOCK_WIDTH, #4 2954 lsl TMPDUP, TMPDUP, #17 2955 sub TMP2, TMP2, IMAGE_WIDTH 2956 get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts 2957 orr TMPDUP, TMPDUP, #1 2958 add TMP3, TMP3, TMP2, lsl #4 2959 dup v16.4s, TMPDUP 2960 ld1 {v18.16b}, [TMP3] 2961 29621: /* row loop */ 2963 ldr INPTR0, [INPUT_DATA], #8 2964 ldr OUTPTR, [OUTPUT_DATA], #8 2965 ldr INPTR1, [INPUT_DATA], #8 2966 subs TMP1, BLOCK_WIDTH, #1 2967 b.eq 3f 29682: /* columns */ 2969 ld1 {v0.16b}, [INPTR0], #16 2970 ld1 {v1.16b}, [INPTR1], #16 2971 mov v4.16b, v16.16b 2972 subs TMP1, TMP1, #1 2973 uadalp v4.8h, v0.16b 2974 uadalp v4.8h, v1.16b 2975 shrn v6.8b, v4.8h, #2 2976 st1 {v6.8b}, [OUTPTR], #8 2977 b.ne 2b 29783: /* last columns */ 2979 ld1 {v0.16b}, [INPTR0], #16 2980 ld1 {v1.16b}, [INPTR1], #16 2981 mov v4.16b, v16.16b 2982 subs V_SAMP, V_SAMP, #1 2983 /* expand right */ 2984 tbl v2.16b, {v0.16b}, v18.16b 2985 tbl v3.16b, {v1.16b}, v18.16b 2986 uadalp v4.8h, v2.16b 2987 uadalp v4.8h, v3.16b 2988 shrn v6.8b, v4.8h, #2 2989 st1 {v6.8b}, [OUTPTR], #8 2990 b.ne 1b 2991 2992 br x30 2993 2994 .unreq IMAGE_WIDTH 2995 .unreq MAX_V_SAMP 2996 .unreq V_SAMP 2997 .unreq BLOCK_WIDTH 2998 .unreq INPUT_DATA 2999 .unreq OUTPUT_DATA 3000 .unreq OUTPTR 3001 .unreq INPTR0 3002 .unreq INPTR1 3003 .unreq TMP1 3004 .unreq TMP2 3005 .unreq TMP3 3006 .unreq TMPDUP 3007 3008 3009/*****************************************************************************/ 3010 3011/* 3012 * GLOBAL(JOCTET *) 3013 * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer, 3014 * JCOEFPTR block, int last_dc_val, 3015 * c_derived_tbl *dctbl, c_derived_tbl *actbl) 3016 * 3017 */ 3018 3019 BUFFER .req x1 3020 PUT_BUFFER .req x6 3021 PUT_BITS .req x7 3022 PUT_BITSw .req w7 3023 3024.macro emit_byte 3025 sub PUT_BITS, PUT_BITS, #0x8 3026 lsr x19, PUT_BUFFER, PUT_BITS 3027 uxtb w19, w19 3028 strb w19, [BUFFER, #1]! 3029 cmp w19, #0xff 3030 b.ne 14f 3031 strb wzr, [BUFFER, #1]! 303214: 3033.endm 3034.macro put_bits CODE, SIZE 3035 lsl PUT_BUFFER, PUT_BUFFER, \SIZE 3036 add PUT_BITS, PUT_BITS, \SIZE 3037 orr PUT_BUFFER, PUT_BUFFER, \CODE 3038.endm 3039.macro checkbuf31 3040 cmp PUT_BITS, #0x20 3041 b.lt 31f 3042 emit_byte 3043 emit_byte 3044 emit_byte 3045 emit_byte 304631: 3047.endm 3048.macro checkbuf47 3049 cmp PUT_BITS, #0x30 3050 b.lt 47f 3051 emit_byte 3052 emit_byte 3053 emit_byte 3054 emit_byte 3055 emit_byte 3056 emit_byte 305747: 3058.endm 3059 3060.macro generate_jsimd_huff_encode_one_block fast_tbl 3061 3062.if \fast_tbl == 1 3063asm_function jsimd_huff_encode_one_block_neon 3064.else 3065asm_function jsimd_huff_encode_one_block_neon_slowtbl 3066.endif 3067 sub sp, sp, 272 3068 sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */ 3069 /* Save Arm registers */ 3070 stp x19, x20, [sp] 3071 get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_consts 3072 ldr PUT_BUFFER, [x0, #0x10] 3073 ldr PUT_BITSw, [x0, #0x18] 3074 ldrsh w12, [x2] /* load DC coeff in w12 */ 3075 /* prepare data */ 3076.if \fast_tbl == 1 3077 ld1 {v23.16b}, [x15], #16 3078 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64 3079 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64 3080 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64 3081 ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64 3082 ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64 3083 sub w12, w12, w3 /* last_dc_val, not used afterwards */ 3084 /* ZigZag 8x8 */ 3085 tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b 3086 tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b 3087 tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b 3088 tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b 3089 tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b 3090 tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b 3091 tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b 3092 tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b 3093 ins v0.h[0], w12 3094 tbx v1.16b, {v28.16b}, v16.16b 3095 tbx v2.16b, {v29.16b, v30.16b}, v17.16b 3096 tbx v5.16b, {v29.16b, v30.16b}, v18.16b 3097 tbx v6.16b, {v31.16b}, v19.16b 3098.else 3099 add x13, x2, #0x22 3100 sub w12, w12, w3 /* last_dc_val, not used afterwards */ 3101 ld1 {v23.16b}, [x15] 3102 add x14, x2, #0x18 3103 add x3, x2, #0x36 3104 ins v0.h[0], w12 3105 add x9, x2, #0x2 3106 ld1 {v1.h}[0], [x13] 3107 add x15, x2, #0x30 3108 ld1 {v2.h}[0], [x14] 3109 add x19, x2, #0x26 3110 ld1 {v3.h}[0], [x3] 3111 add x20, x2, #0x28 3112 ld1 {v0.h}[1], [x9] 3113 add x12, x2, #0x10 3114 ld1 {v1.h}[1], [x15] 3115 add x13, x2, #0x40 3116 ld1 {v2.h}[1], [x19] 3117 add x14, x2, #0x34 3118 ld1 {v3.h}[1], [x20] 3119 add x3, x2, #0x1a 3120 ld1 {v0.h}[2], [x12] 3121 add x9, x2, #0x20 3122 ld1 {v1.h}[2], [x13] 3123 add x15, x2, #0x32 3124 ld1 {v2.h}[2], [x14] 3125 add x19, x2, #0x42 3126 ld1 {v3.h}[2], [x3] 3127 add x20, x2, #0xc 3128 ld1 {v0.h}[3], [x9] 3129 add x12, x2, #0x12 3130 ld1 {v1.h}[3], [x15] 3131 add x13, x2, #0x24 3132 ld1 {v2.h}[3], [x19] 3133 add x14, x2, #0x50 3134 ld1 {v3.h}[3], [x20] 3135 add x3, x2, #0xe 3136 ld1 {v0.h}[4], [x12] 3137 add x9, x2, #0x4 3138 ld1 {v1.h}[4], [x13] 3139 add x15, x2, #0x16 3140 ld1 {v2.h}[4], [x14] 3141 add x19, x2, #0x60 3142 ld1 {v3.h}[4], [x3] 3143 add x20, x2, #0x1c 3144 ld1 {v0.h}[5], [x9] 3145 add x12, x2, #0x6 3146 ld1 {v1.h}[5], [x15] 3147 add x13, x2, #0x8 3148 ld1 {v2.h}[5], [x19] 3149 add x14, x2, #0x52 3150 ld1 {v3.h}[5], [x20] 3151 add x3, x2, #0x2a 3152 ld1 {v0.h}[6], [x12] 3153 add x9, x2, #0x14 3154 ld1 {v1.h}[6], [x13] 3155 add x15, x2, #0xa 3156 ld1 {v2.h}[6], [x14] 3157 add x19, x2, #0x44 3158 ld1 {v3.h}[6], [x3] 3159 add x20, x2, #0x38 3160 ld1 {v0.h}[7], [x9] 3161 add x12, x2, #0x46 3162 ld1 {v1.h}[7], [x15] 3163 add x13, x2, #0x3a 3164 ld1 {v2.h}[7], [x19] 3165 add x14, x2, #0x74 3166 ld1 {v3.h}[7], [x20] 3167 add x3, x2, #0x6a 3168 ld1 {v4.h}[0], [x12] 3169 add x9, x2, #0x54 3170 ld1 {v5.h}[0], [x13] 3171 add x15, x2, #0x2c 3172 ld1 {v6.h}[0], [x14] 3173 add x19, x2, #0x76 3174 ld1 {v7.h}[0], [x3] 3175 add x20, x2, #0x78 3176 ld1 {v4.h}[1], [x9] 3177 add x12, x2, #0x62 3178 ld1 {v5.h}[1], [x15] 3179 add x13, x2, #0x1e 3180 ld1 {v6.h}[1], [x19] 3181 add x14, x2, #0x68 3182 ld1 {v7.h}[1], [x20] 3183 add x3, x2, #0x7a 3184 ld1 {v4.h}[2], [x12] 3185 add x9, x2, #0x70 3186 ld1 {v5.h}[2], [x13] 3187 add x15, x2, #0x2e 3188 ld1 {v6.h}[2], [x14] 3189 add x19, x2, #0x5a 3190 ld1 {v7.h}[2], [x3] 3191 add x20, x2, #0x6c 3192 ld1 {v4.h}[3], [x9] 3193 add x12, x2, #0x72 3194 ld1 {v5.h}[3], [x15] 3195 add x13, x2, #0x3c 3196 ld1 {v6.h}[3], [x19] 3197 add x14, x2, #0x4c 3198 ld1 {v7.h}[3], [x20] 3199 add x3, x2, #0x5e 3200 ld1 {v4.h}[4], [x12] 3201 add x9, x2, #0x64 3202 ld1 {v5.h}[4], [x13] 3203 add x15, x2, #0x4a 3204 ld1 {v6.h}[4], [x14] 3205 add x19, x2, #0x3e 3206 ld1 {v7.h}[4], [x3] 3207 add x20, x2, #0x6e 3208 ld1 {v4.h}[5], [x9] 3209 add x12, x2, #0x56 3210 ld1 {v5.h}[5], [x15] 3211 add x13, x2, #0x58 3212 ld1 {v6.h}[5], [x19] 3213 add x14, x2, #0x4e 3214 ld1 {v7.h}[5], [x20] 3215 add x3, x2, #0x7c 3216 ld1 {v4.h}[6], [x12] 3217 add x9, x2, #0x48 3218 ld1 {v5.h}[6], [x13] 3219 add x15, x2, #0x66 3220 ld1 {v6.h}[6], [x14] 3221 add x19, x2, #0x5c 3222 ld1 {v7.h}[6], [x3] 3223 add x20, x2, #0x7e 3224 ld1 {v4.h}[7], [x9] 3225 ld1 {v5.h}[7], [x15] 3226 ld1 {v6.h}[7], [x19] 3227 ld1 {v7.h}[7], [x20] 3228.endif 3229 cmlt v24.8h, v0.8h, #0 3230 cmlt v25.8h, v1.8h, #0 3231 cmlt v26.8h, v2.8h, #0 3232 cmlt v27.8h, v3.8h, #0 3233 cmlt v28.8h, v4.8h, #0 3234 cmlt v29.8h, v5.8h, #0 3235 cmlt v30.8h, v6.8h, #0 3236 cmlt v31.8h, v7.8h, #0 3237 abs v0.8h, v0.8h 3238 abs v1.8h, v1.8h 3239 abs v2.8h, v2.8h 3240 abs v3.8h, v3.8h 3241 abs v4.8h, v4.8h 3242 abs v5.8h, v5.8h 3243 abs v6.8h, v6.8h 3244 abs v7.8h, v7.8h 3245 eor v24.16b, v24.16b, v0.16b 3246 eor v25.16b, v25.16b, v1.16b 3247 eor v26.16b, v26.16b, v2.16b 3248 eor v27.16b, v27.16b, v3.16b 3249 eor v28.16b, v28.16b, v4.16b 3250 eor v29.16b, v29.16b, v5.16b 3251 eor v30.16b, v30.16b, v6.16b 3252 eor v31.16b, v31.16b, v7.16b 3253 cmeq v16.8h, v0.8h, #0 3254 cmeq v17.8h, v1.8h, #0 3255 cmeq v18.8h, v2.8h, #0 3256 cmeq v19.8h, v3.8h, #0 3257 cmeq v20.8h, v4.8h, #0 3258 cmeq v21.8h, v5.8h, #0 3259 cmeq v22.8h, v6.8h, #0 3260 xtn v16.8b, v16.8h 3261 xtn v18.8b, v18.8h 3262 xtn v20.8b, v20.8h 3263 xtn v22.8b, v22.8h 3264 umov w14, v0.h[0] 3265 xtn2 v16.16b, v17.8h 3266 umov w13, v24.h[0] 3267 xtn2 v18.16b, v19.8h 3268 clz w14, w14 3269 xtn2 v20.16b, v21.8h 3270 lsl w13, w13, w14 3271 cmeq v17.8h, v7.8h, #0 3272 sub w12, w14, #32 3273 xtn2 v22.16b, v17.8h 3274 lsr w13, w13, w14 3275 and v16.16b, v16.16b, v23.16b 3276 neg w12, w12 3277 and v18.16b, v18.16b, v23.16b 3278 add x3, x4, #0x400 /* r1 = dctbl->ehufsi */ 3279 and v20.16b, v20.16b, v23.16b 3280 add x15, sp, #0x90 /* x15 = t2 */ 3281 and v22.16b, v22.16b, v23.16b 3282 ldr w10, [x4, x12, lsl #2] 3283 addp v16.16b, v16.16b, v18.16b 3284 ldrb w11, [x3, x12] 3285 addp v20.16b, v20.16b, v22.16b 3286 checkbuf47 3287 addp v16.16b, v16.16b, v20.16b 3288 put_bits x10, x11 3289 addp v16.16b, v16.16b, v18.16b 3290 checkbuf47 3291 umov x9, v16.D[0] 3292 put_bits x13, x12 3293 cnt v17.8b, v16.8b 3294 mvn x9, x9 3295 addv B18, v17.8b 3296 add x4, x5, #0x400 /* x4 = actbl->ehufsi */ 3297 umov w12, v18.b[0] 3298 lsr x9, x9, #0x1 /* clear AC coeff */ 3299 ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */ 3300 rbit x9, x9 /* x9 = index0 */ 3301 ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */ 3302 cmp w12, #(64-8) 3303 add x11, sp, #16 3304 b.lt 4f 3305 cbz x9, 6f 3306 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 3307 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 3308 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 3309 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 33101: 3311 clz x2, x9 3312 add x15, x15, x2, lsl #1 3313 lsl x9, x9, x2 3314 ldrh w20, [x15, #-126] 33152: 3316 cmp x2, #0x10 3317 b.lt 3f 3318 sub x2, x2, #0x10 3319 checkbuf47 3320 put_bits x13, x14 3321 b 2b 33223: 3323 clz w20, w20 3324 ldrh w3, [x15, #2]! 3325 sub w11, w20, #32 3326 lsl w3, w3, w20 3327 neg w11, w11 3328 lsr w3, w3, w20 3329 add x2, x11, x2, lsl #4 3330 lsl x9, x9, #0x1 3331 ldr w12, [x5, x2, lsl #2] 3332 ldrb w10, [x4, x2] 3333 checkbuf31 3334 put_bits x12, x10 3335 put_bits x3, x11 3336 cbnz x9, 1b 3337 b 6f 33384: 3339 movi v21.8h, #0x0010 3340 clz v0.8h, v0.8h 3341 clz v1.8h, v1.8h 3342 clz v2.8h, v2.8h 3343 clz v3.8h, v3.8h 3344 clz v4.8h, v4.8h 3345 clz v5.8h, v5.8h 3346 clz v6.8h, v6.8h 3347 clz v7.8h, v7.8h 3348 ushl v24.8h, v24.8h, v0.8h 3349 ushl v25.8h, v25.8h, v1.8h 3350 ushl v26.8h, v26.8h, v2.8h 3351 ushl v27.8h, v27.8h, v3.8h 3352 ushl v28.8h, v28.8h, v4.8h 3353 ushl v29.8h, v29.8h, v5.8h 3354 ushl v30.8h, v30.8h, v6.8h 3355 ushl v31.8h, v31.8h, v7.8h 3356 neg v0.8h, v0.8h 3357 neg v1.8h, v1.8h 3358 neg v2.8h, v2.8h 3359 neg v3.8h, v3.8h 3360 neg v4.8h, v4.8h 3361 neg v5.8h, v5.8h 3362 neg v6.8h, v6.8h 3363 neg v7.8h, v7.8h 3364 ushl v24.8h, v24.8h, v0.8h 3365 ushl v25.8h, v25.8h, v1.8h 3366 ushl v26.8h, v26.8h, v2.8h 3367 ushl v27.8h, v27.8h, v3.8h 3368 ushl v28.8h, v28.8h, v4.8h 3369 ushl v29.8h, v29.8h, v5.8h 3370 ushl v30.8h, v30.8h, v6.8h 3371 ushl v31.8h, v31.8h, v7.8h 3372 add v0.8h, v21.8h, v0.8h 3373 add v1.8h, v21.8h, v1.8h 3374 add v2.8h, v21.8h, v2.8h 3375 add v3.8h, v21.8h, v3.8h 3376 add v4.8h, v21.8h, v4.8h 3377 add v5.8h, v21.8h, v5.8h 3378 add v6.8h, v21.8h, v6.8h 3379 add v7.8h, v21.8h, v7.8h 3380 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64 3381 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64 3382 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64 3383 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64 33841: 3385 clz x2, x9 3386 add x15, x15, x2, lsl #1 3387 lsl x9, x9, x2 3388 ldrh w11, [x15, #-126] 33892: 3390 cmp x2, #0x10 3391 b.lt 3f 3392 sub x2, x2, #0x10 3393 checkbuf47 3394 put_bits x13, x14 3395 b 2b 33963: 3397 ldrh w3, [x15, #2]! 3398 add x2, x11, x2, lsl #4 3399 lsl x9, x9, #0x1 3400 ldr w12, [x5, x2, lsl #2] 3401 ldrb w10, [x4, x2] 3402 checkbuf31 3403 put_bits x12, x10 3404 put_bits x3, x11 3405 cbnz x9, 1b 34066: 3407 add x13, sp, #0x10e 3408 cmp x15, x13 3409 b.hs 1f 3410 ldr w12, [x5] 3411 ldrb w14, [x4] 3412 checkbuf47 3413 put_bits x12, x14 34141: 3415 str PUT_BUFFER, [x0, #0x10] 3416 str PUT_BITSw, [x0, #0x18] 3417 ldp x19, x20, [sp], 16 3418 add x0, BUFFER, #0x1 3419 add sp, sp, 256 3420 br x30 3421 3422.endm 3423 3424generate_jsimd_huff_encode_one_block 1 3425generate_jsimd_huff_encode_one_block 0 3426 3427 .unreq BUFFER 3428 .unreq PUT_BUFFER 3429 .unreq PUT_BITS 3430 .unreq PUT_BITSw 3431 3432.purgem emit_byte 3433.purgem put_bits 3434.purgem checkbuf31 3435.purgem checkbuf47 3436