1/* crc-armv8-aarch64-ce.S - ARMv8/CE PMULL accelerated CRC implementation 2 * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi> 3 * 4 * This file is part of Libgcrypt. 5 * 6 * Libgcrypt is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU Lesser General Public License as 8 * published by the Free Software Foundation; either version 2.1 of 9 * the License, or (at your option) any later version. 10 * 11 * Libgcrypt is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this program; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20#include "asm-common-aarch64.h" 21 22#if defined(__AARCH64EL__) && \ 23 defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ 24 defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) 25 26.cpu generic+simd+crypto 27 28.text 29 30 31/* Structure of crc32_consts_s */ 32 33#define consts_k(idx) ((idx) * 8) 34#define consts_my_p(idx) (consts_k(6) + (idx) * 8) 35 36/* Constants */ 37 38.align 6 39.Lcrc32_constants: 40.Lcrc32_partial_fold_input_mask: 41 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 42 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 43.Lcrc32_refl_shuf_shift: 44 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 45 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 46 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 47 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 48.Lcrc32_shuf_shift: 49 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 50 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 51.Lcrc32_bswap_shuf: 52 .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 53 .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 54 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 55 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 56 57 58/* 59 * void _gcry_crc32r_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen, 60 * const struct crc32_consts_s *consts); 61 */ 62.align 3 63.globl _gcry_crc32r_armv8_ce_bulk 64ELF(.type _gcry_crc32r_armv8_ce_bulk,%function;) 65_gcry_crc32r_armv8_ce_bulk: 66 /* input: 67 * x0: pcrc 68 * x1: inbuf 69 * x2: inlen 70 * x3: consts 71 */ 72 CFI_STARTPROC() 73 74 GET_DATA_POINTER(x7, .Lcrc32_constants) 75 add x9, x3, #consts_k(5 - 1) 76 cmp x2, #128 77 78 b.lo .Lcrc32r_fold_by_one_setup 79 80 eor v4.16b, v4.16b, v4.16b 81 add x4, x3, #consts_k(1 - 1) 82 ld1 {v4.s}[0], [x0] /* load pcrc */ 83 ld1 {v0.16b-v3.16b}, [x1], #64 /* load 64 bytes of input */ 84 sub x2, x2, #64 85 ld1 {v6.16b}, [x4] 86 eor v0.16b, v0.16b, v4.16b 87 88 add x4, x3, #consts_k(3 - 1) 89 add x5, x3, #consts_my_p(0) 90 91.Lcrc32r_fold_by_four: 92 93 /* Fold by 4. */ 94 ld1 {v16.16b-v19.16b}, [x1], #64 /* load 64 bytes of input */ 95 sub x2, x2, #64 96 pmull v20.1q, v0.1d, v6.1d 97 pmull v21.1q, v1.1d, v6.1d 98 pmull v22.1q, v2.1d, v6.1d 99 pmull v23.1q, v3.1d, v6.1d 100 cmp x2, #64 101 pmull2 v24.1q, v0.2d, v6.2d 102 pmull2 v25.1q, v1.2d, v6.2d 103 pmull2 v26.1q, v2.2d, v6.2d 104 pmull2 v27.1q, v3.2d, v6.2d 105 eor v0.16b, v20.16b, v16.16b 106 eor v1.16b, v21.16b, v17.16b 107 eor v2.16b, v22.16b, v18.16b 108 eor v3.16b, v23.16b, v19.16b 109 eor v0.16b, v0.16b, v24.16b 110 eor v1.16b, v1.16b, v25.16b 111 eor v2.16b, v2.16b, v26.16b 112 eor v3.16b, v3.16b, v27.16b 113 b.hs .Lcrc32r_fold_by_four 114 115 ld1 {v6.16b}, [x4] 116 ld1 {v5.16b}, [x5] 117 118 cmp x2, #16 119 120 /* Fold 4 to 1. */ 121 122 pmull v16.1q, v0.1d, v6.1d 123 pmull2 v4.1q, v0.2d, v6.2d 124 eor v0.16b, v16.16b, v1.16b 125 eor v0.16b, v0.16b, v4.16b 126 127 pmull v16.1q, v0.1d, v6.1d 128 pmull2 v4.1q, v0.2d, v6.2d 129 eor v0.16b, v16.16b, v2.16b 130 eor v0.16b, v0.16b, v4.16b 131 132 pmull v16.1q, v0.1d, v6.1d 133 pmull2 v4.1q, v0.2d, v6.2d 134 eor v0.16b, v16.16b, v3.16b 135 eor v0.16b, v0.16b, v4.16b 136 137 b.lo .Lcrc32r_fold_by_one_done 138 b .Lcrc32r_fold_by_one 139 140.Lcrc32r_fold_by_one_setup: 141 142 eor v1.16b, v1.16b, v1.16b 143 add x4, x3, #consts_k(3 - 1) 144 add x5, x3, #consts_my_p(0) 145 sub x2, x2, #16 146 ld1 {v1.s}[0], [x0] /* load pcrc */ 147 ld1 {v0.16b}, [x1], #16 /* load 16 bytes of input */ 148 cmp x2, #16 149 ld1 {v6.16b}, [x4] /* load k3k4 */ 150 ld1 {v5.16b}, [x5] /* load my_p */ 151 eor v0.16b, v0.16b, v1.16b 152 b.lo .Lcrc32r_fold_by_one_done 153 154.Lcrc32r_fold_by_one: 155 sub x2, x2, #16 156 ld1 {v2.16b}, [x1], #16 /* load 16 bytes of input */ 157 pmull v3.1q, v0.1d, v6.1d 158 pmull2 v1.1q, v0.2d, v6.2d 159 cmp x2, #16 160 eor v0.16b, v3.16b, v2.16b 161 eor v0.16b, v0.16b, v1.16b 162 163 b.hs .Lcrc32r_fold_by_one 164 165.Lcrc32r_fold_by_one_done: 166 167 cmp x2, #0 168 b.eq .Lcrc32r_final_fold 169 170 /* Partial fold. */ 171 172 add x4, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants 173 add x5, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants + 16 174 add x6, x7, #.Lcrc32_partial_fold_input_mask - .Lcrc32_constants 175 sub x8, x2, #16 176 add x4, x4, x2 177 add x5, x5, x2 178 add x6, x6, x2 179 add x8, x1, x8 180 181 /* Load last input and add padding zeros. */ 182 ld1 {v4.16b}, [x4] 183 eor x2, x2, x2 184 ld1 {v3.16b}, [x5] 185 ld1 {v2.16b}, [x6] 186 tbl v30.16b, {v0.16b}, v4.16b 187 ld1 {v4.16b}, [x8] 188 tbl v1.16b, {v0.16b}, v3.16b 189 190 pmull v0.1q, v30.1d, v6.1d 191 and v2.16b, v2.16b, v4.16b 192 pmull2 v31.1q, v30.2d, v6.2d 193 orr v2.16b, v2.16b, v1.16b 194 eor v0.16b, v0.16b, v31.16b 195 eor v0.16b, v0.16b, v2.16b 196 197.Lcrc32r_final_fold: 198 199 /* Final fold. */ 200 201 eor v2.16b, v2.16b, v2.16b /* zero reg */ 202 ld1 {v7.16b}, [x9] 203 204 /* reduce 128-bits to 96-bits */ 205 ext v6.16b, v6.16b, v6.16b, #8 /* swap high and low parts */ 206 mov v1.16b, v0.16b 207 pmull v0.1q, v0.1d, v6.1d 208 ext v6.16b, v5.16b, v5.16b, #8 /* swap high and low parts */ 209 ext v1.16b, v1.16b, v2.16b, #8 /* high to low, high zeroed */ 210 eor v3.16b, v0.16b, v1.16b 211 212 /* reduce 96-bits to 64-bits */ 213 eor v1.16b, v1.16b, v1.16b 214 ext v0.16b, v3.16b, v2.16b, #4 /* [00][00][x2][x1] */ 215 mov v1.s[0], v3.s[0] /* [00][00][00][x0] */ 216 eor v3.16b, v3.16b, v3.16b 217 pmull v1.1q, v1.1d, v7.1d /* [00][00][xx][xx] */ 218 eor v0.16b, v0.16b, v1.16b /* top 64-bit are zero */ 219 220 /* barrett reduction */ 221 mov v3.s[1], v0.s[0] /* [00][00][x1][00] */ 222 ext v0.16b, v2.16b, v0.16b, #12 /* [??][x1][??][00] */ 223 pmull v1.1q, v3.1d, v5.1d /* [00][xx][xx][00] */ 224 pmull v1.1q, v1.1d, v6.1d /* [00][xx][xx][00] */ 225 eor v0.16b, v0.16b, v1.16b 226 227 /* store CRC */ 228 st1 {v0.s}[2], [x0] 229 230 ret 231 CFI_ENDPROC() 232ELF(.size _gcry_crc32r_armv8_ce_bulk,.-_gcry_crc32r_armv8_ce_bulk;) 233 234/* 235 * void _gcry_crc32r_armv8_ce_reduction_4 (u32 *pcrc, u32 data, u32 crc, 236 * const struct crc32_consts_s *consts); 237 */ 238.align 3 239.globl _gcry_crc32r_armv8_ce_reduction_4 240ELF(.type _gcry_crc32r_armv8_ce_reduction_4,%function;) 241_gcry_crc32r_armv8_ce_reduction_4: 242 /* input: 243 * w0: data 244 * w1: crc 245 * x2: crc32 constants 246 */ 247 CFI_STARTPROC() 248 249 eor v0.16b, v0.16b, v0.16b 250 add x2, x2, #consts_my_p(0) 251 eor v1.16b, v1.16b, v1.16b 252 ld1 {v5.16b}, [x2] 253 254 mov v0.s[0], w0 255 pmull v0.1q, v0.1d, v5.1d /* [00][00][xx][xx] */ 256 mov v1.s[1], w1 257 mov v0.s[2], v0.s[0] /* [00][x0][x1][x0] */ 258 pmull2 v0.1q, v0.2d, v5.2d /* [00][00][xx][xx] */ 259 eor v0.16b, v0.16b, v1.16b 260 261 mov w0, v0.s[1] 262 263 ret 264 CFI_ENDPROC() 265ELF(.size _gcry_crc32r_armv8_ce_reduction_4,.-_gcry_crc32r_armv8_ce_reduction_4;) 266 267/* 268 * void _gcry_crc32_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen, 269 * const struct crc32_consts_s *consts); 270 */ 271.align 3 272.globl _gcry_crc32_armv8_ce_bulk 273ELF(.type _gcry_crc32_armv8_ce_bulk,%function;) 274_gcry_crc32_armv8_ce_bulk: 275 /* input: 276 * x0: pcrc 277 * x1: inbuf 278 * x2: inlen 279 * x3: consts 280 */ 281 CFI_STARTPROC() 282 283 GET_DATA_POINTER(x7, .Lcrc32_constants) 284 add x4, x7, #.Lcrc32_bswap_shuf - .Lcrc32_constants 285 cmp x2, #128 286 ld1 {v7.16b}, [x4] 287 288 b.lo .Lcrc32_fold_by_one_setup 289 290 eor v4.16b, v4.16b, v4.16b 291 add x4, x3, #consts_k(1 - 1) 292 ld1 {v4.s}[0], [x0] /* load pcrc */ 293 ld1 {v0.16b-v3.16b}, [x1], #64 /* load 64 bytes of input */ 294 sub x2, x2, #64 295 ld1 {v6.16b}, [x4] 296 eor v0.16b, v0.16b, v4.16b 297 ext v4.16b, v6.16b, v6.16b, #8 298 tbl v0.16b, { v0.16b }, v7.16b /* byte swap */ 299 tbl v1.16b, { v1.16b }, v7.16b /* byte swap */ 300 tbl v2.16b, { v2.16b }, v7.16b /* byte swap */ 301 tbl v3.16b, { v3.16b }, v7.16b /* byte swap */ 302 303 add x4, x3, #consts_k(3 - 1) 304 add x5, x3, #consts_my_p(0) 305 306.Lcrc32_fold_by_four: 307 308 /* Fold by 4. */ 309 ld1 {v16.16b-v19.16b}, [x1], #64 /* load 64 bytes of input */ 310 sub x2, x2, #64 311 tbl v16.16b, { v16.16b }, v7.16b /* byte swap */ 312 tbl v17.16b, { v17.16b }, v7.16b /* byte swap */ 313 tbl v18.16b, { v18.16b }, v7.16b /* byte swap */ 314 tbl v19.16b, { v19.16b }, v7.16b /* byte swap */ 315 cmp x2, #64 316 pmull2 v20.1q, v0.2d, v4.2d 317 pmull2 v21.1q, v1.2d, v4.2d 318 pmull2 v22.1q, v2.2d, v4.2d 319 pmull2 v23.1q, v3.2d, v4.2d 320 pmull v24.1q, v0.1d, v4.1d 321 pmull v25.1q, v1.1d, v4.1d 322 pmull v26.1q, v2.1d, v4.1d 323 pmull v27.1q, v3.1d, v4.1d 324 eor v0.16b, v20.16b, v16.16b 325 eor v1.16b, v21.16b, v17.16b 326 eor v2.16b, v22.16b, v18.16b 327 eor v3.16b, v23.16b, v19.16b 328 eor v0.16b, v0.16b, v24.16b 329 eor v1.16b, v1.16b, v25.16b 330 eor v2.16b, v2.16b, v26.16b 331 eor v3.16b, v3.16b, v27.16b 332 b.hs .Lcrc32_fold_by_four 333 334 ld1 {v6.16b}, [x4] 335 ld1 {v5.16b}, [x5] 336 ext v6.16b, v6.16b, v6.16b, #8 337 ext v5.16b, v5.16b, v5.16b, #8 338 339 cmp x2, #16 340 341 /* Fold 4 to 1. */ 342 343 pmull2 v16.1q, v0.2d, v6.2d 344 pmull v4.1q, v0.1d, v6.1d 345 eor v0.16b, v16.16b, v1.16b 346 eor v0.16b, v0.16b, v4.16b 347 348 pmull2 v16.1q, v0.2d, v6.2d 349 pmull v4.1q, v0.1d, v6.1d 350 eor v0.16b, v16.16b, v2.16b 351 eor v0.16b, v0.16b, v4.16b 352 353 pmull2 v16.1q, v0.2d, v6.2d 354 pmull v4.1q, v0.1d, v6.1d 355 eor v0.16b, v16.16b, v3.16b 356 eor v0.16b, v0.16b, v4.16b 357 358 b.lo .Lcrc32_fold_by_one_done 359 b .Lcrc32_fold_by_one 360 361.Lcrc32_fold_by_one_setup: 362 363 eor v1.16b, v1.16b, v1.16b 364 add x4, x3, #consts_k(3 - 1) 365 add x5, x3, #consts_my_p(0) 366 ld1 {v1.s}[0], [x0] /* load pcrc */ 367 sub x2, x2, #16 368 ld1 {v0.16b}, [x1], #16 /* load 16 bytes of input */ 369 ld1 {v6.16b}, [x4] /* load k3k4 */ 370 ld1 {v5.16b}, [x5] /* load my_p */ 371 eor v0.16b, v0.16b, v1.16b 372 cmp x2, #16 373 ext v6.16b, v6.16b, v6.16b, #8 /* swap high and low parts */ 374 ext v5.16b, v5.16b, v5.16b, #8 /* swap high and low parts */ 375 tbl v0.16b, { v0.16b }, v7.16b /* byte swap */ 376 b.lo .Lcrc32_fold_by_one_done 377 378.Lcrc32_fold_by_one: 379 sub x2, x2, #16 380 ld1 {v2.16b}, [x1], #16 /* load 16 bytes of input */ 381 pmull2 v3.1q, v0.2d, v6.2d 382 tbl v2.16b, { v2.16b }, v7.16b /* byte swap */ 383 pmull v1.1q, v0.1d, v6.1d 384 cmp x2, #16 385 eor v0.16b, v3.16b, v2.16b 386 eor v0.16b, v0.16b, v1.16b 387 388 b.hs .Lcrc32_fold_by_one 389 390.Lcrc32_fold_by_one_done: 391 392 cmp x2, #0 393 b.eq .Lcrc32_final_fold 394 395 /* Partial fold. */ 396 397 add x4, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants + 32 398 add x5, x7, #.Lcrc32_shuf_shift - .Lcrc32_constants + 16 399 add x6, x7, #.Lcrc32_partial_fold_input_mask - .Lcrc32_constants 400 sub x8, x2, #16 401 sub x4, x4, x2 402 add x5, x5, x2 403 add x6, x6, x2 404 add x8, x1, x8 405 406 /* Load last input and add padding zeros. */ 407 ld1 {v4.16b}, [x4] 408 eor x2, x2, x2 409 ld1 {v3.16b}, [x5] 410 ld1 {v2.16b}, [x6] 411 tbl v30.16b, {v0.16b}, v4.16b 412 ld1 {v4.16b}, [x8] 413 tbl v1.16b, {v0.16b}, v3.16b 414 and v2.16b, v2.16b, v4.16b 415 416 pmull2 v0.1q, v30.2d, v6.2d 417 orr v2.16b, v2.16b, v1.16b 418 pmull v1.1q, v30.1d, v6.1d 419 tbl v2.16b, {v2.16b}, v7.16b /* byte swap */ 420 eor v0.16b, v0.16b, v1.16b 421 eor v0.16b, v0.16b, v2.16b 422 423.Lcrc32_final_fold: 424 425 /* Final fold. */ 426 427 eor v2.16b, v2.16b, v2.16b /* zero reg */ 428 429 /* reduce 128-bits to 96-bits */ 430 add x4, x3, #consts_k(4) 431 ext v3.16b, v6.16b, v6.16b, #8 /* swap high and low parts */ 432 eor v6.16b, v6.16b, v6.16b 433 mov v1.16b, v0.16b 434 pmull2 v0.1q, v0.2d, v3.2d 435 ld1 {v6.d}[1], [x4] /* load k4 */ 436 ext v1.16b, v2.16b, v1.16b, #8 /* low to high, low zeroed */ 437 eor v3.16b, v0.16b, v1.16b /* bottom 32-bit are zero */ 438 439 /* reduce 96-bits to 64-bits */ 440 eor v0.16b, v0.16b, v0.16b 441 eor v1.16b, v1.16b, v1.16b 442 mov v0.s[1], v3.s[1] /* [00][00][x1][00] */ 443 mov v1.s[2], v3.s[3] /* [00][x3][00][00] */ 444 mov v0.s[2], v3.s[2] /* [00][x2][x1][00] */ 445 eor v3.16b, v3.16b, v3.16b 446 pmull2 v1.1q, v1.2d, v6.2d /* [00][xx][xx][00] */ 447 eor v0.16b, v0.16b, v1.16b /* top and bottom 32-bit are zero */ 448 449 /* barrett reduction */ 450 mov v3.s[0], v0.s[1] /* [00][00][00][x1] */ 451 pmull2 v0.1q, v0.2d, v5.2d /* [00][xx][xx][xx] */ 452 ext v0.16b, v0.16b, v2.16b, #4 /* [00][00][xx][xx] */ 453 pmull v0.1q, v0.1d, v5.1d 454 eor v0.16b, v0.16b, v3.16b 455 456 /* store CRC in input endian */ 457 rev32 v0.8b, v0.8b /* byte swap */ 458 st1 {v0.s}[0], [x0] 459 460 ret 461 CFI_ENDPROC() 462ELF(.size _gcry_crc32_armv8_ce_bulk,.-_gcry_crc32_armv8_ce_bulk;) 463 464/* 465 * void _gcry_crc32_armv8_ce_reduction_4 (u32 *pcrc, u32 data, u32 crc, 466 * const struct crc32_consts_s *consts); 467 */ 468.align 3 469.globl _gcry_crc32_armv8_ce_reduction_4 470ELF(.type _gcry_crc32_armv8_ce_reduction_4,%function;) 471_gcry_crc32_armv8_ce_reduction_4: 472 /* input: 473 * w0: data 474 * w1: crc 475 * x2: crc32 constants 476 */ 477 CFI_STARTPROC() 478 479 eor v0.16b, v0.16b, v0.16b 480 add x2, x2, #consts_my_p(0) 481 eor v1.16b, v1.16b, v1.16b 482 ld1 {v5.16b}, [x2] 483 484 mov v0.s[1], w0 485 pmull v0.1q, v0.1d, v5.1d /* [00][xx][xx][00] */ 486 mov v1.s[0], w1 487 pmull2 v0.1q, v0.2d, v5.2d /* [00][00][xx][xx] */ 488 eor v0.16b, v0.16b, v1.16b 489 490 rev32 v0.8b, v0.8b /* Return in input endian */ 491 mov w0, v0.s[0] 492 493 ret 494 CFI_ENDPROC() 495ELF(.size _gcry_crc32_armv8_ce_reduction_4,.-_gcry_crc32_armv8_ce_reduction_4;) 496 497#endif 498