1#include "arm_asm.h" 2#include "arm_arch.h" 3 4.text 5 6 7.hidden OPENSSL_armcap_P 8 9.align 5 10.Lsigma: 11.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 12.Lone: 13.long 1,0,0,0 14.LOPENSSL_armcap_P: 15#ifdef __ILP32__ 16.long OPENSSL_armcap_P-. 17#else 18.quad OPENSSL_armcap_P-. 19#endif 20.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 21.align 2 22 23.globl ChaCha20_ctr32 24.type ChaCha20_ctr32,%function 25.align 5 26ChaCha20_ctr32: 27 cbz x2,.Labort 28 adr x5,.LOPENSSL_armcap_P 29 cmp x2,#192 30 b.lo .Lshort 31#ifdef __ILP32__ 32 ldrsw x6,[x5] 33#else 34 ldr x6,[x5] 35#endif 36 ldr w17,[x6,x5] 37 tst w17,#ARMV7_NEON 38 b.ne ChaCha20_neon 39 40.Lshort: 41.inst 0xd503233f // paciasp 42 stp x29,x30,[sp,#-96]! 43 add x29,sp,#0 44 45 adr x5,.Lsigma 46 stp x19,x20,[sp,#16] 47 stp x21,x22,[sp,#32] 48 stp x23,x24,[sp,#48] 49 stp x25,x26,[sp,#64] 50 stp x27,x28,[sp,#80] 51 sub sp,sp,#64 52 53 ldp x22,x23,[x5] // load sigma 54 ldp x24,x25,[x3] // load key 55 ldp x26,x27,[x3,#16] 56 ldp x28,x30,[x4] // load counter 57#ifdef __ARMEB__ 58 ror x24,x24,#32 59 ror x25,x25,#32 60 ror x26,x26,#32 61 ror x27,x27,#32 62 ror x28,x28,#32 63 ror x30,x30,#32 64#endif 65 66.Loop_outer: 67 mov w5,w22 // unpack key block 68 lsr x6,x22,#32 69 mov w7,w23 70 lsr x8,x23,#32 71 mov w9,w24 72 lsr x10,x24,#32 73 mov w11,w25 74 lsr x12,x25,#32 75 mov w13,w26 76 lsr x14,x26,#32 77 mov w15,w27 78 lsr x16,x27,#32 79 mov w17,w28 80 lsr x19,x28,#32 81 mov w20,w30 82 lsr x21,x30,#32 83 84 mov x4,#10 85 subs x2,x2,#64 86.Loop: 87 sub x4,x4,#1 88 add w5,w5,w9 89 add w6,w6,w10 90 add w7,w7,w11 91 add w8,w8,w12 92 eor w17,w17,w5 93 eor w19,w19,w6 94 eor w20,w20,w7 95 eor w21,w21,w8 96 ror w17,w17,#16 97 ror w19,w19,#16 98 ror w20,w20,#16 99 ror w21,w21,#16 100 add w13,w13,w17 101 add w14,w14,w19 102 add w15,w15,w20 103 add w16,w16,w21 104 eor w9,w9,w13 105 eor w10,w10,w14 106 eor w11,w11,w15 107 eor w12,w12,w16 108 ror w9,w9,#20 109 ror w10,w10,#20 110 ror w11,w11,#20 111 ror w12,w12,#20 112 add w5,w5,w9 113 add w6,w6,w10 114 add w7,w7,w11 115 add w8,w8,w12 116 eor w17,w17,w5 117 eor w19,w19,w6 118 eor w20,w20,w7 119 eor w21,w21,w8 120 ror w17,w17,#24 121 ror w19,w19,#24 122 ror w20,w20,#24 123 ror w21,w21,#24 124 add w13,w13,w17 125 add w14,w14,w19 126 add w15,w15,w20 127 add w16,w16,w21 128 eor w9,w9,w13 129 eor w10,w10,w14 130 eor w11,w11,w15 131 eor w12,w12,w16 132 ror w9,w9,#25 133 ror w10,w10,#25 134 ror w11,w11,#25 135 ror w12,w12,#25 136 add w5,w5,w10 137 add w6,w6,w11 138 add w7,w7,w12 139 add w8,w8,w9 140 eor w21,w21,w5 141 eor w17,w17,w6 142 eor w19,w19,w7 143 eor w20,w20,w8 144 ror w21,w21,#16 145 ror w17,w17,#16 146 ror w19,w19,#16 147 ror w20,w20,#16 148 add w15,w15,w21 149 add w16,w16,w17 150 add w13,w13,w19 151 add w14,w14,w20 152 eor w10,w10,w15 153 eor w11,w11,w16 154 eor w12,w12,w13 155 eor w9,w9,w14 156 ror w10,w10,#20 157 ror w11,w11,#20 158 ror w12,w12,#20 159 ror w9,w9,#20 160 add w5,w5,w10 161 add w6,w6,w11 162 add w7,w7,w12 163 add w8,w8,w9 164 eor w21,w21,w5 165 eor w17,w17,w6 166 eor w19,w19,w7 167 eor w20,w20,w8 168 ror w21,w21,#24 169 ror w17,w17,#24 170 ror w19,w19,#24 171 ror w20,w20,#24 172 add w15,w15,w21 173 add w16,w16,w17 174 add w13,w13,w19 175 add w14,w14,w20 176 eor w10,w10,w15 177 eor w11,w11,w16 178 eor w12,w12,w13 179 eor w9,w9,w14 180 ror w10,w10,#25 181 ror w11,w11,#25 182 ror w12,w12,#25 183 ror w9,w9,#25 184 cbnz x4,.Loop 185 186 add w5,w5,w22 // accumulate key block 187 add x6,x6,x22,lsr#32 188 add w7,w7,w23 189 add x8,x8,x23,lsr#32 190 add w9,w9,w24 191 add x10,x10,x24,lsr#32 192 add w11,w11,w25 193 add x12,x12,x25,lsr#32 194 add w13,w13,w26 195 add x14,x14,x26,lsr#32 196 add w15,w15,w27 197 add x16,x16,x27,lsr#32 198 add w17,w17,w28 199 add x19,x19,x28,lsr#32 200 add w20,w20,w30 201 add x21,x21,x30,lsr#32 202 203 b.lo .Ltail 204 205 add x5,x5,x6,lsl#32 // pack 206 add x7,x7,x8,lsl#32 207 ldp x6,x8,[x1,#0] // load input 208 add x9,x9,x10,lsl#32 209 add x11,x11,x12,lsl#32 210 ldp x10,x12,[x1,#16] 211 add x13,x13,x14,lsl#32 212 add x15,x15,x16,lsl#32 213 ldp x14,x16,[x1,#32] 214 add x17,x17,x19,lsl#32 215 add x20,x20,x21,lsl#32 216 ldp x19,x21,[x1,#48] 217 add x1,x1,#64 218#ifdef __ARMEB__ 219 rev x5,x5 220 rev x7,x7 221 rev x9,x9 222 rev x11,x11 223 rev x13,x13 224 rev x15,x15 225 rev x17,x17 226 rev x20,x20 227#endif 228 eor x5,x5,x6 229 eor x7,x7,x8 230 eor x9,x9,x10 231 eor x11,x11,x12 232 eor x13,x13,x14 233 eor x15,x15,x16 234 eor x17,x17,x19 235 eor x20,x20,x21 236 237 stp x5,x7,[x0,#0] // store output 238 add x28,x28,#1 // increment counter 239 stp x9,x11,[x0,#16] 240 stp x13,x15,[x0,#32] 241 stp x17,x20,[x0,#48] 242 add x0,x0,#64 243 244 b.hi .Loop_outer 245 246 ldp x19,x20,[x29,#16] 247 add sp,sp,#64 248 ldp x21,x22,[x29,#32] 249 ldp x23,x24,[x29,#48] 250 ldp x25,x26,[x29,#64] 251 ldp x27,x28,[x29,#80] 252 ldp x29,x30,[sp],#96 253.inst 0xd50323bf // autiasp 254.Labort: 255 ret 256 257.align 4 258.Ltail: 259 add x2,x2,#64 260.Less_than_64: 261 sub x0,x0,#1 262 add x1,x1,x2 263 add x0,x0,x2 264 add x4,sp,x2 265 neg x2,x2 266 267 add x5,x5,x6,lsl#32 // pack 268 add x7,x7,x8,lsl#32 269 add x9,x9,x10,lsl#32 270 add x11,x11,x12,lsl#32 271 add x13,x13,x14,lsl#32 272 add x15,x15,x16,lsl#32 273 add x17,x17,x19,lsl#32 274 add x20,x20,x21,lsl#32 275#ifdef __ARMEB__ 276 rev x5,x5 277 rev x7,x7 278 rev x9,x9 279 rev x11,x11 280 rev x13,x13 281 rev x15,x15 282 rev x17,x17 283 rev x20,x20 284#endif 285 stp x5,x7,[sp,#0] 286 stp x9,x11,[sp,#16] 287 stp x13,x15,[sp,#32] 288 stp x17,x20,[sp,#48] 289 290.Loop_tail: 291 ldrb w10,[x1,x2] 292 ldrb w11,[x4,x2] 293 add x2,x2,#1 294 eor w10,w10,w11 295 strb w10,[x0,x2] 296 cbnz x2,.Loop_tail 297 298 stp xzr,xzr,[sp,#0] 299 stp xzr,xzr,[sp,#16] 300 stp xzr,xzr,[sp,#32] 301 stp xzr,xzr,[sp,#48] 302 303 ldp x19,x20,[x29,#16] 304 add sp,sp,#64 305 ldp x21,x22,[x29,#32] 306 ldp x23,x24,[x29,#48] 307 ldp x25,x26,[x29,#64] 308 ldp x27,x28,[x29,#80] 309 ldp x29,x30,[sp],#96 310.inst 0xd50323bf // autiasp 311 ret 312.size ChaCha20_ctr32,.-ChaCha20_ctr32 313 314.type ChaCha20_neon,%function 315.align 5 316ChaCha20_neon: 317.inst 0xd503233f // paciasp 318 stp x29,x30,[sp,#-96]! 319 add x29,sp,#0 320 321 adr x5,.Lsigma 322 stp x19,x20,[sp,#16] 323 stp x21,x22,[sp,#32] 324 stp x23,x24,[sp,#48] 325 stp x25,x26,[sp,#64] 326 stp x27,x28,[sp,#80] 327 cmp x2,#512 328 b.hs .L512_or_more_neon 329 330 sub sp,sp,#64 331 332 ldp x22,x23,[x5] // load sigma 333 ld1 {v24.4s},[x5],#16 334 ldp x24,x25,[x3] // load key 335 ldp x26,x27,[x3,#16] 336 ld1 {v25.4s,v26.4s},[x3] 337 ldp x28,x30,[x4] // load counter 338 ld1 {v27.4s},[x4] 339 ld1 {v31.4s},[x5] 340#ifdef __ARMEB__ 341 rev64 v24.4s,v24.4s 342 ror x24,x24,#32 343 ror x25,x25,#32 344 ror x26,x26,#32 345 ror x27,x27,#32 346 ror x28,x28,#32 347 ror x30,x30,#32 348#endif 349 add v27.4s,v27.4s,v31.4s // += 1 350 add v28.4s,v27.4s,v31.4s 351 add v29.4s,v28.4s,v31.4s 352 shl v31.4s,v31.4s,#2 // 1 -> 4 353 354.Loop_outer_neon: 355 mov w5,w22 // unpack key block 356 lsr x6,x22,#32 357 mov v0.16b,v24.16b 358 mov w7,w23 359 lsr x8,x23,#32 360 mov v4.16b,v24.16b 361 mov w9,w24 362 lsr x10,x24,#32 363 mov v16.16b,v24.16b 364 mov w11,w25 365 mov v1.16b,v25.16b 366 lsr x12,x25,#32 367 mov v5.16b,v25.16b 368 mov w13,w26 369 mov v17.16b,v25.16b 370 lsr x14,x26,#32 371 mov v3.16b,v27.16b 372 mov w15,w27 373 mov v7.16b,v28.16b 374 lsr x16,x27,#32 375 mov v19.16b,v29.16b 376 mov w17,w28 377 mov v2.16b,v26.16b 378 lsr x19,x28,#32 379 mov v6.16b,v26.16b 380 mov w20,w30 381 mov v18.16b,v26.16b 382 lsr x21,x30,#32 383 384 mov x4,#10 385 subs x2,x2,#256 386.Loop_neon: 387 sub x4,x4,#1 388 add v0.4s,v0.4s,v1.4s 389 add w5,w5,w9 390 add v4.4s,v4.4s,v5.4s 391 add w6,w6,w10 392 add v16.4s,v16.4s,v17.4s 393 add w7,w7,w11 394 eor v3.16b,v3.16b,v0.16b 395 add w8,w8,w12 396 eor v7.16b,v7.16b,v4.16b 397 eor w17,w17,w5 398 eor v19.16b,v19.16b,v16.16b 399 eor w19,w19,w6 400 rev32 v3.8h,v3.8h 401 eor w20,w20,w7 402 rev32 v7.8h,v7.8h 403 eor w21,w21,w8 404 rev32 v19.8h,v19.8h 405 ror w17,w17,#16 406 add v2.4s,v2.4s,v3.4s 407 ror w19,w19,#16 408 add v6.4s,v6.4s,v7.4s 409 ror w20,w20,#16 410 add v18.4s,v18.4s,v19.4s 411 ror w21,w21,#16 412 eor v20.16b,v1.16b,v2.16b 413 add w13,w13,w17 414 eor v21.16b,v5.16b,v6.16b 415 add w14,w14,w19 416 eor v22.16b,v17.16b,v18.16b 417 add w15,w15,w20 418 ushr v1.4s,v20.4s,#20 419 add w16,w16,w21 420 ushr v5.4s,v21.4s,#20 421 eor w9,w9,w13 422 ushr v17.4s,v22.4s,#20 423 eor w10,w10,w14 424 sli v1.4s,v20.4s,#12 425 eor w11,w11,w15 426 sli v5.4s,v21.4s,#12 427 eor w12,w12,w16 428 sli v17.4s,v22.4s,#12 429 ror w9,w9,#20 430 add v0.4s,v0.4s,v1.4s 431 ror w10,w10,#20 432 add v4.4s,v4.4s,v5.4s 433 ror w11,w11,#20 434 add v16.4s,v16.4s,v17.4s 435 ror w12,w12,#20 436 eor v20.16b,v3.16b,v0.16b 437 add w5,w5,w9 438 eor v21.16b,v7.16b,v4.16b 439 add w6,w6,w10 440 eor v22.16b,v19.16b,v16.16b 441 add w7,w7,w11 442 ushr v3.4s,v20.4s,#24 443 add w8,w8,w12 444 ushr v7.4s,v21.4s,#24 445 eor w17,w17,w5 446 ushr v19.4s,v22.4s,#24 447 eor w19,w19,w6 448 sli v3.4s,v20.4s,#8 449 eor w20,w20,w7 450 sli v7.4s,v21.4s,#8 451 eor w21,w21,w8 452 sli v19.4s,v22.4s,#8 453 ror w17,w17,#24 454 add v2.4s,v2.4s,v3.4s 455 ror w19,w19,#24 456 add v6.4s,v6.4s,v7.4s 457 ror w20,w20,#24 458 add v18.4s,v18.4s,v19.4s 459 ror w21,w21,#24 460 eor v20.16b,v1.16b,v2.16b 461 add w13,w13,w17 462 eor v21.16b,v5.16b,v6.16b 463 add w14,w14,w19 464 eor v22.16b,v17.16b,v18.16b 465 add w15,w15,w20 466 ushr v1.4s,v20.4s,#25 467 add w16,w16,w21 468 ushr v5.4s,v21.4s,#25 469 eor w9,w9,w13 470 ushr v17.4s,v22.4s,#25 471 eor w10,w10,w14 472 sli v1.4s,v20.4s,#7 473 eor w11,w11,w15 474 sli v5.4s,v21.4s,#7 475 eor w12,w12,w16 476 sli v17.4s,v22.4s,#7 477 ror w9,w9,#25 478 ext v2.16b,v2.16b,v2.16b,#8 479 ror w10,w10,#25 480 ext v6.16b,v6.16b,v6.16b,#8 481 ror w11,w11,#25 482 ext v18.16b,v18.16b,v18.16b,#8 483 ror w12,w12,#25 484 ext v3.16b,v3.16b,v3.16b,#12 485 ext v7.16b,v7.16b,v7.16b,#12 486 ext v19.16b,v19.16b,v19.16b,#12 487 ext v1.16b,v1.16b,v1.16b,#4 488 ext v5.16b,v5.16b,v5.16b,#4 489 ext v17.16b,v17.16b,v17.16b,#4 490 add v0.4s,v0.4s,v1.4s 491 add w5,w5,w10 492 add v4.4s,v4.4s,v5.4s 493 add w6,w6,w11 494 add v16.4s,v16.4s,v17.4s 495 add w7,w7,w12 496 eor v3.16b,v3.16b,v0.16b 497 add w8,w8,w9 498 eor v7.16b,v7.16b,v4.16b 499 eor w21,w21,w5 500 eor v19.16b,v19.16b,v16.16b 501 eor w17,w17,w6 502 rev32 v3.8h,v3.8h 503 eor w19,w19,w7 504 rev32 v7.8h,v7.8h 505 eor w20,w20,w8 506 rev32 v19.8h,v19.8h 507 ror w21,w21,#16 508 add v2.4s,v2.4s,v3.4s 509 ror w17,w17,#16 510 add v6.4s,v6.4s,v7.4s 511 ror w19,w19,#16 512 add v18.4s,v18.4s,v19.4s 513 ror w20,w20,#16 514 eor v20.16b,v1.16b,v2.16b 515 add w15,w15,w21 516 eor v21.16b,v5.16b,v6.16b 517 add w16,w16,w17 518 eor v22.16b,v17.16b,v18.16b 519 add w13,w13,w19 520 ushr v1.4s,v20.4s,#20 521 add w14,w14,w20 522 ushr v5.4s,v21.4s,#20 523 eor w10,w10,w15 524 ushr v17.4s,v22.4s,#20 525 eor w11,w11,w16 526 sli v1.4s,v20.4s,#12 527 eor w12,w12,w13 528 sli v5.4s,v21.4s,#12 529 eor w9,w9,w14 530 sli v17.4s,v22.4s,#12 531 ror w10,w10,#20 532 add v0.4s,v0.4s,v1.4s 533 ror w11,w11,#20 534 add v4.4s,v4.4s,v5.4s 535 ror w12,w12,#20 536 add v16.4s,v16.4s,v17.4s 537 ror w9,w9,#20 538 eor v20.16b,v3.16b,v0.16b 539 add w5,w5,w10 540 eor v21.16b,v7.16b,v4.16b 541 add w6,w6,w11 542 eor v22.16b,v19.16b,v16.16b 543 add w7,w7,w12 544 ushr v3.4s,v20.4s,#24 545 add w8,w8,w9 546 ushr v7.4s,v21.4s,#24 547 eor w21,w21,w5 548 ushr v19.4s,v22.4s,#24 549 eor w17,w17,w6 550 sli v3.4s,v20.4s,#8 551 eor w19,w19,w7 552 sli v7.4s,v21.4s,#8 553 eor w20,w20,w8 554 sli v19.4s,v22.4s,#8 555 ror w21,w21,#24 556 add v2.4s,v2.4s,v3.4s 557 ror w17,w17,#24 558 add v6.4s,v6.4s,v7.4s 559 ror w19,w19,#24 560 add v18.4s,v18.4s,v19.4s 561 ror w20,w20,#24 562 eor v20.16b,v1.16b,v2.16b 563 add w15,w15,w21 564 eor v21.16b,v5.16b,v6.16b 565 add w16,w16,w17 566 eor v22.16b,v17.16b,v18.16b 567 add w13,w13,w19 568 ushr v1.4s,v20.4s,#25 569 add w14,w14,w20 570 ushr v5.4s,v21.4s,#25 571 eor w10,w10,w15 572 ushr v17.4s,v22.4s,#25 573 eor w11,w11,w16 574 sli v1.4s,v20.4s,#7 575 eor w12,w12,w13 576 sli v5.4s,v21.4s,#7 577 eor w9,w9,w14 578 sli v17.4s,v22.4s,#7 579 ror w10,w10,#25 580 ext v2.16b,v2.16b,v2.16b,#8 581 ror w11,w11,#25 582 ext v6.16b,v6.16b,v6.16b,#8 583 ror w12,w12,#25 584 ext v18.16b,v18.16b,v18.16b,#8 585 ror w9,w9,#25 586 ext v3.16b,v3.16b,v3.16b,#4 587 ext v7.16b,v7.16b,v7.16b,#4 588 ext v19.16b,v19.16b,v19.16b,#4 589 ext v1.16b,v1.16b,v1.16b,#12 590 ext v5.16b,v5.16b,v5.16b,#12 591 ext v17.16b,v17.16b,v17.16b,#12 592 cbnz x4,.Loop_neon 593 594 add w5,w5,w22 // accumulate key block 595 add v0.4s,v0.4s,v24.4s 596 add x6,x6,x22,lsr#32 597 add v4.4s,v4.4s,v24.4s 598 add w7,w7,w23 599 add v16.4s,v16.4s,v24.4s 600 add x8,x8,x23,lsr#32 601 add v2.4s,v2.4s,v26.4s 602 add w9,w9,w24 603 add v6.4s,v6.4s,v26.4s 604 add x10,x10,x24,lsr#32 605 add v18.4s,v18.4s,v26.4s 606 add w11,w11,w25 607 add v3.4s,v3.4s,v27.4s 608 add x12,x12,x25,lsr#32 609 add w13,w13,w26 610 add v7.4s,v7.4s,v28.4s 611 add x14,x14,x26,lsr#32 612 add w15,w15,w27 613 add v19.4s,v19.4s,v29.4s 614 add x16,x16,x27,lsr#32 615 add w17,w17,w28 616 add v1.4s,v1.4s,v25.4s 617 add x19,x19,x28,lsr#32 618 add w20,w20,w30 619 add v5.4s,v5.4s,v25.4s 620 add x21,x21,x30,lsr#32 621 add v17.4s,v17.4s,v25.4s 622 623 b.lo .Ltail_neon 624 625 add x5,x5,x6,lsl#32 // pack 626 add x7,x7,x8,lsl#32 627 ldp x6,x8,[x1,#0] // load input 628 add x9,x9,x10,lsl#32 629 add x11,x11,x12,lsl#32 630 ldp x10,x12,[x1,#16] 631 add x13,x13,x14,lsl#32 632 add x15,x15,x16,lsl#32 633 ldp x14,x16,[x1,#32] 634 add x17,x17,x19,lsl#32 635 add x20,x20,x21,lsl#32 636 ldp x19,x21,[x1,#48] 637 add x1,x1,#64 638#ifdef __ARMEB__ 639 rev x5,x5 640 rev x7,x7 641 rev x9,x9 642 rev x11,x11 643 rev x13,x13 644 rev x15,x15 645 rev x17,x17 646 rev x20,x20 647#endif 648 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 649 eor x5,x5,x6 650 eor x7,x7,x8 651 eor x9,x9,x10 652 eor x11,x11,x12 653 eor x13,x13,x14 654 eor v0.16b,v0.16b,v20.16b 655 eor x15,x15,x16 656 eor v1.16b,v1.16b,v21.16b 657 eor x17,x17,x19 658 eor v2.16b,v2.16b,v22.16b 659 eor x20,x20,x21 660 eor v3.16b,v3.16b,v23.16b 661 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 662 663 stp x5,x7,[x0,#0] // store output 664 add x28,x28,#4 // increment counter 665 stp x9,x11,[x0,#16] 666 add v27.4s,v27.4s,v31.4s // += 4 667 stp x13,x15,[x0,#32] 668 add v28.4s,v28.4s,v31.4s 669 stp x17,x20,[x0,#48] 670 add v29.4s,v29.4s,v31.4s 671 add x0,x0,#64 672 673 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 674 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 675 676 eor v4.16b,v4.16b,v20.16b 677 eor v5.16b,v5.16b,v21.16b 678 eor v6.16b,v6.16b,v22.16b 679 eor v7.16b,v7.16b,v23.16b 680 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 681 682 eor v16.16b,v16.16b,v0.16b 683 eor v17.16b,v17.16b,v1.16b 684 eor v18.16b,v18.16b,v2.16b 685 eor v19.16b,v19.16b,v3.16b 686 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 687 688 b.hi .Loop_outer_neon 689 690 ldp x19,x20,[x29,#16] 691 add sp,sp,#64 692 ldp x21,x22,[x29,#32] 693 ldp x23,x24,[x29,#48] 694 ldp x25,x26,[x29,#64] 695 ldp x27,x28,[x29,#80] 696 ldp x29,x30,[sp],#96 697.inst 0xd50323bf // autiasp 698 ret 699 700.Ltail_neon: 701 add x2,x2,#256 702 cmp x2,#64 703 b.lo .Less_than_64 704 705 add x5,x5,x6,lsl#32 // pack 706 add x7,x7,x8,lsl#32 707 ldp x6,x8,[x1,#0] // load input 708 add x9,x9,x10,lsl#32 709 add x11,x11,x12,lsl#32 710 ldp x10,x12,[x1,#16] 711 add x13,x13,x14,lsl#32 712 add x15,x15,x16,lsl#32 713 ldp x14,x16,[x1,#32] 714 add x17,x17,x19,lsl#32 715 add x20,x20,x21,lsl#32 716 ldp x19,x21,[x1,#48] 717 add x1,x1,#64 718#ifdef __ARMEB__ 719 rev x5,x5 720 rev x7,x7 721 rev x9,x9 722 rev x11,x11 723 rev x13,x13 724 rev x15,x15 725 rev x17,x17 726 rev x20,x20 727#endif 728 eor x5,x5,x6 729 eor x7,x7,x8 730 eor x9,x9,x10 731 eor x11,x11,x12 732 eor x13,x13,x14 733 eor x15,x15,x16 734 eor x17,x17,x19 735 eor x20,x20,x21 736 737 stp x5,x7,[x0,#0] // store output 738 add x28,x28,#4 // increment counter 739 stp x9,x11,[x0,#16] 740 stp x13,x15,[x0,#32] 741 stp x17,x20,[x0,#48] 742 add x0,x0,#64 743 b.eq .Ldone_neon 744 sub x2,x2,#64 745 cmp x2,#64 746 b.lo .Less_than_128 747 748 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 749 eor v0.16b,v0.16b,v20.16b 750 eor v1.16b,v1.16b,v21.16b 751 eor v2.16b,v2.16b,v22.16b 752 eor v3.16b,v3.16b,v23.16b 753 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 754 b.eq .Ldone_neon 755 sub x2,x2,#64 756 cmp x2,#64 757 b.lo .Less_than_192 758 759 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 760 eor v4.16b,v4.16b,v20.16b 761 eor v5.16b,v5.16b,v21.16b 762 eor v6.16b,v6.16b,v22.16b 763 eor v7.16b,v7.16b,v23.16b 764 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 765 b.eq .Ldone_neon 766 sub x2,x2,#64 767 768 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 769 b .Last_neon 770 771.Less_than_128: 772 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] 773 b .Last_neon 774.Less_than_192: 775 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] 776 b .Last_neon 777 778.align 4 779.Last_neon: 780 sub x0,x0,#1 781 add x1,x1,x2 782 add x0,x0,x2 783 add x4,sp,x2 784 neg x2,x2 785 786.Loop_tail_neon: 787 ldrb w10,[x1,x2] 788 ldrb w11,[x4,x2] 789 add x2,x2,#1 790 eor w10,w10,w11 791 strb w10,[x0,x2] 792 cbnz x2,.Loop_tail_neon 793 794 stp xzr,xzr,[sp,#0] 795 stp xzr,xzr,[sp,#16] 796 stp xzr,xzr,[sp,#32] 797 stp xzr,xzr,[sp,#48] 798 799.Ldone_neon: 800 ldp x19,x20,[x29,#16] 801 add sp,sp,#64 802 ldp x21,x22,[x29,#32] 803 ldp x23,x24,[x29,#48] 804 ldp x25,x26,[x29,#64] 805 ldp x27,x28,[x29,#80] 806 ldp x29,x30,[sp],#96 807.inst 0xd50323bf // autiasp 808 ret 809.size ChaCha20_neon,.-ChaCha20_neon 810.type ChaCha20_512_neon,%function 811.align 5 812ChaCha20_512_neon: 813.inst 0xd503233f // paciasp 814 stp x29,x30,[sp,#-96]! 815 add x29,sp,#0 816 817 adr x5,.Lsigma 818 stp x19,x20,[sp,#16] 819 stp x21,x22,[sp,#32] 820 stp x23,x24,[sp,#48] 821 stp x25,x26,[sp,#64] 822 stp x27,x28,[sp,#80] 823 824.L512_or_more_neon: 825 sub sp,sp,#128+64 826 827 ldp x22,x23,[x5] // load sigma 828 ld1 {v24.4s},[x5],#16 829 ldp x24,x25,[x3] // load key 830 ldp x26,x27,[x3,#16] 831 ld1 {v25.4s,v26.4s},[x3] 832 ldp x28,x30,[x4] // load counter 833 ld1 {v27.4s},[x4] 834 ld1 {v31.4s},[x5] 835#ifdef __ARMEB__ 836 rev64 v24.4s,v24.4s 837 ror x24,x24,#32 838 ror x25,x25,#32 839 ror x26,x26,#32 840 ror x27,x27,#32 841 ror x28,x28,#32 842 ror x30,x30,#32 843#endif 844 add v27.4s,v27.4s,v31.4s // += 1 845 stp q24,q25,[sp,#0] // off-load key block, invariant part 846 add v27.4s,v27.4s,v31.4s // not typo 847 str q26,[sp,#32] 848 add v28.4s,v27.4s,v31.4s 849 add v29.4s,v28.4s,v31.4s 850 add v30.4s,v29.4s,v31.4s 851 shl v31.4s,v31.4s,#2 // 1 -> 4 852 853 stp d8,d9,[sp,#128+0] // meet ABI requirements 854 stp d10,d11,[sp,#128+16] 855 stp d12,d13,[sp,#128+32] 856 stp d14,d15,[sp,#128+48] 857 858 sub x2,x2,#512 // not typo 859 860.Loop_outer_512_neon: 861 mov v0.16b,v24.16b 862 mov v4.16b,v24.16b 863 mov v8.16b,v24.16b 864 mov v12.16b,v24.16b 865 mov v16.16b,v24.16b 866 mov v20.16b,v24.16b 867 mov v1.16b,v25.16b 868 mov w5,w22 // unpack key block 869 mov v5.16b,v25.16b 870 lsr x6,x22,#32 871 mov v9.16b,v25.16b 872 mov w7,w23 873 mov v13.16b,v25.16b 874 lsr x8,x23,#32 875 mov v17.16b,v25.16b 876 mov w9,w24 877 mov v21.16b,v25.16b 878 lsr x10,x24,#32 879 mov v3.16b,v27.16b 880 mov w11,w25 881 mov v7.16b,v28.16b 882 lsr x12,x25,#32 883 mov v11.16b,v29.16b 884 mov w13,w26 885 mov v15.16b,v30.16b 886 lsr x14,x26,#32 887 mov v2.16b,v26.16b 888 mov w15,w27 889 mov v6.16b,v26.16b 890 lsr x16,x27,#32 891 add v19.4s,v3.4s,v31.4s // +4 892 mov w17,w28 893 add v23.4s,v7.4s,v31.4s // +4 894 lsr x19,x28,#32 895 mov v10.16b,v26.16b 896 mov w20,w30 897 mov v14.16b,v26.16b 898 lsr x21,x30,#32 899 mov v18.16b,v26.16b 900 stp q27,q28,[sp,#48] // off-load key block, variable part 901 mov v22.16b,v26.16b 902 str q29,[sp,#80] 903 904 mov x4,#5 905 subs x2,x2,#512 906.Loop_upper_neon: 907 sub x4,x4,#1 908 add v0.4s,v0.4s,v1.4s 909 add w5,w5,w9 910 add v4.4s,v4.4s,v5.4s 911 add w6,w6,w10 912 add v8.4s,v8.4s,v9.4s 913 add w7,w7,w11 914 add v12.4s,v12.4s,v13.4s 915 add w8,w8,w12 916 add v16.4s,v16.4s,v17.4s 917 eor w17,w17,w5 918 add v20.4s,v20.4s,v21.4s 919 eor w19,w19,w6 920 eor v3.16b,v3.16b,v0.16b 921 eor w20,w20,w7 922 eor v7.16b,v7.16b,v4.16b 923 eor w21,w21,w8 924 eor v11.16b,v11.16b,v8.16b 925 ror w17,w17,#16 926 eor v15.16b,v15.16b,v12.16b 927 ror w19,w19,#16 928 eor v19.16b,v19.16b,v16.16b 929 ror w20,w20,#16 930 eor v23.16b,v23.16b,v20.16b 931 ror w21,w21,#16 932 rev32 v3.8h,v3.8h 933 add w13,w13,w17 934 rev32 v7.8h,v7.8h 935 add w14,w14,w19 936 rev32 v11.8h,v11.8h 937 add w15,w15,w20 938 rev32 v15.8h,v15.8h 939 add w16,w16,w21 940 rev32 v19.8h,v19.8h 941 eor w9,w9,w13 942 rev32 v23.8h,v23.8h 943 eor w10,w10,w14 944 add v2.4s,v2.4s,v3.4s 945 eor w11,w11,w15 946 add v6.4s,v6.4s,v7.4s 947 eor w12,w12,w16 948 add v10.4s,v10.4s,v11.4s 949 ror w9,w9,#20 950 add v14.4s,v14.4s,v15.4s 951 ror w10,w10,#20 952 add v18.4s,v18.4s,v19.4s 953 ror w11,w11,#20 954 add v22.4s,v22.4s,v23.4s 955 ror w12,w12,#20 956 eor v24.16b,v1.16b,v2.16b 957 add w5,w5,w9 958 eor v25.16b,v5.16b,v6.16b 959 add w6,w6,w10 960 eor v26.16b,v9.16b,v10.16b 961 add w7,w7,w11 962 eor v27.16b,v13.16b,v14.16b 963 add w8,w8,w12 964 eor v28.16b,v17.16b,v18.16b 965 eor w17,w17,w5 966 eor v29.16b,v21.16b,v22.16b 967 eor w19,w19,w6 968 ushr v1.4s,v24.4s,#20 969 eor w20,w20,w7 970 ushr v5.4s,v25.4s,#20 971 eor w21,w21,w8 972 ushr v9.4s,v26.4s,#20 973 ror w17,w17,#24 974 ushr v13.4s,v27.4s,#20 975 ror w19,w19,#24 976 ushr v17.4s,v28.4s,#20 977 ror w20,w20,#24 978 ushr v21.4s,v29.4s,#20 979 ror w21,w21,#24 980 sli v1.4s,v24.4s,#12 981 add w13,w13,w17 982 sli v5.4s,v25.4s,#12 983 add w14,w14,w19 984 sli v9.4s,v26.4s,#12 985 add w15,w15,w20 986 sli v13.4s,v27.4s,#12 987 add w16,w16,w21 988 sli v17.4s,v28.4s,#12 989 eor w9,w9,w13 990 sli v21.4s,v29.4s,#12 991 eor w10,w10,w14 992 add v0.4s,v0.4s,v1.4s 993 eor w11,w11,w15 994 add v4.4s,v4.4s,v5.4s 995 eor w12,w12,w16 996 add v8.4s,v8.4s,v9.4s 997 ror w9,w9,#25 998 add v12.4s,v12.4s,v13.4s 999 ror w10,w10,#25 1000 add v16.4s,v16.4s,v17.4s 1001 ror w11,w11,#25 1002 add v20.4s,v20.4s,v21.4s 1003 ror w12,w12,#25 1004 eor v24.16b,v3.16b,v0.16b 1005 add w5,w5,w10 1006 eor v25.16b,v7.16b,v4.16b 1007 add w6,w6,w11 1008 eor v26.16b,v11.16b,v8.16b 1009 add w7,w7,w12 1010 eor v27.16b,v15.16b,v12.16b 1011 add w8,w8,w9 1012 eor v28.16b,v19.16b,v16.16b 1013 eor w21,w21,w5 1014 eor v29.16b,v23.16b,v20.16b 1015 eor w17,w17,w6 1016 ushr v3.4s,v24.4s,#24 1017 eor w19,w19,w7 1018 ushr v7.4s,v25.4s,#24 1019 eor w20,w20,w8 1020 ushr v11.4s,v26.4s,#24 1021 ror w21,w21,#16 1022 ushr v15.4s,v27.4s,#24 1023 ror w17,w17,#16 1024 ushr v19.4s,v28.4s,#24 1025 ror w19,w19,#16 1026 ushr v23.4s,v29.4s,#24 1027 ror w20,w20,#16 1028 sli v3.4s,v24.4s,#8 1029 add w15,w15,w21 1030 sli v7.4s,v25.4s,#8 1031 add w16,w16,w17 1032 sli v11.4s,v26.4s,#8 1033 add w13,w13,w19 1034 sli v15.4s,v27.4s,#8 1035 add w14,w14,w20 1036 sli v19.4s,v28.4s,#8 1037 eor w10,w10,w15 1038 sli v23.4s,v29.4s,#8 1039 eor w11,w11,w16 1040 add v2.4s,v2.4s,v3.4s 1041 eor w12,w12,w13 1042 add v6.4s,v6.4s,v7.4s 1043 eor w9,w9,w14 1044 add v10.4s,v10.4s,v11.4s 1045 ror w10,w10,#20 1046 add v14.4s,v14.4s,v15.4s 1047 ror w11,w11,#20 1048 add v18.4s,v18.4s,v19.4s 1049 ror w12,w12,#20 1050 add v22.4s,v22.4s,v23.4s 1051 ror w9,w9,#20 1052 eor v24.16b,v1.16b,v2.16b 1053 add w5,w5,w10 1054 eor v25.16b,v5.16b,v6.16b 1055 add w6,w6,w11 1056 eor v26.16b,v9.16b,v10.16b 1057 add w7,w7,w12 1058 eor v27.16b,v13.16b,v14.16b 1059 add w8,w8,w9 1060 eor v28.16b,v17.16b,v18.16b 1061 eor w21,w21,w5 1062 eor v29.16b,v21.16b,v22.16b 1063 eor w17,w17,w6 1064 ushr v1.4s,v24.4s,#25 1065 eor w19,w19,w7 1066 ushr v5.4s,v25.4s,#25 1067 eor w20,w20,w8 1068 ushr v9.4s,v26.4s,#25 1069 ror w21,w21,#24 1070 ushr v13.4s,v27.4s,#25 1071 ror w17,w17,#24 1072 ushr v17.4s,v28.4s,#25 1073 ror w19,w19,#24 1074 ushr v21.4s,v29.4s,#25 1075 ror w20,w20,#24 1076 sli v1.4s,v24.4s,#7 1077 add w15,w15,w21 1078 sli v5.4s,v25.4s,#7 1079 add w16,w16,w17 1080 sli v9.4s,v26.4s,#7 1081 add w13,w13,w19 1082 sli v13.4s,v27.4s,#7 1083 add w14,w14,w20 1084 sli v17.4s,v28.4s,#7 1085 eor w10,w10,w15 1086 sli v21.4s,v29.4s,#7 1087 eor w11,w11,w16 1088 ext v2.16b,v2.16b,v2.16b,#8 1089 eor w12,w12,w13 1090 ext v6.16b,v6.16b,v6.16b,#8 1091 eor w9,w9,w14 1092 ext v10.16b,v10.16b,v10.16b,#8 1093 ror w10,w10,#25 1094 ext v14.16b,v14.16b,v14.16b,#8 1095 ror w11,w11,#25 1096 ext v18.16b,v18.16b,v18.16b,#8 1097 ror w12,w12,#25 1098 ext v22.16b,v22.16b,v22.16b,#8 1099 ror w9,w9,#25 1100 ext v3.16b,v3.16b,v3.16b,#12 1101 ext v7.16b,v7.16b,v7.16b,#12 1102 ext v11.16b,v11.16b,v11.16b,#12 1103 ext v15.16b,v15.16b,v15.16b,#12 1104 ext v19.16b,v19.16b,v19.16b,#12 1105 ext v23.16b,v23.16b,v23.16b,#12 1106 ext v1.16b,v1.16b,v1.16b,#4 1107 ext v5.16b,v5.16b,v5.16b,#4 1108 ext v9.16b,v9.16b,v9.16b,#4 1109 ext v13.16b,v13.16b,v13.16b,#4 1110 ext v17.16b,v17.16b,v17.16b,#4 1111 ext v21.16b,v21.16b,v21.16b,#4 1112 add v0.4s,v0.4s,v1.4s 1113 add w5,w5,w9 1114 add v4.4s,v4.4s,v5.4s 1115 add w6,w6,w10 1116 add v8.4s,v8.4s,v9.4s 1117 add w7,w7,w11 1118 add v12.4s,v12.4s,v13.4s 1119 add w8,w8,w12 1120 add v16.4s,v16.4s,v17.4s 1121 eor w17,w17,w5 1122 add v20.4s,v20.4s,v21.4s 1123 eor w19,w19,w6 1124 eor v3.16b,v3.16b,v0.16b 1125 eor w20,w20,w7 1126 eor v7.16b,v7.16b,v4.16b 1127 eor w21,w21,w8 1128 eor v11.16b,v11.16b,v8.16b 1129 ror w17,w17,#16 1130 eor v15.16b,v15.16b,v12.16b 1131 ror w19,w19,#16 1132 eor v19.16b,v19.16b,v16.16b 1133 ror w20,w20,#16 1134 eor v23.16b,v23.16b,v20.16b 1135 ror w21,w21,#16 1136 rev32 v3.8h,v3.8h 1137 add w13,w13,w17 1138 rev32 v7.8h,v7.8h 1139 add w14,w14,w19 1140 rev32 v11.8h,v11.8h 1141 add w15,w15,w20 1142 rev32 v15.8h,v15.8h 1143 add w16,w16,w21 1144 rev32 v19.8h,v19.8h 1145 eor w9,w9,w13 1146 rev32 v23.8h,v23.8h 1147 eor w10,w10,w14 1148 add v2.4s,v2.4s,v3.4s 1149 eor w11,w11,w15 1150 add v6.4s,v6.4s,v7.4s 1151 eor w12,w12,w16 1152 add v10.4s,v10.4s,v11.4s 1153 ror w9,w9,#20 1154 add v14.4s,v14.4s,v15.4s 1155 ror w10,w10,#20 1156 add v18.4s,v18.4s,v19.4s 1157 ror w11,w11,#20 1158 add v22.4s,v22.4s,v23.4s 1159 ror w12,w12,#20 1160 eor v24.16b,v1.16b,v2.16b 1161 add w5,w5,w9 1162 eor v25.16b,v5.16b,v6.16b 1163 add w6,w6,w10 1164 eor v26.16b,v9.16b,v10.16b 1165 add w7,w7,w11 1166 eor v27.16b,v13.16b,v14.16b 1167 add w8,w8,w12 1168 eor v28.16b,v17.16b,v18.16b 1169 eor w17,w17,w5 1170 eor v29.16b,v21.16b,v22.16b 1171 eor w19,w19,w6 1172 ushr v1.4s,v24.4s,#20 1173 eor w20,w20,w7 1174 ushr v5.4s,v25.4s,#20 1175 eor w21,w21,w8 1176 ushr v9.4s,v26.4s,#20 1177 ror w17,w17,#24 1178 ushr v13.4s,v27.4s,#20 1179 ror w19,w19,#24 1180 ushr v17.4s,v28.4s,#20 1181 ror w20,w20,#24 1182 ushr v21.4s,v29.4s,#20 1183 ror w21,w21,#24 1184 sli v1.4s,v24.4s,#12 1185 add w13,w13,w17 1186 sli v5.4s,v25.4s,#12 1187 add w14,w14,w19 1188 sli v9.4s,v26.4s,#12 1189 add w15,w15,w20 1190 sli v13.4s,v27.4s,#12 1191 add w16,w16,w21 1192 sli v17.4s,v28.4s,#12 1193 eor w9,w9,w13 1194 sli v21.4s,v29.4s,#12 1195 eor w10,w10,w14 1196 add v0.4s,v0.4s,v1.4s 1197 eor w11,w11,w15 1198 add v4.4s,v4.4s,v5.4s 1199 eor w12,w12,w16 1200 add v8.4s,v8.4s,v9.4s 1201 ror w9,w9,#25 1202 add v12.4s,v12.4s,v13.4s 1203 ror w10,w10,#25 1204 add v16.4s,v16.4s,v17.4s 1205 ror w11,w11,#25 1206 add v20.4s,v20.4s,v21.4s 1207 ror w12,w12,#25 1208 eor v24.16b,v3.16b,v0.16b 1209 add w5,w5,w10 1210 eor v25.16b,v7.16b,v4.16b 1211 add w6,w6,w11 1212 eor v26.16b,v11.16b,v8.16b 1213 add w7,w7,w12 1214 eor v27.16b,v15.16b,v12.16b 1215 add w8,w8,w9 1216 eor v28.16b,v19.16b,v16.16b 1217 eor w21,w21,w5 1218 eor v29.16b,v23.16b,v20.16b 1219 eor w17,w17,w6 1220 ushr v3.4s,v24.4s,#24 1221 eor w19,w19,w7 1222 ushr v7.4s,v25.4s,#24 1223 eor w20,w20,w8 1224 ushr v11.4s,v26.4s,#24 1225 ror w21,w21,#16 1226 ushr v15.4s,v27.4s,#24 1227 ror w17,w17,#16 1228 ushr v19.4s,v28.4s,#24 1229 ror w19,w19,#16 1230 ushr v23.4s,v29.4s,#24 1231 ror w20,w20,#16 1232 sli v3.4s,v24.4s,#8 1233 add w15,w15,w21 1234 sli v7.4s,v25.4s,#8 1235 add w16,w16,w17 1236 sli v11.4s,v26.4s,#8 1237 add w13,w13,w19 1238 sli v15.4s,v27.4s,#8 1239 add w14,w14,w20 1240 sli v19.4s,v28.4s,#8 1241 eor w10,w10,w15 1242 sli v23.4s,v29.4s,#8 1243 eor w11,w11,w16 1244 add v2.4s,v2.4s,v3.4s 1245 eor w12,w12,w13 1246 add v6.4s,v6.4s,v7.4s 1247 eor w9,w9,w14 1248 add v10.4s,v10.4s,v11.4s 1249 ror w10,w10,#20 1250 add v14.4s,v14.4s,v15.4s 1251 ror w11,w11,#20 1252 add v18.4s,v18.4s,v19.4s 1253 ror w12,w12,#20 1254 add v22.4s,v22.4s,v23.4s 1255 ror w9,w9,#20 1256 eor v24.16b,v1.16b,v2.16b 1257 add w5,w5,w10 1258 eor v25.16b,v5.16b,v6.16b 1259 add w6,w6,w11 1260 eor v26.16b,v9.16b,v10.16b 1261 add w7,w7,w12 1262 eor v27.16b,v13.16b,v14.16b 1263 add w8,w8,w9 1264 eor v28.16b,v17.16b,v18.16b 1265 eor w21,w21,w5 1266 eor v29.16b,v21.16b,v22.16b 1267 eor w17,w17,w6 1268 ushr v1.4s,v24.4s,#25 1269 eor w19,w19,w7 1270 ushr v5.4s,v25.4s,#25 1271 eor w20,w20,w8 1272 ushr v9.4s,v26.4s,#25 1273 ror w21,w21,#24 1274 ushr v13.4s,v27.4s,#25 1275 ror w17,w17,#24 1276 ushr v17.4s,v28.4s,#25 1277 ror w19,w19,#24 1278 ushr v21.4s,v29.4s,#25 1279 ror w20,w20,#24 1280 sli v1.4s,v24.4s,#7 1281 add w15,w15,w21 1282 sli v5.4s,v25.4s,#7 1283 add w16,w16,w17 1284 sli v9.4s,v26.4s,#7 1285 add w13,w13,w19 1286 sli v13.4s,v27.4s,#7 1287 add w14,w14,w20 1288 sli v17.4s,v28.4s,#7 1289 eor w10,w10,w15 1290 sli v21.4s,v29.4s,#7 1291 eor w11,w11,w16 1292 ext v2.16b,v2.16b,v2.16b,#8 1293 eor w12,w12,w13 1294 ext v6.16b,v6.16b,v6.16b,#8 1295 eor w9,w9,w14 1296 ext v10.16b,v10.16b,v10.16b,#8 1297 ror w10,w10,#25 1298 ext v14.16b,v14.16b,v14.16b,#8 1299 ror w11,w11,#25 1300 ext v18.16b,v18.16b,v18.16b,#8 1301 ror w12,w12,#25 1302 ext v22.16b,v22.16b,v22.16b,#8 1303 ror w9,w9,#25 1304 ext v3.16b,v3.16b,v3.16b,#4 1305 ext v7.16b,v7.16b,v7.16b,#4 1306 ext v11.16b,v11.16b,v11.16b,#4 1307 ext v15.16b,v15.16b,v15.16b,#4 1308 ext v19.16b,v19.16b,v19.16b,#4 1309 ext v23.16b,v23.16b,v23.16b,#4 1310 ext v1.16b,v1.16b,v1.16b,#12 1311 ext v5.16b,v5.16b,v5.16b,#12 1312 ext v9.16b,v9.16b,v9.16b,#12 1313 ext v13.16b,v13.16b,v13.16b,#12 1314 ext v17.16b,v17.16b,v17.16b,#12 1315 ext v21.16b,v21.16b,v21.16b,#12 1316 cbnz x4,.Loop_upper_neon 1317 1318 add w5,w5,w22 // accumulate key block 1319 add x6,x6,x22,lsr#32 1320 add w7,w7,w23 1321 add x8,x8,x23,lsr#32 1322 add w9,w9,w24 1323 add x10,x10,x24,lsr#32 1324 add w11,w11,w25 1325 add x12,x12,x25,lsr#32 1326 add w13,w13,w26 1327 add x14,x14,x26,lsr#32 1328 add w15,w15,w27 1329 add x16,x16,x27,lsr#32 1330 add w17,w17,w28 1331 add x19,x19,x28,lsr#32 1332 add w20,w20,w30 1333 add x21,x21,x30,lsr#32 1334 1335 add x5,x5,x6,lsl#32 // pack 1336 add x7,x7,x8,lsl#32 1337 ldp x6,x8,[x1,#0] // load input 1338 add x9,x9,x10,lsl#32 1339 add x11,x11,x12,lsl#32 1340 ldp x10,x12,[x1,#16] 1341 add x13,x13,x14,lsl#32 1342 add x15,x15,x16,lsl#32 1343 ldp x14,x16,[x1,#32] 1344 add x17,x17,x19,lsl#32 1345 add x20,x20,x21,lsl#32 1346 ldp x19,x21,[x1,#48] 1347 add x1,x1,#64 1348#ifdef __ARMEB__ 1349 rev x5,x5 1350 rev x7,x7 1351 rev x9,x9 1352 rev x11,x11 1353 rev x13,x13 1354 rev x15,x15 1355 rev x17,x17 1356 rev x20,x20 1357#endif 1358 eor x5,x5,x6 1359 eor x7,x7,x8 1360 eor x9,x9,x10 1361 eor x11,x11,x12 1362 eor x13,x13,x14 1363 eor x15,x15,x16 1364 eor x17,x17,x19 1365 eor x20,x20,x21 1366 1367 stp x5,x7,[x0,#0] // store output 1368 add x28,x28,#1 // increment counter 1369 mov w5,w22 // unpack key block 1370 lsr x6,x22,#32 1371 stp x9,x11,[x0,#16] 1372 mov w7,w23 1373 lsr x8,x23,#32 1374 stp x13,x15,[x0,#32] 1375 mov w9,w24 1376 lsr x10,x24,#32 1377 stp x17,x20,[x0,#48] 1378 add x0,x0,#64 1379 mov w11,w25 1380 lsr x12,x25,#32 1381 mov w13,w26 1382 lsr x14,x26,#32 1383 mov w15,w27 1384 lsr x16,x27,#32 1385 mov w17,w28 1386 lsr x19,x28,#32 1387 mov w20,w30 1388 lsr x21,x30,#32 1389 1390 mov x4,#5 1391.Loop_lower_neon: 1392 sub x4,x4,#1 1393 add v0.4s,v0.4s,v1.4s 1394 add w5,w5,w9 1395 add v4.4s,v4.4s,v5.4s 1396 add w6,w6,w10 1397 add v8.4s,v8.4s,v9.4s 1398 add w7,w7,w11 1399 add v12.4s,v12.4s,v13.4s 1400 add w8,w8,w12 1401 add v16.4s,v16.4s,v17.4s 1402 eor w17,w17,w5 1403 add v20.4s,v20.4s,v21.4s 1404 eor w19,w19,w6 1405 eor v3.16b,v3.16b,v0.16b 1406 eor w20,w20,w7 1407 eor v7.16b,v7.16b,v4.16b 1408 eor w21,w21,w8 1409 eor v11.16b,v11.16b,v8.16b 1410 ror w17,w17,#16 1411 eor v15.16b,v15.16b,v12.16b 1412 ror w19,w19,#16 1413 eor v19.16b,v19.16b,v16.16b 1414 ror w20,w20,#16 1415 eor v23.16b,v23.16b,v20.16b 1416 ror w21,w21,#16 1417 rev32 v3.8h,v3.8h 1418 add w13,w13,w17 1419 rev32 v7.8h,v7.8h 1420 add w14,w14,w19 1421 rev32 v11.8h,v11.8h 1422 add w15,w15,w20 1423 rev32 v15.8h,v15.8h 1424 add w16,w16,w21 1425 rev32 v19.8h,v19.8h 1426 eor w9,w9,w13 1427 rev32 v23.8h,v23.8h 1428 eor w10,w10,w14 1429 add v2.4s,v2.4s,v3.4s 1430 eor w11,w11,w15 1431 add v6.4s,v6.4s,v7.4s 1432 eor w12,w12,w16 1433 add v10.4s,v10.4s,v11.4s 1434 ror w9,w9,#20 1435 add v14.4s,v14.4s,v15.4s 1436 ror w10,w10,#20 1437 add v18.4s,v18.4s,v19.4s 1438 ror w11,w11,#20 1439 add v22.4s,v22.4s,v23.4s 1440 ror w12,w12,#20 1441 eor v24.16b,v1.16b,v2.16b 1442 add w5,w5,w9 1443 eor v25.16b,v5.16b,v6.16b 1444 add w6,w6,w10 1445 eor v26.16b,v9.16b,v10.16b 1446 add w7,w7,w11 1447 eor v27.16b,v13.16b,v14.16b 1448 add w8,w8,w12 1449 eor v28.16b,v17.16b,v18.16b 1450 eor w17,w17,w5 1451 eor v29.16b,v21.16b,v22.16b 1452 eor w19,w19,w6 1453 ushr v1.4s,v24.4s,#20 1454 eor w20,w20,w7 1455 ushr v5.4s,v25.4s,#20 1456 eor w21,w21,w8 1457 ushr v9.4s,v26.4s,#20 1458 ror w17,w17,#24 1459 ushr v13.4s,v27.4s,#20 1460 ror w19,w19,#24 1461 ushr v17.4s,v28.4s,#20 1462 ror w20,w20,#24 1463 ushr v21.4s,v29.4s,#20 1464 ror w21,w21,#24 1465 sli v1.4s,v24.4s,#12 1466 add w13,w13,w17 1467 sli v5.4s,v25.4s,#12 1468 add w14,w14,w19 1469 sli v9.4s,v26.4s,#12 1470 add w15,w15,w20 1471 sli v13.4s,v27.4s,#12 1472 add w16,w16,w21 1473 sli v17.4s,v28.4s,#12 1474 eor w9,w9,w13 1475 sli v21.4s,v29.4s,#12 1476 eor w10,w10,w14 1477 add v0.4s,v0.4s,v1.4s 1478 eor w11,w11,w15 1479 add v4.4s,v4.4s,v5.4s 1480 eor w12,w12,w16 1481 add v8.4s,v8.4s,v9.4s 1482 ror w9,w9,#25 1483 add v12.4s,v12.4s,v13.4s 1484 ror w10,w10,#25 1485 add v16.4s,v16.4s,v17.4s 1486 ror w11,w11,#25 1487 add v20.4s,v20.4s,v21.4s 1488 ror w12,w12,#25 1489 eor v24.16b,v3.16b,v0.16b 1490 add w5,w5,w10 1491 eor v25.16b,v7.16b,v4.16b 1492 add w6,w6,w11 1493 eor v26.16b,v11.16b,v8.16b 1494 add w7,w7,w12 1495 eor v27.16b,v15.16b,v12.16b 1496 add w8,w8,w9 1497 eor v28.16b,v19.16b,v16.16b 1498 eor w21,w21,w5 1499 eor v29.16b,v23.16b,v20.16b 1500 eor w17,w17,w6 1501 ushr v3.4s,v24.4s,#24 1502 eor w19,w19,w7 1503 ushr v7.4s,v25.4s,#24 1504 eor w20,w20,w8 1505 ushr v11.4s,v26.4s,#24 1506 ror w21,w21,#16 1507 ushr v15.4s,v27.4s,#24 1508 ror w17,w17,#16 1509 ushr v19.4s,v28.4s,#24 1510 ror w19,w19,#16 1511 ushr v23.4s,v29.4s,#24 1512 ror w20,w20,#16 1513 sli v3.4s,v24.4s,#8 1514 add w15,w15,w21 1515 sli v7.4s,v25.4s,#8 1516 add w16,w16,w17 1517 sli v11.4s,v26.4s,#8 1518 add w13,w13,w19 1519 sli v15.4s,v27.4s,#8 1520 add w14,w14,w20 1521 sli v19.4s,v28.4s,#8 1522 eor w10,w10,w15 1523 sli v23.4s,v29.4s,#8 1524 eor w11,w11,w16 1525 add v2.4s,v2.4s,v3.4s 1526 eor w12,w12,w13 1527 add v6.4s,v6.4s,v7.4s 1528 eor w9,w9,w14 1529 add v10.4s,v10.4s,v11.4s 1530 ror w10,w10,#20 1531 add v14.4s,v14.4s,v15.4s 1532 ror w11,w11,#20 1533 add v18.4s,v18.4s,v19.4s 1534 ror w12,w12,#20 1535 add v22.4s,v22.4s,v23.4s 1536 ror w9,w9,#20 1537 eor v24.16b,v1.16b,v2.16b 1538 add w5,w5,w10 1539 eor v25.16b,v5.16b,v6.16b 1540 add w6,w6,w11 1541 eor v26.16b,v9.16b,v10.16b 1542 add w7,w7,w12 1543 eor v27.16b,v13.16b,v14.16b 1544 add w8,w8,w9 1545 eor v28.16b,v17.16b,v18.16b 1546 eor w21,w21,w5 1547 eor v29.16b,v21.16b,v22.16b 1548 eor w17,w17,w6 1549 ushr v1.4s,v24.4s,#25 1550 eor w19,w19,w7 1551 ushr v5.4s,v25.4s,#25 1552 eor w20,w20,w8 1553 ushr v9.4s,v26.4s,#25 1554 ror w21,w21,#24 1555 ushr v13.4s,v27.4s,#25 1556 ror w17,w17,#24 1557 ushr v17.4s,v28.4s,#25 1558 ror w19,w19,#24 1559 ushr v21.4s,v29.4s,#25 1560 ror w20,w20,#24 1561 sli v1.4s,v24.4s,#7 1562 add w15,w15,w21 1563 sli v5.4s,v25.4s,#7 1564 add w16,w16,w17 1565 sli v9.4s,v26.4s,#7 1566 add w13,w13,w19 1567 sli v13.4s,v27.4s,#7 1568 add w14,w14,w20 1569 sli v17.4s,v28.4s,#7 1570 eor w10,w10,w15 1571 sli v21.4s,v29.4s,#7 1572 eor w11,w11,w16 1573 ext v2.16b,v2.16b,v2.16b,#8 1574 eor w12,w12,w13 1575 ext v6.16b,v6.16b,v6.16b,#8 1576 eor w9,w9,w14 1577 ext v10.16b,v10.16b,v10.16b,#8 1578 ror w10,w10,#25 1579 ext v14.16b,v14.16b,v14.16b,#8 1580 ror w11,w11,#25 1581 ext v18.16b,v18.16b,v18.16b,#8 1582 ror w12,w12,#25 1583 ext v22.16b,v22.16b,v22.16b,#8 1584 ror w9,w9,#25 1585 ext v3.16b,v3.16b,v3.16b,#12 1586 ext v7.16b,v7.16b,v7.16b,#12 1587 ext v11.16b,v11.16b,v11.16b,#12 1588 ext v15.16b,v15.16b,v15.16b,#12 1589 ext v19.16b,v19.16b,v19.16b,#12 1590 ext v23.16b,v23.16b,v23.16b,#12 1591 ext v1.16b,v1.16b,v1.16b,#4 1592 ext v5.16b,v5.16b,v5.16b,#4 1593 ext v9.16b,v9.16b,v9.16b,#4 1594 ext v13.16b,v13.16b,v13.16b,#4 1595 ext v17.16b,v17.16b,v17.16b,#4 1596 ext v21.16b,v21.16b,v21.16b,#4 1597 add v0.4s,v0.4s,v1.4s 1598 add w5,w5,w9 1599 add v4.4s,v4.4s,v5.4s 1600 add w6,w6,w10 1601 add v8.4s,v8.4s,v9.4s 1602 add w7,w7,w11 1603 add v12.4s,v12.4s,v13.4s 1604 add w8,w8,w12 1605 add v16.4s,v16.4s,v17.4s 1606 eor w17,w17,w5 1607 add v20.4s,v20.4s,v21.4s 1608 eor w19,w19,w6 1609 eor v3.16b,v3.16b,v0.16b 1610 eor w20,w20,w7 1611 eor v7.16b,v7.16b,v4.16b 1612 eor w21,w21,w8 1613 eor v11.16b,v11.16b,v8.16b 1614 ror w17,w17,#16 1615 eor v15.16b,v15.16b,v12.16b 1616 ror w19,w19,#16 1617 eor v19.16b,v19.16b,v16.16b 1618 ror w20,w20,#16 1619 eor v23.16b,v23.16b,v20.16b 1620 ror w21,w21,#16 1621 rev32 v3.8h,v3.8h 1622 add w13,w13,w17 1623 rev32 v7.8h,v7.8h 1624 add w14,w14,w19 1625 rev32 v11.8h,v11.8h 1626 add w15,w15,w20 1627 rev32 v15.8h,v15.8h 1628 add w16,w16,w21 1629 rev32 v19.8h,v19.8h 1630 eor w9,w9,w13 1631 rev32 v23.8h,v23.8h 1632 eor w10,w10,w14 1633 add v2.4s,v2.4s,v3.4s 1634 eor w11,w11,w15 1635 add v6.4s,v6.4s,v7.4s 1636 eor w12,w12,w16 1637 add v10.4s,v10.4s,v11.4s 1638 ror w9,w9,#20 1639 add v14.4s,v14.4s,v15.4s 1640 ror w10,w10,#20 1641 add v18.4s,v18.4s,v19.4s 1642 ror w11,w11,#20 1643 add v22.4s,v22.4s,v23.4s 1644 ror w12,w12,#20 1645 eor v24.16b,v1.16b,v2.16b 1646 add w5,w5,w9 1647 eor v25.16b,v5.16b,v6.16b 1648 add w6,w6,w10 1649 eor v26.16b,v9.16b,v10.16b 1650 add w7,w7,w11 1651 eor v27.16b,v13.16b,v14.16b 1652 add w8,w8,w12 1653 eor v28.16b,v17.16b,v18.16b 1654 eor w17,w17,w5 1655 eor v29.16b,v21.16b,v22.16b 1656 eor w19,w19,w6 1657 ushr v1.4s,v24.4s,#20 1658 eor w20,w20,w7 1659 ushr v5.4s,v25.4s,#20 1660 eor w21,w21,w8 1661 ushr v9.4s,v26.4s,#20 1662 ror w17,w17,#24 1663 ushr v13.4s,v27.4s,#20 1664 ror w19,w19,#24 1665 ushr v17.4s,v28.4s,#20 1666 ror w20,w20,#24 1667 ushr v21.4s,v29.4s,#20 1668 ror w21,w21,#24 1669 sli v1.4s,v24.4s,#12 1670 add w13,w13,w17 1671 sli v5.4s,v25.4s,#12 1672 add w14,w14,w19 1673 sli v9.4s,v26.4s,#12 1674 add w15,w15,w20 1675 sli v13.4s,v27.4s,#12 1676 add w16,w16,w21 1677 sli v17.4s,v28.4s,#12 1678 eor w9,w9,w13 1679 sli v21.4s,v29.4s,#12 1680 eor w10,w10,w14 1681 add v0.4s,v0.4s,v1.4s 1682 eor w11,w11,w15 1683 add v4.4s,v4.4s,v5.4s 1684 eor w12,w12,w16 1685 add v8.4s,v8.4s,v9.4s 1686 ror w9,w9,#25 1687 add v12.4s,v12.4s,v13.4s 1688 ror w10,w10,#25 1689 add v16.4s,v16.4s,v17.4s 1690 ror w11,w11,#25 1691 add v20.4s,v20.4s,v21.4s 1692 ror w12,w12,#25 1693 eor v24.16b,v3.16b,v0.16b 1694 add w5,w5,w10 1695 eor v25.16b,v7.16b,v4.16b 1696 add w6,w6,w11 1697 eor v26.16b,v11.16b,v8.16b 1698 add w7,w7,w12 1699 eor v27.16b,v15.16b,v12.16b 1700 add w8,w8,w9 1701 eor v28.16b,v19.16b,v16.16b 1702 eor w21,w21,w5 1703 eor v29.16b,v23.16b,v20.16b 1704 eor w17,w17,w6 1705 ushr v3.4s,v24.4s,#24 1706 eor w19,w19,w7 1707 ushr v7.4s,v25.4s,#24 1708 eor w20,w20,w8 1709 ushr v11.4s,v26.4s,#24 1710 ror w21,w21,#16 1711 ushr v15.4s,v27.4s,#24 1712 ror w17,w17,#16 1713 ushr v19.4s,v28.4s,#24 1714 ror w19,w19,#16 1715 ushr v23.4s,v29.4s,#24 1716 ror w20,w20,#16 1717 sli v3.4s,v24.4s,#8 1718 add w15,w15,w21 1719 sli v7.4s,v25.4s,#8 1720 add w16,w16,w17 1721 sli v11.4s,v26.4s,#8 1722 add w13,w13,w19 1723 sli v15.4s,v27.4s,#8 1724 add w14,w14,w20 1725 sli v19.4s,v28.4s,#8 1726 eor w10,w10,w15 1727 sli v23.4s,v29.4s,#8 1728 eor w11,w11,w16 1729 add v2.4s,v2.4s,v3.4s 1730 eor w12,w12,w13 1731 add v6.4s,v6.4s,v7.4s 1732 eor w9,w9,w14 1733 add v10.4s,v10.4s,v11.4s 1734 ror w10,w10,#20 1735 add v14.4s,v14.4s,v15.4s 1736 ror w11,w11,#20 1737 add v18.4s,v18.4s,v19.4s 1738 ror w12,w12,#20 1739 add v22.4s,v22.4s,v23.4s 1740 ror w9,w9,#20 1741 eor v24.16b,v1.16b,v2.16b 1742 add w5,w5,w10 1743 eor v25.16b,v5.16b,v6.16b 1744 add w6,w6,w11 1745 eor v26.16b,v9.16b,v10.16b 1746 add w7,w7,w12 1747 eor v27.16b,v13.16b,v14.16b 1748 add w8,w8,w9 1749 eor v28.16b,v17.16b,v18.16b 1750 eor w21,w21,w5 1751 eor v29.16b,v21.16b,v22.16b 1752 eor w17,w17,w6 1753 ushr v1.4s,v24.4s,#25 1754 eor w19,w19,w7 1755 ushr v5.4s,v25.4s,#25 1756 eor w20,w20,w8 1757 ushr v9.4s,v26.4s,#25 1758 ror w21,w21,#24 1759 ushr v13.4s,v27.4s,#25 1760 ror w17,w17,#24 1761 ushr v17.4s,v28.4s,#25 1762 ror w19,w19,#24 1763 ushr v21.4s,v29.4s,#25 1764 ror w20,w20,#24 1765 sli v1.4s,v24.4s,#7 1766 add w15,w15,w21 1767 sli v5.4s,v25.4s,#7 1768 add w16,w16,w17 1769 sli v9.4s,v26.4s,#7 1770 add w13,w13,w19 1771 sli v13.4s,v27.4s,#7 1772 add w14,w14,w20 1773 sli v17.4s,v28.4s,#7 1774 eor w10,w10,w15 1775 sli v21.4s,v29.4s,#7 1776 eor w11,w11,w16 1777 ext v2.16b,v2.16b,v2.16b,#8 1778 eor w12,w12,w13 1779 ext v6.16b,v6.16b,v6.16b,#8 1780 eor w9,w9,w14 1781 ext v10.16b,v10.16b,v10.16b,#8 1782 ror w10,w10,#25 1783 ext v14.16b,v14.16b,v14.16b,#8 1784 ror w11,w11,#25 1785 ext v18.16b,v18.16b,v18.16b,#8 1786 ror w12,w12,#25 1787 ext v22.16b,v22.16b,v22.16b,#8 1788 ror w9,w9,#25 1789 ext v3.16b,v3.16b,v3.16b,#4 1790 ext v7.16b,v7.16b,v7.16b,#4 1791 ext v11.16b,v11.16b,v11.16b,#4 1792 ext v15.16b,v15.16b,v15.16b,#4 1793 ext v19.16b,v19.16b,v19.16b,#4 1794 ext v23.16b,v23.16b,v23.16b,#4 1795 ext v1.16b,v1.16b,v1.16b,#12 1796 ext v5.16b,v5.16b,v5.16b,#12 1797 ext v9.16b,v9.16b,v9.16b,#12 1798 ext v13.16b,v13.16b,v13.16b,#12 1799 ext v17.16b,v17.16b,v17.16b,#12 1800 ext v21.16b,v21.16b,v21.16b,#12 1801 cbnz x4,.Loop_lower_neon 1802 1803 add w5,w5,w22 // accumulate key block 1804 ldp q24,q25,[sp,#0] 1805 add x6,x6,x22,lsr#32 1806 ldp q26,q27,[sp,#32] 1807 add w7,w7,w23 1808 ldp q28,q29,[sp,#64] 1809 add x8,x8,x23,lsr#32 1810 add v0.4s,v0.4s,v24.4s 1811 add w9,w9,w24 1812 add v4.4s,v4.4s,v24.4s 1813 add x10,x10,x24,lsr#32 1814 add v8.4s,v8.4s,v24.4s 1815 add w11,w11,w25 1816 add v12.4s,v12.4s,v24.4s 1817 add x12,x12,x25,lsr#32 1818 add v16.4s,v16.4s,v24.4s 1819 add w13,w13,w26 1820 add v20.4s,v20.4s,v24.4s 1821 add x14,x14,x26,lsr#32 1822 add v2.4s,v2.4s,v26.4s 1823 add w15,w15,w27 1824 add v6.4s,v6.4s,v26.4s 1825 add x16,x16,x27,lsr#32 1826 add v10.4s,v10.4s,v26.4s 1827 add w17,w17,w28 1828 add v14.4s,v14.4s,v26.4s 1829 add x19,x19,x28,lsr#32 1830 add v18.4s,v18.4s,v26.4s 1831 add w20,w20,w30 1832 add v22.4s,v22.4s,v26.4s 1833 add x21,x21,x30,lsr#32 1834 add v19.4s,v19.4s,v31.4s // +4 1835 add x5,x5,x6,lsl#32 // pack 1836 add v23.4s,v23.4s,v31.4s // +4 1837 add x7,x7,x8,lsl#32 1838 add v3.4s,v3.4s,v27.4s 1839 ldp x6,x8,[x1,#0] // load input 1840 add v7.4s,v7.4s,v28.4s 1841 add x9,x9,x10,lsl#32 1842 add v11.4s,v11.4s,v29.4s 1843 add x11,x11,x12,lsl#32 1844 add v15.4s,v15.4s,v30.4s 1845 ldp x10,x12,[x1,#16] 1846 add v19.4s,v19.4s,v27.4s 1847 add x13,x13,x14,lsl#32 1848 add v23.4s,v23.4s,v28.4s 1849 add x15,x15,x16,lsl#32 1850 add v1.4s,v1.4s,v25.4s 1851 ldp x14,x16,[x1,#32] 1852 add v5.4s,v5.4s,v25.4s 1853 add x17,x17,x19,lsl#32 1854 add v9.4s,v9.4s,v25.4s 1855 add x20,x20,x21,lsl#32 1856 add v13.4s,v13.4s,v25.4s 1857 ldp x19,x21,[x1,#48] 1858 add v17.4s,v17.4s,v25.4s 1859 add x1,x1,#64 1860 add v21.4s,v21.4s,v25.4s 1861 1862#ifdef __ARMEB__ 1863 rev x5,x5 1864 rev x7,x7 1865 rev x9,x9 1866 rev x11,x11 1867 rev x13,x13 1868 rev x15,x15 1869 rev x17,x17 1870 rev x20,x20 1871#endif 1872 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1873 eor x5,x5,x6 1874 eor x7,x7,x8 1875 eor x9,x9,x10 1876 eor x11,x11,x12 1877 eor x13,x13,x14 1878 eor v0.16b,v0.16b,v24.16b 1879 eor x15,x15,x16 1880 eor v1.16b,v1.16b,v25.16b 1881 eor x17,x17,x19 1882 eor v2.16b,v2.16b,v26.16b 1883 eor x20,x20,x21 1884 eor v3.16b,v3.16b,v27.16b 1885 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1886 1887 stp x5,x7,[x0,#0] // store output 1888 add x28,x28,#7 // increment counter 1889 stp x9,x11,[x0,#16] 1890 stp x13,x15,[x0,#32] 1891 stp x17,x20,[x0,#48] 1892 add x0,x0,#64 1893 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 1894 1895 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1896 eor v4.16b,v4.16b,v24.16b 1897 eor v5.16b,v5.16b,v25.16b 1898 eor v6.16b,v6.16b,v26.16b 1899 eor v7.16b,v7.16b,v27.16b 1900 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1901 1902 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1903 eor v8.16b,v8.16b,v0.16b 1904 ldp q24,q25,[sp,#0] 1905 eor v9.16b,v9.16b,v1.16b 1906 ldp q26,q27,[sp,#32] 1907 eor v10.16b,v10.16b,v2.16b 1908 eor v11.16b,v11.16b,v3.16b 1909 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1910 1911 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1912 eor v12.16b,v12.16b,v4.16b 1913 eor v13.16b,v13.16b,v5.16b 1914 eor v14.16b,v14.16b,v6.16b 1915 eor v15.16b,v15.16b,v7.16b 1916 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1917 1918 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1919 eor v16.16b,v16.16b,v8.16b 1920 eor v17.16b,v17.16b,v9.16b 1921 eor v18.16b,v18.16b,v10.16b 1922 eor v19.16b,v19.16b,v11.16b 1923 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1924 1925 shl v0.4s,v31.4s,#1 // 4 -> 8 1926 eor v20.16b,v20.16b,v12.16b 1927 eor v21.16b,v21.16b,v13.16b 1928 eor v22.16b,v22.16b,v14.16b 1929 eor v23.16b,v23.16b,v15.16b 1930 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1931 1932 add v27.4s,v27.4s,v0.4s // += 8 1933 add v28.4s,v28.4s,v0.4s 1934 add v29.4s,v29.4s,v0.4s 1935 add v30.4s,v30.4s,v0.4s 1936 1937 b.hs .Loop_outer_512_neon 1938 1939 adds x2,x2,#512 1940 ushr v0.4s,v31.4s,#2 // 4 -> 1 1941 1942 ldp d8,d9,[sp,#128+0] // meet ABI requirements 1943 ldp d10,d11,[sp,#128+16] 1944 ldp d12,d13,[sp,#128+32] 1945 ldp d14,d15,[sp,#128+48] 1946 1947 stp q24,q31,[sp,#0] // wipe off-load area 1948 stp q24,q31,[sp,#32] 1949 stp q24,q31,[sp,#64] 1950 1951 b.eq .Ldone_512_neon 1952 1953 cmp x2,#192 1954 sub v27.4s,v27.4s,v0.4s // -= 1 1955 sub v28.4s,v28.4s,v0.4s 1956 sub v29.4s,v29.4s,v0.4s 1957 add sp,sp,#128 1958 b.hs .Loop_outer_neon 1959 1960 eor v25.16b,v25.16b,v25.16b 1961 eor v26.16b,v26.16b,v26.16b 1962 eor v27.16b,v27.16b,v27.16b 1963 eor v28.16b,v28.16b,v28.16b 1964 eor v29.16b,v29.16b,v29.16b 1965 eor v30.16b,v30.16b,v30.16b 1966 b .Loop_outer 1967 1968.Ldone_512_neon: 1969 ldp x19,x20,[x29,#16] 1970 add sp,sp,#128+64 1971 ldp x21,x22,[x29,#32] 1972 ldp x23,x24,[x29,#48] 1973 ldp x25,x26,[x29,#64] 1974 ldp x27,x28,[x29,#80] 1975 ldp x29,x30,[sp],#96 1976.inst 0xd50323bf // autiasp 1977 ret 1978.size ChaCha20_512_neon,.-ChaCha20_512_neon 1979