1*1dcdf01fSchristos#include "arm_asm.h" 2*1dcdf01fSchristos#include "arm_arch.h" 3*1dcdf01fSchristos 4*1dcdf01fSchristos.text 5*1dcdf01fSchristos 6*1dcdf01fSchristos 7*1dcdf01fSchristos.hidden OPENSSL_armcap_P 8*1dcdf01fSchristos 9*1dcdf01fSchristos.align 5 10*1dcdf01fSchristos.Lsigma: 11*1dcdf01fSchristos.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 12*1dcdf01fSchristos.Lone: 13*1dcdf01fSchristos.long 1,0,0,0 14*1dcdf01fSchristos.LOPENSSL_armcap_P: 15*1dcdf01fSchristos#ifdef __ILP32__ 16*1dcdf01fSchristos.long OPENSSL_armcap_P-. 17*1dcdf01fSchristos#else 18*1dcdf01fSchristos.quad OPENSSL_armcap_P-. 19*1dcdf01fSchristos#endif 20*1dcdf01fSchristos.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 21*1dcdf01fSchristos.align 2 22*1dcdf01fSchristos 23*1dcdf01fSchristos.globl ChaCha20_ctr32 24*1dcdf01fSchristos.type ChaCha20_ctr32,%function 25*1dcdf01fSchristos.align 5 26*1dcdf01fSchristosChaCha20_ctr32: 27*1dcdf01fSchristos cbz x2,.Labort 28*1dcdf01fSchristos adr x5,.LOPENSSL_armcap_P 29*1dcdf01fSchristos cmp x2,#192 30*1dcdf01fSchristos b.lo .Lshort 31*1dcdf01fSchristos#ifdef __ILP32__ 32*1dcdf01fSchristos ldrsw x6,[x5] 33*1dcdf01fSchristos#else 34*1dcdf01fSchristos ldr x6,[x5] 35*1dcdf01fSchristos#endif 36*1dcdf01fSchristos ldr w17,[x6,x5] 37*1dcdf01fSchristos tst w17,#ARMV7_NEON 38*1dcdf01fSchristos b.ne ChaCha20_neon 39*1dcdf01fSchristos 40*1dcdf01fSchristos.Lshort: 41*1dcdf01fSchristos.inst 0xd503233f // paciasp 42*1dcdf01fSchristos stp x29,x30,[sp,#-96]! 43*1dcdf01fSchristos add x29,sp,#0 44*1dcdf01fSchristos 45*1dcdf01fSchristos adr x5,.Lsigma 46*1dcdf01fSchristos stp x19,x20,[sp,#16] 47*1dcdf01fSchristos stp x21,x22,[sp,#32] 48*1dcdf01fSchristos stp x23,x24,[sp,#48] 49*1dcdf01fSchristos stp x25,x26,[sp,#64] 50*1dcdf01fSchristos stp x27,x28,[sp,#80] 51*1dcdf01fSchristos sub sp,sp,#64 52*1dcdf01fSchristos 53*1dcdf01fSchristos ldp x22,x23,[x5] // load sigma 54*1dcdf01fSchristos ldp x24,x25,[x3] // load key 55*1dcdf01fSchristos ldp x26,x27,[x3,#16] 56*1dcdf01fSchristos ldp x28,x30,[x4] // load counter 57*1dcdf01fSchristos#ifdef __ARMEB__ 58*1dcdf01fSchristos ror x24,x24,#32 59*1dcdf01fSchristos ror x25,x25,#32 60*1dcdf01fSchristos ror x26,x26,#32 61*1dcdf01fSchristos ror x27,x27,#32 62*1dcdf01fSchristos ror x28,x28,#32 63*1dcdf01fSchristos ror x30,x30,#32 64*1dcdf01fSchristos#endif 65*1dcdf01fSchristos 66*1dcdf01fSchristos.Loop_outer: 67*1dcdf01fSchristos mov w5,w22 // unpack key block 68*1dcdf01fSchristos lsr x6,x22,#32 69*1dcdf01fSchristos mov w7,w23 70*1dcdf01fSchristos lsr x8,x23,#32 71*1dcdf01fSchristos mov w9,w24 72*1dcdf01fSchristos lsr x10,x24,#32 73*1dcdf01fSchristos mov w11,w25 74*1dcdf01fSchristos lsr x12,x25,#32 75*1dcdf01fSchristos mov w13,w26 76*1dcdf01fSchristos lsr x14,x26,#32 77*1dcdf01fSchristos mov w15,w27 78*1dcdf01fSchristos lsr x16,x27,#32 79*1dcdf01fSchristos mov w17,w28 80*1dcdf01fSchristos lsr x19,x28,#32 81*1dcdf01fSchristos mov w20,w30 82*1dcdf01fSchristos lsr x21,x30,#32 83*1dcdf01fSchristos 84*1dcdf01fSchristos mov x4,#10 85*1dcdf01fSchristos subs x2,x2,#64 86*1dcdf01fSchristos.Loop: 87*1dcdf01fSchristos sub x4,x4,#1 88*1dcdf01fSchristos add w5,w5,w9 89*1dcdf01fSchristos add w6,w6,w10 90*1dcdf01fSchristos add w7,w7,w11 91*1dcdf01fSchristos add w8,w8,w12 92*1dcdf01fSchristos eor w17,w17,w5 93*1dcdf01fSchristos eor w19,w19,w6 94*1dcdf01fSchristos eor w20,w20,w7 95*1dcdf01fSchristos eor w21,w21,w8 96*1dcdf01fSchristos ror w17,w17,#16 97*1dcdf01fSchristos ror w19,w19,#16 98*1dcdf01fSchristos ror w20,w20,#16 99*1dcdf01fSchristos ror w21,w21,#16 100*1dcdf01fSchristos add w13,w13,w17 101*1dcdf01fSchristos add w14,w14,w19 102*1dcdf01fSchristos add w15,w15,w20 103*1dcdf01fSchristos add w16,w16,w21 104*1dcdf01fSchristos eor w9,w9,w13 105*1dcdf01fSchristos eor w10,w10,w14 106*1dcdf01fSchristos eor w11,w11,w15 107*1dcdf01fSchristos eor w12,w12,w16 108*1dcdf01fSchristos ror w9,w9,#20 109*1dcdf01fSchristos ror w10,w10,#20 110*1dcdf01fSchristos ror w11,w11,#20 111*1dcdf01fSchristos ror w12,w12,#20 112*1dcdf01fSchristos add w5,w5,w9 113*1dcdf01fSchristos add w6,w6,w10 114*1dcdf01fSchristos add w7,w7,w11 115*1dcdf01fSchristos add w8,w8,w12 116*1dcdf01fSchristos eor w17,w17,w5 117*1dcdf01fSchristos eor w19,w19,w6 118*1dcdf01fSchristos eor w20,w20,w7 119*1dcdf01fSchristos eor w21,w21,w8 120*1dcdf01fSchristos ror w17,w17,#24 121*1dcdf01fSchristos ror w19,w19,#24 122*1dcdf01fSchristos ror w20,w20,#24 123*1dcdf01fSchristos ror w21,w21,#24 124*1dcdf01fSchristos add w13,w13,w17 125*1dcdf01fSchristos add w14,w14,w19 126*1dcdf01fSchristos add w15,w15,w20 127*1dcdf01fSchristos add w16,w16,w21 128*1dcdf01fSchristos eor w9,w9,w13 129*1dcdf01fSchristos eor w10,w10,w14 130*1dcdf01fSchristos eor w11,w11,w15 131*1dcdf01fSchristos eor w12,w12,w16 132*1dcdf01fSchristos ror w9,w9,#25 133*1dcdf01fSchristos ror w10,w10,#25 134*1dcdf01fSchristos ror w11,w11,#25 135*1dcdf01fSchristos ror w12,w12,#25 136*1dcdf01fSchristos add w5,w5,w10 137*1dcdf01fSchristos add w6,w6,w11 138*1dcdf01fSchristos add w7,w7,w12 139*1dcdf01fSchristos add w8,w8,w9 140*1dcdf01fSchristos eor w21,w21,w5 141*1dcdf01fSchristos eor w17,w17,w6 142*1dcdf01fSchristos eor w19,w19,w7 143*1dcdf01fSchristos eor w20,w20,w8 144*1dcdf01fSchristos ror w21,w21,#16 145*1dcdf01fSchristos ror w17,w17,#16 146*1dcdf01fSchristos ror w19,w19,#16 147*1dcdf01fSchristos ror w20,w20,#16 148*1dcdf01fSchristos add w15,w15,w21 149*1dcdf01fSchristos add w16,w16,w17 150*1dcdf01fSchristos add w13,w13,w19 151*1dcdf01fSchristos add w14,w14,w20 152*1dcdf01fSchristos eor w10,w10,w15 153*1dcdf01fSchristos eor w11,w11,w16 154*1dcdf01fSchristos eor w12,w12,w13 155*1dcdf01fSchristos eor w9,w9,w14 156*1dcdf01fSchristos ror w10,w10,#20 157*1dcdf01fSchristos ror w11,w11,#20 158*1dcdf01fSchristos ror w12,w12,#20 159*1dcdf01fSchristos ror w9,w9,#20 160*1dcdf01fSchristos add w5,w5,w10 161*1dcdf01fSchristos add w6,w6,w11 162*1dcdf01fSchristos add w7,w7,w12 163*1dcdf01fSchristos add w8,w8,w9 164*1dcdf01fSchristos eor w21,w21,w5 165*1dcdf01fSchristos eor w17,w17,w6 166*1dcdf01fSchristos eor w19,w19,w7 167*1dcdf01fSchristos eor w20,w20,w8 168*1dcdf01fSchristos ror w21,w21,#24 169*1dcdf01fSchristos ror w17,w17,#24 170*1dcdf01fSchristos ror w19,w19,#24 171*1dcdf01fSchristos ror w20,w20,#24 172*1dcdf01fSchristos add w15,w15,w21 173*1dcdf01fSchristos add w16,w16,w17 174*1dcdf01fSchristos add w13,w13,w19 175*1dcdf01fSchristos add w14,w14,w20 176*1dcdf01fSchristos eor w10,w10,w15 177*1dcdf01fSchristos eor w11,w11,w16 178*1dcdf01fSchristos eor w12,w12,w13 179*1dcdf01fSchristos eor w9,w9,w14 180*1dcdf01fSchristos ror w10,w10,#25 181*1dcdf01fSchristos ror w11,w11,#25 182*1dcdf01fSchristos ror w12,w12,#25 183*1dcdf01fSchristos ror w9,w9,#25 184*1dcdf01fSchristos cbnz x4,.Loop 185*1dcdf01fSchristos 186*1dcdf01fSchristos add w5,w5,w22 // accumulate key block 187*1dcdf01fSchristos add x6,x6,x22,lsr#32 188*1dcdf01fSchristos add w7,w7,w23 189*1dcdf01fSchristos add x8,x8,x23,lsr#32 190*1dcdf01fSchristos add w9,w9,w24 191*1dcdf01fSchristos add x10,x10,x24,lsr#32 192*1dcdf01fSchristos add w11,w11,w25 193*1dcdf01fSchristos add x12,x12,x25,lsr#32 194*1dcdf01fSchristos add w13,w13,w26 195*1dcdf01fSchristos add x14,x14,x26,lsr#32 196*1dcdf01fSchristos add w15,w15,w27 197*1dcdf01fSchristos add x16,x16,x27,lsr#32 198*1dcdf01fSchristos add w17,w17,w28 199*1dcdf01fSchristos add x19,x19,x28,lsr#32 200*1dcdf01fSchristos add w20,w20,w30 201*1dcdf01fSchristos add x21,x21,x30,lsr#32 202*1dcdf01fSchristos 203*1dcdf01fSchristos b.lo .Ltail 204*1dcdf01fSchristos 205*1dcdf01fSchristos add x5,x5,x6,lsl#32 // pack 206*1dcdf01fSchristos add x7,x7,x8,lsl#32 207*1dcdf01fSchristos ldp x6,x8,[x1,#0] // load input 208*1dcdf01fSchristos add x9,x9,x10,lsl#32 209*1dcdf01fSchristos add x11,x11,x12,lsl#32 210*1dcdf01fSchristos ldp x10,x12,[x1,#16] 211*1dcdf01fSchristos add x13,x13,x14,lsl#32 212*1dcdf01fSchristos add x15,x15,x16,lsl#32 213*1dcdf01fSchristos ldp x14,x16,[x1,#32] 214*1dcdf01fSchristos add x17,x17,x19,lsl#32 215*1dcdf01fSchristos add x20,x20,x21,lsl#32 216*1dcdf01fSchristos ldp x19,x21,[x1,#48] 217*1dcdf01fSchristos add x1,x1,#64 218*1dcdf01fSchristos#ifdef __ARMEB__ 219*1dcdf01fSchristos rev x5,x5 220*1dcdf01fSchristos rev x7,x7 221*1dcdf01fSchristos rev x9,x9 222*1dcdf01fSchristos rev x11,x11 223*1dcdf01fSchristos rev x13,x13 224*1dcdf01fSchristos rev x15,x15 225*1dcdf01fSchristos rev x17,x17 226*1dcdf01fSchristos rev x20,x20 227*1dcdf01fSchristos#endif 228*1dcdf01fSchristos eor x5,x5,x6 229*1dcdf01fSchristos eor x7,x7,x8 230*1dcdf01fSchristos eor x9,x9,x10 231*1dcdf01fSchristos eor x11,x11,x12 232*1dcdf01fSchristos eor x13,x13,x14 233*1dcdf01fSchristos eor x15,x15,x16 234*1dcdf01fSchristos eor x17,x17,x19 235*1dcdf01fSchristos eor x20,x20,x21 236*1dcdf01fSchristos 237*1dcdf01fSchristos stp x5,x7,[x0,#0] // store output 238*1dcdf01fSchristos add x28,x28,#1 // increment counter 239*1dcdf01fSchristos stp x9,x11,[x0,#16] 240*1dcdf01fSchristos stp x13,x15,[x0,#32] 241*1dcdf01fSchristos stp x17,x20,[x0,#48] 242*1dcdf01fSchristos add x0,x0,#64 243*1dcdf01fSchristos 244*1dcdf01fSchristos b.hi .Loop_outer 245*1dcdf01fSchristos 246*1dcdf01fSchristos ldp x19,x20,[x29,#16] 247*1dcdf01fSchristos add sp,sp,#64 248*1dcdf01fSchristos ldp x21,x22,[x29,#32] 249*1dcdf01fSchristos ldp x23,x24,[x29,#48] 250*1dcdf01fSchristos ldp x25,x26,[x29,#64] 251*1dcdf01fSchristos ldp x27,x28,[x29,#80] 252*1dcdf01fSchristos ldp x29,x30,[sp],#96 253*1dcdf01fSchristos.inst 0xd50323bf // autiasp 254*1dcdf01fSchristos.Labort: 255*1dcdf01fSchristos ret 256*1dcdf01fSchristos 257*1dcdf01fSchristos.align 4 258*1dcdf01fSchristos.Ltail: 259*1dcdf01fSchristos add x2,x2,#64 260*1dcdf01fSchristos.Less_than_64: 261*1dcdf01fSchristos sub x0,x0,#1 262*1dcdf01fSchristos add x1,x1,x2 263*1dcdf01fSchristos add x0,x0,x2 264*1dcdf01fSchristos add x4,sp,x2 265*1dcdf01fSchristos neg x2,x2 266*1dcdf01fSchristos 267*1dcdf01fSchristos add x5,x5,x6,lsl#32 // pack 268*1dcdf01fSchristos add x7,x7,x8,lsl#32 269*1dcdf01fSchristos add x9,x9,x10,lsl#32 270*1dcdf01fSchristos add x11,x11,x12,lsl#32 271*1dcdf01fSchristos add x13,x13,x14,lsl#32 272*1dcdf01fSchristos add x15,x15,x16,lsl#32 273*1dcdf01fSchristos add x17,x17,x19,lsl#32 274*1dcdf01fSchristos add x20,x20,x21,lsl#32 275*1dcdf01fSchristos#ifdef __ARMEB__ 276*1dcdf01fSchristos rev x5,x5 277*1dcdf01fSchristos rev x7,x7 278*1dcdf01fSchristos rev x9,x9 279*1dcdf01fSchristos rev x11,x11 280*1dcdf01fSchristos rev x13,x13 281*1dcdf01fSchristos rev x15,x15 282*1dcdf01fSchristos rev x17,x17 283*1dcdf01fSchristos rev x20,x20 284*1dcdf01fSchristos#endif 285*1dcdf01fSchristos stp x5,x7,[sp,#0] 286*1dcdf01fSchristos stp x9,x11,[sp,#16] 287*1dcdf01fSchristos stp x13,x15,[sp,#32] 288*1dcdf01fSchristos stp x17,x20,[sp,#48] 289*1dcdf01fSchristos 290*1dcdf01fSchristos.Loop_tail: 291*1dcdf01fSchristos ldrb w10,[x1,x2] 292*1dcdf01fSchristos ldrb w11,[x4,x2] 293*1dcdf01fSchristos add x2,x2,#1 294*1dcdf01fSchristos eor w10,w10,w11 295*1dcdf01fSchristos strb w10,[x0,x2] 296*1dcdf01fSchristos cbnz x2,.Loop_tail 297*1dcdf01fSchristos 298*1dcdf01fSchristos stp xzr,xzr,[sp,#0] 299*1dcdf01fSchristos stp xzr,xzr,[sp,#16] 300*1dcdf01fSchristos stp xzr,xzr,[sp,#32] 301*1dcdf01fSchristos stp xzr,xzr,[sp,#48] 302*1dcdf01fSchristos 303*1dcdf01fSchristos ldp x19,x20,[x29,#16] 304*1dcdf01fSchristos add sp,sp,#64 305*1dcdf01fSchristos ldp x21,x22,[x29,#32] 306*1dcdf01fSchristos ldp x23,x24,[x29,#48] 307*1dcdf01fSchristos ldp x25,x26,[x29,#64] 308*1dcdf01fSchristos ldp x27,x28,[x29,#80] 309*1dcdf01fSchristos ldp x29,x30,[sp],#96 310*1dcdf01fSchristos.inst 0xd50323bf // autiasp 311*1dcdf01fSchristos ret 312*1dcdf01fSchristos.size ChaCha20_ctr32,.-ChaCha20_ctr32 313*1dcdf01fSchristos 314*1dcdf01fSchristos.type ChaCha20_neon,%function 315*1dcdf01fSchristos.align 5 316*1dcdf01fSchristosChaCha20_neon: 317*1dcdf01fSchristos.inst 0xd503233f // paciasp 318*1dcdf01fSchristos stp x29,x30,[sp,#-96]! 319*1dcdf01fSchristos add x29,sp,#0 320*1dcdf01fSchristos 321*1dcdf01fSchristos adr x5,.Lsigma 322*1dcdf01fSchristos stp x19,x20,[sp,#16] 323*1dcdf01fSchristos stp x21,x22,[sp,#32] 324*1dcdf01fSchristos stp x23,x24,[sp,#48] 325*1dcdf01fSchristos stp x25,x26,[sp,#64] 326*1dcdf01fSchristos stp x27,x28,[sp,#80] 327*1dcdf01fSchristos cmp x2,#512 328*1dcdf01fSchristos b.hs .L512_or_more_neon 329*1dcdf01fSchristos 330*1dcdf01fSchristos sub sp,sp,#64 331*1dcdf01fSchristos 332*1dcdf01fSchristos ldp x22,x23,[x5] // load sigma 333*1dcdf01fSchristos ld1 {v24.4s},[x5],#16 334*1dcdf01fSchristos ldp x24,x25,[x3] // load key 335*1dcdf01fSchristos ldp x26,x27,[x3,#16] 336*1dcdf01fSchristos ld1 {v25.4s,v26.4s},[x3] 337*1dcdf01fSchristos ldp x28,x30,[x4] // load counter 338*1dcdf01fSchristos ld1 {v27.4s},[x4] 339*1dcdf01fSchristos ld1 {v31.4s},[x5] 340*1dcdf01fSchristos#ifdef __ARMEB__ 341*1dcdf01fSchristos rev64 v24.4s,v24.4s 342*1dcdf01fSchristos ror x24,x24,#32 343*1dcdf01fSchristos ror x25,x25,#32 344*1dcdf01fSchristos ror x26,x26,#32 345*1dcdf01fSchristos ror x27,x27,#32 346*1dcdf01fSchristos ror x28,x28,#32 347*1dcdf01fSchristos ror x30,x30,#32 348*1dcdf01fSchristos#endif 349*1dcdf01fSchristos add v27.4s,v27.4s,v31.4s // += 1 350*1dcdf01fSchristos add v28.4s,v27.4s,v31.4s 351*1dcdf01fSchristos add v29.4s,v28.4s,v31.4s 352*1dcdf01fSchristos shl v31.4s,v31.4s,#2 // 1 -> 4 353*1dcdf01fSchristos 354*1dcdf01fSchristos.Loop_outer_neon: 355*1dcdf01fSchristos mov w5,w22 // unpack key block 356*1dcdf01fSchristos lsr x6,x22,#32 357*1dcdf01fSchristos mov v0.16b,v24.16b 358*1dcdf01fSchristos mov w7,w23 359*1dcdf01fSchristos lsr x8,x23,#32 360*1dcdf01fSchristos mov v4.16b,v24.16b 361*1dcdf01fSchristos mov w9,w24 362*1dcdf01fSchristos lsr x10,x24,#32 363*1dcdf01fSchristos mov v16.16b,v24.16b 364*1dcdf01fSchristos mov w11,w25 365*1dcdf01fSchristos mov v1.16b,v25.16b 366*1dcdf01fSchristos lsr x12,x25,#32 367*1dcdf01fSchristos mov v5.16b,v25.16b 368*1dcdf01fSchristos mov w13,w26 369*1dcdf01fSchristos mov v17.16b,v25.16b 370*1dcdf01fSchristos lsr x14,x26,#32 371*1dcdf01fSchristos mov v3.16b,v27.16b 372*1dcdf01fSchristos mov w15,w27 373*1dcdf01fSchristos mov v7.16b,v28.16b 374*1dcdf01fSchristos lsr x16,x27,#32 375*1dcdf01fSchristos mov v19.16b,v29.16b 376*1dcdf01fSchristos mov w17,w28 377*1dcdf01fSchristos mov v2.16b,v26.16b 378*1dcdf01fSchristos lsr x19,x28,#32 379*1dcdf01fSchristos mov v6.16b,v26.16b 380*1dcdf01fSchristos mov w20,w30 381*1dcdf01fSchristos mov v18.16b,v26.16b 382*1dcdf01fSchristos lsr x21,x30,#32 383*1dcdf01fSchristos 384*1dcdf01fSchristos mov x4,#10 385*1dcdf01fSchristos subs x2,x2,#256 386*1dcdf01fSchristos.Loop_neon: 387*1dcdf01fSchristos sub x4,x4,#1 388*1dcdf01fSchristos add v0.4s,v0.4s,v1.4s 389*1dcdf01fSchristos add w5,w5,w9 390*1dcdf01fSchristos add v4.4s,v4.4s,v5.4s 391*1dcdf01fSchristos add w6,w6,w10 392*1dcdf01fSchristos add v16.4s,v16.4s,v17.4s 393*1dcdf01fSchristos add w7,w7,w11 394*1dcdf01fSchristos eor v3.16b,v3.16b,v0.16b 395*1dcdf01fSchristos add w8,w8,w12 396*1dcdf01fSchristos eor v7.16b,v7.16b,v4.16b 397*1dcdf01fSchristos eor w17,w17,w5 398*1dcdf01fSchristos eor v19.16b,v19.16b,v16.16b 399*1dcdf01fSchristos eor w19,w19,w6 400*1dcdf01fSchristos rev32 v3.8h,v3.8h 401*1dcdf01fSchristos eor w20,w20,w7 402*1dcdf01fSchristos rev32 v7.8h,v7.8h 403*1dcdf01fSchristos eor w21,w21,w8 404*1dcdf01fSchristos rev32 v19.8h,v19.8h 405*1dcdf01fSchristos ror w17,w17,#16 406*1dcdf01fSchristos add v2.4s,v2.4s,v3.4s 407*1dcdf01fSchristos ror w19,w19,#16 408*1dcdf01fSchristos add v6.4s,v6.4s,v7.4s 409*1dcdf01fSchristos ror w20,w20,#16 410*1dcdf01fSchristos add v18.4s,v18.4s,v19.4s 411*1dcdf01fSchristos ror w21,w21,#16 412*1dcdf01fSchristos eor v20.16b,v1.16b,v2.16b 413*1dcdf01fSchristos add w13,w13,w17 414*1dcdf01fSchristos eor v21.16b,v5.16b,v6.16b 415*1dcdf01fSchristos add w14,w14,w19 416*1dcdf01fSchristos eor v22.16b,v17.16b,v18.16b 417*1dcdf01fSchristos add w15,w15,w20 418*1dcdf01fSchristos ushr v1.4s,v20.4s,#20 419*1dcdf01fSchristos add w16,w16,w21 420*1dcdf01fSchristos ushr v5.4s,v21.4s,#20 421*1dcdf01fSchristos eor w9,w9,w13 422*1dcdf01fSchristos ushr v17.4s,v22.4s,#20 423*1dcdf01fSchristos eor w10,w10,w14 424*1dcdf01fSchristos sli v1.4s,v20.4s,#12 425*1dcdf01fSchristos eor w11,w11,w15 426*1dcdf01fSchristos sli v5.4s,v21.4s,#12 427*1dcdf01fSchristos eor w12,w12,w16 428*1dcdf01fSchristos sli v17.4s,v22.4s,#12 429*1dcdf01fSchristos ror w9,w9,#20 430*1dcdf01fSchristos add v0.4s,v0.4s,v1.4s 431*1dcdf01fSchristos ror w10,w10,#20 432*1dcdf01fSchristos add v4.4s,v4.4s,v5.4s 433*1dcdf01fSchristos ror w11,w11,#20 434*1dcdf01fSchristos add v16.4s,v16.4s,v17.4s 435*1dcdf01fSchristos ror w12,w12,#20 436*1dcdf01fSchristos eor v20.16b,v3.16b,v0.16b 437*1dcdf01fSchristos add w5,w5,w9 438*1dcdf01fSchristos eor v21.16b,v7.16b,v4.16b 439*1dcdf01fSchristos add w6,w6,w10 440*1dcdf01fSchristos eor v22.16b,v19.16b,v16.16b 441*1dcdf01fSchristos add w7,w7,w11 442*1dcdf01fSchristos ushr v3.4s,v20.4s,#24 443*1dcdf01fSchristos add w8,w8,w12 444*1dcdf01fSchristos ushr v7.4s,v21.4s,#24 445*1dcdf01fSchristos eor w17,w17,w5 446*1dcdf01fSchristos ushr v19.4s,v22.4s,#24 447*1dcdf01fSchristos eor w19,w19,w6 448*1dcdf01fSchristos sli v3.4s,v20.4s,#8 449*1dcdf01fSchristos eor w20,w20,w7 450*1dcdf01fSchristos sli v7.4s,v21.4s,#8 451*1dcdf01fSchristos eor w21,w21,w8 452*1dcdf01fSchristos sli v19.4s,v22.4s,#8 453*1dcdf01fSchristos ror w17,w17,#24 454*1dcdf01fSchristos add v2.4s,v2.4s,v3.4s 455*1dcdf01fSchristos ror w19,w19,#24 456*1dcdf01fSchristos add v6.4s,v6.4s,v7.4s 457*1dcdf01fSchristos ror w20,w20,#24 458*1dcdf01fSchristos add v18.4s,v18.4s,v19.4s 459*1dcdf01fSchristos ror w21,w21,#24 460*1dcdf01fSchristos eor v20.16b,v1.16b,v2.16b 461*1dcdf01fSchristos add w13,w13,w17 462*1dcdf01fSchristos eor v21.16b,v5.16b,v6.16b 463*1dcdf01fSchristos add w14,w14,w19 464*1dcdf01fSchristos eor v22.16b,v17.16b,v18.16b 465*1dcdf01fSchristos add w15,w15,w20 466*1dcdf01fSchristos ushr v1.4s,v20.4s,#25 467*1dcdf01fSchristos add w16,w16,w21 468*1dcdf01fSchristos ushr v5.4s,v21.4s,#25 469*1dcdf01fSchristos eor w9,w9,w13 470*1dcdf01fSchristos ushr v17.4s,v22.4s,#25 471*1dcdf01fSchristos eor w10,w10,w14 472*1dcdf01fSchristos sli v1.4s,v20.4s,#7 473*1dcdf01fSchristos eor w11,w11,w15 474*1dcdf01fSchristos sli v5.4s,v21.4s,#7 475*1dcdf01fSchristos eor w12,w12,w16 476*1dcdf01fSchristos sli v17.4s,v22.4s,#7 477*1dcdf01fSchristos ror w9,w9,#25 478*1dcdf01fSchristos ext v2.16b,v2.16b,v2.16b,#8 479*1dcdf01fSchristos ror w10,w10,#25 480*1dcdf01fSchristos ext v6.16b,v6.16b,v6.16b,#8 481*1dcdf01fSchristos ror w11,w11,#25 482*1dcdf01fSchristos ext v18.16b,v18.16b,v18.16b,#8 483*1dcdf01fSchristos ror w12,w12,#25 484*1dcdf01fSchristos ext v3.16b,v3.16b,v3.16b,#12 485*1dcdf01fSchristos ext v7.16b,v7.16b,v7.16b,#12 486*1dcdf01fSchristos ext v19.16b,v19.16b,v19.16b,#12 487*1dcdf01fSchristos ext v1.16b,v1.16b,v1.16b,#4 488*1dcdf01fSchristos ext v5.16b,v5.16b,v5.16b,#4 489*1dcdf01fSchristos ext v17.16b,v17.16b,v17.16b,#4 490*1dcdf01fSchristos add v0.4s,v0.4s,v1.4s 491*1dcdf01fSchristos add w5,w5,w10 492*1dcdf01fSchristos add v4.4s,v4.4s,v5.4s 493*1dcdf01fSchristos add w6,w6,w11 494*1dcdf01fSchristos add v16.4s,v16.4s,v17.4s 495*1dcdf01fSchristos add w7,w7,w12 496*1dcdf01fSchristos eor v3.16b,v3.16b,v0.16b 497*1dcdf01fSchristos add w8,w8,w9 498*1dcdf01fSchristos eor v7.16b,v7.16b,v4.16b 499*1dcdf01fSchristos eor w21,w21,w5 500*1dcdf01fSchristos eor v19.16b,v19.16b,v16.16b 501*1dcdf01fSchristos eor w17,w17,w6 502*1dcdf01fSchristos rev32 v3.8h,v3.8h 503*1dcdf01fSchristos eor w19,w19,w7 504*1dcdf01fSchristos rev32 v7.8h,v7.8h 505*1dcdf01fSchristos eor w20,w20,w8 506*1dcdf01fSchristos rev32 v19.8h,v19.8h 507*1dcdf01fSchristos ror w21,w21,#16 508*1dcdf01fSchristos add v2.4s,v2.4s,v3.4s 509*1dcdf01fSchristos ror w17,w17,#16 510*1dcdf01fSchristos add v6.4s,v6.4s,v7.4s 511*1dcdf01fSchristos ror w19,w19,#16 512*1dcdf01fSchristos add v18.4s,v18.4s,v19.4s 513*1dcdf01fSchristos ror w20,w20,#16 514*1dcdf01fSchristos eor v20.16b,v1.16b,v2.16b 515*1dcdf01fSchristos add w15,w15,w21 516*1dcdf01fSchristos eor v21.16b,v5.16b,v6.16b 517*1dcdf01fSchristos add w16,w16,w17 518*1dcdf01fSchristos eor v22.16b,v17.16b,v18.16b 519*1dcdf01fSchristos add w13,w13,w19 520*1dcdf01fSchristos ushr v1.4s,v20.4s,#20 521*1dcdf01fSchristos add w14,w14,w20 522*1dcdf01fSchristos ushr v5.4s,v21.4s,#20 523*1dcdf01fSchristos eor w10,w10,w15 524*1dcdf01fSchristos ushr v17.4s,v22.4s,#20 525*1dcdf01fSchristos eor w11,w11,w16 526*1dcdf01fSchristos sli v1.4s,v20.4s,#12 527*1dcdf01fSchristos eor w12,w12,w13 528*1dcdf01fSchristos sli v5.4s,v21.4s,#12 529*1dcdf01fSchristos eor w9,w9,w14 530*1dcdf01fSchristos sli v17.4s,v22.4s,#12 531*1dcdf01fSchristos ror w10,w10,#20 532*1dcdf01fSchristos add v0.4s,v0.4s,v1.4s 533*1dcdf01fSchristos ror w11,w11,#20 534*1dcdf01fSchristos add v4.4s,v4.4s,v5.4s 535*1dcdf01fSchristos ror w12,w12,#20 536*1dcdf01fSchristos add v16.4s,v16.4s,v17.4s 537*1dcdf01fSchristos ror w9,w9,#20 538*1dcdf01fSchristos eor v20.16b,v3.16b,v0.16b 539*1dcdf01fSchristos add w5,w5,w10 540*1dcdf01fSchristos eor v21.16b,v7.16b,v4.16b 541*1dcdf01fSchristos add w6,w6,w11 542*1dcdf01fSchristos eor v22.16b,v19.16b,v16.16b 543*1dcdf01fSchristos add w7,w7,w12 544*1dcdf01fSchristos ushr v3.4s,v20.4s,#24 545*1dcdf01fSchristos add w8,w8,w9 546*1dcdf01fSchristos ushr v7.4s,v21.4s,#24 547*1dcdf01fSchristos eor w21,w21,w5 548*1dcdf01fSchristos ushr v19.4s,v22.4s,#24 549*1dcdf01fSchristos eor w17,w17,w6 550*1dcdf01fSchristos sli v3.4s,v20.4s,#8 551*1dcdf01fSchristos eor w19,w19,w7 552*1dcdf01fSchristos sli v7.4s,v21.4s,#8 553*1dcdf01fSchristos eor w20,w20,w8 554*1dcdf01fSchristos sli v19.4s,v22.4s,#8 555*1dcdf01fSchristos ror w21,w21,#24 556*1dcdf01fSchristos add v2.4s,v2.4s,v3.4s 557*1dcdf01fSchristos ror w17,w17,#24 558*1dcdf01fSchristos add v6.4s,v6.4s,v7.4s 559*1dcdf01fSchristos ror w19,w19,#24 560*1dcdf01fSchristos add v18.4s,v18.4s,v19.4s 561*1dcdf01fSchristos ror w20,w20,#24 562*1dcdf01fSchristos eor v20.16b,v1.16b,v2.16b 563*1dcdf01fSchristos add w15,w15,w21 564*1dcdf01fSchristos eor v21.16b,v5.16b,v6.16b 565*1dcdf01fSchristos add w16,w16,w17 566*1dcdf01fSchristos eor v22.16b,v17.16b,v18.16b 567*1dcdf01fSchristos add w13,w13,w19 568*1dcdf01fSchristos ushr v1.4s,v20.4s,#25 569*1dcdf01fSchristos add w14,w14,w20 570*1dcdf01fSchristos ushr v5.4s,v21.4s,#25 571*1dcdf01fSchristos eor w10,w10,w15 572*1dcdf01fSchristos ushr v17.4s,v22.4s,#25 573*1dcdf01fSchristos eor w11,w11,w16 574*1dcdf01fSchristos sli v1.4s,v20.4s,#7 575*1dcdf01fSchristos eor w12,w12,w13 576*1dcdf01fSchristos sli v5.4s,v21.4s,#7 577*1dcdf01fSchristos eor w9,w9,w14 578*1dcdf01fSchristos sli v17.4s,v22.4s,#7 579*1dcdf01fSchristos ror w10,w10,#25 580*1dcdf01fSchristos ext v2.16b,v2.16b,v2.16b,#8 581*1dcdf01fSchristos ror w11,w11,#25 582*1dcdf01fSchristos ext v6.16b,v6.16b,v6.16b,#8 583*1dcdf01fSchristos ror w12,w12,#25 584*1dcdf01fSchristos ext v18.16b,v18.16b,v18.16b,#8 585*1dcdf01fSchristos ror w9,w9,#25 586*1dcdf01fSchristos ext v3.16b,v3.16b,v3.16b,#4 587*1dcdf01fSchristos ext v7.16b,v7.16b,v7.16b,#4 588*1dcdf01fSchristos ext v19.16b,v19.16b,v19.16b,#4 589*1dcdf01fSchristos ext v1.16b,v1.16b,v1.16b,#12 590*1dcdf01fSchristos ext v5.16b,v5.16b,v5.16b,#12 591*1dcdf01fSchristos ext v17.16b,v17.16b,v17.16b,#12 592*1dcdf01fSchristos cbnz x4,.Loop_neon 593*1dcdf01fSchristos 594*1dcdf01fSchristos add w5,w5,w22 // accumulate key block 595*1dcdf01fSchristos add v0.4s,v0.4s,v24.4s 596*1dcdf01fSchristos add x6,x6,x22,lsr#32 597*1dcdf01fSchristos add v4.4s,v4.4s,v24.4s 598*1dcdf01fSchristos add w7,w7,w23 599*1dcdf01fSchristos add v16.4s,v16.4s,v24.4s 600*1dcdf01fSchristos add x8,x8,x23,lsr#32 601*1dcdf01fSchristos add v2.4s,v2.4s,v26.4s 602*1dcdf01fSchristos add w9,w9,w24 603*1dcdf01fSchristos add v6.4s,v6.4s,v26.4s 604*1dcdf01fSchristos add x10,x10,x24,lsr#32 605*1dcdf01fSchristos add v18.4s,v18.4s,v26.4s 606*1dcdf01fSchristos add w11,w11,w25 607*1dcdf01fSchristos add v3.4s,v3.4s,v27.4s 608*1dcdf01fSchristos add x12,x12,x25,lsr#32 609*1dcdf01fSchristos add w13,w13,w26 610*1dcdf01fSchristos add v7.4s,v7.4s,v28.4s 611*1dcdf01fSchristos add x14,x14,x26,lsr#32 612*1dcdf01fSchristos add w15,w15,w27 613*1dcdf01fSchristos add v19.4s,v19.4s,v29.4s 614*1dcdf01fSchristos add x16,x16,x27,lsr#32 615*1dcdf01fSchristos add w17,w17,w28 616*1dcdf01fSchristos add v1.4s,v1.4s,v25.4s 617*1dcdf01fSchristos add x19,x19,x28,lsr#32 618*1dcdf01fSchristos add w20,w20,w30 619*1dcdf01fSchristos add v5.4s,v5.4s,v25.4s 620*1dcdf01fSchristos add x21,x21,x30,lsr#32 621*1dcdf01fSchristos add v17.4s,v17.4s,v25.4s 622*1dcdf01fSchristos 623*1dcdf01fSchristos b.lo .Ltail_neon 624*1dcdf01fSchristos 625*1dcdf01fSchristos add x5,x5,x6,lsl#32 // pack 626*1dcdf01fSchristos add x7,x7,x8,lsl#32 627*1dcdf01fSchristos ldp x6,x8,[x1,#0] // load input 628*1dcdf01fSchristos add x9,x9,x10,lsl#32 629*1dcdf01fSchristos add x11,x11,x12,lsl#32 630*1dcdf01fSchristos ldp x10,x12,[x1,#16] 631*1dcdf01fSchristos add x13,x13,x14,lsl#32 632*1dcdf01fSchristos add x15,x15,x16,lsl#32 633*1dcdf01fSchristos ldp x14,x16,[x1,#32] 634*1dcdf01fSchristos add x17,x17,x19,lsl#32 635*1dcdf01fSchristos add x20,x20,x21,lsl#32 636*1dcdf01fSchristos ldp x19,x21,[x1,#48] 637*1dcdf01fSchristos add x1,x1,#64 638*1dcdf01fSchristos#ifdef __ARMEB__ 639*1dcdf01fSchristos rev x5,x5 640*1dcdf01fSchristos rev x7,x7 641*1dcdf01fSchristos rev x9,x9 642*1dcdf01fSchristos rev x11,x11 643*1dcdf01fSchristos rev x13,x13 644*1dcdf01fSchristos rev x15,x15 645*1dcdf01fSchristos rev x17,x17 646*1dcdf01fSchristos rev x20,x20 647*1dcdf01fSchristos#endif 648*1dcdf01fSchristos ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 649*1dcdf01fSchristos eor x5,x5,x6 650*1dcdf01fSchristos eor x7,x7,x8 651*1dcdf01fSchristos eor x9,x9,x10 652*1dcdf01fSchristos eor x11,x11,x12 653*1dcdf01fSchristos eor x13,x13,x14 654*1dcdf01fSchristos eor v0.16b,v0.16b,v20.16b 655*1dcdf01fSchristos eor x15,x15,x16 656*1dcdf01fSchristos eor v1.16b,v1.16b,v21.16b 657*1dcdf01fSchristos eor x17,x17,x19 658*1dcdf01fSchristos eor v2.16b,v2.16b,v22.16b 659*1dcdf01fSchristos eor x20,x20,x21 660*1dcdf01fSchristos eor v3.16b,v3.16b,v23.16b 661*1dcdf01fSchristos ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 662*1dcdf01fSchristos 663*1dcdf01fSchristos stp x5,x7,[x0,#0] // store output 664*1dcdf01fSchristos add x28,x28,#4 // increment counter 665*1dcdf01fSchristos stp x9,x11,[x0,#16] 666*1dcdf01fSchristos add v27.4s,v27.4s,v31.4s // += 4 667*1dcdf01fSchristos stp x13,x15,[x0,#32] 668*1dcdf01fSchristos add v28.4s,v28.4s,v31.4s 669*1dcdf01fSchristos stp x17,x20,[x0,#48] 670*1dcdf01fSchristos add v29.4s,v29.4s,v31.4s 671*1dcdf01fSchristos add x0,x0,#64 672*1dcdf01fSchristos 673*1dcdf01fSchristos st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 674*1dcdf01fSchristos ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 675*1dcdf01fSchristos 676*1dcdf01fSchristos eor v4.16b,v4.16b,v20.16b 677*1dcdf01fSchristos eor v5.16b,v5.16b,v21.16b 678*1dcdf01fSchristos eor v6.16b,v6.16b,v22.16b 679*1dcdf01fSchristos eor v7.16b,v7.16b,v23.16b 680*1dcdf01fSchristos st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 681*1dcdf01fSchristos 682*1dcdf01fSchristos eor v16.16b,v16.16b,v0.16b 683*1dcdf01fSchristos eor v17.16b,v17.16b,v1.16b 684*1dcdf01fSchristos eor v18.16b,v18.16b,v2.16b 685*1dcdf01fSchristos eor v19.16b,v19.16b,v3.16b 686*1dcdf01fSchristos st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 687*1dcdf01fSchristos 688*1dcdf01fSchristos b.hi .Loop_outer_neon 689*1dcdf01fSchristos 690*1dcdf01fSchristos ldp x19,x20,[x29,#16] 691*1dcdf01fSchristos add sp,sp,#64 692*1dcdf01fSchristos ldp x21,x22,[x29,#32] 693*1dcdf01fSchristos ldp x23,x24,[x29,#48] 694*1dcdf01fSchristos ldp x25,x26,[x29,#64] 695*1dcdf01fSchristos ldp x27,x28,[x29,#80] 696*1dcdf01fSchristos ldp x29,x30,[sp],#96 697*1dcdf01fSchristos.inst 0xd50323bf // autiasp 698*1dcdf01fSchristos ret 699*1dcdf01fSchristos 700*1dcdf01fSchristos.Ltail_neon: 701*1dcdf01fSchristos add x2,x2,#256 702*1dcdf01fSchristos cmp x2,#64 703*1dcdf01fSchristos b.lo .Less_than_64 704*1dcdf01fSchristos 705*1dcdf01fSchristos add x5,x5,x6,lsl#32 // pack 706*1dcdf01fSchristos add x7,x7,x8,lsl#32 707*1dcdf01fSchristos ldp x6,x8,[x1,#0] // load input 708*1dcdf01fSchristos add x9,x9,x10,lsl#32 709*1dcdf01fSchristos add x11,x11,x12,lsl#32 710*1dcdf01fSchristos ldp x10,x12,[x1,#16] 711*1dcdf01fSchristos add x13,x13,x14,lsl#32 712*1dcdf01fSchristos add x15,x15,x16,lsl#32 713*1dcdf01fSchristos ldp x14,x16,[x1,#32] 714*1dcdf01fSchristos add x17,x17,x19,lsl#32 715*1dcdf01fSchristos add x20,x20,x21,lsl#32 716*1dcdf01fSchristos ldp x19,x21,[x1,#48] 717*1dcdf01fSchristos add x1,x1,#64 718*1dcdf01fSchristos#ifdef __ARMEB__ 719*1dcdf01fSchristos rev x5,x5 720*1dcdf01fSchristos rev x7,x7 721*1dcdf01fSchristos rev x9,x9 722*1dcdf01fSchristos rev x11,x11 723*1dcdf01fSchristos rev x13,x13 724*1dcdf01fSchristos rev x15,x15 725*1dcdf01fSchristos rev x17,x17 726*1dcdf01fSchristos rev x20,x20 727*1dcdf01fSchristos#endif 728*1dcdf01fSchristos eor x5,x5,x6 729*1dcdf01fSchristos eor x7,x7,x8 730*1dcdf01fSchristos eor x9,x9,x10 731*1dcdf01fSchristos eor x11,x11,x12 732*1dcdf01fSchristos eor x13,x13,x14 733*1dcdf01fSchristos eor x15,x15,x16 734*1dcdf01fSchristos eor x17,x17,x19 735*1dcdf01fSchristos eor x20,x20,x21 736*1dcdf01fSchristos 737*1dcdf01fSchristos stp x5,x7,[x0,#0] // store output 738*1dcdf01fSchristos add x28,x28,#4 // increment counter 739*1dcdf01fSchristos stp x9,x11,[x0,#16] 740*1dcdf01fSchristos stp x13,x15,[x0,#32] 741*1dcdf01fSchristos stp x17,x20,[x0,#48] 742*1dcdf01fSchristos add x0,x0,#64 743*1dcdf01fSchristos b.eq .Ldone_neon 744*1dcdf01fSchristos sub x2,x2,#64 745*1dcdf01fSchristos cmp x2,#64 746*1dcdf01fSchristos b.lo .Less_than_128 747*1dcdf01fSchristos 748*1dcdf01fSchristos ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 749*1dcdf01fSchristos eor v0.16b,v0.16b,v20.16b 750*1dcdf01fSchristos eor v1.16b,v1.16b,v21.16b 751*1dcdf01fSchristos eor v2.16b,v2.16b,v22.16b 752*1dcdf01fSchristos eor v3.16b,v3.16b,v23.16b 753*1dcdf01fSchristos st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 754*1dcdf01fSchristos b.eq .Ldone_neon 755*1dcdf01fSchristos sub x2,x2,#64 756*1dcdf01fSchristos cmp x2,#64 757*1dcdf01fSchristos b.lo .Less_than_192 758*1dcdf01fSchristos 759*1dcdf01fSchristos ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 760*1dcdf01fSchristos eor v4.16b,v4.16b,v20.16b 761*1dcdf01fSchristos eor v5.16b,v5.16b,v21.16b 762*1dcdf01fSchristos eor v6.16b,v6.16b,v22.16b 763*1dcdf01fSchristos eor v7.16b,v7.16b,v23.16b 764*1dcdf01fSchristos st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 765*1dcdf01fSchristos b.eq .Ldone_neon 766*1dcdf01fSchristos sub x2,x2,#64 767*1dcdf01fSchristos 768*1dcdf01fSchristos st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 769*1dcdf01fSchristos b .Last_neon 770*1dcdf01fSchristos 771*1dcdf01fSchristos.Less_than_128: 772*1dcdf01fSchristos st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] 773*1dcdf01fSchristos b .Last_neon 774*1dcdf01fSchristos.Less_than_192: 775*1dcdf01fSchristos st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] 776*1dcdf01fSchristos b .Last_neon 777*1dcdf01fSchristos 778*1dcdf01fSchristos.align 4 779*1dcdf01fSchristos.Last_neon: 780*1dcdf01fSchristos sub x0,x0,#1 781*1dcdf01fSchristos add x1,x1,x2 782*1dcdf01fSchristos add x0,x0,x2 783*1dcdf01fSchristos add x4,sp,x2 784*1dcdf01fSchristos neg x2,x2 785*1dcdf01fSchristos 786*1dcdf01fSchristos.Loop_tail_neon: 787*1dcdf01fSchristos ldrb w10,[x1,x2] 788*1dcdf01fSchristos ldrb w11,[x4,x2] 789*1dcdf01fSchristos add x2,x2,#1 790*1dcdf01fSchristos eor w10,w10,w11 791*1dcdf01fSchristos strb w10,[x0,x2] 792*1dcdf01fSchristos cbnz x2,.Loop_tail_neon 793*1dcdf01fSchristos 794*1dcdf01fSchristos stp xzr,xzr,[sp,#0] 795*1dcdf01fSchristos stp xzr,xzr,[sp,#16] 796*1dcdf01fSchristos stp xzr,xzr,[sp,#32] 797*1dcdf01fSchristos stp xzr,xzr,[sp,#48] 798*1dcdf01fSchristos 799*1dcdf01fSchristos.Ldone_neon: 800*1dcdf01fSchristos ldp x19,x20,[x29,#16] 801*1dcdf01fSchristos add sp,sp,#64 802*1dcdf01fSchristos ldp x21,x22,[x29,#32] 803*1dcdf01fSchristos ldp x23,x24,[x29,#48] 804*1dcdf01fSchristos ldp x25,x26,[x29,#64] 805*1dcdf01fSchristos ldp x27,x28,[x29,#80] 806*1dcdf01fSchristos ldp x29,x30,[sp],#96 807*1dcdf01fSchristos.inst 0xd50323bf // autiasp 808*1dcdf01fSchristos ret 809*1dcdf01fSchristos.size ChaCha20_neon,.-ChaCha20_neon 810*1dcdf01fSchristos.type ChaCha20_512_neon,%function 811*1dcdf01fSchristos.align 5 812*1dcdf01fSchristosChaCha20_512_neon: 813*1dcdf01fSchristos.inst 0xd503233f // paciasp 814*1dcdf01fSchristos stp x29,x30,[sp,#-96]! 815*1dcdf01fSchristos add x29,sp,#0 816*1dcdf01fSchristos 817*1dcdf01fSchristos adr x5,.Lsigma 818*1dcdf01fSchristos stp x19,x20,[sp,#16] 819*1dcdf01fSchristos stp x21,x22,[sp,#32] 820*1dcdf01fSchristos stp x23,x24,[sp,#48] 821*1dcdf01fSchristos stp x25,x26,[sp,#64] 822*1dcdf01fSchristos stp x27,x28,[sp,#80] 823*1dcdf01fSchristos 824*1dcdf01fSchristos.L512_or_more_neon: 825*1dcdf01fSchristos sub sp,sp,#128+64 826*1dcdf01fSchristos 827*1dcdf01fSchristos ldp x22,x23,[x5] // load sigma 828*1dcdf01fSchristos ld1 {v24.4s},[x5],#16 829*1dcdf01fSchristos ldp x24,x25,[x3] // load key 830*1dcdf01fSchristos ldp x26,x27,[x3,#16] 831*1dcdf01fSchristos ld1 {v25.4s,v26.4s},[x3] 832*1dcdf01fSchristos ldp x28,x30,[x4] // load counter 833*1dcdf01fSchristos ld1 {v27.4s},[x4] 834*1dcdf01fSchristos ld1 {v31.4s},[x5] 835*1dcdf01fSchristos#ifdef __ARMEB__ 836*1dcdf01fSchristos rev64 v24.4s,v24.4s 837*1dcdf01fSchristos ror x24,x24,#32 838*1dcdf01fSchristos ror x25,x25,#32 839*1dcdf01fSchristos ror x26,x26,#32 840*1dcdf01fSchristos ror x27,x27,#32 841*1dcdf01fSchristos ror x28,x28,#32 842*1dcdf01fSchristos ror x30,x30,#32 843*1dcdf01fSchristos#endif 844*1dcdf01fSchristos add v27.4s,v27.4s,v31.4s // += 1 845*1dcdf01fSchristos stp q24,q25,[sp,#0] // off-load key block, invariant part 846*1dcdf01fSchristos add v27.4s,v27.4s,v31.4s // not typo 847*1dcdf01fSchristos str q26,[sp,#32] 848*1dcdf01fSchristos add v28.4s,v27.4s,v31.4s 849*1dcdf01fSchristos add v29.4s,v28.4s,v31.4s 850*1dcdf01fSchristos add v30.4s,v29.4s,v31.4s 851*1dcdf01fSchristos shl v31.4s,v31.4s,#2 // 1 -> 4 852*1dcdf01fSchristos 853*1dcdf01fSchristos stp d8,d9,[sp,#128+0] // meet ABI requirements 854*1dcdf01fSchristos stp d10,d11,[sp,#128+16] 855*1dcdf01fSchristos stp d12,d13,[sp,#128+32] 856*1dcdf01fSchristos stp d14,d15,[sp,#128+48] 857*1dcdf01fSchristos 858*1dcdf01fSchristos sub x2,x2,#512 // not typo 859*1dcdf01fSchristos 860*1dcdf01fSchristos.Loop_outer_512_neon: 861*1dcdf01fSchristos mov v0.16b,v24.16b 862*1dcdf01fSchristos mov v4.16b,v24.16b 863*1dcdf01fSchristos mov v8.16b,v24.16b 864*1dcdf01fSchristos mov v12.16b,v24.16b 865*1dcdf01fSchristos mov v16.16b,v24.16b 866*1dcdf01fSchristos mov v20.16b,v24.16b 867*1dcdf01fSchristos mov v1.16b,v25.16b 868*1dcdf01fSchristos mov w5,w22 // unpack key block 869*1dcdf01fSchristos mov v5.16b,v25.16b 870*1dcdf01fSchristos lsr x6,x22,#32 871*1dcdf01fSchristos mov v9.16b,v25.16b 872*1dcdf01fSchristos mov w7,w23 873*1dcdf01fSchristos mov v13.16b,v25.16b 874*1dcdf01fSchristos lsr x8,x23,#32 875*1dcdf01fSchristos mov v17.16b,v25.16b 876*1dcdf01fSchristos mov w9,w24 877*1dcdf01fSchristos mov v21.16b,v25.16b 878*1dcdf01fSchristos lsr x10,x24,#32 879*1dcdf01fSchristos mov v3.16b,v27.16b 880*1dcdf01fSchristos mov w11,w25 881*1dcdf01fSchristos mov v7.16b,v28.16b 882*1dcdf01fSchristos lsr x12,x25,#32 883*1dcdf01fSchristos mov v11.16b,v29.16b 884*1dcdf01fSchristos mov w13,w26 885*1dcdf01fSchristos mov v15.16b,v30.16b 886*1dcdf01fSchristos lsr x14,x26,#32 887*1dcdf01fSchristos mov v2.16b,v26.16b 888*1dcdf01fSchristos mov w15,w27 889*1dcdf01fSchristos mov v6.16b,v26.16b 890*1dcdf01fSchristos lsr x16,x27,#32 891*1dcdf01fSchristos add v19.4s,v3.4s,v31.4s // +4 892*1dcdf01fSchristos mov w17,w28 893*1dcdf01fSchristos add v23.4s,v7.4s,v31.4s // +4 894*1dcdf01fSchristos lsr x19,x28,#32 895*1dcdf01fSchristos mov v10.16b,v26.16b 896*1dcdf01fSchristos mov w20,w30 897*1dcdf01fSchristos mov v14.16b,v26.16b 898*1dcdf01fSchristos lsr x21,x30,#32 899*1dcdf01fSchristos mov v18.16b,v26.16b 900*1dcdf01fSchristos stp q27,q28,[sp,#48] // off-load key block, variable part 901*1dcdf01fSchristos mov v22.16b,v26.16b 902*1dcdf01fSchristos str q29,[sp,#80] 903*1dcdf01fSchristos 904*1dcdf01fSchristos mov x4,#5 905*1dcdf01fSchristos subs x2,x2,#512 906*1dcdf01fSchristos.Loop_upper_neon: 907*1dcdf01fSchristos sub x4,x4,#1 908*1dcdf01fSchristos add v0.4s,v0.4s,v1.4s 909*1dcdf01fSchristos add w5,w5,w9 910*1dcdf01fSchristos add v4.4s,v4.4s,v5.4s 911*1dcdf01fSchristos add w6,w6,w10 912*1dcdf01fSchristos add v8.4s,v8.4s,v9.4s 913*1dcdf01fSchristos add w7,w7,w11 914*1dcdf01fSchristos add v12.4s,v12.4s,v13.4s 915*1dcdf01fSchristos add w8,w8,w12 916*1dcdf01fSchristos add v16.4s,v16.4s,v17.4s 917*1dcdf01fSchristos eor w17,w17,w5 918*1dcdf01fSchristos add v20.4s,v20.4s,v21.4s 919*1dcdf01fSchristos eor w19,w19,w6 920*1dcdf01fSchristos eor v3.16b,v3.16b,v0.16b 921*1dcdf01fSchristos eor w20,w20,w7 922*1dcdf01fSchristos eor v7.16b,v7.16b,v4.16b 923*1dcdf01fSchristos eor w21,w21,w8 924*1dcdf01fSchristos eor v11.16b,v11.16b,v8.16b 925*1dcdf01fSchristos ror w17,w17,#16 926*1dcdf01fSchristos eor v15.16b,v15.16b,v12.16b 927*1dcdf01fSchristos ror w19,w19,#16 928*1dcdf01fSchristos eor v19.16b,v19.16b,v16.16b 929*1dcdf01fSchristos ror w20,w20,#16 930*1dcdf01fSchristos eor v23.16b,v23.16b,v20.16b 931*1dcdf01fSchristos ror w21,w21,#16 932*1dcdf01fSchristos rev32 v3.8h,v3.8h 933*1dcdf01fSchristos add w13,w13,w17 934*1dcdf01fSchristos rev32 v7.8h,v7.8h 935*1dcdf01fSchristos add w14,w14,w19 936*1dcdf01fSchristos rev32 v11.8h,v11.8h 937*1dcdf01fSchristos add w15,w15,w20 938*1dcdf01fSchristos rev32 v15.8h,v15.8h 939*1dcdf01fSchristos add w16,w16,w21 940*1dcdf01fSchristos rev32 v19.8h,v19.8h 941*1dcdf01fSchristos eor w9,w9,w13 942*1dcdf01fSchristos rev32 v23.8h,v23.8h 943*1dcdf01fSchristos eor w10,w10,w14 944*1dcdf01fSchristos add v2.4s,v2.4s,v3.4s 945*1dcdf01fSchristos eor w11,w11,w15 946*1dcdf01fSchristos add v6.4s,v6.4s,v7.4s 947*1dcdf01fSchristos eor w12,w12,w16 948*1dcdf01fSchristos add v10.4s,v10.4s,v11.4s 949*1dcdf01fSchristos ror w9,w9,#20 950*1dcdf01fSchristos add v14.4s,v14.4s,v15.4s 951*1dcdf01fSchristos ror w10,w10,#20 952*1dcdf01fSchristos add v18.4s,v18.4s,v19.4s 953*1dcdf01fSchristos ror w11,w11,#20 954*1dcdf01fSchristos add v22.4s,v22.4s,v23.4s 955*1dcdf01fSchristos ror w12,w12,#20 956*1dcdf01fSchristos eor v24.16b,v1.16b,v2.16b 957*1dcdf01fSchristos add w5,w5,w9 958*1dcdf01fSchristos eor v25.16b,v5.16b,v6.16b 959*1dcdf01fSchristos add w6,w6,w10 960*1dcdf01fSchristos eor v26.16b,v9.16b,v10.16b 961*1dcdf01fSchristos add w7,w7,w11 962*1dcdf01fSchristos eor v27.16b,v13.16b,v14.16b 963*1dcdf01fSchristos add w8,w8,w12 964*1dcdf01fSchristos eor v28.16b,v17.16b,v18.16b 965*1dcdf01fSchristos eor w17,w17,w5 966*1dcdf01fSchristos eor v29.16b,v21.16b,v22.16b 967*1dcdf01fSchristos eor w19,w19,w6 968*1dcdf01fSchristos ushr v1.4s,v24.4s,#20 969*1dcdf01fSchristos eor w20,w20,w7 970*1dcdf01fSchristos ushr v5.4s,v25.4s,#20 971*1dcdf01fSchristos eor w21,w21,w8 972*1dcdf01fSchristos ushr v9.4s,v26.4s,#20 973*1dcdf01fSchristos ror w17,w17,#24 974*1dcdf01fSchristos ushr v13.4s,v27.4s,#20 975*1dcdf01fSchristos ror w19,w19,#24 976*1dcdf01fSchristos ushr v17.4s,v28.4s,#20 977*1dcdf01fSchristos ror w20,w20,#24 978*1dcdf01fSchristos ushr v21.4s,v29.4s,#20 979*1dcdf01fSchristos ror w21,w21,#24 980*1dcdf01fSchristos sli v1.4s,v24.4s,#12 981*1dcdf01fSchristos add w13,w13,w17 982*1dcdf01fSchristos sli v5.4s,v25.4s,#12 983*1dcdf01fSchristos add w14,w14,w19 984*1dcdf01fSchristos sli v9.4s,v26.4s,#12 985*1dcdf01fSchristos add w15,w15,w20 986*1dcdf01fSchristos sli v13.4s,v27.4s,#12 987*1dcdf01fSchristos add w16,w16,w21 988*1dcdf01fSchristos sli v17.4s,v28.4s,#12 989*1dcdf01fSchristos eor w9,w9,w13 990*1dcdf01fSchristos sli v21.4s,v29.4s,#12 991*1dcdf01fSchristos eor w10,w10,w14 992*1dcdf01fSchristos add v0.4s,v0.4s,v1.4s 993*1dcdf01fSchristos eor w11,w11,w15 994*1dcdf01fSchristos add v4.4s,v4.4s,v5.4s 995*1dcdf01fSchristos eor w12,w12,w16 996*1dcdf01fSchristos add v8.4s,v8.4s,v9.4s 997*1dcdf01fSchristos ror w9,w9,#25 998*1dcdf01fSchristos add v12.4s,v12.4s,v13.4s 999*1dcdf01fSchristos ror w10,w10,#25 1000*1dcdf01fSchristos add v16.4s,v16.4s,v17.4s 1001*1dcdf01fSchristos ror w11,w11,#25 1002*1dcdf01fSchristos add v20.4s,v20.4s,v21.4s 1003*1dcdf01fSchristos ror w12,w12,#25 1004*1dcdf01fSchristos eor v24.16b,v3.16b,v0.16b 1005*1dcdf01fSchristos add w5,w5,w10 1006*1dcdf01fSchristos eor v25.16b,v7.16b,v4.16b 1007*1dcdf01fSchristos add w6,w6,w11 1008*1dcdf01fSchristos eor v26.16b,v11.16b,v8.16b 1009*1dcdf01fSchristos add w7,w7,w12 1010*1dcdf01fSchristos eor v27.16b,v15.16b,v12.16b 1011*1dcdf01fSchristos add w8,w8,w9 1012*1dcdf01fSchristos eor v28.16b,v19.16b,v16.16b 1013*1dcdf01fSchristos eor w21,w21,w5 1014*1dcdf01fSchristos eor v29.16b,v23.16b,v20.16b 1015*1dcdf01fSchristos eor w17,w17,w6 1016*1dcdf01fSchristos ushr v3.4s,v24.4s,#24 1017*1dcdf01fSchristos eor w19,w19,w7 1018*1dcdf01fSchristos ushr v7.4s,v25.4s,#24 1019*1dcdf01fSchristos eor w20,w20,w8 1020*1dcdf01fSchristos ushr v11.4s,v26.4s,#24 1021*1dcdf01fSchristos ror w21,w21,#16 1022*1dcdf01fSchristos ushr v15.4s,v27.4s,#24 1023*1dcdf01fSchristos ror w17,w17,#16 1024*1dcdf01fSchristos ushr v19.4s,v28.4s,#24 1025*1dcdf01fSchristos ror w19,w19,#16 1026*1dcdf01fSchristos ushr v23.4s,v29.4s,#24 1027*1dcdf01fSchristos ror w20,w20,#16 1028*1dcdf01fSchristos sli v3.4s,v24.4s,#8 1029*1dcdf01fSchristos add w15,w15,w21 1030*1dcdf01fSchristos sli v7.4s,v25.4s,#8 1031*1dcdf01fSchristos add w16,w16,w17 1032*1dcdf01fSchristos sli v11.4s,v26.4s,#8 1033*1dcdf01fSchristos add w13,w13,w19 1034*1dcdf01fSchristos sli v15.4s,v27.4s,#8 1035*1dcdf01fSchristos add w14,w14,w20 1036*1dcdf01fSchristos sli v19.4s,v28.4s,#8 1037*1dcdf01fSchristos eor w10,w10,w15 1038*1dcdf01fSchristos sli v23.4s,v29.4s,#8 1039*1dcdf01fSchristos eor w11,w11,w16 1040*1dcdf01fSchristos add v2.4s,v2.4s,v3.4s 1041*1dcdf01fSchristos eor w12,w12,w13 1042*1dcdf01fSchristos add v6.4s,v6.4s,v7.4s 1043*1dcdf01fSchristos eor w9,w9,w14 1044*1dcdf01fSchristos add v10.4s,v10.4s,v11.4s 1045*1dcdf01fSchristos ror w10,w10,#20 1046*1dcdf01fSchristos add v14.4s,v14.4s,v15.4s 1047*1dcdf01fSchristos ror w11,w11,#20 1048*1dcdf01fSchristos add v18.4s,v18.4s,v19.4s 1049*1dcdf01fSchristos ror w12,w12,#20 1050*1dcdf01fSchristos add v22.4s,v22.4s,v23.4s 1051*1dcdf01fSchristos ror w9,w9,#20 1052*1dcdf01fSchristos eor v24.16b,v1.16b,v2.16b 1053*1dcdf01fSchristos add w5,w5,w10 1054*1dcdf01fSchristos eor v25.16b,v5.16b,v6.16b 1055*1dcdf01fSchristos add w6,w6,w11 1056*1dcdf01fSchristos eor v26.16b,v9.16b,v10.16b 1057*1dcdf01fSchristos add w7,w7,w12 1058*1dcdf01fSchristos eor v27.16b,v13.16b,v14.16b 1059*1dcdf01fSchristos add w8,w8,w9 1060*1dcdf01fSchristos eor v28.16b,v17.16b,v18.16b 1061*1dcdf01fSchristos eor w21,w21,w5 1062*1dcdf01fSchristos eor v29.16b,v21.16b,v22.16b 1063*1dcdf01fSchristos eor w17,w17,w6 1064*1dcdf01fSchristos ushr v1.4s,v24.4s,#25 1065*1dcdf01fSchristos eor w19,w19,w7 1066*1dcdf01fSchristos ushr v5.4s,v25.4s,#25 1067*1dcdf01fSchristos eor w20,w20,w8 1068*1dcdf01fSchristos ushr v9.4s,v26.4s,#25 1069*1dcdf01fSchristos ror w21,w21,#24 1070*1dcdf01fSchristos ushr v13.4s,v27.4s,#25 1071*1dcdf01fSchristos ror w17,w17,#24 1072*1dcdf01fSchristos ushr v17.4s,v28.4s,#25 1073*1dcdf01fSchristos ror w19,w19,#24 1074*1dcdf01fSchristos ushr v21.4s,v29.4s,#25 1075*1dcdf01fSchristos ror w20,w20,#24 1076*1dcdf01fSchristos sli v1.4s,v24.4s,#7 1077*1dcdf01fSchristos add w15,w15,w21 1078*1dcdf01fSchristos sli v5.4s,v25.4s,#7 1079*1dcdf01fSchristos add w16,w16,w17 1080*1dcdf01fSchristos sli v9.4s,v26.4s,#7 1081*1dcdf01fSchristos add w13,w13,w19 1082*1dcdf01fSchristos sli v13.4s,v27.4s,#7 1083*1dcdf01fSchristos add w14,w14,w20 1084*1dcdf01fSchristos sli v17.4s,v28.4s,#7 1085*1dcdf01fSchristos eor w10,w10,w15 1086*1dcdf01fSchristos sli v21.4s,v29.4s,#7 1087*1dcdf01fSchristos eor w11,w11,w16 1088*1dcdf01fSchristos ext v2.16b,v2.16b,v2.16b,#8 1089*1dcdf01fSchristos eor w12,w12,w13 1090*1dcdf01fSchristos ext v6.16b,v6.16b,v6.16b,#8 1091*1dcdf01fSchristos eor w9,w9,w14 1092*1dcdf01fSchristos ext v10.16b,v10.16b,v10.16b,#8 1093*1dcdf01fSchristos ror w10,w10,#25 1094*1dcdf01fSchristos ext v14.16b,v14.16b,v14.16b,#8 1095*1dcdf01fSchristos ror w11,w11,#25 1096*1dcdf01fSchristos ext v18.16b,v18.16b,v18.16b,#8 1097*1dcdf01fSchristos ror w12,w12,#25 1098*1dcdf01fSchristos ext v22.16b,v22.16b,v22.16b,#8 1099*1dcdf01fSchristos ror w9,w9,#25 1100*1dcdf01fSchristos ext v3.16b,v3.16b,v3.16b,#12 1101*1dcdf01fSchristos ext v7.16b,v7.16b,v7.16b,#12 1102*1dcdf01fSchristos ext v11.16b,v11.16b,v11.16b,#12 1103*1dcdf01fSchristos ext v15.16b,v15.16b,v15.16b,#12 1104*1dcdf01fSchristos ext v19.16b,v19.16b,v19.16b,#12 1105*1dcdf01fSchristos ext v23.16b,v23.16b,v23.16b,#12 1106*1dcdf01fSchristos ext v1.16b,v1.16b,v1.16b,#4 1107*1dcdf01fSchristos ext v5.16b,v5.16b,v5.16b,#4 1108*1dcdf01fSchristos ext v9.16b,v9.16b,v9.16b,#4 1109*1dcdf01fSchristos ext v13.16b,v13.16b,v13.16b,#4 1110*1dcdf01fSchristos ext v17.16b,v17.16b,v17.16b,#4 1111*1dcdf01fSchristos ext v21.16b,v21.16b,v21.16b,#4 1112*1dcdf01fSchristos add v0.4s,v0.4s,v1.4s 1113*1dcdf01fSchristos add w5,w5,w9 1114*1dcdf01fSchristos add v4.4s,v4.4s,v5.4s 1115*1dcdf01fSchristos add w6,w6,w10 1116*1dcdf01fSchristos add v8.4s,v8.4s,v9.4s 1117*1dcdf01fSchristos add w7,w7,w11 1118*1dcdf01fSchristos add v12.4s,v12.4s,v13.4s 1119*1dcdf01fSchristos add w8,w8,w12 1120*1dcdf01fSchristos add v16.4s,v16.4s,v17.4s 1121*1dcdf01fSchristos eor w17,w17,w5 1122*1dcdf01fSchristos add v20.4s,v20.4s,v21.4s 1123*1dcdf01fSchristos eor w19,w19,w6 1124*1dcdf01fSchristos eor v3.16b,v3.16b,v0.16b 1125*1dcdf01fSchristos eor w20,w20,w7 1126*1dcdf01fSchristos eor v7.16b,v7.16b,v4.16b 1127*1dcdf01fSchristos eor w21,w21,w8 1128*1dcdf01fSchristos eor v11.16b,v11.16b,v8.16b 1129*1dcdf01fSchristos ror w17,w17,#16 1130*1dcdf01fSchristos eor v15.16b,v15.16b,v12.16b 1131*1dcdf01fSchristos ror w19,w19,#16 1132*1dcdf01fSchristos eor v19.16b,v19.16b,v16.16b 1133*1dcdf01fSchristos ror w20,w20,#16 1134*1dcdf01fSchristos eor v23.16b,v23.16b,v20.16b 1135*1dcdf01fSchristos ror w21,w21,#16 1136*1dcdf01fSchristos rev32 v3.8h,v3.8h 1137*1dcdf01fSchristos add w13,w13,w17 1138*1dcdf01fSchristos rev32 v7.8h,v7.8h 1139*1dcdf01fSchristos add w14,w14,w19 1140*1dcdf01fSchristos rev32 v11.8h,v11.8h 1141*1dcdf01fSchristos add w15,w15,w20 1142*1dcdf01fSchristos rev32 v15.8h,v15.8h 1143*1dcdf01fSchristos add w16,w16,w21 1144*1dcdf01fSchristos rev32 v19.8h,v19.8h 1145*1dcdf01fSchristos eor w9,w9,w13 1146*1dcdf01fSchristos rev32 v23.8h,v23.8h 1147*1dcdf01fSchristos eor w10,w10,w14 1148*1dcdf01fSchristos add v2.4s,v2.4s,v3.4s 1149*1dcdf01fSchristos eor w11,w11,w15 1150*1dcdf01fSchristos add v6.4s,v6.4s,v7.4s 1151*1dcdf01fSchristos eor w12,w12,w16 1152*1dcdf01fSchristos add v10.4s,v10.4s,v11.4s 1153*1dcdf01fSchristos ror w9,w9,#20 1154*1dcdf01fSchristos add v14.4s,v14.4s,v15.4s 1155*1dcdf01fSchristos ror w10,w10,#20 1156*1dcdf01fSchristos add v18.4s,v18.4s,v19.4s 1157*1dcdf01fSchristos ror w11,w11,#20 1158*1dcdf01fSchristos add v22.4s,v22.4s,v23.4s 1159*1dcdf01fSchristos ror w12,w12,#20 1160*1dcdf01fSchristos eor v24.16b,v1.16b,v2.16b 1161*1dcdf01fSchristos add w5,w5,w9 1162*1dcdf01fSchristos eor v25.16b,v5.16b,v6.16b 1163*1dcdf01fSchristos add w6,w6,w10 1164*1dcdf01fSchristos eor v26.16b,v9.16b,v10.16b 1165*1dcdf01fSchristos add w7,w7,w11 1166*1dcdf01fSchristos eor v27.16b,v13.16b,v14.16b 1167*1dcdf01fSchristos add w8,w8,w12 1168*1dcdf01fSchristos eor v28.16b,v17.16b,v18.16b 1169*1dcdf01fSchristos eor w17,w17,w5 1170*1dcdf01fSchristos eor v29.16b,v21.16b,v22.16b 1171*1dcdf01fSchristos eor w19,w19,w6 1172*1dcdf01fSchristos ushr v1.4s,v24.4s,#20 1173*1dcdf01fSchristos eor w20,w20,w7 1174*1dcdf01fSchristos ushr v5.4s,v25.4s,#20 1175*1dcdf01fSchristos eor w21,w21,w8 1176*1dcdf01fSchristos ushr v9.4s,v26.4s,#20 1177*1dcdf01fSchristos ror w17,w17,#24 1178*1dcdf01fSchristos ushr v13.4s,v27.4s,#20 1179*1dcdf01fSchristos ror w19,w19,#24 1180*1dcdf01fSchristos ushr v17.4s,v28.4s,#20 1181*1dcdf01fSchristos ror w20,w20,#24 1182*1dcdf01fSchristos ushr v21.4s,v29.4s,#20 1183*1dcdf01fSchristos ror w21,w21,#24 1184*1dcdf01fSchristos sli v1.4s,v24.4s,#12 1185*1dcdf01fSchristos add w13,w13,w17 1186*1dcdf01fSchristos sli v5.4s,v25.4s,#12 1187*1dcdf01fSchristos add w14,w14,w19 1188*1dcdf01fSchristos sli v9.4s,v26.4s,#12 1189*1dcdf01fSchristos add w15,w15,w20 1190*1dcdf01fSchristos sli v13.4s,v27.4s,#12 1191*1dcdf01fSchristos add w16,w16,w21 1192*1dcdf01fSchristos sli v17.4s,v28.4s,#12 1193*1dcdf01fSchristos eor w9,w9,w13 1194*1dcdf01fSchristos sli v21.4s,v29.4s,#12 1195*1dcdf01fSchristos eor w10,w10,w14 1196*1dcdf01fSchristos add v0.4s,v0.4s,v1.4s 1197*1dcdf01fSchristos eor w11,w11,w15 1198*1dcdf01fSchristos add v4.4s,v4.4s,v5.4s 1199*1dcdf01fSchristos eor w12,w12,w16 1200*1dcdf01fSchristos add v8.4s,v8.4s,v9.4s 1201*1dcdf01fSchristos ror w9,w9,#25 1202*1dcdf01fSchristos add v12.4s,v12.4s,v13.4s 1203*1dcdf01fSchristos ror w10,w10,#25 1204*1dcdf01fSchristos add v16.4s,v16.4s,v17.4s 1205*1dcdf01fSchristos ror w11,w11,#25 1206*1dcdf01fSchristos add v20.4s,v20.4s,v21.4s 1207*1dcdf01fSchristos ror w12,w12,#25 1208*1dcdf01fSchristos eor v24.16b,v3.16b,v0.16b 1209*1dcdf01fSchristos add w5,w5,w10 1210*1dcdf01fSchristos eor v25.16b,v7.16b,v4.16b 1211*1dcdf01fSchristos add w6,w6,w11 1212*1dcdf01fSchristos eor v26.16b,v11.16b,v8.16b 1213*1dcdf01fSchristos add w7,w7,w12 1214*1dcdf01fSchristos eor v27.16b,v15.16b,v12.16b 1215*1dcdf01fSchristos add w8,w8,w9 1216*1dcdf01fSchristos eor v28.16b,v19.16b,v16.16b 1217*1dcdf01fSchristos eor w21,w21,w5 1218*1dcdf01fSchristos eor v29.16b,v23.16b,v20.16b 1219*1dcdf01fSchristos eor w17,w17,w6 1220*1dcdf01fSchristos ushr v3.4s,v24.4s,#24 1221*1dcdf01fSchristos eor w19,w19,w7 1222*1dcdf01fSchristos ushr v7.4s,v25.4s,#24 1223*1dcdf01fSchristos eor w20,w20,w8 1224*1dcdf01fSchristos ushr v11.4s,v26.4s,#24 1225*1dcdf01fSchristos ror w21,w21,#16 1226*1dcdf01fSchristos ushr v15.4s,v27.4s,#24 1227*1dcdf01fSchristos ror w17,w17,#16 1228*1dcdf01fSchristos ushr v19.4s,v28.4s,#24 1229*1dcdf01fSchristos ror w19,w19,#16 1230*1dcdf01fSchristos ushr v23.4s,v29.4s,#24 1231*1dcdf01fSchristos ror w20,w20,#16 1232*1dcdf01fSchristos sli v3.4s,v24.4s,#8 1233*1dcdf01fSchristos add w15,w15,w21 1234*1dcdf01fSchristos sli v7.4s,v25.4s,#8 1235*1dcdf01fSchristos add w16,w16,w17 1236*1dcdf01fSchristos sli v11.4s,v26.4s,#8 1237*1dcdf01fSchristos add w13,w13,w19 1238*1dcdf01fSchristos sli v15.4s,v27.4s,#8 1239*1dcdf01fSchristos add w14,w14,w20 1240*1dcdf01fSchristos sli v19.4s,v28.4s,#8 1241*1dcdf01fSchristos eor w10,w10,w15 1242*1dcdf01fSchristos sli v23.4s,v29.4s,#8 1243*1dcdf01fSchristos eor w11,w11,w16 1244*1dcdf01fSchristos add v2.4s,v2.4s,v3.4s 1245*1dcdf01fSchristos eor w12,w12,w13 1246*1dcdf01fSchristos add v6.4s,v6.4s,v7.4s 1247*1dcdf01fSchristos eor w9,w9,w14 1248*1dcdf01fSchristos add v10.4s,v10.4s,v11.4s 1249*1dcdf01fSchristos ror w10,w10,#20 1250*1dcdf01fSchristos add v14.4s,v14.4s,v15.4s 1251*1dcdf01fSchristos ror w11,w11,#20 1252*1dcdf01fSchristos add v18.4s,v18.4s,v19.4s 1253*1dcdf01fSchristos ror w12,w12,#20 1254*1dcdf01fSchristos add v22.4s,v22.4s,v23.4s 1255*1dcdf01fSchristos ror w9,w9,#20 1256*1dcdf01fSchristos eor v24.16b,v1.16b,v2.16b 1257*1dcdf01fSchristos add w5,w5,w10 1258*1dcdf01fSchristos eor v25.16b,v5.16b,v6.16b 1259*1dcdf01fSchristos add w6,w6,w11 1260*1dcdf01fSchristos eor v26.16b,v9.16b,v10.16b 1261*1dcdf01fSchristos add w7,w7,w12 1262*1dcdf01fSchristos eor v27.16b,v13.16b,v14.16b 1263*1dcdf01fSchristos add w8,w8,w9 1264*1dcdf01fSchristos eor v28.16b,v17.16b,v18.16b 1265*1dcdf01fSchristos eor w21,w21,w5 1266*1dcdf01fSchristos eor v29.16b,v21.16b,v22.16b 1267*1dcdf01fSchristos eor w17,w17,w6 1268*1dcdf01fSchristos ushr v1.4s,v24.4s,#25 1269*1dcdf01fSchristos eor w19,w19,w7 1270*1dcdf01fSchristos ushr v5.4s,v25.4s,#25 1271*1dcdf01fSchristos eor w20,w20,w8 1272*1dcdf01fSchristos ushr v9.4s,v26.4s,#25 1273*1dcdf01fSchristos ror w21,w21,#24 1274*1dcdf01fSchristos ushr v13.4s,v27.4s,#25 1275*1dcdf01fSchristos ror w17,w17,#24 1276*1dcdf01fSchristos ushr v17.4s,v28.4s,#25 1277*1dcdf01fSchristos ror w19,w19,#24 1278*1dcdf01fSchristos ushr v21.4s,v29.4s,#25 1279*1dcdf01fSchristos ror w20,w20,#24 1280*1dcdf01fSchristos sli v1.4s,v24.4s,#7 1281*1dcdf01fSchristos add w15,w15,w21 1282*1dcdf01fSchristos sli v5.4s,v25.4s,#7 1283*1dcdf01fSchristos add w16,w16,w17 1284*1dcdf01fSchristos sli v9.4s,v26.4s,#7 1285*1dcdf01fSchristos add w13,w13,w19 1286*1dcdf01fSchristos sli v13.4s,v27.4s,#7 1287*1dcdf01fSchristos add w14,w14,w20 1288*1dcdf01fSchristos sli v17.4s,v28.4s,#7 1289*1dcdf01fSchristos eor w10,w10,w15 1290*1dcdf01fSchristos sli v21.4s,v29.4s,#7 1291*1dcdf01fSchristos eor w11,w11,w16 1292*1dcdf01fSchristos ext v2.16b,v2.16b,v2.16b,#8 1293*1dcdf01fSchristos eor w12,w12,w13 1294*1dcdf01fSchristos ext v6.16b,v6.16b,v6.16b,#8 1295*1dcdf01fSchristos eor w9,w9,w14 1296*1dcdf01fSchristos ext v10.16b,v10.16b,v10.16b,#8 1297*1dcdf01fSchristos ror w10,w10,#25 1298*1dcdf01fSchristos ext v14.16b,v14.16b,v14.16b,#8 1299*1dcdf01fSchristos ror w11,w11,#25 1300*1dcdf01fSchristos ext v18.16b,v18.16b,v18.16b,#8 1301*1dcdf01fSchristos ror w12,w12,#25 1302*1dcdf01fSchristos ext v22.16b,v22.16b,v22.16b,#8 1303*1dcdf01fSchristos ror w9,w9,#25 1304*1dcdf01fSchristos ext v3.16b,v3.16b,v3.16b,#4 1305*1dcdf01fSchristos ext v7.16b,v7.16b,v7.16b,#4 1306*1dcdf01fSchristos ext v11.16b,v11.16b,v11.16b,#4 1307*1dcdf01fSchristos ext v15.16b,v15.16b,v15.16b,#4 1308*1dcdf01fSchristos ext v19.16b,v19.16b,v19.16b,#4 1309*1dcdf01fSchristos ext v23.16b,v23.16b,v23.16b,#4 1310*1dcdf01fSchristos ext v1.16b,v1.16b,v1.16b,#12 1311*1dcdf01fSchristos ext v5.16b,v5.16b,v5.16b,#12 1312*1dcdf01fSchristos ext v9.16b,v9.16b,v9.16b,#12 1313*1dcdf01fSchristos ext v13.16b,v13.16b,v13.16b,#12 1314*1dcdf01fSchristos ext v17.16b,v17.16b,v17.16b,#12 1315*1dcdf01fSchristos ext v21.16b,v21.16b,v21.16b,#12 1316*1dcdf01fSchristos cbnz x4,.Loop_upper_neon 1317*1dcdf01fSchristos 1318*1dcdf01fSchristos add w5,w5,w22 // accumulate key block 1319*1dcdf01fSchristos add x6,x6,x22,lsr#32 1320*1dcdf01fSchristos add w7,w7,w23 1321*1dcdf01fSchristos add x8,x8,x23,lsr#32 1322*1dcdf01fSchristos add w9,w9,w24 1323*1dcdf01fSchristos add x10,x10,x24,lsr#32 1324*1dcdf01fSchristos add w11,w11,w25 1325*1dcdf01fSchristos add x12,x12,x25,lsr#32 1326*1dcdf01fSchristos add w13,w13,w26 1327*1dcdf01fSchristos add x14,x14,x26,lsr#32 1328*1dcdf01fSchristos add w15,w15,w27 1329*1dcdf01fSchristos add x16,x16,x27,lsr#32 1330*1dcdf01fSchristos add w17,w17,w28 1331*1dcdf01fSchristos add x19,x19,x28,lsr#32 1332*1dcdf01fSchristos add w20,w20,w30 1333*1dcdf01fSchristos add x21,x21,x30,lsr#32 1334*1dcdf01fSchristos 1335*1dcdf01fSchristos add x5,x5,x6,lsl#32 // pack 1336*1dcdf01fSchristos add x7,x7,x8,lsl#32 1337*1dcdf01fSchristos ldp x6,x8,[x1,#0] // load input 1338*1dcdf01fSchristos add x9,x9,x10,lsl#32 1339*1dcdf01fSchristos add x11,x11,x12,lsl#32 1340*1dcdf01fSchristos ldp x10,x12,[x1,#16] 1341*1dcdf01fSchristos add x13,x13,x14,lsl#32 1342*1dcdf01fSchristos add x15,x15,x16,lsl#32 1343*1dcdf01fSchristos ldp x14,x16,[x1,#32] 1344*1dcdf01fSchristos add x17,x17,x19,lsl#32 1345*1dcdf01fSchristos add x20,x20,x21,lsl#32 1346*1dcdf01fSchristos ldp x19,x21,[x1,#48] 1347*1dcdf01fSchristos add x1,x1,#64 1348*1dcdf01fSchristos#ifdef __ARMEB__ 1349*1dcdf01fSchristos rev x5,x5 1350*1dcdf01fSchristos rev x7,x7 1351*1dcdf01fSchristos rev x9,x9 1352*1dcdf01fSchristos rev x11,x11 1353*1dcdf01fSchristos rev x13,x13 1354*1dcdf01fSchristos rev x15,x15 1355*1dcdf01fSchristos rev x17,x17 1356*1dcdf01fSchristos rev x20,x20 1357*1dcdf01fSchristos#endif 1358*1dcdf01fSchristos eor x5,x5,x6 1359*1dcdf01fSchristos eor x7,x7,x8 1360*1dcdf01fSchristos eor x9,x9,x10 1361*1dcdf01fSchristos eor x11,x11,x12 1362*1dcdf01fSchristos eor x13,x13,x14 1363*1dcdf01fSchristos eor x15,x15,x16 1364*1dcdf01fSchristos eor x17,x17,x19 1365*1dcdf01fSchristos eor x20,x20,x21 1366*1dcdf01fSchristos 1367*1dcdf01fSchristos stp x5,x7,[x0,#0] // store output 1368*1dcdf01fSchristos add x28,x28,#1 // increment counter 1369*1dcdf01fSchristos mov w5,w22 // unpack key block 1370*1dcdf01fSchristos lsr x6,x22,#32 1371*1dcdf01fSchristos stp x9,x11,[x0,#16] 1372*1dcdf01fSchristos mov w7,w23 1373*1dcdf01fSchristos lsr x8,x23,#32 1374*1dcdf01fSchristos stp x13,x15,[x0,#32] 1375*1dcdf01fSchristos mov w9,w24 1376*1dcdf01fSchristos lsr x10,x24,#32 1377*1dcdf01fSchristos stp x17,x20,[x0,#48] 1378*1dcdf01fSchristos add x0,x0,#64 1379*1dcdf01fSchristos mov w11,w25 1380*1dcdf01fSchristos lsr x12,x25,#32 1381*1dcdf01fSchristos mov w13,w26 1382*1dcdf01fSchristos lsr x14,x26,#32 1383*1dcdf01fSchristos mov w15,w27 1384*1dcdf01fSchristos lsr x16,x27,#32 1385*1dcdf01fSchristos mov w17,w28 1386*1dcdf01fSchristos lsr x19,x28,#32 1387*1dcdf01fSchristos mov w20,w30 1388*1dcdf01fSchristos lsr x21,x30,#32 1389*1dcdf01fSchristos 1390*1dcdf01fSchristos mov x4,#5 1391*1dcdf01fSchristos.Loop_lower_neon: 1392*1dcdf01fSchristos sub x4,x4,#1 1393*1dcdf01fSchristos add v0.4s,v0.4s,v1.4s 1394*1dcdf01fSchristos add w5,w5,w9 1395*1dcdf01fSchristos add v4.4s,v4.4s,v5.4s 1396*1dcdf01fSchristos add w6,w6,w10 1397*1dcdf01fSchristos add v8.4s,v8.4s,v9.4s 1398*1dcdf01fSchristos add w7,w7,w11 1399*1dcdf01fSchristos add v12.4s,v12.4s,v13.4s 1400*1dcdf01fSchristos add w8,w8,w12 1401*1dcdf01fSchristos add v16.4s,v16.4s,v17.4s 1402*1dcdf01fSchristos eor w17,w17,w5 1403*1dcdf01fSchristos add v20.4s,v20.4s,v21.4s 1404*1dcdf01fSchristos eor w19,w19,w6 1405*1dcdf01fSchristos eor v3.16b,v3.16b,v0.16b 1406*1dcdf01fSchristos eor w20,w20,w7 1407*1dcdf01fSchristos eor v7.16b,v7.16b,v4.16b 1408*1dcdf01fSchristos eor w21,w21,w8 1409*1dcdf01fSchristos eor v11.16b,v11.16b,v8.16b 1410*1dcdf01fSchristos ror w17,w17,#16 1411*1dcdf01fSchristos eor v15.16b,v15.16b,v12.16b 1412*1dcdf01fSchristos ror w19,w19,#16 1413*1dcdf01fSchristos eor v19.16b,v19.16b,v16.16b 1414*1dcdf01fSchristos ror w20,w20,#16 1415*1dcdf01fSchristos eor v23.16b,v23.16b,v20.16b 1416*1dcdf01fSchristos ror w21,w21,#16 1417*1dcdf01fSchristos rev32 v3.8h,v3.8h 1418*1dcdf01fSchristos add w13,w13,w17 1419*1dcdf01fSchristos rev32 v7.8h,v7.8h 1420*1dcdf01fSchristos add w14,w14,w19 1421*1dcdf01fSchristos rev32 v11.8h,v11.8h 1422*1dcdf01fSchristos add w15,w15,w20 1423*1dcdf01fSchristos rev32 v15.8h,v15.8h 1424*1dcdf01fSchristos add w16,w16,w21 1425*1dcdf01fSchristos rev32 v19.8h,v19.8h 1426*1dcdf01fSchristos eor w9,w9,w13 1427*1dcdf01fSchristos rev32 v23.8h,v23.8h 1428*1dcdf01fSchristos eor w10,w10,w14 1429*1dcdf01fSchristos add v2.4s,v2.4s,v3.4s 1430*1dcdf01fSchristos eor w11,w11,w15 1431*1dcdf01fSchristos add v6.4s,v6.4s,v7.4s 1432*1dcdf01fSchristos eor w12,w12,w16 1433*1dcdf01fSchristos add v10.4s,v10.4s,v11.4s 1434*1dcdf01fSchristos ror w9,w9,#20 1435*1dcdf01fSchristos add v14.4s,v14.4s,v15.4s 1436*1dcdf01fSchristos ror w10,w10,#20 1437*1dcdf01fSchristos add v18.4s,v18.4s,v19.4s 1438*1dcdf01fSchristos ror w11,w11,#20 1439*1dcdf01fSchristos add v22.4s,v22.4s,v23.4s 1440*1dcdf01fSchristos ror w12,w12,#20 1441*1dcdf01fSchristos eor v24.16b,v1.16b,v2.16b 1442*1dcdf01fSchristos add w5,w5,w9 1443*1dcdf01fSchristos eor v25.16b,v5.16b,v6.16b 1444*1dcdf01fSchristos add w6,w6,w10 1445*1dcdf01fSchristos eor v26.16b,v9.16b,v10.16b 1446*1dcdf01fSchristos add w7,w7,w11 1447*1dcdf01fSchristos eor v27.16b,v13.16b,v14.16b 1448*1dcdf01fSchristos add w8,w8,w12 1449*1dcdf01fSchristos eor v28.16b,v17.16b,v18.16b 1450*1dcdf01fSchristos eor w17,w17,w5 1451*1dcdf01fSchristos eor v29.16b,v21.16b,v22.16b 1452*1dcdf01fSchristos eor w19,w19,w6 1453*1dcdf01fSchristos ushr v1.4s,v24.4s,#20 1454*1dcdf01fSchristos eor w20,w20,w7 1455*1dcdf01fSchristos ushr v5.4s,v25.4s,#20 1456*1dcdf01fSchristos eor w21,w21,w8 1457*1dcdf01fSchristos ushr v9.4s,v26.4s,#20 1458*1dcdf01fSchristos ror w17,w17,#24 1459*1dcdf01fSchristos ushr v13.4s,v27.4s,#20 1460*1dcdf01fSchristos ror w19,w19,#24 1461*1dcdf01fSchristos ushr v17.4s,v28.4s,#20 1462*1dcdf01fSchristos ror w20,w20,#24 1463*1dcdf01fSchristos ushr v21.4s,v29.4s,#20 1464*1dcdf01fSchristos ror w21,w21,#24 1465*1dcdf01fSchristos sli v1.4s,v24.4s,#12 1466*1dcdf01fSchristos add w13,w13,w17 1467*1dcdf01fSchristos sli v5.4s,v25.4s,#12 1468*1dcdf01fSchristos add w14,w14,w19 1469*1dcdf01fSchristos sli v9.4s,v26.4s,#12 1470*1dcdf01fSchristos add w15,w15,w20 1471*1dcdf01fSchristos sli v13.4s,v27.4s,#12 1472*1dcdf01fSchristos add w16,w16,w21 1473*1dcdf01fSchristos sli v17.4s,v28.4s,#12 1474*1dcdf01fSchristos eor w9,w9,w13 1475*1dcdf01fSchristos sli v21.4s,v29.4s,#12 1476*1dcdf01fSchristos eor w10,w10,w14 1477*1dcdf01fSchristos add v0.4s,v0.4s,v1.4s 1478*1dcdf01fSchristos eor w11,w11,w15 1479*1dcdf01fSchristos add v4.4s,v4.4s,v5.4s 1480*1dcdf01fSchristos eor w12,w12,w16 1481*1dcdf01fSchristos add v8.4s,v8.4s,v9.4s 1482*1dcdf01fSchristos ror w9,w9,#25 1483*1dcdf01fSchristos add v12.4s,v12.4s,v13.4s 1484*1dcdf01fSchristos ror w10,w10,#25 1485*1dcdf01fSchristos add v16.4s,v16.4s,v17.4s 1486*1dcdf01fSchristos ror w11,w11,#25 1487*1dcdf01fSchristos add v20.4s,v20.4s,v21.4s 1488*1dcdf01fSchristos ror w12,w12,#25 1489*1dcdf01fSchristos eor v24.16b,v3.16b,v0.16b 1490*1dcdf01fSchristos add w5,w5,w10 1491*1dcdf01fSchristos eor v25.16b,v7.16b,v4.16b 1492*1dcdf01fSchristos add w6,w6,w11 1493*1dcdf01fSchristos eor v26.16b,v11.16b,v8.16b 1494*1dcdf01fSchristos add w7,w7,w12 1495*1dcdf01fSchristos eor v27.16b,v15.16b,v12.16b 1496*1dcdf01fSchristos add w8,w8,w9 1497*1dcdf01fSchristos eor v28.16b,v19.16b,v16.16b 1498*1dcdf01fSchristos eor w21,w21,w5 1499*1dcdf01fSchristos eor v29.16b,v23.16b,v20.16b 1500*1dcdf01fSchristos eor w17,w17,w6 1501*1dcdf01fSchristos ushr v3.4s,v24.4s,#24 1502*1dcdf01fSchristos eor w19,w19,w7 1503*1dcdf01fSchristos ushr v7.4s,v25.4s,#24 1504*1dcdf01fSchristos eor w20,w20,w8 1505*1dcdf01fSchristos ushr v11.4s,v26.4s,#24 1506*1dcdf01fSchristos ror w21,w21,#16 1507*1dcdf01fSchristos ushr v15.4s,v27.4s,#24 1508*1dcdf01fSchristos ror w17,w17,#16 1509*1dcdf01fSchristos ushr v19.4s,v28.4s,#24 1510*1dcdf01fSchristos ror w19,w19,#16 1511*1dcdf01fSchristos ushr v23.4s,v29.4s,#24 1512*1dcdf01fSchristos ror w20,w20,#16 1513*1dcdf01fSchristos sli v3.4s,v24.4s,#8 1514*1dcdf01fSchristos add w15,w15,w21 1515*1dcdf01fSchristos sli v7.4s,v25.4s,#8 1516*1dcdf01fSchristos add w16,w16,w17 1517*1dcdf01fSchristos sli v11.4s,v26.4s,#8 1518*1dcdf01fSchristos add w13,w13,w19 1519*1dcdf01fSchristos sli v15.4s,v27.4s,#8 1520*1dcdf01fSchristos add w14,w14,w20 1521*1dcdf01fSchristos sli v19.4s,v28.4s,#8 1522*1dcdf01fSchristos eor w10,w10,w15 1523*1dcdf01fSchristos sli v23.4s,v29.4s,#8 1524*1dcdf01fSchristos eor w11,w11,w16 1525*1dcdf01fSchristos add v2.4s,v2.4s,v3.4s 1526*1dcdf01fSchristos eor w12,w12,w13 1527*1dcdf01fSchristos add v6.4s,v6.4s,v7.4s 1528*1dcdf01fSchristos eor w9,w9,w14 1529*1dcdf01fSchristos add v10.4s,v10.4s,v11.4s 1530*1dcdf01fSchristos ror w10,w10,#20 1531*1dcdf01fSchristos add v14.4s,v14.4s,v15.4s 1532*1dcdf01fSchristos ror w11,w11,#20 1533*1dcdf01fSchristos add v18.4s,v18.4s,v19.4s 1534*1dcdf01fSchristos ror w12,w12,#20 1535*1dcdf01fSchristos add v22.4s,v22.4s,v23.4s 1536*1dcdf01fSchristos ror w9,w9,#20 1537*1dcdf01fSchristos eor v24.16b,v1.16b,v2.16b 1538*1dcdf01fSchristos add w5,w5,w10 1539*1dcdf01fSchristos eor v25.16b,v5.16b,v6.16b 1540*1dcdf01fSchristos add w6,w6,w11 1541*1dcdf01fSchristos eor v26.16b,v9.16b,v10.16b 1542*1dcdf01fSchristos add w7,w7,w12 1543*1dcdf01fSchristos eor v27.16b,v13.16b,v14.16b 1544*1dcdf01fSchristos add w8,w8,w9 1545*1dcdf01fSchristos eor v28.16b,v17.16b,v18.16b 1546*1dcdf01fSchristos eor w21,w21,w5 1547*1dcdf01fSchristos eor v29.16b,v21.16b,v22.16b 1548*1dcdf01fSchristos eor w17,w17,w6 1549*1dcdf01fSchristos ushr v1.4s,v24.4s,#25 1550*1dcdf01fSchristos eor w19,w19,w7 1551*1dcdf01fSchristos ushr v5.4s,v25.4s,#25 1552*1dcdf01fSchristos eor w20,w20,w8 1553*1dcdf01fSchristos ushr v9.4s,v26.4s,#25 1554*1dcdf01fSchristos ror w21,w21,#24 1555*1dcdf01fSchristos ushr v13.4s,v27.4s,#25 1556*1dcdf01fSchristos ror w17,w17,#24 1557*1dcdf01fSchristos ushr v17.4s,v28.4s,#25 1558*1dcdf01fSchristos ror w19,w19,#24 1559*1dcdf01fSchristos ushr v21.4s,v29.4s,#25 1560*1dcdf01fSchristos ror w20,w20,#24 1561*1dcdf01fSchristos sli v1.4s,v24.4s,#7 1562*1dcdf01fSchristos add w15,w15,w21 1563*1dcdf01fSchristos sli v5.4s,v25.4s,#7 1564*1dcdf01fSchristos add w16,w16,w17 1565*1dcdf01fSchristos sli v9.4s,v26.4s,#7 1566*1dcdf01fSchristos add w13,w13,w19 1567*1dcdf01fSchristos sli v13.4s,v27.4s,#7 1568*1dcdf01fSchristos add w14,w14,w20 1569*1dcdf01fSchristos sli v17.4s,v28.4s,#7 1570*1dcdf01fSchristos eor w10,w10,w15 1571*1dcdf01fSchristos sli v21.4s,v29.4s,#7 1572*1dcdf01fSchristos eor w11,w11,w16 1573*1dcdf01fSchristos ext v2.16b,v2.16b,v2.16b,#8 1574*1dcdf01fSchristos eor w12,w12,w13 1575*1dcdf01fSchristos ext v6.16b,v6.16b,v6.16b,#8 1576*1dcdf01fSchristos eor w9,w9,w14 1577*1dcdf01fSchristos ext v10.16b,v10.16b,v10.16b,#8 1578*1dcdf01fSchristos ror w10,w10,#25 1579*1dcdf01fSchristos ext v14.16b,v14.16b,v14.16b,#8 1580*1dcdf01fSchristos ror w11,w11,#25 1581*1dcdf01fSchristos ext v18.16b,v18.16b,v18.16b,#8 1582*1dcdf01fSchristos ror w12,w12,#25 1583*1dcdf01fSchristos ext v22.16b,v22.16b,v22.16b,#8 1584*1dcdf01fSchristos ror w9,w9,#25 1585*1dcdf01fSchristos ext v3.16b,v3.16b,v3.16b,#12 1586*1dcdf01fSchristos ext v7.16b,v7.16b,v7.16b,#12 1587*1dcdf01fSchristos ext v11.16b,v11.16b,v11.16b,#12 1588*1dcdf01fSchristos ext v15.16b,v15.16b,v15.16b,#12 1589*1dcdf01fSchristos ext v19.16b,v19.16b,v19.16b,#12 1590*1dcdf01fSchristos ext v23.16b,v23.16b,v23.16b,#12 1591*1dcdf01fSchristos ext v1.16b,v1.16b,v1.16b,#4 1592*1dcdf01fSchristos ext v5.16b,v5.16b,v5.16b,#4 1593*1dcdf01fSchristos ext v9.16b,v9.16b,v9.16b,#4 1594*1dcdf01fSchristos ext v13.16b,v13.16b,v13.16b,#4 1595*1dcdf01fSchristos ext v17.16b,v17.16b,v17.16b,#4 1596*1dcdf01fSchristos ext v21.16b,v21.16b,v21.16b,#4 1597*1dcdf01fSchristos add v0.4s,v0.4s,v1.4s 1598*1dcdf01fSchristos add w5,w5,w9 1599*1dcdf01fSchristos add v4.4s,v4.4s,v5.4s 1600*1dcdf01fSchristos add w6,w6,w10 1601*1dcdf01fSchristos add v8.4s,v8.4s,v9.4s 1602*1dcdf01fSchristos add w7,w7,w11 1603*1dcdf01fSchristos add v12.4s,v12.4s,v13.4s 1604*1dcdf01fSchristos add w8,w8,w12 1605*1dcdf01fSchristos add v16.4s,v16.4s,v17.4s 1606*1dcdf01fSchristos eor w17,w17,w5 1607*1dcdf01fSchristos add v20.4s,v20.4s,v21.4s 1608*1dcdf01fSchristos eor w19,w19,w6 1609*1dcdf01fSchristos eor v3.16b,v3.16b,v0.16b 1610*1dcdf01fSchristos eor w20,w20,w7 1611*1dcdf01fSchristos eor v7.16b,v7.16b,v4.16b 1612*1dcdf01fSchristos eor w21,w21,w8 1613*1dcdf01fSchristos eor v11.16b,v11.16b,v8.16b 1614*1dcdf01fSchristos ror w17,w17,#16 1615*1dcdf01fSchristos eor v15.16b,v15.16b,v12.16b 1616*1dcdf01fSchristos ror w19,w19,#16 1617*1dcdf01fSchristos eor v19.16b,v19.16b,v16.16b 1618*1dcdf01fSchristos ror w20,w20,#16 1619*1dcdf01fSchristos eor v23.16b,v23.16b,v20.16b 1620*1dcdf01fSchristos ror w21,w21,#16 1621*1dcdf01fSchristos rev32 v3.8h,v3.8h 1622*1dcdf01fSchristos add w13,w13,w17 1623*1dcdf01fSchristos rev32 v7.8h,v7.8h 1624*1dcdf01fSchristos add w14,w14,w19 1625*1dcdf01fSchristos rev32 v11.8h,v11.8h 1626*1dcdf01fSchristos add w15,w15,w20 1627*1dcdf01fSchristos rev32 v15.8h,v15.8h 1628*1dcdf01fSchristos add w16,w16,w21 1629*1dcdf01fSchristos rev32 v19.8h,v19.8h 1630*1dcdf01fSchristos eor w9,w9,w13 1631*1dcdf01fSchristos rev32 v23.8h,v23.8h 1632*1dcdf01fSchristos eor w10,w10,w14 1633*1dcdf01fSchristos add v2.4s,v2.4s,v3.4s 1634*1dcdf01fSchristos eor w11,w11,w15 1635*1dcdf01fSchristos add v6.4s,v6.4s,v7.4s 1636*1dcdf01fSchristos eor w12,w12,w16 1637*1dcdf01fSchristos add v10.4s,v10.4s,v11.4s 1638*1dcdf01fSchristos ror w9,w9,#20 1639*1dcdf01fSchristos add v14.4s,v14.4s,v15.4s 1640*1dcdf01fSchristos ror w10,w10,#20 1641*1dcdf01fSchristos add v18.4s,v18.4s,v19.4s 1642*1dcdf01fSchristos ror w11,w11,#20 1643*1dcdf01fSchristos add v22.4s,v22.4s,v23.4s 1644*1dcdf01fSchristos ror w12,w12,#20 1645*1dcdf01fSchristos eor v24.16b,v1.16b,v2.16b 1646*1dcdf01fSchristos add w5,w5,w9 1647*1dcdf01fSchristos eor v25.16b,v5.16b,v6.16b 1648*1dcdf01fSchristos add w6,w6,w10 1649*1dcdf01fSchristos eor v26.16b,v9.16b,v10.16b 1650*1dcdf01fSchristos add w7,w7,w11 1651*1dcdf01fSchristos eor v27.16b,v13.16b,v14.16b 1652*1dcdf01fSchristos add w8,w8,w12 1653*1dcdf01fSchristos eor v28.16b,v17.16b,v18.16b 1654*1dcdf01fSchristos eor w17,w17,w5 1655*1dcdf01fSchristos eor v29.16b,v21.16b,v22.16b 1656*1dcdf01fSchristos eor w19,w19,w6 1657*1dcdf01fSchristos ushr v1.4s,v24.4s,#20 1658*1dcdf01fSchristos eor w20,w20,w7 1659*1dcdf01fSchristos ushr v5.4s,v25.4s,#20 1660*1dcdf01fSchristos eor w21,w21,w8 1661*1dcdf01fSchristos ushr v9.4s,v26.4s,#20 1662*1dcdf01fSchristos ror w17,w17,#24 1663*1dcdf01fSchristos ushr v13.4s,v27.4s,#20 1664*1dcdf01fSchristos ror w19,w19,#24 1665*1dcdf01fSchristos ushr v17.4s,v28.4s,#20 1666*1dcdf01fSchristos ror w20,w20,#24 1667*1dcdf01fSchristos ushr v21.4s,v29.4s,#20 1668*1dcdf01fSchristos ror w21,w21,#24 1669*1dcdf01fSchristos sli v1.4s,v24.4s,#12 1670*1dcdf01fSchristos add w13,w13,w17 1671*1dcdf01fSchristos sli v5.4s,v25.4s,#12 1672*1dcdf01fSchristos add w14,w14,w19 1673*1dcdf01fSchristos sli v9.4s,v26.4s,#12 1674*1dcdf01fSchristos add w15,w15,w20 1675*1dcdf01fSchristos sli v13.4s,v27.4s,#12 1676*1dcdf01fSchristos add w16,w16,w21 1677*1dcdf01fSchristos sli v17.4s,v28.4s,#12 1678*1dcdf01fSchristos eor w9,w9,w13 1679*1dcdf01fSchristos sli v21.4s,v29.4s,#12 1680*1dcdf01fSchristos eor w10,w10,w14 1681*1dcdf01fSchristos add v0.4s,v0.4s,v1.4s 1682*1dcdf01fSchristos eor w11,w11,w15 1683*1dcdf01fSchristos add v4.4s,v4.4s,v5.4s 1684*1dcdf01fSchristos eor w12,w12,w16 1685*1dcdf01fSchristos add v8.4s,v8.4s,v9.4s 1686*1dcdf01fSchristos ror w9,w9,#25 1687*1dcdf01fSchristos add v12.4s,v12.4s,v13.4s 1688*1dcdf01fSchristos ror w10,w10,#25 1689*1dcdf01fSchristos add v16.4s,v16.4s,v17.4s 1690*1dcdf01fSchristos ror w11,w11,#25 1691*1dcdf01fSchristos add v20.4s,v20.4s,v21.4s 1692*1dcdf01fSchristos ror w12,w12,#25 1693*1dcdf01fSchristos eor v24.16b,v3.16b,v0.16b 1694*1dcdf01fSchristos add w5,w5,w10 1695*1dcdf01fSchristos eor v25.16b,v7.16b,v4.16b 1696*1dcdf01fSchristos add w6,w6,w11 1697*1dcdf01fSchristos eor v26.16b,v11.16b,v8.16b 1698*1dcdf01fSchristos add w7,w7,w12 1699*1dcdf01fSchristos eor v27.16b,v15.16b,v12.16b 1700*1dcdf01fSchristos add w8,w8,w9 1701*1dcdf01fSchristos eor v28.16b,v19.16b,v16.16b 1702*1dcdf01fSchristos eor w21,w21,w5 1703*1dcdf01fSchristos eor v29.16b,v23.16b,v20.16b 1704*1dcdf01fSchristos eor w17,w17,w6 1705*1dcdf01fSchristos ushr v3.4s,v24.4s,#24 1706*1dcdf01fSchristos eor w19,w19,w7 1707*1dcdf01fSchristos ushr v7.4s,v25.4s,#24 1708*1dcdf01fSchristos eor w20,w20,w8 1709*1dcdf01fSchristos ushr v11.4s,v26.4s,#24 1710*1dcdf01fSchristos ror w21,w21,#16 1711*1dcdf01fSchristos ushr v15.4s,v27.4s,#24 1712*1dcdf01fSchristos ror w17,w17,#16 1713*1dcdf01fSchristos ushr v19.4s,v28.4s,#24 1714*1dcdf01fSchristos ror w19,w19,#16 1715*1dcdf01fSchristos ushr v23.4s,v29.4s,#24 1716*1dcdf01fSchristos ror w20,w20,#16 1717*1dcdf01fSchristos sli v3.4s,v24.4s,#8 1718*1dcdf01fSchristos add w15,w15,w21 1719*1dcdf01fSchristos sli v7.4s,v25.4s,#8 1720*1dcdf01fSchristos add w16,w16,w17 1721*1dcdf01fSchristos sli v11.4s,v26.4s,#8 1722*1dcdf01fSchristos add w13,w13,w19 1723*1dcdf01fSchristos sli v15.4s,v27.4s,#8 1724*1dcdf01fSchristos add w14,w14,w20 1725*1dcdf01fSchristos sli v19.4s,v28.4s,#8 1726*1dcdf01fSchristos eor w10,w10,w15 1727*1dcdf01fSchristos sli v23.4s,v29.4s,#8 1728*1dcdf01fSchristos eor w11,w11,w16 1729*1dcdf01fSchristos add v2.4s,v2.4s,v3.4s 1730*1dcdf01fSchristos eor w12,w12,w13 1731*1dcdf01fSchristos add v6.4s,v6.4s,v7.4s 1732*1dcdf01fSchristos eor w9,w9,w14 1733*1dcdf01fSchristos add v10.4s,v10.4s,v11.4s 1734*1dcdf01fSchristos ror w10,w10,#20 1735*1dcdf01fSchristos add v14.4s,v14.4s,v15.4s 1736*1dcdf01fSchristos ror w11,w11,#20 1737*1dcdf01fSchristos add v18.4s,v18.4s,v19.4s 1738*1dcdf01fSchristos ror w12,w12,#20 1739*1dcdf01fSchristos add v22.4s,v22.4s,v23.4s 1740*1dcdf01fSchristos ror w9,w9,#20 1741*1dcdf01fSchristos eor v24.16b,v1.16b,v2.16b 1742*1dcdf01fSchristos add w5,w5,w10 1743*1dcdf01fSchristos eor v25.16b,v5.16b,v6.16b 1744*1dcdf01fSchristos add w6,w6,w11 1745*1dcdf01fSchristos eor v26.16b,v9.16b,v10.16b 1746*1dcdf01fSchristos add w7,w7,w12 1747*1dcdf01fSchristos eor v27.16b,v13.16b,v14.16b 1748*1dcdf01fSchristos add w8,w8,w9 1749*1dcdf01fSchristos eor v28.16b,v17.16b,v18.16b 1750*1dcdf01fSchristos eor w21,w21,w5 1751*1dcdf01fSchristos eor v29.16b,v21.16b,v22.16b 1752*1dcdf01fSchristos eor w17,w17,w6 1753*1dcdf01fSchristos ushr v1.4s,v24.4s,#25 1754*1dcdf01fSchristos eor w19,w19,w7 1755*1dcdf01fSchristos ushr v5.4s,v25.4s,#25 1756*1dcdf01fSchristos eor w20,w20,w8 1757*1dcdf01fSchristos ushr v9.4s,v26.4s,#25 1758*1dcdf01fSchristos ror w21,w21,#24 1759*1dcdf01fSchristos ushr v13.4s,v27.4s,#25 1760*1dcdf01fSchristos ror w17,w17,#24 1761*1dcdf01fSchristos ushr v17.4s,v28.4s,#25 1762*1dcdf01fSchristos ror w19,w19,#24 1763*1dcdf01fSchristos ushr v21.4s,v29.4s,#25 1764*1dcdf01fSchristos ror w20,w20,#24 1765*1dcdf01fSchristos sli v1.4s,v24.4s,#7 1766*1dcdf01fSchristos add w15,w15,w21 1767*1dcdf01fSchristos sli v5.4s,v25.4s,#7 1768*1dcdf01fSchristos add w16,w16,w17 1769*1dcdf01fSchristos sli v9.4s,v26.4s,#7 1770*1dcdf01fSchristos add w13,w13,w19 1771*1dcdf01fSchristos sli v13.4s,v27.4s,#7 1772*1dcdf01fSchristos add w14,w14,w20 1773*1dcdf01fSchristos sli v17.4s,v28.4s,#7 1774*1dcdf01fSchristos eor w10,w10,w15 1775*1dcdf01fSchristos sli v21.4s,v29.4s,#7 1776*1dcdf01fSchristos eor w11,w11,w16 1777*1dcdf01fSchristos ext v2.16b,v2.16b,v2.16b,#8 1778*1dcdf01fSchristos eor w12,w12,w13 1779*1dcdf01fSchristos ext v6.16b,v6.16b,v6.16b,#8 1780*1dcdf01fSchristos eor w9,w9,w14 1781*1dcdf01fSchristos ext v10.16b,v10.16b,v10.16b,#8 1782*1dcdf01fSchristos ror w10,w10,#25 1783*1dcdf01fSchristos ext v14.16b,v14.16b,v14.16b,#8 1784*1dcdf01fSchristos ror w11,w11,#25 1785*1dcdf01fSchristos ext v18.16b,v18.16b,v18.16b,#8 1786*1dcdf01fSchristos ror w12,w12,#25 1787*1dcdf01fSchristos ext v22.16b,v22.16b,v22.16b,#8 1788*1dcdf01fSchristos ror w9,w9,#25 1789*1dcdf01fSchristos ext v3.16b,v3.16b,v3.16b,#4 1790*1dcdf01fSchristos ext v7.16b,v7.16b,v7.16b,#4 1791*1dcdf01fSchristos ext v11.16b,v11.16b,v11.16b,#4 1792*1dcdf01fSchristos ext v15.16b,v15.16b,v15.16b,#4 1793*1dcdf01fSchristos ext v19.16b,v19.16b,v19.16b,#4 1794*1dcdf01fSchristos ext v23.16b,v23.16b,v23.16b,#4 1795*1dcdf01fSchristos ext v1.16b,v1.16b,v1.16b,#12 1796*1dcdf01fSchristos ext v5.16b,v5.16b,v5.16b,#12 1797*1dcdf01fSchristos ext v9.16b,v9.16b,v9.16b,#12 1798*1dcdf01fSchristos ext v13.16b,v13.16b,v13.16b,#12 1799*1dcdf01fSchristos ext v17.16b,v17.16b,v17.16b,#12 1800*1dcdf01fSchristos ext v21.16b,v21.16b,v21.16b,#12 1801*1dcdf01fSchristos cbnz x4,.Loop_lower_neon 1802*1dcdf01fSchristos 1803*1dcdf01fSchristos add w5,w5,w22 // accumulate key block 1804*1dcdf01fSchristos ldp q24,q25,[sp,#0] 1805*1dcdf01fSchristos add x6,x6,x22,lsr#32 1806*1dcdf01fSchristos ldp q26,q27,[sp,#32] 1807*1dcdf01fSchristos add w7,w7,w23 1808*1dcdf01fSchristos ldp q28,q29,[sp,#64] 1809*1dcdf01fSchristos add x8,x8,x23,lsr#32 1810*1dcdf01fSchristos add v0.4s,v0.4s,v24.4s 1811*1dcdf01fSchristos add w9,w9,w24 1812*1dcdf01fSchristos add v4.4s,v4.4s,v24.4s 1813*1dcdf01fSchristos add x10,x10,x24,lsr#32 1814*1dcdf01fSchristos add v8.4s,v8.4s,v24.4s 1815*1dcdf01fSchristos add w11,w11,w25 1816*1dcdf01fSchristos add v12.4s,v12.4s,v24.4s 1817*1dcdf01fSchristos add x12,x12,x25,lsr#32 1818*1dcdf01fSchristos add v16.4s,v16.4s,v24.4s 1819*1dcdf01fSchristos add w13,w13,w26 1820*1dcdf01fSchristos add v20.4s,v20.4s,v24.4s 1821*1dcdf01fSchristos add x14,x14,x26,lsr#32 1822*1dcdf01fSchristos add v2.4s,v2.4s,v26.4s 1823*1dcdf01fSchristos add w15,w15,w27 1824*1dcdf01fSchristos add v6.4s,v6.4s,v26.4s 1825*1dcdf01fSchristos add x16,x16,x27,lsr#32 1826*1dcdf01fSchristos add v10.4s,v10.4s,v26.4s 1827*1dcdf01fSchristos add w17,w17,w28 1828*1dcdf01fSchristos add v14.4s,v14.4s,v26.4s 1829*1dcdf01fSchristos add x19,x19,x28,lsr#32 1830*1dcdf01fSchristos add v18.4s,v18.4s,v26.4s 1831*1dcdf01fSchristos add w20,w20,w30 1832*1dcdf01fSchristos add v22.4s,v22.4s,v26.4s 1833*1dcdf01fSchristos add x21,x21,x30,lsr#32 1834*1dcdf01fSchristos add v19.4s,v19.4s,v31.4s // +4 1835*1dcdf01fSchristos add x5,x5,x6,lsl#32 // pack 1836*1dcdf01fSchristos add v23.4s,v23.4s,v31.4s // +4 1837*1dcdf01fSchristos add x7,x7,x8,lsl#32 1838*1dcdf01fSchristos add v3.4s,v3.4s,v27.4s 1839*1dcdf01fSchristos ldp x6,x8,[x1,#0] // load input 1840*1dcdf01fSchristos add v7.4s,v7.4s,v28.4s 1841*1dcdf01fSchristos add x9,x9,x10,lsl#32 1842*1dcdf01fSchristos add v11.4s,v11.4s,v29.4s 1843*1dcdf01fSchristos add x11,x11,x12,lsl#32 1844*1dcdf01fSchristos add v15.4s,v15.4s,v30.4s 1845*1dcdf01fSchristos ldp x10,x12,[x1,#16] 1846*1dcdf01fSchristos add v19.4s,v19.4s,v27.4s 1847*1dcdf01fSchristos add x13,x13,x14,lsl#32 1848*1dcdf01fSchristos add v23.4s,v23.4s,v28.4s 1849*1dcdf01fSchristos add x15,x15,x16,lsl#32 1850*1dcdf01fSchristos add v1.4s,v1.4s,v25.4s 1851*1dcdf01fSchristos ldp x14,x16,[x1,#32] 1852*1dcdf01fSchristos add v5.4s,v5.4s,v25.4s 1853*1dcdf01fSchristos add x17,x17,x19,lsl#32 1854*1dcdf01fSchristos add v9.4s,v9.4s,v25.4s 1855*1dcdf01fSchristos add x20,x20,x21,lsl#32 1856*1dcdf01fSchristos add v13.4s,v13.4s,v25.4s 1857*1dcdf01fSchristos ldp x19,x21,[x1,#48] 1858*1dcdf01fSchristos add v17.4s,v17.4s,v25.4s 1859*1dcdf01fSchristos add x1,x1,#64 1860*1dcdf01fSchristos add v21.4s,v21.4s,v25.4s 1861*1dcdf01fSchristos 1862*1dcdf01fSchristos#ifdef __ARMEB__ 1863*1dcdf01fSchristos rev x5,x5 1864*1dcdf01fSchristos rev x7,x7 1865*1dcdf01fSchristos rev x9,x9 1866*1dcdf01fSchristos rev x11,x11 1867*1dcdf01fSchristos rev x13,x13 1868*1dcdf01fSchristos rev x15,x15 1869*1dcdf01fSchristos rev x17,x17 1870*1dcdf01fSchristos rev x20,x20 1871*1dcdf01fSchristos#endif 1872*1dcdf01fSchristos ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1873*1dcdf01fSchristos eor x5,x5,x6 1874*1dcdf01fSchristos eor x7,x7,x8 1875*1dcdf01fSchristos eor x9,x9,x10 1876*1dcdf01fSchristos eor x11,x11,x12 1877*1dcdf01fSchristos eor x13,x13,x14 1878*1dcdf01fSchristos eor v0.16b,v0.16b,v24.16b 1879*1dcdf01fSchristos eor x15,x15,x16 1880*1dcdf01fSchristos eor v1.16b,v1.16b,v25.16b 1881*1dcdf01fSchristos eor x17,x17,x19 1882*1dcdf01fSchristos eor v2.16b,v2.16b,v26.16b 1883*1dcdf01fSchristos eor x20,x20,x21 1884*1dcdf01fSchristos eor v3.16b,v3.16b,v27.16b 1885*1dcdf01fSchristos ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1886*1dcdf01fSchristos 1887*1dcdf01fSchristos stp x5,x7,[x0,#0] // store output 1888*1dcdf01fSchristos add x28,x28,#7 // increment counter 1889*1dcdf01fSchristos stp x9,x11,[x0,#16] 1890*1dcdf01fSchristos stp x13,x15,[x0,#32] 1891*1dcdf01fSchristos stp x17,x20,[x0,#48] 1892*1dcdf01fSchristos add x0,x0,#64 1893*1dcdf01fSchristos st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 1894*1dcdf01fSchristos 1895*1dcdf01fSchristos ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1896*1dcdf01fSchristos eor v4.16b,v4.16b,v24.16b 1897*1dcdf01fSchristos eor v5.16b,v5.16b,v25.16b 1898*1dcdf01fSchristos eor v6.16b,v6.16b,v26.16b 1899*1dcdf01fSchristos eor v7.16b,v7.16b,v27.16b 1900*1dcdf01fSchristos st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1901*1dcdf01fSchristos 1902*1dcdf01fSchristos ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1903*1dcdf01fSchristos eor v8.16b,v8.16b,v0.16b 1904*1dcdf01fSchristos ldp q24,q25,[sp,#0] 1905*1dcdf01fSchristos eor v9.16b,v9.16b,v1.16b 1906*1dcdf01fSchristos ldp q26,q27,[sp,#32] 1907*1dcdf01fSchristos eor v10.16b,v10.16b,v2.16b 1908*1dcdf01fSchristos eor v11.16b,v11.16b,v3.16b 1909*1dcdf01fSchristos st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1910*1dcdf01fSchristos 1911*1dcdf01fSchristos ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1912*1dcdf01fSchristos eor v12.16b,v12.16b,v4.16b 1913*1dcdf01fSchristos eor v13.16b,v13.16b,v5.16b 1914*1dcdf01fSchristos eor v14.16b,v14.16b,v6.16b 1915*1dcdf01fSchristos eor v15.16b,v15.16b,v7.16b 1916*1dcdf01fSchristos st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1917*1dcdf01fSchristos 1918*1dcdf01fSchristos ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1919*1dcdf01fSchristos eor v16.16b,v16.16b,v8.16b 1920*1dcdf01fSchristos eor v17.16b,v17.16b,v9.16b 1921*1dcdf01fSchristos eor v18.16b,v18.16b,v10.16b 1922*1dcdf01fSchristos eor v19.16b,v19.16b,v11.16b 1923*1dcdf01fSchristos st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1924*1dcdf01fSchristos 1925*1dcdf01fSchristos shl v0.4s,v31.4s,#1 // 4 -> 8 1926*1dcdf01fSchristos eor v20.16b,v20.16b,v12.16b 1927*1dcdf01fSchristos eor v21.16b,v21.16b,v13.16b 1928*1dcdf01fSchristos eor v22.16b,v22.16b,v14.16b 1929*1dcdf01fSchristos eor v23.16b,v23.16b,v15.16b 1930*1dcdf01fSchristos st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1931*1dcdf01fSchristos 1932*1dcdf01fSchristos add v27.4s,v27.4s,v0.4s // += 8 1933*1dcdf01fSchristos add v28.4s,v28.4s,v0.4s 1934*1dcdf01fSchristos add v29.4s,v29.4s,v0.4s 1935*1dcdf01fSchristos add v30.4s,v30.4s,v0.4s 1936*1dcdf01fSchristos 1937*1dcdf01fSchristos b.hs .Loop_outer_512_neon 1938*1dcdf01fSchristos 1939*1dcdf01fSchristos adds x2,x2,#512 1940*1dcdf01fSchristos ushr v0.4s,v31.4s,#2 // 4 -> 1 1941*1dcdf01fSchristos 1942*1dcdf01fSchristos ldp d8,d9,[sp,#128+0] // meet ABI requirements 1943*1dcdf01fSchristos ldp d10,d11,[sp,#128+16] 1944*1dcdf01fSchristos ldp d12,d13,[sp,#128+32] 1945*1dcdf01fSchristos ldp d14,d15,[sp,#128+48] 1946*1dcdf01fSchristos 1947*1dcdf01fSchristos stp q24,q31,[sp,#0] // wipe off-load area 1948*1dcdf01fSchristos stp q24,q31,[sp,#32] 1949*1dcdf01fSchristos stp q24,q31,[sp,#64] 1950*1dcdf01fSchristos 1951*1dcdf01fSchristos b.eq .Ldone_512_neon 1952*1dcdf01fSchristos 1953*1dcdf01fSchristos cmp x2,#192 1954*1dcdf01fSchristos sub v27.4s,v27.4s,v0.4s // -= 1 1955*1dcdf01fSchristos sub v28.4s,v28.4s,v0.4s 1956*1dcdf01fSchristos sub v29.4s,v29.4s,v0.4s 1957*1dcdf01fSchristos add sp,sp,#128 1958*1dcdf01fSchristos b.hs .Loop_outer_neon 1959*1dcdf01fSchristos 1960*1dcdf01fSchristos eor v25.16b,v25.16b,v25.16b 1961*1dcdf01fSchristos eor v26.16b,v26.16b,v26.16b 1962*1dcdf01fSchristos eor v27.16b,v27.16b,v27.16b 1963*1dcdf01fSchristos eor v28.16b,v28.16b,v28.16b 1964*1dcdf01fSchristos eor v29.16b,v29.16b,v29.16b 1965*1dcdf01fSchristos eor v30.16b,v30.16b,v30.16b 1966*1dcdf01fSchristos b .Loop_outer 1967*1dcdf01fSchristos 1968*1dcdf01fSchristos.Ldone_512_neon: 1969*1dcdf01fSchristos ldp x19,x20,[x29,#16] 1970*1dcdf01fSchristos add sp,sp,#128+64 1971*1dcdf01fSchristos ldp x21,x22,[x29,#32] 1972*1dcdf01fSchristos ldp x23,x24,[x29,#48] 1973*1dcdf01fSchristos ldp x25,x26,[x29,#64] 1974*1dcdf01fSchristos ldp x27,x28,[x29,#80] 1975*1dcdf01fSchristos ldp x29,x30,[sp],#96 1976*1dcdf01fSchristos.inst 0xd50323bf // autiasp 1977*1dcdf01fSchristos ret 1978*1dcdf01fSchristos.size ChaCha20_512_neon,.-ChaCha20_512_neon 1979