1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from poly1305-armv8.pl. */ 2bc3d5698SJohn Baldwin#include "arm_arch.h" 3bc3d5698SJohn Baldwin 4bc3d5698SJohn Baldwin.text 5bc3d5698SJohn Baldwin 6bc3d5698SJohn Baldwin// forward "declarations" are required for Apple 7bc3d5698SJohn Baldwin 8c3c73b4fSJung-uk Kim.hidden OPENSSL_armcap_P 9bc3d5698SJohn Baldwin.globl poly1305_init 10c3c73b4fSJung-uk Kim.hidden poly1305_init 11c3c73b4fSJung-uk Kim.globl poly1305_blocks 12c3c73b4fSJung-uk Kim.hidden poly1305_blocks 13c3c73b4fSJung-uk Kim.globl poly1305_emit 14c3c73b4fSJung-uk Kim.hidden poly1305_emit 15c3c73b4fSJung-uk Kim 16bc3d5698SJohn Baldwin.type poly1305_init,%function 17bc3d5698SJohn Baldwin.align 5 18bc3d5698SJohn Baldwinpoly1305_init: 19bd9588bcSAndrew Turner AARCH64_VALID_CALL_TARGET 20bc3d5698SJohn Baldwin cmp x1,xzr 21bc3d5698SJohn Baldwin stp xzr,xzr,[x0] // zero hash value 22bc3d5698SJohn Baldwin stp xzr,xzr,[x0,#16] // [along with is_base2_26] 23bc3d5698SJohn Baldwin 24bc3d5698SJohn Baldwin csel x0,xzr,x0,eq 25bc3d5698SJohn Baldwin b.eq .Lno_key 26bc3d5698SJohn Baldwin 27c0855eaaSJohn Baldwin adrp x17,OPENSSL_armcap_P 28c0855eaaSJohn Baldwin ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 29bc3d5698SJohn Baldwin 30bc3d5698SJohn Baldwin ldp x7,x8,[x1] // load key 31bc3d5698SJohn Baldwin mov x9,#0xfffffffc0fffffff 32bc3d5698SJohn Baldwin movk x9,#0x0fff,lsl#48 33575878a5SEd Maste#ifdef __AARCH64EB__ 34bc3d5698SJohn Baldwin rev x7,x7 // flip bytes 35bc3d5698SJohn Baldwin rev x8,x8 36bc3d5698SJohn Baldwin#endif 37bc3d5698SJohn Baldwin and x7,x7,x9 // &=0ffffffc0fffffff 38bc3d5698SJohn Baldwin and x9,x9,#-4 39bc3d5698SJohn Baldwin and x8,x8,x9 // &=0ffffffc0ffffffc 40bc3d5698SJohn Baldwin stp x7,x8,[x0,#32] // save key value 41bc3d5698SJohn Baldwin 42bc3d5698SJohn Baldwin tst w17,#ARMV7_NEON 43bc3d5698SJohn Baldwin 44c0855eaaSJohn Baldwin adr x12,.Lpoly1305_blocks 45c0855eaaSJohn Baldwin adr x7,.Lpoly1305_blocks_neon 46c0855eaaSJohn Baldwin adr x13,.Lpoly1305_emit 47c0855eaaSJohn Baldwin adr x8,.Lpoly1305_emit_neon 48bc3d5698SJohn Baldwin 49bc3d5698SJohn Baldwin csel x12,x12,x7,eq 50bc3d5698SJohn Baldwin csel x13,x13,x8,eq 51bc3d5698SJohn Baldwin 52bc3d5698SJohn Baldwin#ifdef __ILP32__ 53bc3d5698SJohn Baldwin stp w12,w13,[x2] 54bc3d5698SJohn Baldwin#else 55bc3d5698SJohn Baldwin stp x12,x13,[x2] 56bc3d5698SJohn Baldwin#endif 57bc3d5698SJohn Baldwin 58bc3d5698SJohn Baldwin mov x0,#1 59bc3d5698SJohn Baldwin.Lno_key: 60bc3d5698SJohn Baldwin ret 61bc3d5698SJohn Baldwin.size poly1305_init,.-poly1305_init 62bc3d5698SJohn Baldwin 63bc3d5698SJohn Baldwin.type poly1305_blocks,%function 64bc3d5698SJohn Baldwin.align 5 65bc3d5698SJohn Baldwinpoly1305_blocks: 66c0855eaaSJohn Baldwin.Lpoly1305_blocks: 67bd9588bcSAndrew Turner // The symbol .Lpoly1305_blocks is not a .globl symbol 68bd9588bcSAndrew Turner // but a pointer to it is returned by poly1305_init 69bd9588bcSAndrew Turner AARCH64_VALID_CALL_TARGET 70bc3d5698SJohn Baldwin ands x2,x2,#-16 71bc3d5698SJohn Baldwin b.eq .Lno_data 72bc3d5698SJohn Baldwin 73bc3d5698SJohn Baldwin ldp x4,x5,[x0] // load hash value 74bc3d5698SJohn Baldwin ldp x7,x8,[x0,#32] // load key value 75bc3d5698SJohn Baldwin ldr x6,[x0,#16] 76bc3d5698SJohn Baldwin add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 77bc3d5698SJohn Baldwin b .Loop 78bc3d5698SJohn Baldwin 79bc3d5698SJohn Baldwin.align 5 80bc3d5698SJohn Baldwin.Loop: 81bc3d5698SJohn Baldwin ldp x10,x11,[x1],#16 // load input 82bc3d5698SJohn Baldwin sub x2,x2,#16 83575878a5SEd Maste#ifdef __AARCH64EB__ 84bc3d5698SJohn Baldwin rev x10,x10 85bc3d5698SJohn Baldwin rev x11,x11 86bc3d5698SJohn Baldwin#endif 87bc3d5698SJohn Baldwin adds x4,x4,x10 // accumulate input 88bc3d5698SJohn Baldwin adcs x5,x5,x11 89bc3d5698SJohn Baldwin 90bc3d5698SJohn Baldwin mul x12,x4,x7 // h0*r0 91bc3d5698SJohn Baldwin adc x6,x6,x3 92bc3d5698SJohn Baldwin umulh x13,x4,x7 93bc3d5698SJohn Baldwin 94bc3d5698SJohn Baldwin mul x10,x5,x9 // h1*5*r1 95bc3d5698SJohn Baldwin umulh x11,x5,x9 96bc3d5698SJohn Baldwin 97bc3d5698SJohn Baldwin adds x12,x12,x10 98bc3d5698SJohn Baldwin mul x10,x4,x8 // h0*r1 99bc3d5698SJohn Baldwin adc x13,x13,x11 100bc3d5698SJohn Baldwin umulh x14,x4,x8 101bc3d5698SJohn Baldwin 102bc3d5698SJohn Baldwin adds x13,x13,x10 103bc3d5698SJohn Baldwin mul x10,x5,x7 // h1*r0 104bc3d5698SJohn Baldwin adc x14,x14,xzr 105bc3d5698SJohn Baldwin umulh x11,x5,x7 106bc3d5698SJohn Baldwin 107bc3d5698SJohn Baldwin adds x13,x13,x10 108bc3d5698SJohn Baldwin mul x10,x6,x9 // h2*5*r1 109bc3d5698SJohn Baldwin adc x14,x14,x11 110bc3d5698SJohn Baldwin mul x11,x6,x7 // h2*r0 111bc3d5698SJohn Baldwin 112bc3d5698SJohn Baldwin adds x13,x13,x10 113bc3d5698SJohn Baldwin adc x14,x14,x11 114bc3d5698SJohn Baldwin 115bc3d5698SJohn Baldwin and x10,x14,#-4 // final reduction 116bc3d5698SJohn Baldwin and x6,x14,#3 117bc3d5698SJohn Baldwin add x10,x10,x14,lsr#2 118bc3d5698SJohn Baldwin adds x4,x12,x10 119bc3d5698SJohn Baldwin adcs x5,x13,xzr 120bc3d5698SJohn Baldwin adc x6,x6,xzr 121bc3d5698SJohn Baldwin 122bc3d5698SJohn Baldwin cbnz x2,.Loop 123bc3d5698SJohn Baldwin 124bc3d5698SJohn Baldwin stp x4,x5,[x0] // store hash value 125bc3d5698SJohn Baldwin str x6,[x0,#16] 126bc3d5698SJohn Baldwin 127bc3d5698SJohn Baldwin.Lno_data: 128bc3d5698SJohn Baldwin ret 129bc3d5698SJohn Baldwin.size poly1305_blocks,.-poly1305_blocks 130bc3d5698SJohn Baldwin 131bc3d5698SJohn Baldwin.type poly1305_emit,%function 132bc3d5698SJohn Baldwin.align 5 133bc3d5698SJohn Baldwinpoly1305_emit: 134c0855eaaSJohn Baldwin.Lpoly1305_emit: 135bd9588bcSAndrew Turner // The symbol .poly1305_emit is not a .globl symbol 136bd9588bcSAndrew Turner // but a pointer to it is returned by poly1305_init 137bd9588bcSAndrew Turner AARCH64_VALID_CALL_TARGET 138bc3d5698SJohn Baldwin ldp x4,x5,[x0] // load hash base 2^64 139bc3d5698SJohn Baldwin ldr x6,[x0,#16] 140bc3d5698SJohn Baldwin ldp x10,x11,[x2] // load nonce 141bc3d5698SJohn Baldwin 142bc3d5698SJohn Baldwin adds x12,x4,#5 // compare to modulus 143bc3d5698SJohn Baldwin adcs x13,x5,xzr 144bc3d5698SJohn Baldwin adc x14,x6,xzr 145bc3d5698SJohn Baldwin 146bc3d5698SJohn Baldwin tst x14,#-4 // see if it's carried/borrowed 147bc3d5698SJohn Baldwin 148bc3d5698SJohn Baldwin csel x4,x4,x12,eq 149bc3d5698SJohn Baldwin csel x5,x5,x13,eq 150bc3d5698SJohn Baldwin 151575878a5SEd Maste#ifdef __AARCH64EB__ 152bc3d5698SJohn Baldwin ror x10,x10,#32 // flip nonce words 153bc3d5698SJohn Baldwin ror x11,x11,#32 154bc3d5698SJohn Baldwin#endif 155bc3d5698SJohn Baldwin adds x4,x4,x10 // accumulate nonce 156bc3d5698SJohn Baldwin adc x5,x5,x11 157575878a5SEd Maste#ifdef __AARCH64EB__ 158bc3d5698SJohn Baldwin rev x4,x4 // flip output bytes 159bc3d5698SJohn Baldwin rev x5,x5 160bc3d5698SJohn Baldwin#endif 161bc3d5698SJohn Baldwin stp x4,x5,[x1] // write result 162bc3d5698SJohn Baldwin 163bc3d5698SJohn Baldwin ret 164bc3d5698SJohn Baldwin.size poly1305_emit,.-poly1305_emit 165bc3d5698SJohn Baldwin.type poly1305_mult,%function 166bc3d5698SJohn Baldwin.align 5 167bc3d5698SJohn Baldwinpoly1305_mult: 168bc3d5698SJohn Baldwin mul x12,x4,x7 // h0*r0 169bc3d5698SJohn Baldwin umulh x13,x4,x7 170bc3d5698SJohn Baldwin 171bc3d5698SJohn Baldwin mul x10,x5,x9 // h1*5*r1 172bc3d5698SJohn Baldwin umulh x11,x5,x9 173bc3d5698SJohn Baldwin 174bc3d5698SJohn Baldwin adds x12,x12,x10 175bc3d5698SJohn Baldwin mul x10,x4,x8 // h0*r1 176bc3d5698SJohn Baldwin adc x13,x13,x11 177bc3d5698SJohn Baldwin umulh x14,x4,x8 178bc3d5698SJohn Baldwin 179bc3d5698SJohn Baldwin adds x13,x13,x10 180bc3d5698SJohn Baldwin mul x10,x5,x7 // h1*r0 181bc3d5698SJohn Baldwin adc x14,x14,xzr 182bc3d5698SJohn Baldwin umulh x11,x5,x7 183bc3d5698SJohn Baldwin 184bc3d5698SJohn Baldwin adds x13,x13,x10 185bc3d5698SJohn Baldwin mul x10,x6,x9 // h2*5*r1 186bc3d5698SJohn Baldwin adc x14,x14,x11 187bc3d5698SJohn Baldwin mul x11,x6,x7 // h2*r0 188bc3d5698SJohn Baldwin 189bc3d5698SJohn Baldwin adds x13,x13,x10 190bc3d5698SJohn Baldwin adc x14,x14,x11 191bc3d5698SJohn Baldwin 192bc3d5698SJohn Baldwin and x10,x14,#-4 // final reduction 193bc3d5698SJohn Baldwin and x6,x14,#3 194bc3d5698SJohn Baldwin add x10,x10,x14,lsr#2 195bc3d5698SJohn Baldwin adds x4,x12,x10 196bc3d5698SJohn Baldwin adcs x5,x13,xzr 197bc3d5698SJohn Baldwin adc x6,x6,xzr 198bc3d5698SJohn Baldwin 199bc3d5698SJohn Baldwin ret 200bc3d5698SJohn Baldwin.size poly1305_mult,.-poly1305_mult 201bc3d5698SJohn Baldwin 202bc3d5698SJohn Baldwin.type poly1305_splat,%function 203bc3d5698SJohn Baldwin.align 5 204bc3d5698SJohn Baldwinpoly1305_splat: 205bc3d5698SJohn Baldwin and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 206bc3d5698SJohn Baldwin ubfx x13,x4,#26,#26 207bc3d5698SJohn Baldwin extr x14,x5,x4,#52 208bc3d5698SJohn Baldwin and x14,x14,#0x03ffffff 209bc3d5698SJohn Baldwin ubfx x15,x5,#14,#26 210bc3d5698SJohn Baldwin extr x16,x6,x5,#40 211bc3d5698SJohn Baldwin 212bc3d5698SJohn Baldwin str w12,[x0,#16*0] // r0 213bc3d5698SJohn Baldwin add w12,w13,w13,lsl#2 // r1*5 214bc3d5698SJohn Baldwin str w13,[x0,#16*1] // r1 215bc3d5698SJohn Baldwin add w13,w14,w14,lsl#2 // r2*5 216bc3d5698SJohn Baldwin str w12,[x0,#16*2] // s1 217bc3d5698SJohn Baldwin str w14,[x0,#16*3] // r2 218bc3d5698SJohn Baldwin add w14,w15,w15,lsl#2 // r3*5 219bc3d5698SJohn Baldwin str w13,[x0,#16*4] // s2 220bc3d5698SJohn Baldwin str w15,[x0,#16*5] // r3 221bc3d5698SJohn Baldwin add w15,w16,w16,lsl#2 // r4*5 222bc3d5698SJohn Baldwin str w14,[x0,#16*6] // s3 223bc3d5698SJohn Baldwin str w16,[x0,#16*7] // r4 224bc3d5698SJohn Baldwin str w15,[x0,#16*8] // s4 225bc3d5698SJohn Baldwin 226bc3d5698SJohn Baldwin ret 227bc3d5698SJohn Baldwin.size poly1305_splat,.-poly1305_splat 228bc3d5698SJohn Baldwin 229bc3d5698SJohn Baldwin.type poly1305_blocks_neon,%function 230bc3d5698SJohn Baldwin.align 5 231bc3d5698SJohn Baldwinpoly1305_blocks_neon: 232c0855eaaSJohn Baldwin.Lpoly1305_blocks_neon: 233bd9588bcSAndrew Turner // The symbol .Lpoly1305_blocks_neon is not a .globl symbol 234bd9588bcSAndrew Turner // but a pointer to it is returned by poly1305_init 235bd9588bcSAndrew Turner AARCH64_VALID_CALL_TARGET 236bc3d5698SJohn Baldwin ldr x17,[x0,#24] 237bc3d5698SJohn Baldwin cmp x2,#128 238bc3d5698SJohn Baldwin b.hs .Lblocks_neon 239c0855eaaSJohn Baldwin cbz x17,.Lpoly1305_blocks 240bc3d5698SJohn Baldwin 241bc3d5698SJohn Baldwin.Lblocks_neon: 242bd9588bcSAndrew Turner AARCH64_SIGN_LINK_REGISTER 243bc3d5698SJohn Baldwin stp x29,x30,[sp,#-80]! 244bc3d5698SJohn Baldwin add x29,sp,#0 245bc3d5698SJohn Baldwin 246bc3d5698SJohn Baldwin ands x2,x2,#-16 247bc3d5698SJohn Baldwin b.eq .Lno_data_neon 248bc3d5698SJohn Baldwin 249bc3d5698SJohn Baldwin cbz x17,.Lbase2_64_neon 250bc3d5698SJohn Baldwin 251bc3d5698SJohn Baldwin ldp w10,w11,[x0] // load hash value base 2^26 252bc3d5698SJohn Baldwin ldp w12,w13,[x0,#8] 253bc3d5698SJohn Baldwin ldr w14,[x0,#16] 254bc3d5698SJohn Baldwin 255bc3d5698SJohn Baldwin tst x2,#31 256bc3d5698SJohn Baldwin b.eq .Leven_neon 257bc3d5698SJohn Baldwin 258bc3d5698SJohn Baldwin ldp x7,x8,[x0,#32] // load key value 259bc3d5698SJohn Baldwin 260bc3d5698SJohn Baldwin add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 261bc3d5698SJohn Baldwin lsr x5,x12,#12 262bc3d5698SJohn Baldwin adds x4,x4,x12,lsl#52 263bc3d5698SJohn Baldwin add x5,x5,x13,lsl#14 264bc3d5698SJohn Baldwin adc x5,x5,xzr 265bc3d5698SJohn Baldwin lsr x6,x14,#24 266bc3d5698SJohn Baldwin adds x5,x5,x14,lsl#40 267bc3d5698SJohn Baldwin adc x14,x6,xzr // can be partially reduced... 268bc3d5698SJohn Baldwin 269bc3d5698SJohn Baldwin ldp x12,x13,[x1],#16 // load input 270bc3d5698SJohn Baldwin sub x2,x2,#16 271bc3d5698SJohn Baldwin add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 272bc3d5698SJohn Baldwin 273bc3d5698SJohn Baldwin and x10,x14,#-4 // ... so reduce 274bc3d5698SJohn Baldwin and x6,x14,#3 275bc3d5698SJohn Baldwin add x10,x10,x14,lsr#2 276bc3d5698SJohn Baldwin adds x4,x4,x10 277bc3d5698SJohn Baldwin adcs x5,x5,xzr 278bc3d5698SJohn Baldwin adc x6,x6,xzr 279bc3d5698SJohn Baldwin 280575878a5SEd Maste#ifdef __AARCH64EB__ 281bc3d5698SJohn Baldwin rev x12,x12 282bc3d5698SJohn Baldwin rev x13,x13 283bc3d5698SJohn Baldwin#endif 284bc3d5698SJohn Baldwin adds x4,x4,x12 // accumulate input 285bc3d5698SJohn Baldwin adcs x5,x5,x13 286bc3d5698SJohn Baldwin adc x6,x6,x3 287bc3d5698SJohn Baldwin 288bc3d5698SJohn Baldwin bl poly1305_mult 289bc3d5698SJohn Baldwin ldr x30,[sp,#8] 290bc3d5698SJohn Baldwin 291bc3d5698SJohn Baldwin cbz x3,.Lstore_base2_64_neon 292bc3d5698SJohn Baldwin 293bc3d5698SJohn Baldwin and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 294bc3d5698SJohn Baldwin ubfx x11,x4,#26,#26 295bc3d5698SJohn Baldwin extr x12,x5,x4,#52 296bc3d5698SJohn Baldwin and x12,x12,#0x03ffffff 297bc3d5698SJohn Baldwin ubfx x13,x5,#14,#26 298bc3d5698SJohn Baldwin extr x14,x6,x5,#40 299bc3d5698SJohn Baldwin 300bc3d5698SJohn Baldwin cbnz x2,.Leven_neon 301bc3d5698SJohn Baldwin 302bc3d5698SJohn Baldwin stp w10,w11,[x0] // store hash value base 2^26 303bc3d5698SJohn Baldwin stp w12,w13,[x0,#8] 304bc3d5698SJohn Baldwin str w14,[x0,#16] 305bc3d5698SJohn Baldwin b .Lno_data_neon 306bc3d5698SJohn Baldwin 307bc3d5698SJohn Baldwin.align 4 308bc3d5698SJohn Baldwin.Lstore_base2_64_neon: 309bc3d5698SJohn Baldwin stp x4,x5,[x0] // store hash value base 2^64 310bc3d5698SJohn Baldwin stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed 311bc3d5698SJohn Baldwin b .Lno_data_neon 312bc3d5698SJohn Baldwin 313bc3d5698SJohn Baldwin.align 4 314bc3d5698SJohn Baldwin.Lbase2_64_neon: 315bc3d5698SJohn Baldwin ldp x7,x8,[x0,#32] // load key value 316bc3d5698SJohn Baldwin 317bc3d5698SJohn Baldwin ldp x4,x5,[x0] // load hash value base 2^64 318bc3d5698SJohn Baldwin ldr x6,[x0,#16] 319bc3d5698SJohn Baldwin 320bc3d5698SJohn Baldwin tst x2,#31 321bc3d5698SJohn Baldwin b.eq .Linit_neon 322bc3d5698SJohn Baldwin 323bc3d5698SJohn Baldwin ldp x12,x13,[x1],#16 // load input 324bc3d5698SJohn Baldwin sub x2,x2,#16 325bc3d5698SJohn Baldwin add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 326575878a5SEd Maste#ifdef __AARCH64EB__ 327bc3d5698SJohn Baldwin rev x12,x12 328bc3d5698SJohn Baldwin rev x13,x13 329bc3d5698SJohn Baldwin#endif 330bc3d5698SJohn Baldwin adds x4,x4,x12 // accumulate input 331bc3d5698SJohn Baldwin adcs x5,x5,x13 332bc3d5698SJohn Baldwin adc x6,x6,x3 333bc3d5698SJohn Baldwin 334bc3d5698SJohn Baldwin bl poly1305_mult 335bc3d5698SJohn Baldwin 336bc3d5698SJohn Baldwin.Linit_neon: 337bc3d5698SJohn Baldwin and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 338bc3d5698SJohn Baldwin ubfx x11,x4,#26,#26 339bc3d5698SJohn Baldwin extr x12,x5,x4,#52 340bc3d5698SJohn Baldwin and x12,x12,#0x03ffffff 341bc3d5698SJohn Baldwin ubfx x13,x5,#14,#26 342bc3d5698SJohn Baldwin extr x14,x6,x5,#40 343bc3d5698SJohn Baldwin 344bc3d5698SJohn Baldwin stp d8,d9,[sp,#16] // meet ABI requirements 345bc3d5698SJohn Baldwin stp d10,d11,[sp,#32] 346bc3d5698SJohn Baldwin stp d12,d13,[sp,#48] 347bc3d5698SJohn Baldwin stp d14,d15,[sp,#64] 348bc3d5698SJohn Baldwin 349bc3d5698SJohn Baldwin fmov d24,x10 350bc3d5698SJohn Baldwin fmov d25,x11 351bc3d5698SJohn Baldwin fmov d26,x12 352bc3d5698SJohn Baldwin fmov d27,x13 353bc3d5698SJohn Baldwin fmov d28,x14 354bc3d5698SJohn Baldwin 355bc3d5698SJohn Baldwin ////////////////////////////////// initialize r^n table 356bc3d5698SJohn Baldwin mov x4,x7 // r^1 357bc3d5698SJohn Baldwin add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 358bc3d5698SJohn Baldwin mov x5,x8 359bc3d5698SJohn Baldwin mov x6,xzr 360bc3d5698SJohn Baldwin add x0,x0,#48+12 361bc3d5698SJohn Baldwin bl poly1305_splat 362bc3d5698SJohn Baldwin 363bc3d5698SJohn Baldwin bl poly1305_mult // r^2 364bc3d5698SJohn Baldwin sub x0,x0,#4 365bc3d5698SJohn Baldwin bl poly1305_splat 366bc3d5698SJohn Baldwin 367bc3d5698SJohn Baldwin bl poly1305_mult // r^3 368bc3d5698SJohn Baldwin sub x0,x0,#4 369bc3d5698SJohn Baldwin bl poly1305_splat 370bc3d5698SJohn Baldwin 371bc3d5698SJohn Baldwin bl poly1305_mult // r^4 372bc3d5698SJohn Baldwin sub x0,x0,#4 373bc3d5698SJohn Baldwin bl poly1305_splat 374bc3d5698SJohn Baldwin ldr x30,[sp,#8] 375bc3d5698SJohn Baldwin 376bc3d5698SJohn Baldwin add x16,x1,#32 377bc3d5698SJohn Baldwin adr x17,.Lzeros 378bc3d5698SJohn Baldwin subs x2,x2,#64 379bc3d5698SJohn Baldwin csel x16,x17,x16,lo 380bc3d5698SJohn Baldwin 381bc3d5698SJohn Baldwin mov x4,#1 382c0855eaaSJohn Baldwin stur x4,[x0,#-24] // set is_base2_26 383bc3d5698SJohn Baldwin sub x0,x0,#48 // restore original x0 384bc3d5698SJohn Baldwin b .Ldo_neon 385bc3d5698SJohn Baldwin 386bc3d5698SJohn Baldwin.align 4 387bc3d5698SJohn Baldwin.Leven_neon: 388bc3d5698SJohn Baldwin add x16,x1,#32 389bc3d5698SJohn Baldwin adr x17,.Lzeros 390bc3d5698SJohn Baldwin subs x2,x2,#64 391bc3d5698SJohn Baldwin csel x16,x17,x16,lo 392bc3d5698SJohn Baldwin 393bc3d5698SJohn Baldwin stp d8,d9,[sp,#16] // meet ABI requirements 394bc3d5698SJohn Baldwin stp d10,d11,[sp,#32] 395bc3d5698SJohn Baldwin stp d12,d13,[sp,#48] 396bc3d5698SJohn Baldwin stp d14,d15,[sp,#64] 397bc3d5698SJohn Baldwin 398bc3d5698SJohn Baldwin fmov d24,x10 399bc3d5698SJohn Baldwin fmov d25,x11 400bc3d5698SJohn Baldwin fmov d26,x12 401bc3d5698SJohn Baldwin fmov d27,x13 402bc3d5698SJohn Baldwin fmov d28,x14 403bc3d5698SJohn Baldwin 404bc3d5698SJohn Baldwin.Ldo_neon: 405bc3d5698SJohn Baldwin ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 406bc3d5698SJohn Baldwin ldp x9,x13,[x16],#48 407bc3d5698SJohn Baldwin 408bc3d5698SJohn Baldwin lsl x3,x3,#24 409bc3d5698SJohn Baldwin add x15,x0,#48 410bc3d5698SJohn Baldwin 411575878a5SEd Maste#ifdef __AARCH64EB__ 412bc3d5698SJohn Baldwin rev x8,x8 413bc3d5698SJohn Baldwin rev x12,x12 414bc3d5698SJohn Baldwin rev x9,x9 415bc3d5698SJohn Baldwin rev x13,x13 416bc3d5698SJohn Baldwin#endif 417bc3d5698SJohn Baldwin and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 418bc3d5698SJohn Baldwin and x5,x9,#0x03ffffff 419bc3d5698SJohn Baldwin ubfx x6,x8,#26,#26 420bc3d5698SJohn Baldwin ubfx x7,x9,#26,#26 421bc3d5698SJohn Baldwin add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 422bc3d5698SJohn Baldwin extr x8,x12,x8,#52 423bc3d5698SJohn Baldwin extr x9,x13,x9,#52 424bc3d5698SJohn Baldwin add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 425bc3d5698SJohn Baldwin fmov d14,x4 426bc3d5698SJohn Baldwin and x8,x8,#0x03ffffff 427bc3d5698SJohn Baldwin and x9,x9,#0x03ffffff 428bc3d5698SJohn Baldwin ubfx x10,x12,#14,#26 429bc3d5698SJohn Baldwin ubfx x11,x13,#14,#26 430bc3d5698SJohn Baldwin add x12,x3,x12,lsr#40 431bc3d5698SJohn Baldwin add x13,x3,x13,lsr#40 432bc3d5698SJohn Baldwin add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 433bc3d5698SJohn Baldwin fmov d15,x6 434bc3d5698SJohn Baldwin add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 435bc3d5698SJohn Baldwin add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 436bc3d5698SJohn Baldwin fmov d16,x8 437bc3d5698SJohn Baldwin fmov d17,x10 438bc3d5698SJohn Baldwin fmov d18,x12 439bc3d5698SJohn Baldwin 440bc3d5698SJohn Baldwin ldp x8,x12,[x1],#16 // inp[0:1] 441bc3d5698SJohn Baldwin ldp x9,x13,[x1],#48 442bc3d5698SJohn Baldwin 443bc3d5698SJohn Baldwin ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 444bc3d5698SJohn Baldwin ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 445bc3d5698SJohn Baldwin ld1 {v8.4s},[x15] 446bc3d5698SJohn Baldwin 447575878a5SEd Maste#ifdef __AARCH64EB__ 448bc3d5698SJohn Baldwin rev x8,x8 449bc3d5698SJohn Baldwin rev x12,x12 450bc3d5698SJohn Baldwin rev x9,x9 451bc3d5698SJohn Baldwin rev x13,x13 452bc3d5698SJohn Baldwin#endif 453bc3d5698SJohn Baldwin and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 454bc3d5698SJohn Baldwin and x5,x9,#0x03ffffff 455bc3d5698SJohn Baldwin ubfx x6,x8,#26,#26 456bc3d5698SJohn Baldwin ubfx x7,x9,#26,#26 457bc3d5698SJohn Baldwin add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 458bc3d5698SJohn Baldwin extr x8,x12,x8,#52 459bc3d5698SJohn Baldwin extr x9,x13,x9,#52 460bc3d5698SJohn Baldwin add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 461bc3d5698SJohn Baldwin fmov d9,x4 462bc3d5698SJohn Baldwin and x8,x8,#0x03ffffff 463bc3d5698SJohn Baldwin and x9,x9,#0x03ffffff 464bc3d5698SJohn Baldwin ubfx x10,x12,#14,#26 465bc3d5698SJohn Baldwin ubfx x11,x13,#14,#26 466bc3d5698SJohn Baldwin add x12,x3,x12,lsr#40 467bc3d5698SJohn Baldwin add x13,x3,x13,lsr#40 468bc3d5698SJohn Baldwin add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 469bc3d5698SJohn Baldwin fmov d10,x6 470bc3d5698SJohn Baldwin add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 471bc3d5698SJohn Baldwin add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 472bc3d5698SJohn Baldwin movi v31.2d,#-1 473bc3d5698SJohn Baldwin fmov d11,x8 474bc3d5698SJohn Baldwin fmov d12,x10 475bc3d5698SJohn Baldwin fmov d13,x12 476bc3d5698SJohn Baldwin ushr v31.2d,v31.2d,#38 477bc3d5698SJohn Baldwin 478bc3d5698SJohn Baldwin b.ls .Lskip_loop 479bc3d5698SJohn Baldwin 480bc3d5698SJohn Baldwin.align 4 481bc3d5698SJohn Baldwin.Loop_neon: 482bc3d5698SJohn Baldwin //////////////////////////////////////////////////////////////// 483bc3d5698SJohn Baldwin // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 484bc3d5698SJohn Baldwin // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 485bc3d5698SJohn Baldwin // ___________________/ 486bc3d5698SJohn Baldwin // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 487bc3d5698SJohn Baldwin // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 488bc3d5698SJohn Baldwin // ___________________/ ____________________/ 489bc3d5698SJohn Baldwin // 490bc3d5698SJohn Baldwin // Note that we start with inp[2:3]*r^2. This is because it 491bc3d5698SJohn Baldwin // doesn't depend on reduction in previous iteration. 492bc3d5698SJohn Baldwin //////////////////////////////////////////////////////////////// 493bc3d5698SJohn Baldwin // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 494bc3d5698SJohn Baldwin // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 495bc3d5698SJohn Baldwin // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 496bc3d5698SJohn Baldwin // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 497bc3d5698SJohn Baldwin // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 498bc3d5698SJohn Baldwin 499bc3d5698SJohn Baldwin subs x2,x2,#64 500bc3d5698SJohn Baldwin umull v23.2d,v14.2s,v7.s[2] 501bc3d5698SJohn Baldwin csel x16,x17,x16,lo 502bc3d5698SJohn Baldwin umull v22.2d,v14.2s,v5.s[2] 503bc3d5698SJohn Baldwin umull v21.2d,v14.2s,v3.s[2] 504bc3d5698SJohn Baldwin ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 505bc3d5698SJohn Baldwin umull v20.2d,v14.2s,v1.s[2] 506bc3d5698SJohn Baldwin ldp x9,x13,[x16],#48 507bc3d5698SJohn Baldwin umull v19.2d,v14.2s,v0.s[2] 508575878a5SEd Maste#ifdef __AARCH64EB__ 509bc3d5698SJohn Baldwin rev x8,x8 510bc3d5698SJohn Baldwin rev x12,x12 511bc3d5698SJohn Baldwin rev x9,x9 512bc3d5698SJohn Baldwin rev x13,x13 513bc3d5698SJohn Baldwin#endif 514bc3d5698SJohn Baldwin 515bc3d5698SJohn Baldwin umlal v23.2d,v15.2s,v5.s[2] 516bc3d5698SJohn Baldwin and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 517bc3d5698SJohn Baldwin umlal v22.2d,v15.2s,v3.s[2] 518bc3d5698SJohn Baldwin and x5,x9,#0x03ffffff 519bc3d5698SJohn Baldwin umlal v21.2d,v15.2s,v1.s[2] 520bc3d5698SJohn Baldwin ubfx x6,x8,#26,#26 521bc3d5698SJohn Baldwin umlal v20.2d,v15.2s,v0.s[2] 522bc3d5698SJohn Baldwin ubfx x7,x9,#26,#26 523bc3d5698SJohn Baldwin umlal v19.2d,v15.2s,v8.s[2] 524bc3d5698SJohn Baldwin add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 525bc3d5698SJohn Baldwin 526bc3d5698SJohn Baldwin umlal v23.2d,v16.2s,v3.s[2] 527bc3d5698SJohn Baldwin extr x8,x12,x8,#52 528bc3d5698SJohn Baldwin umlal v22.2d,v16.2s,v1.s[2] 529bc3d5698SJohn Baldwin extr x9,x13,x9,#52 530bc3d5698SJohn Baldwin umlal v21.2d,v16.2s,v0.s[2] 531bc3d5698SJohn Baldwin add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 532bc3d5698SJohn Baldwin umlal v20.2d,v16.2s,v8.s[2] 533bc3d5698SJohn Baldwin fmov d14,x4 534bc3d5698SJohn Baldwin umlal v19.2d,v16.2s,v6.s[2] 535bc3d5698SJohn Baldwin and x8,x8,#0x03ffffff 536bc3d5698SJohn Baldwin 537bc3d5698SJohn Baldwin umlal v23.2d,v17.2s,v1.s[2] 538bc3d5698SJohn Baldwin and x9,x9,#0x03ffffff 539bc3d5698SJohn Baldwin umlal v22.2d,v17.2s,v0.s[2] 540bc3d5698SJohn Baldwin ubfx x10,x12,#14,#26 541bc3d5698SJohn Baldwin umlal v21.2d,v17.2s,v8.s[2] 542bc3d5698SJohn Baldwin ubfx x11,x13,#14,#26 543bc3d5698SJohn Baldwin umlal v20.2d,v17.2s,v6.s[2] 544bc3d5698SJohn Baldwin add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 545bc3d5698SJohn Baldwin umlal v19.2d,v17.2s,v4.s[2] 546bc3d5698SJohn Baldwin fmov d15,x6 547bc3d5698SJohn Baldwin 548bc3d5698SJohn Baldwin add v11.2s,v11.2s,v26.2s 549bc3d5698SJohn Baldwin add x12,x3,x12,lsr#40 550bc3d5698SJohn Baldwin umlal v23.2d,v18.2s,v0.s[2] 551bc3d5698SJohn Baldwin add x13,x3,x13,lsr#40 552bc3d5698SJohn Baldwin umlal v22.2d,v18.2s,v8.s[2] 553bc3d5698SJohn Baldwin add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 554bc3d5698SJohn Baldwin umlal v21.2d,v18.2s,v6.s[2] 555bc3d5698SJohn Baldwin add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 556bc3d5698SJohn Baldwin umlal v20.2d,v18.2s,v4.s[2] 557bc3d5698SJohn Baldwin fmov d16,x8 558bc3d5698SJohn Baldwin umlal v19.2d,v18.2s,v2.s[2] 559bc3d5698SJohn Baldwin fmov d17,x10 560bc3d5698SJohn Baldwin 561bc3d5698SJohn Baldwin //////////////////////////////////////////////////////////////// 562bc3d5698SJohn Baldwin // (hash+inp[0:1])*r^4 and accumulate 563bc3d5698SJohn Baldwin 564bc3d5698SJohn Baldwin add v9.2s,v9.2s,v24.2s 565bc3d5698SJohn Baldwin fmov d18,x12 566bc3d5698SJohn Baldwin umlal v22.2d,v11.2s,v1.s[0] 567bc3d5698SJohn Baldwin ldp x8,x12,[x1],#16 // inp[0:1] 568bc3d5698SJohn Baldwin umlal v19.2d,v11.2s,v6.s[0] 569bc3d5698SJohn Baldwin ldp x9,x13,[x1],#48 570bc3d5698SJohn Baldwin umlal v23.2d,v11.2s,v3.s[0] 571bc3d5698SJohn Baldwin umlal v20.2d,v11.2s,v8.s[0] 572bc3d5698SJohn Baldwin umlal v21.2d,v11.2s,v0.s[0] 573575878a5SEd Maste#ifdef __AARCH64EB__ 574bc3d5698SJohn Baldwin rev x8,x8 575bc3d5698SJohn Baldwin rev x12,x12 576bc3d5698SJohn Baldwin rev x9,x9 577bc3d5698SJohn Baldwin rev x13,x13 578bc3d5698SJohn Baldwin#endif 579bc3d5698SJohn Baldwin 580bc3d5698SJohn Baldwin add v10.2s,v10.2s,v25.2s 581bc3d5698SJohn Baldwin umlal v22.2d,v9.2s,v5.s[0] 582bc3d5698SJohn Baldwin umlal v23.2d,v9.2s,v7.s[0] 583bc3d5698SJohn Baldwin and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 584bc3d5698SJohn Baldwin umlal v21.2d,v9.2s,v3.s[0] 585bc3d5698SJohn Baldwin and x5,x9,#0x03ffffff 586bc3d5698SJohn Baldwin umlal v19.2d,v9.2s,v0.s[0] 587bc3d5698SJohn Baldwin ubfx x6,x8,#26,#26 588bc3d5698SJohn Baldwin umlal v20.2d,v9.2s,v1.s[0] 589bc3d5698SJohn Baldwin ubfx x7,x9,#26,#26 590bc3d5698SJohn Baldwin 591bc3d5698SJohn Baldwin add v12.2s,v12.2s,v27.2s 592bc3d5698SJohn Baldwin add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 593bc3d5698SJohn Baldwin umlal v22.2d,v10.2s,v3.s[0] 594bc3d5698SJohn Baldwin extr x8,x12,x8,#52 595bc3d5698SJohn Baldwin umlal v23.2d,v10.2s,v5.s[0] 596bc3d5698SJohn Baldwin extr x9,x13,x9,#52 597bc3d5698SJohn Baldwin umlal v19.2d,v10.2s,v8.s[0] 598bc3d5698SJohn Baldwin add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 599bc3d5698SJohn Baldwin umlal v21.2d,v10.2s,v1.s[0] 600bc3d5698SJohn Baldwin fmov d9,x4 601bc3d5698SJohn Baldwin umlal v20.2d,v10.2s,v0.s[0] 602bc3d5698SJohn Baldwin and x8,x8,#0x03ffffff 603bc3d5698SJohn Baldwin 604bc3d5698SJohn Baldwin add v13.2s,v13.2s,v28.2s 605bc3d5698SJohn Baldwin and x9,x9,#0x03ffffff 606bc3d5698SJohn Baldwin umlal v22.2d,v12.2s,v0.s[0] 607bc3d5698SJohn Baldwin ubfx x10,x12,#14,#26 608bc3d5698SJohn Baldwin umlal v19.2d,v12.2s,v4.s[0] 609bc3d5698SJohn Baldwin ubfx x11,x13,#14,#26 610bc3d5698SJohn Baldwin umlal v23.2d,v12.2s,v1.s[0] 611bc3d5698SJohn Baldwin add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 612bc3d5698SJohn Baldwin umlal v20.2d,v12.2s,v6.s[0] 613bc3d5698SJohn Baldwin fmov d10,x6 614bc3d5698SJohn Baldwin umlal v21.2d,v12.2s,v8.s[0] 615bc3d5698SJohn Baldwin add x12,x3,x12,lsr#40 616bc3d5698SJohn Baldwin 617bc3d5698SJohn Baldwin umlal v22.2d,v13.2s,v8.s[0] 618bc3d5698SJohn Baldwin add x13,x3,x13,lsr#40 619bc3d5698SJohn Baldwin umlal v19.2d,v13.2s,v2.s[0] 620bc3d5698SJohn Baldwin add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 621bc3d5698SJohn Baldwin umlal v23.2d,v13.2s,v0.s[0] 622bc3d5698SJohn Baldwin add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 623bc3d5698SJohn Baldwin umlal v20.2d,v13.2s,v4.s[0] 624bc3d5698SJohn Baldwin fmov d11,x8 625bc3d5698SJohn Baldwin umlal v21.2d,v13.2s,v6.s[0] 626bc3d5698SJohn Baldwin fmov d12,x10 627bc3d5698SJohn Baldwin fmov d13,x12 628bc3d5698SJohn Baldwin 629bc3d5698SJohn Baldwin ///////////////////////////////////////////////////////////////// 630bc3d5698SJohn Baldwin // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 631bc3d5698SJohn Baldwin // and P. Schwabe 632bc3d5698SJohn Baldwin // 633bc3d5698SJohn Baldwin // [see discussion in poly1305-armv4 module] 634bc3d5698SJohn Baldwin 635bc3d5698SJohn Baldwin ushr v29.2d,v22.2d,#26 636bc3d5698SJohn Baldwin xtn v27.2s,v22.2d 637bc3d5698SJohn Baldwin ushr v30.2d,v19.2d,#26 638bc3d5698SJohn Baldwin and v19.16b,v19.16b,v31.16b 639bc3d5698SJohn Baldwin add v23.2d,v23.2d,v29.2d // h3 -> h4 640bc3d5698SJohn Baldwin bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff 641bc3d5698SJohn Baldwin add v20.2d,v20.2d,v30.2d // h0 -> h1 642bc3d5698SJohn Baldwin 643bc3d5698SJohn Baldwin ushr v29.2d,v23.2d,#26 644bc3d5698SJohn Baldwin xtn v28.2s,v23.2d 645bc3d5698SJohn Baldwin ushr v30.2d,v20.2d,#26 646bc3d5698SJohn Baldwin xtn v25.2s,v20.2d 647bc3d5698SJohn Baldwin bic v28.2s,#0xfc,lsl#24 648bc3d5698SJohn Baldwin add v21.2d,v21.2d,v30.2d // h1 -> h2 649bc3d5698SJohn Baldwin 650bc3d5698SJohn Baldwin add v19.2d,v19.2d,v29.2d 651bc3d5698SJohn Baldwin shl v29.2d,v29.2d,#2 652bc3d5698SJohn Baldwin shrn v30.2s,v21.2d,#26 653bc3d5698SJohn Baldwin xtn v26.2s,v21.2d 654bc3d5698SJohn Baldwin add v19.2d,v19.2d,v29.2d // h4 -> h0 655bc3d5698SJohn Baldwin bic v25.2s,#0xfc,lsl#24 656bc3d5698SJohn Baldwin add v27.2s,v27.2s,v30.2s // h2 -> h3 657bc3d5698SJohn Baldwin bic v26.2s,#0xfc,lsl#24 658bc3d5698SJohn Baldwin 659bc3d5698SJohn Baldwin shrn v29.2s,v19.2d,#26 660bc3d5698SJohn Baldwin xtn v24.2s,v19.2d 661bc3d5698SJohn Baldwin ushr v30.2s,v27.2s,#26 662bc3d5698SJohn Baldwin bic v27.2s,#0xfc,lsl#24 663bc3d5698SJohn Baldwin bic v24.2s,#0xfc,lsl#24 664bc3d5698SJohn Baldwin add v25.2s,v25.2s,v29.2s // h0 -> h1 665bc3d5698SJohn Baldwin add v28.2s,v28.2s,v30.2s // h3 -> h4 666bc3d5698SJohn Baldwin 667bc3d5698SJohn Baldwin b.hi .Loop_neon 668bc3d5698SJohn Baldwin 669bc3d5698SJohn Baldwin.Lskip_loop: 670bc3d5698SJohn Baldwin dup v16.2d,v16.d[0] 671bc3d5698SJohn Baldwin add v11.2s,v11.2s,v26.2s 672bc3d5698SJohn Baldwin 673bc3d5698SJohn Baldwin //////////////////////////////////////////////////////////////// 674bc3d5698SJohn Baldwin // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 675bc3d5698SJohn Baldwin 676bc3d5698SJohn Baldwin adds x2,x2,#32 677bc3d5698SJohn Baldwin b.ne .Long_tail 678bc3d5698SJohn Baldwin 679bc3d5698SJohn Baldwin dup v16.2d,v11.d[0] 680bc3d5698SJohn Baldwin add v14.2s,v9.2s,v24.2s 681bc3d5698SJohn Baldwin add v17.2s,v12.2s,v27.2s 682bc3d5698SJohn Baldwin add v15.2s,v10.2s,v25.2s 683bc3d5698SJohn Baldwin add v18.2s,v13.2s,v28.2s 684bc3d5698SJohn Baldwin 685bc3d5698SJohn Baldwin.Long_tail: 686bc3d5698SJohn Baldwin dup v14.2d,v14.d[0] 687bc3d5698SJohn Baldwin umull2 v19.2d,v16.4s,v6.4s 688bc3d5698SJohn Baldwin umull2 v22.2d,v16.4s,v1.4s 689bc3d5698SJohn Baldwin umull2 v23.2d,v16.4s,v3.4s 690bc3d5698SJohn Baldwin umull2 v21.2d,v16.4s,v0.4s 691bc3d5698SJohn Baldwin umull2 v20.2d,v16.4s,v8.4s 692bc3d5698SJohn Baldwin 693bc3d5698SJohn Baldwin dup v15.2d,v15.d[0] 694bc3d5698SJohn Baldwin umlal2 v19.2d,v14.4s,v0.4s 695bc3d5698SJohn Baldwin umlal2 v21.2d,v14.4s,v3.4s 696bc3d5698SJohn Baldwin umlal2 v22.2d,v14.4s,v5.4s 697bc3d5698SJohn Baldwin umlal2 v23.2d,v14.4s,v7.4s 698bc3d5698SJohn Baldwin umlal2 v20.2d,v14.4s,v1.4s 699bc3d5698SJohn Baldwin 700bc3d5698SJohn Baldwin dup v17.2d,v17.d[0] 701bc3d5698SJohn Baldwin umlal2 v19.2d,v15.4s,v8.4s 702bc3d5698SJohn Baldwin umlal2 v22.2d,v15.4s,v3.4s 703bc3d5698SJohn Baldwin umlal2 v21.2d,v15.4s,v1.4s 704bc3d5698SJohn Baldwin umlal2 v23.2d,v15.4s,v5.4s 705bc3d5698SJohn Baldwin umlal2 v20.2d,v15.4s,v0.4s 706bc3d5698SJohn Baldwin 707bc3d5698SJohn Baldwin dup v18.2d,v18.d[0] 708bc3d5698SJohn Baldwin umlal2 v22.2d,v17.4s,v0.4s 709bc3d5698SJohn Baldwin umlal2 v23.2d,v17.4s,v1.4s 710bc3d5698SJohn Baldwin umlal2 v19.2d,v17.4s,v4.4s 711bc3d5698SJohn Baldwin umlal2 v20.2d,v17.4s,v6.4s 712bc3d5698SJohn Baldwin umlal2 v21.2d,v17.4s,v8.4s 713bc3d5698SJohn Baldwin 714bc3d5698SJohn Baldwin umlal2 v22.2d,v18.4s,v8.4s 715bc3d5698SJohn Baldwin umlal2 v19.2d,v18.4s,v2.4s 716bc3d5698SJohn Baldwin umlal2 v23.2d,v18.4s,v0.4s 717bc3d5698SJohn Baldwin umlal2 v20.2d,v18.4s,v4.4s 718bc3d5698SJohn Baldwin umlal2 v21.2d,v18.4s,v6.4s 719bc3d5698SJohn Baldwin 720bc3d5698SJohn Baldwin b.eq .Lshort_tail 721bc3d5698SJohn Baldwin 722bc3d5698SJohn Baldwin //////////////////////////////////////////////////////////////// 723bc3d5698SJohn Baldwin // (hash+inp[0:1])*r^4:r^3 and accumulate 724bc3d5698SJohn Baldwin 725bc3d5698SJohn Baldwin add v9.2s,v9.2s,v24.2s 726bc3d5698SJohn Baldwin umlal v22.2d,v11.2s,v1.2s 727bc3d5698SJohn Baldwin umlal v19.2d,v11.2s,v6.2s 728bc3d5698SJohn Baldwin umlal v23.2d,v11.2s,v3.2s 729bc3d5698SJohn Baldwin umlal v20.2d,v11.2s,v8.2s 730bc3d5698SJohn Baldwin umlal v21.2d,v11.2s,v0.2s 731bc3d5698SJohn Baldwin 732bc3d5698SJohn Baldwin add v10.2s,v10.2s,v25.2s 733bc3d5698SJohn Baldwin umlal v22.2d,v9.2s,v5.2s 734bc3d5698SJohn Baldwin umlal v19.2d,v9.2s,v0.2s 735bc3d5698SJohn Baldwin umlal v23.2d,v9.2s,v7.2s 736bc3d5698SJohn Baldwin umlal v20.2d,v9.2s,v1.2s 737bc3d5698SJohn Baldwin umlal v21.2d,v9.2s,v3.2s 738bc3d5698SJohn Baldwin 739bc3d5698SJohn Baldwin add v12.2s,v12.2s,v27.2s 740bc3d5698SJohn Baldwin umlal v22.2d,v10.2s,v3.2s 741bc3d5698SJohn Baldwin umlal v19.2d,v10.2s,v8.2s 742bc3d5698SJohn Baldwin umlal v23.2d,v10.2s,v5.2s 743bc3d5698SJohn Baldwin umlal v20.2d,v10.2s,v0.2s 744bc3d5698SJohn Baldwin umlal v21.2d,v10.2s,v1.2s 745bc3d5698SJohn Baldwin 746bc3d5698SJohn Baldwin add v13.2s,v13.2s,v28.2s 747bc3d5698SJohn Baldwin umlal v22.2d,v12.2s,v0.2s 748bc3d5698SJohn Baldwin umlal v19.2d,v12.2s,v4.2s 749bc3d5698SJohn Baldwin umlal v23.2d,v12.2s,v1.2s 750bc3d5698SJohn Baldwin umlal v20.2d,v12.2s,v6.2s 751bc3d5698SJohn Baldwin umlal v21.2d,v12.2s,v8.2s 752bc3d5698SJohn Baldwin 753bc3d5698SJohn Baldwin umlal v22.2d,v13.2s,v8.2s 754bc3d5698SJohn Baldwin umlal v19.2d,v13.2s,v2.2s 755bc3d5698SJohn Baldwin umlal v23.2d,v13.2s,v0.2s 756bc3d5698SJohn Baldwin umlal v20.2d,v13.2s,v4.2s 757bc3d5698SJohn Baldwin umlal v21.2d,v13.2s,v6.2s 758bc3d5698SJohn Baldwin 759bc3d5698SJohn Baldwin.Lshort_tail: 760bc3d5698SJohn Baldwin //////////////////////////////////////////////////////////////// 761bc3d5698SJohn Baldwin // horizontal add 762bc3d5698SJohn Baldwin 763bc3d5698SJohn Baldwin addp v22.2d,v22.2d,v22.2d 764bc3d5698SJohn Baldwin ldp d8,d9,[sp,#16] // meet ABI requirements 765bc3d5698SJohn Baldwin addp v19.2d,v19.2d,v19.2d 766bc3d5698SJohn Baldwin ldp d10,d11,[sp,#32] 767bc3d5698SJohn Baldwin addp v23.2d,v23.2d,v23.2d 768bc3d5698SJohn Baldwin ldp d12,d13,[sp,#48] 769bc3d5698SJohn Baldwin addp v20.2d,v20.2d,v20.2d 770bc3d5698SJohn Baldwin ldp d14,d15,[sp,#64] 771bc3d5698SJohn Baldwin addp v21.2d,v21.2d,v21.2d 772bc3d5698SJohn Baldwin 773bc3d5698SJohn Baldwin //////////////////////////////////////////////////////////////// 774bc3d5698SJohn Baldwin // lazy reduction, but without narrowing 775bc3d5698SJohn Baldwin 776bc3d5698SJohn Baldwin ushr v29.2d,v22.2d,#26 777bc3d5698SJohn Baldwin and v22.16b,v22.16b,v31.16b 778bc3d5698SJohn Baldwin ushr v30.2d,v19.2d,#26 779bc3d5698SJohn Baldwin and v19.16b,v19.16b,v31.16b 780bc3d5698SJohn Baldwin 781bc3d5698SJohn Baldwin add v23.2d,v23.2d,v29.2d // h3 -> h4 782bc3d5698SJohn Baldwin add v20.2d,v20.2d,v30.2d // h0 -> h1 783bc3d5698SJohn Baldwin 784bc3d5698SJohn Baldwin ushr v29.2d,v23.2d,#26 785bc3d5698SJohn Baldwin and v23.16b,v23.16b,v31.16b 786bc3d5698SJohn Baldwin ushr v30.2d,v20.2d,#26 787bc3d5698SJohn Baldwin and v20.16b,v20.16b,v31.16b 788bc3d5698SJohn Baldwin add v21.2d,v21.2d,v30.2d // h1 -> h2 789bc3d5698SJohn Baldwin 790bc3d5698SJohn Baldwin add v19.2d,v19.2d,v29.2d 791bc3d5698SJohn Baldwin shl v29.2d,v29.2d,#2 792bc3d5698SJohn Baldwin ushr v30.2d,v21.2d,#26 793bc3d5698SJohn Baldwin and v21.16b,v21.16b,v31.16b 794bc3d5698SJohn Baldwin add v19.2d,v19.2d,v29.2d // h4 -> h0 795bc3d5698SJohn Baldwin add v22.2d,v22.2d,v30.2d // h2 -> h3 796bc3d5698SJohn Baldwin 797bc3d5698SJohn Baldwin ushr v29.2d,v19.2d,#26 798bc3d5698SJohn Baldwin and v19.16b,v19.16b,v31.16b 799bc3d5698SJohn Baldwin ushr v30.2d,v22.2d,#26 800bc3d5698SJohn Baldwin and v22.16b,v22.16b,v31.16b 801bc3d5698SJohn Baldwin add v20.2d,v20.2d,v29.2d // h0 -> h1 802bc3d5698SJohn Baldwin add v23.2d,v23.2d,v30.2d // h3 -> h4 803bc3d5698SJohn Baldwin 804bc3d5698SJohn Baldwin //////////////////////////////////////////////////////////////// 805bc3d5698SJohn Baldwin // write the result, can be partially reduced 806bc3d5698SJohn Baldwin 807bc3d5698SJohn Baldwin st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 808bc3d5698SJohn Baldwin st1 {v23.s}[0],[x0] 809bc3d5698SJohn Baldwin 810bc3d5698SJohn Baldwin.Lno_data_neon: 811bc3d5698SJohn Baldwin ldr x29,[sp],#80 812bd9588bcSAndrew Turner AARCH64_VALIDATE_LINK_REGISTER 813bc3d5698SJohn Baldwin ret 814bc3d5698SJohn Baldwin.size poly1305_blocks_neon,.-poly1305_blocks_neon 815bc3d5698SJohn Baldwin 816bc3d5698SJohn Baldwin.type poly1305_emit_neon,%function 817bc3d5698SJohn Baldwin.align 5 818bc3d5698SJohn Baldwinpoly1305_emit_neon: 819c0855eaaSJohn Baldwin.Lpoly1305_emit_neon: 820bd9588bcSAndrew Turner // The symbol .Lpoly1305_emit_neon is not a .globl symbol 821bd9588bcSAndrew Turner // but a pointer to it is returned by poly1305_init 822bd9588bcSAndrew Turner AARCH64_VALID_CALL_TARGET 823bc3d5698SJohn Baldwin ldr x17,[x0,#24] 824bc3d5698SJohn Baldwin cbz x17,poly1305_emit 825bc3d5698SJohn Baldwin 826bc3d5698SJohn Baldwin ldp w10,w11,[x0] // load hash value base 2^26 827bc3d5698SJohn Baldwin ldp w12,w13,[x0,#8] 828bc3d5698SJohn Baldwin ldr w14,[x0,#16] 829bc3d5698SJohn Baldwin 830bc3d5698SJohn Baldwin add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 831bc3d5698SJohn Baldwin lsr x5,x12,#12 832bc3d5698SJohn Baldwin adds x4,x4,x12,lsl#52 833bc3d5698SJohn Baldwin add x5,x5,x13,lsl#14 834bc3d5698SJohn Baldwin adc x5,x5,xzr 835bc3d5698SJohn Baldwin lsr x6,x14,#24 836bc3d5698SJohn Baldwin adds x5,x5,x14,lsl#40 837bc3d5698SJohn Baldwin adc x6,x6,xzr // can be partially reduced... 838bc3d5698SJohn Baldwin 839bc3d5698SJohn Baldwin ldp x10,x11,[x2] // load nonce 840bc3d5698SJohn Baldwin 841bc3d5698SJohn Baldwin and x12,x6,#-4 // ... so reduce 842bc3d5698SJohn Baldwin add x12,x12,x6,lsr#2 843bc3d5698SJohn Baldwin and x6,x6,#3 844bc3d5698SJohn Baldwin adds x4,x4,x12 845bc3d5698SJohn Baldwin adcs x5,x5,xzr 846bc3d5698SJohn Baldwin adc x6,x6,xzr 847bc3d5698SJohn Baldwin 848bc3d5698SJohn Baldwin adds x12,x4,#5 // compare to modulus 849bc3d5698SJohn Baldwin adcs x13,x5,xzr 850bc3d5698SJohn Baldwin adc x14,x6,xzr 851bc3d5698SJohn Baldwin 852bc3d5698SJohn Baldwin tst x14,#-4 // see if it's carried/borrowed 853bc3d5698SJohn Baldwin 854bc3d5698SJohn Baldwin csel x4,x4,x12,eq 855bc3d5698SJohn Baldwin csel x5,x5,x13,eq 856bc3d5698SJohn Baldwin 857575878a5SEd Maste#ifdef __AARCH64EB__ 858bc3d5698SJohn Baldwin ror x10,x10,#32 // flip nonce words 859bc3d5698SJohn Baldwin ror x11,x11,#32 860bc3d5698SJohn Baldwin#endif 861bc3d5698SJohn Baldwin adds x4,x4,x10 // accumulate nonce 862bc3d5698SJohn Baldwin adc x5,x5,x11 863575878a5SEd Maste#ifdef __AARCH64EB__ 864bc3d5698SJohn Baldwin rev x4,x4 // flip output bytes 865bc3d5698SJohn Baldwin rev x5,x5 866bc3d5698SJohn Baldwin#endif 867bc3d5698SJohn Baldwin stp x4,x5,[x1] // write result 868bc3d5698SJohn Baldwin 869bc3d5698SJohn Baldwin ret 870bc3d5698SJohn Baldwin.size poly1305_emit_neon,.-poly1305_emit_neon 871bc3d5698SJohn Baldwin 872bc3d5698SJohn Baldwin.align 5 873bc3d5698SJohn Baldwin.Lzeros: 874bc3d5698SJohn Baldwin.long 0,0,0,0,0,0,0,0 875bc3d5698SJohn Baldwin.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 876bc3d5698SJohn Baldwin.align 2 877bc3d5698SJohn Baldwin.align 2 878