1// Copyright 2018 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5#include "textflag.h" 6 7#define B0 V0 8#define B1 V1 9#define B2 V2 10#define B3 V3 11#define B4 V4 12#define B5 V5 13#define B6 V6 14#define B7 V7 15 16#define ACC0 V8 17#define ACC1 V9 18#define ACCM V10 19 20#define T0 V11 21#define T1 V12 22#define T2 V13 23#define T3 V14 24 25#define POLY V15 26#define ZERO V16 27#define INC V17 28#define CTR V18 29 30#define K0 V19 31#define K1 V20 32#define K2 V21 33#define K3 V22 34#define K4 V23 35#define K5 V24 36#define K6 V25 37#define K7 V26 38#define K8 V27 39#define K9 V28 40#define K10 V29 41#define K11 V30 42#define KLAST V31 43 44#define reduce() \ 45 VEOR ACC0.B16, ACCM.B16, ACCM.B16 \ 46 VEOR ACC1.B16, ACCM.B16, ACCM.B16 \ 47 VEXT $8, ZERO.B16, ACCM.B16, T0.B16 \ 48 VEXT $8, ACCM.B16, ZERO.B16, ACCM.B16 \ 49 VEOR ACCM.B16, ACC0.B16, ACC0.B16 \ 50 VEOR T0.B16, ACC1.B16, ACC1.B16 \ 51 VPMULL POLY.D1, ACC0.D1, T0.Q1 \ 52 VEXT $8, ACC0.B16, ACC0.B16, ACC0.B16 \ 53 VEOR T0.B16, ACC0.B16, ACC0.B16 \ 54 VPMULL POLY.D1, ACC0.D1, T0.Q1 \ 55 VEOR T0.B16, ACC1.B16, ACC1.B16 \ 56 VEXT $8, ACC1.B16, ACC1.B16, ACC1.B16 \ 57 VEOR ACC1.B16, ACC0.B16, ACC0.B16 \ 58 59// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) 60TEXT ·gcmAesFinish(SB),NOSPLIT,$0 61#define pTbl R0 62#define tMsk R1 63#define tPtr R2 64#define plen R3 65#define dlen R4 66 67 MOVD $0xC2, R1 68 LSL $56, R1 69 MOVD $1, R0 70 VMOV R1, POLY.D[0] 71 VMOV R0, POLY.D[1] 72 VEOR ZERO.B16, ZERO.B16, ZERO.B16 73 74 MOVD productTable+0(FP), pTbl 75 MOVD tagMask+8(FP), tMsk 76 MOVD T+16(FP), tPtr 77 MOVD pLen+24(FP), plen 78 MOVD dLen+32(FP), dlen 79 80 VLD1 (tPtr), [ACC0.B16] 81 VLD1 (tMsk), [B1.B16] 82 83 LSL $3, plen 84 LSL $3, dlen 85 86 VMOV dlen, B0.D[0] 87 VMOV plen, B0.D[1] 88 89 ADD $14*16, pTbl 90 VLD1.P (pTbl), [T1.B16, T2.B16] 91 92 VEOR ACC0.B16, B0.B16, B0.B16 93 94 VEXT $8, B0.B16, B0.B16, T0.B16 95 VEOR B0.B16, T0.B16, T0.B16 96 VPMULL B0.D1, T1.D1, ACC1.Q1 97 VPMULL2 B0.D2, T1.D2, ACC0.Q1 98 VPMULL T0.D1, T2.D1, ACCM.Q1 99 100 reduce() 101 102 VREV64 ACC0.B16, ACC0.B16 103 VEOR B1.B16, ACC0.B16, ACC0.B16 104 105 VST1 [ACC0.B16], (tPtr) 106 RET 107#undef pTbl 108#undef tMsk 109#undef tPtr 110#undef plen 111#undef dlen 112 113// func gcmAesInit(productTable *[256]byte, ks []uint32) 114TEXT ·gcmAesInit(SB),NOSPLIT,$0 115#define pTbl R0 116#define KS R1 117#define NR R2 118#define I R3 119 MOVD productTable+0(FP), pTbl 120 MOVD ks_base+8(FP), KS 121 MOVD ks_len+16(FP), NR 122 123 MOVD $0xC2, I 124 LSL $56, I 125 VMOV I, POLY.D[0] 126 MOVD $1, I 127 VMOV I, POLY.D[1] 128 VEOR ZERO.B16, ZERO.B16, ZERO.B16 129 130 // Encrypt block 0 with the AES key to generate the hash key H 131 VLD1.P 64(KS), [T0.B16, T1.B16, T2.B16, T3.B16] 132 VEOR B0.B16, B0.B16, B0.B16 133 AESE T0.B16, B0.B16 134 AESMC B0.B16, B0.B16 135 AESE T1.B16, B0.B16 136 AESMC B0.B16, B0.B16 137 AESE T2.B16, B0.B16 138 AESMC B0.B16, B0.B16 139 AESE T3.B16, B0.B16 140 AESMC B0.B16, B0.B16 141 VLD1.P 64(KS), [T0.B16, T1.B16, T2.B16, T3.B16] 142 AESE T0.B16, B0.B16 143 AESMC B0.B16, B0.B16 144 AESE T1.B16, B0.B16 145 AESMC B0.B16, B0.B16 146 AESE T2.B16, B0.B16 147 AESMC B0.B16, B0.B16 148 AESE T3.B16, B0.B16 149 AESMC B0.B16, B0.B16 150 TBZ $4, NR, initEncFinish 151 VLD1.P 32(KS), [T0.B16, T1.B16] 152 AESE T0.B16, B0.B16 153 AESMC B0.B16, B0.B16 154 AESE T1.B16, B0.B16 155 AESMC B0.B16, B0.B16 156 TBZ $3, NR, initEncFinish 157 VLD1.P 32(KS), [T0.B16, T1.B16] 158 AESE T0.B16, B0.B16 159 AESMC B0.B16, B0.B16 160 AESE T1.B16, B0.B16 161 AESMC B0.B16, B0.B16 162initEncFinish: 163 VLD1 (KS), [T0.B16, T1.B16, T2.B16] 164 AESE T0.B16, B0.B16 165 AESMC B0.B16, B0.B16 166 AESE T1.B16, B0.B16 167 VEOR T2.B16, B0.B16, B0.B16 168 169 VREV64 B0.B16, B0.B16 170 171 // Multiply by 2 modulo P 172 VMOV B0.D[0], I 173 ASR $63, I 174 VMOV I, T1.D[0] 175 VMOV I, T1.D[1] 176 VAND POLY.B16, T1.B16, T1.B16 177 VUSHR $63, B0.D2, T2.D2 178 VEXT $8, ZERO.B16, T2.B16, T2.B16 179 VSHL $1, B0.D2, B0.D2 180 VEOR T1.B16, B0.B16, B0.B16 181 VEOR T2.B16, B0.B16, B0.B16 // Can avoid this when VSLI is available 182 183 // Karatsuba pre-computation 184 VEXT $8, B0.B16, B0.B16, B1.B16 185 VEOR B0.B16, B1.B16, B1.B16 186 187 ADD $14*16, pTbl 188 VST1 [B0.B16, B1.B16], (pTbl) 189 SUB $2*16, pTbl 190 191 VMOV B0.B16, B2.B16 192 VMOV B1.B16, B3.B16 193 194 MOVD $7, I 195 196initLoop: 197 // Compute powers of H 198 SUBS $1, I 199 200 VPMULL B0.D1, B2.D1, T1.Q1 201 VPMULL2 B0.D2, B2.D2, T0.Q1 202 VPMULL B1.D1, B3.D1, T2.Q1 203 VEOR T0.B16, T2.B16, T2.B16 204 VEOR T1.B16, T2.B16, T2.B16 205 VEXT $8, ZERO.B16, T2.B16, T3.B16 206 VEXT $8, T2.B16, ZERO.B16, T2.B16 207 VEOR T2.B16, T0.B16, T0.B16 208 VEOR T3.B16, T1.B16, T1.B16 209 VPMULL POLY.D1, T0.D1, T2.Q1 210 VEXT $8, T0.B16, T0.B16, T0.B16 211 VEOR T2.B16, T0.B16, T0.B16 212 VPMULL POLY.D1, T0.D1, T2.Q1 213 VEXT $8, T0.B16, T0.B16, T0.B16 214 VEOR T2.B16, T0.B16, T0.B16 215 VEOR T1.B16, T0.B16, B2.B16 216 VMOV B2.B16, B3.B16 217 VEXT $8, B2.B16, B2.B16, B2.B16 218 VEOR B2.B16, B3.B16, B3.B16 219 220 VST1 [B2.B16, B3.B16], (pTbl) 221 SUB $2*16, pTbl 222 223 BNE initLoop 224 RET 225#undef I 226#undef NR 227#undef KS 228#undef pTbl 229 230// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte) 231TEXT ·gcmAesData(SB),NOSPLIT,$0 232#define pTbl R0 233#define aut R1 234#define tPtr R2 235#define autLen R3 236#define H0 R4 237#define pTblSave R5 238 239#define mulRound(X) \ 240 VLD1.P 32(pTbl), [T1.B16, T2.B16] \ 241 VREV64 X.B16, X.B16 \ 242 VEXT $8, X.B16, X.B16, T0.B16 \ 243 VEOR X.B16, T0.B16, T0.B16 \ 244 VPMULL X.D1, T1.D1, T3.Q1 \ 245 VEOR T3.B16, ACC1.B16, ACC1.B16 \ 246 VPMULL2 X.D2, T1.D2, T3.Q1 \ 247 VEOR T3.B16, ACC0.B16, ACC0.B16 \ 248 VPMULL T0.D1, T2.D1, T3.Q1 \ 249 VEOR T3.B16, ACCM.B16, ACCM.B16 250 251 MOVD productTable+0(FP), pTbl 252 MOVD data_base+8(FP), aut 253 MOVD data_len+16(FP), autLen 254 MOVD T+32(FP), tPtr 255 256 VEOR ACC0.B16, ACC0.B16, ACC0.B16 257 CBZ autLen, dataBail 258 259 MOVD $0xC2, H0 260 LSL $56, H0 261 VMOV H0, POLY.D[0] 262 MOVD $1, H0 263 VMOV H0, POLY.D[1] 264 VEOR ZERO.B16, ZERO.B16, ZERO.B16 265 MOVD pTbl, pTblSave 266 267 CMP $13, autLen 268 BEQ dataTLS 269 CMP $128, autLen 270 BLT startSinglesLoop 271 B octetsLoop 272 273dataTLS: 274 ADD $14*16, pTbl 275 VLD1.P (pTbl), [T1.B16, T2.B16] 276 VEOR B0.B16, B0.B16, B0.B16 277 278 MOVD (aut), H0 279 VMOV H0, B0.D[0] 280 MOVW 8(aut), H0 281 VMOV H0, B0.S[2] 282 MOVB 12(aut), H0 283 VMOV H0, B0.B[12] 284 285 MOVD $0, autLen 286 B dataMul 287 288octetsLoop: 289 CMP $128, autLen 290 BLT startSinglesLoop 291 SUB $128, autLen 292 293 VLD1.P 32(aut), [B0.B16, B1.B16] 294 295 VLD1.P 32(pTbl), [T1.B16, T2.B16] 296 VREV64 B0.B16, B0.B16 297 VEOR ACC0.B16, B0.B16, B0.B16 298 VEXT $8, B0.B16, B0.B16, T0.B16 299 VEOR B0.B16, T0.B16, T0.B16 300 VPMULL B0.D1, T1.D1, ACC1.Q1 301 VPMULL2 B0.D2, T1.D2, ACC0.Q1 302 VPMULL T0.D1, T2.D1, ACCM.Q1 303 304 mulRound(B1) 305 VLD1.P 32(aut), [B2.B16, B3.B16] 306 mulRound(B2) 307 mulRound(B3) 308 VLD1.P 32(aut), [B4.B16, B5.B16] 309 mulRound(B4) 310 mulRound(B5) 311 VLD1.P 32(aut), [B6.B16, B7.B16] 312 mulRound(B6) 313 mulRound(B7) 314 315 MOVD pTblSave, pTbl 316 reduce() 317 B octetsLoop 318 319startSinglesLoop: 320 321 ADD $14*16, pTbl 322 VLD1.P (pTbl), [T1.B16, T2.B16] 323 324singlesLoop: 325 326 CMP $16, autLen 327 BLT dataEnd 328 SUB $16, autLen 329 330 VLD1.P 16(aut), [B0.B16] 331dataMul: 332 VREV64 B0.B16, B0.B16 333 VEOR ACC0.B16, B0.B16, B0.B16 334 335 VEXT $8, B0.B16, B0.B16, T0.B16 336 VEOR B0.B16, T0.B16, T0.B16 337 VPMULL B0.D1, T1.D1, ACC1.Q1 338 VPMULL2 B0.D2, T1.D2, ACC0.Q1 339 VPMULL T0.D1, T2.D1, ACCM.Q1 340 341 reduce() 342 343 B singlesLoop 344 345dataEnd: 346 347 CBZ autLen, dataBail 348 VEOR B0.B16, B0.B16, B0.B16 349 ADD autLen, aut 350 351dataLoadLoop: 352 MOVB.W -1(aut), H0 353 VEXT $15, B0.B16, ZERO.B16, B0.B16 354 VMOV H0, B0.B[0] 355 SUBS $1, autLen 356 BNE dataLoadLoop 357 B dataMul 358 359dataBail: 360 VST1 [ACC0.B16], (tPtr) 361 RET 362 363#undef pTbl 364#undef aut 365#undef tPtr 366#undef autLen 367#undef H0 368#undef pTblSave 369 370// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) 371TEXT ·gcmAesEnc(SB),NOSPLIT,$0 372#define pTbl R0 373#define dstPtr R1 374#define ctrPtr R2 375#define srcPtr R3 376#define ks R4 377#define tPtr R5 378#define srcPtrLen R6 379#define aluCTR R7 380#define aluTMP R8 381#define aluK R9 382#define NR R10 383#define H0 R11 384#define H1 R12 385#define curK R13 386#define pTblSave R14 387 388#define aesrndx8(K) \ 389 AESE K.B16, B0.B16 \ 390 AESMC B0.B16, B0.B16 \ 391 AESE K.B16, B1.B16 \ 392 AESMC B1.B16, B1.B16 \ 393 AESE K.B16, B2.B16 \ 394 AESMC B2.B16, B2.B16 \ 395 AESE K.B16, B3.B16 \ 396 AESMC B3.B16, B3.B16 \ 397 AESE K.B16, B4.B16 \ 398 AESMC B4.B16, B4.B16 \ 399 AESE K.B16, B5.B16 \ 400 AESMC B5.B16, B5.B16 \ 401 AESE K.B16, B6.B16 \ 402 AESMC B6.B16, B6.B16 \ 403 AESE K.B16, B7.B16 \ 404 AESMC B7.B16, B7.B16 405 406#define aesrndlastx8(K) \ 407 AESE K.B16, B0.B16 \ 408 AESE K.B16, B1.B16 \ 409 AESE K.B16, B2.B16 \ 410 AESE K.B16, B3.B16 \ 411 AESE K.B16, B4.B16 \ 412 AESE K.B16, B5.B16 \ 413 AESE K.B16, B6.B16 \ 414 AESE K.B16, B7.B16 415 416 MOVD productTable+0(FP), pTbl 417 MOVD dst+8(FP), dstPtr 418 MOVD src_base+32(FP), srcPtr 419 MOVD src_len+40(FP), srcPtrLen 420 MOVD ctr+56(FP), ctrPtr 421 MOVD T+64(FP), tPtr 422 MOVD ks_base+72(FP), ks 423 MOVD ks_len+80(FP), NR 424 425 MOVD $0xC2, H1 426 LSL $56, H1 427 MOVD $1, H0 428 VMOV H1, POLY.D[0] 429 VMOV H0, POLY.D[1] 430 VEOR ZERO.B16, ZERO.B16, ZERO.B16 431 // Compute NR from len(ks) 432 MOVD pTbl, pTblSave 433 // Current tag, after AAD 434 VLD1 (tPtr), [ACC0.B16] 435 VEOR ACC1.B16, ACC1.B16, ACC1.B16 436 VEOR ACCM.B16, ACCM.B16, ACCM.B16 437 // Prepare initial counter, and the increment vector 438 VLD1 (ctrPtr), [CTR.B16] 439 VEOR INC.B16, INC.B16, INC.B16 440 MOVD $1, H0 441 VMOV H0, INC.S[3] 442 VREV32 CTR.B16, CTR.B16 443 VADD CTR.S4, INC.S4, CTR.S4 444 // Skip to <8 blocks loop 445 CMP $128, srcPtrLen 446 447 MOVD ks, H0 448 // For AES-128 round keys are stored in: K0 .. K10, KLAST 449 VLD1.P 64(H0), [K0.B16, K1.B16, K2.B16, K3.B16] 450 VLD1.P 64(H0), [K4.B16, K5.B16, K6.B16, K7.B16] 451 VLD1.P 48(H0), [K8.B16, K9.B16, K10.B16] 452 VMOV K10.B16, KLAST.B16 453 454 BLT startSingles 455 // There are at least 8 blocks to encrypt 456 TBZ $4, NR, octetsLoop 457 458 // For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST 459 VMOV K8.B16, K10.B16 460 VMOV K9.B16, K11.B16 461 VMOV KLAST.B16, K8.B16 462 VLD1.P 16(H0), [K9.B16] 463 VLD1.P 16(H0), [KLAST.B16] 464 TBZ $3, NR, octetsLoop 465 // For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST 466 VMOV KLAST.B16, K8.B16 467 VLD1.P 16(H0), [K9.B16] 468 VLD1.P 16(H0), [KLAST.B16] 469 ADD $10*16, ks, H0 470 MOVD H0, curK 471 472octetsLoop: 473 SUB $128, srcPtrLen 474 475 VMOV CTR.B16, B0.B16 476 VADD B0.S4, INC.S4, B1.S4 477 VREV32 B0.B16, B0.B16 478 VADD B1.S4, INC.S4, B2.S4 479 VREV32 B1.B16, B1.B16 480 VADD B2.S4, INC.S4, B3.S4 481 VREV32 B2.B16, B2.B16 482 VADD B3.S4, INC.S4, B4.S4 483 VREV32 B3.B16, B3.B16 484 VADD B4.S4, INC.S4, B5.S4 485 VREV32 B4.B16, B4.B16 486 VADD B5.S4, INC.S4, B6.S4 487 VREV32 B5.B16, B5.B16 488 VADD B6.S4, INC.S4, B7.S4 489 VREV32 B6.B16, B6.B16 490 VADD B7.S4, INC.S4, CTR.S4 491 VREV32 B7.B16, B7.B16 492 493 aesrndx8(K0) 494 aesrndx8(K1) 495 aesrndx8(K2) 496 aesrndx8(K3) 497 aesrndx8(K4) 498 aesrndx8(K5) 499 aesrndx8(K6) 500 aesrndx8(K7) 501 TBZ $4, NR, octetsFinish 502 aesrndx8(K10) 503 aesrndx8(K11) 504 TBZ $3, NR, octetsFinish 505 VLD1.P 32(curK), [T1.B16, T2.B16] 506 aesrndx8(T1) 507 aesrndx8(T2) 508 MOVD H0, curK 509octetsFinish: 510 aesrndx8(K8) 511 aesrndlastx8(K9) 512 513 VEOR KLAST.B16, B0.B16, B0.B16 514 VEOR KLAST.B16, B1.B16, B1.B16 515 VEOR KLAST.B16, B2.B16, B2.B16 516 VEOR KLAST.B16, B3.B16, B3.B16 517 VEOR KLAST.B16, B4.B16, B4.B16 518 VEOR KLAST.B16, B5.B16, B5.B16 519 VEOR KLAST.B16, B6.B16, B6.B16 520 VEOR KLAST.B16, B7.B16, B7.B16 521 522 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 523 VEOR B0.B16, T1.B16, B0.B16 524 VEOR B1.B16, T2.B16, B1.B16 525 VST1.P [B0.B16, B1.B16], 32(dstPtr) 526 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 527 VEOR B2.B16, T1.B16, B2.B16 528 VEOR B3.B16, T2.B16, B3.B16 529 VST1.P [B2.B16, B3.B16], 32(dstPtr) 530 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 531 VEOR B4.B16, T1.B16, B4.B16 532 VEOR B5.B16, T2.B16, B5.B16 533 VST1.P [B4.B16, B5.B16], 32(dstPtr) 534 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 535 VEOR B6.B16, T1.B16, B6.B16 536 VEOR B7.B16, T2.B16, B7.B16 537 VST1.P [B6.B16, B7.B16], 32(dstPtr) 538 539 VLD1.P 32(pTbl), [T1.B16, T2.B16] 540 VREV64 B0.B16, B0.B16 541 VEOR ACC0.B16, B0.B16, B0.B16 542 VEXT $8, B0.B16, B0.B16, T0.B16 543 VEOR B0.B16, T0.B16, T0.B16 544 VPMULL B0.D1, T1.D1, ACC1.Q1 545 VPMULL2 B0.D2, T1.D2, ACC0.Q1 546 VPMULL T0.D1, T2.D1, ACCM.Q1 547 548 mulRound(B1) 549 mulRound(B2) 550 mulRound(B3) 551 mulRound(B4) 552 mulRound(B5) 553 mulRound(B6) 554 mulRound(B7) 555 MOVD pTblSave, pTbl 556 reduce() 557 558 CMP $128, srcPtrLen 559 BGE octetsLoop 560 561startSingles: 562 CBZ srcPtrLen, done 563 ADD $14*16, pTbl 564 // Preload H and its Karatsuba precomp 565 VLD1.P (pTbl), [T1.B16, T2.B16] 566 // Preload AES round keys 567 ADD $128, ks 568 VLD1.P 48(ks), [K8.B16, K9.B16, K10.B16] 569 VMOV K10.B16, KLAST.B16 570 TBZ $4, NR, singlesLoop 571 VLD1.P 32(ks), [B1.B16, B2.B16] 572 VMOV B2.B16, KLAST.B16 573 TBZ $3, NR, singlesLoop 574 VLD1.P 32(ks), [B3.B16, B4.B16] 575 VMOV B4.B16, KLAST.B16 576 577singlesLoop: 578 CMP $16, srcPtrLen 579 BLT tail 580 SUB $16, srcPtrLen 581 582 VLD1.P 16(srcPtr), [T0.B16] 583 VEOR KLAST.B16, T0.B16, T0.B16 584 585 VREV32 CTR.B16, B0.B16 586 VADD CTR.S4, INC.S4, CTR.S4 587 588 AESE K0.B16, B0.B16 589 AESMC B0.B16, B0.B16 590 AESE K1.B16, B0.B16 591 AESMC B0.B16, B0.B16 592 AESE K2.B16, B0.B16 593 AESMC B0.B16, B0.B16 594 AESE K3.B16, B0.B16 595 AESMC B0.B16, B0.B16 596 AESE K4.B16, B0.B16 597 AESMC B0.B16, B0.B16 598 AESE K5.B16, B0.B16 599 AESMC B0.B16, B0.B16 600 AESE K6.B16, B0.B16 601 AESMC B0.B16, B0.B16 602 AESE K7.B16, B0.B16 603 AESMC B0.B16, B0.B16 604 AESE K8.B16, B0.B16 605 AESMC B0.B16, B0.B16 606 AESE K9.B16, B0.B16 607 TBZ $4, NR, singlesLast 608 AESMC B0.B16, B0.B16 609 AESE K10.B16, B0.B16 610 AESMC B0.B16, B0.B16 611 AESE B1.B16, B0.B16 612 TBZ $3, NR, singlesLast 613 AESMC B0.B16, B0.B16 614 AESE B2.B16, B0.B16 615 AESMC B0.B16, B0.B16 616 AESE B3.B16, B0.B16 617singlesLast: 618 VEOR T0.B16, B0.B16, B0.B16 619encReduce: 620 VST1.P [B0.B16], 16(dstPtr) 621 622 VREV64 B0.B16, B0.B16 623 VEOR ACC0.B16, B0.B16, B0.B16 624 625 VEXT $8, B0.B16, B0.B16, T0.B16 626 VEOR B0.B16, T0.B16, T0.B16 627 VPMULL B0.D1, T1.D1, ACC1.Q1 628 VPMULL2 B0.D2, T1.D2, ACC0.Q1 629 VPMULL T0.D1, T2.D1, ACCM.Q1 630 631 reduce() 632 633 B singlesLoop 634tail: 635 CBZ srcPtrLen, done 636 637 VEOR T0.B16, T0.B16, T0.B16 638 VEOR T3.B16, T3.B16, T3.B16 639 MOVD $0, H1 640 SUB $1, H1 641 ADD srcPtrLen, srcPtr 642 643 TBZ $3, srcPtrLen, ld4 644 MOVD.W -8(srcPtr), H0 645 VMOV H0, T0.D[0] 646 VMOV H1, T3.D[0] 647ld4: 648 TBZ $2, srcPtrLen, ld2 649 MOVW.W -4(srcPtr), H0 650 VEXT $12, T0.B16, ZERO.B16, T0.B16 651 VEXT $12, T3.B16, ZERO.B16, T3.B16 652 VMOV H0, T0.S[0] 653 VMOV H1, T3.S[0] 654ld2: 655 TBZ $1, srcPtrLen, ld1 656 MOVH.W -2(srcPtr), H0 657 VEXT $14, T0.B16, ZERO.B16, T0.B16 658 VEXT $14, T3.B16, ZERO.B16, T3.B16 659 VMOV H0, T0.H[0] 660 VMOV H1, T3.H[0] 661ld1: 662 TBZ $0, srcPtrLen, ld0 663 MOVB.W -1(srcPtr), H0 664 VEXT $15, T0.B16, ZERO.B16, T0.B16 665 VEXT $15, T3.B16, ZERO.B16, T3.B16 666 VMOV H0, T0.B[0] 667 VMOV H1, T3.B[0] 668ld0: 669 670 MOVD ZR, srcPtrLen 671 VEOR KLAST.B16, T0.B16, T0.B16 672 VREV32 CTR.B16, B0.B16 673 674 AESE K0.B16, B0.B16 675 AESMC B0.B16, B0.B16 676 AESE K1.B16, B0.B16 677 AESMC B0.B16, B0.B16 678 AESE K2.B16, B0.B16 679 AESMC B0.B16, B0.B16 680 AESE K3.B16, B0.B16 681 AESMC B0.B16, B0.B16 682 AESE K4.B16, B0.B16 683 AESMC B0.B16, B0.B16 684 AESE K5.B16, B0.B16 685 AESMC B0.B16, B0.B16 686 AESE K6.B16, B0.B16 687 AESMC B0.B16, B0.B16 688 AESE K7.B16, B0.B16 689 AESMC B0.B16, B0.B16 690 AESE K8.B16, B0.B16 691 AESMC B0.B16, B0.B16 692 AESE K9.B16, B0.B16 693 TBZ $4, NR, tailLast 694 AESMC B0.B16, B0.B16 695 AESE K10.B16, B0.B16 696 AESMC B0.B16, B0.B16 697 AESE B1.B16, B0.B16 698 TBZ $3, NR, tailLast 699 AESMC B0.B16, B0.B16 700 AESE B2.B16, B0.B16 701 AESMC B0.B16, B0.B16 702 AESE B3.B16, B0.B16 703 704tailLast: 705 VEOR T0.B16, B0.B16, B0.B16 706 VAND T3.B16, B0.B16, B0.B16 707 B encReduce 708 709done: 710 VST1 [ACC0.B16], (tPtr) 711 RET 712 713// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) 714TEXT ·gcmAesDec(SB),NOSPLIT,$0 715 MOVD productTable+0(FP), pTbl 716 MOVD dst+8(FP), dstPtr 717 MOVD src_base+32(FP), srcPtr 718 MOVD src_len+40(FP), srcPtrLen 719 MOVD ctr+56(FP), ctrPtr 720 MOVD T+64(FP), tPtr 721 MOVD ks_base+72(FP), ks 722 MOVD ks_len+80(FP), NR 723 724 MOVD $0xC2, H1 725 LSL $56, H1 726 MOVD $1, H0 727 VMOV H1, POLY.D[0] 728 VMOV H0, POLY.D[1] 729 VEOR ZERO.B16, ZERO.B16, ZERO.B16 730 // Compute NR from len(ks) 731 MOVD pTbl, pTblSave 732 // Current tag, after AAD 733 VLD1 (tPtr), [ACC0.B16] 734 VEOR ACC1.B16, ACC1.B16, ACC1.B16 735 VEOR ACCM.B16, ACCM.B16, ACCM.B16 736 // Prepare initial counter, and the increment vector 737 VLD1 (ctrPtr), [CTR.B16] 738 VEOR INC.B16, INC.B16, INC.B16 739 MOVD $1, H0 740 VMOV H0, INC.S[3] 741 VREV32 CTR.B16, CTR.B16 742 VADD CTR.S4, INC.S4, CTR.S4 743 744 MOVD ks, H0 745 // For AES-128 round keys are stored in: K0 .. K10, KLAST 746 VLD1.P 64(H0), [K0.B16, K1.B16, K2.B16, K3.B16] 747 VLD1.P 64(H0), [K4.B16, K5.B16, K6.B16, K7.B16] 748 VLD1.P 48(H0), [K8.B16, K9.B16, K10.B16] 749 VMOV K10.B16, KLAST.B16 750 751 // Skip to <8 blocks loop 752 CMP $128, srcPtrLen 753 BLT startSingles 754 // There are at least 8 blocks to encrypt 755 TBZ $4, NR, octetsLoop 756 757 // For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST 758 VMOV K8.B16, K10.B16 759 VMOV K9.B16, K11.B16 760 VMOV KLAST.B16, K8.B16 761 VLD1.P 16(H0), [K9.B16] 762 VLD1.P 16(H0), [KLAST.B16] 763 TBZ $3, NR, octetsLoop 764 // For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST 765 VMOV KLAST.B16, K8.B16 766 VLD1.P 16(H0), [K9.B16] 767 VLD1.P 16(H0), [KLAST.B16] 768 ADD $10*16, ks, H0 769 MOVD H0, curK 770 771octetsLoop: 772 SUB $128, srcPtrLen 773 774 VMOV CTR.B16, B0.B16 775 VADD B0.S4, INC.S4, B1.S4 776 VREV32 B0.B16, B0.B16 777 VADD B1.S4, INC.S4, B2.S4 778 VREV32 B1.B16, B1.B16 779 VADD B2.S4, INC.S4, B3.S4 780 VREV32 B2.B16, B2.B16 781 VADD B3.S4, INC.S4, B4.S4 782 VREV32 B3.B16, B3.B16 783 VADD B4.S4, INC.S4, B5.S4 784 VREV32 B4.B16, B4.B16 785 VADD B5.S4, INC.S4, B6.S4 786 VREV32 B5.B16, B5.B16 787 VADD B6.S4, INC.S4, B7.S4 788 VREV32 B6.B16, B6.B16 789 VADD B7.S4, INC.S4, CTR.S4 790 VREV32 B7.B16, B7.B16 791 792 aesrndx8(K0) 793 aesrndx8(K1) 794 aesrndx8(K2) 795 aesrndx8(K3) 796 aesrndx8(K4) 797 aesrndx8(K5) 798 aesrndx8(K6) 799 aesrndx8(K7) 800 TBZ $4, NR, octetsFinish 801 aesrndx8(K10) 802 aesrndx8(K11) 803 TBZ $3, NR, octetsFinish 804 VLD1.P 32(curK), [T1.B16, T2.B16] 805 aesrndx8(T1) 806 aesrndx8(T2) 807 MOVD H0, curK 808octetsFinish: 809 aesrndx8(K8) 810 aesrndlastx8(K9) 811 812 VEOR KLAST.B16, B0.B16, T1.B16 813 VEOR KLAST.B16, B1.B16, T2.B16 814 VEOR KLAST.B16, B2.B16, B2.B16 815 VEOR KLAST.B16, B3.B16, B3.B16 816 VEOR KLAST.B16, B4.B16, B4.B16 817 VEOR KLAST.B16, B5.B16, B5.B16 818 VEOR KLAST.B16, B6.B16, B6.B16 819 VEOR KLAST.B16, B7.B16, B7.B16 820 821 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 822 VEOR B0.B16, T1.B16, T1.B16 823 VEOR B1.B16, T2.B16, T2.B16 824 VST1.P [T1.B16, T2.B16], 32(dstPtr) 825 826 VLD1.P 32(pTbl), [T1.B16, T2.B16] 827 VREV64 B0.B16, B0.B16 828 VEOR ACC0.B16, B0.B16, B0.B16 829 VEXT $8, B0.B16, B0.B16, T0.B16 830 VEOR B0.B16, T0.B16, T0.B16 831 VPMULL B0.D1, T1.D1, ACC1.Q1 832 VPMULL2 B0.D2, T1.D2, ACC0.Q1 833 VPMULL T0.D1, T2.D1, ACCM.Q1 834 mulRound(B1) 835 836 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 837 VEOR B2.B16, B0.B16, T1.B16 838 VEOR B3.B16, B1.B16, T2.B16 839 VST1.P [T1.B16, T2.B16], 32(dstPtr) 840 mulRound(B0) 841 mulRound(B1) 842 843 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 844 VEOR B4.B16, B0.B16, T1.B16 845 VEOR B5.B16, B1.B16, T2.B16 846 VST1.P [T1.B16, T2.B16], 32(dstPtr) 847 mulRound(B0) 848 mulRound(B1) 849 850 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 851 VEOR B6.B16, B0.B16, T1.B16 852 VEOR B7.B16, B1.B16, T2.B16 853 VST1.P [T1.B16, T2.B16], 32(dstPtr) 854 mulRound(B0) 855 mulRound(B1) 856 857 MOVD pTblSave, pTbl 858 reduce() 859 860 CMP $128, srcPtrLen 861 BGE octetsLoop 862 863startSingles: 864 CBZ srcPtrLen, done 865 ADD $14*16, pTbl 866 // Preload H and its Karatsuba precomp 867 VLD1.P (pTbl), [T1.B16, T2.B16] 868 // Preload AES round keys 869 ADD $128, ks 870 VLD1.P 48(ks), [K8.B16, K9.B16, K10.B16] 871 VMOV K10.B16, KLAST.B16 872 TBZ $4, NR, singlesLoop 873 VLD1.P 32(ks), [B1.B16, B2.B16] 874 VMOV B2.B16, KLAST.B16 875 TBZ $3, NR, singlesLoop 876 VLD1.P 32(ks), [B3.B16, B4.B16] 877 VMOV B4.B16, KLAST.B16 878 879singlesLoop: 880 CMP $16, srcPtrLen 881 BLT tail 882 SUB $16, srcPtrLen 883 884 VLD1.P 16(srcPtr), [T0.B16] 885 VREV64 T0.B16, B5.B16 886 VEOR KLAST.B16, T0.B16, T0.B16 887 888 VREV32 CTR.B16, B0.B16 889 VADD CTR.S4, INC.S4, CTR.S4 890 891 AESE K0.B16, B0.B16 892 AESMC B0.B16, B0.B16 893 AESE K1.B16, B0.B16 894 AESMC B0.B16, B0.B16 895 AESE K2.B16, B0.B16 896 AESMC B0.B16, B0.B16 897 AESE K3.B16, B0.B16 898 AESMC B0.B16, B0.B16 899 AESE K4.B16, B0.B16 900 AESMC B0.B16, B0.B16 901 AESE K5.B16, B0.B16 902 AESMC B0.B16, B0.B16 903 AESE K6.B16, B0.B16 904 AESMC B0.B16, B0.B16 905 AESE K7.B16, B0.B16 906 AESMC B0.B16, B0.B16 907 AESE K8.B16, B0.B16 908 AESMC B0.B16, B0.B16 909 AESE K9.B16, B0.B16 910 TBZ $4, NR, singlesLast 911 AESMC B0.B16, B0.B16 912 AESE K10.B16, B0.B16 913 AESMC B0.B16, B0.B16 914 AESE B1.B16, B0.B16 915 TBZ $3, NR, singlesLast 916 AESMC B0.B16, B0.B16 917 AESE B2.B16, B0.B16 918 AESMC B0.B16, B0.B16 919 AESE B3.B16, B0.B16 920singlesLast: 921 VEOR T0.B16, B0.B16, B0.B16 922 923 VST1.P [B0.B16], 16(dstPtr) 924 925 VEOR ACC0.B16, B5.B16, B5.B16 926 VEXT $8, B5.B16, B5.B16, T0.B16 927 VEOR B5.B16, T0.B16, T0.B16 928 VPMULL B5.D1, T1.D1, ACC1.Q1 929 VPMULL2 B5.D2, T1.D2, ACC0.Q1 930 VPMULL T0.D1, T2.D1, ACCM.Q1 931 reduce() 932 933 B singlesLoop 934tail: 935 CBZ srcPtrLen, done 936 937 VREV32 CTR.B16, B0.B16 938 VADD CTR.S4, INC.S4, CTR.S4 939 940 AESE K0.B16, B0.B16 941 AESMC B0.B16, B0.B16 942 AESE K1.B16, B0.B16 943 AESMC B0.B16, B0.B16 944 AESE K2.B16, B0.B16 945 AESMC B0.B16, B0.B16 946 AESE K3.B16, B0.B16 947 AESMC B0.B16, B0.B16 948 AESE K4.B16, B0.B16 949 AESMC B0.B16, B0.B16 950 AESE K5.B16, B0.B16 951 AESMC B0.B16, B0.B16 952 AESE K6.B16, B0.B16 953 AESMC B0.B16, B0.B16 954 AESE K7.B16, B0.B16 955 AESMC B0.B16, B0.B16 956 AESE K8.B16, B0.B16 957 AESMC B0.B16, B0.B16 958 AESE K9.B16, B0.B16 959 TBZ $4, NR, tailLast 960 AESMC B0.B16, B0.B16 961 AESE K10.B16, B0.B16 962 AESMC B0.B16, B0.B16 963 AESE B1.B16, B0.B16 964 TBZ $3, NR, tailLast 965 AESMC B0.B16, B0.B16 966 AESE B2.B16, B0.B16 967 AESMC B0.B16, B0.B16 968 AESE B3.B16, B0.B16 969tailLast: 970 VEOR KLAST.B16, B0.B16, B0.B16 971 972 // Assuming it is safe to load past dstPtr due to the presence of the tag 973 VLD1 (srcPtr), [B5.B16] 974 975 VEOR B5.B16, B0.B16, B0.B16 976 977 VEOR T3.B16, T3.B16, T3.B16 978 MOVD $0, H1 979 SUB $1, H1 980 981 TBZ $3, srcPtrLen, ld4 982 VMOV B0.D[0], H0 983 MOVD.P H0, 8(dstPtr) 984 VMOV H1, T3.D[0] 985 VEXT $8, ZERO.B16, B0.B16, B0.B16 986ld4: 987 TBZ $2, srcPtrLen, ld2 988 VMOV B0.S[0], H0 989 MOVW.P H0, 4(dstPtr) 990 VEXT $12, T3.B16, ZERO.B16, T3.B16 991 VMOV H1, T3.S[0] 992 VEXT $4, ZERO.B16, B0.B16, B0.B16 993ld2: 994 TBZ $1, srcPtrLen, ld1 995 VMOV B0.H[0], H0 996 MOVH.P H0, 2(dstPtr) 997 VEXT $14, T3.B16, ZERO.B16, T3.B16 998 VMOV H1, T3.H[0] 999 VEXT $2, ZERO.B16, B0.B16, B0.B16 1000ld1: 1001 TBZ $0, srcPtrLen, ld0 1002 VMOV B0.B[0], H0 1003 MOVB.P H0, 1(dstPtr) 1004 VEXT $15, T3.B16, ZERO.B16, T3.B16 1005 VMOV H1, T3.B[0] 1006ld0: 1007 1008 VAND T3.B16, B5.B16, B5.B16 1009 VREV64 B5.B16, B5.B16 1010 1011 VEOR ACC0.B16, B5.B16, B5.B16 1012 VEXT $8, B5.B16, B5.B16, T0.B16 1013 VEOR B5.B16, T0.B16, T0.B16 1014 VPMULL B5.D1, T1.D1, ACC1.Q1 1015 VPMULL2 B5.D2, T1.D2, ACC0.Q1 1016 VPMULL T0.D1, T2.D1, ACCM.Q1 1017 reduce() 1018done: 1019 VST1 [ACC0.B16], (tPtr) 1020 1021 RET 1022