1// Copyright 2015 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI 6// The implementation uses some optimization as described in: 7// [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication 8// Instruction and its Usage for Computing the GCM Mode rev. 2.02 9// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and 10// Hardware 11 12#include "textflag.h" 13 14#define B0 X0 15#define B1 X1 16#define B2 X2 17#define B3 X3 18#define B4 X4 19#define B5 X5 20#define B6 X6 21#define B7 X7 22 23#define ACC0 X8 24#define ACC1 X9 25#define ACCM X10 26 27#define T0 X11 28#define T1 X12 29#define T2 X13 30#define POLY X14 31#define BSWAP X15 32 33DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f 34DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607 35 36DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001 37DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000 38 39DATA andMask<>+0x00(SB)/8, $0x00000000000000ff 40DATA andMask<>+0x08(SB)/8, $0x0000000000000000 41DATA andMask<>+0x10(SB)/8, $0x000000000000ffff 42DATA andMask<>+0x18(SB)/8, $0x0000000000000000 43DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff 44DATA andMask<>+0x28(SB)/8, $0x0000000000000000 45DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff 46DATA andMask<>+0x38(SB)/8, $0x0000000000000000 47DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff 48DATA andMask<>+0x48(SB)/8, $0x0000000000000000 49DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff 50DATA andMask<>+0x58(SB)/8, $0x0000000000000000 51DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff 52DATA andMask<>+0x68(SB)/8, $0x0000000000000000 53DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff 54DATA andMask<>+0x78(SB)/8, $0x0000000000000000 55DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff 56DATA andMask<>+0x88(SB)/8, $0x00000000000000ff 57DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff 58DATA andMask<>+0x98(SB)/8, $0x000000000000ffff 59DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff 60DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff 61DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff 62DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff 63DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff 64DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff 65DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff 66DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff 67DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff 68DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff 69 70GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16 71GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16 72GLOBL andMask<>(SB), (NOPTR+RODATA), $240 73 74// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) 75TEXT ·gcmAesFinish(SB),NOSPLIT,$0 76#define pTbl DI 77#define tMsk SI 78#define tPtr DX 79#define plen AX 80#define dlen CX 81 82 MOVQ productTable+0(FP), pTbl 83 MOVQ tagMask+8(FP), tMsk 84 MOVQ T+16(FP), tPtr 85 MOVQ pLen+24(FP), plen 86 MOVQ dLen+32(FP), dlen 87 88 MOVOU (tPtr), ACC0 89 MOVOU (tMsk), T2 90 91 MOVOU bswapMask<>(SB), BSWAP 92 MOVOU gcmPoly<>(SB), POLY 93 94 SHLQ $3, plen 95 SHLQ $3, dlen 96 97 MOVQ plen, B0 98 PINSRQ $1, dlen, B0 99 100 PXOR ACC0, B0 101 102 MOVOU (16*14)(pTbl), ACC0 103 MOVOU (16*15)(pTbl), ACCM 104 MOVOU ACC0, ACC1 105 106 PCLMULQDQ $0x00, B0, ACC0 107 PCLMULQDQ $0x11, B0, ACC1 108 PSHUFD $78, B0, T0 109 PXOR B0, T0 110 PCLMULQDQ $0x00, T0, ACCM 111 112 PXOR ACC0, ACCM 113 PXOR ACC1, ACCM 114 MOVOU ACCM, T0 115 PSRLDQ $8, ACCM 116 PSLLDQ $8, T0 117 PXOR ACCM, ACC1 118 PXOR T0, ACC0 119 120 MOVOU POLY, T0 121 PCLMULQDQ $0x01, ACC0, T0 122 PSHUFD $78, ACC0, ACC0 123 PXOR T0, ACC0 124 125 MOVOU POLY, T0 126 PCLMULQDQ $0x01, ACC0, T0 127 PSHUFD $78, ACC0, ACC0 128 PXOR T0, ACC0 129 130 PXOR ACC1, ACC0 131 132 PSHUFB BSWAP, ACC0 133 PXOR T2, ACC0 134 MOVOU ACC0, (tPtr) 135 136 RET 137#undef pTbl 138#undef tMsk 139#undef tPtr 140#undef plen 141#undef dlen 142 143// func gcmAesInit(productTable *[256]byte, ks []uint32) 144TEXT ·gcmAesInit(SB),NOSPLIT,$0 145#define dst DI 146#define KS SI 147#define NR DX 148 149 MOVQ productTable+0(FP), dst 150 MOVQ ks_base+8(FP), KS 151 MOVQ ks_len+16(FP), NR 152 153 SHRQ $2, NR 154 DECQ NR 155 156 MOVOU bswapMask<>(SB), BSWAP 157 MOVOU gcmPoly<>(SB), POLY 158 159 // Encrypt block 0, with the AES key to generate the hash key H 160 MOVOU (16*0)(KS), B0 161 MOVOU (16*1)(KS), T0 162 AESENC T0, B0 163 MOVOU (16*2)(KS), T0 164 AESENC T0, B0 165 MOVOU (16*3)(KS), T0 166 AESENC T0, B0 167 MOVOU (16*4)(KS), T0 168 AESENC T0, B0 169 MOVOU (16*5)(KS), T0 170 AESENC T0, B0 171 MOVOU (16*6)(KS), T0 172 AESENC T0, B0 173 MOVOU (16*7)(KS), T0 174 AESENC T0, B0 175 MOVOU (16*8)(KS), T0 176 AESENC T0, B0 177 MOVOU (16*9)(KS), T0 178 AESENC T0, B0 179 MOVOU (16*10)(KS), T0 180 CMPQ NR, $12 181 JB initEncLast 182 AESENC T0, B0 183 MOVOU (16*11)(KS), T0 184 AESENC T0, B0 185 MOVOU (16*12)(KS), T0 186 JE initEncLast 187 AESENC T0, B0 188 MOVOU (16*13)(KS), T0 189 AESENC T0, B0 190 MOVOU (16*14)(KS), T0 191initEncLast: 192 AESENCLAST T0, B0 193 194 PSHUFB BSWAP, B0 195 // H * 2 196 PSHUFD $0xff, B0, T0 197 MOVOU B0, T1 198 PSRAL $31, T0 199 PAND POLY, T0 200 PSRLL $31, T1 201 PSLLDQ $4, T1 202 PSLLL $1, B0 203 PXOR T0, B0 204 PXOR T1, B0 205 // Karatsuba pre-computations 206 MOVOU B0, (16*14)(dst) 207 PSHUFD $78, B0, B1 208 PXOR B0, B1 209 MOVOU B1, (16*15)(dst) 210 211 MOVOU B0, B2 212 MOVOU B1, B3 213 // Now prepare powers of H and pre-computations for them 214 MOVQ $7, AX 215 216initLoop: 217 MOVOU B2, T0 218 MOVOU B2, T1 219 MOVOU B3, T2 220 PCLMULQDQ $0x00, B0, T0 221 PCLMULQDQ $0x11, B0, T1 222 PCLMULQDQ $0x00, B1, T2 223 224 PXOR T0, T2 225 PXOR T1, T2 226 MOVOU T2, B4 227 PSLLDQ $8, B4 228 PSRLDQ $8, T2 229 PXOR B4, T0 230 PXOR T2, T1 231 232 MOVOU POLY, B2 233 PCLMULQDQ $0x01, T0, B2 234 PSHUFD $78, T0, T0 235 PXOR B2, T0 236 MOVOU POLY, B2 237 PCLMULQDQ $0x01, T0, B2 238 PSHUFD $78, T0, T0 239 PXOR T0, B2 240 PXOR T1, B2 241 242 MOVOU B2, (16*12)(dst) 243 PSHUFD $78, B2, B3 244 PXOR B2, B3 245 MOVOU B3, (16*13)(dst) 246 247 DECQ AX 248 LEAQ (-16*2)(dst), dst 249 JNE initLoop 250 251 RET 252#undef NR 253#undef KS 254#undef dst 255 256// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte) 257TEXT ·gcmAesData(SB),NOSPLIT,$0 258#define pTbl DI 259#define aut SI 260#define tPtr CX 261#define autLen DX 262 263#define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a 264#define mulRoundAAD(X ,i) \ 265 MOVOU (16*(i*2))(pTbl), T1;\ 266 MOVOU T1, T2;\ 267 PCLMULQDQ $0x00, X, T1;\ 268 PXOR T1, ACC0;\ 269 PCLMULQDQ $0x11, X, T2;\ 270 PXOR T2, ACC1;\ 271 PSHUFD $78, X, T1;\ 272 PXOR T1, X;\ 273 MOVOU (16*(i*2+1))(pTbl), T1;\ 274 PCLMULQDQ $0x00, X, T1;\ 275 PXOR T1, ACCM 276 277 MOVQ productTable+0(FP), pTbl 278 MOVQ data_base+8(FP), aut 279 MOVQ data_len+16(FP), autLen 280 MOVQ T+32(FP), tPtr 281 282 PXOR ACC0, ACC0 283 MOVOU bswapMask<>(SB), BSWAP 284 MOVOU gcmPoly<>(SB), POLY 285 286 TESTQ autLen, autLen 287 JEQ dataBail 288 289 CMPQ autLen, $13 // optimize the TLS case 290 JE dataTLS 291 CMPQ autLen, $128 292 JB startSinglesLoop 293 JMP dataOctaLoop 294 295dataTLS: 296 MOVOU (16*14)(pTbl), T1 297 MOVOU (16*15)(pTbl), T2 298 PXOR B0, B0 299 MOVQ (aut), B0 300 PINSRD $2, 8(aut), B0 301 PINSRB $12, 12(aut), B0 302 XORQ autLen, autLen 303 JMP dataMul 304 305dataOctaLoop: 306 CMPQ autLen, $128 307 JB startSinglesLoop 308 SUBQ $128, autLen 309 310 MOVOU (16*0)(aut), X0 311 MOVOU (16*1)(aut), X1 312 MOVOU (16*2)(aut), X2 313 MOVOU (16*3)(aut), X3 314 MOVOU (16*4)(aut), X4 315 MOVOU (16*5)(aut), X5 316 MOVOU (16*6)(aut), X6 317 MOVOU (16*7)(aut), X7 318 LEAQ (16*8)(aut), aut 319 PSHUFB BSWAP, X0 320 PSHUFB BSWAP, X1 321 PSHUFB BSWAP, X2 322 PSHUFB BSWAP, X3 323 PSHUFB BSWAP, X4 324 PSHUFB BSWAP, X5 325 PSHUFB BSWAP, X6 326 PSHUFB BSWAP, X7 327 PXOR ACC0, X0 328 329 MOVOU (16*0)(pTbl), ACC0 330 MOVOU (16*1)(pTbl), ACCM 331 MOVOU ACC0, ACC1 332 PSHUFD $78, X0, T1 333 PXOR X0, T1 334 PCLMULQDQ $0x00, X0, ACC0 335 PCLMULQDQ $0x11, X0, ACC1 336 PCLMULQDQ $0x00, T1, ACCM 337 338 mulRoundAAD(X1, 1) 339 mulRoundAAD(X2, 2) 340 mulRoundAAD(X3, 3) 341 mulRoundAAD(X4, 4) 342 mulRoundAAD(X5, 5) 343 mulRoundAAD(X6, 6) 344 mulRoundAAD(X7, 7) 345 346 PXOR ACC0, ACCM 347 PXOR ACC1, ACCM 348 MOVOU ACCM, T0 349 PSRLDQ $8, ACCM 350 PSLLDQ $8, T0 351 PXOR ACCM, ACC1 352 PXOR T0, ACC0 353 reduceRound(ACC0) 354 reduceRound(ACC0) 355 PXOR ACC1, ACC0 356 JMP dataOctaLoop 357 358startSinglesLoop: 359 MOVOU (16*14)(pTbl), T1 360 MOVOU (16*15)(pTbl), T2 361 362dataSinglesLoop: 363 364 CMPQ autLen, $16 365 JB dataEnd 366 SUBQ $16, autLen 367 368 MOVOU (aut), B0 369dataMul: 370 PSHUFB BSWAP, B0 371 PXOR ACC0, B0 372 373 MOVOU T1, ACC0 374 MOVOU T2, ACCM 375 MOVOU T1, ACC1 376 377 PSHUFD $78, B0, T0 378 PXOR B0, T0 379 PCLMULQDQ $0x00, B0, ACC0 380 PCLMULQDQ $0x11, B0, ACC1 381 PCLMULQDQ $0x00, T0, ACCM 382 383 PXOR ACC0, ACCM 384 PXOR ACC1, ACCM 385 MOVOU ACCM, T0 386 PSRLDQ $8, ACCM 387 PSLLDQ $8, T0 388 PXOR ACCM, ACC1 389 PXOR T0, ACC0 390 391 MOVOU POLY, T0 392 PCLMULQDQ $0x01, ACC0, T0 393 PSHUFD $78, ACC0, ACC0 394 PXOR T0, ACC0 395 396 MOVOU POLY, T0 397 PCLMULQDQ $0x01, ACC0, T0 398 PSHUFD $78, ACC0, ACC0 399 PXOR T0, ACC0 400 PXOR ACC1, ACC0 401 402 LEAQ 16(aut), aut 403 404 JMP dataSinglesLoop 405 406dataEnd: 407 408 TESTQ autLen, autLen 409 JEQ dataBail 410 411 PXOR B0, B0 412 LEAQ -1(aut)(autLen*1), aut 413 414dataLoadLoop: 415 416 PSLLDQ $1, B0 417 PINSRB $0, (aut), B0 418 419 LEAQ -1(aut), aut 420 DECQ autLen 421 JNE dataLoadLoop 422 423 JMP dataMul 424 425dataBail: 426 MOVOU ACC0, (tPtr) 427 RET 428#undef pTbl 429#undef aut 430#undef tPtr 431#undef autLen 432 433// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) 434TEXT ·gcmAesEnc(SB),0,$256-96 435#define pTbl DI 436#define ctx DX 437#define ctrPtr CX 438#define ptx SI 439#define ks AX 440#define tPtr R8 441#define ptxLen R9 442#define aluCTR R10 443#define aluTMP R11 444#define aluK R12 445#define NR R13 446 447#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP) 448#define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7 449#define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7 450#define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7 451#define combinedRound(i) \ 452 MOVOU (16*i)(ks), T0;\ 453 AESENC T0, B0;\ 454 AESENC T0, B1;\ 455 AESENC T0, B2;\ 456 AESENC T0, B3;\ 457 MOVOU (16*(i*2))(pTbl), T1;\ 458 MOVOU T1, T2;\ 459 AESENC T0, B4;\ 460 AESENC T0, B5;\ 461 AESENC T0, B6;\ 462 AESENC T0, B7;\ 463 MOVOU (16*i)(SP), T0;\ 464 PCLMULQDQ $0x00, T0, T1;\ 465 PXOR T1, ACC0;\ 466 PSHUFD $78, T0, T1;\ 467 PCLMULQDQ $0x11, T0, T2;\ 468 PXOR T1, T0;\ 469 PXOR T2, ACC1;\ 470 MOVOU (16*(i*2+1))(pTbl), T2;\ 471 PCLMULQDQ $0x00, T2, T0;\ 472 PXOR T0, ACCM 473#define mulRound(i) \ 474 MOVOU (16*i)(SP), T0;\ 475 MOVOU (16*(i*2))(pTbl), T1;\ 476 MOVOU T1, T2;\ 477 PCLMULQDQ $0x00, T0, T1;\ 478 PXOR T1, ACC0;\ 479 PCLMULQDQ $0x11, T0, T2;\ 480 PXOR T2, ACC1;\ 481 PSHUFD $78, T0, T1;\ 482 PXOR T1, T0;\ 483 MOVOU (16*(i*2+1))(pTbl), T1;\ 484 PCLMULQDQ $0x00, T0, T1;\ 485 PXOR T1, ACCM 486 487 MOVQ productTable+0(FP), pTbl 488 MOVQ dst+8(FP), ctx 489 MOVQ src_base+32(FP), ptx 490 MOVQ src_len+40(FP), ptxLen 491 MOVQ ctr+56(FP), ctrPtr 492 MOVQ T+64(FP), tPtr 493 MOVQ ks_base+72(FP), ks 494 MOVQ ks_len+80(FP), NR 495 496 SHRQ $2, NR 497 DECQ NR 498 499 MOVOU bswapMask<>(SB), BSWAP 500 MOVOU gcmPoly<>(SB), POLY 501 502 MOVOU (tPtr), ACC0 503 PXOR ACC1, ACC1 504 PXOR ACCM, ACCM 505 MOVOU (ctrPtr), B0 506 MOVL (3*4)(ctrPtr), aluCTR 507 MOVOU (ks), T0 508 MOVL (3*4)(ks), aluK 509 BSWAPL aluCTR 510 BSWAPL aluK 511 512 PXOR B0, T0 513 MOVOU T0, (8*16 + 0*16)(SP) 514 increment(0) 515 516 CMPQ ptxLen, $128 517 JB gcmAesEncSingles 518 SUBQ $128, ptxLen 519 520 // We have at least 8 blocks to encrypt, prepare the rest of the counters 521 MOVOU T0, (8*16 + 1*16)(SP) 522 increment(1) 523 MOVOU T0, (8*16 + 2*16)(SP) 524 increment(2) 525 MOVOU T0, (8*16 + 3*16)(SP) 526 increment(3) 527 MOVOU T0, (8*16 + 4*16)(SP) 528 increment(4) 529 MOVOU T0, (8*16 + 5*16)(SP) 530 increment(5) 531 MOVOU T0, (8*16 + 6*16)(SP) 532 increment(6) 533 MOVOU T0, (8*16 + 7*16)(SP) 534 increment(7) 535 536 MOVOU (8*16 + 0*16)(SP), B0 537 MOVOU (8*16 + 1*16)(SP), B1 538 MOVOU (8*16 + 2*16)(SP), B2 539 MOVOU (8*16 + 3*16)(SP), B3 540 MOVOU (8*16 + 4*16)(SP), B4 541 MOVOU (8*16 + 5*16)(SP), B5 542 MOVOU (8*16 + 6*16)(SP), B6 543 MOVOU (8*16 + 7*16)(SP), B7 544 545 aesRound(1) 546 increment(0) 547 aesRound(2) 548 increment(1) 549 aesRound(3) 550 increment(2) 551 aesRound(4) 552 increment(3) 553 aesRound(5) 554 increment(4) 555 aesRound(6) 556 increment(5) 557 aesRound(7) 558 increment(6) 559 aesRound(8) 560 increment(7) 561 aesRound(9) 562 MOVOU (16*10)(ks), T0 563 CMPQ NR, $12 564 JB encLast1 565 aesRnd(T0) 566 aesRound(11) 567 MOVOU (16*12)(ks), T0 568 JE encLast1 569 aesRnd(T0) 570 aesRound(13) 571 MOVOU (16*14)(ks), T0 572encLast1: 573 aesRndLast(T0) 574 575 MOVOU (16*0)(ptx), T0 576 PXOR T0, B0 577 MOVOU (16*1)(ptx), T0 578 PXOR T0, B1 579 MOVOU (16*2)(ptx), T0 580 PXOR T0, B2 581 MOVOU (16*3)(ptx), T0 582 PXOR T0, B3 583 MOVOU (16*4)(ptx), T0 584 PXOR T0, B4 585 MOVOU (16*5)(ptx), T0 586 PXOR T0, B5 587 MOVOU (16*6)(ptx), T0 588 PXOR T0, B6 589 MOVOU (16*7)(ptx), T0 590 PXOR T0, B7 591 592 MOVOU B0, (16*0)(ctx) 593 PSHUFB BSWAP, B0 594 PXOR ACC0, B0 595 MOVOU B1, (16*1)(ctx) 596 PSHUFB BSWAP, B1 597 MOVOU B2, (16*2)(ctx) 598 PSHUFB BSWAP, B2 599 MOVOU B3, (16*3)(ctx) 600 PSHUFB BSWAP, B3 601 MOVOU B4, (16*4)(ctx) 602 PSHUFB BSWAP, B4 603 MOVOU B5, (16*5)(ctx) 604 PSHUFB BSWAP, B5 605 MOVOU B6, (16*6)(ctx) 606 PSHUFB BSWAP, B6 607 MOVOU B7, (16*7)(ctx) 608 PSHUFB BSWAP, B7 609 610 MOVOU B0, (16*0)(SP) 611 MOVOU B1, (16*1)(SP) 612 MOVOU B2, (16*2)(SP) 613 MOVOU B3, (16*3)(SP) 614 MOVOU B4, (16*4)(SP) 615 MOVOU B5, (16*5)(SP) 616 MOVOU B6, (16*6)(SP) 617 MOVOU B7, (16*7)(SP) 618 619 LEAQ 128(ptx), ptx 620 LEAQ 128(ctx), ctx 621 622gcmAesEncOctetsLoop: 623 624 CMPQ ptxLen, $128 625 JB gcmAesEncOctetsEnd 626 SUBQ $128, ptxLen 627 628 MOVOU (8*16 + 0*16)(SP), B0 629 MOVOU (8*16 + 1*16)(SP), B1 630 MOVOU (8*16 + 2*16)(SP), B2 631 MOVOU (8*16 + 3*16)(SP), B3 632 MOVOU (8*16 + 4*16)(SP), B4 633 MOVOU (8*16 + 5*16)(SP), B5 634 MOVOU (8*16 + 6*16)(SP), B6 635 MOVOU (8*16 + 7*16)(SP), B7 636 637 MOVOU (16*0)(SP), T0 638 PSHUFD $78, T0, T1 639 PXOR T0, T1 640 641 MOVOU (16*0)(pTbl), ACC0 642 MOVOU (16*1)(pTbl), ACCM 643 MOVOU ACC0, ACC1 644 645 PCLMULQDQ $0x00, T1, ACCM 646 PCLMULQDQ $0x00, T0, ACC0 647 PCLMULQDQ $0x11, T0, ACC1 648 649 combinedRound(1) 650 increment(0) 651 combinedRound(2) 652 increment(1) 653 combinedRound(3) 654 increment(2) 655 combinedRound(4) 656 increment(3) 657 combinedRound(5) 658 increment(4) 659 combinedRound(6) 660 increment(5) 661 combinedRound(7) 662 increment(6) 663 664 aesRound(8) 665 increment(7) 666 667 PXOR ACC0, ACCM 668 PXOR ACC1, ACCM 669 MOVOU ACCM, T0 670 PSRLDQ $8, ACCM 671 PSLLDQ $8, T0 672 PXOR ACCM, ACC1 673 PXOR T0, ACC0 674 675 reduceRound(ACC0) 676 aesRound(9) 677 678 reduceRound(ACC0) 679 PXOR ACC1, ACC0 680 681 MOVOU (16*10)(ks), T0 682 CMPQ NR, $12 683 JB encLast2 684 aesRnd(T0) 685 aesRound(11) 686 MOVOU (16*12)(ks), T0 687 JE encLast2 688 aesRnd(T0) 689 aesRound(13) 690 MOVOU (16*14)(ks), T0 691encLast2: 692 aesRndLast(T0) 693 694 MOVOU (16*0)(ptx), T0 695 PXOR T0, B0 696 MOVOU (16*1)(ptx), T0 697 PXOR T0, B1 698 MOVOU (16*2)(ptx), T0 699 PXOR T0, B2 700 MOVOU (16*3)(ptx), T0 701 PXOR T0, B3 702 MOVOU (16*4)(ptx), T0 703 PXOR T0, B4 704 MOVOU (16*5)(ptx), T0 705 PXOR T0, B5 706 MOVOU (16*6)(ptx), T0 707 PXOR T0, B6 708 MOVOU (16*7)(ptx), T0 709 PXOR T0, B7 710 711 MOVOU B0, (16*0)(ctx) 712 PSHUFB BSWAP, B0 713 PXOR ACC0, B0 714 MOVOU B1, (16*1)(ctx) 715 PSHUFB BSWAP, B1 716 MOVOU B2, (16*2)(ctx) 717 PSHUFB BSWAP, B2 718 MOVOU B3, (16*3)(ctx) 719 PSHUFB BSWAP, B3 720 MOVOU B4, (16*4)(ctx) 721 PSHUFB BSWAP, B4 722 MOVOU B5, (16*5)(ctx) 723 PSHUFB BSWAP, B5 724 MOVOU B6, (16*6)(ctx) 725 PSHUFB BSWAP, B6 726 MOVOU B7, (16*7)(ctx) 727 PSHUFB BSWAP, B7 728 729 MOVOU B0, (16*0)(SP) 730 MOVOU B1, (16*1)(SP) 731 MOVOU B2, (16*2)(SP) 732 MOVOU B3, (16*3)(SP) 733 MOVOU B4, (16*4)(SP) 734 MOVOU B5, (16*5)(SP) 735 MOVOU B6, (16*6)(SP) 736 MOVOU B7, (16*7)(SP) 737 738 LEAQ 128(ptx), ptx 739 LEAQ 128(ctx), ctx 740 741 JMP gcmAesEncOctetsLoop 742 743gcmAesEncOctetsEnd: 744 745 MOVOU (16*0)(SP), T0 746 MOVOU (16*0)(pTbl), ACC0 747 MOVOU (16*1)(pTbl), ACCM 748 MOVOU ACC0, ACC1 749 PSHUFD $78, T0, T1 750 PXOR T0, T1 751 PCLMULQDQ $0x00, T0, ACC0 752 PCLMULQDQ $0x11, T0, ACC1 753 PCLMULQDQ $0x00, T1, ACCM 754 755 mulRound(1) 756 mulRound(2) 757 mulRound(3) 758 mulRound(4) 759 mulRound(5) 760 mulRound(6) 761 mulRound(7) 762 763 PXOR ACC0, ACCM 764 PXOR ACC1, ACCM 765 MOVOU ACCM, T0 766 PSRLDQ $8, ACCM 767 PSLLDQ $8, T0 768 PXOR ACCM, ACC1 769 PXOR T0, ACC0 770 771 reduceRound(ACC0) 772 reduceRound(ACC0) 773 PXOR ACC1, ACC0 774 775 TESTQ ptxLen, ptxLen 776 JE gcmAesEncDone 777 778 SUBQ $7, aluCTR 779 780gcmAesEncSingles: 781 782 MOVOU (16*1)(ks), B1 783 MOVOU (16*2)(ks), B2 784 MOVOU (16*3)(ks), B3 785 MOVOU (16*4)(ks), B4 786 MOVOU (16*5)(ks), B5 787 MOVOU (16*6)(ks), B6 788 MOVOU (16*7)(ks), B7 789 790 MOVOU (16*14)(pTbl), T2 791 792gcmAesEncSinglesLoop: 793 794 CMPQ ptxLen, $16 795 JB gcmAesEncTail 796 SUBQ $16, ptxLen 797 798 MOVOU (8*16 + 0*16)(SP), B0 799 increment(0) 800 801 AESENC B1, B0 802 AESENC B2, B0 803 AESENC B3, B0 804 AESENC B4, B0 805 AESENC B5, B0 806 AESENC B6, B0 807 AESENC B7, B0 808 MOVOU (16*8)(ks), T0 809 AESENC T0, B0 810 MOVOU (16*9)(ks), T0 811 AESENC T0, B0 812 MOVOU (16*10)(ks), T0 813 CMPQ NR, $12 814 JB encLast3 815 AESENC T0, B0 816 MOVOU (16*11)(ks), T0 817 AESENC T0, B0 818 MOVOU (16*12)(ks), T0 819 JE encLast3 820 AESENC T0, B0 821 MOVOU (16*13)(ks), T0 822 AESENC T0, B0 823 MOVOU (16*14)(ks), T0 824encLast3: 825 AESENCLAST T0, B0 826 827 MOVOU (ptx), T0 828 PXOR T0, B0 829 MOVOU B0, (ctx) 830 831 PSHUFB BSWAP, B0 832 PXOR ACC0, B0 833 834 MOVOU T2, ACC0 835 MOVOU T2, ACC1 836 MOVOU (16*15)(pTbl), ACCM 837 838 PSHUFD $78, B0, T0 839 PXOR B0, T0 840 PCLMULQDQ $0x00, B0, ACC0 841 PCLMULQDQ $0x11, B0, ACC1 842 PCLMULQDQ $0x00, T0, ACCM 843 844 PXOR ACC0, ACCM 845 PXOR ACC1, ACCM 846 MOVOU ACCM, T0 847 PSRLDQ $8, ACCM 848 PSLLDQ $8, T0 849 PXOR ACCM, ACC1 850 PXOR T0, ACC0 851 852 reduceRound(ACC0) 853 reduceRound(ACC0) 854 PXOR ACC1, ACC0 855 856 LEAQ (16*1)(ptx), ptx 857 LEAQ (16*1)(ctx), ctx 858 859 JMP gcmAesEncSinglesLoop 860 861gcmAesEncTail: 862 TESTQ ptxLen, ptxLen 863 JE gcmAesEncDone 864 865 MOVOU (8*16 + 0*16)(SP), B0 866 AESENC B1, B0 867 AESENC B2, B0 868 AESENC B3, B0 869 AESENC B4, B0 870 AESENC B5, B0 871 AESENC B6, B0 872 AESENC B7, B0 873 MOVOU (16*8)(ks), T0 874 AESENC T0, B0 875 MOVOU (16*9)(ks), T0 876 AESENC T0, B0 877 MOVOU (16*10)(ks), T0 878 CMPQ NR, $12 879 JB encLast4 880 AESENC T0, B0 881 MOVOU (16*11)(ks), T0 882 AESENC T0, B0 883 MOVOU (16*12)(ks), T0 884 JE encLast4 885 AESENC T0, B0 886 MOVOU (16*13)(ks), T0 887 AESENC T0, B0 888 MOVOU (16*14)(ks), T0 889encLast4: 890 AESENCLAST T0, B0 891 MOVOU B0, T0 892 893 LEAQ -1(ptx)(ptxLen*1), ptx 894 895 MOVQ ptxLen, aluTMP 896 SHLQ $4, aluTMP 897 898 LEAQ andMask<>(SB), aluCTR 899 MOVOU -16(aluCTR)(aluTMP*1), T1 900 901 PXOR B0, B0 902ptxLoadLoop: 903 PSLLDQ $1, B0 904 PINSRB $0, (ptx), B0 905 LEAQ -1(ptx), ptx 906 DECQ ptxLen 907 JNE ptxLoadLoop 908 909 PXOR T0, B0 910 PAND T1, B0 911 MOVOU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT 912 913 PSHUFB BSWAP, B0 914 PXOR ACC0, B0 915 916 MOVOU T2, ACC0 917 MOVOU T2, ACC1 918 MOVOU (16*15)(pTbl), ACCM 919 920 PSHUFD $78, B0, T0 921 PXOR B0, T0 922 PCLMULQDQ $0x00, B0, ACC0 923 PCLMULQDQ $0x11, B0, ACC1 924 PCLMULQDQ $0x00, T0, ACCM 925 926 PXOR ACC0, ACCM 927 PXOR ACC1, ACCM 928 MOVOU ACCM, T0 929 PSRLDQ $8, ACCM 930 PSLLDQ $8, T0 931 PXOR ACCM, ACC1 932 PXOR T0, ACC0 933 934 reduceRound(ACC0) 935 reduceRound(ACC0) 936 PXOR ACC1, ACC0 937 938gcmAesEncDone: 939 MOVOU ACC0, (tPtr) 940 RET 941#undef increment 942 943// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) 944TEXT ·gcmAesDec(SB),0,$128-96 945#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP) 946#define combinedDecRound(i) \ 947 MOVOU (16*i)(ks), T0;\ 948 AESENC T0, B0;\ 949 AESENC T0, B1;\ 950 AESENC T0, B2;\ 951 AESENC T0, B3;\ 952 MOVOU (16*(i*2))(pTbl), T1;\ 953 MOVOU T1, T2;\ 954 AESENC T0, B4;\ 955 AESENC T0, B5;\ 956 AESENC T0, B6;\ 957 AESENC T0, B7;\ 958 MOVOU (16*i)(ctx), T0;\ 959 PSHUFB BSWAP, T0;\ 960 PCLMULQDQ $0x00, T0, T1;\ 961 PXOR T1, ACC0;\ 962 PSHUFD $78, T0, T1;\ 963 PCLMULQDQ $0x11, T0, T2;\ 964 PXOR T1, T0;\ 965 PXOR T2, ACC1;\ 966 MOVOU (16*(i*2+1))(pTbl), T2;\ 967 PCLMULQDQ $0x00, T2, T0;\ 968 PXOR T0, ACCM 969 970 MOVQ productTable+0(FP), pTbl 971 MOVQ dst+8(FP), ptx 972 MOVQ src_base+32(FP), ctx 973 MOVQ src_len+40(FP), ptxLen 974 MOVQ ctr+56(FP), ctrPtr 975 MOVQ T+64(FP), tPtr 976 MOVQ ks_base+72(FP), ks 977 MOVQ ks_len+80(FP), NR 978 979 SHRQ $2, NR 980 DECQ NR 981 982 MOVOU bswapMask<>(SB), BSWAP 983 MOVOU gcmPoly<>(SB), POLY 984 985 MOVOU (tPtr), ACC0 986 PXOR ACC1, ACC1 987 PXOR ACCM, ACCM 988 MOVOU (ctrPtr), B0 989 MOVL (3*4)(ctrPtr), aluCTR 990 MOVOU (ks), T0 991 MOVL (3*4)(ks), aluK 992 BSWAPL aluCTR 993 BSWAPL aluK 994 995 PXOR B0, T0 996 MOVOU T0, (0*16)(SP) 997 increment(0) 998 999 CMPQ ptxLen, $128 1000 JB gcmAesDecSingles 1001 1002 MOVOU T0, (1*16)(SP) 1003 increment(1) 1004 MOVOU T0, (2*16)(SP) 1005 increment(2) 1006 MOVOU T0, (3*16)(SP) 1007 increment(3) 1008 MOVOU T0, (4*16)(SP) 1009 increment(4) 1010 MOVOU T0, (5*16)(SP) 1011 increment(5) 1012 MOVOU T0, (6*16)(SP) 1013 increment(6) 1014 MOVOU T0, (7*16)(SP) 1015 increment(7) 1016 1017gcmAesDecOctetsLoop: 1018 1019 CMPQ ptxLen, $128 1020 JB gcmAesDecEndOctets 1021 SUBQ $128, ptxLen 1022 1023 MOVOU (0*16)(SP), B0 1024 MOVOU (1*16)(SP), B1 1025 MOVOU (2*16)(SP), B2 1026 MOVOU (3*16)(SP), B3 1027 MOVOU (4*16)(SP), B4 1028 MOVOU (5*16)(SP), B5 1029 MOVOU (6*16)(SP), B6 1030 MOVOU (7*16)(SP), B7 1031 1032 MOVOU (16*0)(ctx), T0 1033 PSHUFB BSWAP, T0 1034 PXOR ACC0, T0 1035 PSHUFD $78, T0, T1 1036 PXOR T0, T1 1037 1038 MOVOU (16*0)(pTbl), ACC0 1039 MOVOU (16*1)(pTbl), ACCM 1040 MOVOU ACC0, ACC1 1041 1042 PCLMULQDQ $0x00, T1, ACCM 1043 PCLMULQDQ $0x00, T0, ACC0 1044 PCLMULQDQ $0x11, T0, ACC1 1045 1046 combinedDecRound(1) 1047 increment(0) 1048 combinedDecRound(2) 1049 increment(1) 1050 combinedDecRound(3) 1051 increment(2) 1052 combinedDecRound(4) 1053 increment(3) 1054 combinedDecRound(5) 1055 increment(4) 1056 combinedDecRound(6) 1057 increment(5) 1058 combinedDecRound(7) 1059 increment(6) 1060 1061 aesRound(8) 1062 increment(7) 1063 1064 PXOR ACC0, ACCM 1065 PXOR ACC1, ACCM 1066 MOVOU ACCM, T0 1067 PSRLDQ $8, ACCM 1068 PSLLDQ $8, T0 1069 PXOR ACCM, ACC1 1070 PXOR T0, ACC0 1071 1072 reduceRound(ACC0) 1073 aesRound(9) 1074 1075 reduceRound(ACC0) 1076 PXOR ACC1, ACC0 1077 1078 MOVOU (16*10)(ks), T0 1079 CMPQ NR, $12 1080 JB decLast1 1081 aesRnd(T0) 1082 aesRound(11) 1083 MOVOU (16*12)(ks), T0 1084 JE decLast1 1085 aesRnd(T0) 1086 aesRound(13) 1087 MOVOU (16*14)(ks), T0 1088decLast1: 1089 aesRndLast(T0) 1090 1091 MOVOU (16*0)(ctx), T0 1092 PXOR T0, B0 1093 MOVOU (16*1)(ctx), T0 1094 PXOR T0, B1 1095 MOVOU (16*2)(ctx), T0 1096 PXOR T0, B2 1097 MOVOU (16*3)(ctx), T0 1098 PXOR T0, B3 1099 MOVOU (16*4)(ctx), T0 1100 PXOR T0, B4 1101 MOVOU (16*5)(ctx), T0 1102 PXOR T0, B5 1103 MOVOU (16*6)(ctx), T0 1104 PXOR T0, B6 1105 MOVOU (16*7)(ctx), T0 1106 PXOR T0, B7 1107 1108 MOVOU B0, (16*0)(ptx) 1109 MOVOU B1, (16*1)(ptx) 1110 MOVOU B2, (16*2)(ptx) 1111 MOVOU B3, (16*3)(ptx) 1112 MOVOU B4, (16*4)(ptx) 1113 MOVOU B5, (16*5)(ptx) 1114 MOVOU B6, (16*6)(ptx) 1115 MOVOU B7, (16*7)(ptx) 1116 1117 LEAQ 128(ptx), ptx 1118 LEAQ 128(ctx), ctx 1119 1120 JMP gcmAesDecOctetsLoop 1121 1122gcmAesDecEndOctets: 1123 1124 SUBQ $7, aluCTR 1125 1126gcmAesDecSingles: 1127 1128 MOVOU (16*1)(ks), B1 1129 MOVOU (16*2)(ks), B2 1130 MOVOU (16*3)(ks), B3 1131 MOVOU (16*4)(ks), B4 1132 MOVOU (16*5)(ks), B5 1133 MOVOU (16*6)(ks), B6 1134 MOVOU (16*7)(ks), B7 1135 1136 MOVOU (16*14)(pTbl), T2 1137 1138gcmAesDecSinglesLoop: 1139 1140 CMPQ ptxLen, $16 1141 JB gcmAesDecTail 1142 SUBQ $16, ptxLen 1143 1144 MOVOU (ctx), B0 1145 MOVOU B0, T1 1146 PSHUFB BSWAP, B0 1147 PXOR ACC0, B0 1148 1149 MOVOU T2, ACC0 1150 MOVOU T2, ACC1 1151 MOVOU (16*15)(pTbl), ACCM 1152 1153 PCLMULQDQ $0x00, B0, ACC0 1154 PCLMULQDQ $0x11, B0, ACC1 1155 PSHUFD $78, B0, T0 1156 PXOR B0, T0 1157 PCLMULQDQ $0x00, T0, ACCM 1158 1159 PXOR ACC0, ACCM 1160 PXOR ACC1, ACCM 1161 MOVOU ACCM, T0 1162 PSRLDQ $8, ACCM 1163 PSLLDQ $8, T0 1164 PXOR ACCM, ACC1 1165 PXOR T0, ACC0 1166 1167 reduceRound(ACC0) 1168 reduceRound(ACC0) 1169 PXOR ACC1, ACC0 1170 1171 MOVOU (0*16)(SP), B0 1172 increment(0) 1173 AESENC B1, B0 1174 AESENC B2, B0 1175 AESENC B3, B0 1176 AESENC B4, B0 1177 AESENC B5, B0 1178 AESENC B6, B0 1179 AESENC B7, B0 1180 MOVOU (16*8)(ks), T0 1181 AESENC T0, B0 1182 MOVOU (16*9)(ks), T0 1183 AESENC T0, B0 1184 MOVOU (16*10)(ks), T0 1185 CMPQ NR, $12 1186 JB decLast2 1187 AESENC T0, B0 1188 MOVOU (16*11)(ks), T0 1189 AESENC T0, B0 1190 MOVOU (16*12)(ks), T0 1191 JE decLast2 1192 AESENC T0, B0 1193 MOVOU (16*13)(ks), T0 1194 AESENC T0, B0 1195 MOVOU (16*14)(ks), T0 1196decLast2: 1197 AESENCLAST T0, B0 1198 1199 PXOR T1, B0 1200 MOVOU B0, (ptx) 1201 1202 LEAQ (16*1)(ptx), ptx 1203 LEAQ (16*1)(ctx), ctx 1204 1205 JMP gcmAesDecSinglesLoop 1206 1207gcmAesDecTail: 1208 1209 TESTQ ptxLen, ptxLen 1210 JE gcmAesDecDone 1211 1212 MOVQ ptxLen, aluTMP 1213 SHLQ $4, aluTMP 1214 LEAQ andMask<>(SB), aluCTR 1215 MOVOU -16(aluCTR)(aluTMP*1), T1 1216 1217 MOVOU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow 1218 PAND T1, B0 1219 1220 MOVOU B0, T1 1221 PSHUFB BSWAP, B0 1222 PXOR ACC0, B0 1223 1224 MOVOU (16*14)(pTbl), ACC0 1225 MOVOU (16*15)(pTbl), ACCM 1226 MOVOU ACC0, ACC1 1227 1228 PCLMULQDQ $0x00, B0, ACC0 1229 PCLMULQDQ $0x11, B0, ACC1 1230 PSHUFD $78, B0, T0 1231 PXOR B0, T0 1232 PCLMULQDQ $0x00, T0, ACCM 1233 1234 PXOR ACC0, ACCM 1235 PXOR ACC1, ACCM 1236 MOVOU ACCM, T0 1237 PSRLDQ $8, ACCM 1238 PSLLDQ $8, T0 1239 PXOR ACCM, ACC1 1240 PXOR T0, ACC0 1241 1242 reduceRound(ACC0) 1243 reduceRound(ACC0) 1244 PXOR ACC1, ACC0 1245 1246 MOVOU (0*16)(SP), B0 1247 increment(0) 1248 AESENC B1, B0 1249 AESENC B2, B0 1250 AESENC B3, B0 1251 AESENC B4, B0 1252 AESENC B5, B0 1253 AESENC B6, B0 1254 AESENC B7, B0 1255 MOVOU (16*8)(ks), T0 1256 AESENC T0, B0 1257 MOVOU (16*9)(ks), T0 1258 AESENC T0, B0 1259 MOVOU (16*10)(ks), T0 1260 CMPQ NR, $12 1261 JB decLast3 1262 AESENC T0, B0 1263 MOVOU (16*11)(ks), T0 1264 AESENC T0, B0 1265 MOVOU (16*12)(ks), T0 1266 JE decLast3 1267 AESENC T0, B0 1268 MOVOU (16*13)(ks), T0 1269 AESENC T0, B0 1270 MOVOU (16*14)(ks), T0 1271decLast3: 1272 AESENCLAST T0, B0 1273 PXOR T1, B0 1274 1275ptxStoreLoop: 1276 PEXTRB $0, B0, (ptx) 1277 PSRLDQ $1, B0 1278 LEAQ 1(ptx), ptx 1279 DECQ ptxLen 1280 1281 JNE ptxStoreLoop 1282 1283gcmAesDecDone: 1284 1285 MOVOU ACC0, (tPtr) 1286 RET 1287