1// Copyright 2016 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare. 6 7//go:build gc && !purego 8// +build gc,!purego 9 10#include "textflag.h" 11// General register allocation 12#define oup DI 13#define inp SI 14#define inl BX 15#define adp CX // free to reuse, after we hash the additional data 16#define keyp R8 // free to reuse, when we copy the key to stack 17#define itr2 R9 // general iterator 18#define itr1 CX // general iterator 19#define acc0 R10 20#define acc1 R11 21#define acc2 R12 22#define t0 R13 23#define t1 R14 24#define t2 R15 25#define t3 R8 26// Register and stack allocation for the SSE code 27#define rStore (0*16)(BP) 28#define sStore (1*16)(BP) 29#define state1Store (2*16)(BP) 30#define state2Store (3*16)(BP) 31#define tmpStore (4*16)(BP) 32#define ctr0Store (5*16)(BP) 33#define ctr1Store (6*16)(BP) 34#define ctr2Store (7*16)(BP) 35#define ctr3Store (8*16)(BP) 36#define A0 X0 37#define A1 X1 38#define A2 X2 39#define B0 X3 40#define B1 X4 41#define B2 X5 42#define C0 X6 43#define C1 X7 44#define C2 X8 45#define D0 X9 46#define D1 X10 47#define D2 X11 48#define T0 X12 49#define T1 X13 50#define T2 X14 51#define T3 X15 52#define A3 T0 53#define B3 T1 54#define C3 T2 55#define D3 T3 56// Register and stack allocation for the AVX2 code 57#define rsStoreAVX2 (0*32)(BP) 58#define state1StoreAVX2 (1*32)(BP) 59#define state2StoreAVX2 (2*32)(BP) 60#define ctr0StoreAVX2 (3*32)(BP) 61#define ctr1StoreAVX2 (4*32)(BP) 62#define ctr2StoreAVX2 (5*32)(BP) 63#define ctr3StoreAVX2 (6*32)(BP) 64#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack 65#define AA0 Y0 66#define AA1 Y5 67#define AA2 Y6 68#define AA3 Y7 69#define BB0 Y14 70#define BB1 Y9 71#define BB2 Y10 72#define BB3 Y11 73#define CC0 Y12 74#define CC1 Y13 75#define CC2 Y8 76#define CC3 Y15 77#define DD0 Y4 78#define DD1 Y1 79#define DD2 Y2 80#define DD3 Y3 81#define TT0 DD3 82#define TT1 AA3 83#define TT2 BB3 84#define TT3 CC3 85// ChaCha20 constants 86DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865 87DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e 88DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32 89DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574 90DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865 91DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e 92DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32 93DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574 94// <<< 16 with PSHUFB 95DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302 96DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A 97DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302 98DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A 99// <<< 8 with PSHUFB 100DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003 101DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B 102DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003 103DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B 104 105DATA ·avx2InitMask<>+0x00(SB)/8, $0x0 106DATA ·avx2InitMask<>+0x08(SB)/8, $0x0 107DATA ·avx2InitMask<>+0x10(SB)/8, $0x1 108DATA ·avx2InitMask<>+0x18(SB)/8, $0x0 109 110DATA ·avx2IncMask<>+0x00(SB)/8, $0x2 111DATA ·avx2IncMask<>+0x08(SB)/8, $0x0 112DATA ·avx2IncMask<>+0x10(SB)/8, $0x2 113DATA ·avx2IncMask<>+0x18(SB)/8, $0x0 114// Poly1305 key clamp 115DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF 116DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC 117DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF 118DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF 119 120DATA ·sseIncMask<>+0x00(SB)/8, $0x1 121DATA ·sseIncMask<>+0x08(SB)/8, $0x0 122// To load/store the last < 16 bytes in a buffer 123DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff 124DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000 125DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff 126DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000 127DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff 128DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000 129DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff 130DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000 131DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff 132DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000 133DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff 134DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000 135DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff 136DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000 137DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff 138DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000 139DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff 140DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff 141DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff 142DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff 143DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff 144DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff 145DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff 146DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff 147DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff 148DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff 149DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff 150DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff 151DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff 152DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff 153 154GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32 155GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32 156GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32 157GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16 158GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32 159GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32 160GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32 161GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240 162// No PALIGNR in Go ASM yet (but VPALIGNR is present). 163#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3 164#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4 165#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5 166#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13 167#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6 168#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7 169#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8 170#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14 171#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9 172#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10 173#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11 174#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15 175#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3 176#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4 177#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5 178#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13 179#define shiftC0Right shiftC0Left 180#define shiftC1Right shiftC1Left 181#define shiftC2Right shiftC2Left 182#define shiftC3Right shiftC3Left 183#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9 184#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10 185#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11 186#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 187// Some macros 188#define chachaQR(A, B, C, D, T) \ 189 PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \ 190 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ 191 PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \ 192 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B 193 194#define chachaQR_AVX2(A, B, C, D, T) \ 195 VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \ 196 VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \ 197 VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \ 198 VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B 199 200#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2 201#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2 202#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX 203#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3 204#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2 205 206#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2 207#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3 208#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3 209 210#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage 211#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage 212// ---------------------------------------------------------------------------- 213TEXT polyHashADInternal<>(SB), NOSPLIT, $0 214 // adp points to beginning of additional data 215 // itr2 holds ad length 216 XORQ acc0, acc0 217 XORQ acc1, acc1 218 XORQ acc2, acc2 219 CMPQ itr2, $13 220 JNE hashADLoop 221 222openFastTLSAD: 223 // Special treatment for the TLS case of 13 bytes 224 MOVQ (adp), acc0 225 MOVQ 5(adp), acc1 226 SHRQ $24, acc1 227 MOVQ $1, acc2 228 polyMul 229 RET 230 231hashADLoop: 232 // Hash in 16 byte chunks 233 CMPQ itr2, $16 234 JB hashADTail 235 polyAdd(0(adp)) 236 LEAQ (1*16)(adp), adp 237 SUBQ $16, itr2 238 polyMul 239 JMP hashADLoop 240 241hashADTail: 242 CMPQ itr2, $0 243 JE hashADDone 244 245 // Hash last < 16 byte tail 246 XORQ t0, t0 247 XORQ t1, t1 248 XORQ t2, t2 249 ADDQ itr2, adp 250 251hashADTailLoop: 252 SHLQ $8, t0, t1 253 SHLQ $8, t0 254 MOVB -1(adp), t2 255 XORQ t2, t0 256 DECQ adp 257 DECQ itr2 258 JNE hashADTailLoop 259 260hashADTailFinish: 261 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 262 polyMul 263 264 // Finished AD 265hashADDone: 266 RET 267 268// ---------------------------------------------------------------------------- 269// func chacha20Poly1305Open(dst, key, src, ad []byte) bool 270TEXT ·chacha20Poly1305Open(SB), 0, $288-97 271 // For aligned stack access 272 MOVQ SP, BP 273 ADDQ $32, BP 274 ANDQ $-32, BP 275 MOVQ dst+0(FP), oup 276 MOVQ key+24(FP), keyp 277 MOVQ src+48(FP), inp 278 MOVQ src_len+56(FP), inl 279 MOVQ ad+72(FP), adp 280 281 // Check for AVX2 support 282 CMPB ·useAVX2(SB), $1 283 JE chacha20Poly1305Open_AVX2 284 285 // Special optimization, for very short buffers 286 CMPQ inl, $128 287 JBE openSSE128 // About 16% faster 288 289 // For long buffers, prepare the poly key first 290 MOVOU ·chacha20Constants<>(SB), A0 291 MOVOU (1*16)(keyp), B0 292 MOVOU (2*16)(keyp), C0 293 MOVOU (3*16)(keyp), D0 294 MOVO D0, T1 295 296 // Store state on stack for future use 297 MOVO B0, state1Store 298 MOVO C0, state2Store 299 MOVO D0, ctr3Store 300 MOVQ $10, itr2 301 302openSSEPreparePolyKey: 303 chachaQR(A0, B0, C0, D0, T0) 304 shiftB0Left; shiftC0Left; shiftD0Left 305 chachaQR(A0, B0, C0, D0, T0) 306 shiftB0Right; shiftC0Right; shiftD0Right 307 DECQ itr2 308 JNE openSSEPreparePolyKey 309 310 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded 311 PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0 312 313 // Clamp and store the key 314 PAND ·polyClampMask<>(SB), A0 315 MOVO A0, rStore; MOVO B0, sStore 316 317 // Hash AAD 318 MOVQ ad_len+80(FP), itr2 319 CALL polyHashADInternal<>(SB) 320 321openSSEMainLoop: 322 CMPQ inl, $256 323 JB openSSEMainLoopDone 324 325 // Load state, increment counter blocks 326 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 327 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 328 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 329 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 330 331 // Store counters 332 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store 333 334 // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 335 MOVQ $4, itr1 336 MOVQ inp, itr2 337 338openSSEInternalLoop: 339 MOVO C3, tmpStore 340 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 341 MOVO tmpStore, C3 342 MOVO C1, tmpStore 343 chachaQR(A3, B3, C3, D3, C1) 344 MOVO tmpStore, C1 345 polyAdd(0(itr2)) 346 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left 347 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left 348 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left 349 polyMulStage1 350 polyMulStage2 351 LEAQ (2*8)(itr2), itr2 352 MOVO C3, tmpStore 353 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 354 MOVO tmpStore, C3 355 MOVO C1, tmpStore 356 polyMulStage3 357 chachaQR(A3, B3, C3, D3, C1) 358 MOVO tmpStore, C1 359 polyMulReduceStage 360 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right 361 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right 362 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right 363 DECQ itr1 364 JGE openSSEInternalLoop 365 366 polyAdd(0(itr2)) 367 polyMul 368 LEAQ (2*8)(itr2), itr2 369 370 CMPQ itr1, $-6 371 JG openSSEInternalLoop 372 373 // Add in the state 374 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 375 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 376 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 377 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 378 379 // Load - xor - store 380 MOVO D3, tmpStore 381 MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup) 382 MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup) 383 MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup) 384 MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup) 385 MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup) 386 MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup) 387 MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup) 388 MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup) 389 MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup) 390 MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup) 391 MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup) 392 MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup) 393 MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup) 394 MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup) 395 MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup) 396 MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup) 397 LEAQ 256(inp), inp 398 LEAQ 256(oup), oup 399 SUBQ $256, inl 400 JMP openSSEMainLoop 401 402openSSEMainLoopDone: 403 // Handle the various tail sizes efficiently 404 TESTQ inl, inl 405 JE openSSEFinalize 406 CMPQ inl, $64 407 JBE openSSETail64 408 CMPQ inl, $128 409 JBE openSSETail128 410 CMPQ inl, $192 411 JBE openSSETail192 412 JMP openSSETail256 413 414openSSEFinalize: 415 // Hash in the PT, AAD lengths 416 ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2 417 polyMul 418 419 // Final reduce 420 MOVQ acc0, t0 421 MOVQ acc1, t1 422 MOVQ acc2, t2 423 SUBQ $-5, acc0 424 SBBQ $-1, acc1 425 SBBQ $3, acc2 426 CMOVQCS t0, acc0 427 CMOVQCS t1, acc1 428 CMOVQCS t2, acc2 429 430 // Add in the "s" part of the key 431 ADDQ 0+sStore, acc0 432 ADCQ 8+sStore, acc1 433 434 // Finally, constant time compare to the tag at the end of the message 435 XORQ AX, AX 436 MOVQ $1, DX 437 XORQ (0*8)(inp), acc0 438 XORQ (1*8)(inp), acc1 439 ORQ acc1, acc0 440 CMOVQEQ DX, AX 441 442 // Return true iff tags are equal 443 MOVB AX, ret+96(FP) 444 RET 445 446// ---------------------------------------------------------------------------- 447// Special optimization for buffers smaller than 129 bytes 448openSSE128: 449 // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks 450 MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 451 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 452 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 453 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 454 MOVQ $10, itr2 455 456openSSE128InnerCipherLoop: 457 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 458 shiftB0Left; shiftB1Left; shiftB2Left 459 shiftC0Left; shiftC1Left; shiftC2Left 460 shiftD0Left; shiftD1Left; shiftD2Left 461 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 462 shiftB0Right; shiftB1Right; shiftB2Right 463 shiftC0Right; shiftC1Right; shiftC2Right 464 shiftD0Right; shiftD1Right; shiftD2Right 465 DECQ itr2 466 JNE openSSE128InnerCipherLoop 467 468 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded 469 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 470 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 471 PADDL T2, C1; PADDL T2, C2 472 PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 473 474 // Clamp and store the key 475 PAND ·polyClampMask<>(SB), A0 476 MOVOU A0, rStore; MOVOU B0, sStore 477 478 // Hash 479 MOVQ ad_len+80(FP), itr2 480 CALL polyHashADInternal<>(SB) 481 482openSSE128Open: 483 CMPQ inl, $16 484 JB openSSETail16 485 SUBQ $16, inl 486 487 // Load for hashing 488 polyAdd(0(inp)) 489 490 // Load for decryption 491 MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup) 492 LEAQ (1*16)(inp), inp 493 LEAQ (1*16)(oup), oup 494 polyMul 495 496 // Shift the stream "left" 497 MOVO B1, A1 498 MOVO C1, B1 499 MOVO D1, C1 500 MOVO A2, D1 501 MOVO B2, A2 502 MOVO C2, B2 503 MOVO D2, C2 504 JMP openSSE128Open 505 506openSSETail16: 507 TESTQ inl, inl 508 JE openSSEFinalize 509 510 // We can safely load the CT from the end, because it is padded with the MAC 511 MOVQ inl, itr2 512 SHLQ $4, itr2 513 LEAQ ·andMask<>(SB), t0 514 MOVOU (inp), T0 515 ADDQ inl, inp 516 PAND -16(t0)(itr2*1), T0 517 MOVO T0, 0+tmpStore 518 MOVQ T0, t0 519 MOVQ 8+tmpStore, t1 520 PXOR A1, T0 521 522 // We can only store one byte at a time, since plaintext can be shorter than 16 bytes 523openSSETail16Store: 524 MOVQ T0, t3 525 MOVB t3, (oup) 526 PSRLDQ $1, T0 527 INCQ oup 528 DECQ inl 529 JNE openSSETail16Store 530 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 531 polyMul 532 JMP openSSEFinalize 533 534// ---------------------------------------------------------------------------- 535// Special optimization for the last 64 bytes of ciphertext 536openSSETail64: 537 // Need to decrypt up to 64 bytes - prepare single block 538 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store 539 XORQ itr2, itr2 540 MOVQ inl, itr1 541 CMPQ itr1, $16 542 JB openSSETail64LoopB 543 544openSSETail64LoopA: 545 // Perform ChaCha rounds, while hashing the remaining input 546 polyAdd(0(inp)(itr2*1)) 547 polyMul 548 SUBQ $16, itr1 549 550openSSETail64LoopB: 551 ADDQ $16, itr2 552 chachaQR(A0, B0, C0, D0, T0) 553 shiftB0Left; shiftC0Left; shiftD0Left 554 chachaQR(A0, B0, C0, D0, T0) 555 shiftB0Right; shiftC0Right; shiftD0Right 556 557 CMPQ itr1, $16 558 JAE openSSETail64LoopA 559 560 CMPQ itr2, $160 561 JNE openSSETail64LoopB 562 563 PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0 564 565openSSETail64DecLoop: 566 CMPQ inl, $16 567 JB openSSETail64DecLoopDone 568 SUBQ $16, inl 569 MOVOU (inp), T0 570 PXOR T0, A0 571 MOVOU A0, (oup) 572 LEAQ 16(inp), inp 573 LEAQ 16(oup), oup 574 MOVO B0, A0 575 MOVO C0, B0 576 MOVO D0, C0 577 JMP openSSETail64DecLoop 578 579openSSETail64DecLoopDone: 580 MOVO A0, A1 581 JMP openSSETail16 582 583// ---------------------------------------------------------------------------- 584// Special optimization for the last 128 bytes of ciphertext 585openSSETail128: 586 // Need to decrypt up to 128 bytes - prepare two blocks 587 MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store 588 MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store 589 XORQ itr2, itr2 590 MOVQ inl, itr1 591 ANDQ $-16, itr1 592 593openSSETail128LoopA: 594 // Perform ChaCha rounds, while hashing the remaining input 595 polyAdd(0(inp)(itr2*1)) 596 polyMul 597 598openSSETail128LoopB: 599 ADDQ $16, itr2 600 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) 601 shiftB0Left; shiftC0Left; shiftD0Left 602 shiftB1Left; shiftC1Left; shiftD1Left 603 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) 604 shiftB0Right; shiftC0Right; shiftD0Right 605 shiftB1Right; shiftC1Right; shiftD1Right 606 607 CMPQ itr2, itr1 608 JB openSSETail128LoopA 609 610 CMPQ itr2, $160 611 JNE openSSETail128LoopB 612 613 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 614 PADDL state1Store, B0; PADDL state1Store, B1 615 PADDL state2Store, C0; PADDL state2Store, C1 616 PADDL ctr1Store, D0; PADDL ctr0Store, D1 617 618 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 619 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 620 MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) 621 622 SUBQ $64, inl 623 LEAQ 64(inp), inp 624 LEAQ 64(oup), oup 625 JMP openSSETail64DecLoop 626 627// ---------------------------------------------------------------------------- 628// Special optimization for the last 192 bytes of ciphertext 629openSSETail192: 630 // Need to decrypt up to 192 bytes - prepare three blocks 631 MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store 632 MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store 633 MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store 634 635 MOVQ inl, itr1 636 MOVQ $160, itr2 637 CMPQ itr1, $160 638 CMOVQGT itr2, itr1 639 ANDQ $-16, itr1 640 XORQ itr2, itr2 641 642openSSLTail192LoopA: 643 // Perform ChaCha rounds, while hashing the remaining input 644 polyAdd(0(inp)(itr2*1)) 645 polyMul 646 647openSSLTail192LoopB: 648 ADDQ $16, itr2 649 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 650 shiftB0Left; shiftC0Left; shiftD0Left 651 shiftB1Left; shiftC1Left; shiftD1Left 652 shiftB2Left; shiftC2Left; shiftD2Left 653 654 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 655 shiftB0Right; shiftC0Right; shiftD0Right 656 shiftB1Right; shiftC1Right; shiftD1Right 657 shiftB2Right; shiftC2Right; shiftD2Right 658 659 CMPQ itr2, itr1 660 JB openSSLTail192LoopA 661 662 CMPQ itr2, $160 663 JNE openSSLTail192LoopB 664 665 CMPQ inl, $176 666 JB openSSLTail192Store 667 668 polyAdd(160(inp)) 669 polyMul 670 671 CMPQ inl, $192 672 JB openSSLTail192Store 673 674 polyAdd(176(inp)) 675 polyMul 676 677openSSLTail192Store: 678 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 679 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 680 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 681 PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2 682 683 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 684 PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2 685 MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup) 686 687 MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 688 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 689 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) 690 691 SUBQ $128, inl 692 LEAQ 128(inp), inp 693 LEAQ 128(oup), oup 694 JMP openSSETail64DecLoop 695 696// ---------------------------------------------------------------------------- 697// Special optimization for the last 256 bytes of ciphertext 698openSSETail256: 699 // Need to decrypt up to 256 bytes - prepare four blocks 700 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 701 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 702 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 703 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 704 705 // Store counters 706 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store 707 XORQ itr2, itr2 708 709openSSETail256Loop: 710 // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication 711 polyAdd(0(inp)(itr2*1)) 712 MOVO C3, tmpStore 713 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 714 MOVO tmpStore, C3 715 MOVO C1, tmpStore 716 chachaQR(A3, B3, C3, D3, C1) 717 MOVO tmpStore, C1 718 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left 719 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left 720 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left 721 polyMulStage1 722 polyMulStage2 723 MOVO C3, tmpStore 724 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 725 MOVO tmpStore, C3 726 MOVO C1, tmpStore 727 chachaQR(A3, B3, C3, D3, C1) 728 MOVO tmpStore, C1 729 polyMulStage3 730 polyMulReduceStage 731 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right 732 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right 733 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right 734 ADDQ $2*8, itr2 735 CMPQ itr2, $160 736 JB openSSETail256Loop 737 MOVQ inl, itr1 738 ANDQ $-16, itr1 739 740openSSETail256HashLoop: 741 polyAdd(0(inp)(itr2*1)) 742 polyMul 743 ADDQ $2*8, itr2 744 CMPQ itr2, itr1 745 JB openSSETail256HashLoop 746 747 // Add in the state 748 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 749 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 750 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 751 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 752 MOVO D3, tmpStore 753 754 // Load - xor - store 755 MOVOU (0*16)(inp), D3; PXOR D3, A0 756 MOVOU (1*16)(inp), D3; PXOR D3, B0 757 MOVOU (2*16)(inp), D3; PXOR D3, C0 758 MOVOU (3*16)(inp), D3; PXOR D3, D0 759 MOVOU A0, (0*16)(oup) 760 MOVOU B0, (1*16)(oup) 761 MOVOU C0, (2*16)(oup) 762 MOVOU D0, (3*16)(oup) 763 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 764 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 765 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) 766 MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 767 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 768 MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) 769 LEAQ 192(inp), inp 770 LEAQ 192(oup), oup 771 SUBQ $192, inl 772 MOVO A3, A0 773 MOVO B3, B0 774 MOVO C3, C0 775 MOVO tmpStore, D0 776 777 JMP openSSETail64DecLoop 778 779// ---------------------------------------------------------------------------- 780// ------------------------- AVX2 Code ---------------------------------------- 781chacha20Poly1305Open_AVX2: 782 VZEROUPPER 783 VMOVDQU ·chacha20Constants<>(SB), AA0 784 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 785 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 786 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 787 VPADDD ·avx2InitMask<>(SB), DD0, DD0 788 789 // Special optimization, for very short buffers 790 CMPQ inl, $192 791 JBE openAVX2192 792 CMPQ inl, $320 793 JBE openAVX2320 794 795 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream 796 VMOVDQA BB0, state1StoreAVX2 797 VMOVDQA CC0, state2StoreAVX2 798 VMOVDQA DD0, ctr3StoreAVX2 799 MOVQ $10, itr2 800 801openAVX2PreparePolyKey: 802 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) 803 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 804 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) 805 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 806 DECQ itr2 807 JNE openAVX2PreparePolyKey 808 809 VPADDD ·chacha20Constants<>(SB), AA0, AA0 810 VPADDD state1StoreAVX2, BB0, BB0 811 VPADDD state2StoreAVX2, CC0, CC0 812 VPADDD ctr3StoreAVX2, DD0, DD0 813 814 VPERM2I128 $0x02, AA0, BB0, TT0 815 816 // Clamp and store poly key 817 VPAND ·polyClampMask<>(SB), TT0, TT0 818 VMOVDQA TT0, rsStoreAVX2 819 820 // Stream for the first 64 bytes 821 VPERM2I128 $0x13, AA0, BB0, AA0 822 VPERM2I128 $0x13, CC0, DD0, BB0 823 824 // Hash AD + first 64 bytes 825 MOVQ ad_len+80(FP), itr2 826 CALL polyHashADInternal<>(SB) 827 XORQ itr1, itr1 828 829openAVX2InitialHash64: 830 polyAdd(0(inp)(itr1*1)) 831 polyMulAVX2 832 ADDQ $16, itr1 833 CMPQ itr1, $64 834 JNE openAVX2InitialHash64 835 836 // Decrypt the first 64 bytes 837 VPXOR (0*32)(inp), AA0, AA0 838 VPXOR (1*32)(inp), BB0, BB0 839 VMOVDQU AA0, (0*32)(oup) 840 VMOVDQU BB0, (1*32)(oup) 841 LEAQ (2*32)(inp), inp 842 LEAQ (2*32)(oup), oup 843 SUBQ $64, inl 844 845openAVX2MainLoop: 846 CMPQ inl, $512 847 JB openAVX2MainLoopDone 848 849 // Load state, increment counter blocks, store the incremented counters 850 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 851 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 852 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 853 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 854 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 855 XORQ itr1, itr1 856 857openAVX2InternalLoop: 858 // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications 859 // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext 860 polyAdd(0*8(inp)(itr1*1)) 861 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 862 polyMulStage1_AVX2 863 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 864 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 865 polyMulStage2_AVX2 866 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 867 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 868 polyMulStage3_AVX2 869 VMOVDQA CC3, tmpStoreAVX2 870 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 871 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 872 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 873 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 874 VMOVDQA tmpStoreAVX2, CC3 875 polyMulReduceStage 876 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 877 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 878 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 879 polyAdd(2*8(inp)(itr1*1)) 880 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 881 polyMulStage1_AVX2 882 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 883 VMOVDQA CC3, tmpStoreAVX2 884 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 885 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 886 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 887 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 888 VMOVDQA tmpStoreAVX2, CC3 889 polyMulStage2_AVX2 890 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 891 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 892 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 893 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 894 polyMulStage3_AVX2 895 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 896 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 897 polyMulReduceStage 898 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 899 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 900 polyAdd(4*8(inp)(itr1*1)) 901 LEAQ (6*8)(itr1), itr1 902 VMOVDQA CC3, tmpStoreAVX2 903 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 904 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 905 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 906 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 907 VMOVDQA tmpStoreAVX2, CC3 908 polyMulStage1_AVX2 909 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 910 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 911 polyMulStage2_AVX2 912 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 913 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 914 polyMulStage3_AVX2 915 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 916 VMOVDQA CC3, tmpStoreAVX2 917 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 918 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 919 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 920 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 921 VMOVDQA tmpStoreAVX2, CC3 922 polyMulReduceStage 923 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 924 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 925 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 926 CMPQ itr1, $480 927 JNE openAVX2InternalLoop 928 929 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 930 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 931 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 932 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 933 VMOVDQA CC3, tmpStoreAVX2 934 935 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here 936 polyAdd(480(inp)) 937 polyMulAVX2 938 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 939 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 940 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) 941 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 942 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 943 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) 944 945 // and here 946 polyAdd(496(inp)) 947 polyMulAVX2 948 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 949 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 950 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) 951 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 952 VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 953 VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) 954 LEAQ (32*16)(inp), inp 955 LEAQ (32*16)(oup), oup 956 SUBQ $(32*16), inl 957 JMP openAVX2MainLoop 958 959openAVX2MainLoopDone: 960 // Handle the various tail sizes efficiently 961 TESTQ inl, inl 962 JE openSSEFinalize 963 CMPQ inl, $128 964 JBE openAVX2Tail128 965 CMPQ inl, $256 966 JBE openAVX2Tail256 967 CMPQ inl, $384 968 JBE openAVX2Tail384 969 JMP openAVX2Tail512 970 971// ---------------------------------------------------------------------------- 972// Special optimization for buffers smaller than 193 bytes 973openAVX2192: 974 // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks 975 VMOVDQA AA0, AA1 976 VMOVDQA BB0, BB1 977 VMOVDQA CC0, CC1 978 VPADDD ·avx2IncMask<>(SB), DD0, DD1 979 VMOVDQA AA0, AA2 980 VMOVDQA BB0, BB2 981 VMOVDQA CC0, CC2 982 VMOVDQA DD0, DD2 983 VMOVDQA DD1, TT3 984 MOVQ $10, itr2 985 986openAVX2192InnerCipherLoop: 987 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 988 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 989 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 990 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 991 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 992 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 993 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 994 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 995 DECQ itr2 996 JNE openAVX2192InnerCipherLoop 997 VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 998 VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 999 VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 1000 VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 1001 VPERM2I128 $0x02, AA0, BB0, TT0 1002 1003 // Clamp and store poly key 1004 VPAND ·polyClampMask<>(SB), TT0, TT0 1005 VMOVDQA TT0, rsStoreAVX2 1006 1007 // Stream for up to 192 bytes 1008 VPERM2I128 $0x13, AA0, BB0, AA0 1009 VPERM2I128 $0x13, CC0, DD0, BB0 1010 VPERM2I128 $0x02, AA1, BB1, CC0 1011 VPERM2I128 $0x02, CC1, DD1, DD0 1012 VPERM2I128 $0x13, AA1, BB1, AA1 1013 VPERM2I128 $0x13, CC1, DD1, BB1 1014 1015openAVX2ShortOpen: 1016 // Hash 1017 MOVQ ad_len+80(FP), itr2 1018 CALL polyHashADInternal<>(SB) 1019 1020openAVX2ShortOpenLoop: 1021 CMPQ inl, $32 1022 JB openAVX2ShortTail32 1023 SUBQ $32, inl 1024 1025 // Load for hashing 1026 polyAdd(0*8(inp)) 1027 polyMulAVX2 1028 polyAdd(2*8(inp)) 1029 polyMulAVX2 1030 1031 // Load for decryption 1032 VPXOR (inp), AA0, AA0 1033 VMOVDQU AA0, (oup) 1034 LEAQ (1*32)(inp), inp 1035 LEAQ (1*32)(oup), oup 1036 1037 // Shift stream left 1038 VMOVDQA BB0, AA0 1039 VMOVDQA CC0, BB0 1040 VMOVDQA DD0, CC0 1041 VMOVDQA AA1, DD0 1042 VMOVDQA BB1, AA1 1043 VMOVDQA CC1, BB1 1044 VMOVDQA DD1, CC1 1045 VMOVDQA AA2, DD1 1046 VMOVDQA BB2, AA2 1047 JMP openAVX2ShortOpenLoop 1048 1049openAVX2ShortTail32: 1050 CMPQ inl, $16 1051 VMOVDQA A0, A1 1052 JB openAVX2ShortDone 1053 1054 SUBQ $16, inl 1055 1056 // Load for hashing 1057 polyAdd(0*8(inp)) 1058 polyMulAVX2 1059 1060 // Load for decryption 1061 VPXOR (inp), A0, T0 1062 VMOVDQU T0, (oup) 1063 LEAQ (1*16)(inp), inp 1064 LEAQ (1*16)(oup), oup 1065 VPERM2I128 $0x11, AA0, AA0, AA0 1066 VMOVDQA A0, A1 1067 1068openAVX2ShortDone: 1069 VZEROUPPER 1070 JMP openSSETail16 1071 1072// ---------------------------------------------------------------------------- 1073// Special optimization for buffers smaller than 321 bytes 1074openAVX2320: 1075 // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks 1076 VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 1077 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 1078 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 1079 MOVQ $10, itr2 1080 1081openAVX2320InnerCipherLoop: 1082 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 1083 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 1084 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 1085 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 1086 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 1087 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 1088 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 1089 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 1090 DECQ itr2 1091 JNE openAVX2320InnerCipherLoop 1092 1093 VMOVDQA ·chacha20Constants<>(SB), TT0 1094 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 1095 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 1096 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 1097 VMOVDQA ·avx2IncMask<>(SB), TT0 1098 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 1099 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 1100 VPADDD TT3, DD2, DD2 1101 1102 // Clamp and store poly key 1103 VPERM2I128 $0x02, AA0, BB0, TT0 1104 VPAND ·polyClampMask<>(SB), TT0, TT0 1105 VMOVDQA TT0, rsStoreAVX2 1106 1107 // Stream for up to 320 bytes 1108 VPERM2I128 $0x13, AA0, BB0, AA0 1109 VPERM2I128 $0x13, CC0, DD0, BB0 1110 VPERM2I128 $0x02, AA1, BB1, CC0 1111 VPERM2I128 $0x02, CC1, DD1, DD0 1112 VPERM2I128 $0x13, AA1, BB1, AA1 1113 VPERM2I128 $0x13, CC1, DD1, BB1 1114 VPERM2I128 $0x02, AA2, BB2, CC1 1115 VPERM2I128 $0x02, CC2, DD2, DD1 1116 VPERM2I128 $0x13, AA2, BB2, AA2 1117 VPERM2I128 $0x13, CC2, DD2, BB2 1118 JMP openAVX2ShortOpen 1119 1120// ---------------------------------------------------------------------------- 1121// Special optimization for the last 128 bytes of ciphertext 1122openAVX2Tail128: 1123 // Need to decrypt up to 128 bytes - prepare two blocks 1124 VMOVDQA ·chacha20Constants<>(SB), AA1 1125 VMOVDQA state1StoreAVX2, BB1 1126 VMOVDQA state2StoreAVX2, CC1 1127 VMOVDQA ctr3StoreAVX2, DD1 1128 VPADDD ·avx2IncMask<>(SB), DD1, DD1 1129 VMOVDQA DD1, DD0 1130 1131 XORQ itr2, itr2 1132 MOVQ inl, itr1 1133 ANDQ $-16, itr1 1134 TESTQ itr1, itr1 1135 JE openAVX2Tail128LoopB 1136 1137openAVX2Tail128LoopA: 1138 // Perform ChaCha rounds, while hashing the remaining input 1139 polyAdd(0(inp)(itr2*1)) 1140 polyMulAVX2 1141 1142openAVX2Tail128LoopB: 1143 ADDQ $16, itr2 1144 chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 1145 VPALIGNR $4, BB1, BB1, BB1 1146 VPALIGNR $8, CC1, CC1, CC1 1147 VPALIGNR $12, DD1, DD1, DD1 1148 chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 1149 VPALIGNR $12, BB1, BB1, BB1 1150 VPALIGNR $8, CC1, CC1, CC1 1151 VPALIGNR $4, DD1, DD1, DD1 1152 CMPQ itr2, itr1 1153 JB openAVX2Tail128LoopA 1154 CMPQ itr2, $160 1155 JNE openAVX2Tail128LoopB 1156 1157 VPADDD ·chacha20Constants<>(SB), AA1, AA1 1158 VPADDD state1StoreAVX2, BB1, BB1 1159 VPADDD state2StoreAVX2, CC1, CC1 1160 VPADDD DD0, DD1, DD1 1161 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 1162 1163openAVX2TailLoop: 1164 CMPQ inl, $32 1165 JB openAVX2Tail 1166 SUBQ $32, inl 1167 1168 // Load for decryption 1169 VPXOR (inp), AA0, AA0 1170 VMOVDQU AA0, (oup) 1171 LEAQ (1*32)(inp), inp 1172 LEAQ (1*32)(oup), oup 1173 VMOVDQA BB0, AA0 1174 VMOVDQA CC0, BB0 1175 VMOVDQA DD0, CC0 1176 JMP openAVX2TailLoop 1177 1178openAVX2Tail: 1179 CMPQ inl, $16 1180 VMOVDQA A0, A1 1181 JB openAVX2TailDone 1182 SUBQ $16, inl 1183 1184 // Load for decryption 1185 VPXOR (inp), A0, T0 1186 VMOVDQU T0, (oup) 1187 LEAQ (1*16)(inp), inp 1188 LEAQ (1*16)(oup), oup 1189 VPERM2I128 $0x11, AA0, AA0, AA0 1190 VMOVDQA A0, A1 1191 1192openAVX2TailDone: 1193 VZEROUPPER 1194 JMP openSSETail16 1195 1196// ---------------------------------------------------------------------------- 1197// Special optimization for the last 256 bytes of ciphertext 1198openAVX2Tail256: 1199 // Need to decrypt up to 256 bytes - prepare four blocks 1200 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1 1201 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1 1202 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1 1203 VMOVDQA ctr3StoreAVX2, DD0 1204 VPADDD ·avx2IncMask<>(SB), DD0, DD0 1205 VPADDD ·avx2IncMask<>(SB), DD0, DD1 1206 VMOVDQA DD0, TT1 1207 VMOVDQA DD1, TT2 1208 1209 // Compute the number of iterations that will hash data 1210 MOVQ inl, tmpStoreAVX2 1211 MOVQ inl, itr1 1212 SUBQ $128, itr1 1213 SHRQ $4, itr1 1214 MOVQ $10, itr2 1215 CMPQ itr1, $10 1216 CMOVQGT itr2, itr1 1217 MOVQ inp, inl 1218 XORQ itr2, itr2 1219 1220openAVX2Tail256LoopA: 1221 polyAdd(0(inl)) 1222 polyMulAVX2 1223 LEAQ 16(inl), inl 1224 1225 // Perform ChaCha rounds, while hashing the remaining input 1226openAVX2Tail256LoopB: 1227 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 1228 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 1229 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 1230 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 1231 INCQ itr2 1232 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 1233 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 1234 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 1235 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 1236 CMPQ itr2, itr1 1237 JB openAVX2Tail256LoopA 1238 1239 CMPQ itr2, $10 1240 JNE openAVX2Tail256LoopB 1241 1242 MOVQ inl, itr2 1243 SUBQ inp, inl 1244 MOVQ inl, itr1 1245 MOVQ tmpStoreAVX2, inl 1246 1247 // Hash the remainder of data (if any) 1248openAVX2Tail256Hash: 1249 ADDQ $16, itr1 1250 CMPQ itr1, inl 1251 JGT openAVX2Tail256HashEnd 1252 polyAdd (0(itr2)) 1253 polyMulAVX2 1254 LEAQ 16(itr2), itr2 1255 JMP openAVX2Tail256Hash 1256 1257// Store 128 bytes safely, then go to store loop 1258openAVX2Tail256HashEnd: 1259 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 1260 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 1261 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 1262 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 1263 VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2 1264 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 1265 1266 VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2 1267 VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup) 1268 LEAQ (4*32)(inp), inp 1269 LEAQ (4*32)(oup), oup 1270 SUBQ $4*32, inl 1271 1272 JMP openAVX2TailLoop 1273 1274// ---------------------------------------------------------------------------- 1275// Special optimization for the last 384 bytes of ciphertext 1276openAVX2Tail384: 1277 // Need to decrypt up to 384 bytes - prepare six blocks 1278 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 1279 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 1280 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 1281 VMOVDQA ctr3StoreAVX2, DD0 1282 VPADDD ·avx2IncMask<>(SB), DD0, DD0 1283 VPADDD ·avx2IncMask<>(SB), DD0, DD1 1284 VPADDD ·avx2IncMask<>(SB), DD1, DD2 1285 VMOVDQA DD0, ctr0StoreAVX2 1286 VMOVDQA DD1, ctr1StoreAVX2 1287 VMOVDQA DD2, ctr2StoreAVX2 1288 1289 // Compute the number of iterations that will hash two blocks of data 1290 MOVQ inl, tmpStoreAVX2 1291 MOVQ inl, itr1 1292 SUBQ $256, itr1 1293 SHRQ $4, itr1 1294 ADDQ $6, itr1 1295 MOVQ $10, itr2 1296 CMPQ itr1, $10 1297 CMOVQGT itr2, itr1 1298 MOVQ inp, inl 1299 XORQ itr2, itr2 1300 1301 // Perform ChaCha rounds, while hashing the remaining input 1302openAVX2Tail384LoopB: 1303 polyAdd(0(inl)) 1304 polyMulAVX2 1305 LEAQ 16(inl), inl 1306 1307openAVX2Tail384LoopA: 1308 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 1309 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 1310 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 1311 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 1312 polyAdd(0(inl)) 1313 polyMulAVX2 1314 LEAQ 16(inl), inl 1315 INCQ itr2 1316 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 1317 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 1318 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 1319 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 1320 1321 CMPQ itr2, itr1 1322 JB openAVX2Tail384LoopB 1323 1324 CMPQ itr2, $10 1325 JNE openAVX2Tail384LoopA 1326 1327 MOVQ inl, itr2 1328 SUBQ inp, inl 1329 MOVQ inl, itr1 1330 MOVQ tmpStoreAVX2, inl 1331 1332openAVX2Tail384Hash: 1333 ADDQ $16, itr1 1334 CMPQ itr1, inl 1335 JGT openAVX2Tail384HashEnd 1336 polyAdd(0(itr2)) 1337 polyMulAVX2 1338 LEAQ 16(itr2), itr2 1339 JMP openAVX2Tail384Hash 1340 1341// Store 256 bytes safely, then go to store loop 1342openAVX2Tail384HashEnd: 1343 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 1344 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 1345 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 1346 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2 1347 VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3 1348 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 1349 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) 1350 VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3 1351 VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 1352 VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) 1353 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 1354 LEAQ (8*32)(inp), inp 1355 LEAQ (8*32)(oup), oup 1356 SUBQ $8*32, inl 1357 JMP openAVX2TailLoop 1358 1359// ---------------------------------------------------------------------------- 1360// Special optimization for the last 512 bytes of ciphertext 1361openAVX2Tail512: 1362 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 1363 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 1364 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 1365 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 1366 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 1367 XORQ itr1, itr1 1368 MOVQ inp, itr2 1369 1370openAVX2Tail512LoopB: 1371 polyAdd(0(itr2)) 1372 polyMulAVX2 1373 LEAQ (2*8)(itr2), itr2 1374 1375openAVX2Tail512LoopA: 1376 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 1377 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 1378 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 1379 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 1380 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 1381 VMOVDQA CC3, tmpStoreAVX2 1382 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 1383 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 1384 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 1385 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 1386 VMOVDQA tmpStoreAVX2, CC3 1387 polyAdd(0*8(itr2)) 1388 polyMulAVX2 1389 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 1390 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 1391 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 1392 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 1393 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 1394 VMOVDQA CC3, tmpStoreAVX2 1395 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 1396 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 1397 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 1398 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 1399 VMOVDQA tmpStoreAVX2, CC3 1400 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 1401 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 1402 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 1403 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 1404 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 1405 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 1406 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 1407 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 1408 polyAdd(2*8(itr2)) 1409 polyMulAVX2 1410 LEAQ (4*8)(itr2), itr2 1411 VMOVDQA CC3, tmpStoreAVX2 1412 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 1413 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 1414 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 1415 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 1416 VMOVDQA tmpStoreAVX2, CC3 1417 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 1418 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 1419 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 1420 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 1421 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 1422 VMOVDQA CC3, tmpStoreAVX2 1423 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 1424 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 1425 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 1426 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 1427 VMOVDQA tmpStoreAVX2, CC3 1428 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 1429 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 1430 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 1431 INCQ itr1 1432 CMPQ itr1, $4 1433 JLT openAVX2Tail512LoopB 1434 1435 CMPQ itr1, $10 1436 JNE openAVX2Tail512LoopA 1437 1438 MOVQ inl, itr1 1439 SUBQ $384, itr1 1440 ANDQ $-16, itr1 1441 1442openAVX2Tail512HashLoop: 1443 TESTQ itr1, itr1 1444 JE openAVX2Tail512HashEnd 1445 polyAdd(0(itr2)) 1446 polyMulAVX2 1447 LEAQ 16(itr2), itr2 1448 SUBQ $16, itr1 1449 JMP openAVX2Tail512HashLoop 1450 1451openAVX2Tail512HashEnd: 1452 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 1453 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 1454 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 1455 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 1456 VMOVDQA CC3, tmpStoreAVX2 1457 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 1458 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 1459 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) 1460 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 1461 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 1462 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) 1463 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 1464 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 1465 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) 1466 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 1467 1468 LEAQ (12*32)(inp), inp 1469 LEAQ (12*32)(oup), oup 1470 SUBQ $12*32, inl 1471 1472 JMP openAVX2TailLoop 1473 1474// ---------------------------------------------------------------------------- 1475// ---------------------------------------------------------------------------- 1476// func chacha20Poly1305Seal(dst, key, src, ad []byte) 1477TEXT ·chacha20Poly1305Seal(SB), 0, $288-96 1478 // For aligned stack access 1479 MOVQ SP, BP 1480 ADDQ $32, BP 1481 ANDQ $-32, BP 1482 MOVQ dst+0(FP), oup 1483 MOVQ key+24(FP), keyp 1484 MOVQ src+48(FP), inp 1485 MOVQ src_len+56(FP), inl 1486 MOVQ ad+72(FP), adp 1487 1488 CMPB ·useAVX2(SB), $1 1489 JE chacha20Poly1305Seal_AVX2 1490 1491 // Special optimization, for very short buffers 1492 CMPQ inl, $128 1493 JBE sealSSE128 // About 15% faster 1494 1495 // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration 1496 MOVOU ·chacha20Constants<>(SB), A0 1497 MOVOU (1*16)(keyp), B0 1498 MOVOU (2*16)(keyp), C0 1499 MOVOU (3*16)(keyp), D0 1500 1501 // Store state on stack for future use 1502 MOVO B0, state1Store 1503 MOVO C0, state2Store 1504 1505 // Load state, increment counter blocks 1506 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 1507 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 1508 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 1509 1510 // Store counters 1511 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store 1512 MOVQ $10, itr2 1513 1514sealSSEIntroLoop: 1515 MOVO C3, tmpStore 1516 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 1517 MOVO tmpStore, C3 1518 MOVO C1, tmpStore 1519 chachaQR(A3, B3, C3, D3, C1) 1520 MOVO tmpStore, C1 1521 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left 1522 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left 1523 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left 1524 1525 MOVO C3, tmpStore 1526 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 1527 MOVO tmpStore, C3 1528 MOVO C1, tmpStore 1529 chachaQR(A3, B3, C3, D3, C1) 1530 MOVO tmpStore, C1 1531 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right 1532 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right 1533 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right 1534 DECQ itr2 1535 JNE sealSSEIntroLoop 1536 1537 // Add in the state 1538 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 1539 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 1540 PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 1541 PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 1542 1543 // Clamp and store the key 1544 PAND ·polyClampMask<>(SB), A0 1545 MOVO A0, rStore 1546 MOVO B0, sStore 1547 1548 // Hash AAD 1549 MOVQ ad_len+80(FP), itr2 1550 CALL polyHashADInternal<>(SB) 1551 1552 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 1553 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 1554 MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) 1555 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 1556 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 1557 MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup) 1558 1559 MOVQ $128, itr1 1560 SUBQ $128, inl 1561 LEAQ 128(inp), inp 1562 1563 MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1 1564 1565 CMPQ inl, $64 1566 JBE sealSSE128SealHash 1567 1568 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 1569 PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 1570 MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup) 1571 1572 ADDQ $64, itr1 1573 SUBQ $64, inl 1574 LEAQ 64(inp), inp 1575 1576 MOVQ $2, itr1 1577 MOVQ $8, itr2 1578 1579 CMPQ inl, $64 1580 JBE sealSSETail64 1581 CMPQ inl, $128 1582 JBE sealSSETail128 1583 CMPQ inl, $192 1584 JBE sealSSETail192 1585 1586sealSSEMainLoop: 1587 // Load state, increment counter blocks 1588 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 1589 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 1590 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 1591 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 1592 1593 // Store counters 1594 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store 1595 1596sealSSEInnerLoop: 1597 MOVO C3, tmpStore 1598 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 1599 MOVO tmpStore, C3 1600 MOVO C1, tmpStore 1601 chachaQR(A3, B3, C3, D3, C1) 1602 MOVO tmpStore, C1 1603 polyAdd(0(oup)) 1604 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left 1605 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left 1606 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left 1607 polyMulStage1 1608 polyMulStage2 1609 LEAQ (2*8)(oup), oup 1610 MOVO C3, tmpStore 1611 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) 1612 MOVO tmpStore, C3 1613 MOVO C1, tmpStore 1614 polyMulStage3 1615 chachaQR(A3, B3, C3, D3, C1) 1616 MOVO tmpStore, C1 1617 polyMulReduceStage 1618 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right 1619 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right 1620 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right 1621 DECQ itr2 1622 JGE sealSSEInnerLoop 1623 polyAdd(0(oup)) 1624 polyMul 1625 LEAQ (2*8)(oup), oup 1626 DECQ itr1 1627 JG sealSSEInnerLoop 1628 1629 // Add in the state 1630 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 1631 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 1632 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 1633 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 1634 MOVO D3, tmpStore 1635 1636 // Load - xor - store 1637 MOVOU (0*16)(inp), D3; PXOR D3, A0 1638 MOVOU (1*16)(inp), D3; PXOR D3, B0 1639 MOVOU (2*16)(inp), D3; PXOR D3, C0 1640 MOVOU (3*16)(inp), D3; PXOR D3, D0 1641 MOVOU A0, (0*16)(oup) 1642 MOVOU B0, (1*16)(oup) 1643 MOVOU C0, (2*16)(oup) 1644 MOVOU D0, (3*16)(oup) 1645 MOVO tmpStore, D3 1646 1647 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 1648 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 1649 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) 1650 MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 1651 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 1652 MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) 1653 ADDQ $192, inp 1654 MOVQ $192, itr1 1655 SUBQ $192, inl 1656 MOVO A3, A1 1657 MOVO B3, B1 1658 MOVO C3, C1 1659 MOVO D3, D1 1660 CMPQ inl, $64 1661 JBE sealSSE128SealHash 1662 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 1663 PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 1664 MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup) 1665 LEAQ 64(inp), inp 1666 SUBQ $64, inl 1667 MOVQ $6, itr1 1668 MOVQ $4, itr2 1669 CMPQ inl, $192 1670 JG sealSSEMainLoop 1671 1672 MOVQ inl, itr1 1673 TESTQ inl, inl 1674 JE sealSSE128SealHash 1675 MOVQ $6, itr1 1676 CMPQ inl, $64 1677 JBE sealSSETail64 1678 CMPQ inl, $128 1679 JBE sealSSETail128 1680 JMP sealSSETail192 1681 1682// ---------------------------------------------------------------------------- 1683// Special optimization for the last 64 bytes of plaintext 1684sealSSETail64: 1685 // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes 1686 MOVO ·chacha20Constants<>(SB), A1 1687 MOVO state1Store, B1 1688 MOVO state2Store, C1 1689 MOVO ctr3Store, D1 1690 PADDL ·sseIncMask<>(SB), D1 1691 MOVO D1, ctr0Store 1692 1693sealSSETail64LoopA: 1694 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext 1695 polyAdd(0(oup)) 1696 polyMul 1697 LEAQ 16(oup), oup 1698 1699sealSSETail64LoopB: 1700 chachaQR(A1, B1, C1, D1, T1) 1701 shiftB1Left; shiftC1Left; shiftD1Left 1702 chachaQR(A1, B1, C1, D1, T1) 1703 shiftB1Right; shiftC1Right; shiftD1Right 1704 polyAdd(0(oup)) 1705 polyMul 1706 LEAQ 16(oup), oup 1707 1708 DECQ itr1 1709 JG sealSSETail64LoopA 1710 1711 DECQ itr2 1712 JGE sealSSETail64LoopB 1713 PADDL ·chacha20Constants<>(SB), A1 1714 PADDL state1Store, B1 1715 PADDL state2Store, C1 1716 PADDL ctr0Store, D1 1717 1718 JMP sealSSE128Seal 1719 1720// ---------------------------------------------------------------------------- 1721// Special optimization for the last 128 bytes of plaintext 1722sealSSETail128: 1723 // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes 1724 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store 1725 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store 1726 1727sealSSETail128LoopA: 1728 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext 1729 polyAdd(0(oup)) 1730 polyMul 1731 LEAQ 16(oup), oup 1732 1733sealSSETail128LoopB: 1734 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) 1735 shiftB0Left; shiftC0Left; shiftD0Left 1736 shiftB1Left; shiftC1Left; shiftD1Left 1737 polyAdd(0(oup)) 1738 polyMul 1739 LEAQ 16(oup), oup 1740 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) 1741 shiftB0Right; shiftC0Right; shiftD0Right 1742 shiftB1Right; shiftC1Right; shiftD1Right 1743 1744 DECQ itr1 1745 JG sealSSETail128LoopA 1746 1747 DECQ itr2 1748 JGE sealSSETail128LoopB 1749 1750 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 1751 PADDL state1Store, B0; PADDL state1Store, B1 1752 PADDL state2Store, C0; PADDL state2Store, C1 1753 PADDL ctr0Store, D0; PADDL ctr1Store, D1 1754 1755 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 1756 PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 1757 MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) 1758 1759 MOVQ $64, itr1 1760 LEAQ 64(inp), inp 1761 SUBQ $64, inl 1762 1763 JMP sealSSE128SealHash 1764 1765// ---------------------------------------------------------------------------- 1766// Special optimization for the last 192 bytes of plaintext 1767sealSSETail192: 1768 // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes 1769 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store 1770 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store 1771 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store 1772 1773sealSSETail192LoopA: 1774 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext 1775 polyAdd(0(oup)) 1776 polyMul 1777 LEAQ 16(oup), oup 1778 1779sealSSETail192LoopB: 1780 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 1781 shiftB0Left; shiftC0Left; shiftD0Left 1782 shiftB1Left; shiftC1Left; shiftD1Left 1783 shiftB2Left; shiftC2Left; shiftD2Left 1784 1785 polyAdd(0(oup)) 1786 polyMul 1787 LEAQ 16(oup), oup 1788 1789 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 1790 shiftB0Right; shiftC0Right; shiftD0Right 1791 shiftB1Right; shiftC1Right; shiftD1Right 1792 shiftB2Right; shiftC2Right; shiftD2Right 1793 1794 DECQ itr1 1795 JG sealSSETail192LoopA 1796 1797 DECQ itr2 1798 JGE sealSSETail192LoopB 1799 1800 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 1801 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 1802 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 1803 PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2 1804 1805 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 1806 PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 1807 MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) 1808 MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 1809 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 1810 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) 1811 1812 MOVO A2, A1 1813 MOVO B2, B1 1814 MOVO C2, C1 1815 MOVO D2, D1 1816 MOVQ $128, itr1 1817 LEAQ 128(inp), inp 1818 SUBQ $128, inl 1819 1820 JMP sealSSE128SealHash 1821 1822// ---------------------------------------------------------------------------- 1823// Special seal optimization for buffers smaller than 129 bytes 1824sealSSE128: 1825 // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks 1826 MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 1827 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 1828 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 1829 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 1830 MOVQ $10, itr2 1831 1832sealSSE128InnerCipherLoop: 1833 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 1834 shiftB0Left; shiftB1Left; shiftB2Left 1835 shiftC0Left; shiftC1Left; shiftC2Left 1836 shiftD0Left; shiftD1Left; shiftD2Left 1837 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) 1838 shiftB0Right; shiftB1Right; shiftB2Right 1839 shiftC0Right; shiftC1Right; shiftC2Right 1840 shiftD0Right; shiftD1Right; shiftD2Right 1841 DECQ itr2 1842 JNE sealSSE128InnerCipherLoop 1843 1844 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded 1845 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 1846 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 1847 PADDL T2, C1; PADDL T2, C2 1848 PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 1849 PAND ·polyClampMask<>(SB), A0 1850 MOVOU A0, rStore 1851 MOVOU B0, sStore 1852 1853 // Hash 1854 MOVQ ad_len+80(FP), itr2 1855 CALL polyHashADInternal<>(SB) 1856 XORQ itr1, itr1 1857 1858sealSSE128SealHash: 1859 // itr1 holds the number of bytes encrypted but not yet hashed 1860 CMPQ itr1, $16 1861 JB sealSSE128Seal 1862 polyAdd(0(oup)) 1863 polyMul 1864 1865 SUBQ $16, itr1 1866 ADDQ $16, oup 1867 1868 JMP sealSSE128SealHash 1869 1870sealSSE128Seal: 1871 CMPQ inl, $16 1872 JB sealSSETail 1873 SUBQ $16, inl 1874 1875 // Load for decryption 1876 MOVOU (inp), T0 1877 PXOR T0, A1 1878 MOVOU A1, (oup) 1879 LEAQ (1*16)(inp), inp 1880 LEAQ (1*16)(oup), oup 1881 1882 // Extract for hashing 1883 MOVQ A1, t0 1884 PSRLDQ $8, A1 1885 MOVQ A1, t1 1886 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 1887 polyMul 1888 1889 // Shift the stream "left" 1890 MOVO B1, A1 1891 MOVO C1, B1 1892 MOVO D1, C1 1893 MOVO A2, D1 1894 MOVO B2, A2 1895 MOVO C2, B2 1896 MOVO D2, C2 1897 JMP sealSSE128Seal 1898 1899sealSSETail: 1900 TESTQ inl, inl 1901 JE sealSSEFinalize 1902 1903 // We can only load the PT one byte at a time to avoid read after end of buffer 1904 MOVQ inl, itr2 1905 SHLQ $4, itr2 1906 LEAQ ·andMask<>(SB), t0 1907 MOVQ inl, itr1 1908 LEAQ -1(inp)(inl*1), inp 1909 XORQ t2, t2 1910 XORQ t3, t3 1911 XORQ AX, AX 1912 1913sealSSETailLoadLoop: 1914 SHLQ $8, t2, t3 1915 SHLQ $8, t2 1916 MOVB (inp), AX 1917 XORQ AX, t2 1918 LEAQ -1(inp), inp 1919 DECQ itr1 1920 JNE sealSSETailLoadLoop 1921 MOVQ t2, 0+tmpStore 1922 MOVQ t3, 8+tmpStore 1923 PXOR 0+tmpStore, A1 1924 MOVOU A1, (oup) 1925 MOVOU -16(t0)(itr2*1), T0 1926 PAND T0, A1 1927 MOVQ A1, t0 1928 PSRLDQ $8, A1 1929 MOVQ A1, t1 1930 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 1931 polyMul 1932 1933 ADDQ inl, oup 1934 1935sealSSEFinalize: 1936 // Hash in the buffer lengths 1937 ADDQ ad_len+80(FP), acc0 1938 ADCQ src_len+56(FP), acc1 1939 ADCQ $1, acc2 1940 polyMul 1941 1942 // Final reduce 1943 MOVQ acc0, t0 1944 MOVQ acc1, t1 1945 MOVQ acc2, t2 1946 SUBQ $-5, acc0 1947 SBBQ $-1, acc1 1948 SBBQ $3, acc2 1949 CMOVQCS t0, acc0 1950 CMOVQCS t1, acc1 1951 CMOVQCS t2, acc2 1952 1953 // Add in the "s" part of the key 1954 ADDQ 0+sStore, acc0 1955 ADCQ 8+sStore, acc1 1956 1957 // Finally store the tag at the end of the message 1958 MOVQ acc0, (0*8)(oup) 1959 MOVQ acc1, (1*8)(oup) 1960 RET 1961 1962// ---------------------------------------------------------------------------- 1963// ------------------------- AVX2 Code ---------------------------------------- 1964chacha20Poly1305Seal_AVX2: 1965 VZEROUPPER 1966 VMOVDQU ·chacha20Constants<>(SB), AA0 1967 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 1968 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 1969 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 1970 VPADDD ·avx2InitMask<>(SB), DD0, DD0 1971 1972 // Special optimizations, for very short buffers 1973 CMPQ inl, $192 1974 JBE seal192AVX2 // 33% faster 1975 CMPQ inl, $320 1976 JBE seal320AVX2 // 17% faster 1977 1978 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream 1979 VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 1980 VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2 1981 VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2 1982 VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2 1983 VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2 1984 VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2 1985 VMOVDQA DD3, ctr3StoreAVX2 1986 MOVQ $10, itr2 1987 1988sealAVX2IntroLoop: 1989 VMOVDQA CC3, tmpStoreAVX2 1990 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) 1991 VMOVDQA tmpStoreAVX2, CC3 1992 VMOVDQA CC1, tmpStoreAVX2 1993 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) 1994 VMOVDQA tmpStoreAVX2, CC1 1995 1996 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 1997 VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 1998 VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 1999 VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 2000 2001 VMOVDQA CC3, tmpStoreAVX2 2002 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) 2003 VMOVDQA tmpStoreAVX2, CC3 2004 VMOVDQA CC1, tmpStoreAVX2 2005 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) 2006 VMOVDQA tmpStoreAVX2, CC1 2007 2008 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 2009 VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 2010 VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 2011 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 2012 DECQ itr2 2013 JNE sealAVX2IntroLoop 2014 2015 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 2016 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 2017 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 2018 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 2019 2020 VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127 2021 VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key 2022 VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95 2023 2024 // Clamp and store poly key 2025 VPAND ·polyClampMask<>(SB), DD0, DD0 2026 VMOVDQA DD0, rsStoreAVX2 2027 2028 // Hash AD 2029 MOVQ ad_len+80(FP), itr2 2030 CALL polyHashADInternal<>(SB) 2031 2032 // Can store at least 320 bytes 2033 VPXOR (0*32)(inp), AA0, AA0 2034 VPXOR (1*32)(inp), CC0, CC0 2035 VMOVDQU AA0, (0*32)(oup) 2036 VMOVDQU CC0, (1*32)(oup) 2037 2038 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 2039 VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0 2040 VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup) 2041 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 2042 VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0 2043 VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup) 2044 2045 MOVQ $320, itr1 2046 SUBQ $320, inl 2047 LEAQ 320(inp), inp 2048 2049 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0 2050 CMPQ inl, $128 2051 JBE sealAVX2SealHash 2052 2053 VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0 2054 VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup) 2055 SUBQ $128, inl 2056 LEAQ 128(inp), inp 2057 2058 MOVQ $8, itr1 2059 MOVQ $2, itr2 2060 2061 CMPQ inl, $128 2062 JBE sealAVX2Tail128 2063 CMPQ inl, $256 2064 JBE sealAVX2Tail256 2065 CMPQ inl, $384 2066 JBE sealAVX2Tail384 2067 CMPQ inl, $512 2068 JBE sealAVX2Tail512 2069 2070 // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop 2071 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 2072 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 2073 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 2074 VMOVDQA ctr3StoreAVX2, DD0 2075 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 2076 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 2077 2078 VMOVDQA CC3, tmpStoreAVX2 2079 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) 2080 VMOVDQA tmpStoreAVX2, CC3 2081 VMOVDQA CC1, tmpStoreAVX2 2082 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) 2083 VMOVDQA tmpStoreAVX2, CC1 2084 2085 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 2086 VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 2087 VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 2088 VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 2089 2090 VMOVDQA CC3, tmpStoreAVX2 2091 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) 2092 VMOVDQA tmpStoreAVX2, CC3 2093 VMOVDQA CC1, tmpStoreAVX2 2094 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) 2095 VMOVDQA tmpStoreAVX2, CC1 2096 2097 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 2098 VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 2099 VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 2100 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 2101 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2102 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2103 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 2104 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2105 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2106 VMOVDQA CC3, tmpStoreAVX2 2107 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2108 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2109 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2110 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2111 VMOVDQA tmpStoreAVX2, CC3 2112 2113 SUBQ $16, oup // Adjust the pointer 2114 MOVQ $9, itr1 2115 JMP sealAVX2InternalLoopStart 2116 2117sealAVX2MainLoop: 2118 // Load state, increment counter blocks, store the incremented counters 2119 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 2120 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 2121 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 2122 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 2123 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 2124 MOVQ $10, itr1 2125 2126sealAVX2InternalLoop: 2127 polyAdd(0*8(oup)) 2128 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2129 polyMulStage1_AVX2 2130 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2131 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 2132 polyMulStage2_AVX2 2133 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2134 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2135 polyMulStage3_AVX2 2136 VMOVDQA CC3, tmpStoreAVX2 2137 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2138 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2139 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2140 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2141 VMOVDQA tmpStoreAVX2, CC3 2142 polyMulReduceStage 2143 2144sealAVX2InternalLoopStart: 2145 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2146 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2147 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 2148 polyAdd(2*8(oup)) 2149 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2150 polyMulStage1_AVX2 2151 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2152 VMOVDQA CC3, tmpStoreAVX2 2153 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 2154 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 2155 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 2156 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 2157 VMOVDQA tmpStoreAVX2, CC3 2158 polyMulStage2_AVX2 2159 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 2160 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 2161 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 2162 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2163 polyMulStage3_AVX2 2164 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2165 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 2166 polyMulReduceStage 2167 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2168 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2169 polyAdd(4*8(oup)) 2170 LEAQ (6*8)(oup), oup 2171 VMOVDQA CC3, tmpStoreAVX2 2172 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2173 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2174 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2175 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2176 VMOVDQA tmpStoreAVX2, CC3 2177 polyMulStage1_AVX2 2178 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2179 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2180 polyMulStage2_AVX2 2181 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 2182 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2183 polyMulStage3_AVX2 2184 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2185 VMOVDQA CC3, tmpStoreAVX2 2186 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 2187 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 2188 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 2189 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 2190 VMOVDQA tmpStoreAVX2, CC3 2191 polyMulReduceStage 2192 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 2193 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 2194 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 2195 DECQ itr1 2196 JNE sealAVX2InternalLoop 2197 2198 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 2199 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 2200 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 2201 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 2202 VMOVDQA CC3, tmpStoreAVX2 2203 2204 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here 2205 polyAdd(0*8(oup)) 2206 polyMulAVX2 2207 LEAQ (4*8)(oup), oup 2208 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 2209 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 2210 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) 2211 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 2212 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 2213 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) 2214 2215 // and here 2216 polyAdd(-2*8(oup)) 2217 polyMulAVX2 2218 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 2219 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 2220 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) 2221 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 2222 VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 2223 VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) 2224 LEAQ (32*16)(inp), inp 2225 SUBQ $(32*16), inl 2226 CMPQ inl, $512 2227 JG sealAVX2MainLoop 2228 2229 // Tail can only hash 480 bytes 2230 polyAdd(0*8(oup)) 2231 polyMulAVX2 2232 polyAdd(2*8(oup)) 2233 polyMulAVX2 2234 LEAQ 32(oup), oup 2235 2236 MOVQ $10, itr1 2237 MOVQ $0, itr2 2238 CMPQ inl, $128 2239 JBE sealAVX2Tail128 2240 CMPQ inl, $256 2241 JBE sealAVX2Tail256 2242 CMPQ inl, $384 2243 JBE sealAVX2Tail384 2244 JMP sealAVX2Tail512 2245 2246// ---------------------------------------------------------------------------- 2247// Special optimization for buffers smaller than 193 bytes 2248seal192AVX2: 2249 // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks 2250 VMOVDQA AA0, AA1 2251 VMOVDQA BB0, BB1 2252 VMOVDQA CC0, CC1 2253 VPADDD ·avx2IncMask<>(SB), DD0, DD1 2254 VMOVDQA AA0, AA2 2255 VMOVDQA BB0, BB2 2256 VMOVDQA CC0, CC2 2257 VMOVDQA DD0, DD2 2258 VMOVDQA DD1, TT3 2259 MOVQ $10, itr2 2260 2261sealAVX2192InnerCipherLoop: 2262 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 2263 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 2264 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 2265 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 2266 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 2267 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 2268 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 2269 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 2270 DECQ itr2 2271 JNE sealAVX2192InnerCipherLoop 2272 VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 2273 VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 2274 VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 2275 VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 2276 VPERM2I128 $0x02, AA0, BB0, TT0 2277 2278 // Clamp and store poly key 2279 VPAND ·polyClampMask<>(SB), TT0, TT0 2280 VMOVDQA TT0, rsStoreAVX2 2281 2282 // Stream for up to 192 bytes 2283 VPERM2I128 $0x13, AA0, BB0, AA0 2284 VPERM2I128 $0x13, CC0, DD0, BB0 2285 VPERM2I128 $0x02, AA1, BB1, CC0 2286 VPERM2I128 $0x02, CC1, DD1, DD0 2287 VPERM2I128 $0x13, AA1, BB1, AA1 2288 VPERM2I128 $0x13, CC1, DD1, BB1 2289 2290sealAVX2ShortSeal: 2291 // Hash aad 2292 MOVQ ad_len+80(FP), itr2 2293 CALL polyHashADInternal<>(SB) 2294 XORQ itr1, itr1 2295 2296sealAVX2SealHash: 2297 // itr1 holds the number of bytes encrypted but not yet hashed 2298 CMPQ itr1, $16 2299 JB sealAVX2ShortSealLoop 2300 polyAdd(0(oup)) 2301 polyMul 2302 SUBQ $16, itr1 2303 ADDQ $16, oup 2304 JMP sealAVX2SealHash 2305 2306sealAVX2ShortSealLoop: 2307 CMPQ inl, $32 2308 JB sealAVX2ShortTail32 2309 SUBQ $32, inl 2310 2311 // Load for encryption 2312 VPXOR (inp), AA0, AA0 2313 VMOVDQU AA0, (oup) 2314 LEAQ (1*32)(inp), inp 2315 2316 // Now can hash 2317 polyAdd(0*8(oup)) 2318 polyMulAVX2 2319 polyAdd(2*8(oup)) 2320 polyMulAVX2 2321 LEAQ (1*32)(oup), oup 2322 2323 // Shift stream left 2324 VMOVDQA BB0, AA0 2325 VMOVDQA CC0, BB0 2326 VMOVDQA DD0, CC0 2327 VMOVDQA AA1, DD0 2328 VMOVDQA BB1, AA1 2329 VMOVDQA CC1, BB1 2330 VMOVDQA DD1, CC1 2331 VMOVDQA AA2, DD1 2332 VMOVDQA BB2, AA2 2333 JMP sealAVX2ShortSealLoop 2334 2335sealAVX2ShortTail32: 2336 CMPQ inl, $16 2337 VMOVDQA A0, A1 2338 JB sealAVX2ShortDone 2339 2340 SUBQ $16, inl 2341 2342 // Load for encryption 2343 VPXOR (inp), A0, T0 2344 VMOVDQU T0, (oup) 2345 LEAQ (1*16)(inp), inp 2346 2347 // Hash 2348 polyAdd(0*8(oup)) 2349 polyMulAVX2 2350 LEAQ (1*16)(oup), oup 2351 VPERM2I128 $0x11, AA0, AA0, AA0 2352 VMOVDQA A0, A1 2353 2354sealAVX2ShortDone: 2355 VZEROUPPER 2356 JMP sealSSETail 2357 2358// ---------------------------------------------------------------------------- 2359// Special optimization for buffers smaller than 321 bytes 2360seal320AVX2: 2361 // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks 2362 VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 2363 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 2364 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 2365 MOVQ $10, itr2 2366 2367sealAVX2320InnerCipherLoop: 2368 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 2369 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 2370 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 2371 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 2372 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 2373 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 2374 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 2375 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 2376 DECQ itr2 2377 JNE sealAVX2320InnerCipherLoop 2378 2379 VMOVDQA ·chacha20Constants<>(SB), TT0 2380 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 2381 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 2382 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 2383 VMOVDQA ·avx2IncMask<>(SB), TT0 2384 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 2385 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 2386 VPADDD TT3, DD2, DD2 2387 2388 // Clamp and store poly key 2389 VPERM2I128 $0x02, AA0, BB0, TT0 2390 VPAND ·polyClampMask<>(SB), TT0, TT0 2391 VMOVDQA TT0, rsStoreAVX2 2392 2393 // Stream for up to 320 bytes 2394 VPERM2I128 $0x13, AA0, BB0, AA0 2395 VPERM2I128 $0x13, CC0, DD0, BB0 2396 VPERM2I128 $0x02, AA1, BB1, CC0 2397 VPERM2I128 $0x02, CC1, DD1, DD0 2398 VPERM2I128 $0x13, AA1, BB1, AA1 2399 VPERM2I128 $0x13, CC1, DD1, BB1 2400 VPERM2I128 $0x02, AA2, BB2, CC1 2401 VPERM2I128 $0x02, CC2, DD2, DD1 2402 VPERM2I128 $0x13, AA2, BB2, AA2 2403 VPERM2I128 $0x13, CC2, DD2, BB2 2404 JMP sealAVX2ShortSeal 2405 2406// ---------------------------------------------------------------------------- 2407// Special optimization for the last 128 bytes of ciphertext 2408sealAVX2Tail128: 2409 // Need to decrypt up to 128 bytes - prepare two blocks 2410 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed 2411 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed 2412 VMOVDQA ·chacha20Constants<>(SB), AA0 2413 VMOVDQA state1StoreAVX2, BB0 2414 VMOVDQA state2StoreAVX2, CC0 2415 VMOVDQA ctr3StoreAVX2, DD0 2416 VPADDD ·avx2IncMask<>(SB), DD0, DD0 2417 VMOVDQA DD0, DD1 2418 2419sealAVX2Tail128LoopA: 2420 polyAdd(0(oup)) 2421 polyMul 2422 LEAQ 16(oup), oup 2423 2424sealAVX2Tail128LoopB: 2425 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) 2426 polyAdd(0(oup)) 2427 polyMul 2428 VPALIGNR $4, BB0, BB0, BB0 2429 VPALIGNR $8, CC0, CC0, CC0 2430 VPALIGNR $12, DD0, DD0, DD0 2431 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) 2432 polyAdd(16(oup)) 2433 polyMul 2434 LEAQ 32(oup), oup 2435 VPALIGNR $12, BB0, BB0, BB0 2436 VPALIGNR $8, CC0, CC0, CC0 2437 VPALIGNR $4, DD0, DD0, DD0 2438 DECQ itr1 2439 JG sealAVX2Tail128LoopA 2440 DECQ itr2 2441 JGE sealAVX2Tail128LoopB 2442 2443 VPADDD ·chacha20Constants<>(SB), AA0, AA1 2444 VPADDD state1StoreAVX2, BB0, BB1 2445 VPADDD state2StoreAVX2, CC0, CC1 2446 VPADDD DD1, DD0, DD1 2447 2448 VPERM2I128 $0x02, AA1, BB1, AA0 2449 VPERM2I128 $0x02, CC1, DD1, BB0 2450 VPERM2I128 $0x13, AA1, BB1, CC0 2451 VPERM2I128 $0x13, CC1, DD1, DD0 2452 JMP sealAVX2ShortSealLoop 2453 2454// ---------------------------------------------------------------------------- 2455// Special optimization for the last 256 bytes of ciphertext 2456sealAVX2Tail256: 2457 // Need to decrypt up to 256 bytes - prepare two blocks 2458 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed 2459 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed 2460 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1 2461 VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1 2462 VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1 2463 VMOVDQA ctr3StoreAVX2, DD0 2464 VPADDD ·avx2IncMask<>(SB), DD0, DD0 2465 VPADDD ·avx2IncMask<>(SB), DD0, DD1 2466 VMOVDQA DD0, TT1 2467 VMOVDQA DD1, TT2 2468 2469sealAVX2Tail256LoopA: 2470 polyAdd(0(oup)) 2471 polyMul 2472 LEAQ 16(oup), oup 2473 2474sealAVX2Tail256LoopB: 2475 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 2476 polyAdd(0(oup)) 2477 polyMul 2478 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 2479 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 2480 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 2481 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) 2482 polyAdd(16(oup)) 2483 polyMul 2484 LEAQ 32(oup), oup 2485 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 2486 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 2487 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 2488 DECQ itr1 2489 JG sealAVX2Tail256LoopA 2490 DECQ itr2 2491 JGE sealAVX2Tail256LoopB 2492 2493 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 2494 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 2495 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 2496 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 2497 VPERM2I128 $0x02, AA0, BB0, TT0 2498 VPERM2I128 $0x02, CC0, DD0, TT1 2499 VPERM2I128 $0x13, AA0, BB0, TT2 2500 VPERM2I128 $0x13, CC0, DD0, TT3 2501 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 2502 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) 2503 MOVQ $128, itr1 2504 LEAQ 128(inp), inp 2505 SUBQ $128, inl 2506 VPERM2I128 $0x02, AA1, BB1, AA0 2507 VPERM2I128 $0x02, CC1, DD1, BB0 2508 VPERM2I128 $0x13, AA1, BB1, CC0 2509 VPERM2I128 $0x13, CC1, DD1, DD0 2510 2511 JMP sealAVX2SealHash 2512 2513// ---------------------------------------------------------------------------- 2514// Special optimization for the last 384 bytes of ciphertext 2515sealAVX2Tail384: 2516 // Need to decrypt up to 384 bytes - prepare two blocks 2517 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed 2518 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed 2519 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 2520 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 2521 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 2522 VMOVDQA ctr3StoreAVX2, DD0 2523 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2 2524 VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3 2525 2526sealAVX2Tail384LoopA: 2527 polyAdd(0(oup)) 2528 polyMul 2529 LEAQ 16(oup), oup 2530 2531sealAVX2Tail384LoopB: 2532 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 2533 polyAdd(0(oup)) 2534 polyMul 2535 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 2536 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 2537 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 2538 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) 2539 polyAdd(16(oup)) 2540 polyMul 2541 LEAQ 32(oup), oup 2542 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 2543 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 2544 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 2545 DECQ itr1 2546 JG sealAVX2Tail384LoopA 2547 DECQ itr2 2548 JGE sealAVX2Tail384LoopB 2549 2550 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 2551 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 2552 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 2553 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2 2554 VPERM2I128 $0x02, AA0, BB0, TT0 2555 VPERM2I128 $0x02, CC0, DD0, TT1 2556 VPERM2I128 $0x13, AA0, BB0, TT2 2557 VPERM2I128 $0x13, CC0, DD0, TT3 2558 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 2559 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) 2560 VPERM2I128 $0x02, AA1, BB1, TT0 2561 VPERM2I128 $0x02, CC1, DD1, TT1 2562 VPERM2I128 $0x13, AA1, BB1, TT2 2563 VPERM2I128 $0x13, CC1, DD1, TT3 2564 VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 2565 VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) 2566 MOVQ $256, itr1 2567 LEAQ 256(inp), inp 2568 SUBQ $256, inl 2569 VPERM2I128 $0x02, AA2, BB2, AA0 2570 VPERM2I128 $0x02, CC2, DD2, BB0 2571 VPERM2I128 $0x13, AA2, BB2, CC0 2572 VPERM2I128 $0x13, CC2, DD2, DD0 2573 2574 JMP sealAVX2SealHash 2575 2576// ---------------------------------------------------------------------------- 2577// Special optimization for the last 512 bytes of ciphertext 2578sealAVX2Tail512: 2579 // Need to decrypt up to 512 bytes - prepare two blocks 2580 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed 2581 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed 2582 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 2583 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 2584 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 2585 VMOVDQA ctr3StoreAVX2, DD0 2586 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 2587 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 2588 2589sealAVX2Tail512LoopA: 2590 polyAdd(0(oup)) 2591 polyMul 2592 LEAQ 16(oup), oup 2593 2594sealAVX2Tail512LoopB: 2595 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2596 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2597 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 2598 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2599 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2600 VMOVDQA CC3, tmpStoreAVX2 2601 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2602 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2603 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2604 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2605 VMOVDQA tmpStoreAVX2, CC3 2606 polyAdd(0*8(oup)) 2607 polyMulAVX2 2608 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2609 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2610 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 2611 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2612 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2613 VMOVDQA CC3, tmpStoreAVX2 2614 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 2615 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 2616 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 2617 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 2618 VMOVDQA tmpStoreAVX2, CC3 2619 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 2620 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 2621 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 2622 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2623 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2624 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 2625 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2626 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2627 polyAdd(2*8(oup)) 2628 polyMulAVX2 2629 LEAQ (4*8)(oup), oup 2630 VMOVDQA CC3, tmpStoreAVX2 2631 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 2632 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 2633 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 2634 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 2635 VMOVDQA tmpStoreAVX2, CC3 2636 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 2637 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 2638 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 2639 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 2640 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 2641 VMOVDQA CC3, tmpStoreAVX2 2642 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 2643 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 2644 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 2645 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 2646 VMOVDQA tmpStoreAVX2, CC3 2647 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 2648 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 2649 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 2650 2651 DECQ itr1 2652 JG sealAVX2Tail512LoopA 2653 DECQ itr2 2654 JGE sealAVX2Tail512LoopB 2655 2656 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 2657 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 2658 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 2659 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 2660 VMOVDQA CC3, tmpStoreAVX2 2661 VPERM2I128 $0x02, AA0, BB0, CC3 2662 VPXOR (0*32)(inp), CC3, CC3 2663 VMOVDQU CC3, (0*32)(oup) 2664 VPERM2I128 $0x02, CC0, DD0, CC3 2665 VPXOR (1*32)(inp), CC3, CC3 2666 VMOVDQU CC3, (1*32)(oup) 2667 VPERM2I128 $0x13, AA0, BB0, CC3 2668 VPXOR (2*32)(inp), CC3, CC3 2669 VMOVDQU CC3, (2*32)(oup) 2670 VPERM2I128 $0x13, CC0, DD0, CC3 2671 VPXOR (3*32)(inp), CC3, CC3 2672 VMOVDQU CC3, (3*32)(oup) 2673 2674 VPERM2I128 $0x02, AA1, BB1, AA0 2675 VPERM2I128 $0x02, CC1, DD1, BB0 2676 VPERM2I128 $0x13, AA1, BB1, CC0 2677 VPERM2I128 $0x13, CC1, DD1, DD0 2678 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 2679 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) 2680 2681 VPERM2I128 $0x02, AA2, BB2, AA0 2682 VPERM2I128 $0x02, CC2, DD2, BB0 2683 VPERM2I128 $0x13, AA2, BB2, CC0 2684 VPERM2I128 $0x13, CC2, DD2, DD0 2685 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 2686 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) 2687 2688 MOVQ $384, itr1 2689 LEAQ 384(inp), inp 2690 SUBQ $384, inl 2691 VPERM2I128 $0x02, AA3, BB3, AA0 2692 VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0 2693 VPERM2I128 $0x13, AA3, BB3, CC0 2694 VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 2695 2696 JMP sealAVX2SealHash 2697