1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
6
7// +build !gccgo,!purego
8
9#include "textflag.h"
10// General register allocation
11#define oup DI
12#define inp SI
13#define inl BX
14#define adp CX // free to reuse, after we hash the additional data
15#define keyp R8 // free to reuse, when we copy the key to stack
16#define itr2 R9 // general iterator
17#define itr1 CX // general iterator
18#define acc0 R10
19#define acc1 R11
20#define acc2 R12
21#define t0 R13
22#define t1 R14
23#define t2 R15
24#define t3 R8
25// Register and stack allocation for the SSE code
26#define rStore (0*16)(BP)
27#define sStore (1*16)(BP)
28#define state1Store (2*16)(BP)
29#define state2Store (3*16)(BP)
30#define tmpStore (4*16)(BP)
31#define ctr0Store (5*16)(BP)
32#define ctr1Store (6*16)(BP)
33#define ctr2Store (7*16)(BP)
34#define ctr3Store (8*16)(BP)
35#define A0 X0
36#define A1 X1
37#define A2 X2
38#define B0 X3
39#define B1 X4
40#define B2 X5
41#define C0 X6
42#define C1 X7
43#define C2 X8
44#define D0 X9
45#define D1 X10
46#define D2 X11
47#define T0 X12
48#define T1 X13
49#define T2 X14
50#define T3 X15
51#define A3 T0
52#define B3 T1
53#define C3 T2
54#define D3 T3
55// Register and stack allocation for the AVX2 code
56#define rsStoreAVX2 (0*32)(BP)
57#define state1StoreAVX2 (1*32)(BP)
58#define state2StoreAVX2 (2*32)(BP)
59#define ctr0StoreAVX2 (3*32)(BP)
60#define ctr1StoreAVX2 (4*32)(BP)
61#define ctr2StoreAVX2 (5*32)(BP)
62#define ctr3StoreAVX2 (6*32)(BP)
63#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
64#define AA0 Y0
65#define AA1 Y5
66#define AA2 Y6
67#define AA3 Y7
68#define BB0 Y14
69#define BB1 Y9
70#define BB2 Y10
71#define BB3 Y11
72#define CC0 Y12
73#define CC1 Y13
74#define CC2 Y8
75#define CC3 Y15
76#define DD0 Y4
77#define DD1 Y1
78#define DD2 Y2
79#define DD3 Y3
80#define TT0 DD3
81#define TT1 AA3
82#define TT2 BB3
83#define TT3 CC3
84// ChaCha20 constants
85DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
86DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
87DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
88DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
89DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
90DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
91DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
92DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
93// <<< 16 with PSHUFB
94DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
95DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
96DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
97DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
98// <<< 8 with PSHUFB
99DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
100DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
101DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
102DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
103
104DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
105DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
106DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
107DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
108
109DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
110DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
111DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
112DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
113// Poly1305 key clamp
114DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
115DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
116DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
117DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
118
119DATA ·sseIncMask<>+0x00(SB)/8, $0x1
120DATA ·sseIncMask<>+0x08(SB)/8, $0x0
121// To load/store the last < 16 bytes in a buffer
122DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
123DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
124DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
125DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
126DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
127DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
128DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
129DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
130DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
131DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
132DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
133DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
134DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
135DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
136DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
137DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
138DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
139DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
140DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
141DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
142DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
143DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
144DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
145DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
146DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
147DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
148DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
149DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
150DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
151DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
152
153GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
154GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
155GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
156GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
157GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
158GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
159GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
160GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
161// No PALIGNR in Go ASM yet (but VPALIGNR is present).
162#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
163#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
164#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
165#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
166#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
167#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
168#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
169#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
170#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
171#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
172#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
173#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
174#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
175#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
176#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
177#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
178#define shiftC0Right shiftC0Left
179#define shiftC1Right shiftC1Left
180#define shiftC2Right shiftC2Left
181#define shiftC3Right shiftC3Left
182#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
183#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
184#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
185#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
186// Some macros
187#define chachaQR(A, B, C, D, T) \
188	PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D                            \
189	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
190	PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D                             \
191	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
192
193#define chachaQR_AVX2(A, B, C, D, T) \
194	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D                         \
195	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
196	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D                          \
197	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
198
199#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
200#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
201#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
202#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
203#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
204
205#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
206#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
207#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
208
209#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
210#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
211// ----------------------------------------------------------------------------
212TEXT polyHashADInternal<>(SB), NOSPLIT, $0
213	// adp points to beginning of additional data
214	// itr2 holds ad length
215	XORQ acc0, acc0
216	XORQ acc1, acc1
217	XORQ acc2, acc2
218	CMPQ itr2, $13
219	JNE  hashADLoop
220
221openFastTLSAD:
222	// Special treatment for the TLS case of 13 bytes
223	MOVQ (adp), acc0
224	MOVQ 5(adp), acc1
225	SHRQ $24, acc1
226	MOVQ $1, acc2
227	polyMul
228	RET
229
230hashADLoop:
231	// Hash in 16 byte chunks
232	CMPQ itr2, $16
233	JB   hashADTail
234	polyAdd(0(adp))
235	LEAQ (1*16)(adp), adp
236	SUBQ $16, itr2
237	polyMul
238	JMP  hashADLoop
239
240hashADTail:
241	CMPQ itr2, $0
242	JE   hashADDone
243
244	// Hash last < 16 byte tail
245	XORQ t0, t0
246	XORQ t1, t1
247	XORQ t2, t2
248	ADDQ itr2, adp
249
250hashADTailLoop:
251	SHLQ $8, t0, t1
252	SHLQ $8, t0
253	MOVB -1(adp), t2
254	XORQ t2, t0
255	DECQ adp
256	DECQ itr2
257	JNE  hashADTailLoop
258
259hashADTailFinish:
260	ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
261	polyMul
262
263	// Finished AD
264hashADDone:
265	RET
266
267// ----------------------------------------------------------------------------
268// func chacha20Poly1305Open(dst, key, src, ad []byte) bool
269TEXT ·chacha20Poly1305Open(SB), 0, $288-97
270	// For aligned stack access
271	MOVQ SP, BP
272	ADDQ $32, BP
273	ANDQ $-32, BP
274	MOVQ dst+0(FP), oup
275	MOVQ key+24(FP), keyp
276	MOVQ src+48(FP), inp
277	MOVQ src_len+56(FP), inl
278	MOVQ ad+72(FP), adp
279
280	// Check for AVX2 support
281	CMPB ·useAVX2(SB), $1
282	JE   chacha20Poly1305Open_AVX2
283
284	// Special optimization, for very short buffers
285	CMPQ inl, $128
286	JBE  openSSE128 // About 16% faster
287
288	// For long buffers, prepare the poly key first
289	MOVOU ·chacha20Constants<>(SB), A0
290	MOVOU (1*16)(keyp), B0
291	MOVOU (2*16)(keyp), C0
292	MOVOU (3*16)(keyp), D0
293	MOVO  D0, T1
294
295	// Store state on stack for future use
296	MOVO B0, state1Store
297	MOVO C0, state2Store
298	MOVO D0, ctr3Store
299	MOVQ $10, itr2
300
301openSSEPreparePolyKey:
302	chachaQR(A0, B0, C0, D0, T0)
303	shiftB0Left;  shiftC0Left; shiftD0Left
304	chachaQR(A0, B0, C0, D0, T0)
305	shiftB0Right; shiftC0Right; shiftD0Right
306	DECQ          itr2
307	JNE           openSSEPreparePolyKey
308
309	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
310	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
311
312	// Clamp and store the key
313	PAND ·polyClampMask<>(SB), A0
314	MOVO A0, rStore; MOVO B0, sStore
315
316	// Hash AAD
317	MOVQ ad_len+80(FP), itr2
318	CALL polyHashADInternal<>(SB)
319
320openSSEMainLoop:
321	CMPQ inl, $256
322	JB   openSSEMainLoopDone
323
324	// Load state, increment counter blocks
325	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
326	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
327	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
328	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
329
330	// Store counters
331	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
332
333	// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
334	MOVQ $4, itr1
335	MOVQ inp, itr2
336
337openSSEInternalLoop:
338	MOVO          C3, tmpStore
339	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
340	MOVO          tmpStore, C3
341	MOVO          C1, tmpStore
342	chachaQR(A3, B3, C3, D3, C1)
343	MOVO          tmpStore, C1
344	polyAdd(0(itr2))
345	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
346	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
347	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
348	polyMulStage1
349	polyMulStage2
350	LEAQ          (2*8)(itr2), itr2
351	MOVO          C3, tmpStore
352	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
353	MOVO          tmpStore, C3
354	MOVO          C1, tmpStore
355	polyMulStage3
356	chachaQR(A3, B3, C3, D3, C1)
357	MOVO          tmpStore, C1
358	polyMulReduceStage
359	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
360	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
361	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
362	DECQ          itr1
363	JGE           openSSEInternalLoop
364
365	polyAdd(0(itr2))
366	polyMul
367	LEAQ (2*8)(itr2), itr2
368
369	CMPQ itr1, $-6
370	JG   openSSEInternalLoop
371
372	// Add in the state
373	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
374	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
375	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
376	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
377
378	// Load - xor - store
379	MOVO  D3, tmpStore
380	MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
381	MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
382	MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
383	MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
384	MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
385	MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
386	MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
387	MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
388	MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
389	MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
390	MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
391	MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
392	MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
393	MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
394	MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
395	MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
396	LEAQ  256(inp), inp
397	LEAQ  256(oup), oup
398	SUBQ  $256, inl
399	JMP   openSSEMainLoop
400
401openSSEMainLoopDone:
402	// Handle the various tail sizes efficiently
403	TESTQ inl, inl
404	JE    openSSEFinalize
405	CMPQ  inl, $64
406	JBE   openSSETail64
407	CMPQ  inl, $128
408	JBE   openSSETail128
409	CMPQ  inl, $192
410	JBE   openSSETail192
411	JMP   openSSETail256
412
413openSSEFinalize:
414	// Hash in the PT, AAD lengths
415	ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
416	polyMul
417
418	// Final reduce
419	MOVQ    acc0, t0
420	MOVQ    acc1, t1
421	MOVQ    acc2, t2
422	SUBQ    $-5, acc0
423	SBBQ    $-1, acc1
424	SBBQ    $3, acc2
425	CMOVQCS t0, acc0
426	CMOVQCS t1, acc1
427	CMOVQCS t2, acc2
428
429	// Add in the "s" part of the key
430	ADDQ 0+sStore, acc0
431	ADCQ 8+sStore, acc1
432
433	// Finally, constant time compare to the tag at the end of the message
434	XORQ    AX, AX
435	MOVQ    $1, DX
436	XORQ    (0*8)(inp), acc0
437	XORQ    (1*8)(inp), acc1
438	ORQ     acc1, acc0
439	CMOVQEQ DX, AX
440
441	// Return true iff tags are equal
442	MOVB AX, ret+96(FP)
443	RET
444
445// ----------------------------------------------------------------------------
446// Special optimization for buffers smaller than 129 bytes
447openSSE128:
448	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
449	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
450	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
451	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
452	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
453	MOVQ  $10, itr2
454
455openSSE128InnerCipherLoop:
456	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
457	shiftB0Left;  shiftB1Left; shiftB2Left
458	shiftC0Left;  shiftC1Left; shiftC2Left
459	shiftD0Left;  shiftD1Left; shiftD2Left
460	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
461	shiftB0Right; shiftB1Right; shiftB2Right
462	shiftC0Right; shiftC1Right; shiftC2Right
463	shiftD0Right; shiftD1Right; shiftD2Right
464	DECQ          itr2
465	JNE           openSSE128InnerCipherLoop
466
467	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
468	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
469	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
470	PADDL T2, C1; PADDL T2, C2
471	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
472
473	// Clamp and store the key
474	PAND  ·polyClampMask<>(SB), A0
475	MOVOU A0, rStore; MOVOU B0, sStore
476
477	// Hash
478	MOVQ ad_len+80(FP), itr2
479	CALL polyHashADInternal<>(SB)
480
481openSSE128Open:
482	CMPQ inl, $16
483	JB   openSSETail16
484	SUBQ $16, inl
485
486	// Load for hashing
487	polyAdd(0(inp))
488
489	// Load for decryption
490	MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
491	LEAQ  (1*16)(inp), inp
492	LEAQ  (1*16)(oup), oup
493	polyMul
494
495	// Shift the stream "left"
496	MOVO B1, A1
497	MOVO C1, B1
498	MOVO D1, C1
499	MOVO A2, D1
500	MOVO B2, A2
501	MOVO C2, B2
502	MOVO D2, C2
503	JMP  openSSE128Open
504
505openSSETail16:
506	TESTQ inl, inl
507	JE    openSSEFinalize
508
509	// We can safely load the CT from the end, because it is padded with the MAC
510	MOVQ   inl, itr2
511	SHLQ   $4, itr2
512	LEAQ   ·andMask<>(SB), t0
513	MOVOU  (inp), T0
514	ADDQ   inl, inp
515	PAND   -16(t0)(itr2*1), T0
516	MOVO   T0, 0+tmpStore
517	MOVQ   T0, t0
518	MOVQ   8+tmpStore, t1
519	PXOR   A1, T0
520
521	// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
522openSSETail16Store:
523	MOVQ T0, t3
524	MOVB t3, (oup)
525	PSRLDQ $1, T0
526	INCQ   oup
527	DECQ   inl
528	JNE    openSSETail16Store
529	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
530	polyMul
531	JMP    openSSEFinalize
532
533// ----------------------------------------------------------------------------
534// Special optimization for the last 64 bytes of ciphertext
535openSSETail64:
536	// Need to decrypt up to 64 bytes - prepare single block
537	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
538	XORQ itr2, itr2
539	MOVQ inl, itr1
540	CMPQ itr1, $16
541	JB   openSSETail64LoopB
542
543openSSETail64LoopA:
544	// Perform ChaCha rounds, while hashing the remaining input
545	polyAdd(0(inp)(itr2*1))
546	polyMul
547	SUBQ $16, itr1
548
549openSSETail64LoopB:
550	ADDQ          $16, itr2
551	chachaQR(A0, B0, C0, D0, T0)
552	shiftB0Left;  shiftC0Left; shiftD0Left
553	chachaQR(A0, B0, C0, D0, T0)
554	shiftB0Right; shiftC0Right; shiftD0Right
555
556	CMPQ itr1, $16
557	JAE  openSSETail64LoopA
558
559	CMPQ itr2, $160
560	JNE  openSSETail64LoopB
561
562	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
563
564openSSETail64DecLoop:
565	CMPQ  inl, $16
566	JB    openSSETail64DecLoopDone
567	SUBQ  $16, inl
568	MOVOU (inp), T0
569	PXOR  T0, A0
570	MOVOU A0, (oup)
571	LEAQ  16(inp), inp
572	LEAQ  16(oup), oup
573	MOVO  B0, A0
574	MOVO  C0, B0
575	MOVO  D0, C0
576	JMP   openSSETail64DecLoop
577
578openSSETail64DecLoopDone:
579	MOVO A0, A1
580	JMP  openSSETail16
581
582// ----------------------------------------------------------------------------
583// Special optimization for the last 128 bytes of ciphertext
584openSSETail128:
585	// Need to decrypt up to 128 bytes - prepare two blocks
586	MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
587	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
588	XORQ itr2, itr2
589	MOVQ inl, itr1
590	ANDQ $-16, itr1
591
592openSSETail128LoopA:
593	// Perform ChaCha rounds, while hashing the remaining input
594	polyAdd(0(inp)(itr2*1))
595	polyMul
596
597openSSETail128LoopB:
598	ADDQ          $16, itr2
599	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
600	shiftB0Left;  shiftC0Left; shiftD0Left
601	shiftB1Left;  shiftC1Left; shiftD1Left
602	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
603	shiftB0Right; shiftC0Right; shiftD0Right
604	shiftB1Right; shiftC1Right; shiftD1Right
605
606	CMPQ itr2, itr1
607	JB   openSSETail128LoopA
608
609	CMPQ itr2, $160
610	JNE  openSSETail128LoopB
611
612	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
613	PADDL state1Store, B0; PADDL state1Store, B1
614	PADDL state2Store, C0; PADDL state2Store, C1
615	PADDL ctr1Store, D0; PADDL ctr0Store, D1
616
617	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
618	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
619	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
620
621	SUBQ $64, inl
622	LEAQ 64(inp), inp
623	LEAQ 64(oup), oup
624	JMP  openSSETail64DecLoop
625
626// ----------------------------------------------------------------------------
627// Special optimization for the last 192 bytes of ciphertext
628openSSETail192:
629	// Need to decrypt up to 192 bytes - prepare three blocks
630	MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
631	MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
632	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
633
634	MOVQ    inl, itr1
635	MOVQ    $160, itr2
636	CMPQ    itr1, $160
637	CMOVQGT itr2, itr1
638	ANDQ    $-16, itr1
639	XORQ    itr2, itr2
640
641openSSLTail192LoopA:
642	// Perform ChaCha rounds, while hashing the remaining input
643	polyAdd(0(inp)(itr2*1))
644	polyMul
645
646openSSLTail192LoopB:
647	ADDQ         $16, itr2
648	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
649	shiftB0Left; shiftC0Left; shiftD0Left
650	shiftB1Left; shiftC1Left; shiftD1Left
651	shiftB2Left; shiftC2Left; shiftD2Left
652
653	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
654	shiftB0Right; shiftC0Right; shiftD0Right
655	shiftB1Right; shiftC1Right; shiftD1Right
656	shiftB2Right; shiftC2Right; shiftD2Right
657
658	CMPQ itr2, itr1
659	JB   openSSLTail192LoopA
660
661	CMPQ itr2, $160
662	JNE  openSSLTail192LoopB
663
664	CMPQ inl, $176
665	JB   openSSLTail192Store
666
667	polyAdd(160(inp))
668	polyMul
669
670	CMPQ inl, $192
671	JB   openSSLTail192Store
672
673	polyAdd(176(inp))
674	polyMul
675
676openSSLTail192Store:
677	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
678	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
679	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
680	PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
681
682	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
683	PXOR  T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
684	MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
685
686	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
687	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
688	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
689
690	SUBQ $128, inl
691	LEAQ 128(inp), inp
692	LEAQ 128(oup), oup
693	JMP  openSSETail64DecLoop
694
695// ----------------------------------------------------------------------------
696// Special optimization for the last 256 bytes of ciphertext
697openSSETail256:
698	// Need to decrypt up to 256 bytes - prepare four blocks
699	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
700	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
701	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
702	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
703
704	// Store counters
705	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
706	XORQ itr2, itr2
707
708openSSETail256Loop:
709	// This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
710	polyAdd(0(inp)(itr2*1))
711	MOVO          C3, tmpStore
712	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
713	MOVO          tmpStore, C3
714	MOVO          C1, tmpStore
715	chachaQR(A3, B3, C3, D3, C1)
716	MOVO          tmpStore, C1
717	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
718	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
719	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
720	polyMulStage1
721	polyMulStage2
722	MOVO          C3, tmpStore
723	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
724	MOVO          tmpStore, C3
725	MOVO          C1, tmpStore
726	chachaQR(A3, B3, C3, D3, C1)
727	MOVO          tmpStore, C1
728	polyMulStage3
729	polyMulReduceStage
730	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
731	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
732	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
733	ADDQ          $2*8, itr2
734	CMPQ          itr2, $160
735	JB            openSSETail256Loop
736	MOVQ          inl, itr1
737	ANDQ          $-16, itr1
738
739openSSETail256HashLoop:
740	polyAdd(0(inp)(itr2*1))
741	polyMul
742	ADDQ $2*8, itr2
743	CMPQ itr2, itr1
744	JB   openSSETail256HashLoop
745
746	// Add in the state
747	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
748	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
749	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
750	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
751	MOVO  D3, tmpStore
752
753	// Load - xor - store
754	MOVOU (0*16)(inp), D3; PXOR D3, A0
755	MOVOU (1*16)(inp), D3; PXOR D3, B0
756	MOVOU (2*16)(inp), D3; PXOR D3, C0
757	MOVOU (3*16)(inp), D3; PXOR D3, D0
758	MOVOU A0, (0*16)(oup)
759	MOVOU B0, (1*16)(oup)
760	MOVOU C0, (2*16)(oup)
761	MOVOU D0, (3*16)(oup)
762	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
763	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
764	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
765	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
766	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
767	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
768	LEAQ  192(inp), inp
769	LEAQ  192(oup), oup
770	SUBQ  $192, inl
771	MOVO  A3, A0
772	MOVO  B3, B0
773	MOVO  C3, C0
774	MOVO  tmpStore, D0
775
776	JMP openSSETail64DecLoop
777
778// ----------------------------------------------------------------------------
779// ------------------------- AVX2 Code ----------------------------------------
780chacha20Poly1305Open_AVX2:
781	VZEROUPPER
782	VMOVDQU ·chacha20Constants<>(SB), AA0
783	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
784	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
785	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
786	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
787
788	// Special optimization, for very short buffers
789	CMPQ inl, $192
790	JBE  openAVX2192
791	CMPQ inl, $320
792	JBE  openAVX2320
793
794	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
795	VMOVDQA BB0, state1StoreAVX2
796	VMOVDQA CC0, state2StoreAVX2
797	VMOVDQA DD0, ctr3StoreAVX2
798	MOVQ    $10, itr2
799
800openAVX2PreparePolyKey:
801	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
802	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
803	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
804	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
805	DECQ     itr2
806	JNE      openAVX2PreparePolyKey
807
808	VPADDD ·chacha20Constants<>(SB), AA0, AA0
809	VPADDD state1StoreAVX2, BB0, BB0
810	VPADDD state2StoreAVX2, CC0, CC0
811	VPADDD ctr3StoreAVX2, DD0, DD0
812
813	VPERM2I128 $0x02, AA0, BB0, TT0
814
815	// Clamp and store poly key
816	VPAND   ·polyClampMask<>(SB), TT0, TT0
817	VMOVDQA TT0, rsStoreAVX2
818
819	// Stream for the first 64 bytes
820	VPERM2I128 $0x13, AA0, BB0, AA0
821	VPERM2I128 $0x13, CC0, DD0, BB0
822
823	// Hash AD + first 64 bytes
824	MOVQ ad_len+80(FP), itr2
825	CALL polyHashADInternal<>(SB)
826	XORQ itr1, itr1
827
828openAVX2InitialHash64:
829	polyAdd(0(inp)(itr1*1))
830	polyMulAVX2
831	ADDQ $16, itr1
832	CMPQ itr1, $64
833	JNE  openAVX2InitialHash64
834
835	// Decrypt the first 64 bytes
836	VPXOR   (0*32)(inp), AA0, AA0
837	VPXOR   (1*32)(inp), BB0, BB0
838	VMOVDQU AA0, (0*32)(oup)
839	VMOVDQU BB0, (1*32)(oup)
840	LEAQ    (2*32)(inp), inp
841	LEAQ    (2*32)(oup), oup
842	SUBQ    $64, inl
843
844openAVX2MainLoop:
845	CMPQ inl, $512
846	JB   openAVX2MainLoopDone
847
848	// Load state, increment counter blocks, store the incremented counters
849	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
850	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
851	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
852	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
853	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
854	XORQ    itr1, itr1
855
856openAVX2InternalLoop:
857	// Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
858	// Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
859	polyAdd(0*8(inp)(itr1*1))
860	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
861	polyMulStage1_AVX2
862	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
863	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
864	polyMulStage2_AVX2
865	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
866	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
867	polyMulStage3_AVX2
868	VMOVDQA  CC3, tmpStoreAVX2
869	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
870	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
871	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
872	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
873	VMOVDQA  tmpStoreAVX2, CC3
874	polyMulReduceStage
875	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
876	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
877	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
878	polyAdd(2*8(inp)(itr1*1))
879	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
880	polyMulStage1_AVX2
881	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
882	VMOVDQA  CC3, tmpStoreAVX2
883	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
884	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
885	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
886	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
887	VMOVDQA  tmpStoreAVX2, CC3
888	polyMulStage2_AVX2
889	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
890	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
891	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
892	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
893	polyMulStage3_AVX2
894	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
895	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
896	polyMulReduceStage
897	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
898	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
899	polyAdd(4*8(inp)(itr1*1))
900	LEAQ     (6*8)(itr1), itr1
901	VMOVDQA  CC3, tmpStoreAVX2
902	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
903	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
904	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
905	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
906	VMOVDQA  tmpStoreAVX2, CC3
907	polyMulStage1_AVX2
908	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
909	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
910	polyMulStage2_AVX2
911	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
912	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
913	polyMulStage3_AVX2
914	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
915	VMOVDQA  CC3, tmpStoreAVX2
916	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
917	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
918	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
919	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
920	VMOVDQA  tmpStoreAVX2, CC3
921	polyMulReduceStage
922	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
923	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
924	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
925	CMPQ     itr1, $480
926	JNE      openAVX2InternalLoop
927
928	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
929	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
930	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
931	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
932	VMOVDQA CC3, tmpStoreAVX2
933
934	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
935	polyAdd(480(inp))
936	polyMulAVX2
937	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
938	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
939	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
940	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
941	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
942	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
943
944	// and here
945	polyAdd(496(inp))
946	polyMulAVX2
947	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
948	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
949	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
950	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
951	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
952	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
953	LEAQ       (32*16)(inp), inp
954	LEAQ       (32*16)(oup), oup
955	SUBQ       $(32*16), inl
956	JMP        openAVX2MainLoop
957
958openAVX2MainLoopDone:
959	// Handle the various tail sizes efficiently
960	TESTQ inl, inl
961	JE    openSSEFinalize
962	CMPQ  inl, $128
963	JBE   openAVX2Tail128
964	CMPQ  inl, $256
965	JBE   openAVX2Tail256
966	CMPQ  inl, $384
967	JBE   openAVX2Tail384
968	JMP   openAVX2Tail512
969
970// ----------------------------------------------------------------------------
971// Special optimization for buffers smaller than 193 bytes
972openAVX2192:
973	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
974	VMOVDQA AA0, AA1
975	VMOVDQA BB0, BB1
976	VMOVDQA CC0, CC1
977	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
978	VMOVDQA AA0, AA2
979	VMOVDQA BB0, BB2
980	VMOVDQA CC0, CC2
981	VMOVDQA DD0, DD2
982	VMOVDQA DD1, TT3
983	MOVQ    $10, itr2
984
985openAVX2192InnerCipherLoop:
986	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
987	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
988	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
989	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
990	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
991	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
992	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
993	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
994	DECQ       itr2
995	JNE        openAVX2192InnerCipherLoop
996	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
997	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
998	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
999	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
1000	VPERM2I128 $0x02, AA0, BB0, TT0
1001
1002	// Clamp and store poly key
1003	VPAND   ·polyClampMask<>(SB), TT0, TT0
1004	VMOVDQA TT0, rsStoreAVX2
1005
1006	// Stream for up to 192 bytes
1007	VPERM2I128 $0x13, AA0, BB0, AA0
1008	VPERM2I128 $0x13, CC0, DD0, BB0
1009	VPERM2I128 $0x02, AA1, BB1, CC0
1010	VPERM2I128 $0x02, CC1, DD1, DD0
1011	VPERM2I128 $0x13, AA1, BB1, AA1
1012	VPERM2I128 $0x13, CC1, DD1, BB1
1013
1014openAVX2ShortOpen:
1015	// Hash
1016	MOVQ ad_len+80(FP), itr2
1017	CALL polyHashADInternal<>(SB)
1018
1019openAVX2ShortOpenLoop:
1020	CMPQ inl, $32
1021	JB   openAVX2ShortTail32
1022	SUBQ $32, inl
1023
1024	// Load for hashing
1025	polyAdd(0*8(inp))
1026	polyMulAVX2
1027	polyAdd(2*8(inp))
1028	polyMulAVX2
1029
1030	// Load for decryption
1031	VPXOR   (inp), AA0, AA0
1032	VMOVDQU AA0, (oup)
1033	LEAQ    (1*32)(inp), inp
1034	LEAQ    (1*32)(oup), oup
1035
1036	// Shift stream left
1037	VMOVDQA BB0, AA0
1038	VMOVDQA CC0, BB0
1039	VMOVDQA DD0, CC0
1040	VMOVDQA AA1, DD0
1041	VMOVDQA BB1, AA1
1042	VMOVDQA CC1, BB1
1043	VMOVDQA DD1, CC1
1044	VMOVDQA AA2, DD1
1045	VMOVDQA BB2, AA2
1046	JMP     openAVX2ShortOpenLoop
1047
1048openAVX2ShortTail32:
1049	CMPQ    inl, $16
1050	VMOVDQA A0, A1
1051	JB      openAVX2ShortDone
1052
1053	SUBQ $16, inl
1054
1055	// Load for hashing
1056	polyAdd(0*8(inp))
1057	polyMulAVX2
1058
1059	// Load for decryption
1060	VPXOR      (inp), A0, T0
1061	VMOVDQU    T0, (oup)
1062	LEAQ       (1*16)(inp), inp
1063	LEAQ       (1*16)(oup), oup
1064	VPERM2I128 $0x11, AA0, AA0, AA0
1065	VMOVDQA    A0, A1
1066
1067openAVX2ShortDone:
1068	VZEROUPPER
1069	JMP openSSETail16
1070
1071// ----------------------------------------------------------------------------
1072// Special optimization for buffers smaller than 321 bytes
1073openAVX2320:
1074	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
1075	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
1076	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
1077	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
1078	MOVQ    $10, itr2
1079
1080openAVX2320InnerCipherLoop:
1081	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1082	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
1083	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1084	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
1085	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1086	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
1087	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1088	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
1089	DECQ     itr2
1090	JNE      openAVX2320InnerCipherLoop
1091
1092	VMOVDQA ·chacha20Constants<>(SB), TT0
1093	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
1094	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
1095	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
1096	VMOVDQA ·avx2IncMask<>(SB), TT0
1097	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
1098	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
1099	VPADDD  TT3, DD2, DD2
1100
1101	// Clamp and store poly key
1102	VPERM2I128 $0x02, AA0, BB0, TT0
1103	VPAND      ·polyClampMask<>(SB), TT0, TT0
1104	VMOVDQA    TT0, rsStoreAVX2
1105
1106	// Stream for up to 320 bytes
1107	VPERM2I128 $0x13, AA0, BB0, AA0
1108	VPERM2I128 $0x13, CC0, DD0, BB0
1109	VPERM2I128 $0x02, AA1, BB1, CC0
1110	VPERM2I128 $0x02, CC1, DD1, DD0
1111	VPERM2I128 $0x13, AA1, BB1, AA1
1112	VPERM2I128 $0x13, CC1, DD1, BB1
1113	VPERM2I128 $0x02, AA2, BB2, CC1
1114	VPERM2I128 $0x02, CC2, DD2, DD1
1115	VPERM2I128 $0x13, AA2, BB2, AA2
1116	VPERM2I128 $0x13, CC2, DD2, BB2
1117	JMP        openAVX2ShortOpen
1118
1119// ----------------------------------------------------------------------------
1120// Special optimization for the last 128 bytes of ciphertext
1121openAVX2Tail128:
1122	// Need to decrypt up to 128 bytes - prepare two blocks
1123	VMOVDQA ·chacha20Constants<>(SB), AA1
1124	VMOVDQA state1StoreAVX2, BB1
1125	VMOVDQA state2StoreAVX2, CC1
1126	VMOVDQA ctr3StoreAVX2, DD1
1127	VPADDD  ·avx2IncMask<>(SB), DD1, DD1
1128	VMOVDQA DD1, DD0
1129
1130	XORQ  itr2, itr2
1131	MOVQ  inl, itr1
1132	ANDQ  $-16, itr1
1133	TESTQ itr1, itr1
1134	JE    openAVX2Tail128LoopB
1135
1136openAVX2Tail128LoopA:
1137	// Perform ChaCha rounds, while hashing the remaining input
1138	polyAdd(0(inp)(itr2*1))
1139	polyMulAVX2
1140
1141openAVX2Tail128LoopB:
1142	ADDQ     $16, itr2
1143	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1144	VPALIGNR $4, BB1, BB1, BB1
1145	VPALIGNR $8, CC1, CC1, CC1
1146	VPALIGNR $12, DD1, DD1, DD1
1147	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1148	VPALIGNR $12, BB1, BB1, BB1
1149	VPALIGNR $8, CC1, CC1, CC1
1150	VPALIGNR $4, DD1, DD1, DD1
1151	CMPQ     itr2, itr1
1152	JB       openAVX2Tail128LoopA
1153	CMPQ     itr2, $160
1154	JNE      openAVX2Tail128LoopB
1155
1156	VPADDD     ·chacha20Constants<>(SB), AA1, AA1
1157	VPADDD     state1StoreAVX2, BB1, BB1
1158	VPADDD     state2StoreAVX2, CC1, CC1
1159	VPADDD     DD0, DD1, DD1
1160	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1161
1162openAVX2TailLoop:
1163	CMPQ inl, $32
1164	JB   openAVX2Tail
1165	SUBQ $32, inl
1166
1167	// Load for decryption
1168	VPXOR   (inp), AA0, AA0
1169	VMOVDQU AA0, (oup)
1170	LEAQ    (1*32)(inp), inp
1171	LEAQ    (1*32)(oup), oup
1172	VMOVDQA BB0, AA0
1173	VMOVDQA CC0, BB0
1174	VMOVDQA DD0, CC0
1175	JMP     openAVX2TailLoop
1176
1177openAVX2Tail:
1178	CMPQ    inl, $16
1179	VMOVDQA A0, A1
1180	JB      openAVX2TailDone
1181	SUBQ    $16, inl
1182
1183	// Load for decryption
1184	VPXOR      (inp), A0, T0
1185	VMOVDQU    T0, (oup)
1186	LEAQ       (1*16)(inp), inp
1187	LEAQ       (1*16)(oup), oup
1188	VPERM2I128 $0x11, AA0, AA0, AA0
1189	VMOVDQA    A0, A1
1190
1191openAVX2TailDone:
1192	VZEROUPPER
1193	JMP openSSETail16
1194
1195// ----------------------------------------------------------------------------
1196// Special optimization for the last 256 bytes of ciphertext
1197openAVX2Tail256:
1198	// Need to decrypt up to 256 bytes - prepare four blocks
1199	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
1200	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
1201	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
1202	VMOVDQA ctr3StoreAVX2, DD0
1203	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
1204	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
1205	VMOVDQA DD0, TT1
1206	VMOVDQA DD1, TT2
1207
1208	// Compute the number of iterations that will hash data
1209	MOVQ    inl, tmpStoreAVX2
1210	MOVQ    inl, itr1
1211	SUBQ    $128, itr1
1212	SHRQ    $4, itr1
1213	MOVQ    $10, itr2
1214	CMPQ    itr1, $10
1215	CMOVQGT itr2, itr1
1216	MOVQ    inp, inl
1217	XORQ    itr2, itr2
1218
1219openAVX2Tail256LoopA:
1220	polyAdd(0(inl))
1221	polyMulAVX2
1222	LEAQ 16(inl), inl
1223
1224	// Perform ChaCha rounds, while hashing the remaining input
1225openAVX2Tail256LoopB:
1226	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1227	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
1228	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
1229	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
1230	INCQ     itr2
1231	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1232	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
1233	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
1234	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
1235	CMPQ     itr2, itr1
1236	JB       openAVX2Tail256LoopA
1237
1238	CMPQ itr2, $10
1239	JNE  openAVX2Tail256LoopB
1240
1241	MOVQ inl, itr2
1242	SUBQ inp, inl
1243	MOVQ inl, itr1
1244	MOVQ tmpStoreAVX2, inl
1245
1246	// Hash the remainder of data (if any)
1247openAVX2Tail256Hash:
1248	ADDQ $16, itr1
1249	CMPQ itr1, inl
1250	JGT  openAVX2Tail256HashEnd
1251	polyAdd (0(itr2))
1252	polyMulAVX2
1253	LEAQ 16(itr2), itr2
1254	JMP  openAVX2Tail256Hash
1255
1256// Store 128 bytes safely, then go to store loop
1257openAVX2Tail256HashEnd:
1258	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
1259	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
1260	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
1261	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
1262	VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
1263	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1264
1265	VPXOR   (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
1266	VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
1267	LEAQ    (4*32)(inp), inp
1268	LEAQ    (4*32)(oup), oup
1269	SUBQ    $4*32, inl
1270
1271	JMP openAVX2TailLoop
1272
1273// ----------------------------------------------------------------------------
1274// Special optimization for the last 384 bytes of ciphertext
1275openAVX2Tail384:
1276	// Need to decrypt up to 384 bytes - prepare six blocks
1277	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
1278	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
1279	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
1280	VMOVDQA ctr3StoreAVX2, DD0
1281	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
1282	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
1283	VPADDD  ·avx2IncMask<>(SB), DD1, DD2
1284	VMOVDQA DD0, ctr0StoreAVX2
1285	VMOVDQA DD1, ctr1StoreAVX2
1286	VMOVDQA DD2, ctr2StoreAVX2
1287
1288	// Compute the number of iterations that will hash two blocks of data
1289	MOVQ    inl, tmpStoreAVX2
1290	MOVQ    inl, itr1
1291	SUBQ    $256, itr1
1292	SHRQ    $4, itr1
1293	ADDQ    $6, itr1
1294	MOVQ    $10, itr2
1295	CMPQ    itr1, $10
1296	CMOVQGT itr2, itr1
1297	MOVQ    inp, inl
1298	XORQ    itr2, itr2
1299
1300	// Perform ChaCha rounds, while hashing the remaining input
1301openAVX2Tail384LoopB:
1302	polyAdd(0(inl))
1303	polyMulAVX2
1304	LEAQ 16(inl), inl
1305
1306openAVX2Tail384LoopA:
1307	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1308	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
1309	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1310	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
1311	polyAdd(0(inl))
1312	polyMulAVX2
1313	LEAQ     16(inl), inl
1314	INCQ     itr2
1315	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1316	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
1317	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1318	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
1319
1320	CMPQ itr2, itr1
1321	JB   openAVX2Tail384LoopB
1322
1323	CMPQ itr2, $10
1324	JNE  openAVX2Tail384LoopA
1325
1326	MOVQ inl, itr2
1327	SUBQ inp, inl
1328	MOVQ inl, itr1
1329	MOVQ tmpStoreAVX2, inl
1330
1331openAVX2Tail384Hash:
1332	ADDQ $16, itr1
1333	CMPQ itr1, inl
1334	JGT  openAVX2Tail384HashEnd
1335	polyAdd(0(itr2))
1336	polyMulAVX2
1337	LEAQ 16(itr2), itr2
1338	JMP  openAVX2Tail384Hash
1339
1340// Store 256 bytes safely, then go to store loop
1341openAVX2Tail384HashEnd:
1342	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
1343	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
1344	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
1345	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
1346	VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
1347	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
1348	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
1349	VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
1350	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
1351	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
1352	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
1353	LEAQ       (8*32)(inp), inp
1354	LEAQ       (8*32)(oup), oup
1355	SUBQ       $8*32, inl
1356	JMP        openAVX2TailLoop
1357
1358// ----------------------------------------------------------------------------
1359// Special optimization for the last 512 bytes of ciphertext
1360openAVX2Tail512:
1361	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
1362	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
1363	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
1364	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
1365	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
1366	XORQ    itr1, itr1
1367	MOVQ    inp, itr2
1368
1369openAVX2Tail512LoopB:
1370	polyAdd(0(itr2))
1371	polyMulAVX2
1372	LEAQ (2*8)(itr2), itr2
1373
1374openAVX2Tail512LoopA:
1375	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1376	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1377	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
1378	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1379	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1380	VMOVDQA  CC3, tmpStoreAVX2
1381	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
1382	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
1383	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
1384	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
1385	VMOVDQA  tmpStoreAVX2, CC3
1386	polyAdd(0*8(itr2))
1387	polyMulAVX2
1388	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1389	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1390	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
1391	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1392	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1393	VMOVDQA  CC3, tmpStoreAVX2
1394	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
1395	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
1396	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
1397	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
1398	VMOVDQA  tmpStoreAVX2, CC3
1399	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
1400	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
1401	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
1402	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1403	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1404	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
1405	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1406	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1407	polyAdd(2*8(itr2))
1408	polyMulAVX2
1409	LEAQ     (4*8)(itr2), itr2
1410	VMOVDQA  CC3, tmpStoreAVX2
1411	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
1412	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
1413	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
1414	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
1415	VMOVDQA  tmpStoreAVX2, CC3
1416	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1417	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1418	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
1419	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1420	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1421	VMOVDQA  CC3, tmpStoreAVX2
1422	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
1423	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
1424	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
1425	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
1426	VMOVDQA  tmpStoreAVX2, CC3
1427	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
1428	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
1429	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
1430	INCQ     itr1
1431	CMPQ     itr1, $4
1432	JLT      openAVX2Tail512LoopB
1433
1434	CMPQ itr1, $10
1435	JNE  openAVX2Tail512LoopA
1436
1437	MOVQ inl, itr1
1438	SUBQ $384, itr1
1439	ANDQ $-16, itr1
1440
1441openAVX2Tail512HashLoop:
1442	TESTQ itr1, itr1
1443	JE    openAVX2Tail512HashEnd
1444	polyAdd(0(itr2))
1445	polyMulAVX2
1446	LEAQ  16(itr2), itr2
1447	SUBQ  $16, itr1
1448	JMP   openAVX2Tail512HashLoop
1449
1450openAVX2Tail512HashEnd:
1451	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
1452	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
1453	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
1454	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
1455	VMOVDQA    CC3, tmpStoreAVX2
1456	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
1457	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
1458	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
1459	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1460	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
1461	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
1462	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
1463	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
1464	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
1465	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
1466
1467	LEAQ (12*32)(inp), inp
1468	LEAQ (12*32)(oup), oup
1469	SUBQ $12*32, inl
1470
1471	JMP openAVX2TailLoop
1472
1473// ----------------------------------------------------------------------------
1474// ----------------------------------------------------------------------------
1475// func chacha20Poly1305Seal(dst, key, src, ad []byte)
1476TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
1477	// For aligned stack access
1478	MOVQ SP, BP
1479	ADDQ $32, BP
1480	ANDQ $-32, BP
1481	MOVQ dst+0(FP), oup
1482	MOVQ key+24(FP), keyp
1483	MOVQ src+48(FP), inp
1484	MOVQ src_len+56(FP), inl
1485	MOVQ ad+72(FP), adp
1486
1487	CMPB ·useAVX2(SB), $1
1488	JE   chacha20Poly1305Seal_AVX2
1489
1490	// Special optimization, for very short buffers
1491	CMPQ inl, $128
1492	JBE  sealSSE128 // About 15% faster
1493
1494	// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
1495	MOVOU ·chacha20Constants<>(SB), A0
1496	MOVOU (1*16)(keyp), B0
1497	MOVOU (2*16)(keyp), C0
1498	MOVOU (3*16)(keyp), D0
1499
1500	// Store state on stack for future use
1501	MOVO B0, state1Store
1502	MOVO C0, state2Store
1503
1504	// Load state, increment counter blocks
1505	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1506	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1507	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
1508
1509	// Store counters
1510	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
1511	MOVQ $10, itr2
1512
1513sealSSEIntroLoop:
1514	MOVO         C3, tmpStore
1515	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1516	MOVO         tmpStore, C3
1517	MOVO         C1, tmpStore
1518	chachaQR(A3, B3, C3, D3, C1)
1519	MOVO         tmpStore, C1
1520	shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
1521	shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
1522	shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
1523
1524	MOVO          C3, tmpStore
1525	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1526	MOVO          tmpStore, C3
1527	MOVO          C1, tmpStore
1528	chachaQR(A3, B3, C3, D3, C1)
1529	MOVO          tmpStore, C1
1530	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
1531	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
1532	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
1533	DECQ          itr2
1534	JNE           sealSSEIntroLoop
1535
1536	// Add in the state
1537	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
1538	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
1539	PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
1540	PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
1541
1542	// Clamp and store the key
1543	PAND ·polyClampMask<>(SB), A0
1544	MOVO A0, rStore
1545	MOVO B0, sStore
1546
1547	// Hash AAD
1548	MOVQ ad_len+80(FP), itr2
1549	CALL polyHashADInternal<>(SB)
1550
1551	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1552	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
1553	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
1554	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
1555	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
1556	MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
1557
1558	MOVQ $128, itr1
1559	SUBQ $128, inl
1560	LEAQ 128(inp), inp
1561
1562	MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
1563
1564	CMPQ inl, $64
1565	JBE  sealSSE128SealHash
1566
1567	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1568	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
1569	MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
1570
1571	ADDQ $64, itr1
1572	SUBQ $64, inl
1573	LEAQ 64(inp), inp
1574
1575	MOVQ $2, itr1
1576	MOVQ $8, itr2
1577
1578	CMPQ inl, $64
1579	JBE  sealSSETail64
1580	CMPQ inl, $128
1581	JBE  sealSSETail128
1582	CMPQ inl, $192
1583	JBE  sealSSETail192
1584
1585sealSSEMainLoop:
1586	// Load state, increment counter blocks
1587	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
1588	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1589	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1590	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
1591
1592	// Store counters
1593	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
1594
1595sealSSEInnerLoop:
1596	MOVO          C3, tmpStore
1597	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1598	MOVO          tmpStore, C3
1599	MOVO          C1, tmpStore
1600	chachaQR(A3, B3, C3, D3, C1)
1601	MOVO          tmpStore, C1
1602	polyAdd(0(oup))
1603	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
1604	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
1605	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
1606	polyMulStage1
1607	polyMulStage2
1608	LEAQ          (2*8)(oup), oup
1609	MOVO          C3, tmpStore
1610	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1611	MOVO          tmpStore, C3
1612	MOVO          C1, tmpStore
1613	polyMulStage3
1614	chachaQR(A3, B3, C3, D3, C1)
1615	MOVO          tmpStore, C1
1616	polyMulReduceStage
1617	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
1618	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
1619	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
1620	DECQ          itr2
1621	JGE           sealSSEInnerLoop
1622	polyAdd(0(oup))
1623	polyMul
1624	LEAQ          (2*8)(oup), oup
1625	DECQ          itr1
1626	JG            sealSSEInnerLoop
1627
1628	// Add in the state
1629	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
1630	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
1631	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
1632	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
1633	MOVO  D3, tmpStore
1634
1635	// Load - xor - store
1636	MOVOU (0*16)(inp), D3; PXOR D3, A0
1637	MOVOU (1*16)(inp), D3; PXOR D3, B0
1638	MOVOU (2*16)(inp), D3; PXOR D3, C0
1639	MOVOU (3*16)(inp), D3; PXOR D3, D0
1640	MOVOU A0, (0*16)(oup)
1641	MOVOU B0, (1*16)(oup)
1642	MOVOU C0, (2*16)(oup)
1643	MOVOU D0, (3*16)(oup)
1644	MOVO  tmpStore, D3
1645
1646	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
1647	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
1648	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
1649	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
1650	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
1651	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
1652	ADDQ  $192, inp
1653	MOVQ  $192, itr1
1654	SUBQ  $192, inl
1655	MOVO  A3, A1
1656	MOVO  B3, B1
1657	MOVO  C3, C1
1658	MOVO  D3, D1
1659	CMPQ  inl, $64
1660	JBE   sealSSE128SealHash
1661	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1662	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
1663	MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
1664	LEAQ  64(inp), inp
1665	SUBQ  $64, inl
1666	MOVQ  $6, itr1
1667	MOVQ  $4, itr2
1668	CMPQ  inl, $192
1669	JG    sealSSEMainLoop
1670
1671	MOVQ  inl, itr1
1672	TESTQ inl, inl
1673	JE    sealSSE128SealHash
1674	MOVQ  $6, itr1
1675	CMPQ  inl, $64
1676	JBE   sealSSETail64
1677	CMPQ  inl, $128
1678	JBE   sealSSETail128
1679	JMP   sealSSETail192
1680
1681// ----------------------------------------------------------------------------
1682// Special optimization for the last 64 bytes of plaintext
1683sealSSETail64:
1684	// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
1685	MOVO  ·chacha20Constants<>(SB), A1
1686	MOVO  state1Store, B1
1687	MOVO  state2Store, C1
1688	MOVO  ctr3Store, D1
1689	PADDL ·sseIncMask<>(SB), D1
1690	MOVO  D1, ctr0Store
1691
1692sealSSETail64LoopA:
1693	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1694	polyAdd(0(oup))
1695	polyMul
1696	LEAQ 16(oup), oup
1697
1698sealSSETail64LoopB:
1699	chachaQR(A1, B1, C1, D1, T1)
1700	shiftB1Left;  shiftC1Left; shiftD1Left
1701	chachaQR(A1, B1, C1, D1, T1)
1702	shiftB1Right; shiftC1Right; shiftD1Right
1703	polyAdd(0(oup))
1704	polyMul
1705	LEAQ          16(oup), oup
1706
1707	DECQ itr1
1708	JG   sealSSETail64LoopA
1709
1710	DECQ  itr2
1711	JGE   sealSSETail64LoopB
1712	PADDL ·chacha20Constants<>(SB), A1
1713	PADDL state1Store, B1
1714	PADDL state2Store, C1
1715	PADDL ctr0Store, D1
1716
1717	JMP sealSSE128Seal
1718
1719// ----------------------------------------------------------------------------
1720// Special optimization for the last 128 bytes of plaintext
1721sealSSETail128:
1722	// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
1723	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
1724	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
1725
1726sealSSETail128LoopA:
1727	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1728	polyAdd(0(oup))
1729	polyMul
1730	LEAQ 16(oup), oup
1731
1732sealSSETail128LoopB:
1733	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
1734	shiftB0Left;  shiftC0Left; shiftD0Left
1735	shiftB1Left;  shiftC1Left; shiftD1Left
1736	polyAdd(0(oup))
1737	polyMul
1738	LEAQ          16(oup), oup
1739	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
1740	shiftB0Right; shiftC0Right; shiftD0Right
1741	shiftB1Right; shiftC1Right; shiftD1Right
1742
1743	DECQ itr1
1744	JG   sealSSETail128LoopA
1745
1746	DECQ itr2
1747	JGE  sealSSETail128LoopB
1748
1749	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
1750	PADDL state1Store, B0; PADDL state1Store, B1
1751	PADDL state2Store, C0; PADDL state2Store, C1
1752	PADDL ctr0Store, D0; PADDL ctr1Store, D1
1753
1754	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
1755	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
1756	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
1757
1758	MOVQ $64, itr1
1759	LEAQ 64(inp), inp
1760	SUBQ $64, inl
1761
1762	JMP sealSSE128SealHash
1763
1764// ----------------------------------------------------------------------------
1765// Special optimization for the last 192 bytes of plaintext
1766sealSSETail192:
1767	// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
1768	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
1769	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
1770	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
1771
1772sealSSETail192LoopA:
1773	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1774	polyAdd(0(oup))
1775	polyMul
1776	LEAQ 16(oup), oup
1777
1778sealSSETail192LoopB:
1779	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1780	shiftB0Left; shiftC0Left; shiftD0Left
1781	shiftB1Left; shiftC1Left; shiftD1Left
1782	shiftB2Left; shiftC2Left; shiftD2Left
1783
1784	polyAdd(0(oup))
1785	polyMul
1786	LEAQ 16(oup), oup
1787
1788	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1789	shiftB0Right; shiftC0Right; shiftD0Right
1790	shiftB1Right; shiftC1Right; shiftD1Right
1791	shiftB2Right; shiftC2Right; shiftD2Right
1792
1793	DECQ itr1
1794	JG   sealSSETail192LoopA
1795
1796	DECQ itr2
1797	JGE  sealSSETail192LoopB
1798
1799	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
1800	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
1801	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
1802	PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
1803
1804	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
1805	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
1806	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
1807	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
1808	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
1809	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
1810
1811	MOVO A2, A1
1812	MOVO B2, B1
1813	MOVO C2, C1
1814	MOVO D2, D1
1815	MOVQ $128, itr1
1816	LEAQ 128(inp), inp
1817	SUBQ $128, inl
1818
1819	JMP sealSSE128SealHash
1820
1821// ----------------------------------------------------------------------------
1822// Special seal optimization for buffers smaller than 129 bytes
1823sealSSE128:
1824	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
1825	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
1826	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1827	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1828	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
1829	MOVQ  $10, itr2
1830
1831sealSSE128InnerCipherLoop:
1832	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1833	shiftB0Left;  shiftB1Left; shiftB2Left
1834	shiftC0Left;  shiftC1Left; shiftC2Left
1835	shiftD0Left;  shiftD1Left; shiftD2Left
1836	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1837	shiftB0Right; shiftB1Right; shiftB2Right
1838	shiftC0Right; shiftC1Right; shiftC2Right
1839	shiftD0Right; shiftD1Right; shiftD2Right
1840	DECQ          itr2
1841	JNE           sealSSE128InnerCipherLoop
1842
1843	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
1844	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
1845	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
1846	PADDL T2, C1; PADDL T2, C2
1847	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
1848	PAND  ·polyClampMask<>(SB), A0
1849	MOVOU A0, rStore
1850	MOVOU B0, sStore
1851
1852	// Hash
1853	MOVQ ad_len+80(FP), itr2
1854	CALL polyHashADInternal<>(SB)
1855	XORQ itr1, itr1
1856
1857sealSSE128SealHash:
1858	// itr1 holds the number of bytes encrypted but not yet hashed
1859	CMPQ itr1, $16
1860	JB   sealSSE128Seal
1861	polyAdd(0(oup))
1862	polyMul
1863
1864	SUBQ $16, itr1
1865	ADDQ $16, oup
1866
1867	JMP sealSSE128SealHash
1868
1869sealSSE128Seal:
1870	CMPQ inl, $16
1871	JB   sealSSETail
1872	SUBQ $16, inl
1873
1874	// Load for decryption
1875	MOVOU (inp), T0
1876	PXOR  T0, A1
1877	MOVOU A1, (oup)
1878	LEAQ  (1*16)(inp), inp
1879	LEAQ  (1*16)(oup), oup
1880
1881	// Extract for hashing
1882	MOVQ   A1, t0
1883	PSRLDQ $8, A1
1884	MOVQ A1, t1
1885	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
1886	polyMul
1887
1888	// Shift the stream "left"
1889	MOVO B1, A1
1890	MOVO C1, B1
1891	MOVO D1, C1
1892	MOVO A2, D1
1893	MOVO B2, A2
1894	MOVO C2, B2
1895	MOVO D2, C2
1896	JMP  sealSSE128Seal
1897
1898sealSSETail:
1899	TESTQ inl, inl
1900	JE    sealSSEFinalize
1901
1902	// We can only load the PT one byte at a time to avoid read after end of buffer
1903	MOVQ inl, itr2
1904	SHLQ $4, itr2
1905	LEAQ ·andMask<>(SB), t0
1906	MOVQ inl, itr1
1907	LEAQ -1(inp)(inl*1), inp
1908	XORQ t2, t2
1909	XORQ t3, t3
1910	XORQ AX, AX
1911
1912sealSSETailLoadLoop:
1913	SHLQ $8, t2, t3
1914	SHLQ $8, t2
1915	MOVB (inp), AX
1916	XORQ AX, t2
1917	LEAQ   -1(inp), inp
1918	DECQ   itr1
1919	JNE    sealSSETailLoadLoop
1920	MOVQ t2, 0+tmpStore
1921	MOVQ t3, 8+tmpStore
1922	PXOR 0+tmpStore, A1
1923	MOVOU  A1, (oup)
1924	MOVOU  -16(t0)(itr2*1), T0
1925	PAND   T0, A1
1926	MOVQ   A1, t0
1927	PSRLDQ $8, A1
1928	MOVQ   A1, t1
1929	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
1930	polyMul
1931
1932	ADDQ inl, oup
1933
1934sealSSEFinalize:
1935	// Hash in the buffer lengths
1936	ADDQ ad_len+80(FP), acc0
1937	ADCQ src_len+56(FP), acc1
1938	ADCQ $1, acc2
1939	polyMul
1940
1941	// Final reduce
1942	MOVQ    acc0, t0
1943	MOVQ    acc1, t1
1944	MOVQ    acc2, t2
1945	SUBQ    $-5, acc0
1946	SBBQ    $-1, acc1
1947	SBBQ    $3, acc2
1948	CMOVQCS t0, acc0
1949	CMOVQCS t1, acc1
1950	CMOVQCS t2, acc2
1951
1952	// Add in the "s" part of the key
1953	ADDQ 0+sStore, acc0
1954	ADCQ 8+sStore, acc1
1955
1956	// Finally store the tag at the end of the message
1957	MOVQ acc0, (0*8)(oup)
1958	MOVQ acc1, (1*8)(oup)
1959	RET
1960
1961// ----------------------------------------------------------------------------
1962// ------------------------- AVX2 Code ----------------------------------------
1963chacha20Poly1305Seal_AVX2:
1964	VZEROUPPER
1965	VMOVDQU ·chacha20Constants<>(SB), AA0
1966	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
1967	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
1968	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
1969	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
1970
1971	// Special optimizations, for very short buffers
1972	CMPQ inl, $192
1973	JBE  seal192AVX2 // 33% faster
1974	CMPQ inl, $320
1975	JBE  seal320AVX2 // 17% faster
1976
1977	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
1978	VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
1979	VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
1980	VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
1981	VPADDD  ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
1982	VPADDD  ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
1983	VPADDD  ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
1984	VMOVDQA DD3, ctr3StoreAVX2
1985	MOVQ    $10, itr2
1986
1987sealAVX2IntroLoop:
1988	VMOVDQA CC3, tmpStoreAVX2
1989	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
1990	VMOVDQA tmpStoreAVX2, CC3
1991	VMOVDQA CC1, tmpStoreAVX2
1992	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
1993	VMOVDQA tmpStoreAVX2, CC1
1994
1995	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
1996	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
1997	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
1998	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
1999
2000	VMOVDQA CC3, tmpStoreAVX2
2001	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2002	VMOVDQA tmpStoreAVX2, CC3
2003	VMOVDQA CC1, tmpStoreAVX2
2004	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2005	VMOVDQA tmpStoreAVX2, CC1
2006
2007	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
2008	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
2009	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
2010	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
2011	DECQ     itr2
2012	JNE      sealAVX2IntroLoop
2013
2014	VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2015	VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2016	VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2017	VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2018
2019	VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
2020	VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
2021	VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
2022
2023	// Clamp and store poly key
2024	VPAND   ·polyClampMask<>(SB), DD0, DD0
2025	VMOVDQA DD0, rsStoreAVX2
2026
2027	// Hash AD
2028	MOVQ ad_len+80(FP), itr2
2029	CALL polyHashADInternal<>(SB)
2030
2031	// Can store at least 320 bytes
2032	VPXOR   (0*32)(inp), AA0, AA0
2033	VPXOR   (1*32)(inp), CC0, CC0
2034	VMOVDQU AA0, (0*32)(oup)
2035	VMOVDQU CC0, (1*32)(oup)
2036
2037	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
2038	VPXOR      (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
2039	VMOVDQU    AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
2040	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
2041	VPXOR      (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
2042	VMOVDQU    AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
2043
2044	MOVQ $320, itr1
2045	SUBQ $320, inl
2046	LEAQ 320(inp), inp
2047
2048	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
2049	CMPQ       inl, $128
2050	JBE        sealAVX2SealHash
2051
2052	VPXOR   (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
2053	VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
2054	SUBQ    $128, inl
2055	LEAQ    128(inp), inp
2056
2057	MOVQ $8, itr1
2058	MOVQ $2, itr2
2059
2060	CMPQ inl, $128
2061	JBE  sealAVX2Tail128
2062	CMPQ inl, $256
2063	JBE  sealAVX2Tail256
2064	CMPQ inl, $384
2065	JBE  sealAVX2Tail384
2066	CMPQ inl, $512
2067	JBE  sealAVX2Tail512
2068
2069	// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
2070	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2071	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2072	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2073	VMOVDQA ctr3StoreAVX2, DD0
2074	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2075	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2076
2077	VMOVDQA CC3, tmpStoreAVX2
2078	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2079	VMOVDQA tmpStoreAVX2, CC3
2080	VMOVDQA CC1, tmpStoreAVX2
2081	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2082	VMOVDQA tmpStoreAVX2, CC1
2083
2084	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
2085	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
2086	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
2087	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
2088
2089	VMOVDQA CC3, tmpStoreAVX2
2090	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2091	VMOVDQA tmpStoreAVX2, CC3
2092	VMOVDQA CC1, tmpStoreAVX2
2093	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2094	VMOVDQA tmpStoreAVX2, CC1
2095
2096	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
2097	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
2098	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
2099	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
2100	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2101	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2102	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2103	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2104	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2105	VMOVDQA  CC3, tmpStoreAVX2
2106	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2107	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2108	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2109	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2110	VMOVDQA  tmpStoreAVX2, CC3
2111
2112	SUBQ $16, oup                  // Adjust the pointer
2113	MOVQ $9, itr1
2114	JMP  sealAVX2InternalLoopStart
2115
2116sealAVX2MainLoop:
2117	// Load state, increment counter blocks, store the incremented counters
2118	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2119	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2120	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2121	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2122	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2123	MOVQ    $10, itr1
2124
2125sealAVX2InternalLoop:
2126	polyAdd(0*8(oup))
2127	VPADDD  BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2128	polyMulStage1_AVX2
2129	VPXOR   AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2130	VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2131	polyMulStage2_AVX2
2132	VPADDD  DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2133	VPXOR   CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2134	polyMulStage3_AVX2
2135	VMOVDQA CC3, tmpStoreAVX2
2136	VPSLLD  $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2137	VPSLLD  $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2138	VPSLLD  $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2139	VPSLLD  $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2140	VMOVDQA tmpStoreAVX2, CC3
2141	polyMulReduceStage
2142
2143sealAVX2InternalLoopStart:
2144	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2145	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2146	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2147	polyAdd(2*8(oup))
2148	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2149	polyMulStage1_AVX2
2150	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2151	VMOVDQA  CC3, tmpStoreAVX2
2152	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2153	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2154	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2155	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2156	VMOVDQA  tmpStoreAVX2, CC3
2157	polyMulStage2_AVX2
2158	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
2159	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2160	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
2161	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2162	polyMulStage3_AVX2
2163	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2164	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2165	polyMulReduceStage
2166	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2167	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2168	polyAdd(4*8(oup))
2169	LEAQ     (6*8)(oup), oup
2170	VMOVDQA  CC3, tmpStoreAVX2
2171	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2172	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2173	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2174	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2175	VMOVDQA  tmpStoreAVX2, CC3
2176	polyMulStage1_AVX2
2177	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2178	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2179	polyMulStage2_AVX2
2180	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2181	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2182	polyMulStage3_AVX2
2183	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2184	VMOVDQA  CC3, tmpStoreAVX2
2185	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2186	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2187	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2188	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2189	VMOVDQA  tmpStoreAVX2, CC3
2190	polyMulReduceStage
2191	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
2192	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2193	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
2194	DECQ     itr1
2195	JNE      sealAVX2InternalLoop
2196
2197	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2198	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2199	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2200	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2201	VMOVDQA CC3, tmpStoreAVX2
2202
2203	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
2204	polyAdd(0*8(oup))
2205	polyMulAVX2
2206	LEAQ       (4*8)(oup), oup
2207	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
2208	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
2209	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
2210	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
2211	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
2212	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
2213
2214	// and here
2215	polyAdd(-2*8(oup))
2216	polyMulAVX2
2217	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
2218	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
2219	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
2220	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
2221	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
2222	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
2223	LEAQ       (32*16)(inp), inp
2224	SUBQ       $(32*16), inl
2225	CMPQ       inl, $512
2226	JG         sealAVX2MainLoop
2227
2228	// Tail can only hash 480 bytes
2229	polyAdd(0*8(oup))
2230	polyMulAVX2
2231	polyAdd(2*8(oup))
2232	polyMulAVX2
2233	LEAQ 32(oup), oup
2234
2235	MOVQ $10, itr1
2236	MOVQ $0, itr2
2237	CMPQ inl, $128
2238	JBE  sealAVX2Tail128
2239	CMPQ inl, $256
2240	JBE  sealAVX2Tail256
2241	CMPQ inl, $384
2242	JBE  sealAVX2Tail384
2243	JMP  sealAVX2Tail512
2244
2245// ----------------------------------------------------------------------------
2246// Special optimization for buffers smaller than 193 bytes
2247seal192AVX2:
2248	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
2249	VMOVDQA AA0, AA1
2250	VMOVDQA BB0, BB1
2251	VMOVDQA CC0, CC1
2252	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
2253	VMOVDQA AA0, AA2
2254	VMOVDQA BB0, BB2
2255	VMOVDQA CC0, CC2
2256	VMOVDQA DD0, DD2
2257	VMOVDQA DD1, TT3
2258	MOVQ    $10, itr2
2259
2260sealAVX2192InnerCipherLoop:
2261	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2262	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
2263	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2264	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
2265	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2266	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
2267	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2268	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
2269	DECQ       itr2
2270	JNE        sealAVX2192InnerCipherLoop
2271	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
2272	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
2273	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
2274	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
2275	VPERM2I128 $0x02, AA0, BB0, TT0
2276
2277	// Clamp and store poly key
2278	VPAND   ·polyClampMask<>(SB), TT0, TT0
2279	VMOVDQA TT0, rsStoreAVX2
2280
2281	// Stream for up to 192 bytes
2282	VPERM2I128 $0x13, AA0, BB0, AA0
2283	VPERM2I128 $0x13, CC0, DD0, BB0
2284	VPERM2I128 $0x02, AA1, BB1, CC0
2285	VPERM2I128 $0x02, CC1, DD1, DD0
2286	VPERM2I128 $0x13, AA1, BB1, AA1
2287	VPERM2I128 $0x13, CC1, DD1, BB1
2288
2289sealAVX2ShortSeal:
2290	// Hash aad
2291	MOVQ ad_len+80(FP), itr2
2292	CALL polyHashADInternal<>(SB)
2293	XORQ itr1, itr1
2294
2295sealAVX2SealHash:
2296	// itr1 holds the number of bytes encrypted but not yet hashed
2297	CMPQ itr1, $16
2298	JB   sealAVX2ShortSealLoop
2299	polyAdd(0(oup))
2300	polyMul
2301	SUBQ $16, itr1
2302	ADDQ $16, oup
2303	JMP  sealAVX2SealHash
2304
2305sealAVX2ShortSealLoop:
2306	CMPQ inl, $32
2307	JB   sealAVX2ShortTail32
2308	SUBQ $32, inl
2309
2310	// Load for encryption
2311	VPXOR   (inp), AA0, AA0
2312	VMOVDQU AA0, (oup)
2313	LEAQ    (1*32)(inp), inp
2314
2315	// Now can hash
2316	polyAdd(0*8(oup))
2317	polyMulAVX2
2318	polyAdd(2*8(oup))
2319	polyMulAVX2
2320	LEAQ (1*32)(oup), oup
2321
2322	// Shift stream left
2323	VMOVDQA BB0, AA0
2324	VMOVDQA CC0, BB0
2325	VMOVDQA DD0, CC0
2326	VMOVDQA AA1, DD0
2327	VMOVDQA BB1, AA1
2328	VMOVDQA CC1, BB1
2329	VMOVDQA DD1, CC1
2330	VMOVDQA AA2, DD1
2331	VMOVDQA BB2, AA2
2332	JMP     sealAVX2ShortSealLoop
2333
2334sealAVX2ShortTail32:
2335	CMPQ    inl, $16
2336	VMOVDQA A0, A1
2337	JB      sealAVX2ShortDone
2338
2339	SUBQ $16, inl
2340
2341	// Load for encryption
2342	VPXOR   (inp), A0, T0
2343	VMOVDQU T0, (oup)
2344	LEAQ    (1*16)(inp), inp
2345
2346	// Hash
2347	polyAdd(0*8(oup))
2348	polyMulAVX2
2349	LEAQ       (1*16)(oup), oup
2350	VPERM2I128 $0x11, AA0, AA0, AA0
2351	VMOVDQA    A0, A1
2352
2353sealAVX2ShortDone:
2354	VZEROUPPER
2355	JMP sealSSETail
2356
2357// ----------------------------------------------------------------------------
2358// Special optimization for buffers smaller than 321 bytes
2359seal320AVX2:
2360	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
2361	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
2362	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
2363	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
2364	MOVQ    $10, itr2
2365
2366sealAVX2320InnerCipherLoop:
2367	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2368	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
2369	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2370	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
2371	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2372	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
2373	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2374	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
2375	DECQ     itr2
2376	JNE      sealAVX2320InnerCipherLoop
2377
2378	VMOVDQA ·chacha20Constants<>(SB), TT0
2379	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
2380	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
2381	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
2382	VMOVDQA ·avx2IncMask<>(SB), TT0
2383	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
2384	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
2385	VPADDD  TT3, DD2, DD2
2386
2387	// Clamp and store poly key
2388	VPERM2I128 $0x02, AA0, BB0, TT0
2389	VPAND      ·polyClampMask<>(SB), TT0, TT0
2390	VMOVDQA    TT0, rsStoreAVX2
2391
2392	// Stream for up to 320 bytes
2393	VPERM2I128 $0x13, AA0, BB0, AA0
2394	VPERM2I128 $0x13, CC0, DD0, BB0
2395	VPERM2I128 $0x02, AA1, BB1, CC0
2396	VPERM2I128 $0x02, CC1, DD1, DD0
2397	VPERM2I128 $0x13, AA1, BB1, AA1
2398	VPERM2I128 $0x13, CC1, DD1, BB1
2399	VPERM2I128 $0x02, AA2, BB2, CC1
2400	VPERM2I128 $0x02, CC2, DD2, DD1
2401	VPERM2I128 $0x13, AA2, BB2, AA2
2402	VPERM2I128 $0x13, CC2, DD2, BB2
2403	JMP        sealAVX2ShortSeal
2404
2405// ----------------------------------------------------------------------------
2406// Special optimization for the last 128 bytes of ciphertext
2407sealAVX2Tail128:
2408	// Need to decrypt up to 128 bytes - prepare two blocks
2409	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2410	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2411	VMOVDQA ·chacha20Constants<>(SB), AA0
2412	VMOVDQA state1StoreAVX2, BB0
2413	VMOVDQA state2StoreAVX2, CC0
2414	VMOVDQA ctr3StoreAVX2, DD0
2415	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
2416	VMOVDQA DD0, DD1
2417
2418sealAVX2Tail128LoopA:
2419	polyAdd(0(oup))
2420	polyMul
2421	LEAQ 16(oup), oup
2422
2423sealAVX2Tail128LoopB:
2424	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
2425	polyAdd(0(oup))
2426	polyMul
2427	VPALIGNR $4, BB0, BB0, BB0
2428	VPALIGNR $8, CC0, CC0, CC0
2429	VPALIGNR $12, DD0, DD0, DD0
2430	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
2431	polyAdd(16(oup))
2432	polyMul
2433	LEAQ     32(oup), oup
2434	VPALIGNR $12, BB0, BB0, BB0
2435	VPALIGNR $8, CC0, CC0, CC0
2436	VPALIGNR $4, DD0, DD0, DD0
2437	DECQ     itr1
2438	JG       sealAVX2Tail128LoopA
2439	DECQ     itr2
2440	JGE      sealAVX2Tail128LoopB
2441
2442	VPADDD ·chacha20Constants<>(SB), AA0, AA1
2443	VPADDD state1StoreAVX2, BB0, BB1
2444	VPADDD state2StoreAVX2, CC0, CC1
2445	VPADDD DD1, DD0, DD1
2446
2447	VPERM2I128 $0x02, AA1, BB1, AA0
2448	VPERM2I128 $0x02, CC1, DD1, BB0
2449	VPERM2I128 $0x13, AA1, BB1, CC0
2450	VPERM2I128 $0x13, CC1, DD1, DD0
2451	JMP        sealAVX2ShortSealLoop
2452
2453// ----------------------------------------------------------------------------
2454// Special optimization for the last 256 bytes of ciphertext
2455sealAVX2Tail256:
2456	// Need to decrypt up to 256 bytes - prepare two blocks
2457	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2458	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2459	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
2460	VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
2461	VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
2462	VMOVDQA ctr3StoreAVX2, DD0
2463	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
2464	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
2465	VMOVDQA DD0, TT1
2466	VMOVDQA DD1, TT2
2467
2468sealAVX2Tail256LoopA:
2469	polyAdd(0(oup))
2470	polyMul
2471	LEAQ 16(oup), oup
2472
2473sealAVX2Tail256LoopB:
2474	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2475	polyAdd(0(oup))
2476	polyMul
2477	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
2478	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2479	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
2480	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2481	polyAdd(16(oup))
2482	polyMul
2483	LEAQ     32(oup), oup
2484	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
2485	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2486	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
2487	DECQ     itr1
2488	JG       sealAVX2Tail256LoopA
2489	DECQ     itr2
2490	JGE      sealAVX2Tail256LoopB
2491
2492	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
2493	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
2494	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
2495	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
2496	VPERM2I128 $0x02, AA0, BB0, TT0
2497	VPERM2I128 $0x02, CC0, DD0, TT1
2498	VPERM2I128 $0x13, AA0, BB0, TT2
2499	VPERM2I128 $0x13, CC0, DD0, TT3
2500	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
2501	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
2502	MOVQ       $128, itr1
2503	LEAQ       128(inp), inp
2504	SUBQ       $128, inl
2505	VPERM2I128 $0x02, AA1, BB1, AA0
2506	VPERM2I128 $0x02, CC1, DD1, BB0
2507	VPERM2I128 $0x13, AA1, BB1, CC0
2508	VPERM2I128 $0x13, CC1, DD1, DD0
2509
2510	JMP sealAVX2SealHash
2511
2512// ----------------------------------------------------------------------------
2513// Special optimization for the last 384 bytes of ciphertext
2514sealAVX2Tail384:
2515	// Need to decrypt up to 384 bytes - prepare two blocks
2516	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2517	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2518	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
2519	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
2520	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
2521	VMOVDQA ctr3StoreAVX2, DD0
2522	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
2523	VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
2524
2525sealAVX2Tail384LoopA:
2526	polyAdd(0(oup))
2527	polyMul
2528	LEAQ 16(oup), oup
2529
2530sealAVX2Tail384LoopB:
2531	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2532	polyAdd(0(oup))
2533	polyMul
2534	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
2535	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2536	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
2537	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2538	polyAdd(16(oup))
2539	polyMul
2540	LEAQ     32(oup), oup
2541	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
2542	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2543	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
2544	DECQ     itr1
2545	JG       sealAVX2Tail384LoopA
2546	DECQ     itr2
2547	JGE      sealAVX2Tail384LoopB
2548
2549	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
2550	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
2551	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
2552	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
2553	VPERM2I128 $0x02, AA0, BB0, TT0
2554	VPERM2I128 $0x02, CC0, DD0, TT1
2555	VPERM2I128 $0x13, AA0, BB0, TT2
2556	VPERM2I128 $0x13, CC0, DD0, TT3
2557	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
2558	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
2559	VPERM2I128 $0x02, AA1, BB1, TT0
2560	VPERM2I128 $0x02, CC1, DD1, TT1
2561	VPERM2I128 $0x13, AA1, BB1, TT2
2562	VPERM2I128 $0x13, CC1, DD1, TT3
2563	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
2564	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
2565	MOVQ       $256, itr1
2566	LEAQ       256(inp), inp
2567	SUBQ       $256, inl
2568	VPERM2I128 $0x02, AA2, BB2, AA0
2569	VPERM2I128 $0x02, CC2, DD2, BB0
2570	VPERM2I128 $0x13, AA2, BB2, CC0
2571	VPERM2I128 $0x13, CC2, DD2, DD0
2572
2573	JMP sealAVX2SealHash
2574
2575// ----------------------------------------------------------------------------
2576// Special optimization for the last 512 bytes of ciphertext
2577sealAVX2Tail512:
2578	// Need to decrypt up to 512 bytes - prepare two blocks
2579	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2580	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2581	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2582	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2583	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2584	VMOVDQA ctr3StoreAVX2, DD0
2585	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2586	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2587
2588sealAVX2Tail512LoopA:
2589	polyAdd(0(oup))
2590	polyMul
2591	LEAQ 16(oup), oup
2592
2593sealAVX2Tail512LoopB:
2594	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2595	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2596	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2597	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2598	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2599	VMOVDQA  CC3, tmpStoreAVX2
2600	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2601	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2602	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2603	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2604	VMOVDQA  tmpStoreAVX2, CC3
2605	polyAdd(0*8(oup))
2606	polyMulAVX2
2607	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2608	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2609	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2610	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2611	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2612	VMOVDQA  CC3, tmpStoreAVX2
2613	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2614	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2615	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2616	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2617	VMOVDQA  tmpStoreAVX2, CC3
2618	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
2619	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2620	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
2621	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2622	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2623	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2624	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2625	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2626	polyAdd(2*8(oup))
2627	polyMulAVX2
2628	LEAQ     (4*8)(oup), oup
2629	VMOVDQA  CC3, tmpStoreAVX2
2630	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2631	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2632	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2633	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2634	VMOVDQA  tmpStoreAVX2, CC3
2635	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2636	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2637	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2638	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2639	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2640	VMOVDQA  CC3, tmpStoreAVX2
2641	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2642	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2643	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2644	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2645	VMOVDQA  tmpStoreAVX2, CC3
2646	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
2647	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2648	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
2649
2650	DECQ itr1
2651	JG   sealAVX2Tail512LoopA
2652	DECQ itr2
2653	JGE  sealAVX2Tail512LoopB
2654
2655	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2656	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2657	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2658	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2659	VMOVDQA    CC3, tmpStoreAVX2
2660	VPERM2I128 $0x02, AA0, BB0, CC3
2661	VPXOR      (0*32)(inp), CC3, CC3
2662	VMOVDQU    CC3, (0*32)(oup)
2663	VPERM2I128 $0x02, CC0, DD0, CC3
2664	VPXOR      (1*32)(inp), CC3, CC3
2665	VMOVDQU    CC3, (1*32)(oup)
2666	VPERM2I128 $0x13, AA0, BB0, CC3
2667	VPXOR      (2*32)(inp), CC3, CC3
2668	VMOVDQU    CC3, (2*32)(oup)
2669	VPERM2I128 $0x13, CC0, DD0, CC3
2670	VPXOR      (3*32)(inp), CC3, CC3
2671	VMOVDQU    CC3, (3*32)(oup)
2672
2673	VPERM2I128 $0x02, AA1, BB1, AA0
2674	VPERM2I128 $0x02, CC1, DD1, BB0
2675	VPERM2I128 $0x13, AA1, BB1, CC0
2676	VPERM2I128 $0x13, CC1, DD1, DD0
2677	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
2678	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
2679
2680	VPERM2I128 $0x02, AA2, BB2, AA0
2681	VPERM2I128 $0x02, CC2, DD2, BB0
2682	VPERM2I128 $0x13, AA2, BB2, CC0
2683	VPERM2I128 $0x13, CC2, DD2, DD0
2684	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
2685	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
2686
2687	MOVQ       $384, itr1
2688	LEAQ       384(inp), inp
2689	SUBQ       $384, inl
2690	VPERM2I128 $0x02, AA3, BB3, AA0
2691	VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
2692	VPERM2I128 $0x13, AA3, BB3, CC0
2693	VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
2694
2695	JMP sealAVX2SealHash
2696