1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
6
7//go:build gc && !purego
8// +build gc,!purego
9
10#include "textflag.h"
11// General register allocation
12#define oup DI
13#define inp SI
14#define inl BX
15#define adp CX // free to reuse, after we hash the additional data
16#define keyp R8 // free to reuse, when we copy the key to stack
17#define itr2 R9 // general iterator
18#define itr1 CX // general iterator
19#define acc0 R10
20#define acc1 R11
21#define acc2 R12
22#define t0 R13
23#define t1 R14
24#define t2 R15
25#define t3 R8
26// Register and stack allocation for the SSE code
27#define rStore (0*16)(BP)
28#define sStore (1*16)(BP)
29#define state1Store (2*16)(BP)
30#define state2Store (3*16)(BP)
31#define tmpStore (4*16)(BP)
32#define ctr0Store (5*16)(BP)
33#define ctr1Store (6*16)(BP)
34#define ctr2Store (7*16)(BP)
35#define ctr3Store (8*16)(BP)
36#define A0 X0
37#define A1 X1
38#define A2 X2
39#define B0 X3
40#define B1 X4
41#define B2 X5
42#define C0 X6
43#define C1 X7
44#define C2 X8
45#define D0 X9
46#define D1 X10
47#define D2 X11
48#define T0 X12
49#define T1 X13
50#define T2 X14
51#define T3 X15
52#define A3 T0
53#define B3 T1
54#define C3 T2
55#define D3 T3
56// Register and stack allocation for the AVX2 code
57#define rsStoreAVX2 (0*32)(BP)
58#define state1StoreAVX2 (1*32)(BP)
59#define state2StoreAVX2 (2*32)(BP)
60#define ctr0StoreAVX2 (3*32)(BP)
61#define ctr1StoreAVX2 (4*32)(BP)
62#define ctr2StoreAVX2 (5*32)(BP)
63#define ctr3StoreAVX2 (6*32)(BP)
64#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
65#define AA0 Y0
66#define AA1 Y5
67#define AA2 Y6
68#define AA3 Y7
69#define BB0 Y14
70#define BB1 Y9
71#define BB2 Y10
72#define BB3 Y11
73#define CC0 Y12
74#define CC1 Y13
75#define CC2 Y8
76#define CC3 Y15
77#define DD0 Y4
78#define DD1 Y1
79#define DD2 Y2
80#define DD3 Y3
81#define TT0 DD3
82#define TT1 AA3
83#define TT2 BB3
84#define TT3 CC3
85// ChaCha20 constants
86DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
87DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
88DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
89DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
90DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
91DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
92DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
93DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
94// <<< 16 with PSHUFB
95DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
96DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
97DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
98DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
99// <<< 8 with PSHUFB
100DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
101DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
102DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
103DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
104
105DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
106DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
107DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
108DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
109
110DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
111DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
112DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
113DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
114// Poly1305 key clamp
115DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
116DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
117DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
118DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
119
120DATA ·sseIncMask<>+0x00(SB)/8, $0x1
121DATA ·sseIncMask<>+0x08(SB)/8, $0x0
122// To load/store the last < 16 bytes in a buffer
123DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
124DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
125DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
126DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
127DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
128DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
129DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
130DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
131DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
132DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
133DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
134DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
135DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
136DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
137DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
138DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
139DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
140DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
141DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
142DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
143DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
144DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
145DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
146DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
147DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
148DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
149DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
150DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
151DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
152DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
153
154GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
155GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
156GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
157GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
158GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
159GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
160GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
161GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
162// No PALIGNR in Go ASM yet (but VPALIGNR is present).
163#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
164#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
165#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
166#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
167#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
168#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
169#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
170#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
171#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
172#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
173#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
174#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
175#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
176#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
177#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
178#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
179#define shiftC0Right shiftC0Left
180#define shiftC1Right shiftC1Left
181#define shiftC2Right shiftC2Left
182#define shiftC3Right shiftC3Left
183#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
184#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
185#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
186#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
187// Some macros
188#define chachaQR(A, B, C, D, T) \
189	PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D                            \
190	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
191	PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D                             \
192	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
193
194#define chachaQR_AVX2(A, B, C, D, T) \
195	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D                         \
196	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
197	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D                          \
198	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
199
200#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
201#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
202#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
203#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
204#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
205
206#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
207#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
208#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
209
210#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
211#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
212// ----------------------------------------------------------------------------
213TEXT polyHashADInternal<>(SB), NOSPLIT, $0
214	// adp points to beginning of additional data
215	// itr2 holds ad length
216	XORQ acc0, acc0
217	XORQ acc1, acc1
218	XORQ acc2, acc2
219	CMPQ itr2, $13
220	JNE  hashADLoop
221
222openFastTLSAD:
223	// Special treatment for the TLS case of 13 bytes
224	MOVQ (adp), acc0
225	MOVQ 5(adp), acc1
226	SHRQ $24, acc1
227	MOVQ $1, acc2
228	polyMul
229	RET
230
231hashADLoop:
232	// Hash in 16 byte chunks
233	CMPQ itr2, $16
234	JB   hashADTail
235	polyAdd(0(adp))
236	LEAQ (1*16)(adp), adp
237	SUBQ $16, itr2
238	polyMul
239	JMP  hashADLoop
240
241hashADTail:
242	CMPQ itr2, $0
243	JE   hashADDone
244
245	// Hash last < 16 byte tail
246	XORQ t0, t0
247	XORQ t1, t1
248	XORQ t2, t2
249	ADDQ itr2, adp
250
251hashADTailLoop:
252	SHLQ $8, t0, t1
253	SHLQ $8, t0
254	MOVB -1(adp), t2
255	XORQ t2, t0
256	DECQ adp
257	DECQ itr2
258	JNE  hashADTailLoop
259
260hashADTailFinish:
261	ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
262	polyMul
263
264	// Finished AD
265hashADDone:
266	RET
267
268// ----------------------------------------------------------------------------
269// func chacha20Poly1305Open(dst, key, src, ad []byte) bool
270TEXT ·chacha20Poly1305Open(SB), 0, $288-97
271	// For aligned stack access
272	MOVQ SP, BP
273	ADDQ $32, BP
274	ANDQ $-32, BP
275	MOVQ dst+0(FP), oup
276	MOVQ key+24(FP), keyp
277	MOVQ src+48(FP), inp
278	MOVQ src_len+56(FP), inl
279	MOVQ ad+72(FP), adp
280
281	// Check for AVX2 support
282	CMPB ·useAVX2(SB), $1
283	JE   chacha20Poly1305Open_AVX2
284
285	// Special optimization, for very short buffers
286	CMPQ inl, $128
287	JBE  openSSE128 // About 16% faster
288
289	// For long buffers, prepare the poly key first
290	MOVOU ·chacha20Constants<>(SB), A0
291	MOVOU (1*16)(keyp), B0
292	MOVOU (2*16)(keyp), C0
293	MOVOU (3*16)(keyp), D0
294	MOVO  D0, T1
295
296	// Store state on stack for future use
297	MOVO B0, state1Store
298	MOVO C0, state2Store
299	MOVO D0, ctr3Store
300	MOVQ $10, itr2
301
302openSSEPreparePolyKey:
303	chachaQR(A0, B0, C0, D0, T0)
304	shiftB0Left;  shiftC0Left; shiftD0Left
305	chachaQR(A0, B0, C0, D0, T0)
306	shiftB0Right; shiftC0Right; shiftD0Right
307	DECQ          itr2
308	JNE           openSSEPreparePolyKey
309
310	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
311	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
312
313	// Clamp and store the key
314	PAND ·polyClampMask<>(SB), A0
315	MOVO A0, rStore; MOVO B0, sStore
316
317	// Hash AAD
318	MOVQ ad_len+80(FP), itr2
319	CALL polyHashADInternal<>(SB)
320
321openSSEMainLoop:
322	CMPQ inl, $256
323	JB   openSSEMainLoopDone
324
325	// Load state, increment counter blocks
326	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
327	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
328	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
329	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
330
331	// Store counters
332	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
333
334	// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
335	MOVQ $4, itr1
336	MOVQ inp, itr2
337
338openSSEInternalLoop:
339	MOVO          C3, tmpStore
340	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
341	MOVO          tmpStore, C3
342	MOVO          C1, tmpStore
343	chachaQR(A3, B3, C3, D3, C1)
344	MOVO          tmpStore, C1
345	polyAdd(0(itr2))
346	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
347	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
348	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
349	polyMulStage1
350	polyMulStage2
351	LEAQ          (2*8)(itr2), itr2
352	MOVO          C3, tmpStore
353	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
354	MOVO          tmpStore, C3
355	MOVO          C1, tmpStore
356	polyMulStage3
357	chachaQR(A3, B3, C3, D3, C1)
358	MOVO          tmpStore, C1
359	polyMulReduceStage
360	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
361	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
362	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
363	DECQ          itr1
364	JGE           openSSEInternalLoop
365
366	polyAdd(0(itr2))
367	polyMul
368	LEAQ (2*8)(itr2), itr2
369
370	CMPQ itr1, $-6
371	JG   openSSEInternalLoop
372
373	// Add in the state
374	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
375	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
376	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
377	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
378
379	// Load - xor - store
380	MOVO  D3, tmpStore
381	MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
382	MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
383	MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
384	MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
385	MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
386	MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
387	MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
388	MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
389	MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
390	MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
391	MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
392	MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
393	MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
394	MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
395	MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
396	MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
397	LEAQ  256(inp), inp
398	LEAQ  256(oup), oup
399	SUBQ  $256, inl
400	JMP   openSSEMainLoop
401
402openSSEMainLoopDone:
403	// Handle the various tail sizes efficiently
404	TESTQ inl, inl
405	JE    openSSEFinalize
406	CMPQ  inl, $64
407	JBE   openSSETail64
408	CMPQ  inl, $128
409	JBE   openSSETail128
410	CMPQ  inl, $192
411	JBE   openSSETail192
412	JMP   openSSETail256
413
414openSSEFinalize:
415	// Hash in the PT, AAD lengths
416	ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
417	polyMul
418
419	// Final reduce
420	MOVQ    acc0, t0
421	MOVQ    acc1, t1
422	MOVQ    acc2, t2
423	SUBQ    $-5, acc0
424	SBBQ    $-1, acc1
425	SBBQ    $3, acc2
426	CMOVQCS t0, acc0
427	CMOVQCS t1, acc1
428	CMOVQCS t2, acc2
429
430	// Add in the "s" part of the key
431	ADDQ 0+sStore, acc0
432	ADCQ 8+sStore, acc1
433
434	// Finally, constant time compare to the tag at the end of the message
435	XORQ    AX, AX
436	MOVQ    $1, DX
437	XORQ    (0*8)(inp), acc0
438	XORQ    (1*8)(inp), acc1
439	ORQ     acc1, acc0
440	CMOVQEQ DX, AX
441
442	// Return true iff tags are equal
443	MOVB AX, ret+96(FP)
444	RET
445
446// ----------------------------------------------------------------------------
447// Special optimization for buffers smaller than 129 bytes
448openSSE128:
449	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
450	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
451	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
452	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
453	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
454	MOVQ  $10, itr2
455
456openSSE128InnerCipherLoop:
457	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
458	shiftB0Left;  shiftB1Left; shiftB2Left
459	shiftC0Left;  shiftC1Left; shiftC2Left
460	shiftD0Left;  shiftD1Left; shiftD2Left
461	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
462	shiftB0Right; shiftB1Right; shiftB2Right
463	shiftC0Right; shiftC1Right; shiftC2Right
464	shiftD0Right; shiftD1Right; shiftD2Right
465	DECQ          itr2
466	JNE           openSSE128InnerCipherLoop
467
468	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
469	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
470	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
471	PADDL T2, C1; PADDL T2, C2
472	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
473
474	// Clamp and store the key
475	PAND  ·polyClampMask<>(SB), A0
476	MOVOU A0, rStore; MOVOU B0, sStore
477
478	// Hash
479	MOVQ ad_len+80(FP), itr2
480	CALL polyHashADInternal<>(SB)
481
482openSSE128Open:
483	CMPQ inl, $16
484	JB   openSSETail16
485	SUBQ $16, inl
486
487	// Load for hashing
488	polyAdd(0(inp))
489
490	// Load for decryption
491	MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
492	LEAQ  (1*16)(inp), inp
493	LEAQ  (1*16)(oup), oup
494	polyMul
495
496	// Shift the stream "left"
497	MOVO B1, A1
498	MOVO C1, B1
499	MOVO D1, C1
500	MOVO A2, D1
501	MOVO B2, A2
502	MOVO C2, B2
503	MOVO D2, C2
504	JMP  openSSE128Open
505
506openSSETail16:
507	TESTQ inl, inl
508	JE    openSSEFinalize
509
510	// We can safely load the CT from the end, because it is padded with the MAC
511	MOVQ   inl, itr2
512	SHLQ   $4, itr2
513	LEAQ   ·andMask<>(SB), t0
514	MOVOU  (inp), T0
515	ADDQ   inl, inp
516	PAND   -16(t0)(itr2*1), T0
517	MOVO   T0, 0+tmpStore
518	MOVQ   T0, t0
519	MOVQ   8+tmpStore, t1
520	PXOR   A1, T0
521
522	// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
523openSSETail16Store:
524	MOVQ T0, t3
525	MOVB t3, (oup)
526	PSRLDQ $1, T0
527	INCQ   oup
528	DECQ   inl
529	JNE    openSSETail16Store
530	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
531	polyMul
532	JMP    openSSEFinalize
533
534// ----------------------------------------------------------------------------
535// Special optimization for the last 64 bytes of ciphertext
536openSSETail64:
537	// Need to decrypt up to 64 bytes - prepare single block
538	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
539	XORQ itr2, itr2
540	MOVQ inl, itr1
541	CMPQ itr1, $16
542	JB   openSSETail64LoopB
543
544openSSETail64LoopA:
545	// Perform ChaCha rounds, while hashing the remaining input
546	polyAdd(0(inp)(itr2*1))
547	polyMul
548	SUBQ $16, itr1
549
550openSSETail64LoopB:
551	ADDQ          $16, itr2
552	chachaQR(A0, B0, C0, D0, T0)
553	shiftB0Left;  shiftC0Left; shiftD0Left
554	chachaQR(A0, B0, C0, D0, T0)
555	shiftB0Right; shiftC0Right; shiftD0Right
556
557	CMPQ itr1, $16
558	JAE  openSSETail64LoopA
559
560	CMPQ itr2, $160
561	JNE  openSSETail64LoopB
562
563	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
564
565openSSETail64DecLoop:
566	CMPQ  inl, $16
567	JB    openSSETail64DecLoopDone
568	SUBQ  $16, inl
569	MOVOU (inp), T0
570	PXOR  T0, A0
571	MOVOU A0, (oup)
572	LEAQ  16(inp), inp
573	LEAQ  16(oup), oup
574	MOVO  B0, A0
575	MOVO  C0, B0
576	MOVO  D0, C0
577	JMP   openSSETail64DecLoop
578
579openSSETail64DecLoopDone:
580	MOVO A0, A1
581	JMP  openSSETail16
582
583// ----------------------------------------------------------------------------
584// Special optimization for the last 128 bytes of ciphertext
585openSSETail128:
586	// Need to decrypt up to 128 bytes - prepare two blocks
587	MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
588	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
589	XORQ itr2, itr2
590	MOVQ inl, itr1
591	ANDQ $-16, itr1
592
593openSSETail128LoopA:
594	// Perform ChaCha rounds, while hashing the remaining input
595	polyAdd(0(inp)(itr2*1))
596	polyMul
597
598openSSETail128LoopB:
599	ADDQ          $16, itr2
600	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
601	shiftB0Left;  shiftC0Left; shiftD0Left
602	shiftB1Left;  shiftC1Left; shiftD1Left
603	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
604	shiftB0Right; shiftC0Right; shiftD0Right
605	shiftB1Right; shiftC1Right; shiftD1Right
606
607	CMPQ itr2, itr1
608	JB   openSSETail128LoopA
609
610	CMPQ itr2, $160
611	JNE  openSSETail128LoopB
612
613	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
614	PADDL state1Store, B0; PADDL state1Store, B1
615	PADDL state2Store, C0; PADDL state2Store, C1
616	PADDL ctr1Store, D0; PADDL ctr0Store, D1
617
618	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
619	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
620	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
621
622	SUBQ $64, inl
623	LEAQ 64(inp), inp
624	LEAQ 64(oup), oup
625	JMP  openSSETail64DecLoop
626
627// ----------------------------------------------------------------------------
628// Special optimization for the last 192 bytes of ciphertext
629openSSETail192:
630	// Need to decrypt up to 192 bytes - prepare three blocks
631	MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
632	MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
633	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
634
635	MOVQ    inl, itr1
636	MOVQ    $160, itr2
637	CMPQ    itr1, $160
638	CMOVQGT itr2, itr1
639	ANDQ    $-16, itr1
640	XORQ    itr2, itr2
641
642openSSLTail192LoopA:
643	// Perform ChaCha rounds, while hashing the remaining input
644	polyAdd(0(inp)(itr2*1))
645	polyMul
646
647openSSLTail192LoopB:
648	ADDQ         $16, itr2
649	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
650	shiftB0Left; shiftC0Left; shiftD0Left
651	shiftB1Left; shiftC1Left; shiftD1Left
652	shiftB2Left; shiftC2Left; shiftD2Left
653
654	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
655	shiftB0Right; shiftC0Right; shiftD0Right
656	shiftB1Right; shiftC1Right; shiftD1Right
657	shiftB2Right; shiftC2Right; shiftD2Right
658
659	CMPQ itr2, itr1
660	JB   openSSLTail192LoopA
661
662	CMPQ itr2, $160
663	JNE  openSSLTail192LoopB
664
665	CMPQ inl, $176
666	JB   openSSLTail192Store
667
668	polyAdd(160(inp))
669	polyMul
670
671	CMPQ inl, $192
672	JB   openSSLTail192Store
673
674	polyAdd(176(inp))
675	polyMul
676
677openSSLTail192Store:
678	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
679	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
680	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
681	PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
682
683	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
684	PXOR  T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
685	MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
686
687	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
688	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
689	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
690
691	SUBQ $128, inl
692	LEAQ 128(inp), inp
693	LEAQ 128(oup), oup
694	JMP  openSSETail64DecLoop
695
696// ----------------------------------------------------------------------------
697// Special optimization for the last 256 bytes of ciphertext
698openSSETail256:
699	// Need to decrypt up to 256 bytes - prepare four blocks
700	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
701	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
702	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
703	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
704
705	// Store counters
706	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
707	XORQ itr2, itr2
708
709openSSETail256Loop:
710	// This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
711	polyAdd(0(inp)(itr2*1))
712	MOVO          C3, tmpStore
713	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
714	MOVO          tmpStore, C3
715	MOVO          C1, tmpStore
716	chachaQR(A3, B3, C3, D3, C1)
717	MOVO          tmpStore, C1
718	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
719	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
720	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
721	polyMulStage1
722	polyMulStage2
723	MOVO          C3, tmpStore
724	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
725	MOVO          tmpStore, C3
726	MOVO          C1, tmpStore
727	chachaQR(A3, B3, C3, D3, C1)
728	MOVO          tmpStore, C1
729	polyMulStage3
730	polyMulReduceStage
731	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
732	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
733	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
734	ADDQ          $2*8, itr2
735	CMPQ          itr2, $160
736	JB            openSSETail256Loop
737	MOVQ          inl, itr1
738	ANDQ          $-16, itr1
739
740openSSETail256HashLoop:
741	polyAdd(0(inp)(itr2*1))
742	polyMul
743	ADDQ $2*8, itr2
744	CMPQ itr2, itr1
745	JB   openSSETail256HashLoop
746
747	// Add in the state
748	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
749	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
750	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
751	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
752	MOVO  D3, tmpStore
753
754	// Load - xor - store
755	MOVOU (0*16)(inp), D3; PXOR D3, A0
756	MOVOU (1*16)(inp), D3; PXOR D3, B0
757	MOVOU (2*16)(inp), D3; PXOR D3, C0
758	MOVOU (3*16)(inp), D3; PXOR D3, D0
759	MOVOU A0, (0*16)(oup)
760	MOVOU B0, (1*16)(oup)
761	MOVOU C0, (2*16)(oup)
762	MOVOU D0, (3*16)(oup)
763	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
764	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
765	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
766	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
767	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
768	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
769	LEAQ  192(inp), inp
770	LEAQ  192(oup), oup
771	SUBQ  $192, inl
772	MOVO  A3, A0
773	MOVO  B3, B0
774	MOVO  C3, C0
775	MOVO  tmpStore, D0
776
777	JMP openSSETail64DecLoop
778
779// ----------------------------------------------------------------------------
780// ------------------------- AVX2 Code ----------------------------------------
781chacha20Poly1305Open_AVX2:
782	VZEROUPPER
783	VMOVDQU ·chacha20Constants<>(SB), AA0
784	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
785	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
786	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
787	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
788
789	// Special optimization, for very short buffers
790	CMPQ inl, $192
791	JBE  openAVX2192
792	CMPQ inl, $320
793	JBE  openAVX2320
794
795	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
796	VMOVDQA BB0, state1StoreAVX2
797	VMOVDQA CC0, state2StoreAVX2
798	VMOVDQA DD0, ctr3StoreAVX2
799	MOVQ    $10, itr2
800
801openAVX2PreparePolyKey:
802	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
803	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
804	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
805	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
806	DECQ     itr2
807	JNE      openAVX2PreparePolyKey
808
809	VPADDD ·chacha20Constants<>(SB), AA0, AA0
810	VPADDD state1StoreAVX2, BB0, BB0
811	VPADDD state2StoreAVX2, CC0, CC0
812	VPADDD ctr3StoreAVX2, DD0, DD0
813
814	VPERM2I128 $0x02, AA0, BB0, TT0
815
816	// Clamp and store poly key
817	VPAND   ·polyClampMask<>(SB), TT0, TT0
818	VMOVDQA TT0, rsStoreAVX2
819
820	// Stream for the first 64 bytes
821	VPERM2I128 $0x13, AA0, BB0, AA0
822	VPERM2I128 $0x13, CC0, DD0, BB0
823
824	// Hash AD + first 64 bytes
825	MOVQ ad_len+80(FP), itr2
826	CALL polyHashADInternal<>(SB)
827	XORQ itr1, itr1
828
829openAVX2InitialHash64:
830	polyAdd(0(inp)(itr1*1))
831	polyMulAVX2
832	ADDQ $16, itr1
833	CMPQ itr1, $64
834	JNE  openAVX2InitialHash64
835
836	// Decrypt the first 64 bytes
837	VPXOR   (0*32)(inp), AA0, AA0
838	VPXOR   (1*32)(inp), BB0, BB0
839	VMOVDQU AA0, (0*32)(oup)
840	VMOVDQU BB0, (1*32)(oup)
841	LEAQ    (2*32)(inp), inp
842	LEAQ    (2*32)(oup), oup
843	SUBQ    $64, inl
844
845openAVX2MainLoop:
846	CMPQ inl, $512
847	JB   openAVX2MainLoopDone
848
849	// Load state, increment counter blocks, store the incremented counters
850	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
851	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
852	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
853	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
854	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
855	XORQ    itr1, itr1
856
857openAVX2InternalLoop:
858	// Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
859	// Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
860	polyAdd(0*8(inp)(itr1*1))
861	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
862	polyMulStage1_AVX2
863	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
864	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
865	polyMulStage2_AVX2
866	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
867	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
868	polyMulStage3_AVX2
869	VMOVDQA  CC3, tmpStoreAVX2
870	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
871	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
872	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
873	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
874	VMOVDQA  tmpStoreAVX2, CC3
875	polyMulReduceStage
876	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
877	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
878	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
879	polyAdd(2*8(inp)(itr1*1))
880	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
881	polyMulStage1_AVX2
882	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
883	VMOVDQA  CC3, tmpStoreAVX2
884	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
885	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
886	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
887	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
888	VMOVDQA  tmpStoreAVX2, CC3
889	polyMulStage2_AVX2
890	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
891	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
892	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
893	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
894	polyMulStage3_AVX2
895	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
896	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
897	polyMulReduceStage
898	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
899	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
900	polyAdd(4*8(inp)(itr1*1))
901	LEAQ     (6*8)(itr1), itr1
902	VMOVDQA  CC3, tmpStoreAVX2
903	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
904	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
905	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
906	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
907	VMOVDQA  tmpStoreAVX2, CC3
908	polyMulStage1_AVX2
909	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
910	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
911	polyMulStage2_AVX2
912	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
913	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
914	polyMulStage3_AVX2
915	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
916	VMOVDQA  CC3, tmpStoreAVX2
917	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
918	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
919	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
920	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
921	VMOVDQA  tmpStoreAVX2, CC3
922	polyMulReduceStage
923	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
924	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
925	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
926	CMPQ     itr1, $480
927	JNE      openAVX2InternalLoop
928
929	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
930	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
931	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
932	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
933	VMOVDQA CC3, tmpStoreAVX2
934
935	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
936	polyAdd(480(inp))
937	polyMulAVX2
938	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
939	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
940	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
941	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
942	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
943	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
944
945	// and here
946	polyAdd(496(inp))
947	polyMulAVX2
948	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
949	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
950	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
951	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
952	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
953	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
954	LEAQ       (32*16)(inp), inp
955	LEAQ       (32*16)(oup), oup
956	SUBQ       $(32*16), inl
957	JMP        openAVX2MainLoop
958
959openAVX2MainLoopDone:
960	// Handle the various tail sizes efficiently
961	TESTQ inl, inl
962	JE    openSSEFinalize
963	CMPQ  inl, $128
964	JBE   openAVX2Tail128
965	CMPQ  inl, $256
966	JBE   openAVX2Tail256
967	CMPQ  inl, $384
968	JBE   openAVX2Tail384
969	JMP   openAVX2Tail512
970
971// ----------------------------------------------------------------------------
972// Special optimization for buffers smaller than 193 bytes
973openAVX2192:
974	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
975	VMOVDQA AA0, AA1
976	VMOVDQA BB0, BB1
977	VMOVDQA CC0, CC1
978	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
979	VMOVDQA AA0, AA2
980	VMOVDQA BB0, BB2
981	VMOVDQA CC0, CC2
982	VMOVDQA DD0, DD2
983	VMOVDQA DD1, TT3
984	MOVQ    $10, itr2
985
986openAVX2192InnerCipherLoop:
987	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
988	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
989	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
990	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
991	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
992	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
993	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
994	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
995	DECQ       itr2
996	JNE        openAVX2192InnerCipherLoop
997	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
998	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
999	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
1000	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
1001	VPERM2I128 $0x02, AA0, BB0, TT0
1002
1003	// Clamp and store poly key
1004	VPAND   ·polyClampMask<>(SB), TT0, TT0
1005	VMOVDQA TT0, rsStoreAVX2
1006
1007	// Stream for up to 192 bytes
1008	VPERM2I128 $0x13, AA0, BB0, AA0
1009	VPERM2I128 $0x13, CC0, DD0, BB0
1010	VPERM2I128 $0x02, AA1, BB1, CC0
1011	VPERM2I128 $0x02, CC1, DD1, DD0
1012	VPERM2I128 $0x13, AA1, BB1, AA1
1013	VPERM2I128 $0x13, CC1, DD1, BB1
1014
1015openAVX2ShortOpen:
1016	// Hash
1017	MOVQ ad_len+80(FP), itr2
1018	CALL polyHashADInternal<>(SB)
1019
1020openAVX2ShortOpenLoop:
1021	CMPQ inl, $32
1022	JB   openAVX2ShortTail32
1023	SUBQ $32, inl
1024
1025	// Load for hashing
1026	polyAdd(0*8(inp))
1027	polyMulAVX2
1028	polyAdd(2*8(inp))
1029	polyMulAVX2
1030
1031	// Load for decryption
1032	VPXOR   (inp), AA0, AA0
1033	VMOVDQU AA0, (oup)
1034	LEAQ    (1*32)(inp), inp
1035	LEAQ    (1*32)(oup), oup
1036
1037	// Shift stream left
1038	VMOVDQA BB0, AA0
1039	VMOVDQA CC0, BB0
1040	VMOVDQA DD0, CC0
1041	VMOVDQA AA1, DD0
1042	VMOVDQA BB1, AA1
1043	VMOVDQA CC1, BB1
1044	VMOVDQA DD1, CC1
1045	VMOVDQA AA2, DD1
1046	VMOVDQA BB2, AA2
1047	JMP     openAVX2ShortOpenLoop
1048
1049openAVX2ShortTail32:
1050	CMPQ    inl, $16
1051	VMOVDQA A0, A1
1052	JB      openAVX2ShortDone
1053
1054	SUBQ $16, inl
1055
1056	// Load for hashing
1057	polyAdd(0*8(inp))
1058	polyMulAVX2
1059
1060	// Load for decryption
1061	VPXOR      (inp), A0, T0
1062	VMOVDQU    T0, (oup)
1063	LEAQ       (1*16)(inp), inp
1064	LEAQ       (1*16)(oup), oup
1065	VPERM2I128 $0x11, AA0, AA0, AA0
1066	VMOVDQA    A0, A1
1067
1068openAVX2ShortDone:
1069	VZEROUPPER
1070	JMP openSSETail16
1071
1072// ----------------------------------------------------------------------------
1073// Special optimization for buffers smaller than 321 bytes
1074openAVX2320:
1075	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
1076	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
1077	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
1078	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
1079	MOVQ    $10, itr2
1080
1081openAVX2320InnerCipherLoop:
1082	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1083	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
1084	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1085	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
1086	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1087	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
1088	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1089	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
1090	DECQ     itr2
1091	JNE      openAVX2320InnerCipherLoop
1092
1093	VMOVDQA ·chacha20Constants<>(SB), TT0
1094	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
1095	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
1096	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
1097	VMOVDQA ·avx2IncMask<>(SB), TT0
1098	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
1099	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
1100	VPADDD  TT3, DD2, DD2
1101
1102	// Clamp and store poly key
1103	VPERM2I128 $0x02, AA0, BB0, TT0
1104	VPAND      ·polyClampMask<>(SB), TT0, TT0
1105	VMOVDQA    TT0, rsStoreAVX2
1106
1107	// Stream for up to 320 bytes
1108	VPERM2I128 $0x13, AA0, BB0, AA0
1109	VPERM2I128 $0x13, CC0, DD0, BB0
1110	VPERM2I128 $0x02, AA1, BB1, CC0
1111	VPERM2I128 $0x02, CC1, DD1, DD0
1112	VPERM2I128 $0x13, AA1, BB1, AA1
1113	VPERM2I128 $0x13, CC1, DD1, BB1
1114	VPERM2I128 $0x02, AA2, BB2, CC1
1115	VPERM2I128 $0x02, CC2, DD2, DD1
1116	VPERM2I128 $0x13, AA2, BB2, AA2
1117	VPERM2I128 $0x13, CC2, DD2, BB2
1118	JMP        openAVX2ShortOpen
1119
1120// ----------------------------------------------------------------------------
1121// Special optimization for the last 128 bytes of ciphertext
1122openAVX2Tail128:
1123	// Need to decrypt up to 128 bytes - prepare two blocks
1124	VMOVDQA ·chacha20Constants<>(SB), AA1
1125	VMOVDQA state1StoreAVX2, BB1
1126	VMOVDQA state2StoreAVX2, CC1
1127	VMOVDQA ctr3StoreAVX2, DD1
1128	VPADDD  ·avx2IncMask<>(SB), DD1, DD1
1129	VMOVDQA DD1, DD0
1130
1131	XORQ  itr2, itr2
1132	MOVQ  inl, itr1
1133	ANDQ  $-16, itr1
1134	TESTQ itr1, itr1
1135	JE    openAVX2Tail128LoopB
1136
1137openAVX2Tail128LoopA:
1138	// Perform ChaCha rounds, while hashing the remaining input
1139	polyAdd(0(inp)(itr2*1))
1140	polyMulAVX2
1141
1142openAVX2Tail128LoopB:
1143	ADDQ     $16, itr2
1144	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1145	VPALIGNR $4, BB1, BB1, BB1
1146	VPALIGNR $8, CC1, CC1, CC1
1147	VPALIGNR $12, DD1, DD1, DD1
1148	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1149	VPALIGNR $12, BB1, BB1, BB1
1150	VPALIGNR $8, CC1, CC1, CC1
1151	VPALIGNR $4, DD1, DD1, DD1
1152	CMPQ     itr2, itr1
1153	JB       openAVX2Tail128LoopA
1154	CMPQ     itr2, $160
1155	JNE      openAVX2Tail128LoopB
1156
1157	VPADDD     ·chacha20Constants<>(SB), AA1, AA1
1158	VPADDD     state1StoreAVX2, BB1, BB1
1159	VPADDD     state2StoreAVX2, CC1, CC1
1160	VPADDD     DD0, DD1, DD1
1161	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1162
1163openAVX2TailLoop:
1164	CMPQ inl, $32
1165	JB   openAVX2Tail
1166	SUBQ $32, inl
1167
1168	// Load for decryption
1169	VPXOR   (inp), AA0, AA0
1170	VMOVDQU AA0, (oup)
1171	LEAQ    (1*32)(inp), inp
1172	LEAQ    (1*32)(oup), oup
1173	VMOVDQA BB0, AA0
1174	VMOVDQA CC0, BB0
1175	VMOVDQA DD0, CC0
1176	JMP     openAVX2TailLoop
1177
1178openAVX2Tail:
1179	CMPQ    inl, $16
1180	VMOVDQA A0, A1
1181	JB      openAVX2TailDone
1182	SUBQ    $16, inl
1183
1184	// Load for decryption
1185	VPXOR      (inp), A0, T0
1186	VMOVDQU    T0, (oup)
1187	LEAQ       (1*16)(inp), inp
1188	LEAQ       (1*16)(oup), oup
1189	VPERM2I128 $0x11, AA0, AA0, AA0
1190	VMOVDQA    A0, A1
1191
1192openAVX2TailDone:
1193	VZEROUPPER
1194	JMP openSSETail16
1195
1196// ----------------------------------------------------------------------------
1197// Special optimization for the last 256 bytes of ciphertext
1198openAVX2Tail256:
1199	// Need to decrypt up to 256 bytes - prepare four blocks
1200	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
1201	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
1202	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
1203	VMOVDQA ctr3StoreAVX2, DD0
1204	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
1205	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
1206	VMOVDQA DD0, TT1
1207	VMOVDQA DD1, TT2
1208
1209	// Compute the number of iterations that will hash data
1210	MOVQ    inl, tmpStoreAVX2
1211	MOVQ    inl, itr1
1212	SUBQ    $128, itr1
1213	SHRQ    $4, itr1
1214	MOVQ    $10, itr2
1215	CMPQ    itr1, $10
1216	CMOVQGT itr2, itr1
1217	MOVQ    inp, inl
1218	XORQ    itr2, itr2
1219
1220openAVX2Tail256LoopA:
1221	polyAdd(0(inl))
1222	polyMulAVX2
1223	LEAQ 16(inl), inl
1224
1225	// Perform ChaCha rounds, while hashing the remaining input
1226openAVX2Tail256LoopB:
1227	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1228	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
1229	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
1230	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
1231	INCQ     itr2
1232	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1233	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
1234	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
1235	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
1236	CMPQ     itr2, itr1
1237	JB       openAVX2Tail256LoopA
1238
1239	CMPQ itr2, $10
1240	JNE  openAVX2Tail256LoopB
1241
1242	MOVQ inl, itr2
1243	SUBQ inp, inl
1244	MOVQ inl, itr1
1245	MOVQ tmpStoreAVX2, inl
1246
1247	// Hash the remainder of data (if any)
1248openAVX2Tail256Hash:
1249	ADDQ $16, itr1
1250	CMPQ itr1, inl
1251	JGT  openAVX2Tail256HashEnd
1252	polyAdd (0(itr2))
1253	polyMulAVX2
1254	LEAQ 16(itr2), itr2
1255	JMP  openAVX2Tail256Hash
1256
1257// Store 128 bytes safely, then go to store loop
1258openAVX2Tail256HashEnd:
1259	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
1260	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
1261	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
1262	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
1263	VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
1264	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1265
1266	VPXOR   (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
1267	VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
1268	LEAQ    (4*32)(inp), inp
1269	LEAQ    (4*32)(oup), oup
1270	SUBQ    $4*32, inl
1271
1272	JMP openAVX2TailLoop
1273
1274// ----------------------------------------------------------------------------
1275// Special optimization for the last 384 bytes of ciphertext
1276openAVX2Tail384:
1277	// Need to decrypt up to 384 bytes - prepare six blocks
1278	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
1279	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
1280	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
1281	VMOVDQA ctr3StoreAVX2, DD0
1282	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
1283	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
1284	VPADDD  ·avx2IncMask<>(SB), DD1, DD2
1285	VMOVDQA DD0, ctr0StoreAVX2
1286	VMOVDQA DD1, ctr1StoreAVX2
1287	VMOVDQA DD2, ctr2StoreAVX2
1288
1289	// Compute the number of iterations that will hash two blocks of data
1290	MOVQ    inl, tmpStoreAVX2
1291	MOVQ    inl, itr1
1292	SUBQ    $256, itr1
1293	SHRQ    $4, itr1
1294	ADDQ    $6, itr1
1295	MOVQ    $10, itr2
1296	CMPQ    itr1, $10
1297	CMOVQGT itr2, itr1
1298	MOVQ    inp, inl
1299	XORQ    itr2, itr2
1300
1301	// Perform ChaCha rounds, while hashing the remaining input
1302openAVX2Tail384LoopB:
1303	polyAdd(0(inl))
1304	polyMulAVX2
1305	LEAQ 16(inl), inl
1306
1307openAVX2Tail384LoopA:
1308	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1309	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
1310	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1311	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
1312	polyAdd(0(inl))
1313	polyMulAVX2
1314	LEAQ     16(inl), inl
1315	INCQ     itr2
1316	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1317	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
1318	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1319	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
1320
1321	CMPQ itr2, itr1
1322	JB   openAVX2Tail384LoopB
1323
1324	CMPQ itr2, $10
1325	JNE  openAVX2Tail384LoopA
1326
1327	MOVQ inl, itr2
1328	SUBQ inp, inl
1329	MOVQ inl, itr1
1330	MOVQ tmpStoreAVX2, inl
1331
1332openAVX2Tail384Hash:
1333	ADDQ $16, itr1
1334	CMPQ itr1, inl
1335	JGT  openAVX2Tail384HashEnd
1336	polyAdd(0(itr2))
1337	polyMulAVX2
1338	LEAQ 16(itr2), itr2
1339	JMP  openAVX2Tail384Hash
1340
1341// Store 256 bytes safely, then go to store loop
1342openAVX2Tail384HashEnd:
1343	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
1344	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
1345	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
1346	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
1347	VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
1348	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
1349	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
1350	VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
1351	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
1352	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
1353	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
1354	LEAQ       (8*32)(inp), inp
1355	LEAQ       (8*32)(oup), oup
1356	SUBQ       $8*32, inl
1357	JMP        openAVX2TailLoop
1358
1359// ----------------------------------------------------------------------------
1360// Special optimization for the last 512 bytes of ciphertext
1361openAVX2Tail512:
1362	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
1363	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
1364	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
1365	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
1366	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
1367	XORQ    itr1, itr1
1368	MOVQ    inp, itr2
1369
1370openAVX2Tail512LoopB:
1371	polyAdd(0(itr2))
1372	polyMulAVX2
1373	LEAQ (2*8)(itr2), itr2
1374
1375openAVX2Tail512LoopA:
1376	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1377	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1378	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
1379	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1380	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1381	VMOVDQA  CC3, tmpStoreAVX2
1382	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
1383	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
1384	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
1385	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
1386	VMOVDQA  tmpStoreAVX2, CC3
1387	polyAdd(0*8(itr2))
1388	polyMulAVX2
1389	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1390	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1391	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
1392	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1393	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1394	VMOVDQA  CC3, tmpStoreAVX2
1395	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
1396	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
1397	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
1398	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
1399	VMOVDQA  tmpStoreAVX2, CC3
1400	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
1401	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
1402	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
1403	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1404	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1405	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
1406	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1407	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1408	polyAdd(2*8(itr2))
1409	polyMulAVX2
1410	LEAQ     (4*8)(itr2), itr2
1411	VMOVDQA  CC3, tmpStoreAVX2
1412	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
1413	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
1414	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
1415	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
1416	VMOVDQA  tmpStoreAVX2, CC3
1417	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1418	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1419	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
1420	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1421	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1422	VMOVDQA  CC3, tmpStoreAVX2
1423	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
1424	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
1425	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
1426	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
1427	VMOVDQA  tmpStoreAVX2, CC3
1428	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
1429	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
1430	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
1431	INCQ     itr1
1432	CMPQ     itr1, $4
1433	JLT      openAVX2Tail512LoopB
1434
1435	CMPQ itr1, $10
1436	JNE  openAVX2Tail512LoopA
1437
1438	MOVQ inl, itr1
1439	SUBQ $384, itr1
1440	ANDQ $-16, itr1
1441
1442openAVX2Tail512HashLoop:
1443	TESTQ itr1, itr1
1444	JE    openAVX2Tail512HashEnd
1445	polyAdd(0(itr2))
1446	polyMulAVX2
1447	LEAQ  16(itr2), itr2
1448	SUBQ  $16, itr1
1449	JMP   openAVX2Tail512HashLoop
1450
1451openAVX2Tail512HashEnd:
1452	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
1453	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
1454	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
1455	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
1456	VMOVDQA    CC3, tmpStoreAVX2
1457	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
1458	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
1459	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
1460	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1461	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
1462	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
1463	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
1464	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
1465	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
1466	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
1467
1468	LEAQ (12*32)(inp), inp
1469	LEAQ (12*32)(oup), oup
1470	SUBQ $12*32, inl
1471
1472	JMP openAVX2TailLoop
1473
1474// ----------------------------------------------------------------------------
1475// ----------------------------------------------------------------------------
1476// func chacha20Poly1305Seal(dst, key, src, ad []byte)
1477TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
1478	// For aligned stack access
1479	MOVQ SP, BP
1480	ADDQ $32, BP
1481	ANDQ $-32, BP
1482	MOVQ dst+0(FP), oup
1483	MOVQ key+24(FP), keyp
1484	MOVQ src+48(FP), inp
1485	MOVQ src_len+56(FP), inl
1486	MOVQ ad+72(FP), adp
1487
1488	CMPB ·useAVX2(SB), $1
1489	JE   chacha20Poly1305Seal_AVX2
1490
1491	// Special optimization, for very short buffers
1492	CMPQ inl, $128
1493	JBE  sealSSE128 // About 15% faster
1494
1495	// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
1496	MOVOU ·chacha20Constants<>(SB), A0
1497	MOVOU (1*16)(keyp), B0
1498	MOVOU (2*16)(keyp), C0
1499	MOVOU (3*16)(keyp), D0
1500
1501	// Store state on stack for future use
1502	MOVO B0, state1Store
1503	MOVO C0, state2Store
1504
1505	// Load state, increment counter blocks
1506	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1507	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1508	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
1509
1510	// Store counters
1511	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
1512	MOVQ $10, itr2
1513
1514sealSSEIntroLoop:
1515	MOVO         C3, tmpStore
1516	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1517	MOVO         tmpStore, C3
1518	MOVO         C1, tmpStore
1519	chachaQR(A3, B3, C3, D3, C1)
1520	MOVO         tmpStore, C1
1521	shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
1522	shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
1523	shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
1524
1525	MOVO          C3, tmpStore
1526	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1527	MOVO          tmpStore, C3
1528	MOVO          C1, tmpStore
1529	chachaQR(A3, B3, C3, D3, C1)
1530	MOVO          tmpStore, C1
1531	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
1532	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
1533	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
1534	DECQ          itr2
1535	JNE           sealSSEIntroLoop
1536
1537	// Add in the state
1538	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
1539	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
1540	PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
1541	PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
1542
1543	// Clamp and store the key
1544	PAND ·polyClampMask<>(SB), A0
1545	MOVO A0, rStore
1546	MOVO B0, sStore
1547
1548	// Hash AAD
1549	MOVQ ad_len+80(FP), itr2
1550	CALL polyHashADInternal<>(SB)
1551
1552	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1553	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
1554	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
1555	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
1556	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
1557	MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
1558
1559	MOVQ $128, itr1
1560	SUBQ $128, inl
1561	LEAQ 128(inp), inp
1562
1563	MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
1564
1565	CMPQ inl, $64
1566	JBE  sealSSE128SealHash
1567
1568	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1569	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
1570	MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
1571
1572	ADDQ $64, itr1
1573	SUBQ $64, inl
1574	LEAQ 64(inp), inp
1575
1576	MOVQ $2, itr1
1577	MOVQ $8, itr2
1578
1579	CMPQ inl, $64
1580	JBE  sealSSETail64
1581	CMPQ inl, $128
1582	JBE  sealSSETail128
1583	CMPQ inl, $192
1584	JBE  sealSSETail192
1585
1586sealSSEMainLoop:
1587	// Load state, increment counter blocks
1588	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
1589	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1590	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1591	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
1592
1593	// Store counters
1594	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
1595
1596sealSSEInnerLoop:
1597	MOVO          C3, tmpStore
1598	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1599	MOVO          tmpStore, C3
1600	MOVO          C1, tmpStore
1601	chachaQR(A3, B3, C3, D3, C1)
1602	MOVO          tmpStore, C1
1603	polyAdd(0(oup))
1604	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
1605	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
1606	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
1607	polyMulStage1
1608	polyMulStage2
1609	LEAQ          (2*8)(oup), oup
1610	MOVO          C3, tmpStore
1611	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1612	MOVO          tmpStore, C3
1613	MOVO          C1, tmpStore
1614	polyMulStage3
1615	chachaQR(A3, B3, C3, D3, C1)
1616	MOVO          tmpStore, C1
1617	polyMulReduceStage
1618	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
1619	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
1620	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
1621	DECQ          itr2
1622	JGE           sealSSEInnerLoop
1623	polyAdd(0(oup))
1624	polyMul
1625	LEAQ          (2*8)(oup), oup
1626	DECQ          itr1
1627	JG            sealSSEInnerLoop
1628
1629	// Add in the state
1630	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
1631	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
1632	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
1633	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
1634	MOVO  D3, tmpStore
1635
1636	// Load - xor - store
1637	MOVOU (0*16)(inp), D3; PXOR D3, A0
1638	MOVOU (1*16)(inp), D3; PXOR D3, B0
1639	MOVOU (2*16)(inp), D3; PXOR D3, C0
1640	MOVOU (3*16)(inp), D3; PXOR D3, D0
1641	MOVOU A0, (0*16)(oup)
1642	MOVOU B0, (1*16)(oup)
1643	MOVOU C0, (2*16)(oup)
1644	MOVOU D0, (3*16)(oup)
1645	MOVO  tmpStore, D3
1646
1647	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
1648	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
1649	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
1650	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
1651	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
1652	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
1653	ADDQ  $192, inp
1654	MOVQ  $192, itr1
1655	SUBQ  $192, inl
1656	MOVO  A3, A1
1657	MOVO  B3, B1
1658	MOVO  C3, C1
1659	MOVO  D3, D1
1660	CMPQ  inl, $64
1661	JBE   sealSSE128SealHash
1662	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1663	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
1664	MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
1665	LEAQ  64(inp), inp
1666	SUBQ  $64, inl
1667	MOVQ  $6, itr1
1668	MOVQ  $4, itr2
1669	CMPQ  inl, $192
1670	JG    sealSSEMainLoop
1671
1672	MOVQ  inl, itr1
1673	TESTQ inl, inl
1674	JE    sealSSE128SealHash
1675	MOVQ  $6, itr1
1676	CMPQ  inl, $64
1677	JBE   sealSSETail64
1678	CMPQ  inl, $128
1679	JBE   sealSSETail128
1680	JMP   sealSSETail192
1681
1682// ----------------------------------------------------------------------------
1683// Special optimization for the last 64 bytes of plaintext
1684sealSSETail64:
1685	// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
1686	MOVO  ·chacha20Constants<>(SB), A1
1687	MOVO  state1Store, B1
1688	MOVO  state2Store, C1
1689	MOVO  ctr3Store, D1
1690	PADDL ·sseIncMask<>(SB), D1
1691	MOVO  D1, ctr0Store
1692
1693sealSSETail64LoopA:
1694	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1695	polyAdd(0(oup))
1696	polyMul
1697	LEAQ 16(oup), oup
1698
1699sealSSETail64LoopB:
1700	chachaQR(A1, B1, C1, D1, T1)
1701	shiftB1Left;  shiftC1Left; shiftD1Left
1702	chachaQR(A1, B1, C1, D1, T1)
1703	shiftB1Right; shiftC1Right; shiftD1Right
1704	polyAdd(0(oup))
1705	polyMul
1706	LEAQ          16(oup), oup
1707
1708	DECQ itr1
1709	JG   sealSSETail64LoopA
1710
1711	DECQ  itr2
1712	JGE   sealSSETail64LoopB
1713	PADDL ·chacha20Constants<>(SB), A1
1714	PADDL state1Store, B1
1715	PADDL state2Store, C1
1716	PADDL ctr0Store, D1
1717
1718	JMP sealSSE128Seal
1719
1720// ----------------------------------------------------------------------------
1721// Special optimization for the last 128 bytes of plaintext
1722sealSSETail128:
1723	// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
1724	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
1725	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
1726
1727sealSSETail128LoopA:
1728	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1729	polyAdd(0(oup))
1730	polyMul
1731	LEAQ 16(oup), oup
1732
1733sealSSETail128LoopB:
1734	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
1735	shiftB0Left;  shiftC0Left; shiftD0Left
1736	shiftB1Left;  shiftC1Left; shiftD1Left
1737	polyAdd(0(oup))
1738	polyMul
1739	LEAQ          16(oup), oup
1740	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
1741	shiftB0Right; shiftC0Right; shiftD0Right
1742	shiftB1Right; shiftC1Right; shiftD1Right
1743
1744	DECQ itr1
1745	JG   sealSSETail128LoopA
1746
1747	DECQ itr2
1748	JGE  sealSSETail128LoopB
1749
1750	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
1751	PADDL state1Store, B0; PADDL state1Store, B1
1752	PADDL state2Store, C0; PADDL state2Store, C1
1753	PADDL ctr0Store, D0; PADDL ctr1Store, D1
1754
1755	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
1756	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
1757	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
1758
1759	MOVQ $64, itr1
1760	LEAQ 64(inp), inp
1761	SUBQ $64, inl
1762
1763	JMP sealSSE128SealHash
1764
1765// ----------------------------------------------------------------------------
1766// Special optimization for the last 192 bytes of plaintext
1767sealSSETail192:
1768	// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
1769	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
1770	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
1771	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
1772
1773sealSSETail192LoopA:
1774	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1775	polyAdd(0(oup))
1776	polyMul
1777	LEAQ 16(oup), oup
1778
1779sealSSETail192LoopB:
1780	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1781	shiftB0Left; shiftC0Left; shiftD0Left
1782	shiftB1Left; shiftC1Left; shiftD1Left
1783	shiftB2Left; shiftC2Left; shiftD2Left
1784
1785	polyAdd(0(oup))
1786	polyMul
1787	LEAQ 16(oup), oup
1788
1789	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1790	shiftB0Right; shiftC0Right; shiftD0Right
1791	shiftB1Right; shiftC1Right; shiftD1Right
1792	shiftB2Right; shiftC2Right; shiftD2Right
1793
1794	DECQ itr1
1795	JG   sealSSETail192LoopA
1796
1797	DECQ itr2
1798	JGE  sealSSETail192LoopB
1799
1800	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
1801	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
1802	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
1803	PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
1804
1805	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
1806	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
1807	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
1808	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
1809	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
1810	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
1811
1812	MOVO A2, A1
1813	MOVO B2, B1
1814	MOVO C2, C1
1815	MOVO D2, D1
1816	MOVQ $128, itr1
1817	LEAQ 128(inp), inp
1818	SUBQ $128, inl
1819
1820	JMP sealSSE128SealHash
1821
1822// ----------------------------------------------------------------------------
1823// Special seal optimization for buffers smaller than 129 bytes
1824sealSSE128:
1825	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
1826	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
1827	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1828	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1829	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
1830	MOVQ  $10, itr2
1831
1832sealSSE128InnerCipherLoop:
1833	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1834	shiftB0Left;  shiftB1Left; shiftB2Left
1835	shiftC0Left;  shiftC1Left; shiftC2Left
1836	shiftD0Left;  shiftD1Left; shiftD2Left
1837	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1838	shiftB0Right; shiftB1Right; shiftB2Right
1839	shiftC0Right; shiftC1Right; shiftC2Right
1840	shiftD0Right; shiftD1Right; shiftD2Right
1841	DECQ          itr2
1842	JNE           sealSSE128InnerCipherLoop
1843
1844	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
1845	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
1846	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
1847	PADDL T2, C1; PADDL T2, C2
1848	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
1849	PAND  ·polyClampMask<>(SB), A0
1850	MOVOU A0, rStore
1851	MOVOU B0, sStore
1852
1853	// Hash
1854	MOVQ ad_len+80(FP), itr2
1855	CALL polyHashADInternal<>(SB)
1856	XORQ itr1, itr1
1857
1858sealSSE128SealHash:
1859	// itr1 holds the number of bytes encrypted but not yet hashed
1860	CMPQ itr1, $16
1861	JB   sealSSE128Seal
1862	polyAdd(0(oup))
1863	polyMul
1864
1865	SUBQ $16, itr1
1866	ADDQ $16, oup
1867
1868	JMP sealSSE128SealHash
1869
1870sealSSE128Seal:
1871	CMPQ inl, $16
1872	JB   sealSSETail
1873	SUBQ $16, inl
1874
1875	// Load for decryption
1876	MOVOU (inp), T0
1877	PXOR  T0, A1
1878	MOVOU A1, (oup)
1879	LEAQ  (1*16)(inp), inp
1880	LEAQ  (1*16)(oup), oup
1881
1882	// Extract for hashing
1883	MOVQ   A1, t0
1884	PSRLDQ $8, A1
1885	MOVQ A1, t1
1886	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
1887	polyMul
1888
1889	// Shift the stream "left"
1890	MOVO B1, A1
1891	MOVO C1, B1
1892	MOVO D1, C1
1893	MOVO A2, D1
1894	MOVO B2, A2
1895	MOVO C2, B2
1896	MOVO D2, C2
1897	JMP  sealSSE128Seal
1898
1899sealSSETail:
1900	TESTQ inl, inl
1901	JE    sealSSEFinalize
1902
1903	// We can only load the PT one byte at a time to avoid read after end of buffer
1904	MOVQ inl, itr2
1905	SHLQ $4, itr2
1906	LEAQ ·andMask<>(SB), t0
1907	MOVQ inl, itr1
1908	LEAQ -1(inp)(inl*1), inp
1909	XORQ t2, t2
1910	XORQ t3, t3
1911	XORQ AX, AX
1912
1913sealSSETailLoadLoop:
1914	SHLQ $8, t2, t3
1915	SHLQ $8, t2
1916	MOVB (inp), AX
1917	XORQ AX, t2
1918	LEAQ   -1(inp), inp
1919	DECQ   itr1
1920	JNE    sealSSETailLoadLoop
1921	MOVQ t2, 0+tmpStore
1922	MOVQ t3, 8+tmpStore
1923	PXOR 0+tmpStore, A1
1924	MOVOU  A1, (oup)
1925	MOVOU  -16(t0)(itr2*1), T0
1926	PAND   T0, A1
1927	MOVQ   A1, t0
1928	PSRLDQ $8, A1
1929	MOVQ   A1, t1
1930	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
1931	polyMul
1932
1933	ADDQ inl, oup
1934
1935sealSSEFinalize:
1936	// Hash in the buffer lengths
1937	ADDQ ad_len+80(FP), acc0
1938	ADCQ src_len+56(FP), acc1
1939	ADCQ $1, acc2
1940	polyMul
1941
1942	// Final reduce
1943	MOVQ    acc0, t0
1944	MOVQ    acc1, t1
1945	MOVQ    acc2, t2
1946	SUBQ    $-5, acc0
1947	SBBQ    $-1, acc1
1948	SBBQ    $3, acc2
1949	CMOVQCS t0, acc0
1950	CMOVQCS t1, acc1
1951	CMOVQCS t2, acc2
1952
1953	// Add in the "s" part of the key
1954	ADDQ 0+sStore, acc0
1955	ADCQ 8+sStore, acc1
1956
1957	// Finally store the tag at the end of the message
1958	MOVQ acc0, (0*8)(oup)
1959	MOVQ acc1, (1*8)(oup)
1960	RET
1961
1962// ----------------------------------------------------------------------------
1963// ------------------------- AVX2 Code ----------------------------------------
1964chacha20Poly1305Seal_AVX2:
1965	VZEROUPPER
1966	VMOVDQU ·chacha20Constants<>(SB), AA0
1967	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
1968	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
1969	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
1970	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
1971
1972	// Special optimizations, for very short buffers
1973	CMPQ inl, $192
1974	JBE  seal192AVX2 // 33% faster
1975	CMPQ inl, $320
1976	JBE  seal320AVX2 // 17% faster
1977
1978	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
1979	VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
1980	VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
1981	VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
1982	VPADDD  ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
1983	VPADDD  ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
1984	VPADDD  ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
1985	VMOVDQA DD3, ctr3StoreAVX2
1986	MOVQ    $10, itr2
1987
1988sealAVX2IntroLoop:
1989	VMOVDQA CC3, tmpStoreAVX2
1990	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
1991	VMOVDQA tmpStoreAVX2, CC3
1992	VMOVDQA CC1, tmpStoreAVX2
1993	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
1994	VMOVDQA tmpStoreAVX2, CC1
1995
1996	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
1997	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
1998	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
1999	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
2000
2001	VMOVDQA CC3, tmpStoreAVX2
2002	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2003	VMOVDQA tmpStoreAVX2, CC3
2004	VMOVDQA CC1, tmpStoreAVX2
2005	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2006	VMOVDQA tmpStoreAVX2, CC1
2007
2008	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
2009	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
2010	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
2011	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
2012	DECQ     itr2
2013	JNE      sealAVX2IntroLoop
2014
2015	VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2016	VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2017	VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2018	VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2019
2020	VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
2021	VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
2022	VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
2023
2024	// Clamp and store poly key
2025	VPAND   ·polyClampMask<>(SB), DD0, DD0
2026	VMOVDQA DD0, rsStoreAVX2
2027
2028	// Hash AD
2029	MOVQ ad_len+80(FP), itr2
2030	CALL polyHashADInternal<>(SB)
2031
2032	// Can store at least 320 bytes
2033	VPXOR   (0*32)(inp), AA0, AA0
2034	VPXOR   (1*32)(inp), CC0, CC0
2035	VMOVDQU AA0, (0*32)(oup)
2036	VMOVDQU CC0, (1*32)(oup)
2037
2038	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
2039	VPXOR      (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
2040	VMOVDQU    AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
2041	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
2042	VPXOR      (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
2043	VMOVDQU    AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
2044
2045	MOVQ $320, itr1
2046	SUBQ $320, inl
2047	LEAQ 320(inp), inp
2048
2049	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
2050	CMPQ       inl, $128
2051	JBE        sealAVX2SealHash
2052
2053	VPXOR   (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
2054	VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
2055	SUBQ    $128, inl
2056	LEAQ    128(inp), inp
2057
2058	MOVQ $8, itr1
2059	MOVQ $2, itr2
2060
2061	CMPQ inl, $128
2062	JBE  sealAVX2Tail128
2063	CMPQ inl, $256
2064	JBE  sealAVX2Tail256
2065	CMPQ inl, $384
2066	JBE  sealAVX2Tail384
2067	CMPQ inl, $512
2068	JBE  sealAVX2Tail512
2069
2070	// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
2071	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2072	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2073	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2074	VMOVDQA ctr3StoreAVX2, DD0
2075	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2076	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2077
2078	VMOVDQA CC3, tmpStoreAVX2
2079	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2080	VMOVDQA tmpStoreAVX2, CC3
2081	VMOVDQA CC1, tmpStoreAVX2
2082	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2083	VMOVDQA tmpStoreAVX2, CC1
2084
2085	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
2086	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
2087	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
2088	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
2089
2090	VMOVDQA CC3, tmpStoreAVX2
2091	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2092	VMOVDQA tmpStoreAVX2, CC3
2093	VMOVDQA CC1, tmpStoreAVX2
2094	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2095	VMOVDQA tmpStoreAVX2, CC1
2096
2097	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
2098	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
2099	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
2100	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
2101	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2102	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2103	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2104	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2105	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2106	VMOVDQA  CC3, tmpStoreAVX2
2107	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2108	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2109	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2110	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2111	VMOVDQA  tmpStoreAVX2, CC3
2112
2113	SUBQ $16, oup                  // Adjust the pointer
2114	MOVQ $9, itr1
2115	JMP  sealAVX2InternalLoopStart
2116
2117sealAVX2MainLoop:
2118	// Load state, increment counter blocks, store the incremented counters
2119	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2120	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2121	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2122	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2123	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2124	MOVQ    $10, itr1
2125
2126sealAVX2InternalLoop:
2127	polyAdd(0*8(oup))
2128	VPADDD  BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2129	polyMulStage1_AVX2
2130	VPXOR   AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2131	VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2132	polyMulStage2_AVX2
2133	VPADDD  DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2134	VPXOR   CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2135	polyMulStage3_AVX2
2136	VMOVDQA CC3, tmpStoreAVX2
2137	VPSLLD  $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2138	VPSLLD  $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2139	VPSLLD  $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2140	VPSLLD  $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2141	VMOVDQA tmpStoreAVX2, CC3
2142	polyMulReduceStage
2143
2144sealAVX2InternalLoopStart:
2145	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2146	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2147	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2148	polyAdd(2*8(oup))
2149	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2150	polyMulStage1_AVX2
2151	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2152	VMOVDQA  CC3, tmpStoreAVX2
2153	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2154	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2155	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2156	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2157	VMOVDQA  tmpStoreAVX2, CC3
2158	polyMulStage2_AVX2
2159	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
2160	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2161	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
2162	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2163	polyMulStage3_AVX2
2164	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2165	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2166	polyMulReduceStage
2167	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2168	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2169	polyAdd(4*8(oup))
2170	LEAQ     (6*8)(oup), oup
2171	VMOVDQA  CC3, tmpStoreAVX2
2172	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2173	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2174	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2175	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2176	VMOVDQA  tmpStoreAVX2, CC3
2177	polyMulStage1_AVX2
2178	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2179	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2180	polyMulStage2_AVX2
2181	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2182	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2183	polyMulStage3_AVX2
2184	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2185	VMOVDQA  CC3, tmpStoreAVX2
2186	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2187	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2188	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2189	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2190	VMOVDQA  tmpStoreAVX2, CC3
2191	polyMulReduceStage
2192	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
2193	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2194	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
2195	DECQ     itr1
2196	JNE      sealAVX2InternalLoop
2197
2198	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2199	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2200	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2201	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2202	VMOVDQA CC3, tmpStoreAVX2
2203
2204	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
2205	polyAdd(0*8(oup))
2206	polyMulAVX2
2207	LEAQ       (4*8)(oup), oup
2208	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
2209	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
2210	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
2211	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
2212	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
2213	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
2214
2215	// and here
2216	polyAdd(-2*8(oup))
2217	polyMulAVX2
2218	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
2219	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
2220	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
2221	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
2222	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
2223	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
2224	LEAQ       (32*16)(inp), inp
2225	SUBQ       $(32*16), inl
2226	CMPQ       inl, $512
2227	JG         sealAVX2MainLoop
2228
2229	// Tail can only hash 480 bytes
2230	polyAdd(0*8(oup))
2231	polyMulAVX2
2232	polyAdd(2*8(oup))
2233	polyMulAVX2
2234	LEAQ 32(oup), oup
2235
2236	MOVQ $10, itr1
2237	MOVQ $0, itr2
2238	CMPQ inl, $128
2239	JBE  sealAVX2Tail128
2240	CMPQ inl, $256
2241	JBE  sealAVX2Tail256
2242	CMPQ inl, $384
2243	JBE  sealAVX2Tail384
2244	JMP  sealAVX2Tail512
2245
2246// ----------------------------------------------------------------------------
2247// Special optimization for buffers smaller than 193 bytes
2248seal192AVX2:
2249	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
2250	VMOVDQA AA0, AA1
2251	VMOVDQA BB0, BB1
2252	VMOVDQA CC0, CC1
2253	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
2254	VMOVDQA AA0, AA2
2255	VMOVDQA BB0, BB2
2256	VMOVDQA CC0, CC2
2257	VMOVDQA DD0, DD2
2258	VMOVDQA DD1, TT3
2259	MOVQ    $10, itr2
2260
2261sealAVX2192InnerCipherLoop:
2262	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2263	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
2264	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2265	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
2266	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2267	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
2268	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2269	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
2270	DECQ       itr2
2271	JNE        sealAVX2192InnerCipherLoop
2272	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
2273	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
2274	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
2275	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
2276	VPERM2I128 $0x02, AA0, BB0, TT0
2277
2278	// Clamp and store poly key
2279	VPAND   ·polyClampMask<>(SB), TT0, TT0
2280	VMOVDQA TT0, rsStoreAVX2
2281
2282	// Stream for up to 192 bytes
2283	VPERM2I128 $0x13, AA0, BB0, AA0
2284	VPERM2I128 $0x13, CC0, DD0, BB0
2285	VPERM2I128 $0x02, AA1, BB1, CC0
2286	VPERM2I128 $0x02, CC1, DD1, DD0
2287	VPERM2I128 $0x13, AA1, BB1, AA1
2288	VPERM2I128 $0x13, CC1, DD1, BB1
2289
2290sealAVX2ShortSeal:
2291	// Hash aad
2292	MOVQ ad_len+80(FP), itr2
2293	CALL polyHashADInternal<>(SB)
2294	XORQ itr1, itr1
2295
2296sealAVX2SealHash:
2297	// itr1 holds the number of bytes encrypted but not yet hashed
2298	CMPQ itr1, $16
2299	JB   sealAVX2ShortSealLoop
2300	polyAdd(0(oup))
2301	polyMul
2302	SUBQ $16, itr1
2303	ADDQ $16, oup
2304	JMP  sealAVX2SealHash
2305
2306sealAVX2ShortSealLoop:
2307	CMPQ inl, $32
2308	JB   sealAVX2ShortTail32
2309	SUBQ $32, inl
2310
2311	// Load for encryption
2312	VPXOR   (inp), AA0, AA0
2313	VMOVDQU AA0, (oup)
2314	LEAQ    (1*32)(inp), inp
2315
2316	// Now can hash
2317	polyAdd(0*8(oup))
2318	polyMulAVX2
2319	polyAdd(2*8(oup))
2320	polyMulAVX2
2321	LEAQ (1*32)(oup), oup
2322
2323	// Shift stream left
2324	VMOVDQA BB0, AA0
2325	VMOVDQA CC0, BB0
2326	VMOVDQA DD0, CC0
2327	VMOVDQA AA1, DD0
2328	VMOVDQA BB1, AA1
2329	VMOVDQA CC1, BB1
2330	VMOVDQA DD1, CC1
2331	VMOVDQA AA2, DD1
2332	VMOVDQA BB2, AA2
2333	JMP     sealAVX2ShortSealLoop
2334
2335sealAVX2ShortTail32:
2336	CMPQ    inl, $16
2337	VMOVDQA A0, A1
2338	JB      sealAVX2ShortDone
2339
2340	SUBQ $16, inl
2341
2342	// Load for encryption
2343	VPXOR   (inp), A0, T0
2344	VMOVDQU T0, (oup)
2345	LEAQ    (1*16)(inp), inp
2346
2347	// Hash
2348	polyAdd(0*8(oup))
2349	polyMulAVX2
2350	LEAQ       (1*16)(oup), oup
2351	VPERM2I128 $0x11, AA0, AA0, AA0
2352	VMOVDQA    A0, A1
2353
2354sealAVX2ShortDone:
2355	VZEROUPPER
2356	JMP sealSSETail
2357
2358// ----------------------------------------------------------------------------
2359// Special optimization for buffers smaller than 321 bytes
2360seal320AVX2:
2361	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
2362	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
2363	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
2364	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
2365	MOVQ    $10, itr2
2366
2367sealAVX2320InnerCipherLoop:
2368	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2369	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
2370	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2371	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
2372	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2373	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
2374	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2375	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
2376	DECQ     itr2
2377	JNE      sealAVX2320InnerCipherLoop
2378
2379	VMOVDQA ·chacha20Constants<>(SB), TT0
2380	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
2381	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
2382	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
2383	VMOVDQA ·avx2IncMask<>(SB), TT0
2384	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
2385	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
2386	VPADDD  TT3, DD2, DD2
2387
2388	// Clamp and store poly key
2389	VPERM2I128 $0x02, AA0, BB0, TT0
2390	VPAND      ·polyClampMask<>(SB), TT0, TT0
2391	VMOVDQA    TT0, rsStoreAVX2
2392
2393	// Stream for up to 320 bytes
2394	VPERM2I128 $0x13, AA0, BB0, AA0
2395	VPERM2I128 $0x13, CC0, DD0, BB0
2396	VPERM2I128 $0x02, AA1, BB1, CC0
2397	VPERM2I128 $0x02, CC1, DD1, DD0
2398	VPERM2I128 $0x13, AA1, BB1, AA1
2399	VPERM2I128 $0x13, CC1, DD1, BB1
2400	VPERM2I128 $0x02, AA2, BB2, CC1
2401	VPERM2I128 $0x02, CC2, DD2, DD1
2402	VPERM2I128 $0x13, AA2, BB2, AA2
2403	VPERM2I128 $0x13, CC2, DD2, BB2
2404	JMP        sealAVX2ShortSeal
2405
2406// ----------------------------------------------------------------------------
2407// Special optimization for the last 128 bytes of ciphertext
2408sealAVX2Tail128:
2409	// Need to decrypt up to 128 bytes - prepare two blocks
2410	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2411	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2412	VMOVDQA ·chacha20Constants<>(SB), AA0
2413	VMOVDQA state1StoreAVX2, BB0
2414	VMOVDQA state2StoreAVX2, CC0
2415	VMOVDQA ctr3StoreAVX2, DD0
2416	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
2417	VMOVDQA DD0, DD1
2418
2419sealAVX2Tail128LoopA:
2420	polyAdd(0(oup))
2421	polyMul
2422	LEAQ 16(oup), oup
2423
2424sealAVX2Tail128LoopB:
2425	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
2426	polyAdd(0(oup))
2427	polyMul
2428	VPALIGNR $4, BB0, BB0, BB0
2429	VPALIGNR $8, CC0, CC0, CC0
2430	VPALIGNR $12, DD0, DD0, DD0
2431	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
2432	polyAdd(16(oup))
2433	polyMul
2434	LEAQ     32(oup), oup
2435	VPALIGNR $12, BB0, BB0, BB0
2436	VPALIGNR $8, CC0, CC0, CC0
2437	VPALIGNR $4, DD0, DD0, DD0
2438	DECQ     itr1
2439	JG       sealAVX2Tail128LoopA
2440	DECQ     itr2
2441	JGE      sealAVX2Tail128LoopB
2442
2443	VPADDD ·chacha20Constants<>(SB), AA0, AA1
2444	VPADDD state1StoreAVX2, BB0, BB1
2445	VPADDD state2StoreAVX2, CC0, CC1
2446	VPADDD DD1, DD0, DD1
2447
2448	VPERM2I128 $0x02, AA1, BB1, AA0
2449	VPERM2I128 $0x02, CC1, DD1, BB0
2450	VPERM2I128 $0x13, AA1, BB1, CC0
2451	VPERM2I128 $0x13, CC1, DD1, DD0
2452	JMP        sealAVX2ShortSealLoop
2453
2454// ----------------------------------------------------------------------------
2455// Special optimization for the last 256 bytes of ciphertext
2456sealAVX2Tail256:
2457	// Need to decrypt up to 256 bytes - prepare two blocks
2458	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2459	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2460	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
2461	VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
2462	VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
2463	VMOVDQA ctr3StoreAVX2, DD0
2464	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
2465	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
2466	VMOVDQA DD0, TT1
2467	VMOVDQA DD1, TT2
2468
2469sealAVX2Tail256LoopA:
2470	polyAdd(0(oup))
2471	polyMul
2472	LEAQ 16(oup), oup
2473
2474sealAVX2Tail256LoopB:
2475	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2476	polyAdd(0(oup))
2477	polyMul
2478	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
2479	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2480	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
2481	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2482	polyAdd(16(oup))
2483	polyMul
2484	LEAQ     32(oup), oup
2485	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
2486	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2487	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
2488	DECQ     itr1
2489	JG       sealAVX2Tail256LoopA
2490	DECQ     itr2
2491	JGE      sealAVX2Tail256LoopB
2492
2493	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
2494	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
2495	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
2496	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
2497	VPERM2I128 $0x02, AA0, BB0, TT0
2498	VPERM2I128 $0x02, CC0, DD0, TT1
2499	VPERM2I128 $0x13, AA0, BB0, TT2
2500	VPERM2I128 $0x13, CC0, DD0, TT3
2501	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
2502	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
2503	MOVQ       $128, itr1
2504	LEAQ       128(inp), inp
2505	SUBQ       $128, inl
2506	VPERM2I128 $0x02, AA1, BB1, AA0
2507	VPERM2I128 $0x02, CC1, DD1, BB0
2508	VPERM2I128 $0x13, AA1, BB1, CC0
2509	VPERM2I128 $0x13, CC1, DD1, DD0
2510
2511	JMP sealAVX2SealHash
2512
2513// ----------------------------------------------------------------------------
2514// Special optimization for the last 384 bytes of ciphertext
2515sealAVX2Tail384:
2516	// Need to decrypt up to 384 bytes - prepare two blocks
2517	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2518	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2519	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
2520	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
2521	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
2522	VMOVDQA ctr3StoreAVX2, DD0
2523	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
2524	VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
2525
2526sealAVX2Tail384LoopA:
2527	polyAdd(0(oup))
2528	polyMul
2529	LEAQ 16(oup), oup
2530
2531sealAVX2Tail384LoopB:
2532	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2533	polyAdd(0(oup))
2534	polyMul
2535	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
2536	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2537	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
2538	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2539	polyAdd(16(oup))
2540	polyMul
2541	LEAQ     32(oup), oup
2542	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
2543	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2544	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
2545	DECQ     itr1
2546	JG       sealAVX2Tail384LoopA
2547	DECQ     itr2
2548	JGE      sealAVX2Tail384LoopB
2549
2550	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
2551	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
2552	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
2553	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
2554	VPERM2I128 $0x02, AA0, BB0, TT0
2555	VPERM2I128 $0x02, CC0, DD0, TT1
2556	VPERM2I128 $0x13, AA0, BB0, TT2
2557	VPERM2I128 $0x13, CC0, DD0, TT3
2558	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
2559	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
2560	VPERM2I128 $0x02, AA1, BB1, TT0
2561	VPERM2I128 $0x02, CC1, DD1, TT1
2562	VPERM2I128 $0x13, AA1, BB1, TT2
2563	VPERM2I128 $0x13, CC1, DD1, TT3
2564	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
2565	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
2566	MOVQ       $256, itr1
2567	LEAQ       256(inp), inp
2568	SUBQ       $256, inl
2569	VPERM2I128 $0x02, AA2, BB2, AA0
2570	VPERM2I128 $0x02, CC2, DD2, BB0
2571	VPERM2I128 $0x13, AA2, BB2, CC0
2572	VPERM2I128 $0x13, CC2, DD2, DD0
2573
2574	JMP sealAVX2SealHash
2575
2576// ----------------------------------------------------------------------------
2577// Special optimization for the last 512 bytes of ciphertext
2578sealAVX2Tail512:
2579	// Need to decrypt up to 512 bytes - prepare two blocks
2580	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2581	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2582	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2583	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2584	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2585	VMOVDQA ctr3StoreAVX2, DD0
2586	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2587	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2588
2589sealAVX2Tail512LoopA:
2590	polyAdd(0(oup))
2591	polyMul
2592	LEAQ 16(oup), oup
2593
2594sealAVX2Tail512LoopB:
2595	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2596	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2597	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2598	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2599	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2600	VMOVDQA  CC3, tmpStoreAVX2
2601	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2602	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2603	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2604	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2605	VMOVDQA  tmpStoreAVX2, CC3
2606	polyAdd(0*8(oup))
2607	polyMulAVX2
2608	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2609	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2610	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2611	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2612	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2613	VMOVDQA  CC3, tmpStoreAVX2
2614	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2615	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2616	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2617	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2618	VMOVDQA  tmpStoreAVX2, CC3
2619	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
2620	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2621	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
2622	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2623	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2624	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2625	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2626	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2627	polyAdd(2*8(oup))
2628	polyMulAVX2
2629	LEAQ     (4*8)(oup), oup
2630	VMOVDQA  CC3, tmpStoreAVX2
2631	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2632	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2633	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2634	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2635	VMOVDQA  tmpStoreAVX2, CC3
2636	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2637	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2638	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2639	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2640	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2641	VMOVDQA  CC3, tmpStoreAVX2
2642	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2643	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2644	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2645	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2646	VMOVDQA  tmpStoreAVX2, CC3
2647	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
2648	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2649	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
2650
2651	DECQ itr1
2652	JG   sealAVX2Tail512LoopA
2653	DECQ itr2
2654	JGE  sealAVX2Tail512LoopB
2655
2656	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2657	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2658	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2659	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2660	VMOVDQA    CC3, tmpStoreAVX2
2661	VPERM2I128 $0x02, AA0, BB0, CC3
2662	VPXOR      (0*32)(inp), CC3, CC3
2663	VMOVDQU    CC3, (0*32)(oup)
2664	VPERM2I128 $0x02, CC0, DD0, CC3
2665	VPXOR      (1*32)(inp), CC3, CC3
2666	VMOVDQU    CC3, (1*32)(oup)
2667	VPERM2I128 $0x13, AA0, BB0, CC3
2668	VPXOR      (2*32)(inp), CC3, CC3
2669	VMOVDQU    CC3, (2*32)(oup)
2670	VPERM2I128 $0x13, CC0, DD0, CC3
2671	VPXOR      (3*32)(inp), CC3, CC3
2672	VMOVDQU    CC3, (3*32)(oup)
2673
2674	VPERM2I128 $0x02, AA1, BB1, AA0
2675	VPERM2I128 $0x02, CC1, DD1, BB0
2676	VPERM2I128 $0x13, AA1, BB1, CC0
2677	VPERM2I128 $0x13, CC1, DD1, DD0
2678	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
2679	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
2680
2681	VPERM2I128 $0x02, AA2, BB2, AA0
2682	VPERM2I128 $0x02, CC2, DD2, BB0
2683	VPERM2I128 $0x13, AA2, BB2, CC0
2684	VPERM2I128 $0x13, CC2, DD2, DD0
2685	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
2686	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
2687
2688	MOVQ       $384, itr1
2689	LEAQ       384(inp), inp
2690	SUBQ       $384, inl
2691	VPERM2I128 $0x02, AA3, BB3, AA0
2692	VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
2693	VPERM2I128 $0x13, AA3, BB3, CC0
2694	VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
2695
2696	JMP sealAVX2SealHash
2697