1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "textflag.h"
6
7#define B0 V0
8#define B1 V1
9#define B2 V2
10#define B3 V3
11#define B4 V4
12#define B5 V5
13#define B6 V6
14#define B7 V7
15
16#define ACC0 V8
17#define ACC1 V9
18#define ACCM V10
19
20#define T0 V11
21#define T1 V12
22#define T2 V13
23#define T3 V14
24
25#define POLY V15
26#define ZERO V16
27#define INC V17
28#define CTR V18
29
30#define K0 V19
31#define K1 V20
32#define K2 V21
33#define K3 V22
34#define K4 V23
35#define K5 V24
36#define K6 V25
37#define K7 V26
38#define K8 V27
39#define K9 V28
40#define K10 V29
41#define K11 V30
42#define KLAST V31
43
44#define reduce() \
45	VEOR	ACC0.B16, ACCM.B16, ACCM.B16     \
46	VEOR	ACC1.B16, ACCM.B16, ACCM.B16     \
47	VEXT	$8, ZERO.B16, ACCM.B16, T0.B16   \
48	VEXT	$8, ACCM.B16, ZERO.B16, ACCM.B16 \
49	VEOR	ACCM.B16, ACC0.B16, ACC0.B16     \
50	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
51	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
52	VEXT	$8, ACC0.B16, ACC0.B16, ACC0.B16 \
53	VEOR	T0.B16, ACC0.B16, ACC0.B16       \
54	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
55	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
56	VEXT	$8, ACC1.B16, ACC1.B16, ACC1.B16 \
57	VEOR	ACC1.B16, ACC0.B16, ACC0.B16     \
58
59// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
60TEXT ·gcmAesFinish(SB),NOSPLIT,$0
61#define pTbl R0
62#define tMsk R1
63#define tPtr R2
64#define plen R3
65#define dlen R4
66
67	MOVD	$0xC2, R1
68	LSL	$56, R1
69	MOVD	$1, R0
70	VMOV	R1, POLY.D[0]
71	VMOV	R0, POLY.D[1]
72	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
73
74	MOVD	productTable+0(FP), pTbl
75	MOVD	tagMask+8(FP), tMsk
76	MOVD	T+16(FP), tPtr
77	MOVD	pLen+24(FP), plen
78	MOVD	dLen+32(FP), dlen
79
80	VLD1	(tPtr), [ACC0.B16]
81	VLD1	(tMsk), [B1.B16]
82
83	LSL	$3, plen
84	LSL	$3, dlen
85
86	VMOV	dlen, B0.D[0]
87	VMOV	plen, B0.D[1]
88
89	ADD	$14*16, pTbl
90	VLD1.P	(pTbl), [T1.B16, T2.B16]
91
92	VEOR	ACC0.B16, B0.B16, B0.B16
93
94	VEXT	$8, B0.B16, B0.B16, T0.B16
95	VEOR	B0.B16, T0.B16, T0.B16
96	VPMULL	B0.D1, T1.D1, ACC1.Q1
97	VPMULL2	B0.D2, T1.D2, ACC0.Q1
98	VPMULL	T0.D1, T2.D1, ACCM.Q1
99
100	reduce()
101
102	VREV64	ACC0.B16, ACC0.B16
103	VEOR	B1.B16, ACC0.B16, ACC0.B16
104
105	VST1	[ACC0.B16], (tPtr)
106	RET
107#undef pTbl
108#undef tMsk
109#undef tPtr
110#undef plen
111#undef dlen
112
113// func gcmAesInit(productTable *[256]byte, ks []uint32)
114TEXT ·gcmAesInit(SB),NOSPLIT,$0
115#define pTbl R0
116#define KS R1
117#define NR R2
118#define I R3
119	MOVD	productTable+0(FP), pTbl
120	MOVD	ks_base+8(FP), KS
121	MOVD	ks_len+16(FP), NR
122
123	MOVD	$0xC2, I
124	LSL	$56, I
125	VMOV	I, POLY.D[0]
126	MOVD	$1, I
127	VMOV	I, POLY.D[1]
128	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
129
130	// Encrypt block 0 with the AES key to generate the hash key H
131	VLD1.P	64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
132	VEOR	B0.B16, B0.B16, B0.B16
133	AESE	T0.B16, B0.B16
134	AESMC	B0.B16, B0.B16
135	AESE	T1.B16, B0.B16
136	AESMC	B0.B16, B0.B16
137	AESE	T2.B16, B0.B16
138	AESMC	B0.B16, B0.B16
139	AESE	T3.B16, B0.B16
140	AESMC	B0.B16, B0.B16
141	VLD1.P	64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
142	AESE	T0.B16, B0.B16
143	AESMC	B0.B16, B0.B16
144	AESE	T1.B16, B0.B16
145	AESMC	B0.B16, B0.B16
146	AESE	T2.B16, B0.B16
147	AESMC	B0.B16, B0.B16
148	AESE	T3.B16, B0.B16
149	AESMC	B0.B16, B0.B16
150	TBZ	$4, NR, initEncFinish
151	VLD1.P	32(KS), [T0.B16, T1.B16]
152	AESE	T0.B16, B0.B16
153	AESMC	B0.B16, B0.B16
154	AESE	T1.B16, B0.B16
155	AESMC	B0.B16, B0.B16
156	TBZ	$3, NR, initEncFinish
157	VLD1.P	32(KS), [T0.B16, T1.B16]
158	AESE	T0.B16, B0.B16
159	AESMC	B0.B16, B0.B16
160	AESE	T1.B16, B0.B16
161	AESMC	B0.B16, B0.B16
162initEncFinish:
163	VLD1	(KS), [T0.B16, T1.B16, T2.B16]
164	AESE	T0.B16, B0.B16
165	AESMC	B0.B16, B0.B16
166	AESE	T1.B16, B0.B16
167 	VEOR	T2.B16, B0.B16, B0.B16
168
169	VREV64	B0.B16, B0.B16
170
171	// Multiply by 2 modulo P
172	VMOV	B0.D[0], I
173	ASR	$63, I
174	VMOV	I, T1.D[0]
175	VMOV	I, T1.D[1]
176	VAND	POLY.B16, T1.B16, T1.B16
177	VUSHR	$63, B0.D2, T2.D2
178	VEXT	$8, ZERO.B16, T2.B16, T2.B16
179	VSHL	$1, B0.D2, B0.D2
180	VEOR	T1.B16, B0.B16, B0.B16
181	VEOR	T2.B16, B0.B16, B0.B16 // Can avoid this when VSLI is available
182
183	// Karatsuba pre-computation
184	VEXT	$8, B0.B16, B0.B16, B1.B16
185	VEOR	B0.B16, B1.B16, B1.B16
186
187	ADD	$14*16, pTbl
188	VST1	[B0.B16, B1.B16], (pTbl)
189	SUB	$2*16, pTbl
190
191	VMOV	B0.B16, B2.B16
192	VMOV	B1.B16, B3.B16
193
194	MOVD	$7, I
195
196initLoop:
197	// Compute powers of H
198	SUBS	$1, I
199
200	VPMULL	B0.D1, B2.D1, T1.Q1
201	VPMULL2	B0.D2, B2.D2, T0.Q1
202	VPMULL	B1.D1, B3.D1, T2.Q1
203	VEOR	T0.B16, T2.B16, T2.B16
204	VEOR	T1.B16, T2.B16, T2.B16
205	VEXT	$8, ZERO.B16, T2.B16, T3.B16
206	VEXT	$8, T2.B16, ZERO.B16, T2.B16
207	VEOR	T2.B16, T0.B16, T0.B16
208	VEOR	T3.B16, T1.B16, T1.B16
209	VPMULL	POLY.D1, T0.D1, T2.Q1
210	VEXT	$8, T0.B16, T0.B16, T0.B16
211	VEOR	T2.B16, T0.B16, T0.B16
212	VPMULL	POLY.D1, T0.D1, T2.Q1
213	VEXT	$8, T0.B16, T0.B16, T0.B16
214	VEOR	T2.B16, T0.B16, T0.B16
215	VEOR	T1.B16, T0.B16, B2.B16
216	VMOV	B2.B16, B3.B16
217	VEXT	$8, B2.B16, B2.B16, B2.B16
218	VEOR	B2.B16, B3.B16, B3.B16
219
220	VST1	[B2.B16, B3.B16], (pTbl)
221	SUB	$2*16, pTbl
222
223	BNE	initLoop
224	RET
225#undef I
226#undef NR
227#undef KS
228#undef pTbl
229
230// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
231TEXT ·gcmAesData(SB),NOSPLIT,$0
232#define pTbl R0
233#define aut R1
234#define tPtr R2
235#define autLen R3
236#define H0 R4
237#define pTblSave R5
238
239#define mulRound(X) \
240	VLD1.P	32(pTbl), [T1.B16, T2.B16] \
241	VREV64	X.B16, X.B16               \
242	VEXT	$8, X.B16, X.B16, T0.B16   \
243	VEOR	X.B16, T0.B16, T0.B16      \
244	VPMULL	X.D1, T1.D1, T3.Q1         \
245	VEOR	T3.B16, ACC1.B16, ACC1.B16 \
246	VPMULL2	X.D2, T1.D2, T3.Q1         \
247	VEOR	T3.B16, ACC0.B16, ACC0.B16 \
248	VPMULL	T0.D1, T2.D1, T3.Q1        \
249	VEOR	T3.B16, ACCM.B16, ACCM.B16
250
251	MOVD	productTable+0(FP), pTbl
252	MOVD	data_base+8(FP), aut
253	MOVD	data_len+16(FP), autLen
254	MOVD	T+32(FP), tPtr
255
256	VEOR	ACC0.B16, ACC0.B16, ACC0.B16
257	CBZ	autLen, dataBail
258
259	MOVD	$0xC2, H0
260	LSL	$56, H0
261	VMOV	H0, POLY.D[0]
262	MOVD	$1, H0
263	VMOV	H0, POLY.D[1]
264	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
265	MOVD	pTbl, pTblSave
266
267	CMP	$13, autLen
268	BEQ	dataTLS
269	CMP	$128, autLen
270	BLT	startSinglesLoop
271	B	octetsLoop
272
273dataTLS:
274	ADD	$14*16, pTbl
275	VLD1.P	(pTbl), [T1.B16, T2.B16]
276	VEOR	B0.B16, B0.B16, B0.B16
277
278	MOVD	(aut), H0
279	VMOV	H0, B0.D[0]
280	MOVW	8(aut), H0
281	VMOV	H0, B0.S[2]
282	MOVB	12(aut), H0
283	VMOV	H0, B0.B[12]
284
285	MOVD	$0, autLen
286	B	dataMul
287
288octetsLoop:
289		CMP	$128, autLen
290		BLT	startSinglesLoop
291		SUB	$128, autLen
292
293		VLD1.P	32(aut), [B0.B16, B1.B16]
294
295		VLD1.P	32(pTbl), [T1.B16, T2.B16]
296		VREV64	B0.B16, B0.B16
297		VEOR	ACC0.B16, B0.B16, B0.B16
298		VEXT	$8, B0.B16, B0.B16, T0.B16
299		VEOR	B0.B16, T0.B16, T0.B16
300		VPMULL	B0.D1, T1.D1, ACC1.Q1
301		VPMULL2	B0.D2, T1.D2, ACC0.Q1
302		VPMULL	T0.D1, T2.D1, ACCM.Q1
303
304		mulRound(B1)
305		VLD1.P  32(aut), [B2.B16, B3.B16]
306		mulRound(B2)
307		mulRound(B3)
308		VLD1.P  32(aut), [B4.B16, B5.B16]
309		mulRound(B4)
310		mulRound(B5)
311		VLD1.P  32(aut), [B6.B16, B7.B16]
312		mulRound(B6)
313		mulRound(B7)
314
315		MOVD	pTblSave, pTbl
316		reduce()
317	B	octetsLoop
318
319startSinglesLoop:
320
321	ADD	$14*16, pTbl
322	VLD1.P	(pTbl), [T1.B16, T2.B16]
323
324singlesLoop:
325
326		CMP	$16, autLen
327		BLT	dataEnd
328		SUB	$16, autLen
329
330		VLD1.P	16(aut), [B0.B16]
331dataMul:
332		VREV64	B0.B16, B0.B16
333		VEOR	ACC0.B16, B0.B16, B0.B16
334
335		VEXT	$8, B0.B16, B0.B16, T0.B16
336		VEOR	B0.B16, T0.B16, T0.B16
337		VPMULL	B0.D1, T1.D1, ACC1.Q1
338		VPMULL2	B0.D2, T1.D2, ACC0.Q1
339		VPMULL	T0.D1, T2.D1, ACCM.Q1
340
341		reduce()
342
343	B	singlesLoop
344
345dataEnd:
346
347	CBZ	autLen, dataBail
348	VEOR	B0.B16, B0.B16, B0.B16
349	ADD	autLen, aut
350
351dataLoadLoop:
352		MOVB.W	-1(aut), H0
353		VEXT	$15, B0.B16, ZERO.B16, B0.B16
354		VMOV	H0, B0.B[0]
355		SUBS	$1, autLen
356		BNE	dataLoadLoop
357	B	dataMul
358
359dataBail:
360	VST1	[ACC0.B16], (tPtr)
361	RET
362
363#undef pTbl
364#undef aut
365#undef tPtr
366#undef autLen
367#undef H0
368#undef pTblSave
369
370// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
371TEXT ·gcmAesEnc(SB),NOSPLIT,$0
372#define pTbl R0
373#define dstPtr R1
374#define ctrPtr R2
375#define srcPtr R3
376#define ks R4
377#define tPtr R5
378#define srcPtrLen R6
379#define aluCTR R7
380#define aluTMP R8
381#define aluK R9
382#define NR R10
383#define H0 R11
384#define H1 R12
385#define curK R13
386#define pTblSave R14
387
388#define aesrndx8(K) \
389	AESE	K.B16, B0.B16    \
390	AESMC	B0.B16, B0.B16   \
391	AESE	K.B16, B1.B16    \
392	AESMC	B1.B16, B1.B16   \
393	AESE	K.B16, B2.B16    \
394	AESMC	B2.B16, B2.B16   \
395	AESE	K.B16, B3.B16    \
396	AESMC	B3.B16, B3.B16   \
397	AESE	K.B16, B4.B16    \
398	AESMC	B4.B16, B4.B16   \
399	AESE	K.B16, B5.B16    \
400	AESMC	B5.B16, B5.B16   \
401	AESE	K.B16, B6.B16    \
402	AESMC	B6.B16, B6.B16   \
403	AESE	K.B16, B7.B16    \
404	AESMC	B7.B16, B7.B16
405
406#define aesrndlastx8(K) \
407	AESE	K.B16, B0.B16    \
408	AESE	K.B16, B1.B16    \
409	AESE	K.B16, B2.B16    \
410	AESE	K.B16, B3.B16    \
411	AESE	K.B16, B4.B16    \
412	AESE	K.B16, B5.B16    \
413	AESE	K.B16, B6.B16    \
414	AESE	K.B16, B7.B16
415
416	MOVD	productTable+0(FP), pTbl
417	MOVD	dst+8(FP), dstPtr
418	MOVD	src_base+32(FP), srcPtr
419	MOVD	src_len+40(FP), srcPtrLen
420	MOVD	ctr+56(FP), ctrPtr
421	MOVD	T+64(FP), tPtr
422	MOVD	ks_base+72(FP), ks
423	MOVD	ks_len+80(FP), NR
424
425	MOVD	$0xC2, H1
426	LSL	$56, H1
427	MOVD	$1, H0
428	VMOV	H1, POLY.D[0]
429	VMOV	H0, POLY.D[1]
430	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
431	// Compute NR from len(ks)
432	MOVD	pTbl, pTblSave
433	// Current tag, after AAD
434	VLD1	(tPtr), [ACC0.B16]
435	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
436	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
437	// Prepare initial counter, and the increment vector
438	VLD1	(ctrPtr), [CTR.B16]
439	VEOR	INC.B16, INC.B16, INC.B16
440	MOVD	$1, H0
441	VMOV	H0, INC.S[3]
442	VREV32	CTR.B16, CTR.B16
443	VADD	CTR.S4, INC.S4, CTR.S4
444	// Skip to <8 blocks loop
445	CMP	$128, srcPtrLen
446
447	MOVD	ks, H0
448	// For AES-128 round keys are stored in: K0 .. K10, KLAST
449	VLD1.P	64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
450	VLD1.P	64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
451	VLD1.P	48(H0), [K8.B16, K9.B16, K10.B16]
452	VMOV	K10.B16, KLAST.B16
453
454	BLT	startSingles
455	// There are at least 8 blocks to encrypt
456	TBZ	$4, NR, octetsLoop
457
458	// For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
459	VMOV	K8.B16, K10.B16
460	VMOV	K9.B16, K11.B16
461	VMOV	KLAST.B16, K8.B16
462	VLD1.P	16(H0), [K9.B16]
463	VLD1.P  16(H0), [KLAST.B16]
464	TBZ	$3, NR, octetsLoop
465	// For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
466	VMOV	KLAST.B16, K8.B16
467	VLD1.P	16(H0), [K9.B16]
468	VLD1.P  16(H0), [KLAST.B16]
469	ADD	$10*16, ks, H0
470	MOVD	H0, curK
471
472octetsLoop:
473		SUB	$128, srcPtrLen
474
475		VMOV	CTR.B16, B0.B16
476		VADD	B0.S4, INC.S4, B1.S4
477		VREV32	B0.B16, B0.B16
478		VADD	B1.S4, INC.S4, B2.S4
479		VREV32	B1.B16, B1.B16
480		VADD	B2.S4, INC.S4, B3.S4
481		VREV32	B2.B16, B2.B16
482		VADD	B3.S4, INC.S4, B4.S4
483		VREV32	B3.B16, B3.B16
484		VADD	B4.S4, INC.S4, B5.S4
485		VREV32	B4.B16, B4.B16
486		VADD	B5.S4, INC.S4, B6.S4
487		VREV32	B5.B16, B5.B16
488		VADD	B6.S4, INC.S4, B7.S4
489		VREV32	B6.B16, B6.B16
490		VADD	B7.S4, INC.S4, CTR.S4
491		VREV32	B7.B16, B7.B16
492
493		aesrndx8(K0)
494		aesrndx8(K1)
495		aesrndx8(K2)
496		aesrndx8(K3)
497		aesrndx8(K4)
498		aesrndx8(K5)
499		aesrndx8(K6)
500		aesrndx8(K7)
501		TBZ	$4, NR, octetsFinish
502		aesrndx8(K10)
503		aesrndx8(K11)
504		TBZ	$3, NR, octetsFinish
505		VLD1.P	32(curK), [T1.B16, T2.B16]
506		aesrndx8(T1)
507		aesrndx8(T2)
508		MOVD	H0, curK
509octetsFinish:
510		aesrndx8(K8)
511		aesrndlastx8(K9)
512
513		VEOR	KLAST.B16, B0.B16, B0.B16
514		VEOR	KLAST.B16, B1.B16, B1.B16
515		VEOR	KLAST.B16, B2.B16, B2.B16
516		VEOR	KLAST.B16, B3.B16, B3.B16
517		VEOR	KLAST.B16, B4.B16, B4.B16
518		VEOR	KLAST.B16, B5.B16, B5.B16
519		VEOR	KLAST.B16, B6.B16, B6.B16
520		VEOR	KLAST.B16, B7.B16, B7.B16
521
522		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
523		VEOR	B0.B16, T1.B16, B0.B16
524		VEOR	B1.B16, T2.B16, B1.B16
525		VST1.P  [B0.B16, B1.B16], 32(dstPtr)
526		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
527		VEOR	B2.B16, T1.B16, B2.B16
528		VEOR	B3.B16, T2.B16, B3.B16
529		VST1.P  [B2.B16, B3.B16], 32(dstPtr)
530		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
531		VEOR	B4.B16, T1.B16, B4.B16
532		VEOR	B5.B16, T2.B16, B5.B16
533		VST1.P  [B4.B16, B5.B16], 32(dstPtr)
534		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
535		VEOR	B6.B16, T1.B16, B6.B16
536		VEOR	B7.B16, T2.B16, B7.B16
537		VST1.P  [B6.B16, B7.B16], 32(dstPtr)
538
539		VLD1.P	32(pTbl), [T1.B16, T2.B16]
540		VREV64	B0.B16, B0.B16
541		VEOR	ACC0.B16, B0.B16, B0.B16
542		VEXT	$8, B0.B16, B0.B16, T0.B16
543		VEOR	B0.B16, T0.B16, T0.B16
544		VPMULL	B0.D1, T1.D1, ACC1.Q1
545		VPMULL2	B0.D2, T1.D2, ACC0.Q1
546		VPMULL	T0.D1, T2.D1, ACCM.Q1
547
548		mulRound(B1)
549		mulRound(B2)
550		mulRound(B3)
551		mulRound(B4)
552		mulRound(B5)
553		mulRound(B6)
554		mulRound(B7)
555		MOVD	pTblSave, pTbl
556		reduce()
557
558		CMP	$128, srcPtrLen
559		BGE	octetsLoop
560
561startSingles:
562	CBZ	srcPtrLen, done
563	ADD	$14*16, pTbl
564	// Preload H and its Karatsuba precomp
565	VLD1.P	(pTbl), [T1.B16, T2.B16]
566	// Preload AES round keys
567	ADD	$128, ks
568	VLD1.P	48(ks), [K8.B16, K9.B16, K10.B16]
569	VMOV	K10.B16, KLAST.B16
570	TBZ	$4, NR, singlesLoop
571	VLD1.P	32(ks), [B1.B16, B2.B16]
572	VMOV	B2.B16, KLAST.B16
573	TBZ	$3, NR, singlesLoop
574	VLD1.P	32(ks), [B3.B16, B4.B16]
575	VMOV	B4.B16, KLAST.B16
576
577singlesLoop:
578		CMP	$16, srcPtrLen
579		BLT	tail
580		SUB	$16, srcPtrLen
581
582		VLD1.P	16(srcPtr), [T0.B16]
583		VEOR	KLAST.B16, T0.B16, T0.B16
584
585		VREV32	CTR.B16, B0.B16
586		VADD	CTR.S4, INC.S4, CTR.S4
587
588		AESE	K0.B16, B0.B16
589		AESMC	B0.B16, B0.B16
590		AESE	K1.B16, B0.B16
591		AESMC	B0.B16, B0.B16
592		AESE	K2.B16, B0.B16
593		AESMC	B0.B16, B0.B16
594		AESE	K3.B16, B0.B16
595		AESMC	B0.B16, B0.B16
596		AESE	K4.B16, B0.B16
597		AESMC	B0.B16, B0.B16
598		AESE	K5.B16, B0.B16
599		AESMC	B0.B16, B0.B16
600		AESE	K6.B16, B0.B16
601		AESMC	B0.B16, B0.B16
602		AESE	K7.B16, B0.B16
603		AESMC	B0.B16, B0.B16
604		AESE	K8.B16, B0.B16
605		AESMC	B0.B16, B0.B16
606		AESE	K9.B16, B0.B16
607		TBZ	$4, NR, singlesLast
608		AESMC	B0.B16, B0.B16
609		AESE	K10.B16, B0.B16
610		AESMC	B0.B16, B0.B16
611		AESE	B1.B16, B0.B16
612		TBZ	$3, NR, singlesLast
613		AESMC	B0.B16, B0.B16
614		AESE	B2.B16, B0.B16
615		AESMC	B0.B16, B0.B16
616		AESE	B3.B16, B0.B16
617singlesLast:
618		VEOR	T0.B16, B0.B16, B0.B16
619encReduce:
620		VST1.P	[B0.B16], 16(dstPtr)
621
622		VREV64	B0.B16, B0.B16
623		VEOR	ACC0.B16, B0.B16, B0.B16
624
625		VEXT	$8, B0.B16, B0.B16, T0.B16
626		VEOR	B0.B16, T0.B16, T0.B16
627		VPMULL	B0.D1, T1.D1, ACC1.Q1
628		VPMULL2	B0.D2, T1.D2, ACC0.Q1
629		VPMULL	T0.D1, T2.D1, ACCM.Q1
630
631		reduce()
632
633	B	singlesLoop
634tail:
635	CBZ	srcPtrLen, done
636
637	VEOR	T0.B16, T0.B16, T0.B16
638	VEOR	T3.B16, T3.B16, T3.B16
639	MOVD	$0, H1
640	SUB	$1, H1
641	ADD	srcPtrLen, srcPtr
642
643	TBZ	$3, srcPtrLen, ld4
644	MOVD.W	-8(srcPtr), H0
645	VMOV	H0, T0.D[0]
646	VMOV	H1, T3.D[0]
647ld4:
648	TBZ	$2, srcPtrLen, ld2
649	MOVW.W	-4(srcPtr), H0
650	VEXT	$12, T0.B16, ZERO.B16, T0.B16
651	VEXT	$12, T3.B16, ZERO.B16, T3.B16
652	VMOV	H0, T0.S[0]
653	VMOV	H1, T3.S[0]
654ld2:
655	TBZ	$1, srcPtrLen, ld1
656	MOVH.W	-2(srcPtr), H0
657	VEXT	$14, T0.B16, ZERO.B16, T0.B16
658	VEXT	$14, T3.B16, ZERO.B16, T3.B16
659	VMOV	H0, T0.H[0]
660	VMOV	H1, T3.H[0]
661ld1:
662	TBZ	$0, srcPtrLen, ld0
663	MOVB.W	-1(srcPtr), H0
664	VEXT	$15, T0.B16, ZERO.B16, T0.B16
665	VEXT	$15, T3.B16, ZERO.B16, T3.B16
666	VMOV	H0, T0.B[0]
667	VMOV	H1, T3.B[0]
668ld0:
669
670	MOVD	ZR, srcPtrLen
671	VEOR	KLAST.B16, T0.B16, T0.B16
672	VREV32	CTR.B16, B0.B16
673
674	AESE	K0.B16, B0.B16
675	AESMC	B0.B16, B0.B16
676	AESE	K1.B16, B0.B16
677	AESMC	B0.B16, B0.B16
678	AESE	K2.B16, B0.B16
679	AESMC	B0.B16, B0.B16
680	AESE	K3.B16, B0.B16
681	AESMC	B0.B16, B0.B16
682	AESE	K4.B16, B0.B16
683	AESMC	B0.B16, B0.B16
684	AESE	K5.B16, B0.B16
685	AESMC	B0.B16, B0.B16
686	AESE	K6.B16, B0.B16
687	AESMC	B0.B16, B0.B16
688	AESE	K7.B16, B0.B16
689	AESMC	B0.B16, B0.B16
690	AESE	K8.B16, B0.B16
691	AESMC	B0.B16, B0.B16
692	AESE	K9.B16, B0.B16
693	TBZ	$4, NR, tailLast
694	AESMC	B0.B16, B0.B16
695	AESE	K10.B16, B0.B16
696	AESMC	B0.B16, B0.B16
697	AESE	B1.B16, B0.B16
698	TBZ	$3, NR, tailLast
699	AESMC	B0.B16, B0.B16
700	AESE	B2.B16, B0.B16
701	AESMC	B0.B16, B0.B16
702	AESE	B3.B16, B0.B16
703
704tailLast:
705	VEOR	T0.B16, B0.B16, B0.B16
706	VAND	T3.B16, B0.B16, B0.B16
707	B	encReduce
708
709done:
710	VST1	[ACC0.B16], (tPtr)
711	RET
712
713// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
714TEXT ·gcmAesDec(SB),NOSPLIT,$0
715	MOVD	productTable+0(FP), pTbl
716	MOVD	dst+8(FP), dstPtr
717	MOVD	src_base+32(FP), srcPtr
718	MOVD	src_len+40(FP), srcPtrLen
719	MOVD	ctr+56(FP), ctrPtr
720	MOVD	T+64(FP), tPtr
721	MOVD	ks_base+72(FP), ks
722	MOVD	ks_len+80(FP), NR
723
724	MOVD	$0xC2, H1
725	LSL	$56, H1
726	MOVD	$1, H0
727	VMOV	H1, POLY.D[0]
728	VMOV	H0, POLY.D[1]
729	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
730	// Compute NR from len(ks)
731	MOVD	pTbl, pTblSave
732	// Current tag, after AAD
733	VLD1	(tPtr), [ACC0.B16]
734	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
735	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
736	// Prepare initial counter, and the increment vector
737	VLD1	(ctrPtr), [CTR.B16]
738	VEOR	INC.B16, INC.B16, INC.B16
739	MOVD	$1, H0
740	VMOV	H0, INC.S[3]
741	VREV32	CTR.B16, CTR.B16
742	VADD	CTR.S4, INC.S4, CTR.S4
743
744	MOVD	ks, H0
745	// For AES-128 round keys are stored in: K0 .. K10, KLAST
746	VLD1.P	64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
747	VLD1.P	64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
748	VLD1.P	48(H0), [K8.B16, K9.B16, K10.B16]
749	VMOV	K10.B16, KLAST.B16
750
751	// Skip to <8 blocks loop
752	CMP	$128, srcPtrLen
753	BLT	startSingles
754	// There are at least 8 blocks to encrypt
755	TBZ	$4, NR, octetsLoop
756
757	// For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
758	VMOV	K8.B16, K10.B16
759	VMOV	K9.B16, K11.B16
760	VMOV	KLAST.B16, K8.B16
761	VLD1.P	16(H0), [K9.B16]
762	VLD1.P  16(H0), [KLAST.B16]
763	TBZ	$3, NR, octetsLoop
764	// For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
765	VMOV	KLAST.B16, K8.B16
766	VLD1.P	16(H0), [K9.B16]
767	VLD1.P  16(H0), [KLAST.B16]
768	ADD	$10*16, ks, H0
769	MOVD	H0, curK
770
771octetsLoop:
772		SUB	$128, srcPtrLen
773
774		VMOV	CTR.B16, B0.B16
775		VADD	B0.S4, INC.S4, B1.S4
776		VREV32	B0.B16, B0.B16
777		VADD	B1.S4, INC.S4, B2.S4
778		VREV32	B1.B16, B1.B16
779		VADD	B2.S4, INC.S4, B3.S4
780		VREV32	B2.B16, B2.B16
781		VADD	B3.S4, INC.S4, B4.S4
782		VREV32	B3.B16, B3.B16
783		VADD	B4.S4, INC.S4, B5.S4
784		VREV32	B4.B16, B4.B16
785		VADD	B5.S4, INC.S4, B6.S4
786		VREV32	B5.B16, B5.B16
787		VADD	B6.S4, INC.S4, B7.S4
788		VREV32	B6.B16, B6.B16
789		VADD	B7.S4, INC.S4, CTR.S4
790		VREV32	B7.B16, B7.B16
791
792		aesrndx8(K0)
793		aesrndx8(K1)
794		aesrndx8(K2)
795		aesrndx8(K3)
796		aesrndx8(K4)
797		aesrndx8(K5)
798		aesrndx8(K6)
799		aesrndx8(K7)
800		TBZ	$4, NR, octetsFinish
801		aesrndx8(K10)
802		aesrndx8(K11)
803		TBZ	$3, NR, octetsFinish
804		VLD1.P	32(curK), [T1.B16, T2.B16]
805		aesrndx8(T1)
806		aesrndx8(T2)
807		MOVD	H0, curK
808octetsFinish:
809		aesrndx8(K8)
810		aesrndlastx8(K9)
811
812		VEOR	KLAST.B16, B0.B16, T1.B16
813		VEOR	KLAST.B16, B1.B16, T2.B16
814		VEOR	KLAST.B16, B2.B16, B2.B16
815		VEOR	KLAST.B16, B3.B16, B3.B16
816		VEOR	KLAST.B16, B4.B16, B4.B16
817		VEOR	KLAST.B16, B5.B16, B5.B16
818		VEOR	KLAST.B16, B6.B16, B6.B16
819		VEOR	KLAST.B16, B7.B16, B7.B16
820
821		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
822		VEOR	B0.B16, T1.B16, T1.B16
823		VEOR	B1.B16, T2.B16, T2.B16
824		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
825
826		VLD1.P	32(pTbl), [T1.B16, T2.B16]
827		VREV64	B0.B16, B0.B16
828		VEOR	ACC0.B16, B0.B16, B0.B16
829		VEXT	$8, B0.B16, B0.B16, T0.B16
830		VEOR	B0.B16, T0.B16, T0.B16
831		VPMULL	B0.D1, T1.D1, ACC1.Q1
832		VPMULL2	B0.D2, T1.D2, ACC0.Q1
833		VPMULL	T0.D1, T2.D1, ACCM.Q1
834		mulRound(B1)
835
836		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
837		VEOR	B2.B16, B0.B16, T1.B16
838		VEOR	B3.B16, B1.B16, T2.B16
839		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
840		mulRound(B0)
841		mulRound(B1)
842
843		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
844		VEOR	B4.B16, B0.B16, T1.B16
845		VEOR	B5.B16, B1.B16, T2.B16
846		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
847		mulRound(B0)
848		mulRound(B1)
849
850		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
851		VEOR	B6.B16, B0.B16, T1.B16
852		VEOR	B7.B16, B1.B16, T2.B16
853		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
854		mulRound(B0)
855		mulRound(B1)
856
857		MOVD	pTblSave, pTbl
858		reduce()
859
860		CMP	$128, srcPtrLen
861		BGE	octetsLoop
862
863startSingles:
864	CBZ	srcPtrLen, done
865	ADD	$14*16, pTbl
866	// Preload H and its Karatsuba precomp
867	VLD1.P	(pTbl), [T1.B16, T2.B16]
868	// Preload AES round keys
869	ADD	$128, ks
870	VLD1.P	48(ks), [K8.B16, K9.B16, K10.B16]
871	VMOV	K10.B16, KLAST.B16
872	TBZ	$4, NR, singlesLoop
873	VLD1.P	32(ks), [B1.B16, B2.B16]
874	VMOV	B2.B16, KLAST.B16
875	TBZ	$3, NR, singlesLoop
876	VLD1.P	32(ks), [B3.B16, B4.B16]
877	VMOV	B4.B16, KLAST.B16
878
879singlesLoop:
880		CMP	$16, srcPtrLen
881		BLT	tail
882		SUB	$16, srcPtrLen
883
884		VLD1.P	16(srcPtr), [T0.B16]
885		VREV64	T0.B16, B5.B16
886		VEOR	KLAST.B16, T0.B16, T0.B16
887
888		VREV32	CTR.B16, B0.B16
889		VADD	CTR.S4, INC.S4, CTR.S4
890
891		AESE	K0.B16, B0.B16
892		AESMC	B0.B16, B0.B16
893		AESE	K1.B16, B0.B16
894		AESMC	B0.B16, B0.B16
895		AESE	K2.B16, B0.B16
896		AESMC	B0.B16, B0.B16
897		AESE	K3.B16, B0.B16
898		AESMC	B0.B16, B0.B16
899		AESE	K4.B16, B0.B16
900		AESMC	B0.B16, B0.B16
901		AESE	K5.B16, B0.B16
902		AESMC	B0.B16, B0.B16
903		AESE	K6.B16, B0.B16
904		AESMC	B0.B16, B0.B16
905		AESE	K7.B16, B0.B16
906		AESMC	B0.B16, B0.B16
907		AESE	K8.B16, B0.B16
908		AESMC	B0.B16, B0.B16
909		AESE	K9.B16, B0.B16
910		TBZ	$4, NR, singlesLast
911		AESMC	B0.B16, B0.B16
912		AESE	K10.B16, B0.B16
913		AESMC	B0.B16, B0.B16
914		AESE	B1.B16, B0.B16
915		TBZ	$3, NR, singlesLast
916		AESMC	B0.B16, B0.B16
917		AESE	B2.B16, B0.B16
918		AESMC	B0.B16, B0.B16
919		AESE	B3.B16, B0.B16
920singlesLast:
921		VEOR	T0.B16, B0.B16, B0.B16
922
923		VST1.P	[B0.B16], 16(dstPtr)
924
925		VEOR	ACC0.B16, B5.B16, B5.B16
926		VEXT	$8, B5.B16, B5.B16, T0.B16
927		VEOR	B5.B16, T0.B16, T0.B16
928		VPMULL	B5.D1, T1.D1, ACC1.Q1
929		VPMULL2	B5.D2, T1.D2, ACC0.Q1
930		VPMULL	T0.D1, T2.D1, ACCM.Q1
931		reduce()
932
933	B	singlesLoop
934tail:
935	CBZ	srcPtrLen, done
936
937	VREV32	CTR.B16, B0.B16
938	VADD	CTR.S4, INC.S4, CTR.S4
939
940	AESE	K0.B16, B0.B16
941	AESMC	B0.B16, B0.B16
942	AESE	K1.B16, B0.B16
943	AESMC	B0.B16, B0.B16
944	AESE	K2.B16, B0.B16
945	AESMC	B0.B16, B0.B16
946	AESE	K3.B16, B0.B16
947	AESMC	B0.B16, B0.B16
948	AESE	K4.B16, B0.B16
949	AESMC	B0.B16, B0.B16
950	AESE	K5.B16, B0.B16
951	AESMC	B0.B16, B0.B16
952	AESE	K6.B16, B0.B16
953	AESMC	B0.B16, B0.B16
954	AESE	K7.B16, B0.B16
955	AESMC	B0.B16, B0.B16
956	AESE	K8.B16, B0.B16
957	AESMC	B0.B16, B0.B16
958	AESE	K9.B16, B0.B16
959	TBZ	$4, NR, tailLast
960	AESMC	B0.B16, B0.B16
961	AESE	K10.B16, B0.B16
962	AESMC	B0.B16, B0.B16
963	AESE	B1.B16, B0.B16
964	TBZ	$3, NR, tailLast
965	AESMC	B0.B16, B0.B16
966	AESE	B2.B16, B0.B16
967	AESMC	B0.B16, B0.B16
968	AESE	B3.B16, B0.B16
969tailLast:
970	VEOR	KLAST.B16, B0.B16, B0.B16
971
972	// Assuming it is safe to load past dstPtr due to the presence of the tag
973	VLD1	(srcPtr), [B5.B16]
974
975	VEOR	B5.B16, B0.B16, B0.B16
976
977	VEOR	T3.B16, T3.B16, T3.B16
978	MOVD	$0, H1
979	SUB	$1, H1
980
981	TBZ	$3, srcPtrLen, ld4
982	VMOV	B0.D[0], H0
983	MOVD.P	H0, 8(dstPtr)
984	VMOV	H1, T3.D[0]
985	VEXT	$8, ZERO.B16, B0.B16, B0.B16
986ld4:
987	TBZ	$2, srcPtrLen, ld2
988	VMOV	B0.S[0], H0
989	MOVW.P	H0, 4(dstPtr)
990	VEXT	$12, T3.B16, ZERO.B16, T3.B16
991	VMOV	H1, T3.S[0]
992	VEXT	$4, ZERO.B16, B0.B16, B0.B16
993ld2:
994	TBZ	$1, srcPtrLen, ld1
995	VMOV	B0.H[0], H0
996	MOVH.P	H0, 2(dstPtr)
997	VEXT	$14, T3.B16, ZERO.B16, T3.B16
998	VMOV	H1, T3.H[0]
999	VEXT	$2, ZERO.B16, B0.B16, B0.B16
1000ld1:
1001	TBZ	$0, srcPtrLen, ld0
1002	VMOV	B0.B[0], H0
1003	MOVB.P	H0, 1(dstPtr)
1004	VEXT	$15, T3.B16, ZERO.B16, T3.B16
1005	VMOV	H1, T3.B[0]
1006ld0:
1007
1008	VAND	T3.B16, B5.B16, B5.B16
1009	VREV64	B5.B16, B5.B16
1010
1011	VEOR	ACC0.B16, B5.B16, B5.B16
1012	VEXT	$8, B5.B16, B5.B16, T0.B16
1013	VEOR	B5.B16, T0.B16, T0.B16
1014	VPMULL	B5.D1, T1.D1, ACC1.Q1
1015	VPMULL2	B5.D2, T1.D2, ACC0.Q1
1016	VPMULL	T0.D1, T2.D1, ACCM.Q1
1017	reduce()
1018done:
1019	VST1	[ACC0.B16], (tPtr)
1020
1021	RET
1022