1// Copyright 2015 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
6// The implementation uses some optimization as described in:
7// [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
8//     Instruction and its Usage for Computing the GCM Mode rev. 2.02
9// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
10//     Hardware
11
12#include "textflag.h"
13
14#define B0 X0
15#define B1 X1
16#define B2 X2
17#define B3 X3
18#define B4 X4
19#define B5 X5
20#define B6 X6
21#define B7 X7
22
23#define ACC0 X8
24#define ACC1 X9
25#define ACCM X10
26
27#define T0 X11
28#define T1 X12
29#define T2 X13
30#define POLY X14
31#define BSWAP X15
32
33DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
34DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
35
36DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
37DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
38
39DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
40DATA andMask<>+0x08(SB)/8, $0x0000000000000000
41DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
42DATA andMask<>+0x18(SB)/8, $0x0000000000000000
43DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
44DATA andMask<>+0x28(SB)/8, $0x0000000000000000
45DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
46DATA andMask<>+0x38(SB)/8, $0x0000000000000000
47DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
48DATA andMask<>+0x48(SB)/8, $0x0000000000000000
49DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
50DATA andMask<>+0x58(SB)/8, $0x0000000000000000
51DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
52DATA andMask<>+0x68(SB)/8, $0x0000000000000000
53DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
54DATA andMask<>+0x78(SB)/8, $0x0000000000000000
55DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
56DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
57DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
58DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
59DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
60DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
61DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
62DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
63DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
64DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
65DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
66DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
67DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
68DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
69
70GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
71GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
72GLOBL andMask<>(SB), (NOPTR+RODATA), $240
73
74// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
75TEXT ·gcmAesFinish(SB),NOSPLIT,$0
76#define pTbl DI
77#define tMsk SI
78#define tPtr DX
79#define plen AX
80#define dlen CX
81
82	MOVQ productTable+0(FP), pTbl
83	MOVQ tagMask+8(FP), tMsk
84	MOVQ T+16(FP), tPtr
85	MOVQ pLen+24(FP), plen
86	MOVQ dLen+32(FP), dlen
87
88	MOVOU (tPtr), ACC0
89	MOVOU (tMsk), T2
90
91	MOVOU bswapMask<>(SB), BSWAP
92	MOVOU gcmPoly<>(SB), POLY
93
94	SHLQ $3, plen
95	SHLQ $3, dlen
96
97	MOVQ plen, B0
98	PINSRQ $1, dlen, B0
99
100	PXOR ACC0, B0
101
102	MOVOU (16*14)(pTbl), ACC0
103	MOVOU (16*15)(pTbl), ACCM
104	MOVOU ACC0, ACC1
105
106	PCLMULQDQ $0x00, B0, ACC0
107	PCLMULQDQ $0x11, B0, ACC1
108	PSHUFD $78, B0, T0
109	PXOR B0, T0
110	PCLMULQDQ $0x00, T0, ACCM
111
112	PXOR ACC0, ACCM
113	PXOR ACC1, ACCM
114	MOVOU ACCM, T0
115	PSRLDQ $8, ACCM
116	PSLLDQ $8, T0
117	PXOR ACCM, ACC1
118	PXOR T0, ACC0
119
120	MOVOU POLY, T0
121	PCLMULQDQ $0x01, ACC0, T0
122	PSHUFD $78, ACC0, ACC0
123	PXOR T0, ACC0
124
125	MOVOU POLY, T0
126	PCLMULQDQ $0x01, ACC0, T0
127	PSHUFD $78, ACC0, ACC0
128	PXOR T0, ACC0
129
130	PXOR ACC1, ACC0
131
132	PSHUFB BSWAP, ACC0
133	PXOR T2, ACC0
134	MOVOU ACC0, (tPtr)
135
136	RET
137#undef pTbl
138#undef tMsk
139#undef tPtr
140#undef plen
141#undef dlen
142
143// func gcmAesInit(productTable *[256]byte, ks []uint32)
144TEXT ·gcmAesInit(SB),NOSPLIT,$0
145#define dst DI
146#define KS SI
147#define NR DX
148
149	MOVQ productTable+0(FP), dst
150	MOVQ ks_base+8(FP), KS
151	MOVQ ks_len+16(FP), NR
152
153	SHRQ $2, NR
154	DECQ NR
155
156	MOVOU bswapMask<>(SB), BSWAP
157	MOVOU gcmPoly<>(SB), POLY
158
159	// Encrypt block 0, with the AES key to generate the hash key H
160	MOVOU (16*0)(KS), B0
161	MOVOU (16*1)(KS), T0
162	AESENC T0, B0
163	MOVOU (16*2)(KS), T0
164	AESENC T0, B0
165	MOVOU (16*3)(KS), T0
166	AESENC T0, B0
167	MOVOU (16*4)(KS), T0
168	AESENC T0, B0
169	MOVOU (16*5)(KS), T0
170	AESENC T0, B0
171	MOVOU (16*6)(KS), T0
172	AESENC T0, B0
173	MOVOU (16*7)(KS), T0
174	AESENC T0, B0
175	MOVOU (16*8)(KS), T0
176	AESENC T0, B0
177	MOVOU (16*9)(KS), T0
178	AESENC T0, B0
179	MOVOU (16*10)(KS), T0
180	CMPQ NR, $12
181	JB initEncLast
182	AESENC T0, B0
183	MOVOU (16*11)(KS), T0
184	AESENC T0, B0
185	MOVOU (16*12)(KS), T0
186	JE initEncLast
187	AESENC T0, B0
188	MOVOU (16*13)(KS), T0
189	AESENC T0, B0
190	MOVOU (16*14)(KS), T0
191initEncLast:
192	AESENCLAST T0, B0
193
194	PSHUFB BSWAP, B0
195	// H * 2
196	PSHUFD $0xff, B0, T0
197	MOVOU B0, T1
198	PSRAL $31, T0
199	PAND POLY, T0
200	PSRLL $31, T1
201	PSLLDQ $4, T1
202	PSLLL $1, B0
203	PXOR T0, B0
204	PXOR T1, B0
205	// Karatsuba pre-computations
206	MOVOU B0, (16*14)(dst)
207	PSHUFD $78, B0, B1
208	PXOR B0, B1
209	MOVOU B1, (16*15)(dst)
210
211	MOVOU B0, B2
212	MOVOU B1, B3
213	// Now prepare powers of H and pre-computations for them
214	MOVQ $7, AX
215
216initLoop:
217		MOVOU B2, T0
218		MOVOU B2, T1
219		MOVOU B3, T2
220		PCLMULQDQ $0x00, B0, T0
221		PCLMULQDQ $0x11, B0, T1
222		PCLMULQDQ $0x00, B1, T2
223
224		PXOR T0, T2
225		PXOR T1, T2
226		MOVOU T2, B4
227		PSLLDQ $8, B4
228		PSRLDQ $8, T2
229		PXOR B4, T0
230		PXOR T2, T1
231
232		MOVOU POLY, B2
233		PCLMULQDQ $0x01, T0, B2
234		PSHUFD $78, T0, T0
235		PXOR B2, T0
236		MOVOU POLY, B2
237		PCLMULQDQ $0x01, T0, B2
238		PSHUFD $78, T0, T0
239		PXOR T0, B2
240		PXOR T1, B2
241
242		MOVOU B2, (16*12)(dst)
243		PSHUFD $78, B2, B3
244		PXOR B2, B3
245		MOVOU B3, (16*13)(dst)
246
247		DECQ AX
248		LEAQ (-16*2)(dst), dst
249	JNE initLoop
250
251	RET
252#undef NR
253#undef KS
254#undef dst
255
256// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
257TEXT ·gcmAesData(SB),NOSPLIT,$0
258#define pTbl DI
259#define aut SI
260#define tPtr CX
261#define autLen DX
262
263#define reduceRound(a) 	MOVOU POLY, T0;	PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
264#define mulRoundAAD(X ,i) \
265	MOVOU (16*(i*2))(pTbl), T1;\
266	MOVOU T1, T2;\
267	PCLMULQDQ $0x00, X, T1;\
268	PXOR T1, ACC0;\
269	PCLMULQDQ $0x11, X, T2;\
270	PXOR T2, ACC1;\
271	PSHUFD $78, X, T1;\
272	PXOR T1, X;\
273	MOVOU (16*(i*2+1))(pTbl), T1;\
274	PCLMULQDQ $0x00, X, T1;\
275	PXOR T1, ACCM
276
277	MOVQ productTable+0(FP), pTbl
278	MOVQ data_base+8(FP), aut
279	MOVQ data_len+16(FP), autLen
280	MOVQ T+32(FP), tPtr
281
282	PXOR ACC0, ACC0
283	MOVOU bswapMask<>(SB), BSWAP
284	MOVOU gcmPoly<>(SB), POLY
285
286	TESTQ autLen, autLen
287	JEQ dataBail
288
289	CMPQ autLen, $13	// optimize the TLS case
290	JE dataTLS
291	CMPQ autLen, $128
292	JB startSinglesLoop
293	JMP dataOctaLoop
294
295dataTLS:
296	MOVOU (16*14)(pTbl), T1
297	MOVOU (16*15)(pTbl), T2
298	PXOR B0, B0
299	MOVQ (aut), B0
300	PINSRD $2, 8(aut), B0
301	PINSRB $12, 12(aut), B0
302	XORQ autLen, autLen
303	JMP dataMul
304
305dataOctaLoop:
306		CMPQ autLen, $128
307		JB startSinglesLoop
308		SUBQ $128, autLen
309
310		MOVOU (16*0)(aut), X0
311		MOVOU (16*1)(aut), X1
312		MOVOU (16*2)(aut), X2
313		MOVOU (16*3)(aut), X3
314		MOVOU (16*4)(aut), X4
315		MOVOU (16*5)(aut), X5
316		MOVOU (16*6)(aut), X6
317		MOVOU (16*7)(aut), X7
318		LEAQ (16*8)(aut), aut
319		PSHUFB BSWAP, X0
320		PSHUFB BSWAP, X1
321		PSHUFB BSWAP, X2
322		PSHUFB BSWAP, X3
323		PSHUFB BSWAP, X4
324		PSHUFB BSWAP, X5
325		PSHUFB BSWAP, X6
326		PSHUFB BSWAP, X7
327		PXOR ACC0, X0
328
329		MOVOU (16*0)(pTbl), ACC0
330		MOVOU (16*1)(pTbl), ACCM
331		MOVOU ACC0, ACC1
332		PSHUFD $78, X0, T1
333		PXOR X0, T1
334		PCLMULQDQ $0x00, X0, ACC0
335		PCLMULQDQ $0x11, X0, ACC1
336		PCLMULQDQ $0x00, T1, ACCM
337
338		mulRoundAAD(X1, 1)
339		mulRoundAAD(X2, 2)
340		mulRoundAAD(X3, 3)
341		mulRoundAAD(X4, 4)
342		mulRoundAAD(X5, 5)
343		mulRoundAAD(X6, 6)
344		mulRoundAAD(X7, 7)
345
346		PXOR ACC0, ACCM
347		PXOR ACC1, ACCM
348		MOVOU ACCM, T0
349		PSRLDQ $8, ACCM
350		PSLLDQ $8, T0
351		PXOR ACCM, ACC1
352		PXOR T0, ACC0
353		reduceRound(ACC0)
354		reduceRound(ACC0)
355		PXOR ACC1, ACC0
356	JMP dataOctaLoop
357
358startSinglesLoop:
359	MOVOU (16*14)(pTbl), T1
360	MOVOU (16*15)(pTbl), T2
361
362dataSinglesLoop:
363
364		CMPQ autLen, $16
365		JB dataEnd
366		SUBQ $16, autLen
367
368		MOVOU (aut), B0
369dataMul:
370		PSHUFB BSWAP, B0
371		PXOR ACC0, B0
372
373		MOVOU T1, ACC0
374		MOVOU T2, ACCM
375		MOVOU T1, ACC1
376
377		PSHUFD $78, B0, T0
378		PXOR B0, T0
379		PCLMULQDQ $0x00, B0, ACC0
380		PCLMULQDQ $0x11, B0, ACC1
381		PCLMULQDQ $0x00, T0, ACCM
382
383		PXOR ACC0, ACCM
384		PXOR ACC1, ACCM
385		MOVOU ACCM, T0
386		PSRLDQ $8, ACCM
387		PSLLDQ $8, T0
388		PXOR ACCM, ACC1
389		PXOR T0, ACC0
390
391		MOVOU POLY, T0
392		PCLMULQDQ $0x01, ACC0, T0
393		PSHUFD $78, ACC0, ACC0
394		PXOR T0, ACC0
395
396		MOVOU POLY, T0
397		PCLMULQDQ $0x01, ACC0, T0
398		PSHUFD $78, ACC0, ACC0
399		PXOR T0, ACC0
400		PXOR ACC1, ACC0
401
402		LEAQ 16(aut), aut
403
404	JMP dataSinglesLoop
405
406dataEnd:
407
408	TESTQ autLen, autLen
409	JEQ dataBail
410
411	PXOR B0, B0
412	LEAQ -1(aut)(autLen*1), aut
413
414dataLoadLoop:
415
416		PSLLDQ $1, B0
417		PINSRB $0, (aut), B0
418
419		LEAQ -1(aut), aut
420		DECQ autLen
421		JNE dataLoadLoop
422
423	JMP dataMul
424
425dataBail:
426	MOVOU ACC0, (tPtr)
427	RET
428#undef pTbl
429#undef aut
430#undef tPtr
431#undef autLen
432
433// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
434TEXT ·gcmAesEnc(SB),0,$256-96
435#define pTbl DI
436#define ctx DX
437#define ctrPtr CX
438#define ptx SI
439#define ks AX
440#define tPtr R8
441#define ptxLen R9
442#define aluCTR R10
443#define aluTMP R11
444#define aluK R12
445#define NR R13
446
447#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
448#define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
449#define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
450#define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
451#define combinedRound(i) \
452	MOVOU (16*i)(ks), T0;\
453	AESENC T0, B0;\
454	AESENC T0, B1;\
455	AESENC T0, B2;\
456	AESENC T0, B3;\
457	 MOVOU (16*(i*2))(pTbl), T1;\
458	 MOVOU T1, T2;\
459	AESENC T0, B4;\
460	AESENC T0, B5;\
461	AESENC T0, B6;\
462	AESENC T0, B7;\
463	 MOVOU (16*i)(SP), T0;\
464	 PCLMULQDQ $0x00, T0, T1;\
465	 PXOR T1, ACC0;\
466	 PSHUFD $78, T0, T1;\
467	 PCLMULQDQ $0x11, T0, T2;\
468	 PXOR T1, T0;\
469	 PXOR T2, ACC1;\
470	 MOVOU (16*(i*2+1))(pTbl), T2;\
471	 PCLMULQDQ $0x00, T2, T0;\
472	 PXOR T0, ACCM
473#define mulRound(i) \
474	MOVOU (16*i)(SP), T0;\
475	MOVOU (16*(i*2))(pTbl), T1;\
476	MOVOU T1, T2;\
477	PCLMULQDQ $0x00, T0, T1;\
478	PXOR T1, ACC0;\
479	PCLMULQDQ $0x11, T0, T2;\
480	PXOR T2, ACC1;\
481	PSHUFD $78, T0, T1;\
482	PXOR T1, T0;\
483	MOVOU (16*(i*2+1))(pTbl), T1;\
484	PCLMULQDQ $0x00, T0, T1;\
485	PXOR T1, ACCM
486
487	MOVQ productTable+0(FP), pTbl
488	MOVQ dst+8(FP), ctx
489	MOVQ src_base+32(FP), ptx
490	MOVQ src_len+40(FP), ptxLen
491	MOVQ ctr+56(FP), ctrPtr
492	MOVQ T+64(FP), tPtr
493	MOVQ ks_base+72(FP), ks
494	MOVQ ks_len+80(FP), NR
495
496	SHRQ $2, NR
497	DECQ NR
498
499	MOVOU bswapMask<>(SB), BSWAP
500	MOVOU gcmPoly<>(SB), POLY
501
502	MOVOU (tPtr), ACC0
503	PXOR ACC1, ACC1
504	PXOR ACCM, ACCM
505	MOVOU (ctrPtr), B0
506	MOVL (3*4)(ctrPtr), aluCTR
507	MOVOU (ks), T0
508	MOVL (3*4)(ks), aluK
509	BSWAPL aluCTR
510	BSWAPL aluK
511
512	PXOR B0, T0
513	MOVOU T0, (8*16 + 0*16)(SP)
514	increment(0)
515
516	CMPQ ptxLen, $128
517	JB gcmAesEncSingles
518	SUBQ $128, ptxLen
519
520	// We have at least 8 blocks to encrypt, prepare the rest of the counters
521	MOVOU T0, (8*16 + 1*16)(SP)
522	increment(1)
523	MOVOU T0, (8*16 + 2*16)(SP)
524	increment(2)
525	MOVOU T0, (8*16 + 3*16)(SP)
526	increment(3)
527	MOVOU T0, (8*16 + 4*16)(SP)
528	increment(4)
529	MOVOU T0, (8*16 + 5*16)(SP)
530	increment(5)
531	MOVOU T0, (8*16 + 6*16)(SP)
532	increment(6)
533	MOVOU T0, (8*16 + 7*16)(SP)
534	increment(7)
535
536	MOVOU (8*16 + 0*16)(SP), B0
537	MOVOU (8*16 + 1*16)(SP), B1
538	MOVOU (8*16 + 2*16)(SP), B2
539	MOVOU (8*16 + 3*16)(SP), B3
540	MOVOU (8*16 + 4*16)(SP), B4
541	MOVOU (8*16 + 5*16)(SP), B5
542	MOVOU (8*16 + 6*16)(SP), B6
543	MOVOU (8*16 + 7*16)(SP), B7
544
545	aesRound(1)
546	increment(0)
547	aesRound(2)
548	increment(1)
549	aesRound(3)
550	increment(2)
551	aesRound(4)
552	increment(3)
553	aesRound(5)
554	increment(4)
555	aesRound(6)
556	increment(5)
557	aesRound(7)
558	increment(6)
559	aesRound(8)
560	increment(7)
561	aesRound(9)
562	MOVOU (16*10)(ks), T0
563	CMPQ NR, $12
564	JB encLast1
565	aesRnd(T0)
566	aesRound(11)
567	MOVOU (16*12)(ks), T0
568	JE encLast1
569	aesRnd(T0)
570	aesRound(13)
571	MOVOU (16*14)(ks), T0
572encLast1:
573	aesRndLast(T0)
574
575	MOVOU (16*0)(ptx), T0
576	PXOR T0, B0
577	MOVOU (16*1)(ptx), T0
578	PXOR T0, B1
579	MOVOU (16*2)(ptx), T0
580	PXOR T0, B2
581	MOVOU (16*3)(ptx), T0
582	PXOR T0, B3
583	MOVOU (16*4)(ptx), T0
584	PXOR T0, B4
585	MOVOU (16*5)(ptx), T0
586	PXOR T0, B5
587	MOVOU (16*6)(ptx), T0
588	PXOR T0, B6
589	MOVOU (16*7)(ptx), T0
590	PXOR T0, B7
591
592	MOVOU B0, (16*0)(ctx)
593	PSHUFB BSWAP, B0
594	PXOR ACC0, B0
595	MOVOU B1, (16*1)(ctx)
596	PSHUFB BSWAP, B1
597	MOVOU B2, (16*2)(ctx)
598	PSHUFB BSWAP, B2
599	MOVOU B3, (16*3)(ctx)
600	PSHUFB BSWAP, B3
601	MOVOU B4, (16*4)(ctx)
602	PSHUFB BSWAP, B4
603	MOVOU B5, (16*5)(ctx)
604	PSHUFB BSWAP, B5
605	MOVOU B6, (16*6)(ctx)
606	PSHUFB BSWAP, B6
607	MOVOU B7, (16*7)(ctx)
608	PSHUFB BSWAP, B7
609
610	MOVOU B0, (16*0)(SP)
611	MOVOU B1, (16*1)(SP)
612	MOVOU B2, (16*2)(SP)
613	MOVOU B3, (16*3)(SP)
614	MOVOU B4, (16*4)(SP)
615	MOVOU B5, (16*5)(SP)
616	MOVOU B6, (16*6)(SP)
617	MOVOU B7, (16*7)(SP)
618
619	LEAQ 128(ptx), ptx
620	LEAQ 128(ctx), ctx
621
622gcmAesEncOctetsLoop:
623
624		CMPQ ptxLen, $128
625		JB gcmAesEncOctetsEnd
626		SUBQ $128, ptxLen
627
628		MOVOU (8*16 + 0*16)(SP), B0
629		MOVOU (8*16 + 1*16)(SP), B1
630		MOVOU (8*16 + 2*16)(SP), B2
631		MOVOU (8*16 + 3*16)(SP), B3
632		MOVOU (8*16 + 4*16)(SP), B4
633		MOVOU (8*16 + 5*16)(SP), B5
634		MOVOU (8*16 + 6*16)(SP), B6
635		MOVOU (8*16 + 7*16)(SP), B7
636
637		MOVOU (16*0)(SP), T0
638		PSHUFD $78, T0, T1
639		PXOR T0, T1
640
641		MOVOU (16*0)(pTbl), ACC0
642		MOVOU (16*1)(pTbl), ACCM
643		MOVOU ACC0, ACC1
644
645		PCLMULQDQ $0x00, T1, ACCM
646		PCLMULQDQ $0x00, T0, ACC0
647		PCLMULQDQ $0x11, T0, ACC1
648
649		combinedRound(1)
650		increment(0)
651		combinedRound(2)
652		increment(1)
653		combinedRound(3)
654		increment(2)
655		combinedRound(4)
656		increment(3)
657		combinedRound(5)
658		increment(4)
659		combinedRound(6)
660		increment(5)
661		combinedRound(7)
662		increment(6)
663
664		aesRound(8)
665		increment(7)
666
667		PXOR ACC0, ACCM
668		PXOR ACC1, ACCM
669		MOVOU ACCM, T0
670		PSRLDQ $8, ACCM
671		PSLLDQ $8, T0
672		PXOR ACCM, ACC1
673		PXOR T0, ACC0
674
675		reduceRound(ACC0)
676		aesRound(9)
677
678		reduceRound(ACC0)
679		PXOR ACC1, ACC0
680
681		MOVOU (16*10)(ks), T0
682		CMPQ NR, $12
683		JB encLast2
684		aesRnd(T0)
685		aesRound(11)
686		MOVOU (16*12)(ks), T0
687		JE encLast2
688		aesRnd(T0)
689		aesRound(13)
690		MOVOU (16*14)(ks), T0
691encLast2:
692		aesRndLast(T0)
693
694		MOVOU (16*0)(ptx), T0
695		PXOR T0, B0
696		MOVOU (16*1)(ptx), T0
697		PXOR T0, B1
698		MOVOU (16*2)(ptx), T0
699		PXOR T0, B2
700		MOVOU (16*3)(ptx), T0
701		PXOR T0, B3
702		MOVOU (16*4)(ptx), T0
703		PXOR T0, B4
704		MOVOU (16*5)(ptx), T0
705		PXOR T0, B5
706		MOVOU (16*6)(ptx), T0
707		PXOR T0, B6
708		MOVOU (16*7)(ptx), T0
709		PXOR T0, B7
710
711		MOVOU B0, (16*0)(ctx)
712		PSHUFB BSWAP, B0
713		PXOR ACC0, B0
714		MOVOU B1, (16*1)(ctx)
715		PSHUFB BSWAP, B1
716		MOVOU B2, (16*2)(ctx)
717		PSHUFB BSWAP, B2
718		MOVOU B3, (16*3)(ctx)
719		PSHUFB BSWAP, B3
720		MOVOU B4, (16*4)(ctx)
721		PSHUFB BSWAP, B4
722		MOVOU B5, (16*5)(ctx)
723		PSHUFB BSWAP, B5
724		MOVOU B6, (16*6)(ctx)
725		PSHUFB BSWAP, B6
726		MOVOU B7, (16*7)(ctx)
727		PSHUFB BSWAP, B7
728
729		MOVOU B0, (16*0)(SP)
730		MOVOU B1, (16*1)(SP)
731		MOVOU B2, (16*2)(SP)
732		MOVOU B3, (16*3)(SP)
733		MOVOU B4, (16*4)(SP)
734		MOVOU B5, (16*5)(SP)
735		MOVOU B6, (16*6)(SP)
736		MOVOU B7, (16*7)(SP)
737
738		LEAQ 128(ptx), ptx
739		LEAQ 128(ctx), ctx
740
741		JMP gcmAesEncOctetsLoop
742
743gcmAesEncOctetsEnd:
744
745	MOVOU (16*0)(SP), T0
746	MOVOU (16*0)(pTbl), ACC0
747	MOVOU (16*1)(pTbl), ACCM
748	MOVOU ACC0, ACC1
749	PSHUFD $78, T0, T1
750	PXOR T0, T1
751	PCLMULQDQ $0x00, T0, ACC0
752	PCLMULQDQ $0x11, T0, ACC1
753	PCLMULQDQ $0x00, T1, ACCM
754
755	mulRound(1)
756	mulRound(2)
757	mulRound(3)
758	mulRound(4)
759	mulRound(5)
760	mulRound(6)
761	mulRound(7)
762
763	PXOR ACC0, ACCM
764	PXOR ACC1, ACCM
765	MOVOU ACCM, T0
766	PSRLDQ $8, ACCM
767	PSLLDQ $8, T0
768	PXOR ACCM, ACC1
769	PXOR T0, ACC0
770
771	reduceRound(ACC0)
772	reduceRound(ACC0)
773	PXOR ACC1, ACC0
774
775	TESTQ ptxLen, ptxLen
776	JE gcmAesEncDone
777
778	SUBQ $7, aluCTR
779
780gcmAesEncSingles:
781
782	MOVOU (16*1)(ks), B1
783	MOVOU (16*2)(ks), B2
784	MOVOU (16*3)(ks), B3
785	MOVOU (16*4)(ks), B4
786	MOVOU (16*5)(ks), B5
787	MOVOU (16*6)(ks), B6
788	MOVOU (16*7)(ks), B7
789
790	MOVOU (16*14)(pTbl), T2
791
792gcmAesEncSinglesLoop:
793
794		CMPQ ptxLen, $16
795		JB gcmAesEncTail
796		SUBQ $16, ptxLen
797
798		MOVOU (8*16 + 0*16)(SP), B0
799		increment(0)
800
801		AESENC B1, B0
802		AESENC B2, B0
803		AESENC B3, B0
804		AESENC B4, B0
805		AESENC B5, B0
806		AESENC B6, B0
807		AESENC B7, B0
808		MOVOU (16*8)(ks), T0
809		AESENC T0, B0
810		MOVOU (16*9)(ks), T0
811		AESENC T0, B0
812		MOVOU (16*10)(ks), T0
813		CMPQ NR, $12
814		JB encLast3
815		AESENC T0, B0
816		MOVOU (16*11)(ks), T0
817		AESENC T0, B0
818		MOVOU (16*12)(ks), T0
819		JE encLast3
820		AESENC T0, B0
821		MOVOU (16*13)(ks), T0
822		AESENC T0, B0
823		MOVOU (16*14)(ks), T0
824encLast3:
825		AESENCLAST T0, B0
826
827		MOVOU (ptx), T0
828		PXOR T0, B0
829		MOVOU B0, (ctx)
830
831		PSHUFB BSWAP, B0
832		PXOR ACC0, B0
833
834		MOVOU T2, ACC0
835		MOVOU T2, ACC1
836		MOVOU (16*15)(pTbl), ACCM
837
838		PSHUFD $78, B0, T0
839		PXOR B0, T0
840		PCLMULQDQ $0x00, B0, ACC0
841		PCLMULQDQ $0x11, B0, ACC1
842		PCLMULQDQ $0x00, T0, ACCM
843
844		PXOR ACC0, ACCM
845		PXOR ACC1, ACCM
846		MOVOU ACCM, T0
847		PSRLDQ $8, ACCM
848		PSLLDQ $8, T0
849		PXOR ACCM, ACC1
850		PXOR T0, ACC0
851
852		reduceRound(ACC0)
853		reduceRound(ACC0)
854		PXOR ACC1, ACC0
855
856		LEAQ (16*1)(ptx), ptx
857		LEAQ (16*1)(ctx), ctx
858
859	JMP gcmAesEncSinglesLoop
860
861gcmAesEncTail:
862	TESTQ ptxLen, ptxLen
863	JE gcmAesEncDone
864
865	MOVOU (8*16 + 0*16)(SP), B0
866	AESENC B1, B0
867	AESENC B2, B0
868	AESENC B3, B0
869	AESENC B4, B0
870	AESENC B5, B0
871	AESENC B6, B0
872	AESENC B7, B0
873	MOVOU (16*8)(ks), T0
874	AESENC T0, B0
875	MOVOU (16*9)(ks), T0
876	AESENC T0, B0
877	MOVOU (16*10)(ks), T0
878	CMPQ NR, $12
879	JB encLast4
880	AESENC T0, B0
881	MOVOU (16*11)(ks), T0
882	AESENC T0, B0
883	MOVOU (16*12)(ks), T0
884	JE encLast4
885	AESENC T0, B0
886	MOVOU (16*13)(ks), T0
887	AESENC T0, B0
888	MOVOU (16*14)(ks), T0
889encLast4:
890	AESENCLAST T0, B0
891	MOVOU B0, T0
892
893	LEAQ -1(ptx)(ptxLen*1), ptx
894
895	MOVQ ptxLen, aluTMP
896	SHLQ $4, aluTMP
897
898	LEAQ andMask<>(SB), aluCTR
899	MOVOU -16(aluCTR)(aluTMP*1), T1
900
901	PXOR B0, B0
902ptxLoadLoop:
903		PSLLDQ $1, B0
904		PINSRB $0, (ptx), B0
905		LEAQ -1(ptx), ptx
906		DECQ ptxLen
907	JNE ptxLoadLoop
908
909	PXOR T0, B0
910	PAND T1, B0
911	MOVOU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
912
913	PSHUFB BSWAP, B0
914	PXOR ACC0, B0
915
916	MOVOU T2, ACC0
917	MOVOU T2, ACC1
918	MOVOU (16*15)(pTbl), ACCM
919
920	PSHUFD $78, B0, T0
921	PXOR B0, T0
922	PCLMULQDQ $0x00, B0, ACC0
923	PCLMULQDQ $0x11, B0, ACC1
924	PCLMULQDQ $0x00, T0, ACCM
925
926	PXOR ACC0, ACCM
927	PXOR ACC1, ACCM
928	MOVOU ACCM, T0
929	PSRLDQ $8, ACCM
930	PSLLDQ $8, T0
931	PXOR ACCM, ACC1
932	PXOR T0, ACC0
933
934	reduceRound(ACC0)
935	reduceRound(ACC0)
936	PXOR ACC1, ACC0
937
938gcmAesEncDone:
939	MOVOU ACC0, (tPtr)
940	RET
941#undef increment
942
943// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
944TEXT ·gcmAesDec(SB),0,$128-96
945#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
946#define combinedDecRound(i) \
947	MOVOU (16*i)(ks), T0;\
948	AESENC T0, B0;\
949	AESENC T0, B1;\
950	AESENC T0, B2;\
951	AESENC T0, B3;\
952	MOVOU (16*(i*2))(pTbl), T1;\
953	MOVOU T1, T2;\
954	AESENC T0, B4;\
955	AESENC T0, B5;\
956	AESENC T0, B6;\
957	AESENC T0, B7;\
958	MOVOU (16*i)(ctx), T0;\
959	PSHUFB BSWAP, T0;\
960	PCLMULQDQ $0x00, T0, T1;\
961	PXOR T1, ACC0;\
962	PSHUFD $78, T0, T1;\
963	PCLMULQDQ $0x11, T0, T2;\
964	PXOR T1, T0;\
965	PXOR T2, ACC1;\
966	MOVOU (16*(i*2+1))(pTbl), T2;\
967	PCLMULQDQ $0x00, T2, T0;\
968	PXOR T0, ACCM
969
970	MOVQ productTable+0(FP), pTbl
971	MOVQ dst+8(FP), ptx
972	MOVQ src_base+32(FP), ctx
973	MOVQ src_len+40(FP), ptxLen
974	MOVQ ctr+56(FP), ctrPtr
975	MOVQ T+64(FP), tPtr
976	MOVQ ks_base+72(FP), ks
977	MOVQ ks_len+80(FP), NR
978
979	SHRQ $2, NR
980	DECQ NR
981
982	MOVOU bswapMask<>(SB), BSWAP
983	MOVOU gcmPoly<>(SB), POLY
984
985	MOVOU (tPtr), ACC0
986	PXOR ACC1, ACC1
987	PXOR ACCM, ACCM
988	MOVOU (ctrPtr), B0
989	MOVL (3*4)(ctrPtr), aluCTR
990	MOVOU (ks), T0
991	MOVL (3*4)(ks), aluK
992	BSWAPL aluCTR
993	BSWAPL aluK
994
995	PXOR B0, T0
996	MOVOU T0, (0*16)(SP)
997	increment(0)
998
999	CMPQ ptxLen, $128
1000	JB gcmAesDecSingles
1001
1002	MOVOU T0, (1*16)(SP)
1003	increment(1)
1004	MOVOU T0, (2*16)(SP)
1005	increment(2)
1006	MOVOU T0, (3*16)(SP)
1007	increment(3)
1008	MOVOU T0, (4*16)(SP)
1009	increment(4)
1010	MOVOU T0, (5*16)(SP)
1011	increment(5)
1012	MOVOU T0, (6*16)(SP)
1013	increment(6)
1014	MOVOU T0, (7*16)(SP)
1015	increment(7)
1016
1017gcmAesDecOctetsLoop:
1018
1019		CMPQ ptxLen, $128
1020		JB gcmAesDecEndOctets
1021		SUBQ $128, ptxLen
1022
1023		MOVOU (0*16)(SP), B0
1024		MOVOU (1*16)(SP), B1
1025		MOVOU (2*16)(SP), B2
1026		MOVOU (3*16)(SP), B3
1027		MOVOU (4*16)(SP), B4
1028		MOVOU (5*16)(SP), B5
1029		MOVOU (6*16)(SP), B6
1030		MOVOU (7*16)(SP), B7
1031
1032		MOVOU (16*0)(ctx), T0
1033		PSHUFB BSWAP, T0
1034		PXOR ACC0, T0
1035		PSHUFD $78, T0, T1
1036		PXOR T0, T1
1037
1038		MOVOU (16*0)(pTbl), ACC0
1039		MOVOU (16*1)(pTbl), ACCM
1040		MOVOU ACC0, ACC1
1041
1042		PCLMULQDQ $0x00, T1, ACCM
1043		PCLMULQDQ $0x00, T0, ACC0
1044		PCLMULQDQ $0x11, T0, ACC1
1045
1046		combinedDecRound(1)
1047		increment(0)
1048		combinedDecRound(2)
1049		increment(1)
1050		combinedDecRound(3)
1051		increment(2)
1052		combinedDecRound(4)
1053		increment(3)
1054		combinedDecRound(5)
1055		increment(4)
1056		combinedDecRound(6)
1057		increment(5)
1058		combinedDecRound(7)
1059		increment(6)
1060
1061		aesRound(8)
1062		increment(7)
1063
1064		PXOR ACC0, ACCM
1065		PXOR ACC1, ACCM
1066		MOVOU ACCM, T0
1067		PSRLDQ $8, ACCM
1068		PSLLDQ $8, T0
1069		PXOR ACCM, ACC1
1070		PXOR T0, ACC0
1071
1072		reduceRound(ACC0)
1073		aesRound(9)
1074
1075		reduceRound(ACC0)
1076		PXOR ACC1, ACC0
1077
1078		MOVOU (16*10)(ks), T0
1079		CMPQ NR, $12
1080		JB decLast1
1081		aesRnd(T0)
1082		aesRound(11)
1083		MOVOU (16*12)(ks), T0
1084		JE decLast1
1085		aesRnd(T0)
1086		aesRound(13)
1087		MOVOU (16*14)(ks), T0
1088decLast1:
1089		aesRndLast(T0)
1090
1091		MOVOU (16*0)(ctx), T0
1092		PXOR T0, B0
1093		MOVOU (16*1)(ctx), T0
1094		PXOR T0, B1
1095		MOVOU (16*2)(ctx), T0
1096		PXOR T0, B2
1097		MOVOU (16*3)(ctx), T0
1098		PXOR T0, B3
1099		MOVOU (16*4)(ctx), T0
1100		PXOR T0, B4
1101		MOVOU (16*5)(ctx), T0
1102		PXOR T0, B5
1103		MOVOU (16*6)(ctx), T0
1104		PXOR T0, B6
1105		MOVOU (16*7)(ctx), T0
1106		PXOR T0, B7
1107
1108		MOVOU B0, (16*0)(ptx)
1109		MOVOU B1, (16*1)(ptx)
1110		MOVOU B2, (16*2)(ptx)
1111		MOVOU B3, (16*3)(ptx)
1112		MOVOU B4, (16*4)(ptx)
1113		MOVOU B5, (16*5)(ptx)
1114		MOVOU B6, (16*6)(ptx)
1115		MOVOU B7, (16*7)(ptx)
1116
1117		LEAQ 128(ptx), ptx
1118		LEAQ 128(ctx), ctx
1119
1120		JMP gcmAesDecOctetsLoop
1121
1122gcmAesDecEndOctets:
1123
1124	SUBQ $7, aluCTR
1125
1126gcmAesDecSingles:
1127
1128	MOVOU (16*1)(ks), B1
1129	MOVOU (16*2)(ks), B2
1130	MOVOU (16*3)(ks), B3
1131	MOVOU (16*4)(ks), B4
1132	MOVOU (16*5)(ks), B5
1133	MOVOU (16*6)(ks), B6
1134	MOVOU (16*7)(ks), B7
1135
1136	MOVOU (16*14)(pTbl), T2
1137
1138gcmAesDecSinglesLoop:
1139
1140		CMPQ ptxLen, $16
1141		JB gcmAesDecTail
1142		SUBQ $16, ptxLen
1143
1144		MOVOU (ctx), B0
1145		MOVOU B0, T1
1146		PSHUFB BSWAP, B0
1147		PXOR ACC0, B0
1148
1149		MOVOU T2, ACC0
1150		MOVOU T2, ACC1
1151		MOVOU (16*15)(pTbl), ACCM
1152
1153		PCLMULQDQ $0x00, B0, ACC0
1154		PCLMULQDQ $0x11, B0, ACC1
1155		PSHUFD $78, B0, T0
1156		PXOR B0, T0
1157		PCLMULQDQ $0x00, T0, ACCM
1158
1159		PXOR ACC0, ACCM
1160		PXOR ACC1, ACCM
1161		MOVOU ACCM, T0
1162		PSRLDQ $8, ACCM
1163		PSLLDQ $8, T0
1164		PXOR ACCM, ACC1
1165		PXOR T0, ACC0
1166
1167		reduceRound(ACC0)
1168		reduceRound(ACC0)
1169		PXOR ACC1, ACC0
1170
1171		MOVOU (0*16)(SP), B0
1172		increment(0)
1173		AESENC B1, B0
1174		AESENC B2, B0
1175		AESENC B3, B0
1176		AESENC B4, B0
1177		AESENC B5, B0
1178		AESENC B6, B0
1179		AESENC B7, B0
1180		MOVOU (16*8)(ks), T0
1181		AESENC T0, B0
1182		MOVOU (16*9)(ks), T0
1183		AESENC T0, B0
1184		MOVOU (16*10)(ks), T0
1185		CMPQ NR, $12
1186		JB decLast2
1187		AESENC T0, B0
1188		MOVOU (16*11)(ks), T0
1189		AESENC T0, B0
1190		MOVOU (16*12)(ks), T0
1191		JE decLast2
1192		AESENC T0, B0
1193		MOVOU (16*13)(ks), T0
1194		AESENC T0, B0
1195		MOVOU (16*14)(ks), T0
1196decLast2:
1197		AESENCLAST T0, B0
1198
1199		PXOR T1, B0
1200		MOVOU B0, (ptx)
1201
1202		LEAQ (16*1)(ptx), ptx
1203		LEAQ (16*1)(ctx), ctx
1204
1205	JMP gcmAesDecSinglesLoop
1206
1207gcmAesDecTail:
1208
1209	TESTQ ptxLen, ptxLen
1210	JE gcmAesDecDone
1211
1212	MOVQ ptxLen, aluTMP
1213	SHLQ $4, aluTMP
1214	LEAQ andMask<>(SB), aluCTR
1215	MOVOU -16(aluCTR)(aluTMP*1), T1
1216
1217	MOVOU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
1218	PAND T1, B0
1219
1220	MOVOU B0, T1
1221	PSHUFB BSWAP, B0
1222	PXOR ACC0, B0
1223
1224	MOVOU (16*14)(pTbl), ACC0
1225	MOVOU (16*15)(pTbl), ACCM
1226	MOVOU ACC0, ACC1
1227
1228	PCLMULQDQ $0x00, B0, ACC0
1229	PCLMULQDQ $0x11, B0, ACC1
1230	PSHUFD $78, B0, T0
1231	PXOR B0, T0
1232	PCLMULQDQ $0x00, T0, ACCM
1233
1234	PXOR ACC0, ACCM
1235	PXOR ACC1, ACCM
1236	MOVOU ACCM, T0
1237	PSRLDQ $8, ACCM
1238	PSLLDQ $8, T0
1239	PXOR ACCM, ACC1
1240	PXOR T0, ACC0
1241
1242	reduceRound(ACC0)
1243	reduceRound(ACC0)
1244	PXOR ACC1, ACC0
1245
1246	MOVOU (0*16)(SP), B0
1247	increment(0)
1248	AESENC B1, B0
1249	AESENC B2, B0
1250	AESENC B3, B0
1251	AESENC B4, B0
1252	AESENC B5, B0
1253	AESENC B6, B0
1254	AESENC B7, B0
1255	MOVOU (16*8)(ks), T0
1256	AESENC T0, B0
1257	MOVOU (16*9)(ks), T0
1258	AESENC T0, B0
1259	MOVOU (16*10)(ks), T0
1260	CMPQ NR, $12
1261	JB decLast3
1262	AESENC T0, B0
1263	MOVOU (16*11)(ks), T0
1264	AESENC T0, B0
1265	MOVOU (16*12)(ks), T0
1266	JE decLast3
1267	AESENC T0, B0
1268	MOVOU (16*13)(ks), T0
1269	AESENC T0, B0
1270	MOVOU (16*14)(ks), T0
1271decLast3:
1272	AESENCLAST T0, B0
1273	PXOR T1, B0
1274
1275ptxStoreLoop:
1276		PEXTRB $0, B0, (ptx)
1277		PSRLDQ $1, B0
1278		LEAQ 1(ptx), ptx
1279		DECQ ptxLen
1280
1281	JNE ptxStoreLoop
1282
1283gcmAesDecDone:
1284
1285	MOVOU ACC0, (tPtr)
1286	RET
1287