1#include "x86_arch.h"
2.text
3
4.globl	gcm_gmult_4bit
5.def	gcm_gmult_4bit;	.scl 2;	.type 32;	.endef
6.p2align	4
7gcm_gmult_4bit:
8	movq	%rdi,8(%rsp)
9	movq	%rsi,16(%rsp)
10	movq	%rsp,%rax
11.LSEH_begin_gcm_gmult_4bit:
12	movq	%rcx,%rdi
13	movq	%rdx,%rsi
14
15	pushq	%rbx
16	pushq	%rbp
17	pushq	%r12
18.Lgmult_prologue:
19
20	movzbq	15(%rdi),%r8
21	leaq	.Lrem_4bit(%rip),%r11
22	xorq	%rax,%rax
23	xorq	%rbx,%rbx
24	movb	%r8b,%al
25	movb	%r8b,%bl
26	shlb	$4,%al
27	movq	$14,%rcx
28	movq	8(%rsi,%rax,1),%r8
29	movq	(%rsi,%rax,1),%r9
30	andb	$240,%bl
31	movq	%r8,%rdx
32	jmp	.Loop1
33
34.p2align	4
35.Loop1:
36	shrq	$4,%r8
37	andq	$15,%rdx
38	movq	%r9,%r10
39	movb	(%rdi,%rcx,1),%al
40	shrq	$4,%r9
41	xorq	8(%rsi,%rbx,1),%r8
42	shlq	$60,%r10
43	xorq	(%rsi,%rbx,1),%r9
44	movb	%al,%bl
45	xorq	(%r11,%rdx,8),%r9
46	movq	%r8,%rdx
47	shlb	$4,%al
48	xorq	%r10,%r8
49	decq	%rcx
50	js	.Lbreak1
51
52	shrq	$4,%r8
53	andq	$15,%rdx
54	movq	%r9,%r10
55	shrq	$4,%r9
56	xorq	8(%rsi,%rax,1),%r8
57	shlq	$60,%r10
58	xorq	(%rsi,%rax,1),%r9
59	andb	$240,%bl
60	xorq	(%r11,%rdx,8),%r9
61	movq	%r8,%rdx
62	xorq	%r10,%r8
63	jmp	.Loop1
64
65.p2align	4
66.Lbreak1:
67	shrq	$4,%r8
68	andq	$15,%rdx
69	movq	%r9,%r10
70	shrq	$4,%r9
71	xorq	8(%rsi,%rax,1),%r8
72	shlq	$60,%r10
73	xorq	(%rsi,%rax,1),%r9
74	andb	$240,%bl
75	xorq	(%r11,%rdx,8),%r9
76	movq	%r8,%rdx
77	xorq	%r10,%r8
78
79	shrq	$4,%r8
80	andq	$15,%rdx
81	movq	%r9,%r10
82	shrq	$4,%r9
83	xorq	8(%rsi,%rbx,1),%r8
84	shlq	$60,%r10
85	xorq	(%rsi,%rbx,1),%r9
86	xorq	%r10,%r8
87	xorq	(%r11,%rdx,8),%r9
88
89	bswapq	%r8
90	bswapq	%r9
91	movq	%r8,8(%rdi)
92	movq	%r9,(%rdi)
93
94	movq	16(%rsp),%rbx
95	leaq	24(%rsp),%rsp
96.Lgmult_epilogue:
97	movq	8(%rsp),%rdi
98	movq	16(%rsp),%rsi
99	retq
100.LSEH_end_gcm_gmult_4bit:
101.globl	gcm_ghash_4bit
102.def	gcm_ghash_4bit;	.scl 2;	.type 32;	.endef
103.p2align	4
104gcm_ghash_4bit:
105	movq	%rdi,8(%rsp)
106	movq	%rsi,16(%rsp)
107	movq	%rsp,%rax
108.LSEH_begin_gcm_ghash_4bit:
109	movq	%rcx,%rdi
110	movq	%rdx,%rsi
111	movq	%r8,%rdx
112	movq	%r9,%rcx
113
114	pushq	%rbx
115	pushq	%rbp
116	pushq	%r12
117	pushq	%r13
118	pushq	%r14
119	pushq	%r15
120	subq	$280,%rsp
121.Lghash_prologue:
122	movq	%rdx,%r14
123	movq	%rcx,%r15
124	subq	$-128,%rsi
125	leaq	16+128(%rsp),%rbp
126	xorl	%edx,%edx
127	movq	0+0-128(%rsi),%r8
128	movq	0+8-128(%rsi),%rax
129	movb	%al,%dl
130	shrq	$4,%rax
131	movq	%r8,%r10
132	shrq	$4,%r8
133	movq	16+0-128(%rsi),%r9
134	shlb	$4,%dl
135	movq	16+8-128(%rsi),%rbx
136	shlq	$60,%r10
137	movb	%dl,0(%rsp)
138	orq	%r10,%rax
139	movb	%bl,%dl
140	shrq	$4,%rbx
141	movq	%r9,%r10
142	shrq	$4,%r9
143	movq	%r8,0(%rbp)
144	movq	32+0-128(%rsi),%r8
145	shlb	$4,%dl
146	movq	%rax,0-128(%rbp)
147	movq	32+8-128(%rsi),%rax
148	shlq	$60,%r10
149	movb	%dl,1(%rsp)
150	orq	%r10,%rbx
151	movb	%al,%dl
152	shrq	$4,%rax
153	movq	%r8,%r10
154	shrq	$4,%r8
155	movq	%r9,8(%rbp)
156	movq	48+0-128(%rsi),%r9
157	shlb	$4,%dl
158	movq	%rbx,8-128(%rbp)
159	movq	48+8-128(%rsi),%rbx
160	shlq	$60,%r10
161	movb	%dl,2(%rsp)
162	orq	%r10,%rax
163	movb	%bl,%dl
164	shrq	$4,%rbx
165	movq	%r9,%r10
166	shrq	$4,%r9
167	movq	%r8,16(%rbp)
168	movq	64+0-128(%rsi),%r8
169	shlb	$4,%dl
170	movq	%rax,16-128(%rbp)
171	movq	64+8-128(%rsi),%rax
172	shlq	$60,%r10
173	movb	%dl,3(%rsp)
174	orq	%r10,%rbx
175	movb	%al,%dl
176	shrq	$4,%rax
177	movq	%r8,%r10
178	shrq	$4,%r8
179	movq	%r9,24(%rbp)
180	movq	80+0-128(%rsi),%r9
181	shlb	$4,%dl
182	movq	%rbx,24-128(%rbp)
183	movq	80+8-128(%rsi),%rbx
184	shlq	$60,%r10
185	movb	%dl,4(%rsp)
186	orq	%r10,%rax
187	movb	%bl,%dl
188	shrq	$4,%rbx
189	movq	%r9,%r10
190	shrq	$4,%r9
191	movq	%r8,32(%rbp)
192	movq	96+0-128(%rsi),%r8
193	shlb	$4,%dl
194	movq	%rax,32-128(%rbp)
195	movq	96+8-128(%rsi),%rax
196	shlq	$60,%r10
197	movb	%dl,5(%rsp)
198	orq	%r10,%rbx
199	movb	%al,%dl
200	shrq	$4,%rax
201	movq	%r8,%r10
202	shrq	$4,%r8
203	movq	%r9,40(%rbp)
204	movq	112+0-128(%rsi),%r9
205	shlb	$4,%dl
206	movq	%rbx,40-128(%rbp)
207	movq	112+8-128(%rsi),%rbx
208	shlq	$60,%r10
209	movb	%dl,6(%rsp)
210	orq	%r10,%rax
211	movb	%bl,%dl
212	shrq	$4,%rbx
213	movq	%r9,%r10
214	shrq	$4,%r9
215	movq	%r8,48(%rbp)
216	movq	128+0-128(%rsi),%r8
217	shlb	$4,%dl
218	movq	%rax,48-128(%rbp)
219	movq	128+8-128(%rsi),%rax
220	shlq	$60,%r10
221	movb	%dl,7(%rsp)
222	orq	%r10,%rbx
223	movb	%al,%dl
224	shrq	$4,%rax
225	movq	%r8,%r10
226	shrq	$4,%r8
227	movq	%r9,56(%rbp)
228	movq	144+0-128(%rsi),%r9
229	shlb	$4,%dl
230	movq	%rbx,56-128(%rbp)
231	movq	144+8-128(%rsi),%rbx
232	shlq	$60,%r10
233	movb	%dl,8(%rsp)
234	orq	%r10,%rax
235	movb	%bl,%dl
236	shrq	$4,%rbx
237	movq	%r9,%r10
238	shrq	$4,%r9
239	movq	%r8,64(%rbp)
240	movq	160+0-128(%rsi),%r8
241	shlb	$4,%dl
242	movq	%rax,64-128(%rbp)
243	movq	160+8-128(%rsi),%rax
244	shlq	$60,%r10
245	movb	%dl,9(%rsp)
246	orq	%r10,%rbx
247	movb	%al,%dl
248	shrq	$4,%rax
249	movq	%r8,%r10
250	shrq	$4,%r8
251	movq	%r9,72(%rbp)
252	movq	176+0-128(%rsi),%r9
253	shlb	$4,%dl
254	movq	%rbx,72-128(%rbp)
255	movq	176+8-128(%rsi),%rbx
256	shlq	$60,%r10
257	movb	%dl,10(%rsp)
258	orq	%r10,%rax
259	movb	%bl,%dl
260	shrq	$4,%rbx
261	movq	%r9,%r10
262	shrq	$4,%r9
263	movq	%r8,80(%rbp)
264	movq	192+0-128(%rsi),%r8
265	shlb	$4,%dl
266	movq	%rax,80-128(%rbp)
267	movq	192+8-128(%rsi),%rax
268	shlq	$60,%r10
269	movb	%dl,11(%rsp)
270	orq	%r10,%rbx
271	movb	%al,%dl
272	shrq	$4,%rax
273	movq	%r8,%r10
274	shrq	$4,%r8
275	movq	%r9,88(%rbp)
276	movq	208+0-128(%rsi),%r9
277	shlb	$4,%dl
278	movq	%rbx,88-128(%rbp)
279	movq	208+8-128(%rsi),%rbx
280	shlq	$60,%r10
281	movb	%dl,12(%rsp)
282	orq	%r10,%rax
283	movb	%bl,%dl
284	shrq	$4,%rbx
285	movq	%r9,%r10
286	shrq	$4,%r9
287	movq	%r8,96(%rbp)
288	movq	224+0-128(%rsi),%r8
289	shlb	$4,%dl
290	movq	%rax,96-128(%rbp)
291	movq	224+8-128(%rsi),%rax
292	shlq	$60,%r10
293	movb	%dl,13(%rsp)
294	orq	%r10,%rbx
295	movb	%al,%dl
296	shrq	$4,%rax
297	movq	%r8,%r10
298	shrq	$4,%r8
299	movq	%r9,104(%rbp)
300	movq	240+0-128(%rsi),%r9
301	shlb	$4,%dl
302	movq	%rbx,104-128(%rbp)
303	movq	240+8-128(%rsi),%rbx
304	shlq	$60,%r10
305	movb	%dl,14(%rsp)
306	orq	%r10,%rax
307	movb	%bl,%dl
308	shrq	$4,%rbx
309	movq	%r9,%r10
310	shrq	$4,%r9
311	movq	%r8,112(%rbp)
312	shlb	$4,%dl
313	movq	%rax,112-128(%rbp)
314	shlq	$60,%r10
315	movb	%dl,15(%rsp)
316	orq	%r10,%rbx
317	movq	%r9,120(%rbp)
318	movq	%rbx,120-128(%rbp)
319	addq	$-128,%rsi
320	movq	8(%rdi),%r8
321	movq	0(%rdi),%r9
322	addq	%r14,%r15
323	leaq	.Lrem_8bit(%rip),%r11
324	jmp	.Louter_loop
325.p2align	4
326.Louter_loop:
327	xorq	(%r14),%r9
328	movq	8(%r14),%rdx
329	leaq	16(%r14),%r14
330	xorq	%r8,%rdx
331	movq	%r9,(%rdi)
332	movq	%rdx,8(%rdi)
333	shrq	$32,%rdx
334	xorq	%rax,%rax
335	roll	$8,%edx
336	movb	%dl,%al
337	movzbl	%dl,%ebx
338	shlb	$4,%al
339	shrl	$4,%ebx
340	roll	$8,%edx
341	movq	8(%rsi,%rax,1),%r8
342	movq	(%rsi,%rax,1),%r9
343	movb	%dl,%al
344	movzbl	%dl,%ecx
345	shlb	$4,%al
346	movzbq	(%rsp,%rbx,1),%r12
347	shrl	$4,%ecx
348	xorq	%r8,%r12
349	movq	%r9,%r10
350	shrq	$8,%r8
351	movzbq	%r12b,%r12
352	shrq	$8,%r9
353	xorq	-128(%rbp,%rbx,8),%r8
354	shlq	$56,%r10
355	xorq	(%rbp,%rbx,8),%r9
356	roll	$8,%edx
357	xorq	8(%rsi,%rax,1),%r8
358	xorq	(%rsi,%rax,1),%r9
359	movb	%dl,%al
360	xorq	%r10,%r8
361	movzwq	(%r11,%r12,2),%r12
362	movzbl	%dl,%ebx
363	shlb	$4,%al
364	movzbq	(%rsp,%rcx,1),%r13
365	shrl	$4,%ebx
366	shlq	$48,%r12
367	xorq	%r8,%r13
368	movq	%r9,%r10
369	xorq	%r12,%r9
370	shrq	$8,%r8
371	movzbq	%r13b,%r13
372	shrq	$8,%r9
373	xorq	-128(%rbp,%rcx,8),%r8
374	shlq	$56,%r10
375	xorq	(%rbp,%rcx,8),%r9
376	roll	$8,%edx
377	xorq	8(%rsi,%rax,1),%r8
378	xorq	(%rsi,%rax,1),%r9
379	movb	%dl,%al
380	xorq	%r10,%r8
381	movzwq	(%r11,%r13,2),%r13
382	movzbl	%dl,%ecx
383	shlb	$4,%al
384	movzbq	(%rsp,%rbx,1),%r12
385	shrl	$4,%ecx
386	shlq	$48,%r13
387	xorq	%r8,%r12
388	movq	%r9,%r10
389	xorq	%r13,%r9
390	shrq	$8,%r8
391	movzbq	%r12b,%r12
392	movl	8(%rdi),%edx
393	shrq	$8,%r9
394	xorq	-128(%rbp,%rbx,8),%r8
395	shlq	$56,%r10
396	xorq	(%rbp,%rbx,8),%r9
397	roll	$8,%edx
398	xorq	8(%rsi,%rax,1),%r8
399	xorq	(%rsi,%rax,1),%r9
400	movb	%dl,%al
401	xorq	%r10,%r8
402	movzwq	(%r11,%r12,2),%r12
403	movzbl	%dl,%ebx
404	shlb	$4,%al
405	movzbq	(%rsp,%rcx,1),%r13
406	shrl	$4,%ebx
407	shlq	$48,%r12
408	xorq	%r8,%r13
409	movq	%r9,%r10
410	xorq	%r12,%r9
411	shrq	$8,%r8
412	movzbq	%r13b,%r13
413	shrq	$8,%r9
414	xorq	-128(%rbp,%rcx,8),%r8
415	shlq	$56,%r10
416	xorq	(%rbp,%rcx,8),%r9
417	roll	$8,%edx
418	xorq	8(%rsi,%rax,1),%r8
419	xorq	(%rsi,%rax,1),%r9
420	movb	%dl,%al
421	xorq	%r10,%r8
422	movzwq	(%r11,%r13,2),%r13
423	movzbl	%dl,%ecx
424	shlb	$4,%al
425	movzbq	(%rsp,%rbx,1),%r12
426	shrl	$4,%ecx
427	shlq	$48,%r13
428	xorq	%r8,%r12
429	movq	%r9,%r10
430	xorq	%r13,%r9
431	shrq	$8,%r8
432	movzbq	%r12b,%r12
433	shrq	$8,%r9
434	xorq	-128(%rbp,%rbx,8),%r8
435	shlq	$56,%r10
436	xorq	(%rbp,%rbx,8),%r9
437	roll	$8,%edx
438	xorq	8(%rsi,%rax,1),%r8
439	xorq	(%rsi,%rax,1),%r9
440	movb	%dl,%al
441	xorq	%r10,%r8
442	movzwq	(%r11,%r12,2),%r12
443	movzbl	%dl,%ebx
444	shlb	$4,%al
445	movzbq	(%rsp,%rcx,1),%r13
446	shrl	$4,%ebx
447	shlq	$48,%r12
448	xorq	%r8,%r13
449	movq	%r9,%r10
450	xorq	%r12,%r9
451	shrq	$8,%r8
452	movzbq	%r13b,%r13
453	shrq	$8,%r9
454	xorq	-128(%rbp,%rcx,8),%r8
455	shlq	$56,%r10
456	xorq	(%rbp,%rcx,8),%r9
457	roll	$8,%edx
458	xorq	8(%rsi,%rax,1),%r8
459	xorq	(%rsi,%rax,1),%r9
460	movb	%dl,%al
461	xorq	%r10,%r8
462	movzwq	(%r11,%r13,2),%r13
463	movzbl	%dl,%ecx
464	shlb	$4,%al
465	movzbq	(%rsp,%rbx,1),%r12
466	shrl	$4,%ecx
467	shlq	$48,%r13
468	xorq	%r8,%r12
469	movq	%r9,%r10
470	xorq	%r13,%r9
471	shrq	$8,%r8
472	movzbq	%r12b,%r12
473	movl	4(%rdi),%edx
474	shrq	$8,%r9
475	xorq	-128(%rbp,%rbx,8),%r8
476	shlq	$56,%r10
477	xorq	(%rbp,%rbx,8),%r9
478	roll	$8,%edx
479	xorq	8(%rsi,%rax,1),%r8
480	xorq	(%rsi,%rax,1),%r9
481	movb	%dl,%al
482	xorq	%r10,%r8
483	movzwq	(%r11,%r12,2),%r12
484	movzbl	%dl,%ebx
485	shlb	$4,%al
486	movzbq	(%rsp,%rcx,1),%r13
487	shrl	$4,%ebx
488	shlq	$48,%r12
489	xorq	%r8,%r13
490	movq	%r9,%r10
491	xorq	%r12,%r9
492	shrq	$8,%r8
493	movzbq	%r13b,%r13
494	shrq	$8,%r9
495	xorq	-128(%rbp,%rcx,8),%r8
496	shlq	$56,%r10
497	xorq	(%rbp,%rcx,8),%r9
498	roll	$8,%edx
499	xorq	8(%rsi,%rax,1),%r8
500	xorq	(%rsi,%rax,1),%r9
501	movb	%dl,%al
502	xorq	%r10,%r8
503	movzwq	(%r11,%r13,2),%r13
504	movzbl	%dl,%ecx
505	shlb	$4,%al
506	movzbq	(%rsp,%rbx,1),%r12
507	shrl	$4,%ecx
508	shlq	$48,%r13
509	xorq	%r8,%r12
510	movq	%r9,%r10
511	xorq	%r13,%r9
512	shrq	$8,%r8
513	movzbq	%r12b,%r12
514	shrq	$8,%r9
515	xorq	-128(%rbp,%rbx,8),%r8
516	shlq	$56,%r10
517	xorq	(%rbp,%rbx,8),%r9
518	roll	$8,%edx
519	xorq	8(%rsi,%rax,1),%r8
520	xorq	(%rsi,%rax,1),%r9
521	movb	%dl,%al
522	xorq	%r10,%r8
523	movzwq	(%r11,%r12,2),%r12
524	movzbl	%dl,%ebx
525	shlb	$4,%al
526	movzbq	(%rsp,%rcx,1),%r13
527	shrl	$4,%ebx
528	shlq	$48,%r12
529	xorq	%r8,%r13
530	movq	%r9,%r10
531	xorq	%r12,%r9
532	shrq	$8,%r8
533	movzbq	%r13b,%r13
534	shrq	$8,%r9
535	xorq	-128(%rbp,%rcx,8),%r8
536	shlq	$56,%r10
537	xorq	(%rbp,%rcx,8),%r9
538	roll	$8,%edx
539	xorq	8(%rsi,%rax,1),%r8
540	xorq	(%rsi,%rax,1),%r9
541	movb	%dl,%al
542	xorq	%r10,%r8
543	movzwq	(%r11,%r13,2),%r13
544	movzbl	%dl,%ecx
545	shlb	$4,%al
546	movzbq	(%rsp,%rbx,1),%r12
547	shrl	$4,%ecx
548	shlq	$48,%r13
549	xorq	%r8,%r12
550	movq	%r9,%r10
551	xorq	%r13,%r9
552	shrq	$8,%r8
553	movzbq	%r12b,%r12
554	movl	0(%rdi),%edx
555	shrq	$8,%r9
556	xorq	-128(%rbp,%rbx,8),%r8
557	shlq	$56,%r10
558	xorq	(%rbp,%rbx,8),%r9
559	roll	$8,%edx
560	xorq	8(%rsi,%rax,1),%r8
561	xorq	(%rsi,%rax,1),%r9
562	movb	%dl,%al
563	xorq	%r10,%r8
564	movzwq	(%r11,%r12,2),%r12
565	movzbl	%dl,%ebx
566	shlb	$4,%al
567	movzbq	(%rsp,%rcx,1),%r13
568	shrl	$4,%ebx
569	shlq	$48,%r12
570	xorq	%r8,%r13
571	movq	%r9,%r10
572	xorq	%r12,%r9
573	shrq	$8,%r8
574	movzbq	%r13b,%r13
575	shrq	$8,%r9
576	xorq	-128(%rbp,%rcx,8),%r8
577	shlq	$56,%r10
578	xorq	(%rbp,%rcx,8),%r9
579	roll	$8,%edx
580	xorq	8(%rsi,%rax,1),%r8
581	xorq	(%rsi,%rax,1),%r9
582	movb	%dl,%al
583	xorq	%r10,%r8
584	movzwq	(%r11,%r13,2),%r13
585	movzbl	%dl,%ecx
586	shlb	$4,%al
587	movzbq	(%rsp,%rbx,1),%r12
588	shrl	$4,%ecx
589	shlq	$48,%r13
590	xorq	%r8,%r12
591	movq	%r9,%r10
592	xorq	%r13,%r9
593	shrq	$8,%r8
594	movzbq	%r12b,%r12
595	shrq	$8,%r9
596	xorq	-128(%rbp,%rbx,8),%r8
597	shlq	$56,%r10
598	xorq	(%rbp,%rbx,8),%r9
599	roll	$8,%edx
600	xorq	8(%rsi,%rax,1),%r8
601	xorq	(%rsi,%rax,1),%r9
602	movb	%dl,%al
603	xorq	%r10,%r8
604	movzwq	(%r11,%r12,2),%r12
605	movzbl	%dl,%ebx
606	shlb	$4,%al
607	movzbq	(%rsp,%rcx,1),%r13
608	shrl	$4,%ebx
609	shlq	$48,%r12
610	xorq	%r8,%r13
611	movq	%r9,%r10
612	xorq	%r12,%r9
613	shrq	$8,%r8
614	movzbq	%r13b,%r13
615	shrq	$8,%r9
616	xorq	-128(%rbp,%rcx,8),%r8
617	shlq	$56,%r10
618	xorq	(%rbp,%rcx,8),%r9
619	roll	$8,%edx
620	xorq	8(%rsi,%rax,1),%r8
621	xorq	(%rsi,%rax,1),%r9
622	movb	%dl,%al
623	xorq	%r10,%r8
624	movzwq	(%r11,%r13,2),%r13
625	movzbl	%dl,%ecx
626	shlb	$4,%al
627	movzbq	(%rsp,%rbx,1),%r12
628	andl	$240,%ecx
629	shlq	$48,%r13
630	xorq	%r8,%r12
631	movq	%r9,%r10
632	xorq	%r13,%r9
633	shrq	$8,%r8
634	movzbq	%r12b,%r12
635	movl	-4(%rdi),%edx
636	shrq	$8,%r9
637	xorq	-128(%rbp,%rbx,8),%r8
638	shlq	$56,%r10
639	xorq	(%rbp,%rbx,8),%r9
640	movzwq	(%r11,%r12,2),%r12
641	xorq	8(%rsi,%rax,1),%r8
642	xorq	(%rsi,%rax,1),%r9
643	shlq	$48,%r12
644	xorq	%r10,%r8
645	xorq	%r12,%r9
646	movzbq	%r8b,%r13
647	shrq	$4,%r8
648	movq	%r9,%r10
649	shlb	$4,%r13b
650	shrq	$4,%r9
651	xorq	8(%rsi,%rcx,1),%r8
652	movzwq	(%r11,%r13,2),%r13
653	shlq	$60,%r10
654	xorq	(%rsi,%rcx,1),%r9
655	xorq	%r10,%r8
656	shlq	$48,%r13
657	bswapq	%r8
658	xorq	%r13,%r9
659	bswapq	%r9
660	cmpq	%r15,%r14
661	jb	.Louter_loop
662	movq	%r8,8(%rdi)
663	movq	%r9,(%rdi)
664
665	leaq	280(%rsp),%rsi
666	movq	0(%rsi),%r15
667	movq	8(%rsi),%r14
668	movq	16(%rsi),%r13
669	movq	24(%rsi),%r12
670	movq	32(%rsi),%rbp
671	movq	40(%rsi),%rbx
672	leaq	48(%rsi),%rsp
673.Lghash_epilogue:
674	movq	8(%rsp),%rdi
675	movq	16(%rsp),%rsi
676	retq
677.LSEH_end_gcm_ghash_4bit:
678.globl	gcm_init_clmul
679.def	gcm_init_clmul;	.scl 2;	.type 32;	.endef
680.p2align	4
681gcm_init_clmul:
682	movdqu	(%rdx),%xmm2
683	pshufd	$78,%xmm2,%xmm2
684
685
686	pshufd	$255,%xmm2,%xmm4
687	movdqa	%xmm2,%xmm3
688	psllq	$1,%xmm2
689	pxor	%xmm5,%xmm5
690	psrlq	$63,%xmm3
691	pcmpgtd	%xmm4,%xmm5
692	pslldq	$8,%xmm3
693	por	%xmm3,%xmm2
694
695
696	pand	.L0x1c2_polynomial(%rip),%xmm5
697	pxor	%xmm5,%xmm2
698
699
700	movdqa	%xmm2,%xmm0
701	movdqa	%xmm0,%xmm1
702	pshufd	$78,%xmm0,%xmm3
703	pshufd	$78,%xmm2,%xmm4
704	pxor	%xmm0,%xmm3
705	pxor	%xmm2,%xmm4
706.byte	102,15,58,68,194,0
707.byte	102,15,58,68,202,17
708.byte	102,15,58,68,220,0
709	pxor	%xmm0,%xmm3
710	pxor	%xmm1,%xmm3
711
712	movdqa	%xmm3,%xmm4
713	psrldq	$8,%xmm3
714	pslldq	$8,%xmm4
715	pxor	%xmm3,%xmm1
716	pxor	%xmm4,%xmm0
717
718	movdqa	%xmm0,%xmm3
719	psllq	$1,%xmm0
720	pxor	%xmm3,%xmm0
721	psllq	$5,%xmm0
722	pxor	%xmm3,%xmm0
723	psllq	$57,%xmm0
724	movdqa	%xmm0,%xmm4
725	pslldq	$8,%xmm0
726	psrldq	$8,%xmm4
727	pxor	%xmm3,%xmm0
728	pxor	%xmm4,%xmm1
729
730
731	movdqa	%xmm0,%xmm4
732	psrlq	$5,%xmm0
733	pxor	%xmm4,%xmm0
734	psrlq	$1,%xmm0
735	pxor	%xmm4,%xmm0
736	pxor	%xmm1,%xmm4
737	psrlq	$1,%xmm0
738	pxor	%xmm4,%xmm0
739	movdqu	%xmm2,(%rcx)
740	movdqu	%xmm0,16(%rcx)
741	retq
742
743.globl	gcm_gmult_clmul
744.def	gcm_gmult_clmul;	.scl 2;	.type 32;	.endef
745.p2align	4
746gcm_gmult_clmul:
747	movdqu	(%rcx),%xmm0
748	movdqa	.Lbswap_mask(%rip),%xmm5
749	movdqu	(%rdx),%xmm2
750.byte	102,15,56,0,197
751	movdqa	%xmm0,%xmm1
752	pshufd	$78,%xmm0,%xmm3
753	pshufd	$78,%xmm2,%xmm4
754	pxor	%xmm0,%xmm3
755	pxor	%xmm2,%xmm4
756.byte	102,15,58,68,194,0
757.byte	102,15,58,68,202,17
758.byte	102,15,58,68,220,0
759	pxor	%xmm0,%xmm3
760	pxor	%xmm1,%xmm3
761
762	movdqa	%xmm3,%xmm4
763	psrldq	$8,%xmm3
764	pslldq	$8,%xmm4
765	pxor	%xmm3,%xmm1
766	pxor	%xmm4,%xmm0
767
768	movdqa	%xmm0,%xmm3
769	psllq	$1,%xmm0
770	pxor	%xmm3,%xmm0
771	psllq	$5,%xmm0
772	pxor	%xmm3,%xmm0
773	psllq	$57,%xmm0
774	movdqa	%xmm0,%xmm4
775	pslldq	$8,%xmm0
776	psrldq	$8,%xmm4
777	pxor	%xmm3,%xmm0
778	pxor	%xmm4,%xmm1
779
780
781	movdqa	%xmm0,%xmm4
782	psrlq	$5,%xmm0
783	pxor	%xmm4,%xmm0
784	psrlq	$1,%xmm0
785	pxor	%xmm4,%xmm0
786	pxor	%xmm1,%xmm4
787	psrlq	$1,%xmm0
788	pxor	%xmm4,%xmm0
789.byte	102,15,56,0,197
790	movdqu	%xmm0,(%rcx)
791	retq
792
793.globl	gcm_ghash_clmul
794.def	gcm_ghash_clmul;	.scl 2;	.type 32;	.endef
795.p2align	4
796gcm_ghash_clmul:
797.LSEH_begin_gcm_ghash_clmul:
798
799.byte	0x48,0x83,0xec,0x58
800.byte	0x0f,0x29,0x34,0x24
801.byte	0x0f,0x29,0x7c,0x24,0x10
802.byte	0x44,0x0f,0x29,0x44,0x24,0x20
803.byte	0x44,0x0f,0x29,0x4c,0x24,0x30
804.byte	0x44,0x0f,0x29,0x54,0x24,0x40
805	movdqa	.Lbswap_mask(%rip),%xmm5
806
807	movdqu	(%rcx),%xmm0
808	movdqu	(%rdx),%xmm2
809.byte	102,15,56,0,197
810
811	subq	$16,%r9
812	jz	.Lodd_tail
813
814	movdqu	16(%rdx),%xmm8
815
816
817
818
819
820	movdqu	(%r8),%xmm3
821	movdqu	16(%r8),%xmm6
822.byte	102,15,56,0,221
823.byte	102,15,56,0,245
824	pxor	%xmm3,%xmm0
825	movdqa	%xmm6,%xmm7
826	pshufd	$78,%xmm6,%xmm3
827	pshufd	$78,%xmm2,%xmm4
828	pxor	%xmm6,%xmm3
829	pxor	%xmm2,%xmm4
830.byte	102,15,58,68,242,0
831.byte	102,15,58,68,250,17
832.byte	102,15,58,68,220,0
833	pxor	%xmm6,%xmm3
834	pxor	%xmm7,%xmm3
835
836	movdqa	%xmm3,%xmm4
837	psrldq	$8,%xmm3
838	pslldq	$8,%xmm4
839	pxor	%xmm3,%xmm7
840	pxor	%xmm4,%xmm6
841	movdqa	%xmm0,%xmm1
842	pshufd	$78,%xmm0,%xmm3
843	pshufd	$78,%xmm8,%xmm4
844	pxor	%xmm0,%xmm3
845	pxor	%xmm8,%xmm4
846
847	leaq	32(%r8),%r8
848	subq	$32,%r9
849	jbe	.Leven_tail
850
851.Lmod_loop:
852.byte	102,65,15,58,68,192,0
853.byte	102,65,15,58,68,200,17
854.byte	102,15,58,68,220,0
855	pxor	%xmm0,%xmm3
856	pxor	%xmm1,%xmm3
857
858	movdqa	%xmm3,%xmm4
859	psrldq	$8,%xmm3
860	pslldq	$8,%xmm4
861	pxor	%xmm3,%xmm1
862	pxor	%xmm4,%xmm0
863	movdqu	(%r8),%xmm3
864	pxor	%xmm6,%xmm0
865	pxor	%xmm7,%xmm1
866
867	movdqu	16(%r8),%xmm6
868.byte	102,15,56,0,221
869.byte	102,15,56,0,245
870
871	movdqa	%xmm6,%xmm7
872	pshufd	$78,%xmm6,%xmm9
873	pshufd	$78,%xmm2,%xmm10
874	pxor	%xmm6,%xmm9
875	pxor	%xmm2,%xmm10
876	pxor	%xmm3,%xmm1
877
878	movdqa	%xmm0,%xmm3
879	psllq	$1,%xmm0
880	pxor	%xmm3,%xmm0
881	psllq	$5,%xmm0
882	pxor	%xmm3,%xmm0
883.byte	102,15,58,68,242,0
884	psllq	$57,%xmm0
885	movdqa	%xmm0,%xmm4
886	pslldq	$8,%xmm0
887	psrldq	$8,%xmm4
888	pxor	%xmm3,%xmm0
889	pxor	%xmm4,%xmm1
890
891.byte	102,15,58,68,250,17
892	movdqa	%xmm0,%xmm4
893	psrlq	$5,%xmm0
894	pxor	%xmm4,%xmm0
895	psrlq	$1,%xmm0
896	pxor	%xmm4,%xmm0
897	pxor	%xmm1,%xmm4
898	psrlq	$1,%xmm0
899	pxor	%xmm4,%xmm0
900
901.byte	102,69,15,58,68,202,0
902	movdqa	%xmm0,%xmm1
903	pshufd	$78,%xmm0,%xmm3
904	pshufd	$78,%xmm8,%xmm4
905	pxor	%xmm0,%xmm3
906	pxor	%xmm8,%xmm4
907
908	pxor	%xmm6,%xmm9
909	pxor	%xmm7,%xmm9
910	movdqa	%xmm9,%xmm10
911	psrldq	$8,%xmm9
912	pslldq	$8,%xmm10
913	pxor	%xmm9,%xmm7
914	pxor	%xmm10,%xmm6
915
916	leaq	32(%r8),%r8
917	subq	$32,%r9
918	ja	.Lmod_loop
919
920.Leven_tail:
921.byte	102,65,15,58,68,192,0
922.byte	102,65,15,58,68,200,17
923.byte	102,15,58,68,220,0
924	pxor	%xmm0,%xmm3
925	pxor	%xmm1,%xmm3
926
927	movdqa	%xmm3,%xmm4
928	psrldq	$8,%xmm3
929	pslldq	$8,%xmm4
930	pxor	%xmm3,%xmm1
931	pxor	%xmm4,%xmm0
932	pxor	%xmm6,%xmm0
933	pxor	%xmm7,%xmm1
934
935	movdqa	%xmm0,%xmm3
936	psllq	$1,%xmm0
937	pxor	%xmm3,%xmm0
938	psllq	$5,%xmm0
939	pxor	%xmm3,%xmm0
940	psllq	$57,%xmm0
941	movdqa	%xmm0,%xmm4
942	pslldq	$8,%xmm0
943	psrldq	$8,%xmm4
944	pxor	%xmm3,%xmm0
945	pxor	%xmm4,%xmm1
946
947
948	movdqa	%xmm0,%xmm4
949	psrlq	$5,%xmm0
950	pxor	%xmm4,%xmm0
951	psrlq	$1,%xmm0
952	pxor	%xmm4,%xmm0
953	pxor	%xmm1,%xmm4
954	psrlq	$1,%xmm0
955	pxor	%xmm4,%xmm0
956	testq	%r9,%r9
957	jnz	.Ldone
958
959.Lodd_tail:
960	movdqu	(%r8),%xmm3
961.byte	102,15,56,0,221
962	pxor	%xmm3,%xmm0
963	movdqa	%xmm0,%xmm1
964	pshufd	$78,%xmm0,%xmm3
965	pshufd	$78,%xmm2,%xmm4
966	pxor	%xmm0,%xmm3
967	pxor	%xmm2,%xmm4
968.byte	102,15,58,68,194,0
969.byte	102,15,58,68,202,17
970.byte	102,15,58,68,220,0
971	pxor	%xmm0,%xmm3
972	pxor	%xmm1,%xmm3
973
974	movdqa	%xmm3,%xmm4
975	psrldq	$8,%xmm3
976	pslldq	$8,%xmm4
977	pxor	%xmm3,%xmm1
978	pxor	%xmm4,%xmm0
979
980	movdqa	%xmm0,%xmm3
981	psllq	$1,%xmm0
982	pxor	%xmm3,%xmm0
983	psllq	$5,%xmm0
984	pxor	%xmm3,%xmm0
985	psllq	$57,%xmm0
986	movdqa	%xmm0,%xmm4
987	pslldq	$8,%xmm0
988	psrldq	$8,%xmm4
989	pxor	%xmm3,%xmm0
990	pxor	%xmm4,%xmm1
991
992
993	movdqa	%xmm0,%xmm4
994	psrlq	$5,%xmm0
995	pxor	%xmm4,%xmm0
996	psrlq	$1,%xmm0
997	pxor	%xmm4,%xmm0
998	pxor	%xmm1,%xmm4
999	psrlq	$1,%xmm0
1000	pxor	%xmm4,%xmm0
1001.Ldone:
1002.byte	102,15,56,0,197
1003	movdqu	%xmm0,(%rcx)
1004	movaps	(%rsp),%xmm6
1005	movaps	16(%rsp),%xmm7
1006	movaps	32(%rsp),%xmm8
1007	movaps	48(%rsp),%xmm9
1008	movaps	64(%rsp),%xmm10
1009	addq	$88,%rsp
1010	retq
1011.LSEH_end_gcm_ghash_clmul:
1012
1013.p2align	6
1014.Lbswap_mask:
1015.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1016.L0x1c2_polynomial:
1017.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1018.p2align	6
1019
1020.Lrem_4bit:
1021.long	0,0,0,471859200,0,943718400,0,610271232
1022.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
1023.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
1024.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
1025
1026.Lrem_8bit:
1027.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1028.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1029.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1030.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1031.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1032.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1033.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1034.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1035.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1036.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1037.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1038.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1039.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1040.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1041.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1042.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1043.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1044.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1045.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1046.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1047.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1048.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1049.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1050.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1051.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1052.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1053.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1054.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1055.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1056.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1057.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1058.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1059
1060.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1061.p2align	6
1062
1063.def	se_handler;	.scl 3;	.type 32;	.endef
1064.p2align	4
1065se_handler:
1066	pushq	%rsi
1067	pushq	%rdi
1068	pushq	%rbx
1069	pushq	%rbp
1070	pushq	%r12
1071	pushq	%r13
1072	pushq	%r14
1073	pushq	%r15
1074	pushfq
1075	subq	$64,%rsp
1076
1077	movq	120(%r8),%rax
1078	movq	248(%r8),%rbx
1079
1080	movq	8(%r9),%rsi
1081	movq	56(%r9),%r11
1082
1083	movl	0(%r11),%r10d
1084	leaq	(%rsi,%r10,1),%r10
1085	cmpq	%r10,%rbx
1086	jb	.Lin_prologue
1087
1088	movq	152(%r8),%rax
1089
1090	movl	4(%r11),%r10d
1091	leaq	(%rsi,%r10,1),%r10
1092	cmpq	%r10,%rbx
1093	jae	.Lin_prologue
1094
1095	leaq	24(%rax),%rax
1096
1097	movq	-8(%rax),%rbx
1098	movq	-16(%rax),%rbp
1099	movq	-24(%rax),%r12
1100	movq	%rbx,144(%r8)
1101	movq	%rbp,160(%r8)
1102	movq	%r12,216(%r8)
1103
1104.Lin_prologue:
1105	movq	8(%rax),%rdi
1106	movq	16(%rax),%rsi
1107	movq	%rax,152(%r8)
1108	movq	%rsi,168(%r8)
1109	movq	%rdi,176(%r8)
1110
1111	movq	40(%r9),%rdi
1112	movq	%r8,%rsi
1113	movl	$154,%ecx
1114.long	0xa548f3fc
1115
1116	movq	%r9,%rsi
1117	xorq	%rcx,%rcx
1118	movq	8(%rsi),%rdx
1119	movq	0(%rsi),%r8
1120	movq	16(%rsi),%r9
1121	movq	40(%rsi),%r10
1122	leaq	56(%rsi),%r11
1123	leaq	24(%rsi),%r12
1124	movq	%r10,32(%rsp)
1125	movq	%r11,40(%rsp)
1126	movq	%r12,48(%rsp)
1127	movq	%rcx,56(%rsp)
1128	call	*__imp_RtlVirtualUnwind(%rip)
1129
1130	movl	$1,%eax
1131	addq	$64,%rsp
1132	popfq
1133	popq	%r15
1134	popq	%r14
1135	popq	%r13
1136	popq	%r12
1137	popq	%rbp
1138	popq	%rbx
1139	popq	%rdi
1140	popq	%rsi
1141	retq
1142
1143
1144.section	.pdata
1145.p2align	2
1146.rva	.LSEH_begin_gcm_gmult_4bit
1147.rva	.LSEH_end_gcm_gmult_4bit
1148.rva	.LSEH_info_gcm_gmult_4bit
1149
1150.rva	.LSEH_begin_gcm_ghash_4bit
1151.rva	.LSEH_end_gcm_ghash_4bit
1152.rva	.LSEH_info_gcm_ghash_4bit
1153
1154.rva	.LSEH_begin_gcm_ghash_clmul
1155.rva	.LSEH_end_gcm_ghash_clmul
1156.rva	.LSEH_info_gcm_ghash_clmul
1157
1158.section	.xdata
1159.p2align	3
1160.LSEH_info_gcm_gmult_4bit:
1161.byte	9,0,0,0
1162.rva	se_handler
1163.rva	.Lgmult_prologue,.Lgmult_epilogue
1164.LSEH_info_gcm_ghash_4bit:
1165.byte	9,0,0,0
1166.rva	se_handler
1167.rva	.Lghash_prologue,.Lghash_epilogue
1168.LSEH_info_gcm_ghash_clmul:
1169.byte	0x01,0x1f,0x0b,0x00
1170.byte	0x1f,0xa8,0x04,0x00
1171.byte	0x19,0x98,0x03,0x00
1172.byte	0x13,0x88,0x02,0x00
1173.byte	0x0d,0x78,0x01,0x00
1174.byte	0x08,0x68,0x00,0x00
1175.byte	0x04,0xa2,0x00,0x00
1176