1#include "x86_arch.h"
2.text
3
4.globl	gcm_gmult_4bit
5.type	gcm_gmult_4bit,@function
6.align	16
7gcm_gmult_4bit:
8	pushq	%rbx
9	pushq	%rbp
10	pushq	%r12
11.Lgmult_prologue:
12
13	movzbq	15(%rdi),%r8
14	leaq	.Lrem_4bit(%rip),%r11
15	xorq	%rax,%rax
16	xorq	%rbx,%rbx
17	movb	%r8b,%al
18	movb	%r8b,%bl
19	shlb	$4,%al
20	movq	$14,%rcx
21	movq	8(%rsi,%rax,1),%r8
22	movq	(%rsi,%rax,1),%r9
23	andb	$240,%bl
24	movq	%r8,%rdx
25	jmp	.Loop1
26
27.align	16
28.Loop1:
29	shrq	$4,%r8
30	andq	$15,%rdx
31	movq	%r9,%r10
32	movb	(%rdi,%rcx,1),%al
33	shrq	$4,%r9
34	xorq	8(%rsi,%rbx,1),%r8
35	shlq	$60,%r10
36	xorq	(%rsi,%rbx,1),%r9
37	movb	%al,%bl
38	xorq	(%r11,%rdx,8),%r9
39	movq	%r8,%rdx
40	shlb	$4,%al
41	xorq	%r10,%r8
42	decq	%rcx
43	js	.Lbreak1
44
45	shrq	$4,%r8
46	andq	$15,%rdx
47	movq	%r9,%r10
48	shrq	$4,%r9
49	xorq	8(%rsi,%rax,1),%r8
50	shlq	$60,%r10
51	xorq	(%rsi,%rax,1),%r9
52	andb	$240,%bl
53	xorq	(%r11,%rdx,8),%r9
54	movq	%r8,%rdx
55	xorq	%r10,%r8
56	jmp	.Loop1
57
58.align	16
59.Lbreak1:
60	shrq	$4,%r8
61	andq	$15,%rdx
62	movq	%r9,%r10
63	shrq	$4,%r9
64	xorq	8(%rsi,%rax,1),%r8
65	shlq	$60,%r10
66	xorq	(%rsi,%rax,1),%r9
67	andb	$240,%bl
68	xorq	(%r11,%rdx,8),%r9
69	movq	%r8,%rdx
70	xorq	%r10,%r8
71
72	shrq	$4,%r8
73	andq	$15,%rdx
74	movq	%r9,%r10
75	shrq	$4,%r9
76	xorq	8(%rsi,%rbx,1),%r8
77	shlq	$60,%r10
78	xorq	(%rsi,%rbx,1),%r9
79	xorq	%r10,%r8
80	xorq	(%r11,%rdx,8),%r9
81
82	bswapq	%r8
83	bswapq	%r9
84	movq	%r8,8(%rdi)
85	movq	%r9,(%rdi)
86
87	movq	16(%rsp),%rbx
88	leaq	24(%rsp),%rsp
89.Lgmult_epilogue:
90	retq
91.size	gcm_gmult_4bit,.-gcm_gmult_4bit
92.globl	gcm_ghash_4bit
93.type	gcm_ghash_4bit,@function
94.align	16
95gcm_ghash_4bit:
96	pushq	%rbx
97	pushq	%rbp
98	pushq	%r12
99	pushq	%r13
100	pushq	%r14
101	pushq	%r15
102	subq	$280,%rsp
103.Lghash_prologue:
104	movq	%rdx,%r14
105	movq	%rcx,%r15
106	subq	$-128,%rsi
107	leaq	16+128(%rsp),%rbp
108	xorl	%edx,%edx
109	movq	0+0-128(%rsi),%r8
110	movq	0+8-128(%rsi),%rax
111	movb	%al,%dl
112	shrq	$4,%rax
113	movq	%r8,%r10
114	shrq	$4,%r8
115	movq	16+0-128(%rsi),%r9
116	shlb	$4,%dl
117	movq	16+8-128(%rsi),%rbx
118	shlq	$60,%r10
119	movb	%dl,0(%rsp)
120	orq	%r10,%rax
121	movb	%bl,%dl
122	shrq	$4,%rbx
123	movq	%r9,%r10
124	shrq	$4,%r9
125	movq	%r8,0(%rbp)
126	movq	32+0-128(%rsi),%r8
127	shlb	$4,%dl
128	movq	%rax,0-128(%rbp)
129	movq	32+8-128(%rsi),%rax
130	shlq	$60,%r10
131	movb	%dl,1(%rsp)
132	orq	%r10,%rbx
133	movb	%al,%dl
134	shrq	$4,%rax
135	movq	%r8,%r10
136	shrq	$4,%r8
137	movq	%r9,8(%rbp)
138	movq	48+0-128(%rsi),%r9
139	shlb	$4,%dl
140	movq	%rbx,8-128(%rbp)
141	movq	48+8-128(%rsi),%rbx
142	shlq	$60,%r10
143	movb	%dl,2(%rsp)
144	orq	%r10,%rax
145	movb	%bl,%dl
146	shrq	$4,%rbx
147	movq	%r9,%r10
148	shrq	$4,%r9
149	movq	%r8,16(%rbp)
150	movq	64+0-128(%rsi),%r8
151	shlb	$4,%dl
152	movq	%rax,16-128(%rbp)
153	movq	64+8-128(%rsi),%rax
154	shlq	$60,%r10
155	movb	%dl,3(%rsp)
156	orq	%r10,%rbx
157	movb	%al,%dl
158	shrq	$4,%rax
159	movq	%r8,%r10
160	shrq	$4,%r8
161	movq	%r9,24(%rbp)
162	movq	80+0-128(%rsi),%r9
163	shlb	$4,%dl
164	movq	%rbx,24-128(%rbp)
165	movq	80+8-128(%rsi),%rbx
166	shlq	$60,%r10
167	movb	%dl,4(%rsp)
168	orq	%r10,%rax
169	movb	%bl,%dl
170	shrq	$4,%rbx
171	movq	%r9,%r10
172	shrq	$4,%r9
173	movq	%r8,32(%rbp)
174	movq	96+0-128(%rsi),%r8
175	shlb	$4,%dl
176	movq	%rax,32-128(%rbp)
177	movq	96+8-128(%rsi),%rax
178	shlq	$60,%r10
179	movb	%dl,5(%rsp)
180	orq	%r10,%rbx
181	movb	%al,%dl
182	shrq	$4,%rax
183	movq	%r8,%r10
184	shrq	$4,%r8
185	movq	%r9,40(%rbp)
186	movq	112+0-128(%rsi),%r9
187	shlb	$4,%dl
188	movq	%rbx,40-128(%rbp)
189	movq	112+8-128(%rsi),%rbx
190	shlq	$60,%r10
191	movb	%dl,6(%rsp)
192	orq	%r10,%rax
193	movb	%bl,%dl
194	shrq	$4,%rbx
195	movq	%r9,%r10
196	shrq	$4,%r9
197	movq	%r8,48(%rbp)
198	movq	128+0-128(%rsi),%r8
199	shlb	$4,%dl
200	movq	%rax,48-128(%rbp)
201	movq	128+8-128(%rsi),%rax
202	shlq	$60,%r10
203	movb	%dl,7(%rsp)
204	orq	%r10,%rbx
205	movb	%al,%dl
206	shrq	$4,%rax
207	movq	%r8,%r10
208	shrq	$4,%r8
209	movq	%r9,56(%rbp)
210	movq	144+0-128(%rsi),%r9
211	shlb	$4,%dl
212	movq	%rbx,56-128(%rbp)
213	movq	144+8-128(%rsi),%rbx
214	shlq	$60,%r10
215	movb	%dl,8(%rsp)
216	orq	%r10,%rax
217	movb	%bl,%dl
218	shrq	$4,%rbx
219	movq	%r9,%r10
220	shrq	$4,%r9
221	movq	%r8,64(%rbp)
222	movq	160+0-128(%rsi),%r8
223	shlb	$4,%dl
224	movq	%rax,64-128(%rbp)
225	movq	160+8-128(%rsi),%rax
226	shlq	$60,%r10
227	movb	%dl,9(%rsp)
228	orq	%r10,%rbx
229	movb	%al,%dl
230	shrq	$4,%rax
231	movq	%r8,%r10
232	shrq	$4,%r8
233	movq	%r9,72(%rbp)
234	movq	176+0-128(%rsi),%r9
235	shlb	$4,%dl
236	movq	%rbx,72-128(%rbp)
237	movq	176+8-128(%rsi),%rbx
238	shlq	$60,%r10
239	movb	%dl,10(%rsp)
240	orq	%r10,%rax
241	movb	%bl,%dl
242	shrq	$4,%rbx
243	movq	%r9,%r10
244	shrq	$4,%r9
245	movq	%r8,80(%rbp)
246	movq	192+0-128(%rsi),%r8
247	shlb	$4,%dl
248	movq	%rax,80-128(%rbp)
249	movq	192+8-128(%rsi),%rax
250	shlq	$60,%r10
251	movb	%dl,11(%rsp)
252	orq	%r10,%rbx
253	movb	%al,%dl
254	shrq	$4,%rax
255	movq	%r8,%r10
256	shrq	$4,%r8
257	movq	%r9,88(%rbp)
258	movq	208+0-128(%rsi),%r9
259	shlb	$4,%dl
260	movq	%rbx,88-128(%rbp)
261	movq	208+8-128(%rsi),%rbx
262	shlq	$60,%r10
263	movb	%dl,12(%rsp)
264	orq	%r10,%rax
265	movb	%bl,%dl
266	shrq	$4,%rbx
267	movq	%r9,%r10
268	shrq	$4,%r9
269	movq	%r8,96(%rbp)
270	movq	224+0-128(%rsi),%r8
271	shlb	$4,%dl
272	movq	%rax,96-128(%rbp)
273	movq	224+8-128(%rsi),%rax
274	shlq	$60,%r10
275	movb	%dl,13(%rsp)
276	orq	%r10,%rbx
277	movb	%al,%dl
278	shrq	$4,%rax
279	movq	%r8,%r10
280	shrq	$4,%r8
281	movq	%r9,104(%rbp)
282	movq	240+0-128(%rsi),%r9
283	shlb	$4,%dl
284	movq	%rbx,104-128(%rbp)
285	movq	240+8-128(%rsi),%rbx
286	shlq	$60,%r10
287	movb	%dl,14(%rsp)
288	orq	%r10,%rax
289	movb	%bl,%dl
290	shrq	$4,%rbx
291	movq	%r9,%r10
292	shrq	$4,%r9
293	movq	%r8,112(%rbp)
294	shlb	$4,%dl
295	movq	%rax,112-128(%rbp)
296	shlq	$60,%r10
297	movb	%dl,15(%rsp)
298	orq	%r10,%rbx
299	movq	%r9,120(%rbp)
300	movq	%rbx,120-128(%rbp)
301	addq	$-128,%rsi
302	movq	8(%rdi),%r8
303	movq	0(%rdi),%r9
304	addq	%r14,%r15
305	leaq	.Lrem_8bit(%rip),%r11
306	jmp	.Louter_loop
307.align	16
308.Louter_loop:
309	xorq	(%r14),%r9
310	movq	8(%r14),%rdx
311	leaq	16(%r14),%r14
312	xorq	%r8,%rdx
313	movq	%r9,(%rdi)
314	movq	%rdx,8(%rdi)
315	shrq	$32,%rdx
316	xorq	%rax,%rax
317	roll	$8,%edx
318	movb	%dl,%al
319	movzbl	%dl,%ebx
320	shlb	$4,%al
321	shrl	$4,%ebx
322	roll	$8,%edx
323	movq	8(%rsi,%rax,1),%r8
324	movq	(%rsi,%rax,1),%r9
325	movb	%dl,%al
326	movzbl	%dl,%ecx
327	shlb	$4,%al
328	movzbq	(%rsp,%rbx,1),%r12
329	shrl	$4,%ecx
330	xorq	%r8,%r12
331	movq	%r9,%r10
332	shrq	$8,%r8
333	movzbq	%r12b,%r12
334	shrq	$8,%r9
335	xorq	-128(%rbp,%rbx,8),%r8
336	shlq	$56,%r10
337	xorq	(%rbp,%rbx,8),%r9
338	roll	$8,%edx
339	xorq	8(%rsi,%rax,1),%r8
340	xorq	(%rsi,%rax,1),%r9
341	movb	%dl,%al
342	xorq	%r10,%r8
343	movzwq	(%r11,%r12,2),%r12
344	movzbl	%dl,%ebx
345	shlb	$4,%al
346	movzbq	(%rsp,%rcx,1),%r13
347	shrl	$4,%ebx
348	shlq	$48,%r12
349	xorq	%r8,%r13
350	movq	%r9,%r10
351	xorq	%r12,%r9
352	shrq	$8,%r8
353	movzbq	%r13b,%r13
354	shrq	$8,%r9
355	xorq	-128(%rbp,%rcx,8),%r8
356	shlq	$56,%r10
357	xorq	(%rbp,%rcx,8),%r9
358	roll	$8,%edx
359	xorq	8(%rsi,%rax,1),%r8
360	xorq	(%rsi,%rax,1),%r9
361	movb	%dl,%al
362	xorq	%r10,%r8
363	movzwq	(%r11,%r13,2),%r13
364	movzbl	%dl,%ecx
365	shlb	$4,%al
366	movzbq	(%rsp,%rbx,1),%r12
367	shrl	$4,%ecx
368	shlq	$48,%r13
369	xorq	%r8,%r12
370	movq	%r9,%r10
371	xorq	%r13,%r9
372	shrq	$8,%r8
373	movzbq	%r12b,%r12
374	movl	8(%rdi),%edx
375	shrq	$8,%r9
376	xorq	-128(%rbp,%rbx,8),%r8
377	shlq	$56,%r10
378	xorq	(%rbp,%rbx,8),%r9
379	roll	$8,%edx
380	xorq	8(%rsi,%rax,1),%r8
381	xorq	(%rsi,%rax,1),%r9
382	movb	%dl,%al
383	xorq	%r10,%r8
384	movzwq	(%r11,%r12,2),%r12
385	movzbl	%dl,%ebx
386	shlb	$4,%al
387	movzbq	(%rsp,%rcx,1),%r13
388	shrl	$4,%ebx
389	shlq	$48,%r12
390	xorq	%r8,%r13
391	movq	%r9,%r10
392	xorq	%r12,%r9
393	shrq	$8,%r8
394	movzbq	%r13b,%r13
395	shrq	$8,%r9
396	xorq	-128(%rbp,%rcx,8),%r8
397	shlq	$56,%r10
398	xorq	(%rbp,%rcx,8),%r9
399	roll	$8,%edx
400	xorq	8(%rsi,%rax,1),%r8
401	xorq	(%rsi,%rax,1),%r9
402	movb	%dl,%al
403	xorq	%r10,%r8
404	movzwq	(%r11,%r13,2),%r13
405	movzbl	%dl,%ecx
406	shlb	$4,%al
407	movzbq	(%rsp,%rbx,1),%r12
408	shrl	$4,%ecx
409	shlq	$48,%r13
410	xorq	%r8,%r12
411	movq	%r9,%r10
412	xorq	%r13,%r9
413	shrq	$8,%r8
414	movzbq	%r12b,%r12
415	shrq	$8,%r9
416	xorq	-128(%rbp,%rbx,8),%r8
417	shlq	$56,%r10
418	xorq	(%rbp,%rbx,8),%r9
419	roll	$8,%edx
420	xorq	8(%rsi,%rax,1),%r8
421	xorq	(%rsi,%rax,1),%r9
422	movb	%dl,%al
423	xorq	%r10,%r8
424	movzwq	(%r11,%r12,2),%r12
425	movzbl	%dl,%ebx
426	shlb	$4,%al
427	movzbq	(%rsp,%rcx,1),%r13
428	shrl	$4,%ebx
429	shlq	$48,%r12
430	xorq	%r8,%r13
431	movq	%r9,%r10
432	xorq	%r12,%r9
433	shrq	$8,%r8
434	movzbq	%r13b,%r13
435	shrq	$8,%r9
436	xorq	-128(%rbp,%rcx,8),%r8
437	shlq	$56,%r10
438	xorq	(%rbp,%rcx,8),%r9
439	roll	$8,%edx
440	xorq	8(%rsi,%rax,1),%r8
441	xorq	(%rsi,%rax,1),%r9
442	movb	%dl,%al
443	xorq	%r10,%r8
444	movzwq	(%r11,%r13,2),%r13
445	movzbl	%dl,%ecx
446	shlb	$4,%al
447	movzbq	(%rsp,%rbx,1),%r12
448	shrl	$4,%ecx
449	shlq	$48,%r13
450	xorq	%r8,%r12
451	movq	%r9,%r10
452	xorq	%r13,%r9
453	shrq	$8,%r8
454	movzbq	%r12b,%r12
455	movl	4(%rdi),%edx
456	shrq	$8,%r9
457	xorq	-128(%rbp,%rbx,8),%r8
458	shlq	$56,%r10
459	xorq	(%rbp,%rbx,8),%r9
460	roll	$8,%edx
461	xorq	8(%rsi,%rax,1),%r8
462	xorq	(%rsi,%rax,1),%r9
463	movb	%dl,%al
464	xorq	%r10,%r8
465	movzwq	(%r11,%r12,2),%r12
466	movzbl	%dl,%ebx
467	shlb	$4,%al
468	movzbq	(%rsp,%rcx,1),%r13
469	shrl	$4,%ebx
470	shlq	$48,%r12
471	xorq	%r8,%r13
472	movq	%r9,%r10
473	xorq	%r12,%r9
474	shrq	$8,%r8
475	movzbq	%r13b,%r13
476	shrq	$8,%r9
477	xorq	-128(%rbp,%rcx,8),%r8
478	shlq	$56,%r10
479	xorq	(%rbp,%rcx,8),%r9
480	roll	$8,%edx
481	xorq	8(%rsi,%rax,1),%r8
482	xorq	(%rsi,%rax,1),%r9
483	movb	%dl,%al
484	xorq	%r10,%r8
485	movzwq	(%r11,%r13,2),%r13
486	movzbl	%dl,%ecx
487	shlb	$4,%al
488	movzbq	(%rsp,%rbx,1),%r12
489	shrl	$4,%ecx
490	shlq	$48,%r13
491	xorq	%r8,%r12
492	movq	%r9,%r10
493	xorq	%r13,%r9
494	shrq	$8,%r8
495	movzbq	%r12b,%r12
496	shrq	$8,%r9
497	xorq	-128(%rbp,%rbx,8),%r8
498	shlq	$56,%r10
499	xorq	(%rbp,%rbx,8),%r9
500	roll	$8,%edx
501	xorq	8(%rsi,%rax,1),%r8
502	xorq	(%rsi,%rax,1),%r9
503	movb	%dl,%al
504	xorq	%r10,%r8
505	movzwq	(%r11,%r12,2),%r12
506	movzbl	%dl,%ebx
507	shlb	$4,%al
508	movzbq	(%rsp,%rcx,1),%r13
509	shrl	$4,%ebx
510	shlq	$48,%r12
511	xorq	%r8,%r13
512	movq	%r9,%r10
513	xorq	%r12,%r9
514	shrq	$8,%r8
515	movzbq	%r13b,%r13
516	shrq	$8,%r9
517	xorq	-128(%rbp,%rcx,8),%r8
518	shlq	$56,%r10
519	xorq	(%rbp,%rcx,8),%r9
520	roll	$8,%edx
521	xorq	8(%rsi,%rax,1),%r8
522	xorq	(%rsi,%rax,1),%r9
523	movb	%dl,%al
524	xorq	%r10,%r8
525	movzwq	(%r11,%r13,2),%r13
526	movzbl	%dl,%ecx
527	shlb	$4,%al
528	movzbq	(%rsp,%rbx,1),%r12
529	shrl	$4,%ecx
530	shlq	$48,%r13
531	xorq	%r8,%r12
532	movq	%r9,%r10
533	xorq	%r13,%r9
534	shrq	$8,%r8
535	movzbq	%r12b,%r12
536	movl	0(%rdi),%edx
537	shrq	$8,%r9
538	xorq	-128(%rbp,%rbx,8),%r8
539	shlq	$56,%r10
540	xorq	(%rbp,%rbx,8),%r9
541	roll	$8,%edx
542	xorq	8(%rsi,%rax,1),%r8
543	xorq	(%rsi,%rax,1),%r9
544	movb	%dl,%al
545	xorq	%r10,%r8
546	movzwq	(%r11,%r12,2),%r12
547	movzbl	%dl,%ebx
548	shlb	$4,%al
549	movzbq	(%rsp,%rcx,1),%r13
550	shrl	$4,%ebx
551	shlq	$48,%r12
552	xorq	%r8,%r13
553	movq	%r9,%r10
554	xorq	%r12,%r9
555	shrq	$8,%r8
556	movzbq	%r13b,%r13
557	shrq	$8,%r9
558	xorq	-128(%rbp,%rcx,8),%r8
559	shlq	$56,%r10
560	xorq	(%rbp,%rcx,8),%r9
561	roll	$8,%edx
562	xorq	8(%rsi,%rax,1),%r8
563	xorq	(%rsi,%rax,1),%r9
564	movb	%dl,%al
565	xorq	%r10,%r8
566	movzwq	(%r11,%r13,2),%r13
567	movzbl	%dl,%ecx
568	shlb	$4,%al
569	movzbq	(%rsp,%rbx,1),%r12
570	shrl	$4,%ecx
571	shlq	$48,%r13
572	xorq	%r8,%r12
573	movq	%r9,%r10
574	xorq	%r13,%r9
575	shrq	$8,%r8
576	movzbq	%r12b,%r12
577	shrq	$8,%r9
578	xorq	-128(%rbp,%rbx,8),%r8
579	shlq	$56,%r10
580	xorq	(%rbp,%rbx,8),%r9
581	roll	$8,%edx
582	xorq	8(%rsi,%rax,1),%r8
583	xorq	(%rsi,%rax,1),%r9
584	movb	%dl,%al
585	xorq	%r10,%r8
586	movzwq	(%r11,%r12,2),%r12
587	movzbl	%dl,%ebx
588	shlb	$4,%al
589	movzbq	(%rsp,%rcx,1),%r13
590	shrl	$4,%ebx
591	shlq	$48,%r12
592	xorq	%r8,%r13
593	movq	%r9,%r10
594	xorq	%r12,%r9
595	shrq	$8,%r8
596	movzbq	%r13b,%r13
597	shrq	$8,%r9
598	xorq	-128(%rbp,%rcx,8),%r8
599	shlq	$56,%r10
600	xorq	(%rbp,%rcx,8),%r9
601	roll	$8,%edx
602	xorq	8(%rsi,%rax,1),%r8
603	xorq	(%rsi,%rax,1),%r9
604	movb	%dl,%al
605	xorq	%r10,%r8
606	movzwq	(%r11,%r13,2),%r13
607	movzbl	%dl,%ecx
608	shlb	$4,%al
609	movzbq	(%rsp,%rbx,1),%r12
610	andl	$240,%ecx
611	shlq	$48,%r13
612	xorq	%r8,%r12
613	movq	%r9,%r10
614	xorq	%r13,%r9
615	shrq	$8,%r8
616	movzbq	%r12b,%r12
617	movl	-4(%rdi),%edx
618	shrq	$8,%r9
619	xorq	-128(%rbp,%rbx,8),%r8
620	shlq	$56,%r10
621	xorq	(%rbp,%rbx,8),%r9
622	movzwq	(%r11,%r12,2),%r12
623	xorq	8(%rsi,%rax,1),%r8
624	xorq	(%rsi,%rax,1),%r9
625	shlq	$48,%r12
626	xorq	%r10,%r8
627	xorq	%r12,%r9
628	movzbq	%r8b,%r13
629	shrq	$4,%r8
630	movq	%r9,%r10
631	shlb	$4,%r13b
632	shrq	$4,%r9
633	xorq	8(%rsi,%rcx,1),%r8
634	movzwq	(%r11,%r13,2),%r13
635	shlq	$60,%r10
636	xorq	(%rsi,%rcx,1),%r9
637	xorq	%r10,%r8
638	shlq	$48,%r13
639	bswapq	%r8
640	xorq	%r13,%r9
641	bswapq	%r9
642	cmpq	%r15,%r14
643	jb	.Louter_loop
644	movq	%r8,8(%rdi)
645	movq	%r9,(%rdi)
646
647	leaq	280(%rsp),%rsi
648	movq	0(%rsi),%r15
649	movq	8(%rsi),%r14
650	movq	16(%rsi),%r13
651	movq	24(%rsi),%r12
652	movq	32(%rsi),%rbp
653	movq	40(%rsi),%rbx
654	leaq	48(%rsi),%rsp
655.Lghash_epilogue:
656	retq
657.size	gcm_ghash_4bit,.-gcm_ghash_4bit
658.globl	gcm_init_clmul
659.type	gcm_init_clmul,@function
660.align	16
661gcm_init_clmul:
662	movdqu	(%rsi),%xmm2
663	pshufd	$78,%xmm2,%xmm2
664
665
666	pshufd	$255,%xmm2,%xmm4
667	movdqa	%xmm2,%xmm3
668	psllq	$1,%xmm2
669	pxor	%xmm5,%xmm5
670	psrlq	$63,%xmm3
671	pcmpgtd	%xmm4,%xmm5
672	pslldq	$8,%xmm3
673	por	%xmm3,%xmm2
674
675
676	pand	.L0x1c2_polynomial(%rip),%xmm5
677	pxor	%xmm5,%xmm2
678
679
680	movdqa	%xmm2,%xmm0
681	movdqa	%xmm0,%xmm1
682	pshufd	$78,%xmm0,%xmm3
683	pshufd	$78,%xmm2,%xmm4
684	pxor	%xmm0,%xmm3
685	pxor	%xmm2,%xmm4
686.byte	102,15,58,68,194,0
687.byte	102,15,58,68,202,17
688.byte	102,15,58,68,220,0
689	pxor	%xmm0,%xmm3
690	pxor	%xmm1,%xmm3
691
692	movdqa	%xmm3,%xmm4
693	psrldq	$8,%xmm3
694	pslldq	$8,%xmm4
695	pxor	%xmm3,%xmm1
696	pxor	%xmm4,%xmm0
697
698	movdqa	%xmm0,%xmm3
699	psllq	$1,%xmm0
700	pxor	%xmm3,%xmm0
701	psllq	$5,%xmm0
702	pxor	%xmm3,%xmm0
703	psllq	$57,%xmm0
704	movdqa	%xmm0,%xmm4
705	pslldq	$8,%xmm0
706	psrldq	$8,%xmm4
707	pxor	%xmm3,%xmm0
708	pxor	%xmm4,%xmm1
709
710
711	movdqa	%xmm0,%xmm4
712	psrlq	$5,%xmm0
713	pxor	%xmm4,%xmm0
714	psrlq	$1,%xmm0
715	pxor	%xmm4,%xmm0
716	pxor	%xmm1,%xmm4
717	psrlq	$1,%xmm0
718	pxor	%xmm4,%xmm0
719	movdqu	%xmm2,(%rdi)
720	movdqu	%xmm0,16(%rdi)
721	retq
722.size	gcm_init_clmul,.-gcm_init_clmul
723.globl	gcm_gmult_clmul
724.type	gcm_gmult_clmul,@function
725.align	16
726gcm_gmult_clmul:
727	movdqu	(%rdi),%xmm0
728	movdqa	.Lbswap_mask(%rip),%xmm5
729	movdqu	(%rsi),%xmm2
730.byte	102,15,56,0,197
731	movdqa	%xmm0,%xmm1
732	pshufd	$78,%xmm0,%xmm3
733	pshufd	$78,%xmm2,%xmm4
734	pxor	%xmm0,%xmm3
735	pxor	%xmm2,%xmm4
736.byte	102,15,58,68,194,0
737.byte	102,15,58,68,202,17
738.byte	102,15,58,68,220,0
739	pxor	%xmm0,%xmm3
740	pxor	%xmm1,%xmm3
741
742	movdqa	%xmm3,%xmm4
743	psrldq	$8,%xmm3
744	pslldq	$8,%xmm4
745	pxor	%xmm3,%xmm1
746	pxor	%xmm4,%xmm0
747
748	movdqa	%xmm0,%xmm3
749	psllq	$1,%xmm0
750	pxor	%xmm3,%xmm0
751	psllq	$5,%xmm0
752	pxor	%xmm3,%xmm0
753	psllq	$57,%xmm0
754	movdqa	%xmm0,%xmm4
755	pslldq	$8,%xmm0
756	psrldq	$8,%xmm4
757	pxor	%xmm3,%xmm0
758	pxor	%xmm4,%xmm1
759
760
761	movdqa	%xmm0,%xmm4
762	psrlq	$5,%xmm0
763	pxor	%xmm4,%xmm0
764	psrlq	$1,%xmm0
765	pxor	%xmm4,%xmm0
766	pxor	%xmm1,%xmm4
767	psrlq	$1,%xmm0
768	pxor	%xmm4,%xmm0
769.byte	102,15,56,0,197
770	movdqu	%xmm0,(%rdi)
771	retq
772.size	gcm_gmult_clmul,.-gcm_gmult_clmul
773.globl	gcm_ghash_clmul
774.type	gcm_ghash_clmul,@function
775.align	16
776gcm_ghash_clmul:
777	movdqa	.Lbswap_mask(%rip),%xmm5
778
779	movdqu	(%rdi),%xmm0
780	movdqu	(%rsi),%xmm2
781.byte	102,15,56,0,197
782
783	subq	$16,%rcx
784	jz	.Lodd_tail
785
786	movdqu	16(%rsi),%xmm8
787
788
789
790
791
792	movdqu	(%rdx),%xmm3
793	movdqu	16(%rdx),%xmm6
794.byte	102,15,56,0,221
795.byte	102,15,56,0,245
796	pxor	%xmm3,%xmm0
797	movdqa	%xmm6,%xmm7
798	pshufd	$78,%xmm6,%xmm3
799	pshufd	$78,%xmm2,%xmm4
800	pxor	%xmm6,%xmm3
801	pxor	%xmm2,%xmm4
802.byte	102,15,58,68,242,0
803.byte	102,15,58,68,250,17
804.byte	102,15,58,68,220,0
805	pxor	%xmm6,%xmm3
806	pxor	%xmm7,%xmm3
807
808	movdqa	%xmm3,%xmm4
809	psrldq	$8,%xmm3
810	pslldq	$8,%xmm4
811	pxor	%xmm3,%xmm7
812	pxor	%xmm4,%xmm6
813	movdqa	%xmm0,%xmm1
814	pshufd	$78,%xmm0,%xmm3
815	pshufd	$78,%xmm8,%xmm4
816	pxor	%xmm0,%xmm3
817	pxor	%xmm8,%xmm4
818
819	leaq	32(%rdx),%rdx
820	subq	$32,%rcx
821	jbe	.Leven_tail
822
823.Lmod_loop:
824.byte	102,65,15,58,68,192,0
825.byte	102,65,15,58,68,200,17
826.byte	102,15,58,68,220,0
827	pxor	%xmm0,%xmm3
828	pxor	%xmm1,%xmm3
829
830	movdqa	%xmm3,%xmm4
831	psrldq	$8,%xmm3
832	pslldq	$8,%xmm4
833	pxor	%xmm3,%xmm1
834	pxor	%xmm4,%xmm0
835	movdqu	(%rdx),%xmm3
836	pxor	%xmm6,%xmm0
837	pxor	%xmm7,%xmm1
838
839	movdqu	16(%rdx),%xmm6
840.byte	102,15,56,0,221
841.byte	102,15,56,0,245
842
843	movdqa	%xmm6,%xmm7
844	pshufd	$78,%xmm6,%xmm9
845	pshufd	$78,%xmm2,%xmm10
846	pxor	%xmm6,%xmm9
847	pxor	%xmm2,%xmm10
848	pxor	%xmm3,%xmm1
849
850	movdqa	%xmm0,%xmm3
851	psllq	$1,%xmm0
852	pxor	%xmm3,%xmm0
853	psllq	$5,%xmm0
854	pxor	%xmm3,%xmm0
855.byte	102,15,58,68,242,0
856	psllq	$57,%xmm0
857	movdqa	%xmm0,%xmm4
858	pslldq	$8,%xmm0
859	psrldq	$8,%xmm4
860	pxor	%xmm3,%xmm0
861	pxor	%xmm4,%xmm1
862
863.byte	102,15,58,68,250,17
864	movdqa	%xmm0,%xmm4
865	psrlq	$5,%xmm0
866	pxor	%xmm4,%xmm0
867	psrlq	$1,%xmm0
868	pxor	%xmm4,%xmm0
869	pxor	%xmm1,%xmm4
870	psrlq	$1,%xmm0
871	pxor	%xmm4,%xmm0
872
873.byte	102,69,15,58,68,202,0
874	movdqa	%xmm0,%xmm1
875	pshufd	$78,%xmm0,%xmm3
876	pshufd	$78,%xmm8,%xmm4
877	pxor	%xmm0,%xmm3
878	pxor	%xmm8,%xmm4
879
880	pxor	%xmm6,%xmm9
881	pxor	%xmm7,%xmm9
882	movdqa	%xmm9,%xmm10
883	psrldq	$8,%xmm9
884	pslldq	$8,%xmm10
885	pxor	%xmm9,%xmm7
886	pxor	%xmm10,%xmm6
887
888	leaq	32(%rdx),%rdx
889	subq	$32,%rcx
890	ja	.Lmod_loop
891
892.Leven_tail:
893.byte	102,65,15,58,68,192,0
894.byte	102,65,15,58,68,200,17
895.byte	102,15,58,68,220,0
896	pxor	%xmm0,%xmm3
897	pxor	%xmm1,%xmm3
898
899	movdqa	%xmm3,%xmm4
900	psrldq	$8,%xmm3
901	pslldq	$8,%xmm4
902	pxor	%xmm3,%xmm1
903	pxor	%xmm4,%xmm0
904	pxor	%xmm6,%xmm0
905	pxor	%xmm7,%xmm1
906
907	movdqa	%xmm0,%xmm3
908	psllq	$1,%xmm0
909	pxor	%xmm3,%xmm0
910	psllq	$5,%xmm0
911	pxor	%xmm3,%xmm0
912	psllq	$57,%xmm0
913	movdqa	%xmm0,%xmm4
914	pslldq	$8,%xmm0
915	psrldq	$8,%xmm4
916	pxor	%xmm3,%xmm0
917	pxor	%xmm4,%xmm1
918
919
920	movdqa	%xmm0,%xmm4
921	psrlq	$5,%xmm0
922	pxor	%xmm4,%xmm0
923	psrlq	$1,%xmm0
924	pxor	%xmm4,%xmm0
925	pxor	%xmm1,%xmm4
926	psrlq	$1,%xmm0
927	pxor	%xmm4,%xmm0
928	testq	%rcx,%rcx
929	jnz	.Ldone
930
931.Lodd_tail:
932	movdqu	(%rdx),%xmm3
933.byte	102,15,56,0,221
934	pxor	%xmm3,%xmm0
935	movdqa	%xmm0,%xmm1
936	pshufd	$78,%xmm0,%xmm3
937	pshufd	$78,%xmm2,%xmm4
938	pxor	%xmm0,%xmm3
939	pxor	%xmm2,%xmm4
940.byte	102,15,58,68,194,0
941.byte	102,15,58,68,202,17
942.byte	102,15,58,68,220,0
943	pxor	%xmm0,%xmm3
944	pxor	%xmm1,%xmm3
945
946	movdqa	%xmm3,%xmm4
947	psrldq	$8,%xmm3
948	pslldq	$8,%xmm4
949	pxor	%xmm3,%xmm1
950	pxor	%xmm4,%xmm0
951
952	movdqa	%xmm0,%xmm3
953	psllq	$1,%xmm0
954	pxor	%xmm3,%xmm0
955	psllq	$5,%xmm0
956	pxor	%xmm3,%xmm0
957	psllq	$57,%xmm0
958	movdqa	%xmm0,%xmm4
959	pslldq	$8,%xmm0
960	psrldq	$8,%xmm4
961	pxor	%xmm3,%xmm0
962	pxor	%xmm4,%xmm1
963
964
965	movdqa	%xmm0,%xmm4
966	psrlq	$5,%xmm0
967	pxor	%xmm4,%xmm0
968	psrlq	$1,%xmm0
969	pxor	%xmm4,%xmm0
970	pxor	%xmm1,%xmm4
971	psrlq	$1,%xmm0
972	pxor	%xmm4,%xmm0
973.Ldone:
974.byte	102,15,56,0,197
975	movdqu	%xmm0,(%rdi)
976	retq
977.LSEH_end_gcm_ghash_clmul:
978.size	gcm_ghash_clmul,.-gcm_ghash_clmul
979.align	64
980.Lbswap_mask:
981.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
982.L0x1c2_polynomial:
983.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
984.align	64
985.type	.Lrem_4bit,@object
986.Lrem_4bit:
987.long	0,0,0,471859200,0,943718400,0,610271232
988.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
989.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
990.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
991.type	.Lrem_8bit,@object
992.Lrem_8bit:
993.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
994.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
995.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
996.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
997.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
998.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
999.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1000.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1001.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1002.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1003.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1004.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1005.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1006.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1007.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1008.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1009.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1010.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1011.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1012.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1013.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1014.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1015.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1016.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1017.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1018.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1019.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1020.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1021.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1022.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1023.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1024.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1025
1026.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1027.align	64
1028#if defined(HAVE_GNU_STACK)
1029.section .note.GNU-stack,"",%progbits
1030#endif
1031