1.text
2
3
4.globl	gcm_gmult_4bit
5.type	gcm_gmult_4bit,@function
6.align	16
7gcm_gmult_4bit:
8.cfi_startproc
9	pushq	%rbx
10.cfi_adjust_cfa_offset	8
11.cfi_offset	%rbx,-16
12	pushq	%rbp
13.cfi_adjust_cfa_offset	8
14.cfi_offset	%rbp,-24
15	pushq	%r12
16.cfi_adjust_cfa_offset	8
17.cfi_offset	%r12,-32
18	pushq	%r13
19.cfi_adjust_cfa_offset	8
20.cfi_offset	%r13,-40
21	pushq	%r14
22.cfi_adjust_cfa_offset	8
23.cfi_offset	%r14,-48
24	pushq	%r15
25.cfi_adjust_cfa_offset	8
26.cfi_offset	%r15,-56
27	subq	$280,%rsp
28.cfi_adjust_cfa_offset	280
29.Lgmult_prologue:
30
31	movzbq	15(%rdi),%r8
32	leaq	.Lrem_4bit(%rip),%r11
33	xorq	%rax,%rax
34	xorq	%rbx,%rbx
35	movb	%r8b,%al
36	movb	%r8b,%bl
37	shlb	$4,%al
38	movq	$14,%rcx
39	movq	8(%rsi,%rax,1),%r8
40	movq	(%rsi,%rax,1),%r9
41	andb	$0xf0,%bl
42	movq	%r8,%rdx
43	jmp	.Loop1
44
45.align	16
46.Loop1:
47	shrq	$4,%r8
48	andq	$0xf,%rdx
49	movq	%r9,%r10
50	movb	(%rdi,%rcx,1),%al
51	shrq	$4,%r9
52	xorq	8(%rsi,%rbx,1),%r8
53	shlq	$60,%r10
54	xorq	(%rsi,%rbx,1),%r9
55	movb	%al,%bl
56	xorq	(%r11,%rdx,8),%r9
57	movq	%r8,%rdx
58	shlb	$4,%al
59	xorq	%r10,%r8
60	decq	%rcx
61	js	.Lbreak1
62
63	shrq	$4,%r8
64	andq	$0xf,%rdx
65	movq	%r9,%r10
66	shrq	$4,%r9
67	xorq	8(%rsi,%rax,1),%r8
68	shlq	$60,%r10
69	xorq	(%rsi,%rax,1),%r9
70	andb	$0xf0,%bl
71	xorq	(%r11,%rdx,8),%r9
72	movq	%r8,%rdx
73	xorq	%r10,%r8
74	jmp	.Loop1
75
76.align	16
77.Lbreak1:
78	shrq	$4,%r8
79	andq	$0xf,%rdx
80	movq	%r9,%r10
81	shrq	$4,%r9
82	xorq	8(%rsi,%rax,1),%r8
83	shlq	$60,%r10
84	xorq	(%rsi,%rax,1),%r9
85	andb	$0xf0,%bl
86	xorq	(%r11,%rdx,8),%r9
87	movq	%r8,%rdx
88	xorq	%r10,%r8
89
90	shrq	$4,%r8
91	andq	$0xf,%rdx
92	movq	%r9,%r10
93	shrq	$4,%r9
94	xorq	8(%rsi,%rbx,1),%r8
95	shlq	$60,%r10
96	xorq	(%rsi,%rbx,1),%r9
97	xorq	%r10,%r8
98	xorq	(%r11,%rdx,8),%r9
99
100	bswapq	%r8
101	bswapq	%r9
102	movq	%r8,8(%rdi)
103	movq	%r9,(%rdi)
104
105	leaq	280+48(%rsp),%rsi
106.cfi_def_cfa	%rsi,8
107	movq	-8(%rsi),%rbx
108.cfi_restore	%rbx
109	leaq	(%rsi),%rsp
110.cfi_def_cfa_register	%rsp
111.Lgmult_epilogue:
112	.byte	0xf3,0xc3
113.cfi_endproc
114.size	gcm_gmult_4bit,.-gcm_gmult_4bit
115.globl	gcm_ghash_4bit
116.type	gcm_ghash_4bit,@function
117.align	16
118gcm_ghash_4bit:
119.cfi_startproc
120	pushq	%rbx
121.cfi_adjust_cfa_offset	8
122.cfi_offset	%rbx,-16
123	pushq	%rbp
124.cfi_adjust_cfa_offset	8
125.cfi_offset	%rbp,-24
126	pushq	%r12
127.cfi_adjust_cfa_offset	8
128.cfi_offset	%r12,-32
129	pushq	%r13
130.cfi_adjust_cfa_offset	8
131.cfi_offset	%r13,-40
132	pushq	%r14
133.cfi_adjust_cfa_offset	8
134.cfi_offset	%r14,-48
135	pushq	%r15
136.cfi_adjust_cfa_offset	8
137.cfi_offset	%r15,-56
138	subq	$280,%rsp
139.cfi_adjust_cfa_offset	280
140.Lghash_prologue:
141	movq	%rdx,%r14
142	movq	%rcx,%r15
143	subq	$-128,%rsi
144	leaq	16+128(%rsp),%rbp
145	xorl	%edx,%edx
146	movq	0+0-128(%rsi),%r8
147	movq	0+8-128(%rsi),%rax
148	movb	%al,%dl
149	shrq	$4,%rax
150	movq	%r8,%r10
151	shrq	$4,%r8
152	movq	16+0-128(%rsi),%r9
153	shlb	$4,%dl
154	movq	16+8-128(%rsi),%rbx
155	shlq	$60,%r10
156	movb	%dl,0(%rsp)
157	orq	%r10,%rax
158	movb	%bl,%dl
159	shrq	$4,%rbx
160	movq	%r9,%r10
161	shrq	$4,%r9
162	movq	%r8,0(%rbp)
163	movq	32+0-128(%rsi),%r8
164	shlb	$4,%dl
165	movq	%rax,0-128(%rbp)
166	movq	32+8-128(%rsi),%rax
167	shlq	$60,%r10
168	movb	%dl,1(%rsp)
169	orq	%r10,%rbx
170	movb	%al,%dl
171	shrq	$4,%rax
172	movq	%r8,%r10
173	shrq	$4,%r8
174	movq	%r9,8(%rbp)
175	movq	48+0-128(%rsi),%r9
176	shlb	$4,%dl
177	movq	%rbx,8-128(%rbp)
178	movq	48+8-128(%rsi),%rbx
179	shlq	$60,%r10
180	movb	%dl,2(%rsp)
181	orq	%r10,%rax
182	movb	%bl,%dl
183	shrq	$4,%rbx
184	movq	%r9,%r10
185	shrq	$4,%r9
186	movq	%r8,16(%rbp)
187	movq	64+0-128(%rsi),%r8
188	shlb	$4,%dl
189	movq	%rax,16-128(%rbp)
190	movq	64+8-128(%rsi),%rax
191	shlq	$60,%r10
192	movb	%dl,3(%rsp)
193	orq	%r10,%rbx
194	movb	%al,%dl
195	shrq	$4,%rax
196	movq	%r8,%r10
197	shrq	$4,%r8
198	movq	%r9,24(%rbp)
199	movq	80+0-128(%rsi),%r9
200	shlb	$4,%dl
201	movq	%rbx,24-128(%rbp)
202	movq	80+8-128(%rsi),%rbx
203	shlq	$60,%r10
204	movb	%dl,4(%rsp)
205	orq	%r10,%rax
206	movb	%bl,%dl
207	shrq	$4,%rbx
208	movq	%r9,%r10
209	shrq	$4,%r9
210	movq	%r8,32(%rbp)
211	movq	96+0-128(%rsi),%r8
212	shlb	$4,%dl
213	movq	%rax,32-128(%rbp)
214	movq	96+8-128(%rsi),%rax
215	shlq	$60,%r10
216	movb	%dl,5(%rsp)
217	orq	%r10,%rbx
218	movb	%al,%dl
219	shrq	$4,%rax
220	movq	%r8,%r10
221	shrq	$4,%r8
222	movq	%r9,40(%rbp)
223	movq	112+0-128(%rsi),%r9
224	shlb	$4,%dl
225	movq	%rbx,40-128(%rbp)
226	movq	112+8-128(%rsi),%rbx
227	shlq	$60,%r10
228	movb	%dl,6(%rsp)
229	orq	%r10,%rax
230	movb	%bl,%dl
231	shrq	$4,%rbx
232	movq	%r9,%r10
233	shrq	$4,%r9
234	movq	%r8,48(%rbp)
235	movq	128+0-128(%rsi),%r8
236	shlb	$4,%dl
237	movq	%rax,48-128(%rbp)
238	movq	128+8-128(%rsi),%rax
239	shlq	$60,%r10
240	movb	%dl,7(%rsp)
241	orq	%r10,%rbx
242	movb	%al,%dl
243	shrq	$4,%rax
244	movq	%r8,%r10
245	shrq	$4,%r8
246	movq	%r9,56(%rbp)
247	movq	144+0-128(%rsi),%r9
248	shlb	$4,%dl
249	movq	%rbx,56-128(%rbp)
250	movq	144+8-128(%rsi),%rbx
251	shlq	$60,%r10
252	movb	%dl,8(%rsp)
253	orq	%r10,%rax
254	movb	%bl,%dl
255	shrq	$4,%rbx
256	movq	%r9,%r10
257	shrq	$4,%r9
258	movq	%r8,64(%rbp)
259	movq	160+0-128(%rsi),%r8
260	shlb	$4,%dl
261	movq	%rax,64-128(%rbp)
262	movq	160+8-128(%rsi),%rax
263	shlq	$60,%r10
264	movb	%dl,9(%rsp)
265	orq	%r10,%rbx
266	movb	%al,%dl
267	shrq	$4,%rax
268	movq	%r8,%r10
269	shrq	$4,%r8
270	movq	%r9,72(%rbp)
271	movq	176+0-128(%rsi),%r9
272	shlb	$4,%dl
273	movq	%rbx,72-128(%rbp)
274	movq	176+8-128(%rsi),%rbx
275	shlq	$60,%r10
276	movb	%dl,10(%rsp)
277	orq	%r10,%rax
278	movb	%bl,%dl
279	shrq	$4,%rbx
280	movq	%r9,%r10
281	shrq	$4,%r9
282	movq	%r8,80(%rbp)
283	movq	192+0-128(%rsi),%r8
284	shlb	$4,%dl
285	movq	%rax,80-128(%rbp)
286	movq	192+8-128(%rsi),%rax
287	shlq	$60,%r10
288	movb	%dl,11(%rsp)
289	orq	%r10,%rbx
290	movb	%al,%dl
291	shrq	$4,%rax
292	movq	%r8,%r10
293	shrq	$4,%r8
294	movq	%r9,88(%rbp)
295	movq	208+0-128(%rsi),%r9
296	shlb	$4,%dl
297	movq	%rbx,88-128(%rbp)
298	movq	208+8-128(%rsi),%rbx
299	shlq	$60,%r10
300	movb	%dl,12(%rsp)
301	orq	%r10,%rax
302	movb	%bl,%dl
303	shrq	$4,%rbx
304	movq	%r9,%r10
305	shrq	$4,%r9
306	movq	%r8,96(%rbp)
307	movq	224+0-128(%rsi),%r8
308	shlb	$4,%dl
309	movq	%rax,96-128(%rbp)
310	movq	224+8-128(%rsi),%rax
311	shlq	$60,%r10
312	movb	%dl,13(%rsp)
313	orq	%r10,%rbx
314	movb	%al,%dl
315	shrq	$4,%rax
316	movq	%r8,%r10
317	shrq	$4,%r8
318	movq	%r9,104(%rbp)
319	movq	240+0-128(%rsi),%r9
320	shlb	$4,%dl
321	movq	%rbx,104-128(%rbp)
322	movq	240+8-128(%rsi),%rbx
323	shlq	$60,%r10
324	movb	%dl,14(%rsp)
325	orq	%r10,%rax
326	movb	%bl,%dl
327	shrq	$4,%rbx
328	movq	%r9,%r10
329	shrq	$4,%r9
330	movq	%r8,112(%rbp)
331	shlb	$4,%dl
332	movq	%rax,112-128(%rbp)
333	shlq	$60,%r10
334	movb	%dl,15(%rsp)
335	orq	%r10,%rbx
336	movq	%r9,120(%rbp)
337	movq	%rbx,120-128(%rbp)
338	addq	$-128,%rsi
339	movq	8(%rdi),%r8
340	movq	0(%rdi),%r9
341	addq	%r14,%r15
342	leaq	.Lrem_8bit(%rip),%r11
343	jmp	.Louter_loop
344.align	16
345.Louter_loop:
346	xorq	(%r14),%r9
347	movq	8(%r14),%rdx
348	leaq	16(%r14),%r14
349	xorq	%r8,%rdx
350	movq	%r9,(%rdi)
351	movq	%rdx,8(%rdi)
352	shrq	$32,%rdx
353	xorq	%rax,%rax
354	roll	$8,%edx
355	movb	%dl,%al
356	movzbl	%dl,%ebx
357	shlb	$4,%al
358	shrl	$4,%ebx
359	roll	$8,%edx
360	movq	8(%rsi,%rax,1),%r8
361	movq	(%rsi,%rax,1),%r9
362	movb	%dl,%al
363	movzbl	%dl,%ecx
364	shlb	$4,%al
365	movzbq	(%rsp,%rbx,1),%r12
366	shrl	$4,%ecx
367	xorq	%r8,%r12
368	movq	%r9,%r10
369	shrq	$8,%r8
370	movzbq	%r12b,%r12
371	shrq	$8,%r9
372	xorq	-128(%rbp,%rbx,8),%r8
373	shlq	$56,%r10
374	xorq	(%rbp,%rbx,8),%r9
375	roll	$8,%edx
376	xorq	8(%rsi,%rax,1),%r8
377	xorq	(%rsi,%rax,1),%r9
378	movb	%dl,%al
379	xorq	%r10,%r8
380	movzwq	(%r11,%r12,2),%r12
381	movzbl	%dl,%ebx
382	shlb	$4,%al
383	movzbq	(%rsp,%rcx,1),%r13
384	shrl	$4,%ebx
385	shlq	$48,%r12
386	xorq	%r8,%r13
387	movq	%r9,%r10
388	xorq	%r12,%r9
389	shrq	$8,%r8
390	movzbq	%r13b,%r13
391	shrq	$8,%r9
392	xorq	-128(%rbp,%rcx,8),%r8
393	shlq	$56,%r10
394	xorq	(%rbp,%rcx,8),%r9
395	roll	$8,%edx
396	xorq	8(%rsi,%rax,1),%r8
397	xorq	(%rsi,%rax,1),%r9
398	movb	%dl,%al
399	xorq	%r10,%r8
400	movzwq	(%r11,%r13,2),%r13
401	movzbl	%dl,%ecx
402	shlb	$4,%al
403	movzbq	(%rsp,%rbx,1),%r12
404	shrl	$4,%ecx
405	shlq	$48,%r13
406	xorq	%r8,%r12
407	movq	%r9,%r10
408	xorq	%r13,%r9
409	shrq	$8,%r8
410	movzbq	%r12b,%r12
411	movl	8(%rdi),%edx
412	shrq	$8,%r9
413	xorq	-128(%rbp,%rbx,8),%r8
414	shlq	$56,%r10
415	xorq	(%rbp,%rbx,8),%r9
416	roll	$8,%edx
417	xorq	8(%rsi,%rax,1),%r8
418	xorq	(%rsi,%rax,1),%r9
419	movb	%dl,%al
420	xorq	%r10,%r8
421	movzwq	(%r11,%r12,2),%r12
422	movzbl	%dl,%ebx
423	shlb	$4,%al
424	movzbq	(%rsp,%rcx,1),%r13
425	shrl	$4,%ebx
426	shlq	$48,%r12
427	xorq	%r8,%r13
428	movq	%r9,%r10
429	xorq	%r12,%r9
430	shrq	$8,%r8
431	movzbq	%r13b,%r13
432	shrq	$8,%r9
433	xorq	-128(%rbp,%rcx,8),%r8
434	shlq	$56,%r10
435	xorq	(%rbp,%rcx,8),%r9
436	roll	$8,%edx
437	xorq	8(%rsi,%rax,1),%r8
438	xorq	(%rsi,%rax,1),%r9
439	movb	%dl,%al
440	xorq	%r10,%r8
441	movzwq	(%r11,%r13,2),%r13
442	movzbl	%dl,%ecx
443	shlb	$4,%al
444	movzbq	(%rsp,%rbx,1),%r12
445	shrl	$4,%ecx
446	shlq	$48,%r13
447	xorq	%r8,%r12
448	movq	%r9,%r10
449	xorq	%r13,%r9
450	shrq	$8,%r8
451	movzbq	%r12b,%r12
452	shrq	$8,%r9
453	xorq	-128(%rbp,%rbx,8),%r8
454	shlq	$56,%r10
455	xorq	(%rbp,%rbx,8),%r9
456	roll	$8,%edx
457	xorq	8(%rsi,%rax,1),%r8
458	xorq	(%rsi,%rax,1),%r9
459	movb	%dl,%al
460	xorq	%r10,%r8
461	movzwq	(%r11,%r12,2),%r12
462	movzbl	%dl,%ebx
463	shlb	$4,%al
464	movzbq	(%rsp,%rcx,1),%r13
465	shrl	$4,%ebx
466	shlq	$48,%r12
467	xorq	%r8,%r13
468	movq	%r9,%r10
469	xorq	%r12,%r9
470	shrq	$8,%r8
471	movzbq	%r13b,%r13
472	shrq	$8,%r9
473	xorq	-128(%rbp,%rcx,8),%r8
474	shlq	$56,%r10
475	xorq	(%rbp,%rcx,8),%r9
476	roll	$8,%edx
477	xorq	8(%rsi,%rax,1),%r8
478	xorq	(%rsi,%rax,1),%r9
479	movb	%dl,%al
480	xorq	%r10,%r8
481	movzwq	(%r11,%r13,2),%r13
482	movzbl	%dl,%ecx
483	shlb	$4,%al
484	movzbq	(%rsp,%rbx,1),%r12
485	shrl	$4,%ecx
486	shlq	$48,%r13
487	xorq	%r8,%r12
488	movq	%r9,%r10
489	xorq	%r13,%r9
490	shrq	$8,%r8
491	movzbq	%r12b,%r12
492	movl	4(%rdi),%edx
493	shrq	$8,%r9
494	xorq	-128(%rbp,%rbx,8),%r8
495	shlq	$56,%r10
496	xorq	(%rbp,%rbx,8),%r9
497	roll	$8,%edx
498	xorq	8(%rsi,%rax,1),%r8
499	xorq	(%rsi,%rax,1),%r9
500	movb	%dl,%al
501	xorq	%r10,%r8
502	movzwq	(%r11,%r12,2),%r12
503	movzbl	%dl,%ebx
504	shlb	$4,%al
505	movzbq	(%rsp,%rcx,1),%r13
506	shrl	$4,%ebx
507	shlq	$48,%r12
508	xorq	%r8,%r13
509	movq	%r9,%r10
510	xorq	%r12,%r9
511	shrq	$8,%r8
512	movzbq	%r13b,%r13
513	shrq	$8,%r9
514	xorq	-128(%rbp,%rcx,8),%r8
515	shlq	$56,%r10
516	xorq	(%rbp,%rcx,8),%r9
517	roll	$8,%edx
518	xorq	8(%rsi,%rax,1),%r8
519	xorq	(%rsi,%rax,1),%r9
520	movb	%dl,%al
521	xorq	%r10,%r8
522	movzwq	(%r11,%r13,2),%r13
523	movzbl	%dl,%ecx
524	shlb	$4,%al
525	movzbq	(%rsp,%rbx,1),%r12
526	shrl	$4,%ecx
527	shlq	$48,%r13
528	xorq	%r8,%r12
529	movq	%r9,%r10
530	xorq	%r13,%r9
531	shrq	$8,%r8
532	movzbq	%r12b,%r12
533	shrq	$8,%r9
534	xorq	-128(%rbp,%rbx,8),%r8
535	shlq	$56,%r10
536	xorq	(%rbp,%rbx,8),%r9
537	roll	$8,%edx
538	xorq	8(%rsi,%rax,1),%r8
539	xorq	(%rsi,%rax,1),%r9
540	movb	%dl,%al
541	xorq	%r10,%r8
542	movzwq	(%r11,%r12,2),%r12
543	movzbl	%dl,%ebx
544	shlb	$4,%al
545	movzbq	(%rsp,%rcx,1),%r13
546	shrl	$4,%ebx
547	shlq	$48,%r12
548	xorq	%r8,%r13
549	movq	%r9,%r10
550	xorq	%r12,%r9
551	shrq	$8,%r8
552	movzbq	%r13b,%r13
553	shrq	$8,%r9
554	xorq	-128(%rbp,%rcx,8),%r8
555	shlq	$56,%r10
556	xorq	(%rbp,%rcx,8),%r9
557	roll	$8,%edx
558	xorq	8(%rsi,%rax,1),%r8
559	xorq	(%rsi,%rax,1),%r9
560	movb	%dl,%al
561	xorq	%r10,%r8
562	movzwq	(%r11,%r13,2),%r13
563	movzbl	%dl,%ecx
564	shlb	$4,%al
565	movzbq	(%rsp,%rbx,1),%r12
566	shrl	$4,%ecx
567	shlq	$48,%r13
568	xorq	%r8,%r12
569	movq	%r9,%r10
570	xorq	%r13,%r9
571	shrq	$8,%r8
572	movzbq	%r12b,%r12
573	movl	0(%rdi),%edx
574	shrq	$8,%r9
575	xorq	-128(%rbp,%rbx,8),%r8
576	shlq	$56,%r10
577	xorq	(%rbp,%rbx,8),%r9
578	roll	$8,%edx
579	xorq	8(%rsi,%rax,1),%r8
580	xorq	(%rsi,%rax,1),%r9
581	movb	%dl,%al
582	xorq	%r10,%r8
583	movzwq	(%r11,%r12,2),%r12
584	movzbl	%dl,%ebx
585	shlb	$4,%al
586	movzbq	(%rsp,%rcx,1),%r13
587	shrl	$4,%ebx
588	shlq	$48,%r12
589	xorq	%r8,%r13
590	movq	%r9,%r10
591	xorq	%r12,%r9
592	shrq	$8,%r8
593	movzbq	%r13b,%r13
594	shrq	$8,%r9
595	xorq	-128(%rbp,%rcx,8),%r8
596	shlq	$56,%r10
597	xorq	(%rbp,%rcx,8),%r9
598	roll	$8,%edx
599	xorq	8(%rsi,%rax,1),%r8
600	xorq	(%rsi,%rax,1),%r9
601	movb	%dl,%al
602	xorq	%r10,%r8
603	movzwq	(%r11,%r13,2),%r13
604	movzbl	%dl,%ecx
605	shlb	$4,%al
606	movzbq	(%rsp,%rbx,1),%r12
607	shrl	$4,%ecx
608	shlq	$48,%r13
609	xorq	%r8,%r12
610	movq	%r9,%r10
611	xorq	%r13,%r9
612	shrq	$8,%r8
613	movzbq	%r12b,%r12
614	shrq	$8,%r9
615	xorq	-128(%rbp,%rbx,8),%r8
616	shlq	$56,%r10
617	xorq	(%rbp,%rbx,8),%r9
618	roll	$8,%edx
619	xorq	8(%rsi,%rax,1),%r8
620	xorq	(%rsi,%rax,1),%r9
621	movb	%dl,%al
622	xorq	%r10,%r8
623	movzwq	(%r11,%r12,2),%r12
624	movzbl	%dl,%ebx
625	shlb	$4,%al
626	movzbq	(%rsp,%rcx,1),%r13
627	shrl	$4,%ebx
628	shlq	$48,%r12
629	xorq	%r8,%r13
630	movq	%r9,%r10
631	xorq	%r12,%r9
632	shrq	$8,%r8
633	movzbq	%r13b,%r13
634	shrq	$8,%r9
635	xorq	-128(%rbp,%rcx,8),%r8
636	shlq	$56,%r10
637	xorq	(%rbp,%rcx,8),%r9
638	roll	$8,%edx
639	xorq	8(%rsi,%rax,1),%r8
640	xorq	(%rsi,%rax,1),%r9
641	movb	%dl,%al
642	xorq	%r10,%r8
643	movzwq	(%r11,%r13,2),%r13
644	movzbl	%dl,%ecx
645	shlb	$4,%al
646	movzbq	(%rsp,%rbx,1),%r12
647	andl	$240,%ecx
648	shlq	$48,%r13
649	xorq	%r8,%r12
650	movq	%r9,%r10
651	xorq	%r13,%r9
652	shrq	$8,%r8
653	movzbq	%r12b,%r12
654	movl	-4(%rdi),%edx
655	shrq	$8,%r9
656	xorq	-128(%rbp,%rbx,8),%r8
657	shlq	$56,%r10
658	xorq	(%rbp,%rbx,8),%r9
659	movzwq	(%r11,%r12,2),%r12
660	xorq	8(%rsi,%rax,1),%r8
661	xorq	(%rsi,%rax,1),%r9
662	shlq	$48,%r12
663	xorq	%r10,%r8
664	xorq	%r12,%r9
665	movzbq	%r8b,%r13
666	shrq	$4,%r8
667	movq	%r9,%r10
668	shlb	$4,%r13b
669	shrq	$4,%r9
670	xorq	8(%rsi,%rcx,1),%r8
671	movzwq	(%r11,%r13,2),%r13
672	shlq	$60,%r10
673	xorq	(%rsi,%rcx,1),%r9
674	xorq	%r10,%r8
675	shlq	$48,%r13
676	bswapq	%r8
677	xorq	%r13,%r9
678	bswapq	%r9
679	cmpq	%r15,%r14
680	jb	.Louter_loop
681	movq	%r8,8(%rdi)
682	movq	%r9,(%rdi)
683
684	leaq	280+48(%rsp),%rsi
685.cfi_def_cfa	%rsi,8
686	movq	-48(%rsi),%r15
687.cfi_restore	%r15
688	movq	-40(%rsi),%r14
689.cfi_restore	%r14
690	movq	-32(%rsi),%r13
691.cfi_restore	%r13
692	movq	-24(%rsi),%r12
693.cfi_restore	%r12
694	movq	-16(%rsi),%rbp
695.cfi_restore	%rbp
696	movq	-8(%rsi),%rbx
697.cfi_restore	%rbx
698	leaq	0(%rsi),%rsp
699.cfi_def_cfa_register	%rsp
700.Lghash_epilogue:
701	.byte	0xf3,0xc3
702.cfi_endproc
703.size	gcm_ghash_4bit,.-gcm_ghash_4bit
704.globl	gcm_init_clmul
705.type	gcm_init_clmul,@function
706.align	16
707gcm_init_clmul:
708.cfi_startproc
709.L_init_clmul:
710	movdqu	(%rsi),%xmm2
711	pshufd	$78,%xmm2,%xmm2
712
713
714	pshufd	$255,%xmm2,%xmm4
715	movdqa	%xmm2,%xmm3
716	psllq	$1,%xmm2
717	pxor	%xmm5,%xmm5
718	psrlq	$63,%xmm3
719	pcmpgtd	%xmm4,%xmm5
720	pslldq	$8,%xmm3
721	por	%xmm3,%xmm2
722
723
724	pand	.L0x1c2_polynomial(%rip),%xmm5
725	pxor	%xmm5,%xmm2
726
727
728	pshufd	$78,%xmm2,%xmm6
729	movdqa	%xmm2,%xmm0
730	pxor	%xmm2,%xmm6
731	movdqa	%xmm0,%xmm1
732	pshufd	$78,%xmm0,%xmm3
733	pxor	%xmm0,%xmm3
734.byte	102,15,58,68,194,0
735.byte	102,15,58,68,202,17
736.byte	102,15,58,68,222,0
737	pxor	%xmm0,%xmm3
738	pxor	%xmm1,%xmm3
739
740	movdqa	%xmm3,%xmm4
741	psrldq	$8,%xmm3
742	pslldq	$8,%xmm4
743	pxor	%xmm3,%xmm1
744	pxor	%xmm4,%xmm0
745
746	movdqa	%xmm0,%xmm4
747	movdqa	%xmm0,%xmm3
748	psllq	$5,%xmm0
749	pxor	%xmm0,%xmm3
750	psllq	$1,%xmm0
751	pxor	%xmm3,%xmm0
752	psllq	$57,%xmm0
753	movdqa	%xmm0,%xmm3
754	pslldq	$8,%xmm0
755	psrldq	$8,%xmm3
756	pxor	%xmm4,%xmm0
757	pxor	%xmm3,%xmm1
758
759
760	movdqa	%xmm0,%xmm4
761	psrlq	$1,%xmm0
762	pxor	%xmm4,%xmm1
763	pxor	%xmm0,%xmm4
764	psrlq	$5,%xmm0
765	pxor	%xmm4,%xmm0
766	psrlq	$1,%xmm0
767	pxor	%xmm1,%xmm0
768	pshufd	$78,%xmm2,%xmm3
769	pshufd	$78,%xmm0,%xmm4
770	pxor	%xmm2,%xmm3
771	movdqu	%xmm2,0(%rdi)
772	pxor	%xmm0,%xmm4
773	movdqu	%xmm0,16(%rdi)
774.byte	102,15,58,15,227,8
775	movdqu	%xmm4,32(%rdi)
776	movdqa	%xmm0,%xmm1
777	pshufd	$78,%xmm0,%xmm3
778	pxor	%xmm0,%xmm3
779.byte	102,15,58,68,194,0
780.byte	102,15,58,68,202,17
781.byte	102,15,58,68,222,0
782	pxor	%xmm0,%xmm3
783	pxor	%xmm1,%xmm3
784
785	movdqa	%xmm3,%xmm4
786	psrldq	$8,%xmm3
787	pslldq	$8,%xmm4
788	pxor	%xmm3,%xmm1
789	pxor	%xmm4,%xmm0
790
791	movdqa	%xmm0,%xmm4
792	movdqa	%xmm0,%xmm3
793	psllq	$5,%xmm0
794	pxor	%xmm0,%xmm3
795	psllq	$1,%xmm0
796	pxor	%xmm3,%xmm0
797	psllq	$57,%xmm0
798	movdqa	%xmm0,%xmm3
799	pslldq	$8,%xmm0
800	psrldq	$8,%xmm3
801	pxor	%xmm4,%xmm0
802	pxor	%xmm3,%xmm1
803
804
805	movdqa	%xmm0,%xmm4
806	psrlq	$1,%xmm0
807	pxor	%xmm4,%xmm1
808	pxor	%xmm0,%xmm4
809	psrlq	$5,%xmm0
810	pxor	%xmm4,%xmm0
811	psrlq	$1,%xmm0
812	pxor	%xmm1,%xmm0
813	movdqa	%xmm0,%xmm5
814	movdqa	%xmm0,%xmm1
815	pshufd	$78,%xmm0,%xmm3
816	pxor	%xmm0,%xmm3
817.byte	102,15,58,68,194,0
818.byte	102,15,58,68,202,17
819.byte	102,15,58,68,222,0
820	pxor	%xmm0,%xmm3
821	pxor	%xmm1,%xmm3
822
823	movdqa	%xmm3,%xmm4
824	psrldq	$8,%xmm3
825	pslldq	$8,%xmm4
826	pxor	%xmm3,%xmm1
827	pxor	%xmm4,%xmm0
828
829	movdqa	%xmm0,%xmm4
830	movdqa	%xmm0,%xmm3
831	psllq	$5,%xmm0
832	pxor	%xmm0,%xmm3
833	psllq	$1,%xmm0
834	pxor	%xmm3,%xmm0
835	psllq	$57,%xmm0
836	movdqa	%xmm0,%xmm3
837	pslldq	$8,%xmm0
838	psrldq	$8,%xmm3
839	pxor	%xmm4,%xmm0
840	pxor	%xmm3,%xmm1
841
842
843	movdqa	%xmm0,%xmm4
844	psrlq	$1,%xmm0
845	pxor	%xmm4,%xmm1
846	pxor	%xmm0,%xmm4
847	psrlq	$5,%xmm0
848	pxor	%xmm4,%xmm0
849	psrlq	$1,%xmm0
850	pxor	%xmm1,%xmm0
851	pshufd	$78,%xmm5,%xmm3
852	pshufd	$78,%xmm0,%xmm4
853	pxor	%xmm5,%xmm3
854	movdqu	%xmm5,48(%rdi)
855	pxor	%xmm0,%xmm4
856	movdqu	%xmm0,64(%rdi)
857.byte	102,15,58,15,227,8
858	movdqu	%xmm4,80(%rdi)
859	.byte	0xf3,0xc3
860.cfi_endproc
861.size	gcm_init_clmul,.-gcm_init_clmul
862.globl	gcm_gmult_clmul
863.type	gcm_gmult_clmul,@function
864.align	16
865gcm_gmult_clmul:
866.cfi_startproc
867.L_gmult_clmul:
868	movdqu	(%rdi),%xmm0
869	movdqa	.Lbswap_mask(%rip),%xmm5
870	movdqu	(%rsi),%xmm2
871	movdqu	32(%rsi),%xmm4
872.byte	102,15,56,0,197
873	movdqa	%xmm0,%xmm1
874	pshufd	$78,%xmm0,%xmm3
875	pxor	%xmm0,%xmm3
876.byte	102,15,58,68,194,0
877.byte	102,15,58,68,202,17
878.byte	102,15,58,68,220,0
879	pxor	%xmm0,%xmm3
880	pxor	%xmm1,%xmm3
881
882	movdqa	%xmm3,%xmm4
883	psrldq	$8,%xmm3
884	pslldq	$8,%xmm4
885	pxor	%xmm3,%xmm1
886	pxor	%xmm4,%xmm0
887
888	movdqa	%xmm0,%xmm4
889	movdqa	%xmm0,%xmm3
890	psllq	$5,%xmm0
891	pxor	%xmm0,%xmm3
892	psllq	$1,%xmm0
893	pxor	%xmm3,%xmm0
894	psllq	$57,%xmm0
895	movdqa	%xmm0,%xmm3
896	pslldq	$8,%xmm0
897	psrldq	$8,%xmm3
898	pxor	%xmm4,%xmm0
899	pxor	%xmm3,%xmm1
900
901
902	movdqa	%xmm0,%xmm4
903	psrlq	$1,%xmm0
904	pxor	%xmm4,%xmm1
905	pxor	%xmm0,%xmm4
906	psrlq	$5,%xmm0
907	pxor	%xmm4,%xmm0
908	psrlq	$1,%xmm0
909	pxor	%xmm1,%xmm0
910.byte	102,15,56,0,197
911	movdqu	%xmm0,(%rdi)
912	.byte	0xf3,0xc3
913.cfi_endproc
914.size	gcm_gmult_clmul,.-gcm_gmult_clmul
915.globl	gcm_ghash_clmul
916.type	gcm_ghash_clmul,@function
917.align	32
918gcm_ghash_clmul:
919.cfi_startproc
920.L_ghash_clmul:
921	movdqa	.Lbswap_mask(%rip),%xmm10
922
923	movdqu	(%rdi),%xmm0
924	movdqu	(%rsi),%xmm2
925	movdqu	32(%rsi),%xmm7
926.byte	102,65,15,56,0,194
927
928	subq	$0x10,%rcx
929	jz	.Lodd_tail
930
931	movdqu	16(%rsi),%xmm6
932	movl	OPENSSL_ia32cap_P+4(%rip),%eax
933	cmpq	$0x30,%rcx
934	jb	.Lskip4x
935
936	andl	$71303168,%eax
937	cmpl	$4194304,%eax
938	je	.Lskip4x
939
940	subq	$0x30,%rcx
941	movq	$0xA040608020C0E000,%rax
942	movdqu	48(%rsi),%xmm14
943	movdqu	64(%rsi),%xmm15
944
945
946
947
948	movdqu	48(%rdx),%xmm3
949	movdqu	32(%rdx),%xmm11
950.byte	102,65,15,56,0,218
951.byte	102,69,15,56,0,218
952	movdqa	%xmm3,%xmm5
953	pshufd	$78,%xmm3,%xmm4
954	pxor	%xmm3,%xmm4
955.byte	102,15,58,68,218,0
956.byte	102,15,58,68,234,17
957.byte	102,15,58,68,231,0
958
959	movdqa	%xmm11,%xmm13
960	pshufd	$78,%xmm11,%xmm12
961	pxor	%xmm11,%xmm12
962.byte	102,68,15,58,68,222,0
963.byte	102,68,15,58,68,238,17
964.byte	102,68,15,58,68,231,16
965	xorps	%xmm11,%xmm3
966	xorps	%xmm13,%xmm5
967	movups	80(%rsi),%xmm7
968	xorps	%xmm12,%xmm4
969
970	movdqu	16(%rdx),%xmm11
971	movdqu	0(%rdx),%xmm8
972.byte	102,69,15,56,0,218
973.byte	102,69,15,56,0,194
974	movdqa	%xmm11,%xmm13
975	pshufd	$78,%xmm11,%xmm12
976	pxor	%xmm8,%xmm0
977	pxor	%xmm11,%xmm12
978.byte	102,69,15,58,68,222,0
979	movdqa	%xmm0,%xmm1
980	pshufd	$78,%xmm0,%xmm8
981	pxor	%xmm0,%xmm8
982.byte	102,69,15,58,68,238,17
983.byte	102,68,15,58,68,231,0
984	xorps	%xmm11,%xmm3
985	xorps	%xmm13,%xmm5
986
987	leaq	64(%rdx),%rdx
988	subq	$0x40,%rcx
989	jc	.Ltail4x
990
991	jmp	.Lmod4_loop
992.align	32
993.Lmod4_loop:
994.byte	102,65,15,58,68,199,0
995	xorps	%xmm12,%xmm4
996	movdqu	48(%rdx),%xmm11
997.byte	102,69,15,56,0,218
998.byte	102,65,15,58,68,207,17
999	xorps	%xmm3,%xmm0
1000	movdqu	32(%rdx),%xmm3
1001	movdqa	%xmm11,%xmm13
1002.byte	102,68,15,58,68,199,16
1003	pshufd	$78,%xmm11,%xmm12
1004	xorps	%xmm5,%xmm1
1005	pxor	%xmm11,%xmm12
1006.byte	102,65,15,56,0,218
1007	movups	32(%rsi),%xmm7
1008	xorps	%xmm4,%xmm8
1009.byte	102,68,15,58,68,218,0
1010	pshufd	$78,%xmm3,%xmm4
1011
1012	pxor	%xmm0,%xmm8
1013	movdqa	%xmm3,%xmm5
1014	pxor	%xmm1,%xmm8
1015	pxor	%xmm3,%xmm4
1016	movdqa	%xmm8,%xmm9
1017.byte	102,68,15,58,68,234,17
1018	pslldq	$8,%xmm8
1019	psrldq	$8,%xmm9
1020	pxor	%xmm8,%xmm0
1021	movdqa	.L7_mask(%rip),%xmm8
1022	pxor	%xmm9,%xmm1
1023.byte	102,76,15,110,200
1024
1025	pand	%xmm0,%xmm8
1026.byte	102,69,15,56,0,200
1027	pxor	%xmm0,%xmm9
1028.byte	102,68,15,58,68,231,0
1029	psllq	$57,%xmm9
1030	movdqa	%xmm9,%xmm8
1031	pslldq	$8,%xmm9
1032.byte	102,15,58,68,222,0
1033	psrldq	$8,%xmm8
1034	pxor	%xmm9,%xmm0
1035	pxor	%xmm8,%xmm1
1036	movdqu	0(%rdx),%xmm8
1037
1038	movdqa	%xmm0,%xmm9
1039	psrlq	$1,%xmm0
1040.byte	102,15,58,68,238,17
1041	xorps	%xmm11,%xmm3
1042	movdqu	16(%rdx),%xmm11
1043.byte	102,69,15,56,0,218
1044.byte	102,15,58,68,231,16
1045	xorps	%xmm13,%xmm5
1046	movups	80(%rsi),%xmm7
1047.byte	102,69,15,56,0,194
1048	pxor	%xmm9,%xmm1
1049	pxor	%xmm0,%xmm9
1050	psrlq	$5,%xmm0
1051
1052	movdqa	%xmm11,%xmm13
1053	pxor	%xmm12,%xmm4
1054	pshufd	$78,%xmm11,%xmm12
1055	pxor	%xmm9,%xmm0
1056	pxor	%xmm8,%xmm1
1057	pxor	%xmm11,%xmm12
1058.byte	102,69,15,58,68,222,0
1059	psrlq	$1,%xmm0
1060	pxor	%xmm1,%xmm0
1061	movdqa	%xmm0,%xmm1
1062.byte	102,69,15,58,68,238,17
1063	xorps	%xmm11,%xmm3
1064	pshufd	$78,%xmm0,%xmm8
1065	pxor	%xmm0,%xmm8
1066
1067.byte	102,68,15,58,68,231,0
1068	xorps	%xmm13,%xmm5
1069
1070	leaq	64(%rdx),%rdx
1071	subq	$0x40,%rcx
1072	jnc	.Lmod4_loop
1073
1074.Ltail4x:
1075.byte	102,65,15,58,68,199,0
1076.byte	102,65,15,58,68,207,17
1077.byte	102,68,15,58,68,199,16
1078	xorps	%xmm12,%xmm4
1079	xorps	%xmm3,%xmm0
1080	xorps	%xmm5,%xmm1
1081	pxor	%xmm0,%xmm1
1082	pxor	%xmm4,%xmm8
1083
1084	pxor	%xmm1,%xmm8
1085	pxor	%xmm0,%xmm1
1086
1087	movdqa	%xmm8,%xmm9
1088	psrldq	$8,%xmm8
1089	pslldq	$8,%xmm9
1090	pxor	%xmm8,%xmm1
1091	pxor	%xmm9,%xmm0
1092
1093	movdqa	%xmm0,%xmm4
1094	movdqa	%xmm0,%xmm3
1095	psllq	$5,%xmm0
1096	pxor	%xmm0,%xmm3
1097	psllq	$1,%xmm0
1098	pxor	%xmm3,%xmm0
1099	psllq	$57,%xmm0
1100	movdqa	%xmm0,%xmm3
1101	pslldq	$8,%xmm0
1102	psrldq	$8,%xmm3
1103	pxor	%xmm4,%xmm0
1104	pxor	%xmm3,%xmm1
1105
1106
1107	movdqa	%xmm0,%xmm4
1108	psrlq	$1,%xmm0
1109	pxor	%xmm4,%xmm1
1110	pxor	%xmm0,%xmm4
1111	psrlq	$5,%xmm0
1112	pxor	%xmm4,%xmm0
1113	psrlq	$1,%xmm0
1114	pxor	%xmm1,%xmm0
1115	addq	$0x40,%rcx
1116	jz	.Ldone
1117	movdqu	32(%rsi),%xmm7
1118	subq	$0x10,%rcx
1119	jz	.Lodd_tail
1120.Lskip4x:
1121
1122
1123
1124
1125
1126	movdqu	(%rdx),%xmm8
1127	movdqu	16(%rdx),%xmm3
1128.byte	102,69,15,56,0,194
1129.byte	102,65,15,56,0,218
1130	pxor	%xmm8,%xmm0
1131
1132	movdqa	%xmm3,%xmm5
1133	pshufd	$78,%xmm3,%xmm4
1134	pxor	%xmm3,%xmm4
1135.byte	102,15,58,68,218,0
1136.byte	102,15,58,68,234,17
1137.byte	102,15,58,68,231,0
1138
1139	leaq	32(%rdx),%rdx
1140	nop
1141	subq	$0x20,%rcx
1142	jbe	.Leven_tail
1143	nop
1144	jmp	.Lmod_loop
1145
1146.align	32
1147.Lmod_loop:
1148	movdqa	%xmm0,%xmm1
1149	movdqa	%xmm4,%xmm8
1150	pshufd	$78,%xmm0,%xmm4
1151	pxor	%xmm0,%xmm4
1152
1153.byte	102,15,58,68,198,0
1154.byte	102,15,58,68,206,17
1155.byte	102,15,58,68,231,16
1156
1157	pxor	%xmm3,%xmm0
1158	pxor	%xmm5,%xmm1
1159	movdqu	(%rdx),%xmm9
1160	pxor	%xmm0,%xmm8
1161.byte	102,69,15,56,0,202
1162	movdqu	16(%rdx),%xmm3
1163
1164	pxor	%xmm1,%xmm8
1165	pxor	%xmm9,%xmm1
1166	pxor	%xmm8,%xmm4
1167.byte	102,65,15,56,0,218
1168	movdqa	%xmm4,%xmm8
1169	psrldq	$8,%xmm8
1170	pslldq	$8,%xmm4
1171	pxor	%xmm8,%xmm1
1172	pxor	%xmm4,%xmm0
1173
1174	movdqa	%xmm3,%xmm5
1175
1176	movdqa	%xmm0,%xmm9
1177	movdqa	%xmm0,%xmm8
1178	psllq	$5,%xmm0
1179	pxor	%xmm0,%xmm8
1180.byte	102,15,58,68,218,0
1181	psllq	$1,%xmm0
1182	pxor	%xmm8,%xmm0
1183	psllq	$57,%xmm0
1184	movdqa	%xmm0,%xmm8
1185	pslldq	$8,%xmm0
1186	psrldq	$8,%xmm8
1187	pxor	%xmm9,%xmm0
1188	pshufd	$78,%xmm5,%xmm4
1189	pxor	%xmm8,%xmm1
1190	pxor	%xmm5,%xmm4
1191
1192	movdqa	%xmm0,%xmm9
1193	psrlq	$1,%xmm0
1194.byte	102,15,58,68,234,17
1195	pxor	%xmm9,%xmm1
1196	pxor	%xmm0,%xmm9
1197	psrlq	$5,%xmm0
1198	pxor	%xmm9,%xmm0
1199	leaq	32(%rdx),%rdx
1200	psrlq	$1,%xmm0
1201.byte	102,15,58,68,231,0
1202	pxor	%xmm1,%xmm0
1203
1204	subq	$0x20,%rcx
1205	ja	.Lmod_loop
1206
1207.Leven_tail:
1208	movdqa	%xmm0,%xmm1
1209	movdqa	%xmm4,%xmm8
1210	pshufd	$78,%xmm0,%xmm4
1211	pxor	%xmm0,%xmm4
1212
1213.byte	102,15,58,68,198,0
1214.byte	102,15,58,68,206,17
1215.byte	102,15,58,68,231,16
1216
1217	pxor	%xmm3,%xmm0
1218	pxor	%xmm5,%xmm1
1219	pxor	%xmm0,%xmm8
1220	pxor	%xmm1,%xmm8
1221	pxor	%xmm8,%xmm4
1222	movdqa	%xmm4,%xmm8
1223	psrldq	$8,%xmm8
1224	pslldq	$8,%xmm4
1225	pxor	%xmm8,%xmm1
1226	pxor	%xmm4,%xmm0
1227
1228	movdqa	%xmm0,%xmm4
1229	movdqa	%xmm0,%xmm3
1230	psllq	$5,%xmm0
1231	pxor	%xmm0,%xmm3
1232	psllq	$1,%xmm0
1233	pxor	%xmm3,%xmm0
1234	psllq	$57,%xmm0
1235	movdqa	%xmm0,%xmm3
1236	pslldq	$8,%xmm0
1237	psrldq	$8,%xmm3
1238	pxor	%xmm4,%xmm0
1239	pxor	%xmm3,%xmm1
1240
1241
1242	movdqa	%xmm0,%xmm4
1243	psrlq	$1,%xmm0
1244	pxor	%xmm4,%xmm1
1245	pxor	%xmm0,%xmm4
1246	psrlq	$5,%xmm0
1247	pxor	%xmm4,%xmm0
1248	psrlq	$1,%xmm0
1249	pxor	%xmm1,%xmm0
1250	testq	%rcx,%rcx
1251	jnz	.Ldone
1252
1253.Lodd_tail:
1254	movdqu	(%rdx),%xmm8
1255.byte	102,69,15,56,0,194
1256	pxor	%xmm8,%xmm0
1257	movdqa	%xmm0,%xmm1
1258	pshufd	$78,%xmm0,%xmm3
1259	pxor	%xmm0,%xmm3
1260.byte	102,15,58,68,194,0
1261.byte	102,15,58,68,202,17
1262.byte	102,15,58,68,223,0
1263	pxor	%xmm0,%xmm3
1264	pxor	%xmm1,%xmm3
1265
1266	movdqa	%xmm3,%xmm4
1267	psrldq	$8,%xmm3
1268	pslldq	$8,%xmm4
1269	pxor	%xmm3,%xmm1
1270	pxor	%xmm4,%xmm0
1271
1272	movdqa	%xmm0,%xmm4
1273	movdqa	%xmm0,%xmm3
1274	psllq	$5,%xmm0
1275	pxor	%xmm0,%xmm3
1276	psllq	$1,%xmm0
1277	pxor	%xmm3,%xmm0
1278	psllq	$57,%xmm0
1279	movdqa	%xmm0,%xmm3
1280	pslldq	$8,%xmm0
1281	psrldq	$8,%xmm3
1282	pxor	%xmm4,%xmm0
1283	pxor	%xmm3,%xmm1
1284
1285
1286	movdqa	%xmm0,%xmm4
1287	psrlq	$1,%xmm0
1288	pxor	%xmm4,%xmm1
1289	pxor	%xmm0,%xmm4
1290	psrlq	$5,%xmm0
1291	pxor	%xmm4,%xmm0
1292	psrlq	$1,%xmm0
1293	pxor	%xmm1,%xmm0
1294.Ldone:
1295.byte	102,65,15,56,0,194
1296	movdqu	%xmm0,(%rdi)
1297	.byte	0xf3,0xc3
1298.cfi_endproc
1299.size	gcm_ghash_clmul,.-gcm_ghash_clmul
1300.globl	gcm_init_avx
1301.type	gcm_init_avx,@function
1302.align	32
1303gcm_init_avx:
1304.cfi_startproc
1305	vzeroupper
1306
1307	vmovdqu	(%rsi),%xmm2
1308	vpshufd	$78,%xmm2,%xmm2
1309
1310
1311	vpshufd	$255,%xmm2,%xmm4
1312	vpsrlq	$63,%xmm2,%xmm3
1313	vpsllq	$1,%xmm2,%xmm2
1314	vpxor	%xmm5,%xmm5,%xmm5
1315	vpcmpgtd	%xmm4,%xmm5,%xmm5
1316	vpslldq	$8,%xmm3,%xmm3
1317	vpor	%xmm3,%xmm2,%xmm2
1318
1319
1320	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
1321	vpxor	%xmm5,%xmm2,%xmm2
1322
1323	vpunpckhqdq	%xmm2,%xmm2,%xmm6
1324	vmovdqa	%xmm2,%xmm0
1325	vpxor	%xmm2,%xmm6,%xmm6
1326	movq	$4,%r10
1327	jmp	.Linit_start_avx
1328.align	32
1329.Linit_loop_avx:
1330	vpalignr	$8,%xmm3,%xmm4,%xmm5
1331	vmovdqu	%xmm5,-16(%rdi)
1332	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1333	vpxor	%xmm0,%xmm3,%xmm3
1334	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1335	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1336	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1337	vpxor	%xmm0,%xmm1,%xmm4
1338	vpxor	%xmm4,%xmm3,%xmm3
1339
1340	vpslldq	$8,%xmm3,%xmm4
1341	vpsrldq	$8,%xmm3,%xmm3
1342	vpxor	%xmm4,%xmm0,%xmm0
1343	vpxor	%xmm3,%xmm1,%xmm1
1344	vpsllq	$57,%xmm0,%xmm3
1345	vpsllq	$62,%xmm0,%xmm4
1346	vpxor	%xmm3,%xmm4,%xmm4
1347	vpsllq	$63,%xmm0,%xmm3
1348	vpxor	%xmm3,%xmm4,%xmm4
1349	vpslldq	$8,%xmm4,%xmm3
1350	vpsrldq	$8,%xmm4,%xmm4
1351	vpxor	%xmm3,%xmm0,%xmm0
1352	vpxor	%xmm4,%xmm1,%xmm1
1353
1354	vpsrlq	$1,%xmm0,%xmm4
1355	vpxor	%xmm0,%xmm1,%xmm1
1356	vpxor	%xmm4,%xmm0,%xmm0
1357	vpsrlq	$5,%xmm4,%xmm4
1358	vpxor	%xmm4,%xmm0,%xmm0
1359	vpsrlq	$1,%xmm0,%xmm0
1360	vpxor	%xmm1,%xmm0,%xmm0
1361.Linit_start_avx:
1362	vmovdqa	%xmm0,%xmm5
1363	vpunpckhqdq	%xmm0,%xmm0,%xmm3
1364	vpxor	%xmm0,%xmm3,%xmm3
1365	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
1366	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
1367	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
1368	vpxor	%xmm0,%xmm1,%xmm4
1369	vpxor	%xmm4,%xmm3,%xmm3
1370
1371	vpslldq	$8,%xmm3,%xmm4
1372	vpsrldq	$8,%xmm3,%xmm3
1373	vpxor	%xmm4,%xmm0,%xmm0
1374	vpxor	%xmm3,%xmm1,%xmm1
1375	vpsllq	$57,%xmm0,%xmm3
1376	vpsllq	$62,%xmm0,%xmm4
1377	vpxor	%xmm3,%xmm4,%xmm4
1378	vpsllq	$63,%xmm0,%xmm3
1379	vpxor	%xmm3,%xmm4,%xmm4
1380	vpslldq	$8,%xmm4,%xmm3
1381	vpsrldq	$8,%xmm4,%xmm4
1382	vpxor	%xmm3,%xmm0,%xmm0
1383	vpxor	%xmm4,%xmm1,%xmm1
1384
1385	vpsrlq	$1,%xmm0,%xmm4
1386	vpxor	%xmm0,%xmm1,%xmm1
1387	vpxor	%xmm4,%xmm0,%xmm0
1388	vpsrlq	$5,%xmm4,%xmm4
1389	vpxor	%xmm4,%xmm0,%xmm0
1390	vpsrlq	$1,%xmm0,%xmm0
1391	vpxor	%xmm1,%xmm0,%xmm0
1392	vpshufd	$78,%xmm5,%xmm3
1393	vpshufd	$78,%xmm0,%xmm4
1394	vpxor	%xmm5,%xmm3,%xmm3
1395	vmovdqu	%xmm5,0(%rdi)
1396	vpxor	%xmm0,%xmm4,%xmm4
1397	vmovdqu	%xmm0,16(%rdi)
1398	leaq	48(%rdi),%rdi
1399	subq	$1,%r10
1400	jnz	.Linit_loop_avx
1401
1402	vpalignr	$8,%xmm4,%xmm3,%xmm5
1403	vmovdqu	%xmm5,-16(%rdi)
1404
1405	vzeroupper
1406	.byte	0xf3,0xc3
1407.cfi_endproc
1408.size	gcm_init_avx,.-gcm_init_avx
1409.globl	gcm_gmult_avx
1410.type	gcm_gmult_avx,@function
1411.align	32
1412gcm_gmult_avx:
1413.cfi_startproc
1414	jmp	.L_gmult_clmul
1415.cfi_endproc
1416.size	gcm_gmult_avx,.-gcm_gmult_avx
1417.globl	gcm_ghash_avx
1418.type	gcm_ghash_avx,@function
1419.align	32
1420gcm_ghash_avx:
1421.cfi_startproc
1422	vzeroupper
1423
1424	vmovdqu	(%rdi),%xmm10
1425	leaq	.L0x1c2_polynomial(%rip),%r10
1426	leaq	64(%rsi),%rsi
1427	vmovdqu	.Lbswap_mask(%rip),%xmm13
1428	vpshufb	%xmm13,%xmm10,%xmm10
1429	cmpq	$0x80,%rcx
1430	jb	.Lshort_avx
1431	subq	$0x80,%rcx
1432
1433	vmovdqu	112(%rdx),%xmm14
1434	vmovdqu	0-64(%rsi),%xmm6
1435	vpshufb	%xmm13,%xmm14,%xmm14
1436	vmovdqu	32-64(%rsi),%xmm7
1437
1438	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1439	vmovdqu	96(%rdx),%xmm15
1440	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1441	vpxor	%xmm14,%xmm9,%xmm9
1442	vpshufb	%xmm13,%xmm15,%xmm15
1443	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1444	vmovdqu	16-64(%rsi),%xmm6
1445	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1446	vmovdqu	80(%rdx),%xmm14
1447	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1448	vpxor	%xmm15,%xmm8,%xmm8
1449
1450	vpshufb	%xmm13,%xmm14,%xmm14
1451	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1452	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1453	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1454	vmovdqu	48-64(%rsi),%xmm6
1455	vpxor	%xmm14,%xmm9,%xmm9
1456	vmovdqu	64(%rdx),%xmm15
1457	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1458	vmovdqu	80-64(%rsi),%xmm7
1459
1460	vpshufb	%xmm13,%xmm15,%xmm15
1461	vpxor	%xmm0,%xmm3,%xmm3
1462	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1463	vpxor	%xmm1,%xmm4,%xmm4
1464	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1465	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1466	vmovdqu	64-64(%rsi),%xmm6
1467	vpxor	%xmm2,%xmm5,%xmm5
1468	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1469	vpxor	%xmm15,%xmm8,%xmm8
1470
1471	vmovdqu	48(%rdx),%xmm14
1472	vpxor	%xmm3,%xmm0,%xmm0
1473	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1474	vpxor	%xmm4,%xmm1,%xmm1
1475	vpshufb	%xmm13,%xmm14,%xmm14
1476	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1477	vmovdqu	96-64(%rsi),%xmm6
1478	vpxor	%xmm5,%xmm2,%xmm2
1479	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1480	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1481	vmovdqu	128-64(%rsi),%xmm7
1482	vpxor	%xmm14,%xmm9,%xmm9
1483
1484	vmovdqu	32(%rdx),%xmm15
1485	vpxor	%xmm0,%xmm3,%xmm3
1486	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1487	vpxor	%xmm1,%xmm4,%xmm4
1488	vpshufb	%xmm13,%xmm15,%xmm15
1489	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1490	vmovdqu	112-64(%rsi),%xmm6
1491	vpxor	%xmm2,%xmm5,%xmm5
1492	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1493	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1494	vpxor	%xmm15,%xmm8,%xmm8
1495
1496	vmovdqu	16(%rdx),%xmm14
1497	vpxor	%xmm3,%xmm0,%xmm0
1498	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1499	vpxor	%xmm4,%xmm1,%xmm1
1500	vpshufb	%xmm13,%xmm14,%xmm14
1501	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1502	vmovdqu	144-64(%rsi),%xmm6
1503	vpxor	%xmm5,%xmm2,%xmm2
1504	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1505	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1506	vmovdqu	176-64(%rsi),%xmm7
1507	vpxor	%xmm14,%xmm9,%xmm9
1508
1509	vmovdqu	(%rdx),%xmm15
1510	vpxor	%xmm0,%xmm3,%xmm3
1511	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1512	vpxor	%xmm1,%xmm4,%xmm4
1513	vpshufb	%xmm13,%xmm15,%xmm15
1514	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1515	vmovdqu	160-64(%rsi),%xmm6
1516	vpxor	%xmm2,%xmm5,%xmm5
1517	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1518
1519	leaq	128(%rdx),%rdx
1520	cmpq	$0x80,%rcx
1521	jb	.Ltail_avx
1522
1523	vpxor	%xmm10,%xmm15,%xmm15
1524	subq	$0x80,%rcx
1525	jmp	.Loop8x_avx
1526
1527.align	32
1528.Loop8x_avx:
1529	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1530	vmovdqu	112(%rdx),%xmm14
1531	vpxor	%xmm0,%xmm3,%xmm3
1532	vpxor	%xmm15,%xmm8,%xmm8
1533	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
1534	vpshufb	%xmm13,%xmm14,%xmm14
1535	vpxor	%xmm1,%xmm4,%xmm4
1536	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
1537	vmovdqu	0-64(%rsi),%xmm6
1538	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1539	vpxor	%xmm2,%xmm5,%xmm5
1540	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
1541	vmovdqu	32-64(%rsi),%xmm7
1542	vpxor	%xmm14,%xmm9,%xmm9
1543
1544	vmovdqu	96(%rdx),%xmm15
1545	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1546	vpxor	%xmm3,%xmm10,%xmm10
1547	vpshufb	%xmm13,%xmm15,%xmm15
1548	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1549	vxorps	%xmm4,%xmm11,%xmm11
1550	vmovdqu	16-64(%rsi),%xmm6
1551	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1552	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1553	vpxor	%xmm5,%xmm12,%xmm12
1554	vxorps	%xmm15,%xmm8,%xmm8
1555
1556	vmovdqu	80(%rdx),%xmm14
1557	vpxor	%xmm10,%xmm12,%xmm12
1558	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1559	vpxor	%xmm11,%xmm12,%xmm12
1560	vpslldq	$8,%xmm12,%xmm9
1561	vpxor	%xmm0,%xmm3,%xmm3
1562	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1563	vpsrldq	$8,%xmm12,%xmm12
1564	vpxor	%xmm9,%xmm10,%xmm10
1565	vmovdqu	48-64(%rsi),%xmm6
1566	vpshufb	%xmm13,%xmm14,%xmm14
1567	vxorps	%xmm12,%xmm11,%xmm11
1568	vpxor	%xmm1,%xmm4,%xmm4
1569	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1570	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1571	vmovdqu	80-64(%rsi),%xmm7
1572	vpxor	%xmm14,%xmm9,%xmm9
1573	vpxor	%xmm2,%xmm5,%xmm5
1574
1575	vmovdqu	64(%rdx),%xmm15
1576	vpalignr	$8,%xmm10,%xmm10,%xmm12
1577	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1578	vpshufb	%xmm13,%xmm15,%xmm15
1579	vpxor	%xmm3,%xmm0,%xmm0
1580	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1581	vmovdqu	64-64(%rsi),%xmm6
1582	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1583	vpxor	%xmm4,%xmm1,%xmm1
1584	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1585	vxorps	%xmm15,%xmm8,%xmm8
1586	vpxor	%xmm5,%xmm2,%xmm2
1587
1588	vmovdqu	48(%rdx),%xmm14
1589	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1590	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1591	vpshufb	%xmm13,%xmm14,%xmm14
1592	vpxor	%xmm0,%xmm3,%xmm3
1593	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1594	vmovdqu	96-64(%rsi),%xmm6
1595	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1596	vpxor	%xmm1,%xmm4,%xmm4
1597	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1598	vmovdqu	128-64(%rsi),%xmm7
1599	vpxor	%xmm14,%xmm9,%xmm9
1600	vpxor	%xmm2,%xmm5,%xmm5
1601
1602	vmovdqu	32(%rdx),%xmm15
1603	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1604	vpshufb	%xmm13,%xmm15,%xmm15
1605	vpxor	%xmm3,%xmm0,%xmm0
1606	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1607	vmovdqu	112-64(%rsi),%xmm6
1608	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1609	vpxor	%xmm4,%xmm1,%xmm1
1610	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
1611	vpxor	%xmm15,%xmm8,%xmm8
1612	vpxor	%xmm5,%xmm2,%xmm2
1613	vxorps	%xmm12,%xmm10,%xmm10
1614
1615	vmovdqu	16(%rdx),%xmm14
1616	vpalignr	$8,%xmm10,%xmm10,%xmm12
1617	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
1618	vpshufb	%xmm13,%xmm14,%xmm14
1619	vpxor	%xmm0,%xmm3,%xmm3
1620	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
1621	vmovdqu	144-64(%rsi),%xmm6
1622	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
1623	vxorps	%xmm11,%xmm12,%xmm12
1624	vpunpckhqdq	%xmm14,%xmm14,%xmm9
1625	vpxor	%xmm1,%xmm4,%xmm4
1626	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
1627	vmovdqu	176-64(%rsi),%xmm7
1628	vpxor	%xmm14,%xmm9,%xmm9
1629	vpxor	%xmm2,%xmm5,%xmm5
1630
1631	vmovdqu	(%rdx),%xmm15
1632	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
1633	vpshufb	%xmm13,%xmm15,%xmm15
1634	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
1635	vmovdqu	160-64(%rsi),%xmm6
1636	vpxor	%xmm12,%xmm15,%xmm15
1637	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
1638	vpxor	%xmm10,%xmm15,%xmm15
1639
1640	leaq	128(%rdx),%rdx
1641	subq	$0x80,%rcx
1642	jnc	.Loop8x_avx
1643
1644	addq	$0x80,%rcx
1645	jmp	.Ltail_no_xor_avx
1646
1647.align	32
1648.Lshort_avx:
1649	vmovdqu	-16(%rdx,%rcx,1),%xmm14
1650	leaq	(%rdx,%rcx,1),%rdx
1651	vmovdqu	0-64(%rsi),%xmm6
1652	vmovdqu	32-64(%rsi),%xmm7
1653	vpshufb	%xmm13,%xmm14,%xmm15
1654
1655	vmovdqa	%xmm0,%xmm3
1656	vmovdqa	%xmm1,%xmm4
1657	vmovdqa	%xmm2,%xmm5
1658	subq	$0x10,%rcx
1659	jz	.Ltail_avx
1660
1661	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1662	vpxor	%xmm0,%xmm3,%xmm3
1663	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1664	vpxor	%xmm15,%xmm8,%xmm8
1665	vmovdqu	-32(%rdx),%xmm14
1666	vpxor	%xmm1,%xmm4,%xmm4
1667	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1668	vmovdqu	16-64(%rsi),%xmm6
1669	vpshufb	%xmm13,%xmm14,%xmm15
1670	vpxor	%xmm2,%xmm5,%xmm5
1671	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1672	vpsrldq	$8,%xmm7,%xmm7
1673	subq	$0x10,%rcx
1674	jz	.Ltail_avx
1675
1676	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1677	vpxor	%xmm0,%xmm3,%xmm3
1678	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1679	vpxor	%xmm15,%xmm8,%xmm8
1680	vmovdqu	-48(%rdx),%xmm14
1681	vpxor	%xmm1,%xmm4,%xmm4
1682	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1683	vmovdqu	48-64(%rsi),%xmm6
1684	vpshufb	%xmm13,%xmm14,%xmm15
1685	vpxor	%xmm2,%xmm5,%xmm5
1686	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1687	vmovdqu	80-64(%rsi),%xmm7
1688	subq	$0x10,%rcx
1689	jz	.Ltail_avx
1690
1691	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1692	vpxor	%xmm0,%xmm3,%xmm3
1693	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1694	vpxor	%xmm15,%xmm8,%xmm8
1695	vmovdqu	-64(%rdx),%xmm14
1696	vpxor	%xmm1,%xmm4,%xmm4
1697	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1698	vmovdqu	64-64(%rsi),%xmm6
1699	vpshufb	%xmm13,%xmm14,%xmm15
1700	vpxor	%xmm2,%xmm5,%xmm5
1701	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1702	vpsrldq	$8,%xmm7,%xmm7
1703	subq	$0x10,%rcx
1704	jz	.Ltail_avx
1705
1706	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1707	vpxor	%xmm0,%xmm3,%xmm3
1708	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1709	vpxor	%xmm15,%xmm8,%xmm8
1710	vmovdqu	-80(%rdx),%xmm14
1711	vpxor	%xmm1,%xmm4,%xmm4
1712	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1713	vmovdqu	96-64(%rsi),%xmm6
1714	vpshufb	%xmm13,%xmm14,%xmm15
1715	vpxor	%xmm2,%xmm5,%xmm5
1716	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1717	vmovdqu	128-64(%rsi),%xmm7
1718	subq	$0x10,%rcx
1719	jz	.Ltail_avx
1720
1721	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1722	vpxor	%xmm0,%xmm3,%xmm3
1723	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1724	vpxor	%xmm15,%xmm8,%xmm8
1725	vmovdqu	-96(%rdx),%xmm14
1726	vpxor	%xmm1,%xmm4,%xmm4
1727	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1728	vmovdqu	112-64(%rsi),%xmm6
1729	vpshufb	%xmm13,%xmm14,%xmm15
1730	vpxor	%xmm2,%xmm5,%xmm5
1731	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1732	vpsrldq	$8,%xmm7,%xmm7
1733	subq	$0x10,%rcx
1734	jz	.Ltail_avx
1735
1736	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1737	vpxor	%xmm0,%xmm3,%xmm3
1738	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1739	vpxor	%xmm15,%xmm8,%xmm8
1740	vmovdqu	-112(%rdx),%xmm14
1741	vpxor	%xmm1,%xmm4,%xmm4
1742	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1743	vmovdqu	144-64(%rsi),%xmm6
1744	vpshufb	%xmm13,%xmm14,%xmm15
1745	vpxor	%xmm2,%xmm5,%xmm5
1746	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1747	vmovq	184-64(%rsi),%xmm7
1748	subq	$0x10,%rcx
1749	jmp	.Ltail_avx
1750
1751.align	32
1752.Ltail_avx:
1753	vpxor	%xmm10,%xmm15,%xmm15
1754.Ltail_no_xor_avx:
1755	vpunpckhqdq	%xmm15,%xmm15,%xmm8
1756	vpxor	%xmm0,%xmm3,%xmm3
1757	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
1758	vpxor	%xmm15,%xmm8,%xmm8
1759	vpxor	%xmm1,%xmm4,%xmm4
1760	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
1761	vpxor	%xmm2,%xmm5,%xmm5
1762	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
1763
1764	vmovdqu	(%r10),%xmm12
1765
1766	vpxor	%xmm0,%xmm3,%xmm10
1767	vpxor	%xmm1,%xmm4,%xmm11
1768	vpxor	%xmm2,%xmm5,%xmm5
1769
1770	vpxor	%xmm10,%xmm5,%xmm5
1771	vpxor	%xmm11,%xmm5,%xmm5
1772	vpslldq	$8,%xmm5,%xmm9
1773	vpsrldq	$8,%xmm5,%xmm5
1774	vpxor	%xmm9,%xmm10,%xmm10
1775	vpxor	%xmm5,%xmm11,%xmm11
1776
1777	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1778	vpalignr	$8,%xmm10,%xmm10,%xmm10
1779	vpxor	%xmm9,%xmm10,%xmm10
1780
1781	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
1782	vpalignr	$8,%xmm10,%xmm10,%xmm10
1783	vpxor	%xmm11,%xmm10,%xmm10
1784	vpxor	%xmm9,%xmm10,%xmm10
1785
1786	cmpq	$0,%rcx
1787	jne	.Lshort_avx
1788
1789	vpshufb	%xmm13,%xmm10,%xmm10
1790	vmovdqu	%xmm10,(%rdi)
1791	vzeroupper
1792	.byte	0xf3,0xc3
1793.cfi_endproc
1794.size	gcm_ghash_avx,.-gcm_ghash_avx
1795.align	64
1796.Lbswap_mask:
1797.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1798.L0x1c2_polynomial:
1799.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1800.L7_mask:
1801.long	7,0,7,0
1802.L7_mask_poly:
1803.long	7,0,450,0
1804.align	64
1805.type	.Lrem_4bit,@object
1806.Lrem_4bit:
1807.long	0,0,0,471859200,0,943718400,0,610271232
1808.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
1809.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
1810.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
1811.type	.Lrem_8bit,@object
1812.Lrem_8bit:
1813.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1814.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1815.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1816.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1817.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1818.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1819.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1820.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1821.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1822.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1823.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1824.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1825.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1826.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1827.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1828.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1829.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1830.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1831.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1832.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1833.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1834.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1835.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1836.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1837.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1838.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1839.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1840.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1841.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1842.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1843.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1844.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1845
1846.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1847.align	64
1848