1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11.text
12
13.extern	GFp_ia32cap_P
14.hidden GFp_ia32cap_P
15
16.globl	GFp_bn_mul_mont_gather5
17.hidden GFp_bn_mul_mont_gather5
18.type	GFp_bn_mul_mont_gather5,@function
19.align	64
20GFp_bn_mul_mont_gather5:
21.cfi_startproc
22	movl	%r9d,%r9d
23	movq	%rsp,%rax
24.cfi_def_cfa_register	%rax
25	testl	$7,%r9d
26	jnz	.Lmul_enter
27	leaq	GFp_ia32cap_P(%rip),%r11
28	movl	8(%r11),%r11d
29	jmp	.Lmul4x_enter
30
31.align	16
32.Lmul_enter:
33	movd	8(%rsp),%xmm5
34	pushq	%rbx
35.cfi_offset	%rbx,-16
36	pushq	%rbp
37.cfi_offset	%rbp,-24
38	pushq	%r12
39.cfi_offset	%r12,-32
40	pushq	%r13
41.cfi_offset	%r13,-40
42	pushq	%r14
43.cfi_offset	%r14,-48
44	pushq	%r15
45.cfi_offset	%r15,-56
46
47	negq	%r9
48	movq	%rsp,%r11
49	leaq	-280(%rsp,%r9,8),%r10
50	negq	%r9
51	andq	$-1024,%r10
52
53
54
55
56
57
58
59
60
61	subq	%r10,%r11
62	andq	$-4096,%r11
63	leaq	(%r10,%r11,1),%rsp
64	movq	(%rsp),%r11
65	cmpq	%r10,%rsp
66	ja	.Lmul_page_walk
67	jmp	.Lmul_page_walk_done
68
69.Lmul_page_walk:
70	leaq	-4096(%rsp),%rsp
71	movq	(%rsp),%r11
72	cmpq	%r10,%rsp
73	ja	.Lmul_page_walk
74.Lmul_page_walk_done:
75
76	leaq	.Linc(%rip),%r10
77	movq	%rax,8(%rsp,%r9,8)
78.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
79.Lmul_body:
80
81	leaq	128(%rdx),%r12
82	movdqa	0(%r10),%xmm0
83	movdqa	16(%r10),%xmm1
84	leaq	24-112(%rsp,%r9,8),%r10
85	andq	$-16,%r10
86
87	pshufd	$0,%xmm5,%xmm5
88	movdqa	%xmm1,%xmm4
89	movdqa	%xmm1,%xmm2
90	paddd	%xmm0,%xmm1
91	pcmpeqd	%xmm5,%xmm0
92.byte	0x67
93	movdqa	%xmm4,%xmm3
94	paddd	%xmm1,%xmm2
95	pcmpeqd	%xmm5,%xmm1
96	movdqa	%xmm0,112(%r10)
97	movdqa	%xmm4,%xmm0
98
99	paddd	%xmm2,%xmm3
100	pcmpeqd	%xmm5,%xmm2
101	movdqa	%xmm1,128(%r10)
102	movdqa	%xmm4,%xmm1
103
104	paddd	%xmm3,%xmm0
105	pcmpeqd	%xmm5,%xmm3
106	movdqa	%xmm2,144(%r10)
107	movdqa	%xmm4,%xmm2
108
109	paddd	%xmm0,%xmm1
110	pcmpeqd	%xmm5,%xmm0
111	movdqa	%xmm3,160(%r10)
112	movdqa	%xmm4,%xmm3
113	paddd	%xmm1,%xmm2
114	pcmpeqd	%xmm5,%xmm1
115	movdqa	%xmm0,176(%r10)
116	movdqa	%xmm4,%xmm0
117
118	paddd	%xmm2,%xmm3
119	pcmpeqd	%xmm5,%xmm2
120	movdqa	%xmm1,192(%r10)
121	movdqa	%xmm4,%xmm1
122
123	paddd	%xmm3,%xmm0
124	pcmpeqd	%xmm5,%xmm3
125	movdqa	%xmm2,208(%r10)
126	movdqa	%xmm4,%xmm2
127
128	paddd	%xmm0,%xmm1
129	pcmpeqd	%xmm5,%xmm0
130	movdqa	%xmm3,224(%r10)
131	movdqa	%xmm4,%xmm3
132	paddd	%xmm1,%xmm2
133	pcmpeqd	%xmm5,%xmm1
134	movdqa	%xmm0,240(%r10)
135	movdqa	%xmm4,%xmm0
136
137	paddd	%xmm2,%xmm3
138	pcmpeqd	%xmm5,%xmm2
139	movdqa	%xmm1,256(%r10)
140	movdqa	%xmm4,%xmm1
141
142	paddd	%xmm3,%xmm0
143	pcmpeqd	%xmm5,%xmm3
144	movdqa	%xmm2,272(%r10)
145	movdqa	%xmm4,%xmm2
146
147	paddd	%xmm0,%xmm1
148	pcmpeqd	%xmm5,%xmm0
149	movdqa	%xmm3,288(%r10)
150	movdqa	%xmm4,%xmm3
151	paddd	%xmm1,%xmm2
152	pcmpeqd	%xmm5,%xmm1
153	movdqa	%xmm0,304(%r10)
154
155	paddd	%xmm2,%xmm3
156.byte	0x67
157	pcmpeqd	%xmm5,%xmm2
158	movdqa	%xmm1,320(%r10)
159
160	pcmpeqd	%xmm5,%xmm3
161	movdqa	%xmm2,336(%r10)
162	pand	64(%r12),%xmm0
163
164	pand	80(%r12),%xmm1
165	pand	96(%r12),%xmm2
166	movdqa	%xmm3,352(%r10)
167	pand	112(%r12),%xmm3
168	por	%xmm2,%xmm0
169	por	%xmm3,%xmm1
170	movdqa	-128(%r12),%xmm4
171	movdqa	-112(%r12),%xmm5
172	movdqa	-96(%r12),%xmm2
173	pand	112(%r10),%xmm4
174	movdqa	-80(%r12),%xmm3
175	pand	128(%r10),%xmm5
176	por	%xmm4,%xmm0
177	pand	144(%r10),%xmm2
178	por	%xmm5,%xmm1
179	pand	160(%r10),%xmm3
180	por	%xmm2,%xmm0
181	por	%xmm3,%xmm1
182	movdqa	-64(%r12),%xmm4
183	movdqa	-48(%r12),%xmm5
184	movdqa	-32(%r12),%xmm2
185	pand	176(%r10),%xmm4
186	movdqa	-16(%r12),%xmm3
187	pand	192(%r10),%xmm5
188	por	%xmm4,%xmm0
189	pand	208(%r10),%xmm2
190	por	%xmm5,%xmm1
191	pand	224(%r10),%xmm3
192	por	%xmm2,%xmm0
193	por	%xmm3,%xmm1
194	movdqa	0(%r12),%xmm4
195	movdqa	16(%r12),%xmm5
196	movdqa	32(%r12),%xmm2
197	pand	240(%r10),%xmm4
198	movdqa	48(%r12),%xmm3
199	pand	256(%r10),%xmm5
200	por	%xmm4,%xmm0
201	pand	272(%r10),%xmm2
202	por	%xmm5,%xmm1
203	pand	288(%r10),%xmm3
204	por	%xmm2,%xmm0
205	por	%xmm3,%xmm1
206	por	%xmm1,%xmm0
207	pshufd	$0x4e,%xmm0,%xmm1
208	por	%xmm1,%xmm0
209	leaq	256(%r12),%r12
210.byte	102,72,15,126,195
211
212	movq	(%r8),%r8
213	movq	(%rsi),%rax
214
215	xorq	%r14,%r14
216	xorq	%r15,%r15
217
218	movq	%r8,%rbp
219	mulq	%rbx
220	movq	%rax,%r10
221	movq	(%rcx),%rax
222
223	imulq	%r10,%rbp
224	movq	%rdx,%r11
225
226	mulq	%rbp
227	addq	%rax,%r10
228	movq	8(%rsi),%rax
229	adcq	$0,%rdx
230	movq	%rdx,%r13
231
232	leaq	1(%r15),%r15
233	jmp	.L1st_enter
234
235.align	16
236.L1st:
237	addq	%rax,%r13
238	movq	(%rsi,%r15,8),%rax
239	adcq	$0,%rdx
240	addq	%r11,%r13
241	movq	%r10,%r11
242	adcq	$0,%rdx
243	movq	%r13,-16(%rsp,%r15,8)
244	movq	%rdx,%r13
245
246.L1st_enter:
247	mulq	%rbx
248	addq	%rax,%r11
249	movq	(%rcx,%r15,8),%rax
250	adcq	$0,%rdx
251	leaq	1(%r15),%r15
252	movq	%rdx,%r10
253
254	mulq	%rbp
255	cmpq	%r9,%r15
256	jne	.L1st
257
258
259	addq	%rax,%r13
260	adcq	$0,%rdx
261	addq	%r11,%r13
262	adcq	$0,%rdx
263	movq	%r13,-16(%rsp,%r9,8)
264	movq	%rdx,%r13
265	movq	%r10,%r11
266
267	xorq	%rdx,%rdx
268	addq	%r11,%r13
269	adcq	$0,%rdx
270	movq	%r13,-8(%rsp,%r9,8)
271	movq	%rdx,(%rsp,%r9,8)
272
273	leaq	1(%r14),%r14
274	jmp	.Louter
275.align	16
276.Louter:
277	leaq	24+128(%rsp,%r9,8),%rdx
278	andq	$-16,%rdx
279	pxor	%xmm4,%xmm4
280	pxor	%xmm5,%xmm5
281	movdqa	-128(%r12),%xmm0
282	movdqa	-112(%r12),%xmm1
283	movdqa	-96(%r12),%xmm2
284	movdqa	-80(%r12),%xmm3
285	pand	-128(%rdx),%xmm0
286	pand	-112(%rdx),%xmm1
287	por	%xmm0,%xmm4
288	pand	-96(%rdx),%xmm2
289	por	%xmm1,%xmm5
290	pand	-80(%rdx),%xmm3
291	por	%xmm2,%xmm4
292	por	%xmm3,%xmm5
293	movdqa	-64(%r12),%xmm0
294	movdqa	-48(%r12),%xmm1
295	movdqa	-32(%r12),%xmm2
296	movdqa	-16(%r12),%xmm3
297	pand	-64(%rdx),%xmm0
298	pand	-48(%rdx),%xmm1
299	por	%xmm0,%xmm4
300	pand	-32(%rdx),%xmm2
301	por	%xmm1,%xmm5
302	pand	-16(%rdx),%xmm3
303	por	%xmm2,%xmm4
304	por	%xmm3,%xmm5
305	movdqa	0(%r12),%xmm0
306	movdqa	16(%r12),%xmm1
307	movdqa	32(%r12),%xmm2
308	movdqa	48(%r12),%xmm3
309	pand	0(%rdx),%xmm0
310	pand	16(%rdx),%xmm1
311	por	%xmm0,%xmm4
312	pand	32(%rdx),%xmm2
313	por	%xmm1,%xmm5
314	pand	48(%rdx),%xmm3
315	por	%xmm2,%xmm4
316	por	%xmm3,%xmm5
317	movdqa	64(%r12),%xmm0
318	movdqa	80(%r12),%xmm1
319	movdqa	96(%r12),%xmm2
320	movdqa	112(%r12),%xmm3
321	pand	64(%rdx),%xmm0
322	pand	80(%rdx),%xmm1
323	por	%xmm0,%xmm4
324	pand	96(%rdx),%xmm2
325	por	%xmm1,%xmm5
326	pand	112(%rdx),%xmm3
327	por	%xmm2,%xmm4
328	por	%xmm3,%xmm5
329	por	%xmm5,%xmm4
330	pshufd	$0x4e,%xmm4,%xmm0
331	por	%xmm4,%xmm0
332	leaq	256(%r12),%r12
333
334	movq	(%rsi),%rax
335.byte	102,72,15,126,195
336
337	xorq	%r15,%r15
338	movq	%r8,%rbp
339	movq	(%rsp),%r10
340
341	mulq	%rbx
342	addq	%rax,%r10
343	movq	(%rcx),%rax
344	adcq	$0,%rdx
345
346	imulq	%r10,%rbp
347	movq	%rdx,%r11
348
349	mulq	%rbp
350	addq	%rax,%r10
351	movq	8(%rsi),%rax
352	adcq	$0,%rdx
353	movq	8(%rsp),%r10
354	movq	%rdx,%r13
355
356	leaq	1(%r15),%r15
357	jmp	.Linner_enter
358
359.align	16
360.Linner:
361	addq	%rax,%r13
362	movq	(%rsi,%r15,8),%rax
363	adcq	$0,%rdx
364	addq	%r10,%r13
365	movq	(%rsp,%r15,8),%r10
366	adcq	$0,%rdx
367	movq	%r13,-16(%rsp,%r15,8)
368	movq	%rdx,%r13
369
370.Linner_enter:
371	mulq	%rbx
372	addq	%rax,%r11
373	movq	(%rcx,%r15,8),%rax
374	adcq	$0,%rdx
375	addq	%r11,%r10
376	movq	%rdx,%r11
377	adcq	$0,%r11
378	leaq	1(%r15),%r15
379
380	mulq	%rbp
381	cmpq	%r9,%r15
382	jne	.Linner
383
384	addq	%rax,%r13
385	adcq	$0,%rdx
386	addq	%r10,%r13
387	movq	(%rsp,%r9,8),%r10
388	adcq	$0,%rdx
389	movq	%r13,-16(%rsp,%r9,8)
390	movq	%rdx,%r13
391
392	xorq	%rdx,%rdx
393	addq	%r11,%r13
394	adcq	$0,%rdx
395	addq	%r10,%r13
396	adcq	$0,%rdx
397	movq	%r13,-8(%rsp,%r9,8)
398	movq	%rdx,(%rsp,%r9,8)
399
400	leaq	1(%r14),%r14
401	cmpq	%r9,%r14
402	jb	.Louter
403
404	xorq	%r14,%r14
405	movq	(%rsp),%rax
406	leaq	(%rsp),%rsi
407	movq	%r9,%r15
408	jmp	.Lsub
409.align	16
410.Lsub:	sbbq	(%rcx,%r14,8),%rax
411	movq	%rax,(%rdi,%r14,8)
412	movq	8(%rsi,%r14,8),%rax
413	leaq	1(%r14),%r14
414	decq	%r15
415	jnz	.Lsub
416
417	sbbq	$0,%rax
418	movq	$-1,%rbx
419	xorq	%rax,%rbx
420	xorq	%r14,%r14
421	movq	%r9,%r15
422
423.Lcopy:
424	movq	(%rdi,%r14,8),%rcx
425	movq	(%rsp,%r14,8),%rdx
426	andq	%rbx,%rcx
427	andq	%rax,%rdx
428	movq	%r14,(%rsp,%r14,8)
429	orq	%rcx,%rdx
430	movq	%rdx,(%rdi,%r14,8)
431	leaq	1(%r14),%r14
432	subq	$1,%r15
433	jnz	.Lcopy
434
435	movq	8(%rsp,%r9,8),%rsi
436.cfi_def_cfa	%rsi,8
437	movq	$1,%rax
438
439	movq	-48(%rsi),%r15
440.cfi_restore	%r15
441	movq	-40(%rsi),%r14
442.cfi_restore	%r14
443	movq	-32(%rsi),%r13
444.cfi_restore	%r13
445	movq	-24(%rsi),%r12
446.cfi_restore	%r12
447	movq	-16(%rsi),%rbp
448.cfi_restore	%rbp
449	movq	-8(%rsi),%rbx
450.cfi_restore	%rbx
451	leaq	(%rsi),%rsp
452.cfi_def_cfa_register	%rsp
453.Lmul_epilogue:
454	.byte	0xf3,0xc3
455.cfi_endproc
456.size	GFp_bn_mul_mont_gather5,.-GFp_bn_mul_mont_gather5
457.type	bn_mul4x_mont_gather5,@function
458.align	32
459bn_mul4x_mont_gather5:
460.cfi_startproc
461.byte	0x67
462	movq	%rsp,%rax
463.cfi_def_cfa_register	%rax
464.Lmul4x_enter:
465	andl	$0x80108,%r11d
466	cmpl	$0x80108,%r11d
467	je	.Lmulx4x_enter
468	pushq	%rbx
469.cfi_offset	%rbx,-16
470	pushq	%rbp
471.cfi_offset	%rbp,-24
472	pushq	%r12
473.cfi_offset	%r12,-32
474	pushq	%r13
475.cfi_offset	%r13,-40
476	pushq	%r14
477.cfi_offset	%r14,-48
478	pushq	%r15
479.cfi_offset	%r15,-56
480.Lmul4x_prologue:
481
482.byte	0x67
483	shll	$3,%r9d
484	leaq	(%r9,%r9,2),%r10
485	negq	%r9
486
487
488
489
490
491
492
493
494
495
496	leaq	-320(%rsp,%r9,2),%r11
497	movq	%rsp,%rbp
498	subq	%rdi,%r11
499	andq	$4095,%r11
500	cmpq	%r11,%r10
501	jb	.Lmul4xsp_alt
502	subq	%r11,%rbp
503	leaq	-320(%rbp,%r9,2),%rbp
504	jmp	.Lmul4xsp_done
505
506.align	32
507.Lmul4xsp_alt:
508	leaq	4096-320(,%r9,2),%r10
509	leaq	-320(%rbp,%r9,2),%rbp
510	subq	%r10,%r11
511	movq	$0,%r10
512	cmovcq	%r10,%r11
513	subq	%r11,%rbp
514.Lmul4xsp_done:
515	andq	$-64,%rbp
516	movq	%rsp,%r11
517	subq	%rbp,%r11
518	andq	$-4096,%r11
519	leaq	(%r11,%rbp,1),%rsp
520	movq	(%rsp),%r10
521	cmpq	%rbp,%rsp
522	ja	.Lmul4x_page_walk
523	jmp	.Lmul4x_page_walk_done
524
525.Lmul4x_page_walk:
526	leaq	-4096(%rsp),%rsp
527	movq	(%rsp),%r10
528	cmpq	%rbp,%rsp
529	ja	.Lmul4x_page_walk
530.Lmul4x_page_walk_done:
531
532	negq	%r9
533
534	movq	%rax,40(%rsp)
535.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
536.Lmul4x_body:
537
538	call	mul4x_internal
539
540	movq	40(%rsp),%rsi
541.cfi_def_cfa	%rsi,8
542	movq	$1,%rax
543
544	movq	-48(%rsi),%r15
545.cfi_restore	%r15
546	movq	-40(%rsi),%r14
547.cfi_restore	%r14
548	movq	-32(%rsi),%r13
549.cfi_restore	%r13
550	movq	-24(%rsi),%r12
551.cfi_restore	%r12
552	movq	-16(%rsi),%rbp
553.cfi_restore	%rbp
554	movq	-8(%rsi),%rbx
555.cfi_restore	%rbx
556	leaq	(%rsi),%rsp
557.cfi_def_cfa_register	%rsp
558.Lmul4x_epilogue:
559	.byte	0xf3,0xc3
560.cfi_endproc
561.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
562
563.type	mul4x_internal,@function
564.align	32
565mul4x_internal:
566.cfi_startproc
567	shlq	$5,%r9
568	movd	8(%rax),%xmm5
569	leaq	.Linc(%rip),%rax
570	leaq	128(%rdx,%r9,1),%r13
571	shrq	$5,%r9
572	movdqa	0(%rax),%xmm0
573	movdqa	16(%rax),%xmm1
574	leaq	88-112(%rsp,%r9,1),%r10
575	leaq	128(%rdx),%r12
576
577	pshufd	$0,%xmm5,%xmm5
578	movdqa	%xmm1,%xmm4
579.byte	0x67,0x67
580	movdqa	%xmm1,%xmm2
581	paddd	%xmm0,%xmm1
582	pcmpeqd	%xmm5,%xmm0
583.byte	0x67
584	movdqa	%xmm4,%xmm3
585	paddd	%xmm1,%xmm2
586	pcmpeqd	%xmm5,%xmm1
587	movdqa	%xmm0,112(%r10)
588	movdqa	%xmm4,%xmm0
589
590	paddd	%xmm2,%xmm3
591	pcmpeqd	%xmm5,%xmm2
592	movdqa	%xmm1,128(%r10)
593	movdqa	%xmm4,%xmm1
594
595	paddd	%xmm3,%xmm0
596	pcmpeqd	%xmm5,%xmm3
597	movdqa	%xmm2,144(%r10)
598	movdqa	%xmm4,%xmm2
599
600	paddd	%xmm0,%xmm1
601	pcmpeqd	%xmm5,%xmm0
602	movdqa	%xmm3,160(%r10)
603	movdqa	%xmm4,%xmm3
604	paddd	%xmm1,%xmm2
605	pcmpeqd	%xmm5,%xmm1
606	movdqa	%xmm0,176(%r10)
607	movdqa	%xmm4,%xmm0
608
609	paddd	%xmm2,%xmm3
610	pcmpeqd	%xmm5,%xmm2
611	movdqa	%xmm1,192(%r10)
612	movdqa	%xmm4,%xmm1
613
614	paddd	%xmm3,%xmm0
615	pcmpeqd	%xmm5,%xmm3
616	movdqa	%xmm2,208(%r10)
617	movdqa	%xmm4,%xmm2
618
619	paddd	%xmm0,%xmm1
620	pcmpeqd	%xmm5,%xmm0
621	movdqa	%xmm3,224(%r10)
622	movdqa	%xmm4,%xmm3
623	paddd	%xmm1,%xmm2
624	pcmpeqd	%xmm5,%xmm1
625	movdqa	%xmm0,240(%r10)
626	movdqa	%xmm4,%xmm0
627
628	paddd	%xmm2,%xmm3
629	pcmpeqd	%xmm5,%xmm2
630	movdqa	%xmm1,256(%r10)
631	movdqa	%xmm4,%xmm1
632
633	paddd	%xmm3,%xmm0
634	pcmpeqd	%xmm5,%xmm3
635	movdqa	%xmm2,272(%r10)
636	movdqa	%xmm4,%xmm2
637
638	paddd	%xmm0,%xmm1
639	pcmpeqd	%xmm5,%xmm0
640	movdqa	%xmm3,288(%r10)
641	movdqa	%xmm4,%xmm3
642	paddd	%xmm1,%xmm2
643	pcmpeqd	%xmm5,%xmm1
644	movdqa	%xmm0,304(%r10)
645
646	paddd	%xmm2,%xmm3
647.byte	0x67
648	pcmpeqd	%xmm5,%xmm2
649	movdqa	%xmm1,320(%r10)
650
651	pcmpeqd	%xmm5,%xmm3
652	movdqa	%xmm2,336(%r10)
653	pand	64(%r12),%xmm0
654
655	pand	80(%r12),%xmm1
656	pand	96(%r12),%xmm2
657	movdqa	%xmm3,352(%r10)
658	pand	112(%r12),%xmm3
659	por	%xmm2,%xmm0
660	por	%xmm3,%xmm1
661	movdqa	-128(%r12),%xmm4
662	movdqa	-112(%r12),%xmm5
663	movdqa	-96(%r12),%xmm2
664	pand	112(%r10),%xmm4
665	movdqa	-80(%r12),%xmm3
666	pand	128(%r10),%xmm5
667	por	%xmm4,%xmm0
668	pand	144(%r10),%xmm2
669	por	%xmm5,%xmm1
670	pand	160(%r10),%xmm3
671	por	%xmm2,%xmm0
672	por	%xmm3,%xmm1
673	movdqa	-64(%r12),%xmm4
674	movdqa	-48(%r12),%xmm5
675	movdqa	-32(%r12),%xmm2
676	pand	176(%r10),%xmm4
677	movdqa	-16(%r12),%xmm3
678	pand	192(%r10),%xmm5
679	por	%xmm4,%xmm0
680	pand	208(%r10),%xmm2
681	por	%xmm5,%xmm1
682	pand	224(%r10),%xmm3
683	por	%xmm2,%xmm0
684	por	%xmm3,%xmm1
685	movdqa	0(%r12),%xmm4
686	movdqa	16(%r12),%xmm5
687	movdqa	32(%r12),%xmm2
688	pand	240(%r10),%xmm4
689	movdqa	48(%r12),%xmm3
690	pand	256(%r10),%xmm5
691	por	%xmm4,%xmm0
692	pand	272(%r10),%xmm2
693	por	%xmm5,%xmm1
694	pand	288(%r10),%xmm3
695	por	%xmm2,%xmm0
696	por	%xmm3,%xmm1
697	por	%xmm1,%xmm0
698	pshufd	$0x4e,%xmm0,%xmm1
699	por	%xmm1,%xmm0
700	leaq	256(%r12),%r12
701.byte	102,72,15,126,195
702
703	movq	%r13,16+8(%rsp)
704	movq	%rdi,56+8(%rsp)
705
706	movq	(%r8),%r8
707	movq	(%rsi),%rax
708	leaq	(%rsi,%r9,1),%rsi
709	negq	%r9
710
711	movq	%r8,%rbp
712	mulq	%rbx
713	movq	%rax,%r10
714	movq	(%rcx),%rax
715
716	imulq	%r10,%rbp
717	leaq	64+8(%rsp),%r14
718	movq	%rdx,%r11
719
720	mulq	%rbp
721	addq	%rax,%r10
722	movq	8(%rsi,%r9,1),%rax
723	adcq	$0,%rdx
724	movq	%rdx,%rdi
725
726	mulq	%rbx
727	addq	%rax,%r11
728	movq	8(%rcx),%rax
729	adcq	$0,%rdx
730	movq	%rdx,%r10
731
732	mulq	%rbp
733	addq	%rax,%rdi
734	movq	16(%rsi,%r9,1),%rax
735	adcq	$0,%rdx
736	addq	%r11,%rdi
737	leaq	32(%r9),%r15
738	leaq	32(%rcx),%rcx
739	adcq	$0,%rdx
740	movq	%rdi,(%r14)
741	movq	%rdx,%r13
742	jmp	.L1st4x
743
744.align	32
745.L1st4x:
746	mulq	%rbx
747	addq	%rax,%r10
748	movq	-16(%rcx),%rax
749	leaq	32(%r14),%r14
750	adcq	$0,%rdx
751	movq	%rdx,%r11
752
753	mulq	%rbp
754	addq	%rax,%r13
755	movq	-8(%rsi,%r15,1),%rax
756	adcq	$0,%rdx
757	addq	%r10,%r13
758	adcq	$0,%rdx
759	movq	%r13,-24(%r14)
760	movq	%rdx,%rdi
761
762	mulq	%rbx
763	addq	%rax,%r11
764	movq	-8(%rcx),%rax
765	adcq	$0,%rdx
766	movq	%rdx,%r10
767
768	mulq	%rbp
769	addq	%rax,%rdi
770	movq	(%rsi,%r15,1),%rax
771	adcq	$0,%rdx
772	addq	%r11,%rdi
773	adcq	$0,%rdx
774	movq	%rdi,-16(%r14)
775	movq	%rdx,%r13
776
777	mulq	%rbx
778	addq	%rax,%r10
779	movq	0(%rcx),%rax
780	adcq	$0,%rdx
781	movq	%rdx,%r11
782
783	mulq	%rbp
784	addq	%rax,%r13
785	movq	8(%rsi,%r15,1),%rax
786	adcq	$0,%rdx
787	addq	%r10,%r13
788	adcq	$0,%rdx
789	movq	%r13,-8(%r14)
790	movq	%rdx,%rdi
791
792	mulq	%rbx
793	addq	%rax,%r11
794	movq	8(%rcx),%rax
795	adcq	$0,%rdx
796	movq	%rdx,%r10
797
798	mulq	%rbp
799	addq	%rax,%rdi
800	movq	16(%rsi,%r15,1),%rax
801	adcq	$0,%rdx
802	addq	%r11,%rdi
803	leaq	32(%rcx),%rcx
804	adcq	$0,%rdx
805	movq	%rdi,(%r14)
806	movq	%rdx,%r13
807
808	addq	$32,%r15
809	jnz	.L1st4x
810
811	mulq	%rbx
812	addq	%rax,%r10
813	movq	-16(%rcx),%rax
814	leaq	32(%r14),%r14
815	adcq	$0,%rdx
816	movq	%rdx,%r11
817
818	mulq	%rbp
819	addq	%rax,%r13
820	movq	-8(%rsi),%rax
821	adcq	$0,%rdx
822	addq	%r10,%r13
823	adcq	$0,%rdx
824	movq	%r13,-24(%r14)
825	movq	%rdx,%rdi
826
827	mulq	%rbx
828	addq	%rax,%r11
829	movq	-8(%rcx),%rax
830	adcq	$0,%rdx
831	movq	%rdx,%r10
832
833	mulq	%rbp
834	addq	%rax,%rdi
835	movq	(%rsi,%r9,1),%rax
836	adcq	$0,%rdx
837	addq	%r11,%rdi
838	adcq	$0,%rdx
839	movq	%rdi,-16(%r14)
840	movq	%rdx,%r13
841
842	leaq	(%rcx,%r9,1),%rcx
843
844	xorq	%rdi,%rdi
845	addq	%r10,%r13
846	adcq	$0,%rdi
847	movq	%r13,-8(%r14)
848
849	jmp	.Louter4x
850
851.align	32
852.Louter4x:
853	leaq	16+128(%r14),%rdx
854	pxor	%xmm4,%xmm4
855	pxor	%xmm5,%xmm5
856	movdqa	-128(%r12),%xmm0
857	movdqa	-112(%r12),%xmm1
858	movdqa	-96(%r12),%xmm2
859	movdqa	-80(%r12),%xmm3
860	pand	-128(%rdx),%xmm0
861	pand	-112(%rdx),%xmm1
862	por	%xmm0,%xmm4
863	pand	-96(%rdx),%xmm2
864	por	%xmm1,%xmm5
865	pand	-80(%rdx),%xmm3
866	por	%xmm2,%xmm4
867	por	%xmm3,%xmm5
868	movdqa	-64(%r12),%xmm0
869	movdqa	-48(%r12),%xmm1
870	movdqa	-32(%r12),%xmm2
871	movdqa	-16(%r12),%xmm3
872	pand	-64(%rdx),%xmm0
873	pand	-48(%rdx),%xmm1
874	por	%xmm0,%xmm4
875	pand	-32(%rdx),%xmm2
876	por	%xmm1,%xmm5
877	pand	-16(%rdx),%xmm3
878	por	%xmm2,%xmm4
879	por	%xmm3,%xmm5
880	movdqa	0(%r12),%xmm0
881	movdqa	16(%r12),%xmm1
882	movdqa	32(%r12),%xmm2
883	movdqa	48(%r12),%xmm3
884	pand	0(%rdx),%xmm0
885	pand	16(%rdx),%xmm1
886	por	%xmm0,%xmm4
887	pand	32(%rdx),%xmm2
888	por	%xmm1,%xmm5
889	pand	48(%rdx),%xmm3
890	por	%xmm2,%xmm4
891	por	%xmm3,%xmm5
892	movdqa	64(%r12),%xmm0
893	movdqa	80(%r12),%xmm1
894	movdqa	96(%r12),%xmm2
895	movdqa	112(%r12),%xmm3
896	pand	64(%rdx),%xmm0
897	pand	80(%rdx),%xmm1
898	por	%xmm0,%xmm4
899	pand	96(%rdx),%xmm2
900	por	%xmm1,%xmm5
901	pand	112(%rdx),%xmm3
902	por	%xmm2,%xmm4
903	por	%xmm3,%xmm5
904	por	%xmm5,%xmm4
905	pshufd	$0x4e,%xmm4,%xmm0
906	por	%xmm4,%xmm0
907	leaq	256(%r12),%r12
908.byte	102,72,15,126,195
909
910	movq	(%r14,%r9,1),%r10
911	movq	%r8,%rbp
912	mulq	%rbx
913	addq	%rax,%r10
914	movq	(%rcx),%rax
915	adcq	$0,%rdx
916
917	imulq	%r10,%rbp
918	movq	%rdx,%r11
919	movq	%rdi,(%r14)
920
921	leaq	(%r14,%r9,1),%r14
922
923	mulq	%rbp
924	addq	%rax,%r10
925	movq	8(%rsi,%r9,1),%rax
926	adcq	$0,%rdx
927	movq	%rdx,%rdi
928
929	mulq	%rbx
930	addq	%rax,%r11
931	movq	8(%rcx),%rax
932	adcq	$0,%rdx
933	addq	8(%r14),%r11
934	adcq	$0,%rdx
935	movq	%rdx,%r10
936
937	mulq	%rbp
938	addq	%rax,%rdi
939	movq	16(%rsi,%r9,1),%rax
940	adcq	$0,%rdx
941	addq	%r11,%rdi
942	leaq	32(%r9),%r15
943	leaq	32(%rcx),%rcx
944	adcq	$0,%rdx
945	movq	%rdx,%r13
946	jmp	.Linner4x
947
948.align	32
949.Linner4x:
950	mulq	%rbx
951	addq	%rax,%r10
952	movq	-16(%rcx),%rax
953	adcq	$0,%rdx
954	addq	16(%r14),%r10
955	leaq	32(%r14),%r14
956	adcq	$0,%rdx
957	movq	%rdx,%r11
958
959	mulq	%rbp
960	addq	%rax,%r13
961	movq	-8(%rsi,%r15,1),%rax
962	adcq	$0,%rdx
963	addq	%r10,%r13
964	adcq	$0,%rdx
965	movq	%rdi,-32(%r14)
966	movq	%rdx,%rdi
967
968	mulq	%rbx
969	addq	%rax,%r11
970	movq	-8(%rcx),%rax
971	adcq	$0,%rdx
972	addq	-8(%r14),%r11
973	adcq	$0,%rdx
974	movq	%rdx,%r10
975
976	mulq	%rbp
977	addq	%rax,%rdi
978	movq	(%rsi,%r15,1),%rax
979	adcq	$0,%rdx
980	addq	%r11,%rdi
981	adcq	$0,%rdx
982	movq	%r13,-24(%r14)
983	movq	%rdx,%r13
984
985	mulq	%rbx
986	addq	%rax,%r10
987	movq	0(%rcx),%rax
988	adcq	$0,%rdx
989	addq	(%r14),%r10
990	adcq	$0,%rdx
991	movq	%rdx,%r11
992
993	mulq	%rbp
994	addq	%rax,%r13
995	movq	8(%rsi,%r15,1),%rax
996	adcq	$0,%rdx
997	addq	%r10,%r13
998	adcq	$0,%rdx
999	movq	%rdi,-16(%r14)
1000	movq	%rdx,%rdi
1001
1002	mulq	%rbx
1003	addq	%rax,%r11
1004	movq	8(%rcx),%rax
1005	adcq	$0,%rdx
1006	addq	8(%r14),%r11
1007	adcq	$0,%rdx
1008	movq	%rdx,%r10
1009
1010	mulq	%rbp
1011	addq	%rax,%rdi
1012	movq	16(%rsi,%r15,1),%rax
1013	adcq	$0,%rdx
1014	addq	%r11,%rdi
1015	leaq	32(%rcx),%rcx
1016	adcq	$0,%rdx
1017	movq	%r13,-8(%r14)
1018	movq	%rdx,%r13
1019
1020	addq	$32,%r15
1021	jnz	.Linner4x
1022
1023	mulq	%rbx
1024	addq	%rax,%r10
1025	movq	-16(%rcx),%rax
1026	adcq	$0,%rdx
1027	addq	16(%r14),%r10
1028	leaq	32(%r14),%r14
1029	adcq	$0,%rdx
1030	movq	%rdx,%r11
1031
1032	mulq	%rbp
1033	addq	%rax,%r13
1034	movq	-8(%rsi),%rax
1035	adcq	$0,%rdx
1036	addq	%r10,%r13
1037	adcq	$0,%rdx
1038	movq	%rdi,-32(%r14)
1039	movq	%rdx,%rdi
1040
1041	mulq	%rbx
1042	addq	%rax,%r11
1043	movq	%rbp,%rax
1044	movq	-8(%rcx),%rbp
1045	adcq	$0,%rdx
1046	addq	-8(%r14),%r11
1047	adcq	$0,%rdx
1048	movq	%rdx,%r10
1049
1050	mulq	%rbp
1051	addq	%rax,%rdi
1052	movq	(%rsi,%r9,1),%rax
1053	adcq	$0,%rdx
1054	addq	%r11,%rdi
1055	adcq	$0,%rdx
1056	movq	%r13,-24(%r14)
1057	movq	%rdx,%r13
1058
1059	movq	%rdi,-16(%r14)
1060	leaq	(%rcx,%r9,1),%rcx
1061
1062	xorq	%rdi,%rdi
1063	addq	%r10,%r13
1064	adcq	$0,%rdi
1065	addq	(%r14),%r13
1066	adcq	$0,%rdi
1067	movq	%r13,-8(%r14)
1068
1069	cmpq	16+8(%rsp),%r12
1070	jb	.Louter4x
1071	xorq	%rax,%rax
1072	subq	%r13,%rbp
1073	adcq	%r15,%r15
1074	orq	%r15,%rdi
1075	subq	%rdi,%rax
1076	leaq	(%r14,%r9,1),%rbx
1077	movq	(%rcx),%r12
1078	leaq	(%rcx),%rbp
1079	movq	%r9,%rcx
1080	sarq	$3+2,%rcx
1081	movq	56+8(%rsp),%rdi
1082	decq	%r12
1083	xorq	%r10,%r10
1084	movq	8(%rbp),%r13
1085	movq	16(%rbp),%r14
1086	movq	24(%rbp),%r15
1087	jmp	.Lsqr4x_sub_entry
1088.cfi_endproc
1089.size	mul4x_internal,.-mul4x_internal
1090.globl	GFp_bn_power5
1091.hidden GFp_bn_power5
1092.type	GFp_bn_power5,@function
1093.align	32
1094GFp_bn_power5:
1095.cfi_startproc
1096	movq	%rsp,%rax
1097.cfi_def_cfa_register	%rax
1098	leaq	GFp_ia32cap_P(%rip),%r11
1099	movl	8(%r11),%r11d
1100	andl	$0x80108,%r11d
1101	cmpl	$0x80108,%r11d
1102	je	.Lpowerx5_enter
1103	pushq	%rbx
1104.cfi_offset	%rbx,-16
1105	pushq	%rbp
1106.cfi_offset	%rbp,-24
1107	pushq	%r12
1108.cfi_offset	%r12,-32
1109	pushq	%r13
1110.cfi_offset	%r13,-40
1111	pushq	%r14
1112.cfi_offset	%r14,-48
1113	pushq	%r15
1114.cfi_offset	%r15,-56
1115.Lpower5_prologue:
1116
1117	shll	$3,%r9d
1118	leal	(%r9,%r9,2),%r10d
1119	negq	%r9
1120	movq	(%r8),%r8
1121
1122
1123
1124
1125
1126
1127
1128
1129	leaq	-320(%rsp,%r9,2),%r11
1130	movq	%rsp,%rbp
1131	subq	%rdi,%r11
1132	andq	$4095,%r11
1133	cmpq	%r11,%r10
1134	jb	.Lpwr_sp_alt
1135	subq	%r11,%rbp
1136	leaq	-320(%rbp,%r9,2),%rbp
1137	jmp	.Lpwr_sp_done
1138
1139.align	32
1140.Lpwr_sp_alt:
1141	leaq	4096-320(,%r9,2),%r10
1142	leaq	-320(%rbp,%r9,2),%rbp
1143	subq	%r10,%r11
1144	movq	$0,%r10
1145	cmovcq	%r10,%r11
1146	subq	%r11,%rbp
1147.Lpwr_sp_done:
1148	andq	$-64,%rbp
1149	movq	%rsp,%r11
1150	subq	%rbp,%r11
1151	andq	$-4096,%r11
1152	leaq	(%r11,%rbp,1),%rsp
1153	movq	(%rsp),%r10
1154	cmpq	%rbp,%rsp
1155	ja	.Lpwr_page_walk
1156	jmp	.Lpwr_page_walk_done
1157
1158.Lpwr_page_walk:
1159	leaq	-4096(%rsp),%rsp
1160	movq	(%rsp),%r10
1161	cmpq	%rbp,%rsp
1162	ja	.Lpwr_page_walk
1163.Lpwr_page_walk_done:
1164
1165	movq	%r9,%r10
1166	negq	%r9
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177	movq	%r8,32(%rsp)
1178	movq	%rax,40(%rsp)
1179.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
1180.Lpower5_body:
1181.byte	102,72,15,110,207
1182.byte	102,72,15,110,209
1183.byte	102,73,15,110,218
1184.byte	102,72,15,110,226
1185
1186	call	__bn_sqr8x_internal
1187	call	__bn_post4x_internal
1188	call	__bn_sqr8x_internal
1189	call	__bn_post4x_internal
1190	call	__bn_sqr8x_internal
1191	call	__bn_post4x_internal
1192	call	__bn_sqr8x_internal
1193	call	__bn_post4x_internal
1194	call	__bn_sqr8x_internal
1195	call	__bn_post4x_internal
1196
1197.byte	102,72,15,126,209
1198.byte	102,72,15,126,226
1199	movq	%rsi,%rdi
1200	movq	40(%rsp),%rax
1201	leaq	32(%rsp),%r8
1202
1203	call	mul4x_internal
1204
1205	movq	40(%rsp),%rsi
1206.cfi_def_cfa	%rsi,8
1207	movq	$1,%rax
1208	movq	-48(%rsi),%r15
1209.cfi_restore	%r15
1210	movq	-40(%rsi),%r14
1211.cfi_restore	%r14
1212	movq	-32(%rsi),%r13
1213.cfi_restore	%r13
1214	movq	-24(%rsi),%r12
1215.cfi_restore	%r12
1216	movq	-16(%rsi),%rbp
1217.cfi_restore	%rbp
1218	movq	-8(%rsi),%rbx
1219.cfi_restore	%rbx
1220	leaq	(%rsi),%rsp
1221.cfi_def_cfa_register	%rsp
1222.Lpower5_epilogue:
1223	.byte	0xf3,0xc3
1224.cfi_endproc
1225.size	GFp_bn_power5,.-GFp_bn_power5
1226
1227.globl	GFp_bn_sqr8x_internal
1228.hidden GFp_bn_sqr8x_internal
1229.hidden	GFp_bn_sqr8x_internal
1230.type	GFp_bn_sqr8x_internal,@function
1231.align	32
1232GFp_bn_sqr8x_internal:
1233__bn_sqr8x_internal:
1234.cfi_startproc
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308	leaq	32(%r10),%rbp
1309	leaq	(%rsi,%r9,1),%rsi
1310
1311	movq	%r9,%rcx
1312
1313
1314	movq	-32(%rsi,%rbp,1),%r14
1315	leaq	48+8(%rsp,%r9,2),%rdi
1316	movq	-24(%rsi,%rbp,1),%rax
1317	leaq	-32(%rdi,%rbp,1),%rdi
1318	movq	-16(%rsi,%rbp,1),%rbx
1319	movq	%rax,%r15
1320
1321	mulq	%r14
1322	movq	%rax,%r10
1323	movq	%rbx,%rax
1324	movq	%rdx,%r11
1325	movq	%r10,-24(%rdi,%rbp,1)
1326
1327	mulq	%r14
1328	addq	%rax,%r11
1329	movq	%rbx,%rax
1330	adcq	$0,%rdx
1331	movq	%r11,-16(%rdi,%rbp,1)
1332	movq	%rdx,%r10
1333
1334
1335	movq	-8(%rsi,%rbp,1),%rbx
1336	mulq	%r15
1337	movq	%rax,%r12
1338	movq	%rbx,%rax
1339	movq	%rdx,%r13
1340
1341	leaq	(%rbp),%rcx
1342	mulq	%r14
1343	addq	%rax,%r10
1344	movq	%rbx,%rax
1345	movq	%rdx,%r11
1346	adcq	$0,%r11
1347	addq	%r12,%r10
1348	adcq	$0,%r11
1349	movq	%r10,-8(%rdi,%rcx,1)
1350	jmp	.Lsqr4x_1st
1351
1352.align	32
1353.Lsqr4x_1st:
1354	movq	(%rsi,%rcx,1),%rbx
1355	mulq	%r15
1356	addq	%rax,%r13
1357	movq	%rbx,%rax
1358	movq	%rdx,%r12
1359	adcq	$0,%r12
1360
1361	mulq	%r14
1362	addq	%rax,%r11
1363	movq	%rbx,%rax
1364	movq	8(%rsi,%rcx,1),%rbx
1365	movq	%rdx,%r10
1366	adcq	$0,%r10
1367	addq	%r13,%r11
1368	adcq	$0,%r10
1369
1370
1371	mulq	%r15
1372	addq	%rax,%r12
1373	movq	%rbx,%rax
1374	movq	%r11,(%rdi,%rcx,1)
1375	movq	%rdx,%r13
1376	adcq	$0,%r13
1377
1378	mulq	%r14
1379	addq	%rax,%r10
1380	movq	%rbx,%rax
1381	movq	16(%rsi,%rcx,1),%rbx
1382	movq	%rdx,%r11
1383	adcq	$0,%r11
1384	addq	%r12,%r10
1385	adcq	$0,%r11
1386
1387	mulq	%r15
1388	addq	%rax,%r13
1389	movq	%rbx,%rax
1390	movq	%r10,8(%rdi,%rcx,1)
1391	movq	%rdx,%r12
1392	adcq	$0,%r12
1393
1394	mulq	%r14
1395	addq	%rax,%r11
1396	movq	%rbx,%rax
1397	movq	24(%rsi,%rcx,1),%rbx
1398	movq	%rdx,%r10
1399	adcq	$0,%r10
1400	addq	%r13,%r11
1401	adcq	$0,%r10
1402
1403
1404	mulq	%r15
1405	addq	%rax,%r12
1406	movq	%rbx,%rax
1407	movq	%r11,16(%rdi,%rcx,1)
1408	movq	%rdx,%r13
1409	adcq	$0,%r13
1410	leaq	32(%rcx),%rcx
1411
1412	mulq	%r14
1413	addq	%rax,%r10
1414	movq	%rbx,%rax
1415	movq	%rdx,%r11
1416	adcq	$0,%r11
1417	addq	%r12,%r10
1418	adcq	$0,%r11
1419	movq	%r10,-8(%rdi,%rcx,1)
1420
1421	cmpq	$0,%rcx
1422	jne	.Lsqr4x_1st
1423
1424	mulq	%r15
1425	addq	%rax,%r13
1426	leaq	16(%rbp),%rbp
1427	adcq	$0,%rdx
1428	addq	%r11,%r13
1429	adcq	$0,%rdx
1430
1431	movq	%r13,(%rdi)
1432	movq	%rdx,%r12
1433	movq	%rdx,8(%rdi)
1434	jmp	.Lsqr4x_outer
1435
1436.align	32
1437.Lsqr4x_outer:
1438	movq	-32(%rsi,%rbp,1),%r14
1439	leaq	48+8(%rsp,%r9,2),%rdi
1440	movq	-24(%rsi,%rbp,1),%rax
1441	leaq	-32(%rdi,%rbp,1),%rdi
1442	movq	-16(%rsi,%rbp,1),%rbx
1443	movq	%rax,%r15
1444
1445	mulq	%r14
1446	movq	-24(%rdi,%rbp,1),%r10
1447	addq	%rax,%r10
1448	movq	%rbx,%rax
1449	adcq	$0,%rdx
1450	movq	%r10,-24(%rdi,%rbp,1)
1451	movq	%rdx,%r11
1452
1453	mulq	%r14
1454	addq	%rax,%r11
1455	movq	%rbx,%rax
1456	adcq	$0,%rdx
1457	addq	-16(%rdi,%rbp,1),%r11
1458	movq	%rdx,%r10
1459	adcq	$0,%r10
1460	movq	%r11,-16(%rdi,%rbp,1)
1461
1462	xorq	%r12,%r12
1463
1464	movq	-8(%rsi,%rbp,1),%rbx
1465	mulq	%r15
1466	addq	%rax,%r12
1467	movq	%rbx,%rax
1468	adcq	$0,%rdx
1469	addq	-8(%rdi,%rbp,1),%r12
1470	movq	%rdx,%r13
1471	adcq	$0,%r13
1472
1473	mulq	%r14
1474	addq	%rax,%r10
1475	movq	%rbx,%rax
1476	adcq	$0,%rdx
1477	addq	%r12,%r10
1478	movq	%rdx,%r11
1479	adcq	$0,%r11
1480	movq	%r10,-8(%rdi,%rbp,1)
1481
1482	leaq	(%rbp),%rcx
1483	jmp	.Lsqr4x_inner
1484
1485.align	32
1486.Lsqr4x_inner:
1487	movq	(%rsi,%rcx,1),%rbx
1488	mulq	%r15
1489	addq	%rax,%r13
1490	movq	%rbx,%rax
1491	movq	%rdx,%r12
1492	adcq	$0,%r12
1493	addq	(%rdi,%rcx,1),%r13
1494	adcq	$0,%r12
1495
1496.byte	0x67
1497	mulq	%r14
1498	addq	%rax,%r11
1499	movq	%rbx,%rax
1500	movq	8(%rsi,%rcx,1),%rbx
1501	movq	%rdx,%r10
1502	adcq	$0,%r10
1503	addq	%r13,%r11
1504	adcq	$0,%r10
1505
1506	mulq	%r15
1507	addq	%rax,%r12
1508	movq	%r11,(%rdi,%rcx,1)
1509	movq	%rbx,%rax
1510	movq	%rdx,%r13
1511	adcq	$0,%r13
1512	addq	8(%rdi,%rcx,1),%r12
1513	leaq	16(%rcx),%rcx
1514	adcq	$0,%r13
1515
1516	mulq	%r14
1517	addq	%rax,%r10
1518	movq	%rbx,%rax
1519	adcq	$0,%rdx
1520	addq	%r12,%r10
1521	movq	%rdx,%r11
1522	adcq	$0,%r11
1523	movq	%r10,-8(%rdi,%rcx,1)
1524
1525	cmpq	$0,%rcx
1526	jne	.Lsqr4x_inner
1527
1528.byte	0x67
1529	mulq	%r15
1530	addq	%rax,%r13
1531	adcq	$0,%rdx
1532	addq	%r11,%r13
1533	adcq	$0,%rdx
1534
1535	movq	%r13,(%rdi)
1536	movq	%rdx,%r12
1537	movq	%rdx,8(%rdi)
1538
1539	addq	$16,%rbp
1540	jnz	.Lsqr4x_outer
1541
1542
1543	movq	-32(%rsi),%r14
1544	leaq	48+8(%rsp,%r9,2),%rdi
1545	movq	-24(%rsi),%rax
1546	leaq	-32(%rdi,%rbp,1),%rdi
1547	movq	-16(%rsi),%rbx
1548	movq	%rax,%r15
1549
1550	mulq	%r14
1551	addq	%rax,%r10
1552	movq	%rbx,%rax
1553	movq	%rdx,%r11
1554	adcq	$0,%r11
1555
1556	mulq	%r14
1557	addq	%rax,%r11
1558	movq	%rbx,%rax
1559	movq	%r10,-24(%rdi)
1560	movq	%rdx,%r10
1561	adcq	$0,%r10
1562	addq	%r13,%r11
1563	movq	-8(%rsi),%rbx
1564	adcq	$0,%r10
1565
1566	mulq	%r15
1567	addq	%rax,%r12
1568	movq	%rbx,%rax
1569	movq	%r11,-16(%rdi)
1570	movq	%rdx,%r13
1571	adcq	$0,%r13
1572
1573	mulq	%r14
1574	addq	%rax,%r10
1575	movq	%rbx,%rax
1576	movq	%rdx,%r11
1577	adcq	$0,%r11
1578	addq	%r12,%r10
1579	adcq	$0,%r11
1580	movq	%r10,-8(%rdi)
1581
1582	mulq	%r15
1583	addq	%rax,%r13
1584	movq	-16(%rsi),%rax
1585	adcq	$0,%rdx
1586	addq	%r11,%r13
1587	adcq	$0,%rdx
1588
1589	movq	%r13,(%rdi)
1590	movq	%rdx,%r12
1591	movq	%rdx,8(%rdi)
1592
1593	mulq	%rbx
1594	addq	$16,%rbp
1595	xorq	%r14,%r14
1596	subq	%r9,%rbp
1597	xorq	%r15,%r15
1598
1599	addq	%r12,%rax
1600	adcq	$0,%rdx
1601	movq	%rax,8(%rdi)
1602	movq	%rdx,16(%rdi)
1603	movq	%r15,24(%rdi)
1604
1605	movq	-16(%rsi,%rbp,1),%rax
1606	leaq	48+8(%rsp),%rdi
1607	xorq	%r10,%r10
1608	movq	8(%rdi),%r11
1609
1610	leaq	(%r14,%r10,2),%r12
1611	shrq	$63,%r10
1612	leaq	(%rcx,%r11,2),%r13
1613	shrq	$63,%r11
1614	orq	%r10,%r13
1615	movq	16(%rdi),%r10
1616	movq	%r11,%r14
1617	mulq	%rax
1618	negq	%r15
1619	movq	24(%rdi),%r11
1620	adcq	%rax,%r12
1621	movq	-8(%rsi,%rbp,1),%rax
1622	movq	%r12,(%rdi)
1623	adcq	%rdx,%r13
1624
1625	leaq	(%r14,%r10,2),%rbx
1626	movq	%r13,8(%rdi)
1627	sbbq	%r15,%r15
1628	shrq	$63,%r10
1629	leaq	(%rcx,%r11,2),%r8
1630	shrq	$63,%r11
1631	orq	%r10,%r8
1632	movq	32(%rdi),%r10
1633	movq	%r11,%r14
1634	mulq	%rax
1635	negq	%r15
1636	movq	40(%rdi),%r11
1637	adcq	%rax,%rbx
1638	movq	0(%rsi,%rbp,1),%rax
1639	movq	%rbx,16(%rdi)
1640	adcq	%rdx,%r8
1641	leaq	16(%rbp),%rbp
1642	movq	%r8,24(%rdi)
1643	sbbq	%r15,%r15
1644	leaq	64(%rdi),%rdi
1645	jmp	.Lsqr4x_shift_n_add
1646
1647.align	32
1648.Lsqr4x_shift_n_add:
1649	leaq	(%r14,%r10,2),%r12
1650	shrq	$63,%r10
1651	leaq	(%rcx,%r11,2),%r13
1652	shrq	$63,%r11
1653	orq	%r10,%r13
1654	movq	-16(%rdi),%r10
1655	movq	%r11,%r14
1656	mulq	%rax
1657	negq	%r15
1658	movq	-8(%rdi),%r11
1659	adcq	%rax,%r12
1660	movq	-8(%rsi,%rbp,1),%rax
1661	movq	%r12,-32(%rdi)
1662	adcq	%rdx,%r13
1663
1664	leaq	(%r14,%r10,2),%rbx
1665	movq	%r13,-24(%rdi)
1666	sbbq	%r15,%r15
1667	shrq	$63,%r10
1668	leaq	(%rcx,%r11,2),%r8
1669	shrq	$63,%r11
1670	orq	%r10,%r8
1671	movq	0(%rdi),%r10
1672	movq	%r11,%r14
1673	mulq	%rax
1674	negq	%r15
1675	movq	8(%rdi),%r11
1676	adcq	%rax,%rbx
1677	movq	0(%rsi,%rbp,1),%rax
1678	movq	%rbx,-16(%rdi)
1679	adcq	%rdx,%r8
1680
1681	leaq	(%r14,%r10,2),%r12
1682	movq	%r8,-8(%rdi)
1683	sbbq	%r15,%r15
1684	shrq	$63,%r10
1685	leaq	(%rcx,%r11,2),%r13
1686	shrq	$63,%r11
1687	orq	%r10,%r13
1688	movq	16(%rdi),%r10
1689	movq	%r11,%r14
1690	mulq	%rax
1691	negq	%r15
1692	movq	24(%rdi),%r11
1693	adcq	%rax,%r12
1694	movq	8(%rsi,%rbp,1),%rax
1695	movq	%r12,0(%rdi)
1696	adcq	%rdx,%r13
1697
1698	leaq	(%r14,%r10,2),%rbx
1699	movq	%r13,8(%rdi)
1700	sbbq	%r15,%r15
1701	shrq	$63,%r10
1702	leaq	(%rcx,%r11,2),%r8
1703	shrq	$63,%r11
1704	orq	%r10,%r8
1705	movq	32(%rdi),%r10
1706	movq	%r11,%r14
1707	mulq	%rax
1708	negq	%r15
1709	movq	40(%rdi),%r11
1710	adcq	%rax,%rbx
1711	movq	16(%rsi,%rbp,1),%rax
1712	movq	%rbx,16(%rdi)
1713	adcq	%rdx,%r8
1714	movq	%r8,24(%rdi)
1715	sbbq	%r15,%r15
1716	leaq	64(%rdi),%rdi
1717	addq	$32,%rbp
1718	jnz	.Lsqr4x_shift_n_add
1719
1720	leaq	(%r14,%r10,2),%r12
1721.byte	0x67
1722	shrq	$63,%r10
1723	leaq	(%rcx,%r11,2),%r13
1724	shrq	$63,%r11
1725	orq	%r10,%r13
1726	movq	-16(%rdi),%r10
1727	movq	%r11,%r14
1728	mulq	%rax
1729	negq	%r15
1730	movq	-8(%rdi),%r11
1731	adcq	%rax,%r12
1732	movq	-8(%rsi),%rax
1733	movq	%r12,-32(%rdi)
1734	adcq	%rdx,%r13
1735
1736	leaq	(%r14,%r10,2),%rbx
1737	movq	%r13,-24(%rdi)
1738	sbbq	%r15,%r15
1739	shrq	$63,%r10
1740	leaq	(%rcx,%r11,2),%r8
1741	shrq	$63,%r11
1742	orq	%r10,%r8
1743	mulq	%rax
1744	negq	%r15
1745	adcq	%rax,%rbx
1746	adcq	%rdx,%r8
1747	movq	%rbx,-16(%rdi)
1748	movq	%r8,-8(%rdi)
1749.byte	102,72,15,126,213
1750__bn_sqr8x_reduction:
1751	xorq	%rax,%rax
1752	leaq	(%r9,%rbp,1),%rcx
1753	leaq	48+8(%rsp,%r9,2),%rdx
1754	movq	%rcx,0+8(%rsp)
1755	leaq	48+8(%rsp,%r9,1),%rdi
1756	movq	%rdx,8+8(%rsp)
1757	negq	%r9
1758	jmp	.L8x_reduction_loop
1759
1760.align	32
1761.L8x_reduction_loop:
1762	leaq	(%rdi,%r9,1),%rdi
1763.byte	0x66
1764	movq	0(%rdi),%rbx
1765	movq	8(%rdi),%r9
1766	movq	16(%rdi),%r10
1767	movq	24(%rdi),%r11
1768	movq	32(%rdi),%r12
1769	movq	40(%rdi),%r13
1770	movq	48(%rdi),%r14
1771	movq	56(%rdi),%r15
1772	movq	%rax,(%rdx)
1773	leaq	64(%rdi),%rdi
1774
1775.byte	0x67
1776	movq	%rbx,%r8
1777	imulq	32+8(%rsp),%rbx
1778	movq	0(%rbp),%rax
1779	movl	$8,%ecx
1780	jmp	.L8x_reduce
1781
1782.align	32
1783.L8x_reduce:
1784	mulq	%rbx
1785	movq	8(%rbp),%rax
1786	negq	%r8
1787	movq	%rdx,%r8
1788	adcq	$0,%r8
1789
1790	mulq	%rbx
1791	addq	%rax,%r9
1792	movq	16(%rbp),%rax
1793	adcq	$0,%rdx
1794	addq	%r9,%r8
1795	movq	%rbx,48-8+8(%rsp,%rcx,8)
1796	movq	%rdx,%r9
1797	adcq	$0,%r9
1798
1799	mulq	%rbx
1800	addq	%rax,%r10
1801	movq	24(%rbp),%rax
1802	adcq	$0,%rdx
1803	addq	%r10,%r9
1804	movq	32+8(%rsp),%rsi
1805	movq	%rdx,%r10
1806	adcq	$0,%r10
1807
1808	mulq	%rbx
1809	addq	%rax,%r11
1810	movq	32(%rbp),%rax
1811	adcq	$0,%rdx
1812	imulq	%r8,%rsi
1813	addq	%r11,%r10
1814	movq	%rdx,%r11
1815	adcq	$0,%r11
1816
1817	mulq	%rbx
1818	addq	%rax,%r12
1819	movq	40(%rbp),%rax
1820	adcq	$0,%rdx
1821	addq	%r12,%r11
1822	movq	%rdx,%r12
1823	adcq	$0,%r12
1824
1825	mulq	%rbx
1826	addq	%rax,%r13
1827	movq	48(%rbp),%rax
1828	adcq	$0,%rdx
1829	addq	%r13,%r12
1830	movq	%rdx,%r13
1831	adcq	$0,%r13
1832
1833	mulq	%rbx
1834	addq	%rax,%r14
1835	movq	56(%rbp),%rax
1836	adcq	$0,%rdx
1837	addq	%r14,%r13
1838	movq	%rdx,%r14
1839	adcq	$0,%r14
1840
1841	mulq	%rbx
1842	movq	%rsi,%rbx
1843	addq	%rax,%r15
1844	movq	0(%rbp),%rax
1845	adcq	$0,%rdx
1846	addq	%r15,%r14
1847	movq	%rdx,%r15
1848	adcq	$0,%r15
1849
1850	decl	%ecx
1851	jnz	.L8x_reduce
1852
1853	leaq	64(%rbp),%rbp
1854	xorq	%rax,%rax
1855	movq	8+8(%rsp),%rdx
1856	cmpq	0+8(%rsp),%rbp
1857	jae	.L8x_no_tail
1858
1859.byte	0x66
1860	addq	0(%rdi),%r8
1861	adcq	8(%rdi),%r9
1862	adcq	16(%rdi),%r10
1863	adcq	24(%rdi),%r11
1864	adcq	32(%rdi),%r12
1865	adcq	40(%rdi),%r13
1866	adcq	48(%rdi),%r14
1867	adcq	56(%rdi),%r15
1868	sbbq	%rsi,%rsi
1869
1870	movq	48+56+8(%rsp),%rbx
1871	movl	$8,%ecx
1872	movq	0(%rbp),%rax
1873	jmp	.L8x_tail
1874
1875.align	32
1876.L8x_tail:
1877	mulq	%rbx
1878	addq	%rax,%r8
1879	movq	8(%rbp),%rax
1880	movq	%r8,(%rdi)
1881	movq	%rdx,%r8
1882	adcq	$0,%r8
1883
1884	mulq	%rbx
1885	addq	%rax,%r9
1886	movq	16(%rbp),%rax
1887	adcq	$0,%rdx
1888	addq	%r9,%r8
1889	leaq	8(%rdi),%rdi
1890	movq	%rdx,%r9
1891	adcq	$0,%r9
1892
1893	mulq	%rbx
1894	addq	%rax,%r10
1895	movq	24(%rbp),%rax
1896	adcq	$0,%rdx
1897	addq	%r10,%r9
1898	movq	%rdx,%r10
1899	adcq	$0,%r10
1900
1901	mulq	%rbx
1902	addq	%rax,%r11
1903	movq	32(%rbp),%rax
1904	adcq	$0,%rdx
1905	addq	%r11,%r10
1906	movq	%rdx,%r11
1907	adcq	$0,%r11
1908
1909	mulq	%rbx
1910	addq	%rax,%r12
1911	movq	40(%rbp),%rax
1912	adcq	$0,%rdx
1913	addq	%r12,%r11
1914	movq	%rdx,%r12
1915	adcq	$0,%r12
1916
1917	mulq	%rbx
1918	addq	%rax,%r13
1919	movq	48(%rbp),%rax
1920	adcq	$0,%rdx
1921	addq	%r13,%r12
1922	movq	%rdx,%r13
1923	adcq	$0,%r13
1924
1925	mulq	%rbx
1926	addq	%rax,%r14
1927	movq	56(%rbp),%rax
1928	adcq	$0,%rdx
1929	addq	%r14,%r13
1930	movq	%rdx,%r14
1931	adcq	$0,%r14
1932
1933	mulq	%rbx
1934	movq	48-16+8(%rsp,%rcx,8),%rbx
1935	addq	%rax,%r15
1936	adcq	$0,%rdx
1937	addq	%r15,%r14
1938	movq	0(%rbp),%rax
1939	movq	%rdx,%r15
1940	adcq	$0,%r15
1941
1942	decl	%ecx
1943	jnz	.L8x_tail
1944
1945	leaq	64(%rbp),%rbp
1946	movq	8+8(%rsp),%rdx
1947	cmpq	0+8(%rsp),%rbp
1948	jae	.L8x_tail_done
1949
1950	movq	48+56+8(%rsp),%rbx
1951	negq	%rsi
1952	movq	0(%rbp),%rax
1953	adcq	0(%rdi),%r8
1954	adcq	8(%rdi),%r9
1955	adcq	16(%rdi),%r10
1956	adcq	24(%rdi),%r11
1957	adcq	32(%rdi),%r12
1958	adcq	40(%rdi),%r13
1959	adcq	48(%rdi),%r14
1960	adcq	56(%rdi),%r15
1961	sbbq	%rsi,%rsi
1962
1963	movl	$8,%ecx
1964	jmp	.L8x_tail
1965
1966.align	32
1967.L8x_tail_done:
1968	xorq	%rax,%rax
1969	addq	(%rdx),%r8
1970	adcq	$0,%r9
1971	adcq	$0,%r10
1972	adcq	$0,%r11
1973	adcq	$0,%r12
1974	adcq	$0,%r13
1975	adcq	$0,%r14
1976	adcq	$0,%r15
1977	adcq	$0,%rax
1978
1979	negq	%rsi
1980.L8x_no_tail:
1981	adcq	0(%rdi),%r8
1982	adcq	8(%rdi),%r9
1983	adcq	16(%rdi),%r10
1984	adcq	24(%rdi),%r11
1985	adcq	32(%rdi),%r12
1986	adcq	40(%rdi),%r13
1987	adcq	48(%rdi),%r14
1988	adcq	56(%rdi),%r15
1989	adcq	$0,%rax
1990	movq	-8(%rbp),%rcx
1991	xorq	%rsi,%rsi
1992
1993.byte	102,72,15,126,213
1994
1995	movq	%r8,0(%rdi)
1996	movq	%r9,8(%rdi)
1997.byte	102,73,15,126,217
1998	movq	%r10,16(%rdi)
1999	movq	%r11,24(%rdi)
2000	movq	%r12,32(%rdi)
2001	movq	%r13,40(%rdi)
2002	movq	%r14,48(%rdi)
2003	movq	%r15,56(%rdi)
2004	leaq	64(%rdi),%rdi
2005
2006	cmpq	%rdx,%rdi
2007	jb	.L8x_reduction_loop
2008	.byte	0xf3,0xc3
2009.cfi_endproc
2010.size	GFp_bn_sqr8x_internal,.-GFp_bn_sqr8x_internal
2011.type	__bn_post4x_internal,@function
2012.align	32
2013__bn_post4x_internal:
2014.cfi_startproc
2015	movq	0(%rbp),%r12
2016	leaq	(%rdi,%r9,1),%rbx
2017	movq	%r9,%rcx
2018.byte	102,72,15,126,207
2019	negq	%rax
2020.byte	102,72,15,126,206
2021	sarq	$3+2,%rcx
2022	decq	%r12
2023	xorq	%r10,%r10
2024	movq	8(%rbp),%r13
2025	movq	16(%rbp),%r14
2026	movq	24(%rbp),%r15
2027	jmp	.Lsqr4x_sub_entry
2028
2029.align	16
2030.Lsqr4x_sub:
2031	movq	0(%rbp),%r12
2032	movq	8(%rbp),%r13
2033	movq	16(%rbp),%r14
2034	movq	24(%rbp),%r15
2035.Lsqr4x_sub_entry:
2036	leaq	32(%rbp),%rbp
2037	notq	%r12
2038	notq	%r13
2039	notq	%r14
2040	notq	%r15
2041	andq	%rax,%r12
2042	andq	%rax,%r13
2043	andq	%rax,%r14
2044	andq	%rax,%r15
2045
2046	negq	%r10
2047	adcq	0(%rbx),%r12
2048	adcq	8(%rbx),%r13
2049	adcq	16(%rbx),%r14
2050	adcq	24(%rbx),%r15
2051	movq	%r12,0(%rdi)
2052	leaq	32(%rbx),%rbx
2053	movq	%r13,8(%rdi)
2054	sbbq	%r10,%r10
2055	movq	%r14,16(%rdi)
2056	movq	%r15,24(%rdi)
2057	leaq	32(%rdi),%rdi
2058
2059	incq	%rcx
2060	jnz	.Lsqr4x_sub
2061
2062	movq	%r9,%r10
2063	negq	%r9
2064	.byte	0xf3,0xc3
2065.cfi_endproc
2066.size	__bn_post4x_internal,.-__bn_post4x_internal
2067.globl	GFp_bn_from_montgomery
2068.hidden GFp_bn_from_montgomery
2069.type	GFp_bn_from_montgomery,@function
2070.align	32
2071GFp_bn_from_montgomery:
2072.cfi_startproc
2073	testl	$7,%r9d
2074	jz	bn_from_mont8x
2075	xorl	%eax,%eax
2076	.byte	0xf3,0xc3
2077.cfi_endproc
2078.size	GFp_bn_from_montgomery,.-GFp_bn_from_montgomery
2079
2080.type	bn_from_mont8x,@function
2081.align	32
2082bn_from_mont8x:
2083.cfi_startproc
2084.byte	0x67
2085	movq	%rsp,%rax
2086.cfi_def_cfa_register	%rax
2087	pushq	%rbx
2088.cfi_offset	%rbx,-16
2089	pushq	%rbp
2090.cfi_offset	%rbp,-24
2091	pushq	%r12
2092.cfi_offset	%r12,-32
2093	pushq	%r13
2094.cfi_offset	%r13,-40
2095	pushq	%r14
2096.cfi_offset	%r14,-48
2097	pushq	%r15
2098.cfi_offset	%r15,-56
2099.Lfrom_prologue:
2100
2101	shll	$3,%r9d
2102	leaq	(%r9,%r9,2),%r10
2103	negq	%r9
2104	movq	(%r8),%r8
2105
2106
2107
2108
2109
2110
2111
2112
2113	leaq	-320(%rsp,%r9,2),%r11
2114	movq	%rsp,%rbp
2115	subq	%rdi,%r11
2116	andq	$4095,%r11
2117	cmpq	%r11,%r10
2118	jb	.Lfrom_sp_alt
2119	subq	%r11,%rbp
2120	leaq	-320(%rbp,%r9,2),%rbp
2121	jmp	.Lfrom_sp_done
2122
2123.align	32
2124.Lfrom_sp_alt:
2125	leaq	4096-320(,%r9,2),%r10
2126	leaq	-320(%rbp,%r9,2),%rbp
2127	subq	%r10,%r11
2128	movq	$0,%r10
2129	cmovcq	%r10,%r11
2130	subq	%r11,%rbp
2131.Lfrom_sp_done:
2132	andq	$-64,%rbp
2133	movq	%rsp,%r11
2134	subq	%rbp,%r11
2135	andq	$-4096,%r11
2136	leaq	(%r11,%rbp,1),%rsp
2137	movq	(%rsp),%r10
2138	cmpq	%rbp,%rsp
2139	ja	.Lfrom_page_walk
2140	jmp	.Lfrom_page_walk_done
2141
2142.Lfrom_page_walk:
2143	leaq	-4096(%rsp),%rsp
2144	movq	(%rsp),%r10
2145	cmpq	%rbp,%rsp
2146	ja	.Lfrom_page_walk
2147.Lfrom_page_walk_done:
2148
2149	movq	%r9,%r10
2150	negq	%r9
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161	movq	%r8,32(%rsp)
2162	movq	%rax,40(%rsp)
2163.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2164.Lfrom_body:
2165	movq	%r9,%r11
2166	leaq	48(%rsp),%rax
2167	pxor	%xmm0,%xmm0
2168	jmp	.Lmul_by_1
2169
2170.align	32
2171.Lmul_by_1:
2172	movdqu	(%rsi),%xmm1
2173	movdqu	16(%rsi),%xmm2
2174	movdqu	32(%rsi),%xmm3
2175	movdqa	%xmm0,(%rax,%r9,1)
2176	movdqu	48(%rsi),%xmm4
2177	movdqa	%xmm0,16(%rax,%r9,1)
2178.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2179	movdqa	%xmm1,(%rax)
2180	movdqa	%xmm0,32(%rax,%r9,1)
2181	movdqa	%xmm2,16(%rax)
2182	movdqa	%xmm0,48(%rax,%r9,1)
2183	movdqa	%xmm3,32(%rax)
2184	movdqa	%xmm4,48(%rax)
2185	leaq	64(%rax),%rax
2186	subq	$64,%r11
2187	jnz	.Lmul_by_1
2188
2189.byte	102,72,15,110,207
2190.byte	102,72,15,110,209
2191.byte	0x67
2192	movq	%rcx,%rbp
2193.byte	102,73,15,110,218
2194	leaq	GFp_ia32cap_P(%rip),%r11
2195	movl	8(%r11),%r11d
2196	andl	$0x80108,%r11d
2197	cmpl	$0x80108,%r11d
2198	jne	.Lfrom_mont_nox
2199
2200	leaq	(%rax,%r9,1),%rdi
2201	call	__bn_sqrx8x_reduction
2202	call	__bn_postx4x_internal
2203
2204	pxor	%xmm0,%xmm0
2205	leaq	48(%rsp),%rax
2206	jmp	.Lfrom_mont_zero
2207
2208.align	32
2209.Lfrom_mont_nox:
2210	call	__bn_sqr8x_reduction
2211	call	__bn_post4x_internal
2212
2213	pxor	%xmm0,%xmm0
2214	leaq	48(%rsp),%rax
2215	jmp	.Lfrom_mont_zero
2216
2217.align	32
2218.Lfrom_mont_zero:
2219	movq	40(%rsp),%rsi
2220.cfi_def_cfa	%rsi,8
2221	movdqa	%xmm0,0(%rax)
2222	movdqa	%xmm0,16(%rax)
2223	movdqa	%xmm0,32(%rax)
2224	movdqa	%xmm0,48(%rax)
2225	leaq	64(%rax),%rax
2226	subq	$32,%r9
2227	jnz	.Lfrom_mont_zero
2228
2229	movq	$1,%rax
2230	movq	-48(%rsi),%r15
2231.cfi_restore	%r15
2232	movq	-40(%rsi),%r14
2233.cfi_restore	%r14
2234	movq	-32(%rsi),%r13
2235.cfi_restore	%r13
2236	movq	-24(%rsi),%r12
2237.cfi_restore	%r12
2238	movq	-16(%rsi),%rbp
2239.cfi_restore	%rbp
2240	movq	-8(%rsi),%rbx
2241.cfi_restore	%rbx
2242	leaq	(%rsi),%rsp
2243.cfi_def_cfa_register	%rsp
2244.Lfrom_epilogue:
2245	.byte	0xf3,0xc3
2246.cfi_endproc
2247.size	bn_from_mont8x,.-bn_from_mont8x
2248.type	bn_mulx4x_mont_gather5,@function
2249.align	32
2250bn_mulx4x_mont_gather5:
2251.cfi_startproc
2252	movq	%rsp,%rax
2253.cfi_def_cfa_register	%rax
2254.Lmulx4x_enter:
2255	pushq	%rbx
2256.cfi_offset	%rbx,-16
2257	pushq	%rbp
2258.cfi_offset	%rbp,-24
2259	pushq	%r12
2260.cfi_offset	%r12,-32
2261	pushq	%r13
2262.cfi_offset	%r13,-40
2263	pushq	%r14
2264.cfi_offset	%r14,-48
2265	pushq	%r15
2266.cfi_offset	%r15,-56
2267.Lmulx4x_prologue:
2268
2269	shll	$3,%r9d
2270	leaq	(%r9,%r9,2),%r10
2271	negq	%r9
2272	movq	(%r8),%r8
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283	leaq	-320(%rsp,%r9,2),%r11
2284	movq	%rsp,%rbp
2285	subq	%rdi,%r11
2286	andq	$4095,%r11
2287	cmpq	%r11,%r10
2288	jb	.Lmulx4xsp_alt
2289	subq	%r11,%rbp
2290	leaq	-320(%rbp,%r9,2),%rbp
2291	jmp	.Lmulx4xsp_done
2292
2293.Lmulx4xsp_alt:
2294	leaq	4096-320(,%r9,2),%r10
2295	leaq	-320(%rbp,%r9,2),%rbp
2296	subq	%r10,%r11
2297	movq	$0,%r10
2298	cmovcq	%r10,%r11
2299	subq	%r11,%rbp
2300.Lmulx4xsp_done:
2301	andq	$-64,%rbp
2302	movq	%rsp,%r11
2303	subq	%rbp,%r11
2304	andq	$-4096,%r11
2305	leaq	(%r11,%rbp,1),%rsp
2306	movq	(%rsp),%r10
2307	cmpq	%rbp,%rsp
2308	ja	.Lmulx4x_page_walk
2309	jmp	.Lmulx4x_page_walk_done
2310
2311.Lmulx4x_page_walk:
2312	leaq	-4096(%rsp),%rsp
2313	movq	(%rsp),%r10
2314	cmpq	%rbp,%rsp
2315	ja	.Lmulx4x_page_walk
2316.Lmulx4x_page_walk_done:
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330	movq	%r8,32(%rsp)
2331	movq	%rax,40(%rsp)
2332.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2333.Lmulx4x_body:
2334	call	mulx4x_internal
2335
2336	movq	40(%rsp),%rsi
2337.cfi_def_cfa	%rsi,8
2338	movq	$1,%rax
2339
2340	movq	-48(%rsi),%r15
2341.cfi_restore	%r15
2342	movq	-40(%rsi),%r14
2343.cfi_restore	%r14
2344	movq	-32(%rsi),%r13
2345.cfi_restore	%r13
2346	movq	-24(%rsi),%r12
2347.cfi_restore	%r12
2348	movq	-16(%rsi),%rbp
2349.cfi_restore	%rbp
2350	movq	-8(%rsi),%rbx
2351.cfi_restore	%rbx
2352	leaq	(%rsi),%rsp
2353.cfi_def_cfa_register	%rsp
2354.Lmulx4x_epilogue:
2355	.byte	0xf3,0xc3
2356.cfi_endproc
2357.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2358
2359.type	mulx4x_internal,@function
2360.align	32
2361mulx4x_internal:
2362.cfi_startproc
2363	movq	%r9,8(%rsp)
2364	movq	%r9,%r10
2365	negq	%r9
2366	shlq	$5,%r9
2367	negq	%r10
2368	leaq	128(%rdx,%r9,1),%r13
2369	shrq	$5+5,%r9
2370	movd	8(%rax),%xmm5
2371	subq	$1,%r9
2372	leaq	.Linc(%rip),%rax
2373	movq	%r13,16+8(%rsp)
2374	movq	%r9,24+8(%rsp)
2375	movq	%rdi,56+8(%rsp)
2376	movdqa	0(%rax),%xmm0
2377	movdqa	16(%rax),%xmm1
2378	leaq	88-112(%rsp,%r10,1),%r10
2379	leaq	128(%rdx),%rdi
2380
2381	pshufd	$0,%xmm5,%xmm5
2382	movdqa	%xmm1,%xmm4
2383.byte	0x67
2384	movdqa	%xmm1,%xmm2
2385.byte	0x67
2386	paddd	%xmm0,%xmm1
2387	pcmpeqd	%xmm5,%xmm0
2388	movdqa	%xmm4,%xmm3
2389	paddd	%xmm1,%xmm2
2390	pcmpeqd	%xmm5,%xmm1
2391	movdqa	%xmm0,112(%r10)
2392	movdqa	%xmm4,%xmm0
2393
2394	paddd	%xmm2,%xmm3
2395	pcmpeqd	%xmm5,%xmm2
2396	movdqa	%xmm1,128(%r10)
2397	movdqa	%xmm4,%xmm1
2398
2399	paddd	%xmm3,%xmm0
2400	pcmpeqd	%xmm5,%xmm3
2401	movdqa	%xmm2,144(%r10)
2402	movdqa	%xmm4,%xmm2
2403
2404	paddd	%xmm0,%xmm1
2405	pcmpeqd	%xmm5,%xmm0
2406	movdqa	%xmm3,160(%r10)
2407	movdqa	%xmm4,%xmm3
2408	paddd	%xmm1,%xmm2
2409	pcmpeqd	%xmm5,%xmm1
2410	movdqa	%xmm0,176(%r10)
2411	movdqa	%xmm4,%xmm0
2412
2413	paddd	%xmm2,%xmm3
2414	pcmpeqd	%xmm5,%xmm2
2415	movdqa	%xmm1,192(%r10)
2416	movdqa	%xmm4,%xmm1
2417
2418	paddd	%xmm3,%xmm0
2419	pcmpeqd	%xmm5,%xmm3
2420	movdqa	%xmm2,208(%r10)
2421	movdqa	%xmm4,%xmm2
2422
2423	paddd	%xmm0,%xmm1
2424	pcmpeqd	%xmm5,%xmm0
2425	movdqa	%xmm3,224(%r10)
2426	movdqa	%xmm4,%xmm3
2427	paddd	%xmm1,%xmm2
2428	pcmpeqd	%xmm5,%xmm1
2429	movdqa	%xmm0,240(%r10)
2430	movdqa	%xmm4,%xmm0
2431
2432	paddd	%xmm2,%xmm3
2433	pcmpeqd	%xmm5,%xmm2
2434	movdqa	%xmm1,256(%r10)
2435	movdqa	%xmm4,%xmm1
2436
2437	paddd	%xmm3,%xmm0
2438	pcmpeqd	%xmm5,%xmm3
2439	movdqa	%xmm2,272(%r10)
2440	movdqa	%xmm4,%xmm2
2441
2442	paddd	%xmm0,%xmm1
2443	pcmpeqd	%xmm5,%xmm0
2444	movdqa	%xmm3,288(%r10)
2445	movdqa	%xmm4,%xmm3
2446.byte	0x67
2447	paddd	%xmm1,%xmm2
2448	pcmpeqd	%xmm5,%xmm1
2449	movdqa	%xmm0,304(%r10)
2450
2451	paddd	%xmm2,%xmm3
2452	pcmpeqd	%xmm5,%xmm2
2453	movdqa	%xmm1,320(%r10)
2454
2455	pcmpeqd	%xmm5,%xmm3
2456	movdqa	%xmm2,336(%r10)
2457
2458	pand	64(%rdi),%xmm0
2459	pand	80(%rdi),%xmm1
2460	pand	96(%rdi),%xmm2
2461	movdqa	%xmm3,352(%r10)
2462	pand	112(%rdi),%xmm3
2463	por	%xmm2,%xmm0
2464	por	%xmm3,%xmm1
2465	movdqa	-128(%rdi),%xmm4
2466	movdqa	-112(%rdi),%xmm5
2467	movdqa	-96(%rdi),%xmm2
2468	pand	112(%r10),%xmm4
2469	movdqa	-80(%rdi),%xmm3
2470	pand	128(%r10),%xmm5
2471	por	%xmm4,%xmm0
2472	pand	144(%r10),%xmm2
2473	por	%xmm5,%xmm1
2474	pand	160(%r10),%xmm3
2475	por	%xmm2,%xmm0
2476	por	%xmm3,%xmm1
2477	movdqa	-64(%rdi),%xmm4
2478	movdqa	-48(%rdi),%xmm5
2479	movdqa	-32(%rdi),%xmm2
2480	pand	176(%r10),%xmm4
2481	movdqa	-16(%rdi),%xmm3
2482	pand	192(%r10),%xmm5
2483	por	%xmm4,%xmm0
2484	pand	208(%r10),%xmm2
2485	por	%xmm5,%xmm1
2486	pand	224(%r10),%xmm3
2487	por	%xmm2,%xmm0
2488	por	%xmm3,%xmm1
2489	movdqa	0(%rdi),%xmm4
2490	movdqa	16(%rdi),%xmm5
2491	movdqa	32(%rdi),%xmm2
2492	pand	240(%r10),%xmm4
2493	movdqa	48(%rdi),%xmm3
2494	pand	256(%r10),%xmm5
2495	por	%xmm4,%xmm0
2496	pand	272(%r10),%xmm2
2497	por	%xmm5,%xmm1
2498	pand	288(%r10),%xmm3
2499	por	%xmm2,%xmm0
2500	por	%xmm3,%xmm1
2501	pxor	%xmm1,%xmm0
2502	pshufd	$0x4e,%xmm0,%xmm1
2503	por	%xmm1,%xmm0
2504	leaq	256(%rdi),%rdi
2505.byte	102,72,15,126,194
2506	leaq	64+32+8(%rsp),%rbx
2507
2508	movq	%rdx,%r9
2509	mulxq	0(%rsi),%r8,%rax
2510	mulxq	8(%rsi),%r11,%r12
2511	addq	%rax,%r11
2512	mulxq	16(%rsi),%rax,%r13
2513	adcq	%rax,%r12
2514	adcq	$0,%r13
2515	mulxq	24(%rsi),%rax,%r14
2516
2517	movq	%r8,%r15
2518	imulq	32+8(%rsp),%r8
2519	xorq	%rbp,%rbp
2520	movq	%r8,%rdx
2521
2522	movq	%rdi,8+8(%rsp)
2523
2524	leaq	32(%rsi),%rsi
2525	adcxq	%rax,%r13
2526	adcxq	%rbp,%r14
2527
2528	mulxq	0(%rcx),%rax,%r10
2529	adcxq	%rax,%r15
2530	adoxq	%r11,%r10
2531	mulxq	8(%rcx),%rax,%r11
2532	adcxq	%rax,%r10
2533	adoxq	%r12,%r11
2534	mulxq	16(%rcx),%rax,%r12
2535	movq	24+8(%rsp),%rdi
2536	movq	%r10,-32(%rbx)
2537	adcxq	%rax,%r11
2538	adoxq	%r13,%r12
2539	mulxq	24(%rcx),%rax,%r15
2540	movq	%r9,%rdx
2541	movq	%r11,-24(%rbx)
2542	adcxq	%rax,%r12
2543	adoxq	%rbp,%r15
2544	leaq	32(%rcx),%rcx
2545	movq	%r12,-16(%rbx)
2546	jmp	.Lmulx4x_1st
2547
2548.align	32
2549.Lmulx4x_1st:
2550	adcxq	%rbp,%r15
2551	mulxq	0(%rsi),%r10,%rax
2552	adcxq	%r14,%r10
2553	mulxq	8(%rsi),%r11,%r14
2554	adcxq	%rax,%r11
2555	mulxq	16(%rsi),%r12,%rax
2556	adcxq	%r14,%r12
2557	mulxq	24(%rsi),%r13,%r14
2558.byte	0x67,0x67
2559	movq	%r8,%rdx
2560	adcxq	%rax,%r13
2561	adcxq	%rbp,%r14
2562	leaq	32(%rsi),%rsi
2563	leaq	32(%rbx),%rbx
2564
2565	adoxq	%r15,%r10
2566	mulxq	0(%rcx),%rax,%r15
2567	adcxq	%rax,%r10
2568	adoxq	%r15,%r11
2569	mulxq	8(%rcx),%rax,%r15
2570	adcxq	%rax,%r11
2571	adoxq	%r15,%r12
2572	mulxq	16(%rcx),%rax,%r15
2573	movq	%r10,-40(%rbx)
2574	adcxq	%rax,%r12
2575	movq	%r11,-32(%rbx)
2576	adoxq	%r15,%r13
2577	mulxq	24(%rcx),%rax,%r15
2578	movq	%r9,%rdx
2579	movq	%r12,-24(%rbx)
2580	adcxq	%rax,%r13
2581	adoxq	%rbp,%r15
2582	leaq	32(%rcx),%rcx
2583	movq	%r13,-16(%rbx)
2584
2585	decq	%rdi
2586	jnz	.Lmulx4x_1st
2587
2588	movq	8(%rsp),%rax
2589	adcq	%rbp,%r15
2590	leaq	(%rsi,%rax,1),%rsi
2591	addq	%r15,%r14
2592	movq	8+8(%rsp),%rdi
2593	adcq	%rbp,%rbp
2594	movq	%r14,-8(%rbx)
2595	jmp	.Lmulx4x_outer
2596
2597.align	32
2598.Lmulx4x_outer:
2599	leaq	16-256(%rbx),%r10
2600	pxor	%xmm4,%xmm4
2601.byte	0x67,0x67
2602	pxor	%xmm5,%xmm5
2603	movdqa	-128(%rdi),%xmm0
2604	movdqa	-112(%rdi),%xmm1
2605	movdqa	-96(%rdi),%xmm2
2606	pand	256(%r10),%xmm0
2607	movdqa	-80(%rdi),%xmm3
2608	pand	272(%r10),%xmm1
2609	por	%xmm0,%xmm4
2610	pand	288(%r10),%xmm2
2611	por	%xmm1,%xmm5
2612	pand	304(%r10),%xmm3
2613	por	%xmm2,%xmm4
2614	por	%xmm3,%xmm5
2615	movdqa	-64(%rdi),%xmm0
2616	movdqa	-48(%rdi),%xmm1
2617	movdqa	-32(%rdi),%xmm2
2618	pand	320(%r10),%xmm0
2619	movdqa	-16(%rdi),%xmm3
2620	pand	336(%r10),%xmm1
2621	por	%xmm0,%xmm4
2622	pand	352(%r10),%xmm2
2623	por	%xmm1,%xmm5
2624	pand	368(%r10),%xmm3
2625	por	%xmm2,%xmm4
2626	por	%xmm3,%xmm5
2627	movdqa	0(%rdi),%xmm0
2628	movdqa	16(%rdi),%xmm1
2629	movdqa	32(%rdi),%xmm2
2630	pand	384(%r10),%xmm0
2631	movdqa	48(%rdi),%xmm3
2632	pand	400(%r10),%xmm1
2633	por	%xmm0,%xmm4
2634	pand	416(%r10),%xmm2
2635	por	%xmm1,%xmm5
2636	pand	432(%r10),%xmm3
2637	por	%xmm2,%xmm4
2638	por	%xmm3,%xmm5
2639	movdqa	64(%rdi),%xmm0
2640	movdqa	80(%rdi),%xmm1
2641	movdqa	96(%rdi),%xmm2
2642	pand	448(%r10),%xmm0
2643	movdqa	112(%rdi),%xmm3
2644	pand	464(%r10),%xmm1
2645	por	%xmm0,%xmm4
2646	pand	480(%r10),%xmm2
2647	por	%xmm1,%xmm5
2648	pand	496(%r10),%xmm3
2649	por	%xmm2,%xmm4
2650	por	%xmm3,%xmm5
2651	por	%xmm5,%xmm4
2652	pshufd	$0x4e,%xmm4,%xmm0
2653	por	%xmm4,%xmm0
2654	leaq	256(%rdi),%rdi
2655.byte	102,72,15,126,194
2656
2657	movq	%rbp,(%rbx)
2658	leaq	32(%rbx,%rax,1),%rbx
2659	mulxq	0(%rsi),%r8,%r11
2660	xorq	%rbp,%rbp
2661	movq	%rdx,%r9
2662	mulxq	8(%rsi),%r14,%r12
2663	adoxq	-32(%rbx),%r8
2664	adcxq	%r14,%r11
2665	mulxq	16(%rsi),%r15,%r13
2666	adoxq	-24(%rbx),%r11
2667	adcxq	%r15,%r12
2668	mulxq	24(%rsi),%rdx,%r14
2669	adoxq	-16(%rbx),%r12
2670	adcxq	%rdx,%r13
2671	leaq	(%rcx,%rax,1),%rcx
2672	leaq	32(%rsi),%rsi
2673	adoxq	-8(%rbx),%r13
2674	adcxq	%rbp,%r14
2675	adoxq	%rbp,%r14
2676
2677	movq	%r8,%r15
2678	imulq	32+8(%rsp),%r8
2679
2680	movq	%r8,%rdx
2681	xorq	%rbp,%rbp
2682	movq	%rdi,8+8(%rsp)
2683
2684	mulxq	0(%rcx),%rax,%r10
2685	adcxq	%rax,%r15
2686	adoxq	%r11,%r10
2687	mulxq	8(%rcx),%rax,%r11
2688	adcxq	%rax,%r10
2689	adoxq	%r12,%r11
2690	mulxq	16(%rcx),%rax,%r12
2691	adcxq	%rax,%r11
2692	adoxq	%r13,%r12
2693	mulxq	24(%rcx),%rax,%r15
2694	movq	%r9,%rdx
2695	movq	24+8(%rsp),%rdi
2696	movq	%r10,-32(%rbx)
2697	adcxq	%rax,%r12
2698	movq	%r11,-24(%rbx)
2699	adoxq	%rbp,%r15
2700	movq	%r12,-16(%rbx)
2701	leaq	32(%rcx),%rcx
2702	jmp	.Lmulx4x_inner
2703
2704.align	32
2705.Lmulx4x_inner:
2706	mulxq	0(%rsi),%r10,%rax
2707	adcxq	%rbp,%r15
2708	adoxq	%r14,%r10
2709	mulxq	8(%rsi),%r11,%r14
2710	adcxq	0(%rbx),%r10
2711	adoxq	%rax,%r11
2712	mulxq	16(%rsi),%r12,%rax
2713	adcxq	8(%rbx),%r11
2714	adoxq	%r14,%r12
2715	mulxq	24(%rsi),%r13,%r14
2716	movq	%r8,%rdx
2717	adcxq	16(%rbx),%r12
2718	adoxq	%rax,%r13
2719	adcxq	24(%rbx),%r13
2720	adoxq	%rbp,%r14
2721	leaq	32(%rsi),%rsi
2722	leaq	32(%rbx),%rbx
2723	adcxq	%rbp,%r14
2724
2725	adoxq	%r15,%r10
2726	mulxq	0(%rcx),%rax,%r15
2727	adcxq	%rax,%r10
2728	adoxq	%r15,%r11
2729	mulxq	8(%rcx),%rax,%r15
2730	adcxq	%rax,%r11
2731	adoxq	%r15,%r12
2732	mulxq	16(%rcx),%rax,%r15
2733	movq	%r10,-40(%rbx)
2734	adcxq	%rax,%r12
2735	adoxq	%r15,%r13
2736	movq	%r11,-32(%rbx)
2737	mulxq	24(%rcx),%rax,%r15
2738	movq	%r9,%rdx
2739	leaq	32(%rcx),%rcx
2740	movq	%r12,-24(%rbx)
2741	adcxq	%rax,%r13
2742	adoxq	%rbp,%r15
2743	movq	%r13,-16(%rbx)
2744
2745	decq	%rdi
2746	jnz	.Lmulx4x_inner
2747
2748	movq	0+8(%rsp),%rax
2749	adcq	%rbp,%r15
2750	subq	0(%rbx),%rdi
2751	movq	8+8(%rsp),%rdi
2752	movq	16+8(%rsp),%r10
2753	adcq	%r15,%r14
2754	leaq	(%rsi,%rax,1),%rsi
2755	adcq	%rbp,%rbp
2756	movq	%r14,-8(%rbx)
2757
2758	cmpq	%r10,%rdi
2759	jb	.Lmulx4x_outer
2760
2761	movq	-8(%rcx),%r10
2762	movq	%rbp,%r8
2763	movq	(%rcx,%rax,1),%r12
2764	leaq	(%rcx,%rax,1),%rbp
2765	movq	%rax,%rcx
2766	leaq	(%rbx,%rax,1),%rdi
2767	xorl	%eax,%eax
2768	xorq	%r15,%r15
2769	subq	%r14,%r10
2770	adcq	%r15,%r15
2771	orq	%r15,%r8
2772	sarq	$3+2,%rcx
2773	subq	%r8,%rax
2774	movq	56+8(%rsp),%rdx
2775	decq	%r12
2776	movq	8(%rbp),%r13
2777	xorq	%r8,%r8
2778	movq	16(%rbp),%r14
2779	movq	24(%rbp),%r15
2780	jmp	.Lsqrx4x_sub_entry
2781.cfi_endproc
2782.size	mulx4x_internal,.-mulx4x_internal
2783.type	bn_powerx5,@function
2784.align	32
2785bn_powerx5:
2786.cfi_startproc
2787	movq	%rsp,%rax
2788.cfi_def_cfa_register	%rax
2789.Lpowerx5_enter:
2790	pushq	%rbx
2791.cfi_offset	%rbx,-16
2792	pushq	%rbp
2793.cfi_offset	%rbp,-24
2794	pushq	%r12
2795.cfi_offset	%r12,-32
2796	pushq	%r13
2797.cfi_offset	%r13,-40
2798	pushq	%r14
2799.cfi_offset	%r14,-48
2800	pushq	%r15
2801.cfi_offset	%r15,-56
2802.Lpowerx5_prologue:
2803
2804	shll	$3,%r9d
2805	leaq	(%r9,%r9,2),%r10
2806	negq	%r9
2807	movq	(%r8),%r8
2808
2809
2810
2811
2812
2813
2814
2815
2816	leaq	-320(%rsp,%r9,2),%r11
2817	movq	%rsp,%rbp
2818	subq	%rdi,%r11
2819	andq	$4095,%r11
2820	cmpq	%r11,%r10
2821	jb	.Lpwrx_sp_alt
2822	subq	%r11,%rbp
2823	leaq	-320(%rbp,%r9,2),%rbp
2824	jmp	.Lpwrx_sp_done
2825
2826.align	32
2827.Lpwrx_sp_alt:
2828	leaq	4096-320(,%r9,2),%r10
2829	leaq	-320(%rbp,%r9,2),%rbp
2830	subq	%r10,%r11
2831	movq	$0,%r10
2832	cmovcq	%r10,%r11
2833	subq	%r11,%rbp
2834.Lpwrx_sp_done:
2835	andq	$-64,%rbp
2836	movq	%rsp,%r11
2837	subq	%rbp,%r11
2838	andq	$-4096,%r11
2839	leaq	(%r11,%rbp,1),%rsp
2840	movq	(%rsp),%r10
2841	cmpq	%rbp,%rsp
2842	ja	.Lpwrx_page_walk
2843	jmp	.Lpwrx_page_walk_done
2844
2845.Lpwrx_page_walk:
2846	leaq	-4096(%rsp),%rsp
2847	movq	(%rsp),%r10
2848	cmpq	%rbp,%rsp
2849	ja	.Lpwrx_page_walk
2850.Lpwrx_page_walk_done:
2851
2852	movq	%r9,%r10
2853	negq	%r9
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866	pxor	%xmm0,%xmm0
2867.byte	102,72,15,110,207
2868.byte	102,72,15,110,209
2869.byte	102,73,15,110,218
2870.byte	102,72,15,110,226
2871	movq	%r8,32(%rsp)
2872	movq	%rax,40(%rsp)
2873.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2874.Lpowerx5_body:
2875
2876	call	__bn_sqrx8x_internal
2877	call	__bn_postx4x_internal
2878	call	__bn_sqrx8x_internal
2879	call	__bn_postx4x_internal
2880	call	__bn_sqrx8x_internal
2881	call	__bn_postx4x_internal
2882	call	__bn_sqrx8x_internal
2883	call	__bn_postx4x_internal
2884	call	__bn_sqrx8x_internal
2885	call	__bn_postx4x_internal
2886
2887	movq	%r10,%r9
2888	movq	%rsi,%rdi
2889.byte	102,72,15,126,209
2890.byte	102,72,15,126,226
2891	movq	40(%rsp),%rax
2892
2893	call	mulx4x_internal
2894
2895	movq	40(%rsp),%rsi
2896.cfi_def_cfa	%rsi,8
2897	movq	$1,%rax
2898
2899	movq	-48(%rsi),%r15
2900.cfi_restore	%r15
2901	movq	-40(%rsi),%r14
2902.cfi_restore	%r14
2903	movq	-32(%rsi),%r13
2904.cfi_restore	%r13
2905	movq	-24(%rsi),%r12
2906.cfi_restore	%r12
2907	movq	-16(%rsi),%rbp
2908.cfi_restore	%rbp
2909	movq	-8(%rsi),%rbx
2910.cfi_restore	%rbx
2911	leaq	(%rsi),%rsp
2912.cfi_def_cfa_register	%rsp
2913.Lpowerx5_epilogue:
2914	.byte	0xf3,0xc3
2915.cfi_endproc
2916.size	bn_powerx5,.-bn_powerx5
2917
2918.globl	GFp_bn_sqrx8x_internal
2919.hidden GFp_bn_sqrx8x_internal
2920.type	GFp_bn_sqrx8x_internal,@function
2921.align	32
2922GFp_bn_sqrx8x_internal:
2923__bn_sqrx8x_internal:
2924.cfi_startproc
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965	leaq	48+8(%rsp),%rdi
2966	leaq	(%rsi,%r9,1),%rbp
2967	movq	%r9,0+8(%rsp)
2968	movq	%rbp,8+8(%rsp)
2969	jmp	.Lsqr8x_zero_start
2970
2971.align	32
2972.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2973.Lsqrx8x_zero:
2974.byte	0x3e
2975	movdqa	%xmm0,0(%rdi)
2976	movdqa	%xmm0,16(%rdi)
2977	movdqa	%xmm0,32(%rdi)
2978	movdqa	%xmm0,48(%rdi)
2979.Lsqr8x_zero_start:
2980	movdqa	%xmm0,64(%rdi)
2981	movdqa	%xmm0,80(%rdi)
2982	movdqa	%xmm0,96(%rdi)
2983	movdqa	%xmm0,112(%rdi)
2984	leaq	128(%rdi),%rdi
2985	subq	$64,%r9
2986	jnz	.Lsqrx8x_zero
2987
2988	movq	0(%rsi),%rdx
2989
2990	xorq	%r10,%r10
2991	xorq	%r11,%r11
2992	xorq	%r12,%r12
2993	xorq	%r13,%r13
2994	xorq	%r14,%r14
2995	xorq	%r15,%r15
2996	leaq	48+8(%rsp),%rdi
2997	xorq	%rbp,%rbp
2998	jmp	.Lsqrx8x_outer_loop
2999
3000.align	32
3001.Lsqrx8x_outer_loop:
3002	mulxq	8(%rsi),%r8,%rax
3003	adcxq	%r9,%r8
3004	adoxq	%rax,%r10
3005	mulxq	16(%rsi),%r9,%rax
3006	adcxq	%r10,%r9
3007	adoxq	%rax,%r11
3008.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
3009	adcxq	%r11,%r10
3010	adoxq	%rax,%r12
3011.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
3012	adcxq	%r12,%r11
3013	adoxq	%rax,%r13
3014	mulxq	40(%rsi),%r12,%rax
3015	adcxq	%r13,%r12
3016	adoxq	%rax,%r14
3017	mulxq	48(%rsi),%r13,%rax
3018	adcxq	%r14,%r13
3019	adoxq	%r15,%rax
3020	mulxq	56(%rsi),%r14,%r15
3021	movq	8(%rsi),%rdx
3022	adcxq	%rax,%r14
3023	adoxq	%rbp,%r15
3024	adcq	64(%rdi),%r15
3025	movq	%r8,8(%rdi)
3026	movq	%r9,16(%rdi)
3027	sbbq	%rcx,%rcx
3028	xorq	%rbp,%rbp
3029
3030
3031	mulxq	16(%rsi),%r8,%rbx
3032	mulxq	24(%rsi),%r9,%rax
3033	adcxq	%r10,%r8
3034	adoxq	%rbx,%r9
3035	mulxq	32(%rsi),%r10,%rbx
3036	adcxq	%r11,%r9
3037	adoxq	%rax,%r10
3038.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
3039	adcxq	%r12,%r10
3040	adoxq	%rbx,%r11
3041.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
3042	adcxq	%r13,%r11
3043	adoxq	%r14,%r12
3044.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
3045	movq	16(%rsi),%rdx
3046	adcxq	%rax,%r12
3047	adoxq	%rbx,%r13
3048	adcxq	%r15,%r13
3049	adoxq	%rbp,%r14
3050	adcxq	%rbp,%r14
3051
3052	movq	%r8,24(%rdi)
3053	movq	%r9,32(%rdi)
3054
3055	mulxq	24(%rsi),%r8,%rbx
3056	mulxq	32(%rsi),%r9,%rax
3057	adcxq	%r10,%r8
3058	adoxq	%rbx,%r9
3059	mulxq	40(%rsi),%r10,%rbx
3060	adcxq	%r11,%r9
3061	adoxq	%rax,%r10
3062.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
3063	adcxq	%r12,%r10
3064	adoxq	%r13,%r11
3065.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
3066.byte	0x3e
3067	movq	24(%rsi),%rdx
3068	adcxq	%rbx,%r11
3069	adoxq	%rax,%r12
3070	adcxq	%r14,%r12
3071	movq	%r8,40(%rdi)
3072	movq	%r9,48(%rdi)
3073	mulxq	32(%rsi),%r8,%rax
3074	adoxq	%rbp,%r13
3075	adcxq	%rbp,%r13
3076
3077	mulxq	40(%rsi),%r9,%rbx
3078	adcxq	%r10,%r8
3079	adoxq	%rax,%r9
3080	mulxq	48(%rsi),%r10,%rax
3081	adcxq	%r11,%r9
3082	adoxq	%r12,%r10
3083	mulxq	56(%rsi),%r11,%r12
3084	movq	32(%rsi),%rdx
3085	movq	40(%rsi),%r14
3086	adcxq	%rbx,%r10
3087	adoxq	%rax,%r11
3088	movq	48(%rsi),%r15
3089	adcxq	%r13,%r11
3090	adoxq	%rbp,%r12
3091	adcxq	%rbp,%r12
3092
3093	movq	%r8,56(%rdi)
3094	movq	%r9,64(%rdi)
3095
3096	mulxq	%r14,%r9,%rax
3097	movq	56(%rsi),%r8
3098	adcxq	%r10,%r9
3099	mulxq	%r15,%r10,%rbx
3100	adoxq	%rax,%r10
3101	adcxq	%r11,%r10
3102	mulxq	%r8,%r11,%rax
3103	movq	%r14,%rdx
3104	adoxq	%rbx,%r11
3105	adcxq	%r12,%r11
3106
3107	adcxq	%rbp,%rax
3108
3109	mulxq	%r15,%r14,%rbx
3110	mulxq	%r8,%r12,%r13
3111	movq	%r15,%rdx
3112	leaq	64(%rsi),%rsi
3113	adcxq	%r14,%r11
3114	adoxq	%rbx,%r12
3115	adcxq	%rax,%r12
3116	adoxq	%rbp,%r13
3117
3118.byte	0x67,0x67
3119	mulxq	%r8,%r8,%r14
3120	adcxq	%r8,%r13
3121	adcxq	%rbp,%r14
3122
3123	cmpq	8+8(%rsp),%rsi
3124	je	.Lsqrx8x_outer_break
3125
3126	negq	%rcx
3127	movq	$-8,%rcx
3128	movq	%rbp,%r15
3129	movq	64(%rdi),%r8
3130	adcxq	72(%rdi),%r9
3131	adcxq	80(%rdi),%r10
3132	adcxq	88(%rdi),%r11
3133	adcq	96(%rdi),%r12
3134	adcq	104(%rdi),%r13
3135	adcq	112(%rdi),%r14
3136	adcq	120(%rdi),%r15
3137	leaq	(%rsi),%rbp
3138	leaq	128(%rdi),%rdi
3139	sbbq	%rax,%rax
3140
3141	movq	-64(%rsi),%rdx
3142	movq	%rax,16+8(%rsp)
3143	movq	%rdi,24+8(%rsp)
3144
3145
3146	xorl	%eax,%eax
3147	jmp	.Lsqrx8x_loop
3148
3149.align	32
3150.Lsqrx8x_loop:
3151	movq	%r8,%rbx
3152	mulxq	0(%rbp),%rax,%r8
3153	adcxq	%rax,%rbx
3154	adoxq	%r9,%r8
3155
3156	mulxq	8(%rbp),%rax,%r9
3157	adcxq	%rax,%r8
3158	adoxq	%r10,%r9
3159
3160	mulxq	16(%rbp),%rax,%r10
3161	adcxq	%rax,%r9
3162	adoxq	%r11,%r10
3163
3164	mulxq	24(%rbp),%rax,%r11
3165	adcxq	%rax,%r10
3166	adoxq	%r12,%r11
3167
3168.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3169	adcxq	%rax,%r11
3170	adoxq	%r13,%r12
3171
3172	mulxq	40(%rbp),%rax,%r13
3173	adcxq	%rax,%r12
3174	adoxq	%r14,%r13
3175
3176	mulxq	48(%rbp),%rax,%r14
3177	movq	%rbx,(%rdi,%rcx,8)
3178	movl	$0,%ebx
3179	adcxq	%rax,%r13
3180	adoxq	%r15,%r14
3181
3182.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3183	movq	8(%rsi,%rcx,8),%rdx
3184	adcxq	%rax,%r14
3185	adoxq	%rbx,%r15
3186	adcxq	%rbx,%r15
3187
3188.byte	0x67
3189	incq	%rcx
3190	jnz	.Lsqrx8x_loop
3191
3192	leaq	64(%rbp),%rbp
3193	movq	$-8,%rcx
3194	cmpq	8+8(%rsp),%rbp
3195	je	.Lsqrx8x_break
3196
3197	subq	16+8(%rsp),%rbx
3198.byte	0x66
3199	movq	-64(%rsi),%rdx
3200	adcxq	0(%rdi),%r8
3201	adcxq	8(%rdi),%r9
3202	adcq	16(%rdi),%r10
3203	adcq	24(%rdi),%r11
3204	adcq	32(%rdi),%r12
3205	adcq	40(%rdi),%r13
3206	adcq	48(%rdi),%r14
3207	adcq	56(%rdi),%r15
3208	leaq	64(%rdi),%rdi
3209.byte	0x67
3210	sbbq	%rax,%rax
3211	xorl	%ebx,%ebx
3212	movq	%rax,16+8(%rsp)
3213	jmp	.Lsqrx8x_loop
3214
3215.align	32
3216.Lsqrx8x_break:
3217	xorq	%rbp,%rbp
3218	subq	16+8(%rsp),%rbx
3219	adcxq	%rbp,%r8
3220	movq	24+8(%rsp),%rcx
3221	adcxq	%rbp,%r9
3222	movq	0(%rsi),%rdx
3223	adcq	$0,%r10
3224	movq	%r8,0(%rdi)
3225	adcq	$0,%r11
3226	adcq	$0,%r12
3227	adcq	$0,%r13
3228	adcq	$0,%r14
3229	adcq	$0,%r15
3230	cmpq	%rcx,%rdi
3231	je	.Lsqrx8x_outer_loop
3232
3233	movq	%r9,8(%rdi)
3234	movq	8(%rcx),%r9
3235	movq	%r10,16(%rdi)
3236	movq	16(%rcx),%r10
3237	movq	%r11,24(%rdi)
3238	movq	24(%rcx),%r11
3239	movq	%r12,32(%rdi)
3240	movq	32(%rcx),%r12
3241	movq	%r13,40(%rdi)
3242	movq	40(%rcx),%r13
3243	movq	%r14,48(%rdi)
3244	movq	48(%rcx),%r14
3245	movq	%r15,56(%rdi)
3246	movq	56(%rcx),%r15
3247	movq	%rcx,%rdi
3248	jmp	.Lsqrx8x_outer_loop
3249
3250.align	32
3251.Lsqrx8x_outer_break:
3252	movq	%r9,72(%rdi)
3253.byte	102,72,15,126,217
3254	movq	%r10,80(%rdi)
3255	movq	%r11,88(%rdi)
3256	movq	%r12,96(%rdi)
3257	movq	%r13,104(%rdi)
3258	movq	%r14,112(%rdi)
3259	leaq	48+8(%rsp),%rdi
3260	movq	(%rsi,%rcx,1),%rdx
3261
3262	movq	8(%rdi),%r11
3263	xorq	%r10,%r10
3264	movq	0+8(%rsp),%r9
3265	adoxq	%r11,%r11
3266	movq	16(%rdi),%r12
3267	movq	24(%rdi),%r13
3268
3269
3270.align	32
3271.Lsqrx4x_shift_n_add:
3272	mulxq	%rdx,%rax,%rbx
3273	adoxq	%r12,%r12
3274	adcxq	%r10,%rax
3275.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3276.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3277	adoxq	%r13,%r13
3278	adcxq	%r11,%rbx
3279	movq	40(%rdi),%r11
3280	movq	%rax,0(%rdi)
3281	movq	%rbx,8(%rdi)
3282
3283	mulxq	%rdx,%rax,%rbx
3284	adoxq	%r10,%r10
3285	adcxq	%r12,%rax
3286	movq	16(%rsi,%rcx,1),%rdx
3287	movq	48(%rdi),%r12
3288	adoxq	%r11,%r11
3289	adcxq	%r13,%rbx
3290	movq	56(%rdi),%r13
3291	movq	%rax,16(%rdi)
3292	movq	%rbx,24(%rdi)
3293
3294	mulxq	%rdx,%rax,%rbx
3295	adoxq	%r12,%r12
3296	adcxq	%r10,%rax
3297	movq	24(%rsi,%rcx,1),%rdx
3298	leaq	32(%rcx),%rcx
3299	movq	64(%rdi),%r10
3300	adoxq	%r13,%r13
3301	adcxq	%r11,%rbx
3302	movq	72(%rdi),%r11
3303	movq	%rax,32(%rdi)
3304	movq	%rbx,40(%rdi)
3305
3306	mulxq	%rdx,%rax,%rbx
3307	adoxq	%r10,%r10
3308	adcxq	%r12,%rax
3309	jrcxz	.Lsqrx4x_shift_n_add_break
3310.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3311	adoxq	%r11,%r11
3312	adcxq	%r13,%rbx
3313	movq	80(%rdi),%r12
3314	movq	88(%rdi),%r13
3315	movq	%rax,48(%rdi)
3316	movq	%rbx,56(%rdi)
3317	leaq	64(%rdi),%rdi
3318	nop
3319	jmp	.Lsqrx4x_shift_n_add
3320
3321.align	32
3322.Lsqrx4x_shift_n_add_break:
3323	adcxq	%r13,%rbx
3324	movq	%rax,48(%rdi)
3325	movq	%rbx,56(%rdi)
3326	leaq	64(%rdi),%rdi
3327.byte	102,72,15,126,213
3328__bn_sqrx8x_reduction:
3329	xorl	%eax,%eax
3330	movq	32+8(%rsp),%rbx
3331	movq	48+8(%rsp),%rdx
3332	leaq	-64(%rbp,%r9,1),%rcx
3333
3334	movq	%rcx,0+8(%rsp)
3335	movq	%rdi,8+8(%rsp)
3336
3337	leaq	48+8(%rsp),%rdi
3338	jmp	.Lsqrx8x_reduction_loop
3339
3340.align	32
3341.Lsqrx8x_reduction_loop:
3342	movq	8(%rdi),%r9
3343	movq	16(%rdi),%r10
3344	movq	24(%rdi),%r11
3345	movq	32(%rdi),%r12
3346	movq	%rdx,%r8
3347	imulq	%rbx,%rdx
3348	movq	40(%rdi),%r13
3349	movq	48(%rdi),%r14
3350	movq	56(%rdi),%r15
3351	movq	%rax,24+8(%rsp)
3352
3353	leaq	64(%rdi),%rdi
3354	xorq	%rsi,%rsi
3355	movq	$-8,%rcx
3356	jmp	.Lsqrx8x_reduce
3357
3358.align	32
3359.Lsqrx8x_reduce:
3360	movq	%r8,%rbx
3361	mulxq	0(%rbp),%rax,%r8
3362	adcxq	%rbx,%rax
3363	adoxq	%r9,%r8
3364
3365	mulxq	8(%rbp),%rbx,%r9
3366	adcxq	%rbx,%r8
3367	adoxq	%r10,%r9
3368
3369	mulxq	16(%rbp),%rbx,%r10
3370	adcxq	%rbx,%r9
3371	adoxq	%r11,%r10
3372
3373	mulxq	24(%rbp),%rbx,%r11
3374	adcxq	%rbx,%r10
3375	adoxq	%r12,%r11
3376
3377.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3378	movq	%rdx,%rax
3379	movq	%r8,%rdx
3380	adcxq	%rbx,%r11
3381	adoxq	%r13,%r12
3382
3383	mulxq	32+8(%rsp),%rbx,%rdx
3384	movq	%rax,%rdx
3385	movq	%rax,64+48+8(%rsp,%rcx,8)
3386
3387	mulxq	40(%rbp),%rax,%r13
3388	adcxq	%rax,%r12
3389	adoxq	%r14,%r13
3390
3391	mulxq	48(%rbp),%rax,%r14
3392	adcxq	%rax,%r13
3393	adoxq	%r15,%r14
3394
3395	mulxq	56(%rbp),%rax,%r15
3396	movq	%rbx,%rdx
3397	adcxq	%rax,%r14
3398	adoxq	%rsi,%r15
3399	adcxq	%rsi,%r15
3400
3401.byte	0x67,0x67,0x67
3402	incq	%rcx
3403	jnz	.Lsqrx8x_reduce
3404
3405	movq	%rsi,%rax
3406	cmpq	0+8(%rsp),%rbp
3407	jae	.Lsqrx8x_no_tail
3408
3409	movq	48+8(%rsp),%rdx
3410	addq	0(%rdi),%r8
3411	leaq	64(%rbp),%rbp
3412	movq	$-8,%rcx
3413	adcxq	8(%rdi),%r9
3414	adcxq	16(%rdi),%r10
3415	adcq	24(%rdi),%r11
3416	adcq	32(%rdi),%r12
3417	adcq	40(%rdi),%r13
3418	adcq	48(%rdi),%r14
3419	adcq	56(%rdi),%r15
3420	leaq	64(%rdi),%rdi
3421	sbbq	%rax,%rax
3422
3423	xorq	%rsi,%rsi
3424	movq	%rax,16+8(%rsp)
3425	jmp	.Lsqrx8x_tail
3426
3427.align	32
3428.Lsqrx8x_tail:
3429	movq	%r8,%rbx
3430	mulxq	0(%rbp),%rax,%r8
3431	adcxq	%rax,%rbx
3432	adoxq	%r9,%r8
3433
3434	mulxq	8(%rbp),%rax,%r9
3435	adcxq	%rax,%r8
3436	adoxq	%r10,%r9
3437
3438	mulxq	16(%rbp),%rax,%r10
3439	adcxq	%rax,%r9
3440	adoxq	%r11,%r10
3441
3442	mulxq	24(%rbp),%rax,%r11
3443	adcxq	%rax,%r10
3444	adoxq	%r12,%r11
3445
3446.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3447	adcxq	%rax,%r11
3448	adoxq	%r13,%r12
3449
3450	mulxq	40(%rbp),%rax,%r13
3451	adcxq	%rax,%r12
3452	adoxq	%r14,%r13
3453
3454	mulxq	48(%rbp),%rax,%r14
3455	adcxq	%rax,%r13
3456	adoxq	%r15,%r14
3457
3458	mulxq	56(%rbp),%rax,%r15
3459	movq	72+48+8(%rsp,%rcx,8),%rdx
3460	adcxq	%rax,%r14
3461	adoxq	%rsi,%r15
3462	movq	%rbx,(%rdi,%rcx,8)
3463	movq	%r8,%rbx
3464	adcxq	%rsi,%r15
3465
3466	incq	%rcx
3467	jnz	.Lsqrx8x_tail
3468
3469	cmpq	0+8(%rsp),%rbp
3470	jae	.Lsqrx8x_tail_done
3471
3472	subq	16+8(%rsp),%rsi
3473	movq	48+8(%rsp),%rdx
3474	leaq	64(%rbp),%rbp
3475	adcq	0(%rdi),%r8
3476	adcq	8(%rdi),%r9
3477	adcq	16(%rdi),%r10
3478	adcq	24(%rdi),%r11
3479	adcq	32(%rdi),%r12
3480	adcq	40(%rdi),%r13
3481	adcq	48(%rdi),%r14
3482	adcq	56(%rdi),%r15
3483	leaq	64(%rdi),%rdi
3484	sbbq	%rax,%rax
3485	subq	$8,%rcx
3486
3487	xorq	%rsi,%rsi
3488	movq	%rax,16+8(%rsp)
3489	jmp	.Lsqrx8x_tail
3490
3491.align	32
3492.Lsqrx8x_tail_done:
3493	xorq	%rax,%rax
3494	addq	24+8(%rsp),%r8
3495	adcq	$0,%r9
3496	adcq	$0,%r10
3497	adcq	$0,%r11
3498	adcq	$0,%r12
3499	adcq	$0,%r13
3500	adcq	$0,%r14
3501	adcq	$0,%r15
3502	adcq	$0,%rax
3503
3504	subq	16+8(%rsp),%rsi
3505.Lsqrx8x_no_tail:
3506	adcq	0(%rdi),%r8
3507.byte	102,72,15,126,217
3508	adcq	8(%rdi),%r9
3509	movq	56(%rbp),%rsi
3510.byte	102,72,15,126,213
3511	adcq	16(%rdi),%r10
3512	adcq	24(%rdi),%r11
3513	adcq	32(%rdi),%r12
3514	adcq	40(%rdi),%r13
3515	adcq	48(%rdi),%r14
3516	adcq	56(%rdi),%r15
3517	adcq	$0,%rax
3518
3519	movq	32+8(%rsp),%rbx
3520	movq	64(%rdi,%rcx,1),%rdx
3521
3522	movq	%r8,0(%rdi)
3523	leaq	64(%rdi),%r8
3524	movq	%r9,8(%rdi)
3525	movq	%r10,16(%rdi)
3526	movq	%r11,24(%rdi)
3527	movq	%r12,32(%rdi)
3528	movq	%r13,40(%rdi)
3529	movq	%r14,48(%rdi)
3530	movq	%r15,56(%rdi)
3531
3532	leaq	64(%rdi,%rcx,1),%rdi
3533	cmpq	8+8(%rsp),%r8
3534	jb	.Lsqrx8x_reduction_loop
3535	.byte	0xf3,0xc3
3536.cfi_endproc
3537.size	GFp_bn_sqrx8x_internal,.-GFp_bn_sqrx8x_internal
3538.align	32
3539.type	__bn_postx4x_internal,@function
3540__bn_postx4x_internal:
3541.cfi_startproc
3542	movq	0(%rbp),%r12
3543	movq	%rcx,%r10
3544	movq	%rcx,%r9
3545	negq	%rax
3546	sarq	$3+2,%rcx
3547
3548.byte	102,72,15,126,202
3549.byte	102,72,15,126,206
3550	decq	%r12
3551	movq	8(%rbp),%r13
3552	xorq	%r8,%r8
3553	movq	16(%rbp),%r14
3554	movq	24(%rbp),%r15
3555	jmp	.Lsqrx4x_sub_entry
3556
3557.align	16
3558.Lsqrx4x_sub:
3559	movq	0(%rbp),%r12
3560	movq	8(%rbp),%r13
3561	movq	16(%rbp),%r14
3562	movq	24(%rbp),%r15
3563.Lsqrx4x_sub_entry:
3564	andnq	%rax,%r12,%r12
3565	leaq	32(%rbp),%rbp
3566	andnq	%rax,%r13,%r13
3567	andnq	%rax,%r14,%r14
3568	andnq	%rax,%r15,%r15
3569
3570	negq	%r8
3571	adcq	0(%rdi),%r12
3572	adcq	8(%rdi),%r13
3573	adcq	16(%rdi),%r14
3574	adcq	24(%rdi),%r15
3575	movq	%r12,0(%rdx)
3576	leaq	32(%rdi),%rdi
3577	movq	%r13,8(%rdx)
3578	sbbq	%r8,%r8
3579	movq	%r14,16(%rdx)
3580	movq	%r15,24(%rdx)
3581	leaq	32(%rdx),%rdx
3582
3583	incq	%rcx
3584	jnz	.Lsqrx4x_sub
3585
3586	negq	%r9
3587
3588	.byte	0xf3,0xc3
3589.cfi_endproc
3590.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3591.globl	GFp_bn_scatter5
3592.hidden GFp_bn_scatter5
3593.type	GFp_bn_scatter5,@function
3594.align	16
3595GFp_bn_scatter5:
3596.cfi_startproc
3597	cmpl	$0,%esi
3598	jz	.Lscatter_epilogue
3599	leaq	(%rdx,%rcx,8),%rdx
3600.Lscatter:
3601	movq	(%rdi),%rax
3602	leaq	8(%rdi),%rdi
3603	movq	%rax,(%rdx)
3604	leaq	256(%rdx),%rdx
3605	subl	$1,%esi
3606	jnz	.Lscatter
3607.Lscatter_epilogue:
3608	.byte	0xf3,0xc3
3609.cfi_endproc
3610.size	GFp_bn_scatter5,.-GFp_bn_scatter5
3611
3612.globl	GFp_bn_gather5
3613.hidden GFp_bn_gather5
3614.type	GFp_bn_gather5,@function
3615.align	32
3616GFp_bn_gather5:
3617.cfi_startproc
3618.LSEH_begin_GFp_bn_gather5:
3619
3620.byte	0x4c,0x8d,0x14,0x24
3621.cfi_def_cfa_register	%r10
3622.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3623	leaq	.Linc(%rip),%rax
3624	andq	$-16,%rsp
3625
3626	movd	%ecx,%xmm5
3627	movdqa	0(%rax),%xmm0
3628	movdqa	16(%rax),%xmm1
3629	leaq	128(%rdx),%r11
3630	leaq	128(%rsp),%rax
3631
3632	pshufd	$0,%xmm5,%xmm5
3633	movdqa	%xmm1,%xmm4
3634	movdqa	%xmm1,%xmm2
3635	paddd	%xmm0,%xmm1
3636	pcmpeqd	%xmm5,%xmm0
3637	movdqa	%xmm4,%xmm3
3638
3639	paddd	%xmm1,%xmm2
3640	pcmpeqd	%xmm5,%xmm1
3641	movdqa	%xmm0,-128(%rax)
3642	movdqa	%xmm4,%xmm0
3643
3644	paddd	%xmm2,%xmm3
3645	pcmpeqd	%xmm5,%xmm2
3646	movdqa	%xmm1,-112(%rax)
3647	movdqa	%xmm4,%xmm1
3648
3649	paddd	%xmm3,%xmm0
3650	pcmpeqd	%xmm5,%xmm3
3651	movdqa	%xmm2,-96(%rax)
3652	movdqa	%xmm4,%xmm2
3653	paddd	%xmm0,%xmm1
3654	pcmpeqd	%xmm5,%xmm0
3655	movdqa	%xmm3,-80(%rax)
3656	movdqa	%xmm4,%xmm3
3657
3658	paddd	%xmm1,%xmm2
3659	pcmpeqd	%xmm5,%xmm1
3660	movdqa	%xmm0,-64(%rax)
3661	movdqa	%xmm4,%xmm0
3662
3663	paddd	%xmm2,%xmm3
3664	pcmpeqd	%xmm5,%xmm2
3665	movdqa	%xmm1,-48(%rax)
3666	movdqa	%xmm4,%xmm1
3667
3668	paddd	%xmm3,%xmm0
3669	pcmpeqd	%xmm5,%xmm3
3670	movdqa	%xmm2,-32(%rax)
3671	movdqa	%xmm4,%xmm2
3672	paddd	%xmm0,%xmm1
3673	pcmpeqd	%xmm5,%xmm0
3674	movdqa	%xmm3,-16(%rax)
3675	movdqa	%xmm4,%xmm3
3676
3677	paddd	%xmm1,%xmm2
3678	pcmpeqd	%xmm5,%xmm1
3679	movdqa	%xmm0,0(%rax)
3680	movdqa	%xmm4,%xmm0
3681
3682	paddd	%xmm2,%xmm3
3683	pcmpeqd	%xmm5,%xmm2
3684	movdqa	%xmm1,16(%rax)
3685	movdqa	%xmm4,%xmm1
3686
3687	paddd	%xmm3,%xmm0
3688	pcmpeqd	%xmm5,%xmm3
3689	movdqa	%xmm2,32(%rax)
3690	movdqa	%xmm4,%xmm2
3691	paddd	%xmm0,%xmm1
3692	pcmpeqd	%xmm5,%xmm0
3693	movdqa	%xmm3,48(%rax)
3694	movdqa	%xmm4,%xmm3
3695
3696	paddd	%xmm1,%xmm2
3697	pcmpeqd	%xmm5,%xmm1
3698	movdqa	%xmm0,64(%rax)
3699	movdqa	%xmm4,%xmm0
3700
3701	paddd	%xmm2,%xmm3
3702	pcmpeqd	%xmm5,%xmm2
3703	movdqa	%xmm1,80(%rax)
3704	movdqa	%xmm4,%xmm1
3705
3706	paddd	%xmm3,%xmm0
3707	pcmpeqd	%xmm5,%xmm3
3708	movdqa	%xmm2,96(%rax)
3709	movdqa	%xmm4,%xmm2
3710	movdqa	%xmm3,112(%rax)
3711	jmp	.Lgather
3712
3713.align	32
3714.Lgather:
3715	pxor	%xmm4,%xmm4
3716	pxor	%xmm5,%xmm5
3717	movdqa	-128(%r11),%xmm0
3718	movdqa	-112(%r11),%xmm1
3719	movdqa	-96(%r11),%xmm2
3720	pand	-128(%rax),%xmm0
3721	movdqa	-80(%r11),%xmm3
3722	pand	-112(%rax),%xmm1
3723	por	%xmm0,%xmm4
3724	pand	-96(%rax),%xmm2
3725	por	%xmm1,%xmm5
3726	pand	-80(%rax),%xmm3
3727	por	%xmm2,%xmm4
3728	por	%xmm3,%xmm5
3729	movdqa	-64(%r11),%xmm0
3730	movdqa	-48(%r11),%xmm1
3731	movdqa	-32(%r11),%xmm2
3732	pand	-64(%rax),%xmm0
3733	movdqa	-16(%r11),%xmm3
3734	pand	-48(%rax),%xmm1
3735	por	%xmm0,%xmm4
3736	pand	-32(%rax),%xmm2
3737	por	%xmm1,%xmm5
3738	pand	-16(%rax),%xmm3
3739	por	%xmm2,%xmm4
3740	por	%xmm3,%xmm5
3741	movdqa	0(%r11),%xmm0
3742	movdqa	16(%r11),%xmm1
3743	movdqa	32(%r11),%xmm2
3744	pand	0(%rax),%xmm0
3745	movdqa	48(%r11),%xmm3
3746	pand	16(%rax),%xmm1
3747	por	%xmm0,%xmm4
3748	pand	32(%rax),%xmm2
3749	por	%xmm1,%xmm5
3750	pand	48(%rax),%xmm3
3751	por	%xmm2,%xmm4
3752	por	%xmm3,%xmm5
3753	movdqa	64(%r11),%xmm0
3754	movdqa	80(%r11),%xmm1
3755	movdqa	96(%r11),%xmm2
3756	pand	64(%rax),%xmm0
3757	movdqa	112(%r11),%xmm3
3758	pand	80(%rax),%xmm1
3759	por	%xmm0,%xmm4
3760	pand	96(%rax),%xmm2
3761	por	%xmm1,%xmm5
3762	pand	112(%rax),%xmm3
3763	por	%xmm2,%xmm4
3764	por	%xmm3,%xmm5
3765	por	%xmm5,%xmm4
3766	leaq	256(%r11),%r11
3767	pshufd	$0x4e,%xmm4,%xmm0
3768	por	%xmm4,%xmm0
3769	movq	%xmm0,(%rdi)
3770	leaq	8(%rdi),%rdi
3771	subl	$1,%esi
3772	jnz	.Lgather
3773
3774	leaq	(%r10),%rsp
3775.cfi_def_cfa_register	%rsp
3776	.byte	0xf3,0xc3
3777.LSEH_end_GFp_bn_gather5:
3778.cfi_endproc
3779.size	GFp_bn_gather5,.-GFp_bn_gather5
3780.align	64
3781.Linc:
3782.long	0,0, 1,1
3783.long	2,2, 2,2
3784.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3785#endif
3786.section	.note.GNU-stack,"",@progbits
3787