1.text
2
3
4
5.globl	bn_mul_mont
6.type	bn_mul_mont,@function
7.align	16
8bn_mul_mont:
9.cfi_startproc
10	movl	%r9d,%r9d
11	movq	%rsp,%rax
12.cfi_def_cfa_register	%rax
13	testl	$3,%r9d
14	jnz	.Lmul_enter
15	cmpl	$8,%r9d
16	jb	.Lmul_enter
17	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
18	cmpq	%rsi,%rdx
19	jne	.Lmul4x_enter
20	testl	$7,%r9d
21	jz	.Lsqr8x_enter
22	jmp	.Lmul4x_enter
23
24.align	16
25.Lmul_enter:
26	pushq	%rbx
27.cfi_offset	%rbx,-16
28	pushq	%rbp
29.cfi_offset	%rbp,-24
30	pushq	%r12
31.cfi_offset	%r12,-32
32	pushq	%r13
33.cfi_offset	%r13,-40
34	pushq	%r14
35.cfi_offset	%r14,-48
36	pushq	%r15
37.cfi_offset	%r15,-56
38
39	negq	%r9
40	movq	%rsp,%r11
41	leaq	-16(%rsp,%r9,8),%r10
42	negq	%r9
43	andq	$-1024,%r10
44
45
46
47
48
49
50
51
52
53	subq	%r10,%r11
54	andq	$-4096,%r11
55	leaq	(%r10,%r11,1),%rsp
56	movq	(%rsp),%r11
57	cmpq	%r10,%rsp
58	ja	.Lmul_page_walk
59	jmp	.Lmul_page_walk_done
60
61.align	16
62.Lmul_page_walk:
63	leaq	-4096(%rsp),%rsp
64	movq	(%rsp),%r11
65	cmpq	%r10,%rsp
66	ja	.Lmul_page_walk
67.Lmul_page_walk_done:
68
69	movq	%rax,8(%rsp,%r9,8)
70.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
71.Lmul_body:
72	movq	%rdx,%r12
73	movq	(%r8),%r8
74	movq	(%r12),%rbx
75	movq	(%rsi),%rax
76
77	xorq	%r14,%r14
78	xorq	%r15,%r15
79
80	movq	%r8,%rbp
81	mulq	%rbx
82	movq	%rax,%r10
83	movq	(%rcx),%rax
84
85	imulq	%r10,%rbp
86	movq	%rdx,%r11
87
88	mulq	%rbp
89	addq	%rax,%r10
90	movq	8(%rsi),%rax
91	adcq	$0,%rdx
92	movq	%rdx,%r13
93
94	leaq	1(%r15),%r15
95	jmp	.L1st_enter
96
97.align	16
98.L1st:
99	addq	%rax,%r13
100	movq	(%rsi,%r15,8),%rax
101	adcq	$0,%rdx
102	addq	%r11,%r13
103	movq	%r10,%r11
104	adcq	$0,%rdx
105	movq	%r13,-16(%rsp,%r15,8)
106	movq	%rdx,%r13
107
108.L1st_enter:
109	mulq	%rbx
110	addq	%rax,%r11
111	movq	(%rcx,%r15,8),%rax
112	adcq	$0,%rdx
113	leaq	1(%r15),%r15
114	movq	%rdx,%r10
115
116	mulq	%rbp
117	cmpq	%r9,%r15
118	jne	.L1st
119
120	addq	%rax,%r13
121	movq	(%rsi),%rax
122	adcq	$0,%rdx
123	addq	%r11,%r13
124	adcq	$0,%rdx
125	movq	%r13,-16(%rsp,%r15,8)
126	movq	%rdx,%r13
127	movq	%r10,%r11
128
129	xorq	%rdx,%rdx
130	addq	%r11,%r13
131	adcq	$0,%rdx
132	movq	%r13,-8(%rsp,%r9,8)
133	movq	%rdx,(%rsp,%r9,8)
134
135	leaq	1(%r14),%r14
136	jmp	.Louter
137.align	16
138.Louter:
139	movq	(%r12,%r14,8),%rbx
140	xorq	%r15,%r15
141	movq	%r8,%rbp
142	movq	(%rsp),%r10
143	mulq	%rbx
144	addq	%rax,%r10
145	movq	(%rcx),%rax
146	adcq	$0,%rdx
147
148	imulq	%r10,%rbp
149	movq	%rdx,%r11
150
151	mulq	%rbp
152	addq	%rax,%r10
153	movq	8(%rsi),%rax
154	adcq	$0,%rdx
155	movq	8(%rsp),%r10
156	movq	%rdx,%r13
157
158	leaq	1(%r15),%r15
159	jmp	.Linner_enter
160
161.align	16
162.Linner:
163	addq	%rax,%r13
164	movq	(%rsi,%r15,8),%rax
165	adcq	$0,%rdx
166	addq	%r10,%r13
167	movq	(%rsp,%r15,8),%r10
168	adcq	$0,%rdx
169	movq	%r13,-16(%rsp,%r15,8)
170	movq	%rdx,%r13
171
172.Linner_enter:
173	mulq	%rbx
174	addq	%rax,%r11
175	movq	(%rcx,%r15,8),%rax
176	adcq	$0,%rdx
177	addq	%r11,%r10
178	movq	%rdx,%r11
179	adcq	$0,%r11
180	leaq	1(%r15),%r15
181
182	mulq	%rbp
183	cmpq	%r9,%r15
184	jne	.Linner
185
186	addq	%rax,%r13
187	movq	(%rsi),%rax
188	adcq	$0,%rdx
189	addq	%r10,%r13
190	movq	(%rsp,%r15,8),%r10
191	adcq	$0,%rdx
192	movq	%r13,-16(%rsp,%r15,8)
193	movq	%rdx,%r13
194
195	xorq	%rdx,%rdx
196	addq	%r11,%r13
197	adcq	$0,%rdx
198	addq	%r10,%r13
199	adcq	$0,%rdx
200	movq	%r13,-8(%rsp,%r9,8)
201	movq	%rdx,(%rsp,%r9,8)
202
203	leaq	1(%r14),%r14
204	cmpq	%r9,%r14
205	jb	.Louter
206
207	xorq	%r14,%r14
208	movq	(%rsp),%rax
209	movq	%r9,%r15
210
211.align	16
212.Lsub:	sbbq	(%rcx,%r14,8),%rax
213	movq	%rax,(%rdi,%r14,8)
214	movq	8(%rsp,%r14,8),%rax
215	leaq	1(%r14),%r14
216	decq	%r15
217	jnz	.Lsub
218
219	sbbq	$0,%rax
220	movq	$-1,%rbx
221	xorq	%rax,%rbx
222	xorq	%r14,%r14
223	movq	%r9,%r15
224
225.Lcopy:
226	movq	(%rdi,%r14,8),%rcx
227	movq	(%rsp,%r14,8),%rdx
228	andq	%rbx,%rcx
229	andq	%rax,%rdx
230	movq	%r9,(%rsp,%r14,8)
231	orq	%rcx,%rdx
232	movq	%rdx,(%rdi,%r14,8)
233	leaq	1(%r14),%r14
234	subq	$1,%r15
235	jnz	.Lcopy
236
237	movq	8(%rsp,%r9,8),%rsi
238.cfi_def_cfa	%rsi,8
239	movq	$1,%rax
240	movq	-48(%rsi),%r15
241.cfi_restore	%r15
242	movq	-40(%rsi),%r14
243.cfi_restore	%r14
244	movq	-32(%rsi),%r13
245.cfi_restore	%r13
246	movq	-24(%rsi),%r12
247.cfi_restore	%r12
248	movq	-16(%rsi),%rbp
249.cfi_restore	%rbp
250	movq	-8(%rsi),%rbx
251.cfi_restore	%rbx
252	leaq	(%rsi),%rsp
253.cfi_def_cfa_register	%rsp
254.Lmul_epilogue:
255	.byte	0xf3,0xc3
256.cfi_endproc
257.size	bn_mul_mont,.-bn_mul_mont
258.type	bn_mul4x_mont,@function
259.align	16
260bn_mul4x_mont:
261.cfi_startproc
262	movl	%r9d,%r9d
263	movq	%rsp,%rax
264.cfi_def_cfa_register	%rax
265.Lmul4x_enter:
266	andl	$0x80100,%r11d
267	cmpl	$0x80100,%r11d
268	je	.Lmulx4x_enter
269	pushq	%rbx
270.cfi_offset	%rbx,-16
271	pushq	%rbp
272.cfi_offset	%rbp,-24
273	pushq	%r12
274.cfi_offset	%r12,-32
275	pushq	%r13
276.cfi_offset	%r13,-40
277	pushq	%r14
278.cfi_offset	%r14,-48
279	pushq	%r15
280.cfi_offset	%r15,-56
281
282	negq	%r9
283	movq	%rsp,%r11
284	leaq	-32(%rsp,%r9,8),%r10
285	negq	%r9
286	andq	$-1024,%r10
287
288	subq	%r10,%r11
289	andq	$-4096,%r11
290	leaq	(%r10,%r11,1),%rsp
291	movq	(%rsp),%r11
292	cmpq	%r10,%rsp
293	ja	.Lmul4x_page_walk
294	jmp	.Lmul4x_page_walk_done
295
296.Lmul4x_page_walk:
297	leaq	-4096(%rsp),%rsp
298	movq	(%rsp),%r11
299	cmpq	%r10,%rsp
300	ja	.Lmul4x_page_walk
301.Lmul4x_page_walk_done:
302
303	movq	%rax,8(%rsp,%r9,8)
304.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
305.Lmul4x_body:
306	movq	%rdi,16(%rsp,%r9,8)
307	movq	%rdx,%r12
308	movq	(%r8),%r8
309	movq	(%r12),%rbx
310	movq	(%rsi),%rax
311
312	xorq	%r14,%r14
313	xorq	%r15,%r15
314
315	movq	%r8,%rbp
316	mulq	%rbx
317	movq	%rax,%r10
318	movq	(%rcx),%rax
319
320	imulq	%r10,%rbp
321	movq	%rdx,%r11
322
323	mulq	%rbp
324	addq	%rax,%r10
325	movq	8(%rsi),%rax
326	adcq	$0,%rdx
327	movq	%rdx,%rdi
328
329	mulq	%rbx
330	addq	%rax,%r11
331	movq	8(%rcx),%rax
332	adcq	$0,%rdx
333	movq	%rdx,%r10
334
335	mulq	%rbp
336	addq	%rax,%rdi
337	movq	16(%rsi),%rax
338	adcq	$0,%rdx
339	addq	%r11,%rdi
340	leaq	4(%r15),%r15
341	adcq	$0,%rdx
342	movq	%rdi,(%rsp)
343	movq	%rdx,%r13
344	jmp	.L1st4x
345.align	16
346.L1st4x:
347	mulq	%rbx
348	addq	%rax,%r10
349	movq	-16(%rcx,%r15,8),%rax
350	adcq	$0,%rdx
351	movq	%rdx,%r11
352
353	mulq	%rbp
354	addq	%rax,%r13
355	movq	-8(%rsi,%r15,8),%rax
356	adcq	$0,%rdx
357	addq	%r10,%r13
358	adcq	$0,%rdx
359	movq	%r13,-24(%rsp,%r15,8)
360	movq	%rdx,%rdi
361
362	mulq	%rbx
363	addq	%rax,%r11
364	movq	-8(%rcx,%r15,8),%rax
365	adcq	$0,%rdx
366	movq	%rdx,%r10
367
368	mulq	%rbp
369	addq	%rax,%rdi
370	movq	(%rsi,%r15,8),%rax
371	adcq	$0,%rdx
372	addq	%r11,%rdi
373	adcq	$0,%rdx
374	movq	%rdi,-16(%rsp,%r15,8)
375	movq	%rdx,%r13
376
377	mulq	%rbx
378	addq	%rax,%r10
379	movq	(%rcx,%r15,8),%rax
380	adcq	$0,%rdx
381	movq	%rdx,%r11
382
383	mulq	%rbp
384	addq	%rax,%r13
385	movq	8(%rsi,%r15,8),%rax
386	adcq	$0,%rdx
387	addq	%r10,%r13
388	adcq	$0,%rdx
389	movq	%r13,-8(%rsp,%r15,8)
390	movq	%rdx,%rdi
391
392	mulq	%rbx
393	addq	%rax,%r11
394	movq	8(%rcx,%r15,8),%rax
395	adcq	$0,%rdx
396	leaq	4(%r15),%r15
397	movq	%rdx,%r10
398
399	mulq	%rbp
400	addq	%rax,%rdi
401	movq	-16(%rsi,%r15,8),%rax
402	adcq	$0,%rdx
403	addq	%r11,%rdi
404	adcq	$0,%rdx
405	movq	%rdi,-32(%rsp,%r15,8)
406	movq	%rdx,%r13
407	cmpq	%r9,%r15
408	jb	.L1st4x
409
410	mulq	%rbx
411	addq	%rax,%r10
412	movq	-16(%rcx,%r15,8),%rax
413	adcq	$0,%rdx
414	movq	%rdx,%r11
415
416	mulq	%rbp
417	addq	%rax,%r13
418	movq	-8(%rsi,%r15,8),%rax
419	adcq	$0,%rdx
420	addq	%r10,%r13
421	adcq	$0,%rdx
422	movq	%r13,-24(%rsp,%r15,8)
423	movq	%rdx,%rdi
424
425	mulq	%rbx
426	addq	%rax,%r11
427	movq	-8(%rcx,%r15,8),%rax
428	adcq	$0,%rdx
429	movq	%rdx,%r10
430
431	mulq	%rbp
432	addq	%rax,%rdi
433	movq	(%rsi),%rax
434	adcq	$0,%rdx
435	addq	%r11,%rdi
436	adcq	$0,%rdx
437	movq	%rdi,-16(%rsp,%r15,8)
438	movq	%rdx,%r13
439
440	xorq	%rdi,%rdi
441	addq	%r10,%r13
442	adcq	$0,%rdi
443	movq	%r13,-8(%rsp,%r15,8)
444	movq	%rdi,(%rsp,%r15,8)
445
446	leaq	1(%r14),%r14
447.align	4
448.Louter4x:
449	movq	(%r12,%r14,8),%rbx
450	xorq	%r15,%r15
451	movq	(%rsp),%r10
452	movq	%r8,%rbp
453	mulq	%rbx
454	addq	%rax,%r10
455	movq	(%rcx),%rax
456	adcq	$0,%rdx
457
458	imulq	%r10,%rbp
459	movq	%rdx,%r11
460
461	mulq	%rbp
462	addq	%rax,%r10
463	movq	8(%rsi),%rax
464	adcq	$0,%rdx
465	movq	%rdx,%rdi
466
467	mulq	%rbx
468	addq	%rax,%r11
469	movq	8(%rcx),%rax
470	adcq	$0,%rdx
471	addq	8(%rsp),%r11
472	adcq	$0,%rdx
473	movq	%rdx,%r10
474
475	mulq	%rbp
476	addq	%rax,%rdi
477	movq	16(%rsi),%rax
478	adcq	$0,%rdx
479	addq	%r11,%rdi
480	leaq	4(%r15),%r15
481	adcq	$0,%rdx
482	movq	%rdi,(%rsp)
483	movq	%rdx,%r13
484	jmp	.Linner4x
485.align	16
486.Linner4x:
487	mulq	%rbx
488	addq	%rax,%r10
489	movq	-16(%rcx,%r15,8),%rax
490	adcq	$0,%rdx
491	addq	-16(%rsp,%r15,8),%r10
492	adcq	$0,%rdx
493	movq	%rdx,%r11
494
495	mulq	%rbp
496	addq	%rax,%r13
497	movq	-8(%rsi,%r15,8),%rax
498	adcq	$0,%rdx
499	addq	%r10,%r13
500	adcq	$0,%rdx
501	movq	%r13,-24(%rsp,%r15,8)
502	movq	%rdx,%rdi
503
504	mulq	%rbx
505	addq	%rax,%r11
506	movq	-8(%rcx,%r15,8),%rax
507	adcq	$0,%rdx
508	addq	-8(%rsp,%r15,8),%r11
509	adcq	$0,%rdx
510	movq	%rdx,%r10
511
512	mulq	%rbp
513	addq	%rax,%rdi
514	movq	(%rsi,%r15,8),%rax
515	adcq	$0,%rdx
516	addq	%r11,%rdi
517	adcq	$0,%rdx
518	movq	%rdi,-16(%rsp,%r15,8)
519	movq	%rdx,%r13
520
521	mulq	%rbx
522	addq	%rax,%r10
523	movq	(%rcx,%r15,8),%rax
524	adcq	$0,%rdx
525	addq	(%rsp,%r15,8),%r10
526	adcq	$0,%rdx
527	movq	%rdx,%r11
528
529	mulq	%rbp
530	addq	%rax,%r13
531	movq	8(%rsi,%r15,8),%rax
532	adcq	$0,%rdx
533	addq	%r10,%r13
534	adcq	$0,%rdx
535	movq	%r13,-8(%rsp,%r15,8)
536	movq	%rdx,%rdi
537
538	mulq	%rbx
539	addq	%rax,%r11
540	movq	8(%rcx,%r15,8),%rax
541	adcq	$0,%rdx
542	addq	8(%rsp,%r15,8),%r11
543	adcq	$0,%rdx
544	leaq	4(%r15),%r15
545	movq	%rdx,%r10
546
547	mulq	%rbp
548	addq	%rax,%rdi
549	movq	-16(%rsi,%r15,8),%rax
550	adcq	$0,%rdx
551	addq	%r11,%rdi
552	adcq	$0,%rdx
553	movq	%rdi,-32(%rsp,%r15,8)
554	movq	%rdx,%r13
555	cmpq	%r9,%r15
556	jb	.Linner4x
557
558	mulq	%rbx
559	addq	%rax,%r10
560	movq	-16(%rcx,%r15,8),%rax
561	adcq	$0,%rdx
562	addq	-16(%rsp,%r15,8),%r10
563	adcq	$0,%rdx
564	movq	%rdx,%r11
565
566	mulq	%rbp
567	addq	%rax,%r13
568	movq	-8(%rsi,%r15,8),%rax
569	adcq	$0,%rdx
570	addq	%r10,%r13
571	adcq	$0,%rdx
572	movq	%r13,-24(%rsp,%r15,8)
573	movq	%rdx,%rdi
574
575	mulq	%rbx
576	addq	%rax,%r11
577	movq	-8(%rcx,%r15,8),%rax
578	adcq	$0,%rdx
579	addq	-8(%rsp,%r15,8),%r11
580	adcq	$0,%rdx
581	leaq	1(%r14),%r14
582	movq	%rdx,%r10
583
584	mulq	%rbp
585	addq	%rax,%rdi
586	movq	(%rsi),%rax
587	adcq	$0,%rdx
588	addq	%r11,%rdi
589	adcq	$0,%rdx
590	movq	%rdi,-16(%rsp,%r15,8)
591	movq	%rdx,%r13
592
593	xorq	%rdi,%rdi
594	addq	%r10,%r13
595	adcq	$0,%rdi
596	addq	(%rsp,%r9,8),%r13
597	adcq	$0,%rdi
598	movq	%r13,-8(%rsp,%r15,8)
599	movq	%rdi,(%rsp,%r15,8)
600
601	cmpq	%r9,%r14
602	jb	.Louter4x
603	movq	16(%rsp,%r9,8),%rdi
604	leaq	-4(%r9),%r15
605	movq	0(%rsp),%rax
606	movq	8(%rsp),%rdx
607	shrq	$2,%r15
608	leaq	(%rsp),%rsi
609	xorq	%r14,%r14
610
611	subq	0(%rcx),%rax
612	movq	16(%rsi),%rbx
613	movq	24(%rsi),%rbp
614	sbbq	8(%rcx),%rdx
615
616.Lsub4x:
617	movq	%rax,0(%rdi,%r14,8)
618	movq	%rdx,8(%rdi,%r14,8)
619	sbbq	16(%rcx,%r14,8),%rbx
620	movq	32(%rsi,%r14,8),%rax
621	movq	40(%rsi,%r14,8),%rdx
622	sbbq	24(%rcx,%r14,8),%rbp
623	movq	%rbx,16(%rdi,%r14,8)
624	movq	%rbp,24(%rdi,%r14,8)
625	sbbq	32(%rcx,%r14,8),%rax
626	movq	48(%rsi,%r14,8),%rbx
627	movq	56(%rsi,%r14,8),%rbp
628	sbbq	40(%rcx,%r14,8),%rdx
629	leaq	4(%r14),%r14
630	decq	%r15
631	jnz	.Lsub4x
632
633	movq	%rax,0(%rdi,%r14,8)
634	movq	32(%rsi,%r14,8),%rax
635	sbbq	16(%rcx,%r14,8),%rbx
636	movq	%rdx,8(%rdi,%r14,8)
637	sbbq	24(%rcx,%r14,8),%rbp
638	movq	%rbx,16(%rdi,%r14,8)
639
640	sbbq	$0,%rax
641	movq	%rbp,24(%rdi,%r14,8)
642	pxor	%xmm0,%xmm0
643.byte	102,72,15,110,224
644	pcmpeqd	%xmm5,%xmm5
645	pshufd	$0,%xmm4,%xmm4
646	movq	%r9,%r15
647	pxor	%xmm4,%xmm5
648	shrq	$2,%r15
649	xorl	%eax,%eax
650
651	jmp	.Lcopy4x
652.align	16
653.Lcopy4x:
654	movdqa	(%rsp,%rax,1),%xmm1
655	movdqu	(%rdi,%rax,1),%xmm2
656	pand	%xmm4,%xmm1
657	pand	%xmm5,%xmm2
658	movdqa	16(%rsp,%rax,1),%xmm3
659	movdqa	%xmm0,(%rsp,%rax,1)
660	por	%xmm2,%xmm1
661	movdqu	16(%rdi,%rax,1),%xmm2
662	movdqu	%xmm1,(%rdi,%rax,1)
663	pand	%xmm4,%xmm3
664	pand	%xmm5,%xmm2
665	movdqa	%xmm0,16(%rsp,%rax,1)
666	por	%xmm2,%xmm3
667	movdqu	%xmm3,16(%rdi,%rax,1)
668	leaq	32(%rax),%rax
669	decq	%r15
670	jnz	.Lcopy4x
671	movq	8(%rsp,%r9,8),%rsi
672.cfi_def_cfa	%rsi, 8
673	movq	$1,%rax
674	movq	-48(%rsi),%r15
675.cfi_restore	%r15
676	movq	-40(%rsi),%r14
677.cfi_restore	%r14
678	movq	-32(%rsi),%r13
679.cfi_restore	%r13
680	movq	-24(%rsi),%r12
681.cfi_restore	%r12
682	movq	-16(%rsi),%rbp
683.cfi_restore	%rbp
684	movq	-8(%rsi),%rbx
685.cfi_restore	%rbx
686	leaq	(%rsi),%rsp
687.cfi_def_cfa_register	%rsp
688.Lmul4x_epilogue:
689	.byte	0xf3,0xc3
690.cfi_endproc
691.size	bn_mul4x_mont,.-bn_mul4x_mont
692
693
694
695.type	bn_sqr8x_mont,@function
696.align	32
697bn_sqr8x_mont:
698.cfi_startproc
699	movq	%rsp,%rax
700.cfi_def_cfa_register	%rax
701.Lsqr8x_enter:
702	pushq	%rbx
703.cfi_offset	%rbx,-16
704	pushq	%rbp
705.cfi_offset	%rbp,-24
706	pushq	%r12
707.cfi_offset	%r12,-32
708	pushq	%r13
709.cfi_offset	%r13,-40
710	pushq	%r14
711.cfi_offset	%r14,-48
712	pushq	%r15
713.cfi_offset	%r15,-56
714.Lsqr8x_prologue:
715
716	movl	%r9d,%r10d
717	shll	$3,%r9d
718	shlq	$3+2,%r10
719	negq	%r9
720
721
722
723
724
725
726	leaq	-64(%rsp,%r9,2),%r11
727	movq	%rsp,%rbp
728	movq	(%r8),%r8
729	subq	%rsi,%r11
730	andq	$4095,%r11
731	cmpq	%r11,%r10
732	jb	.Lsqr8x_sp_alt
733	subq	%r11,%rbp
734	leaq	-64(%rbp,%r9,2),%rbp
735	jmp	.Lsqr8x_sp_done
736
737.align	32
738.Lsqr8x_sp_alt:
739	leaq	4096-64(,%r9,2),%r10
740	leaq	-64(%rbp,%r9,2),%rbp
741	subq	%r10,%r11
742	movq	$0,%r10
743	cmovcq	%r10,%r11
744	subq	%r11,%rbp
745.Lsqr8x_sp_done:
746	andq	$-64,%rbp
747	movq	%rsp,%r11
748	subq	%rbp,%r11
749	andq	$-4096,%r11
750	leaq	(%r11,%rbp,1),%rsp
751	movq	(%rsp),%r10
752	cmpq	%rbp,%rsp
753	ja	.Lsqr8x_page_walk
754	jmp	.Lsqr8x_page_walk_done
755
756.align	16
757.Lsqr8x_page_walk:
758	leaq	-4096(%rsp),%rsp
759	movq	(%rsp),%r10
760	cmpq	%rbp,%rsp
761	ja	.Lsqr8x_page_walk
762.Lsqr8x_page_walk_done:
763
764	movq	%r9,%r10
765	negq	%r9
766
767	movq	%r8,32(%rsp)
768	movq	%rax,40(%rsp)
769.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
770.Lsqr8x_body:
771
772.byte	102,72,15,110,209
773	pxor	%xmm0,%xmm0
774.byte	102,72,15,110,207
775.byte	102,73,15,110,218
776	movl	OPENSSL_ia32cap_P+8(%rip),%eax
777	andl	$0x80100,%eax
778	cmpl	$0x80100,%eax
779	jne	.Lsqr8x_nox
780
781	call	bn_sqrx8x_internal
782
783
784
785
786	leaq	(%r8,%rcx,1),%rbx
787	movq	%rcx,%r9
788	movq	%rcx,%rdx
789.byte	102,72,15,126,207
790	sarq	$3+2,%rcx
791	jmp	.Lsqr8x_sub
792
793.align	32
794.Lsqr8x_nox:
795	call	bn_sqr8x_internal
796
797
798
799
800	leaq	(%rdi,%r9,1),%rbx
801	movq	%r9,%rcx
802	movq	%r9,%rdx
803.byte	102,72,15,126,207
804	sarq	$3+2,%rcx
805	jmp	.Lsqr8x_sub
806
807.align	32
808.Lsqr8x_sub:
809	movq	0(%rbx),%r12
810	movq	8(%rbx),%r13
811	movq	16(%rbx),%r14
812	movq	24(%rbx),%r15
813	leaq	32(%rbx),%rbx
814	sbbq	0(%rbp),%r12
815	sbbq	8(%rbp),%r13
816	sbbq	16(%rbp),%r14
817	sbbq	24(%rbp),%r15
818	leaq	32(%rbp),%rbp
819	movq	%r12,0(%rdi)
820	movq	%r13,8(%rdi)
821	movq	%r14,16(%rdi)
822	movq	%r15,24(%rdi)
823	leaq	32(%rdi),%rdi
824	incq	%rcx
825	jnz	.Lsqr8x_sub
826
827	sbbq	$0,%rax
828	leaq	(%rbx,%r9,1),%rbx
829	leaq	(%rdi,%r9,1),%rdi
830
831.byte	102,72,15,110,200
832	pxor	%xmm0,%xmm0
833	pshufd	$0,%xmm1,%xmm1
834	movq	40(%rsp),%rsi
835.cfi_def_cfa	%rsi,8
836	jmp	.Lsqr8x_cond_copy
837
838.align	32
839.Lsqr8x_cond_copy:
840	movdqa	0(%rbx),%xmm2
841	movdqa	16(%rbx),%xmm3
842	leaq	32(%rbx),%rbx
843	movdqu	0(%rdi),%xmm4
844	movdqu	16(%rdi),%xmm5
845	leaq	32(%rdi),%rdi
846	movdqa	%xmm0,-32(%rbx)
847	movdqa	%xmm0,-16(%rbx)
848	movdqa	%xmm0,-32(%rbx,%rdx,1)
849	movdqa	%xmm0,-16(%rbx,%rdx,1)
850	pcmpeqd	%xmm1,%xmm0
851	pand	%xmm1,%xmm2
852	pand	%xmm1,%xmm3
853	pand	%xmm0,%xmm4
854	pand	%xmm0,%xmm5
855	pxor	%xmm0,%xmm0
856	por	%xmm2,%xmm4
857	por	%xmm3,%xmm5
858	movdqu	%xmm4,-32(%rdi)
859	movdqu	%xmm5,-16(%rdi)
860	addq	$32,%r9
861	jnz	.Lsqr8x_cond_copy
862
863	movq	$1,%rax
864	movq	-48(%rsi),%r15
865.cfi_restore	%r15
866	movq	-40(%rsi),%r14
867.cfi_restore	%r14
868	movq	-32(%rsi),%r13
869.cfi_restore	%r13
870	movq	-24(%rsi),%r12
871.cfi_restore	%r12
872	movq	-16(%rsi),%rbp
873.cfi_restore	%rbp
874	movq	-8(%rsi),%rbx
875.cfi_restore	%rbx
876	leaq	(%rsi),%rsp
877.cfi_def_cfa_register	%rsp
878.Lsqr8x_epilogue:
879	.byte	0xf3,0xc3
880.cfi_endproc
881.size	bn_sqr8x_mont,.-bn_sqr8x_mont
882.type	bn_mulx4x_mont,@function
883.align	32
884bn_mulx4x_mont:
885.cfi_startproc
886	movq	%rsp,%rax
887.cfi_def_cfa_register	%rax
888.Lmulx4x_enter:
889	pushq	%rbx
890.cfi_offset	%rbx,-16
891	pushq	%rbp
892.cfi_offset	%rbp,-24
893	pushq	%r12
894.cfi_offset	%r12,-32
895	pushq	%r13
896.cfi_offset	%r13,-40
897	pushq	%r14
898.cfi_offset	%r14,-48
899	pushq	%r15
900.cfi_offset	%r15,-56
901.Lmulx4x_prologue:
902
903	shll	$3,%r9d
904	xorq	%r10,%r10
905	subq	%r9,%r10
906	movq	(%r8),%r8
907	leaq	-72(%rsp,%r10,1),%rbp
908	andq	$-128,%rbp
909	movq	%rsp,%r11
910	subq	%rbp,%r11
911	andq	$-4096,%r11
912	leaq	(%r11,%rbp,1),%rsp
913	movq	(%rsp),%r10
914	cmpq	%rbp,%rsp
915	ja	.Lmulx4x_page_walk
916	jmp	.Lmulx4x_page_walk_done
917
918.align	16
919.Lmulx4x_page_walk:
920	leaq	-4096(%rsp),%rsp
921	movq	(%rsp),%r10
922	cmpq	%rbp,%rsp
923	ja	.Lmulx4x_page_walk
924.Lmulx4x_page_walk_done:
925
926	leaq	(%rdx,%r9,1),%r10
927
928
929
930
931
932
933
934
935
936
937
938
939	movq	%r9,0(%rsp)
940	shrq	$5,%r9
941	movq	%r10,16(%rsp)
942	subq	$1,%r9
943	movq	%r8,24(%rsp)
944	movq	%rdi,32(%rsp)
945	movq	%rax,40(%rsp)
946.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
947	movq	%r9,48(%rsp)
948	jmp	.Lmulx4x_body
949
950.align	32
951.Lmulx4x_body:
952	leaq	8(%rdx),%rdi
953	movq	(%rdx),%rdx
954	leaq	64+32(%rsp),%rbx
955	movq	%rdx,%r9
956
957	mulxq	0(%rsi),%r8,%rax
958	mulxq	8(%rsi),%r11,%r14
959	addq	%rax,%r11
960	movq	%rdi,8(%rsp)
961	mulxq	16(%rsi),%r12,%r13
962	adcq	%r14,%r12
963	adcq	$0,%r13
964
965	movq	%r8,%rdi
966	imulq	24(%rsp),%r8
967	xorq	%rbp,%rbp
968
969	mulxq	24(%rsi),%rax,%r14
970	movq	%r8,%rdx
971	leaq	32(%rsi),%rsi
972	adcxq	%rax,%r13
973	adcxq	%rbp,%r14
974
975	mulxq	0(%rcx),%rax,%r10
976	adcxq	%rax,%rdi
977	adoxq	%r11,%r10
978	mulxq	8(%rcx),%rax,%r11
979	adcxq	%rax,%r10
980	adoxq	%r12,%r11
981.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
982	movq	48(%rsp),%rdi
983	movq	%r10,-32(%rbx)
984	adcxq	%rax,%r11
985	adoxq	%r13,%r12
986	mulxq	24(%rcx),%rax,%r15
987	movq	%r9,%rdx
988	movq	%r11,-24(%rbx)
989	adcxq	%rax,%r12
990	adoxq	%rbp,%r15
991	leaq	32(%rcx),%rcx
992	movq	%r12,-16(%rbx)
993
994	jmp	.Lmulx4x_1st
995
996.align	32
997.Lmulx4x_1st:
998	adcxq	%rbp,%r15
999	mulxq	0(%rsi),%r10,%rax
1000	adcxq	%r14,%r10
1001	mulxq	8(%rsi),%r11,%r14
1002	adcxq	%rax,%r11
1003	mulxq	16(%rsi),%r12,%rax
1004	adcxq	%r14,%r12
1005	mulxq	24(%rsi),%r13,%r14
1006.byte	0x67,0x67
1007	movq	%r8,%rdx
1008	adcxq	%rax,%r13
1009	adcxq	%rbp,%r14
1010	leaq	32(%rsi),%rsi
1011	leaq	32(%rbx),%rbx
1012
1013	adoxq	%r15,%r10
1014	mulxq	0(%rcx),%rax,%r15
1015	adcxq	%rax,%r10
1016	adoxq	%r15,%r11
1017	mulxq	8(%rcx),%rax,%r15
1018	adcxq	%rax,%r11
1019	adoxq	%r15,%r12
1020	mulxq	16(%rcx),%rax,%r15
1021	movq	%r10,-40(%rbx)
1022	adcxq	%rax,%r12
1023	movq	%r11,-32(%rbx)
1024	adoxq	%r15,%r13
1025	mulxq	24(%rcx),%rax,%r15
1026	movq	%r9,%rdx
1027	movq	%r12,-24(%rbx)
1028	adcxq	%rax,%r13
1029	adoxq	%rbp,%r15
1030	leaq	32(%rcx),%rcx
1031	movq	%r13,-16(%rbx)
1032
1033	decq	%rdi
1034	jnz	.Lmulx4x_1st
1035
1036	movq	0(%rsp),%rax
1037	movq	8(%rsp),%rdi
1038	adcq	%rbp,%r15
1039	addq	%r15,%r14
1040	sbbq	%r15,%r15
1041	movq	%r14,-8(%rbx)
1042	jmp	.Lmulx4x_outer
1043
1044.align	32
1045.Lmulx4x_outer:
1046	movq	(%rdi),%rdx
1047	leaq	8(%rdi),%rdi
1048	subq	%rax,%rsi
1049	movq	%r15,(%rbx)
1050	leaq	64+32(%rsp),%rbx
1051	subq	%rax,%rcx
1052
1053	mulxq	0(%rsi),%r8,%r11
1054	xorl	%ebp,%ebp
1055	movq	%rdx,%r9
1056	mulxq	8(%rsi),%r14,%r12
1057	adoxq	-32(%rbx),%r8
1058	adcxq	%r14,%r11
1059	mulxq	16(%rsi),%r15,%r13
1060	adoxq	-24(%rbx),%r11
1061	adcxq	%r15,%r12
1062	adoxq	-16(%rbx),%r12
1063	adcxq	%rbp,%r13
1064	adoxq	%rbp,%r13
1065
1066	movq	%rdi,8(%rsp)
1067	movq	%r8,%r15
1068	imulq	24(%rsp),%r8
1069	xorl	%ebp,%ebp
1070
1071	mulxq	24(%rsi),%rax,%r14
1072	movq	%r8,%rdx
1073	adcxq	%rax,%r13
1074	adoxq	-8(%rbx),%r13
1075	adcxq	%rbp,%r14
1076	leaq	32(%rsi),%rsi
1077	adoxq	%rbp,%r14
1078
1079	mulxq	0(%rcx),%rax,%r10
1080	adcxq	%rax,%r15
1081	adoxq	%r11,%r10
1082	mulxq	8(%rcx),%rax,%r11
1083	adcxq	%rax,%r10
1084	adoxq	%r12,%r11
1085	mulxq	16(%rcx),%rax,%r12
1086	movq	%r10,-32(%rbx)
1087	adcxq	%rax,%r11
1088	adoxq	%r13,%r12
1089	mulxq	24(%rcx),%rax,%r15
1090	movq	%r9,%rdx
1091	movq	%r11,-24(%rbx)
1092	leaq	32(%rcx),%rcx
1093	adcxq	%rax,%r12
1094	adoxq	%rbp,%r15
1095	movq	48(%rsp),%rdi
1096	movq	%r12,-16(%rbx)
1097
1098	jmp	.Lmulx4x_inner
1099
1100.align	32
1101.Lmulx4x_inner:
1102	mulxq	0(%rsi),%r10,%rax
1103	adcxq	%rbp,%r15
1104	adoxq	%r14,%r10
1105	mulxq	8(%rsi),%r11,%r14
1106	adcxq	0(%rbx),%r10
1107	adoxq	%rax,%r11
1108	mulxq	16(%rsi),%r12,%rax
1109	adcxq	8(%rbx),%r11
1110	adoxq	%r14,%r12
1111	mulxq	24(%rsi),%r13,%r14
1112	movq	%r8,%rdx
1113	adcxq	16(%rbx),%r12
1114	adoxq	%rax,%r13
1115	adcxq	24(%rbx),%r13
1116	adoxq	%rbp,%r14
1117	leaq	32(%rsi),%rsi
1118	leaq	32(%rbx),%rbx
1119	adcxq	%rbp,%r14
1120
1121	adoxq	%r15,%r10
1122	mulxq	0(%rcx),%rax,%r15
1123	adcxq	%rax,%r10
1124	adoxq	%r15,%r11
1125	mulxq	8(%rcx),%rax,%r15
1126	adcxq	%rax,%r11
1127	adoxq	%r15,%r12
1128	mulxq	16(%rcx),%rax,%r15
1129	movq	%r10,-40(%rbx)
1130	adcxq	%rax,%r12
1131	adoxq	%r15,%r13
1132	mulxq	24(%rcx),%rax,%r15
1133	movq	%r9,%rdx
1134	movq	%r11,-32(%rbx)
1135	movq	%r12,-24(%rbx)
1136	adcxq	%rax,%r13
1137	adoxq	%rbp,%r15
1138	leaq	32(%rcx),%rcx
1139	movq	%r13,-16(%rbx)
1140
1141	decq	%rdi
1142	jnz	.Lmulx4x_inner
1143
1144	movq	0(%rsp),%rax
1145	movq	8(%rsp),%rdi
1146	adcq	%rbp,%r15
1147	subq	0(%rbx),%rbp
1148	adcq	%r15,%r14
1149	sbbq	%r15,%r15
1150	movq	%r14,-8(%rbx)
1151
1152	cmpq	16(%rsp),%rdi
1153	jne	.Lmulx4x_outer
1154
1155	leaq	64(%rsp),%rbx
1156	subq	%rax,%rcx
1157	negq	%r15
1158	movq	%rax,%rdx
1159	shrq	$3+2,%rax
1160	movq	32(%rsp),%rdi
1161	jmp	.Lmulx4x_sub
1162
1163.align	32
1164.Lmulx4x_sub:
1165	movq	0(%rbx),%r11
1166	movq	8(%rbx),%r12
1167	movq	16(%rbx),%r13
1168	movq	24(%rbx),%r14
1169	leaq	32(%rbx),%rbx
1170	sbbq	0(%rcx),%r11
1171	sbbq	8(%rcx),%r12
1172	sbbq	16(%rcx),%r13
1173	sbbq	24(%rcx),%r14
1174	leaq	32(%rcx),%rcx
1175	movq	%r11,0(%rdi)
1176	movq	%r12,8(%rdi)
1177	movq	%r13,16(%rdi)
1178	movq	%r14,24(%rdi)
1179	leaq	32(%rdi),%rdi
1180	decq	%rax
1181	jnz	.Lmulx4x_sub
1182
1183	sbbq	$0,%r15
1184	leaq	64(%rsp),%rbx
1185	subq	%rdx,%rdi
1186
1187.byte	102,73,15,110,207
1188	pxor	%xmm0,%xmm0
1189	pshufd	$0,%xmm1,%xmm1
1190	movq	40(%rsp),%rsi
1191.cfi_def_cfa	%rsi,8
1192	jmp	.Lmulx4x_cond_copy
1193
1194.align	32
1195.Lmulx4x_cond_copy:
1196	movdqa	0(%rbx),%xmm2
1197	movdqa	16(%rbx),%xmm3
1198	leaq	32(%rbx),%rbx
1199	movdqu	0(%rdi),%xmm4
1200	movdqu	16(%rdi),%xmm5
1201	leaq	32(%rdi),%rdi
1202	movdqa	%xmm0,-32(%rbx)
1203	movdqa	%xmm0,-16(%rbx)
1204	pcmpeqd	%xmm1,%xmm0
1205	pand	%xmm1,%xmm2
1206	pand	%xmm1,%xmm3
1207	pand	%xmm0,%xmm4
1208	pand	%xmm0,%xmm5
1209	pxor	%xmm0,%xmm0
1210	por	%xmm2,%xmm4
1211	por	%xmm3,%xmm5
1212	movdqu	%xmm4,-32(%rdi)
1213	movdqu	%xmm5,-16(%rdi)
1214	subq	$32,%rdx
1215	jnz	.Lmulx4x_cond_copy
1216
1217	movq	%rdx,(%rbx)
1218
1219	movq	$1,%rax
1220	movq	-48(%rsi),%r15
1221.cfi_restore	%r15
1222	movq	-40(%rsi),%r14
1223.cfi_restore	%r14
1224	movq	-32(%rsi),%r13
1225.cfi_restore	%r13
1226	movq	-24(%rsi),%r12
1227.cfi_restore	%r12
1228	movq	-16(%rsi),%rbp
1229.cfi_restore	%rbp
1230	movq	-8(%rsi),%rbx
1231.cfi_restore	%rbx
1232	leaq	(%rsi),%rsp
1233.cfi_def_cfa_register	%rsp
1234.Lmulx4x_epilogue:
1235	.byte	0xf3,0xc3
1236.cfi_endproc
1237.size	bn_mulx4x_mont,.-bn_mulx4x_mont
1238.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1239.align	16
1240