xref: /freebsd/sys/crypto/openssl/amd64/x86_64-mont.S (revision 9768746b)
1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from x86_64-mont.pl. */
3.text
4
5
6
7.globl	bn_mul_mont
8.type	bn_mul_mont,@function
9.align	16
10bn_mul_mont:
11.cfi_startproc
12	movl	%r9d,%r9d
13	movq	%rsp,%rax
14.cfi_def_cfa_register	%rax
15	testl	$3,%r9d
16	jnz	.Lmul_enter
17	cmpl	$8,%r9d
18	jb	.Lmul_enter
19	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
20	cmpq	%rsi,%rdx
21	jne	.Lmul4x_enter
22	testl	$7,%r9d
23	jz	.Lsqr8x_enter
24	jmp	.Lmul4x_enter
25
26.align	16
27.Lmul_enter:
28	pushq	%rbx
29.cfi_offset	%rbx,-16
30	pushq	%rbp
31.cfi_offset	%rbp,-24
32	pushq	%r12
33.cfi_offset	%r12,-32
34	pushq	%r13
35.cfi_offset	%r13,-40
36	pushq	%r14
37.cfi_offset	%r14,-48
38	pushq	%r15
39.cfi_offset	%r15,-56
40
41	negq	%r9
42	movq	%rsp,%r11
43	leaq	-16(%rsp,%r9,8),%r10
44	negq	%r9
45	andq	$-1024,%r10
46
47
48
49
50
51
52
53
54
55	subq	%r10,%r11
56	andq	$-4096,%r11
57	leaq	(%r10,%r11,1),%rsp
58	movq	(%rsp),%r11
59	cmpq	%r10,%rsp
60	ja	.Lmul_page_walk
61	jmp	.Lmul_page_walk_done
62
63.align	16
64.Lmul_page_walk:
65	leaq	-4096(%rsp),%rsp
66	movq	(%rsp),%r11
67	cmpq	%r10,%rsp
68	ja	.Lmul_page_walk
69.Lmul_page_walk_done:
70
71	movq	%rax,8(%rsp,%r9,8)
72.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
73.Lmul_body:
74	movq	%rdx,%r12
75	movq	(%r8),%r8
76	movq	(%r12),%rbx
77	movq	(%rsi),%rax
78
79	xorq	%r14,%r14
80	xorq	%r15,%r15
81
82	movq	%r8,%rbp
83	mulq	%rbx
84	movq	%rax,%r10
85	movq	(%rcx),%rax
86
87	imulq	%r10,%rbp
88	movq	%rdx,%r11
89
90	mulq	%rbp
91	addq	%rax,%r10
92	movq	8(%rsi),%rax
93	adcq	$0,%rdx
94	movq	%rdx,%r13
95
96	leaq	1(%r15),%r15
97	jmp	.L1st_enter
98
99.align	16
100.L1st:
101	addq	%rax,%r13
102	movq	(%rsi,%r15,8),%rax
103	adcq	$0,%rdx
104	addq	%r11,%r13
105	movq	%r10,%r11
106	adcq	$0,%rdx
107	movq	%r13,-16(%rsp,%r15,8)
108	movq	%rdx,%r13
109
110.L1st_enter:
111	mulq	%rbx
112	addq	%rax,%r11
113	movq	(%rcx,%r15,8),%rax
114	adcq	$0,%rdx
115	leaq	1(%r15),%r15
116	movq	%rdx,%r10
117
118	mulq	%rbp
119	cmpq	%r9,%r15
120	jne	.L1st
121
122	addq	%rax,%r13
123	movq	(%rsi),%rax
124	adcq	$0,%rdx
125	addq	%r11,%r13
126	adcq	$0,%rdx
127	movq	%r13,-16(%rsp,%r15,8)
128	movq	%rdx,%r13
129	movq	%r10,%r11
130
131	xorq	%rdx,%rdx
132	addq	%r11,%r13
133	adcq	$0,%rdx
134	movq	%r13,-8(%rsp,%r9,8)
135	movq	%rdx,(%rsp,%r9,8)
136
137	leaq	1(%r14),%r14
138	jmp	.Louter
139.align	16
140.Louter:
141	movq	(%r12,%r14,8),%rbx
142	xorq	%r15,%r15
143	movq	%r8,%rbp
144	movq	(%rsp),%r10
145	mulq	%rbx
146	addq	%rax,%r10
147	movq	(%rcx),%rax
148	adcq	$0,%rdx
149
150	imulq	%r10,%rbp
151	movq	%rdx,%r11
152
153	mulq	%rbp
154	addq	%rax,%r10
155	movq	8(%rsi),%rax
156	adcq	$0,%rdx
157	movq	8(%rsp),%r10
158	movq	%rdx,%r13
159
160	leaq	1(%r15),%r15
161	jmp	.Linner_enter
162
163.align	16
164.Linner:
165	addq	%rax,%r13
166	movq	(%rsi,%r15,8),%rax
167	adcq	$0,%rdx
168	addq	%r10,%r13
169	movq	(%rsp,%r15,8),%r10
170	adcq	$0,%rdx
171	movq	%r13,-16(%rsp,%r15,8)
172	movq	%rdx,%r13
173
174.Linner_enter:
175	mulq	%rbx
176	addq	%rax,%r11
177	movq	(%rcx,%r15,8),%rax
178	adcq	$0,%rdx
179	addq	%r11,%r10
180	movq	%rdx,%r11
181	adcq	$0,%r11
182	leaq	1(%r15),%r15
183
184	mulq	%rbp
185	cmpq	%r9,%r15
186	jne	.Linner
187
188	addq	%rax,%r13
189	movq	(%rsi),%rax
190	adcq	$0,%rdx
191	addq	%r10,%r13
192	movq	(%rsp,%r15,8),%r10
193	adcq	$0,%rdx
194	movq	%r13,-16(%rsp,%r15,8)
195	movq	%rdx,%r13
196
197	xorq	%rdx,%rdx
198	addq	%r11,%r13
199	adcq	$0,%rdx
200	addq	%r10,%r13
201	adcq	$0,%rdx
202	movq	%r13,-8(%rsp,%r9,8)
203	movq	%rdx,(%rsp,%r9,8)
204
205	leaq	1(%r14),%r14
206	cmpq	%r9,%r14
207	jb	.Louter
208
209	xorq	%r14,%r14
210	movq	(%rsp),%rax
211	movq	%r9,%r15
212
213.align	16
214.Lsub:	sbbq	(%rcx,%r14,8),%rax
215	movq	%rax,(%rdi,%r14,8)
216	movq	8(%rsp,%r14,8),%rax
217	leaq	1(%r14),%r14
218	decq	%r15
219	jnz	.Lsub
220
221	sbbq	$0,%rax
222	movq	$-1,%rbx
223	xorq	%rax,%rbx
224	xorq	%r14,%r14
225	movq	%r9,%r15
226
227.Lcopy:
228	movq	(%rdi,%r14,8),%rcx
229	movq	(%rsp,%r14,8),%rdx
230	andq	%rbx,%rcx
231	andq	%rax,%rdx
232	movq	%r9,(%rsp,%r14,8)
233	orq	%rcx,%rdx
234	movq	%rdx,(%rdi,%r14,8)
235	leaq	1(%r14),%r14
236	subq	$1,%r15
237	jnz	.Lcopy
238
239	movq	8(%rsp,%r9,8),%rsi
240.cfi_def_cfa	%rsi,8
241	movq	$1,%rax
242	movq	-48(%rsi),%r15
243.cfi_restore	%r15
244	movq	-40(%rsi),%r14
245.cfi_restore	%r14
246	movq	-32(%rsi),%r13
247.cfi_restore	%r13
248	movq	-24(%rsi),%r12
249.cfi_restore	%r12
250	movq	-16(%rsi),%rbp
251.cfi_restore	%rbp
252	movq	-8(%rsi),%rbx
253.cfi_restore	%rbx
254	leaq	(%rsi),%rsp
255.cfi_def_cfa_register	%rsp
256.Lmul_epilogue:
257	.byte	0xf3,0xc3
258.cfi_endproc
259.size	bn_mul_mont,.-bn_mul_mont
260.type	bn_mul4x_mont,@function
261.align	16
262bn_mul4x_mont:
263.cfi_startproc
264	movl	%r9d,%r9d
265	movq	%rsp,%rax
266.cfi_def_cfa_register	%rax
267.Lmul4x_enter:
268	andl	$0x80100,%r11d
269	cmpl	$0x80100,%r11d
270	je	.Lmulx4x_enter
271	pushq	%rbx
272.cfi_offset	%rbx,-16
273	pushq	%rbp
274.cfi_offset	%rbp,-24
275	pushq	%r12
276.cfi_offset	%r12,-32
277	pushq	%r13
278.cfi_offset	%r13,-40
279	pushq	%r14
280.cfi_offset	%r14,-48
281	pushq	%r15
282.cfi_offset	%r15,-56
283
284	negq	%r9
285	movq	%rsp,%r11
286	leaq	-32(%rsp,%r9,8),%r10
287	negq	%r9
288	andq	$-1024,%r10
289
290	subq	%r10,%r11
291	andq	$-4096,%r11
292	leaq	(%r10,%r11,1),%rsp
293	movq	(%rsp),%r11
294	cmpq	%r10,%rsp
295	ja	.Lmul4x_page_walk
296	jmp	.Lmul4x_page_walk_done
297
298.Lmul4x_page_walk:
299	leaq	-4096(%rsp),%rsp
300	movq	(%rsp),%r11
301	cmpq	%r10,%rsp
302	ja	.Lmul4x_page_walk
303.Lmul4x_page_walk_done:
304
305	movq	%rax,8(%rsp,%r9,8)
306.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
307.Lmul4x_body:
308	movq	%rdi,16(%rsp,%r9,8)
309	movq	%rdx,%r12
310	movq	(%r8),%r8
311	movq	(%r12),%rbx
312	movq	(%rsi),%rax
313
314	xorq	%r14,%r14
315	xorq	%r15,%r15
316
317	movq	%r8,%rbp
318	mulq	%rbx
319	movq	%rax,%r10
320	movq	(%rcx),%rax
321
322	imulq	%r10,%rbp
323	movq	%rdx,%r11
324
325	mulq	%rbp
326	addq	%rax,%r10
327	movq	8(%rsi),%rax
328	adcq	$0,%rdx
329	movq	%rdx,%rdi
330
331	mulq	%rbx
332	addq	%rax,%r11
333	movq	8(%rcx),%rax
334	adcq	$0,%rdx
335	movq	%rdx,%r10
336
337	mulq	%rbp
338	addq	%rax,%rdi
339	movq	16(%rsi),%rax
340	adcq	$0,%rdx
341	addq	%r11,%rdi
342	leaq	4(%r15),%r15
343	adcq	$0,%rdx
344	movq	%rdi,(%rsp)
345	movq	%rdx,%r13
346	jmp	.L1st4x
347.align	16
348.L1st4x:
349	mulq	%rbx
350	addq	%rax,%r10
351	movq	-16(%rcx,%r15,8),%rax
352	adcq	$0,%rdx
353	movq	%rdx,%r11
354
355	mulq	%rbp
356	addq	%rax,%r13
357	movq	-8(%rsi,%r15,8),%rax
358	adcq	$0,%rdx
359	addq	%r10,%r13
360	adcq	$0,%rdx
361	movq	%r13,-24(%rsp,%r15,8)
362	movq	%rdx,%rdi
363
364	mulq	%rbx
365	addq	%rax,%r11
366	movq	-8(%rcx,%r15,8),%rax
367	adcq	$0,%rdx
368	movq	%rdx,%r10
369
370	mulq	%rbp
371	addq	%rax,%rdi
372	movq	(%rsi,%r15,8),%rax
373	adcq	$0,%rdx
374	addq	%r11,%rdi
375	adcq	$0,%rdx
376	movq	%rdi,-16(%rsp,%r15,8)
377	movq	%rdx,%r13
378
379	mulq	%rbx
380	addq	%rax,%r10
381	movq	(%rcx,%r15,8),%rax
382	adcq	$0,%rdx
383	movq	%rdx,%r11
384
385	mulq	%rbp
386	addq	%rax,%r13
387	movq	8(%rsi,%r15,8),%rax
388	adcq	$0,%rdx
389	addq	%r10,%r13
390	adcq	$0,%rdx
391	movq	%r13,-8(%rsp,%r15,8)
392	movq	%rdx,%rdi
393
394	mulq	%rbx
395	addq	%rax,%r11
396	movq	8(%rcx,%r15,8),%rax
397	adcq	$0,%rdx
398	leaq	4(%r15),%r15
399	movq	%rdx,%r10
400
401	mulq	%rbp
402	addq	%rax,%rdi
403	movq	-16(%rsi,%r15,8),%rax
404	adcq	$0,%rdx
405	addq	%r11,%rdi
406	adcq	$0,%rdx
407	movq	%rdi,-32(%rsp,%r15,8)
408	movq	%rdx,%r13
409	cmpq	%r9,%r15
410	jb	.L1st4x
411
412	mulq	%rbx
413	addq	%rax,%r10
414	movq	-16(%rcx,%r15,8),%rax
415	adcq	$0,%rdx
416	movq	%rdx,%r11
417
418	mulq	%rbp
419	addq	%rax,%r13
420	movq	-8(%rsi,%r15,8),%rax
421	adcq	$0,%rdx
422	addq	%r10,%r13
423	adcq	$0,%rdx
424	movq	%r13,-24(%rsp,%r15,8)
425	movq	%rdx,%rdi
426
427	mulq	%rbx
428	addq	%rax,%r11
429	movq	-8(%rcx,%r15,8),%rax
430	adcq	$0,%rdx
431	movq	%rdx,%r10
432
433	mulq	%rbp
434	addq	%rax,%rdi
435	movq	(%rsi),%rax
436	adcq	$0,%rdx
437	addq	%r11,%rdi
438	adcq	$0,%rdx
439	movq	%rdi,-16(%rsp,%r15,8)
440	movq	%rdx,%r13
441
442	xorq	%rdi,%rdi
443	addq	%r10,%r13
444	adcq	$0,%rdi
445	movq	%r13,-8(%rsp,%r15,8)
446	movq	%rdi,(%rsp,%r15,8)
447
448	leaq	1(%r14),%r14
449.align	4
450.Louter4x:
451	movq	(%r12,%r14,8),%rbx
452	xorq	%r15,%r15
453	movq	(%rsp),%r10
454	movq	%r8,%rbp
455	mulq	%rbx
456	addq	%rax,%r10
457	movq	(%rcx),%rax
458	adcq	$0,%rdx
459
460	imulq	%r10,%rbp
461	movq	%rdx,%r11
462
463	mulq	%rbp
464	addq	%rax,%r10
465	movq	8(%rsi),%rax
466	adcq	$0,%rdx
467	movq	%rdx,%rdi
468
469	mulq	%rbx
470	addq	%rax,%r11
471	movq	8(%rcx),%rax
472	adcq	$0,%rdx
473	addq	8(%rsp),%r11
474	adcq	$0,%rdx
475	movq	%rdx,%r10
476
477	mulq	%rbp
478	addq	%rax,%rdi
479	movq	16(%rsi),%rax
480	adcq	$0,%rdx
481	addq	%r11,%rdi
482	leaq	4(%r15),%r15
483	adcq	$0,%rdx
484	movq	%rdi,(%rsp)
485	movq	%rdx,%r13
486	jmp	.Linner4x
487.align	16
488.Linner4x:
489	mulq	%rbx
490	addq	%rax,%r10
491	movq	-16(%rcx,%r15,8),%rax
492	adcq	$0,%rdx
493	addq	-16(%rsp,%r15,8),%r10
494	adcq	$0,%rdx
495	movq	%rdx,%r11
496
497	mulq	%rbp
498	addq	%rax,%r13
499	movq	-8(%rsi,%r15,8),%rax
500	adcq	$0,%rdx
501	addq	%r10,%r13
502	adcq	$0,%rdx
503	movq	%r13,-24(%rsp,%r15,8)
504	movq	%rdx,%rdi
505
506	mulq	%rbx
507	addq	%rax,%r11
508	movq	-8(%rcx,%r15,8),%rax
509	adcq	$0,%rdx
510	addq	-8(%rsp,%r15,8),%r11
511	adcq	$0,%rdx
512	movq	%rdx,%r10
513
514	mulq	%rbp
515	addq	%rax,%rdi
516	movq	(%rsi,%r15,8),%rax
517	adcq	$0,%rdx
518	addq	%r11,%rdi
519	adcq	$0,%rdx
520	movq	%rdi,-16(%rsp,%r15,8)
521	movq	%rdx,%r13
522
523	mulq	%rbx
524	addq	%rax,%r10
525	movq	(%rcx,%r15,8),%rax
526	adcq	$0,%rdx
527	addq	(%rsp,%r15,8),%r10
528	adcq	$0,%rdx
529	movq	%rdx,%r11
530
531	mulq	%rbp
532	addq	%rax,%r13
533	movq	8(%rsi,%r15,8),%rax
534	adcq	$0,%rdx
535	addq	%r10,%r13
536	adcq	$0,%rdx
537	movq	%r13,-8(%rsp,%r15,8)
538	movq	%rdx,%rdi
539
540	mulq	%rbx
541	addq	%rax,%r11
542	movq	8(%rcx,%r15,8),%rax
543	adcq	$0,%rdx
544	addq	8(%rsp,%r15,8),%r11
545	adcq	$0,%rdx
546	leaq	4(%r15),%r15
547	movq	%rdx,%r10
548
549	mulq	%rbp
550	addq	%rax,%rdi
551	movq	-16(%rsi,%r15,8),%rax
552	adcq	$0,%rdx
553	addq	%r11,%rdi
554	adcq	$0,%rdx
555	movq	%rdi,-32(%rsp,%r15,8)
556	movq	%rdx,%r13
557	cmpq	%r9,%r15
558	jb	.Linner4x
559
560	mulq	%rbx
561	addq	%rax,%r10
562	movq	-16(%rcx,%r15,8),%rax
563	adcq	$0,%rdx
564	addq	-16(%rsp,%r15,8),%r10
565	adcq	$0,%rdx
566	movq	%rdx,%r11
567
568	mulq	%rbp
569	addq	%rax,%r13
570	movq	-8(%rsi,%r15,8),%rax
571	adcq	$0,%rdx
572	addq	%r10,%r13
573	adcq	$0,%rdx
574	movq	%r13,-24(%rsp,%r15,8)
575	movq	%rdx,%rdi
576
577	mulq	%rbx
578	addq	%rax,%r11
579	movq	-8(%rcx,%r15,8),%rax
580	adcq	$0,%rdx
581	addq	-8(%rsp,%r15,8),%r11
582	adcq	$0,%rdx
583	leaq	1(%r14),%r14
584	movq	%rdx,%r10
585
586	mulq	%rbp
587	addq	%rax,%rdi
588	movq	(%rsi),%rax
589	adcq	$0,%rdx
590	addq	%r11,%rdi
591	adcq	$0,%rdx
592	movq	%rdi,-16(%rsp,%r15,8)
593	movq	%rdx,%r13
594
595	xorq	%rdi,%rdi
596	addq	%r10,%r13
597	adcq	$0,%rdi
598	addq	(%rsp,%r9,8),%r13
599	adcq	$0,%rdi
600	movq	%r13,-8(%rsp,%r15,8)
601	movq	%rdi,(%rsp,%r15,8)
602
603	cmpq	%r9,%r14
604	jb	.Louter4x
605	movq	16(%rsp,%r9,8),%rdi
606	leaq	-4(%r9),%r15
607	movq	0(%rsp),%rax
608	movq	8(%rsp),%rdx
609	shrq	$2,%r15
610	leaq	(%rsp),%rsi
611	xorq	%r14,%r14
612
613	subq	0(%rcx),%rax
614	movq	16(%rsi),%rbx
615	movq	24(%rsi),%rbp
616	sbbq	8(%rcx),%rdx
617
618.Lsub4x:
619	movq	%rax,0(%rdi,%r14,8)
620	movq	%rdx,8(%rdi,%r14,8)
621	sbbq	16(%rcx,%r14,8),%rbx
622	movq	32(%rsi,%r14,8),%rax
623	movq	40(%rsi,%r14,8),%rdx
624	sbbq	24(%rcx,%r14,8),%rbp
625	movq	%rbx,16(%rdi,%r14,8)
626	movq	%rbp,24(%rdi,%r14,8)
627	sbbq	32(%rcx,%r14,8),%rax
628	movq	48(%rsi,%r14,8),%rbx
629	movq	56(%rsi,%r14,8),%rbp
630	sbbq	40(%rcx,%r14,8),%rdx
631	leaq	4(%r14),%r14
632	decq	%r15
633	jnz	.Lsub4x
634
635	movq	%rax,0(%rdi,%r14,8)
636	movq	32(%rsi,%r14,8),%rax
637	sbbq	16(%rcx,%r14,8),%rbx
638	movq	%rdx,8(%rdi,%r14,8)
639	sbbq	24(%rcx,%r14,8),%rbp
640	movq	%rbx,16(%rdi,%r14,8)
641
642	sbbq	$0,%rax
643	movq	%rbp,24(%rdi,%r14,8)
644	pxor	%xmm0,%xmm0
645.byte	102,72,15,110,224
646	pcmpeqd	%xmm5,%xmm5
647	pshufd	$0,%xmm4,%xmm4
648	movq	%r9,%r15
649	pxor	%xmm4,%xmm5
650	shrq	$2,%r15
651	xorl	%eax,%eax
652
653	jmp	.Lcopy4x
654.align	16
655.Lcopy4x:
656	movdqa	(%rsp,%rax,1),%xmm1
657	movdqu	(%rdi,%rax,1),%xmm2
658	pand	%xmm4,%xmm1
659	pand	%xmm5,%xmm2
660	movdqa	16(%rsp,%rax,1),%xmm3
661	movdqa	%xmm0,(%rsp,%rax,1)
662	por	%xmm2,%xmm1
663	movdqu	16(%rdi,%rax,1),%xmm2
664	movdqu	%xmm1,(%rdi,%rax,1)
665	pand	%xmm4,%xmm3
666	pand	%xmm5,%xmm2
667	movdqa	%xmm0,16(%rsp,%rax,1)
668	por	%xmm2,%xmm3
669	movdqu	%xmm3,16(%rdi,%rax,1)
670	leaq	32(%rax),%rax
671	decq	%r15
672	jnz	.Lcopy4x
673	movq	8(%rsp,%r9,8),%rsi
674.cfi_def_cfa	%rsi, 8
675	movq	$1,%rax
676	movq	-48(%rsi),%r15
677.cfi_restore	%r15
678	movq	-40(%rsi),%r14
679.cfi_restore	%r14
680	movq	-32(%rsi),%r13
681.cfi_restore	%r13
682	movq	-24(%rsi),%r12
683.cfi_restore	%r12
684	movq	-16(%rsi),%rbp
685.cfi_restore	%rbp
686	movq	-8(%rsi),%rbx
687.cfi_restore	%rbx
688	leaq	(%rsi),%rsp
689.cfi_def_cfa_register	%rsp
690.Lmul4x_epilogue:
691	.byte	0xf3,0xc3
692.cfi_endproc
693.size	bn_mul4x_mont,.-bn_mul4x_mont
694
695
696
697.type	bn_sqr8x_mont,@function
698.align	32
699bn_sqr8x_mont:
700.cfi_startproc
701	movq	%rsp,%rax
702.cfi_def_cfa_register	%rax
703.Lsqr8x_enter:
704	pushq	%rbx
705.cfi_offset	%rbx,-16
706	pushq	%rbp
707.cfi_offset	%rbp,-24
708	pushq	%r12
709.cfi_offset	%r12,-32
710	pushq	%r13
711.cfi_offset	%r13,-40
712	pushq	%r14
713.cfi_offset	%r14,-48
714	pushq	%r15
715.cfi_offset	%r15,-56
716.Lsqr8x_prologue:
717
718	movl	%r9d,%r10d
719	shll	$3,%r9d
720	shlq	$3+2,%r10
721	negq	%r9
722
723
724
725
726
727
728	leaq	-64(%rsp,%r9,2),%r11
729	movq	%rsp,%rbp
730	movq	(%r8),%r8
731	subq	%rsi,%r11
732	andq	$4095,%r11
733	cmpq	%r11,%r10
734	jb	.Lsqr8x_sp_alt
735	subq	%r11,%rbp
736	leaq	-64(%rbp,%r9,2),%rbp
737	jmp	.Lsqr8x_sp_done
738
739.align	32
740.Lsqr8x_sp_alt:
741	leaq	4096-64(,%r9,2),%r10
742	leaq	-64(%rbp,%r9,2),%rbp
743	subq	%r10,%r11
744	movq	$0,%r10
745	cmovcq	%r10,%r11
746	subq	%r11,%rbp
747.Lsqr8x_sp_done:
748	andq	$-64,%rbp
749	movq	%rsp,%r11
750	subq	%rbp,%r11
751	andq	$-4096,%r11
752	leaq	(%r11,%rbp,1),%rsp
753	movq	(%rsp),%r10
754	cmpq	%rbp,%rsp
755	ja	.Lsqr8x_page_walk
756	jmp	.Lsqr8x_page_walk_done
757
758.align	16
759.Lsqr8x_page_walk:
760	leaq	-4096(%rsp),%rsp
761	movq	(%rsp),%r10
762	cmpq	%rbp,%rsp
763	ja	.Lsqr8x_page_walk
764.Lsqr8x_page_walk_done:
765
766	movq	%r9,%r10
767	negq	%r9
768
769	movq	%r8,32(%rsp)
770	movq	%rax,40(%rsp)
771.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
772.Lsqr8x_body:
773
774.byte	102,72,15,110,209
775	pxor	%xmm0,%xmm0
776.byte	102,72,15,110,207
777.byte	102,73,15,110,218
778	movl	OPENSSL_ia32cap_P+8(%rip),%eax
779	andl	$0x80100,%eax
780	cmpl	$0x80100,%eax
781	jne	.Lsqr8x_nox
782
783	call	bn_sqrx8x_internal
784
785
786
787
788	leaq	(%r8,%rcx,1),%rbx
789	movq	%rcx,%r9
790	movq	%rcx,%rdx
791.byte	102,72,15,126,207
792	sarq	$3+2,%rcx
793	jmp	.Lsqr8x_sub
794
795.align	32
796.Lsqr8x_nox:
797	call	bn_sqr8x_internal
798
799
800
801
802	leaq	(%rdi,%r9,1),%rbx
803	movq	%r9,%rcx
804	movq	%r9,%rdx
805.byte	102,72,15,126,207
806	sarq	$3+2,%rcx
807	jmp	.Lsqr8x_sub
808
809.align	32
810.Lsqr8x_sub:
811	movq	0(%rbx),%r12
812	movq	8(%rbx),%r13
813	movq	16(%rbx),%r14
814	movq	24(%rbx),%r15
815	leaq	32(%rbx),%rbx
816	sbbq	0(%rbp),%r12
817	sbbq	8(%rbp),%r13
818	sbbq	16(%rbp),%r14
819	sbbq	24(%rbp),%r15
820	leaq	32(%rbp),%rbp
821	movq	%r12,0(%rdi)
822	movq	%r13,8(%rdi)
823	movq	%r14,16(%rdi)
824	movq	%r15,24(%rdi)
825	leaq	32(%rdi),%rdi
826	incq	%rcx
827	jnz	.Lsqr8x_sub
828
829	sbbq	$0,%rax
830	leaq	(%rbx,%r9,1),%rbx
831	leaq	(%rdi,%r9,1),%rdi
832
833.byte	102,72,15,110,200
834	pxor	%xmm0,%xmm0
835	pshufd	$0,%xmm1,%xmm1
836	movq	40(%rsp),%rsi
837.cfi_def_cfa	%rsi,8
838	jmp	.Lsqr8x_cond_copy
839
840.align	32
841.Lsqr8x_cond_copy:
842	movdqa	0(%rbx),%xmm2
843	movdqa	16(%rbx),%xmm3
844	leaq	32(%rbx),%rbx
845	movdqu	0(%rdi),%xmm4
846	movdqu	16(%rdi),%xmm5
847	leaq	32(%rdi),%rdi
848	movdqa	%xmm0,-32(%rbx)
849	movdqa	%xmm0,-16(%rbx)
850	movdqa	%xmm0,-32(%rbx,%rdx,1)
851	movdqa	%xmm0,-16(%rbx,%rdx,1)
852	pcmpeqd	%xmm1,%xmm0
853	pand	%xmm1,%xmm2
854	pand	%xmm1,%xmm3
855	pand	%xmm0,%xmm4
856	pand	%xmm0,%xmm5
857	pxor	%xmm0,%xmm0
858	por	%xmm2,%xmm4
859	por	%xmm3,%xmm5
860	movdqu	%xmm4,-32(%rdi)
861	movdqu	%xmm5,-16(%rdi)
862	addq	$32,%r9
863	jnz	.Lsqr8x_cond_copy
864
865	movq	$1,%rax
866	movq	-48(%rsi),%r15
867.cfi_restore	%r15
868	movq	-40(%rsi),%r14
869.cfi_restore	%r14
870	movq	-32(%rsi),%r13
871.cfi_restore	%r13
872	movq	-24(%rsi),%r12
873.cfi_restore	%r12
874	movq	-16(%rsi),%rbp
875.cfi_restore	%rbp
876	movq	-8(%rsi),%rbx
877.cfi_restore	%rbx
878	leaq	(%rsi),%rsp
879.cfi_def_cfa_register	%rsp
880.Lsqr8x_epilogue:
881	.byte	0xf3,0xc3
882.cfi_endproc
883.size	bn_sqr8x_mont,.-bn_sqr8x_mont
884.type	bn_mulx4x_mont,@function
885.align	32
886bn_mulx4x_mont:
887.cfi_startproc
888	movq	%rsp,%rax
889.cfi_def_cfa_register	%rax
890.Lmulx4x_enter:
891	pushq	%rbx
892.cfi_offset	%rbx,-16
893	pushq	%rbp
894.cfi_offset	%rbp,-24
895	pushq	%r12
896.cfi_offset	%r12,-32
897	pushq	%r13
898.cfi_offset	%r13,-40
899	pushq	%r14
900.cfi_offset	%r14,-48
901	pushq	%r15
902.cfi_offset	%r15,-56
903.Lmulx4x_prologue:
904
905	shll	$3,%r9d
906	xorq	%r10,%r10
907	subq	%r9,%r10
908	movq	(%r8),%r8
909	leaq	-72(%rsp,%r10,1),%rbp
910	andq	$-128,%rbp
911	movq	%rsp,%r11
912	subq	%rbp,%r11
913	andq	$-4096,%r11
914	leaq	(%r11,%rbp,1),%rsp
915	movq	(%rsp),%r10
916	cmpq	%rbp,%rsp
917	ja	.Lmulx4x_page_walk
918	jmp	.Lmulx4x_page_walk_done
919
920.align	16
921.Lmulx4x_page_walk:
922	leaq	-4096(%rsp),%rsp
923	movq	(%rsp),%r10
924	cmpq	%rbp,%rsp
925	ja	.Lmulx4x_page_walk
926.Lmulx4x_page_walk_done:
927
928	leaq	(%rdx,%r9,1),%r10
929
930
931
932
933
934
935
936
937
938
939
940
941	movq	%r9,0(%rsp)
942	shrq	$5,%r9
943	movq	%r10,16(%rsp)
944	subq	$1,%r9
945	movq	%r8,24(%rsp)
946	movq	%rdi,32(%rsp)
947	movq	%rax,40(%rsp)
948.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
949	movq	%r9,48(%rsp)
950	jmp	.Lmulx4x_body
951
952.align	32
953.Lmulx4x_body:
954	leaq	8(%rdx),%rdi
955	movq	(%rdx),%rdx
956	leaq	64+32(%rsp),%rbx
957	movq	%rdx,%r9
958
959	mulxq	0(%rsi),%r8,%rax
960	mulxq	8(%rsi),%r11,%r14
961	addq	%rax,%r11
962	movq	%rdi,8(%rsp)
963	mulxq	16(%rsi),%r12,%r13
964	adcq	%r14,%r12
965	adcq	$0,%r13
966
967	movq	%r8,%rdi
968	imulq	24(%rsp),%r8
969	xorq	%rbp,%rbp
970
971	mulxq	24(%rsi),%rax,%r14
972	movq	%r8,%rdx
973	leaq	32(%rsi),%rsi
974	adcxq	%rax,%r13
975	adcxq	%rbp,%r14
976
977	mulxq	0(%rcx),%rax,%r10
978	adcxq	%rax,%rdi
979	adoxq	%r11,%r10
980	mulxq	8(%rcx),%rax,%r11
981	adcxq	%rax,%r10
982	adoxq	%r12,%r11
983.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
984	movq	48(%rsp),%rdi
985	movq	%r10,-32(%rbx)
986	adcxq	%rax,%r11
987	adoxq	%r13,%r12
988	mulxq	24(%rcx),%rax,%r15
989	movq	%r9,%rdx
990	movq	%r11,-24(%rbx)
991	adcxq	%rax,%r12
992	adoxq	%rbp,%r15
993	leaq	32(%rcx),%rcx
994	movq	%r12,-16(%rbx)
995
996	jmp	.Lmulx4x_1st
997
998.align	32
999.Lmulx4x_1st:
1000	adcxq	%rbp,%r15
1001	mulxq	0(%rsi),%r10,%rax
1002	adcxq	%r14,%r10
1003	mulxq	8(%rsi),%r11,%r14
1004	adcxq	%rax,%r11
1005	mulxq	16(%rsi),%r12,%rax
1006	adcxq	%r14,%r12
1007	mulxq	24(%rsi),%r13,%r14
1008.byte	0x67,0x67
1009	movq	%r8,%rdx
1010	adcxq	%rax,%r13
1011	adcxq	%rbp,%r14
1012	leaq	32(%rsi),%rsi
1013	leaq	32(%rbx),%rbx
1014
1015	adoxq	%r15,%r10
1016	mulxq	0(%rcx),%rax,%r15
1017	adcxq	%rax,%r10
1018	adoxq	%r15,%r11
1019	mulxq	8(%rcx),%rax,%r15
1020	adcxq	%rax,%r11
1021	adoxq	%r15,%r12
1022	mulxq	16(%rcx),%rax,%r15
1023	movq	%r10,-40(%rbx)
1024	adcxq	%rax,%r12
1025	movq	%r11,-32(%rbx)
1026	adoxq	%r15,%r13
1027	mulxq	24(%rcx),%rax,%r15
1028	movq	%r9,%rdx
1029	movq	%r12,-24(%rbx)
1030	adcxq	%rax,%r13
1031	adoxq	%rbp,%r15
1032	leaq	32(%rcx),%rcx
1033	movq	%r13,-16(%rbx)
1034
1035	decq	%rdi
1036	jnz	.Lmulx4x_1st
1037
1038	movq	0(%rsp),%rax
1039	movq	8(%rsp),%rdi
1040	adcq	%rbp,%r15
1041	addq	%r15,%r14
1042	sbbq	%r15,%r15
1043	movq	%r14,-8(%rbx)
1044	jmp	.Lmulx4x_outer
1045
1046.align	32
1047.Lmulx4x_outer:
1048	movq	(%rdi),%rdx
1049	leaq	8(%rdi),%rdi
1050	subq	%rax,%rsi
1051	movq	%r15,(%rbx)
1052	leaq	64+32(%rsp),%rbx
1053	subq	%rax,%rcx
1054
1055	mulxq	0(%rsi),%r8,%r11
1056	xorl	%ebp,%ebp
1057	movq	%rdx,%r9
1058	mulxq	8(%rsi),%r14,%r12
1059	adoxq	-32(%rbx),%r8
1060	adcxq	%r14,%r11
1061	mulxq	16(%rsi),%r15,%r13
1062	adoxq	-24(%rbx),%r11
1063	adcxq	%r15,%r12
1064	adoxq	-16(%rbx),%r12
1065	adcxq	%rbp,%r13
1066	adoxq	%rbp,%r13
1067
1068	movq	%rdi,8(%rsp)
1069	movq	%r8,%r15
1070	imulq	24(%rsp),%r8
1071	xorl	%ebp,%ebp
1072
1073	mulxq	24(%rsi),%rax,%r14
1074	movq	%r8,%rdx
1075	adcxq	%rax,%r13
1076	adoxq	-8(%rbx),%r13
1077	adcxq	%rbp,%r14
1078	leaq	32(%rsi),%rsi
1079	adoxq	%rbp,%r14
1080
1081	mulxq	0(%rcx),%rax,%r10
1082	adcxq	%rax,%r15
1083	adoxq	%r11,%r10
1084	mulxq	8(%rcx),%rax,%r11
1085	adcxq	%rax,%r10
1086	adoxq	%r12,%r11
1087	mulxq	16(%rcx),%rax,%r12
1088	movq	%r10,-32(%rbx)
1089	adcxq	%rax,%r11
1090	adoxq	%r13,%r12
1091	mulxq	24(%rcx),%rax,%r15
1092	movq	%r9,%rdx
1093	movq	%r11,-24(%rbx)
1094	leaq	32(%rcx),%rcx
1095	adcxq	%rax,%r12
1096	adoxq	%rbp,%r15
1097	movq	48(%rsp),%rdi
1098	movq	%r12,-16(%rbx)
1099
1100	jmp	.Lmulx4x_inner
1101
1102.align	32
1103.Lmulx4x_inner:
1104	mulxq	0(%rsi),%r10,%rax
1105	adcxq	%rbp,%r15
1106	adoxq	%r14,%r10
1107	mulxq	8(%rsi),%r11,%r14
1108	adcxq	0(%rbx),%r10
1109	adoxq	%rax,%r11
1110	mulxq	16(%rsi),%r12,%rax
1111	adcxq	8(%rbx),%r11
1112	adoxq	%r14,%r12
1113	mulxq	24(%rsi),%r13,%r14
1114	movq	%r8,%rdx
1115	adcxq	16(%rbx),%r12
1116	adoxq	%rax,%r13
1117	adcxq	24(%rbx),%r13
1118	adoxq	%rbp,%r14
1119	leaq	32(%rsi),%rsi
1120	leaq	32(%rbx),%rbx
1121	adcxq	%rbp,%r14
1122
1123	adoxq	%r15,%r10
1124	mulxq	0(%rcx),%rax,%r15
1125	adcxq	%rax,%r10
1126	adoxq	%r15,%r11
1127	mulxq	8(%rcx),%rax,%r15
1128	adcxq	%rax,%r11
1129	adoxq	%r15,%r12
1130	mulxq	16(%rcx),%rax,%r15
1131	movq	%r10,-40(%rbx)
1132	adcxq	%rax,%r12
1133	adoxq	%r15,%r13
1134	mulxq	24(%rcx),%rax,%r15
1135	movq	%r9,%rdx
1136	movq	%r11,-32(%rbx)
1137	movq	%r12,-24(%rbx)
1138	adcxq	%rax,%r13
1139	adoxq	%rbp,%r15
1140	leaq	32(%rcx),%rcx
1141	movq	%r13,-16(%rbx)
1142
1143	decq	%rdi
1144	jnz	.Lmulx4x_inner
1145
1146	movq	0(%rsp),%rax
1147	movq	8(%rsp),%rdi
1148	adcq	%rbp,%r15
1149	subq	0(%rbx),%rbp
1150	adcq	%r15,%r14
1151	sbbq	%r15,%r15
1152	movq	%r14,-8(%rbx)
1153
1154	cmpq	16(%rsp),%rdi
1155	jne	.Lmulx4x_outer
1156
1157	leaq	64(%rsp),%rbx
1158	subq	%rax,%rcx
1159	negq	%r15
1160	movq	%rax,%rdx
1161	shrq	$3+2,%rax
1162	movq	32(%rsp),%rdi
1163	jmp	.Lmulx4x_sub
1164
1165.align	32
1166.Lmulx4x_sub:
1167	movq	0(%rbx),%r11
1168	movq	8(%rbx),%r12
1169	movq	16(%rbx),%r13
1170	movq	24(%rbx),%r14
1171	leaq	32(%rbx),%rbx
1172	sbbq	0(%rcx),%r11
1173	sbbq	8(%rcx),%r12
1174	sbbq	16(%rcx),%r13
1175	sbbq	24(%rcx),%r14
1176	leaq	32(%rcx),%rcx
1177	movq	%r11,0(%rdi)
1178	movq	%r12,8(%rdi)
1179	movq	%r13,16(%rdi)
1180	movq	%r14,24(%rdi)
1181	leaq	32(%rdi),%rdi
1182	decq	%rax
1183	jnz	.Lmulx4x_sub
1184
1185	sbbq	$0,%r15
1186	leaq	64(%rsp),%rbx
1187	subq	%rdx,%rdi
1188
1189.byte	102,73,15,110,207
1190	pxor	%xmm0,%xmm0
1191	pshufd	$0,%xmm1,%xmm1
1192	movq	40(%rsp),%rsi
1193.cfi_def_cfa	%rsi,8
1194	jmp	.Lmulx4x_cond_copy
1195
1196.align	32
1197.Lmulx4x_cond_copy:
1198	movdqa	0(%rbx),%xmm2
1199	movdqa	16(%rbx),%xmm3
1200	leaq	32(%rbx),%rbx
1201	movdqu	0(%rdi),%xmm4
1202	movdqu	16(%rdi),%xmm5
1203	leaq	32(%rdi),%rdi
1204	movdqa	%xmm0,-32(%rbx)
1205	movdqa	%xmm0,-16(%rbx)
1206	pcmpeqd	%xmm1,%xmm0
1207	pand	%xmm1,%xmm2
1208	pand	%xmm1,%xmm3
1209	pand	%xmm0,%xmm4
1210	pand	%xmm0,%xmm5
1211	pxor	%xmm0,%xmm0
1212	por	%xmm2,%xmm4
1213	por	%xmm3,%xmm5
1214	movdqu	%xmm4,-32(%rdi)
1215	movdqu	%xmm5,-16(%rdi)
1216	subq	$32,%rdx
1217	jnz	.Lmulx4x_cond_copy
1218
1219	movq	%rdx,(%rbx)
1220
1221	movq	$1,%rax
1222	movq	-48(%rsi),%r15
1223.cfi_restore	%r15
1224	movq	-40(%rsi),%r14
1225.cfi_restore	%r14
1226	movq	-32(%rsi),%r13
1227.cfi_restore	%r13
1228	movq	-24(%rsi),%r12
1229.cfi_restore	%r12
1230	movq	-16(%rsi),%rbp
1231.cfi_restore	%rbp
1232	movq	-8(%rsi),%rbx
1233.cfi_restore	%rbx
1234	leaq	(%rsi),%rsp
1235.cfi_def_cfa_register	%rsp
1236.Lmulx4x_epilogue:
1237	.byte	0xf3,0xc3
1238.cfi_endproc
1239.size	bn_mulx4x_mont,.-bn_mulx4x_mont
1240.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1241.align	16
1242