1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11#if defined(BORINGSSL_PREFIX)
12#include <boringssl_prefix_symbols_asm.h>
13#endif
14.text
15
16.extern	OPENSSL_ia32cap_P
17.hidden OPENSSL_ia32cap_P
18
19.globl	bn_mul_mont
20.hidden bn_mul_mont
21.type	bn_mul_mont,@function
22.align	16
23bn_mul_mont:
24.cfi_startproc
25	movl	%r9d,%r9d
26	movq	%rsp,%rax
27.cfi_def_cfa_register	%rax
28	testl	$3,%r9d
29	jnz	.Lmul_enter
30	cmpl	$8,%r9d
31	jb	.Lmul_enter
32	leaq	OPENSSL_ia32cap_P(%rip),%r11
33	movl	8(%r11),%r11d
34	cmpq	%rsi,%rdx
35	jne	.Lmul4x_enter
36	testl	$7,%r9d
37	jz	.Lsqr8x_enter
38	jmp	.Lmul4x_enter
39
40.align	16
41.Lmul_enter:
42	pushq	%rbx
43.cfi_offset	%rbx,-16
44	pushq	%rbp
45.cfi_offset	%rbp,-24
46	pushq	%r12
47.cfi_offset	%r12,-32
48	pushq	%r13
49.cfi_offset	%r13,-40
50	pushq	%r14
51.cfi_offset	%r14,-48
52	pushq	%r15
53.cfi_offset	%r15,-56
54
55	negq	%r9
56	movq	%rsp,%r11
57	leaq	-16(%rsp,%r9,8),%r10
58	negq	%r9
59	andq	$-1024,%r10
60
61
62
63
64
65
66
67
68
69	subq	%r10,%r11
70	andq	$-4096,%r11
71	leaq	(%r10,%r11,1),%rsp
72	movq	(%rsp),%r11
73	cmpq	%r10,%rsp
74	ja	.Lmul_page_walk
75	jmp	.Lmul_page_walk_done
76
77.align	16
78.Lmul_page_walk:
79	leaq	-4096(%rsp),%rsp
80	movq	(%rsp),%r11
81	cmpq	%r10,%rsp
82	ja	.Lmul_page_walk
83.Lmul_page_walk_done:
84
85	movq	%rax,8(%rsp,%r9,8)
86.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
87.Lmul_body:
88	movq	%rdx,%r12
89	movq	(%r8),%r8
90	movq	(%r12),%rbx
91	movq	(%rsi),%rax
92
93	xorq	%r14,%r14
94	xorq	%r15,%r15
95
96	movq	%r8,%rbp
97	mulq	%rbx
98	movq	%rax,%r10
99	movq	(%rcx),%rax
100
101	imulq	%r10,%rbp
102	movq	%rdx,%r11
103
104	mulq	%rbp
105	addq	%rax,%r10
106	movq	8(%rsi),%rax
107	adcq	$0,%rdx
108	movq	%rdx,%r13
109
110	leaq	1(%r15),%r15
111	jmp	.L1st_enter
112
113.align	16
114.L1st:
115	addq	%rax,%r13
116	movq	(%rsi,%r15,8),%rax
117	adcq	$0,%rdx
118	addq	%r11,%r13
119	movq	%r10,%r11
120	adcq	$0,%rdx
121	movq	%r13,-16(%rsp,%r15,8)
122	movq	%rdx,%r13
123
124.L1st_enter:
125	mulq	%rbx
126	addq	%rax,%r11
127	movq	(%rcx,%r15,8),%rax
128	adcq	$0,%rdx
129	leaq	1(%r15),%r15
130	movq	%rdx,%r10
131
132	mulq	%rbp
133	cmpq	%r9,%r15
134	jne	.L1st
135
136	addq	%rax,%r13
137	movq	(%rsi),%rax
138	adcq	$0,%rdx
139	addq	%r11,%r13
140	adcq	$0,%rdx
141	movq	%r13,-16(%rsp,%r15,8)
142	movq	%rdx,%r13
143	movq	%r10,%r11
144
145	xorq	%rdx,%rdx
146	addq	%r11,%r13
147	adcq	$0,%rdx
148	movq	%r13,-8(%rsp,%r9,8)
149	movq	%rdx,(%rsp,%r9,8)
150
151	leaq	1(%r14),%r14
152	jmp	.Louter
153.align	16
154.Louter:
155	movq	(%r12,%r14,8),%rbx
156	xorq	%r15,%r15
157	movq	%r8,%rbp
158	movq	(%rsp),%r10
159	mulq	%rbx
160	addq	%rax,%r10
161	movq	(%rcx),%rax
162	adcq	$0,%rdx
163
164	imulq	%r10,%rbp
165	movq	%rdx,%r11
166
167	mulq	%rbp
168	addq	%rax,%r10
169	movq	8(%rsi),%rax
170	adcq	$0,%rdx
171	movq	8(%rsp),%r10
172	movq	%rdx,%r13
173
174	leaq	1(%r15),%r15
175	jmp	.Linner_enter
176
177.align	16
178.Linner:
179	addq	%rax,%r13
180	movq	(%rsi,%r15,8),%rax
181	adcq	$0,%rdx
182	addq	%r10,%r13
183	movq	(%rsp,%r15,8),%r10
184	adcq	$0,%rdx
185	movq	%r13,-16(%rsp,%r15,8)
186	movq	%rdx,%r13
187
188.Linner_enter:
189	mulq	%rbx
190	addq	%rax,%r11
191	movq	(%rcx,%r15,8),%rax
192	adcq	$0,%rdx
193	addq	%r11,%r10
194	movq	%rdx,%r11
195	adcq	$0,%r11
196	leaq	1(%r15),%r15
197
198	mulq	%rbp
199	cmpq	%r9,%r15
200	jne	.Linner
201
202	addq	%rax,%r13
203	movq	(%rsi),%rax
204	adcq	$0,%rdx
205	addq	%r10,%r13
206	movq	(%rsp,%r15,8),%r10
207	adcq	$0,%rdx
208	movq	%r13,-16(%rsp,%r15,8)
209	movq	%rdx,%r13
210
211	xorq	%rdx,%rdx
212	addq	%r11,%r13
213	adcq	$0,%rdx
214	addq	%r10,%r13
215	adcq	$0,%rdx
216	movq	%r13,-8(%rsp,%r9,8)
217	movq	%rdx,(%rsp,%r9,8)
218
219	leaq	1(%r14),%r14
220	cmpq	%r9,%r14
221	jb	.Louter
222
223	xorq	%r14,%r14
224	movq	(%rsp),%rax
225	movq	%r9,%r15
226
227.align	16
228.Lsub:	sbbq	(%rcx,%r14,8),%rax
229	movq	%rax,(%rdi,%r14,8)
230	movq	8(%rsp,%r14,8),%rax
231	leaq	1(%r14),%r14
232	decq	%r15
233	jnz	.Lsub
234
235	sbbq	$0,%rax
236	movq	$-1,%rbx
237	xorq	%rax,%rbx
238	xorq	%r14,%r14
239	movq	%r9,%r15
240
241.Lcopy:
242	movq	(%rdi,%r14,8),%rcx
243	movq	(%rsp,%r14,8),%rdx
244	andq	%rbx,%rcx
245	andq	%rax,%rdx
246	movq	%r9,(%rsp,%r14,8)
247	orq	%rcx,%rdx
248	movq	%rdx,(%rdi,%r14,8)
249	leaq	1(%r14),%r14
250	subq	$1,%r15
251	jnz	.Lcopy
252
253	movq	8(%rsp,%r9,8),%rsi
254.cfi_def_cfa	%rsi,8
255	movq	$1,%rax
256	movq	-48(%rsi),%r15
257.cfi_restore	%r15
258	movq	-40(%rsi),%r14
259.cfi_restore	%r14
260	movq	-32(%rsi),%r13
261.cfi_restore	%r13
262	movq	-24(%rsi),%r12
263.cfi_restore	%r12
264	movq	-16(%rsi),%rbp
265.cfi_restore	%rbp
266	movq	-8(%rsi),%rbx
267.cfi_restore	%rbx
268	leaq	(%rsi),%rsp
269.cfi_def_cfa_register	%rsp
270.Lmul_epilogue:
271	.byte	0xf3,0xc3
272.cfi_endproc
273.size	bn_mul_mont,.-bn_mul_mont
274.type	bn_mul4x_mont,@function
275.align	16
276bn_mul4x_mont:
277.cfi_startproc
278	movl	%r9d,%r9d
279	movq	%rsp,%rax
280.cfi_def_cfa_register	%rax
281.Lmul4x_enter:
282	andl	$0x80100,%r11d
283	cmpl	$0x80100,%r11d
284	je	.Lmulx4x_enter
285	pushq	%rbx
286.cfi_offset	%rbx,-16
287	pushq	%rbp
288.cfi_offset	%rbp,-24
289	pushq	%r12
290.cfi_offset	%r12,-32
291	pushq	%r13
292.cfi_offset	%r13,-40
293	pushq	%r14
294.cfi_offset	%r14,-48
295	pushq	%r15
296.cfi_offset	%r15,-56
297
298	negq	%r9
299	movq	%rsp,%r11
300	leaq	-32(%rsp,%r9,8),%r10
301	negq	%r9
302	andq	$-1024,%r10
303
304	subq	%r10,%r11
305	andq	$-4096,%r11
306	leaq	(%r10,%r11,1),%rsp
307	movq	(%rsp),%r11
308	cmpq	%r10,%rsp
309	ja	.Lmul4x_page_walk
310	jmp	.Lmul4x_page_walk_done
311
312.Lmul4x_page_walk:
313	leaq	-4096(%rsp),%rsp
314	movq	(%rsp),%r11
315	cmpq	%r10,%rsp
316	ja	.Lmul4x_page_walk
317.Lmul4x_page_walk_done:
318
319	movq	%rax,8(%rsp,%r9,8)
320.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
321.Lmul4x_body:
322	movq	%rdi,16(%rsp,%r9,8)
323	movq	%rdx,%r12
324	movq	(%r8),%r8
325	movq	(%r12),%rbx
326	movq	(%rsi),%rax
327
328	xorq	%r14,%r14
329	xorq	%r15,%r15
330
331	movq	%r8,%rbp
332	mulq	%rbx
333	movq	%rax,%r10
334	movq	(%rcx),%rax
335
336	imulq	%r10,%rbp
337	movq	%rdx,%r11
338
339	mulq	%rbp
340	addq	%rax,%r10
341	movq	8(%rsi),%rax
342	adcq	$0,%rdx
343	movq	%rdx,%rdi
344
345	mulq	%rbx
346	addq	%rax,%r11
347	movq	8(%rcx),%rax
348	adcq	$0,%rdx
349	movq	%rdx,%r10
350
351	mulq	%rbp
352	addq	%rax,%rdi
353	movq	16(%rsi),%rax
354	adcq	$0,%rdx
355	addq	%r11,%rdi
356	leaq	4(%r15),%r15
357	adcq	$0,%rdx
358	movq	%rdi,(%rsp)
359	movq	%rdx,%r13
360	jmp	.L1st4x
361.align	16
362.L1st4x:
363	mulq	%rbx
364	addq	%rax,%r10
365	movq	-16(%rcx,%r15,8),%rax
366	adcq	$0,%rdx
367	movq	%rdx,%r11
368
369	mulq	%rbp
370	addq	%rax,%r13
371	movq	-8(%rsi,%r15,8),%rax
372	adcq	$0,%rdx
373	addq	%r10,%r13
374	adcq	$0,%rdx
375	movq	%r13,-24(%rsp,%r15,8)
376	movq	%rdx,%rdi
377
378	mulq	%rbx
379	addq	%rax,%r11
380	movq	-8(%rcx,%r15,8),%rax
381	adcq	$0,%rdx
382	movq	%rdx,%r10
383
384	mulq	%rbp
385	addq	%rax,%rdi
386	movq	(%rsi,%r15,8),%rax
387	adcq	$0,%rdx
388	addq	%r11,%rdi
389	adcq	$0,%rdx
390	movq	%rdi,-16(%rsp,%r15,8)
391	movq	%rdx,%r13
392
393	mulq	%rbx
394	addq	%rax,%r10
395	movq	(%rcx,%r15,8),%rax
396	adcq	$0,%rdx
397	movq	%rdx,%r11
398
399	mulq	%rbp
400	addq	%rax,%r13
401	movq	8(%rsi,%r15,8),%rax
402	adcq	$0,%rdx
403	addq	%r10,%r13
404	adcq	$0,%rdx
405	movq	%r13,-8(%rsp,%r15,8)
406	movq	%rdx,%rdi
407
408	mulq	%rbx
409	addq	%rax,%r11
410	movq	8(%rcx,%r15,8),%rax
411	adcq	$0,%rdx
412	leaq	4(%r15),%r15
413	movq	%rdx,%r10
414
415	mulq	%rbp
416	addq	%rax,%rdi
417	movq	-16(%rsi,%r15,8),%rax
418	adcq	$0,%rdx
419	addq	%r11,%rdi
420	adcq	$0,%rdx
421	movq	%rdi,-32(%rsp,%r15,8)
422	movq	%rdx,%r13
423	cmpq	%r9,%r15
424	jb	.L1st4x
425
426	mulq	%rbx
427	addq	%rax,%r10
428	movq	-16(%rcx,%r15,8),%rax
429	adcq	$0,%rdx
430	movq	%rdx,%r11
431
432	mulq	%rbp
433	addq	%rax,%r13
434	movq	-8(%rsi,%r15,8),%rax
435	adcq	$0,%rdx
436	addq	%r10,%r13
437	adcq	$0,%rdx
438	movq	%r13,-24(%rsp,%r15,8)
439	movq	%rdx,%rdi
440
441	mulq	%rbx
442	addq	%rax,%r11
443	movq	-8(%rcx,%r15,8),%rax
444	adcq	$0,%rdx
445	movq	%rdx,%r10
446
447	mulq	%rbp
448	addq	%rax,%rdi
449	movq	(%rsi),%rax
450	adcq	$0,%rdx
451	addq	%r11,%rdi
452	adcq	$0,%rdx
453	movq	%rdi,-16(%rsp,%r15,8)
454	movq	%rdx,%r13
455
456	xorq	%rdi,%rdi
457	addq	%r10,%r13
458	adcq	$0,%rdi
459	movq	%r13,-8(%rsp,%r15,8)
460	movq	%rdi,(%rsp,%r15,8)
461
462	leaq	1(%r14),%r14
463.align	4
464.Louter4x:
465	movq	(%r12,%r14,8),%rbx
466	xorq	%r15,%r15
467	movq	(%rsp),%r10
468	movq	%r8,%rbp
469	mulq	%rbx
470	addq	%rax,%r10
471	movq	(%rcx),%rax
472	adcq	$0,%rdx
473
474	imulq	%r10,%rbp
475	movq	%rdx,%r11
476
477	mulq	%rbp
478	addq	%rax,%r10
479	movq	8(%rsi),%rax
480	adcq	$0,%rdx
481	movq	%rdx,%rdi
482
483	mulq	%rbx
484	addq	%rax,%r11
485	movq	8(%rcx),%rax
486	adcq	$0,%rdx
487	addq	8(%rsp),%r11
488	adcq	$0,%rdx
489	movq	%rdx,%r10
490
491	mulq	%rbp
492	addq	%rax,%rdi
493	movq	16(%rsi),%rax
494	adcq	$0,%rdx
495	addq	%r11,%rdi
496	leaq	4(%r15),%r15
497	adcq	$0,%rdx
498	movq	%rdi,(%rsp)
499	movq	%rdx,%r13
500	jmp	.Linner4x
501.align	16
502.Linner4x:
503	mulq	%rbx
504	addq	%rax,%r10
505	movq	-16(%rcx,%r15,8),%rax
506	adcq	$0,%rdx
507	addq	-16(%rsp,%r15,8),%r10
508	adcq	$0,%rdx
509	movq	%rdx,%r11
510
511	mulq	%rbp
512	addq	%rax,%r13
513	movq	-8(%rsi,%r15,8),%rax
514	adcq	$0,%rdx
515	addq	%r10,%r13
516	adcq	$0,%rdx
517	movq	%r13,-24(%rsp,%r15,8)
518	movq	%rdx,%rdi
519
520	mulq	%rbx
521	addq	%rax,%r11
522	movq	-8(%rcx,%r15,8),%rax
523	adcq	$0,%rdx
524	addq	-8(%rsp,%r15,8),%r11
525	adcq	$0,%rdx
526	movq	%rdx,%r10
527
528	mulq	%rbp
529	addq	%rax,%rdi
530	movq	(%rsi,%r15,8),%rax
531	adcq	$0,%rdx
532	addq	%r11,%rdi
533	adcq	$0,%rdx
534	movq	%rdi,-16(%rsp,%r15,8)
535	movq	%rdx,%r13
536
537	mulq	%rbx
538	addq	%rax,%r10
539	movq	(%rcx,%r15,8),%rax
540	adcq	$0,%rdx
541	addq	(%rsp,%r15,8),%r10
542	adcq	$0,%rdx
543	movq	%rdx,%r11
544
545	mulq	%rbp
546	addq	%rax,%r13
547	movq	8(%rsi,%r15,8),%rax
548	adcq	$0,%rdx
549	addq	%r10,%r13
550	adcq	$0,%rdx
551	movq	%r13,-8(%rsp,%r15,8)
552	movq	%rdx,%rdi
553
554	mulq	%rbx
555	addq	%rax,%r11
556	movq	8(%rcx,%r15,8),%rax
557	adcq	$0,%rdx
558	addq	8(%rsp,%r15,8),%r11
559	adcq	$0,%rdx
560	leaq	4(%r15),%r15
561	movq	%rdx,%r10
562
563	mulq	%rbp
564	addq	%rax,%rdi
565	movq	-16(%rsi,%r15,8),%rax
566	adcq	$0,%rdx
567	addq	%r11,%rdi
568	adcq	$0,%rdx
569	movq	%rdi,-32(%rsp,%r15,8)
570	movq	%rdx,%r13
571	cmpq	%r9,%r15
572	jb	.Linner4x
573
574	mulq	%rbx
575	addq	%rax,%r10
576	movq	-16(%rcx,%r15,8),%rax
577	adcq	$0,%rdx
578	addq	-16(%rsp,%r15,8),%r10
579	adcq	$0,%rdx
580	movq	%rdx,%r11
581
582	mulq	%rbp
583	addq	%rax,%r13
584	movq	-8(%rsi,%r15,8),%rax
585	adcq	$0,%rdx
586	addq	%r10,%r13
587	adcq	$0,%rdx
588	movq	%r13,-24(%rsp,%r15,8)
589	movq	%rdx,%rdi
590
591	mulq	%rbx
592	addq	%rax,%r11
593	movq	-8(%rcx,%r15,8),%rax
594	adcq	$0,%rdx
595	addq	-8(%rsp,%r15,8),%r11
596	adcq	$0,%rdx
597	leaq	1(%r14),%r14
598	movq	%rdx,%r10
599
600	mulq	%rbp
601	addq	%rax,%rdi
602	movq	(%rsi),%rax
603	adcq	$0,%rdx
604	addq	%r11,%rdi
605	adcq	$0,%rdx
606	movq	%rdi,-16(%rsp,%r15,8)
607	movq	%rdx,%r13
608
609	xorq	%rdi,%rdi
610	addq	%r10,%r13
611	adcq	$0,%rdi
612	addq	(%rsp,%r9,8),%r13
613	adcq	$0,%rdi
614	movq	%r13,-8(%rsp,%r15,8)
615	movq	%rdi,(%rsp,%r15,8)
616
617	cmpq	%r9,%r14
618	jb	.Louter4x
619	movq	16(%rsp,%r9,8),%rdi
620	leaq	-4(%r9),%r15
621	movq	0(%rsp),%rax
622	movq	8(%rsp),%rdx
623	shrq	$2,%r15
624	leaq	(%rsp),%rsi
625	xorq	%r14,%r14
626
627	subq	0(%rcx),%rax
628	movq	16(%rsi),%rbx
629	movq	24(%rsi),%rbp
630	sbbq	8(%rcx),%rdx
631
632.Lsub4x:
633	movq	%rax,0(%rdi,%r14,8)
634	movq	%rdx,8(%rdi,%r14,8)
635	sbbq	16(%rcx,%r14,8),%rbx
636	movq	32(%rsi,%r14,8),%rax
637	movq	40(%rsi,%r14,8),%rdx
638	sbbq	24(%rcx,%r14,8),%rbp
639	movq	%rbx,16(%rdi,%r14,8)
640	movq	%rbp,24(%rdi,%r14,8)
641	sbbq	32(%rcx,%r14,8),%rax
642	movq	48(%rsi,%r14,8),%rbx
643	movq	56(%rsi,%r14,8),%rbp
644	sbbq	40(%rcx,%r14,8),%rdx
645	leaq	4(%r14),%r14
646	decq	%r15
647	jnz	.Lsub4x
648
649	movq	%rax,0(%rdi,%r14,8)
650	movq	32(%rsi,%r14,8),%rax
651	sbbq	16(%rcx,%r14,8),%rbx
652	movq	%rdx,8(%rdi,%r14,8)
653	sbbq	24(%rcx,%r14,8),%rbp
654	movq	%rbx,16(%rdi,%r14,8)
655
656	sbbq	$0,%rax
657	movq	%rbp,24(%rdi,%r14,8)
658	pxor	%xmm0,%xmm0
659.byte	102,72,15,110,224
660	pcmpeqd	%xmm5,%xmm5
661	pshufd	$0,%xmm4,%xmm4
662	movq	%r9,%r15
663	pxor	%xmm4,%xmm5
664	shrq	$2,%r15
665	xorl	%eax,%eax
666
667	jmp	.Lcopy4x
668.align	16
669.Lcopy4x:
670	movdqa	(%rsp,%rax,1),%xmm1
671	movdqu	(%rdi,%rax,1),%xmm2
672	pand	%xmm4,%xmm1
673	pand	%xmm5,%xmm2
674	movdqa	16(%rsp,%rax,1),%xmm3
675	movdqa	%xmm0,(%rsp,%rax,1)
676	por	%xmm2,%xmm1
677	movdqu	16(%rdi,%rax,1),%xmm2
678	movdqu	%xmm1,(%rdi,%rax,1)
679	pand	%xmm4,%xmm3
680	pand	%xmm5,%xmm2
681	movdqa	%xmm0,16(%rsp,%rax,1)
682	por	%xmm2,%xmm3
683	movdqu	%xmm3,16(%rdi,%rax,1)
684	leaq	32(%rax),%rax
685	decq	%r15
686	jnz	.Lcopy4x
687	movq	8(%rsp,%r9,8),%rsi
688.cfi_def_cfa	%rsi, 8
689	movq	$1,%rax
690	movq	-48(%rsi),%r15
691.cfi_restore	%r15
692	movq	-40(%rsi),%r14
693.cfi_restore	%r14
694	movq	-32(%rsi),%r13
695.cfi_restore	%r13
696	movq	-24(%rsi),%r12
697.cfi_restore	%r12
698	movq	-16(%rsi),%rbp
699.cfi_restore	%rbp
700	movq	-8(%rsi),%rbx
701.cfi_restore	%rbx
702	leaq	(%rsi),%rsp
703.cfi_def_cfa_register	%rsp
704.Lmul4x_epilogue:
705	.byte	0xf3,0xc3
706.cfi_endproc
707.size	bn_mul4x_mont,.-bn_mul4x_mont
708.extern	bn_sqrx8x_internal
709.hidden bn_sqrx8x_internal
710.extern	bn_sqr8x_internal
711.hidden bn_sqr8x_internal
712
713.type	bn_sqr8x_mont,@function
714.align	32
715bn_sqr8x_mont:
716.cfi_startproc
717	movq	%rsp,%rax
718.cfi_def_cfa_register	%rax
719.Lsqr8x_enter:
720	pushq	%rbx
721.cfi_offset	%rbx,-16
722	pushq	%rbp
723.cfi_offset	%rbp,-24
724	pushq	%r12
725.cfi_offset	%r12,-32
726	pushq	%r13
727.cfi_offset	%r13,-40
728	pushq	%r14
729.cfi_offset	%r14,-48
730	pushq	%r15
731.cfi_offset	%r15,-56
732.Lsqr8x_prologue:
733
734	movl	%r9d,%r10d
735	shll	$3,%r9d
736	shlq	$3+2,%r10
737	negq	%r9
738
739
740
741
742
743
744	leaq	-64(%rsp,%r9,2),%r11
745	movq	%rsp,%rbp
746	movq	(%r8),%r8
747	subq	%rsi,%r11
748	andq	$4095,%r11
749	cmpq	%r11,%r10
750	jb	.Lsqr8x_sp_alt
751	subq	%r11,%rbp
752	leaq	-64(%rbp,%r9,2),%rbp
753	jmp	.Lsqr8x_sp_done
754
755.align	32
756.Lsqr8x_sp_alt:
757	leaq	4096-64(,%r9,2),%r10
758	leaq	-64(%rbp,%r9,2),%rbp
759	subq	%r10,%r11
760	movq	$0,%r10
761	cmovcq	%r10,%r11
762	subq	%r11,%rbp
763.Lsqr8x_sp_done:
764	andq	$-64,%rbp
765	movq	%rsp,%r11
766	subq	%rbp,%r11
767	andq	$-4096,%r11
768	leaq	(%r11,%rbp,1),%rsp
769	movq	(%rsp),%r10
770	cmpq	%rbp,%rsp
771	ja	.Lsqr8x_page_walk
772	jmp	.Lsqr8x_page_walk_done
773
774.align	16
775.Lsqr8x_page_walk:
776	leaq	-4096(%rsp),%rsp
777	movq	(%rsp),%r10
778	cmpq	%rbp,%rsp
779	ja	.Lsqr8x_page_walk
780.Lsqr8x_page_walk_done:
781
782	movq	%r9,%r10
783	negq	%r9
784
785	movq	%r8,32(%rsp)
786	movq	%rax,40(%rsp)
787.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
788.Lsqr8x_body:
789
790.byte	102,72,15,110,209
791	pxor	%xmm0,%xmm0
792.byte	102,72,15,110,207
793.byte	102,73,15,110,218
794	leaq	OPENSSL_ia32cap_P(%rip),%rax
795	movl	8(%rax),%eax
796	andl	$0x80100,%eax
797	cmpl	$0x80100,%eax
798	jne	.Lsqr8x_nox
799
800	call	bn_sqrx8x_internal
801
802
803
804
805	leaq	(%r8,%rcx,1),%rbx
806	movq	%rcx,%r9
807	movq	%rcx,%rdx
808.byte	102,72,15,126,207
809	sarq	$3+2,%rcx
810	jmp	.Lsqr8x_sub
811
812.align	32
813.Lsqr8x_nox:
814	call	bn_sqr8x_internal
815
816
817
818
819	leaq	(%rdi,%r9,1),%rbx
820	movq	%r9,%rcx
821	movq	%r9,%rdx
822.byte	102,72,15,126,207
823	sarq	$3+2,%rcx
824	jmp	.Lsqr8x_sub
825
826.align	32
827.Lsqr8x_sub:
828	movq	0(%rbx),%r12
829	movq	8(%rbx),%r13
830	movq	16(%rbx),%r14
831	movq	24(%rbx),%r15
832	leaq	32(%rbx),%rbx
833	sbbq	0(%rbp),%r12
834	sbbq	8(%rbp),%r13
835	sbbq	16(%rbp),%r14
836	sbbq	24(%rbp),%r15
837	leaq	32(%rbp),%rbp
838	movq	%r12,0(%rdi)
839	movq	%r13,8(%rdi)
840	movq	%r14,16(%rdi)
841	movq	%r15,24(%rdi)
842	leaq	32(%rdi),%rdi
843	incq	%rcx
844	jnz	.Lsqr8x_sub
845
846	sbbq	$0,%rax
847	leaq	(%rbx,%r9,1),%rbx
848	leaq	(%rdi,%r9,1),%rdi
849
850.byte	102,72,15,110,200
851	pxor	%xmm0,%xmm0
852	pshufd	$0,%xmm1,%xmm1
853	movq	40(%rsp),%rsi
854.cfi_def_cfa	%rsi,8
855	jmp	.Lsqr8x_cond_copy
856
857.align	32
858.Lsqr8x_cond_copy:
859	movdqa	0(%rbx),%xmm2
860	movdqa	16(%rbx),%xmm3
861	leaq	32(%rbx),%rbx
862	movdqu	0(%rdi),%xmm4
863	movdqu	16(%rdi),%xmm5
864	leaq	32(%rdi),%rdi
865	movdqa	%xmm0,-32(%rbx)
866	movdqa	%xmm0,-16(%rbx)
867	movdqa	%xmm0,-32(%rbx,%rdx,1)
868	movdqa	%xmm0,-16(%rbx,%rdx,1)
869	pcmpeqd	%xmm1,%xmm0
870	pand	%xmm1,%xmm2
871	pand	%xmm1,%xmm3
872	pand	%xmm0,%xmm4
873	pand	%xmm0,%xmm5
874	pxor	%xmm0,%xmm0
875	por	%xmm2,%xmm4
876	por	%xmm3,%xmm5
877	movdqu	%xmm4,-32(%rdi)
878	movdqu	%xmm5,-16(%rdi)
879	addq	$32,%r9
880	jnz	.Lsqr8x_cond_copy
881
882	movq	$1,%rax
883	movq	-48(%rsi),%r15
884.cfi_restore	%r15
885	movq	-40(%rsi),%r14
886.cfi_restore	%r14
887	movq	-32(%rsi),%r13
888.cfi_restore	%r13
889	movq	-24(%rsi),%r12
890.cfi_restore	%r12
891	movq	-16(%rsi),%rbp
892.cfi_restore	%rbp
893	movq	-8(%rsi),%rbx
894.cfi_restore	%rbx
895	leaq	(%rsi),%rsp
896.cfi_def_cfa_register	%rsp
897.Lsqr8x_epilogue:
898	.byte	0xf3,0xc3
899.cfi_endproc
900.size	bn_sqr8x_mont,.-bn_sqr8x_mont
901.type	bn_mulx4x_mont,@function
902.align	32
903bn_mulx4x_mont:
904.cfi_startproc
905	movq	%rsp,%rax
906.cfi_def_cfa_register	%rax
907.Lmulx4x_enter:
908	pushq	%rbx
909.cfi_offset	%rbx,-16
910	pushq	%rbp
911.cfi_offset	%rbp,-24
912	pushq	%r12
913.cfi_offset	%r12,-32
914	pushq	%r13
915.cfi_offset	%r13,-40
916	pushq	%r14
917.cfi_offset	%r14,-48
918	pushq	%r15
919.cfi_offset	%r15,-56
920.Lmulx4x_prologue:
921
922	shll	$3,%r9d
923	xorq	%r10,%r10
924	subq	%r9,%r10
925	movq	(%r8),%r8
926	leaq	-72(%rsp,%r10,1),%rbp
927	andq	$-128,%rbp
928	movq	%rsp,%r11
929	subq	%rbp,%r11
930	andq	$-4096,%r11
931	leaq	(%r11,%rbp,1),%rsp
932	movq	(%rsp),%r10
933	cmpq	%rbp,%rsp
934	ja	.Lmulx4x_page_walk
935	jmp	.Lmulx4x_page_walk_done
936
937.align	16
938.Lmulx4x_page_walk:
939	leaq	-4096(%rsp),%rsp
940	movq	(%rsp),%r10
941	cmpq	%rbp,%rsp
942	ja	.Lmulx4x_page_walk
943.Lmulx4x_page_walk_done:
944
945	leaq	(%rdx,%r9,1),%r10
946
947
948
949
950
951
952
953
954
955
956
957
958	movq	%r9,0(%rsp)
959	shrq	$5,%r9
960	movq	%r10,16(%rsp)
961	subq	$1,%r9
962	movq	%r8,24(%rsp)
963	movq	%rdi,32(%rsp)
964	movq	%rax,40(%rsp)
965.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
966	movq	%r9,48(%rsp)
967	jmp	.Lmulx4x_body
968
969.align	32
970.Lmulx4x_body:
971	leaq	8(%rdx),%rdi
972	movq	(%rdx),%rdx
973	leaq	64+32(%rsp),%rbx
974	movq	%rdx,%r9
975
976	mulxq	0(%rsi),%r8,%rax
977	mulxq	8(%rsi),%r11,%r14
978	addq	%rax,%r11
979	movq	%rdi,8(%rsp)
980	mulxq	16(%rsi),%r12,%r13
981	adcq	%r14,%r12
982	adcq	$0,%r13
983
984	movq	%r8,%rdi
985	imulq	24(%rsp),%r8
986	xorq	%rbp,%rbp
987
988	mulxq	24(%rsi),%rax,%r14
989	movq	%r8,%rdx
990	leaq	32(%rsi),%rsi
991	adcxq	%rax,%r13
992	adcxq	%rbp,%r14
993
994	mulxq	0(%rcx),%rax,%r10
995	adcxq	%rax,%rdi
996	adoxq	%r11,%r10
997	mulxq	8(%rcx),%rax,%r11
998	adcxq	%rax,%r10
999	adoxq	%r12,%r11
1000.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
1001	movq	48(%rsp),%rdi
1002	movq	%r10,-32(%rbx)
1003	adcxq	%rax,%r11
1004	adoxq	%r13,%r12
1005	mulxq	24(%rcx),%rax,%r15
1006	movq	%r9,%rdx
1007	movq	%r11,-24(%rbx)
1008	adcxq	%rax,%r12
1009	adoxq	%rbp,%r15
1010	leaq	32(%rcx),%rcx
1011	movq	%r12,-16(%rbx)
1012
1013	jmp	.Lmulx4x_1st
1014
1015.align	32
1016.Lmulx4x_1st:
1017	adcxq	%rbp,%r15
1018	mulxq	0(%rsi),%r10,%rax
1019	adcxq	%r14,%r10
1020	mulxq	8(%rsi),%r11,%r14
1021	adcxq	%rax,%r11
1022	mulxq	16(%rsi),%r12,%rax
1023	adcxq	%r14,%r12
1024	mulxq	24(%rsi),%r13,%r14
1025.byte	0x67,0x67
1026	movq	%r8,%rdx
1027	adcxq	%rax,%r13
1028	adcxq	%rbp,%r14
1029	leaq	32(%rsi),%rsi
1030	leaq	32(%rbx),%rbx
1031
1032	adoxq	%r15,%r10
1033	mulxq	0(%rcx),%rax,%r15
1034	adcxq	%rax,%r10
1035	adoxq	%r15,%r11
1036	mulxq	8(%rcx),%rax,%r15
1037	adcxq	%rax,%r11
1038	adoxq	%r15,%r12
1039	mulxq	16(%rcx),%rax,%r15
1040	movq	%r10,-40(%rbx)
1041	adcxq	%rax,%r12
1042	movq	%r11,-32(%rbx)
1043	adoxq	%r15,%r13
1044	mulxq	24(%rcx),%rax,%r15
1045	movq	%r9,%rdx
1046	movq	%r12,-24(%rbx)
1047	adcxq	%rax,%r13
1048	adoxq	%rbp,%r15
1049	leaq	32(%rcx),%rcx
1050	movq	%r13,-16(%rbx)
1051
1052	decq	%rdi
1053	jnz	.Lmulx4x_1st
1054
1055	movq	0(%rsp),%rax
1056	movq	8(%rsp),%rdi
1057	adcq	%rbp,%r15
1058	addq	%r15,%r14
1059	sbbq	%r15,%r15
1060	movq	%r14,-8(%rbx)
1061	jmp	.Lmulx4x_outer
1062
1063.align	32
1064.Lmulx4x_outer:
1065	movq	(%rdi),%rdx
1066	leaq	8(%rdi),%rdi
1067	subq	%rax,%rsi
1068	movq	%r15,(%rbx)
1069	leaq	64+32(%rsp),%rbx
1070	subq	%rax,%rcx
1071
1072	mulxq	0(%rsi),%r8,%r11
1073	xorl	%ebp,%ebp
1074	movq	%rdx,%r9
1075	mulxq	8(%rsi),%r14,%r12
1076	adoxq	-32(%rbx),%r8
1077	adcxq	%r14,%r11
1078	mulxq	16(%rsi),%r15,%r13
1079	adoxq	-24(%rbx),%r11
1080	adcxq	%r15,%r12
1081	adoxq	-16(%rbx),%r12
1082	adcxq	%rbp,%r13
1083	adoxq	%rbp,%r13
1084
1085	movq	%rdi,8(%rsp)
1086	movq	%r8,%r15
1087	imulq	24(%rsp),%r8
1088	xorl	%ebp,%ebp
1089
1090	mulxq	24(%rsi),%rax,%r14
1091	movq	%r8,%rdx
1092	adcxq	%rax,%r13
1093	adoxq	-8(%rbx),%r13
1094	adcxq	%rbp,%r14
1095	leaq	32(%rsi),%rsi
1096	adoxq	%rbp,%r14
1097
1098	mulxq	0(%rcx),%rax,%r10
1099	adcxq	%rax,%r15
1100	adoxq	%r11,%r10
1101	mulxq	8(%rcx),%rax,%r11
1102	adcxq	%rax,%r10
1103	adoxq	%r12,%r11
1104	mulxq	16(%rcx),%rax,%r12
1105	movq	%r10,-32(%rbx)
1106	adcxq	%rax,%r11
1107	adoxq	%r13,%r12
1108	mulxq	24(%rcx),%rax,%r15
1109	movq	%r9,%rdx
1110	movq	%r11,-24(%rbx)
1111	leaq	32(%rcx),%rcx
1112	adcxq	%rax,%r12
1113	adoxq	%rbp,%r15
1114	movq	48(%rsp),%rdi
1115	movq	%r12,-16(%rbx)
1116
1117	jmp	.Lmulx4x_inner
1118
1119.align	32
1120.Lmulx4x_inner:
1121	mulxq	0(%rsi),%r10,%rax
1122	adcxq	%rbp,%r15
1123	adoxq	%r14,%r10
1124	mulxq	8(%rsi),%r11,%r14
1125	adcxq	0(%rbx),%r10
1126	adoxq	%rax,%r11
1127	mulxq	16(%rsi),%r12,%rax
1128	adcxq	8(%rbx),%r11
1129	adoxq	%r14,%r12
1130	mulxq	24(%rsi),%r13,%r14
1131	movq	%r8,%rdx
1132	adcxq	16(%rbx),%r12
1133	adoxq	%rax,%r13
1134	adcxq	24(%rbx),%r13
1135	adoxq	%rbp,%r14
1136	leaq	32(%rsi),%rsi
1137	leaq	32(%rbx),%rbx
1138	adcxq	%rbp,%r14
1139
1140	adoxq	%r15,%r10
1141	mulxq	0(%rcx),%rax,%r15
1142	adcxq	%rax,%r10
1143	adoxq	%r15,%r11
1144	mulxq	8(%rcx),%rax,%r15
1145	adcxq	%rax,%r11
1146	adoxq	%r15,%r12
1147	mulxq	16(%rcx),%rax,%r15
1148	movq	%r10,-40(%rbx)
1149	adcxq	%rax,%r12
1150	adoxq	%r15,%r13
1151	mulxq	24(%rcx),%rax,%r15
1152	movq	%r9,%rdx
1153	movq	%r11,-32(%rbx)
1154	movq	%r12,-24(%rbx)
1155	adcxq	%rax,%r13
1156	adoxq	%rbp,%r15
1157	leaq	32(%rcx),%rcx
1158	movq	%r13,-16(%rbx)
1159
1160	decq	%rdi
1161	jnz	.Lmulx4x_inner
1162
1163	movq	0(%rsp),%rax
1164	movq	8(%rsp),%rdi
1165	adcq	%rbp,%r15
1166	subq	0(%rbx),%rbp
1167	adcq	%r15,%r14
1168	sbbq	%r15,%r15
1169	movq	%r14,-8(%rbx)
1170
1171	cmpq	16(%rsp),%rdi
1172	jne	.Lmulx4x_outer
1173
1174	leaq	64(%rsp),%rbx
1175	subq	%rax,%rcx
1176	negq	%r15
1177	movq	%rax,%rdx
1178	shrq	$3+2,%rax
1179	movq	32(%rsp),%rdi
1180	jmp	.Lmulx4x_sub
1181
1182.align	32
1183.Lmulx4x_sub:
1184	movq	0(%rbx),%r11
1185	movq	8(%rbx),%r12
1186	movq	16(%rbx),%r13
1187	movq	24(%rbx),%r14
1188	leaq	32(%rbx),%rbx
1189	sbbq	0(%rcx),%r11
1190	sbbq	8(%rcx),%r12
1191	sbbq	16(%rcx),%r13
1192	sbbq	24(%rcx),%r14
1193	leaq	32(%rcx),%rcx
1194	movq	%r11,0(%rdi)
1195	movq	%r12,8(%rdi)
1196	movq	%r13,16(%rdi)
1197	movq	%r14,24(%rdi)
1198	leaq	32(%rdi),%rdi
1199	decq	%rax
1200	jnz	.Lmulx4x_sub
1201
1202	sbbq	$0,%r15
1203	leaq	64(%rsp),%rbx
1204	subq	%rdx,%rdi
1205
1206.byte	102,73,15,110,207
1207	pxor	%xmm0,%xmm0
1208	pshufd	$0,%xmm1,%xmm1
1209	movq	40(%rsp),%rsi
1210.cfi_def_cfa	%rsi,8
1211	jmp	.Lmulx4x_cond_copy
1212
1213.align	32
1214.Lmulx4x_cond_copy:
1215	movdqa	0(%rbx),%xmm2
1216	movdqa	16(%rbx),%xmm3
1217	leaq	32(%rbx),%rbx
1218	movdqu	0(%rdi),%xmm4
1219	movdqu	16(%rdi),%xmm5
1220	leaq	32(%rdi),%rdi
1221	movdqa	%xmm0,-32(%rbx)
1222	movdqa	%xmm0,-16(%rbx)
1223	pcmpeqd	%xmm1,%xmm0
1224	pand	%xmm1,%xmm2
1225	pand	%xmm1,%xmm3
1226	pand	%xmm0,%xmm4
1227	pand	%xmm0,%xmm5
1228	pxor	%xmm0,%xmm0
1229	por	%xmm2,%xmm4
1230	por	%xmm3,%xmm5
1231	movdqu	%xmm4,-32(%rdi)
1232	movdqu	%xmm5,-16(%rdi)
1233	subq	$32,%rdx
1234	jnz	.Lmulx4x_cond_copy
1235
1236	movq	%rdx,(%rbx)
1237
1238	movq	$1,%rax
1239	movq	-48(%rsi),%r15
1240.cfi_restore	%r15
1241	movq	-40(%rsi),%r14
1242.cfi_restore	%r14
1243	movq	-32(%rsi),%r13
1244.cfi_restore	%r13
1245	movq	-24(%rsi),%r12
1246.cfi_restore	%r12
1247	movq	-16(%rsi),%rbp
1248.cfi_restore	%rbp
1249	movq	-8(%rsi),%rbx
1250.cfi_restore	%rbx
1251	leaq	(%rsi),%rsp
1252.cfi_def_cfa_register	%rsp
1253.Lmulx4x_epilogue:
1254	.byte	0xf3,0xc3
1255.cfi_endproc
1256.size	bn_mulx4x_mont,.-bn_mulx4x_mont
1257.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1258.align	16
1259#endif
1260.section	.note.GNU-stack,"",@progbits
1261