1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11.text
12
13.extern	GFp_ia32cap_P
14.hidden GFp_ia32cap_P
15
16.globl	GFp_bn_mul_mont
17.hidden GFp_bn_mul_mont
18.type	GFp_bn_mul_mont,@function
19.align	16
20GFp_bn_mul_mont:
21.cfi_startproc
22	movl	%r9d,%r9d
23	movq	%rsp,%rax
24.cfi_def_cfa_register	%rax
25	testl	$3,%r9d
26	jnz	.Lmul_enter
27	cmpl	$8,%r9d
28	jb	.Lmul_enter
29	movl	GFp_ia32cap_P+8(%rip),%r11d
30	cmpq	%rsi,%rdx
31	jne	.Lmul4x_enter
32	testl	$7,%r9d
33	jz	.Lsqr8x_enter
34	jmp	.Lmul4x_enter
35
36.align	16
37.Lmul_enter:
38	pushq	%rbx
39.cfi_offset	%rbx,-16
40	pushq	%rbp
41.cfi_offset	%rbp,-24
42	pushq	%r12
43.cfi_offset	%r12,-32
44	pushq	%r13
45.cfi_offset	%r13,-40
46	pushq	%r14
47.cfi_offset	%r14,-48
48	pushq	%r15
49.cfi_offset	%r15,-56
50
51	negq	%r9
52	movq	%rsp,%r11
53	leaq	-16(%rsp,%r9,8),%r10
54	negq	%r9
55	andq	$-1024,%r10
56
57
58
59
60
61
62
63
64
65	subq	%r10,%r11
66	andq	$-4096,%r11
67	leaq	(%r10,%r11,1),%rsp
68	movq	(%rsp),%r11
69	cmpq	%r10,%rsp
70	ja	.Lmul_page_walk
71	jmp	.Lmul_page_walk_done
72
73.align	16
74.Lmul_page_walk:
75	leaq	-4096(%rsp),%rsp
76	movq	(%rsp),%r11
77	cmpq	%r10,%rsp
78	ja	.Lmul_page_walk
79.Lmul_page_walk_done:
80
81	movq	%rax,8(%rsp,%r9,8)
82.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
83.Lmul_body:
84	movq	%rdx,%r12
85	movq	(%r8),%r8
86	movq	(%r12),%rbx
87	movq	(%rsi),%rax
88
89	xorq	%r14,%r14
90	xorq	%r15,%r15
91
92	movq	%r8,%rbp
93	mulq	%rbx
94	movq	%rax,%r10
95	movq	(%rcx),%rax
96
97	imulq	%r10,%rbp
98	movq	%rdx,%r11
99
100	mulq	%rbp
101	addq	%rax,%r10
102	movq	8(%rsi),%rax
103	adcq	$0,%rdx
104	movq	%rdx,%r13
105
106	leaq	1(%r15),%r15
107	jmp	.L1st_enter
108
109.align	16
110.L1st:
111	addq	%rax,%r13
112	movq	(%rsi,%r15,8),%rax
113	adcq	$0,%rdx
114	addq	%r11,%r13
115	movq	%r10,%r11
116	adcq	$0,%rdx
117	movq	%r13,-16(%rsp,%r15,8)
118	movq	%rdx,%r13
119
120.L1st_enter:
121	mulq	%rbx
122	addq	%rax,%r11
123	movq	(%rcx,%r15,8),%rax
124	adcq	$0,%rdx
125	leaq	1(%r15),%r15
126	movq	%rdx,%r10
127
128	mulq	%rbp
129	cmpq	%r9,%r15
130	jne	.L1st
131
132	addq	%rax,%r13
133	movq	(%rsi),%rax
134	adcq	$0,%rdx
135	addq	%r11,%r13
136	adcq	$0,%rdx
137	movq	%r13,-16(%rsp,%r15,8)
138	movq	%rdx,%r13
139	movq	%r10,%r11
140
141	xorq	%rdx,%rdx
142	addq	%r11,%r13
143	adcq	$0,%rdx
144	movq	%r13,-8(%rsp,%r9,8)
145	movq	%rdx,(%rsp,%r9,8)
146
147	leaq	1(%r14),%r14
148	jmp	.Louter
149.align	16
150.Louter:
151	movq	(%r12,%r14,8),%rbx
152	xorq	%r15,%r15
153	movq	%r8,%rbp
154	movq	(%rsp),%r10
155	mulq	%rbx
156	addq	%rax,%r10
157	movq	(%rcx),%rax
158	adcq	$0,%rdx
159
160	imulq	%r10,%rbp
161	movq	%rdx,%r11
162
163	mulq	%rbp
164	addq	%rax,%r10
165	movq	8(%rsi),%rax
166	adcq	$0,%rdx
167	movq	8(%rsp),%r10
168	movq	%rdx,%r13
169
170	leaq	1(%r15),%r15
171	jmp	.Linner_enter
172
173.align	16
174.Linner:
175	addq	%rax,%r13
176	movq	(%rsi,%r15,8),%rax
177	adcq	$0,%rdx
178	addq	%r10,%r13
179	movq	(%rsp,%r15,8),%r10
180	adcq	$0,%rdx
181	movq	%r13,-16(%rsp,%r15,8)
182	movq	%rdx,%r13
183
184.Linner_enter:
185	mulq	%rbx
186	addq	%rax,%r11
187	movq	(%rcx,%r15,8),%rax
188	adcq	$0,%rdx
189	addq	%r11,%r10
190	movq	%rdx,%r11
191	adcq	$0,%r11
192	leaq	1(%r15),%r15
193
194	mulq	%rbp
195	cmpq	%r9,%r15
196	jne	.Linner
197
198	addq	%rax,%r13
199	movq	(%rsi),%rax
200	adcq	$0,%rdx
201	addq	%r10,%r13
202	movq	(%rsp,%r15,8),%r10
203	adcq	$0,%rdx
204	movq	%r13,-16(%rsp,%r15,8)
205	movq	%rdx,%r13
206
207	xorq	%rdx,%rdx
208	addq	%r11,%r13
209	adcq	$0,%rdx
210	addq	%r10,%r13
211	adcq	$0,%rdx
212	movq	%r13,-8(%rsp,%r9,8)
213	movq	%rdx,(%rsp,%r9,8)
214
215	leaq	1(%r14),%r14
216	cmpq	%r9,%r14
217	jb	.Louter
218
219	xorq	%r14,%r14
220	movq	(%rsp),%rax
221	movq	%r9,%r15
222
223.align	16
224.Lsub:	sbbq	(%rcx,%r14,8),%rax
225	movq	%rax,(%rdi,%r14,8)
226	movq	8(%rsp,%r14,8),%rax
227	leaq	1(%r14),%r14
228	decq	%r15
229	jnz	.Lsub
230
231	sbbq	$0,%rax
232	movq	$-1,%rbx
233	xorq	%rax,%rbx
234	xorq	%r14,%r14
235	movq	%r9,%r15
236
237.Lcopy:
238	movq	(%rdi,%r14,8),%rcx
239	movq	(%rsp,%r14,8),%rdx
240	andq	%rbx,%rcx
241	andq	%rax,%rdx
242	movq	%r9,(%rsp,%r14,8)
243	orq	%rcx,%rdx
244	movq	%rdx,(%rdi,%r14,8)
245	leaq	1(%r14),%r14
246	subq	$1,%r15
247	jnz	.Lcopy
248
249	movq	8(%rsp,%r9,8),%rsi
250.cfi_def_cfa	%rsi,8
251	movq	$1,%rax
252	movq	-48(%rsi),%r15
253.cfi_restore	%r15
254	movq	-40(%rsi),%r14
255.cfi_restore	%r14
256	movq	-32(%rsi),%r13
257.cfi_restore	%r13
258	movq	-24(%rsi),%r12
259.cfi_restore	%r12
260	movq	-16(%rsi),%rbp
261.cfi_restore	%rbp
262	movq	-8(%rsi),%rbx
263.cfi_restore	%rbx
264	leaq	(%rsi),%rsp
265.cfi_def_cfa_register	%rsp
266.Lmul_epilogue:
267	.byte	0xf3,0xc3
268.cfi_endproc
269.size	GFp_bn_mul_mont,.-GFp_bn_mul_mont
270.type	bn_mul4x_mont,@function
271.align	16
272bn_mul4x_mont:
273.cfi_startproc
274	movl	%r9d,%r9d
275	movq	%rsp,%rax
276.cfi_def_cfa_register	%rax
277.Lmul4x_enter:
278	andl	$0x80100,%r11d
279	cmpl	$0x80100,%r11d
280	je	.Lmulx4x_enter
281	pushq	%rbx
282.cfi_offset	%rbx,-16
283	pushq	%rbp
284.cfi_offset	%rbp,-24
285	pushq	%r12
286.cfi_offset	%r12,-32
287	pushq	%r13
288.cfi_offset	%r13,-40
289	pushq	%r14
290.cfi_offset	%r14,-48
291	pushq	%r15
292.cfi_offset	%r15,-56
293
294	negq	%r9
295	movq	%rsp,%r11
296	leaq	-32(%rsp,%r9,8),%r10
297	negq	%r9
298	andq	$-1024,%r10
299
300	subq	%r10,%r11
301	andq	$-4096,%r11
302	leaq	(%r10,%r11,1),%rsp
303	movq	(%rsp),%r11
304	cmpq	%r10,%rsp
305	ja	.Lmul4x_page_walk
306	jmp	.Lmul4x_page_walk_done
307
308.Lmul4x_page_walk:
309	leaq	-4096(%rsp),%rsp
310	movq	(%rsp),%r11
311	cmpq	%r10,%rsp
312	ja	.Lmul4x_page_walk
313.Lmul4x_page_walk_done:
314
315	movq	%rax,8(%rsp,%r9,8)
316.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
317.Lmul4x_body:
318	movq	%rdi,16(%rsp,%r9,8)
319	movq	%rdx,%r12
320	movq	(%r8),%r8
321	movq	(%r12),%rbx
322	movq	(%rsi),%rax
323
324	xorq	%r14,%r14
325	xorq	%r15,%r15
326
327	movq	%r8,%rbp
328	mulq	%rbx
329	movq	%rax,%r10
330	movq	(%rcx),%rax
331
332	imulq	%r10,%rbp
333	movq	%rdx,%r11
334
335	mulq	%rbp
336	addq	%rax,%r10
337	movq	8(%rsi),%rax
338	adcq	$0,%rdx
339	movq	%rdx,%rdi
340
341	mulq	%rbx
342	addq	%rax,%r11
343	movq	8(%rcx),%rax
344	adcq	$0,%rdx
345	movq	%rdx,%r10
346
347	mulq	%rbp
348	addq	%rax,%rdi
349	movq	16(%rsi),%rax
350	adcq	$0,%rdx
351	addq	%r11,%rdi
352	leaq	4(%r15),%r15
353	adcq	$0,%rdx
354	movq	%rdi,(%rsp)
355	movq	%rdx,%r13
356	jmp	.L1st4x
357.align	16
358.L1st4x:
359	mulq	%rbx
360	addq	%rax,%r10
361	movq	-16(%rcx,%r15,8),%rax
362	adcq	$0,%rdx
363	movq	%rdx,%r11
364
365	mulq	%rbp
366	addq	%rax,%r13
367	movq	-8(%rsi,%r15,8),%rax
368	adcq	$0,%rdx
369	addq	%r10,%r13
370	adcq	$0,%rdx
371	movq	%r13,-24(%rsp,%r15,8)
372	movq	%rdx,%rdi
373
374	mulq	%rbx
375	addq	%rax,%r11
376	movq	-8(%rcx,%r15,8),%rax
377	adcq	$0,%rdx
378	movq	%rdx,%r10
379
380	mulq	%rbp
381	addq	%rax,%rdi
382	movq	(%rsi,%r15,8),%rax
383	adcq	$0,%rdx
384	addq	%r11,%rdi
385	adcq	$0,%rdx
386	movq	%rdi,-16(%rsp,%r15,8)
387	movq	%rdx,%r13
388
389	mulq	%rbx
390	addq	%rax,%r10
391	movq	(%rcx,%r15,8),%rax
392	adcq	$0,%rdx
393	movq	%rdx,%r11
394
395	mulq	%rbp
396	addq	%rax,%r13
397	movq	8(%rsi,%r15,8),%rax
398	adcq	$0,%rdx
399	addq	%r10,%r13
400	adcq	$0,%rdx
401	movq	%r13,-8(%rsp,%r15,8)
402	movq	%rdx,%rdi
403
404	mulq	%rbx
405	addq	%rax,%r11
406	movq	8(%rcx,%r15,8),%rax
407	adcq	$0,%rdx
408	leaq	4(%r15),%r15
409	movq	%rdx,%r10
410
411	mulq	%rbp
412	addq	%rax,%rdi
413	movq	-16(%rsi,%r15,8),%rax
414	adcq	$0,%rdx
415	addq	%r11,%rdi
416	adcq	$0,%rdx
417	movq	%rdi,-32(%rsp,%r15,8)
418	movq	%rdx,%r13
419	cmpq	%r9,%r15
420	jb	.L1st4x
421
422	mulq	%rbx
423	addq	%rax,%r10
424	movq	-16(%rcx,%r15,8),%rax
425	adcq	$0,%rdx
426	movq	%rdx,%r11
427
428	mulq	%rbp
429	addq	%rax,%r13
430	movq	-8(%rsi,%r15,8),%rax
431	adcq	$0,%rdx
432	addq	%r10,%r13
433	adcq	$0,%rdx
434	movq	%r13,-24(%rsp,%r15,8)
435	movq	%rdx,%rdi
436
437	mulq	%rbx
438	addq	%rax,%r11
439	movq	-8(%rcx,%r15,8),%rax
440	adcq	$0,%rdx
441	movq	%rdx,%r10
442
443	mulq	%rbp
444	addq	%rax,%rdi
445	movq	(%rsi),%rax
446	adcq	$0,%rdx
447	addq	%r11,%rdi
448	adcq	$0,%rdx
449	movq	%rdi,-16(%rsp,%r15,8)
450	movq	%rdx,%r13
451
452	xorq	%rdi,%rdi
453	addq	%r10,%r13
454	adcq	$0,%rdi
455	movq	%r13,-8(%rsp,%r15,8)
456	movq	%rdi,(%rsp,%r15,8)
457
458	leaq	1(%r14),%r14
459.align	4
460.Louter4x:
461	movq	(%r12,%r14,8),%rbx
462	xorq	%r15,%r15
463	movq	(%rsp),%r10
464	movq	%r8,%rbp
465	mulq	%rbx
466	addq	%rax,%r10
467	movq	(%rcx),%rax
468	adcq	$0,%rdx
469
470	imulq	%r10,%rbp
471	movq	%rdx,%r11
472
473	mulq	%rbp
474	addq	%rax,%r10
475	movq	8(%rsi),%rax
476	adcq	$0,%rdx
477	movq	%rdx,%rdi
478
479	mulq	%rbx
480	addq	%rax,%r11
481	movq	8(%rcx),%rax
482	adcq	$0,%rdx
483	addq	8(%rsp),%r11
484	adcq	$0,%rdx
485	movq	%rdx,%r10
486
487	mulq	%rbp
488	addq	%rax,%rdi
489	movq	16(%rsi),%rax
490	adcq	$0,%rdx
491	addq	%r11,%rdi
492	leaq	4(%r15),%r15
493	adcq	$0,%rdx
494	movq	%rdi,(%rsp)
495	movq	%rdx,%r13
496	jmp	.Linner4x
497.align	16
498.Linner4x:
499	mulq	%rbx
500	addq	%rax,%r10
501	movq	-16(%rcx,%r15,8),%rax
502	adcq	$0,%rdx
503	addq	-16(%rsp,%r15,8),%r10
504	adcq	$0,%rdx
505	movq	%rdx,%r11
506
507	mulq	%rbp
508	addq	%rax,%r13
509	movq	-8(%rsi,%r15,8),%rax
510	adcq	$0,%rdx
511	addq	%r10,%r13
512	adcq	$0,%rdx
513	movq	%r13,-24(%rsp,%r15,8)
514	movq	%rdx,%rdi
515
516	mulq	%rbx
517	addq	%rax,%r11
518	movq	-8(%rcx,%r15,8),%rax
519	adcq	$0,%rdx
520	addq	-8(%rsp,%r15,8),%r11
521	adcq	$0,%rdx
522	movq	%rdx,%r10
523
524	mulq	%rbp
525	addq	%rax,%rdi
526	movq	(%rsi,%r15,8),%rax
527	adcq	$0,%rdx
528	addq	%r11,%rdi
529	adcq	$0,%rdx
530	movq	%rdi,-16(%rsp,%r15,8)
531	movq	%rdx,%r13
532
533	mulq	%rbx
534	addq	%rax,%r10
535	movq	(%rcx,%r15,8),%rax
536	adcq	$0,%rdx
537	addq	(%rsp,%r15,8),%r10
538	adcq	$0,%rdx
539	movq	%rdx,%r11
540
541	mulq	%rbp
542	addq	%rax,%r13
543	movq	8(%rsi,%r15,8),%rax
544	adcq	$0,%rdx
545	addq	%r10,%r13
546	adcq	$0,%rdx
547	movq	%r13,-8(%rsp,%r15,8)
548	movq	%rdx,%rdi
549
550	mulq	%rbx
551	addq	%rax,%r11
552	movq	8(%rcx,%r15,8),%rax
553	adcq	$0,%rdx
554	addq	8(%rsp,%r15,8),%r11
555	adcq	$0,%rdx
556	leaq	4(%r15),%r15
557	movq	%rdx,%r10
558
559	mulq	%rbp
560	addq	%rax,%rdi
561	movq	-16(%rsi,%r15,8),%rax
562	adcq	$0,%rdx
563	addq	%r11,%rdi
564	adcq	$0,%rdx
565	movq	%rdi,-32(%rsp,%r15,8)
566	movq	%rdx,%r13
567	cmpq	%r9,%r15
568	jb	.Linner4x
569
570	mulq	%rbx
571	addq	%rax,%r10
572	movq	-16(%rcx,%r15,8),%rax
573	adcq	$0,%rdx
574	addq	-16(%rsp,%r15,8),%r10
575	adcq	$0,%rdx
576	movq	%rdx,%r11
577
578	mulq	%rbp
579	addq	%rax,%r13
580	movq	-8(%rsi,%r15,8),%rax
581	adcq	$0,%rdx
582	addq	%r10,%r13
583	adcq	$0,%rdx
584	movq	%r13,-24(%rsp,%r15,8)
585	movq	%rdx,%rdi
586
587	mulq	%rbx
588	addq	%rax,%r11
589	movq	-8(%rcx,%r15,8),%rax
590	adcq	$0,%rdx
591	addq	-8(%rsp,%r15,8),%r11
592	adcq	$0,%rdx
593	leaq	1(%r14),%r14
594	movq	%rdx,%r10
595
596	mulq	%rbp
597	addq	%rax,%rdi
598	movq	(%rsi),%rax
599	adcq	$0,%rdx
600	addq	%r11,%rdi
601	adcq	$0,%rdx
602	movq	%rdi,-16(%rsp,%r15,8)
603	movq	%rdx,%r13
604
605	xorq	%rdi,%rdi
606	addq	%r10,%r13
607	adcq	$0,%rdi
608	addq	(%rsp,%r9,8),%r13
609	adcq	$0,%rdi
610	movq	%r13,-8(%rsp,%r15,8)
611	movq	%rdi,(%rsp,%r15,8)
612
613	cmpq	%r9,%r14
614	jb	.Louter4x
615	movq	16(%rsp,%r9,8),%rdi
616	leaq	-4(%r9),%r15
617	movq	0(%rsp),%rax
618	movq	8(%rsp),%rdx
619	shrq	$2,%r15
620	leaq	(%rsp),%rsi
621	xorq	%r14,%r14
622
623	subq	0(%rcx),%rax
624	movq	16(%rsi),%rbx
625	movq	24(%rsi),%rbp
626	sbbq	8(%rcx),%rdx
627
628.Lsub4x:
629	movq	%rax,0(%rdi,%r14,8)
630	movq	%rdx,8(%rdi,%r14,8)
631	sbbq	16(%rcx,%r14,8),%rbx
632	movq	32(%rsi,%r14,8),%rax
633	movq	40(%rsi,%r14,8),%rdx
634	sbbq	24(%rcx,%r14,8),%rbp
635	movq	%rbx,16(%rdi,%r14,8)
636	movq	%rbp,24(%rdi,%r14,8)
637	sbbq	32(%rcx,%r14,8),%rax
638	movq	48(%rsi,%r14,8),%rbx
639	movq	56(%rsi,%r14,8),%rbp
640	sbbq	40(%rcx,%r14,8),%rdx
641	leaq	4(%r14),%r14
642	decq	%r15
643	jnz	.Lsub4x
644
645	movq	%rax,0(%rdi,%r14,8)
646	movq	32(%rsi,%r14,8),%rax
647	sbbq	16(%rcx,%r14,8),%rbx
648	movq	%rdx,8(%rdi,%r14,8)
649	sbbq	24(%rcx,%r14,8),%rbp
650	movq	%rbx,16(%rdi,%r14,8)
651
652	sbbq	$0,%rax
653	movq	%rbp,24(%rdi,%r14,8)
654	pxor	%xmm0,%xmm0
655.byte	102,72,15,110,224
656	pcmpeqd	%xmm5,%xmm5
657	pshufd	$0,%xmm4,%xmm4
658	movq	%r9,%r15
659	pxor	%xmm4,%xmm5
660	shrq	$2,%r15
661	xorl	%eax,%eax
662
663	jmp	.Lcopy4x
664.align	16
665.Lcopy4x:
666	movdqa	(%rsp,%rax,1),%xmm1
667	movdqu	(%rdi,%rax,1),%xmm2
668	pand	%xmm4,%xmm1
669	pand	%xmm5,%xmm2
670	movdqa	16(%rsp,%rax,1),%xmm3
671	movdqa	%xmm0,(%rsp,%rax,1)
672	por	%xmm2,%xmm1
673	movdqu	16(%rdi,%rax,1),%xmm2
674	movdqu	%xmm1,(%rdi,%rax,1)
675	pand	%xmm4,%xmm3
676	pand	%xmm5,%xmm2
677	movdqa	%xmm0,16(%rsp,%rax,1)
678	por	%xmm2,%xmm3
679	movdqu	%xmm3,16(%rdi,%rax,1)
680	leaq	32(%rax),%rax
681	decq	%r15
682	jnz	.Lcopy4x
683	movq	8(%rsp,%r9,8),%rsi
684.cfi_def_cfa	%rsi, 8
685	movq	$1,%rax
686	movq	-48(%rsi),%r15
687.cfi_restore	%r15
688	movq	-40(%rsi),%r14
689.cfi_restore	%r14
690	movq	-32(%rsi),%r13
691.cfi_restore	%r13
692	movq	-24(%rsi),%r12
693.cfi_restore	%r12
694	movq	-16(%rsi),%rbp
695.cfi_restore	%rbp
696	movq	-8(%rsi),%rbx
697.cfi_restore	%rbx
698	leaq	(%rsi),%rsp
699.cfi_def_cfa_register	%rsp
700.Lmul4x_epilogue:
701	.byte	0xf3,0xc3
702.cfi_endproc
703.size	bn_mul4x_mont,.-bn_mul4x_mont
704.extern	GFp_bn_sqrx8x_internal
705.hidden GFp_bn_sqrx8x_internal
706.extern	GFp_bn_sqr8x_internal
707.hidden GFp_bn_sqr8x_internal
708
709.type	bn_sqr8x_mont,@function
710.align	32
711bn_sqr8x_mont:
712.cfi_startproc
713	movq	%rsp,%rax
714.cfi_def_cfa_register	%rax
715.Lsqr8x_enter:
716	pushq	%rbx
717.cfi_offset	%rbx,-16
718	pushq	%rbp
719.cfi_offset	%rbp,-24
720	pushq	%r12
721.cfi_offset	%r12,-32
722	pushq	%r13
723.cfi_offset	%r13,-40
724	pushq	%r14
725.cfi_offset	%r14,-48
726	pushq	%r15
727.cfi_offset	%r15,-56
728.Lsqr8x_prologue:
729
730	movl	%r9d,%r10d
731	shll	$3,%r9d
732	shlq	$3+2,%r10
733	negq	%r9
734
735
736
737
738
739
740	leaq	-64(%rsp,%r9,2),%r11
741	movq	%rsp,%rbp
742	movq	(%r8),%r8
743	subq	%rsi,%r11
744	andq	$4095,%r11
745	cmpq	%r11,%r10
746	jb	.Lsqr8x_sp_alt
747	subq	%r11,%rbp
748	leaq	-64(%rbp,%r9,2),%rbp
749	jmp	.Lsqr8x_sp_done
750
751.align	32
752.Lsqr8x_sp_alt:
753	leaq	4096-64(,%r9,2),%r10
754	leaq	-64(%rbp,%r9,2),%rbp
755	subq	%r10,%r11
756	movq	$0,%r10
757	cmovcq	%r10,%r11
758	subq	%r11,%rbp
759.Lsqr8x_sp_done:
760	andq	$-64,%rbp
761	movq	%rsp,%r11
762	subq	%rbp,%r11
763	andq	$-4096,%r11
764	leaq	(%r11,%rbp,1),%rsp
765	movq	(%rsp),%r10
766	cmpq	%rbp,%rsp
767	ja	.Lsqr8x_page_walk
768	jmp	.Lsqr8x_page_walk_done
769
770.align	16
771.Lsqr8x_page_walk:
772	leaq	-4096(%rsp),%rsp
773	movq	(%rsp),%r10
774	cmpq	%rbp,%rsp
775	ja	.Lsqr8x_page_walk
776.Lsqr8x_page_walk_done:
777
778	movq	%r9,%r10
779	negq	%r9
780
781	movq	%r8,32(%rsp)
782	movq	%rax,40(%rsp)
783.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
784.Lsqr8x_body:
785
786.byte	102,72,15,110,209
787	pxor	%xmm0,%xmm0
788.byte	102,72,15,110,207
789.byte	102,73,15,110,218
790	movl	GFp_ia32cap_P+8(%rip),%eax
791	andl	$0x80100,%eax
792	cmpl	$0x80100,%eax
793	jne	.Lsqr8x_nox
794
795	call	GFp_bn_sqrx8x_internal
796
797
798
799
800	leaq	(%r8,%rcx,1),%rbx
801	movq	%rcx,%r9
802	movq	%rcx,%rdx
803.byte	102,72,15,126,207
804	sarq	$3+2,%rcx
805	jmp	.Lsqr8x_sub
806
807.align	32
808.Lsqr8x_nox:
809	call	GFp_bn_sqr8x_internal
810
811
812
813
814	leaq	(%rdi,%r9,1),%rbx
815	movq	%r9,%rcx
816	movq	%r9,%rdx
817.byte	102,72,15,126,207
818	sarq	$3+2,%rcx
819	jmp	.Lsqr8x_sub
820
821.align	32
822.Lsqr8x_sub:
823	movq	0(%rbx),%r12
824	movq	8(%rbx),%r13
825	movq	16(%rbx),%r14
826	movq	24(%rbx),%r15
827	leaq	32(%rbx),%rbx
828	sbbq	0(%rbp),%r12
829	sbbq	8(%rbp),%r13
830	sbbq	16(%rbp),%r14
831	sbbq	24(%rbp),%r15
832	leaq	32(%rbp),%rbp
833	movq	%r12,0(%rdi)
834	movq	%r13,8(%rdi)
835	movq	%r14,16(%rdi)
836	movq	%r15,24(%rdi)
837	leaq	32(%rdi),%rdi
838	incq	%rcx
839	jnz	.Lsqr8x_sub
840
841	sbbq	$0,%rax
842	leaq	(%rbx,%r9,1),%rbx
843	leaq	(%rdi,%r9,1),%rdi
844
845.byte	102,72,15,110,200
846	pxor	%xmm0,%xmm0
847	pshufd	$0,%xmm1,%xmm1
848	movq	40(%rsp),%rsi
849.cfi_def_cfa	%rsi,8
850	jmp	.Lsqr8x_cond_copy
851
852.align	32
853.Lsqr8x_cond_copy:
854	movdqa	0(%rbx),%xmm2
855	movdqa	16(%rbx),%xmm3
856	leaq	32(%rbx),%rbx
857	movdqu	0(%rdi),%xmm4
858	movdqu	16(%rdi),%xmm5
859	leaq	32(%rdi),%rdi
860	movdqa	%xmm0,-32(%rbx)
861	movdqa	%xmm0,-16(%rbx)
862	movdqa	%xmm0,-32(%rbx,%rdx,1)
863	movdqa	%xmm0,-16(%rbx,%rdx,1)
864	pcmpeqd	%xmm1,%xmm0
865	pand	%xmm1,%xmm2
866	pand	%xmm1,%xmm3
867	pand	%xmm0,%xmm4
868	pand	%xmm0,%xmm5
869	pxor	%xmm0,%xmm0
870	por	%xmm2,%xmm4
871	por	%xmm3,%xmm5
872	movdqu	%xmm4,-32(%rdi)
873	movdqu	%xmm5,-16(%rdi)
874	addq	$32,%r9
875	jnz	.Lsqr8x_cond_copy
876
877	movq	$1,%rax
878	movq	-48(%rsi),%r15
879.cfi_restore	%r15
880	movq	-40(%rsi),%r14
881.cfi_restore	%r14
882	movq	-32(%rsi),%r13
883.cfi_restore	%r13
884	movq	-24(%rsi),%r12
885.cfi_restore	%r12
886	movq	-16(%rsi),%rbp
887.cfi_restore	%rbp
888	movq	-8(%rsi),%rbx
889.cfi_restore	%rbx
890	leaq	(%rsi),%rsp
891.cfi_def_cfa_register	%rsp
892.Lsqr8x_epilogue:
893	.byte	0xf3,0xc3
894.cfi_endproc
895.size	bn_sqr8x_mont,.-bn_sqr8x_mont
896.type	bn_mulx4x_mont,@function
897.align	32
898bn_mulx4x_mont:
899.cfi_startproc
900	movq	%rsp,%rax
901.cfi_def_cfa_register	%rax
902.Lmulx4x_enter:
903	pushq	%rbx
904.cfi_offset	%rbx,-16
905	pushq	%rbp
906.cfi_offset	%rbp,-24
907	pushq	%r12
908.cfi_offset	%r12,-32
909	pushq	%r13
910.cfi_offset	%r13,-40
911	pushq	%r14
912.cfi_offset	%r14,-48
913	pushq	%r15
914.cfi_offset	%r15,-56
915.Lmulx4x_prologue:
916
917	shll	$3,%r9d
918	xorq	%r10,%r10
919	subq	%r9,%r10
920	movq	(%r8),%r8
921	leaq	-72(%rsp,%r10,1),%rbp
922	andq	$-128,%rbp
923	movq	%rsp,%r11
924	subq	%rbp,%r11
925	andq	$-4096,%r11
926	leaq	(%r11,%rbp,1),%rsp
927	movq	(%rsp),%r10
928	cmpq	%rbp,%rsp
929	ja	.Lmulx4x_page_walk
930	jmp	.Lmulx4x_page_walk_done
931
932.align	16
933.Lmulx4x_page_walk:
934	leaq	-4096(%rsp),%rsp
935	movq	(%rsp),%r10
936	cmpq	%rbp,%rsp
937	ja	.Lmulx4x_page_walk
938.Lmulx4x_page_walk_done:
939
940	leaq	(%rdx,%r9,1),%r10
941
942
943
944
945
946
947
948
949
950
951
952
953	movq	%r9,0(%rsp)
954	shrq	$5,%r9
955	movq	%r10,16(%rsp)
956	subq	$1,%r9
957	movq	%r8,24(%rsp)
958	movq	%rdi,32(%rsp)
959	movq	%rax,40(%rsp)
960.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
961	movq	%r9,48(%rsp)
962	jmp	.Lmulx4x_body
963
964.align	32
965.Lmulx4x_body:
966	leaq	8(%rdx),%rdi
967	movq	(%rdx),%rdx
968	leaq	64+32(%rsp),%rbx
969	movq	%rdx,%r9
970
971	mulxq	0(%rsi),%r8,%rax
972	mulxq	8(%rsi),%r11,%r14
973	addq	%rax,%r11
974	movq	%rdi,8(%rsp)
975	mulxq	16(%rsi),%r12,%r13
976	adcq	%r14,%r12
977	adcq	$0,%r13
978
979	movq	%r8,%rdi
980	imulq	24(%rsp),%r8
981	xorq	%rbp,%rbp
982
983	mulxq	24(%rsi),%rax,%r14
984	movq	%r8,%rdx
985	leaq	32(%rsi),%rsi
986	adcxq	%rax,%r13
987	adcxq	%rbp,%r14
988
989	mulxq	0(%rcx),%rax,%r10
990	adcxq	%rax,%rdi
991	adoxq	%r11,%r10
992	mulxq	8(%rcx),%rax,%r11
993	adcxq	%rax,%r10
994	adoxq	%r12,%r11
995.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
996	movq	48(%rsp),%rdi
997	movq	%r10,-32(%rbx)
998	adcxq	%rax,%r11
999	adoxq	%r13,%r12
1000	mulxq	24(%rcx),%rax,%r15
1001	movq	%r9,%rdx
1002	movq	%r11,-24(%rbx)
1003	adcxq	%rax,%r12
1004	adoxq	%rbp,%r15
1005	leaq	32(%rcx),%rcx
1006	movq	%r12,-16(%rbx)
1007
1008	jmp	.Lmulx4x_1st
1009
1010.align	32
1011.Lmulx4x_1st:
1012	adcxq	%rbp,%r15
1013	mulxq	0(%rsi),%r10,%rax
1014	adcxq	%r14,%r10
1015	mulxq	8(%rsi),%r11,%r14
1016	adcxq	%rax,%r11
1017	mulxq	16(%rsi),%r12,%rax
1018	adcxq	%r14,%r12
1019	mulxq	24(%rsi),%r13,%r14
1020.byte	0x67,0x67
1021	movq	%r8,%rdx
1022	adcxq	%rax,%r13
1023	adcxq	%rbp,%r14
1024	leaq	32(%rsi),%rsi
1025	leaq	32(%rbx),%rbx
1026
1027	adoxq	%r15,%r10
1028	mulxq	0(%rcx),%rax,%r15
1029	adcxq	%rax,%r10
1030	adoxq	%r15,%r11
1031	mulxq	8(%rcx),%rax,%r15
1032	adcxq	%rax,%r11
1033	adoxq	%r15,%r12
1034	mulxq	16(%rcx),%rax,%r15
1035	movq	%r10,-40(%rbx)
1036	adcxq	%rax,%r12
1037	movq	%r11,-32(%rbx)
1038	adoxq	%r15,%r13
1039	mulxq	24(%rcx),%rax,%r15
1040	movq	%r9,%rdx
1041	movq	%r12,-24(%rbx)
1042	adcxq	%rax,%r13
1043	adoxq	%rbp,%r15
1044	leaq	32(%rcx),%rcx
1045	movq	%r13,-16(%rbx)
1046
1047	decq	%rdi
1048	jnz	.Lmulx4x_1st
1049
1050	movq	0(%rsp),%rax
1051	movq	8(%rsp),%rdi
1052	adcq	%rbp,%r15
1053	addq	%r15,%r14
1054	sbbq	%r15,%r15
1055	movq	%r14,-8(%rbx)
1056	jmp	.Lmulx4x_outer
1057
1058.align	32
1059.Lmulx4x_outer:
1060	movq	(%rdi),%rdx
1061	leaq	8(%rdi),%rdi
1062	subq	%rax,%rsi
1063	movq	%r15,(%rbx)
1064	leaq	64+32(%rsp),%rbx
1065	subq	%rax,%rcx
1066
1067	mulxq	0(%rsi),%r8,%r11
1068	xorl	%ebp,%ebp
1069	movq	%rdx,%r9
1070	mulxq	8(%rsi),%r14,%r12
1071	adoxq	-32(%rbx),%r8
1072	adcxq	%r14,%r11
1073	mulxq	16(%rsi),%r15,%r13
1074	adoxq	-24(%rbx),%r11
1075	adcxq	%r15,%r12
1076	adoxq	-16(%rbx),%r12
1077	adcxq	%rbp,%r13
1078	adoxq	%rbp,%r13
1079
1080	movq	%rdi,8(%rsp)
1081	movq	%r8,%r15
1082	imulq	24(%rsp),%r8
1083	xorl	%ebp,%ebp
1084
1085	mulxq	24(%rsi),%rax,%r14
1086	movq	%r8,%rdx
1087	adcxq	%rax,%r13
1088	adoxq	-8(%rbx),%r13
1089	adcxq	%rbp,%r14
1090	leaq	32(%rsi),%rsi
1091	adoxq	%rbp,%r14
1092
1093	mulxq	0(%rcx),%rax,%r10
1094	adcxq	%rax,%r15
1095	adoxq	%r11,%r10
1096	mulxq	8(%rcx),%rax,%r11
1097	adcxq	%rax,%r10
1098	adoxq	%r12,%r11
1099	mulxq	16(%rcx),%rax,%r12
1100	movq	%r10,-32(%rbx)
1101	adcxq	%rax,%r11
1102	adoxq	%r13,%r12
1103	mulxq	24(%rcx),%rax,%r15
1104	movq	%r9,%rdx
1105	movq	%r11,-24(%rbx)
1106	leaq	32(%rcx),%rcx
1107	adcxq	%rax,%r12
1108	adoxq	%rbp,%r15
1109	movq	48(%rsp),%rdi
1110	movq	%r12,-16(%rbx)
1111
1112	jmp	.Lmulx4x_inner
1113
1114.align	32
1115.Lmulx4x_inner:
1116	mulxq	0(%rsi),%r10,%rax
1117	adcxq	%rbp,%r15
1118	adoxq	%r14,%r10
1119	mulxq	8(%rsi),%r11,%r14
1120	adcxq	0(%rbx),%r10
1121	adoxq	%rax,%r11
1122	mulxq	16(%rsi),%r12,%rax
1123	adcxq	8(%rbx),%r11
1124	adoxq	%r14,%r12
1125	mulxq	24(%rsi),%r13,%r14
1126	movq	%r8,%rdx
1127	adcxq	16(%rbx),%r12
1128	adoxq	%rax,%r13
1129	adcxq	24(%rbx),%r13
1130	adoxq	%rbp,%r14
1131	leaq	32(%rsi),%rsi
1132	leaq	32(%rbx),%rbx
1133	adcxq	%rbp,%r14
1134
1135	adoxq	%r15,%r10
1136	mulxq	0(%rcx),%rax,%r15
1137	adcxq	%rax,%r10
1138	adoxq	%r15,%r11
1139	mulxq	8(%rcx),%rax,%r15
1140	adcxq	%rax,%r11
1141	adoxq	%r15,%r12
1142	mulxq	16(%rcx),%rax,%r15
1143	movq	%r10,-40(%rbx)
1144	adcxq	%rax,%r12
1145	adoxq	%r15,%r13
1146	mulxq	24(%rcx),%rax,%r15
1147	movq	%r9,%rdx
1148	movq	%r11,-32(%rbx)
1149	movq	%r12,-24(%rbx)
1150	adcxq	%rax,%r13
1151	adoxq	%rbp,%r15
1152	leaq	32(%rcx),%rcx
1153	movq	%r13,-16(%rbx)
1154
1155	decq	%rdi
1156	jnz	.Lmulx4x_inner
1157
1158	movq	0(%rsp),%rax
1159	movq	8(%rsp),%rdi
1160	adcq	%rbp,%r15
1161	subq	0(%rbx),%rbp
1162	adcq	%r15,%r14
1163	sbbq	%r15,%r15
1164	movq	%r14,-8(%rbx)
1165
1166	cmpq	16(%rsp),%rdi
1167	jne	.Lmulx4x_outer
1168
1169	leaq	64(%rsp),%rbx
1170	subq	%rax,%rcx
1171	negq	%r15
1172	movq	%rax,%rdx
1173	shrq	$3+2,%rax
1174	movq	32(%rsp),%rdi
1175	jmp	.Lmulx4x_sub
1176
1177.align	32
1178.Lmulx4x_sub:
1179	movq	0(%rbx),%r11
1180	movq	8(%rbx),%r12
1181	movq	16(%rbx),%r13
1182	movq	24(%rbx),%r14
1183	leaq	32(%rbx),%rbx
1184	sbbq	0(%rcx),%r11
1185	sbbq	8(%rcx),%r12
1186	sbbq	16(%rcx),%r13
1187	sbbq	24(%rcx),%r14
1188	leaq	32(%rcx),%rcx
1189	movq	%r11,0(%rdi)
1190	movq	%r12,8(%rdi)
1191	movq	%r13,16(%rdi)
1192	movq	%r14,24(%rdi)
1193	leaq	32(%rdi),%rdi
1194	decq	%rax
1195	jnz	.Lmulx4x_sub
1196
1197	sbbq	$0,%r15
1198	leaq	64(%rsp),%rbx
1199	subq	%rdx,%rdi
1200
1201.byte	102,73,15,110,207
1202	pxor	%xmm0,%xmm0
1203	pshufd	$0,%xmm1,%xmm1
1204	movq	40(%rsp),%rsi
1205.cfi_def_cfa	%rsi,8
1206	jmp	.Lmulx4x_cond_copy
1207
1208.align	32
1209.Lmulx4x_cond_copy:
1210	movdqa	0(%rbx),%xmm2
1211	movdqa	16(%rbx),%xmm3
1212	leaq	32(%rbx),%rbx
1213	movdqu	0(%rdi),%xmm4
1214	movdqu	16(%rdi),%xmm5
1215	leaq	32(%rdi),%rdi
1216	movdqa	%xmm0,-32(%rbx)
1217	movdqa	%xmm0,-16(%rbx)
1218	pcmpeqd	%xmm1,%xmm0
1219	pand	%xmm1,%xmm2
1220	pand	%xmm1,%xmm3
1221	pand	%xmm0,%xmm4
1222	pand	%xmm0,%xmm5
1223	pxor	%xmm0,%xmm0
1224	por	%xmm2,%xmm4
1225	por	%xmm3,%xmm5
1226	movdqu	%xmm4,-32(%rdi)
1227	movdqu	%xmm5,-16(%rdi)
1228	subq	$32,%rdx
1229	jnz	.Lmulx4x_cond_copy
1230
1231	movq	%rdx,(%rbx)
1232
1233	movq	$1,%rax
1234	movq	-48(%rsi),%r15
1235.cfi_restore	%r15
1236	movq	-40(%rsi),%r14
1237.cfi_restore	%r14
1238	movq	-32(%rsi),%r13
1239.cfi_restore	%r13
1240	movq	-24(%rsi),%r12
1241.cfi_restore	%r12
1242	movq	-16(%rsi),%rbp
1243.cfi_restore	%rbp
1244	movq	-8(%rsi),%rbx
1245.cfi_restore	%rbx
1246	leaq	(%rsi),%rsp
1247.cfi_def_cfa_register	%rsp
1248.Lmulx4x_epilogue:
1249	.byte	0xf3,0xc3
1250.cfi_endproc
1251.size	bn_mulx4x_mont,.-bn_mulx4x_mont
1252.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1253.align	16
1254#endif
1255.section	.note.GNU-stack,"",@progbits
1256