1#include "x86_arch.h"
2.text
3
4
5.p2align	4
6MULADD_128x512:
7	movq	0(%rsi),%rax
8	mulq	%rbp
9	addq	%rax,%r8
10	adcq	$0,%rdx
11	movq	%r8,0(%rcx)
12	movq	%rdx,%rbx
13
14	movq	8(%rsi),%rax
15	mulq	%rbp
16	addq	%rax,%r9
17	adcq	$0,%rdx
18	addq	%rbx,%r9
19	adcq	$0,%rdx
20	movq	%rdx,%rbx
21
22	movq	16(%rsi),%rax
23	mulq	%rbp
24	addq	%rax,%r10
25	adcq	$0,%rdx
26	addq	%rbx,%r10
27	adcq	$0,%rdx
28	movq	%rdx,%rbx
29
30	movq	24(%rsi),%rax
31	mulq	%rbp
32	addq	%rax,%r11
33	adcq	$0,%rdx
34	addq	%rbx,%r11
35	adcq	$0,%rdx
36	movq	%rdx,%rbx
37
38	movq	32(%rsi),%rax
39	mulq	%rbp
40	addq	%rax,%r12
41	adcq	$0,%rdx
42	addq	%rbx,%r12
43	adcq	$0,%rdx
44	movq	%rdx,%rbx
45
46	movq	40(%rsi),%rax
47	mulq	%rbp
48	addq	%rax,%r13
49	adcq	$0,%rdx
50	addq	%rbx,%r13
51	adcq	$0,%rdx
52	movq	%rdx,%rbx
53
54	movq	48(%rsi),%rax
55	mulq	%rbp
56	addq	%rax,%r14
57	adcq	$0,%rdx
58	addq	%rbx,%r14
59	adcq	$0,%rdx
60	movq	%rdx,%rbx
61
62	movq	56(%rsi),%rax
63	mulq	%rbp
64	addq	%rax,%r15
65	adcq	$0,%rdx
66	addq	%rbx,%r15
67	adcq	$0,%rdx
68	movq	%rdx,%r8
69	movq	8(%rdi),%rbp
70	movq	0(%rsi),%rax
71	mulq	%rbp
72	addq	%rax,%r9
73	adcq	$0,%rdx
74	movq	%r9,8(%rcx)
75	movq	%rdx,%rbx
76
77	movq	8(%rsi),%rax
78	mulq	%rbp
79	addq	%rax,%r10
80	adcq	$0,%rdx
81	addq	%rbx,%r10
82	adcq	$0,%rdx
83	movq	%rdx,%rbx
84
85	movq	16(%rsi),%rax
86	mulq	%rbp
87	addq	%rax,%r11
88	adcq	$0,%rdx
89	addq	%rbx,%r11
90	adcq	$0,%rdx
91	movq	%rdx,%rbx
92
93	movq	24(%rsi),%rax
94	mulq	%rbp
95	addq	%rax,%r12
96	adcq	$0,%rdx
97	addq	%rbx,%r12
98	adcq	$0,%rdx
99	movq	%rdx,%rbx
100
101	movq	32(%rsi),%rax
102	mulq	%rbp
103	addq	%rax,%r13
104	adcq	$0,%rdx
105	addq	%rbx,%r13
106	adcq	$0,%rdx
107	movq	%rdx,%rbx
108
109	movq	40(%rsi),%rax
110	mulq	%rbp
111	addq	%rax,%r14
112	adcq	$0,%rdx
113	addq	%rbx,%r14
114	adcq	$0,%rdx
115	movq	%rdx,%rbx
116
117	movq	48(%rsi),%rax
118	mulq	%rbp
119	addq	%rax,%r15
120	adcq	$0,%rdx
121	addq	%rbx,%r15
122	adcq	$0,%rdx
123	movq	%rdx,%rbx
124
125	movq	56(%rsi),%rax
126	mulq	%rbp
127	addq	%rax,%r8
128	adcq	$0,%rdx
129	addq	%rbx,%r8
130	adcq	$0,%rdx
131	movq	%rdx,%r9
132	retq
133
134
135.p2align	4
136mont_reduce:
137	leaq	192(%rsp),%rdi
138	movq	32(%rsp),%rsi
139	addq	$576,%rsi
140	leaq	520(%rsp),%rcx
141
142	movq	96(%rcx),%rbp
143	movq	0(%rsi),%rax
144	mulq	%rbp
145	movq	(%rcx),%r8
146	addq	%rax,%r8
147	adcq	$0,%rdx
148	movq	%r8,0(%rdi)
149	movq	%rdx,%rbx
150
151	movq	8(%rsi),%rax
152	mulq	%rbp
153	movq	8(%rcx),%r9
154	addq	%rax,%r9
155	adcq	$0,%rdx
156	addq	%rbx,%r9
157	adcq	$0,%rdx
158	movq	%rdx,%rbx
159
160	movq	16(%rsi),%rax
161	mulq	%rbp
162	movq	16(%rcx),%r10
163	addq	%rax,%r10
164	adcq	$0,%rdx
165	addq	%rbx,%r10
166	adcq	$0,%rdx
167	movq	%rdx,%rbx
168
169	movq	24(%rsi),%rax
170	mulq	%rbp
171	movq	24(%rcx),%r11
172	addq	%rax,%r11
173	adcq	$0,%rdx
174	addq	%rbx,%r11
175	adcq	$0,%rdx
176	movq	%rdx,%rbx
177
178	movq	32(%rsi),%rax
179	mulq	%rbp
180	movq	32(%rcx),%r12
181	addq	%rax,%r12
182	adcq	$0,%rdx
183	addq	%rbx,%r12
184	adcq	$0,%rdx
185	movq	%rdx,%rbx
186
187	movq	40(%rsi),%rax
188	mulq	%rbp
189	movq	40(%rcx),%r13
190	addq	%rax,%r13
191	adcq	$0,%rdx
192	addq	%rbx,%r13
193	adcq	$0,%rdx
194	movq	%rdx,%rbx
195
196	movq	48(%rsi),%rax
197	mulq	%rbp
198	movq	48(%rcx),%r14
199	addq	%rax,%r14
200	adcq	$0,%rdx
201	addq	%rbx,%r14
202	adcq	$0,%rdx
203	movq	%rdx,%rbx
204
205	movq	56(%rsi),%rax
206	mulq	%rbp
207	movq	56(%rcx),%r15
208	addq	%rax,%r15
209	adcq	$0,%rdx
210	addq	%rbx,%r15
211	adcq	$0,%rdx
212	movq	%rdx,%r8
213	movq	104(%rcx),%rbp
214	movq	0(%rsi),%rax
215	mulq	%rbp
216	addq	%rax,%r9
217	adcq	$0,%rdx
218	movq	%r9,8(%rdi)
219	movq	%rdx,%rbx
220
221	movq	8(%rsi),%rax
222	mulq	%rbp
223	addq	%rax,%r10
224	adcq	$0,%rdx
225	addq	%rbx,%r10
226	adcq	$0,%rdx
227	movq	%rdx,%rbx
228
229	movq	16(%rsi),%rax
230	mulq	%rbp
231	addq	%rax,%r11
232	adcq	$0,%rdx
233	addq	%rbx,%r11
234	adcq	$0,%rdx
235	movq	%rdx,%rbx
236
237	movq	24(%rsi),%rax
238	mulq	%rbp
239	addq	%rax,%r12
240	adcq	$0,%rdx
241	addq	%rbx,%r12
242	adcq	$0,%rdx
243	movq	%rdx,%rbx
244
245	movq	32(%rsi),%rax
246	mulq	%rbp
247	addq	%rax,%r13
248	adcq	$0,%rdx
249	addq	%rbx,%r13
250	adcq	$0,%rdx
251	movq	%rdx,%rbx
252
253	movq	40(%rsi),%rax
254	mulq	%rbp
255	addq	%rax,%r14
256	adcq	$0,%rdx
257	addq	%rbx,%r14
258	adcq	$0,%rdx
259	movq	%rdx,%rbx
260
261	movq	48(%rsi),%rax
262	mulq	%rbp
263	addq	%rax,%r15
264	adcq	$0,%rdx
265	addq	%rbx,%r15
266	adcq	$0,%rdx
267	movq	%rdx,%rbx
268
269	movq	56(%rsi),%rax
270	mulq	%rbp
271	addq	%rax,%r8
272	adcq	$0,%rdx
273	addq	%rbx,%r8
274	adcq	$0,%rdx
275	movq	%rdx,%r9
276	movq	112(%rcx),%rbp
277	movq	0(%rsi),%rax
278	mulq	%rbp
279	addq	%rax,%r10
280	adcq	$0,%rdx
281	movq	%r10,16(%rdi)
282	movq	%rdx,%rbx
283
284	movq	8(%rsi),%rax
285	mulq	%rbp
286	addq	%rax,%r11
287	adcq	$0,%rdx
288	addq	%rbx,%r11
289	adcq	$0,%rdx
290	movq	%rdx,%rbx
291
292	movq	16(%rsi),%rax
293	mulq	%rbp
294	addq	%rax,%r12
295	adcq	$0,%rdx
296	addq	%rbx,%r12
297	adcq	$0,%rdx
298	movq	%rdx,%rbx
299
300	movq	24(%rsi),%rax
301	mulq	%rbp
302	addq	%rax,%r13
303	adcq	$0,%rdx
304	addq	%rbx,%r13
305	adcq	$0,%rdx
306	movq	%rdx,%rbx
307
308	movq	32(%rsi),%rax
309	mulq	%rbp
310	addq	%rax,%r14
311	adcq	$0,%rdx
312	addq	%rbx,%r14
313	adcq	$0,%rdx
314	movq	%rdx,%rbx
315
316	movq	40(%rsi),%rax
317	mulq	%rbp
318	addq	%rax,%r15
319	adcq	$0,%rdx
320	addq	%rbx,%r15
321	adcq	$0,%rdx
322	movq	%rdx,%rbx
323
324	movq	48(%rsi),%rax
325	mulq	%rbp
326	addq	%rax,%r8
327	adcq	$0,%rdx
328	addq	%rbx,%r8
329	adcq	$0,%rdx
330	movq	%rdx,%rbx
331
332	movq	56(%rsi),%rax
333	mulq	%rbp
334	addq	%rax,%r9
335	adcq	$0,%rdx
336	addq	%rbx,%r9
337	adcq	$0,%rdx
338	movq	%rdx,%r10
339	movq	120(%rcx),%rbp
340	movq	0(%rsi),%rax
341	mulq	%rbp
342	addq	%rax,%r11
343	adcq	$0,%rdx
344	movq	%r11,24(%rdi)
345	movq	%rdx,%rbx
346
347	movq	8(%rsi),%rax
348	mulq	%rbp
349	addq	%rax,%r12
350	adcq	$0,%rdx
351	addq	%rbx,%r12
352	adcq	$0,%rdx
353	movq	%rdx,%rbx
354
355	movq	16(%rsi),%rax
356	mulq	%rbp
357	addq	%rax,%r13
358	adcq	$0,%rdx
359	addq	%rbx,%r13
360	adcq	$0,%rdx
361	movq	%rdx,%rbx
362
363	movq	24(%rsi),%rax
364	mulq	%rbp
365	addq	%rax,%r14
366	adcq	$0,%rdx
367	addq	%rbx,%r14
368	adcq	$0,%rdx
369	movq	%rdx,%rbx
370
371	movq	32(%rsi),%rax
372	mulq	%rbp
373	addq	%rax,%r15
374	adcq	$0,%rdx
375	addq	%rbx,%r15
376	adcq	$0,%rdx
377	movq	%rdx,%rbx
378
379	movq	40(%rsi),%rax
380	mulq	%rbp
381	addq	%rax,%r8
382	adcq	$0,%rdx
383	addq	%rbx,%r8
384	adcq	$0,%rdx
385	movq	%rdx,%rbx
386
387	movq	48(%rsi),%rax
388	mulq	%rbp
389	addq	%rax,%r9
390	adcq	$0,%rdx
391	addq	%rbx,%r9
392	adcq	$0,%rdx
393	movq	%rdx,%rbx
394
395	movq	56(%rsi),%rax
396	mulq	%rbp
397	addq	%rax,%r10
398	adcq	$0,%rdx
399	addq	%rbx,%r10
400	adcq	$0,%rdx
401	movq	%rdx,%r11
402	xorq	%rax,%rax
403
404	addq	64(%rcx),%r8
405	adcq	72(%rcx),%r9
406	adcq	80(%rcx),%r10
407	adcq	88(%rcx),%r11
408	adcq	$0,%rax
409
410
411
412
413	movq	%r8,64(%rdi)
414	movq	%r9,72(%rdi)
415	movq	%r10,%rbp
416	movq	%r11,88(%rdi)
417
418	movq	%rax,384(%rsp)
419
420	movq	0(%rdi),%r8
421	movq	8(%rdi),%r9
422	movq	16(%rdi),%r10
423	movq	24(%rdi),%r11
424
425
426
427
428
429
430
431
432	addq	$80,%rdi
433
434	addq	$64,%rsi
435	leaq	296(%rsp),%rcx
436
437	call	MULADD_128x512
438
439	movq	384(%rsp),%rax
440
441
442	addq	-16(%rdi),%r8
443	adcq	-8(%rdi),%r9
444	movq	%r8,64(%rcx)
445	movq	%r9,72(%rcx)
446
447	adcq	%rax,%rax
448	movq	%rax,384(%rsp)
449
450	leaq	192(%rsp),%rdi
451	addq	$64,%rsi
452
453
454
455
456
457	movq	(%rsi),%r8
458	movq	8(%rsi),%rbx
459
460	movq	(%rcx),%rax
461	mulq	%r8
462	movq	%rax,%rbp
463	movq	%rdx,%r9
464
465	movq	8(%rcx),%rax
466	mulq	%r8
467	addq	%rax,%r9
468
469	movq	(%rcx),%rax
470	mulq	%rbx
471	addq	%rax,%r9
472
473	movq	%r9,8(%rdi)
474
475
476	subq	$192,%rsi
477
478	movq	(%rcx),%r8
479	movq	8(%rcx),%r9
480
481	call	MULADD_128x512
482
483
484
485
486	movq	0(%rsi),%rax
487	movq	8(%rsi),%rbx
488	movq	16(%rsi),%rdi
489	movq	24(%rsi),%rdx
490
491
492	movq	384(%rsp),%rbp
493
494	addq	64(%rcx),%r8
495	adcq	72(%rcx),%r9
496
497
498	adcq	%rbp,%rbp
499
500
501
502	shlq	$3,%rbp
503	movq	32(%rsp),%rcx
504	addq	%rcx,%rbp
505
506
507	xorq	%rsi,%rsi
508
509	addq	0(%rbp),%r10
510	adcq	64(%rbp),%r11
511	adcq	128(%rbp),%r12
512	adcq	192(%rbp),%r13
513	adcq	256(%rbp),%r14
514	adcq	320(%rbp),%r15
515	adcq	384(%rbp),%r8
516	adcq	448(%rbp),%r9
517
518
519
520	sbbq	$0,%rsi
521
522
523	andq	%rsi,%rax
524	andq	%rsi,%rbx
525	andq	%rsi,%rdi
526	andq	%rsi,%rdx
527
528	movq	$1,%rbp
529	subq	%rax,%r10
530	sbbq	%rbx,%r11
531	sbbq	%rdi,%r12
532	sbbq	%rdx,%r13
533
534
535
536
537	sbbq	$0,%rbp
538
539
540
541	addq	$512,%rcx
542	movq	32(%rcx),%rax
543	movq	40(%rcx),%rbx
544	movq	48(%rcx),%rdi
545	movq	56(%rcx),%rdx
546
547
548
549	andq	%rsi,%rax
550	andq	%rsi,%rbx
551	andq	%rsi,%rdi
552	andq	%rsi,%rdx
553
554
555
556	subq	$1,%rbp
557
558	sbbq	%rax,%r14
559	sbbq	%rbx,%r15
560	sbbq	%rdi,%r8
561	sbbq	%rdx,%r9
562
563
564
565	movq	144(%rsp),%rsi
566	movq	%r10,0(%rsi)
567	movq	%r11,8(%rsi)
568	movq	%r12,16(%rsi)
569	movq	%r13,24(%rsi)
570	movq	%r14,32(%rsi)
571	movq	%r15,40(%rsi)
572	movq	%r8,48(%rsi)
573	movq	%r9,56(%rsi)
574
575	retq
576
577
578.p2align	4
579mont_mul_a3b:
580
581
582
583
584	movq	0(%rdi),%rbp
585
586	movq	%r10,%rax
587	mulq	%rbp
588	movq	%rax,520(%rsp)
589	movq	%rdx,%r10
590	movq	%r11,%rax
591	mulq	%rbp
592	addq	%rax,%r10
593	adcq	$0,%rdx
594	movq	%rdx,%r11
595	movq	%r12,%rax
596	mulq	%rbp
597	addq	%rax,%r11
598	adcq	$0,%rdx
599	movq	%rdx,%r12
600	movq	%r13,%rax
601	mulq	%rbp
602	addq	%rax,%r12
603	adcq	$0,%rdx
604	movq	%rdx,%r13
605	movq	%r14,%rax
606	mulq	%rbp
607	addq	%rax,%r13
608	adcq	$0,%rdx
609	movq	%rdx,%r14
610	movq	%r15,%rax
611	mulq	%rbp
612	addq	%rax,%r14
613	adcq	$0,%rdx
614	movq	%rdx,%r15
615	movq	%r8,%rax
616	mulq	%rbp
617	addq	%rax,%r15
618	adcq	$0,%rdx
619	movq	%rdx,%r8
620	movq	%r9,%rax
621	mulq	%rbp
622	addq	%rax,%r8
623	adcq	$0,%rdx
624	movq	%rdx,%r9
625	movq	8(%rdi),%rbp
626	movq	0(%rsi),%rax
627	mulq	%rbp
628	addq	%rax,%r10
629	adcq	$0,%rdx
630	movq	%r10,528(%rsp)
631	movq	%rdx,%rbx
632
633	movq	8(%rsi),%rax
634	mulq	%rbp
635	addq	%rax,%r11
636	adcq	$0,%rdx
637	addq	%rbx,%r11
638	adcq	$0,%rdx
639	movq	%rdx,%rbx
640
641	movq	16(%rsi),%rax
642	mulq	%rbp
643	addq	%rax,%r12
644	adcq	$0,%rdx
645	addq	%rbx,%r12
646	adcq	$0,%rdx
647	movq	%rdx,%rbx
648
649	movq	24(%rsi),%rax
650	mulq	%rbp
651	addq	%rax,%r13
652	adcq	$0,%rdx
653	addq	%rbx,%r13
654	adcq	$0,%rdx
655	movq	%rdx,%rbx
656
657	movq	32(%rsi),%rax
658	mulq	%rbp
659	addq	%rax,%r14
660	adcq	$0,%rdx
661	addq	%rbx,%r14
662	adcq	$0,%rdx
663	movq	%rdx,%rbx
664
665	movq	40(%rsi),%rax
666	mulq	%rbp
667	addq	%rax,%r15
668	adcq	$0,%rdx
669	addq	%rbx,%r15
670	adcq	$0,%rdx
671	movq	%rdx,%rbx
672
673	movq	48(%rsi),%rax
674	mulq	%rbp
675	addq	%rax,%r8
676	adcq	$0,%rdx
677	addq	%rbx,%r8
678	adcq	$0,%rdx
679	movq	%rdx,%rbx
680
681	movq	56(%rsi),%rax
682	mulq	%rbp
683	addq	%rax,%r9
684	adcq	$0,%rdx
685	addq	%rbx,%r9
686	adcq	$0,%rdx
687	movq	%rdx,%r10
688	movq	16(%rdi),%rbp
689	movq	0(%rsi),%rax
690	mulq	%rbp
691	addq	%rax,%r11
692	adcq	$0,%rdx
693	movq	%r11,536(%rsp)
694	movq	%rdx,%rbx
695
696	movq	8(%rsi),%rax
697	mulq	%rbp
698	addq	%rax,%r12
699	adcq	$0,%rdx
700	addq	%rbx,%r12
701	adcq	$0,%rdx
702	movq	%rdx,%rbx
703
704	movq	16(%rsi),%rax
705	mulq	%rbp
706	addq	%rax,%r13
707	adcq	$0,%rdx
708	addq	%rbx,%r13
709	adcq	$0,%rdx
710	movq	%rdx,%rbx
711
712	movq	24(%rsi),%rax
713	mulq	%rbp
714	addq	%rax,%r14
715	adcq	$0,%rdx
716	addq	%rbx,%r14
717	adcq	$0,%rdx
718	movq	%rdx,%rbx
719
720	movq	32(%rsi),%rax
721	mulq	%rbp
722	addq	%rax,%r15
723	adcq	$0,%rdx
724	addq	%rbx,%r15
725	adcq	$0,%rdx
726	movq	%rdx,%rbx
727
728	movq	40(%rsi),%rax
729	mulq	%rbp
730	addq	%rax,%r8
731	adcq	$0,%rdx
732	addq	%rbx,%r8
733	adcq	$0,%rdx
734	movq	%rdx,%rbx
735
736	movq	48(%rsi),%rax
737	mulq	%rbp
738	addq	%rax,%r9
739	adcq	$0,%rdx
740	addq	%rbx,%r9
741	adcq	$0,%rdx
742	movq	%rdx,%rbx
743
744	movq	56(%rsi),%rax
745	mulq	%rbp
746	addq	%rax,%r10
747	adcq	$0,%rdx
748	addq	%rbx,%r10
749	adcq	$0,%rdx
750	movq	%rdx,%r11
751	movq	24(%rdi),%rbp
752	movq	0(%rsi),%rax
753	mulq	%rbp
754	addq	%rax,%r12
755	adcq	$0,%rdx
756	movq	%r12,544(%rsp)
757	movq	%rdx,%rbx
758
759	movq	8(%rsi),%rax
760	mulq	%rbp
761	addq	%rax,%r13
762	adcq	$0,%rdx
763	addq	%rbx,%r13
764	adcq	$0,%rdx
765	movq	%rdx,%rbx
766
767	movq	16(%rsi),%rax
768	mulq	%rbp
769	addq	%rax,%r14
770	adcq	$0,%rdx
771	addq	%rbx,%r14
772	adcq	$0,%rdx
773	movq	%rdx,%rbx
774
775	movq	24(%rsi),%rax
776	mulq	%rbp
777	addq	%rax,%r15
778	adcq	$0,%rdx
779	addq	%rbx,%r15
780	adcq	$0,%rdx
781	movq	%rdx,%rbx
782
783	movq	32(%rsi),%rax
784	mulq	%rbp
785	addq	%rax,%r8
786	adcq	$0,%rdx
787	addq	%rbx,%r8
788	adcq	$0,%rdx
789	movq	%rdx,%rbx
790
791	movq	40(%rsi),%rax
792	mulq	%rbp
793	addq	%rax,%r9
794	adcq	$0,%rdx
795	addq	%rbx,%r9
796	adcq	$0,%rdx
797	movq	%rdx,%rbx
798
799	movq	48(%rsi),%rax
800	mulq	%rbp
801	addq	%rax,%r10
802	adcq	$0,%rdx
803	addq	%rbx,%r10
804	adcq	$0,%rdx
805	movq	%rdx,%rbx
806
807	movq	56(%rsi),%rax
808	mulq	%rbp
809	addq	%rax,%r11
810	adcq	$0,%rdx
811	addq	%rbx,%r11
812	adcq	$0,%rdx
813	movq	%rdx,%r12
814	movq	32(%rdi),%rbp
815	movq	0(%rsi),%rax
816	mulq	%rbp
817	addq	%rax,%r13
818	adcq	$0,%rdx
819	movq	%r13,552(%rsp)
820	movq	%rdx,%rbx
821
822	movq	8(%rsi),%rax
823	mulq	%rbp
824	addq	%rax,%r14
825	adcq	$0,%rdx
826	addq	%rbx,%r14
827	adcq	$0,%rdx
828	movq	%rdx,%rbx
829
830	movq	16(%rsi),%rax
831	mulq	%rbp
832	addq	%rax,%r15
833	adcq	$0,%rdx
834	addq	%rbx,%r15
835	adcq	$0,%rdx
836	movq	%rdx,%rbx
837
838	movq	24(%rsi),%rax
839	mulq	%rbp
840	addq	%rax,%r8
841	adcq	$0,%rdx
842	addq	%rbx,%r8
843	adcq	$0,%rdx
844	movq	%rdx,%rbx
845
846	movq	32(%rsi),%rax
847	mulq	%rbp
848	addq	%rax,%r9
849	adcq	$0,%rdx
850	addq	%rbx,%r9
851	adcq	$0,%rdx
852	movq	%rdx,%rbx
853
854	movq	40(%rsi),%rax
855	mulq	%rbp
856	addq	%rax,%r10
857	adcq	$0,%rdx
858	addq	%rbx,%r10
859	adcq	$0,%rdx
860	movq	%rdx,%rbx
861
862	movq	48(%rsi),%rax
863	mulq	%rbp
864	addq	%rax,%r11
865	adcq	$0,%rdx
866	addq	%rbx,%r11
867	adcq	$0,%rdx
868	movq	%rdx,%rbx
869
870	movq	56(%rsi),%rax
871	mulq	%rbp
872	addq	%rax,%r12
873	adcq	$0,%rdx
874	addq	%rbx,%r12
875	adcq	$0,%rdx
876	movq	%rdx,%r13
877	movq	40(%rdi),%rbp
878	movq	0(%rsi),%rax
879	mulq	%rbp
880	addq	%rax,%r14
881	adcq	$0,%rdx
882	movq	%r14,560(%rsp)
883	movq	%rdx,%rbx
884
885	movq	8(%rsi),%rax
886	mulq	%rbp
887	addq	%rax,%r15
888	adcq	$0,%rdx
889	addq	%rbx,%r15
890	adcq	$0,%rdx
891	movq	%rdx,%rbx
892
893	movq	16(%rsi),%rax
894	mulq	%rbp
895	addq	%rax,%r8
896	adcq	$0,%rdx
897	addq	%rbx,%r8
898	adcq	$0,%rdx
899	movq	%rdx,%rbx
900
901	movq	24(%rsi),%rax
902	mulq	%rbp
903	addq	%rax,%r9
904	adcq	$0,%rdx
905	addq	%rbx,%r9
906	adcq	$0,%rdx
907	movq	%rdx,%rbx
908
909	movq	32(%rsi),%rax
910	mulq	%rbp
911	addq	%rax,%r10
912	adcq	$0,%rdx
913	addq	%rbx,%r10
914	adcq	$0,%rdx
915	movq	%rdx,%rbx
916
917	movq	40(%rsi),%rax
918	mulq	%rbp
919	addq	%rax,%r11
920	adcq	$0,%rdx
921	addq	%rbx,%r11
922	adcq	$0,%rdx
923	movq	%rdx,%rbx
924
925	movq	48(%rsi),%rax
926	mulq	%rbp
927	addq	%rax,%r12
928	adcq	$0,%rdx
929	addq	%rbx,%r12
930	adcq	$0,%rdx
931	movq	%rdx,%rbx
932
933	movq	56(%rsi),%rax
934	mulq	%rbp
935	addq	%rax,%r13
936	adcq	$0,%rdx
937	addq	%rbx,%r13
938	adcq	$0,%rdx
939	movq	%rdx,%r14
940	movq	48(%rdi),%rbp
941	movq	0(%rsi),%rax
942	mulq	%rbp
943	addq	%rax,%r15
944	adcq	$0,%rdx
945	movq	%r15,568(%rsp)
946	movq	%rdx,%rbx
947
948	movq	8(%rsi),%rax
949	mulq	%rbp
950	addq	%rax,%r8
951	adcq	$0,%rdx
952	addq	%rbx,%r8
953	adcq	$0,%rdx
954	movq	%rdx,%rbx
955
956	movq	16(%rsi),%rax
957	mulq	%rbp
958	addq	%rax,%r9
959	adcq	$0,%rdx
960	addq	%rbx,%r9
961	adcq	$0,%rdx
962	movq	%rdx,%rbx
963
964	movq	24(%rsi),%rax
965	mulq	%rbp
966	addq	%rax,%r10
967	adcq	$0,%rdx
968	addq	%rbx,%r10
969	adcq	$0,%rdx
970	movq	%rdx,%rbx
971
972	movq	32(%rsi),%rax
973	mulq	%rbp
974	addq	%rax,%r11
975	adcq	$0,%rdx
976	addq	%rbx,%r11
977	adcq	$0,%rdx
978	movq	%rdx,%rbx
979
980	movq	40(%rsi),%rax
981	mulq	%rbp
982	addq	%rax,%r12
983	adcq	$0,%rdx
984	addq	%rbx,%r12
985	adcq	$0,%rdx
986	movq	%rdx,%rbx
987
988	movq	48(%rsi),%rax
989	mulq	%rbp
990	addq	%rax,%r13
991	adcq	$0,%rdx
992	addq	%rbx,%r13
993	adcq	$0,%rdx
994	movq	%rdx,%rbx
995
996	movq	56(%rsi),%rax
997	mulq	%rbp
998	addq	%rax,%r14
999	adcq	$0,%rdx
1000	addq	%rbx,%r14
1001	adcq	$0,%rdx
1002	movq	%rdx,%r15
1003	movq	56(%rdi),%rbp
1004	movq	0(%rsi),%rax
1005	mulq	%rbp
1006	addq	%rax,%r8
1007	adcq	$0,%rdx
1008	movq	%r8,576(%rsp)
1009	movq	%rdx,%rbx
1010
1011	movq	8(%rsi),%rax
1012	mulq	%rbp
1013	addq	%rax,%r9
1014	adcq	$0,%rdx
1015	addq	%rbx,%r9
1016	adcq	$0,%rdx
1017	movq	%rdx,%rbx
1018
1019	movq	16(%rsi),%rax
1020	mulq	%rbp
1021	addq	%rax,%r10
1022	adcq	$0,%rdx
1023	addq	%rbx,%r10
1024	adcq	$0,%rdx
1025	movq	%rdx,%rbx
1026
1027	movq	24(%rsi),%rax
1028	mulq	%rbp
1029	addq	%rax,%r11
1030	adcq	$0,%rdx
1031	addq	%rbx,%r11
1032	adcq	$0,%rdx
1033	movq	%rdx,%rbx
1034
1035	movq	32(%rsi),%rax
1036	mulq	%rbp
1037	addq	%rax,%r12
1038	adcq	$0,%rdx
1039	addq	%rbx,%r12
1040	adcq	$0,%rdx
1041	movq	%rdx,%rbx
1042
1043	movq	40(%rsi),%rax
1044	mulq	%rbp
1045	addq	%rax,%r13
1046	adcq	$0,%rdx
1047	addq	%rbx,%r13
1048	adcq	$0,%rdx
1049	movq	%rdx,%rbx
1050
1051	movq	48(%rsi),%rax
1052	mulq	%rbp
1053	addq	%rax,%r14
1054	adcq	$0,%rdx
1055	addq	%rbx,%r14
1056	adcq	$0,%rdx
1057	movq	%rdx,%rbx
1058
1059	movq	56(%rsi),%rax
1060	mulq	%rbp
1061	addq	%rax,%r15
1062	adcq	$0,%rdx
1063	addq	%rbx,%r15
1064	adcq	$0,%rdx
1065	movq	%rdx,%r8
1066	movq	%r9,584(%rsp)
1067	movq	%r10,592(%rsp)
1068	movq	%r11,600(%rsp)
1069	movq	%r12,608(%rsp)
1070	movq	%r13,616(%rsp)
1071	movq	%r14,624(%rsp)
1072	movq	%r15,632(%rsp)
1073	movq	%r8,640(%rsp)
1074
1075
1076
1077
1078
1079	jmp	mont_reduce
1080
1081
1082
1083
1084.p2align	4
1085sqr_reduce:
1086	movq	16(%rsp),%rcx
1087
1088
1089
1090	movq	%r10,%rbx
1091
1092	movq	%r11,%rax
1093	mulq	%rbx
1094	movq	%rax,528(%rsp)
1095	movq	%rdx,%r10
1096	movq	%r12,%rax
1097	mulq	%rbx
1098	addq	%rax,%r10
1099	adcq	$0,%rdx
1100	movq	%rdx,%r11
1101	movq	%r13,%rax
1102	mulq	%rbx
1103	addq	%rax,%r11
1104	adcq	$0,%rdx
1105	movq	%rdx,%r12
1106	movq	%r14,%rax
1107	mulq	%rbx
1108	addq	%rax,%r12
1109	adcq	$0,%rdx
1110	movq	%rdx,%r13
1111	movq	%r15,%rax
1112	mulq	%rbx
1113	addq	%rax,%r13
1114	adcq	$0,%rdx
1115	movq	%rdx,%r14
1116	movq	%r8,%rax
1117	mulq	%rbx
1118	addq	%rax,%r14
1119	adcq	$0,%rdx
1120	movq	%rdx,%r15
1121	movq	%r9,%rax
1122	mulq	%rbx
1123	addq	%rax,%r15
1124	adcq	$0,%rdx
1125	movq	%rdx,%rsi
1126
1127	movq	%r10,536(%rsp)
1128
1129
1130
1131
1132
1133	movq	8(%rcx),%rbx
1134
1135	movq	16(%rcx),%rax
1136	mulq	%rbx
1137	addq	%rax,%r11
1138	adcq	$0,%rdx
1139	movq	%r11,544(%rsp)
1140
1141	movq	%rdx,%r10
1142	movq	24(%rcx),%rax
1143	mulq	%rbx
1144	addq	%rax,%r12
1145	adcq	$0,%rdx
1146	addq	%r10,%r12
1147	adcq	$0,%rdx
1148	movq	%r12,552(%rsp)
1149
1150	movq	%rdx,%r10
1151	movq	32(%rcx),%rax
1152	mulq	%rbx
1153	addq	%rax,%r13
1154	adcq	$0,%rdx
1155	addq	%r10,%r13
1156	adcq	$0,%rdx
1157
1158	movq	%rdx,%r10
1159	movq	40(%rcx),%rax
1160	mulq	%rbx
1161	addq	%rax,%r14
1162	adcq	$0,%rdx
1163	addq	%r10,%r14
1164	adcq	$0,%rdx
1165
1166	movq	%rdx,%r10
1167	movq	%r8,%rax
1168	mulq	%rbx
1169	addq	%rax,%r15
1170	adcq	$0,%rdx
1171	addq	%r10,%r15
1172	adcq	$0,%rdx
1173
1174	movq	%rdx,%r10
1175	movq	%r9,%rax
1176	mulq	%rbx
1177	addq	%rax,%rsi
1178	adcq	$0,%rdx
1179	addq	%r10,%rsi
1180	adcq	$0,%rdx
1181
1182	movq	%rdx,%r11
1183
1184
1185
1186
1187	movq	16(%rcx),%rbx
1188
1189	movq	24(%rcx),%rax
1190	mulq	%rbx
1191	addq	%rax,%r13
1192	adcq	$0,%rdx
1193	movq	%r13,560(%rsp)
1194
1195	movq	%rdx,%r10
1196	movq	32(%rcx),%rax
1197	mulq	%rbx
1198	addq	%rax,%r14
1199	adcq	$0,%rdx
1200	addq	%r10,%r14
1201	adcq	$0,%rdx
1202	movq	%r14,568(%rsp)
1203
1204	movq	%rdx,%r10
1205	movq	40(%rcx),%rax
1206	mulq	%rbx
1207	addq	%rax,%r15
1208	adcq	$0,%rdx
1209	addq	%r10,%r15
1210	adcq	$0,%rdx
1211
1212	movq	%rdx,%r10
1213	movq	%r8,%rax
1214	mulq	%rbx
1215	addq	%rax,%rsi
1216	adcq	$0,%rdx
1217	addq	%r10,%rsi
1218	adcq	$0,%rdx
1219
1220	movq	%rdx,%r10
1221	movq	%r9,%rax
1222	mulq	%rbx
1223	addq	%rax,%r11
1224	adcq	$0,%rdx
1225	addq	%r10,%r11
1226	adcq	$0,%rdx
1227
1228	movq	%rdx,%r12
1229
1230
1231
1232
1233
1234	movq	24(%rcx),%rbx
1235
1236	movq	32(%rcx),%rax
1237	mulq	%rbx
1238	addq	%rax,%r15
1239	adcq	$0,%rdx
1240	movq	%r15,576(%rsp)
1241
1242	movq	%rdx,%r10
1243	movq	40(%rcx),%rax
1244	mulq	%rbx
1245	addq	%rax,%rsi
1246	adcq	$0,%rdx
1247	addq	%r10,%rsi
1248	adcq	$0,%rdx
1249	movq	%rsi,584(%rsp)
1250
1251	movq	%rdx,%r10
1252	movq	%r8,%rax
1253	mulq	%rbx
1254	addq	%rax,%r11
1255	adcq	$0,%rdx
1256	addq	%r10,%r11
1257	adcq	$0,%rdx
1258
1259	movq	%rdx,%r10
1260	movq	%r9,%rax
1261	mulq	%rbx
1262	addq	%rax,%r12
1263	adcq	$0,%rdx
1264	addq	%r10,%r12
1265	adcq	$0,%rdx
1266
1267	movq	%rdx,%r15
1268
1269
1270
1271
1272	movq	32(%rcx),%rbx
1273
1274	movq	40(%rcx),%rax
1275	mulq	%rbx
1276	addq	%rax,%r11
1277	adcq	$0,%rdx
1278	movq	%r11,592(%rsp)
1279
1280	movq	%rdx,%r10
1281	movq	%r8,%rax
1282	mulq	%rbx
1283	addq	%rax,%r12
1284	adcq	$0,%rdx
1285	addq	%r10,%r12
1286	adcq	$0,%rdx
1287	movq	%r12,600(%rsp)
1288
1289	movq	%rdx,%r10
1290	movq	%r9,%rax
1291	mulq	%rbx
1292	addq	%rax,%r15
1293	adcq	$0,%rdx
1294	addq	%r10,%r15
1295	adcq	$0,%rdx
1296
1297	movq	%rdx,%r11
1298
1299
1300
1301
1302	movq	40(%rcx),%rbx
1303
1304	movq	%r8,%rax
1305	mulq	%rbx
1306	addq	%rax,%r15
1307	adcq	$0,%rdx
1308	movq	%r15,608(%rsp)
1309
1310	movq	%rdx,%r10
1311	movq	%r9,%rax
1312	mulq	%rbx
1313	addq	%rax,%r11
1314	adcq	$0,%rdx
1315	addq	%r10,%r11
1316	adcq	$0,%rdx
1317	movq	%r11,616(%rsp)
1318
1319	movq	%rdx,%r12
1320
1321
1322
1323
1324	movq	%r8,%rbx
1325
1326	movq	%r9,%rax
1327	mulq	%rbx
1328	addq	%rax,%r12
1329	adcq	$0,%rdx
1330	movq	%r12,624(%rsp)
1331
1332	movq	%rdx,632(%rsp)
1333
1334
1335	movq	528(%rsp),%r10
1336	movq	536(%rsp),%r11
1337	movq	544(%rsp),%r12
1338	movq	552(%rsp),%r13
1339	movq	560(%rsp),%r14
1340	movq	568(%rsp),%r15
1341
1342	movq	24(%rcx),%rax
1343	mulq	%rax
1344	movq	%rax,%rdi
1345	movq	%rdx,%r8
1346
1347	addq	%r10,%r10
1348	adcq	%r11,%r11
1349	adcq	%r12,%r12
1350	adcq	%r13,%r13
1351	adcq	%r14,%r14
1352	adcq	%r15,%r15
1353	adcq	$0,%r8
1354
1355	movq	0(%rcx),%rax
1356	mulq	%rax
1357	movq	%rax,520(%rsp)
1358	movq	%rdx,%rbx
1359
1360	movq	8(%rcx),%rax
1361	mulq	%rax
1362
1363	addq	%rbx,%r10
1364	adcq	%rax,%r11
1365	adcq	$0,%rdx
1366
1367	movq	%rdx,%rbx
1368	movq	%r10,528(%rsp)
1369	movq	%r11,536(%rsp)
1370
1371	movq	16(%rcx),%rax
1372	mulq	%rax
1373
1374	addq	%rbx,%r12
1375	adcq	%rax,%r13
1376	adcq	$0,%rdx
1377
1378	movq	%rdx,%rbx
1379
1380	movq	%r12,544(%rsp)
1381	movq	%r13,552(%rsp)
1382
1383	xorq	%rbp,%rbp
1384	addq	%rbx,%r14
1385	adcq	%rdi,%r15
1386	adcq	$0,%rbp
1387
1388	movq	%r14,560(%rsp)
1389	movq	%r15,568(%rsp)
1390
1391
1392
1393
1394	movq	576(%rsp),%r10
1395	movq	584(%rsp),%r11
1396	movq	592(%rsp),%r12
1397	movq	600(%rsp),%r13
1398	movq	608(%rsp),%r14
1399	movq	616(%rsp),%r15
1400	movq	624(%rsp),%rdi
1401	movq	632(%rsp),%rsi
1402
1403	movq	%r9,%rax
1404	mulq	%rax
1405	movq	%rax,%r9
1406	movq	%rdx,%rbx
1407
1408	addq	%r10,%r10
1409	adcq	%r11,%r11
1410	adcq	%r12,%r12
1411	adcq	%r13,%r13
1412	adcq	%r14,%r14
1413	adcq	%r15,%r15
1414	adcq	%rdi,%rdi
1415	adcq	%rsi,%rsi
1416	adcq	$0,%rbx
1417
1418	addq	%rbp,%r10
1419
1420	movq	32(%rcx),%rax
1421	mulq	%rax
1422
1423	addq	%r8,%r10
1424	adcq	%rax,%r11
1425	adcq	$0,%rdx
1426
1427	movq	%rdx,%rbp
1428
1429	movq	%r10,576(%rsp)
1430	movq	%r11,584(%rsp)
1431
1432	movq	40(%rcx),%rax
1433	mulq	%rax
1434
1435	addq	%rbp,%r12
1436	adcq	%rax,%r13
1437	adcq	$0,%rdx
1438
1439	movq	%rdx,%rbp
1440
1441	movq	%r12,592(%rsp)
1442	movq	%r13,600(%rsp)
1443
1444	movq	48(%rcx),%rax
1445	mulq	%rax
1446
1447	addq	%rbp,%r14
1448	adcq	%rax,%r15
1449	adcq	$0,%rdx
1450
1451	movq	%r14,608(%rsp)
1452	movq	%r15,616(%rsp)
1453
1454	addq	%rdx,%rdi
1455	adcq	%r9,%rsi
1456	adcq	$0,%rbx
1457
1458	movq	%rdi,624(%rsp)
1459	movq	%rsi,632(%rsp)
1460	movq	%rbx,640(%rsp)
1461
1462	jmp	mont_reduce
1463
1464
1465
1466.globl	_mod_exp_512
1467
1468_mod_exp_512:
1469	pushq	%rbp
1470	pushq	%rbx
1471	pushq	%r12
1472	pushq	%r13
1473	pushq	%r14
1474	pushq	%r15
1475
1476
1477	movq	%rsp,%r8
1478	subq	$2688,%rsp
1479	andq	$-64,%rsp
1480
1481
1482	movq	%r8,0(%rsp)
1483	movq	%rdi,8(%rsp)
1484	movq	%rsi,16(%rsp)
1485	movq	%rcx,24(%rsp)
1486L$body:
1487
1488
1489
1490	pxor	%xmm4,%xmm4
1491	movdqu	0(%rsi),%xmm0
1492	movdqu	16(%rsi),%xmm1
1493	movdqu	32(%rsi),%xmm2
1494	movdqu	48(%rsi),%xmm3
1495	movdqa	%xmm4,512(%rsp)
1496	movdqa	%xmm4,528(%rsp)
1497	movdqa	%xmm4,608(%rsp)
1498	movdqa	%xmm4,624(%rsp)
1499	movdqa	%xmm0,544(%rsp)
1500	movdqa	%xmm1,560(%rsp)
1501	movdqa	%xmm2,576(%rsp)
1502	movdqa	%xmm3,592(%rsp)
1503
1504
1505	movdqu	0(%rdx),%xmm0
1506	movdqu	16(%rdx),%xmm1
1507	movdqu	32(%rdx),%xmm2
1508	movdqu	48(%rdx),%xmm3
1509
1510	leaq	384(%rsp),%rbx
1511	movq	%rbx,136(%rsp)
1512	call	mont_reduce
1513
1514
1515	leaq	448(%rsp),%rcx
1516	xorq	%rax,%rax
1517	movq	%rax,0(%rcx)
1518	movq	%rax,8(%rcx)
1519	movq	%rax,24(%rcx)
1520	movq	%rax,32(%rcx)
1521	movq	%rax,40(%rcx)
1522	movq	%rax,48(%rcx)
1523	movq	%rax,56(%rcx)
1524	movq	%rax,128(%rsp)
1525	movq	$1,16(%rcx)
1526
1527	leaq	640(%rsp),%rbp
1528	movq	%rcx,%rsi
1529	movq	%rbp,%rdi
1530	movq	$8,%rax
1531loop_0:
1532	movq	(%rcx),%rbx
1533	movw	%bx,(%rdi)
1534	shrq	$16,%rbx
1535	movw	%bx,64(%rdi)
1536	shrq	$16,%rbx
1537	movw	%bx,128(%rdi)
1538	shrq	$16,%rbx
1539	movw	%bx,192(%rdi)
1540	leaq	8(%rcx),%rcx
1541	leaq	256(%rdi),%rdi
1542	decq	%rax
1543	jnz	loop_0
1544	movq	$31,%rax
1545	movq	%rax,32(%rsp)
1546	movq	%rbp,40(%rsp)
1547
1548	movq	%rsi,136(%rsp)
1549	movq	0(%rsi),%r10
1550	movq	8(%rsi),%r11
1551	movq	16(%rsi),%r12
1552	movq	24(%rsi),%r13
1553	movq	32(%rsi),%r14
1554	movq	40(%rsi),%r15
1555	movq	48(%rsi),%r8
1556	movq	56(%rsi),%r9
1557init_loop:
1558	leaq	384(%rsp),%rdi
1559	call	mont_mul_a3b
1560	leaq	448(%rsp),%rsi
1561	movq	40(%rsp),%rbp
1562	addq	$2,%rbp
1563	movq	%rbp,40(%rsp)
1564	movq	%rsi,%rcx
1565	movq	$8,%rax
1566loop_1:
1567	movq	(%rcx),%rbx
1568	movw	%bx,(%rbp)
1569	shrq	$16,%rbx
1570	movw	%bx,64(%rbp)
1571	shrq	$16,%rbx
1572	movw	%bx,128(%rbp)
1573	shrq	$16,%rbx
1574	movw	%bx,192(%rbp)
1575	leaq	8(%rcx),%rcx
1576	leaq	256(%rbp),%rbp
1577	decq	%rax
1578	jnz	loop_1
1579	movq	32(%rsp),%rax
1580	subq	$1,%rax
1581	movq	%rax,32(%rsp)
1582	jne	init_loop
1583
1584
1585
1586	movdqa	%xmm0,64(%rsp)
1587	movdqa	%xmm1,80(%rsp)
1588	movdqa	%xmm2,96(%rsp)
1589	movdqa	%xmm3,112(%rsp)
1590
1591
1592
1593
1594
1595	movl	126(%rsp),%eax
1596	movq	%rax,%rdx
1597	shrq	$11,%rax
1598	andl	$2047,%edx
1599	movl	%edx,126(%rsp)
1600	leaq	640(%rsp,%rax,2),%rsi
1601	movq	8(%rsp),%rdx
1602	movq	$4,%rbp
1603loop_2:
1604	movzwq	192(%rsi),%rbx
1605	movzwq	448(%rsi),%rax
1606	shlq	$16,%rbx
1607	shlq	$16,%rax
1608	movw	128(%rsi),%bx
1609	movw	384(%rsi),%ax
1610	shlq	$16,%rbx
1611	shlq	$16,%rax
1612	movw	64(%rsi),%bx
1613	movw	320(%rsi),%ax
1614	shlq	$16,%rbx
1615	shlq	$16,%rax
1616	movw	0(%rsi),%bx
1617	movw	256(%rsi),%ax
1618	movq	%rbx,0(%rdx)
1619	movq	%rax,8(%rdx)
1620	leaq	512(%rsi),%rsi
1621	leaq	16(%rdx),%rdx
1622	subq	$1,%rbp
1623	jnz	loop_2
1624	movq	$505,48(%rsp)
1625
1626	movq	8(%rsp),%rcx
1627	movq	%rcx,136(%rsp)
1628	movq	0(%rcx),%r10
1629	movq	8(%rcx),%r11
1630	movq	16(%rcx),%r12
1631	movq	24(%rcx),%r13
1632	movq	32(%rcx),%r14
1633	movq	40(%rcx),%r15
1634	movq	48(%rcx),%r8
1635	movq	56(%rcx),%r9
1636	jmp	sqr_2
1637
1638main_loop_a3b:
1639	call	sqr_reduce
1640	call	sqr_reduce
1641	call	sqr_reduce
1642sqr_2:
1643	call	sqr_reduce
1644	call	sqr_reduce
1645
1646
1647
1648	movq	48(%rsp),%rcx
1649	movq	%rcx,%rax
1650	shrq	$4,%rax
1651	movl	64(%rsp,%rax,2),%edx
1652	andq	$15,%rcx
1653	shrq	%cl,%rdx
1654	andq	$31,%rdx
1655
1656	leaq	640(%rsp,%rdx,2),%rsi
1657	leaq	448(%rsp),%rdx
1658	movq	%rdx,%rdi
1659	movq	$4,%rbp
1660loop_3:
1661	movzwq	192(%rsi),%rbx
1662	movzwq	448(%rsi),%rax
1663	shlq	$16,%rbx
1664	shlq	$16,%rax
1665	movw	128(%rsi),%bx
1666	movw	384(%rsi),%ax
1667	shlq	$16,%rbx
1668	shlq	$16,%rax
1669	movw	64(%rsi),%bx
1670	movw	320(%rsi),%ax
1671	shlq	$16,%rbx
1672	shlq	$16,%rax
1673	movw	0(%rsi),%bx
1674	movw	256(%rsi),%ax
1675	movq	%rbx,0(%rdx)
1676	movq	%rax,8(%rdx)
1677	leaq	512(%rsi),%rsi
1678	leaq	16(%rdx),%rdx
1679	subq	$1,%rbp
1680	jnz	loop_3
1681	movq	8(%rsp),%rsi
1682	call	mont_mul_a3b
1683
1684
1685
1686	movq	48(%rsp),%rcx
1687	subq	$5,%rcx
1688	movq	%rcx,48(%rsp)
1689	jge	main_loop_a3b
1690
1691
1692
1693end_main_loop_a3b:
1694
1695
1696	movq	8(%rsp),%rdx
1697	pxor	%xmm4,%xmm4
1698	movdqu	0(%rdx),%xmm0
1699	movdqu	16(%rdx),%xmm1
1700	movdqu	32(%rdx),%xmm2
1701	movdqu	48(%rdx),%xmm3
1702	movdqa	%xmm4,576(%rsp)
1703	movdqa	%xmm4,592(%rsp)
1704	movdqa	%xmm4,608(%rsp)
1705	movdqa	%xmm4,624(%rsp)
1706	movdqa	%xmm0,512(%rsp)
1707	movdqa	%xmm1,528(%rsp)
1708	movdqa	%xmm2,544(%rsp)
1709	movdqa	%xmm3,560(%rsp)
1710	call	mont_reduce
1711
1712
1713
1714	movq	8(%rsp),%rax
1715	movq	0(%rax),%r8
1716	movq	8(%rax),%r9
1717	movq	16(%rax),%r10
1718	movq	24(%rax),%r11
1719	movq	32(%rax),%r12
1720	movq	40(%rax),%r13
1721	movq	48(%rax),%r14
1722	movq	56(%rax),%r15
1723
1724
1725	movq	24(%rsp),%rbx
1726	addq	$512,%rbx
1727
1728	subq	0(%rbx),%r8
1729	sbbq	8(%rbx),%r9
1730	sbbq	16(%rbx),%r10
1731	sbbq	24(%rbx),%r11
1732	sbbq	32(%rbx),%r12
1733	sbbq	40(%rbx),%r13
1734	sbbq	48(%rbx),%r14
1735	sbbq	56(%rbx),%r15
1736
1737
1738	movq	0(%rax),%rsi
1739	movq	8(%rax),%rdi
1740	movq	16(%rax),%rcx
1741	movq	24(%rax),%rdx
1742	cmovncq	%r8,%rsi
1743	cmovncq	%r9,%rdi
1744	cmovncq	%r10,%rcx
1745	cmovncq	%r11,%rdx
1746	movq	%rsi,0(%rax)
1747	movq	%rdi,8(%rax)
1748	movq	%rcx,16(%rax)
1749	movq	%rdx,24(%rax)
1750
1751	movq	32(%rax),%rsi
1752	movq	40(%rax),%rdi
1753	movq	48(%rax),%rcx
1754	movq	56(%rax),%rdx
1755	cmovncq	%r12,%rsi
1756	cmovncq	%r13,%rdi
1757	cmovncq	%r14,%rcx
1758	cmovncq	%r15,%rdx
1759	movq	%rsi,32(%rax)
1760	movq	%rdi,40(%rax)
1761	movq	%rcx,48(%rax)
1762	movq	%rdx,56(%rax)
1763
1764	movq	0(%rsp),%rsi
1765	movq	0(%rsi),%r15
1766	movq	8(%rsi),%r14
1767	movq	16(%rsi),%r13
1768	movq	24(%rsi),%r12
1769	movq	32(%rsi),%rbx
1770	movq	40(%rsi),%rbp
1771	leaq	48(%rsi),%rsp
1772L$epilogue:
1773	retq
1774
1775