1
2;  AMD64 mpn_mul_basecase
3
4;  Copyright 2008,2009 Jason Moxham
5
6;  This file is part of the MPIR Library.
7
8;  The MPIR Library is free software; you can redistribute it and/or modify
9;  it under the terms of the GNU Lesser General Public License as published
10;  by the Free Software Foundation; either version 2.1 of the License, or (at
11;  your option) any later version.
12
13;  The MPIR Library is distributed in the hope that it will be useful, but
14;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16;  License for more details.
17
18;  You should have received a copy of the GNU Lesser General Public License
19;  along with the MPIR Library; see the file COPYING.LIB.  If not, write
20;  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21;  Boston, MA 02110-1301, USA.
22
23%include 'yasm_mac.inc'
24
25; C	(rdi,rdx+r8)=(rsi,rdx)*(rcx,r8)
26; C Version 1.0.7
27
28
29%macro addmul2lp 1
30	align   16
31%%1:
32	mov     rax, [rsi+rbx*8]
33	mul     r8
34	add     r9, rax
35	mov     rax, [rsi+rbx*8+8]
36	adc     r10, rdx
37	mov     r11, 0
38	mul     rcx
39	add     [rdi+rbx*8], r12
40	adc     r9, rax
41	mov     r12, 0
42	adc     r10, rdx
43	mov     rax, [rsi+rbx*8+8]
44	adc     r11, 0
45	mul     r8
46	add     [rdi+rbx*8+8], r9
47	adc     r10, rax
48	adc     r11, rdx
49	mov     rax, [rsi+rbx*8+16]
50	mul     rcx
51	add     r10, rax
52	mov     rax, [rsi+rbx*8+16]
53	adc     r11, rdx
54	adc     r12, 0
55	mul     r8
56	add     [rdi+rbx*8+16], r10
57	mov     r9, 0
58	adc     r11, rax
59	mov     r10, 0
60	mov     rax, [rsi+rbx*8+24]
61	adc     r12, rdx
62	mov     r15, r15
63	mul     rcx
64	add     r11, rax
65	mov     rax, [rsi+rbx*8+24]
66	adc     r12, rdx
67	adc     r9, 0
68	mul     r8
69	add     [rdi+rbx*8+24], r11
70	adc     r12, rax
71	adc     r9, rdx
72	mov     rax, [rsi+rbx*8+32]
73	mul     rcx
74	add     r12, rax
75	adc     r9, rdx
76	adc     r10, 0
77	add     rbx, 4
78	jnc     %%1
79%endmacro
80
81%macro addmul2pro0 0
82	mov     rcx, [r13+r15*8]
83	mul     rcx
84	mov     r12, rax
85	mov     r9, rdx
86	mov     r10, 0
87	mov     r8, [r13+r15*8+8]
88%endmacro
89
90%macro addmul2epi0 0
91	mov     rbx, r14
92	mov     rax, [rsi+24]
93	mul     r8
94	add     [rdi+24], r12
95	adc     r9, rax
96	adc     r10, rdx
97	add     r15, 2
98	mov     rax, [rsi+r14*8]
99	mov     [rdi+32], r9
100	lea     rdi, [rdi+16]
101	mov     [rdi+24], r10
102%endmacro
103
104%macro addmul2pro1 0
105	mov     rcx, [r13+r15*8]
106	mul     rcx
107	mov     r12, rax
108	mov     r10, 0
109	mov     r9, rdx
110	mov     r8, [r13+r15*8+8]
111%endmacro
112
113%macro addmul2epi1 0
114	mov     rax, [rsi+16]
115	lea     rdi, [rdi+16]
116	mul     r8
117	add     r9, rax
118	mov     rax, [rsi+24]
119	mov     r11, 0
120	adc     r10, rdx
121	mul     rcx
122	add     [rdi], r12
123	adc     r9, rax
124	adc     r10, rdx
125	adc     r11, 0
126	mov     rax, [rsi+24]
127	mul     r8
128	add     [rdi+8], r9
129	adc     r10, rax
130	adc     r11, rdx
131	add     r15, 2
132	mov     rbx, r14
133	mov     rax, [rsi+r14*8]
134	mov     [rdi+24], r11
135	mov     [rdi+16], r10
136%endmacro
137
138%macro addmul2pro2 0
139	mov     rcx, [r13+r15*8]
140	mul     rcx
141	mov     r10, 0
142	mov     r12, rax
143	mov     r9, rdx
144	mov     r8, [r13+r15*8+8]
145%endmacro
146
147%macro addmul2epi2 0
148	mov     rax, [rsi+8]
149	lea     rdi, [rdi+16]
150	mul     r8
151	add     r9, rax
152	mov     rax, [rsi+16]
153	adc     r10, rdx
154	mov     r11, 0
155	mul     rcx
156	add     [rdi-8], r12
157	adc     r9, rax
158	mov     r12, 0
159	adc     r10, rdx
160	mov     rax, [rsi+16]
161	adc     r11, 0
162	mul     r8
163	add     [rdi], r9
164	adc     r10, rax
165	adc     r11, rdx
166	mov     rax, [rsi+24]
167	mul     rcx
168	add     r10, rax
169	mov     rax, [rsi+24]
170	adc     r11, rdx
171	adc     r12, 0
172	mul     r8
173	add     [rdi+8], r10
174	adc     r11, rax
175	adc     r12, rdx
176	mov     rax, [rsi+r14*8]
177	mov     [rdi+16], r11
178	mov     [rdi+24], r12
179	add     r15, 2
180	mov     rbx, r14
181%endmacro
182
183%macro addmul2pro3 0
184	mov     rcx, [r13+r15*8]
185	mul     rcx
186	mov     r12, rax
187	mov     r9, rdx
188	mov     r8, [r13+r15*8+8]
189	mov     r10, 0
190%endmacro
191
192%macro addmul2epi3 0
193	mov     rax, [rsi]
194	lea     rdi, [rdi+16]
195	mul     r8
196	add     r9, rax
197	mov     rax, [rsi+8]
198	adc     r10, rdx
199	mov     r11, 0
200	mul     rcx
201	add     [rdi-16], r12
202	adc     r9, rax
203	mov     r12, 0
204	adc     r10, rdx
205	mov     rax, [rsi+8]
206	adc     r11, 0
207	mul     r8
208	add     [rdi-8], r9
209	adc     r10, rax
210	adc     r11, rdx
211	mov     rax, [rsi+16]
212	mul     rcx
213	add     r10, rax
214	mov     rax, [rsi+16]
215	adc     r11, rdx
216	adc     r12, 0
217	mul     r8
218	add     [rdi], r10
219	mov     r9, 0
220	adc     r11, rax
221	mov     r10, 0
222	mov     rax, [rsi+24]
223	adc     r12, rdx
224	mov     r15, r15
225	mul     rcx
226	add     r11, rax
227	mov     rax, [rsi+24]
228	adc     r12, rdx
229	adc     r9, 0
230	mul     r8
231	add     [rdi+8], r11
232	adc     r12, rax
233	adc     r9, rdx
234	mov     rax, [rsi+r14*8]
235	mov     [rdi+16], r12
236	mov     [rdi+24], r9
237	add     r15, 2
238	mov     rbx, r14
239%endmacro
240
241%macro mul2lp 0
242	align   16
243%%1:
244	mov     rax, [rsi+rbx*8]
245	mul     r8
246	add     r9, rax
247	mov     rax, [rsi+rbx*8+8]
248	adc     r10, rdx
249	mov     r11, 0
250	mul     rcx
251	mov     [rdi+rbx*8], r12
252	add     r9, rax
253	mov     r12, 0
254	adc     r10, rdx
255	mov     rax, [rsi+rbx*8+8]
256	adc     r11, 0
257	mul     r8
258	mov     [rdi+rbx*8+8], r9
259	add     r10, rax
260	adc     r11, rdx
261	mov     rax, [rsi+rbx*8+16]
262	mul     rcx
263	add     r10, rax
264	mov     rax, [rsi+rbx*8+16]
265	adc     r11, rdx
266	adc     r12, 0
267	mul     r8
268	mov     [rdi+rbx*8+16], r10
269	mov     r9, 0
270	add     r11, rax
271	mov     r10, 0
272	mov     rax, [rsi+rbx*8+24]
273	adc     r12, rdx
274	mov     r15, r15
275	mul     rcx
276	add     r11, rax
277	mov     rax, [rsi+rbx*8+24]
278	adc     r12, rdx
279	adc     r9, 0
280	mul     r8
281	mov     [rdi+rbx*8+24], r11
282	add     r12, rax
283	adc     r9, rdx
284	mov     rax, [rsi+rbx*8+32]
285	mul     rcx
286	add     r12, rax
287	adc     r9, rdx
288	adc     r10, 0
289	add     rbx, 4
290	jnc     %%1
291%endmacro
292
293%macro mul2pro0 0
294	mov     rcx, [r13+r15*8]
295	mul     rcx
296	mov     r12, rax
297	mov     r9, rdx
298	mov     r10, 0
299	mov     r8, [r13+r15*8+8]
300%endmacro
301
302%macro mul2epi0 0
303	mov     rbx, r14
304	mov     rax, [rsi+24]
305	mul     r8
306	mov     [rdi+24], r12
307	add     r9, rax
308	adc     r10, rdx
309	add     r15, 2
310	mov     rax, [rsi+r14*8]
311	mov     [rdi+32], r9
312	lea     rdi, [rdi+16]
313	mov     [rdi+24], r10
314%endmacro
315
316%macro mul2pro1 0
317	mov     rcx, [r13+r15*8]
318	mul     rcx
319	mov     r12, rax
320	mov     r10, 0
321	mov     r9, rdx
322	mov     r8, [r13+r15*8+8]
323%endmacro
324
325%macro mul2epi1 0
326	mov     rax, [rsi+16]
327	lea     rdi, [rdi+16]
328	mul     r8
329	add     r9, rax
330	mov     rax, [rsi+24]
331	mov     r11, 0
332	adc     r10, rdx
333	mul     rcx
334	mov     [rdi], r12
335	add     r9, rax
336	adc     r10, rdx
337	adc     r11, 0
338	mov     rax, [rsi+24]
339	mul     r8
340	mov     [rdi+8], r9
341	add     r10, rax
342	adc     r11, rdx
343	add     r15, 2
344	mov     rbx, r14
345	mov     rax, [rsi+r14*8]
346	mov     [rdi+24], r11
347	mov     [rdi+16], r10
348%endmacro
349
350%macro mul2pro2 0
351	mov     rcx, [r13+r15*8]
352	mul     rcx
353	mov     r10, 0
354	mov     r12, rax
355	mov     r9, rdx
356	mov     r8, [r13+r15*8+8]
357%endmacro
358
359%macro mul2epi2 0
360	mov     rax, [rsi+8]
361	lea     rdi, [rdi+16]
362	mul     r8
363	add     r9, rax
364	mov     rax, [rsi+16]
365	adc     r10, rdx
366	mov     r11, 0
367	mul     rcx
368	mov     [rdi-8], r12
369	add     r9, rax
370	mov     r12, 0
371	adc     r10, rdx
372	mov     rax, [rsi+16]
373	adc     r11, 0
374	mul     r8
375	mov     [rdi], r9
376	add     r10, rax
377	adc     r11, rdx
378	mov     rax, [rsi+24]
379	mul     rcx
380	add     r10, rax
381	mov     rax, [rsi+24]
382	adc     r11, rdx
383	adc     r12, 0
384	mul     r8
385	mov     [rdi+8], r10
386	add     r11, rax
387	adc     r12, rdx
388	mov     rax, [rsi+r14*8]
389	mov     [rdi+16], r11
390	mov     [rdi+24], r12
391	add     r15, 2
392	mov     rbx, r14
393%endmacro
394
395%macro mul2pro3 0
396	mov     rcx, [r13+r15*8]
397	mul     rcx
398	mov     r12, rax
399	mov     r9, rdx
400	mov     r8, [r13+r15*8+8]
401	mov     r10, 0
402%endmacro
403
404%macro mul2epi3 0
405	mov     rax, [rsi]
406	lea     rdi, [rdi+16]
407	mul     r8
408	add     r9, rax
409	mov     rax, [rsi+8]
410	adc     r10, rdx
411	mov     r11, 0
412	mul     rcx
413	mov     [rdi-16], r12
414	add     r9, rax
415	mov     r12, 0
416	adc     r10, rdx
417	mov     rax, [rsi+8]
418	adc     r11, 0
419	mul     r8
420	mov     [rdi-8], r9
421	add     r10, rax
422	adc     r11, rdx
423	mov     rax, [rsi+16]
424	mul     rcx
425	add     r10, rax
426	mov     rax, [rsi+16]
427	adc     r11, rdx
428	adc     r12, 0
429	mul     r8
430	mov     [rdi], r10
431	mov     r9, 0
432	add     r11, rax
433	mov     r10, 0
434	mov     rax, [rsi+24]
435	adc     r12, rdx
436	mov     r15, r15
437	mul     rcx
438	add     r11, rax
439	mov     rax, [rsi+24]
440	adc     r12, rdx
441	adc     r9, 0
442	mul     r8
443	mov     [rdi+8], r11
444	add     r12, rax
445	adc     r9, rdx
446	mov     rax, [rsi+r14*8]
447	mov     [rdi+16], r12
448	mov     [rdi+24], r9
449	add     r15, 2
450	mov     rbx, r14
451%endmacro
452
453%macro mul1lp 0
454	align   16
455%%1:
456	mov     r10, 0
457	mul     r8
458	mov     [rdi+rbx*8-8], r12
459	add     r9, rax
460	db      0x26
461	adc     r10, rdx
462	mov     rax, [rsi+rbx*8+8]
463	mul     r8
464	mov     [rdi+rbx*8], r9
465	add     r10, rax
466	mov     r11d, 0
467	adc     r11, rdx
468	mov     rax, [rsi+rbx*8+16]
469	mov     r12, 0
470	mov     r9, 0
471	mul     r8
472	mov     [rdi+rbx*8+8], r10
473	db      0x26
474	add     r11, rax
475	db      0x26
476	adc     r12, rdx
477	mov     rax, [rsi+rbx*8+24]
478	mul     r8
479	mov     [rdi+rbx*8+16], r11
480	db      0x26
481	add     r12, rax
482	db      0x26
483	adc     r9, rdx
484	add     rbx, 4
485	mov     rax, [rsi+rbx*8]
486	jnc     %%1
487%endmacro
488
489; rbx is 0
490%macro mulnext0 0
491	mov     rax, [rsi+8]
492	mul     r8
493	mov     [rdi], r9
494	add     r10, rax
495	mov     r11d, 0
496	adc     r11, rdx
497	mov     rax, [rsi+16]
498	mov     r12d, 0
499	mul     r8
500	mov     [rdi+8], r10
501	add     r11, rax
502	adc     r12, rdx
503	mov     rax, [rsi+24]
504	mul     r8
505	mov     [rdi+16], r11
506	add     r12, rax
507	adc     rdx, 0
508	mov     [rdi+24], r12
509	mov     rax, [rsi+r14*8]
510	mov     [rdi+32], rdx
511	inc     r15
512	lea     rdi, [rdi+8]
513	mov     rbx, r14
514%endmacro
515
516; rbx is 1
517%macro mulnext1 0
518	mov     rax, [rsi+16]
519	mul     r8
520	mov     [rdi+8], r9
521	add     r10, rax
522	mov     r12d, 0
523	adc     r12, rdx
524	mov     rax, [rsi+24]
525	mul     r8
526	mov     [rdi+16], r10
527	add     r12, rax
528	adc     rdx, 0
529	mov     [rdi+24], r12
530	mov     [rdi+32], rdx
531	inc     r15
532	lea     rdi, [rdi+8]
533	mov     rbx, r14
534	mov     rax, [rsi+r14*8]
535%endmacro
536
537; rbx is 2
538%macro mulnext2 0
539	mov     rax, [rsi+24]
540	mul     r8
541	mov     [rdi+16], r9
542	add     r10, rax
543	mov     r11d, 0
544	adc     r11, rdx
545	mov     [rdi+24], r10
546	mov     [rdi+32], r11
547	inc     r15
548	lea     rdi, [rdi+8]
549	mov     rax, [rsi+r14*8]
550	mov     rbx, r14
551%endmacro
552
553; rbx is 3
554%macro mulnext3 0
555	mov     [rdi+24], r9
556	mov     [rdi+32], r10
557	inc     r15
558	lea     rdi, [rdi+8]
559	mov     rax, [rsi+r14*8]
560	mov     rbx, r14
561%endmacro
562
563%macro mpn_addmul_2_int 1
564	jz      %%2
565	align   16
566%%1:
567	addmul2pro%1
568	addmul2lp %1
569	addmul2epi%1
570	jnz     %%1
571%%2:
572	mov     r13, [rsp-8]
573	mov     r14, [rsp-16]
574	mov     rbx, [rsp-24]
575	mov     r12, [rsp-32]
576	mov     r15, [rsp-40]
577	ret
578%endmacro
579
580%macro oldmulnext0 0
581	mov     rax, [rsi+r11*8+16]
582	mul     r13
583	mov     [rdi+r11*8+8], r9
584	add     r10, rax
585	mov     ebx, 0
586	adc     rbx, rdx
587	mov     rax, [rsi+r11*8+24]
588	mov     r12d, 0
589	mul     r13
590	mov     [rdi+r11*8+16], r10
591	add     rbx, rax
592	adc     r12, rdx
593	mov     rax, [rsi+r11*8+32]
594	mul     r13
595	mov     [rdi+r11*8+24], rbx
596	add     r12, rax
597	adc     rdx, 0
598	mov     [rdi+r11*8+32], r12
599	mov     rax, [rsi+r14*8]
600	mov     [rdi+r11*8+40], rdx
601	inc     r8
602	mov     r11, r14
603%endmacro
604
605%macro oldmulnext1 0
606	mov     rax, [rsi+r11*8+16]
607	mul     r13
608	mov     [rdi+r11*8+8], r9
609	add     r10, rax
610	mov     r12d, 0
611	adc     r12, rdx
612	mov     rax, [rsi+r11*8+24]
613	mul     r13
614	mov     [rdi+r11*8+16], r10
615	add     r12, rax
616	adc     rdx, 0
617	mov     [rdi+r11*8+24], r12
618	mov     [rdi+r11*8+32], rdx
619	inc     r8
620	lea     rdi, [rdi+8]
621	mov     r11, r14
622	mov     rax, [rsi+r14*8]
623%endmacro
624
625%macro oldmulnext2 0
626	mov     rax, [rsi+r11*8+16]
627	mul     r13
628	mov     [rdi+r11*8+8], r9
629	add     r10, rax
630	mov     ebx, 0
631	adc     rbx, rdx
632	mov     [rdi+r11*8+16], r10
633	mov     [rdi+r11*8+24], rbx
634	inc     r8
635	mov     rax, [rsi+r14*8]
636	mov     r11, r14
637%endmacro
638
639%macro oldmulnext3 0
640	mov     [rdi+r11*8+8], r9
641	mov     [rdi+r11*8+16], r10
642	inc     r8
643	mov     rax, [rsi+r14*8]
644	mov     r11, r14
645%endmacro
646
647%macro oldaddmulpro0 0
648	mov     r13, [rcx+r8*8]
649	db      0x26
650	mul     r13
651	db      0x26
652	mov     r12, rax
653	mov     rax, [rsi+r14*8+8]
654	db      0x26
655	mov     r9, rdx
656	lea     rdi, [rdi+8]
657%endmacro
658
659%macro oldaddmulnext0 0
660	mov     r10d, 0
661	mul     r13
662	add     [rdi], r12
663	adc     r9, rax
664	adc     r10, rdx
665	mov     rax, [rsi+16]
666	mul     r13
667	add     [rdi+8], r9
668	adc     r10, rax
669	mov     ebx, 0
670	adc     rbx, rdx
671	mov     rax, [rsi+24]
672	mov     r12d, 0
673	mov     r11, r14
674	mul     r13
675	add     [rdi+16], r10
676	adc     rbx, rax
677	adc     r12, rdx
678	mov     rax, [rsi+32]
679	mul     r13
680	add     [rdi+24], rbx
681	adc     r12, rax
682	adc     rdx, 0
683	add     [rdi+32], r12
684	mov     rax, [rsi+r14*8]
685	adc     rdx, 0
686	inc     r8
687	mov     [rdi+40], rdx
688%endmacro
689
690%macro oldaddmulpro1 0
691	mov     r13, [rcx+r8*8]
692	mul     r13
693	mov     r12, rax
694	mov     rax, [rsi+r14*8+8]
695	mov     r9, rdx
696%endmacro
697
698%macro oldaddmulnext1 0
699	mov     r10d, 0
700	mul     r13
701	add     [rdi+8], r12
702	adc     r9, rax
703	adc     r10, rdx
704	mov     rax, [rsi+24]
705	mul     r13
706	lea     rdi, [rdi+8]
707	add     [rdi+8], r9
708	adc     r10, rax
709	mov     r12d, 0
710	mov     rax, [rsi+32]
711	adc     r12, rdx
712	mov     r11, r14
713	mul     r13
714	add     [rdi+16], r10
715	adc     r12, rax
716	adc     rdx, 0
717	add     [rdi+24], r12
718	adc     rdx, 0
719	mov     [rdi+32], rdx
720	inc     r8
721	mov     rax, [rsi+r14*8]
722%endmacro
723
724%macro oldaddmulpro2 0
725	mov     r13, [rcx+r8*8]
726	lea     rdi, [rdi+8]
727	mul     r13
728	mov     r12, rax
729	mov     rax, [rsi+r14*8+8]
730	mov     r9, rdx
731%endmacro
732
733%macro oldaddmulnext2 0
734	mov     r10d, 0
735	mul     r13
736	add     [rdi+r11*8], r12
737	adc     r9, rax
738	adc     r10, rdx
739	mov     rax, [rsi+r11*8+16]
740	mul     r13
741	mov     ebx, 0
742	add     [rdi+r11*8+8], r9
743	adc     r10, rax
744	adc     rbx, rdx
745	mov     rax, [rsi+r14*8]
746	add     [rdi+r11*8+16], r10
747	adc     rbx, 0
748	mov     [rdi+r11*8+24], rbx
749	inc     r8
750	mov     r11, r14
751%endmacro
752
753%macro oldaddmulpro3 0
754	mov     r13, [rcx+r8*8]
755	db      0x26
756	mul     r13
757	db      0x26
758	mov     r12, rax
759	db      0x26
760	lea     rdi, [rdi+8]
761	db      0x26
762	mov     r9, rdx
763	mov     rax, [rsi+r14*8+8]
764%endmacro
765
766%macro oldaddmulnext3 0
767	mov     r11, r14
768	mul     r13
769	add     [rdi+24], r12
770	adc     r9, rax
771	adc     rdx, 0
772	add     [rdi+32], r9
773	mov     rax, [rsi+r14*8]
774	adc     rdx, 0
775	inc     r8
776	mov     [rdi+40], rdx
777%endmacro
778
779%macro oldmpn_muladdmul_1_int 1
780	oldmulnext%1
781	jz      %%2
782	align   16
783%%1:
784	oldaddmulpro%1
785	oldaddmulnext%1
786	jnz     %%1
787%%2:
788	mov     r13, [rsp-8]
789	mov     r14, [rsp-16]
790	mov     rbx, [rsp-24]
791	mov     r12, [rsp-32]
792	ret
793%endmacro
794
795	ASM_START
796	GLOBAL_FUNC mpn_mul_basecase
797; the current mul does not handle case one
798	cmp     rdx, 4
799	jg      L_fiveormore
800	cmp     rdx, 1
801	je      L_one
802	mov     [rsp-8], r13
803	mov     [rsp-16], r14
804	mov     [rsp-24], rbx
805	mov     [rsp-32], r12
806	mov     r14, 5
807	sub     r14, rdx
808	lea     rdi, [rdi+rdx*8-40]
809	lea     rcx, [rcx+r8*8]
810	neg     r8
811	lea     rsi, [rsi+rdx*8-40]
812	mov     rax, [rsi+r14*8]
813	mov     r13, [rcx+r8*8]
814	mov     r11, r14
815	mul     r13
816	mov     r12, rax
817	mov     rax, [rsi+r14*8+8]
818	mov     r9, rdx
819	mov     r10d, 0
820	mul     r13
821	mov     [rdi+r11*8], r12
822	add     r9, rax
823	adc     r10, rdx
824	cmp     r11, 2
825	ja      L_oldcase3
826	jz      L_oldcase2
827	jp      L_oldcase1
828L_oldcase0:
829	oldmpn_muladdmul_1_int 0
830L_oldcase1:
831	oldmpn_muladdmul_1_int 1
832L_oldcase2:
833	oldmpn_muladdmul_1_int 2
834L_oldcase3:
835	oldmpn_muladdmul_1_int 3
836	align   16
837L_fiveormore:
838; rdx >= 5  as we dont have an inner jump
839; (rdi,rdx+r8)=(rsi,rdx)*(rcx,r8)
840	mov     [rsp-8], r13
841	mov     [rsp-16], r14
842	mov     [rsp-24], rbx
843	mov     [rsp-32], r12
844	mov     [rsp-40], r15
845	mov     r14, 4
846	sub     r14, rdx
847	lea     rdi, [rdi+rdx*8-32]
848	lea     rsi, [rsi+rdx*8-32]
849	mov     r13, rcx
850	mov     r15, r8
851	lea     r13, [r13+r15*8]
852	neg     r15
853	mov     rbx, r14
854	mov     rax, [rsi+r14*8]
855	bt      r15, 0
856	jnc     L_even
857L_odd:
858	inc     rbx
859	mov     r8, [r13+r15*8]
860	mul     r8
861	mov     r12, rax
862	mov     rax, [rsi+r14*8+8]
863	mov     r9, rdx
864	cmp     rbx, 0
865	jge     L_mulskiploop
866	mul1lp
867L_mulskiploop:
868	mov     r10d, 0
869	mul     r8
870	mov     [rdi+rbx*8-8], r12
871	add     r9, rax
872	adc     r10, rdx
873	cmp     rbx, 2
874	ja      L_mul1case3
875	jz      L_mul1case2
876	jp      L_mul1case1
877L_mul1case0:
878	mulnext0
879	jmp     L_case0
880L_mul1case1:
881	mulnext1
882	jmp     L_case3
883L_mul1case2:
884	mulnext2
885	jmp     L_case2
886L_mul1case3:
887	mulnext3
888	jmp     L_case1
889L_even:
890	; as all the mul2pro? are the same
891	mul2pro0
892	mul2lp
893	cmp     rbx, 2
894	ja      L_mul2case0
895	jz      L_mul2case1
896	jp      L_mul2case2
897L_mul2case3:
898	mul2epi3
899L_case3:
900	mpn_addmul_2_int 3
901L_mul2case2:
902	mul2epi2
903L_case2:
904	mpn_addmul_2_int 2
905L_mul2case1:
906	mul2epi1
907L_case1:
908	mpn_addmul_2_int 1
909L_mul2case0:
910	mul2epi0
911L_case0:
912	mpn_addmul_2_int 0
913	align   16
914L_one:
915	mov     rax, [rsi]
916	mul	qword [rcx]
917	mov     [rdi], rax
918	mov     [rdi+8], rdx
919	ret
920	end
921
922