1; PROLOGUE(mpn_mul_basecase)
2
3;  Copyright 2009 Jason Moxham
4;
5;  Windows Conversion Copyright 2008 Brian Gladman
6;
7;  This file is part of the MPIR Library.
8;
9;  The MPIR Library is free software; you can redistribute it and/or modify
10;  it under the terms of the GNU Lesser General Public License as published
11;  by the Free Software Foundation; either version 2.1 of the License, or (at
12;  your option) any later version.
13
14;  The MPIR Library is distributed in the hope that it will be useful, but
15;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17;  License for more details.
18
19;  You should have received a copy of the GNU Lesser General Public License
20;  along with the MPIR Library; see the file COPYING.LIB.  If not, write
21;  to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
22;  Boston, MA 02110-1301, USA.
23;
24;  mp_limb_t mpn_mul_basecase(mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t)
25;  rax                           rdi     rsi        rdx     rcx         r8
26;  rax                           rcx     rdx         r8      r9   [rsp+40]
27
28%include "yasm_mac.inc"
29
30%macro addmul2lp 0
31    xalign  16
32%%1:mov     rax, [rsi+rbx*8]
33    mul     r8
34    add     r9, rax
35    mov     rax, [rsi+rbx*8+8]
36    adc     r10, rdx
37    mov     r11, 0
38    mul     rcx
39    add     [rdi+rbx*8], r12
40    adc     r9, rax
41    mov     r12, 0
42    adc     r10, rdx
43    mov     rax, [rsi+rbx*8+8]
44    adc     r11, 0
45    mul     r8
46    add     [rdi+rbx*8+8], r9
47    adc     r10, rax
48    adc     r11, rdx
49    mov     rax, [rsi+rbx*8+16]
50    mul     rcx
51    add     r10, rax
52    mov     rax, [rsi+rbx*8+16]
53    adc     r11, rdx
54    adc     r12, 0
55    mul     r8
56    add     [rdi+rbx*8+16], r10
57    mov     r9, 0
58    adc     r11, rax
59    mov     r10, 0
60    mov     rax, [rsi+rbx*8+24]
61    adc     r12, rdx
62    mov     r15, r15
63    mul     rcx
64    add     r11, rax
65    mov     rax, [rsi+rbx*8+24]
66    adc     r12, rdx
67    adc     r9, 0
68    mul     r8
69    add     [rdi+rbx*8+24], r11
70    adc     r12, rax
71    adc     r9, rdx
72    mov     rax, [rsi+rbx*8+32]
73    mul     rcx
74    add     r12, rax
75    adc     r9, rdx
76    adc     r10, 0
77    add     rbx, 4
78    jnc     %%1
79%endmacro
80
81%macro addmul2pro0 0
82    mov     rcx, [r13+r15*8]
83    mul     rcx
84    mov     r12, rax
85    mov     r9, rdx
86    mov     r10, 0
87    mov     r8, [r13+r15*8+8]
88%endmacro
89
90%macro addmul2epi0 0
91    mov     rbx, r14
92    mov     rax, [rsi+24]
93    mul     r8
94    add     [rdi+24], r12
95    adc     r9, rax
96    adc     r10, rdx
97    add     r15, 2
98    mov     rax, [rsi+r14*8]
99    mov     [rdi+32], r9
100    lea     rdi, [rdi+16]
101    mov     [rdi+24], r10
102%endmacro
103
104%macro addmul2pro1 0
105    mov     rcx, [r13+r15*8]
106    mul     rcx
107    mov     r12, rax
108    mov     r10, 0
109    mov     r9, rdx
110    mov     r8, [r13+r15*8+8]
111%endmacro
112
113%macro addmul2epi1 0
114    mov     rax, [rsi+16]
115    lea     rdi, [rdi+16]
116    mul     r8
117    add     r9, rax
118    mov     rax, [rsi+24]
119    mov     r11, 0
120    adc     r10, rdx
121    mul     rcx
122    add     [rdi], r12
123    adc     r9, rax
124    adc     r10, rdx
125    adc     r11, 0
126    mov     rax, [rsi+24]
127    mul     r8
128    add     [rdi+8], r9
129    adc     r10, rax
130    adc     r11, rdx
131    add     r15, 2
132    mov     rbx, r14
133    mov     rax, [rsi+r14*8]
134    mov     [rdi+24], r11
135    mov     [rdi+16], r10
136%endmacro
137
138%macro addmul2pro2 0
139    mov     rcx, [r13+r15*8]
140    mul     rcx
141    mov     r10, 0
142    mov     r12, rax
143    mov     r9, rdx
144    mov     r8, [r13+r15*8+8]
145%endmacro
146
147%macro addmul2epi2 0
148    mov     rax, [rsi+8]
149    lea     rdi, [rdi+16]
150    mul     r8
151    add     r9, rax
152    mov     rax, [rsi+16]
153    adc     r10, rdx
154    mov     r11, 0
155    mul     rcx
156    add     [rdi-8], r12
157    adc     r9, rax
158    mov     r12, 0
159    adc     r10, rdx
160    mov     rax, [rsi+16]
161    adc     r11, 0
162    mul     r8
163    add     [rdi], r9
164    adc     r10, rax
165    adc     r11, rdx
166    mov     rax, [rsi+24]
167    mul     rcx
168    add     r10, rax
169    mov     rax, [rsi+24]
170    adc     r11, rdx
171    adc     r12, 0
172    mul     r8
173    add     [rdi+8], r10
174    adc     r11, rax
175    adc     r12, rdx
176    mov     rax, [rsi+r14*8]
177    mov     [rdi+16], r11
178    mov     [rdi+24], r12
179    add     r15, 2
180    mov     rbx, r14
181%endmacro
182
183%macro addmul2pro3 0
184    mov     rcx, [r13+r15*8]
185    mul     rcx
186    mov     r12, rax
187    mov     r9, rdx
188    mov     r8, [r13+r15*8+8]
189    mov     r10, 0
190%endmacro
191
192%macro addmul2epi3 0
193    mov     rax, [rsi]
194    lea     rdi, [rdi+16]
195    mul     r8
196    add     r9, rax
197    mov     rax, [rsi+8]
198    adc     r10, rdx
199    mov     r11, 0
200    mul     rcx
201    add     [rdi-16], r12
202    adc     r9, rax
203    mov     r12, 0
204    adc     r10, rdx
205    mov     rax, [rsi+8]
206    adc     r11, 0
207    mul     r8
208    add     [rdi-8], r9
209    adc     r10, rax
210    adc     r11, rdx
211    mov     rax, [rsi+16]
212    mul     rcx
213    add     r10, rax
214    mov     rax, [rsi+16]
215    adc     r11, rdx
216    adc     r12, 0
217    mul     r8
218    add     [rdi], r10
219    mov     r9, 0
220    adc     r11, rax
221    mov     r10, 0
222    mov     rax, [rsi+24]
223    adc     r12, rdx
224    mov     r15, r15
225    mul     rcx
226    add     r11, rax
227    mov     rax, [rsi+24]
228    adc     r12, rdx
229    adc     r9, 0
230    mul     r8
231    add     [rdi+8], r11
232    adc     r12, rax
233    adc     r9, rdx
234    mov     rax, [rsi+r14*8]
235    mov     [rdi+16], r12
236    mov     [rdi+24], r9
237    add     r15, 2
238    mov     rbx, r14
239%endmacro
240
241%macro mul2lp 0
242    xalign  16
243%%1:mov     rax, [rsi+rbx*8]
244    mul     r8
245    add     r9, rax
246    mov     rax, [rsi+rbx*8+8]
247    adc     r10, rdx
248    mov     r11, 0
249    mul     rcx
250    mov     [rdi+rbx*8], r12
251    add     r9, rax
252    mov     r12, 0
253    adc     r10, rdx
254    mov     rax, [rsi+rbx*8+8]
255    adc     r11, 0
256    mul     r8
257    mov     [rdi+rbx*8+8], r9
258    add     r10, rax
259    adc     r11, rdx
260    mov     rax, [rsi+rbx*8+16]
261    mul     rcx
262    add     r10, rax
263    mov     rax, [rsi+rbx*8+16]
264    adc     r11, rdx
265    adc     r12, 0
266    mul     r8
267    mov     [rdi+rbx*8+16], r10
268    mov     r9, 0
269    add     r11, rax
270    mov     r10, 0
271    mov     rax, [rsi+rbx*8+24]
272    adc     r12, rdx
273    mov     r15, r15
274    mul     rcx
275    add     r11, rax
276    mov     rax, [rsi+rbx*8+24]
277    adc     r12, rdx
278    adc     r9, 0
279    mul     r8
280    mov     [rdi+rbx*8+24], r11
281    add     r12, rax
282    adc     r9, rdx
283    mov     rax, [rsi+rbx*8+32]
284    mul     rcx
285    add     r12, rax
286    adc     r9, rdx
287    adc     r10, 0
288    add     rbx, 4
289    jnc     %%1
290%endmacro
291
292%macro mul2pro0 0
293    mov     rcx, [r13+r15*8]
294    mul     rcx
295    mov     r12, rax
296    mov     r9, rdx
297    mov     r10, 0
298    mov     r8, [r13+r15*8+8]
299%endmacro
300
301%macro mul2epi0 0
302    mov     rbx, r14
303    mov     rax, [rsi+24]
304    mul     r8
305    mov     [rdi+24], r12
306    add     r9, rax
307    adc     r10, rdx
308    add     r15, 2
309    mov     rax, [rsi+r14*8]
310    mov     [rdi+32], r9
311    lea     rdi, [rdi+16]
312    mov     [rdi+24], r10
313%endmacro
314
315%macro mul2pro1 0
316    mov     rcx, [r13+r15*8]
317    mul     rcx
318    mov     r12, rax
319    mov     r10, 0
320    mov     r9, rdx
321    mov     r8, [r13+r15*8+8]
322%endmacro
323
324%macro mul2epi1 0
325    mov     rax, [rsi+16]
326    lea     rdi, [rdi+16]
327    mul     r8
328    add     r9, rax
329    mov     rax, [rsi+24]
330    mov     r11, 0
331    adc     r10, rdx
332    mul     rcx
333    mov     [rdi], r12
334    add     r9, rax
335    adc     r10, rdx
336    adc     r11, 0
337    mov     rax, [rsi+24]
338    mul     r8
339    mov     [rdi+8], r9
340    add     r10, rax
341    adc     r11, rdx
342    add     r15, 2
343    mov     rbx, r14
344    mov     rax, [rsi+r14*8]
345    mov     [rdi+24], r11
346    mov     [rdi+16], r10
347%endmacro
348
349%macro mul2pro2 0
350    mov     rcx, [r13+r15*8]
351    mul     rcx
352    mov     r10, 0
353    mov     r12, rax
354    mov     r9, rdx
355    mov     r8, [r13+r15*8+8]
356%endmacro
357
358%macro mul2epi2 0
359    mov     rax, [rsi+8]
360    lea     rdi, [rdi+16]
361    mul     r8
362    add     r9, rax
363    mov     rax, [rsi+16]
364    adc     r10, rdx
365    mov     r11, 0
366    mul     rcx
367    mov     [rdi-8], r12
368    add     r9, rax
369    mov     r12, 0
370    adc     r10, rdx
371    mov     rax, [rsi+16]
372    adc     r11, 0
373    mul     r8
374    mov     [rdi], r9
375    add     r10, rax
376    adc     r11, rdx
377    mov     rax, [rsi+24]
378    mul     rcx
379    add     r10, rax
380    mov     rax, [rsi+24]
381    adc     r11, rdx
382    adc     r12, 0
383    mul     r8
384    mov     [rdi+8], r10
385    add     r11, rax
386    adc     r12, rdx
387    mov     rax, [rsi+r14*8]
388    mov     [rdi+16], r11
389    mov     [rdi+24], r12
390    add     r15, 2
391    mov     rbx, r14
392%endmacro
393
394%macro mul2pro3 0
395    mov     rcx, [r13+r15*8]
396    mul     rcx
397    mov     r12, rax
398    mov     r9, rdx
399    mov     r8, [r13+r15*8+8]
400    mov     r10, 0
401%endmacro
402
403%macro mul2epi3 0
404    mov     rax, [rsi]
405    lea     rdi, [rdi+16]
406    mul     r8
407    add     r9, rax
408    mov     rax, [rsi+8]
409    adc     r10, rdx
410    mov     r11, 0
411    mul     rcx
412    mov     [rdi-16], r12
413    add     r9, rax
414    mov     r12, 0
415    adc     r10, rdx
416    mov     rax, [rsi+8]
417    adc     r11, 0
418    mul     r8
419    mov     [rdi-8], r9
420    add     r10, rax
421    adc     r11, rdx
422    mov     rax, [rsi+16]
423    mul     rcx
424    add     r10, rax
425    mov     rax, [rsi+16]
426    adc     r11, rdx
427    adc     r12, 0
428    mul     r8
429    mov     [rdi], r10
430    mov     r9, 0
431    add     r11, rax
432    mov     r10, 0
433    mov     rax, [rsi+24]
434    adc     r12, rdx
435    mov     r15, r15
436    mul     rcx
437    add     r11, rax
438    mov     rax, [rsi+24]
439    adc     r12, rdx
440    adc     r9, 0
441    mul     r8
442    mov     [rdi+8], r11
443    add     r12, rax
444    adc     r9, rdx
445    mov     rax, [rsi+r14*8]
446    mov     [rdi+16], r12
447    mov     [rdi+24], r9
448    add     r15, 2
449    mov     rbx, r14
450%endmacro
451
452%macro mul1lp 0
453    xalign  16
454%%1:
455    mov     r10, 0
456    mul     r8
457    mov     [rdi+rbx*8-8], r12
458    add     r9, rax
459    db      0x26
460    adc     r10, rdx
461    mov     rax, [rsi+rbx*8+8]
462    mul     r8
463    mov     [rdi+rbx*8], r9
464    add     r10, rax
465    mov     r11d, 0
466    adc     r11, rdx
467    mov     rax, [rsi+rbx*8+16]
468    mov     r12, 0
469    mov     r9, 0
470    mul     r8
471    mov     [rdi+rbx*8+8], r10
472    db      0x26
473    add     r11, rax
474    db      0x26
475    adc     r12, rdx
476    mov     rax, [rsi+rbx*8+24]
477    mul     r8
478    mov     [rdi+rbx*8+16], r11
479    db      0x26
480    add     r12, rax
481    db      0x26
482    adc     r9, rdx
483    add     rbx, 4
484    mov     rax, [rsi+rbx*8]
485    jnc     %%1
486%endmacro
487
488; rbx is 0
489%macro mulnext0 0
490    mov     rax, [rsi+8]
491    mul     r8
492    mov     [rdi], r9
493    add     r10, rax
494    mov     r11d, 0
495    adc     r11, rdx
496    mov     rax, [rsi+16]
497    mov     r12d, 0
498    mul     r8
499    mov     [rdi+8], r10
500    add     r11, rax
501    adc     r12, rdx
502    mov     rax, [rsi+24]
503    mul     r8
504    mov     [rdi+16], r11
505    add     r12, rax
506    adc     rdx, 0
507    mov     [rdi+24], r12
508    mov     rax, [rsi+r14*8]
509    mov     [rdi+32], rdx
510    inc     r15
511    lea     rdi, [rdi+8]
512    mov     rbx, r14
513%endmacro
514
515; rbx is 1
516%macro mulnext1 0
517    mov     rax, [rsi+16]
518    mul     r8
519    mov     [rdi+8], r9
520    add     r10, rax
521    mov     r12d, 0
522    adc     r12, rdx
523    mov     rax, [rsi+24]
524    mul     r8
525    mov     [rdi+16], r10
526    add     r12, rax
527    adc     rdx, 0
528    mov     [rdi+24], r12
529    mov     [rdi+32], rdx
530    inc     r15
531    lea     rdi, [rdi+8]
532    mov     rbx, r14
533    mov     rax, [rsi+r14*8]
534%endmacro
535
536; rbx is 2
537%macro mulnext2 0
538    mov     rax, [rsi+24]
539    mul     r8
540    mov     [rdi+16], r9
541    add     r10, rax
542    mov     r11d, 0
543    adc     r11, rdx
544    mov     [rdi+24], r10
545    mov     [rdi+32], r11
546    inc     r15
547    lea     rdi, [rdi+8]
548    mov     rax, [rsi+r14*8]
549    mov     rbx, r14
550%endmacro
551
552; rbx is 3
553%macro mulnext3 0
554    mov     [rdi+24], r9
555    mov     [rdi+32], r10
556    inc     r15
557    lea     rdi, [rdi+8]
558    mov     rax, [rsi+r14*8]
559    mov     rbx, r14
560%endmacro
561
562%macro mpn_addmul_2_int 1
563    jz      %%2
564    xalign  16
565%%1:addmul2pro%1
566    addmul2lp
567    addmul2epi%1
568    jnz     %%1
569%%2:
570%endmacro
571
572%macro oldmulnext0 0
573    mov     rax, [rsi+r11*8+16]
574    mul     r13
575    mov     [rdi+r11*8+8], r9
576    add     r10, rax
577    mov     ebx, 0
578    adc     rbx, rdx
579    mov     rax, [rsi+r11*8+24]
580    mov     r12d, 0
581    mul     r13
582    mov     [rdi+r11*8+16], r10
583    add     rbx, rax
584    adc     r12, rdx
585    mov     rax, [rsi+r11*8+32]
586    mul     r13
587    mov     [rdi+r11*8+24], rbx
588    add     r12, rax
589    adc     rdx, 0
590    mov     [rdi+r11*8+32], r12
591    mov     rax, [rsi+r14*8]
592    mov     [rdi+r11*8+40], rdx
593    inc     r8
594    mov     r11, r14
595%endmacro
596
597%macro oldmulnext1 0
598    mov     rax, [rsi+r11*8+16]
599    mul     r13
600    mov     [rdi+r11*8+8], r9
601    add     r10, rax
602    mov     r12d, 0
603    adc     r12, rdx
604    mov     rax, [rsi+r11*8+24]
605    mul     r13
606    mov     [rdi+r11*8+16], r10
607    add     r12, rax
608    adc     rdx, 0
609    mov     [rdi+r11*8+24], r12
610    mov     [rdi+r11*8+32], rdx
611    inc     r8
612    lea     rdi, [rdi+8]
613    mov     r11, r14
614    mov     rax, [rsi+r14*8]
615%endmacro
616
617%macro oldmulnext2 0
618    mov     rax, [rsi+r11*8+16]
619    mul     r13
620    mov     [rdi+r11*8+8], r9
621    add     r10, rax
622    mov     ebx, 0
623    adc     rbx, rdx
624    mov     [rdi+r11*8+16], r10
625    mov     [rdi+r11*8+24], rbx
626    inc     r8
627    mov     rax, [rsi+r14*8]
628    mov     r11, r14
629%endmacro
630
631%macro oldmulnext3 0
632    mov     [rdi+r11*8+8], r9
633    mov     [rdi+r11*8+16], r10
634    inc     r8
635    mov     rax, [rsi+r14*8]
636    mov     r11, r14
637%endmacro
638
639%macro oldaddmulpro0 0
640    mov     r13, [rcx+r8*8]
641    db      0x26
642    mul     r13
643    db      0x26
644    mov     r12, rax
645    mov     rax, [rsi+r14*8+8]
646    db      0x26
647    mov     r9, rdx
648    lea     rdi, [rdi+8]
649%endmacro
650
651%macro oldaddmulnext0 0
652    mov     r10d, 0
653    mul     r13
654    add     [rdi], r12
655    adc     r9, rax
656    adc     r10, rdx
657    mov     rax, [rsi+16]
658    mul     r13
659    add     [rdi+8], r9
660    adc     r10, rax
661    mov     ebx, 0
662    adc     rbx, rdx
663    mov     rax, [rsi+24]
664    mov     r12d, 0
665    mov     r11, r14
666    mul     r13
667    add     [rdi+16], r10
668    adc     rbx, rax
669    adc     r12, rdx
670    mov     rax, [rsi+32]
671    mul     r13
672    add     [rdi+24], rbx
673    adc     r12, rax
674    adc     rdx, 0
675    add     [rdi+32], r12
676    mov     rax, [rsi+r14*8]
677    adc     rdx, 0
678    inc     r8
679    mov     [rdi+40], rdx
680%endmacro
681
682%macro oldaddmulpro1 0
683    mov     r13, [rcx+r8*8]
684    mul     r13
685    mov     r12, rax
686    mov     rax, [rsi+r14*8+8]
687    mov     r9, rdx
688%endmacro
689
690%macro oldaddmulnext1 0
691    mov     r10d, 0
692    mul     r13
693    add     [rdi+8], r12
694    adc     r9, rax
695    adc     r10, rdx
696    mov     rax, [rsi+24]
697    mul     r13
698    lea     rdi, [rdi+8]
699    add     [rdi+8], r9
700    adc     r10, rax
701    mov     r12d, 0
702    mov     rax, [rsi+32]
703    adc     r12, rdx
704    mov     r11, r14
705    mul     r13
706    add     [rdi+16], r10
707    adc     r12, rax
708    adc     rdx, 0
709    add     [rdi+24], r12
710    adc     rdx, 0
711    mov     [rdi+32], rdx
712    inc     r8
713    mov     rax, [rsi+r14*8]
714%endmacro
715
716%macro oldaddmulpro2 0
717    mov     r13, [rcx+r8*8]
718    lea     rdi, [rdi+8]
719    mul     r13
720    mov     r12, rax
721    mov     rax, [rsi+r14*8+8]
722    mov     r9, rdx
723%endmacro
724
725%macro oldaddmulnext2 0
726    mov     r10d, 0
727    mul     r13
728    add     [rdi+r11*8], r12
729    adc     r9, rax
730    adc     r10, rdx
731    mov     rax, [rsi+r11*8+16]
732    mul     r13
733    mov     ebx, 0
734    add     [rdi+r11*8+8], r9
735    adc     r10, rax
736    adc     rbx, rdx
737    mov     rax, [rsi+r14*8]
738    add     [rdi+r11*8+16], r10
739    adc     rbx, 0
740    mov     [rdi+r11*8+24], rbx
741    inc     r8
742    mov     r11, r14
743%endmacro
744
745%macro oldaddmulpro3 0
746    mov     r13, [rcx+r8*8]
747    db      0x26
748    mul     r13
749    db      0x26
750    mov     r12, rax
751    db      0x26
752    lea     rdi, [rdi+8]
753    db      0x26
754    mov     r9, rdx
755    mov     rax, [rsi+r14*8+8]
756%endmacro
757
758%macro oldaddmulnext3 0
759    mov     r11, r14
760    mul     r13
761    add     [rdi+24], r12
762    adc     r9, rax
763    adc     rdx, 0
764    add     [rdi+32], r9
765    mov     rax, [rsi+r14*8]
766    adc     rdx, 0
767    inc     r8
768    mov     [rdi+40], rdx
769%endmacro
770
771%macro oldmpn_muladdmul_1_int 1
772    oldmulnext%1
773    jz      %%2
774    xalign  16
775%%1:oldaddmulpro%1
776    oldaddmulnext%1
777    jnz     %%1
778%%2:
779%endmacro
780
781    CPU  Core2
782    BITS 64
783
784;  mp_limb_t mpn_mul_basecase(mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t)
785;  rax                           rdi     rsi        rdx     rcx         r8
786;  rax                           rcx     rdx         r8      r9   [rsp+40]
787
788%define reg_save_list   rbx, rsi, rdi, r12, r13, r14
789
790    LEAF_PROC mpn_mul_basecase
791    ; the current mul does not handle case one
792    cmp     r8d, 4
793    jg      fiveormore
794    cmp     r8d, 1
795    je      one
796
797    WIN64_GCC_PROC mpn_bobcat_mbc1, 5, frame
798
799    mov     r14, 5
800    sub     r14, rdx
801    lea     rdi, [rdi+rdx*8-40]
802    lea     rcx, [rcx+r8*8]
803    neg     r8
804    lea     rsi, [rsi+rdx*8-40]
805    mov     rax, [rsi+r14*8]
806    mov     r13, [rcx+r8*8]
807    mov     r11, r14
808    mul     r13
809    mov     r12, rax
810    mov     rax, [rsi+r14*8+8]
811    mov     r9, rdx
812    mov     r10d, 0
813    mul     r13
814    mov     [rdi+r11*8], r12
815    add     r9, rax
816    adc     r10, rdx
817    cmp     r11, 2
818    ja      .4
819    jz      .3
820    jp      .2
821.1:	oldmpn_muladdmul_1_int 0
822    jmp     .5
823.2:	oldmpn_muladdmul_1_int 1
824    jmp     .5
825.3:	oldmpn_muladdmul_1_int 2
826    jmp     .5
827.4:	oldmpn_muladdmul_1_int 3
828.5:	WIN64_GCC_END frame
829
830; rdx >= 5  as we dont have an inner jump
831; (rdi,rdx+r8)=(rsi,rdx)*(rcx,r8)
832
833%undef  reg_save_list
834%define reg_save_list   rbx, rsi, rdi, r12, r13, r14, r15
835
836    xalign  16
837fiveormore:
838    WIN64_GCC_PROC mpn_bobcat_mbc2, 5, frame
839    movsxd  rdx, edx
840    movsxd  r8, r8d
841
842    mov     r14, 4
843    sub     r14, rdx
844    lea     rdi, [rdi+rdx*8-32]
845    lea     rsi, [rsi+rdx*8-32]
846    mov     r13, rcx
847    mov     r15, r8
848    lea     r13, [r13+r15*8]
849    neg     r15
850    mov     rbx, r14
851    mov     rax, [rsi+r14*8]
852    bt      r15, 0
853    jnc     .12
854.6:	inc     rbx
855    mov     r8, [r13+r15*8]
856    mul     r8
857    mov     r12, rax
858    mov     rax, [rsi+r14*8+8]
859    mov     r9, rdx
860    cmp     rbx, 0
861    jge     .7
862    mul1lp
863.7:	mov     r10d, 0
864    mul     r8
865    mov     [rdi+rbx*8-8], r12
866    add     r9, rax
867    adc     r10, rdx
868    cmp     rbx, 2
869    ja      .11
870    jz      .10
871    jp      .9
872.8:	mulnext0
873    jmp     .20
874.9:	mulnext1
875    jmp     .14
876.10:mulnext2
877    jmp     .16
878.11:mulnext3
879    jmp     .18
880     ; as all the mul2pro? are the same
881.12:mul2pro0
882    mul2lp
883    cmp     rbx, 2
884    ja      .19
885    jz      .17
886    jp      .15
887.13:mul2epi3
888.14:mpn_addmul_2_int 3
889    WIN64_GCC_EXIT frame
890
891.15:mul2epi2
892.16:mpn_addmul_2_int 2
893    WIN64_GCC_EXIT frame
894
895.17:mul2epi1
896.18:mpn_addmul_2_int 1
897    WIN64_GCC_EXIT frame
898
899.19:mul2epi0
900.20:mpn_addmul_2_int 0
901
902    xalign  16
903.21:WIN64_GCC_END frame
904
905    xalign  16
906one:mov     rax, [rdx]
907    mul     qword [r9]
908    mov     [rcx], rax
909    mov     [rcx+8], rdx
910    ret
911
912    end
913