1;  AMD64 mpn_submul_1 optimised for Intel Haswell.
2
3;  Contributed to the GNU project by Torbjörn Granlund.
4;  Converted to MPIR by Alexander Kruppa.
5
6;  Copyright 2013 Free Software Foundation, Inc.
7
8;  This file is part of the GNU MP Library.
9;
10;  The GNU MP Library is free software; you can redistribute it and/or modify
11;  it under the terms of either:
12;
13;    * the GNU Lesser General Public License as published by the Free
14;      Software Foundation; either version 3 of the License, or (at your
15;      option) any later version.
16;
17;  or
18;
19;    * the GNU General Public License as published by the Free Software
20;      Foundation; either version 2 of the License, or (at your option) any
21;      later version.
22;
23;  or both in parallel, as here.
24;
25;  The GNU MP Library is distributed in the hope that it will be useful, but
26;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28;  for more details.
29;
30;  You should have received copies of the GNU General Public License and the
31;  GNU Lesser General Public License along with the GNU MP Library.  If not,
32;  see https://www.gnu.org/licenses/.
33;  mp_limb_t mpn_submul_1(mp_ptr, mp_ptr, mp_size_t, mp_limb_t)
34;  rax                       rdi     rsi        rdx        rcx
35;  rax                       rcx     rdx         r8        r9d
36
37%include 'yasm_mac.inc'
38
39BITS 64
40
41%define reg_save_list rbx, rbp, rsi, rdi, r12, r13
42%define RP      rdi
43%define S1P     rsi
44%define Size    rbp
45%define Sizeb   bpl
46%define Limb    rcx
47
48%define Tmp0    r12
49%define Tmp1    r13
50%define Tmp2    rax
51%define Tmp3    rbx
52%define Tmp4    r8
53%define Tmp5    r9
54%define Tmp6    r10
55%define Tmp7    r11
56%define Tmp8    rcx
57
58%define ADDSUB sub
59%define ADCSBB sbb
60
61align 16
62
63FRAME_PROC mpn_submul_1, 0, reg_save_list
64    mov     rdi, rcx
65    mov     rsi, rdx
66	mov 	rbp, r8 ; mulx requires one input in rdx
67	mov 	rdx, r9
68
69	test 	Sizeb, 1
70	jnz 	.Lbx1
71
72.Lbx0:  shr 	Size, 2
73	jc 	.Lb10 ;ajs:notshortform
74
75.Lb00:	mulx 	Tmp0, Tmp1, [S1P]
76	mulx 	Tmp2, Tmp3, [S1P+8]
77	add 	Tmp3, Tmp0
78	adc 	Tmp2, 0
79	mov 	Tmp0, [RP]
80	mov 	Tmp8, [RP+8]
81	mulx 	Tmp4, Tmp5, [S1P+16]
82	lea 	RP, [RP-16]
83	lea 	S1P, [S1P+16]
84	ADDSUB 	Tmp0, Tmp1
85	jmp 	.Llo0 ;ajs:notshortform
86
87.Lbx1:	shr 	Size, 2
88	jc 	.Lb11
89
90.Lb01:	mulx 	Tmp6, Tmp7, [S1P]
91	jnz 	.Lgt1
92.Ln1:	ADDSUB 	[RP], Tmp7
93	mov 	eax, 0
94	adc 	Tmp2, Tmp6
95	jmp 	.Lret ;ajs:notshortform
96
97.Lgt1:	mulx 	Tmp0, Tmp1, [S1P+8]
98	mulx 	Tmp2, Tmp3, [S1P+16]
99	lea 	S1P, [S1P+24]
100	add 	Tmp1, Tmp6
101	adc 	Tmp3, Tmp0
102	adc 	Tmp2, 0
103	mov 	Tmp6, [RP]
104	mov 	Tmp0, [RP+8]
105	mov 	Tmp8, [RP+16]
106	lea 	RP, [RP-8]
107	ADDSUB 	Tmp6, Tmp7
108	jmp 	.Llo1
109
110.Lb11:	mulx 	Tmp2, Tmp3, [S1P]
111	mov 	Tmp8, [RP]
112	mulx 	Tmp4, Tmp5, [S1P+8]
113	lea 	S1P, [S1P+8]
114	lea 	RP, [RP-24]
115	inc 	Size
116	ADDSUB 	Tmp8, Tmp3
117	jmp 	.Llo3
118
119.Lb10:	mulx 	Tmp4, Tmp5, [S1P]
120	mulx 	Tmp6, Tmp7, [S1P+8]
121	lea 	RP, [RP-32]
122	mov 	eax, 0
123	clc
124	jz 	.Lend ;ajs:notshortform
125
126	align 16
127.Ltop:	adc 	Tmp5, Tmp2
128	lea 	RP, [RP+32]
129	adc 	Tmp7, Tmp4
130	mulx 	Tmp0, Tmp1, [S1P+16]
131	mov 	Tmp4, [RP]
132	mulx 	Tmp2, Tmp3, [S1P+24]
133	lea 	S1P, [S1P+32]
134	adc 	Tmp1, Tmp6
135	adc 	Tmp3, Tmp0
136	adc 	Tmp2, 0
137	mov 	Tmp6, [RP+8]
138	mov 	Tmp0, [RP+16]
139	ADDSUB 	Tmp4, Tmp5
140	mov 	Tmp8, [RP+24]
141	mov 	[RP], Tmp4
142	ADCSBB 	Tmp6, Tmp7
143.Llo1:	mulx 	Tmp4, Tmp5, [S1P]
144	mov 	[RP+8], Tmp6
145	ADCSBB 	Tmp0, Tmp1
146.Llo0:	mov 	[RP+16], Tmp0
147	ADCSBB 	Tmp8, Tmp3
148.Llo3:	mulx 	Tmp6, Tmp7, [S1P+8]
149	mov 	[RP+24], Tmp8
150	dec 	Size
151	jnz 	.Ltop
152
153.Lend:	adc 	Tmp5, Tmp2
154	adc 	Tmp7, Tmp4
155	mov 	Tmp4, [RP+32]
156	mov 	Tmp2, Tmp6
157	adc 	Tmp2, 0
158	mov 	Tmp6, [RP+40]
159	ADDSUB 	Tmp4, Tmp5
160	mov 	[RP+32], Tmp4
161	ADCSBB 	Tmp6, Tmp7
162	mov 	[RP+40], Tmp6
163	adc 	Tmp2, 0
164
165.Lret:
166    END_PROC reg_save_list
167