1dnl  AMD64 mpn_add_n, mpn_sub_n
2
3dnl  Copyright 2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb
34C AMD K8,K9
35C AMD K10
36C AMD bd1	 1.5  with fluctuations
37C AMD bd2	 1.5  with fluctuations
38C AMD bd3
39C AMD bd4	 1.6
40C AMD zen
41C AMD bt1
42C AMD bt2
43C Intel P4
44C Intel PNR
45C Intel NHM
46C Intel SBR
47C Intel IBR
48C Intel HWL	 1.21
49C Intel BWL	 1.04
50C Intel SKL
51C Intel atom
52C Intel SLM
53C VIA nano
54
55C The loop of this code is the result of running a code generation and
56C optimization tool suite written by David Harvey and Torbjorn Granlund.
57
58C INPUT PARAMETERS
59define(`rp',	`%rdi')	C rcx
60define(`up',	`%rsi')	C rdx
61define(`vp',	`%rdx')	C r8
62define(`n',	`%rcx')	C r9
63define(`cy',	`%r8')	C rsp+40    (mpn_add_nc and mpn_sub_nc)
64
65ifdef(`OPERATION_add_n', `
66	define(ADCSBB,	      adc)
67	define(func,	      mpn_add_n)
68	define(func_nc,	      mpn_add_nc)')
69ifdef(`OPERATION_sub_n', `
70	define(ADCSBB,	      sbb)
71	define(func,	      mpn_sub_n)
72	define(func_nc,	      mpn_sub_nc)')
73
74MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
75
76ABI_SUPPORT(DOS64)
77ABI_SUPPORT(STD64)
78
79ASM_START()
80	TEXT
81	ALIGN(16)
82PROLOGUE(func_nc)
83	FUNC_ENTRY(4)
84IFDOS(`	mov	56(%rsp), %r8	')
85
86	mov	R32(n), R32(%rax)
87	shr	$3, n
88	and	$7, R32(%rax)
89
90	lea	L(tab)(%rip), %r9
91	neg	%r8			C set carry
92ifdef(`PIC',`
93	movslq	(%r9,%rax,4), %rax
94	lea	(%r9,%rax), %rax	C lea not add to preserve carry
95	jmp	*%rax
96',`
97	jmp	*(%r9,%rax,8)
98')
99EPILOGUE()
100
101	ALIGN(16)
102PROLOGUE(func)
103	FUNC_ENTRY(4)
104
105	mov	R32(n), R32(%rax)
106	shr	$3, n
107	and	$7, R32(%rax)		C clear cy as side-effect
108
109	lea	L(tab)(%rip), %r9
110ifdef(`PIC',`
111	movslq	(%r9,%rax,4), %rax
112	lea	(%r9,%rax), %rax	C lea not add to preserve carry
113	jmp	*%rax
114',`
115	jmp	*(%r9,%rax,8)
116')
117
118L(0):	mov	(up), %r8
119	mov	8(up), %r9
120	ADCSBB	(vp), %r8
121	jmp	L(e0)
122
123L(4):	mov	(up), %r8
124	mov	8(up), %r9
125	ADCSBB	(vp), %r8
126	lea	-32(up), up
127	lea	-32(vp), vp
128	lea	-32(rp), rp
129	inc	n
130	jmp	L(e4)
131
132L(5):	mov	(up), %r11
133	mov	8(up), %r8
134	mov	16(up), %r9
135	ADCSBB	(vp), %r11
136	lea	-24(up), up
137	lea	-24(vp), vp
138	lea	-24(rp), rp
139	inc	n
140	jmp	L(e5)
141
142L(6):	mov	(up), %r10
143	ADCSBB	(vp), %r10
144	mov	8(up), %r11
145	lea	-16(up), up
146	lea	-16(vp), vp
147	lea	-16(rp), rp
148	inc	n
149	jmp	L(e6)
150
151L(7):	mov	(up), %r9
152	mov	8(up), %r10
153	ADCSBB	(vp), %r9
154	ADCSBB	8(vp), %r10
155	lea	-8(up), up
156	lea	-8(vp), vp
157	lea	-8(rp), rp
158	inc	n
159	jmp	L(e7)
160
161	ALIGN(16)
162L(top):
163L(e3):	mov	%r9, 40(rp)
164L(e2):	mov	%r10, 48(rp)
165L(e1):	mov	(up), %r8
166	mov	8(up), %r9
167	ADCSBB	(vp), %r8
168	mov	%r11, 56(rp)
169	lea	64(rp), rp
170L(e0):	mov	16(up), %r10
171	ADCSBB	8(vp), %r9
172	ADCSBB	16(vp), %r10
173	mov	%r8, (rp)
174L(e7):	mov	24(up), %r11
175	mov	%r9, 8(rp)
176L(e6):	mov	32(up), %r8
177	mov	40(up), %r9
178	ADCSBB	24(vp), %r11
179	mov	%r10, 16(rp)
180L(e5):	ADCSBB	32(vp), %r8
181	mov	%r11, 24(rp)
182L(e4):	mov	48(up), %r10
183	mov	56(up), %r11
184	mov	%r8, 32(rp)
185	lea	64(up), up
186	ADCSBB	40(vp), %r9
187	ADCSBB	48(vp), %r10
188	ADCSBB	56(vp), %r11
189	lea	64(vp), vp
190	dec	n
191	jnz	L(top)
192
193L(end):	mov	%r9, 40(rp)
194	mov	%r10, 48(rp)
195	mov	%r11, 56(rp)
196	mov	R32(n), R32(%rax)
197	adc	R32(n), R32(%rax)
198	FUNC_EXIT()
199	ret
200
201	ALIGN(16)
202L(3):	mov	(up), %r9
203	mov	8(up), %r10
204	mov	16(up), %r11
205	ADCSBB	(vp), %r9
206	ADCSBB	8(vp), %r10
207	ADCSBB	16(vp), %r11
208	jrcxz	L(x3)
209	lea	24(up), up
210	lea	24(vp), vp
211	lea	-40(rp), rp
212	jmp	L(e3)
213L(x3):	mov	%r9, (rp)
214	mov	%r10, 8(rp)
215	mov	%r11, 16(rp)
216	mov	R32(n), R32(%rax)
217	adc	R32(n), R32(%rax)
218	FUNC_EXIT()
219	ret
220
221	ALIGN(16)
222L(1):	mov	(up), %r11
223	ADCSBB	(vp), %r11
224	jrcxz	L(x1)
225	lea	8(up), up
226	lea	8(vp), vp
227	lea	-56(rp), rp
228	jmp	L(e1)
229L(x1):	mov	%r11, (rp)
230	mov	R32(n), R32(%rax)
231	adc	R32(n), R32(%rax)
232	FUNC_EXIT()
233	ret
234
235	ALIGN(16)
236L(2):	mov	(up), %r10
237	mov	8(up), %r11
238	ADCSBB	(vp), %r10
239	ADCSBB	8(vp), %r11
240	jrcxz	L(x2)
241	lea	16(up), up
242	lea	16(vp), vp
243	lea	-48(rp), rp
244	jmp	L(e2)
245L(x2):	mov	%r10, (rp)
246	mov	%r11, 8(rp)
247	mov	R32(n), R32(%rax)
248	adc	R32(n), R32(%rax)
249	FUNC_EXIT()
250	ret
251EPILOGUE()
252	JUMPTABSECT
253	ALIGN(8)
254L(tab):	JMPENT(	L(0), L(tab))
255	JMPENT(	L(1), L(tab))
256	JMPENT(	L(2), L(tab))
257	JMPENT(	L(3), L(tab))
258	JMPENT(	L(4), L(tab))
259	JMPENT(	L(5), L(tab))
260	JMPENT(	L(6), L(tab))
261	JMPENT(	L(7), L(tab))
262