1dnl  Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
2
3dnl  Copyright 1999-2002, 2005 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C			    cycles/limb
35C P5
36C P6 model 0-8,10-12		 6.44
37C P6 model 9  (Banias)		 6.15
38C P6 model 13 (Dothan)		 6.11
39C P4 model 0  (Willamette)
40C P4 model 1  (?)
41C P4 model 2  (Northwood)
42C P4 model 3  (Prescott)
43C P4 model 4  (Nocona)
44C AMD K6
45C AMD K7
46C AMD K8
47
48
49dnl  P6 UNROLL_COUNT cycles/limb
50dnl          8           6.7
51dnl         16           6.35
52dnl         32           6.3
53dnl         64           6.3
54dnl  Maximum possible with the current code is 64.
55
56deflit(UNROLL_COUNT, 16)
57
58
59ifdef(`OPERATION_addmul_1', `
60	define(M4_inst,        addl)
61	define(M4_function_1,  mpn_addmul_1)
62	define(M4_function_1c, mpn_addmul_1c)
63	define(M4_description, add it to)
64	define(M4_desc_retval, carry)
65',`ifdef(`OPERATION_submul_1', `
66	define(M4_inst,        subl)
67	define(M4_function_1,  mpn_submul_1)
68	define(M4_function_1c, mpn_submul_1c)
69	define(M4_description, subtract it from)
70	define(M4_desc_retval, borrow)
71',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
72')')')
73
74MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
75
76
77C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
78C                            mp_limb_t mult);
79C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
80C                             mp_limb_t mult, mp_limb_t carry);
81C
82C Calculate src,size multiplied by mult and M4_description dst,size.
83C Return the M4_desc_retval limb from the top of the result.
84C
85C This code is pretty much the same as the K6 code.  The unrolled loop is
86C the same, but there's just a few scheduling tweaks in the setups and the
87C simple loop.
88C
89C A number of variations have been tried for the unrolled loop, with one or
90C two carries, and with loads scheduled earlier, but nothing faster than 6
91C cycles/limb has been found.
92
93ifdef(`PIC',`
94deflit(UNROLL_THRESHOLD, 5)
95',`
96deflit(UNROLL_THRESHOLD, 5)
97')
98
99defframe(PARAM_CARRY,     20)
100defframe(PARAM_MULTIPLIER,16)
101defframe(PARAM_SIZE,      12)
102defframe(PARAM_SRC,       8)
103defframe(PARAM_DST,       4)
104
105	TEXT
106	ALIGN(32)
107
108PROLOGUE(M4_function_1c)
109	pushl	%ebx
110deflit(`FRAME',4)
111	movl	PARAM_CARRY, %ebx
112	jmp	L(start_nc)
113EPILOGUE()
114
115PROLOGUE(M4_function_1)
116	push	%ebx
117deflit(`FRAME',4)
118	xorl	%ebx, %ebx	C initial carry
119
120L(start_nc):
121	movl	PARAM_SIZE, %ecx
122	pushl	%esi
123deflit(`FRAME',8)
124
125	movl	PARAM_SRC, %esi
126	pushl	%edi
127deflit(`FRAME',12)
128
129	movl	PARAM_DST, %edi
130	pushl	%ebp
131deflit(`FRAME',16)
132	cmpl	$UNROLL_THRESHOLD, %ecx
133
134	movl	PARAM_MULTIPLIER, %ebp
135	jae	L(unroll)
136
137
138	C simple loop
139	C this is offset 0x22, so close enough to aligned
140L(simple):
141	C eax	scratch
142	C ebx	carry
143	C ecx	counter
144	C edx	scratch
145	C esi	src
146	C edi	dst
147	C ebp	multiplier
148
149	movl	(%esi), %eax
150	addl	$4, %edi
151
152	mull	%ebp
153
154	addl	%ebx, %eax
155	adcl	$0, %edx
156
157	M4_inst	%eax, -4(%edi)
158	movl	%edx, %ebx
159
160	adcl	$0, %ebx
161	decl	%ecx
162
163	leal	4(%esi), %esi
164	jnz	L(simple)
165
166
167	popl	%ebp
168	popl	%edi
169
170	popl	%esi
171	movl	%ebx, %eax
172
173	popl	%ebx
174	ret
175
176
177
178C------------------------------------------------------------------------------
179C VAR_JUMP holds the computed jump temporarily because there's not enough
180C registers when doing the mul for the initial two carry limbs.
181C
182C The add/adc for the initial carry in %ebx is necessary only for the
183C mpn_add/submul_1c entry points.  Duplicating the startup code to
184C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
185C idea.
186
187dnl  overlapping with parameters already fetched
188define(VAR_COUNTER,`PARAM_SIZE')
189define(VAR_JUMP,   `PARAM_DST')
190
191	C this is offset 0x43, so close enough to aligned
192L(unroll):
193	C eax
194	C ebx	initial carry
195	C ecx	size
196	C edx
197	C esi	src
198	C edi	dst
199	C ebp
200
201	movl	%ecx, %edx
202	decl	%ecx
203
204	subl	$2, %edx
205	negl	%ecx
206
207	shrl	$UNROLL_LOG2, %edx
208	andl	$UNROLL_MASK, %ecx
209
210	movl	%edx, VAR_COUNTER
211	movl	%ecx, %edx
212
213	C 15 code bytes per limb
214ifdef(`PIC',`
215	call	L(pic_calc)
216L(here):
217',`
218	shll	$4, %edx
219	negl	%ecx
220
221	leal	L(entry) (%edx,%ecx,1), %edx
222')
223	movl	(%esi), %eax		C src low limb
224
225	movl	%edx, VAR_JUMP
226	leal	ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
227
228	mull	%ebp
229
230	addl	%ebx, %eax	C initial carry (from _1c)
231	adcl	$0, %edx
232
233	movl	%edx, %ebx	C high carry
234	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
235
236	movl	VAR_JUMP, %edx
237	testl	$1, %ecx
238	movl	%eax, %ecx	C low carry
239
240	cmovnz(	%ebx, %ecx)	C high,low carry other way around
241	cmovnz(	%eax, %ebx)
242
243	jmp	*%edx
244
245
246ifdef(`PIC',`
247L(pic_calc):
248	shll	$4, %edx
249	negl	%ecx
250
251	C See mpn/x86/README about old gas bugs
252	leal	(%edx,%ecx,1), %edx
253	addl	$L(entry)-L(here), %edx
254
255	addl	(%esp), %edx
256
257	ret_internal
258')
259
260
261C -----------------------------------------------------------
262	ALIGN(32)
263L(top):
264deflit(`FRAME',16)
265	C eax	scratch
266	C ebx	carry hi
267	C ecx	carry lo
268	C edx	scratch
269	C esi	src
270	C edi	dst
271	C ebp	multiplier
272	C
273	C VAR_COUNTER	loop counter
274	C
275	C 15 code bytes per limb
276
277	addl	$UNROLL_BYTES, %edi
278
279L(entry):
280deflit(CHUNK_COUNT,2)
281forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
282	deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
283	deflit(`disp1', eval(disp0 + 4))
284
285Zdisp(	movl,	disp0,(%esi), %eax)
286	mull	%ebp
287Zdisp(	M4_inst,%ecx, disp0,(%edi))
288	adcl	%eax, %ebx
289	movl	%edx, %ecx
290	adcl	$0, %ecx
291
292	movl	disp1(%esi), %eax
293	mull	%ebp
294	M4_inst	%ebx, disp1(%edi)
295	adcl	%eax, %ecx
296	movl	%edx, %ebx
297	adcl	$0, %ebx
298')
299
300	decl	VAR_COUNTER
301	leal	UNROLL_BYTES(%esi), %esi
302
303	jns	L(top)
304
305
306deflit(`disp0',	eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
307
308	M4_inst	%ecx, disp0(%edi)
309	movl	%ebx, %eax
310
311	popl	%ebp
312	popl	%edi
313
314	popl	%esi
315	popl	%ebx
316	adcl	$0, %eax
317
318	ret
319
320EPILOGUE()
321