1dnl  Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
2
3dnl  Copyright 1999, 2000, 2001, 2002, 2005 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 2.1 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public
18dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
19dnl  not, write to the Free Software Foundation, Inc., 51 Franklin Street,
20dnl  Fifth Floor, Boston, MA 02110-1301, USA.
21
22include(`../config.m4')
23
24
25C                           cycles/limb
26C P5:
27C P6 model 0-8,10-12)            6.44
28C P6 model 9  (Banias)
29C P6 model 13 (Dothan)           6.11
30C P4 model 0  (Willamette)
31C P4 model 1  (?)
32C P4 model 2  (Northwood)
33C P4 model 3  (Prescott)
34C P4 model 4  (Nocona)
35C K6:
36C K7:
37C K8:
38
39
40dnl  P6 UNROLL_COUNT cycles/limb
41dnl          8           6.7
42dnl         16           6.35
43dnl         32           6.3
44dnl         64           6.3
45dnl  Maximum possible with the current code is 64.
46
47deflit(UNROLL_COUNT, 16)
48
49define(`OPERATION_addmul_1',1)
50
51ifdef(`OPERATION_addmul_1', `
52	define(M4_inst,        addl)
53	define(M4_function_1,  mpn_addmul_1)
54	define(M4_function_1c, mpn_addmul_1c)
55	define(M4_description, add it to)
56	define(M4_desc_retval, carry)
57',`ifdef(`OPERATION_submul_1', `
58	define(M4_inst,        subl)
59	define(M4_function_1,  mpn_submul_1)
60	define(M4_function_1c, mpn_submul_1c)
61	define(M4_description, subtract it from)
62	define(M4_desc_retval, borrow)
63',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
64')')')
65
66MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c)
67
68
69C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
70C                            mp_limb_t mult);
71C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
72C                             mp_limb_t mult, mp_limb_t carry);
73C
74C Calculate src,size multiplied by mult and M4_description dst,size.
75C Return the M4_desc_retval limb from the top of the result.
76C
77C This code is pretty much the same as the K6 code.  The unrolled loop is
78C the same, but there's just a few scheduling tweaks in the setups and the
79C simple loop.
80C
81C A number of variations have been tried for the unrolled loop, with one or
82C two carries, and with loads scheduled earlier, but nothing faster than 6
83C cycles/limb has been found.
84
85ifdef(`PIC',`
86deflit(UNROLL_THRESHOLD, 5)
87',`
88deflit(UNROLL_THRESHOLD, 5)
89')
90
91defframe(PARAM_CARRY,     20)
92defframe(PARAM_MULTIPLIER,16)
93defframe(PARAM_SIZE,      12)
94defframe(PARAM_SRC,       8)
95defframe(PARAM_DST,       4)
96
97	TEXT
98	ALIGN(32)
99
100PROLOGUE(M4_function_1c)
101	pushl	%ebx
102deflit(`FRAME',4)
103	movl	PARAM_CARRY, %ebx
104	jmp	L(start_nc)
105EPILOGUE()
106
107PROLOGUE(M4_function_1)
108	push	%ebx
109deflit(`FRAME',4)
110	xorl	%ebx, %ebx	C initial carry
111
112L(start_nc):
113	movl	PARAM_SIZE, %ecx
114	pushl	%esi
115deflit(`FRAME',8)
116
117	movl	PARAM_SRC, %esi
118	pushl	%edi
119deflit(`FRAME',12)
120
121	movl	PARAM_DST, %edi
122	pushl	%ebp
123deflit(`FRAME',16)
124	cmpl	$UNROLL_THRESHOLD, %ecx
125
126	movl	PARAM_MULTIPLIER, %ebp
127	jae	L(unroll)
128
129
130	C simple loop
131	C this is offset 0x22, so close enough to aligned
132L(simple):
133	C eax	scratch
134	C ebx	carry
135	C ecx	counter
136	C edx	scratch
137	C esi	src
138	C edi	dst
139	C ebp	multiplier
140
141	movl	(%esi), %eax
142	addl	$4, %edi
143
144	mull	%ebp
145
146	addl	%ebx, %eax
147	adcl	$0, %edx
148
149	M4_inst	%eax, -4(%edi)
150	movl	%edx, %ebx
151
152	adcl	$0, %ebx
153	decl	%ecx
154
155	leal	4(%esi), %esi
156	jnz	L(simple)
157
158
159	popl	%ebp
160	popl	%edi
161
162	popl	%esi
163	movl	%ebx, %eax
164
165	popl	%ebx
166	ret
167
168
169
170C------------------------------------------------------------------------------
171C VAR_JUMP holds the computed jump temporarily because there's not enough
172C registers when doing the mul for the initial two carry limbs.
173C
174C The add/adc for the initial carry in %ebx is necessary only for the
175C mpn_add/submul_1c entry points.  Duplicating the startup code to
176C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
177C idea.
178
179dnl  overlapping with parameters already fetched
180define(VAR_COUNTER,`PARAM_SIZE')
181define(VAR_JUMP,   `PARAM_DST')
182
183	C this is offset 0x43, so close enough to aligned
184L(unroll):
185	C eax
186	C ebx	initial carry
187	C ecx	size
188	C edx
189	C esi	src
190	C edi	dst
191	C ebp
192
193	movl	%ecx, %edx
194	decl	%ecx
195
196	subl	$2, %edx
197	negl	%ecx
198
199	shrl	$UNROLL_LOG2, %edx
200	andl	$UNROLL_MASK, %ecx
201
202	movl	%edx, VAR_COUNTER
203	movl	%ecx, %edx
204
205	C 15 code bytes per limb
206ifdef(`PIC',`
207	call	L(pic_calc)
208L(here):
209',`
210	shll	$4, %edx
211	negl	%ecx
212
213	leal	L(entry) (%edx,%ecx,1), %edx
214')
215	movl	(%esi), %eax		C src low limb
216
217	movl	%edx, VAR_JUMP
218	leal	ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
219
220	mull	%ebp
221
222	addl	%ebx, %eax	C initial carry (from _1c)
223	adcl	$0, %edx
224
225	movl	%edx, %ebx	C high carry
226	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
227
228	movl	VAR_JUMP, %edx
229	testl	$1, %ecx
230	movl	%eax, %ecx	C low carry
231
232	cmovnz(	%ebx, %ecx)	C high,low carry other way around
233	cmovnz(	%eax, %ebx)
234
235	jmp	*%edx
236
237
238ifdef(`PIC',`
239L(pic_calc):
240	shll	$4, %edx
241	negl	%ecx
242
243	C See mpn/x86/README about old gas bugs
244	leal	(%edx,%ecx,1), %edx
245	addl	$L(entry)-L(here), %edx
246
247	addl	(%esp), %edx
248
249	ret_internal
250')
251
252
253C -----------------------------------------------------------
254	ALIGN(32)
255L(top):
256deflit(`FRAME',16)
257	C eax	scratch
258	C ebx	carry hi
259	C ecx	carry lo
260	C edx	scratch
261	C esi	src
262	C edi	dst
263	C ebp	multiplier
264	C
265	C VAR_COUNTER	loop counter
266	C
267	C 15 code bytes per limb
268
269	addl	$UNROLL_BYTES, %edi
270
271L(entry):
272deflit(CHUNK_COUNT,2)
273forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
274	deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
275	deflit(`disp1', eval(disp0 + 4))
276
277Zdisp(	movl,	disp0,(%esi), %eax)
278	mull	%ebp
279Zdisp(	M4_inst,%ecx, disp0,(%edi))
280	adcl	%eax, %ebx
281	movl	%edx, %ecx
282	adcl	$0, %ecx
283
284	movl	disp1(%esi), %eax
285	mull	%ebp
286	M4_inst	%ebx, disp1(%edi)
287	adcl	%eax, %ecx
288	movl	%edx, %ebx
289	adcl	$0, %ebx
290')
291
292	decl	VAR_COUNTER
293	leal	UNROLL_BYTES(%esi), %esi
294
295	jns	L(top)
296
297
298deflit(`disp0',	eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
299
300	M4_inst	%ecx, disp0(%edi)
301	movl	%ebx, %eax
302
303	popl	%ebp
304	popl	%edi
305
306	popl	%esi
307	popl	%ebx
308	adcl	$0, %eax
309
310	ret
311
312EPILOGUE()
313