1dnl  AMD K7 mpn_mod_1 -- mpn by limb remainder.
2
3dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 2.1 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public
18dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
19dnl  not, write to the Free Software Foundation, Inc., 51 Franklin Street,
20dnl  Fifth Floor, Boston, MA 02110-1301, USA.
21
22include(`../config.m4')
23
24
25C K7: 17.0 cycles/limb.
26
27
28C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
29C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
30C                       mp_limb_t carry);
31C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
32C                             mp_limb_t inverse);
33C
34C The code here is the same as mpn_divrem_1, but with the quotient
35C discarded.  See mpn/x86/k7/mmx/divrem_1.c for some comments.
36
37
38dnl  MUL_THRESHOLD is the size at which the multiply by inverse method is
39dnl  used, rather than plain "divl"s.  Minimum value 2.
40dnl
41dnl  The inverse takes about 50 cycles to calculate, but after that the
42dnl  multiply is 17 c/l versus division at 41 c/l.
43dnl
44dnl  Using mul or div is about the same speed at 3 limbs, so the threshold
45dnl  is set to 4 to get the smaller div code used at 3.
46
47deflit(MUL_THRESHOLD, 4)
48
49
50defframe(PARAM_INVERSE,16)  dnl mpn_preinv_mod_1
51defframe(PARAM_CARRY,  16)  dnl mpn_mod_1c
52defframe(PARAM_DIVISOR,12)
53defframe(PARAM_SIZE,    8)
54defframe(PARAM_SRC,     4)
55
56defframe(SAVE_EBX,    -4)
57defframe(SAVE_ESI,    -8)
58defframe(SAVE_EDI,    -12)
59defframe(SAVE_EBP,    -16)
60
61defframe(VAR_NORM,    -20)
62defframe(VAR_INVERSE, -24)
63defframe(VAR_SRC_STOP,-28)
64
65deflit(STACK_SPACE, 28)
66
67	TEXT
68
69	ALIGN(32)
70PROLOGUE(mpn_preinv_mod_1)
71deflit(`FRAME',0)
72	movl	PARAM_SRC, %ecx
73	movl	PARAM_SIZE, %eax
74	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
75
76	movl	%ebp, SAVE_EBP
77	movl	PARAM_DIVISOR, %ebp
78
79	movl	%edi, SAVE_EDI
80	movl	PARAM_INVERSE, %edx
81
82	movl	%esi, SAVE_ESI
83	movl	-4(%ecx,%eax,4), %edi		C src high limb
84	leal	-16(%ecx,%eax,4), %ecx		C &src[size-4]
85
86	movl	%ebx, SAVE_EBX
87	movl	PARAM_INVERSE, %edx
88
89	movl	$0, VAR_NORM			C l==0
90
91	movl	%edi, %esi
92	subl	%ebp, %edi			C high-divisor
93
94	cmovc(	%esi, %edi)			C restore if underflow
95	decl	%eax
96	jz	L(done_edi)			C size==1, high-divisor only
97
98	movl	8(%ecx), %esi			C src second high limb
99	movl	%edx, VAR_INVERSE
100
101	movl	$32, %ebx			C 32-l
102	decl	%eax
103	jz	L(inverse_one_left)		C size==2, one divide
104
105	movd	%ebx, %mm7			C 32-l
106	decl	%eax
107	jz	L(inverse_two_left)		C size==3, two divides
108
109	jmp	L(inverse_top)			C size>=4
110
111
112L(done_edi):
113	movl	SAVE_ESI, %esi
114	movl	SAVE_EBP, %ebp
115	movl	%edi, %eax
116
117	movl	SAVE_EDI, %edi
118	addl	$STACK_SPACE, %esp
119
120	ret
121
122EPILOGUE()
123
124
125	ALIGN(32)
126PROLOGUE(mpn_mod_1c)
127deflit(`FRAME',0)
128	movl	PARAM_CARRY, %edx
129	movl	PARAM_SIZE, %ecx
130	subl	$STACK_SPACE, %esp
131deflit(`FRAME',STACK_SPACE)
132
133	movl	%ebp, SAVE_EBP
134	movl	PARAM_DIVISOR, %ebp
135
136	movl	%esi, SAVE_ESI
137	movl	PARAM_SRC, %esi
138	jmp	L(start_1c)
139
140EPILOGUE()
141
142
143	ALIGN(32)
144PROLOGUE(mpn_mod_1)
145deflit(`FRAME',0)
146
147	movl	PARAM_SIZE, %ecx
148	movl	$0, %edx		C initial carry (if can't skip a div)
149	subl	$STACK_SPACE, %esp
150deflit(`FRAME',STACK_SPACE)
151
152	movl	%esi, SAVE_ESI
153	movl	PARAM_SRC, %esi
154
155	movl	%ebp, SAVE_EBP
156	movl	PARAM_DIVISOR, %ebp
157
158	orl	%ecx, %ecx
159	jz	L(divide_done)
160
161	movl	-4(%esi,%ecx,4), %eax	C src high limb
162
163	cmpl	%ebp, %eax		C carry flag if high<divisor
164
165	cmovc(	%eax, %edx)		C src high limb as initial carry
166	sbbl	$0, %ecx		C size-1 to skip one div
167	jz	L(divide_done)
168
169
170	ALIGN(16)
171L(start_1c):
172	C eax
173	C ebx
174	C ecx	size
175	C edx	carry
176	C esi	src
177	C edi
178	C ebp	divisor
179
180	cmpl	$MUL_THRESHOLD, %ecx
181	jae	L(mul_by_inverse)
182
183
184
185C With a MUL_THRESHOLD of 4, this "loop" only ever does 1 to 3 iterations,
186C but it's already fast and compact, and there's nothing to gain by
187C expanding it out.
188C
189C Using PARAM_DIVISOR in the divl is a couple of cycles faster than %ebp.
190
191	orl	%ecx, %ecx
192	jz	L(divide_done)
193
194
195L(divide_top):
196	C eax	scratch (quotient)
197	C ebx
198	C ecx	counter, limbs, decrementing
199	C edx	scratch (remainder)
200	C esi	src
201	C edi
202	C ebp
203
204	movl	-4(%esi,%ecx,4), %eax
205
206	divl	PARAM_DIVISOR
207
208	decl	%ecx
209	jnz	L(divide_top)
210
211
212L(divide_done):
213	movl	SAVE_ESI, %esi
214	movl	SAVE_EBP, %ebp
215	addl	$STACK_SPACE, %esp
216
217	movl	%edx, %eax
218
219	ret
220
221
222
223C -----------------------------------------------------------------------------
224
225L(mul_by_inverse):
226	C eax
227	C ebx
228	C ecx	size
229	C edx	carry
230	C esi	src
231	C edi
232	C ebp	divisor
233
234	bsrl	%ebp, %eax		C 31-l
235
236	movl	%ebx, SAVE_EBX
237	movl	%ecx, %ebx		C size
238
239	movl	%edi, SAVE_EDI
240	movl	$31, %ecx
241
242	movl	%edx, %edi		C carry
243	movl	$-1, %edx
244
245	C
246
247	xorl	%eax, %ecx		C l
248	incl	%eax			C 32-l
249
250	shll	%cl, %ebp		C d normalized
251	movl	%ecx, VAR_NORM
252
253	movd	%eax, %mm7		C 32-l
254
255	movl	$-1, %eax
256	subl	%ebp, %edx		C (b-d)-1 so  edx:eax = b*(b-d)-1
257
258	divl	%ebp			C floor (b*(b-d)-1) / d
259
260	C
261
262	movl	%eax, VAR_INVERSE
263	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
264
265	movl	8(%eax), %esi		C src high limb
266	movl	4(%eax), %edx		C src second highest limb
267
268	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
269
270	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
271
272	movl	%eax, %ecx		C &src[size-3]
273
274
275ifelse(MUL_THRESHOLD,2,`
276	cmpl	$2, %ebx
277	je	L(inverse_two_left)
278')
279
280
281C The dependent chain here is the same as in mpn_divrem_1, but a few
282C instructions are saved by not needing to store the quotient limbs.
283C Unfortunately this doesn't get the code down to the theoretical 16 c/l.
284C
285C There's four dummy instructions in the loop, all of which are necessary
286C for the claimed 17 c/l.  It's a 1 to 3 cycle slowdown if any are removed,
287C or changed from load to store or vice versa.  They're not completely
288C random, since they correspond to what mpn_divrem_1 has, but there's no
289C obvious reason why they're necessary.  Presumably they induce something
290C good in the out of order execution, perhaps through some load/store
291C ordering and/or decoding effects.
292C
293C The q1==0xFFFFFFFF case is handled here the same as in mpn_divrem_1.  On
294C on special data that comes out as q1==0xFFFFFFFF always, the loop runs at
295C about 13.5 c/l.
296
297	ALIGN(32)
298L(inverse_top):
299	C eax	scratch
300	C ebx	scratch (nadj, q1)
301	C ecx	src pointer, decrementing
302	C edx	scratch
303	C esi	n10
304	C edi	n2
305	C ebp	divisor
306	C
307	C mm0	scratch (src qword)
308	C mm7	rshift for normalization
309
310	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
311	movl	%edi, %eax         C n2
312	movl	PARAM_SIZE, %ebx   C dummy
313
314	leal	(%ebp,%esi), %ebx
315	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
316	sbbl	$-1, %eax          C n2+n1
317
318	mull	VAR_INVERSE        C m*(n2+n1)
319
320	movq	(%ecx), %mm0       C next src limb and the one below it
321	subl	$4, %ecx
322
323	movl	%ecx, PARAM_SIZE   C dummy
324
325	C
326
327	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
328	leal	1(%edi), %ebx      C n2+1
329	movl	%ebp, %eax	   C d
330
331	C
332
333	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
334	jz	L(q1_ff)
335	nop			   C dummy
336
337	mull	%ebx		   C (q1+1)*d
338
339	psrlq	%mm7, %mm0
340	leal	(%ecx), %ecx	   C dummy
341
342	C
343
344	C
345
346	subl	%eax, %esi	   C low  n - (q1+1)*d
347	movl	PARAM_SRC, %eax
348
349	C
350
351	sbbl	%edx, %edi	   C high n - (q1+1)*d, 0 or -1
352	movl	%esi, %edi	   C remainder -> n2
353	leal	(%ebp,%esi), %edx
354
355	movd	%mm0, %esi
356
357	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
358	cmpl	%eax, %ecx
359	jae	L(inverse_top)
360
361
362L(inverse_loop_done):
363
364
365C -----------------------------------------------------------------------------
366
367L(inverse_two_left):
368	C eax	scratch
369	C ebx	scratch (nadj, q1)
370	C ecx	&src[-1]
371	C edx	scratch
372	C esi	n10
373	C edi	n2
374	C ebp	divisor
375	C
376	C mm0	scratch (src dword)
377	C mm7	rshift
378
379	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
380	movl	%edi, %eax         C n2
381
382	leal	(%ebp,%esi), %ebx
383	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
384	sbbl	$-1, %eax          C n2+n1
385
386	mull	VAR_INVERSE        C m*(n2+n1)
387
388	movd	4(%ecx), %mm0	   C src low limb
389
390	C
391
392	C
393
394	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
395	leal	1(%edi), %ebx      C n2+1
396	movl	%ebp, %eax	   C d
397
398	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
399
400	sbbl	$0, %ebx
401
402	mull	%ebx		   C (q1+1)*d
403
404	psllq	$32, %mm0
405
406	psrlq	%mm7, %mm0
407
408	C
409
410	subl	%eax, %esi
411
412	C
413
414	sbbl	%edx, %edi	   C n - (q1+1)*d
415	movl	%esi, %edi	   C remainder -> n2
416	leal	(%ebp,%esi), %edx
417
418	movd	%mm0, %esi
419
420	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
421
422
423L(inverse_one_left):
424	C eax	scratch
425	C ebx	scratch (nadj, q1)
426	C ecx
427	C edx	scratch
428	C esi	n10
429	C edi	n2
430	C ebp	divisor
431	C
432	C mm0	src limb, shifted
433	C mm7	rshift
434
435	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
436	movl	%edi, %eax         C n2
437
438	leal	(%ebp,%esi), %ebx
439	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
440	sbbl	$-1, %eax          C n2+n1
441
442	mull	VAR_INVERSE        C m*(n2+n1)
443
444	movl	VAR_NORM, %ecx     C for final denorm
445
446	C
447
448	C
449
450	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
451	leal	1(%edi), %ebx      C n2+1
452	movl	%ebp, %eax	   C d
453
454	C
455
456	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
457
458	sbbl	$0, %ebx
459
460	mull	%ebx		   C (q1+1)*d
461
462	movl	SAVE_EBX, %ebx
463
464	C
465
466	C
467
468	subl	%eax, %esi
469
470	movl	%esi, %eax	   C remainder
471	movl	SAVE_ESI, %esi
472
473	sbbl	%edx, %edi	   C n - (q1+1)*d
474	leal	(%ebp,%eax), %edx
475	movl	SAVE_EBP, %ebp
476
477	cmovc(	%edx, %eax)	   C n - q1*d if underflow from using q1+1
478	movl	SAVE_EDI, %edi
479
480	shrl	%cl, %eax	   C denorm remainder
481	addl	$STACK_SPACE, %esp
482	emms
483
484	ret
485
486
487C -----------------------------------------------------------------------------
488C
489C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
490C of q*d is simply -d and the remainder n-q*d = n10+d
491
492L(q1_ff):
493	C eax	(divisor)
494	C ebx	(q1+1 == 0)
495	C ecx	src pointer
496	C edx
497	C esi	n10
498	C edi	(n2)
499	C ebp	divisor
500
501	movl	PARAM_SRC, %edx
502	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
503	psrlq	%mm7, %mm0
504
505	movd	%mm0, %esi		C next n10
506
507	cmpl	%edx, %ecx
508	jae	L(inverse_top)
509	jmp	L(inverse_loop_done)
510
511EPILOGUE()
512