1dnl  x86-32 mpn_mod_1_1p, requiring cmov.
2
3dnl  Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
4
5dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C			    cycles/limb
36C P5				 ?
37C P6 model 0-8,10-12		 ?
38C P6 model 9  (Banias)		 ?
39C P6 model 13 (Dothan)		 ?
40C P4 model 0  (Willamette)	 ?
41C P4 model 1  (?)		 ?
42C P4 model 2  (Northwood)	 ?
43C P4 model 3  (Prescott)	 ?
44C P4 model 4  (Nocona)		 ?
45C AMD K6			 ?
46C AMD K7			 7
47C AMD K8			 ?
48
49define(`B2mb', `%ebx')
50define(`r0', `%esi')
51define(`r2', `%ebp')
52define(`t0', `%edi')
53define(`ap', `%ecx')  C Also shift count
54
55C Stack frame
56C	pre	36(%esp)
57C	b	32(%esp)
58C	n	28(%esp)
59C	ap	24(%esp)
60C	return	20(%esp)
61C	%ebp	16(%esp)
62C	%edi	12(%esp)
63C	%esi	8(%esp)
64C	%ebx	4(%esp)
65C	B2mod	(%esp)
66
67define(`B2modb', `(%esp)')
68define(`n', `28(%esp)')
69define(`b', `32(%esp)')
70define(`pre', `36(%esp)')
71
72C mp_limb_t
73C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4])
74C
75C The pre array contains bi, cnt, B1modb, B2modb
76C Note: This implementation needs B1modb only when cnt > 0
77
78ASM_START()
79	TEXT
80	ALIGN(8)
81PROLOGUE(mpn_mod_1_1p)
82	push	%ebp
83	push	%edi
84	push	%esi
85	push	%ebx
86	mov	32(%esp), %ebp		C pre[]
87
88	mov	12(%ebp), %eax		C B2modb
89	push	%eax			C Put it on stack
90
91	mov	n, %edx
92	mov	24(%esp), ap
93
94	lea	(ap, %edx, 4), ap
95	mov	-4(ap), %eax
96	cmp	$3, %edx
97	jnc	L(first)
98	mov	-8(ap), r0
99	jmp	L(reduce_two)
100
101L(first):
102	C First iteration, no r2
103	mull	B2modb
104	mov	-12(ap), r0
105	add	%eax, r0
106	mov	-8(ap), %eax
107	adc	%edx, %eax
108	sbb	r2, r2
109	subl	$3, n
110	lea	-16(ap), ap
111	jz	L(reduce_three)
112
113	mov	B2modb, B2mb
114	sub	b, B2mb
115	lea	(B2mb, r0), t0
116	jmp	L(mid)
117
118	ALIGN(16)
119L(top): C Loopmixed to 7 c/l on k7
120	add	%eax, r0
121	lea	(B2mb, r0), t0
122	mov	r2, %eax
123	adc	%edx, %eax
124	sbb	r2, r2
125L(mid):	mull	B2modb
126	and	B2modb, r2
127	add	r0, r2
128	decl	n
129	mov	(ap), r0
130	cmovc(	t0, r2)
131	lea	-4(ap), ap
132	jnz	L(top)
133
134	add	%eax, r0
135	mov	r2, %eax
136	adc	%edx, %eax
137	sbb	r2, r2
138
139L(reduce_three):
140	C Eliminate r2
141	and	b, r2
142	sub	r2, %eax
143
144L(reduce_two):
145	mov	pre, %ebp
146	movb	4(%ebp), %cl
147	test	%cl, %cl
148	jz	L(normalized)
149
150	C Unnormalized, use B1modb to reduce to size < B b
151	mull	8(%ebp)
152	xor	t0, t0
153	add	%eax, r0
154	adc	%edx, t0
155	mov	t0, %eax
156
157	C Left-shift to normalize
158	shld	%cl, r0, %eax C Always use shld?
159
160	shl	%cl, r0
161	jmp	L(udiv)
162
163L(normalized):
164	mov	%eax, t0
165	sub	b, t0
166	cmovnc(	t0, %eax)
167
168L(udiv):
169	lea	1(%eax), t0
170	mull	(%ebp)
171	mov	b, %ebx		C Needed in register for lea
172	add	r0, %eax
173	adc	t0, %edx
174	imul	%ebx, %edx
175	sub	%edx, r0
176	cmp	r0, %eax
177	lea	(%ebx, r0), %eax
178	cmovnc(	r0, %eax)
179	cmp	%ebx, %eax
180	jnc	L(fix)
181L(ok):	shr	%cl, %eax
182
183	add	$4, %esp
184	pop	%ebx
185	pop	%esi
186	pop	%edi
187	pop	%ebp
188
189	ret
190L(fix):	sub	%ebx, %eax
191	jmp	L(ok)
192EPILOGUE()
193
194PROLOGUE(mpn_mod_1_1p_cps)
195	push	%ebp
196	mov	12(%esp), %ebp
197	push	%esi
198	bsr	%ebp, %ecx
199	push	%ebx
200	xor	$31, %ecx
201	mov	16(%esp), %esi
202	sal	%cl, %ebp
203	mov	%ebp, %edx
204	not	%edx
205	mov	$-1, %eax
206	div	%ebp			C On K7, invert_limb would be a few cycles faster.
207	mov	%eax, (%esi)		C store bi
208	mov	%ecx, 4(%esi)		C store cnt
209	neg	%ebp
210	mov	$1, %edx
211	shld	%cl, %eax, %edx
212	imul	%ebp, %edx
213	shr	%cl, %edx
214	imul	%ebp, %eax
215	mov	%edx, 8(%esi)		C store B1modb
216	mov	%eax, 12(%esi)		C store B2modb
217	pop	%ebx
218	pop	%esi
219	pop	%ebp
220	ret
221EPILOGUE()
222