1dnl  AMD K7 mpn_divexact_1 -- mpn by limb exact division.
2
3dnl  Copyright 2001, 2002, 2004 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 2.1 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public
18dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
19dnl  not, write to the Free Software Foundation, Inc., 51 Franklin Street,
20dnl  Fifth Floor, Boston, MA 02110-1301, USA.
21
22include(`../config.m4')
23
24
25C          cycles/limb
26C Athlon:     11.0
27C Hammer:      9.0
28
29
30C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
31C                      mp_limb_t divisor);
32C
33C The dependent chain is mul+imul+sub for 11 cycles and that speed is
34C achieved with no special effort.  The load and shrld latencies are hidden
35C by out of order execution.
36C
37C It's a touch faster on size==1 to use the mul-by-inverse than divl.
38
39defframe(PARAM_DIVISOR,16)
40defframe(PARAM_SIZE,   12)
41defframe(PARAM_SRC,    8)
42defframe(PARAM_DST,    4)
43
44defframe(SAVE_EBX,     -4)
45defframe(SAVE_ESI,     -8)
46defframe(SAVE_EDI,    -12)
47defframe(SAVE_EBP,    -16)
48defframe(VAR_INVERSE, -20)
49defframe(VAR_DST_END, -24)
50
51deflit(STACK_SPACE, 24)
52
53	TEXT
54
55	ALIGN(16)
56PROLOGUE(mpn_divexact_1)
57deflit(`FRAME',0)
58
59	movl	PARAM_DIVISOR, %eax
60	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
61	movl	$-1, %ecx		C shift count
62
63	movl	%ebp, SAVE_EBP
64	movl	PARAM_SIZE, %ebp
65
66	movl	%esi, SAVE_ESI
67	movl	%edi, SAVE_EDI
68
69	C If there's usually only one or two trailing zero bits then this
70	C should be faster than bsfl.
71L(strip_twos):
72	incl	%ecx
73	shrl	%eax
74	jnc	L(strip_twos)
75
76	movl	%ebx, SAVE_EBX
77	leal	1(%eax,%eax), %ebx	C d without twos
78	andl	$127, %eax		C d/2, 7 bits
79
80ifdef(`PIC',`
81	call	L(movl_eip_edx)
82
83	addl	$_GLOBAL_OFFSET_TABLE_, %edx
84
85	movl	modlimb_invert_table@GOT(%edx), %edx
86
87	movzbl	(%eax,%edx), %eax			C inv 8 bits
88',`
89dnl non-PIC
90	movzbl	modlimb_invert_table(%eax), %eax	C inv 8 bits
91')
92
93	leal	(%eax,%eax), %edx	C 2*inv
94	movl	%ebx, PARAM_DIVISOR	C d without twos
95
96	imull	%eax, %eax		C inv*inv
97
98	movl	PARAM_SRC, %esi
99	movl	PARAM_DST, %edi
100
101	imull	%ebx, %eax		C inv*inv*d
102
103	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
104	leal	(%edx,%edx), %eax	C 2*inv
105
106	imull	%edx, %edx		C inv*inv
107
108	leal	(%esi,%ebp,4), %esi	C src end
109	leal	(%edi,%ebp,4), %edi	C dst end
110	negl	%ebp			C -size
111
112	imull	%ebx, %edx		C inv*inv*d
113
114	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
115
116	ASSERT(e,`	C expect d*inv == 1 mod 2^BITS_PER_MP_LIMB
117	pushl	%eax	FRAME_pushl()
118	imull	PARAM_DIVISOR, %eax
119	cmpl	$1, %eax
120	popl	%eax	FRAME_popl()')
121
122	movl	%eax, VAR_INVERSE
123	movl	(%esi,%ebp,4), %eax	C src[0]
124
125	incl	%ebp
126	jz	L(one)
127
128	movl	(%esi,%ebp,4), %edx	C src[1]
129
130	shrdl(	%cl, %edx, %eax)
131
132	movl	%edi, VAR_DST_END
133	xorl	%ebx, %ebx
134	jmp	L(entry)
135
136ifdef(`PIC',`
137L(movl_eip_edx):
138	movl	(%esp), %edx
139	ret_internal
140')
141
142	ALIGN(8)
143L(top):
144	C eax	q
145	C ebx	carry bit, 0 or 1
146	C ecx	shift
147	C edx
148	C esi	src end
149	C edi	dst end
150	C ebp	counter, limbs, negative
151
152	mull	PARAM_DIVISOR		C carry limb in edx
153
154	movl	-4(%esi,%ebp,4), %eax
155	movl	(%esi,%ebp,4), %edi
156
157	shrdl(	%cl, %edi, %eax)
158
159	subl	%ebx, %eax		C apply carry bit
160	setc	%bl
161	movl	VAR_DST_END, %edi
162
163	subl	%edx, %eax		C apply carry limb
164	adcl	$0, %ebx
165
166L(entry):
167	imull	VAR_INVERSE, %eax
168
169	movl	%eax, -4(%edi,%ebp,4)
170	incl	%ebp
171	jnz	L(top)
172
173
174	mull	PARAM_DIVISOR		C carry limb in edx
175
176	movl	-4(%esi), %eax		C src high limb
177	shrl	%cl, %eax
178	movl	SAVE_ESI, %esi
179
180	subl	%ebx, %eax		C apply carry bit
181	movl	SAVE_EBX, %ebx
182	movl	SAVE_EBP, %ebp
183
184	subl	%edx, %eax		C apply carry limb
185
186	imull	VAR_INVERSE, %eax
187
188	movl	%eax, -4(%edi)
189	movl	SAVE_EDI, %edi
190	addl	$STACK_SPACE, %esp
191
192	ret
193
194
195L(one):
196	shrl	%cl, %eax
197	movl	SAVE_ESI, %esi
198	movl	SAVE_EBX, %ebx
199
200	imull	VAR_INVERSE, %eax
201
202	movl	SAVE_EBP, %ebp
203	movl	%eax, -4(%edi)
204
205	movl	SAVE_EDI, %edi
206	addl	$STACK_SPACE, %esp
207
208	ret
209
210EPILOGUE()
211