1dnl  Intel P6 mpn_modexact_1_odd -- exact division style remainder.
2
3dnl  Rearranged from mpn/x86/p6/dive_1.asm by Marco Bodrato.
4
5dnl  Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35
36C       odd  even  divisor
37C P6:  10.0  12.0  cycles/limb
38
39C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
40
41C The odd case is basically the same as mpn_modexact_1_odd, just with an
42C extra store, and it runs at the same 10 cycles which is the dependent
43C chain.
44C
45C The shifts for the even case aren't on the dependent chain so in principle
46C it could run the same too, but nothing running at 10 has been found.
47C Perhaps there's too many uops (an extra 4 over the odd case).
48
49defframe(PARAM_SHIFT,  24)
50defframe(PARAM_INVERSE,20)
51defframe(PARAM_DIVISOR,16)
52defframe(PARAM_SIZE,   12)
53defframe(PARAM_SRC,     8)
54defframe(PARAM_DST,     4)
55
56defframe(SAVE_EBX,     -4)
57defframe(SAVE_ESI,     -8)
58defframe(SAVE_EDI,    -12)
59defframe(SAVE_EBP,    -16)
60deflit(STACK_SPACE, 16)
61
62dnl  re-use parameter space
63define(VAR_INVERSE,`PARAM_SRC')
64
65	TEXT
66
67C mp_limb_t
68C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
69C		    mp_limb_t inverse, int shift)
70
71	ALIGN(16)
72PROLOGUE(mpn_pi1_bdiv_q_1)
73deflit(`FRAME',0)
74
75	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
76
77	movl	%esi, SAVE_ESI
78	movl	PARAM_SRC, %esi
79
80	movl	%ebx, SAVE_EBX
81	movl	PARAM_SIZE, %ebx
82
83	movl	%ebp, SAVE_EBP
84	movl	PARAM_INVERSE, %ebp
85
86	movl	PARAM_SHIFT, %ecx	C trailing twos
87
88L(common):
89	movl	%edi, SAVE_EDI
90	movl	PARAM_DST, %edi
91
92	leal	(%esi,%ebx,4), %esi	C src end
93
94	leal	(%edi,%ebx,4), %edi	C dst end
95	negl	%ebx			C -size
96
97	movl	(%esi,%ebx,4), %eax	C src[0]
98
99	orl	%ecx, %ecx
100	jz	L(odd_entry)
101
102	movl	%edi, PARAM_DST
103	movl	%ebp, VAR_INVERSE
104
105L(even):
106	C eax	src[0]
107	C ebx	counter, limbs, negative
108	C ecx	shift
109	C edx
110	C esi
111	C edi
112	C ebp
113
114	xorl	%ebp, %ebp		C initial carry bit
115	xorl	%edx, %edx		C initial carry limb (for size==1)
116
117	incl	%ebx
118	jz	L(even_one)
119
120	movl	(%esi,%ebx,4), %edi	C src[1]
121
122	shrdl(	%cl, %edi, %eax)
123
124	jmp	L(even_entry)
125
126
127L(even_top):
128	C eax	scratch
129	C ebx	counter, limbs, negative
130	C ecx	shift
131	C edx	scratch
132	C esi	&src[size]
133	C edi	&dst[size] and scratch
134	C ebp	carry bit
135
136	movl	(%esi,%ebx,4), %edi
137
138	mull	PARAM_DIVISOR
139
140	movl	-4(%esi,%ebx,4), %eax
141	shrdl(	%cl, %edi, %eax)
142
143	subl	%ebp, %eax
144
145	sbbl	%ebp, %ebp
146	subl	%edx, %eax
147
148	sbbl	$0, %ebp
149
150L(even_entry):
151	imull	VAR_INVERSE, %eax
152
153	movl	PARAM_DST, %edi
154	negl	%ebp
155
156	movl	%eax, -4(%edi,%ebx,4)
157	incl	%ebx
158	jnz	L(even_top)
159
160	mull	PARAM_DIVISOR
161
162	movl	-4(%esi), %eax
163
164L(even_one):
165	shrl	%cl, %eax
166	movl	SAVE_ESI, %esi
167
168	subl	%ebp, %eax
169	movl	SAVE_EBP, %ebp
170
171	subl	%edx, %eax
172	movl	SAVE_EBX, %ebx
173
174	imull	VAR_INVERSE, %eax
175
176	movl	%eax, -4(%edi)
177	movl	SAVE_EDI, %edi
178	addl	$STACK_SPACE, %esp
179
180	ret
181
182C The dependent chain here is
183C
184C	subl	%edx, %eax       1
185C	imull	%ebp, %eax       4
186C	mull	PARAM_DIVISOR    5
187C			       ----
188C	total			10
189C
190C and this is the measured speed.  No special scheduling is necessary, out
191C of order execution hides the load latency.
192
193L(odd_top):
194	C eax	scratch (src limb)
195	C ebx	counter, limbs, negative
196	C ecx	carry bit
197	C edx	carry limb, high of last product
198	C esi	&src[size]
199	C edi	&dst[size]
200	C ebp	inverse
201
202	mull	PARAM_DIVISOR
203
204	movl	(%esi,%ebx,4), %eax
205	subl	%ecx, %eax
206
207	sbbl	%ecx, %ecx
208	subl	%edx, %eax
209
210	sbbl	$0, %ecx
211
212L(odd_entry):
213	imull	%ebp, %eax
214
215	movl	%eax, (%edi,%ebx,4)
216	negl	%ecx
217
218	incl	%ebx
219	jnz	L(odd_top)
220
221
222	movl	SAVE_ESI, %esi
223
224	movl	SAVE_EDI, %edi
225
226	movl	SAVE_EBP, %ebp
227
228	movl	SAVE_EBX, %ebx
229	addl	$STACK_SPACE, %esp
230
231	ret
232
233EPILOGUE()
234
235C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
236C                           mp_limb_t divisor);
237C
238
239	ALIGN(16)
240PROLOGUE(mpn_bdiv_q_1)
241deflit(`FRAME',0)
242
243	movl	PARAM_DIVISOR, %eax
244	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
245
246	movl	%esi, SAVE_ESI
247	movl	PARAM_SRC, %esi
248
249	movl	%ebx, SAVE_EBX
250	movl	PARAM_SIZE, %ebx
251
252	bsfl	%eax, %ecx		C trailing twos
253
254	movl	%ebp, SAVE_EBP
255
256	shrl	%cl, %eax		C d without twos
257
258	movl	%eax, %edx
259	shrl	%eax			C d/2 without twos
260
261	movl	%edx, PARAM_DIVISOR
262	andl	$127, %eax
263
264ifdef(`PIC',`
265	LEA(	binvert_limb_table, %ebp)
266	movzbl	(%eax,%ebp), %ebp		C inv 8 bits
267',`
268	movzbl	binvert_limb_table(%eax), %ebp	C inv 8 bits
269')
270
271	leal	(%ebp,%ebp), %eax	C 2*inv
272
273	imull	%ebp, %ebp		C inv*inv
274	imull	%edx, %ebp	C inv*inv*d
275
276	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
277	leal	(%eax,%eax), %ebp	C 2*inv
278
279	imull	%eax, %eax		C inv*inv
280	imull	%edx, %eax	C inv*inv*d
281
282	subl	%eax, %ebp		C inv = 2*inv - inv*inv*d
283
284	jmp	L(common)
285
286EPILOGUE()
287ASM_END()
288