1d25e02daSmrgdnl  Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
2d25e02daSmrg
3*f81b1c5bSmrgdnl  Rearranged from mpn/x86/pentium4/sse2/dive_1.asm by Marco Bodrato.
4*f81b1c5bSmrg
5d25e02daSmrgdnl  Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
6*f81b1c5bSmrg
7d25e02daSmrgdnl  This file is part of the GNU MP Library.
8d25e02daSmrgdnl
9*f81b1c5bSmrgdnl  The GNU MP Library is free software; you can redistribute it and/or modify
10*f81b1c5bSmrgdnl  it under the terms of either:
11d25e02daSmrgdnl
12*f81b1c5bSmrgdnl    * the GNU Lesser General Public License as published by the Free
13*f81b1c5bSmrgdnl      Software Foundation; either version 3 of the License, or (at your
14*f81b1c5bSmrgdnl      option) any later version.
15d25e02daSmrgdnl
16*f81b1c5bSmrgdnl  or
17d25e02daSmrgdnl
18*f81b1c5bSmrgdnl    * the GNU General Public License as published by the Free Software
19*f81b1c5bSmrgdnl      Foundation; either version 2 of the License, or (at your option) any
20*f81b1c5bSmrgdnl      later version.
21*f81b1c5bSmrgdnl
22*f81b1c5bSmrgdnl  or both in parallel, as here.
23*f81b1c5bSmrgdnl
24*f81b1c5bSmrgdnl  The GNU MP Library is distributed in the hope that it will be useful, but
25*f81b1c5bSmrgdnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26*f81b1c5bSmrgdnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27*f81b1c5bSmrgdnl  for more details.
28*f81b1c5bSmrgdnl
29*f81b1c5bSmrgdnl  You should have received copies of the GNU General Public License and the
30*f81b1c5bSmrgdnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31*f81b1c5bSmrgdnl  see https://www.gnu.org/licenses/.
32d25e02daSmrg
33d25e02daSmrginclude(`../config.m4')
34d25e02daSmrg
35d25e02daSmrg
36d25e02daSmrgC P4: 19.0 cycles/limb
37d25e02daSmrg
38d25e02daSmrgC Pairs of movd's are used to avoid unaligned loads.  Despite the loads not
39d25e02daSmrgC being on the dependent chain and there being plenty of cycles available,
40d25e02daSmrgC using an unaligned movq on every second iteration measured about 23 c/l.
41d25e02daSmrgC
42d25e02daSmrg
43d25e02daSmrgdefframe(PARAM_SHIFT,  24)
44d25e02daSmrgdefframe(PARAM_INVERSE,20)
45d25e02daSmrgdefframe(PARAM_DIVISOR,16)
46d25e02daSmrgdefframe(PARAM_SIZE,   12)
47d25e02daSmrgdefframe(PARAM_SRC,    8)
48d25e02daSmrgdefframe(PARAM_DST,    4)
49d25e02daSmrg
50d25e02daSmrg	TEXT
51d25e02daSmrg
52d25e02daSmrgC mp_limb_t
53d25e02daSmrgC mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
54d25e02daSmrgC		    mp_limb_t inverse, int shift)
55d25e02daSmrg	ALIGN(32)
56d25e02daSmrgPROLOGUE(mpn_pi1_bdiv_q_1)
57d25e02daSmrgdeflit(`FRAME',0)
58d25e02daSmrg
59d25e02daSmrg	movl	PARAM_SIZE, %edx
60d25e02daSmrg
61d25e02daSmrg	movl	PARAM_SRC, %eax
62d25e02daSmrg
63d25e02daSmrg	movl	PARAM_DIVISOR, %ecx
64d25e02daSmrg
65d25e02daSmrg	movd	%ecx, %mm6
66d25e02daSmrg	movl	PARAM_SHIFT, %ecx
67d25e02daSmrg
68d25e02daSmrg	movd	%ecx, %mm7		C shift
69d25e02daSmrg
70d25e02daSmrg	C
71d25e02daSmrg
72d25e02daSmrg	movl	PARAM_INVERSE, %ecx
73d25e02daSmrg	movd	%ecx, %mm5		C inv
74d25e02daSmrg
75d25e02daSmrg	movl	PARAM_DST, %ecx
76d25e02daSmrg	pxor	%mm1, %mm1		C initial carry limb
77d25e02daSmrg	pxor	%mm0, %mm0		C initial carry bit
78d25e02daSmrg
79d25e02daSmrg	subl	$1, %edx
80d25e02daSmrg	jz	L(done)
81d25e02daSmrg
82d25e02daSmrg	pcmpeqd	%mm4, %mm4
83d25e02daSmrg	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
84d25e02daSmrg
85d25e02daSmrgC The dependent chain here is as follows.
86d25e02daSmrgC
87d25e02daSmrgC					latency
88d25e02daSmrgC	psubq	 s = (src-cbit) - climb	   2
89d25e02daSmrgC	pmuludq	 q = s*inverse		   8
90d25e02daSmrgC	pmuludq	 prod = q*divisor	   8
91d25e02daSmrgC	psrlq	 climb = high(prod)	   2
92d25e02daSmrgC					  --
93d25e02daSmrgC					  20
94d25e02daSmrgC
95d25e02daSmrgC Yet the loop measures 19.0 c/l, so obviously there's something gained
96d25e02daSmrgC there over a straight reading of the chip documentation.
97d25e02daSmrg
98d25e02daSmrgL(top):
99d25e02daSmrg	C eax	src, incrementing
100d25e02daSmrg	C ebx
101d25e02daSmrg	C ecx	dst, incrementing
102d25e02daSmrg	C edx	counter, size-1 iterations
103d25e02daSmrg	C
104d25e02daSmrg	C mm0	carry bit
105d25e02daSmrg	C mm1	carry limb
106d25e02daSmrg	C mm4	0x00000000FFFFFFFF
107d25e02daSmrg	C mm5	inverse
108d25e02daSmrg	C mm6	divisor
109d25e02daSmrg	C mm7	shift
110d25e02daSmrg
111d25e02daSmrg	movd	(%eax), %mm2
112d25e02daSmrg	movd	4(%eax), %mm3
113d25e02daSmrg	addl	$4, %eax
114d25e02daSmrg	punpckldq %mm3, %mm2
115d25e02daSmrg
116d25e02daSmrg	psrlq	%mm7, %mm2
117d25e02daSmrg	pand	%mm4, %mm2		C src
118d25e02daSmrg	psubq	%mm0, %mm2		C src - cbit
119d25e02daSmrg
120d25e02daSmrg	psubq	%mm1, %mm2		C src - cbit - climb
121d25e02daSmrg	movq	%mm2, %mm0
122d25e02daSmrg	psrlq	$63, %mm0		C new cbit
123d25e02daSmrg
124d25e02daSmrg	pmuludq	%mm5, %mm2		C s*inverse
125d25e02daSmrg	movd	%mm2, (%ecx)		C q
126d25e02daSmrg	addl	$4, %ecx
127d25e02daSmrg
128d25e02daSmrg	movq	%mm6, %mm1
129d25e02daSmrg	pmuludq	%mm2, %mm1		C q*divisor
130d25e02daSmrg	psrlq	$32, %mm1		C new climb
131d25e02daSmrg
132d25e02daSmrgL(entry):
133d25e02daSmrg	subl	$1, %edx
134d25e02daSmrg	jnz	L(top)
135d25e02daSmrg
136d25e02daSmrgL(done):
137d25e02daSmrg	movd	(%eax), %mm2
138d25e02daSmrg	psrlq	%mm7, %mm2		C src
139d25e02daSmrg	psubq	%mm0, %mm2		C src - cbit
140d25e02daSmrg
141d25e02daSmrg	psubq	%mm1, %mm2		C src - cbit - climb
142d25e02daSmrg
143d25e02daSmrg	pmuludq	%mm5, %mm2		C s*inverse
144d25e02daSmrg	movd	%mm2, (%ecx)		C q
145d25e02daSmrg
146d25e02daSmrg	emms
147d25e02daSmrg	ret
148d25e02daSmrg
149d25e02daSmrgEPILOGUE()
150d25e02daSmrg
151d25e02daSmrg	ALIGN(16)
152d25e02daSmrgC mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
153d25e02daSmrgC                           mp_limb_t divisor);
154d25e02daSmrgC
155d25e02daSmrgPROLOGUE(mpn_bdiv_q_1)
156d25e02daSmrgdeflit(`FRAME',0)
157d25e02daSmrg
158d25e02daSmrg	movl	PARAM_SIZE, %edx
159d25e02daSmrg
160d25e02daSmrg	movl	PARAM_DIVISOR, %ecx
161d25e02daSmrg
162d25e02daSmrg	C eax	src
163d25e02daSmrg	C ebx
164d25e02daSmrg	C ecx	divisor
165d25e02daSmrg	C edx	size-1
166d25e02daSmrg
167d25e02daSmrg	movl	%ecx, %eax
168d25e02daSmrg	bsfl	%ecx, %ecx		C trailing twos
169d25e02daSmrg
170d25e02daSmrg	shrl	%cl, %eax		C d = divisor without twos
171d25e02daSmrg	movd	%eax, %mm6
172d25e02daSmrg	movd	%ecx, %mm7		C shift
173d25e02daSmrg
174d25e02daSmrg	shrl	%eax			C d/2
175d25e02daSmrg
176d25e02daSmrg	andl	$127, %eax		C d/2, 7 bits
177d25e02daSmrg
178d25e02daSmrgifdef(`PIC',`
179d25e02daSmrg	LEA(	binvert_limb_table, %ecx)
180d25e02daSmrg	movzbl	(%eax,%ecx), %eax		C inv 8 bits
181d25e02daSmrg',`
182d25e02daSmrg	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
183d25e02daSmrg')
184d25e02daSmrg
185d25e02daSmrg	C
186d25e02daSmrg
187d25e02daSmrg	movd	%eax, %mm5		C inv
188d25e02daSmrg
189d25e02daSmrg	movd	%eax, %mm0		C inv
190d25e02daSmrg
191d25e02daSmrg	pmuludq	%mm5, %mm5		C inv*inv
192d25e02daSmrg
193d25e02daSmrg	C
194d25e02daSmrg
195d25e02daSmrg	pmuludq	%mm6, %mm5		C inv*inv*d
196d25e02daSmrg	paddd	%mm0, %mm0		C 2*inv
197d25e02daSmrg
198d25e02daSmrg	C
199d25e02daSmrg
200d25e02daSmrg	psubd	%mm5, %mm0		C inv = 2*inv - inv*inv*d
201d25e02daSmrg	pxor	%mm5, %mm5
202d25e02daSmrg
203d25e02daSmrg	paddd	%mm0, %mm5
204d25e02daSmrg	pmuludq	%mm0, %mm0		C inv*inv
205d25e02daSmrg
206d25e02daSmrg	pcmpeqd	%mm4, %mm4
207d25e02daSmrg	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
208d25e02daSmrg
209d25e02daSmrg	C
210d25e02daSmrg
211d25e02daSmrg	pmuludq	%mm6, %mm0		C inv*inv*d
212d25e02daSmrg	paddd	%mm5, %mm5		C 2*inv
213d25e02daSmrg
214d25e02daSmrg	movl	PARAM_SRC, %eax
215d25e02daSmrg	movl	PARAM_DST, %ecx
216d25e02daSmrg	pxor	%mm1, %mm1		C initial carry limb
217d25e02daSmrg
218d25e02daSmrg	C
219d25e02daSmrg
220d25e02daSmrg	psubd	%mm0, %mm5		C inv = 2*inv - inv*inv*d
221d25e02daSmrg
222d25e02daSmrg	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
223d25e02daSmrg	pushl	%eax	FRAME_pushl()
224d25e02daSmrg	movq	%mm6, %mm0
225d25e02daSmrg	pmuludq	%mm5, %mm0
226d25e02daSmrg	movd	%mm0, %eax
227d25e02daSmrg	cmpl	$1, %eax
228d25e02daSmrg	popl	%eax	FRAME_popl()')
229d25e02daSmrg
230d25e02daSmrg	pxor	%mm0, %mm0		C initial carry bit
231d25e02daSmrg	jmp	L(entry)
232d25e02daSmrg
233d25e02daSmrgEPILOGUE()
234*f81b1c5bSmrgASM_END()
235