1dnl  Alpha mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
2
3dnl  Copyright 2007, 2008, 2013 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C		norm	frac
34C ev4
35C ev5		70	70
36C ev6		29	29
37
38C TODO
39C  * Perhaps inline mpn_invert_limb, that would allow us to not save/restore
40C    any registers (thus save ~10 cycles per call).
41C  * Use negated d1 and/or d0 to speed carry propagation.  Might save a cycle
42C    or two.
43C  * Check cluster delays (for ev6).  We very likely could save some cycles.
44C  * Use branch-free code for computing di.
45C  * CAVEAT: We rely on r19 not being clobbered by mpn_invert_limb call.
46
47C INPUT PARAMETERS
48define(`qp',		`r16')
49define(`fn',		`r17')
50define(`up_param',	`r18')
51define(`un_param',	`r19')
52define(`dp',		`r20')
53
54ASM_START()
55PROLOGUE(mpn_divrem_2,gp)
56	lda	r30, -80(r30)
57	stq	r26, 0(r30)
58	stq	r9, 8(r30)
59	stq	r10, 16(r30)
60	stq	r11, 24(r30)
61	stq	r12, 32(r30)
62	stq	r13, 40(r30)
63C	stq	r14, 48(r30)
64	stq	r15, 56(r30)
65	.prologue	1
66	stq	r16, 64(r30)
67	bis	r31, r17, r15
68	s8addq	r19, r18, r13
69	lda	r13, -24(r13)
70	ldq	r12, 8(r20)
71	ldq	r10, 0(r20)
72	ldq	r11, 16(r13)
73	ldq	r9, 8(r13)
74
75	bis	r31, r31, r3		C most_significant_q_limb = 0
76	cmpult	r11, r12, r1
77	bne	r1, L(L8)
78	cmpule	r11, r12, r1
79	cmpult	r9, r10, r2
80	and	r1, r2, r1
81	bne	r1, L(L8)
82	subq	r11, r12, r11
83	subq	r11, r2, r11
84	subq	r9, r10, r9
85	lda	r3, 1(r31)		C most_significant_q_limb = 1
86L(L8):	stq	r3, 72(r30)
87
88	addq	r15, r19, r19
89	lda	r19, -3(r19)
90	blt	r19, L(L10)
91	bis	r31, r12, r16
92	jsr	r26, mpn_invert_limb
93	LDGP(	r29, 0(r26))
94	mulq	r0, r12, r4		C t0 = LO(di * d1)
95	umulh	r0, r10, r2		C s1 = HI(di * d0)
96	addq	r4, r10, r4		C t0 += d0
97	cmpule	r10, r4, r7		C (t0 < d0)
98	addq	r4, r2, r4		C t0 += s1
99	cmpult	r4, r2, r1
100	subq	r1, r7, r7		C t1 (-1, 0, or 1)
101	blt	r7, L(L42)
102L(L22):
103	lda	r0, -1(r0)		C di--
104	cmpult	r4, r12, r1		C cy for: t0 -= d1 (below)
105	subq	r7, r1, r7		C t1 -= cy
106	subq	r4, r12, r4		C t0 -= d1
107	bge	r7, L(L22)
108L(L42):
109	ldq	r16, 64(r30)
110	s8addq	r19, r16, r16
111	ALIGN(16)
112L(loop):
113	mulq	r11, r0, r5		C q0 (early)
114	umulh	r11, r0, r6		C q  (early)
115	addq	r5, r9, r8		C q0 += n1
116	addq	r6, r11, r6		C q  += n2
117	cmpult	r8, r5, r1		C cy for: q0 += n1
118	addq	r6, r1, r6		C q  += cy
119	unop
120	mulq	r12, r6, r1		C LO(d1 * q)
121	umulh	r10, r6, r7		C t1 = HI(d0 * q)
122	subq	r9, r1, r9		C n1 -= LO(d1 * q)
123	mulq	r10, r6, r4		C t0 = LO(d0 * q)
124	unop
125	cmple	r15, r19, r5		C condition and n0...
126	beq	r5, L(L31)
127	ldq	r5, 0(r13)
128	lda	r13, -8(r13)
129L(L31):	subq	r9, r12, r9		C n1 -= d1
130	cmpult	r5, r10, r1		C
131	subq	r9, r1, r9		C
132	subq	r5, r10, r5		C n0 -= d0
133	subq	r9, r7, r9		C n1 -= t0
134	cmpult	r5, r4, r1		C
135	subq	r9, r1, r2		C
136	subq	r5, r4, r5		C n0 -= t1
137	cmpult	r2, r8, r1		C (n1 < q0)
138	addq	r6, r1, r6		C q += cond
139	lda	r1, -1(r1)		C -(n1 >= q0)
140	and	r1, r10, r4		C
141	addq	r5, r4, r9		C n0 += mask & d0
142	and	r1, r12, r1		C
143	cmpult	r9, r5, r11		C cy for: n0 += mask & d0
144	addq	r2, r1, r1		C n1 += mask & d1
145	addq	r1, r11, r11		C n1 += cy
146	cmpult	r11, r12, r1		C
147	beq	r1, L(fix)		C
148L(bck):	stq	r6, 0(r16)
149	lda	r16, -8(r16)
150	lda	r19, -1(r19)
151	bge	r19, L(loop)
152
153L(L10):	stq	r9, 8(r13)
154	stq	r11, 16(r13)
155	ldq	r0, 72(r30)
156	ldq	r26, 0(r30)
157	ldq	r9, 8(r30)
158	ldq	r10, 16(r30)
159	ldq	r11, 24(r30)
160	ldq	r12, 32(r30)
161	ldq	r13, 40(r30)
162C	ldq	r14, 48(r30)
163	ldq	r15, 56(r30)
164	lda	r30, 80(r30)
165	ret	r31, (r26), 1
166
167L(fix):	cmpule	r11, r12, r1
168	cmpult	r9, r10, r2
169	and	r1, r2, r1
170	bne	r1, L(bck)
171	subq	r11, r12, r11
172	subq	r11, r2, r11
173	subq	r9, r10, r9
174	lda	r6, 1(r6)
175	br	L(bck)
176EPILOGUE()
177ASM_END()
178