xref: /netbsd/external/lgpl3/gmp/dist/mpn/alpha/mode1o.asm (revision f81b1c5b)
1dnl  Alpha mpn_modexact_1c_odd -- mpn exact remainder
2
3dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C      cycles/limb
35C EV4:    47
36C EV5:    30
37C EV6:    15
38
39
40C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
41C                                mp_limb_t c)
42C
43C This code follows the "alternate" code in mpn/generic/mode1o.c,
44C eliminating cbit+climb from the dependent chain.  This leaves,
45C
46C        ev4   ev5   ev6
47C         1     3     1    subq   y = x - h
48C        23    13     7    mulq   q = y * inverse
49C        23    14     7    umulh  h = high (q * d)
50C        --    --    --
51C        47    30    15
52C
53C In each case, the load latency, loop control, and extra carry bit handling
54C hide under the multiply latencies.  Those latencies are long enough that
55C we don't need to worry about alignment or pairing to squeeze out
56C performance.
57C
58C For the first limb, some of the loop code is broken out and scheduled back
59C since it can be done earlier.
60C
61C   - The first ldq src[0] is near the start of the routine, for maximum
62C     time from memory.
63C
64C   - The subq y=x-climb can be done without waiting for the inverse.
65C
66C   - The mulq y*inverse is replicated after the final subq for the inverse,
67C     instead of branching to the mulq in the main loop.  On ev4 a branch
68C     there would cost cycles, but we can hide them under the mulq latency.
69C
70C For the last limb, high<divisor is tested and if that's true a subtract
71C and addback is done, as per the main mpn/generic/mode1o.c code.  This is a
72C data-dependent branch, but we're waiting for umulh so any penalty should
73C hide there.  The multiplies saved would be worth the cost anyway.
74C
75C Enhancements:
76C
77C For size==1, a plain division (done bitwise say) might be faster than
78C calculating an inverse, the latter taking about 130 cycles on ev4 or 70 on
79C ev5.  A call to gcc __remqu might be a possibility.
80
81ASM_START()
82PROLOGUE(mpn_modexact_1c_odd,gp)
83
84	C r16	src
85	C r17	size
86	C r18	d
87	C r19	c
88
89	LEA(r0, binvert_limb_table)
90	srl	r18, 1, r20		C d >> 1
91
92	and	r20, 127, r20		C idx = d>>1 & 0x7F
93
94	addq	r0, r20, r21		C table + idx
95
96ifelse(bwx_available_p,1,
97`	ldbu	r20, 0(r21)		C table[idx], inverse 8 bits
98',`
99	ldq_u	r20, 0(r21)		C table[idx] qword
100	extbl	r20, r21, r20		C table[idx], inverse 8 bits
101')
102
103	mull	r20, r20, r7		C i*i
104	addq	r20, r20, r20		C 2*i
105
106	ldq	r2, 0(r16)		C x = s = src[0]
107	lda	r17, -1(r17)		C size--
108	clr	r0			C initial cbit=0
109
110	mull	r7, r18, r7		C i*i*d
111
112	subq	r20, r7, r20		C 2*i-i*i*d, inverse 16 bits
113
114	mull	r20, r20, r7		C i*i
115	addq	r20, r20, r20		C 2*i
116
117	mull	r7, r18, r7		C i*i*d
118
119	subq	r20, r7, r20		C 2*i-i*i*d, inverse 32 bits
120
121	mulq	r20, r20, r7		C i*i
122	addq	r20, r20, r20		C 2*i
123
124	mulq	r7, r18, r7		C i*i*d
125	subq	r2, r19, r3		C y = x - climb
126
127	subq	r20, r7, r20		C inv = 2*i-i*i*d, inverse 64 bits
128
129ASSERT(r7, C should have d*inv==1 mod 2^64
130`	mulq	r18, r20, r7
131	cmpeq	r7, 1, r7')
132
133	mulq	r3, r20, r4		C first q = y * inv
134
135	beq	r17, L(one)		C if size==1
136	br	L(entry)
137
138
139L(top):
140	C r0	cbit
141	C r16	src, incrementing
142	C r17	size, decrementing
143	C r18	d
144	C r19	climb
145	C r20	inv
146
147	ldq	r1, 0(r16)		C s = src[i]
148	subq	r1, r0, r2		C x = s - cbit
149	cmpult	r1, r0, r0		C new cbit = s < cbit
150
151	subq	r2, r19, r3		C y = x - climb
152
153	mulq	r3, r20, r4		C q = y * inv
154L(entry):
155	cmpult	r2, r19, r5		C cbit2 = x < climb
156	addq	r5, r0, r0		C cbit += cbit2
157	lda	r16, 8(r16)		C src++
158	lda	r17, -1(r17)		C size--
159
160	umulh	r4, r18, r19		C climb = q * d
161	bne	r17, L(top)		C while 2 or more limbs left
162
163
164
165	C r0	cbit
166	C r18	d
167	C r19	climb
168	C r20	inv
169
170	ldq	r1, 0(r16)		C s = src[size-1] high limb
171
172	cmpult	r1, r18, r2		C test high<divisor
173	bne	r2, L(skip)		C skip if so
174
175	C can't skip a division, repeat loop code
176
177	subq	r1, r0, r2		C x = s - cbit
178	cmpult	r1, r0, r0		C new cbit = s < cbit
179
180	subq	r2, r19, r3		C y = x - climb
181
182	mulq	r3, r20, r4		C q = y * inv
183L(one):
184	cmpult	r2, r19, r5		C cbit2 = x < climb
185	addq	r5, r0, r0		C cbit += cbit2
186
187	umulh	r4, r18, r19		C climb = q * d
188
189	addq	r19, r0, r0		C return climb + cbit
190	ret	r31, (r26), 1
191
192
193	ALIGN(8)
194L(skip):
195	C with high<divisor, the final step can be just (cbit+climb)-s and
196	C an addback of d if that underflows
197
198	addq	r19, r0, r19		C c = climb + cbit
199
200	subq	r19, r1, r2		C c - s
201	cmpult	r19, r1, r3		C c < s
202
203	addq	r2, r18, r0		C return c-s + divisor
204
205	cmoveq	r3, r2, r0		C return c-s if no underflow
206	ret	r31, (r26), 1
207
208EPILOGUE()
209ASM_END()
210