1* Copyright (c) 1995  Colin Plumb.  All rights reserved.
2* For licensing and other legal details, see the file legal.c.
3*
4* lbn68360.c - 32-bit bignum primitives for 683xx processors.
5*
6* This code is using InterTools calling convention, which is a bit odd.
7* One minor note is that the default variable sizes are
8* char = unsigned 8, short = 8 (in violation of ANSI!),
9* int = 16, long = 32.  Longs (including on the stack) are 16-bit aligned.
10* Arguments are apdded to 16 bits.
11* A6 is used as a frame pointer, and globals are indexed off A5.
12* Return valies are passes id D0 or A0 (or FP0), depending on type.
13* D0, D1, A0 and A4 (!) are volatile across function calls.  A1
14* must be preserved!
15*
16* This code assumes 16-bit ints.  Code for 32-bit ints is commented out
17* with "**".
18*
19* Regardless of UINT_MAX, only bignums up to 64K words (2 million bits)
20* are supported.  (68k hackers will recognize this as a consequence of
21* using dbra.)  This could be extended easily if anyone cares.
22*
23* These primitives use little-endian word order.
24* (The order of bytes within words is irrelevant to this issue.)
25
26* The Metrowerks C compiler (1.2.2) produces bad 68k code for the
27* following input, which happens to be the inner loop of lbnSub1,
28* so it has been rewritees in assembly, even though it is not terribly
29* speed-critical.  (Optimizer on or off does not matter.)
30*
31* unsigned
32* decrement(unsigned *num, unsigned len)
33* {
34*      do {
35*              if ((*num++)-- != 0)
36*                      return 0;
37*      } while (--len);
38*      return 1;
39* }
40
41* BNWORD32 lbnSub1_32(BNWORD32 *num, unsigned len, BNWORD32 borrow)
42	SECTION	S_lbnSub1_32,,"code"
43	XDEF	_lbnSub1_32
44_lbnSub1_32:
45	movea.l	4(sp),a0	* num
46	move.l	10(sp),d0	* borrow
47**	move.l	12(sp),d0	* borrow
48	sub.l	d0,(a0)+
49	bcc	sub_done
50	move.w	8(sp),d0	* len
51**	move.w	10(sp),d0	* len
52	subq.w	#2,d0
53	bcs	sub_done
54sub_loop:
55	subq.l	#1,(a0)+
56	dbcc	d0,sub_loop
57sub_done:
58	moveq.l	#0,d0
59	addx.w	d0,d0
60	rts
61
62* BNWORD32 lbnAdd1_32(BNWORD32 *num, unsigned len, BNWORD32 carry)
63	SECTION	S_lbnAdd1_32,,"code"
64	XDEF	_lbnAdd1_32
65_lbnAdd1_32:
66	movea.l	4(sp),a0	* num
67	move.l	10(sp),d0	* carry
68**	move.l	12(sp),d0	* carry
69	add.l	d0,(a0)+
70	bcc	add_done
71	move.w	8(sp),d0	* len
72**	move.w	10(sp),d0	* len
73	subq.w	#2,d0
74	bcs	add_done
75add_loop:
76	addq.l	#1,(a0)+
77	dbcc	d0,add_loop
78add_done:
79	moveq.l	#0,d0
80	addx.w	d0,d0
81	rts
82
83* void lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
84	SECTION	S_lbnMulN1_32,,"code"
85	XDEF	_lbnMulN1_32
86_lbnMulN1_32:
87	movem.l	d2-d5,-(sp)	* 16 bytes of extra data
88	moveq.l	#0,d4
89	move.l	20(sp),a4	* out
90	move.l	24(sp),a0	* in
91	move.w	28(sp),d5	* len
92	move.l	30(sp),d2	* k
93**	move.w	30(sp),d5	* len
94**	move.l	32(sp),d2	* k
95
96	move.l	(a0)+,d3	* First multiply
97	mulu.l	d2,d1:d3	* dc.w    0x4c02, 0x3401
98	move.l	d3,(a4)+
99
100	subq.w	#1,d5		* Setup for loop unrolling
101	lsr.w	#1,d5
102	bcs.s	m32_even
103	beq.s	m32_short
104
105	subq.w	#1,d5		* Set up software pipeline properly
106	move.l	d1,d0
107
108m32_loop:
109	move.l	(a0)+,d3
110	mulu.l	d2,d1:d3	* dc.w    0x4c02, 0x3401
111	add.l	d0,d3
112	addx.l	d4,d1
113	move.l	d3,(a4)+
114m32_even:
115
116	move.l	(a0)+,d3
117	mulu.l	d2,d0:d3	* dc.w    0x4c02, 0x3400
118	add.l	d1,d3
119	addx.l	d4,d0
120	move.l	d3,(a4)+
121
122	dbra	d5,m32_loop
123
124	move.l	d0,(a4)
125	movem.l	(sp)+,d2-d5
126	rts
127m32_short:
128	move.l	d1,(a4)
129	movem.l	(sp)+,d2-d5
130	rts
131
132* BNWORD32
133* lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
134	SECTION	S_lbnMulAdd1_32,,"code"
135	XDEF	_lbnMulAdd1_32
136_lbnMulAdd1_32:
137	movem.l	d2-d5,-(sp)	* 16 bytes of extra data
138	moveq.l	#0,d4
139	move.l	20(sp),a4	* out
140	move.l	24(sp),a0	* in
141	move.w	28(sp),d5	* len
142	move.l	30(sp),d2	* k
143**	move.w	30(sp),d5	* len
144**	move.l	32(sp),d2	* k
145
146	move.l	(a0)+,d3	* First multiply
147	mulu.l	d2,d1:d3	* dc.w    0x4c02, 0x3401
148	add.l	d3,(a4)+
149	addx.l	d4,d1
150
151	subq.w	#1,d5	* Setup for loop unrolling
152	lsr.w	#1,d5
153	bcs.s	ma32_even
154	beq.s	ma32_short
155
156	subq.w	#1,d5	* Set up software pipeline properly
157	move.l	d1,d0
158
159ma32_loop:
160	move.l	(a0)+,d3
161	mulu.l	d2,d1:d3	* dc.w    0x4c02, 0x3401
162	add.l	d0,d3
163	addx.l	d4,d1
164	add.l	d3,(a4)+
165	addx.l	d4,d1
166ma32_even:
167
168	move.l	(a0)+,d3
169	mulu.l	d2,d0:d3	* dc.w    0x4c02, 0x3400
170	add.l	d1,d3
171	addx.l	d4,d0
172	add.l	d3,(a4)+
173	addx.l	d4,d0
174
175	dbra	d5,ma32_loop
176
177	movem.l	(sp)+,d2-d5
178	rts
179ma32_short:
180	move.l	d1,d0
181	movem.l	(sp)+,d2-d5
182	rts
183
184* BNWORD32
185* lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
186	SECTION	S_lbnMulSub1_32,,"code"
187	XDEF	_lbnMulSub1_32
188_lbnMulSub1_32:
189	movem.l	d2-d5,-(sp)	* 16 bytes of extra data
190	moveq.l	#0,d4
191	move.l	20(sp),a4	* out
192	move.l	24(sp),a0	* in
193	move.w	28(sp),d5	* len
194	move.l	30(sp),d2	* k
195**	move.w	30(sp),d5	* len
196**	move.l	32(sp),d2	* k
197
198	move.l	(a0)+,d3	* First multiply
199	mulu.l	d2,d1:d3	* dc.w    0x4c02, 0x3401
200	sub.l	d3,(a4)+
201	addx.l	d4,d1
202
203	subq.w	#1,d5	* Setup for loop unrolling
204	lsr.w	#1,d5
205	bcs.s	ms32_even
206	beq.s	ms32_short
207
208	subq.w	#1,d5	* Set up software pipeline properly
209	move.l	d1,d0
210
211ms32_loop:
212	move.l	(a0)+,d3
213	mulu.l	d2,d1:d3	* dc.w	0x4c02, 0x3401
214	add.l	d0,d3
215	addx.l	d4,d1
216	sub.l	d3,(a4)+
217	addx.l	d4,d1
218ms32_even:
219
220	move.l	(a0)+,d3
221	mulu.l	d2,d0:d3	* dc.w	0x4c02, 0x3400
222	add.l	d1,d3
223	addx.l	d4,d0
224	sub.l	d3,(a4)+
225	addx.l	d4,d0
226
227	dbra	d5,ms32_loop
228
229	movem.l	(sp)+,d2-d5
230	rts
231
232ms32_short:
233	move.l	d1,d0
234	movem.l	(sp)+,d2-d5
235	rts
236
237
238* BNWORD32 lbnDiv21_32(BNWORD32 *q, BNWORD32 nh, BNWORD32 nl, BNWORD32 d)
239	SECTION	S_lbnDiv21_32,,"code"
240	XDEF	_lbnDiv21_32
241_lbnDiv21_32:
242	move.l	8(sp),d0
243	move.l	12(sp),d1
244	move.l	4(sp),a0
245	divu.l	16(sp),d0:d1	*  dc.w	0x4c6f, 0x1400, 16
246	move.l	d1,(a0)
247	rts
248
249* unsigned lbnModQ_32(BNWORD32 const *n, unsigned len, unsigned d)
250	SECTION	S_lbnModQ_32,,"code"
251	XDEF	_lbnModQ_32
252_lbnModQ_32:
253	move.l	4(sp),a0	* n
254	move.l	d2,-(sp)
255	move.l	d3,a4
256	moveq.l	#0,d1
257	moveq.l	#0,d2
258	move.w	12(sp),d1	* len
259	move.w	14(sp),d2	* d
260**	move.l	12(sp),d1	* len
261**	move.l	16(sp),d2	* d
262	lea  -4(a0,d1.L*4),a0	* dc.w	0x41f0, 0x1cfc
263
264* First time, divide 32/32 - may be faster than 64/32
265	move.l	(a0),d3
266	divul.l	d2,d0:d3	* dc.w    0x4c02, 0x3000
267	subq.w	#2,d1
268	bmi	mq32_done
269
270mq32_loop:
271	move.l	-(a0),d3
272	divu.l	d2,d0:d3	* dc.w    0x4c02,0x3400
273	dbra	d1,mq32_loop
274
275mq32_done:
276	move.l	(sp)+,d2
277	move.l	a4,d3
278	rts
279
280	end
281