1#	$NetBSD: bn_asm_vax.S,v 1.1 2009/07/19 23:30:47 christos Exp $
2#
3# w.j.m. 15-jan-1999
4#
5# it's magic ...
6#
7# ULONG bn_mul_add_words(ULONG r[],ULONG a[],int n,ULONG w) {
8#	ULONG c = 0;
9#	int i;
10#	for(i = 0; i < n; i++) <c,r[i]> := r[i] + c + a[i] * w ;
11#	return c;
12# }
13
14	.globl	bn_mul_add_words
15	.type   bn_mul_add_words@function
16
17bn_mul_add_words:
18	.word	0x40
19
20	movl	4(%ap),%r2		# *r
21	movl	8(%ap),%r3		# *a
22	movl	12(%ap),%r4		# n
23	movl	16(%ap),%r5		# w
24	clrl	%r6			# return value ("carry")
25
260:	emul	%r5,(%r3),(%r2),%r0	# w * a[0] + r[0] -> r0
27
28	# fixup for "negative" r[]
29	tstl	(%r2)
30	bgeq	1f
31	incl	%r1			# add 1 to highword
32
331:	# add saved carry to result
34	addl2	%r6,%r0
35	adwc	$0,%r1
36
37	# combined fixup for "negative" w, a[]
38	tstl	%r5		# if w is negative...
39	bgeq	1f
40	addl2	(%r3),%r1	# ...add a[0] again to highword
411:	tstl	(%r3)		# if a[0] is negative...
42	bgeq	1f
43	addl2	%r5,%r1		# ...add w again to highword
441:
45	movl	%r0,(%r2)+	# save low word in dest & advance *r
46	addl2	$4,%r3		# advance *a
47	movl	%r1,%r6		# high word in r6 for return value
48
49	sobgtr	%r4,0b		# loop?
50
51	movl	%r6,%r0
52	ret
53
54#	.title	vax_bn_mul_words  unsigned multiply & add, 32*32+32=>64
55#;
56#; w.j.m. 15-jan-1999
57#;
58#; it's magic ...
59#;
60#; ULONG bn_mul_words(ULONG r[],ULONG a[],int n,ULONG w) {
61#;	ULONG c = 0;
62#;	int i;
63#;	for(i = 0; i < num; i++) <c,r[i]> := a[i] * w + c ;
64#;	return(c);
65#; }
66#
67	.globl	bn_mul_words
68	.type   bn_mul_words@function
69bn_mul_words:
70	.word	0x40
71
72	movl	4(%ap),%r2		# *r
73	movl	8(%ap),%r3		# *a
74	movl	12(%ap),%r4		# n
75	movl	16(%ap),%r5		# w
76	clrl	%r6			# carry
77
780:	emul	%r5,(%r3),%r6,%r0	# w * a[0] + carry -> r0
79
80	# fixup for "negative" carry
81	tstl	%r6
82	bgeq	1f
83	incl	%r1
84
851:	# combined fixup for "negative" w, a[]
86	tstl	%r5
87	bgeq	1f
88	addl2	(%r3),%r1
891:	tstl	(%r3)
90	bgeq	1f
91	addl2	%r5,%r1
92
931:	movl	%r0,(%r2)+
94	addl2	$4,%r3
95	movl	%r1,%r6
96
97	sobgtr	%r4,0b
98
99	movl	%r6,%r0
100	ret
101
102
103
104#	.title	vax_bn_sqr_words  unsigned square, 32*32=>64
105#;
106#; w.j.m. 15-jan-1999
107#;
108#; it's magic ...
109#;
110#; void bn_sqr_words(ULONG r[],ULONG a[],int n) {
111#;	int i;
112#;	for(i = 0; i < n; i++) <r[2*i+1],r[2*i]> := a[i] * a[i] ;
113#; }
114#
115	.globl	bn_sqr_words
116	.type   bn_sqr_words@function
117bn_sqr_words:
118	.word	0
119
120	movl	4(%ap),%r2		# r
121	movl	8(%ap),%r3		# a
122	movl	12(%ap),%r4		# n
123
1240:	movl	(%r3)+,%r5		# r5 = a[] & advance
125
126	emul	%r5,%r5,$0,%r0		# a[0] * a[0] + 0 -> r0
127
128	# fixup for "negative" a[]
129	tstl	%r5
130	bgeq	1f
131	addl2	%r5,%r1
132	addl2	%r5,%r1
133
1341:	movq	%r0,(%r2)+		# store 64-bit result
135
136	sobgtr	%r4,0b			# loop
137
138	ret
139
140
141#	.title	vax_bn_div_words  unsigned divide
142#;
143#; Richard Levitte 20-Nov-2000
144#;
145#; ULONG bn_div_words(ULONG h, ULONG l, ULONG d)
146#; {
147#;	return ((ULONG)((((ULLONG)h)<<32)|l) / (ULLONG)d);
148#; }
149#;
150#; Using EDIV would be very easy, if it didn't do signed calculations.
151#; Any time any of the input numbers are signed, there are problems,
152#; usually with integer overflow, at which point it returns useless
153#; data (the quotient gets the value of l, and the remainder becomes 0).
154#;
155#; If it was just for the dividend, it would be very easy, just divide
156#; it by 2 (unsigned), do the division, multiply the resulting quotient
157#; and remainder by 2, add the bit that was dropped when dividing by 2
158#; to the remainder, and do some adjustment so the remainder doesn't
159#; end up larger than the divisor.  For some cases when the divisor is
160#; negative (from EDIV's point of view, i.e. when the highest bit is set),
161#; dividing the dividend by 2 isn't enough, and since some operations
162#; might generate integer overflows even when the dividend is divided by
163#; 4 (when the high part of the shifted down dividend ends up being exactly
164#; half of the divisor, the result is the quotient 0x80000000, which is
165#; negative...) it needs to be divided by 8.  Furthermore, the divisor needs
166#; to be divided by 2 (unsigned) as well, to avoid more problems with the sign.
167#; In this case, a little extra fiddling with the remainder is required.
168#;
169#; So, the simplest way to handle this is always to divide the dividend
170#; by 8, and to divide the divisor by 2 if it's highest bit is set.
171#; After EDIV has been used, the quotient gets multiplied by 8 if the
172#; original divisor was positive, otherwise 4.  The remainder, oddly
173#; enough, is *always* multiplied by 8.
174#; NOTE: in the case mentioned above, where the high part of the shifted
175#; down dividend ends up being exactly half the shifted down divisor, we
176#; end up with a 33 bit quotient.  That's no problem however, it usually
177#; means we have ended up with a too large remainder as well, and the
178#; problem is fixed by the last part of the algorithm (next paragraph).
179#;
180#; The routine ends with comparing the resulting remainder with the
181#; original divisor and if the remainder is larger, subtract the
182#; original divisor from it, and increase the quotient by 1.  This is
183#; done until the remainder is smaller than the divisor.
184#;
185#; The complete algorithm looks like this:
186#;
187#; d'    = d
188#; l'    = l & 7
189#; [h,l] = [h,l] >> 3
190#; [q,r] = floor([h,l] / d)	# This is the EDIV operation
191#; if (q < 0) q = -q		# I doubt this is necessary any more
192#;
193#; r'    = r >> 29
194#; if (d' >= 0)
195#;   q'  = q >> 29
196#;   q   = q << 3
197#; else
198#;   q'  = q >> 30
199#;   q   = q << 2
200#; r     = (r << 3) + l'
201#;
202#; if (d' < 0)
203#;   {
204#;     [r',r] = [r',r] - q
205#;     while ([r',r] < 0)
206#;       {
207#;         [r',r] = [r',r] + d
208#;         [q',q] = [q',q] - 1
209#;       }
210#;   }
211#;
212#; while ([r',r] >= d')
213#;   {
214#;     [r',r] = [r',r] - d'
215#;     [q',q] = [q',q] + 1
216#;   }
217#;
218#; return q
219#
220#;r2 = l, q
221#;r3 = h, r
222#;r4 = d
223#;r5 = l'
224#;r6 = r'
225#;r7 = d'
226#;r8 = q'
227#
228	.globl	bn_div_words
229	.type   bn_div_words@function
230bn_div_words:
231	.word	0x1c0
232
233	movl	4(%ap),%r3		# h
234	movl	8(%ap),%r2		# l
235	movl	12(%ap),%r4		# d
236
237	bicl3	$-8,%r2,%r5		# l' = l & 7
238	bicl3	$7,%r2,%r2
239
240	bicl3	$-8,%r3,%r6
241	bicl3	$7,%r3,%r3
242
243	addl2	%r6,%r2
244
245	rotl	$-3,%r2,%r2		# l = l >> 3
246	rotl	$-3,%r3,%r3		# h = h >> 3
247
248	movl	%r4,%r7			# d' = d
249
250	clrl	%r6			# r' = 0
251	clrl	%r8			# q' = 0
252
253	tstl	%r4
254	beql	0f			# Uh-oh, the divisor is 0...
255	bgtr	1f
256	rotl	$-1,%r4,%r4	# If d is negative, shift it right.
257	bicl2	$0x80000000,%r4	# Since d is then a large number, the
258				# lowest bit is insignificant
259				# (contradict that, and I'll fix the problem!)
2601:
261	ediv	%r4,%r2,%r2,%r3		# Do the actual division
262
263	tstl	%r2
264	bgeq	1f
265	mnegl	%r2,%r2		# if q < 0, negate it
2661:
267	tstl	%r7
268	blss	1f
269	rotl	$3,%r2,%r2	#   q = q << 3
270	bicl3	$-8,%r2,%r8	#   q' gets the high bits from q
271	bicl3	$7,%r2,%r2
272	brb	2f
273
2741:				# else
275	rotl	$2,%r2,%r2	#   q = q << 2
276	bicl3	$-4,%r2,%r8	#   q' gets the high bits from q
277	bicl3	$3,%r2,%r2
2782:
279	rotl	$3,%r3,%r3	# r = r << 3
280	bicl3	$-8,%r3,%r6	# r' gets the high bits from r
281	bicl3	$7,%r3,%r3
282	addl2	%r5,%r3		# r = r + l'
283
284	tstl	%r7
285	bgeq	5f
286	bitl	$1,%r7
287	beql	5f		# if d' < 0 && d' & 1
288	subl2	%r2,%r3		#   [r',r] = [r',r] - [q',q]
289	sbwc	%r8,%r6
2903:
291	bgeq	5f		#   while r < 0
292	decl	%r2		#     [q',q] = [q',q] - 1
293	sbwc	$0,%r8
294	addl2	%r7,%r3		#     [r',r] = [r',r] + d'
295	adwc	$0,%r6
296	brb	3b
297
298# The return points are placed in the middle to keep a short distance from
299# all the branch points
3001:
301#	movl	%r3,%r1
302	movl	%r2,%r0
303	ret
3040:
305	movl	$-1,%r0
306	ret
3075:
308	tstl	%r6
309	bneq	6f
310	cmpl	%r3,%r7
311	blssu	1b		# while [r',r] >= d'
3126:
313	subl2	%r7,%r3		#   [r',r] = [r',r] - d'
314	sbwc	$0,%r6
315	incl	%r2		#   [q',q] = [q',q] + 1
316	adwc	$0,%r8
317	brb	5b
318
319
320
321#	.title	vax_bn_add_words  unsigned add of two arrays
322#;
323#; Richard Levitte 20-Nov-2000
324#;
325#; ULONG bn_add_words(ULONG r[], ULONG a[], ULONG b[], int n) {
326#;	ULONG c = 0;
327#;	int i;
328#;	for (i = 0; i < n; i++) <c,r[i]> = a[i] + b[i] + c;
329#;	return(c);
330#; }
331#
332
333	.globl	bn_add_words
334	.type   bn_add_words@function
335bn_add_words:
336	.word	0
337
338	movl	4(%ap),%r2	# r
339	movl	8(%ap),%r3	# a
340	movl	12(%ap),%r4	# b
341	movl	16(%ap),%r5	# n
342	clrl	%r0
343
344	tstl	%r5
345	bleq	1f
346
3470:	movl	(%r3)+,%r1	# carry untouched
348	adwc	(%r4)+,%r1	# carry used and touched
349	movl	%r1,(%r2)+	# carry untouched
350	sobgtr	%r5,0b		# carry untouched
351
352	adwc	$0,%r0
3531:	ret
354
355#;
356#; Richard Levitte 20-Nov-2000
357#;
358#; ULONG bn_sub_words(ULONG r[], ULONG a[], ULONG b[], int n) {
359#;	ULONG c = 0;
360#;	int i;
361#;	for (i = 0; i < n; i++) <c,r[i]> = a[i] - b[i] - c;
362#;	return(c);
363#; }
364#
365	.globl	bn_sub_words
366	.type   bn_sub_words@function
367bn_sub_words:
368	.word	0x40
369
370	movl	4(%ap),%r2	# r
371	movl	8(%ap),%r3	# a
372	movl	12(%ap),%r4	# b
373	movl	16(%ap),%r5	# n
374	clrl	%r0
375
376	tstl	%r5
377	bleq	1f
378
3790:	movl	(%r3)+,%r6	# carry untouched
380	sbwc	(%r4)+,%r6	# carry used and touched
381	movl	%r6,(%r2)+	# carry untouched
382	sobgtr	%r5,0b		# carry untouched
383
3841:	adwc	$0,%r0
385	ret
386
387#
388#	Ragge 20-Sep-2003
389#
390#	Multiply a vector of 4/8 longword by another.
391#	Uses two loops and 16/64 emuls.
392#
393	.globl	bn_mul_comba4
394	.type   bn_mul_comba4@function
395bn_mul_comba4:
396	.word	0x3c0
397	movl	$4,%r9		# 4*4
398	brb	6f
399
400	.globl	bn_mul_comba8
401	.type   bn_mul_comba8@function
402bn_mul_comba8:
403	.word	0x3c0
404	movl	$8,%r9		# 8*8
405
4066:	movl	8(%ap),%r3	# a[]
407	movl	12(%ap),%r7	# b[]
408	brb	5f
409
410	.globl	bn_sqr_comba4
411	.type   bn_sqr_comba4@function
412bn_sqr_comba4:
413	.word	0x3c0
414	movl	$4,%r9		# 4*4
415	brb 0f
416
417	.globl	bn_sqr_comba8
418	.type   bn_sqr_comba8@function
419bn_sqr_comba8:
420	.word	0x3c0
421	movl	$8,%r9		# 8*8
422
4230:
424	movl	8(%ap),%r3	# a[]
425	movl	%r3,%r7		# a[]
426
4275:	movl	4(%ap),%r5	# r[]
428	movl	%r9,%r8
429
430	clrq	(%r5)		# clear destinatino, for add.
431	clrq	8(%r5)
432	clrq	16(%r5)		# these only needed for comba8
433	clrq	24(%r5)
434
4352:	clrl	%r4		# carry
436	movl	%r9,%r6		# inner loop count
437	movl	(%r7)+,%r2	# value to multiply with
438
4391:	emul	%r2,(%r3),%r4,%r0
440	tstl	%r4
441	bgeq	3f
442	incl	%r1
4433:	tstl	%r2
444	bgeq	3f
445	addl2	(%r3),%r1
4463:	tstl	(%r3)
447	bgeq	3f
448	addl2	%r2,%r1
449
4503:	addl2	%r0,(%r5)+	# add to destination
451	adwc	$0,%r1		# remember carry
452	movl	%r1,%r4		# add carry in next emul
453	addl2	$4,%r3
454	sobgtr	%r6,1b
455
456	movl	%r4,(%r5)	# save highest add result
457
458	ashl	$2,%r9,%r4
459	subl2	%r4,%r3
460	subl2	$4,%r4
461	subl2	%r4,%r5
462
463	sobgtr	%r8,2b
464
465	ret
466