1@ lbnarm.s - 32-bit bignum primitives for ARM processors with 32x32-bit multiply
2@
3@ This uses the standard ARM calling convetion, which is that arguments
4@ are passed, and results returned, in r0..r3.  r0..r3, r12 (IP) and r14 (LR)
5@ are volatile across the function; all others are callee-save.
6@ However, note that r14 (LR) is the return address, so it would be
7@ wise to save it somewhere before trashing it.  Fortunately, there is
8@ a neat trick possible, in that you can pop LR from the stack straight
9@ into r15 (PC), effecting a return at the same time.
10@
11@ Also, r13 (SP) is probably best left alone, and r15 (PC) is obviously
12@ reserved by hardware.  Temps should use lr, then r4..r9 in order.
13
14	.text
15	.align	2
16
17@ out[0..len] = in[0..len-1] * k
18@ void lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
19	.global	lbnMulN1_32
20	.type	lbnMulN1_32, %function
21lbnMulN1_32:
22	stmfd	sp!, {r4, r5, lr}
23	ldr	lr, [r1], #4		@ lr = *in++
24	umull	r5, r4, lr, r3		@ (r4,r5) = lr * r3
25	str	r5, [r0], #4		@ *out++ = r5
26	movs	r2, r2, lsr #1
27	bcc	m32_even
28	mov	r5, r4			@ Get carry in the right register
29	beq	m32_done
30m32_loop:
31	@ carry is in r5
32	ldr	lr, [r1], #4		@ lr = *in++
33	mov	r4, #0
34	umlal	r5, r4, lr, r3		@ (r4,r5) += lr * r3
35	str	r5, [r0], #4		@ *out++ = r5
36m32_even:
37	@ carry is in r4
38	ldr	lr, [r1], #4		@ lr = *in++
39	mov	r5, #0
40	umlal	r4, r5, lr, r3		@ (r5,r4) += lr * r3
41	subs	r2, r2, #1
42	str	r4, [r0], #4		@ *out++ = r4
43
44	bne	m32_loop
45m32_done:
46	str	r5, [r0, #0]		@ store carry
47	ldmfd	sp!, {r4, r5, pc}
48	.size	lbnMulN1_32, .-lbnMulN1_32
49
50@ out[0..len-1] += in[0..len-1] * k, return carry
51@ BNWORD32
52@ lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
53
54	.global	lbnMulAdd1_32
55	.type	lbnMulAdd1_32, %function
56lbnMulAdd1_32:
57	stmfd	sp!, {r4, r5, lr}
58
59	mov	r4, #0
60	ldr	lr, [r1], #4		@ lr = *in++
61	ldr	r5, [r0, #0]		@ r5 = *out
62	mov	r4, #0
63	umlal	r5, r4, lr, r3		@ (r4,r5) += lr * r3
64	str	r5, [r0], #4		@ *out++ = r5
65	movs	r2, r2, lsr #1
66	bcc	ma32_even
67	beq	ma32_done
68ma32_loop:
69	@ carry is in r4
70	ldr	lr, [r1], #4		@ lr = *in++
71	mov	r5, #0
72	umlal	r4, r5, lr, r3		@ (r5,r4) += lr * r3
73	ldr	lr, [r0, #0]		@ lr = *out
74	adds	lr, lr, r4		@ lr += product.low
75	str	lr, [r0], #4		@ *out++ = lr
76	adc	r4, r5, #0		@ Compute carry and move back to r4
77ma32_even:
78	@ another unrolled copy
79	ldr	lr, [r1], #4		@ lr = *in++
80	mov	r5, #0
81	umlal	r4, r5, lr, r3		@ (r5,r4) += lr * r3
82	ldr	lr, [r0, #0]		@ lr = *out
83	adds	lr, lr, r4		@ lr += product.low
84	adc	r4, r5, #0		@ Compute carry and move back to r4
85	str	lr, [r0], #4		@ *out++ = lr
86	subs	r2, r2, #1
87
88	bne	ma32_loop
89ma32_done:
90	mov	r0, r4
91	ldmfd	sp!, {r4, r5, pc}
92	.size	lbnMulAdd1_32, .-lbnMulAdd1_32
93
94@@@ This is a bit messy... punt for now...
95@ out[0..len-1] -= in[0..len-1] * k, return carry (borrow)
96@ BNWORD32
97@ lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k)
98	.global	lbnMulSub1_32
99	.type	lbnMulSub1_32, %function
100lbnMulSub1_32:
101	stmfd	sp!, {r4, r5, lr}
102
103	mov	r4, #0
104	mov	r5, #0
105	ldr	lr, [r1], #4		@ lr = *in++
106	umull	r4, r5, lr, r3		@ (r5,r4) = lr * r3
107	ldr	lr, [r0, #0]		@ lr = *out
108	subs	lr, lr, r4		@ lr -= product.low
109	str	lr, [r0], #4		@ *out++ = lr
110	addcc	r5, r5, #1		@ propagate carry
111
112	movs	r2, r2, lsr #1
113	bcc	ms32_even
114	mov	r4, r5
115	beq	ms32_done
116ms32_loop:
117	@ carry is in r4
118	ldr	lr, [r1], #4		@ lr = *in++
119	mov	r5, #0
120	umlal	r4, r5, lr, r3		@ (r5,r4) += lr * r3
121	ldr	lr, [r0, #0]		@ lr = *out
122	subs	lr, lr, r4		@ lr -= product.low
123	str	lr, [r0], #4		@ *out++ = lr
124	addcc	r5, r5, #1		@ propagate carry
125ms32_even:
126	@ carry is in r5
127	ldr	lr, [r1], #4		@ lr = *in++
128	mov	r4, #0
129	umlal	r5, r4, lr, r3		@ (r4,r5) += lr * r3
130	ldr	lr, [r0, #0]		@ lr = *out
131	subs	lr, lr, r5		@ lr -= product.low
132	str	lr, [r0], #4		@ *out++ = lr
133	addcc	r4, r4, #1		@ Propagate carry
134
135	subs	r2, r2, #1
136	bne	ms32_loop
137ms32_done:
138	mov	r0, r4
139	ldmfd	sp!, {r4, r5, pc}
140
141	.size	lbnMulSub1_32, .-lbnMulSub1_32
142
143@@
144@@ It's possible to eliminate the store traffic by doing the multiplies
145@@ in a different order, forming all the partial products in one column
146@@ at a time.  But it requires 32x32 + 64 -> 65-bit MAC.  The
147@@ ARM has the MAC, but no carry out.
148@@
149@@ The question is, is it faster to do the add directly (3 instructions),
150@@ or can we compute the carry out in 1 instruction (+1 to do the add)?
151@@ Well... it takes at least 1 instruction to copy the original accumulator,
152@@ out of the way, and 1 to do a compare, so no.
153@@
154@@ Now, the overall loop... this is an nxn->2n multiply.  For i=0..n-1,
155@@ we sum i+1 multiplies in each (plus the carry in from the
156@@ previous one).  For i = n..2*n-1 we sum 2*n-1-i, plus the previous
157@@ carry.
158@@
159@@ This "non-square" structure makes things more complicated.
160@@
161@@ void
162@@ lbnMulX_32(BNWORD32 *prod, BNWORD32 const *num1, BNWORD32 const *num2,
163@@ 	unsigned len)
164@	.global	lbnMulX_32
165@	.type	lbnMulX_32, %function
166@lbnMulX_32:
167@	stmfd	sp!, {r4, r5, r6, r7, lr}
168@
169@	mov	r4, #0
170@	mov	r5, #0
171@	mov	r0, r4
172@	ldmfd	sp!, {r4, r5, pc}
173@	.size	lbnMulX_32, .-lbnMulX_32
174