1@ lbnarm.s - 32-bit bignum primitives for ARM processors with 32x32-bit multiply 2@ 3@ This uses the standard ARM calling convetion, which is that arguments 4@ are passed, and results returned, in r0..r3. r0..r3, r12 (IP) and r14 (LR) 5@ are volatile across the function; all others are callee-save. 6@ However, note that r14 (LR) is the return address, so it would be 7@ wise to save it somewhere before trashing it. Fortunately, there is 8@ a neat trick possible, in that you can pop LR from the stack straight 9@ into r15 (PC), effecting a return at the same time. 10@ 11@ Also, r13 (SP) is probably best left alone, and r15 (PC) is obviously 12@ reserved by hardware. Temps should use lr, then r4..r9 in order. 13 14 .text 15 .align 2 16 17@ out[0..len] = in[0..len-1] * k 18@ void lbnMulN1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k) 19 .global lbnMulN1_32 20 .type lbnMulN1_32, %function 21lbnMulN1_32: 22 stmfd sp!, {r4, r5, lr} 23 ldr lr, [r1], #4 @ lr = *in++ 24 umull r5, r4, lr, r3 @ (r4,r5) = lr * r3 25 str r5, [r0], #4 @ *out++ = r5 26 movs r2, r2, lsr #1 27 bcc m32_even 28 mov r5, r4 @ Get carry in the right register 29 beq m32_done 30m32_loop: 31 @ carry is in r5 32 ldr lr, [r1], #4 @ lr = *in++ 33 mov r4, #0 34 umlal r5, r4, lr, r3 @ (r4,r5) += lr * r3 35 str r5, [r0], #4 @ *out++ = r5 36m32_even: 37 @ carry is in r4 38 ldr lr, [r1], #4 @ lr = *in++ 39 mov r5, #0 40 umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3 41 subs r2, r2, #1 42 str r4, [r0], #4 @ *out++ = r4 43 44 bne m32_loop 45m32_done: 46 str r5, [r0, #0] @ store carry 47 ldmfd sp!, {r4, r5, pc} 48 .size lbnMulN1_32, .-lbnMulN1_32 49 50@ out[0..len-1] += in[0..len-1] * k, return carry 51@ BNWORD32 52@ lbnMulAdd1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k) 53 54 .global lbnMulAdd1_32 55 .type lbnMulAdd1_32, %function 56lbnMulAdd1_32: 57 stmfd sp!, {r4, r5, lr} 58 59 mov r4, #0 60 ldr lr, [r1], #4 @ lr = *in++ 61 ldr r5, [r0, #0] @ r5 = *out 62 mov r4, #0 63 umlal r5, r4, lr, r3 @ (r4,r5) += lr * r3 64 str r5, [r0], #4 @ *out++ = r5 65 movs r2, r2, lsr #1 66 bcc ma32_even 67 beq ma32_done 68ma32_loop: 69 @ carry is in r4 70 ldr lr, [r1], #4 @ lr = *in++ 71 mov r5, #0 72 umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3 73 ldr lr, [r0, #0] @ lr = *out 74 adds lr, lr, r4 @ lr += product.low 75 str lr, [r0], #4 @ *out++ = lr 76 adc r4, r5, #0 @ Compute carry and move back to r4 77ma32_even: 78 @ another unrolled copy 79 ldr lr, [r1], #4 @ lr = *in++ 80 mov r5, #0 81 umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3 82 ldr lr, [r0, #0] @ lr = *out 83 adds lr, lr, r4 @ lr += product.low 84 adc r4, r5, #0 @ Compute carry and move back to r4 85 str lr, [r0], #4 @ *out++ = lr 86 subs r2, r2, #1 87 88 bne ma32_loop 89ma32_done: 90 mov r0, r4 91 ldmfd sp!, {r4, r5, pc} 92 .size lbnMulAdd1_32, .-lbnMulAdd1_32 93 94@@@ This is a bit messy... punt for now... 95@ out[0..len-1] -= in[0..len-1] * k, return carry (borrow) 96@ BNWORD32 97@ lbnMulSub1_32(BNWORD32 *out, BNWORD32 const *in, unsigned len, BNWORD32 k) 98 .global lbnMulSub1_32 99 .type lbnMulSub1_32, %function 100lbnMulSub1_32: 101 stmfd sp!, {r4, r5, lr} 102 103 mov r4, #0 104 mov r5, #0 105 ldr lr, [r1], #4 @ lr = *in++ 106 umull r4, r5, lr, r3 @ (r5,r4) = lr * r3 107 ldr lr, [r0, #0] @ lr = *out 108 subs lr, lr, r4 @ lr -= product.low 109 str lr, [r0], #4 @ *out++ = lr 110 addcc r5, r5, #1 @ propagate carry 111 112 movs r2, r2, lsr #1 113 bcc ms32_even 114 mov r4, r5 115 beq ms32_done 116ms32_loop: 117 @ carry is in r4 118 ldr lr, [r1], #4 @ lr = *in++ 119 mov r5, #0 120 umlal r4, r5, lr, r3 @ (r5,r4) += lr * r3 121 ldr lr, [r0, #0] @ lr = *out 122 subs lr, lr, r4 @ lr -= product.low 123 str lr, [r0], #4 @ *out++ = lr 124 addcc r5, r5, #1 @ propagate carry 125ms32_even: 126 @ carry is in r5 127 ldr lr, [r1], #4 @ lr = *in++ 128 mov r4, #0 129 umlal r5, r4, lr, r3 @ (r4,r5) += lr * r3 130 ldr lr, [r0, #0] @ lr = *out 131 subs lr, lr, r5 @ lr -= product.low 132 str lr, [r0], #4 @ *out++ = lr 133 addcc r4, r4, #1 @ Propagate carry 134 135 subs r2, r2, #1 136 bne ms32_loop 137ms32_done: 138 mov r0, r4 139 ldmfd sp!, {r4, r5, pc} 140 141 .size lbnMulSub1_32, .-lbnMulSub1_32 142 143@@ 144@@ It's possible to eliminate the store traffic by doing the multiplies 145@@ in a different order, forming all the partial products in one column 146@@ at a time. But it requires 32x32 + 64 -> 65-bit MAC. The 147@@ ARM has the MAC, but no carry out. 148@@ 149@@ The question is, is it faster to do the add directly (3 instructions), 150@@ or can we compute the carry out in 1 instruction (+1 to do the add)? 151@@ Well... it takes at least 1 instruction to copy the original accumulator, 152@@ out of the way, and 1 to do a compare, so no. 153@@ 154@@ Now, the overall loop... this is an nxn->2n multiply. For i=0..n-1, 155@@ we sum i+1 multiplies in each (plus the carry in from the 156@@ previous one). For i = n..2*n-1 we sum 2*n-1-i, plus the previous 157@@ carry. 158@@ 159@@ This "non-square" structure makes things more complicated. 160@@ 161@@ void 162@@ lbnMulX_32(BNWORD32 *prod, BNWORD32 const *num1, BNWORD32 const *num2, 163@@ unsigned len) 164@ .global lbnMulX_32 165@ .type lbnMulX_32, %function 166@lbnMulX_32: 167@ stmfd sp!, {r4, r5, r6, r7, lr} 168@ 169@ mov r4, #0 170@ mov r5, #0 171@ mov r0, r4 172@ ldmfd sp!, {r4, r5, pc} 173@ .size lbnMulX_32, .-lbnMulX_32 174