1*1f13597dSJung-uk Kim#!/usr/bin/env perl
2*1f13597dSJung-uk Kim
3*1f13597dSJung-uk Kim# ====================================================================
4*1f13597dSJung-uk Kim# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5*1f13597dSJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and
6*1f13597dSJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further
7*1f13597dSJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/.
8*1f13597dSJung-uk Kim# ====================================================================
9*1f13597dSJung-uk Kim
10*1f13597dSJung-uk Kim# January 2007.
11*1f13597dSJung-uk Kim
12*1f13597dSJung-uk Kim# Montgomery multiplication for ARMv4.
13*1f13597dSJung-uk Kim#
14*1f13597dSJung-uk Kim# Performance improvement naturally varies among CPU implementations
15*1f13597dSJung-uk Kim# and compilers. The code was observed to provide +65-35% improvement
16*1f13597dSJung-uk Kim# [depending on key length, less for longer keys] on ARM920T, and
17*1f13597dSJung-uk Kim# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
18*1f13597dSJung-uk Kim# base and compiler generated code with in-lined umull and even umlal
19*1f13597dSJung-uk Kim# instructions. The latter means that this code didn't really have an
20*1f13597dSJung-uk Kim# "advantage" of utilizing some "secret" instruction.
21*1f13597dSJung-uk Kim#
22*1f13597dSJung-uk Kim# The code is interoperable with Thumb ISA and is rather compact, less
23*1f13597dSJung-uk Kim# than 1/2KB. Windows CE port would be trivial, as it's exclusively
24*1f13597dSJung-uk Kim# about decorations, ABI and instruction syntax are identical.
25*1f13597dSJung-uk Kim
26*1f13597dSJung-uk Kimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
27*1f13597dSJung-uk Kimopen STDOUT,">$output";
28*1f13597dSJung-uk Kim
29*1f13597dSJung-uk Kim$num="r0";	# starts as num argument, but holds &tp[num-1]
30*1f13597dSJung-uk Kim$ap="r1";
31*1f13597dSJung-uk Kim$bp="r2"; $bi="r2"; $rp="r2";
32*1f13597dSJung-uk Kim$np="r3";
33*1f13597dSJung-uk Kim$tp="r4";
34*1f13597dSJung-uk Kim$aj="r5";
35*1f13597dSJung-uk Kim$nj="r6";
36*1f13597dSJung-uk Kim$tj="r7";
37*1f13597dSJung-uk Kim$n0="r8";
38*1f13597dSJung-uk Kim###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
39*1f13597dSJung-uk Kim$alo="r10";	# sl, gcc uses it to keep @GOT
40*1f13597dSJung-uk Kim$ahi="r11";	# fp
41*1f13597dSJung-uk Kim$nlo="r12";	# ip
42*1f13597dSJung-uk Kim###########	# r13 is stack pointer
43*1f13597dSJung-uk Kim$nhi="r14";	# lr
44*1f13597dSJung-uk Kim###########	# r15 is program counter
45*1f13597dSJung-uk Kim
46*1f13597dSJung-uk Kim#### argument block layout relative to &tp[num-1], a.k.a. $num
47*1f13597dSJung-uk Kim$_rp="$num,#12*4";
48*1f13597dSJung-uk Kim# ap permanently resides in r1
49*1f13597dSJung-uk Kim$_bp="$num,#13*4";
50*1f13597dSJung-uk Kim# np permanently resides in r3
51*1f13597dSJung-uk Kim$_n0="$num,#14*4";
52*1f13597dSJung-uk Kim$_num="$num,#15*4";	$_bpend=$_num;
53*1f13597dSJung-uk Kim
54*1f13597dSJung-uk Kim$code=<<___;
55*1f13597dSJung-uk Kim.text
56*1f13597dSJung-uk Kim
57*1f13597dSJung-uk Kim.global	bn_mul_mont
58*1f13597dSJung-uk Kim.type	bn_mul_mont,%function
59*1f13597dSJung-uk Kim
60*1f13597dSJung-uk Kim.align	2
61*1f13597dSJung-uk Kimbn_mul_mont:
62*1f13597dSJung-uk Kim	stmdb	sp!,{r0,r2}		@ sp points at argument block
63*1f13597dSJung-uk Kim	ldr	$num,[sp,#3*4]		@ load num
64*1f13597dSJung-uk Kim	cmp	$num,#2
65*1f13597dSJung-uk Kim	movlt	r0,#0
66*1f13597dSJung-uk Kim	addlt	sp,sp,#2*4
67*1f13597dSJung-uk Kim	blt	.Labrt
68*1f13597dSJung-uk Kim
69*1f13597dSJung-uk Kim	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
70*1f13597dSJung-uk Kim
71*1f13597dSJung-uk Kim	mov	$num,$num,lsl#2		@ rescale $num for byte count
72*1f13597dSJung-uk Kim	sub	sp,sp,$num		@ alloca(4*num)
73*1f13597dSJung-uk Kim	sub	sp,sp,#4		@ +extra dword
74*1f13597dSJung-uk Kim	sub	$num,$num,#4		@ "num=num-1"
75*1f13597dSJung-uk Kim	add	$tp,$bp,$num		@ &bp[num-1]
76*1f13597dSJung-uk Kim
77*1f13597dSJung-uk Kim	add	$num,sp,$num		@ $num to point at &tp[num-1]
78*1f13597dSJung-uk Kim	ldr	$n0,[$_n0]		@ &n0
79*1f13597dSJung-uk Kim	ldr	$bi,[$bp]		@ bp[0]
80*1f13597dSJung-uk Kim	ldr	$aj,[$ap],#4		@ ap[0],ap++
81*1f13597dSJung-uk Kim	ldr	$nj,[$np],#4		@ np[0],np++
82*1f13597dSJung-uk Kim	ldr	$n0,[$n0]		@ *n0
83*1f13597dSJung-uk Kim	str	$tp,[$_bpend]		@ save &bp[num]
84*1f13597dSJung-uk Kim
85*1f13597dSJung-uk Kim	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
86*1f13597dSJung-uk Kim	str	$n0,[$_n0]		@ save n0 value
87*1f13597dSJung-uk Kim	mul	$n0,$alo,$n0		@ "tp[0]"*n0
88*1f13597dSJung-uk Kim	mov	$nlo,#0
89*1f13597dSJung-uk Kim	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
90*1f13597dSJung-uk Kim	mov	$tp,sp
91*1f13597dSJung-uk Kim
92*1f13597dSJung-uk Kim.L1st:
93*1f13597dSJung-uk Kim	ldr	$aj,[$ap],#4		@ ap[j],ap++
94*1f13597dSJung-uk Kim	mov	$alo,$ahi
95*1f13597dSJung-uk Kim	ldr	$nj,[$np],#4		@ np[j],np++
96*1f13597dSJung-uk Kim	mov	$ahi,#0
97*1f13597dSJung-uk Kim	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
98*1f13597dSJung-uk Kim	mov	$nhi,#0
99*1f13597dSJung-uk Kim	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
100*1f13597dSJung-uk Kim	adds	$nlo,$nlo,$alo
101*1f13597dSJung-uk Kim	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
102*1f13597dSJung-uk Kim	adc	$nlo,$nhi,#0
103*1f13597dSJung-uk Kim	cmp	$tp,$num
104*1f13597dSJung-uk Kim	bne	.L1st
105*1f13597dSJung-uk Kim
106*1f13597dSJung-uk Kim	adds	$nlo,$nlo,$ahi
107*1f13597dSJung-uk Kim	ldr	$tp,[$_bp]		@ restore bp
108*1f13597dSJung-uk Kim	mov	$nhi,#0
109*1f13597dSJung-uk Kim	ldr	$n0,[$_n0]		@ restore n0
110*1f13597dSJung-uk Kim	adc	$nhi,$nhi,#0
111*1f13597dSJung-uk Kim	str	$nlo,[$num]		@ tp[num-1]=
112*1f13597dSJung-uk Kim	str	$nhi,[$num,#4]		@ tp[num]=
113*1f13597dSJung-uk Kim
114*1f13597dSJung-uk Kim.Louter:
115*1f13597dSJung-uk Kim	sub	$tj,$num,sp		@ "original" $num-1 value
116*1f13597dSJung-uk Kim	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
117*1f13597dSJung-uk Kim	ldr	$bi,[$tp,#4]!		@ *(++bp)
118*1f13597dSJung-uk Kim	sub	$np,$np,$tj		@ "rewind" np to &np[1]
119*1f13597dSJung-uk Kim	ldr	$aj,[$ap,#-4]		@ ap[0]
120*1f13597dSJung-uk Kim	ldr	$alo,[sp]		@ tp[0]
121*1f13597dSJung-uk Kim	ldr	$nj,[$np,#-4]		@ np[0]
122*1f13597dSJung-uk Kim	ldr	$tj,[sp,#4]		@ tp[1]
123*1f13597dSJung-uk Kim
124*1f13597dSJung-uk Kim	mov	$ahi,#0
125*1f13597dSJung-uk Kim	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
126*1f13597dSJung-uk Kim	str	$tp,[$_bp]		@ save bp
127*1f13597dSJung-uk Kim	mul	$n0,$alo,$n0
128*1f13597dSJung-uk Kim	mov	$nlo,#0
129*1f13597dSJung-uk Kim	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
130*1f13597dSJung-uk Kim	mov	$tp,sp
131*1f13597dSJung-uk Kim
132*1f13597dSJung-uk Kim.Linner:
133*1f13597dSJung-uk Kim	ldr	$aj,[$ap],#4		@ ap[j],ap++
134*1f13597dSJung-uk Kim	adds	$alo,$ahi,$tj		@ +=tp[j]
135*1f13597dSJung-uk Kim	ldr	$nj,[$np],#4		@ np[j],np++
136*1f13597dSJung-uk Kim	mov	$ahi,#0
137*1f13597dSJung-uk Kim	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
138*1f13597dSJung-uk Kim	mov	$nhi,#0
139*1f13597dSJung-uk Kim	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
140*1f13597dSJung-uk Kim	adc	$ahi,$ahi,#0
141*1f13597dSJung-uk Kim	ldr	$tj,[$tp,#8]		@ tp[j+1]
142*1f13597dSJung-uk Kim	adds	$nlo,$nlo,$alo
143*1f13597dSJung-uk Kim	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
144*1f13597dSJung-uk Kim	adc	$nlo,$nhi,#0
145*1f13597dSJung-uk Kim	cmp	$tp,$num
146*1f13597dSJung-uk Kim	bne	.Linner
147*1f13597dSJung-uk Kim
148*1f13597dSJung-uk Kim	adds	$nlo,$nlo,$ahi
149*1f13597dSJung-uk Kim	mov	$nhi,#0
150*1f13597dSJung-uk Kim	ldr	$tp,[$_bp]		@ restore bp
151*1f13597dSJung-uk Kim	adc	$nhi,$nhi,#0
152*1f13597dSJung-uk Kim	ldr	$n0,[$_n0]		@ restore n0
153*1f13597dSJung-uk Kim	adds	$nlo,$nlo,$tj
154*1f13597dSJung-uk Kim	ldr	$tj,[$_bpend]		@ restore &bp[num]
155*1f13597dSJung-uk Kim	adc	$nhi,$nhi,#0
156*1f13597dSJung-uk Kim	str	$nlo,[$num]		@ tp[num-1]=
157*1f13597dSJung-uk Kim	str	$nhi,[$num,#4]		@ tp[num]=
158*1f13597dSJung-uk Kim
159*1f13597dSJung-uk Kim	cmp	$tp,$tj
160*1f13597dSJung-uk Kim	bne	.Louter
161*1f13597dSJung-uk Kim
162*1f13597dSJung-uk Kim	ldr	$rp,[$_rp]		@ pull rp
163*1f13597dSJung-uk Kim	add	$num,$num,#4		@ $num to point at &tp[num]
164*1f13597dSJung-uk Kim	sub	$aj,$num,sp		@ "original" num value
165*1f13597dSJung-uk Kim	mov	$tp,sp			@ "rewind" $tp
166*1f13597dSJung-uk Kim	mov	$ap,$tp			@ "borrow" $ap
167*1f13597dSJung-uk Kim	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
168*1f13597dSJung-uk Kim
169*1f13597dSJung-uk Kim	subs	$tj,$tj,$tj		@ "clear" carry flag
170*1f13597dSJung-uk Kim.Lsub:	ldr	$tj,[$tp],#4
171*1f13597dSJung-uk Kim	ldr	$nj,[$np],#4
172*1f13597dSJung-uk Kim	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
173*1f13597dSJung-uk Kim	str	$tj,[$rp],#4		@ rp[j]=
174*1f13597dSJung-uk Kim	teq	$tp,$num		@ preserve carry
175*1f13597dSJung-uk Kim	bne	.Lsub
176*1f13597dSJung-uk Kim	sbcs	$nhi,$nhi,#0		@ upmost carry
177*1f13597dSJung-uk Kim	mov	$tp,sp			@ "rewind" $tp
178*1f13597dSJung-uk Kim	sub	$rp,$rp,$aj		@ "rewind" $rp
179*1f13597dSJung-uk Kim
180*1f13597dSJung-uk Kim	and	$ap,$tp,$nhi
181*1f13597dSJung-uk Kim	bic	$np,$rp,$nhi
182*1f13597dSJung-uk Kim	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
183*1f13597dSJung-uk Kim
184*1f13597dSJung-uk Kim.Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
185*1f13597dSJung-uk Kim	str	sp,[$tp],#4		@ zap tp
186*1f13597dSJung-uk Kim	str	$tj,[$rp],#4
187*1f13597dSJung-uk Kim	cmp	$tp,$num
188*1f13597dSJung-uk Kim	bne	.Lcopy
189*1f13597dSJung-uk Kim
190*1f13597dSJung-uk Kim	add	sp,$num,#4		@ skip over tp[num+1]
191*1f13597dSJung-uk Kim	ldmia	sp!,{r4-r12,lr}		@ restore registers
192*1f13597dSJung-uk Kim	add	sp,sp,#2*4		@ skip over {r0,r2}
193*1f13597dSJung-uk Kim	mov	r0,#1
194*1f13597dSJung-uk Kim.Labrt:	tst	lr,#1
195*1f13597dSJung-uk Kim	moveq	pc,lr			@ be binary compatible with V4, yet
196*1f13597dSJung-uk Kim	bx	lr			@ interoperable with Thumb ISA:-)
197*1f13597dSJung-uk Kim.size	bn_mul_mont,.-bn_mul_mont
198*1f13597dSJung-uk Kim.asciz	"Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
199*1f13597dSJung-uk Kim.align	2
200*1f13597dSJung-uk Kim___
201*1f13597dSJung-uk Kim
202*1f13597dSJung-uk Kim$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
203*1f13597dSJung-uk Kimprint $code;
204*1f13597dSJung-uk Kimclose STDOUT;
205