1#! /usr/bin/env perl
2# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# This module doesn't present direct interest for OpenSSL, because it
18# doesn't provide better performance for longer keys, at least not on
19# in-order-execution cores. While 512-bit RSA sign operations can be
20# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
21# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
22# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
23# verify:-( All comparisons are against bn_mul_mont-free assembler.
24# The module might be of interest to embedded system developers, as
25# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
26# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
27# code.
28
29######################################################################
30# There is a number of MIPS ABI in use, O32 and N32/64 are most
31# widely used. Then there is a new contender: NUBI. It appears that if
32# one picks the latter, it's possible to arrange code in ABI neutral
33# manner. Therefore let's stick to NUBI register layout:
34#
35($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
36($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
37($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
38($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
39#
40# The return value is placed in $a0. Following coding rules facilitate
41# interoperability:
42#
43# - never ever touch $tp, "thread pointer", former $gp;
44# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
45#   old code];
46# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
47#
48# For reference here is register layout for N32/64 MIPS ABIs:
49#
50# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
51# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
52# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
53# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
54# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
55#
56$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
57
58if ($flavour =~ /64|n32/i) {
59	$PTR_ADD="daddu";	# incidentally works even on n32
60	$PTR_SUB="dsubu";	# incidentally works even on n32
61	$REG_S="sd";
62	$REG_L="ld";
63	$SZREG=8;
64} else {
65	$PTR_ADD="addu";
66	$PTR_SUB="subu";
67	$REG_S="sw";
68	$REG_L="lw";
69	$SZREG=4;
70}
71$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
72#
73# <appro@openssl.org>
74#
75######################################################################
76
77while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
78open STDOUT,">$output";
79
80if ($flavour =~ /64|n32/i) {
81	$LD="ld";
82	$ST="sd";
83	$MULTU="dmultu";
84	$ADDU="daddu";
85	$SUBU="dsubu";
86	$BNSZ=8;
87} else {
88	$LD="lw";
89	$ST="sw";
90	$MULTU="multu";
91	$ADDU="addu";
92	$SUBU="subu";
93	$BNSZ=4;
94}
95
96# int bn_mul_mont(
97$rp=$a0;	# BN_ULONG *rp,
98$ap=$a1;	# const BN_ULONG *ap,
99$bp=$a2;	# const BN_ULONG *bp,
100$np=$a3;	# const BN_ULONG *np,
101$n0=$a4;	# const BN_ULONG *n0,
102$num=$a5;	# int num);
103
104$lo0=$a6;
105$hi0=$a7;
106$lo1=$t1;
107$hi1=$t2;
108$aj=$s0;
109$bi=$s1;
110$nj=$s2;
111$tp=$s3;
112$alo=$s4;
113$ahi=$s5;
114$nlo=$s6;
115$nhi=$s7;
116$tj=$s8;
117$i=$s9;
118$j=$s10;
119$m1=$s11;
120
121$FRAMESIZE=14;
122
123$code=<<___;
124#include "mips_arch.h"
125
126.text
127
128.set	noat
129.set	noreorder
130
131.align	5
132.globl	bn_mul_mont
133.ent	bn_mul_mont
134bn_mul_mont:
135___
136$code.=<<___ if ($flavour =~ /o32/i);
137	lw	$n0,16($sp)
138	lw	$num,20($sp)
139___
140$code.=<<___;
141	slt	$at,$num,4
142	bnez	$at,1f
143	li	$t0,0
144	slt	$at,$num,17	# on in-order CPU
145	bnez	$at,bn_mul_mont_internal
146	nop
1471:	jr	$ra
148	li	$a0,0
149.end	bn_mul_mont
150
151.align	5
152.ent	bn_mul_mont_internal
153bn_mul_mont_internal:
154	.frame	$fp,$FRAMESIZE*$SZREG,$ra
155	.mask	0x40000000|$SAVED_REGS_MASK,-$SZREG
156	$PTR_SUB $sp,$FRAMESIZE*$SZREG
157	$REG_S	$fp,($FRAMESIZE-1)*$SZREG($sp)
158	$REG_S	$s11,($FRAMESIZE-2)*$SZREG($sp)
159	$REG_S	$s10,($FRAMESIZE-3)*$SZREG($sp)
160	$REG_S	$s9,($FRAMESIZE-4)*$SZREG($sp)
161	$REG_S	$s8,($FRAMESIZE-5)*$SZREG($sp)
162	$REG_S	$s7,($FRAMESIZE-6)*$SZREG($sp)
163	$REG_S	$s6,($FRAMESIZE-7)*$SZREG($sp)
164	$REG_S	$s5,($FRAMESIZE-8)*$SZREG($sp)
165	$REG_S	$s4,($FRAMESIZE-9)*$SZREG($sp)
166___
167$code.=<<___ if ($flavour =~ /nubi/i);
168	$REG_S	$s3,($FRAMESIZE-10)*$SZREG($sp)
169	$REG_S	$s2,($FRAMESIZE-11)*$SZREG($sp)
170	$REG_S	$s1,($FRAMESIZE-12)*$SZREG($sp)
171	$REG_S	$s0,($FRAMESIZE-13)*$SZREG($sp)
172___
173$code.=<<___;
174	move	$fp,$sp
175
176	.set	reorder
177	$LD	$n0,0($n0)
178	$LD	$bi,0($bp)	# bp[0]
179	$LD	$aj,0($ap)	# ap[0]
180	$LD	$nj,0($np)	# np[0]
181
182	$PTR_SUB $sp,2*$BNSZ	# place for two extra words
183	sll	$num,`log($BNSZ)/log(2)`
184	li	$at,-4096
185	$PTR_SUB $sp,$num
186	and	$sp,$at
187
188	$MULTU	($aj,$bi)
189	$LD	$ahi,$BNSZ($ap)
190	$LD	$nhi,$BNSZ($np)
191	mflo	($lo0,$aj,$bi)
192	mfhi	($hi0,$aj,$bi)
193	$MULTU	($lo0,$n0)
194	mflo	($m1,$lo0,$n0)
195
196	$MULTU	($ahi,$bi)
197	mflo	($alo,$ahi,$bi)
198	mfhi	($ahi,$ahi,$bi)
199
200	$MULTU	($nj,$m1)
201	mflo	($lo1,$nj,$m1)
202	mfhi	($hi1,$nj,$m1)
203	$MULTU	($nhi,$m1)
204	$ADDU	$lo1,$lo0
205	sltu	$at,$lo1,$lo0
206	$ADDU	$hi1,$at
207	mflo	($nlo,$nhi,$m1)
208	mfhi	($nhi,$nhi,$m1)
209
210	move	$tp,$sp
211	li	$j,2*$BNSZ
212.align	4
213.L1st:
214	.set	noreorder
215	$PTR_ADD $aj,$ap,$j
216	$PTR_ADD $nj,$np,$j
217	$LD	$aj,($aj)
218	$LD	$nj,($nj)
219
220	$MULTU	($aj,$bi)
221	$ADDU	$lo0,$alo,$hi0
222	$ADDU	$lo1,$nlo,$hi1
223	sltu	$at,$lo0,$hi0
224	sltu	$t0,$lo1,$hi1
225	$ADDU	$hi0,$ahi,$at
226	$ADDU	$hi1,$nhi,$t0
227	mflo	($alo,$aj,$bi)
228	mfhi	($ahi,$aj,$bi)
229
230	$ADDU	$lo1,$lo0
231	sltu	$at,$lo1,$lo0
232	$MULTU	($nj,$m1)
233	$ADDU	$hi1,$at
234	addu	$j,$BNSZ
235	$ST	$lo1,($tp)
236	sltu	$t0,$j,$num
237	mflo	($nlo,$nj,$m1)
238	mfhi	($nhi,$nj,$m1)
239
240	bnez	$t0,.L1st
241	$PTR_ADD $tp,$BNSZ
242	.set	reorder
243
244	$ADDU	$lo0,$alo,$hi0
245	sltu	$at,$lo0,$hi0
246	$ADDU	$hi0,$ahi,$at
247
248	$ADDU	$lo1,$nlo,$hi1
249	sltu	$t0,$lo1,$hi1
250	$ADDU	$hi1,$nhi,$t0
251	$ADDU	$lo1,$lo0
252	sltu	$at,$lo1,$lo0
253	$ADDU	$hi1,$at
254
255	$ST	$lo1,($tp)
256
257	$ADDU	$hi1,$hi0
258	sltu	$at,$hi1,$hi0
259	$ST	$hi1,$BNSZ($tp)
260	$ST	$at,2*$BNSZ($tp)
261
262	li	$i,$BNSZ
263.align	4
264.Louter:
265	$PTR_ADD $bi,$bp,$i
266	$LD	$bi,($bi)
267	$LD	$aj,($ap)
268	$LD	$ahi,$BNSZ($ap)
269	$LD	$tj,($sp)
270
271	$MULTU	($aj,$bi)
272	$LD	$nj,($np)
273	$LD	$nhi,$BNSZ($np)
274	mflo	($lo0,$aj,$bi)
275	mfhi	($hi0,$aj,$bi)
276	$ADDU	$lo0,$tj
277	$MULTU	($lo0,$n0)
278	sltu	$at,$lo0,$tj
279	$ADDU	$hi0,$at
280	mflo	($m1,$lo0,$n0)
281
282	$MULTU	($ahi,$bi)
283	mflo	($alo,$ahi,$bi)
284	mfhi	($ahi,$ahi,$bi)
285
286	$MULTU	($nj,$m1)
287	mflo	($lo1,$nj,$m1)
288	mfhi	($hi1,$nj,$m1)
289
290	$MULTU	($nhi,$m1)
291	$ADDU	$lo1,$lo0
292	sltu	$at,$lo1,$lo0
293	$ADDU	$hi1,$at
294	mflo	($nlo,$nhi,$m1)
295	mfhi	($nhi,$nhi,$m1)
296
297	move	$tp,$sp
298	li	$j,2*$BNSZ
299	$LD	$tj,$BNSZ($tp)
300.align	4
301.Linner:
302	.set	noreorder
303	$PTR_ADD $aj,$ap,$j
304	$PTR_ADD $nj,$np,$j
305	$LD	$aj,($aj)
306	$LD	$nj,($nj)
307
308	$MULTU	($aj,$bi)
309	$ADDU	$lo0,$alo,$hi0
310	$ADDU	$lo1,$nlo,$hi1
311	sltu	$at,$lo0,$hi0
312	sltu	$t0,$lo1,$hi1
313	$ADDU	$hi0,$ahi,$at
314	$ADDU	$hi1,$nhi,$t0
315	mflo	($alo,$aj,$bi)
316	mfhi	($ahi,$aj,$bi)
317
318	$ADDU	$lo0,$tj
319	addu	$j,$BNSZ
320	$MULTU	($nj,$m1)
321	sltu	$at,$lo0,$tj
322	$ADDU	$lo1,$lo0
323	$ADDU	$hi0,$at
324	sltu	$t0,$lo1,$lo0
325	$LD	$tj,2*$BNSZ($tp)
326	$ADDU	$hi1,$t0
327	sltu	$at,$j,$num
328	mflo	($nlo,$nj,$m1)
329	mfhi	($nhi,$nj,$m1)
330	$ST	$lo1,($tp)
331	bnez	$at,.Linner
332	$PTR_ADD $tp,$BNSZ
333	.set	reorder
334
335	$ADDU	$lo0,$alo,$hi0
336	sltu	$at,$lo0,$hi0
337	$ADDU	$hi0,$ahi,$at
338	$ADDU	$lo0,$tj
339	sltu	$t0,$lo0,$tj
340	$ADDU	$hi0,$t0
341
342	$LD	$tj,2*$BNSZ($tp)
343	$ADDU	$lo1,$nlo,$hi1
344	sltu	$at,$lo1,$hi1
345	$ADDU	$hi1,$nhi,$at
346	$ADDU	$lo1,$lo0
347	sltu	$t0,$lo1,$lo0
348	$ADDU	$hi1,$t0
349	$ST	$lo1,($tp)
350
351	$ADDU	$lo1,$hi1,$hi0
352	sltu	$hi1,$lo1,$hi0
353	$ADDU	$lo1,$tj
354	sltu	$at,$lo1,$tj
355	$ADDU	$hi1,$at
356	$ST	$lo1,$BNSZ($tp)
357	$ST	$hi1,2*$BNSZ($tp)
358
359	addu	$i,$BNSZ
360	sltu	$t0,$i,$num
361	bnez	$t0,.Louter
362
363	.set	noreorder
364	$PTR_ADD $tj,$sp,$num	# &tp[num]
365	move	$tp,$sp
366	move	$ap,$sp
367	li	$hi0,0		# clear borrow bit
368
369.align	4
370.Lsub:	$LD	$lo0,($tp)
371	$LD	$lo1,($np)
372	$PTR_ADD $tp,$BNSZ
373	$PTR_ADD $np,$BNSZ
374	$SUBU	$lo1,$lo0,$lo1	# tp[i]-np[i]
375	sgtu	$at,$lo1,$lo0
376	$SUBU	$lo0,$lo1,$hi0
377	sgtu	$hi0,$lo0,$lo1
378	$ST	$lo0,($rp)
379	or	$hi0,$at
380	sltu	$at,$tp,$tj
381	bnez	$at,.Lsub
382	$PTR_ADD $rp,$BNSZ
383
384	$SUBU	$hi0,$hi1,$hi0	# handle upmost overflow bit
385	move	$tp,$sp
386	$PTR_SUB $rp,$num	# restore rp
387	not	$hi1,$hi0
388
389.Lcopy:	$LD	$nj,($tp)	# conditional move
390	$LD	$aj,($rp)
391	$ST	$zero,($tp)
392	$PTR_ADD $tp,$BNSZ
393	and	$nj,$hi0
394	and	$aj,$hi1
395	or	$aj,$nj
396	sltu	$at,$tp,$tj
397	$ST	$aj,($rp)
398	bnez	$at,.Lcopy
399	$PTR_ADD $rp,$BNSZ
400
401	li	$a0,1
402	li	$t0,1
403
404	.set	noreorder
405	move	$sp,$fp
406	$REG_L	$fp,($FRAMESIZE-1)*$SZREG($sp)
407	$REG_L	$s11,($FRAMESIZE-2)*$SZREG($sp)
408	$REG_L	$s10,($FRAMESIZE-3)*$SZREG($sp)
409	$REG_L	$s9,($FRAMESIZE-4)*$SZREG($sp)
410	$REG_L	$s8,($FRAMESIZE-5)*$SZREG($sp)
411	$REG_L	$s7,($FRAMESIZE-6)*$SZREG($sp)
412	$REG_L	$s6,($FRAMESIZE-7)*$SZREG($sp)
413	$REG_L	$s5,($FRAMESIZE-8)*$SZREG($sp)
414	$REG_L	$s4,($FRAMESIZE-9)*$SZREG($sp)
415___
416$code.=<<___ if ($flavour =~ /nubi/i);
417	$REG_L	$s3,($FRAMESIZE-10)*$SZREG($sp)
418	$REG_L	$s2,($FRAMESIZE-11)*$SZREG($sp)
419	$REG_L	$s1,($FRAMESIZE-12)*$SZREG($sp)
420	$REG_L	$s0,($FRAMESIZE-13)*$SZREG($sp)
421___
422$code.=<<___;
423	jr	$ra
424	$PTR_ADD $sp,$FRAMESIZE*$SZREG
425.end	bn_mul_mont_internal
426.rdata
427.asciiz	"Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
428___
429
430$code =~ s/\`([^\`]*)\`/eval $1/gem;
431
432print $code;
433close STDOUT;
434