xref: /freebsd/crypto/openssl/crypto/bn/asm/mips.pl (revision 148a8da8)
1#! /usr/bin/env perl
2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project.
13#
14# Rights for redistribution and usage in source and binary forms are
15# granted according to the OpenSSL license. Warranty of any kind is
16# disclaimed.
17# ====================================================================
18
19
20# July 1999
21#
22# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
23#
24# The module is designed to work with either of the "new" MIPS ABI(5),
25# namely N32 or N64, offered by IRIX 6.x. It's not meant to work under
26# IRIX 5.x not only because it doesn't support new ABIs but also
27# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
28# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
29# cause illegal instruction exception:-(
30#
31# In addition the code depends on preprocessor flags set up by MIPSpro
32# compiler driver (either as or cc) and therefore (probably?) can't be
33# compiled by the GNU assembler. GNU C driver manages fine though...
34# I mean as long as -mmips-as is specified or is the default option,
35# because then it simply invokes /usr/bin/as which in turn takes
36# perfect care of the preprocessor definitions. Another neat feature
37# offered by the MIPSpro assembler is an optimization pass. This gave
38# me the opportunity to have the code looking more regular as all those
39# architecture dependent instruction rescheduling details were left to
40# the assembler. Cool, huh?
41#
42# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
43# goes way over 3 times faster!
44#
45#					<appro@openssl.org>
46
47# October 2010
48#
49# Adapt the module even for 32-bit ABIs and other OSes. The former was
50# achieved by mechanical replacement of 64-bit arithmetic instructions
51# such as dmultu, daddu, etc. with their 32-bit counterparts and
52# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
53# >3x performance improvement naturally does not apply to 32-bit code
54# [because there is no instruction 32-bit compiler can't use], one
55# has to content with 40-85% improvement depending on benchmark and
56# key length, more for longer keys.
57
58$flavour = shift || "o32";
59while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
60open STDOUT,">$output";
61
62if ($flavour =~ /64|n32/i) {
63	$LD="ld";
64	$ST="sd";
65	$MULTU="dmultu";
66	$DIVU="ddivu";
67	$ADDU="daddu";
68	$SUBU="dsubu";
69	$SRL="dsrl";
70	$SLL="dsll";
71	$BNSZ=8;
72	$PTR_ADD="daddu";
73	$PTR_SUB="dsubu";
74	$SZREG=8;
75	$REG_S="sd";
76	$REG_L="ld";
77} else {
78	$LD="lw";
79	$ST="sw";
80	$MULTU="multu";
81	$DIVU="divu";
82	$ADDU="addu";
83	$SUBU="subu";
84	$SRL="srl";
85	$SLL="sll";
86	$BNSZ=4;
87	$PTR_ADD="addu";
88	$PTR_SUB="subu";
89	$SZREG=4;
90	$REG_S="sw";
91	$REG_L="lw";
92	$code=".set	mips2\n";
93}
94
95# Below is N32/64 register layout used in the original module.
96#
97($zero,$at,$v0,$v1)=map("\$$_",(0..3));
98($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
99($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
100($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
101($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
102($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
103#
104# No special adaptation is required for O32. NUBI on the other hand
105# is treated by saving/restoring ($v1,$t0..$t3).
106
107$gp=$v1 if ($flavour =~ /nubi/i);
108
109$minus4=$v1;
110
111$code.=<<___;
112#include "mips_arch.h"
113
114#if defined(_MIPS_ARCH_MIPS64R6)
115# define ddivu(rs,rt)
116# define mfqt(rd,rs,rt)	ddivu	rd,rs,rt
117# define mfrm(rd,rs,rt)	dmodu	rd,rs,rt
118#elif defined(_MIPS_ARCH_MIPS32R6)
119# define divu(rs,rt)
120# define mfqt(rd,rs,rt)	divu	rd,rs,rt
121# define mfrm(rd,rs,rt)	modu	rd,rs,rt
122#else
123# define $DIVU(rs,rt)	$DIVU	$zero,rs,rt
124# define mfqt(rd,rs,rt)	mflo	rd
125# define mfrm(rd,rs,rt)	mfhi	rd
126#endif
127
128.rdata
129.asciiz	"mips3.s, Version 1.2"
130.asciiz	"MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
131
132.text
133.set	noat
134
135.align	5
136.globl	bn_mul_add_words
137.ent	bn_mul_add_words
138bn_mul_add_words:
139	.set	noreorder
140	bgtz	$a2,bn_mul_add_words_internal
141	move	$v0,$zero
142	jr	$ra
143	move	$a0,$v0
144.end	bn_mul_add_words
145
146.align	5
147.ent	bn_mul_add_words_internal
148bn_mul_add_words_internal:
149___
150$code.=<<___ if ($flavour =~ /nubi/i);
151	.frame	$sp,6*$SZREG,$ra
152	.mask	0x8000f008,-$SZREG
153	.set	noreorder
154	$PTR_SUB $sp,6*$SZREG
155	$REG_S	$ra,5*$SZREG($sp)
156	$REG_S	$t3,4*$SZREG($sp)
157	$REG_S	$t2,3*$SZREG($sp)
158	$REG_S	$t1,2*$SZREG($sp)
159	$REG_S	$t0,1*$SZREG($sp)
160	$REG_S	$gp,0*$SZREG($sp)
161___
162$code.=<<___;
163	.set	reorder
164	li	$minus4,-4
165	and	$ta0,$a2,$minus4
166	beqz	$ta0,.L_bn_mul_add_words_tail
167
168.L_bn_mul_add_words_loop:
169	$LD	$t0,0($a1)
170	$MULTU	($t0,$a3)
171	$LD	$t1,0($a0)
172	$LD	$t2,$BNSZ($a1)
173	$LD	$t3,$BNSZ($a0)
174	$LD	$ta0,2*$BNSZ($a1)
175	$LD	$ta1,2*$BNSZ($a0)
176	$ADDU	$t1,$v0
177	sltu	$v0,$t1,$v0	# All manuals say it "compares 32-bit
178				# values", but it seems to work fine
179				# even on 64-bit registers.
180	mflo	($at,$t0,$a3)
181	mfhi	($t0,$t0,$a3)
182	$ADDU	$t1,$at
183	$ADDU	$v0,$t0
184	 $MULTU	($t2,$a3)
185	sltu	$at,$t1,$at
186	$ST	$t1,0($a0)
187	$ADDU	$v0,$at
188
189	$LD	$ta2,3*$BNSZ($a1)
190	$LD	$ta3,3*$BNSZ($a0)
191	$ADDU	$t3,$v0
192	sltu	$v0,$t3,$v0
193	mflo	($at,$t2,$a3)
194	mfhi	($t2,$t2,$a3)
195	$ADDU	$t3,$at
196	$ADDU	$v0,$t2
197	 $MULTU	($ta0,$a3)
198	sltu	$at,$t3,$at
199	$ST	$t3,$BNSZ($a0)
200	$ADDU	$v0,$at
201
202	subu	$a2,4
203	$PTR_ADD $a0,4*$BNSZ
204	$PTR_ADD $a1,4*$BNSZ
205	$ADDU	$ta1,$v0
206	sltu	$v0,$ta1,$v0
207	mflo	($at,$ta0,$a3)
208	mfhi	($ta0,$ta0,$a3)
209	$ADDU	$ta1,$at
210	$ADDU	$v0,$ta0
211	 $MULTU	($ta2,$a3)
212	sltu	$at,$ta1,$at
213	$ST	$ta1,-2*$BNSZ($a0)
214	$ADDU	$v0,$at
215
216
217	and	$ta0,$a2,$minus4
218	$ADDU	$ta3,$v0
219	sltu	$v0,$ta3,$v0
220	mflo	($at,$ta2,$a3)
221	mfhi	($ta2,$ta2,$a3)
222	$ADDU	$ta3,$at
223	$ADDU	$v0,$ta2
224	sltu	$at,$ta3,$at
225	$ST	$ta3,-$BNSZ($a0)
226	.set	noreorder
227	bgtz	$ta0,.L_bn_mul_add_words_loop
228	$ADDU	$v0,$at
229
230	beqz	$a2,.L_bn_mul_add_words_return
231	nop
232
233.L_bn_mul_add_words_tail:
234	.set	reorder
235	$LD	$t0,0($a1)
236	$MULTU	($t0,$a3)
237	$LD	$t1,0($a0)
238	subu	$a2,1
239	$ADDU	$t1,$v0
240	sltu	$v0,$t1,$v0
241	mflo	($at,$t0,$a3)
242	mfhi	($t0,$t0,$a3)
243	$ADDU	$t1,$at
244	$ADDU	$v0,$t0
245	sltu	$at,$t1,$at
246	$ST	$t1,0($a0)
247	$ADDU	$v0,$at
248	beqz	$a2,.L_bn_mul_add_words_return
249
250	$LD	$t0,$BNSZ($a1)
251	$MULTU	($t0,$a3)
252	$LD	$t1,$BNSZ($a0)
253	subu	$a2,1
254	$ADDU	$t1,$v0
255	sltu	$v0,$t1,$v0
256	mflo	($at,$t0,$a3)
257	mfhi	($t0,$t0,$a3)
258	$ADDU	$t1,$at
259	$ADDU	$v0,$t0
260	sltu	$at,$t1,$at
261	$ST	$t1,$BNSZ($a0)
262	$ADDU	$v0,$at
263	beqz	$a2,.L_bn_mul_add_words_return
264
265	$LD	$t0,2*$BNSZ($a1)
266	$MULTU	($t0,$a3)
267	$LD	$t1,2*$BNSZ($a0)
268	$ADDU	$t1,$v0
269	sltu	$v0,$t1,$v0
270	mflo	($at,$t0,$a3)
271	mfhi	($t0,$t0,$a3)
272	$ADDU	$t1,$at
273	$ADDU	$v0,$t0
274	sltu	$at,$t1,$at
275	$ST	$t1,2*$BNSZ($a0)
276	$ADDU	$v0,$at
277
278.L_bn_mul_add_words_return:
279	.set	noreorder
280___
281$code.=<<___ if ($flavour =~ /nubi/i);
282	$REG_L	$t3,4*$SZREG($sp)
283	$REG_L	$t2,3*$SZREG($sp)
284	$REG_L	$t1,2*$SZREG($sp)
285	$REG_L	$t0,1*$SZREG($sp)
286	$REG_L	$gp,0*$SZREG($sp)
287	$PTR_ADD $sp,6*$SZREG
288___
289$code.=<<___;
290	jr	$ra
291	move	$a0,$v0
292.end	bn_mul_add_words_internal
293
294.align	5
295.globl	bn_mul_words
296.ent	bn_mul_words
297bn_mul_words:
298	.set	noreorder
299	bgtz	$a2,bn_mul_words_internal
300	move	$v0,$zero
301	jr	$ra
302	move	$a0,$v0
303.end	bn_mul_words
304
305.align	5
306.ent	bn_mul_words_internal
307bn_mul_words_internal:
308___
309$code.=<<___ if ($flavour =~ /nubi/i);
310	.frame	$sp,6*$SZREG,$ra
311	.mask	0x8000f008,-$SZREG
312	.set	noreorder
313	$PTR_SUB $sp,6*$SZREG
314	$REG_S	$ra,5*$SZREG($sp)
315	$REG_S	$t3,4*$SZREG($sp)
316	$REG_S	$t2,3*$SZREG($sp)
317	$REG_S	$t1,2*$SZREG($sp)
318	$REG_S	$t0,1*$SZREG($sp)
319	$REG_S	$gp,0*$SZREG($sp)
320___
321$code.=<<___;
322	.set	reorder
323	li	$minus4,-4
324	and	$ta0,$a2,$minus4
325	beqz	$ta0,.L_bn_mul_words_tail
326
327.L_bn_mul_words_loop:
328	$LD	$t0,0($a1)
329	$MULTU	($t0,$a3)
330	$LD	$t2,$BNSZ($a1)
331	$LD	$ta0,2*$BNSZ($a1)
332	$LD	$ta2,3*$BNSZ($a1)
333	mflo	($at,$t0,$a3)
334	mfhi	($t0,$t0,$a3)
335	$ADDU	$v0,$at
336	sltu	$t1,$v0,$at
337	 $MULTU	($t2,$a3)
338	$ST	$v0,0($a0)
339	$ADDU	$v0,$t1,$t0
340
341	subu	$a2,4
342	$PTR_ADD $a0,4*$BNSZ
343	$PTR_ADD $a1,4*$BNSZ
344	mflo	($at,$t2,$a3)
345	mfhi	($t2,$t2,$a3)
346	$ADDU	$v0,$at
347	sltu	$t3,$v0,$at
348	 $MULTU	($ta0,$a3)
349	$ST	$v0,-3*$BNSZ($a0)
350	$ADDU	$v0,$t3,$t2
351
352	mflo	($at,$ta0,$a3)
353	mfhi	($ta0,$ta0,$a3)
354	$ADDU	$v0,$at
355	sltu	$ta1,$v0,$at
356	 $MULTU	($ta2,$a3)
357	$ST	$v0,-2*$BNSZ($a0)
358	$ADDU	$v0,$ta1,$ta0
359
360	and	$ta0,$a2,$minus4
361	mflo	($at,$ta2,$a3)
362	mfhi	($ta2,$ta2,$a3)
363	$ADDU	$v0,$at
364	sltu	$ta3,$v0,$at
365	$ST	$v0,-$BNSZ($a0)
366	.set	noreorder
367	bgtz	$ta0,.L_bn_mul_words_loop
368	$ADDU	$v0,$ta3,$ta2
369
370	beqz	$a2,.L_bn_mul_words_return
371	nop
372
373.L_bn_mul_words_tail:
374	.set	reorder
375	$LD	$t0,0($a1)
376	$MULTU	($t0,$a3)
377	subu	$a2,1
378	mflo	($at,$t0,$a3)
379	mfhi	($t0,$t0,$a3)
380	$ADDU	$v0,$at
381	sltu	$t1,$v0,$at
382	$ST	$v0,0($a0)
383	$ADDU	$v0,$t1,$t0
384	beqz	$a2,.L_bn_mul_words_return
385
386	$LD	$t0,$BNSZ($a1)
387	$MULTU	($t0,$a3)
388	subu	$a2,1
389	mflo	($at,$t0,$a3)
390	mfhi	($t0,$t0,$a3)
391	$ADDU	$v0,$at
392	sltu	$t1,$v0,$at
393	$ST	$v0,$BNSZ($a0)
394	$ADDU	$v0,$t1,$t0
395	beqz	$a2,.L_bn_mul_words_return
396
397	$LD	$t0,2*$BNSZ($a1)
398	$MULTU	($t0,$a3)
399	mflo	($at,$t0,$a3)
400	mfhi	($t0,$t0,$a3)
401	$ADDU	$v0,$at
402	sltu	$t1,$v0,$at
403	$ST	$v0,2*$BNSZ($a0)
404	$ADDU	$v0,$t1,$t0
405
406.L_bn_mul_words_return:
407	.set	noreorder
408___
409$code.=<<___ if ($flavour =~ /nubi/i);
410	$REG_L	$t3,4*$SZREG($sp)
411	$REG_L	$t2,3*$SZREG($sp)
412	$REG_L	$t1,2*$SZREG($sp)
413	$REG_L	$t0,1*$SZREG($sp)
414	$REG_L	$gp,0*$SZREG($sp)
415	$PTR_ADD $sp,6*$SZREG
416___
417$code.=<<___;
418	jr	$ra
419	move	$a0,$v0
420.end	bn_mul_words_internal
421
422.align	5
423.globl	bn_sqr_words
424.ent	bn_sqr_words
425bn_sqr_words:
426	.set	noreorder
427	bgtz	$a2,bn_sqr_words_internal
428	move	$v0,$zero
429	jr	$ra
430	move	$a0,$v0
431.end	bn_sqr_words
432
433.align	5
434.ent	bn_sqr_words_internal
435bn_sqr_words_internal:
436___
437$code.=<<___ if ($flavour =~ /nubi/i);
438	.frame	$sp,6*$SZREG,$ra
439	.mask	0x8000f008,-$SZREG
440	.set	noreorder
441	$PTR_SUB $sp,6*$SZREG
442	$REG_S	$ra,5*$SZREG($sp)
443	$REG_S	$t3,4*$SZREG($sp)
444	$REG_S	$t2,3*$SZREG($sp)
445	$REG_S	$t1,2*$SZREG($sp)
446	$REG_S	$t0,1*$SZREG($sp)
447	$REG_S	$gp,0*$SZREG($sp)
448___
449$code.=<<___;
450	.set	reorder
451	li	$minus4,-4
452	and	$ta0,$a2,$minus4
453	beqz	$ta0,.L_bn_sqr_words_tail
454
455.L_bn_sqr_words_loop:
456	$LD	$t0,0($a1)
457	$MULTU	($t0,$t0)
458	$LD	$t2,$BNSZ($a1)
459	$LD	$ta0,2*$BNSZ($a1)
460	$LD	$ta2,3*$BNSZ($a1)
461	mflo	($t1,$t0,$t0)
462	mfhi	($t0,$t0,$t0)
463	$ST	$t1,0($a0)
464	$ST	$t0,$BNSZ($a0)
465
466	$MULTU	($t2,$t2)
467	subu	$a2,4
468	$PTR_ADD $a0,8*$BNSZ
469	$PTR_ADD $a1,4*$BNSZ
470	mflo	($t3,$t2,$t2)
471	mfhi	($t2,$t2,$t2)
472	$ST	$t3,-6*$BNSZ($a0)
473	$ST	$t2,-5*$BNSZ($a0)
474
475	$MULTU	($ta0,$ta0)
476	mflo	($ta1,$ta0,$ta0)
477	mfhi	($ta0,$ta0,$ta0)
478	$ST	$ta1,-4*$BNSZ($a0)
479	$ST	$ta0,-3*$BNSZ($a0)
480
481
482	$MULTU	($ta2,$ta2)
483	and	$ta0,$a2,$minus4
484	mflo	($ta3,$ta2,$ta2)
485	mfhi	($ta2,$ta2,$ta2)
486	$ST	$ta3,-2*$BNSZ($a0)
487
488	.set	noreorder
489	bgtz	$ta0,.L_bn_sqr_words_loop
490	$ST	$ta2,-$BNSZ($a0)
491
492	beqz	$a2,.L_bn_sqr_words_return
493	nop
494
495.L_bn_sqr_words_tail:
496	.set	reorder
497	$LD	$t0,0($a1)
498	$MULTU	($t0,$t0)
499	subu	$a2,1
500	mflo	($t1,$t0,$t0)
501	mfhi	($t0,$t0,$t0)
502	$ST	$t1,0($a0)
503	$ST	$t0,$BNSZ($a0)
504	beqz	$a2,.L_bn_sqr_words_return
505
506	$LD	$t0,$BNSZ($a1)
507	$MULTU	($t0,$t0)
508	subu	$a2,1
509	mflo	($t1,$t0,$t0)
510	mfhi	($t0,$t0,$t0)
511	$ST	$t1,2*$BNSZ($a0)
512	$ST	$t0,3*$BNSZ($a0)
513	beqz	$a2,.L_bn_sqr_words_return
514
515	$LD	$t0,2*$BNSZ($a1)
516	$MULTU	($t0,$t0)
517	mflo	($t1,$t0,$t0)
518	mfhi	($t0,$t0,$t0)
519	$ST	$t1,4*$BNSZ($a0)
520	$ST	$t0,5*$BNSZ($a0)
521
522.L_bn_sqr_words_return:
523	.set	noreorder
524___
525$code.=<<___ if ($flavour =~ /nubi/i);
526	$REG_L	$t3,4*$SZREG($sp)
527	$REG_L	$t2,3*$SZREG($sp)
528	$REG_L	$t1,2*$SZREG($sp)
529	$REG_L	$t0,1*$SZREG($sp)
530	$REG_L	$gp,0*$SZREG($sp)
531	$PTR_ADD $sp,6*$SZREG
532___
533$code.=<<___;
534	jr	$ra
535	move	$a0,$v0
536
537.end	bn_sqr_words_internal
538
539.align	5
540.globl	bn_add_words
541.ent	bn_add_words
542bn_add_words:
543	.set	noreorder
544	bgtz	$a3,bn_add_words_internal
545	move	$v0,$zero
546	jr	$ra
547	move	$a0,$v0
548.end	bn_add_words
549
550.align	5
551.ent	bn_add_words_internal
552bn_add_words_internal:
553___
554$code.=<<___ if ($flavour =~ /nubi/i);
555	.frame	$sp,6*$SZREG,$ra
556	.mask	0x8000f008,-$SZREG
557	.set	noreorder
558	$PTR_SUB $sp,6*$SZREG
559	$REG_S	$ra,5*$SZREG($sp)
560	$REG_S	$t3,4*$SZREG($sp)
561	$REG_S	$t2,3*$SZREG($sp)
562	$REG_S	$t1,2*$SZREG($sp)
563	$REG_S	$t0,1*$SZREG($sp)
564	$REG_S	$gp,0*$SZREG($sp)
565___
566$code.=<<___;
567	.set	reorder
568	li	$minus4,-4
569	and	$at,$a3,$minus4
570	beqz	$at,.L_bn_add_words_tail
571
572.L_bn_add_words_loop:
573	$LD	$t0,0($a1)
574	$LD	$ta0,0($a2)
575	subu	$a3,4
576	$LD	$t1,$BNSZ($a1)
577	and	$at,$a3,$minus4
578	$LD	$t2,2*$BNSZ($a1)
579	$PTR_ADD $a2,4*$BNSZ
580	$LD	$t3,3*$BNSZ($a1)
581	$PTR_ADD $a0,4*$BNSZ
582	$LD	$ta1,-3*$BNSZ($a2)
583	$PTR_ADD $a1,4*$BNSZ
584	$LD	$ta2,-2*$BNSZ($a2)
585	$LD	$ta3,-$BNSZ($a2)
586	$ADDU	$ta0,$t0
587	sltu	$t8,$ta0,$t0
588	$ADDU	$t0,$ta0,$v0
589	sltu	$v0,$t0,$ta0
590	$ST	$t0,-4*$BNSZ($a0)
591	$ADDU	$v0,$t8
592
593	$ADDU	$ta1,$t1
594	sltu	$t9,$ta1,$t1
595	$ADDU	$t1,$ta1,$v0
596	sltu	$v0,$t1,$ta1
597	$ST	$t1,-3*$BNSZ($a0)
598	$ADDU	$v0,$t9
599
600	$ADDU	$ta2,$t2
601	sltu	$t8,$ta2,$t2
602	$ADDU	$t2,$ta2,$v0
603	sltu	$v0,$t2,$ta2
604	$ST	$t2,-2*$BNSZ($a0)
605	$ADDU	$v0,$t8
606
607	$ADDU	$ta3,$t3
608	sltu	$t9,$ta3,$t3
609	$ADDU	$t3,$ta3,$v0
610	sltu	$v0,$t3,$ta3
611	$ST	$t3,-$BNSZ($a0)
612
613	.set	noreorder
614	bgtz	$at,.L_bn_add_words_loop
615	$ADDU	$v0,$t9
616
617	beqz	$a3,.L_bn_add_words_return
618	nop
619
620.L_bn_add_words_tail:
621	.set	reorder
622	$LD	$t0,0($a1)
623	$LD	$ta0,0($a2)
624	$ADDU	$ta0,$t0
625	subu	$a3,1
626	sltu	$t8,$ta0,$t0
627	$ADDU	$t0,$ta0,$v0
628	sltu	$v0,$t0,$ta0
629	$ST	$t0,0($a0)
630	$ADDU	$v0,$t8
631	beqz	$a3,.L_bn_add_words_return
632
633	$LD	$t1,$BNSZ($a1)
634	$LD	$ta1,$BNSZ($a2)
635	$ADDU	$ta1,$t1
636	subu	$a3,1
637	sltu	$t9,$ta1,$t1
638	$ADDU	$t1,$ta1,$v0
639	sltu	$v0,$t1,$ta1
640	$ST	$t1,$BNSZ($a0)
641	$ADDU	$v0,$t9
642	beqz	$a3,.L_bn_add_words_return
643
644	$LD	$t2,2*$BNSZ($a1)
645	$LD	$ta2,2*$BNSZ($a2)
646	$ADDU	$ta2,$t2
647	sltu	$t8,$ta2,$t2
648	$ADDU	$t2,$ta2,$v0
649	sltu	$v0,$t2,$ta2
650	$ST	$t2,2*$BNSZ($a0)
651	$ADDU	$v0,$t8
652
653.L_bn_add_words_return:
654	.set	noreorder
655___
656$code.=<<___ if ($flavour =~ /nubi/i);
657	$REG_L	$t3,4*$SZREG($sp)
658	$REG_L	$t2,3*$SZREG($sp)
659	$REG_L	$t1,2*$SZREG($sp)
660	$REG_L	$t0,1*$SZREG($sp)
661	$REG_L	$gp,0*$SZREG($sp)
662	$PTR_ADD $sp,6*$SZREG
663___
664$code.=<<___;
665	jr	$ra
666	move	$a0,$v0
667
668.end	bn_add_words_internal
669
670.align	5
671.globl	bn_sub_words
672.ent	bn_sub_words
673bn_sub_words:
674	.set	noreorder
675	bgtz	$a3,bn_sub_words_internal
676	move	$v0,$zero
677	jr	$ra
678	move	$a0,$zero
679.end	bn_sub_words
680
681.align	5
682.ent	bn_sub_words_internal
683bn_sub_words_internal:
684___
685$code.=<<___ if ($flavour =~ /nubi/i);
686	.frame	$sp,6*$SZREG,$ra
687	.mask	0x8000f008,-$SZREG
688	.set	noreorder
689	$PTR_SUB $sp,6*$SZREG
690	$REG_S	$ra,5*$SZREG($sp)
691	$REG_S	$t3,4*$SZREG($sp)
692	$REG_S	$t2,3*$SZREG($sp)
693	$REG_S	$t1,2*$SZREG($sp)
694	$REG_S	$t0,1*$SZREG($sp)
695	$REG_S	$gp,0*$SZREG($sp)
696___
697$code.=<<___;
698	.set	reorder
699	li	$minus4,-4
700	and	$at,$a3,$minus4
701	beqz	$at,.L_bn_sub_words_tail
702
703.L_bn_sub_words_loop:
704	$LD	$t0,0($a1)
705	$LD	$ta0,0($a2)
706	subu	$a3,4
707	$LD	$t1,$BNSZ($a1)
708	and	$at,$a3,$minus4
709	$LD	$t2,2*$BNSZ($a1)
710	$PTR_ADD $a2,4*$BNSZ
711	$LD	$t3,3*$BNSZ($a1)
712	$PTR_ADD $a0,4*$BNSZ
713	$LD	$ta1,-3*$BNSZ($a2)
714	$PTR_ADD $a1,4*$BNSZ
715	$LD	$ta2,-2*$BNSZ($a2)
716	$LD	$ta3,-$BNSZ($a2)
717	sltu	$t8,$t0,$ta0
718	$SUBU	$ta0,$t0,$ta0
719	$SUBU	$t0,$ta0,$v0
720	sgtu	$v0,$t0,$ta0
721	$ST	$t0,-4*$BNSZ($a0)
722	$ADDU	$v0,$t8
723
724	sltu	$t9,$t1,$ta1
725	$SUBU	$ta1,$t1,$ta1
726	$SUBU	$t1,$ta1,$v0
727	sgtu	$v0,$t1,$ta1
728	$ST	$t1,-3*$BNSZ($a0)
729	$ADDU	$v0,$t9
730
731
732	sltu	$t8,$t2,$ta2
733	$SUBU	$ta2,$t2,$ta2
734	$SUBU	$t2,$ta2,$v0
735	sgtu	$v0,$t2,$ta2
736	$ST	$t2,-2*$BNSZ($a0)
737	$ADDU	$v0,$t8
738
739	sltu	$t9,$t3,$ta3
740	$SUBU	$ta3,$t3,$ta3
741	$SUBU	$t3,$ta3,$v0
742	sgtu	$v0,$t3,$ta3
743	$ST	$t3,-$BNSZ($a0)
744
745	.set	noreorder
746	bgtz	$at,.L_bn_sub_words_loop
747	$ADDU	$v0,$t9
748
749	beqz	$a3,.L_bn_sub_words_return
750	nop
751
752.L_bn_sub_words_tail:
753	.set	reorder
754	$LD	$t0,0($a1)
755	$LD	$ta0,0($a2)
756	subu	$a3,1
757	sltu	$t8,$t0,$ta0
758	$SUBU	$ta0,$t0,$ta0
759	$SUBU	$t0,$ta0,$v0
760	sgtu	$v0,$t0,$ta0
761	$ST	$t0,0($a0)
762	$ADDU	$v0,$t8
763	beqz	$a3,.L_bn_sub_words_return
764
765	$LD	$t1,$BNSZ($a1)
766	subu	$a3,1
767	$LD	$ta1,$BNSZ($a2)
768	sltu	$t9,$t1,$ta1
769	$SUBU	$ta1,$t1,$ta1
770	$SUBU	$t1,$ta1,$v0
771	sgtu	$v0,$t1,$ta1
772	$ST	$t1,$BNSZ($a0)
773	$ADDU	$v0,$t9
774	beqz	$a3,.L_bn_sub_words_return
775
776	$LD	$t2,2*$BNSZ($a1)
777	$LD	$ta2,2*$BNSZ($a2)
778	sltu	$t8,$t2,$ta2
779	$SUBU	$ta2,$t2,$ta2
780	$SUBU	$t2,$ta2,$v0
781	sgtu	$v0,$t2,$ta2
782	$ST	$t2,2*$BNSZ($a0)
783	$ADDU	$v0,$t8
784
785.L_bn_sub_words_return:
786	.set	noreorder
787___
788$code.=<<___ if ($flavour =~ /nubi/i);
789	$REG_L	$t3,4*$SZREG($sp)
790	$REG_L	$t2,3*$SZREG($sp)
791	$REG_L	$t1,2*$SZREG($sp)
792	$REG_L	$t0,1*$SZREG($sp)
793	$REG_L	$gp,0*$SZREG($sp)
794	$PTR_ADD $sp,6*$SZREG
795___
796$code.=<<___;
797	jr	$ra
798	move	$a0,$v0
799.end	bn_sub_words_internal
800
801#if 0
802/*
803 * The bn_div_3_words entry point is re-used for constant-time interface.
804 * Implementation is retained as hystorical reference.
805 */
806.align 5
807.globl	bn_div_3_words
808.ent	bn_div_3_words
809bn_div_3_words:
810	.set	noreorder
811	move	$a3,$a0		# we know that bn_div_words does not
812				# touch $a3, $ta2, $ta3 and preserves $a2
813				# so that we can save two arguments
814				# and return address in registers
815				# instead of stack:-)
816
817	$LD	$a0,($a3)
818	move	$ta2,$a1
819	bne	$a0,$a2,bn_div_3_words_internal
820	$LD	$a1,-$BNSZ($a3)
821	li	$v0,-1
822	jr	$ra
823	move	$a0,$v0
824.end	bn_div_3_words
825
826.align	5
827.ent	bn_div_3_words_internal
828bn_div_3_words_internal:
829___
830$code.=<<___ if ($flavour =~ /nubi/i);
831	.frame	$sp,6*$SZREG,$ra
832	.mask	0x8000f008,-$SZREG
833	.set	noreorder
834	$PTR_SUB $sp,6*$SZREG
835	$REG_S	$ra,5*$SZREG($sp)
836	$REG_S	$t3,4*$SZREG($sp)
837	$REG_S	$t2,3*$SZREG($sp)
838	$REG_S	$t1,2*$SZREG($sp)
839	$REG_S	$t0,1*$SZREG($sp)
840	$REG_S	$gp,0*$SZREG($sp)
841___
842$code.=<<___;
843	.set	reorder
844	move	$ta3,$ra
845	bal	bn_div_words_internal
846	move	$ra,$ta3
847	$MULTU	($ta2,$v0)
848	$LD	$t2,-2*$BNSZ($a3)
849	move	$ta0,$zero
850	mfhi	($t1,$ta2,$v0)
851	mflo	($t0,$ta2,$v0)
852	sltu	$t8,$t1,$a1
853.L_bn_div_3_words_inner_loop:
854	bnez	$t8,.L_bn_div_3_words_inner_loop_done
855	sgeu	$at,$t2,$t0
856	seq	$t9,$t1,$a1
857	and	$at,$t9
858	sltu	$t3,$t0,$ta2
859	$ADDU	$a1,$a2
860	$SUBU	$t1,$t3
861	$SUBU	$t0,$ta2
862	sltu	$t8,$t1,$a1
863	sltu	$ta0,$a1,$a2
864	or	$t8,$ta0
865	.set	noreorder
866	beqz	$at,.L_bn_div_3_words_inner_loop
867	$SUBU	$v0,1
868	$ADDU	$v0,1
869	.set	reorder
870.L_bn_div_3_words_inner_loop_done:
871	.set	noreorder
872___
873$code.=<<___ if ($flavour =~ /nubi/i);
874	$REG_L	$t3,4*$SZREG($sp)
875	$REG_L	$t2,3*$SZREG($sp)
876	$REG_L	$t1,2*$SZREG($sp)
877	$REG_L	$t0,1*$SZREG($sp)
878	$REG_L	$gp,0*$SZREG($sp)
879	$PTR_ADD $sp,6*$SZREG
880___
881$code.=<<___;
882	jr	$ra
883	move	$a0,$v0
884.end	bn_div_3_words_internal
885#endif
886
887.align	5
888.globl	bn_div_words
889.ent	bn_div_words
890bn_div_words:
891	.set	noreorder
892	bnez	$a2,bn_div_words_internal
893	li	$v0,-1		# I would rather signal div-by-zero
894				# which can be done with 'break 7'
895	jr	$ra
896	move	$a0,$v0
897.end	bn_div_words
898
899.align	5
900.ent	bn_div_words_internal
901bn_div_words_internal:
902___
903$code.=<<___ if ($flavour =~ /nubi/i);
904	.frame	$sp,6*$SZREG,$ra
905	.mask	0x8000f008,-$SZREG
906	.set	noreorder
907	$PTR_SUB $sp,6*$SZREG
908	$REG_S	$ra,5*$SZREG($sp)
909	$REG_S	$t3,4*$SZREG($sp)
910	$REG_S	$t2,3*$SZREG($sp)
911	$REG_S	$t1,2*$SZREG($sp)
912	$REG_S	$t0,1*$SZREG($sp)
913	$REG_S	$gp,0*$SZREG($sp)
914___
915$code.=<<___;
916	move	$v1,$zero
917	bltz	$a2,.L_bn_div_words_body
918	move	$t9,$v1
919	$SLL	$a2,1
920	bgtz	$a2,.-4
921	addu	$t9,1
922
923	.set	reorder
924	negu	$t1,$t9
925	li	$t2,-1
926	$SLL	$t2,$t1
927	and	$t2,$a0
928	$SRL	$at,$a1,$t1
929	.set	noreorder
930	beqz	$t2,.+12
931	nop
932	break	6		# signal overflow
933	.set	reorder
934	$SLL	$a0,$t9
935	$SLL	$a1,$t9
936	or	$a0,$at
937___
938$QT=$ta0;
939$HH=$ta1;
940$DH=$v1;
941$code.=<<___;
942.L_bn_div_words_body:
943	$SRL	$DH,$a2,4*$BNSZ	# bits
944	sgeu	$at,$a0,$a2
945	.set	noreorder
946	beqz	$at,.+12
947	nop
948	$SUBU	$a0,$a2
949	.set	reorder
950
951	li	$QT,-1
952	$SRL	$HH,$a0,4*$BNSZ	# bits
953	$SRL	$QT,4*$BNSZ	# q=0xffffffff
954	beq	$DH,$HH,.L_bn_div_words_skip_div1
955	$DIVU	($a0,$DH)
956	mfqt	($QT,$a0,$DH)
957.L_bn_div_words_skip_div1:
958	$MULTU	($a2,$QT)
959	$SLL	$t3,$a0,4*$BNSZ	# bits
960	$SRL	$at,$a1,4*$BNSZ	# bits
961	or	$t3,$at
962	mflo	($t0,$a2,$QT)
963	mfhi	($t1,$a2,$QT)
964.L_bn_div_words_inner_loop1:
965	sltu	$t2,$t3,$t0
966	seq	$t8,$HH,$t1
967	sltu	$at,$HH,$t1
968	and	$t2,$t8
969	sltu	$v0,$t0,$a2
970	or	$at,$t2
971	.set	noreorder
972	beqz	$at,.L_bn_div_words_inner_loop1_done
973	$SUBU	$t1,$v0
974	$SUBU	$t0,$a2
975	b	.L_bn_div_words_inner_loop1
976	$SUBU	$QT,1
977	.set	reorder
978.L_bn_div_words_inner_loop1_done:
979
980	$SLL	$a1,4*$BNSZ	# bits
981	$SUBU	$a0,$t3,$t0
982	$SLL	$v0,$QT,4*$BNSZ	# bits
983
984	li	$QT,-1
985	$SRL	$HH,$a0,4*$BNSZ	# bits
986	$SRL	$QT,4*$BNSZ	# q=0xffffffff
987	beq	$DH,$HH,.L_bn_div_words_skip_div2
988	$DIVU	($a0,$DH)
989	mfqt	($QT,$a0,$DH)
990.L_bn_div_words_skip_div2:
991	$MULTU	($a2,$QT)
992	$SLL	$t3,$a0,4*$BNSZ	# bits
993	$SRL	$at,$a1,4*$BNSZ	# bits
994	or	$t3,$at
995	mflo	($t0,$a2,$QT)
996	mfhi	($t1,$a2,$QT)
997.L_bn_div_words_inner_loop2:
998	sltu	$t2,$t3,$t0
999	seq	$t8,$HH,$t1
1000	sltu	$at,$HH,$t1
1001	and	$t2,$t8
1002	sltu	$v1,$t0,$a2
1003	or	$at,$t2
1004	.set	noreorder
1005	beqz	$at,.L_bn_div_words_inner_loop2_done
1006	$SUBU	$t1,$v1
1007	$SUBU	$t0,$a2
1008	b	.L_bn_div_words_inner_loop2
1009	$SUBU	$QT,1
1010	.set	reorder
1011.L_bn_div_words_inner_loop2_done:
1012
1013	$SUBU	$a0,$t3,$t0
1014	or	$v0,$QT
1015	$SRL	$v1,$a0,$t9	# $v1 contains remainder if anybody wants it
1016	$SRL	$a2,$t9		# restore $a2
1017
1018	.set	noreorder
1019	move	$a1,$v1
1020___
1021$code.=<<___ if ($flavour =~ /nubi/i);
1022	$REG_L	$t3,4*$SZREG($sp)
1023	$REG_L	$t2,3*$SZREG($sp)
1024	$REG_L	$t1,2*$SZREG($sp)
1025	$REG_L	$t0,1*$SZREG($sp)
1026	$REG_L	$gp,0*$SZREG($sp)
1027	$PTR_ADD $sp,6*$SZREG
1028___
1029$code.=<<___;
1030	jr	$ra
1031	move	$a0,$v0
1032.end	bn_div_words_internal
1033___
1034undef $HH; undef $QT; undef $DH;
1035
1036($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1037($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1038
1039($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1040($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1041
1042($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1043
1044$code.=<<___;
1045
1046.align	5
1047.globl	bn_mul_comba8
1048.ent	bn_mul_comba8
1049bn_mul_comba8:
1050	.set	noreorder
1051___
1052$code.=<<___ if ($flavour =~ /nubi/i);
1053	.frame	$sp,12*$SZREG,$ra
1054	.mask	0x803ff008,-$SZREG
1055	$PTR_SUB $sp,12*$SZREG
1056	$REG_S	$ra,11*$SZREG($sp)
1057	$REG_S	$s5,10*$SZREG($sp)
1058	$REG_S	$s4,9*$SZREG($sp)
1059	$REG_S	$s3,8*$SZREG($sp)
1060	$REG_S	$s2,7*$SZREG($sp)
1061	$REG_S	$s1,6*$SZREG($sp)
1062	$REG_S	$s0,5*$SZREG($sp)
1063	$REG_S	$t3,4*$SZREG($sp)
1064	$REG_S	$t2,3*$SZREG($sp)
1065	$REG_S	$t1,2*$SZREG($sp)
1066	$REG_S	$t0,1*$SZREG($sp)
1067	$REG_S	$gp,0*$SZREG($sp)
1068___
1069$code.=<<___ if ($flavour !~ /nubi/i);
1070	.frame	$sp,6*$SZREG,$ra
1071	.mask	0x003f0000,-$SZREG
1072	$PTR_SUB $sp,6*$SZREG
1073	$REG_S	$s5,5*$SZREG($sp)
1074	$REG_S	$s4,4*$SZREG($sp)
1075	$REG_S	$s3,3*$SZREG($sp)
1076	$REG_S	$s2,2*$SZREG($sp)
1077	$REG_S	$s1,1*$SZREG($sp)
1078	$REG_S	$s0,0*$SZREG($sp)
1079___
1080$code.=<<___;
1081
1082	.set	reorder
1083	$LD	$a_0,0($a1)	# If compiled with -mips3 option on
1084				# R5000 box assembler barks on this
1085				# 1ine with "should not have mult/div
1086				# as last instruction in bb (R10K
1087				# bug)" warning. If anybody out there
1088				# has a clue about how to circumvent
1089				# this do send me a note.
1090				#		<appro\@fy.chalmers.se>
1091
1092	$LD	$b_0,0($a2)
1093	$LD	$a_1,$BNSZ($a1)
1094	$LD	$a_2,2*$BNSZ($a1)
1095	$MULTU	($a_0,$b_0)		# mul_add_c(a[0],b[0],c1,c2,c3);
1096	$LD	$a_3,3*$BNSZ($a1)
1097	$LD	$b_1,$BNSZ($a2)
1098	$LD	$b_2,2*$BNSZ($a2)
1099	$LD	$b_3,3*$BNSZ($a2)
1100	mflo	($c_1,$a_0,$b_0)
1101	mfhi	($c_2,$a_0,$b_0)
1102
1103	$LD	$a_4,4*$BNSZ($a1)
1104	$LD	$a_5,5*$BNSZ($a1)
1105	$MULTU	($a_0,$b_1)		# mul_add_c(a[0],b[1],c2,c3,c1);
1106	$LD	$a_6,6*$BNSZ($a1)
1107	$LD	$a_7,7*$BNSZ($a1)
1108	$LD	$b_4,4*$BNSZ($a2)
1109	$LD	$b_5,5*$BNSZ($a2)
1110	mflo	($t_1,$a_0,$b_1)
1111	mfhi	($t_2,$a_0,$b_1)
1112	$ADDU	$c_2,$t_1
1113	sltu	$at,$c_2,$t_1
1114	$MULTU	($a_1,$b_0)		# mul_add_c(a[1],b[0],c2,c3,c1);
1115	$ADDU	$c_3,$t_2,$at
1116	$LD	$b_6,6*$BNSZ($a2)
1117	$LD	$b_7,7*$BNSZ($a2)
1118	$ST	$c_1,0($a0)	# r[0]=c1;
1119	mflo	($t_1,$a_1,$b_0)
1120	mfhi	($t_2,$a_1,$b_0)
1121	$ADDU	$c_2,$t_1
1122	sltu	$at,$c_2,$t_1
1123	 $MULTU	($a_2,$b_0)		# mul_add_c(a[2],b[0],c3,c1,c2);
1124	$ADDU	$t_2,$at
1125	$ADDU	$c_3,$t_2
1126	sltu	$c_1,$c_3,$t_2
1127	$ST	$c_2,$BNSZ($a0)	# r[1]=c2;
1128
1129	mflo	($t_1,$a_2,$b_0)
1130	mfhi	($t_2,$a_2,$b_0)
1131	$ADDU	$c_3,$t_1
1132	sltu	$at,$c_3,$t_1
1133	$MULTU	($a_1,$b_1)		# mul_add_c(a[1],b[1],c3,c1,c2);
1134	$ADDU	$t_2,$at
1135	$ADDU	$c_1,$t_2
1136	mflo	($t_1,$a_1,$b_1)
1137	mfhi	($t_2,$a_1,$b_1)
1138	$ADDU	$c_3,$t_1
1139	sltu	$at,$c_3,$t_1
1140	$MULTU	($a_0,$b_2)		# mul_add_c(a[0],b[2],c3,c1,c2);
1141	$ADDU	$t_2,$at
1142	$ADDU	$c_1,$t_2
1143	sltu	$c_2,$c_1,$t_2
1144	mflo	($t_1,$a_0,$b_2)
1145	mfhi	($t_2,$a_0,$b_2)
1146	$ADDU	$c_3,$t_1
1147	sltu	$at,$c_3,$t_1
1148	 $MULTU	($a_0,$b_3)		# mul_add_c(a[0],b[3],c1,c2,c3);
1149	$ADDU	$t_2,$at
1150	$ADDU	$c_1,$t_2
1151	sltu	$at,$c_1,$t_2
1152	$ADDU	$c_2,$at
1153	$ST	$c_3,2*$BNSZ($a0)	# r[2]=c3;
1154
1155	mflo	($t_1,$a_0,$b_3)
1156	mfhi	($t_2,$a_0,$b_3)
1157	$ADDU	$c_1,$t_1
1158	sltu	$at,$c_1,$t_1
1159	$MULTU	($a_1,$b_2)		# mul_add_c(a[1],b[2],c1,c2,c3);
1160	$ADDU	$t_2,$at
1161	$ADDU	$c_2,$t_2
1162	sltu	$c_3,$c_2,$t_2
1163	mflo	($t_1,$a_1,$b_2)
1164	mfhi	($t_2,$a_1,$b_2)
1165	$ADDU	$c_1,$t_1
1166	sltu	$at,$c_1,$t_1
1167	$MULTU	($a_2,$b_1)		# mul_add_c(a[2],b[1],c1,c2,c3);
1168	$ADDU	$t_2,$at
1169	$ADDU	$c_2,$t_2
1170	sltu	$at,$c_2,$t_2
1171	$ADDU	$c_3,$at
1172	mflo	($t_1,$a_2,$b_1)
1173	mfhi	($t_2,$a_2,$b_1)
1174	$ADDU	$c_1,$t_1
1175	sltu	$at,$c_1,$t_1
1176	$MULTU	($a_3,$b_0)		# mul_add_c(a[3],b[0],c1,c2,c3);
1177	$ADDU	$t_2,$at
1178	$ADDU	$c_2,$t_2
1179	sltu	$at,$c_2,$t_2
1180	$ADDU	$c_3,$at
1181	mflo	($t_1,$a_3,$b_0)
1182	mfhi	($t_2,$a_3,$b_0)
1183	$ADDU	$c_1,$t_1
1184	sltu	$at,$c_1,$t_1
1185	 $MULTU	($a_4,$b_0)		# mul_add_c(a[4],b[0],c2,c3,c1);
1186	$ADDU	$t_2,$at
1187	$ADDU	$c_2,$t_2
1188	sltu	$at,$c_2,$t_2
1189	$ADDU	$c_3,$at
1190	$ST	$c_1,3*$BNSZ($a0)	# r[3]=c1;
1191
1192	mflo	($t_1,$a_4,$b_0)
1193	mfhi	($t_2,$a_4,$b_0)
1194	$ADDU	$c_2,$t_1
1195	sltu	$at,$c_2,$t_1
1196	$MULTU	($a_3,$b_1)		# mul_add_c(a[3],b[1],c2,c3,c1);
1197	$ADDU	$t_2,$at
1198	$ADDU	$c_3,$t_2
1199	sltu	$c_1,$c_3,$t_2
1200	mflo	($t_1,$a_3,$b_1)
1201	mfhi	($t_2,$a_3,$b_1)
1202	$ADDU	$c_2,$t_1
1203	sltu	$at,$c_2,$t_1
1204	$MULTU	($a_2,$b_2)		# mul_add_c(a[2],b[2],c2,c3,c1);
1205	$ADDU	$t_2,$at
1206	$ADDU	$c_3,$t_2
1207	sltu	$at,$c_3,$t_2
1208	$ADDU	$c_1,$at
1209	mflo	($t_1,$a_2,$b_2)
1210	mfhi	($t_2,$a_2,$b_2)
1211	$ADDU	$c_2,$t_1
1212	sltu	$at,$c_2,$t_1
1213	$MULTU	($a_1,$b_3)		# mul_add_c(a[1],b[3],c2,c3,c1);
1214	$ADDU	$t_2,$at
1215	$ADDU	$c_3,$t_2
1216	sltu	$at,$c_3,$t_2
1217	$ADDU	$c_1,$at
1218	mflo	($t_1,$a_1,$b_3)
1219	mfhi	($t_2,$a_1,$b_3)
1220	$ADDU	$c_2,$t_1
1221	sltu	$at,$c_2,$t_1
1222	$MULTU	($a_0,$b_4)		# mul_add_c(a[0],b[4],c2,c3,c1);
1223	$ADDU	$t_2,$at
1224	$ADDU	$c_3,$t_2
1225	sltu	$at,$c_3,$t_2
1226	$ADDU	$c_1,$at
1227	mflo	($t_1,$a_0,$b_4)
1228	mfhi	($t_2,$a_0,$b_4)
1229	$ADDU	$c_2,$t_1
1230	sltu	$at,$c_2,$t_1
1231	 $MULTU	($a_0,$b_5)		# mul_add_c(a[0],b[5],c3,c1,c2);
1232	$ADDU	$t_2,$at
1233	$ADDU	$c_3,$t_2
1234	sltu	$at,$c_3,$t_2
1235	$ADDU	$c_1,$at
1236	$ST	$c_2,4*$BNSZ($a0)	# r[4]=c2;
1237
1238	mflo	($t_1,$a_0,$b_5)
1239	mfhi	($t_2,$a_0,$b_5)
1240	$ADDU	$c_3,$t_1
1241	sltu	$at,$c_3,$t_1
1242	$MULTU	($a_1,$b_4)		# mul_add_c(a[1],b[4],c3,c1,c2);
1243	$ADDU	$t_2,$at
1244	$ADDU	$c_1,$t_2
1245	sltu	$c_2,$c_1,$t_2
1246	mflo	($t_1,$a_1,$b_4)
1247	mfhi	($t_2,$a_1,$b_4)
1248	$ADDU	$c_3,$t_1
1249	sltu	$at,$c_3,$t_1
1250	$MULTU	($a_2,$b_3)		# mul_add_c(a[2],b[3],c3,c1,c2);
1251	$ADDU	$t_2,$at
1252	$ADDU	$c_1,$t_2
1253	sltu	$at,$c_1,$t_2
1254	$ADDU	$c_2,$at
1255	mflo	($t_1,$a_2,$b_3)
1256	mfhi	($t_2,$a_2,$b_3)
1257	$ADDU	$c_3,$t_1
1258	sltu	$at,$c_3,$t_1
1259	$MULTU	($a_3,$b_2)		# mul_add_c(a[3],b[2],c3,c1,c2);
1260	$ADDU	$t_2,$at
1261	$ADDU	$c_1,$t_2
1262	sltu	$at,$c_1,$t_2
1263	$ADDU	$c_2,$at
1264	mflo	($t_1,$a_3,$b_2)
1265	mfhi	($t_2,$a_3,$b_2)
1266	$ADDU	$c_3,$t_1
1267	sltu	$at,$c_3,$t_1
1268	$MULTU	($a_4,$b_1)		# mul_add_c(a[4],b[1],c3,c1,c2);
1269	$ADDU	$t_2,$at
1270	$ADDU	$c_1,$t_2
1271	sltu	$at,$c_1,$t_2
1272	$ADDU	$c_2,$at
1273	mflo	($t_1,$a_4,$b_1)
1274	mfhi	($t_2,$a_4,$b_1)
1275	$ADDU	$c_3,$t_1
1276	sltu	$at,$c_3,$t_1
1277	$MULTU	($a_5,$b_0)		# mul_add_c(a[5],b[0],c3,c1,c2);
1278	$ADDU	$t_2,$at
1279	$ADDU	$c_1,$t_2
1280	sltu	$at,$c_1,$t_2
1281	$ADDU	$c_2,$at
1282	mflo	($t_1,$a_5,$b_0)
1283	mfhi	($t_2,$a_5,$b_0)
1284	$ADDU	$c_3,$t_1
1285	sltu	$at,$c_3,$t_1
1286	 $MULTU	($a_6,$b_0)		# mul_add_c(a[6],b[0],c1,c2,c3);
1287	$ADDU	$t_2,$at
1288	$ADDU	$c_1,$t_2
1289	sltu	$at,$c_1,$t_2
1290	$ADDU	$c_2,$at
1291	$ST	$c_3,5*$BNSZ($a0)	# r[5]=c3;
1292
1293	mflo	($t_1,$a_6,$b_0)
1294	mfhi	($t_2,$a_6,$b_0)
1295	$ADDU	$c_1,$t_1
1296	sltu	$at,$c_1,$t_1
1297	$MULTU	($a_5,$b_1)		# mul_add_c(a[5],b[1],c1,c2,c3);
1298	$ADDU	$t_2,$at
1299	$ADDU	$c_2,$t_2
1300	sltu	$c_3,$c_2,$t_2
1301	mflo	($t_1,$a_5,$b_1)
1302	mfhi	($t_2,$a_5,$b_1)
1303	$ADDU	$c_1,$t_1
1304	sltu	$at,$c_1,$t_1
1305	$MULTU	($a_4,$b_2)		# mul_add_c(a[4],b[2],c1,c2,c3);
1306	$ADDU	$t_2,$at
1307	$ADDU	$c_2,$t_2
1308	sltu	$at,$c_2,$t_2
1309	$ADDU	$c_3,$at
1310	mflo	($t_1,$a_4,$b_2)
1311	mfhi	($t_2,$a_4,$b_2)
1312	$ADDU	$c_1,$t_1
1313	sltu	$at,$c_1,$t_1
1314	$MULTU	($a_3,$b_3)		# mul_add_c(a[3],b[3],c1,c2,c3);
1315	$ADDU	$t_2,$at
1316	$ADDU	$c_2,$t_2
1317	sltu	$at,$c_2,$t_2
1318	$ADDU	$c_3,$at
1319	mflo	($t_1,$a_3,$b_3)
1320	mfhi	($t_2,$a_3,$b_3)
1321	$ADDU	$c_1,$t_1
1322	sltu	$at,$c_1,$t_1
1323	$MULTU	($a_2,$b_4)		# mul_add_c(a[2],b[4],c1,c2,c3);
1324	$ADDU	$t_2,$at
1325	$ADDU	$c_2,$t_2
1326	sltu	$at,$c_2,$t_2
1327	$ADDU	$c_3,$at
1328	mflo	($t_1,$a_2,$b_4)
1329	mfhi	($t_2,$a_2,$b_4)
1330	$ADDU	$c_1,$t_1
1331	sltu	$at,$c_1,$t_1
1332	$MULTU	($a_1,$b_5)		# mul_add_c(a[1],b[5],c1,c2,c3);
1333	$ADDU	$t_2,$at
1334	$ADDU	$c_2,$t_2
1335	sltu	$at,$c_2,$t_2
1336	$ADDU	$c_3,$at
1337	mflo	($t_1,$a_1,$b_5)
1338	mfhi	($t_2,$a_1,$b_5)
1339	$ADDU	$c_1,$t_1
1340	sltu	$at,$c_1,$t_1
1341	$MULTU	($a_0,$b_6)		# mul_add_c(a[0],b[6],c1,c2,c3);
1342	$ADDU	$t_2,$at
1343	$ADDU	$c_2,$t_2
1344	sltu	$at,$c_2,$t_2
1345	$ADDU	$c_3,$at
1346	mflo	($t_1,$a_0,$b_6)
1347	mfhi	($t_2,$a_0,$b_6)
1348	$ADDU	$c_1,$t_1
1349	sltu	$at,$c_1,$t_1
1350	 $MULTU	($a_0,$b_7)		# mul_add_c(a[0],b[7],c2,c3,c1);
1351	$ADDU	$t_2,$at
1352	$ADDU	$c_2,$t_2
1353	sltu	$at,$c_2,$t_2
1354	$ADDU	$c_3,$at
1355	$ST	$c_1,6*$BNSZ($a0)	# r[6]=c1;
1356
1357	mflo	($t_1,$a_0,$b_7)
1358	mfhi	($t_2,$a_0,$b_7)
1359	$ADDU	$c_2,$t_1
1360	sltu	$at,$c_2,$t_1
1361	$MULTU	($a_1,$b_6)		# mul_add_c(a[1],b[6],c2,c3,c1);
1362	$ADDU	$t_2,$at
1363	$ADDU	$c_3,$t_2
1364	sltu	$c_1,$c_3,$t_2
1365	mflo	($t_1,$a_1,$b_6)
1366	mfhi	($t_2,$a_1,$b_6)
1367	$ADDU	$c_2,$t_1
1368	sltu	$at,$c_2,$t_1
1369	$MULTU	($a_2,$b_5)		# mul_add_c(a[2],b[5],c2,c3,c1);
1370	$ADDU	$t_2,$at
1371	$ADDU	$c_3,$t_2
1372	sltu	$at,$c_3,$t_2
1373	$ADDU	$c_1,$at
1374	mflo	($t_1,$a_2,$b_5)
1375	mfhi	($t_2,$a_2,$b_5)
1376	$ADDU	$c_2,$t_1
1377	sltu	$at,$c_2,$t_1
1378	$MULTU	($a_3,$b_4)		# mul_add_c(a[3],b[4],c2,c3,c1);
1379	$ADDU	$t_2,$at
1380	$ADDU	$c_3,$t_2
1381	sltu	$at,$c_3,$t_2
1382	$ADDU	$c_1,$at
1383	mflo	($t_1,$a_3,$b_4)
1384	mfhi	($t_2,$a_3,$b_4)
1385	$ADDU	$c_2,$t_1
1386	sltu	$at,$c_2,$t_1
1387	$MULTU	($a_4,$b_3)		# mul_add_c(a[4],b[3],c2,c3,c1);
1388	$ADDU	$t_2,$at
1389	$ADDU	$c_3,$t_2
1390	sltu	$at,$c_3,$t_2
1391	$ADDU	$c_1,$at
1392	mflo	($t_1,$a_4,$b_3)
1393	mfhi	($t_2,$a_4,$b_3)
1394	$ADDU	$c_2,$t_1
1395	sltu	$at,$c_2,$t_1
1396	$MULTU	($a_5,$b_2)		# mul_add_c(a[5],b[2],c2,c3,c1);
1397	$ADDU	$t_2,$at
1398	$ADDU	$c_3,$t_2
1399	sltu	$at,$c_3,$t_2
1400	$ADDU	$c_1,$at
1401	mflo	($t_1,$a_5,$b_2)
1402	mfhi	($t_2,$a_5,$b_2)
1403	$ADDU	$c_2,$t_1
1404	sltu	$at,$c_2,$t_1
1405	$MULTU	($a_6,$b_1)		# mul_add_c(a[6],b[1],c2,c3,c1);
1406	$ADDU	$t_2,$at
1407	$ADDU	$c_3,$t_2
1408	sltu	$at,$c_3,$t_2
1409	$ADDU	$c_1,$at
1410	mflo	($t_1,$a_6,$b_1)
1411	mfhi	($t_2,$a_6,$b_1)
1412	$ADDU	$c_2,$t_1
1413	sltu	$at,$c_2,$t_1
1414	$MULTU	($a_7,$b_0)		# mul_add_c(a[7],b[0],c2,c3,c1);
1415	$ADDU	$t_2,$at
1416	$ADDU	$c_3,$t_2
1417	sltu	$at,$c_3,$t_2
1418	$ADDU	$c_1,$at
1419	mflo	($t_1,$a_7,$b_0)
1420	mfhi	($t_2,$a_7,$b_0)
1421	$ADDU	$c_2,$t_1
1422	sltu	$at,$c_2,$t_1
1423	 $MULTU	($a_7,$b_1)		# mul_add_c(a[7],b[1],c3,c1,c2);
1424	$ADDU	$t_2,$at
1425	$ADDU	$c_3,$t_2
1426	sltu	$at,$c_3,$t_2
1427	$ADDU	$c_1,$at
1428	$ST	$c_2,7*$BNSZ($a0)	# r[7]=c2;
1429
1430	mflo	($t_1,$a_7,$b_1)
1431	mfhi	($t_2,$a_7,$b_1)
1432	$ADDU	$c_3,$t_1
1433	sltu	$at,$c_3,$t_1
1434	$MULTU	($a_6,$b_2)		# mul_add_c(a[6],b[2],c3,c1,c2);
1435	$ADDU	$t_2,$at
1436	$ADDU	$c_1,$t_2
1437	sltu	$c_2,$c_1,$t_2
1438	mflo	($t_1,$a_6,$b_2)
1439	mfhi	($t_2,$a_6,$b_2)
1440	$ADDU	$c_3,$t_1
1441	sltu	$at,$c_3,$t_1
1442	$MULTU	($a_5,$b_3)		# mul_add_c(a[5],b[3],c3,c1,c2);
1443	$ADDU	$t_2,$at
1444	$ADDU	$c_1,$t_2
1445	sltu	$at,$c_1,$t_2
1446	$ADDU	$c_2,$at
1447	mflo	($t_1,$a_5,$b_3)
1448	mfhi	($t_2,$a_5,$b_3)
1449	$ADDU	$c_3,$t_1
1450	sltu	$at,$c_3,$t_1
1451	$MULTU	($a_4,$b_4)		# mul_add_c(a[4],b[4],c3,c1,c2);
1452	$ADDU	$t_2,$at
1453	$ADDU	$c_1,$t_2
1454	sltu	$at,$c_1,$t_2
1455	$ADDU	$c_2,$at
1456	mflo	($t_1,$a_4,$b_4)
1457	mfhi	($t_2,$a_4,$b_4)
1458	$ADDU	$c_3,$t_1
1459	sltu	$at,$c_3,$t_1
1460	$MULTU	($a_3,$b_5)		# mul_add_c(a[3],b[5],c3,c1,c2);
1461	$ADDU	$t_2,$at
1462	$ADDU	$c_1,$t_2
1463	sltu	$at,$c_1,$t_2
1464	$ADDU	$c_2,$at
1465	mflo	($t_1,$a_3,$b_5)
1466	mfhi	($t_2,$a_3,$b_5)
1467	$ADDU	$c_3,$t_1
1468	sltu	$at,$c_3,$t_1
1469	$MULTU	($a_2,$b_6)		# mul_add_c(a[2],b[6],c3,c1,c2);
1470	$ADDU	$t_2,$at
1471	$ADDU	$c_1,$t_2
1472	sltu	$at,$c_1,$t_2
1473	$ADDU	$c_2,$at
1474	mflo	($t_1,$a_2,$b_6)
1475	mfhi	($t_2,$a_2,$b_6)
1476	$ADDU	$c_3,$t_1
1477	sltu	$at,$c_3,$t_1
1478	$MULTU	($a_1,$b_7)		# mul_add_c(a[1],b[7],c3,c1,c2);
1479	$ADDU	$t_2,$at
1480	$ADDU	$c_1,$t_2
1481	sltu	$at,$c_1,$t_2
1482	$ADDU	$c_2,$at
1483	mflo	($t_1,$a_1,$b_7)
1484	mfhi	($t_2,$a_1,$b_7)
1485	$ADDU	$c_3,$t_1
1486	sltu	$at,$c_3,$t_1
1487	 $MULTU	($a_2,$b_7)		# mul_add_c(a[2],b[7],c1,c2,c3);
1488	$ADDU	$t_2,$at
1489	$ADDU	$c_1,$t_2
1490	sltu	$at,$c_1,$t_2
1491	$ADDU	$c_2,$at
1492	$ST	$c_3,8*$BNSZ($a0)	# r[8]=c3;
1493
1494	mflo	($t_1,$a_2,$b_7)
1495	mfhi	($t_2,$a_2,$b_7)
1496	$ADDU	$c_1,$t_1
1497	sltu	$at,$c_1,$t_1
1498	$MULTU	($a_3,$b_6)		# mul_add_c(a[3],b[6],c1,c2,c3);
1499	$ADDU	$t_2,$at
1500	$ADDU	$c_2,$t_2
1501	sltu	$c_3,$c_2,$t_2
1502	mflo	($t_1,$a_3,$b_6)
1503	mfhi	($t_2,$a_3,$b_6)
1504	$ADDU	$c_1,$t_1
1505	sltu	$at,$c_1,$t_1
1506	$MULTU	($a_4,$b_5)		# mul_add_c(a[4],b[5],c1,c2,c3);
1507	$ADDU	$t_2,$at
1508	$ADDU	$c_2,$t_2
1509	sltu	$at,$c_2,$t_2
1510	$ADDU	$c_3,$at
1511	mflo	($t_1,$a_4,$b_5)
1512	mfhi	($t_2,$a_4,$b_5)
1513	$ADDU	$c_1,$t_1
1514	sltu	$at,$c_1,$t_1
1515	$MULTU	($a_5,$b_4)		# mul_add_c(a[5],b[4],c1,c2,c3);
1516	$ADDU	$t_2,$at
1517	$ADDU	$c_2,$t_2
1518	sltu	$at,$c_2,$t_2
1519	$ADDU	$c_3,$at
1520	mflo	($t_1,$a_5,$b_4)
1521	mfhi	($t_2,$a_5,$b_4)
1522	$ADDU	$c_1,$t_1
1523	sltu	$at,$c_1,$t_1
1524	$MULTU	($a_6,$b_3)		# mul_add_c(a[6],b[3],c1,c2,c3);
1525	$ADDU	$t_2,$at
1526	$ADDU	$c_2,$t_2
1527	sltu	$at,$c_2,$t_2
1528	$ADDU	$c_3,$at
1529	mflo	($t_1,$a_6,$b_3)
1530	mfhi	($t_2,$a_6,$b_3)
1531	$ADDU	$c_1,$t_1
1532	sltu	$at,$c_1,$t_1
1533	$MULTU	($a_7,$b_2)		# mul_add_c(a[7],b[2],c1,c2,c3);
1534	$ADDU	$t_2,$at
1535	$ADDU	$c_2,$t_2
1536	sltu	$at,$c_2,$t_2
1537	$ADDU	$c_3,$at
1538	mflo	($t_1,$a_7,$b_2)
1539	mfhi	($t_2,$a_7,$b_2)
1540	$ADDU	$c_1,$t_1
1541	sltu	$at,$c_1,$t_1
1542	 $MULTU	($a_7,$b_3)		# mul_add_c(a[7],b[3],c2,c3,c1);
1543	$ADDU	$t_2,$at
1544	$ADDU	$c_2,$t_2
1545	sltu	$at,$c_2,$t_2
1546	$ADDU	$c_3,$at
1547	$ST	$c_1,9*$BNSZ($a0)	# r[9]=c1;
1548
1549	mflo	($t_1,$a_7,$b_3)
1550	mfhi	($t_2,$a_7,$b_3)
1551	$ADDU	$c_2,$t_1
1552	sltu	$at,$c_2,$t_1
1553	$MULTU	($a_6,$b_4)		# mul_add_c(a[6],b[4],c2,c3,c1);
1554	$ADDU	$t_2,$at
1555	$ADDU	$c_3,$t_2
1556	sltu	$c_1,$c_3,$t_2
1557	mflo	($t_1,$a_6,$b_4)
1558	mfhi	($t_2,$a_6,$b_4)
1559	$ADDU	$c_2,$t_1
1560	sltu	$at,$c_2,$t_1
1561	$MULTU	($a_5,$b_5)		# mul_add_c(a[5],b[5],c2,c3,c1);
1562	$ADDU	$t_2,$at
1563	$ADDU	$c_3,$t_2
1564	sltu	$at,$c_3,$t_2
1565	$ADDU	$c_1,$at
1566	mflo	($t_1,$a_5,$b_5)
1567	mfhi	($t_2,$a_5,$b_5)
1568	$ADDU	$c_2,$t_1
1569	sltu	$at,$c_2,$t_1
1570	$MULTU	($a_4,$b_6)		# mul_add_c(a[4],b[6],c2,c3,c1);
1571	$ADDU	$t_2,$at
1572	$ADDU	$c_3,$t_2
1573	sltu	$at,$c_3,$t_2
1574	$ADDU	$c_1,$at
1575	mflo	($t_1,$a_4,$b_6)
1576	mfhi	($t_2,$a_4,$b_6)
1577	$ADDU	$c_2,$t_1
1578	sltu	$at,$c_2,$t_1
1579	$MULTU	($a_3,$b_7)		# mul_add_c(a[3],b[7],c2,c3,c1);
1580	$ADDU	$t_2,$at
1581	$ADDU	$c_3,$t_2
1582	sltu	$at,$c_3,$t_2
1583	$ADDU	$c_1,$at
1584	mflo	($t_1,$a_3,$b_7)
1585	mfhi	($t_2,$a_3,$b_7)
1586	$ADDU	$c_2,$t_1
1587	sltu	$at,$c_2,$t_1
1588	$MULTU	($a_4,$b_7)		# mul_add_c(a[4],b[7],c3,c1,c2);
1589	$ADDU	$t_2,$at
1590	$ADDU	$c_3,$t_2
1591	sltu	$at,$c_3,$t_2
1592	$ADDU	$c_1,$at
1593	$ST	$c_2,10*$BNSZ($a0)	# r[10]=c2;
1594
1595	mflo	($t_1,$a_4,$b_7)
1596	mfhi	($t_2,$a_4,$b_7)
1597	$ADDU	$c_3,$t_1
1598	sltu	$at,$c_3,$t_1
1599	$MULTU	($a_5,$b_6)		# mul_add_c(a[5],b[6],c3,c1,c2);
1600	$ADDU	$t_2,$at
1601	$ADDU	$c_1,$t_2
1602	sltu	$c_2,$c_1,$t_2
1603	mflo	($t_1,$a_5,$b_6)
1604	mfhi	($t_2,$a_5,$b_6)
1605	$ADDU	$c_3,$t_1
1606	sltu	$at,$c_3,$t_1
1607	$MULTU	($a_6,$b_5)		# mul_add_c(a[6],b[5],c3,c1,c2);
1608	$ADDU	$t_2,$at
1609	$ADDU	$c_1,$t_2
1610	sltu	$at,$c_1,$t_2
1611	$ADDU	$c_2,$at
1612	mflo	($t_1,$a_6,$b_5)
1613	mfhi	($t_2,$a_6,$b_5)
1614	$ADDU	$c_3,$t_1
1615	sltu	$at,$c_3,$t_1
1616	$MULTU	($a_7,$b_4)		# mul_add_c(a[7],b[4],c3,c1,c2);
1617	$ADDU	$t_2,$at
1618	$ADDU	$c_1,$t_2
1619	sltu	$at,$c_1,$t_2
1620	$ADDU	$c_2,$at
1621	mflo	($t_1,$a_7,$b_4)
1622	mfhi	($t_2,$a_7,$b_4)
1623	$ADDU	$c_3,$t_1
1624	sltu	$at,$c_3,$t_1
1625	 $MULTU	($a_7,$b_5)		# mul_add_c(a[7],b[5],c1,c2,c3);
1626	$ADDU	$t_2,$at
1627	$ADDU	$c_1,$t_2
1628	sltu	$at,$c_1,$t_2
1629	$ADDU	$c_2,$at
1630	$ST	$c_3,11*$BNSZ($a0)	# r[11]=c3;
1631
1632	mflo	($t_1,$a_7,$b_5)
1633	mfhi	($t_2,$a_7,$b_5)
1634	$ADDU	$c_1,$t_1
1635	sltu	$at,$c_1,$t_1
1636	$MULTU	($a_6,$b_6)		# mul_add_c(a[6],b[6],c1,c2,c3);
1637	$ADDU	$t_2,$at
1638	$ADDU	$c_2,$t_2
1639	sltu	$c_3,$c_2,$t_2
1640	mflo	($t_1,$a_6,$b_6)
1641	mfhi	($t_2,$a_6,$b_6)
1642	$ADDU	$c_1,$t_1
1643	sltu	$at,$c_1,$t_1
1644	$MULTU	($a_5,$b_7)		# mul_add_c(a[5],b[7],c1,c2,c3);
1645	$ADDU	$t_2,$at
1646	$ADDU	$c_2,$t_2
1647	sltu	$at,$c_2,$t_2
1648	$ADDU	$c_3,$at
1649	mflo	($t_1,$a_5,$b_7)
1650	mfhi	($t_2,$a_5,$b_7)
1651	$ADDU	$c_1,$t_1
1652	sltu	$at,$c_1,$t_1
1653	 $MULTU	($a_6,$b_7)		# mul_add_c(a[6],b[7],c2,c3,c1);
1654	$ADDU	$t_2,$at
1655	$ADDU	$c_2,$t_2
1656	sltu	$at,$c_2,$t_2
1657	$ADDU	$c_3,$at
1658	$ST	$c_1,12*$BNSZ($a0)	# r[12]=c1;
1659
1660	mflo	($t_1,$a_6,$b_7)
1661	mfhi	($t_2,$a_6,$b_7)
1662	$ADDU	$c_2,$t_1
1663	sltu	$at,$c_2,$t_1
1664	$MULTU	($a_7,$b_6)		# mul_add_c(a[7],b[6],c2,c3,c1);
1665	$ADDU	$t_2,$at
1666	$ADDU	$c_3,$t_2
1667	sltu	$c_1,$c_3,$t_2
1668	mflo	($t_1,$a_7,$b_6)
1669	mfhi	($t_2,$a_7,$b_6)
1670	$ADDU	$c_2,$t_1
1671	sltu	$at,$c_2,$t_1
1672	$MULTU	($a_7,$b_7)		# mul_add_c(a[7],b[7],c3,c1,c2);
1673	$ADDU	$t_2,$at
1674	$ADDU	$c_3,$t_2
1675	sltu	$at,$c_3,$t_2
1676	$ADDU	$c_1,$at
1677	$ST	$c_2,13*$BNSZ($a0)	# r[13]=c2;
1678
1679	mflo	($t_1,$a_7,$b_7)
1680	mfhi	($t_2,$a_7,$b_7)
1681	$ADDU	$c_3,$t_1
1682	sltu	$at,$c_3,$t_1
1683	$ADDU	$t_2,$at
1684	$ADDU	$c_1,$t_2
1685	$ST	$c_3,14*$BNSZ($a0)	# r[14]=c3;
1686	$ST	$c_1,15*$BNSZ($a0)	# r[15]=c1;
1687
1688	.set	noreorder
1689___
1690$code.=<<___ if ($flavour =~ /nubi/i);
1691	$REG_L	$s5,10*$SZREG($sp)
1692	$REG_L	$s4,9*$SZREG($sp)
1693	$REG_L	$s3,8*$SZREG($sp)
1694	$REG_L	$s2,7*$SZREG($sp)
1695	$REG_L	$s1,6*$SZREG($sp)
1696	$REG_L	$s0,5*$SZREG($sp)
1697	$REG_L	$t3,4*$SZREG($sp)
1698	$REG_L	$t2,3*$SZREG($sp)
1699	$REG_L	$t1,2*$SZREG($sp)
1700	$REG_L	$t0,1*$SZREG($sp)
1701	$REG_L	$gp,0*$SZREG($sp)
1702	jr	$ra
1703	$PTR_ADD $sp,12*$SZREG
1704___
1705$code.=<<___ if ($flavour !~ /nubi/i);
1706	$REG_L	$s5,5*$SZREG($sp)
1707	$REG_L	$s4,4*$SZREG($sp)
1708	$REG_L	$s3,3*$SZREG($sp)
1709	$REG_L	$s2,2*$SZREG($sp)
1710	$REG_L	$s1,1*$SZREG($sp)
1711	$REG_L	$s0,0*$SZREG($sp)
1712	jr	$ra
1713	$PTR_ADD $sp,6*$SZREG
1714___
1715$code.=<<___;
1716.end	bn_mul_comba8
1717
1718.align	5
1719.globl	bn_mul_comba4
1720.ent	bn_mul_comba4
1721bn_mul_comba4:
1722___
1723$code.=<<___ if ($flavour =~ /nubi/i);
1724	.frame	$sp,6*$SZREG,$ra
1725	.mask	0x8000f008,-$SZREG
1726	.set	noreorder
1727	$PTR_SUB $sp,6*$SZREG
1728	$REG_S	$ra,5*$SZREG($sp)
1729	$REG_S	$t3,4*$SZREG($sp)
1730	$REG_S	$t2,3*$SZREG($sp)
1731	$REG_S	$t1,2*$SZREG($sp)
1732	$REG_S	$t0,1*$SZREG($sp)
1733	$REG_S	$gp,0*$SZREG($sp)
1734___
1735$code.=<<___;
1736	.set	reorder
1737	$LD	$a_0,0($a1)
1738	$LD	$b_0,0($a2)
1739	$LD	$a_1,$BNSZ($a1)
1740	$LD	$a_2,2*$BNSZ($a1)
1741	$MULTU	($a_0,$b_0)		# mul_add_c(a[0],b[0],c1,c2,c3);
1742	$LD	$a_3,3*$BNSZ($a1)
1743	$LD	$b_1,$BNSZ($a2)
1744	$LD	$b_2,2*$BNSZ($a2)
1745	$LD	$b_3,3*$BNSZ($a2)
1746	mflo	($c_1,$a_0,$b_0)
1747	mfhi	($c_2,$a_0,$b_0)
1748	$ST	$c_1,0($a0)
1749
1750	$MULTU	($a_0,$b_1)		# mul_add_c(a[0],b[1],c2,c3,c1);
1751	mflo	($t_1,$a_0,$b_1)
1752	mfhi	($t_2,$a_0,$b_1)
1753	$ADDU	$c_2,$t_1
1754	sltu	$at,$c_2,$t_1
1755	$MULTU	($a_1,$b_0)		# mul_add_c(a[1],b[0],c2,c3,c1);
1756	$ADDU	$c_3,$t_2,$at
1757	mflo	($t_1,$a_1,$b_0)
1758	mfhi	($t_2,$a_1,$b_0)
1759	$ADDU	$c_2,$t_1
1760	sltu	$at,$c_2,$t_1
1761	 $MULTU	($a_2,$b_0)		# mul_add_c(a[2],b[0],c3,c1,c2);
1762	$ADDU	$t_2,$at
1763	$ADDU	$c_3,$t_2
1764	sltu	$c_1,$c_3,$t_2
1765	$ST	$c_2,$BNSZ($a0)
1766
1767	mflo	($t_1,$a_2,$b_0)
1768	mfhi	($t_2,$a_2,$b_0)
1769	$ADDU	$c_3,$t_1
1770	sltu	$at,$c_3,$t_1
1771	$MULTU	($a_1,$b_1)		# mul_add_c(a[1],b[1],c3,c1,c2);
1772	$ADDU	$t_2,$at
1773	$ADDU	$c_1,$t_2
1774	mflo	($t_1,$a_1,$b_1)
1775	mfhi	($t_2,$a_1,$b_1)
1776	$ADDU	$c_3,$t_1
1777	sltu	$at,$c_3,$t_1
1778	$MULTU	($a_0,$b_2)		# mul_add_c(a[0],b[2],c3,c1,c2);
1779	$ADDU	$t_2,$at
1780	$ADDU	$c_1,$t_2
1781	sltu	$c_2,$c_1,$t_2
1782	mflo	($t_1,$a_0,$b_2)
1783	mfhi	($t_2,$a_0,$b_2)
1784	$ADDU	$c_3,$t_1
1785	sltu	$at,$c_3,$t_1
1786	 $MULTU	($a_0,$b_3)		# mul_add_c(a[0],b[3],c1,c2,c3);
1787	$ADDU	$t_2,$at
1788	$ADDU	$c_1,$t_2
1789	sltu	$at,$c_1,$t_2
1790	$ADDU	$c_2,$at
1791	$ST	$c_3,2*$BNSZ($a0)
1792
1793	mflo	($t_1,$a_0,$b_3)
1794	mfhi	($t_2,$a_0,$b_3)
1795	$ADDU	$c_1,$t_1
1796	sltu	$at,$c_1,$t_1
1797	$MULTU	($a_1,$b_2)		# mul_add_c(a[1],b[2],c1,c2,c3);
1798	$ADDU	$t_2,$at
1799	$ADDU	$c_2,$t_2
1800	sltu	$c_3,$c_2,$t_2
1801	mflo	($t_1,$a_1,$b_2)
1802	mfhi	($t_2,$a_1,$b_2)
1803	$ADDU	$c_1,$t_1
1804	sltu	$at,$c_1,$t_1
1805	$MULTU	($a_2,$b_1)		# mul_add_c(a[2],b[1],c1,c2,c3);
1806	$ADDU	$t_2,$at
1807	$ADDU	$c_2,$t_2
1808	sltu	$at,$c_2,$t_2
1809	$ADDU	$c_3,$at
1810	mflo	($t_1,$a_2,$b_1)
1811	mfhi	($t_2,$a_2,$b_1)
1812	$ADDU	$c_1,$t_1
1813	sltu	$at,$c_1,$t_1
1814	$MULTU	($a_3,$b_0)		# mul_add_c(a[3],b[0],c1,c2,c3);
1815	$ADDU	$t_2,$at
1816	$ADDU	$c_2,$t_2
1817	sltu	$at,$c_2,$t_2
1818	$ADDU	$c_3,$at
1819	mflo	($t_1,$a_3,$b_0)
1820	mfhi	($t_2,$a_3,$b_0)
1821	$ADDU	$c_1,$t_1
1822	sltu	$at,$c_1,$t_1
1823	 $MULTU	($a_3,$b_1)		# mul_add_c(a[3],b[1],c2,c3,c1);
1824	$ADDU	$t_2,$at
1825	$ADDU	$c_2,$t_2
1826	sltu	$at,$c_2,$t_2
1827	$ADDU	$c_3,$at
1828	$ST	$c_1,3*$BNSZ($a0)
1829
1830	mflo	($t_1,$a_3,$b_1)
1831	mfhi	($t_2,$a_3,$b_1)
1832	$ADDU	$c_2,$t_1
1833	sltu	$at,$c_2,$t_1
1834	$MULTU	($a_2,$b_2)		# mul_add_c(a[2],b[2],c2,c3,c1);
1835	$ADDU	$t_2,$at
1836	$ADDU	$c_3,$t_2
1837	sltu	$c_1,$c_3,$t_2
1838	mflo	($t_1,$a_2,$b_2)
1839	mfhi	($t_2,$a_2,$b_2)
1840	$ADDU	$c_2,$t_1
1841	sltu	$at,$c_2,$t_1
1842	$MULTU	($a_1,$b_3)		# mul_add_c(a[1],b[3],c2,c3,c1);
1843	$ADDU	$t_2,$at
1844	$ADDU	$c_3,$t_2
1845	sltu	$at,$c_3,$t_2
1846	$ADDU	$c_1,$at
1847	mflo	($t_1,$a_1,$b_3)
1848	mfhi	($t_2,$a_1,$b_3)
1849	$ADDU	$c_2,$t_1
1850	sltu	$at,$c_2,$t_1
1851	 $MULTU	($a_2,$b_3)		# mul_add_c(a[2],b[3],c3,c1,c2);
1852	$ADDU	$t_2,$at
1853	$ADDU	$c_3,$t_2
1854	sltu	$at,$c_3,$t_2
1855	$ADDU	$c_1,$at
1856	$ST	$c_2,4*$BNSZ($a0)
1857
1858	mflo	($t_1,$a_2,$b_3)
1859	mfhi	($t_2,$a_2,$b_3)
1860	$ADDU	$c_3,$t_1
1861	sltu	$at,$c_3,$t_1
1862	$MULTU	($a_3,$b_2)		# mul_add_c(a[3],b[2],c3,c1,c2);
1863	$ADDU	$t_2,$at
1864	$ADDU	$c_1,$t_2
1865	sltu	$c_2,$c_1,$t_2
1866	mflo	($t_1,$a_3,$b_2)
1867	mfhi	($t_2,$a_3,$b_2)
1868	$ADDU	$c_3,$t_1
1869	sltu	$at,$c_3,$t_1
1870	 $MULTU	($a_3,$b_3)		# mul_add_c(a[3],b[3],c1,c2,c3);
1871	$ADDU	$t_2,$at
1872	$ADDU	$c_1,$t_2
1873	sltu	$at,$c_1,$t_2
1874	$ADDU	$c_2,$at
1875	$ST	$c_3,5*$BNSZ($a0)
1876
1877	mflo	($t_1,$a_3,$b_3)
1878	mfhi	($t_2,$a_3,$b_3)
1879	$ADDU	$c_1,$t_1
1880	sltu	$at,$c_1,$t_1
1881	$ADDU	$t_2,$at
1882	$ADDU	$c_2,$t_2
1883	$ST	$c_1,6*$BNSZ($a0)
1884	$ST	$c_2,7*$BNSZ($a0)
1885
1886	.set	noreorder
1887___
1888$code.=<<___ if ($flavour =~ /nubi/i);
1889	$REG_L	$t3,4*$SZREG($sp)
1890	$REG_L	$t2,3*$SZREG($sp)
1891	$REG_L	$t1,2*$SZREG($sp)
1892	$REG_L	$t0,1*$SZREG($sp)
1893	$REG_L	$gp,0*$SZREG($sp)
1894	$PTR_ADD $sp,6*$SZREG
1895___
1896$code.=<<___;
1897	jr	$ra
1898	nop
1899.end	bn_mul_comba4
1900___
1901
1902($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1903
1904sub add_c2 () {
1905my ($hi,$lo,$c0,$c1,$c2,
1906    $warm,      # !$warm denotes first call with specific sequence of
1907                # $c_[XYZ] when there is no Z-carry to accumulate yet;
1908    $an,$bn     # these two are arguments for multiplication which
1909                # result is used in *next* step [which is why it's
1910                # commented as "forward multiplication" below];
1911    )=@_;
1912$code.=<<___;
1913	$ADDU	$c0,$lo
1914	sltu	$at,$c0,$lo
1915	 $MULTU	($an,$bn)		# forward multiplication
1916	$ADDU	$c0,$lo
1917	$ADDU	$at,$hi
1918	sltu	$lo,$c0,$lo
1919	$ADDU	$c1,$at
1920	$ADDU	$hi,$lo
1921___
1922$code.=<<___	if (!$warm);
1923	sltu	$c2,$c1,$at
1924	$ADDU	$c1,$hi
1925___
1926$code.=<<___	if ($warm);
1927	sltu	$at,$c1,$at
1928	$ADDU	$c1,$hi
1929	$ADDU	$c2,$at
1930___
1931$code.=<<___;
1932	sltu	$hi,$c1,$hi
1933	$ADDU	$c2,$hi
1934	mflo	($lo,$an,$bn)
1935	mfhi	($hi,$an,$bn)
1936___
1937}
1938
1939$code.=<<___;
1940
1941.align	5
1942.globl	bn_sqr_comba8
1943.ent	bn_sqr_comba8
1944bn_sqr_comba8:
1945___
1946$code.=<<___ if ($flavour =~ /nubi/i);
1947	.frame	$sp,6*$SZREG,$ra
1948	.mask	0x8000f008,-$SZREG
1949	.set	noreorder
1950	$PTR_SUB $sp,6*$SZREG
1951	$REG_S	$ra,5*$SZREG($sp)
1952	$REG_S	$t3,4*$SZREG($sp)
1953	$REG_S	$t2,3*$SZREG($sp)
1954	$REG_S	$t1,2*$SZREG($sp)
1955	$REG_S	$t0,1*$SZREG($sp)
1956	$REG_S	$gp,0*$SZREG($sp)
1957___
1958$code.=<<___;
1959	.set	reorder
1960	$LD	$a_0,0($a1)
1961	$LD	$a_1,$BNSZ($a1)
1962	$LD	$a_2,2*$BNSZ($a1)
1963	$LD	$a_3,3*$BNSZ($a1)
1964
1965	$MULTU	($a_0,$a_0)		# mul_add_c(a[0],b[0],c1,c2,c3);
1966	$LD	$a_4,4*$BNSZ($a1)
1967	$LD	$a_5,5*$BNSZ($a1)
1968	$LD	$a_6,6*$BNSZ($a1)
1969	$LD	$a_7,7*$BNSZ($a1)
1970	mflo	($c_1,$a_0,$a_0)
1971	mfhi	($c_2,$a_0,$a_0)
1972	$ST	$c_1,0($a0)
1973
1974	$MULTU	($a_0,$a_1)		# mul_add_c2(a[0],b[1],c2,c3,c1);
1975	mflo	($t_1,$a_0,$a_1)
1976	mfhi	($t_2,$a_0,$a_1)
1977	slt	$c_1,$t_2,$zero
1978	$SLL	$t_2,1
1979	 $MULTU	($a_2,$a_0)		# mul_add_c2(a[2],b[0],c3,c1,c2);
1980	slt	$a2,$t_1,$zero
1981	$ADDU	$t_2,$a2
1982	$SLL	$t_1,1
1983	$ADDU	$c_2,$t_1
1984	sltu	$at,$c_2,$t_1
1985	$ADDU	$c_3,$t_2,$at
1986	$ST	$c_2,$BNSZ($a0)
1987	mflo	($t_1,$a_2,$a_0)
1988	mfhi	($t_2,$a_2,$a_0)
1989___
1990	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1991		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
1992$code.=<<___;
1993	$ADDU	$c_3,$t_1
1994	sltu	$at,$c_3,$t_1
1995	 $MULTU	($a_0,$a_3)		# mul_add_c2(a[0],b[3],c1,c2,c3);
1996	$ADDU	$t_2,$at
1997	$ADDU	$c_1,$t_2
1998	sltu	$at,$c_1,$t_2
1999	$ADDU	$c_2,$at
2000	$ST	$c_3,2*$BNSZ($a0)
2001	mflo	($t_1,$a_0,$a_3)
2002	mfhi	($t_2,$a_0,$a_3)
2003___
2004	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2005		$a_1,$a_2);		# mul_add_c2(a[1],b[2],c1,c2,c3);
2006	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2007		$a_4,$a_0);		# mul_add_c2(a[4],b[0],c2,c3,c1);
2008$code.=<<___;
2009	$ST	$c_1,3*$BNSZ($a0)
2010___
2011	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2012		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
2013	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2014		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
2015$code.=<<___;
2016	$ADDU	$c_2,$t_1
2017	sltu	$at,$c_2,$t_1
2018	 $MULTU	($a_0,$a_5)		# mul_add_c2(a[0],b[5],c3,c1,c2);
2019	$ADDU	$t_2,$at
2020	$ADDU	$c_3,$t_2
2021	sltu	$at,$c_3,$t_2
2022	$ADDU	$c_1,$at
2023	$ST	$c_2,4*$BNSZ($a0)
2024	mflo	($t_1,$a_0,$a_5)
2025	mfhi	($t_2,$a_0,$a_5)
2026___
2027	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2028		$a_1,$a_4);		# mul_add_c2(a[1],b[4],c3,c1,c2);
2029	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2030		$a_2,$a_3);		# mul_add_c2(a[2],b[3],c3,c1,c2);
2031	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2032		$a_6,$a_0);		# mul_add_c2(a[6],b[0],c1,c2,c3);
2033$code.=<<___;
2034	$ST	$c_3,5*$BNSZ($a0)
2035___
2036	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2037		$a_5,$a_1);		# mul_add_c2(a[5],b[1],c1,c2,c3);
2038	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2039		$a_4,$a_2);		# mul_add_c2(a[4],b[2],c1,c2,c3);
2040	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2041		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
2042$code.=<<___;
2043	$ADDU	$c_1,$t_1
2044	sltu	$at,$c_1,$t_1
2045	 $MULTU	($a_0,$a_7)		# mul_add_c2(a[0],b[7],c2,c3,c1);
2046	$ADDU	$t_2,$at
2047	$ADDU	$c_2,$t_2
2048	sltu	$at,$c_2,$t_2
2049	$ADDU	$c_3,$at
2050	$ST	$c_1,6*$BNSZ($a0)
2051	mflo	($t_1,$a_0,$a_7)
2052	mfhi	($t_2,$a_0,$a_7)
2053___
2054	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2055		$a_1,$a_6);		# mul_add_c2(a[1],b[6],c2,c3,c1);
2056	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2057		$a_2,$a_5);		# mul_add_c2(a[2],b[5],c2,c3,c1);
2058	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2059		$a_3,$a_4);		# mul_add_c2(a[3],b[4],c2,c3,c1);
2060	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2061		$a_7,$a_1);		# mul_add_c2(a[7],b[1],c3,c1,c2);
2062$code.=<<___;
2063	$ST	$c_2,7*$BNSZ($a0)
2064___
2065	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2066		$a_6,$a_2);		# mul_add_c2(a[6],b[2],c3,c1,c2);
2067	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2068		$a_5,$a_3);		# mul_add_c2(a[5],b[3],c3,c1,c2);
2069	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2070		$a_4,$a_4);		# mul_add_c(a[4],b[4],c3,c1,c2);
2071$code.=<<___;
2072	$ADDU	$c_3,$t_1
2073	sltu	$at,$c_3,$t_1
2074	 $MULTU	($a_2,$a_7)		# mul_add_c2(a[2],b[7],c1,c2,c3);
2075	$ADDU	$t_2,$at
2076	$ADDU	$c_1,$t_2
2077	sltu	$at,$c_1,$t_2
2078	$ADDU	$c_2,$at
2079	$ST	$c_3,8*$BNSZ($a0)
2080	mflo	($t_1,$a_2,$a_7)
2081	mfhi	($t_2,$a_2,$a_7)
2082___
2083	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2084		$a_3,$a_6);		# mul_add_c2(a[3],b[6],c1,c2,c3);
2085	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2086		$a_4,$a_5);		# mul_add_c2(a[4],b[5],c1,c2,c3);
2087	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2088		$a_7,$a_3);		# mul_add_c2(a[7],b[3],c2,c3,c1);
2089$code.=<<___;
2090	$ST	$c_1,9*$BNSZ($a0)
2091___
2092	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2093		$a_6,$a_4);		# mul_add_c2(a[6],b[4],c2,c3,c1);
2094	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2095		$a_5,$a_5);		# mul_add_c(a[5],b[5],c2,c3,c1);
2096$code.=<<___;
2097	$ADDU	$c_2,$t_1
2098	sltu	$at,$c_2,$t_1
2099	 $MULTU	($a_4,$a_7)		# mul_add_c2(a[4],b[7],c3,c1,c2);
2100	$ADDU	$t_2,$at
2101	$ADDU	$c_3,$t_2
2102	sltu	$at,$c_3,$t_2
2103	$ADDU	$c_1,$at
2104	$ST	$c_2,10*$BNSZ($a0)
2105	mflo	($t_1,$a_4,$a_7)
2106	mfhi	($t_2,$a_4,$a_7)
2107___
2108	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2109		$a_5,$a_6);		# mul_add_c2(a[5],b[6],c3,c1,c2);
2110	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2111		$a_7,$a_5);		# mul_add_c2(a[7],b[5],c1,c2,c3);
2112$code.=<<___;
2113	$ST	$c_3,11*$BNSZ($a0)
2114___
2115	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2116		$a_6,$a_6);		# mul_add_c(a[6],b[6],c1,c2,c3);
2117$code.=<<___;
2118	$ADDU	$c_1,$t_1
2119	sltu	$at,$c_1,$t_1
2120	 $MULTU	($a_6,$a_7)		# mul_add_c2(a[6],b[7],c2,c3,c1);
2121	$ADDU	$t_2,$at
2122	$ADDU	$c_2,$t_2
2123	sltu	$at,$c_2,$t_2
2124	$ADDU	$c_3,$at
2125	$ST	$c_1,12*$BNSZ($a0)
2126	mflo	($t_1,$a_6,$a_7)
2127	mfhi	($t_2,$a_6,$a_7)
2128___
2129	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2130		$a_7,$a_7);		# mul_add_c(a[7],b[7],c3,c1,c2);
2131$code.=<<___;
2132	$ST	$c_2,13*$BNSZ($a0)
2133
2134	$ADDU	$c_3,$t_1
2135	sltu	$at,$c_3,$t_1
2136	$ADDU	$t_2,$at
2137	$ADDU	$c_1,$t_2
2138	$ST	$c_3,14*$BNSZ($a0)
2139	$ST	$c_1,15*$BNSZ($a0)
2140
2141	.set	noreorder
2142___
2143$code.=<<___ if ($flavour =~ /nubi/i);
2144	$REG_L	$t3,4*$SZREG($sp)
2145	$REG_L	$t2,3*$SZREG($sp)
2146	$REG_L	$t1,2*$SZREG($sp)
2147	$REG_L	$t0,1*$SZREG($sp)
2148	$REG_L	$gp,0*$SZREG($sp)
2149	$PTR_ADD $sp,6*$SZREG
2150___
2151$code.=<<___;
2152	jr	$ra
2153	nop
2154.end	bn_sqr_comba8
2155
2156.align	5
2157.globl	bn_sqr_comba4
2158.ent	bn_sqr_comba4
2159bn_sqr_comba4:
2160___
2161$code.=<<___ if ($flavour =~ /nubi/i);
2162	.frame	$sp,6*$SZREG,$ra
2163	.mask	0x8000f008,-$SZREG
2164	.set	noreorder
2165	$PTR_SUB $sp,6*$SZREG
2166	$REG_S	$ra,5*$SZREG($sp)
2167	$REG_S	$t3,4*$SZREG($sp)
2168	$REG_S	$t2,3*$SZREG($sp)
2169	$REG_S	$t1,2*$SZREG($sp)
2170	$REG_S	$t0,1*$SZREG($sp)
2171	$REG_S	$gp,0*$SZREG($sp)
2172___
2173$code.=<<___;
2174	.set	reorder
2175	$LD	$a_0,0($a1)
2176	$LD	$a_1,$BNSZ($a1)
2177	$MULTU	($a_0,$a_0)		# mul_add_c(a[0],b[0],c1,c2,c3);
2178	$LD	$a_2,2*$BNSZ($a1)
2179	$LD	$a_3,3*$BNSZ($a1)
2180	mflo	($c_1,$a_0,$a_0)
2181	mfhi	($c_2,$a_0,$a_0)
2182	$ST	$c_1,0($a0)
2183
2184	$MULTU	($a_0,$a_1)		# mul_add_c2(a[0],b[1],c2,c3,c1);
2185	mflo	($t_1,$a_0,$a_1)
2186	mfhi	($t_2,$a_0,$a_1)
2187	slt	$c_1,$t_2,$zero
2188	$SLL	$t_2,1
2189	 $MULTU	($a_2,$a_0)		# mul_add_c2(a[2],b[0],c3,c1,c2);
2190	slt	$a2,$t_1,$zero
2191	$ADDU	$t_2,$a2
2192	$SLL	$t_1,1
2193	$ADDU	$c_2,$t_1
2194	sltu	$at,$c_2,$t_1
2195	$ADDU	$c_3,$t_2,$at
2196	$ST	$c_2,$BNSZ($a0)
2197	mflo	($t_1,$a_2,$a_0)
2198	mfhi	($t_2,$a_2,$a_0)
2199___
2200	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2201		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
2202$code.=<<___;
2203	$ADDU	$c_3,$t_1
2204	sltu	$at,$c_3,$t_1
2205	 $MULTU	($a_0,$a_3)		# mul_add_c2(a[0],b[3],c1,c2,c3);
2206	$ADDU	$t_2,$at
2207	$ADDU	$c_1,$t_2
2208	sltu	$at,$c_1,$t_2
2209	$ADDU	$c_2,$at
2210	$ST	$c_3,2*$BNSZ($a0)
2211	mflo	($t_1,$a_0,$a_3)
2212	mfhi	($t_2,$a_0,$a_3)
2213___
2214	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2215		$a_1,$a_2);		# mul_add_c2(a2[1],b[2],c1,c2,c3);
2216	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2217		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
2218$code.=<<___;
2219	$ST	$c_1,3*$BNSZ($a0)
2220___
2221	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2222		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
2223$code.=<<___;
2224	$ADDU	$c_2,$t_1
2225	sltu	$at,$c_2,$t_1
2226	 $MULTU	($a_2,$a_3)		# mul_add_c2(a[2],b[3],c3,c1,c2);
2227	$ADDU	$t_2,$at
2228	$ADDU	$c_3,$t_2
2229	sltu	$at,$c_3,$t_2
2230	$ADDU	$c_1,$at
2231	$ST	$c_2,4*$BNSZ($a0)
2232	mflo	($t_1,$a_2,$a_3)
2233	mfhi	($t_2,$a_2,$a_3)
2234___
2235	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2236		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
2237$code.=<<___;
2238	$ST	$c_3,5*$BNSZ($a0)
2239
2240	$ADDU	$c_1,$t_1
2241	sltu	$at,$c_1,$t_1
2242	$ADDU	$t_2,$at
2243	$ADDU	$c_2,$t_2
2244	$ST	$c_1,6*$BNSZ($a0)
2245	$ST	$c_2,7*$BNSZ($a0)
2246
2247	.set	noreorder
2248___
2249$code.=<<___ if ($flavour =~ /nubi/i);
2250	$REG_L	$t3,4*$SZREG($sp)
2251	$REG_L	$t2,3*$SZREG($sp)
2252	$REG_L	$t1,2*$SZREG($sp)
2253	$REG_L	$t0,1*$SZREG($sp)
2254	$REG_L	$gp,0*$SZREG($sp)
2255	$PTR_ADD $sp,6*$SZREG
2256___
2257$code.=<<___;
2258	jr	$ra
2259	nop
2260.end	bn_sqr_comba4
2261___
2262print $code;
2263close STDOUT;
2264