xref: /freebsd/crypto/openssl/crypto/bn/asm/mips.pl (revision 85732ac8)
1#! /usr/bin/env perl
2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project.
13#
14# Rights for redistribution and usage in source and binary forms are
15# granted according to the OpenSSL license. Warranty of any kind is
16# disclaimed.
17# ====================================================================
18
19
20# July 1999
21#
22# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
23#
24# The module is designed to work with either of the "new" MIPS ABI(5),
25# namely N32 or N64, offered by IRIX 6.x. It's not meant to work under
26# IRIX 5.x not only because it doesn't support new ABIs but also
27# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
28# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
29# cause illegal instruction exception:-(
30#
31# In addition the code depends on preprocessor flags set up by MIPSpro
32# compiler driver (either as or cc) and therefore (probably?) can't be
33# compiled by the GNU assembler. GNU C driver manages fine though...
34# I mean as long as -mmips-as is specified or is the default option,
35# because then it simply invokes /usr/bin/as which in turn takes
36# perfect care of the preprocessor definitions. Another neat feature
37# offered by the MIPSpro assembler is an optimization pass. This gave
38# me the opportunity to have the code looking more regular as all those
39# architecture dependent instruction rescheduling details were left to
40# the assembler. Cool, huh?
41#
42# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
43# goes way over 3 times faster!
44#
45#					<appro@openssl.org>
46
47# October 2010
48#
49# Adapt the module even for 32-bit ABIs and other OSes. The former was
50# achieved by mechanical replacement of 64-bit arithmetic instructions
51# such as dmultu, daddu, etc. with their 32-bit counterparts and
52# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
53# >3x performance improvement naturally does not apply to 32-bit code
54# [because there is no instruction 32-bit compiler can't use], one
55# has to content with 40-85% improvement depending on benchmark and
56# key length, more for longer keys.
57
58$flavour = shift || "o32";
59while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
60open STDOUT,">$output";
61
62if ($flavour =~ /64|n32/i) {
63	$LD="ld";
64	$ST="sd";
65	$MULTU="dmultu";
66	$DIVU="ddivu";
67	$ADDU="daddu";
68	$SUBU="dsubu";
69	$SRL="dsrl";
70	$SLL="dsll";
71	$BNSZ=8;
72	$PTR_ADD="daddu";
73	$PTR_SUB="dsubu";
74	$SZREG=8;
75	$REG_S="sd";
76	$REG_L="ld";
77} else {
78	$LD="lw";
79	$ST="sw";
80	$MULTU="multu";
81	$DIVU="divu";
82	$ADDU="addu";
83	$SUBU="subu";
84	$SRL="srl";
85	$SLL="sll";
86	$BNSZ=4;
87	$PTR_ADD="addu";
88	$PTR_SUB="subu";
89	$SZREG=4;
90	$REG_S="sw";
91	$REG_L="lw";
92	$code=".set	mips2\n";
93}
94
95# Below is N32/64 register layout used in the original module.
96#
97($zero,$at,$v0,$v1)=map("\$$_",(0..3));
98($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
99($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
100($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
101($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
102($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
103#
104# No special adaptation is required for O32. NUBI on the other hand
105# is treated by saving/restoring ($v1,$t0..$t3).
106
107$gp=$v1 if ($flavour =~ /nubi/i);
108
109$minus4=$v1;
110
111$code.=<<___;
112#include "mips_arch.h"
113
114#if defined(_MIPS_ARCH_MIPS64R6)
115# define ddivu(rs,rt)
116# define mfqt(rd,rs,rt)	ddivu	rd,rs,rt
117# define mfrm(rd,rs,rt)	dmodu	rd,rs,rt
118#elif defined(_MIPS_ARCH_MIPS32R6)
119# define divu(rs,rt)
120# define mfqt(rd,rs,rt)	divu	rd,rs,rt
121# define mfrm(rd,rs,rt)	modu	rd,rs,rt
122#else
123# define $DIVU(rs,rt)	$DIVU	$zero,rs,rt
124# define mfqt(rd,rs,rt)	mflo	rd
125# define mfrm(rd,rs,rt)	mfhi	rd
126#endif
127
128.rdata
129.asciiz	"mips3.s, Version 1.2"
130.asciiz	"MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
131
132.text
133.set	noat
134
135.align	5
136.globl	bn_mul_add_words
137.ent	bn_mul_add_words
138bn_mul_add_words:
139	.set	noreorder
140	bgtz	$a2,bn_mul_add_words_internal
141	move	$v0,$zero
142	jr	$ra
143	move	$a0,$v0
144.end	bn_mul_add_words
145
146.align	5
147.ent	bn_mul_add_words_internal
148bn_mul_add_words_internal:
149___
150$code.=<<___ if ($flavour =~ /nubi/i);
151	.frame	$sp,6*$SZREG,$ra
152	.mask	0x8000f008,-$SZREG
153	.set	noreorder
154	$PTR_SUB $sp,6*$SZREG
155	$REG_S	$ra,5*$SZREG($sp)
156	$REG_S	$t3,4*$SZREG($sp)
157	$REG_S	$t2,3*$SZREG($sp)
158	$REG_S	$t1,2*$SZREG($sp)
159	$REG_S	$t0,1*$SZREG($sp)
160	$REG_S	$gp,0*$SZREG($sp)
161___
162$code.=<<___;
163	.set	reorder
164	li	$minus4,-4
165	and	$ta0,$a2,$minus4
166	beqz	$ta0,.L_bn_mul_add_words_tail
167
168.L_bn_mul_add_words_loop:
169	$LD	$t0,0($a1)
170	$MULTU	($t0,$a3)
171	$LD	$t1,0($a0)
172	$LD	$t2,$BNSZ($a1)
173	$LD	$t3,$BNSZ($a0)
174	$LD	$ta0,2*$BNSZ($a1)
175	$LD	$ta1,2*$BNSZ($a0)
176	$ADDU	$t1,$v0
177	sltu	$v0,$t1,$v0	# All manuals say it "compares 32-bit
178				# values", but it seems to work fine
179				# even on 64-bit registers.
180	mflo	($at,$t0,$a3)
181	mfhi	($t0,$t0,$a3)
182	$ADDU	$t1,$at
183	$ADDU	$v0,$t0
184	 $MULTU	($t2,$a3)
185	sltu	$at,$t1,$at
186	$ST	$t1,0($a0)
187	$ADDU	$v0,$at
188
189	$LD	$ta2,3*$BNSZ($a1)
190	$LD	$ta3,3*$BNSZ($a0)
191	$ADDU	$t3,$v0
192	sltu	$v0,$t3,$v0
193	mflo	($at,$t2,$a3)
194	mfhi	($t2,$t2,$a3)
195	$ADDU	$t3,$at
196	$ADDU	$v0,$t2
197	 $MULTU	($ta0,$a3)
198	sltu	$at,$t3,$at
199	$ST	$t3,$BNSZ($a0)
200	$ADDU	$v0,$at
201
202	subu	$a2,4
203	$PTR_ADD $a0,4*$BNSZ
204	$PTR_ADD $a1,4*$BNSZ
205	$ADDU	$ta1,$v0
206	sltu	$v0,$ta1,$v0
207	mflo	($at,$ta0,$a3)
208	mfhi	($ta0,$ta0,$a3)
209	$ADDU	$ta1,$at
210	$ADDU	$v0,$ta0
211	 $MULTU	($ta2,$a3)
212	sltu	$at,$ta1,$at
213	$ST	$ta1,-2*$BNSZ($a0)
214	$ADDU	$v0,$at
215
216
217	and	$ta0,$a2,$minus4
218	$ADDU	$ta3,$v0
219	sltu	$v0,$ta3,$v0
220	mflo	($at,$ta2,$a3)
221	mfhi	($ta2,$ta2,$a3)
222	$ADDU	$ta3,$at
223	$ADDU	$v0,$ta2
224	sltu	$at,$ta3,$at
225	$ST	$ta3,-$BNSZ($a0)
226	.set	noreorder
227	bgtz	$ta0,.L_bn_mul_add_words_loop
228	$ADDU	$v0,$at
229
230	beqz	$a2,.L_bn_mul_add_words_return
231	nop
232
233.L_bn_mul_add_words_tail:
234	.set	reorder
235	$LD	$t0,0($a1)
236	$MULTU	($t0,$a3)
237	$LD	$t1,0($a0)
238	subu	$a2,1
239	$ADDU	$t1,$v0
240	sltu	$v0,$t1,$v0
241	mflo	($at,$t0,$a3)
242	mfhi	($t0,$t0,$a3)
243	$ADDU	$t1,$at
244	$ADDU	$v0,$t0
245	sltu	$at,$t1,$at
246	$ST	$t1,0($a0)
247	$ADDU	$v0,$at
248	beqz	$a2,.L_bn_mul_add_words_return
249
250	$LD	$t0,$BNSZ($a1)
251	$MULTU	($t0,$a3)
252	$LD	$t1,$BNSZ($a0)
253	subu	$a2,1
254	$ADDU	$t1,$v0
255	sltu	$v0,$t1,$v0
256	mflo	($at,$t0,$a3)
257	mfhi	($t0,$t0,$a3)
258	$ADDU	$t1,$at
259	$ADDU	$v0,$t0
260	sltu	$at,$t1,$at
261	$ST	$t1,$BNSZ($a0)
262	$ADDU	$v0,$at
263	beqz	$a2,.L_bn_mul_add_words_return
264
265	$LD	$t0,2*$BNSZ($a1)
266	$MULTU	($t0,$a3)
267	$LD	$t1,2*$BNSZ($a0)
268	$ADDU	$t1,$v0
269	sltu	$v0,$t1,$v0
270	mflo	($at,$t0,$a3)
271	mfhi	($t0,$t0,$a3)
272	$ADDU	$t1,$at
273	$ADDU	$v0,$t0
274	sltu	$at,$t1,$at
275	$ST	$t1,2*$BNSZ($a0)
276	$ADDU	$v0,$at
277
278.L_bn_mul_add_words_return:
279	.set	noreorder
280___
281$code.=<<___ if ($flavour =~ /nubi/i);
282	$REG_L	$t3,4*$SZREG($sp)
283	$REG_L	$t2,3*$SZREG($sp)
284	$REG_L	$t1,2*$SZREG($sp)
285	$REG_L	$t0,1*$SZREG($sp)
286	$REG_L	$gp,0*$SZREG($sp)
287	$PTR_ADD $sp,6*$SZREG
288___
289$code.=<<___;
290	jr	$ra
291	move	$a0,$v0
292.end	bn_mul_add_words_internal
293
294.align	5
295.globl	bn_mul_words
296.ent	bn_mul_words
297bn_mul_words:
298	.set	noreorder
299	bgtz	$a2,bn_mul_words_internal
300	move	$v0,$zero
301	jr	$ra
302	move	$a0,$v0
303.end	bn_mul_words
304
305.align	5
306.ent	bn_mul_words_internal
307bn_mul_words_internal:
308___
309$code.=<<___ if ($flavour =~ /nubi/i);
310	.frame	$sp,6*$SZREG,$ra
311	.mask	0x8000f008,-$SZREG
312	.set	noreorder
313	$PTR_SUB $sp,6*$SZREG
314	$REG_S	$ra,5*$SZREG($sp)
315	$REG_S	$t3,4*$SZREG($sp)
316	$REG_S	$t2,3*$SZREG($sp)
317	$REG_S	$t1,2*$SZREG($sp)
318	$REG_S	$t0,1*$SZREG($sp)
319	$REG_S	$gp,0*$SZREG($sp)
320___
321$code.=<<___;
322	.set	reorder
323	li	$minus4,-4
324	and	$ta0,$a2,$minus4
325	beqz	$ta0,.L_bn_mul_words_tail
326
327.L_bn_mul_words_loop:
328	$LD	$t0,0($a1)
329	$MULTU	($t0,$a3)
330	$LD	$t2,$BNSZ($a1)
331	$LD	$ta0,2*$BNSZ($a1)
332	$LD	$ta2,3*$BNSZ($a1)
333	mflo	($at,$t0,$a3)
334	mfhi	($t0,$t0,$a3)
335	$ADDU	$v0,$at
336	sltu	$t1,$v0,$at
337	 $MULTU	($t2,$a3)
338	$ST	$v0,0($a0)
339	$ADDU	$v0,$t1,$t0
340
341	subu	$a2,4
342	$PTR_ADD $a0,4*$BNSZ
343	$PTR_ADD $a1,4*$BNSZ
344	mflo	($at,$t2,$a3)
345	mfhi	($t2,$t2,$a3)
346	$ADDU	$v0,$at
347	sltu	$t3,$v0,$at
348	 $MULTU	($ta0,$a3)
349	$ST	$v0,-3*$BNSZ($a0)
350	$ADDU	$v0,$t3,$t2
351
352	mflo	($at,$ta0,$a3)
353	mfhi	($ta0,$ta0,$a3)
354	$ADDU	$v0,$at
355	sltu	$ta1,$v0,$at
356	 $MULTU	($ta2,$a3)
357	$ST	$v0,-2*$BNSZ($a0)
358	$ADDU	$v0,$ta1,$ta0
359
360	and	$ta0,$a2,$minus4
361	mflo	($at,$ta2,$a3)
362	mfhi	($ta2,$ta2,$a3)
363	$ADDU	$v0,$at
364	sltu	$ta3,$v0,$at
365	$ST	$v0,-$BNSZ($a0)
366	.set	noreorder
367	bgtz	$ta0,.L_bn_mul_words_loop
368	$ADDU	$v0,$ta3,$ta2
369
370	beqz	$a2,.L_bn_mul_words_return
371	nop
372
373.L_bn_mul_words_tail:
374	.set	reorder
375	$LD	$t0,0($a1)
376	$MULTU	($t0,$a3)
377	subu	$a2,1
378	mflo	($at,$t0,$a3)
379	mfhi	($t0,$t0,$a3)
380	$ADDU	$v0,$at
381	sltu	$t1,$v0,$at
382	$ST	$v0,0($a0)
383	$ADDU	$v0,$t1,$t0
384	beqz	$a2,.L_bn_mul_words_return
385
386	$LD	$t0,$BNSZ($a1)
387	$MULTU	($t0,$a3)
388	subu	$a2,1
389	mflo	($at,$t0,$a3)
390	mfhi	($t0,$t0,$a3)
391	$ADDU	$v0,$at
392	sltu	$t1,$v0,$at
393	$ST	$v0,$BNSZ($a0)
394	$ADDU	$v0,$t1,$t0
395	beqz	$a2,.L_bn_mul_words_return
396
397	$LD	$t0,2*$BNSZ($a1)
398	$MULTU	($t0,$a3)
399	mflo	($at,$t0,$a3)
400	mfhi	($t0,$t0,$a3)
401	$ADDU	$v0,$at
402	sltu	$t1,$v0,$at
403	$ST	$v0,2*$BNSZ($a0)
404	$ADDU	$v0,$t1,$t0
405
406.L_bn_mul_words_return:
407	.set	noreorder
408___
409$code.=<<___ if ($flavour =~ /nubi/i);
410	$REG_L	$t3,4*$SZREG($sp)
411	$REG_L	$t2,3*$SZREG($sp)
412	$REG_L	$t1,2*$SZREG($sp)
413	$REG_L	$t0,1*$SZREG($sp)
414	$REG_L	$gp,0*$SZREG($sp)
415	$PTR_ADD $sp,6*$SZREG
416___
417$code.=<<___;
418	jr	$ra
419	move	$a0,$v0
420.end	bn_mul_words_internal
421
422.align	5
423.globl	bn_sqr_words
424.ent	bn_sqr_words
425bn_sqr_words:
426	.set	noreorder
427	bgtz	$a2,bn_sqr_words_internal
428	move	$v0,$zero
429	jr	$ra
430	move	$a0,$v0
431.end	bn_sqr_words
432
433.align	5
434.ent	bn_sqr_words_internal
435bn_sqr_words_internal:
436___
437$code.=<<___ if ($flavour =~ /nubi/i);
438	.frame	$sp,6*$SZREG,$ra
439	.mask	0x8000f008,-$SZREG
440	.set	noreorder
441	$PTR_SUB $sp,6*$SZREG
442	$REG_S	$ra,5*$SZREG($sp)
443	$REG_S	$t3,4*$SZREG($sp)
444	$REG_S	$t2,3*$SZREG($sp)
445	$REG_S	$t1,2*$SZREG($sp)
446	$REG_S	$t0,1*$SZREG($sp)
447	$REG_S	$gp,0*$SZREG($sp)
448___
449$code.=<<___;
450	.set	reorder
451	li	$minus4,-4
452	and	$ta0,$a2,$minus4
453	beqz	$ta0,.L_bn_sqr_words_tail
454
455.L_bn_sqr_words_loop:
456	$LD	$t0,0($a1)
457	$MULTU	($t0,$t0)
458	$LD	$t2,$BNSZ($a1)
459	$LD	$ta0,2*$BNSZ($a1)
460	$LD	$ta2,3*$BNSZ($a1)
461	mflo	($t1,$t0,$t0)
462	mfhi	($t0,$t0,$t0)
463	$ST	$t1,0($a0)
464	$ST	$t0,$BNSZ($a0)
465
466	$MULTU	($t2,$t2)
467	subu	$a2,4
468	$PTR_ADD $a0,8*$BNSZ
469	$PTR_ADD $a1,4*$BNSZ
470	mflo	($t3,$t2,$t2)
471	mfhi	($t2,$t2,$t2)
472	$ST	$t3,-6*$BNSZ($a0)
473	$ST	$t2,-5*$BNSZ($a0)
474
475	$MULTU	($ta0,$ta0)
476	mflo	($ta1,$ta0,$ta0)
477	mfhi	($ta0,$ta0,$ta0)
478	$ST	$ta1,-4*$BNSZ($a0)
479	$ST	$ta0,-3*$BNSZ($a0)
480
481
482	$MULTU	($ta2,$ta2)
483	and	$ta0,$a2,$minus4
484	mflo	($ta3,$ta2,$ta2)
485	mfhi	($ta2,$ta2,$ta2)
486	$ST	$ta3,-2*$BNSZ($a0)
487
488	.set	noreorder
489	bgtz	$ta0,.L_bn_sqr_words_loop
490	$ST	$ta2,-$BNSZ($a0)
491
492	beqz	$a2,.L_bn_sqr_words_return
493	nop
494
495.L_bn_sqr_words_tail:
496	.set	reorder
497	$LD	$t0,0($a1)
498	$MULTU	($t0,$t0)
499	subu	$a2,1
500	mflo	($t1,$t0,$t0)
501	mfhi	($t0,$t0,$t0)
502	$ST	$t1,0($a0)
503	$ST	$t0,$BNSZ($a0)
504	beqz	$a2,.L_bn_sqr_words_return
505
506	$LD	$t0,$BNSZ($a1)
507	$MULTU	($t0,$t0)
508	subu	$a2,1
509	mflo	($t1,$t0,$t0)
510	mfhi	($t0,$t0,$t0)
511	$ST	$t1,2*$BNSZ($a0)
512	$ST	$t0,3*$BNSZ($a0)
513	beqz	$a2,.L_bn_sqr_words_return
514
515	$LD	$t0,2*$BNSZ($a1)
516	$MULTU	($t0,$t0)
517	mflo	($t1,$t0,$t0)
518	mfhi	($t0,$t0,$t0)
519	$ST	$t1,4*$BNSZ($a0)
520	$ST	$t0,5*$BNSZ($a0)
521
522.L_bn_sqr_words_return:
523	.set	noreorder
524___
525$code.=<<___ if ($flavour =~ /nubi/i);
526	$REG_L	$t3,4*$SZREG($sp)
527	$REG_L	$t2,3*$SZREG($sp)
528	$REG_L	$t1,2*$SZREG($sp)
529	$REG_L	$t0,1*$SZREG($sp)
530	$REG_L	$gp,0*$SZREG($sp)
531	$PTR_ADD $sp,6*$SZREG
532___
533$code.=<<___;
534	jr	$ra
535	move	$a0,$v0
536
537.end	bn_sqr_words_internal
538
539.align	5
540.globl	bn_add_words
541.ent	bn_add_words
542bn_add_words:
543	.set	noreorder
544	bgtz	$a3,bn_add_words_internal
545	move	$v0,$zero
546	jr	$ra
547	move	$a0,$v0
548.end	bn_add_words
549
550.align	5
551.ent	bn_add_words_internal
552bn_add_words_internal:
553___
554$code.=<<___ if ($flavour =~ /nubi/i);
555	.frame	$sp,6*$SZREG,$ra
556	.mask	0x8000f008,-$SZREG
557	.set	noreorder
558	$PTR_SUB $sp,6*$SZREG
559	$REG_S	$ra,5*$SZREG($sp)
560	$REG_S	$t3,4*$SZREG($sp)
561	$REG_S	$t2,3*$SZREG($sp)
562	$REG_S	$t1,2*$SZREG($sp)
563	$REG_S	$t0,1*$SZREG($sp)
564	$REG_S	$gp,0*$SZREG($sp)
565___
566$code.=<<___;
567	.set	reorder
568	li	$minus4,-4
569	and	$at,$a3,$minus4
570	beqz	$at,.L_bn_add_words_tail
571
572.L_bn_add_words_loop:
573	$LD	$t0,0($a1)
574	$LD	$ta0,0($a2)
575	subu	$a3,4
576	$LD	$t1,$BNSZ($a1)
577	and	$at,$a3,$minus4
578	$LD	$t2,2*$BNSZ($a1)
579	$PTR_ADD $a2,4*$BNSZ
580	$LD	$t3,3*$BNSZ($a1)
581	$PTR_ADD $a0,4*$BNSZ
582	$LD	$ta1,-3*$BNSZ($a2)
583	$PTR_ADD $a1,4*$BNSZ
584	$LD	$ta2,-2*$BNSZ($a2)
585	$LD	$ta3,-$BNSZ($a2)
586	$ADDU	$ta0,$t0
587	sltu	$t8,$ta0,$t0
588	$ADDU	$t0,$ta0,$v0
589	sltu	$v0,$t0,$ta0
590	$ST	$t0,-4*$BNSZ($a0)
591	$ADDU	$v0,$t8
592
593	$ADDU	$ta1,$t1
594	sltu	$t9,$ta1,$t1
595	$ADDU	$t1,$ta1,$v0
596	sltu	$v0,$t1,$ta1
597	$ST	$t1,-3*$BNSZ($a0)
598	$ADDU	$v0,$t9
599
600	$ADDU	$ta2,$t2
601	sltu	$t8,$ta2,$t2
602	$ADDU	$t2,$ta2,$v0
603	sltu	$v0,$t2,$ta2
604	$ST	$t2,-2*$BNSZ($a0)
605	$ADDU	$v0,$t8
606
607	$ADDU	$ta3,$t3
608	sltu	$t9,$ta3,$t3
609	$ADDU	$t3,$ta3,$v0
610	sltu	$v0,$t3,$ta3
611	$ST	$t3,-$BNSZ($a0)
612
613	.set	noreorder
614	bgtz	$at,.L_bn_add_words_loop
615	$ADDU	$v0,$t9
616
617	beqz	$a3,.L_bn_add_words_return
618	nop
619
620.L_bn_add_words_tail:
621	.set	reorder
622	$LD	$t0,0($a1)
623	$LD	$ta0,0($a2)
624	$ADDU	$ta0,$t0
625	subu	$a3,1
626	sltu	$t8,$ta0,$t0
627	$ADDU	$t0,$ta0,$v0
628	sltu	$v0,$t0,$ta0
629	$ST	$t0,0($a0)
630	$ADDU	$v0,$t8
631	beqz	$a3,.L_bn_add_words_return
632
633	$LD	$t1,$BNSZ($a1)
634	$LD	$ta1,$BNSZ($a2)
635	$ADDU	$ta1,$t1
636	subu	$a3,1
637	sltu	$t9,$ta1,$t1
638	$ADDU	$t1,$ta1,$v0
639	sltu	$v0,$t1,$ta1
640	$ST	$t1,$BNSZ($a0)
641	$ADDU	$v0,$t9
642	beqz	$a3,.L_bn_add_words_return
643
644	$LD	$t2,2*$BNSZ($a1)
645	$LD	$ta2,2*$BNSZ($a2)
646	$ADDU	$ta2,$t2
647	sltu	$t8,$ta2,$t2
648	$ADDU	$t2,$ta2,$v0
649	sltu	$v0,$t2,$ta2
650	$ST	$t2,2*$BNSZ($a0)
651	$ADDU	$v0,$t8
652
653.L_bn_add_words_return:
654	.set	noreorder
655___
656$code.=<<___ if ($flavour =~ /nubi/i);
657	$REG_L	$t3,4*$SZREG($sp)
658	$REG_L	$t2,3*$SZREG($sp)
659	$REG_L	$t1,2*$SZREG($sp)
660	$REG_L	$t0,1*$SZREG($sp)
661	$REG_L	$gp,0*$SZREG($sp)
662	$PTR_ADD $sp,6*$SZREG
663___
664$code.=<<___;
665	jr	$ra
666	move	$a0,$v0
667
668.end	bn_add_words_internal
669
670.align	5
671.globl	bn_sub_words
672.ent	bn_sub_words
673bn_sub_words:
674	.set	noreorder
675	bgtz	$a3,bn_sub_words_internal
676	move	$v0,$zero
677	jr	$ra
678	move	$a0,$zero
679.end	bn_sub_words
680
681.align	5
682.ent	bn_sub_words_internal
683bn_sub_words_internal:
684___
685$code.=<<___ if ($flavour =~ /nubi/i);
686	.frame	$sp,6*$SZREG,$ra
687	.mask	0x8000f008,-$SZREG
688	.set	noreorder
689	$PTR_SUB $sp,6*$SZREG
690	$REG_S	$ra,5*$SZREG($sp)
691	$REG_S	$t3,4*$SZREG($sp)
692	$REG_S	$t2,3*$SZREG($sp)
693	$REG_S	$t1,2*$SZREG($sp)
694	$REG_S	$t0,1*$SZREG($sp)
695	$REG_S	$gp,0*$SZREG($sp)
696___
697$code.=<<___;
698	.set	reorder
699	li	$minus4,-4
700	and	$at,$a3,$minus4
701	beqz	$at,.L_bn_sub_words_tail
702
703.L_bn_sub_words_loop:
704	$LD	$t0,0($a1)
705	$LD	$ta0,0($a2)
706	subu	$a3,4
707	$LD	$t1,$BNSZ($a1)
708	and	$at,$a3,$minus4
709	$LD	$t2,2*$BNSZ($a1)
710	$PTR_ADD $a2,4*$BNSZ
711	$LD	$t3,3*$BNSZ($a1)
712	$PTR_ADD $a0,4*$BNSZ
713	$LD	$ta1,-3*$BNSZ($a2)
714	$PTR_ADD $a1,4*$BNSZ
715	$LD	$ta2,-2*$BNSZ($a2)
716	$LD	$ta3,-$BNSZ($a2)
717	sltu	$t8,$t0,$ta0
718	$SUBU	$ta0,$t0,$ta0
719	$SUBU	$t0,$ta0,$v0
720	sgtu	$v0,$t0,$ta0
721	$ST	$t0,-4*$BNSZ($a0)
722	$ADDU	$v0,$t8
723
724	sltu	$t9,$t1,$ta1
725	$SUBU	$ta1,$t1,$ta1
726	$SUBU	$t1,$ta1,$v0
727	sgtu	$v0,$t1,$ta1
728	$ST	$t1,-3*$BNSZ($a0)
729	$ADDU	$v0,$t9
730
731
732	sltu	$t8,$t2,$ta2
733	$SUBU	$ta2,$t2,$ta2
734	$SUBU	$t2,$ta2,$v0
735	sgtu	$v0,$t2,$ta2
736	$ST	$t2,-2*$BNSZ($a0)
737	$ADDU	$v0,$t8
738
739	sltu	$t9,$t3,$ta3
740	$SUBU	$ta3,$t3,$ta3
741	$SUBU	$t3,$ta3,$v0
742	sgtu	$v0,$t3,$ta3
743	$ST	$t3,-$BNSZ($a0)
744
745	.set	noreorder
746	bgtz	$at,.L_bn_sub_words_loop
747	$ADDU	$v0,$t9
748
749	beqz	$a3,.L_bn_sub_words_return
750	nop
751
752.L_bn_sub_words_tail:
753	.set	reorder
754	$LD	$t0,0($a1)
755	$LD	$ta0,0($a2)
756	subu	$a3,1
757	sltu	$t8,$t0,$ta0
758	$SUBU	$ta0,$t0,$ta0
759	$SUBU	$t0,$ta0,$v0
760	sgtu	$v0,$t0,$ta0
761	$ST	$t0,0($a0)
762	$ADDU	$v0,$t8
763	beqz	$a3,.L_bn_sub_words_return
764
765	$LD	$t1,$BNSZ($a1)
766	subu	$a3,1
767	$LD	$ta1,$BNSZ($a2)
768	sltu	$t9,$t1,$ta1
769	$SUBU	$ta1,$t1,$ta1
770	$SUBU	$t1,$ta1,$v0
771	sgtu	$v0,$t1,$ta1
772	$ST	$t1,$BNSZ($a0)
773	$ADDU	$v0,$t9
774	beqz	$a3,.L_bn_sub_words_return
775
776	$LD	$t2,2*$BNSZ($a1)
777	$LD	$ta2,2*$BNSZ($a2)
778	sltu	$t8,$t2,$ta2
779	$SUBU	$ta2,$t2,$ta2
780	$SUBU	$t2,$ta2,$v0
781	sgtu	$v0,$t2,$ta2
782	$ST	$t2,2*$BNSZ($a0)
783	$ADDU	$v0,$t8
784
785.L_bn_sub_words_return:
786	.set	noreorder
787___
788$code.=<<___ if ($flavour =~ /nubi/i);
789	$REG_L	$t3,4*$SZREG($sp)
790	$REG_L	$t2,3*$SZREG($sp)
791	$REG_L	$t1,2*$SZREG($sp)
792	$REG_L	$t0,1*$SZREG($sp)
793	$REG_L	$gp,0*$SZREG($sp)
794	$PTR_ADD $sp,6*$SZREG
795___
796$code.=<<___;
797	jr	$ra
798	move	$a0,$v0
799.end	bn_sub_words_internal
800
801.align 5
802.globl	bn_div_3_words
803.ent	bn_div_3_words
804bn_div_3_words:
805	.set	noreorder
806	move	$a3,$a0		# we know that bn_div_words does not
807				# touch $a3, $ta2, $ta3 and preserves $a2
808				# so that we can save two arguments
809				# and return address in registers
810				# instead of stack:-)
811
812	$LD	$a0,($a3)
813	move	$ta2,$a1
814	bne	$a0,$a2,bn_div_3_words_internal
815	$LD	$a1,-$BNSZ($a3)
816	li	$v0,-1
817	jr	$ra
818	move	$a0,$v0
819.end	bn_div_3_words
820
821.align	5
822.ent	bn_div_3_words_internal
823bn_div_3_words_internal:
824___
825$code.=<<___ if ($flavour =~ /nubi/i);
826	.frame	$sp,6*$SZREG,$ra
827	.mask	0x8000f008,-$SZREG
828	.set	noreorder
829	$PTR_SUB $sp,6*$SZREG
830	$REG_S	$ra,5*$SZREG($sp)
831	$REG_S	$t3,4*$SZREG($sp)
832	$REG_S	$t2,3*$SZREG($sp)
833	$REG_S	$t1,2*$SZREG($sp)
834	$REG_S	$t0,1*$SZREG($sp)
835	$REG_S	$gp,0*$SZREG($sp)
836___
837$code.=<<___;
838	.set	reorder
839	move	$ta3,$ra
840	bal	bn_div_words_internal
841	move	$ra,$ta3
842	$MULTU	($ta2,$v0)
843	$LD	$t2,-2*$BNSZ($a3)
844	move	$ta0,$zero
845	mfhi	($t1,$ta2,$v0)
846	mflo	($t0,$ta2,$v0)
847	sltu	$t8,$t1,$a1
848.L_bn_div_3_words_inner_loop:
849	bnez	$t8,.L_bn_div_3_words_inner_loop_done
850	sgeu	$at,$t2,$t0
851	seq	$t9,$t1,$a1
852	and	$at,$t9
853	sltu	$t3,$t0,$ta2
854	$ADDU	$a1,$a2
855	$SUBU	$t1,$t3
856	$SUBU	$t0,$ta2
857	sltu	$t8,$t1,$a1
858	sltu	$ta0,$a1,$a2
859	or	$t8,$ta0
860	.set	noreorder
861	beqz	$at,.L_bn_div_3_words_inner_loop
862	$SUBU	$v0,1
863	$ADDU	$v0,1
864	.set	reorder
865.L_bn_div_3_words_inner_loop_done:
866	.set	noreorder
867___
868$code.=<<___ if ($flavour =~ /nubi/i);
869	$REG_L	$t3,4*$SZREG($sp)
870	$REG_L	$t2,3*$SZREG($sp)
871	$REG_L	$t1,2*$SZREG($sp)
872	$REG_L	$t0,1*$SZREG($sp)
873	$REG_L	$gp,0*$SZREG($sp)
874	$PTR_ADD $sp,6*$SZREG
875___
876$code.=<<___;
877	jr	$ra
878	move	$a0,$v0
879.end	bn_div_3_words_internal
880
881.align	5
882.globl	bn_div_words
883.ent	bn_div_words
884bn_div_words:
885	.set	noreorder
886	bnez	$a2,bn_div_words_internal
887	li	$v0,-1		# I would rather signal div-by-zero
888				# which can be done with 'break 7'
889	jr	$ra
890	move	$a0,$v0
891.end	bn_div_words
892
893.align	5
894.ent	bn_div_words_internal
895bn_div_words_internal:
896___
897$code.=<<___ if ($flavour =~ /nubi/i);
898	.frame	$sp,6*$SZREG,$ra
899	.mask	0x8000f008,-$SZREG
900	.set	noreorder
901	$PTR_SUB $sp,6*$SZREG
902	$REG_S	$ra,5*$SZREG($sp)
903	$REG_S	$t3,4*$SZREG($sp)
904	$REG_S	$t2,3*$SZREG($sp)
905	$REG_S	$t1,2*$SZREG($sp)
906	$REG_S	$t0,1*$SZREG($sp)
907	$REG_S	$gp,0*$SZREG($sp)
908___
909$code.=<<___;
910	move	$v1,$zero
911	bltz	$a2,.L_bn_div_words_body
912	move	$t9,$v1
913	$SLL	$a2,1
914	bgtz	$a2,.-4
915	addu	$t9,1
916
917	.set	reorder
918	negu	$t1,$t9
919	li	$t2,-1
920	$SLL	$t2,$t1
921	and	$t2,$a0
922	$SRL	$at,$a1,$t1
923	.set	noreorder
924	beqz	$t2,.+12
925	nop
926	break	6		# signal overflow
927	.set	reorder
928	$SLL	$a0,$t9
929	$SLL	$a1,$t9
930	or	$a0,$at
931___
932$QT=$ta0;
933$HH=$ta1;
934$DH=$v1;
935$code.=<<___;
936.L_bn_div_words_body:
937	$SRL	$DH,$a2,4*$BNSZ	# bits
938	sgeu	$at,$a0,$a2
939	.set	noreorder
940	beqz	$at,.+12
941	nop
942	$SUBU	$a0,$a2
943	.set	reorder
944
945	li	$QT,-1
946	$SRL	$HH,$a0,4*$BNSZ	# bits
947	$SRL	$QT,4*$BNSZ	# q=0xffffffff
948	beq	$DH,$HH,.L_bn_div_words_skip_div1
949	$DIVU	($a0,$DH)
950	mfqt	($QT,$a0,$DH)
951.L_bn_div_words_skip_div1:
952	$MULTU	($a2,$QT)
953	$SLL	$t3,$a0,4*$BNSZ	# bits
954	$SRL	$at,$a1,4*$BNSZ	# bits
955	or	$t3,$at
956	mflo	($t0,$a2,$QT)
957	mfhi	($t1,$a2,$QT)
958.L_bn_div_words_inner_loop1:
959	sltu	$t2,$t3,$t0
960	seq	$t8,$HH,$t1
961	sltu	$at,$HH,$t1
962	and	$t2,$t8
963	sltu	$v0,$t0,$a2
964	or	$at,$t2
965	.set	noreorder
966	beqz	$at,.L_bn_div_words_inner_loop1_done
967	$SUBU	$t1,$v0
968	$SUBU	$t0,$a2
969	b	.L_bn_div_words_inner_loop1
970	$SUBU	$QT,1
971	.set	reorder
972.L_bn_div_words_inner_loop1_done:
973
974	$SLL	$a1,4*$BNSZ	# bits
975	$SUBU	$a0,$t3,$t0
976	$SLL	$v0,$QT,4*$BNSZ	# bits
977
978	li	$QT,-1
979	$SRL	$HH,$a0,4*$BNSZ	# bits
980	$SRL	$QT,4*$BNSZ	# q=0xffffffff
981	beq	$DH,$HH,.L_bn_div_words_skip_div2
982	$DIVU	($a0,$DH)
983	mfqt	($QT,$a0,$DH)
984.L_bn_div_words_skip_div2:
985	$MULTU	($a2,$QT)
986	$SLL	$t3,$a0,4*$BNSZ	# bits
987	$SRL	$at,$a1,4*$BNSZ	# bits
988	or	$t3,$at
989	mflo	($t0,$a2,$QT)
990	mfhi	($t1,$a2,$QT)
991.L_bn_div_words_inner_loop2:
992	sltu	$t2,$t3,$t0
993	seq	$t8,$HH,$t1
994	sltu	$at,$HH,$t1
995	and	$t2,$t8
996	sltu	$v1,$t0,$a2
997	or	$at,$t2
998	.set	noreorder
999	beqz	$at,.L_bn_div_words_inner_loop2_done
1000	$SUBU	$t1,$v1
1001	$SUBU	$t0,$a2
1002	b	.L_bn_div_words_inner_loop2
1003	$SUBU	$QT,1
1004	.set	reorder
1005.L_bn_div_words_inner_loop2_done:
1006
1007	$SUBU	$a0,$t3,$t0
1008	or	$v0,$QT
1009	$SRL	$v1,$a0,$t9	# $v1 contains remainder if anybody wants it
1010	$SRL	$a2,$t9		# restore $a2
1011
1012	.set	noreorder
1013	move	$a1,$v1
1014___
1015$code.=<<___ if ($flavour =~ /nubi/i);
1016	$REG_L	$t3,4*$SZREG($sp)
1017	$REG_L	$t2,3*$SZREG($sp)
1018	$REG_L	$t1,2*$SZREG($sp)
1019	$REG_L	$t0,1*$SZREG($sp)
1020	$REG_L	$gp,0*$SZREG($sp)
1021	$PTR_ADD $sp,6*$SZREG
1022___
1023$code.=<<___;
1024	jr	$ra
1025	move	$a0,$v0
1026.end	bn_div_words_internal
1027___
1028undef $HH; undef $QT; undef $DH;
1029
1030($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1031($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1032
1033($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1034($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1035
1036($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1037
1038$code.=<<___;
1039
1040.align	5
1041.globl	bn_mul_comba8
1042.ent	bn_mul_comba8
1043bn_mul_comba8:
1044	.set	noreorder
1045___
1046$code.=<<___ if ($flavour =~ /nubi/i);
1047	.frame	$sp,12*$SZREG,$ra
1048	.mask	0x803ff008,-$SZREG
1049	$PTR_SUB $sp,12*$SZREG
1050	$REG_S	$ra,11*$SZREG($sp)
1051	$REG_S	$s5,10*$SZREG($sp)
1052	$REG_S	$s4,9*$SZREG($sp)
1053	$REG_S	$s3,8*$SZREG($sp)
1054	$REG_S	$s2,7*$SZREG($sp)
1055	$REG_S	$s1,6*$SZREG($sp)
1056	$REG_S	$s0,5*$SZREG($sp)
1057	$REG_S	$t3,4*$SZREG($sp)
1058	$REG_S	$t2,3*$SZREG($sp)
1059	$REG_S	$t1,2*$SZREG($sp)
1060	$REG_S	$t0,1*$SZREG($sp)
1061	$REG_S	$gp,0*$SZREG($sp)
1062___
1063$code.=<<___ if ($flavour !~ /nubi/i);
1064	.frame	$sp,6*$SZREG,$ra
1065	.mask	0x003f0000,-$SZREG
1066	$PTR_SUB $sp,6*$SZREG
1067	$REG_S	$s5,5*$SZREG($sp)
1068	$REG_S	$s4,4*$SZREG($sp)
1069	$REG_S	$s3,3*$SZREG($sp)
1070	$REG_S	$s2,2*$SZREG($sp)
1071	$REG_S	$s1,1*$SZREG($sp)
1072	$REG_S	$s0,0*$SZREG($sp)
1073___
1074$code.=<<___;
1075
1076	.set	reorder
1077	$LD	$a_0,0($a1)	# If compiled with -mips3 option on
1078				# R5000 box assembler barks on this
1079				# 1ine with "should not have mult/div
1080				# as last instruction in bb (R10K
1081				# bug)" warning. If anybody out there
1082				# has a clue about how to circumvent
1083				# this do send me a note.
1084				#		<appro\@fy.chalmers.se>
1085
1086	$LD	$b_0,0($a2)
1087	$LD	$a_1,$BNSZ($a1)
1088	$LD	$a_2,2*$BNSZ($a1)
1089	$MULTU	($a_0,$b_0)		# mul_add_c(a[0],b[0],c1,c2,c3);
1090	$LD	$a_3,3*$BNSZ($a1)
1091	$LD	$b_1,$BNSZ($a2)
1092	$LD	$b_2,2*$BNSZ($a2)
1093	$LD	$b_3,3*$BNSZ($a2)
1094	mflo	($c_1,$a_0,$b_0)
1095	mfhi	($c_2,$a_0,$b_0)
1096
1097	$LD	$a_4,4*$BNSZ($a1)
1098	$LD	$a_5,5*$BNSZ($a1)
1099	$MULTU	($a_0,$b_1)		# mul_add_c(a[0],b[1],c2,c3,c1);
1100	$LD	$a_6,6*$BNSZ($a1)
1101	$LD	$a_7,7*$BNSZ($a1)
1102	$LD	$b_4,4*$BNSZ($a2)
1103	$LD	$b_5,5*$BNSZ($a2)
1104	mflo	($t_1,$a_0,$b_1)
1105	mfhi	($t_2,$a_0,$b_1)
1106	$ADDU	$c_2,$t_1
1107	sltu	$at,$c_2,$t_1
1108	$MULTU	($a_1,$b_0)		# mul_add_c(a[1],b[0],c2,c3,c1);
1109	$ADDU	$c_3,$t_2,$at
1110	$LD	$b_6,6*$BNSZ($a2)
1111	$LD	$b_7,7*$BNSZ($a2)
1112	$ST	$c_1,0($a0)	# r[0]=c1;
1113	mflo	($t_1,$a_1,$b_0)
1114	mfhi	($t_2,$a_1,$b_0)
1115	$ADDU	$c_2,$t_1
1116	sltu	$at,$c_2,$t_1
1117	 $MULTU	($a_2,$b_0)		# mul_add_c(a[2],b[0],c3,c1,c2);
1118	$ADDU	$t_2,$at
1119	$ADDU	$c_3,$t_2
1120	sltu	$c_1,$c_3,$t_2
1121	$ST	$c_2,$BNSZ($a0)	# r[1]=c2;
1122
1123	mflo	($t_1,$a_2,$b_0)
1124	mfhi	($t_2,$a_2,$b_0)
1125	$ADDU	$c_3,$t_1
1126	sltu	$at,$c_3,$t_1
1127	$MULTU	($a_1,$b_1)		# mul_add_c(a[1],b[1],c3,c1,c2);
1128	$ADDU	$t_2,$at
1129	$ADDU	$c_1,$t_2
1130	mflo	($t_1,$a_1,$b_1)
1131	mfhi	($t_2,$a_1,$b_1)
1132	$ADDU	$c_3,$t_1
1133	sltu	$at,$c_3,$t_1
1134	$MULTU	($a_0,$b_2)		# mul_add_c(a[0],b[2],c3,c1,c2);
1135	$ADDU	$t_2,$at
1136	$ADDU	$c_1,$t_2
1137	sltu	$c_2,$c_1,$t_2
1138	mflo	($t_1,$a_0,$b_2)
1139	mfhi	($t_2,$a_0,$b_2)
1140	$ADDU	$c_3,$t_1
1141	sltu	$at,$c_3,$t_1
1142	 $MULTU	($a_0,$b_3)		# mul_add_c(a[0],b[3],c1,c2,c3);
1143	$ADDU	$t_2,$at
1144	$ADDU	$c_1,$t_2
1145	sltu	$at,$c_1,$t_2
1146	$ADDU	$c_2,$at
1147	$ST	$c_3,2*$BNSZ($a0)	# r[2]=c3;
1148
1149	mflo	($t_1,$a_0,$b_3)
1150	mfhi	($t_2,$a_0,$b_3)
1151	$ADDU	$c_1,$t_1
1152	sltu	$at,$c_1,$t_1
1153	$MULTU	($a_1,$b_2)		# mul_add_c(a[1],b[2],c1,c2,c3);
1154	$ADDU	$t_2,$at
1155	$ADDU	$c_2,$t_2
1156	sltu	$c_3,$c_2,$t_2
1157	mflo	($t_1,$a_1,$b_2)
1158	mfhi	($t_2,$a_1,$b_2)
1159	$ADDU	$c_1,$t_1
1160	sltu	$at,$c_1,$t_1
1161	$MULTU	($a_2,$b_1)		# mul_add_c(a[2],b[1],c1,c2,c3);
1162	$ADDU	$t_2,$at
1163	$ADDU	$c_2,$t_2
1164	sltu	$at,$c_2,$t_2
1165	$ADDU	$c_3,$at
1166	mflo	($t_1,$a_2,$b_1)
1167	mfhi	($t_2,$a_2,$b_1)
1168	$ADDU	$c_1,$t_1
1169	sltu	$at,$c_1,$t_1
1170	$MULTU	($a_3,$b_0)		# mul_add_c(a[3],b[0],c1,c2,c3);
1171	$ADDU	$t_2,$at
1172	$ADDU	$c_2,$t_2
1173	sltu	$at,$c_2,$t_2
1174	$ADDU	$c_3,$at
1175	mflo	($t_1,$a_3,$b_0)
1176	mfhi	($t_2,$a_3,$b_0)
1177	$ADDU	$c_1,$t_1
1178	sltu	$at,$c_1,$t_1
1179	 $MULTU	($a_4,$b_0)		# mul_add_c(a[4],b[0],c2,c3,c1);
1180	$ADDU	$t_2,$at
1181	$ADDU	$c_2,$t_2
1182	sltu	$at,$c_2,$t_2
1183	$ADDU	$c_3,$at
1184	$ST	$c_1,3*$BNSZ($a0)	# r[3]=c1;
1185
1186	mflo	($t_1,$a_4,$b_0)
1187	mfhi	($t_2,$a_4,$b_0)
1188	$ADDU	$c_2,$t_1
1189	sltu	$at,$c_2,$t_1
1190	$MULTU	($a_3,$b_1)		# mul_add_c(a[3],b[1],c2,c3,c1);
1191	$ADDU	$t_2,$at
1192	$ADDU	$c_3,$t_2
1193	sltu	$c_1,$c_3,$t_2
1194	mflo	($t_1,$a_3,$b_1)
1195	mfhi	($t_2,$a_3,$b_1)
1196	$ADDU	$c_2,$t_1
1197	sltu	$at,$c_2,$t_1
1198	$MULTU	($a_2,$b_2)		# mul_add_c(a[2],b[2],c2,c3,c1);
1199	$ADDU	$t_2,$at
1200	$ADDU	$c_3,$t_2
1201	sltu	$at,$c_3,$t_2
1202	$ADDU	$c_1,$at
1203	mflo	($t_1,$a_2,$b_2)
1204	mfhi	($t_2,$a_2,$b_2)
1205	$ADDU	$c_2,$t_1
1206	sltu	$at,$c_2,$t_1
1207	$MULTU	($a_1,$b_3)		# mul_add_c(a[1],b[3],c2,c3,c1);
1208	$ADDU	$t_2,$at
1209	$ADDU	$c_3,$t_2
1210	sltu	$at,$c_3,$t_2
1211	$ADDU	$c_1,$at
1212	mflo	($t_1,$a_1,$b_3)
1213	mfhi	($t_2,$a_1,$b_3)
1214	$ADDU	$c_2,$t_1
1215	sltu	$at,$c_2,$t_1
1216	$MULTU	($a_0,$b_4)		# mul_add_c(a[0],b[4],c2,c3,c1);
1217	$ADDU	$t_2,$at
1218	$ADDU	$c_3,$t_2
1219	sltu	$at,$c_3,$t_2
1220	$ADDU	$c_1,$at
1221	mflo	($t_1,$a_0,$b_4)
1222	mfhi	($t_2,$a_0,$b_4)
1223	$ADDU	$c_2,$t_1
1224	sltu	$at,$c_2,$t_1
1225	 $MULTU	($a_0,$b_5)		# mul_add_c(a[0],b[5],c3,c1,c2);
1226	$ADDU	$t_2,$at
1227	$ADDU	$c_3,$t_2
1228	sltu	$at,$c_3,$t_2
1229	$ADDU	$c_1,$at
1230	$ST	$c_2,4*$BNSZ($a0)	# r[4]=c2;
1231
1232	mflo	($t_1,$a_0,$b_5)
1233	mfhi	($t_2,$a_0,$b_5)
1234	$ADDU	$c_3,$t_1
1235	sltu	$at,$c_3,$t_1
1236	$MULTU	($a_1,$b_4)		# mul_add_c(a[1],b[4],c3,c1,c2);
1237	$ADDU	$t_2,$at
1238	$ADDU	$c_1,$t_2
1239	sltu	$c_2,$c_1,$t_2
1240	mflo	($t_1,$a_1,$b_4)
1241	mfhi	($t_2,$a_1,$b_4)
1242	$ADDU	$c_3,$t_1
1243	sltu	$at,$c_3,$t_1
1244	$MULTU	($a_2,$b_3)		# mul_add_c(a[2],b[3],c3,c1,c2);
1245	$ADDU	$t_2,$at
1246	$ADDU	$c_1,$t_2
1247	sltu	$at,$c_1,$t_2
1248	$ADDU	$c_2,$at
1249	mflo	($t_1,$a_2,$b_3)
1250	mfhi	($t_2,$a_2,$b_3)
1251	$ADDU	$c_3,$t_1
1252	sltu	$at,$c_3,$t_1
1253	$MULTU	($a_3,$b_2)		# mul_add_c(a[3],b[2],c3,c1,c2);
1254	$ADDU	$t_2,$at
1255	$ADDU	$c_1,$t_2
1256	sltu	$at,$c_1,$t_2
1257	$ADDU	$c_2,$at
1258	mflo	($t_1,$a_3,$b_2)
1259	mfhi	($t_2,$a_3,$b_2)
1260	$ADDU	$c_3,$t_1
1261	sltu	$at,$c_3,$t_1
1262	$MULTU	($a_4,$b_1)		# mul_add_c(a[4],b[1],c3,c1,c2);
1263	$ADDU	$t_2,$at
1264	$ADDU	$c_1,$t_2
1265	sltu	$at,$c_1,$t_2
1266	$ADDU	$c_2,$at
1267	mflo	($t_1,$a_4,$b_1)
1268	mfhi	($t_2,$a_4,$b_1)
1269	$ADDU	$c_3,$t_1
1270	sltu	$at,$c_3,$t_1
1271	$MULTU	($a_5,$b_0)		# mul_add_c(a[5],b[0],c3,c1,c2);
1272	$ADDU	$t_2,$at
1273	$ADDU	$c_1,$t_2
1274	sltu	$at,$c_1,$t_2
1275	$ADDU	$c_2,$at
1276	mflo	($t_1,$a_5,$b_0)
1277	mfhi	($t_2,$a_5,$b_0)
1278	$ADDU	$c_3,$t_1
1279	sltu	$at,$c_3,$t_1
1280	 $MULTU	($a_6,$b_0)		# mul_add_c(a[6],b[0],c1,c2,c3);
1281	$ADDU	$t_2,$at
1282	$ADDU	$c_1,$t_2
1283	sltu	$at,$c_1,$t_2
1284	$ADDU	$c_2,$at
1285	$ST	$c_3,5*$BNSZ($a0)	# r[5]=c3;
1286
1287	mflo	($t_1,$a_6,$b_0)
1288	mfhi	($t_2,$a_6,$b_0)
1289	$ADDU	$c_1,$t_1
1290	sltu	$at,$c_1,$t_1
1291	$MULTU	($a_5,$b_1)		# mul_add_c(a[5],b[1],c1,c2,c3);
1292	$ADDU	$t_2,$at
1293	$ADDU	$c_2,$t_2
1294	sltu	$c_3,$c_2,$t_2
1295	mflo	($t_1,$a_5,$b_1)
1296	mfhi	($t_2,$a_5,$b_1)
1297	$ADDU	$c_1,$t_1
1298	sltu	$at,$c_1,$t_1
1299	$MULTU	($a_4,$b_2)		# mul_add_c(a[4],b[2],c1,c2,c3);
1300	$ADDU	$t_2,$at
1301	$ADDU	$c_2,$t_2
1302	sltu	$at,$c_2,$t_2
1303	$ADDU	$c_3,$at
1304	mflo	($t_1,$a_4,$b_2)
1305	mfhi	($t_2,$a_4,$b_2)
1306	$ADDU	$c_1,$t_1
1307	sltu	$at,$c_1,$t_1
1308	$MULTU	($a_3,$b_3)		# mul_add_c(a[3],b[3],c1,c2,c3);
1309	$ADDU	$t_2,$at
1310	$ADDU	$c_2,$t_2
1311	sltu	$at,$c_2,$t_2
1312	$ADDU	$c_3,$at
1313	mflo	($t_1,$a_3,$b_3)
1314	mfhi	($t_2,$a_3,$b_3)
1315	$ADDU	$c_1,$t_1
1316	sltu	$at,$c_1,$t_1
1317	$MULTU	($a_2,$b_4)		# mul_add_c(a[2],b[4],c1,c2,c3);
1318	$ADDU	$t_2,$at
1319	$ADDU	$c_2,$t_2
1320	sltu	$at,$c_2,$t_2
1321	$ADDU	$c_3,$at
1322	mflo	($t_1,$a_2,$b_4)
1323	mfhi	($t_2,$a_2,$b_4)
1324	$ADDU	$c_1,$t_1
1325	sltu	$at,$c_1,$t_1
1326	$MULTU	($a_1,$b_5)		# mul_add_c(a[1],b[5],c1,c2,c3);
1327	$ADDU	$t_2,$at
1328	$ADDU	$c_2,$t_2
1329	sltu	$at,$c_2,$t_2
1330	$ADDU	$c_3,$at
1331	mflo	($t_1,$a_1,$b_5)
1332	mfhi	($t_2,$a_1,$b_5)
1333	$ADDU	$c_1,$t_1
1334	sltu	$at,$c_1,$t_1
1335	$MULTU	($a_0,$b_6)		# mul_add_c(a[0],b[6],c1,c2,c3);
1336	$ADDU	$t_2,$at
1337	$ADDU	$c_2,$t_2
1338	sltu	$at,$c_2,$t_2
1339	$ADDU	$c_3,$at
1340	mflo	($t_1,$a_0,$b_6)
1341	mfhi	($t_2,$a_0,$b_6)
1342	$ADDU	$c_1,$t_1
1343	sltu	$at,$c_1,$t_1
1344	 $MULTU	($a_0,$b_7)		# mul_add_c(a[0],b[7],c2,c3,c1);
1345	$ADDU	$t_2,$at
1346	$ADDU	$c_2,$t_2
1347	sltu	$at,$c_2,$t_2
1348	$ADDU	$c_3,$at
1349	$ST	$c_1,6*$BNSZ($a0)	# r[6]=c1;
1350
1351	mflo	($t_1,$a_0,$b_7)
1352	mfhi	($t_2,$a_0,$b_7)
1353	$ADDU	$c_2,$t_1
1354	sltu	$at,$c_2,$t_1
1355	$MULTU	($a_1,$b_6)		# mul_add_c(a[1],b[6],c2,c3,c1);
1356	$ADDU	$t_2,$at
1357	$ADDU	$c_3,$t_2
1358	sltu	$c_1,$c_3,$t_2
1359	mflo	($t_1,$a_1,$b_6)
1360	mfhi	($t_2,$a_1,$b_6)
1361	$ADDU	$c_2,$t_1
1362	sltu	$at,$c_2,$t_1
1363	$MULTU	($a_2,$b_5)		# mul_add_c(a[2],b[5],c2,c3,c1);
1364	$ADDU	$t_2,$at
1365	$ADDU	$c_3,$t_2
1366	sltu	$at,$c_3,$t_2
1367	$ADDU	$c_1,$at
1368	mflo	($t_1,$a_2,$b_5)
1369	mfhi	($t_2,$a_2,$b_5)
1370	$ADDU	$c_2,$t_1
1371	sltu	$at,$c_2,$t_1
1372	$MULTU	($a_3,$b_4)		# mul_add_c(a[3],b[4],c2,c3,c1);
1373	$ADDU	$t_2,$at
1374	$ADDU	$c_3,$t_2
1375	sltu	$at,$c_3,$t_2
1376	$ADDU	$c_1,$at
1377	mflo	($t_1,$a_3,$b_4)
1378	mfhi	($t_2,$a_3,$b_4)
1379	$ADDU	$c_2,$t_1
1380	sltu	$at,$c_2,$t_1
1381	$MULTU	($a_4,$b_3)		# mul_add_c(a[4],b[3],c2,c3,c1);
1382	$ADDU	$t_2,$at
1383	$ADDU	$c_3,$t_2
1384	sltu	$at,$c_3,$t_2
1385	$ADDU	$c_1,$at
1386	mflo	($t_1,$a_4,$b_3)
1387	mfhi	($t_2,$a_4,$b_3)
1388	$ADDU	$c_2,$t_1
1389	sltu	$at,$c_2,$t_1
1390	$MULTU	($a_5,$b_2)		# mul_add_c(a[5],b[2],c2,c3,c1);
1391	$ADDU	$t_2,$at
1392	$ADDU	$c_3,$t_2
1393	sltu	$at,$c_3,$t_2
1394	$ADDU	$c_1,$at
1395	mflo	($t_1,$a_5,$b_2)
1396	mfhi	($t_2,$a_5,$b_2)
1397	$ADDU	$c_2,$t_1
1398	sltu	$at,$c_2,$t_1
1399	$MULTU	($a_6,$b_1)		# mul_add_c(a[6],b[1],c2,c3,c1);
1400	$ADDU	$t_2,$at
1401	$ADDU	$c_3,$t_2
1402	sltu	$at,$c_3,$t_2
1403	$ADDU	$c_1,$at
1404	mflo	($t_1,$a_6,$b_1)
1405	mfhi	($t_2,$a_6,$b_1)
1406	$ADDU	$c_2,$t_1
1407	sltu	$at,$c_2,$t_1
1408	$MULTU	($a_7,$b_0)		# mul_add_c(a[7],b[0],c2,c3,c1);
1409	$ADDU	$t_2,$at
1410	$ADDU	$c_3,$t_2
1411	sltu	$at,$c_3,$t_2
1412	$ADDU	$c_1,$at
1413	mflo	($t_1,$a_7,$b_0)
1414	mfhi	($t_2,$a_7,$b_0)
1415	$ADDU	$c_2,$t_1
1416	sltu	$at,$c_2,$t_1
1417	 $MULTU	($a_7,$b_1)		# mul_add_c(a[7],b[1],c3,c1,c2);
1418	$ADDU	$t_2,$at
1419	$ADDU	$c_3,$t_2
1420	sltu	$at,$c_3,$t_2
1421	$ADDU	$c_1,$at
1422	$ST	$c_2,7*$BNSZ($a0)	# r[7]=c2;
1423
1424	mflo	($t_1,$a_7,$b_1)
1425	mfhi	($t_2,$a_7,$b_1)
1426	$ADDU	$c_3,$t_1
1427	sltu	$at,$c_3,$t_1
1428	$MULTU	($a_6,$b_2)		# mul_add_c(a[6],b[2],c3,c1,c2);
1429	$ADDU	$t_2,$at
1430	$ADDU	$c_1,$t_2
1431	sltu	$c_2,$c_1,$t_2
1432	mflo	($t_1,$a_6,$b_2)
1433	mfhi	($t_2,$a_6,$b_2)
1434	$ADDU	$c_3,$t_1
1435	sltu	$at,$c_3,$t_1
1436	$MULTU	($a_5,$b_3)		# mul_add_c(a[5],b[3],c3,c1,c2);
1437	$ADDU	$t_2,$at
1438	$ADDU	$c_1,$t_2
1439	sltu	$at,$c_1,$t_2
1440	$ADDU	$c_2,$at
1441	mflo	($t_1,$a_5,$b_3)
1442	mfhi	($t_2,$a_5,$b_3)
1443	$ADDU	$c_3,$t_1
1444	sltu	$at,$c_3,$t_1
1445	$MULTU	($a_4,$b_4)		# mul_add_c(a[4],b[4],c3,c1,c2);
1446	$ADDU	$t_2,$at
1447	$ADDU	$c_1,$t_2
1448	sltu	$at,$c_1,$t_2
1449	$ADDU	$c_2,$at
1450	mflo	($t_1,$a_4,$b_4)
1451	mfhi	($t_2,$a_4,$b_4)
1452	$ADDU	$c_3,$t_1
1453	sltu	$at,$c_3,$t_1
1454	$MULTU	($a_3,$b_5)		# mul_add_c(a[3],b[5],c3,c1,c2);
1455	$ADDU	$t_2,$at
1456	$ADDU	$c_1,$t_2
1457	sltu	$at,$c_1,$t_2
1458	$ADDU	$c_2,$at
1459	mflo	($t_1,$a_3,$b_5)
1460	mfhi	($t_2,$a_3,$b_5)
1461	$ADDU	$c_3,$t_1
1462	sltu	$at,$c_3,$t_1
1463	$MULTU	($a_2,$b_6)		# mul_add_c(a[2],b[6],c3,c1,c2);
1464	$ADDU	$t_2,$at
1465	$ADDU	$c_1,$t_2
1466	sltu	$at,$c_1,$t_2
1467	$ADDU	$c_2,$at
1468	mflo	($t_1,$a_2,$b_6)
1469	mfhi	($t_2,$a_2,$b_6)
1470	$ADDU	$c_3,$t_1
1471	sltu	$at,$c_3,$t_1
1472	$MULTU	($a_1,$b_7)		# mul_add_c(a[1],b[7],c3,c1,c2);
1473	$ADDU	$t_2,$at
1474	$ADDU	$c_1,$t_2
1475	sltu	$at,$c_1,$t_2
1476	$ADDU	$c_2,$at
1477	mflo	($t_1,$a_1,$b_7)
1478	mfhi	($t_2,$a_1,$b_7)
1479	$ADDU	$c_3,$t_1
1480	sltu	$at,$c_3,$t_1
1481	 $MULTU	($a_2,$b_7)		# mul_add_c(a[2],b[7],c1,c2,c3);
1482	$ADDU	$t_2,$at
1483	$ADDU	$c_1,$t_2
1484	sltu	$at,$c_1,$t_2
1485	$ADDU	$c_2,$at
1486	$ST	$c_3,8*$BNSZ($a0)	# r[8]=c3;
1487
1488	mflo	($t_1,$a_2,$b_7)
1489	mfhi	($t_2,$a_2,$b_7)
1490	$ADDU	$c_1,$t_1
1491	sltu	$at,$c_1,$t_1
1492	$MULTU	($a_3,$b_6)		# mul_add_c(a[3],b[6],c1,c2,c3);
1493	$ADDU	$t_2,$at
1494	$ADDU	$c_2,$t_2
1495	sltu	$c_3,$c_2,$t_2
1496	mflo	($t_1,$a_3,$b_6)
1497	mfhi	($t_2,$a_3,$b_6)
1498	$ADDU	$c_1,$t_1
1499	sltu	$at,$c_1,$t_1
1500	$MULTU	($a_4,$b_5)		# mul_add_c(a[4],b[5],c1,c2,c3);
1501	$ADDU	$t_2,$at
1502	$ADDU	$c_2,$t_2
1503	sltu	$at,$c_2,$t_2
1504	$ADDU	$c_3,$at
1505	mflo	($t_1,$a_4,$b_5)
1506	mfhi	($t_2,$a_4,$b_5)
1507	$ADDU	$c_1,$t_1
1508	sltu	$at,$c_1,$t_1
1509	$MULTU	($a_5,$b_4)		# mul_add_c(a[5],b[4],c1,c2,c3);
1510	$ADDU	$t_2,$at
1511	$ADDU	$c_2,$t_2
1512	sltu	$at,$c_2,$t_2
1513	$ADDU	$c_3,$at
1514	mflo	($t_1,$a_5,$b_4)
1515	mfhi	($t_2,$a_5,$b_4)
1516	$ADDU	$c_1,$t_1
1517	sltu	$at,$c_1,$t_1
1518	$MULTU	($a_6,$b_3)		# mul_add_c(a[6],b[3],c1,c2,c3);
1519	$ADDU	$t_2,$at
1520	$ADDU	$c_2,$t_2
1521	sltu	$at,$c_2,$t_2
1522	$ADDU	$c_3,$at
1523	mflo	($t_1,$a_6,$b_3)
1524	mfhi	($t_2,$a_6,$b_3)
1525	$ADDU	$c_1,$t_1
1526	sltu	$at,$c_1,$t_1
1527	$MULTU	($a_7,$b_2)		# mul_add_c(a[7],b[2],c1,c2,c3);
1528	$ADDU	$t_2,$at
1529	$ADDU	$c_2,$t_2
1530	sltu	$at,$c_2,$t_2
1531	$ADDU	$c_3,$at
1532	mflo	($t_1,$a_7,$b_2)
1533	mfhi	($t_2,$a_7,$b_2)
1534	$ADDU	$c_1,$t_1
1535	sltu	$at,$c_1,$t_1
1536	 $MULTU	($a_7,$b_3)		# mul_add_c(a[7],b[3],c2,c3,c1);
1537	$ADDU	$t_2,$at
1538	$ADDU	$c_2,$t_2
1539	sltu	$at,$c_2,$t_2
1540	$ADDU	$c_3,$at
1541	$ST	$c_1,9*$BNSZ($a0)	# r[9]=c1;
1542
1543	mflo	($t_1,$a_7,$b_3)
1544	mfhi	($t_2,$a_7,$b_3)
1545	$ADDU	$c_2,$t_1
1546	sltu	$at,$c_2,$t_1
1547	$MULTU	($a_6,$b_4)		# mul_add_c(a[6],b[4],c2,c3,c1);
1548	$ADDU	$t_2,$at
1549	$ADDU	$c_3,$t_2
1550	sltu	$c_1,$c_3,$t_2
1551	mflo	($t_1,$a_6,$b_4)
1552	mfhi	($t_2,$a_6,$b_4)
1553	$ADDU	$c_2,$t_1
1554	sltu	$at,$c_2,$t_1
1555	$MULTU	($a_5,$b_5)		# mul_add_c(a[5],b[5],c2,c3,c1);
1556	$ADDU	$t_2,$at
1557	$ADDU	$c_3,$t_2
1558	sltu	$at,$c_3,$t_2
1559	$ADDU	$c_1,$at
1560	mflo	($t_1,$a_5,$b_5)
1561	mfhi	($t_2,$a_5,$b_5)
1562	$ADDU	$c_2,$t_1
1563	sltu	$at,$c_2,$t_1
1564	$MULTU	($a_4,$b_6)		# mul_add_c(a[4],b[6],c2,c3,c1);
1565	$ADDU	$t_2,$at
1566	$ADDU	$c_3,$t_2
1567	sltu	$at,$c_3,$t_2
1568	$ADDU	$c_1,$at
1569	mflo	($t_1,$a_4,$b_6)
1570	mfhi	($t_2,$a_4,$b_6)
1571	$ADDU	$c_2,$t_1
1572	sltu	$at,$c_2,$t_1
1573	$MULTU	($a_3,$b_7)		# mul_add_c(a[3],b[7],c2,c3,c1);
1574	$ADDU	$t_2,$at
1575	$ADDU	$c_3,$t_2
1576	sltu	$at,$c_3,$t_2
1577	$ADDU	$c_1,$at
1578	mflo	($t_1,$a_3,$b_7)
1579	mfhi	($t_2,$a_3,$b_7)
1580	$ADDU	$c_2,$t_1
1581	sltu	$at,$c_2,$t_1
1582	$MULTU	($a_4,$b_7)		# mul_add_c(a[4],b[7],c3,c1,c2);
1583	$ADDU	$t_2,$at
1584	$ADDU	$c_3,$t_2
1585	sltu	$at,$c_3,$t_2
1586	$ADDU	$c_1,$at
1587	$ST	$c_2,10*$BNSZ($a0)	# r[10]=c2;
1588
1589	mflo	($t_1,$a_4,$b_7)
1590	mfhi	($t_2,$a_4,$b_7)
1591	$ADDU	$c_3,$t_1
1592	sltu	$at,$c_3,$t_1
1593	$MULTU	($a_5,$b_6)		# mul_add_c(a[5],b[6],c3,c1,c2);
1594	$ADDU	$t_2,$at
1595	$ADDU	$c_1,$t_2
1596	sltu	$c_2,$c_1,$t_2
1597	mflo	($t_1,$a_5,$b_6)
1598	mfhi	($t_2,$a_5,$b_6)
1599	$ADDU	$c_3,$t_1
1600	sltu	$at,$c_3,$t_1
1601	$MULTU	($a_6,$b_5)		# mul_add_c(a[6],b[5],c3,c1,c2);
1602	$ADDU	$t_2,$at
1603	$ADDU	$c_1,$t_2
1604	sltu	$at,$c_1,$t_2
1605	$ADDU	$c_2,$at
1606	mflo	($t_1,$a_6,$b_5)
1607	mfhi	($t_2,$a_6,$b_5)
1608	$ADDU	$c_3,$t_1
1609	sltu	$at,$c_3,$t_1
1610	$MULTU	($a_7,$b_4)		# mul_add_c(a[7],b[4],c3,c1,c2);
1611	$ADDU	$t_2,$at
1612	$ADDU	$c_1,$t_2
1613	sltu	$at,$c_1,$t_2
1614	$ADDU	$c_2,$at
1615	mflo	($t_1,$a_7,$b_4)
1616	mfhi	($t_2,$a_7,$b_4)
1617	$ADDU	$c_3,$t_1
1618	sltu	$at,$c_3,$t_1
1619	 $MULTU	($a_7,$b_5)		# mul_add_c(a[7],b[5],c1,c2,c3);
1620	$ADDU	$t_2,$at
1621	$ADDU	$c_1,$t_2
1622	sltu	$at,$c_1,$t_2
1623	$ADDU	$c_2,$at
1624	$ST	$c_3,11*$BNSZ($a0)	# r[11]=c3;
1625
1626	mflo	($t_1,$a_7,$b_5)
1627	mfhi	($t_2,$a_7,$b_5)
1628	$ADDU	$c_1,$t_1
1629	sltu	$at,$c_1,$t_1
1630	$MULTU	($a_6,$b_6)		# mul_add_c(a[6],b[6],c1,c2,c3);
1631	$ADDU	$t_2,$at
1632	$ADDU	$c_2,$t_2
1633	sltu	$c_3,$c_2,$t_2
1634	mflo	($t_1,$a_6,$b_6)
1635	mfhi	($t_2,$a_6,$b_6)
1636	$ADDU	$c_1,$t_1
1637	sltu	$at,$c_1,$t_1
1638	$MULTU	($a_5,$b_7)		# mul_add_c(a[5],b[7],c1,c2,c3);
1639	$ADDU	$t_2,$at
1640	$ADDU	$c_2,$t_2
1641	sltu	$at,$c_2,$t_2
1642	$ADDU	$c_3,$at
1643	mflo	($t_1,$a_5,$b_7)
1644	mfhi	($t_2,$a_5,$b_7)
1645	$ADDU	$c_1,$t_1
1646	sltu	$at,$c_1,$t_1
1647	 $MULTU	($a_6,$b_7)		# mul_add_c(a[6],b[7],c2,c3,c1);
1648	$ADDU	$t_2,$at
1649	$ADDU	$c_2,$t_2
1650	sltu	$at,$c_2,$t_2
1651	$ADDU	$c_3,$at
1652	$ST	$c_1,12*$BNSZ($a0)	# r[12]=c1;
1653
1654	mflo	($t_1,$a_6,$b_7)
1655	mfhi	($t_2,$a_6,$b_7)
1656	$ADDU	$c_2,$t_1
1657	sltu	$at,$c_2,$t_1
1658	$MULTU	($a_7,$b_6)		# mul_add_c(a[7],b[6],c2,c3,c1);
1659	$ADDU	$t_2,$at
1660	$ADDU	$c_3,$t_2
1661	sltu	$c_1,$c_3,$t_2
1662	mflo	($t_1,$a_7,$b_6)
1663	mfhi	($t_2,$a_7,$b_6)
1664	$ADDU	$c_2,$t_1
1665	sltu	$at,$c_2,$t_1
1666	$MULTU	($a_7,$b_7)		# mul_add_c(a[7],b[7],c3,c1,c2);
1667	$ADDU	$t_2,$at
1668	$ADDU	$c_3,$t_2
1669	sltu	$at,$c_3,$t_2
1670	$ADDU	$c_1,$at
1671	$ST	$c_2,13*$BNSZ($a0)	# r[13]=c2;
1672
1673	mflo	($t_1,$a_7,$b_7)
1674	mfhi	($t_2,$a_7,$b_7)
1675	$ADDU	$c_3,$t_1
1676	sltu	$at,$c_3,$t_1
1677	$ADDU	$t_2,$at
1678	$ADDU	$c_1,$t_2
1679	$ST	$c_3,14*$BNSZ($a0)	# r[14]=c3;
1680	$ST	$c_1,15*$BNSZ($a0)	# r[15]=c1;
1681
1682	.set	noreorder
1683___
1684$code.=<<___ if ($flavour =~ /nubi/i);
1685	$REG_L	$s5,10*$SZREG($sp)
1686	$REG_L	$s4,9*$SZREG($sp)
1687	$REG_L	$s3,8*$SZREG($sp)
1688	$REG_L	$s2,7*$SZREG($sp)
1689	$REG_L	$s1,6*$SZREG($sp)
1690	$REG_L	$s0,5*$SZREG($sp)
1691	$REG_L	$t3,4*$SZREG($sp)
1692	$REG_L	$t2,3*$SZREG($sp)
1693	$REG_L	$t1,2*$SZREG($sp)
1694	$REG_L	$t0,1*$SZREG($sp)
1695	$REG_L	$gp,0*$SZREG($sp)
1696	jr	$ra
1697	$PTR_ADD $sp,12*$SZREG
1698___
1699$code.=<<___ if ($flavour !~ /nubi/i);
1700	$REG_L	$s5,5*$SZREG($sp)
1701	$REG_L	$s4,4*$SZREG($sp)
1702	$REG_L	$s3,3*$SZREG($sp)
1703	$REG_L	$s2,2*$SZREG($sp)
1704	$REG_L	$s1,1*$SZREG($sp)
1705	$REG_L	$s0,0*$SZREG($sp)
1706	jr	$ra
1707	$PTR_ADD $sp,6*$SZREG
1708___
1709$code.=<<___;
1710.end	bn_mul_comba8
1711
1712.align	5
1713.globl	bn_mul_comba4
1714.ent	bn_mul_comba4
1715bn_mul_comba4:
1716___
1717$code.=<<___ if ($flavour =~ /nubi/i);
1718	.frame	$sp,6*$SZREG,$ra
1719	.mask	0x8000f008,-$SZREG
1720	.set	noreorder
1721	$PTR_SUB $sp,6*$SZREG
1722	$REG_S	$ra,5*$SZREG($sp)
1723	$REG_S	$t3,4*$SZREG($sp)
1724	$REG_S	$t2,3*$SZREG($sp)
1725	$REG_S	$t1,2*$SZREG($sp)
1726	$REG_S	$t0,1*$SZREG($sp)
1727	$REG_S	$gp,0*$SZREG($sp)
1728___
1729$code.=<<___;
1730	.set	reorder
1731	$LD	$a_0,0($a1)
1732	$LD	$b_0,0($a2)
1733	$LD	$a_1,$BNSZ($a1)
1734	$LD	$a_2,2*$BNSZ($a1)
1735	$MULTU	($a_0,$b_0)		# mul_add_c(a[0],b[0],c1,c2,c3);
1736	$LD	$a_3,3*$BNSZ($a1)
1737	$LD	$b_1,$BNSZ($a2)
1738	$LD	$b_2,2*$BNSZ($a2)
1739	$LD	$b_3,3*$BNSZ($a2)
1740	mflo	($c_1,$a_0,$b_0)
1741	mfhi	($c_2,$a_0,$b_0)
1742	$ST	$c_1,0($a0)
1743
1744	$MULTU	($a_0,$b_1)		# mul_add_c(a[0],b[1],c2,c3,c1);
1745	mflo	($t_1,$a_0,$b_1)
1746	mfhi	($t_2,$a_0,$b_1)
1747	$ADDU	$c_2,$t_1
1748	sltu	$at,$c_2,$t_1
1749	$MULTU	($a_1,$b_0)		# mul_add_c(a[1],b[0],c2,c3,c1);
1750	$ADDU	$c_3,$t_2,$at
1751	mflo	($t_1,$a_1,$b_0)
1752	mfhi	($t_2,$a_1,$b_0)
1753	$ADDU	$c_2,$t_1
1754	sltu	$at,$c_2,$t_1
1755	 $MULTU	($a_2,$b_0)		# mul_add_c(a[2],b[0],c3,c1,c2);
1756	$ADDU	$t_2,$at
1757	$ADDU	$c_3,$t_2
1758	sltu	$c_1,$c_3,$t_2
1759	$ST	$c_2,$BNSZ($a0)
1760
1761	mflo	($t_1,$a_2,$b_0)
1762	mfhi	($t_2,$a_2,$b_0)
1763	$ADDU	$c_3,$t_1
1764	sltu	$at,$c_3,$t_1
1765	$MULTU	($a_1,$b_1)		# mul_add_c(a[1],b[1],c3,c1,c2);
1766	$ADDU	$t_2,$at
1767	$ADDU	$c_1,$t_2
1768	mflo	($t_1,$a_1,$b_1)
1769	mfhi	($t_2,$a_1,$b_1)
1770	$ADDU	$c_3,$t_1
1771	sltu	$at,$c_3,$t_1
1772	$MULTU	($a_0,$b_2)		# mul_add_c(a[0],b[2],c3,c1,c2);
1773	$ADDU	$t_2,$at
1774	$ADDU	$c_1,$t_2
1775	sltu	$c_2,$c_1,$t_2
1776	mflo	($t_1,$a_0,$b_2)
1777	mfhi	($t_2,$a_0,$b_2)
1778	$ADDU	$c_3,$t_1
1779	sltu	$at,$c_3,$t_1
1780	 $MULTU	($a_0,$b_3)		# mul_add_c(a[0],b[3],c1,c2,c3);
1781	$ADDU	$t_2,$at
1782	$ADDU	$c_1,$t_2
1783	sltu	$at,$c_1,$t_2
1784	$ADDU	$c_2,$at
1785	$ST	$c_3,2*$BNSZ($a0)
1786
1787	mflo	($t_1,$a_0,$b_3)
1788	mfhi	($t_2,$a_0,$b_3)
1789	$ADDU	$c_1,$t_1
1790	sltu	$at,$c_1,$t_1
1791	$MULTU	($a_1,$b_2)		# mul_add_c(a[1],b[2],c1,c2,c3);
1792	$ADDU	$t_2,$at
1793	$ADDU	$c_2,$t_2
1794	sltu	$c_3,$c_2,$t_2
1795	mflo	($t_1,$a_1,$b_2)
1796	mfhi	($t_2,$a_1,$b_2)
1797	$ADDU	$c_1,$t_1
1798	sltu	$at,$c_1,$t_1
1799	$MULTU	($a_2,$b_1)		# mul_add_c(a[2],b[1],c1,c2,c3);
1800	$ADDU	$t_2,$at
1801	$ADDU	$c_2,$t_2
1802	sltu	$at,$c_2,$t_2
1803	$ADDU	$c_3,$at
1804	mflo	($t_1,$a_2,$b_1)
1805	mfhi	($t_2,$a_2,$b_1)
1806	$ADDU	$c_1,$t_1
1807	sltu	$at,$c_1,$t_1
1808	$MULTU	($a_3,$b_0)		# mul_add_c(a[3],b[0],c1,c2,c3);
1809	$ADDU	$t_2,$at
1810	$ADDU	$c_2,$t_2
1811	sltu	$at,$c_2,$t_2
1812	$ADDU	$c_3,$at
1813	mflo	($t_1,$a_3,$b_0)
1814	mfhi	($t_2,$a_3,$b_0)
1815	$ADDU	$c_1,$t_1
1816	sltu	$at,$c_1,$t_1
1817	 $MULTU	($a_3,$b_1)		# mul_add_c(a[3],b[1],c2,c3,c1);
1818	$ADDU	$t_2,$at
1819	$ADDU	$c_2,$t_2
1820	sltu	$at,$c_2,$t_2
1821	$ADDU	$c_3,$at
1822	$ST	$c_1,3*$BNSZ($a0)
1823
1824	mflo	($t_1,$a_3,$b_1)
1825	mfhi	($t_2,$a_3,$b_1)
1826	$ADDU	$c_2,$t_1
1827	sltu	$at,$c_2,$t_1
1828	$MULTU	($a_2,$b_2)		# mul_add_c(a[2],b[2],c2,c3,c1);
1829	$ADDU	$t_2,$at
1830	$ADDU	$c_3,$t_2
1831	sltu	$c_1,$c_3,$t_2
1832	mflo	($t_1,$a_2,$b_2)
1833	mfhi	($t_2,$a_2,$b_2)
1834	$ADDU	$c_2,$t_1
1835	sltu	$at,$c_2,$t_1
1836	$MULTU	($a_1,$b_3)		# mul_add_c(a[1],b[3],c2,c3,c1);
1837	$ADDU	$t_2,$at
1838	$ADDU	$c_3,$t_2
1839	sltu	$at,$c_3,$t_2
1840	$ADDU	$c_1,$at
1841	mflo	($t_1,$a_1,$b_3)
1842	mfhi	($t_2,$a_1,$b_3)
1843	$ADDU	$c_2,$t_1
1844	sltu	$at,$c_2,$t_1
1845	 $MULTU	($a_2,$b_3)		# mul_add_c(a[2],b[3],c3,c1,c2);
1846	$ADDU	$t_2,$at
1847	$ADDU	$c_3,$t_2
1848	sltu	$at,$c_3,$t_2
1849	$ADDU	$c_1,$at
1850	$ST	$c_2,4*$BNSZ($a0)
1851
1852	mflo	($t_1,$a_2,$b_3)
1853	mfhi	($t_2,$a_2,$b_3)
1854	$ADDU	$c_3,$t_1
1855	sltu	$at,$c_3,$t_1
1856	$MULTU	($a_3,$b_2)		# mul_add_c(a[3],b[2],c3,c1,c2);
1857	$ADDU	$t_2,$at
1858	$ADDU	$c_1,$t_2
1859	sltu	$c_2,$c_1,$t_2
1860	mflo	($t_1,$a_3,$b_2)
1861	mfhi	($t_2,$a_3,$b_2)
1862	$ADDU	$c_3,$t_1
1863	sltu	$at,$c_3,$t_1
1864	 $MULTU	($a_3,$b_3)		# mul_add_c(a[3],b[3],c1,c2,c3);
1865	$ADDU	$t_2,$at
1866	$ADDU	$c_1,$t_2
1867	sltu	$at,$c_1,$t_2
1868	$ADDU	$c_2,$at
1869	$ST	$c_3,5*$BNSZ($a0)
1870
1871	mflo	($t_1,$a_3,$b_3)
1872	mfhi	($t_2,$a_3,$b_3)
1873	$ADDU	$c_1,$t_1
1874	sltu	$at,$c_1,$t_1
1875	$ADDU	$t_2,$at
1876	$ADDU	$c_2,$t_2
1877	$ST	$c_1,6*$BNSZ($a0)
1878	$ST	$c_2,7*$BNSZ($a0)
1879
1880	.set	noreorder
1881___
1882$code.=<<___ if ($flavour =~ /nubi/i);
1883	$REG_L	$t3,4*$SZREG($sp)
1884	$REG_L	$t2,3*$SZREG($sp)
1885	$REG_L	$t1,2*$SZREG($sp)
1886	$REG_L	$t0,1*$SZREG($sp)
1887	$REG_L	$gp,0*$SZREG($sp)
1888	$PTR_ADD $sp,6*$SZREG
1889___
1890$code.=<<___;
1891	jr	$ra
1892	nop
1893.end	bn_mul_comba4
1894___
1895
1896($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1897
1898sub add_c2 () {
1899my ($hi,$lo,$c0,$c1,$c2,
1900    $warm,      # !$warm denotes first call with specific sequence of
1901                # $c_[XYZ] when there is no Z-carry to accumulate yet;
1902    $an,$bn     # these two are arguments for multiplication which
1903                # result is used in *next* step [which is why it's
1904                # commented as "forward multiplication" below];
1905    )=@_;
1906$code.=<<___;
1907	$ADDU	$c0,$lo
1908	sltu	$at,$c0,$lo
1909	 $MULTU	($an,$bn)		# forward multiplication
1910	$ADDU	$c0,$lo
1911	$ADDU	$at,$hi
1912	sltu	$lo,$c0,$lo
1913	$ADDU	$c1,$at
1914	$ADDU	$hi,$lo
1915___
1916$code.=<<___	if (!$warm);
1917	sltu	$c2,$c1,$at
1918	$ADDU	$c1,$hi
1919___
1920$code.=<<___	if ($warm);
1921	sltu	$at,$c1,$at
1922	$ADDU	$c1,$hi
1923	$ADDU	$c2,$at
1924___
1925$code.=<<___;
1926	sltu	$hi,$c1,$hi
1927	$ADDU	$c2,$hi
1928	mflo	($lo,$an,$bn)
1929	mfhi	($hi,$an,$bn)
1930___
1931}
1932
1933$code.=<<___;
1934
1935.align	5
1936.globl	bn_sqr_comba8
1937.ent	bn_sqr_comba8
1938bn_sqr_comba8:
1939___
1940$code.=<<___ if ($flavour =~ /nubi/i);
1941	.frame	$sp,6*$SZREG,$ra
1942	.mask	0x8000f008,-$SZREG
1943	.set	noreorder
1944	$PTR_SUB $sp,6*$SZREG
1945	$REG_S	$ra,5*$SZREG($sp)
1946	$REG_S	$t3,4*$SZREG($sp)
1947	$REG_S	$t2,3*$SZREG($sp)
1948	$REG_S	$t1,2*$SZREG($sp)
1949	$REG_S	$t0,1*$SZREG($sp)
1950	$REG_S	$gp,0*$SZREG($sp)
1951___
1952$code.=<<___;
1953	.set	reorder
1954	$LD	$a_0,0($a1)
1955	$LD	$a_1,$BNSZ($a1)
1956	$LD	$a_2,2*$BNSZ($a1)
1957	$LD	$a_3,3*$BNSZ($a1)
1958
1959	$MULTU	($a_0,$a_0)		# mul_add_c(a[0],b[0],c1,c2,c3);
1960	$LD	$a_4,4*$BNSZ($a1)
1961	$LD	$a_5,5*$BNSZ($a1)
1962	$LD	$a_6,6*$BNSZ($a1)
1963	$LD	$a_7,7*$BNSZ($a1)
1964	mflo	($c_1,$a_0,$a_0)
1965	mfhi	($c_2,$a_0,$a_0)
1966	$ST	$c_1,0($a0)
1967
1968	$MULTU	($a_0,$a_1)		# mul_add_c2(a[0],b[1],c2,c3,c1);
1969	mflo	($t_1,$a_0,$a_1)
1970	mfhi	($t_2,$a_0,$a_1)
1971	slt	$c_1,$t_2,$zero
1972	$SLL	$t_2,1
1973	 $MULTU	($a_2,$a_0)		# mul_add_c2(a[2],b[0],c3,c1,c2);
1974	slt	$a2,$t_1,$zero
1975	$ADDU	$t_2,$a2
1976	$SLL	$t_1,1
1977	$ADDU	$c_2,$t_1
1978	sltu	$at,$c_2,$t_1
1979	$ADDU	$c_3,$t_2,$at
1980	$ST	$c_2,$BNSZ($a0)
1981	mflo	($t_1,$a_2,$a_0)
1982	mfhi	($t_2,$a_2,$a_0)
1983___
1984	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1985		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
1986$code.=<<___;
1987	$ADDU	$c_3,$t_1
1988	sltu	$at,$c_3,$t_1
1989	 $MULTU	($a_0,$a_3)		# mul_add_c2(a[0],b[3],c1,c2,c3);
1990	$ADDU	$t_2,$at
1991	$ADDU	$c_1,$t_2
1992	sltu	$at,$c_1,$t_2
1993	$ADDU	$c_2,$at
1994	$ST	$c_3,2*$BNSZ($a0)
1995	mflo	($t_1,$a_0,$a_3)
1996	mfhi	($t_2,$a_0,$a_3)
1997___
1998	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
1999		$a_1,$a_2);		# mul_add_c2(a[1],b[2],c1,c2,c3);
2000	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2001		$a_4,$a_0);		# mul_add_c2(a[4],b[0],c2,c3,c1);
2002$code.=<<___;
2003	$ST	$c_1,3*$BNSZ($a0)
2004___
2005	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2006		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
2007	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2008		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
2009$code.=<<___;
2010	$ADDU	$c_2,$t_1
2011	sltu	$at,$c_2,$t_1
2012	 $MULTU	($a_0,$a_5)		# mul_add_c2(a[0],b[5],c3,c1,c2);
2013	$ADDU	$t_2,$at
2014	$ADDU	$c_3,$t_2
2015	sltu	$at,$c_3,$t_2
2016	$ADDU	$c_1,$at
2017	$ST	$c_2,4*$BNSZ($a0)
2018	mflo	($t_1,$a_0,$a_5)
2019	mfhi	($t_2,$a_0,$a_5)
2020___
2021	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2022		$a_1,$a_4);		# mul_add_c2(a[1],b[4],c3,c1,c2);
2023	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2024		$a_2,$a_3);		# mul_add_c2(a[2],b[3],c3,c1,c2);
2025	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2026		$a_6,$a_0);		# mul_add_c2(a[6],b[0],c1,c2,c3);
2027$code.=<<___;
2028	$ST	$c_3,5*$BNSZ($a0)
2029___
2030	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2031		$a_5,$a_1);		# mul_add_c2(a[5],b[1],c1,c2,c3);
2032	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2033		$a_4,$a_2);		# mul_add_c2(a[4],b[2],c1,c2,c3);
2034	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2035		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
2036$code.=<<___;
2037	$ADDU	$c_1,$t_1
2038	sltu	$at,$c_1,$t_1
2039	 $MULTU	($a_0,$a_7)		# mul_add_c2(a[0],b[7],c2,c3,c1);
2040	$ADDU	$t_2,$at
2041	$ADDU	$c_2,$t_2
2042	sltu	$at,$c_2,$t_2
2043	$ADDU	$c_3,$at
2044	$ST	$c_1,6*$BNSZ($a0)
2045	mflo	($t_1,$a_0,$a_7)
2046	mfhi	($t_2,$a_0,$a_7)
2047___
2048	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2049		$a_1,$a_6);		# mul_add_c2(a[1],b[6],c2,c3,c1);
2050	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2051		$a_2,$a_5);		# mul_add_c2(a[2],b[5],c2,c3,c1);
2052	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2053		$a_3,$a_4);		# mul_add_c2(a[3],b[4],c2,c3,c1);
2054	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2055		$a_7,$a_1);		# mul_add_c2(a[7],b[1],c3,c1,c2);
2056$code.=<<___;
2057	$ST	$c_2,7*$BNSZ($a0)
2058___
2059	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2060		$a_6,$a_2);		# mul_add_c2(a[6],b[2],c3,c1,c2);
2061	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2062		$a_5,$a_3);		# mul_add_c2(a[5],b[3],c3,c1,c2);
2063	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2064		$a_4,$a_4);		# mul_add_c(a[4],b[4],c3,c1,c2);
2065$code.=<<___;
2066	$ADDU	$c_3,$t_1
2067	sltu	$at,$c_3,$t_1
2068	 $MULTU	($a_2,$a_7)		# mul_add_c2(a[2],b[7],c1,c2,c3);
2069	$ADDU	$t_2,$at
2070	$ADDU	$c_1,$t_2
2071	sltu	$at,$c_1,$t_2
2072	$ADDU	$c_2,$at
2073	$ST	$c_3,8*$BNSZ($a0)
2074	mflo	($t_1,$a_2,$a_7)
2075	mfhi	($t_2,$a_2,$a_7)
2076___
2077	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2078		$a_3,$a_6);		# mul_add_c2(a[3],b[6],c1,c2,c3);
2079	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2080		$a_4,$a_5);		# mul_add_c2(a[4],b[5],c1,c2,c3);
2081	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2082		$a_7,$a_3);		# mul_add_c2(a[7],b[3],c2,c3,c1);
2083$code.=<<___;
2084	$ST	$c_1,9*$BNSZ($a0)
2085___
2086	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2087		$a_6,$a_4);		# mul_add_c2(a[6],b[4],c2,c3,c1);
2088	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2089		$a_5,$a_5);		# mul_add_c(a[5],b[5],c2,c3,c1);
2090$code.=<<___;
2091	$ADDU	$c_2,$t_1
2092	sltu	$at,$c_2,$t_1
2093	 $MULTU	($a_4,$a_7)		# mul_add_c2(a[4],b[7],c3,c1,c2);
2094	$ADDU	$t_2,$at
2095	$ADDU	$c_3,$t_2
2096	sltu	$at,$c_3,$t_2
2097	$ADDU	$c_1,$at
2098	$ST	$c_2,10*$BNSZ($a0)
2099	mflo	($t_1,$a_4,$a_7)
2100	mfhi	($t_2,$a_4,$a_7)
2101___
2102	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2103		$a_5,$a_6);		# mul_add_c2(a[5],b[6],c3,c1,c2);
2104	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2105		$a_7,$a_5);		# mul_add_c2(a[7],b[5],c1,c2,c3);
2106$code.=<<___;
2107	$ST	$c_3,11*$BNSZ($a0)
2108___
2109	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2110		$a_6,$a_6);		# mul_add_c(a[6],b[6],c1,c2,c3);
2111$code.=<<___;
2112	$ADDU	$c_1,$t_1
2113	sltu	$at,$c_1,$t_1
2114	 $MULTU	($a_6,$a_7)		# mul_add_c2(a[6],b[7],c2,c3,c1);
2115	$ADDU	$t_2,$at
2116	$ADDU	$c_2,$t_2
2117	sltu	$at,$c_2,$t_2
2118	$ADDU	$c_3,$at
2119	$ST	$c_1,12*$BNSZ($a0)
2120	mflo	($t_1,$a_6,$a_7)
2121	mfhi	($t_2,$a_6,$a_7)
2122___
2123	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2124		$a_7,$a_7);		# mul_add_c(a[7],b[7],c3,c1,c2);
2125$code.=<<___;
2126	$ST	$c_2,13*$BNSZ($a0)
2127
2128	$ADDU	$c_3,$t_1
2129	sltu	$at,$c_3,$t_1
2130	$ADDU	$t_2,$at
2131	$ADDU	$c_1,$t_2
2132	$ST	$c_3,14*$BNSZ($a0)
2133	$ST	$c_1,15*$BNSZ($a0)
2134
2135	.set	noreorder
2136___
2137$code.=<<___ if ($flavour =~ /nubi/i);
2138	$REG_L	$t3,4*$SZREG($sp)
2139	$REG_L	$t2,3*$SZREG($sp)
2140	$REG_L	$t1,2*$SZREG($sp)
2141	$REG_L	$t0,1*$SZREG($sp)
2142	$REG_L	$gp,0*$SZREG($sp)
2143	$PTR_ADD $sp,6*$SZREG
2144___
2145$code.=<<___;
2146	jr	$ra
2147	nop
2148.end	bn_sqr_comba8
2149
2150.align	5
2151.globl	bn_sqr_comba4
2152.ent	bn_sqr_comba4
2153bn_sqr_comba4:
2154___
2155$code.=<<___ if ($flavour =~ /nubi/i);
2156	.frame	$sp,6*$SZREG,$ra
2157	.mask	0x8000f008,-$SZREG
2158	.set	noreorder
2159	$PTR_SUB $sp,6*$SZREG
2160	$REG_S	$ra,5*$SZREG($sp)
2161	$REG_S	$t3,4*$SZREG($sp)
2162	$REG_S	$t2,3*$SZREG($sp)
2163	$REG_S	$t1,2*$SZREG($sp)
2164	$REG_S	$t0,1*$SZREG($sp)
2165	$REG_S	$gp,0*$SZREG($sp)
2166___
2167$code.=<<___;
2168	.set	reorder
2169	$LD	$a_0,0($a1)
2170	$LD	$a_1,$BNSZ($a1)
2171	$MULTU	($a_0,$a_0)		# mul_add_c(a[0],b[0],c1,c2,c3);
2172	$LD	$a_2,2*$BNSZ($a1)
2173	$LD	$a_3,3*$BNSZ($a1)
2174	mflo	($c_1,$a_0,$a_0)
2175	mfhi	($c_2,$a_0,$a_0)
2176	$ST	$c_1,0($a0)
2177
2178	$MULTU	($a_0,$a_1)		# mul_add_c2(a[0],b[1],c2,c3,c1);
2179	mflo	($t_1,$a_0,$a_1)
2180	mfhi	($t_2,$a_0,$a_1)
2181	slt	$c_1,$t_2,$zero
2182	$SLL	$t_2,1
2183	 $MULTU	($a_2,$a_0)		# mul_add_c2(a[2],b[0],c3,c1,c2);
2184	slt	$a2,$t_1,$zero
2185	$ADDU	$t_2,$a2
2186	$SLL	$t_1,1
2187	$ADDU	$c_2,$t_1
2188	sltu	$at,$c_2,$t_1
2189	$ADDU	$c_3,$t_2,$at
2190	$ST	$c_2,$BNSZ($a0)
2191	mflo	($t_1,$a_2,$a_0)
2192	mfhi	($t_2,$a_2,$a_0)
2193___
2194	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2195		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
2196$code.=<<___;
2197	$ADDU	$c_3,$t_1
2198	sltu	$at,$c_3,$t_1
2199	 $MULTU	($a_0,$a_3)		# mul_add_c2(a[0],b[3],c1,c2,c3);
2200	$ADDU	$t_2,$at
2201	$ADDU	$c_1,$t_2
2202	sltu	$at,$c_1,$t_2
2203	$ADDU	$c_2,$at
2204	$ST	$c_3,2*$BNSZ($a0)
2205	mflo	($t_1,$a_0,$a_3)
2206	mfhi	($t_2,$a_0,$a_3)
2207___
2208	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2209		$a_1,$a_2);		# mul_add_c2(a2[1],b[2],c1,c2,c3);
2210	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2211		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
2212$code.=<<___;
2213	$ST	$c_1,3*$BNSZ($a0)
2214___
2215	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2216		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
2217$code.=<<___;
2218	$ADDU	$c_2,$t_1
2219	sltu	$at,$c_2,$t_1
2220	 $MULTU	($a_2,$a_3)		# mul_add_c2(a[2],b[3],c3,c1,c2);
2221	$ADDU	$t_2,$at
2222	$ADDU	$c_3,$t_2
2223	sltu	$at,$c_3,$t_2
2224	$ADDU	$c_1,$at
2225	$ST	$c_2,4*$BNSZ($a0)
2226	mflo	($t_1,$a_2,$a_3)
2227	mfhi	($t_2,$a_2,$a_3)
2228___
2229	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2230		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
2231$code.=<<___;
2232	$ST	$c_3,5*$BNSZ($a0)
2233
2234	$ADDU	$c_1,$t_1
2235	sltu	$at,$c_1,$t_1
2236	$ADDU	$t_2,$at
2237	$ADDU	$c_2,$t_2
2238	$ST	$c_1,6*$BNSZ($a0)
2239	$ST	$c_2,7*$BNSZ($a0)
2240
2241	.set	noreorder
2242___
2243$code.=<<___ if ($flavour =~ /nubi/i);
2244	$REG_L	$t3,4*$SZREG($sp)
2245	$REG_L	$t2,3*$SZREG($sp)
2246	$REG_L	$t1,2*$SZREG($sp)
2247	$REG_L	$t0,1*$SZREG($sp)
2248	$REG_L	$gp,0*$SZREG($sp)
2249	$PTR_ADD $sp,6*$SZREG
2250___
2251$code.=<<___;
2252	jr	$ra
2253	nop
2254.end	bn_sqr_comba4
2255___
2256print $code;
2257close STDOUT;
2258