1#! /usr/bin/env perl
2# Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# ECP_NISTZ256 module for SPARCv9.
18#
19# February 2015.
20#
21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22# http://eprint.iacr.org/2013/816. In the process of adaptation
23# original .c module was made 32-bit savvy in order to make this
24# implementation possible.
25#
26#			with/without -DECP_NISTZ256_ASM
27# UltraSPARC III	+12-18%
28# SPARC T4		+99-550% (+66-150% on 32-bit Solaris)
29#
30# Ranges denote minimum and maximum improvement coefficients depending
31# on benchmark. Lower coefficients are for ECDSA sign, server-side
32# operation. Keep in mind that +200% means 3x improvement.
33
34$output = pop and open STDOUT,">$output";
35
36$code.=<<___;
37#ifndef __ASSEMBLER__
38# define __ASSEMBLER__ 1
39#endif
40#include "crypto/sparc_arch.h"
41
42#define LOCALS	(STACK_BIAS+STACK_FRAME)
43#ifdef	__arch64__
44.register	%g2,#scratch
45.register	%g3,#scratch
46# define STACK64_FRAME	STACK_FRAME
47# define LOCALS64	LOCALS
48#else
49# define STACK64_FRAME	(2047+192)
50# define LOCALS64	STACK64_FRAME
51#endif
52
53.section	".text",#alloc,#execinstr
54___
55########################################################################
56# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
57#
58$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59open TABLE,"<ecp_nistz256_table.c"		or
60open TABLE,"<${dir}../ecp_nistz256_table.c"	or
61die "failed to open ecp_nistz256_table.c:",$!;
62
63use integer;
64
65foreach(<TABLE>) {
66	s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
67}
68close TABLE;
69
70# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
71# 64*16*37-1 is because $#arr returns last valid index or @arr, not
72# amount of elements.
73die "insane number of elements" if ($#arr != 64*16*37-1);
74
75$code.=<<___;
76.globl	ecp_nistz256_precomputed
77.align	4096
78ecp_nistz256_precomputed:
79___
80########################################################################
81# this conversion smashes P256_POINT_AFFINE by individual bytes with
82# 64 byte interval, similar to
83#	1111222233334444
84#	1234123412341234
85for(1..37) {
86	@tbl = splice(@arr,0,64*16);
87	for($i=0;$i<64;$i++) {
88		undef @line;
89		for($j=0;$j<64;$j++) {
90			push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
91		}
92		$code.=".byte\t";
93		$code.=join(',',map { sprintf "0x%02x",$_} @line);
94		$code.="\n";
95	}
96}
97
98{{{
99my ($rp,$ap,$bp)=map("%i$_",(0..2));
100my @acc=map("%l$_",(0..7));
101my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7)=(map("%o$_",(0..5)),"%g4","%g5");
102my ($bi,$a0,$mask,$carry)=(map("%i$_",(3..5)),"%g1");
103my ($rp_real,$ap_real)=("%g2","%g3");
104
105$code.=<<___;
106.type	ecp_nistz256_precomputed,#object
107.size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
108.align	64
109.LRR:	! 2^512 mod P precomputed for NIST P256 polynomial
110.long	0x00000003, 0x00000000, 0xffffffff, 0xfffffffb
111.long	0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004
112.Lone:
113.long	1,0,0,0,0,0,0,0
114.asciz	"ECP_NISTZ256 for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
115
116! void	ecp_nistz256_to_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
117.globl	ecp_nistz256_to_mont
118.align	64
119ecp_nistz256_to_mont:
120	save	%sp,-STACK_FRAME,%sp
121	nop
1221:	call	.+8
123	add	%o7,.LRR-1b,$bp
124	call	__ecp_nistz256_mul_mont
125	nop
126	ret
127	restore
128.type	ecp_nistz256_to_mont,#function
129.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
130
131! void	ecp_nistz256_from_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
132.globl	ecp_nistz256_from_mont
133.align	32
134ecp_nistz256_from_mont:
135	save	%sp,-STACK_FRAME,%sp
136	nop
1371:	call	.+8
138	add	%o7,.Lone-1b,$bp
139	call	__ecp_nistz256_mul_mont
140	nop
141	ret
142	restore
143.type	ecp_nistz256_from_mont,#function
144.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
145
146! void	ecp_nistz256_mul_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8],
147!					      const BN_ULONG %i2[8]);
148.globl	ecp_nistz256_mul_mont
149.align	32
150ecp_nistz256_mul_mont:
151	save	%sp,-STACK_FRAME,%sp
152	nop
153	call	__ecp_nistz256_mul_mont
154	nop
155	ret
156	restore
157.type	ecp_nistz256_mul_mont,#function
158.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
159
160! void	ecp_nistz256_sqr_mont(BN_ULONG %i0[8],const BN_ULONG %i2[8]);
161.globl	ecp_nistz256_sqr_mont
162.align	32
163ecp_nistz256_sqr_mont:
164	save	%sp,-STACK_FRAME,%sp
165	mov	$ap,$bp
166	call	__ecp_nistz256_mul_mont
167	nop
168	ret
169	restore
170.type	ecp_nistz256_sqr_mont,#function
171.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
172___
173
174########################################################################
175# Special thing to keep in mind is that $t0-$t7 hold 64-bit values,
176# while all others are meant to keep 32. "Meant to" means that additions
177# to @acc[0-7] do "contaminate" upper bits, but they are cleared before
178# they can affect outcome (follow 'and' with $mask). Also keep in mind
179# that addition with carry is addition with 32-bit carry, even though
180# CPU is 64-bit. [Addition with 64-bit carry was introduced in T3, see
181# below for VIS3 code paths.]
182
183$code.=<<___;
184.align	32
185__ecp_nistz256_mul_mont:
186	ld	[$bp+0],$bi		! b[0]
187	mov	-1,$mask
188	ld	[$ap+0],$a0
189	srl	$mask,0,$mask		! 0xffffffff
190	ld	[$ap+4],$t1
191	ld	[$ap+8],$t2
192	ld	[$ap+12],$t3
193	ld	[$ap+16],$t4
194	ld	[$ap+20],$t5
195	ld	[$ap+24],$t6
196	ld	[$ap+28],$t7
197	mulx	$a0,$bi,$t0		! a[0-7]*b[0], 64-bit results
198	mulx	$t1,$bi,$t1
199	mulx	$t2,$bi,$t2
200	mulx	$t3,$bi,$t3
201	mulx	$t4,$bi,$t4
202	mulx	$t5,$bi,$t5
203	mulx	$t6,$bi,$t6
204	mulx	$t7,$bi,$t7
205	srlx	$t0,32,@acc[1]		! extract high parts
206	srlx	$t1,32,@acc[2]
207	srlx	$t2,32,@acc[3]
208	srlx	$t3,32,@acc[4]
209	srlx	$t4,32,@acc[5]
210	srlx	$t5,32,@acc[6]
211	srlx	$t6,32,@acc[7]
212	srlx	$t7,32,@acc[0]		! "@acc[8]"
213	mov	0,$carry
214___
215for($i=1;$i<8;$i++) {
216$code.=<<___;
217	addcc	@acc[1],$t1,@acc[1]	! accumulate high parts
218	ld	[$bp+4*$i],$bi		! b[$i]
219	ld	[$ap+4],$t1		! re-load a[1-7]
220	addccc	@acc[2],$t2,@acc[2]
221	addccc	@acc[3],$t3,@acc[3]
222	ld	[$ap+8],$t2
223	ld	[$ap+12],$t3
224	addccc	@acc[4],$t4,@acc[4]
225	addccc	@acc[5],$t5,@acc[5]
226	ld	[$ap+16],$t4
227	ld	[$ap+20],$t5
228	addccc	@acc[6],$t6,@acc[6]
229	addccc	@acc[7],$t7,@acc[7]
230	ld	[$ap+24],$t6
231	ld	[$ap+28],$t7
232	addccc	@acc[0],$carry,@acc[0]	! "@acc[8]"
233	addc	%g0,%g0,$carry
234___
235	# Reduction iteration is normally performed by accumulating
236	# result of multiplication of modulus by "magic" digit [and
237	# omitting least significant word, which is guaranteed to
238	# be 0], but thanks to special form of modulus and "magic"
239	# digit being equal to least significant word, it can be
240	# performed with additions and subtractions alone. Indeed:
241	#
242	#        ffff.0001.0000.0000.0000.ffff.ffff.ffff
243	# *                                         abcd
244	# + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
245	#
246	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
247	# rewrite above as:
248	#
249	#   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
250	# + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
251	# -      abcd.0000.0000.0000.0000.0000.0000.abcd
252	#
253	# or marking redundant operations:
254	#
255	#   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
256	# + abcd.0000.abcd.0000.0000.abcd.----.----.----
257	# -      abcd.----.----.----.----.----.----.----
258
259$code.=<<___;
260	! multiplication-less reduction
261	addcc	@acc[3],$t0,@acc[3]	! r[3]+=r[0]
262	addccc	@acc[4],%g0,@acc[4]	! r[4]+=0
263	 and	@acc[1],$mask,@acc[1]
264	 and	@acc[2],$mask,@acc[2]
265	addccc	@acc[5],%g0,@acc[5]	! r[5]+=0
266	addccc	@acc[6],$t0,@acc[6]	! r[6]+=r[0]
267	 and	@acc[3],$mask,@acc[3]
268	 and	@acc[4],$mask,@acc[4]
269	addccc	@acc[7],%g0,@acc[7]	! r[7]+=0
270	addccc	@acc[0],$t0,@acc[0]	! r[8]+=r[0]	"@acc[8]"
271	 and	@acc[5],$mask,@acc[5]
272	 and	@acc[6],$mask,@acc[6]
273	addc	$carry,%g0,$carry	! top-most carry
274	subcc	@acc[7],$t0,@acc[7]	! r[7]-=r[0]
275	subccc	@acc[0],%g0,@acc[0]	! r[8]-=0	"@acc[8]"
276	subc	$carry,%g0,$carry	! top-most carry
277	 and	@acc[7],$mask,@acc[7]
278	 and	@acc[0],$mask,@acc[0]	! "@acc[8]"
279___
280	push(@acc,shift(@acc));		# rotate registers to "omit" acc[0]
281$code.=<<___;
282	mulx	$a0,$bi,$t0		! a[0-7]*b[$i], 64-bit results
283	mulx	$t1,$bi,$t1
284	mulx	$t2,$bi,$t2
285	mulx	$t3,$bi,$t3
286	mulx	$t4,$bi,$t4
287	mulx	$t5,$bi,$t5
288	mulx	$t6,$bi,$t6
289	mulx	$t7,$bi,$t7
290	add	@acc[0],$t0,$t0		! accumulate low parts, can't overflow
291	add	@acc[1],$t1,$t1
292	srlx	$t0,32,@acc[1]		! extract high parts
293	add	@acc[2],$t2,$t2
294	srlx	$t1,32,@acc[2]
295	add	@acc[3],$t3,$t3
296	srlx	$t2,32,@acc[3]
297	add	@acc[4],$t4,$t4
298	srlx	$t3,32,@acc[4]
299	add	@acc[5],$t5,$t5
300	srlx	$t4,32,@acc[5]
301	add	@acc[6],$t6,$t6
302	srlx	$t5,32,@acc[6]
303	add	@acc[7],$t7,$t7
304	srlx	$t6,32,@acc[7]
305	srlx	$t7,32,@acc[0]		! "@acc[8]"
306___
307}
308$code.=<<___;
309	addcc	@acc[1],$t1,@acc[1]	! accumulate high parts
310	addccc	@acc[2],$t2,@acc[2]
311	addccc	@acc[3],$t3,@acc[3]
312	addccc	@acc[4],$t4,@acc[4]
313	addccc	@acc[5],$t5,@acc[5]
314	addccc	@acc[6],$t6,@acc[6]
315	addccc	@acc[7],$t7,@acc[7]
316	addccc	@acc[0],$carry,@acc[0]	! "@acc[8]"
317	addc	%g0,%g0,$carry
318
319	addcc	@acc[3],$t0,@acc[3]	! multiplication-less reduction
320	addccc	@acc[4],%g0,@acc[4]
321	addccc	@acc[5],%g0,@acc[5]
322	addccc	@acc[6],$t0,@acc[6]
323	addccc	@acc[7],%g0,@acc[7]
324	addccc	@acc[0],$t0,@acc[0]	! "@acc[8]"
325	addc	$carry,%g0,$carry
326	subcc	@acc[7],$t0,@acc[7]
327	subccc	@acc[0],%g0,@acc[0]	! "@acc[8]"
328	subc	$carry,%g0,$carry	! top-most carry
329___
330	push(@acc,shift(@acc));		# rotate registers to omit acc[0]
331$code.=<<___;
332	! Final step is "if result > mod, subtract mod", but we do it
333	! "other way around", namely subtract modulus from result
334	! and if it borrowed, add modulus back.
335
336	subcc	@acc[0],-1,@acc[0]	! subtract modulus
337	subccc	@acc[1],-1,@acc[1]
338	subccc	@acc[2],-1,@acc[2]
339	subccc	@acc[3],0,@acc[3]
340	subccc	@acc[4],0,@acc[4]
341	subccc	@acc[5],0,@acc[5]
342	subccc	@acc[6],1,@acc[6]
343	subccc	@acc[7],-1,@acc[7]
344	subc	$carry,0,$carry		! broadcast borrow bit
345
346	! Note that because mod has special form, i.e. consists of
347	! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
348	! using value of broadcasted borrow and the borrow bit itself.
349	! To minimize dependency chain we first broadcast and then
350	! extract the bit by negating (follow $bi).
351
352	addcc	@acc[0],$carry,@acc[0]	! add modulus or zero
353	addccc	@acc[1],$carry,@acc[1]
354	neg	$carry,$bi
355	st	@acc[0],[$rp]
356	addccc	@acc[2],$carry,@acc[2]
357	st	@acc[1],[$rp+4]
358	addccc	@acc[3],0,@acc[3]
359	st	@acc[2],[$rp+8]
360	addccc	@acc[4],0,@acc[4]
361	st	@acc[3],[$rp+12]
362	addccc	@acc[5],0,@acc[5]
363	st	@acc[4],[$rp+16]
364	addccc	@acc[6],$bi,@acc[6]
365	st	@acc[5],[$rp+20]
366	addc	@acc[7],$carry,@acc[7]
367	st	@acc[6],[$rp+24]
368	retl
369	st	@acc[7],[$rp+28]
370.type	__ecp_nistz256_mul_mont,#function
371.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
372
373! void	ecp_nistz256_add(BN_ULONG %i0[8],const BN_ULONG %i1[8],
374!					 const BN_ULONG %i2[8]);
375.globl	ecp_nistz256_add
376.align	32
377ecp_nistz256_add:
378	save	%sp,-STACK_FRAME,%sp
379	ld	[$ap],@acc[0]
380	ld	[$ap+4],@acc[1]
381	ld	[$ap+8],@acc[2]
382	ld	[$ap+12],@acc[3]
383	ld	[$ap+16],@acc[4]
384	ld	[$ap+20],@acc[5]
385	ld	[$ap+24],@acc[6]
386	call	__ecp_nistz256_add
387	ld	[$ap+28],@acc[7]
388	ret
389	restore
390.type	ecp_nistz256_add,#function
391.size	ecp_nistz256_add,.-ecp_nistz256_add
392
393.align	32
394__ecp_nistz256_add:
395	ld	[$bp+0],$t0		! b[0]
396	ld	[$bp+4],$t1
397	ld	[$bp+8],$t2
398	ld	[$bp+12],$t3
399	addcc	@acc[0],$t0,@acc[0]
400	ld	[$bp+16],$t4
401	ld	[$bp+20],$t5
402	addccc	@acc[1],$t1,@acc[1]
403	ld	[$bp+24],$t6
404	ld	[$bp+28],$t7
405	addccc	@acc[2],$t2,@acc[2]
406	addccc	@acc[3],$t3,@acc[3]
407	addccc	@acc[4],$t4,@acc[4]
408	addccc	@acc[5],$t5,@acc[5]
409	addccc	@acc[6],$t6,@acc[6]
410	addccc	@acc[7],$t7,@acc[7]
411	addc	%g0,%g0,$carry
412
413.Lreduce_by_sub:
414
415	! if a+b >= modulus, subtract modulus.
416	!
417	! But since comparison implies subtraction, we subtract
418	! modulus and then add it back if subtraction borrowed.
419
420	subcc	@acc[0],-1,@acc[0]
421	subccc	@acc[1],-1,@acc[1]
422	subccc	@acc[2],-1,@acc[2]
423	subccc	@acc[3], 0,@acc[3]
424	subccc	@acc[4], 0,@acc[4]
425	subccc	@acc[5], 0,@acc[5]
426	subccc	@acc[6], 1,@acc[6]
427	subccc	@acc[7],-1,@acc[7]
428	subc	$carry,0,$carry
429
430	! Note that because mod has special form, i.e. consists of
431	! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
432	! using value of borrow and its negative.
433
434	addcc	@acc[0],$carry,@acc[0]	! add synthesized modulus
435	addccc	@acc[1],$carry,@acc[1]
436	neg	$carry,$bi
437	st	@acc[0],[$rp]
438	addccc	@acc[2],$carry,@acc[2]
439	st	@acc[1],[$rp+4]
440	addccc	@acc[3],0,@acc[3]
441	st	@acc[2],[$rp+8]
442	addccc	@acc[4],0,@acc[4]
443	st	@acc[3],[$rp+12]
444	addccc	@acc[5],0,@acc[5]
445	st	@acc[4],[$rp+16]
446	addccc	@acc[6],$bi,@acc[6]
447	st	@acc[5],[$rp+20]
448	addc	@acc[7],$carry,@acc[7]
449	st	@acc[6],[$rp+24]
450	retl
451	st	@acc[7],[$rp+28]
452.type	__ecp_nistz256_add,#function
453.size	__ecp_nistz256_add,.-__ecp_nistz256_add
454
455! void	ecp_nistz256_mul_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
456.globl	ecp_nistz256_mul_by_2
457.align	32
458ecp_nistz256_mul_by_2:
459	save	%sp,-STACK_FRAME,%sp
460	ld	[$ap],@acc[0]
461	ld	[$ap+4],@acc[1]
462	ld	[$ap+8],@acc[2]
463	ld	[$ap+12],@acc[3]
464	ld	[$ap+16],@acc[4]
465	ld	[$ap+20],@acc[5]
466	ld	[$ap+24],@acc[6]
467	call	__ecp_nistz256_mul_by_2
468	ld	[$ap+28],@acc[7]
469	ret
470	restore
471.type	ecp_nistz256_mul_by_2,#function
472.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
473
474.align	32
475__ecp_nistz256_mul_by_2:
476	addcc	@acc[0],@acc[0],@acc[0]	! a+a=2*a
477	addccc	@acc[1],@acc[1],@acc[1]
478	addccc	@acc[2],@acc[2],@acc[2]
479	addccc	@acc[3],@acc[3],@acc[3]
480	addccc	@acc[4],@acc[4],@acc[4]
481	addccc	@acc[5],@acc[5],@acc[5]
482	addccc	@acc[6],@acc[6],@acc[6]
483	addccc	@acc[7],@acc[7],@acc[7]
484	b	.Lreduce_by_sub
485	addc	%g0,%g0,$carry
486.type	__ecp_nistz256_mul_by_2,#function
487.size	__ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
488
489! void	ecp_nistz256_mul_by_3(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
490.globl	ecp_nistz256_mul_by_3
491.align	32
492ecp_nistz256_mul_by_3:
493	save	%sp,-STACK_FRAME,%sp
494	ld	[$ap],@acc[0]
495	ld	[$ap+4],@acc[1]
496	ld	[$ap+8],@acc[2]
497	ld	[$ap+12],@acc[3]
498	ld	[$ap+16],@acc[4]
499	ld	[$ap+20],@acc[5]
500	ld	[$ap+24],@acc[6]
501	call	__ecp_nistz256_mul_by_3
502	ld	[$ap+28],@acc[7]
503	ret
504	restore
505.type	ecp_nistz256_mul_by_3,#function
506.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
507
508.align	32
509__ecp_nistz256_mul_by_3:
510	addcc	@acc[0],@acc[0],$t0	! a+a=2*a
511	addccc	@acc[1],@acc[1],$t1
512	addccc	@acc[2],@acc[2],$t2
513	addccc	@acc[3],@acc[3],$t3
514	addccc	@acc[4],@acc[4],$t4
515	addccc	@acc[5],@acc[5],$t5
516	addccc	@acc[6],@acc[6],$t6
517	addccc	@acc[7],@acc[7],$t7
518	addc	%g0,%g0,$carry
519
520	subcc	$t0,-1,$t0		! .Lreduce_by_sub but without stores
521	subccc	$t1,-1,$t1
522	subccc	$t2,-1,$t2
523	subccc	$t3, 0,$t3
524	subccc	$t4, 0,$t4
525	subccc	$t5, 0,$t5
526	subccc	$t6, 1,$t6
527	subccc	$t7,-1,$t7
528	subc	$carry,0,$carry
529
530	addcc	$t0,$carry,$t0		! add synthesized modulus
531	addccc	$t1,$carry,$t1
532	neg	$carry,$bi
533	addccc	$t2,$carry,$t2
534	addccc	$t3,0,$t3
535	addccc	$t4,0,$t4
536	addccc	$t5,0,$t5
537	addccc	$t6,$bi,$t6
538	addc	$t7,$carry,$t7
539
540	addcc	$t0,@acc[0],@acc[0]	! 2*a+a=3*a
541	addccc	$t1,@acc[1],@acc[1]
542	addccc	$t2,@acc[2],@acc[2]
543	addccc	$t3,@acc[3],@acc[3]
544	addccc	$t4,@acc[4],@acc[4]
545	addccc	$t5,@acc[5],@acc[5]
546	addccc	$t6,@acc[6],@acc[6]
547	addccc	$t7,@acc[7],@acc[7]
548	b	.Lreduce_by_sub
549	addc	%g0,%g0,$carry
550.type	__ecp_nistz256_mul_by_3,#function
551.size	__ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3
552
553! void	ecp_nistz256_sub(BN_ULONG %i0[8],const BN_ULONG %i1[8],
554!				         const BN_ULONG %i2[8]);
555.globl	ecp_nistz256_sub
556.align	32
557ecp_nistz256_sub:
558	save	%sp,-STACK_FRAME,%sp
559	ld	[$ap],@acc[0]
560	ld	[$ap+4],@acc[1]
561	ld	[$ap+8],@acc[2]
562	ld	[$ap+12],@acc[3]
563	ld	[$ap+16],@acc[4]
564	ld	[$ap+20],@acc[5]
565	ld	[$ap+24],@acc[6]
566	call	__ecp_nistz256_sub_from
567	ld	[$ap+28],@acc[7]
568	ret
569	restore
570.type	ecp_nistz256_sub,#function
571.size	ecp_nistz256_sub,.-ecp_nistz256_sub
572
573! void	ecp_nistz256_neg(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
574.globl	ecp_nistz256_neg
575.align	32
576ecp_nistz256_neg:
577	save	%sp,-STACK_FRAME,%sp
578	mov	$ap,$bp
579	mov	0,@acc[0]
580	mov	0,@acc[1]
581	mov	0,@acc[2]
582	mov	0,@acc[3]
583	mov	0,@acc[4]
584	mov	0,@acc[5]
585	mov	0,@acc[6]
586	call	__ecp_nistz256_sub_from
587	mov	0,@acc[7]
588	ret
589	restore
590.type	ecp_nistz256_neg,#function
591.size	ecp_nistz256_neg,.-ecp_nistz256_neg
592
593.align	32
594__ecp_nistz256_sub_from:
595	ld	[$bp+0],$t0		! b[0]
596	ld	[$bp+4],$t1
597	ld	[$bp+8],$t2
598	ld	[$bp+12],$t3
599	subcc	@acc[0],$t0,@acc[0]
600	ld	[$bp+16],$t4
601	ld	[$bp+20],$t5
602	subccc	@acc[1],$t1,@acc[1]
603	subccc	@acc[2],$t2,@acc[2]
604	ld	[$bp+24],$t6
605	ld	[$bp+28],$t7
606	subccc	@acc[3],$t3,@acc[3]
607	subccc	@acc[4],$t4,@acc[4]
608	subccc	@acc[5],$t5,@acc[5]
609	subccc	@acc[6],$t6,@acc[6]
610	subccc	@acc[7],$t7,@acc[7]
611	subc	%g0,%g0,$carry		! broadcast borrow bit
612
613.Lreduce_by_add:
614
615	! if a-b borrows, add modulus.
616	!
617	! Note that because mod has special form, i.e. consists of
618	! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
619	! using value of broadcasted borrow and the borrow bit itself.
620	! To minimize dependency chain we first broadcast and then
621	! extract the bit by negating (follow $bi).
622
623	addcc	@acc[0],$carry,@acc[0]	! add synthesized modulus
624	addccc	@acc[1],$carry,@acc[1]
625	neg	$carry,$bi
626	st	@acc[0],[$rp]
627	addccc	@acc[2],$carry,@acc[2]
628	st	@acc[1],[$rp+4]
629	addccc	@acc[3],0,@acc[3]
630	st	@acc[2],[$rp+8]
631	addccc	@acc[4],0,@acc[4]
632	st	@acc[3],[$rp+12]
633	addccc	@acc[5],0,@acc[5]
634	st	@acc[4],[$rp+16]
635	addccc	@acc[6],$bi,@acc[6]
636	st	@acc[5],[$rp+20]
637	addc	@acc[7],$carry,@acc[7]
638	st	@acc[6],[$rp+24]
639	retl
640	st	@acc[7],[$rp+28]
641.type	__ecp_nistz256_sub_from,#function
642.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
643
644.align	32
645__ecp_nistz256_sub_morf:
646	ld	[$bp+0],$t0		! b[0]
647	ld	[$bp+4],$t1
648	ld	[$bp+8],$t2
649	ld	[$bp+12],$t3
650	subcc	$t0,@acc[0],@acc[0]
651	ld	[$bp+16],$t4
652	ld	[$bp+20],$t5
653	subccc	$t1,@acc[1],@acc[1]
654	subccc	$t2,@acc[2],@acc[2]
655	ld	[$bp+24],$t6
656	ld	[$bp+28],$t7
657	subccc	$t3,@acc[3],@acc[3]
658	subccc	$t4,@acc[4],@acc[4]
659	subccc	$t5,@acc[5],@acc[5]
660	subccc	$t6,@acc[6],@acc[6]
661	subccc	$t7,@acc[7],@acc[7]
662	b	.Lreduce_by_add
663	subc	%g0,%g0,$carry		! broadcast borrow bit
664.type	__ecp_nistz256_sub_morf,#function
665.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
666
667! void	ecp_nistz256_div_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
668.globl	ecp_nistz256_div_by_2
669.align	32
670ecp_nistz256_div_by_2:
671	save	%sp,-STACK_FRAME,%sp
672	ld	[$ap],@acc[0]
673	ld	[$ap+4],@acc[1]
674	ld	[$ap+8],@acc[2]
675	ld	[$ap+12],@acc[3]
676	ld	[$ap+16],@acc[4]
677	ld	[$ap+20],@acc[5]
678	ld	[$ap+24],@acc[6]
679	call	__ecp_nistz256_div_by_2
680	ld	[$ap+28],@acc[7]
681	ret
682	restore
683.type	ecp_nistz256_div_by_2,#function
684.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
685
686.align	32
687__ecp_nistz256_div_by_2:
688	! ret = (a is odd ? a+mod : a) >> 1
689
690	and	@acc[0],1,$bi
691	neg	$bi,$carry
692	addcc	@acc[0],$carry,@acc[0]
693	addccc	@acc[1],$carry,@acc[1]
694	addccc	@acc[2],$carry,@acc[2]
695	addccc	@acc[3],0,@acc[3]
696	addccc	@acc[4],0,@acc[4]
697	addccc	@acc[5],0,@acc[5]
698	addccc	@acc[6],$bi,@acc[6]
699	addccc	@acc[7],$carry,@acc[7]
700	addc	%g0,%g0,$carry
701
702	! ret >>= 1
703
704	srl	@acc[0],1,@acc[0]
705	sll	@acc[1],31,$t0
706	srl	@acc[1],1,@acc[1]
707	or	@acc[0],$t0,@acc[0]
708	sll	@acc[2],31,$t1
709	srl	@acc[2],1,@acc[2]
710	or	@acc[1],$t1,@acc[1]
711	sll	@acc[3],31,$t2
712	st	@acc[0],[$rp]
713	srl	@acc[3],1,@acc[3]
714	or	@acc[2],$t2,@acc[2]
715	sll	@acc[4],31,$t3
716	st	@acc[1],[$rp+4]
717	srl	@acc[4],1,@acc[4]
718	or	@acc[3],$t3,@acc[3]
719	sll	@acc[5],31,$t4
720	st	@acc[2],[$rp+8]
721	srl	@acc[5],1,@acc[5]
722	or	@acc[4],$t4,@acc[4]
723	sll	@acc[6],31,$t5
724	st	@acc[3],[$rp+12]
725	srl	@acc[6],1,@acc[6]
726	or	@acc[5],$t5,@acc[5]
727	sll	@acc[7],31,$t6
728	st	@acc[4],[$rp+16]
729	srl	@acc[7],1,@acc[7]
730	or	@acc[6],$t6,@acc[6]
731	sll	$carry,31,$t7
732	st	@acc[5],[$rp+20]
733	or	@acc[7],$t7,@acc[7]
734	st	@acc[6],[$rp+24]
735	retl
736	st	@acc[7],[$rp+28]
737.type	__ecp_nistz256_div_by_2,#function
738.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
739___
740
741########################################################################
742# following subroutines are "literal" implementation of those found in
743# ecp_nistz256.c
744#
745########################################################################
746# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
747#
748{
749my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
750# above map() describes stack layout with 4 temporary
751# 256-bit vectors on top.
752
753$code.=<<___;
754#ifdef __PIC__
755SPARC_PIC_THUNK(%g1)
756#endif
757
758.globl	ecp_nistz256_point_double
759.align	32
760ecp_nistz256_point_double:
761	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
762	ld	[%g1],%g1		! OPENSSL_sparcv9cap_P[0]
763	and	%g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
764	cmp	%g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
765	be	ecp_nistz256_point_double_vis3
766	nop
767
768	save	%sp,-STACK_FRAME-32*4,%sp
769
770	mov	$rp,$rp_real
771	mov	$ap,$ap_real
772
773.Lpoint_double_shortcut:
774	ld	[$ap+32],@acc[0]
775	ld	[$ap+32+4],@acc[1]
776	ld	[$ap+32+8],@acc[2]
777	ld	[$ap+32+12],@acc[3]
778	ld	[$ap+32+16],@acc[4]
779	ld	[$ap+32+20],@acc[5]
780	ld	[$ap+32+24],@acc[6]
781	ld	[$ap+32+28],@acc[7]
782	call	__ecp_nistz256_mul_by_2	! p256_mul_by_2(S, in_y);
783	add	%sp,LOCALS+$S,$rp
784
785	add	$ap_real,64,$bp
786	add	$ap_real,64,$ap
787	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(Zsqr, in_z);
788	add	%sp,LOCALS+$Zsqr,$rp
789
790	add	$ap_real,0,$bp
791	call	__ecp_nistz256_add	! p256_add(M, Zsqr, in_x);
792	add	%sp,LOCALS+$M,$rp
793
794	add	%sp,LOCALS+$S,$bp
795	add	%sp,LOCALS+$S,$ap
796	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(S, S);
797	add	%sp,LOCALS+$S,$rp
798
799	ld	[$ap_real],@acc[0]
800	add	%sp,LOCALS+$Zsqr,$bp
801	ld	[$ap_real+4],@acc[1]
802	ld	[$ap_real+8],@acc[2]
803	ld	[$ap_real+12],@acc[3]
804	ld	[$ap_real+16],@acc[4]
805	ld	[$ap_real+20],@acc[5]
806	ld	[$ap_real+24],@acc[6]
807	ld	[$ap_real+28],@acc[7]
808	call	__ecp_nistz256_sub_from	! p256_sub(Zsqr, in_x, Zsqr);
809	add	%sp,LOCALS+$Zsqr,$rp
810
811	add	$ap_real,32,$bp
812	add	$ap_real,64,$ap
813	call	__ecp_nistz256_mul_mont	! p256_mul_mont(tmp0, in_z, in_y);
814	add	%sp,LOCALS+$tmp0,$rp
815
816	call	__ecp_nistz256_mul_by_2	! p256_mul_by_2(res_z, tmp0);
817	add	$rp_real,64,$rp
818
819	add	%sp,LOCALS+$Zsqr,$bp
820	add	%sp,LOCALS+$M,$ap
821	call	__ecp_nistz256_mul_mont	! p256_mul_mont(M, M, Zsqr);
822	add	%sp,LOCALS+$M,$rp
823
824	call	__ecp_nistz256_mul_by_3	! p256_mul_by_3(M, M);
825	add	%sp,LOCALS+$M,$rp
826
827	add	%sp,LOCALS+$S,$bp
828	add	%sp,LOCALS+$S,$ap
829	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(tmp0, S);
830	add	%sp,LOCALS+$tmp0,$rp
831
832	call	__ecp_nistz256_div_by_2	! p256_div_by_2(res_y, tmp0);
833	add	$rp_real,32,$rp
834
835	add	$ap_real,0,$bp
836	add	%sp,LOCALS+$S,$ap
837	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S, S, in_x);
838	add	%sp,LOCALS+$S,$rp
839
840	call	__ecp_nistz256_mul_by_2	! p256_mul_by_2(tmp0, S);
841	add	%sp,LOCALS+$tmp0,$rp
842
843	add	%sp,LOCALS+$M,$bp
844	add	%sp,LOCALS+$M,$ap
845	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(res_x, M);
846	add	$rp_real,0,$rp
847
848	add	%sp,LOCALS+$tmp0,$bp
849	call	__ecp_nistz256_sub_from	! p256_sub(res_x, res_x, tmp0);
850	add	$rp_real,0,$rp
851
852	add	%sp,LOCALS+$S,$bp
853	call	__ecp_nistz256_sub_morf	! p256_sub(S, S, res_x);
854	add	%sp,LOCALS+$S,$rp
855
856	add	%sp,LOCALS+$M,$bp
857	add	%sp,LOCALS+$S,$ap
858	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S, S, M);
859	add	%sp,LOCALS+$S,$rp
860
861	add	$rp_real,32,$bp
862	call	__ecp_nistz256_sub_from	! p256_sub(res_y, S, res_y);
863	add	$rp_real,32,$rp
864
865	ret
866	restore
867.type	ecp_nistz256_point_double,#function
868.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
869___
870}
871
872########################################################################
873# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
874#			      const P256_POINT *in2);
875{
876my ($res_x,$res_y,$res_z,
877    $H,$Hsqr,$R,$Rsqr,$Hcub,
878    $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
879my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
880
881# above map() describes stack layout with 12 temporary
882# 256-bit vectors on top. Then we reserve some space for
883# !in1infty, !in2infty, result of check for zero and return pointer.
884
885my $bp_real=$rp_real;
886
887$code.=<<___;
888.globl	ecp_nistz256_point_add
889.align	32
890ecp_nistz256_point_add:
891	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
892	ld	[%g1],%g1		! OPENSSL_sparcv9cap_P[0]
893	and	%g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
894	cmp	%g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
895	be	ecp_nistz256_point_add_vis3
896	nop
897
898	save	%sp,-STACK_FRAME-32*12-32,%sp
899
900	stx	$rp,[%fp+STACK_BIAS-8]	! off-load $rp
901	mov	$ap,$ap_real
902	mov	$bp,$bp_real
903
904	ld	[$bp+64],$t0		! in2_z
905	ld	[$bp+64+4],$t1
906	ld	[$bp+64+8],$t2
907	ld	[$bp+64+12],$t3
908	ld	[$bp+64+16],$t4
909	ld	[$bp+64+20],$t5
910	ld	[$bp+64+24],$t6
911	ld	[$bp+64+28],$t7
912	or	$t1,$t0,$t0
913	or	$t3,$t2,$t2
914	or	$t5,$t4,$t4
915	or	$t7,$t6,$t6
916	or	$t2,$t0,$t0
917	or	$t6,$t4,$t4
918	or	$t4,$t0,$t0		! !in2infty
919	movrnz	$t0,-1,$t0
920	st	$t0,[%fp+STACK_BIAS-12]
921
922	ld	[$ap+64],$t0		! in1_z
923	ld	[$ap+64+4],$t1
924	ld	[$ap+64+8],$t2
925	ld	[$ap+64+12],$t3
926	ld	[$ap+64+16],$t4
927	ld	[$ap+64+20],$t5
928	ld	[$ap+64+24],$t6
929	ld	[$ap+64+28],$t7
930	or	$t1,$t0,$t0
931	or	$t3,$t2,$t2
932	or	$t5,$t4,$t4
933	or	$t7,$t6,$t6
934	or	$t2,$t0,$t0
935	or	$t6,$t4,$t4
936	or	$t4,$t0,$t0		! !in1infty
937	movrnz	$t0,-1,$t0
938	st	$t0,[%fp+STACK_BIAS-16]
939
940	add	$bp_real,64,$bp
941	add	$bp_real,64,$ap
942	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(Z2sqr, in2_z);
943	add	%sp,LOCALS+$Z2sqr,$rp
944
945	add	$ap_real,64,$bp
946	add	$ap_real,64,$ap
947	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(Z1sqr, in1_z);
948	add	%sp,LOCALS+$Z1sqr,$rp
949
950	add	$bp_real,64,$bp
951	add	%sp,LOCALS+$Z2sqr,$ap
952	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S1, Z2sqr, in2_z);
953	add	%sp,LOCALS+$S1,$rp
954
955	add	$ap_real,64,$bp
956	add	%sp,LOCALS+$Z1sqr,$ap
957	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S2, Z1sqr, in1_z);
958	add	%sp,LOCALS+$S2,$rp
959
960	add	$ap_real,32,$bp
961	add	%sp,LOCALS+$S1,$ap
962	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S1, S1, in1_y);
963	add	%sp,LOCALS+$S1,$rp
964
965	add	$bp_real,32,$bp
966	add	%sp,LOCALS+$S2,$ap
967	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S2, S2, in2_y);
968	add	%sp,LOCALS+$S2,$rp
969
970	add	%sp,LOCALS+$S1,$bp
971	call	__ecp_nistz256_sub_from	! p256_sub(R, S2, S1);
972	add	%sp,LOCALS+$R,$rp
973
974	or	@acc[1],@acc[0],@acc[0]	! see if result is zero
975	or	@acc[3],@acc[2],@acc[2]
976	or	@acc[5],@acc[4],@acc[4]
977	or	@acc[7],@acc[6],@acc[6]
978	or	@acc[2],@acc[0],@acc[0]
979	or	@acc[6],@acc[4],@acc[4]
980	or	@acc[4],@acc[0],@acc[0]
981	st	@acc[0],[%fp+STACK_BIAS-20]
982
983	add	$ap_real,0,$bp
984	add	%sp,LOCALS+$Z2sqr,$ap
985	call	__ecp_nistz256_mul_mont	! p256_mul_mont(U1, in1_x, Z2sqr);
986	add	%sp,LOCALS+$U1,$rp
987
988	add	$bp_real,0,$bp
989	add	%sp,LOCALS+$Z1sqr,$ap
990	call	__ecp_nistz256_mul_mont	! p256_mul_mont(U2, in2_x, Z1sqr);
991	add	%sp,LOCALS+$U2,$rp
992
993	add	%sp,LOCALS+$U1,$bp
994	call	__ecp_nistz256_sub_from	! p256_sub(H, U2, U1);
995	add	%sp,LOCALS+$H,$rp
996
997	or	@acc[1],@acc[0],@acc[0]	! see if result is zero
998	or	@acc[3],@acc[2],@acc[2]
999	or	@acc[5],@acc[4],@acc[4]
1000	or	@acc[7],@acc[6],@acc[6]
1001	or	@acc[2],@acc[0],@acc[0]
1002	or	@acc[6],@acc[4],@acc[4]
1003	orcc	@acc[4],@acc[0],@acc[0]
1004
1005	bne,pt	%icc,.Ladd_proceed	! is_equal(U1,U2)?
1006	nop
1007
1008	ld	[%fp+STACK_BIAS-12],$t0
1009	ld	[%fp+STACK_BIAS-16],$t1
1010	ld	[%fp+STACK_BIAS-20],$t2
1011	andcc	$t0,$t1,%g0
1012	be,pt	%icc,.Ladd_proceed	! (in1infty || in2infty)?
1013	nop
1014	andcc	$t2,$t2,%g0
1015	be,pt	%icc,.Ladd_double	! is_equal(S1,S2)?
1016	nop
1017
1018	ldx	[%fp+STACK_BIAS-8],$rp
1019	st	%g0,[$rp]
1020	st	%g0,[$rp+4]
1021	st	%g0,[$rp+8]
1022	st	%g0,[$rp+12]
1023	st	%g0,[$rp+16]
1024	st	%g0,[$rp+20]
1025	st	%g0,[$rp+24]
1026	st	%g0,[$rp+28]
1027	st	%g0,[$rp+32]
1028	st	%g0,[$rp+32+4]
1029	st	%g0,[$rp+32+8]
1030	st	%g0,[$rp+32+12]
1031	st	%g0,[$rp+32+16]
1032	st	%g0,[$rp+32+20]
1033	st	%g0,[$rp+32+24]
1034	st	%g0,[$rp+32+28]
1035	st	%g0,[$rp+64]
1036	st	%g0,[$rp+64+4]
1037	st	%g0,[$rp+64+8]
1038	st	%g0,[$rp+64+12]
1039	st	%g0,[$rp+64+16]
1040	st	%g0,[$rp+64+20]
1041	st	%g0,[$rp+64+24]
1042	st	%g0,[$rp+64+28]
1043	b	.Ladd_done
1044	nop
1045
1046.align	16
1047.Ladd_double:
1048	ldx	[%fp+STACK_BIAS-8],$rp_real
1049	mov	$ap_real,$ap
1050	b	.Lpoint_double_shortcut
1051	add	%sp,32*(12-4)+32,%sp	! difference in frame sizes
1052
1053.align	16
1054.Ladd_proceed:
1055	add	%sp,LOCALS+$R,$bp
1056	add	%sp,LOCALS+$R,$ap
1057	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(Rsqr, R);
1058	add	%sp,LOCALS+$Rsqr,$rp
1059
1060	add	$ap_real,64,$bp
1061	add	%sp,LOCALS+$H,$ap
1062	call	__ecp_nistz256_mul_mont	! p256_mul_mont(res_z, H, in1_z);
1063	add	%sp,LOCALS+$res_z,$rp
1064
1065	add	%sp,LOCALS+$H,$bp
1066	add	%sp,LOCALS+$H,$ap
1067	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(Hsqr, H);
1068	add	%sp,LOCALS+$Hsqr,$rp
1069
1070	add	$bp_real,64,$bp
1071	add	%sp,LOCALS+$res_z,$ap
1072	call	__ecp_nistz256_mul_mont	! p256_mul_mont(res_z, res_z, in2_z);
1073	add	%sp,LOCALS+$res_z,$rp
1074
1075	add	%sp,LOCALS+$H,$bp
1076	add	%sp,LOCALS+$Hsqr,$ap
1077	call	__ecp_nistz256_mul_mont	! p256_mul_mont(Hcub, Hsqr, H);
1078	add	%sp,LOCALS+$Hcub,$rp
1079
1080	add	%sp,LOCALS+$U1,$bp
1081	add	%sp,LOCALS+$Hsqr,$ap
1082	call	__ecp_nistz256_mul_mont	! p256_mul_mont(U2, U1, Hsqr);
1083	add	%sp,LOCALS+$U2,$rp
1084
1085	call	__ecp_nistz256_mul_by_2	! p256_mul_by_2(Hsqr, U2);
1086	add	%sp,LOCALS+$Hsqr,$rp
1087
1088	add	%sp,LOCALS+$Rsqr,$bp
1089	call	__ecp_nistz256_sub_morf	! p256_sub(res_x, Rsqr, Hsqr);
1090	add	%sp,LOCALS+$res_x,$rp
1091
1092	add	%sp,LOCALS+$Hcub,$bp
1093	call	__ecp_nistz256_sub_from	!  p256_sub(res_x, res_x, Hcub);
1094	add	%sp,LOCALS+$res_x,$rp
1095
1096	add	%sp,LOCALS+$U2,$bp
1097	call	__ecp_nistz256_sub_morf	! p256_sub(res_y, U2, res_x);
1098	add	%sp,LOCALS+$res_y,$rp
1099
1100	add	%sp,LOCALS+$Hcub,$bp
1101	add	%sp,LOCALS+$S1,$ap
1102	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S2, S1, Hcub);
1103	add	%sp,LOCALS+$S2,$rp
1104
1105	add	%sp,LOCALS+$R,$bp
1106	add	%sp,LOCALS+$res_y,$ap
1107	call	__ecp_nistz256_mul_mont	! p256_mul_mont(res_y, res_y, R);
1108	add	%sp,LOCALS+$res_y,$rp
1109
1110	add	%sp,LOCALS+$S2,$bp
1111	call	__ecp_nistz256_sub_from	! p256_sub(res_y, res_y, S2);
1112	add	%sp,LOCALS+$res_y,$rp
1113
1114	ld	[%fp+STACK_BIAS-16],$t1	! !in1infty
1115	ld	[%fp+STACK_BIAS-12],$t2	! !in2infty
1116	ldx	[%fp+STACK_BIAS-8],$rp
1117___
1118for($i=0;$i<96;$i+=8) {			# conditional moves
1119$code.=<<___;
1120	ld	[%sp+LOCALS+$i],@acc[0]		! res
1121	ld	[%sp+LOCALS+$i+4],@acc[1]
1122	ld	[$bp_real+$i],@acc[2]		! in2
1123	ld	[$bp_real+$i+4],@acc[3]
1124	ld	[$ap_real+$i],@acc[4]		! in1
1125	ld	[$ap_real+$i+4],@acc[5]
1126	movrz	$t1,@acc[2],@acc[0]
1127	movrz	$t1,@acc[3],@acc[1]
1128	movrz	$t2,@acc[4],@acc[0]
1129	movrz	$t2,@acc[5],@acc[1]
1130	st	@acc[0],[$rp+$i]
1131	st	@acc[1],[$rp+$i+4]
1132___
1133}
1134$code.=<<___;
1135.Ladd_done:
1136	ret
1137	restore
1138.type	ecp_nistz256_point_add,#function
1139.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
1140___
1141}
1142
1143########################################################################
1144# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1145#				     const P256_POINT_AFFINE *in2);
1146{
1147my ($res_x,$res_y,$res_z,
1148    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
1149my $Z1sqr = $S2;
1150# above map() describes stack layout with 10 temporary
1151# 256-bit vectors on top. Then we reserve some space for
1152# !in1infty, !in2infty, result of check for zero and return pointer.
1153
1154my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
1155my $bp_real=$rp_real;
1156
1157$code.=<<___;
1158.globl	ecp_nistz256_point_add_affine
1159.align	32
1160ecp_nistz256_point_add_affine:
1161	SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
1162	ld	[%g1],%g1		! OPENSSL_sparcv9cap_P[0]
1163	and	%g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
1164	cmp	%g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
1165	be	ecp_nistz256_point_add_affine_vis3
1166	nop
1167
1168	save	%sp,-STACK_FRAME-32*10-32,%sp
1169
1170	stx	$rp,[%fp+STACK_BIAS-8]	! off-load $rp
1171	mov	$ap,$ap_real
1172	mov	$bp,$bp_real
1173
1174	ld	[$ap+64],$t0		! in1_z
1175	ld	[$ap+64+4],$t1
1176	ld	[$ap+64+8],$t2
1177	ld	[$ap+64+12],$t3
1178	ld	[$ap+64+16],$t4
1179	ld	[$ap+64+20],$t5
1180	ld	[$ap+64+24],$t6
1181	ld	[$ap+64+28],$t7
1182	or	$t1,$t0,$t0
1183	or	$t3,$t2,$t2
1184	or	$t5,$t4,$t4
1185	or	$t7,$t6,$t6
1186	or	$t2,$t0,$t0
1187	or	$t6,$t4,$t4
1188	or	$t4,$t0,$t0		! !in1infty
1189	movrnz	$t0,-1,$t0
1190	st	$t0,[%fp+STACK_BIAS-16]
1191
1192	ld	[$bp],@acc[0]		! in2_x
1193	ld	[$bp+4],@acc[1]
1194	ld	[$bp+8],@acc[2]
1195	ld	[$bp+12],@acc[3]
1196	ld	[$bp+16],@acc[4]
1197	ld	[$bp+20],@acc[5]
1198	ld	[$bp+24],@acc[6]
1199	ld	[$bp+28],@acc[7]
1200	ld	[$bp+32],$t0		! in2_y
1201	ld	[$bp+32+4],$t1
1202	ld	[$bp+32+8],$t2
1203	ld	[$bp+32+12],$t3
1204	ld	[$bp+32+16],$t4
1205	ld	[$bp+32+20],$t5
1206	ld	[$bp+32+24],$t6
1207	ld	[$bp+32+28],$t7
1208	or	@acc[1],@acc[0],@acc[0]
1209	or	@acc[3],@acc[2],@acc[2]
1210	or	@acc[5],@acc[4],@acc[4]
1211	or	@acc[7],@acc[6],@acc[6]
1212	or	@acc[2],@acc[0],@acc[0]
1213	or	@acc[6],@acc[4],@acc[4]
1214	or	@acc[4],@acc[0],@acc[0]
1215	or	$t1,$t0,$t0
1216	or	$t3,$t2,$t2
1217	or	$t5,$t4,$t4
1218	or	$t7,$t6,$t6
1219	or	$t2,$t0,$t0
1220	or	$t6,$t4,$t4
1221	or	$t4,$t0,$t0
1222	or	@acc[0],$t0,$t0		! !in2infty
1223	movrnz	$t0,-1,$t0
1224	st	$t0,[%fp+STACK_BIAS-12]
1225
1226	add	$ap_real,64,$bp
1227	add	$ap_real,64,$ap
1228	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(Z1sqr, in1_z);
1229	add	%sp,LOCALS+$Z1sqr,$rp
1230
1231	add	$bp_real,0,$bp
1232	add	%sp,LOCALS+$Z1sqr,$ap
1233	call	__ecp_nistz256_mul_mont	! p256_mul_mont(U2, Z1sqr, in2_x);
1234	add	%sp,LOCALS+$U2,$rp
1235
1236	add	$ap_real,0,$bp
1237	call	__ecp_nistz256_sub_from	! p256_sub(H, U2, in1_x);
1238	add	%sp,LOCALS+$H,$rp
1239
1240	add	$ap_real,64,$bp
1241	add	%sp,LOCALS+$Z1sqr,$ap
1242	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S2, Z1sqr, in1_z);
1243	add	%sp,LOCALS+$S2,$rp
1244
1245	add	$ap_real,64,$bp
1246	add	%sp,LOCALS+$H,$ap
1247	call	__ecp_nistz256_mul_mont	! p256_mul_mont(res_z, H, in1_z);
1248	add	%sp,LOCALS+$res_z,$rp
1249
1250	add	$bp_real,32,$bp
1251	add	%sp,LOCALS+$S2,$ap
1252	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S2, S2, in2_y);
1253	add	%sp,LOCALS+$S2,$rp
1254
1255	add	$ap_real,32,$bp
1256	call	__ecp_nistz256_sub_from	! p256_sub(R, S2, in1_y);
1257	add	%sp,LOCALS+$R,$rp
1258
1259	add	%sp,LOCALS+$H,$bp
1260	add	%sp,LOCALS+$H,$ap
1261	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(Hsqr, H);
1262	add	%sp,LOCALS+$Hsqr,$rp
1263
1264	add	%sp,LOCALS+$R,$bp
1265	add	%sp,LOCALS+$R,$ap
1266	call	__ecp_nistz256_mul_mont	! p256_sqr_mont(Rsqr, R);
1267	add	%sp,LOCALS+$Rsqr,$rp
1268
1269	add	%sp,LOCALS+$H,$bp
1270	add	%sp,LOCALS+$Hsqr,$ap
1271	call	__ecp_nistz256_mul_mont	! p256_mul_mont(Hcub, Hsqr, H);
1272	add	%sp,LOCALS+$Hcub,$rp
1273
1274	add	$ap_real,0,$bp
1275	add	%sp,LOCALS+$Hsqr,$ap
1276	call	__ecp_nistz256_mul_mont	! p256_mul_mont(U2, in1_x, Hsqr);
1277	add	%sp,LOCALS+$U2,$rp
1278
1279	call	__ecp_nistz256_mul_by_2	! p256_mul_by_2(Hsqr, U2);
1280	add	%sp,LOCALS+$Hsqr,$rp
1281
1282	add	%sp,LOCALS+$Rsqr,$bp
1283	call	__ecp_nistz256_sub_morf	! p256_sub(res_x, Rsqr, Hsqr);
1284	add	%sp,LOCALS+$res_x,$rp
1285
1286	add	%sp,LOCALS+$Hcub,$bp
1287	call	__ecp_nistz256_sub_from	!  p256_sub(res_x, res_x, Hcub);
1288	add	%sp,LOCALS+$res_x,$rp
1289
1290	add	%sp,LOCALS+$U2,$bp
1291	call	__ecp_nistz256_sub_morf	! p256_sub(res_y, U2, res_x);
1292	add	%sp,LOCALS+$res_y,$rp
1293
1294	add	$ap_real,32,$bp
1295	add	%sp,LOCALS+$Hcub,$ap
1296	call	__ecp_nistz256_mul_mont	! p256_mul_mont(S2, in1_y, Hcub);
1297	add	%sp,LOCALS+$S2,$rp
1298
1299	add	%sp,LOCALS+$R,$bp
1300	add	%sp,LOCALS+$res_y,$ap
1301	call	__ecp_nistz256_mul_mont	! p256_mul_mont(res_y, res_y, R);
1302	add	%sp,LOCALS+$res_y,$rp
1303
1304	add	%sp,LOCALS+$S2,$bp
1305	call	__ecp_nistz256_sub_from	! p256_sub(res_y, res_y, S2);
1306	add	%sp,LOCALS+$res_y,$rp
1307
1308	ld	[%fp+STACK_BIAS-16],$t1	! !in1infty
1309	ld	[%fp+STACK_BIAS-12],$t2	! !in2infty
1310	ldx	[%fp+STACK_BIAS-8],$rp
1311___
1312for($i=0;$i<64;$i+=8) {			# conditional moves
1313$code.=<<___;
1314	ld	[%sp+LOCALS+$i],@acc[0]		! res
1315	ld	[%sp+LOCALS+$i+4],@acc[1]
1316	ld	[$bp_real+$i],@acc[2]		! in2
1317	ld	[$bp_real+$i+4],@acc[3]
1318	ld	[$ap_real+$i],@acc[4]		! in1
1319	ld	[$ap_real+$i+4],@acc[5]
1320	movrz	$t1,@acc[2],@acc[0]
1321	movrz	$t1,@acc[3],@acc[1]
1322	movrz	$t2,@acc[4],@acc[0]
1323	movrz	$t2,@acc[5],@acc[1]
1324	st	@acc[0],[$rp+$i]
1325	st	@acc[1],[$rp+$i+4]
1326___
1327}
1328for(;$i<96;$i+=8) {
1329my $j=($i-64)/4;
1330$code.=<<___;
1331	ld	[%sp+LOCALS+$i],@acc[0]		! res
1332	ld	[%sp+LOCALS+$i+4],@acc[1]
1333	ld	[$ap_real+$i],@acc[4]		! in1
1334	ld	[$ap_real+$i+4],@acc[5]
1335	movrz	$t1,@ONE_mont[$j],@acc[0]
1336	movrz	$t1,@ONE_mont[$j+1],@acc[1]
1337	movrz	$t2,@acc[4],@acc[0]
1338	movrz	$t2,@acc[5],@acc[1]
1339	st	@acc[0],[$rp+$i]
1340	st	@acc[1],[$rp+$i+4]
1341___
1342}
1343$code.=<<___;
1344	ret
1345	restore
1346.type	ecp_nistz256_point_add_affine,#function
1347.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1348___
1349}								}}}
1350{{{
1351my ($out,$inp,$index)=map("%i$_",(0..2));
1352my $mask="%o0";
1353
1354$code.=<<___;
1355! void	ecp_nistz256_scatter_w5(void *%i0,const P256_POINT *%i1,
1356!					  int %i2);
1357.globl	ecp_nistz256_scatter_w5
1358.align	32
1359ecp_nistz256_scatter_w5:
1360	save	%sp,-STACK_FRAME,%sp
1361
1362	sll	$index,2,$index
1363	add	$out,$index,$out
1364
1365	ld	[$inp],%l0		! X
1366	ld	[$inp+4],%l1
1367	ld	[$inp+8],%l2
1368	ld	[$inp+12],%l3
1369	ld	[$inp+16],%l4
1370	ld	[$inp+20],%l5
1371	ld	[$inp+24],%l6
1372	ld	[$inp+28],%l7
1373	add	$inp,32,$inp
1374	st	%l0,[$out+64*0-4]
1375	st	%l1,[$out+64*1-4]
1376	st	%l2,[$out+64*2-4]
1377	st	%l3,[$out+64*3-4]
1378	st	%l4,[$out+64*4-4]
1379	st	%l5,[$out+64*5-4]
1380	st	%l6,[$out+64*6-4]
1381	st	%l7,[$out+64*7-4]
1382	add	$out,64*8,$out
1383
1384	ld	[$inp],%l0		! Y
1385	ld	[$inp+4],%l1
1386	ld	[$inp+8],%l2
1387	ld	[$inp+12],%l3
1388	ld	[$inp+16],%l4
1389	ld	[$inp+20],%l5
1390	ld	[$inp+24],%l6
1391	ld	[$inp+28],%l7
1392	add	$inp,32,$inp
1393	st	%l0,[$out+64*0-4]
1394	st	%l1,[$out+64*1-4]
1395	st	%l2,[$out+64*2-4]
1396	st	%l3,[$out+64*3-4]
1397	st	%l4,[$out+64*4-4]
1398	st	%l5,[$out+64*5-4]
1399	st	%l6,[$out+64*6-4]
1400	st	%l7,[$out+64*7-4]
1401	add	$out,64*8,$out
1402
1403	ld	[$inp],%l0		! Z
1404	ld	[$inp+4],%l1
1405	ld	[$inp+8],%l2
1406	ld	[$inp+12],%l3
1407	ld	[$inp+16],%l4
1408	ld	[$inp+20],%l5
1409	ld	[$inp+24],%l6
1410	ld	[$inp+28],%l7
1411	st	%l0,[$out+64*0-4]
1412	st	%l1,[$out+64*1-4]
1413	st	%l2,[$out+64*2-4]
1414	st	%l3,[$out+64*3-4]
1415	st	%l4,[$out+64*4-4]
1416	st	%l5,[$out+64*5-4]
1417	st	%l6,[$out+64*6-4]
1418	st	%l7,[$out+64*7-4]
1419
1420	ret
1421	restore
1422.type	ecp_nistz256_scatter_w5,#function
1423.size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
1424
1425! void	ecp_nistz256_gather_w5(P256_POINT *%i0,const void *%i1,
1426!					       int %i2);
1427.globl	ecp_nistz256_gather_w5
1428.align	32
1429ecp_nistz256_gather_w5:
1430	save	%sp,-STACK_FRAME,%sp
1431
1432	neg	$index,$mask
1433	srax	$mask,63,$mask
1434
1435	add	$index,$mask,$index
1436	sll	$index,2,$index
1437	add	$inp,$index,$inp
1438
1439	ld	[$inp+64*0],%l0
1440	ld	[$inp+64*1],%l1
1441	ld	[$inp+64*2],%l2
1442	ld	[$inp+64*3],%l3
1443	ld	[$inp+64*4],%l4
1444	ld	[$inp+64*5],%l5
1445	ld	[$inp+64*6],%l6
1446	ld	[$inp+64*7],%l7
1447	add	$inp,64*8,$inp
1448	and	%l0,$mask,%l0
1449	and	%l1,$mask,%l1
1450	st	%l0,[$out]		! X
1451	and	%l2,$mask,%l2
1452	st	%l1,[$out+4]
1453	and	%l3,$mask,%l3
1454	st	%l2,[$out+8]
1455	and	%l4,$mask,%l4
1456	st	%l3,[$out+12]
1457	and	%l5,$mask,%l5
1458	st	%l4,[$out+16]
1459	and	%l6,$mask,%l6
1460	st	%l5,[$out+20]
1461	and	%l7,$mask,%l7
1462	st	%l6,[$out+24]
1463	st	%l7,[$out+28]
1464	add	$out,32,$out
1465
1466	ld	[$inp+64*0],%l0
1467	ld	[$inp+64*1],%l1
1468	ld	[$inp+64*2],%l2
1469	ld	[$inp+64*3],%l3
1470	ld	[$inp+64*4],%l4
1471	ld	[$inp+64*5],%l5
1472	ld	[$inp+64*6],%l6
1473	ld	[$inp+64*7],%l7
1474	add	$inp,64*8,$inp
1475	and	%l0,$mask,%l0
1476	and	%l1,$mask,%l1
1477	st	%l0,[$out]		! Y
1478	and	%l2,$mask,%l2
1479	st	%l1,[$out+4]
1480	and	%l3,$mask,%l3
1481	st	%l2,[$out+8]
1482	and	%l4,$mask,%l4
1483	st	%l3,[$out+12]
1484	and	%l5,$mask,%l5
1485	st	%l4,[$out+16]
1486	and	%l6,$mask,%l6
1487	st	%l5,[$out+20]
1488	and	%l7,$mask,%l7
1489	st	%l6,[$out+24]
1490	st	%l7,[$out+28]
1491	add	$out,32,$out
1492
1493	ld	[$inp+64*0],%l0
1494	ld	[$inp+64*1],%l1
1495	ld	[$inp+64*2],%l2
1496	ld	[$inp+64*3],%l3
1497	ld	[$inp+64*4],%l4
1498	ld	[$inp+64*5],%l5
1499	ld	[$inp+64*6],%l6
1500	ld	[$inp+64*7],%l7
1501	and	%l0,$mask,%l0
1502	and	%l1,$mask,%l1
1503	st	%l0,[$out]		! Z
1504	and	%l2,$mask,%l2
1505	st	%l1,[$out+4]
1506	and	%l3,$mask,%l3
1507	st	%l2,[$out+8]
1508	and	%l4,$mask,%l4
1509	st	%l3,[$out+12]
1510	and	%l5,$mask,%l5
1511	st	%l4,[$out+16]
1512	and	%l6,$mask,%l6
1513	st	%l5,[$out+20]
1514	and	%l7,$mask,%l7
1515	st	%l6,[$out+24]
1516	st	%l7,[$out+28]
1517
1518	ret
1519	restore
1520.type	ecp_nistz256_gather_w5,#function
1521.size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
1522
1523! void	ecp_nistz256_scatter_w7(void *%i0,const P256_POINT_AFFINE *%i1,
1524!					  int %i2);
1525.globl	ecp_nistz256_scatter_w7
1526.align	32
1527ecp_nistz256_scatter_w7:
1528	save	%sp,-STACK_FRAME,%sp
1529	nop
1530	add	$out,$index,$out
1531	mov	64/4,$index
1532.Loop_scatter_w7:
1533	ld	[$inp],%l0
1534	add	$inp,4,$inp
1535	subcc	$index,1,$index
1536	stb	%l0,[$out+64*0]
1537	srl	%l0,8,%l1
1538	stb	%l1,[$out+64*1]
1539	srl	%l0,16,%l2
1540	stb	%l2,[$out+64*2]
1541	srl	%l0,24,%l3
1542	stb	%l3,[$out+64*3]
1543	bne	.Loop_scatter_w7
1544	add	$out,64*4,$out
1545
1546	ret
1547	restore
1548.type	ecp_nistz256_scatter_w7,#function
1549.size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
1550
1551! void	ecp_nistz256_gather_w7(P256_POINT_AFFINE *%i0,const void *%i1,
1552!						      int %i2);
1553.globl	ecp_nistz256_gather_w7
1554.align	32
1555ecp_nistz256_gather_w7:
1556	save	%sp,-STACK_FRAME,%sp
1557
1558	neg	$index,$mask
1559	srax	$mask,63,$mask
1560
1561	add	$index,$mask,$index
1562	add	$inp,$index,$inp
1563	mov	64/4,$index
1564
1565.Loop_gather_w7:
1566	ldub	[$inp+64*0],%l0
1567	prefetch [$inp+3840+64*0],1
1568	subcc	$index,1,$index
1569	ldub	[$inp+64*1],%l1
1570	prefetch [$inp+3840+64*1],1
1571	ldub	[$inp+64*2],%l2
1572	prefetch [$inp+3840+64*2],1
1573	ldub	[$inp+64*3],%l3
1574	prefetch [$inp+3840+64*3],1
1575	add	$inp,64*4,$inp
1576	sll	%l1,8,%l1
1577	sll	%l2,16,%l2
1578	or	%l0,%l1,%l0
1579	sll	%l3,24,%l3
1580	or	%l0,%l2,%l0
1581	or	%l0,%l3,%l0
1582	and	%l0,$mask,%l0
1583	st	%l0,[$out]
1584	bne	.Loop_gather_w7
1585	add	$out,4,$out
1586
1587	ret
1588	restore
1589.type	ecp_nistz256_gather_w7,#function
1590.size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
1591___
1592}}}
1593{{{
1594########################################################################
1595# Following subroutines are VIS3 counterparts of those above that
1596# implement ones found in ecp_nistz256.c. Key difference is that they
1597# use 128-bit multiplication and addition with 64-bit carry, and in order
1598# to do that they perform conversion from uin32_t[8] to uint64_t[4] upon
1599# entry and vice versa on return.
1600#
1601my ($rp,$ap,$bp)=map("%i$_",(0..2));
1602my ($t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("%l$_",(0..7));
1603my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5)=map("%o$_",(0..5));
1604my ($bi,$poly1,$poly3,$minus1)=(map("%i$_",(3..5)),"%g1");
1605my ($rp_real,$ap_real)=("%g2","%g3");
1606my ($acc6,$acc7)=($bp,$bi);	# used in squaring
1607
1608$code.=<<___;
1609.align	32
1610__ecp_nistz256_mul_by_2_vis3:
1611	addcc	$acc0,$acc0,$acc0
1612	addxccc	$acc1,$acc1,$acc1
1613	addxccc	$acc2,$acc2,$acc2
1614	addxccc	$acc3,$acc3,$acc3
1615	b	.Lreduce_by_sub_vis3
1616	addxc	%g0,%g0,$acc4		! did it carry?
1617.type	__ecp_nistz256_mul_by_2_vis3,#function
1618.size	__ecp_nistz256_mul_by_2_vis3,.-__ecp_nistz256_mul_by_2_vis3
1619
1620.align	32
1621__ecp_nistz256_add_vis3:
1622	ldx	[$bp+0],$t0
1623	ldx	[$bp+8],$t1
1624	ldx	[$bp+16],$t2
1625	ldx	[$bp+24],$t3
1626
1627__ecp_nistz256_add_noload_vis3:
1628
1629	addcc	$t0,$acc0,$acc0
1630	addxccc	$t1,$acc1,$acc1
1631	addxccc	$t2,$acc2,$acc2
1632	addxccc	$t3,$acc3,$acc3
1633	addxc	%g0,%g0,$acc4		! did it carry?
1634
1635.Lreduce_by_sub_vis3:
1636
1637	addcc	$acc0,1,$t0		! add -modulus, i.e. subtract
1638	addxccc	$acc1,$poly1,$t1
1639	addxccc	$acc2,$minus1,$t2
1640	addxccc	$acc3,$poly3,$t3
1641	addxc	$acc4,$minus1,$acc4
1642
1643	movrz	$acc4,$t0,$acc0		! ret = borrow ? ret : ret-modulus
1644	movrz	$acc4,$t1,$acc1
1645	stx	$acc0,[$rp]
1646	movrz	$acc4,$t2,$acc2
1647	stx	$acc1,[$rp+8]
1648	movrz	$acc4,$t3,$acc3
1649	stx	$acc2,[$rp+16]
1650	retl
1651	stx	$acc3,[$rp+24]
1652.type	__ecp_nistz256_add_vis3,#function
1653.size	__ecp_nistz256_add_vis3,.-__ecp_nistz256_add_vis3
1654
1655! Trouble with subtraction is that there is no subtraction with 64-bit
1656! borrow, only with 32-bit one. For this reason we "decompose" 64-bit
1657! $acc0-$acc3 to 32-bit values and pick b[4] in 32-bit pieces. But
1658! recall that SPARC is big-endian, which is why you'll observe that
1659! b[4] is accessed as 4-0-12-8-20-16-28-24. And prior reduction we
1660! "collect" result back to 64-bit $acc0-$acc3.
1661.align	32
1662__ecp_nistz256_sub_from_vis3:
1663	ld	[$bp+4],$t0
1664	ld	[$bp+0],$t1
1665	ld	[$bp+12],$t2
1666	ld	[$bp+8],$t3
1667
1668	srlx	$acc0,32,$acc4
1669	not	$poly1,$poly1
1670	srlx	$acc1,32,$acc5
1671	subcc	$acc0,$t0,$acc0
1672	ld	[$bp+20],$t0
1673	subccc	$acc4,$t1,$acc4
1674	ld	[$bp+16],$t1
1675	subccc	$acc1,$t2,$acc1
1676	ld	[$bp+28],$t2
1677	and	$acc0,$poly1,$acc0
1678	subccc	$acc5,$t3,$acc5
1679	ld	[$bp+24],$t3
1680	sllx	$acc4,32,$acc4
1681	and	$acc1,$poly1,$acc1
1682	sllx	$acc5,32,$acc5
1683	or	$acc0,$acc4,$acc0
1684	srlx	$acc2,32,$acc4
1685	or	$acc1,$acc5,$acc1
1686	srlx	$acc3,32,$acc5
1687	subccc	$acc2,$t0,$acc2
1688	subccc	$acc4,$t1,$acc4
1689	subccc	$acc3,$t2,$acc3
1690	and	$acc2,$poly1,$acc2
1691	subccc	$acc5,$t3,$acc5
1692	sllx	$acc4,32,$acc4
1693	and	$acc3,$poly1,$acc3
1694	sllx	$acc5,32,$acc5
1695	or	$acc2,$acc4,$acc2
1696	subc	%g0,%g0,$acc4		! did it borrow?
1697	b	.Lreduce_by_add_vis3
1698	or	$acc3,$acc5,$acc3
1699.type	__ecp_nistz256_sub_from_vis3,#function
1700.size	__ecp_nistz256_sub_from_vis3,.-__ecp_nistz256_sub_from_vis3
1701
1702.align	32
1703__ecp_nistz256_sub_morf_vis3:
1704	ld	[$bp+4],$t0
1705	ld	[$bp+0],$t1
1706	ld	[$bp+12],$t2
1707	ld	[$bp+8],$t3
1708
1709	srlx	$acc0,32,$acc4
1710	not	$poly1,$poly1
1711	srlx	$acc1,32,$acc5
1712	subcc	$t0,$acc0,$acc0
1713	ld	[$bp+20],$t0
1714	subccc	$t1,$acc4,$acc4
1715	ld	[$bp+16],$t1
1716	subccc	$t2,$acc1,$acc1
1717	ld	[$bp+28],$t2
1718	and	$acc0,$poly1,$acc0
1719	subccc	$t3,$acc5,$acc5
1720	ld	[$bp+24],$t3
1721	sllx	$acc4,32,$acc4
1722	and	$acc1,$poly1,$acc1
1723	sllx	$acc5,32,$acc5
1724	or	$acc0,$acc4,$acc0
1725	srlx	$acc2,32,$acc4
1726	or	$acc1,$acc5,$acc1
1727	srlx	$acc3,32,$acc5
1728	subccc	$t0,$acc2,$acc2
1729	subccc	$t1,$acc4,$acc4
1730	subccc	$t2,$acc3,$acc3
1731	and	$acc2,$poly1,$acc2
1732	subccc	$t3,$acc5,$acc5
1733	sllx	$acc4,32,$acc4
1734	and	$acc3,$poly1,$acc3
1735	sllx	$acc5,32,$acc5
1736	or	$acc2,$acc4,$acc2
1737	subc	%g0,%g0,$acc4		! did it borrow?
1738	or	$acc3,$acc5,$acc3
1739
1740.Lreduce_by_add_vis3:
1741
1742	addcc	$acc0,-1,$t0		! add modulus
1743	not	$poly3,$t3
1744	addxccc	$acc1,$poly1,$t1
1745	not	$poly1,$poly1		! restore $poly1
1746	addxccc	$acc2,%g0,$t2
1747	addxc	$acc3,$t3,$t3
1748
1749	movrnz	$acc4,$t0,$acc0		! if a-b borrowed, ret = ret+mod
1750	movrnz	$acc4,$t1,$acc1
1751	stx	$acc0,[$rp]
1752	movrnz	$acc4,$t2,$acc2
1753	stx	$acc1,[$rp+8]
1754	movrnz	$acc4,$t3,$acc3
1755	stx	$acc2,[$rp+16]
1756	retl
1757	stx	$acc3,[$rp+24]
1758.type	__ecp_nistz256_sub_morf_vis3,#function
1759.size	__ecp_nistz256_sub_morf_vis3,.-__ecp_nistz256_sub_morf_vis3
1760
1761.align	32
1762__ecp_nistz256_div_by_2_vis3:
1763	! ret = (a is odd ? a+mod : a) >> 1
1764
1765	not	$poly1,$t1
1766	not	$poly3,$t3
1767	and	$acc0,1,$acc5
1768	addcc	$acc0,-1,$t0		! add modulus
1769	addxccc	$acc1,$t1,$t1
1770	addxccc	$acc2,%g0,$t2
1771	addxccc	$acc3,$t3,$t3
1772	addxc	%g0,%g0,$acc4		! carry bit
1773
1774	movrnz	$acc5,$t0,$acc0
1775	movrnz	$acc5,$t1,$acc1
1776	movrnz	$acc5,$t2,$acc2
1777	movrnz	$acc5,$t3,$acc3
1778	movrz	$acc5,%g0,$acc4
1779
1780	! ret >>= 1
1781
1782	srlx	$acc0,1,$acc0
1783	sllx	$acc1,63,$t0
1784	srlx	$acc1,1,$acc1
1785	or	$acc0,$t0,$acc0
1786	sllx	$acc2,63,$t1
1787	srlx	$acc2,1,$acc2
1788	or	$acc1,$t1,$acc1
1789	sllx	$acc3,63,$t2
1790	stx	$acc0,[$rp]
1791	srlx	$acc3,1,$acc3
1792	or	$acc2,$t2,$acc2
1793	sllx	$acc4,63,$t3		! don't forget carry bit
1794	stx	$acc1,[$rp+8]
1795	or	$acc3,$t3,$acc3
1796	stx	$acc2,[$rp+16]
1797	retl
1798	stx	$acc3,[$rp+24]
1799.type	__ecp_nistz256_div_by_2_vis3,#function
1800.size	__ecp_nistz256_div_by_2_vis3,.-__ecp_nistz256_div_by_2_vis3
1801
1802! compared to __ecp_nistz256_mul_mont it's almost 4x smaller and
1803! 4x faster [on T4]...
1804.align	32
1805__ecp_nistz256_mul_mont_vis3:
1806	mulx	$a0,$bi,$acc0
1807	not	$poly3,$poly3		! 0xFFFFFFFF00000001
1808	umulxhi	$a0,$bi,$t0
1809	mulx	$a1,$bi,$acc1
1810	umulxhi	$a1,$bi,$t1
1811	mulx	$a2,$bi,$acc2
1812	umulxhi	$a2,$bi,$t2
1813	mulx	$a3,$bi,$acc3
1814	umulxhi	$a3,$bi,$t3
1815	ldx	[$bp+8],$bi		! b[1]
1816
1817	addcc	$acc1,$t0,$acc1		! accumulate high parts of multiplication
1818	 sllx	$acc0,32,$t0
1819	addxccc	$acc2,$t1,$acc2
1820	 srlx	$acc0,32,$t1
1821	addxccc	$acc3,$t2,$acc3
1822	addxc	%g0,$t3,$acc4
1823	mov	0,$acc5
1824___
1825for($i=1;$i<4;$i++) {
1826	# Reduction iteration is normally performed by accumulating
1827	# result of multiplication of modulus by "magic" digit [and
1828	# omitting least significant word, which is guaranteed to
1829	# be 0], but thanks to special form of modulus and "magic"
1830	# digit being equal to least significant word, it can be
1831	# performed with additions and subtractions alone. Indeed:
1832	#
1833	#            ffff0001.00000000.0000ffff.ffffffff
1834	# *                                     abcdefgh
1835	# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
1836	#
1837	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1838	# rewrite above as:
1839	#
1840	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
1841	# + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
1842	# - 0000abcd.efgh0000.00000000.00000000.abcdefgh
1843	#
1844	# or marking redundant operations:
1845	#
1846	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
1847	# + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
1848	# - 0000abcd.efgh0000.--------.--------.--------
1849	#   ^^^^^^^^ but this word is calculated with umulxhi, because
1850	#            there is no subtract with 64-bit borrow:-(
1851
1852$code.=<<___;
1853	sub	$acc0,$t0,$t2		! acc0*0xFFFFFFFF00000001, low part
1854	umulxhi	$acc0,$poly3,$t3	! acc0*0xFFFFFFFF00000001, high part
1855	addcc	$acc1,$t0,$acc0		! +=acc[0]<<96 and omit acc[0]
1856	mulx	$a0,$bi,$t0
1857	addxccc	$acc2,$t1,$acc1
1858	mulx	$a1,$bi,$t1
1859	addxccc	$acc3,$t2,$acc2		! +=acc[0]*0xFFFFFFFF00000001
1860	mulx	$a2,$bi,$t2
1861	addxccc	$acc4,$t3,$acc3
1862	mulx	$a3,$bi,$t3
1863	addxc	$acc5,%g0,$acc4
1864
1865	addcc	$acc0,$t0,$acc0		! accumulate low parts of multiplication
1866	umulxhi	$a0,$bi,$t0
1867	addxccc	$acc1,$t1,$acc1
1868	umulxhi	$a1,$bi,$t1
1869	addxccc	$acc2,$t2,$acc2
1870	umulxhi	$a2,$bi,$t2
1871	addxccc	$acc3,$t3,$acc3
1872	umulxhi	$a3,$bi,$t3
1873	addxc	$acc4,%g0,$acc4
1874___
1875$code.=<<___	if ($i<3);
1876	ldx	[$bp+8*($i+1)],$bi	! bp[$i+1]
1877___
1878$code.=<<___;
1879	addcc	$acc1,$t0,$acc1		! accumulate high parts of multiplication
1880	 sllx	$acc0,32,$t0
1881	addxccc	$acc2,$t1,$acc2
1882	 srlx	$acc0,32,$t1
1883	addxccc	$acc3,$t2,$acc3
1884	addxccc	$acc4,$t3,$acc4
1885	addxc	%g0,%g0,$acc5
1886___
1887}
1888$code.=<<___;
1889	sub	$acc0,$t0,$t2		! acc0*0xFFFFFFFF00000001, low part
1890	umulxhi	$acc0,$poly3,$t3	! acc0*0xFFFFFFFF00000001, high part
1891	addcc	$acc1,$t0,$acc0		! +=acc[0]<<96 and omit acc[0]
1892	addxccc	$acc2,$t1,$acc1
1893	addxccc	$acc3,$t2,$acc2		! +=acc[0]*0xFFFFFFFF00000001
1894	addxccc	$acc4,$t3,$acc3
1895	b	.Lmul_final_vis3	! see below
1896	addxc	$acc5,%g0,$acc4
1897.type	__ecp_nistz256_mul_mont_vis3,#function
1898.size	__ecp_nistz256_mul_mont_vis3,.-__ecp_nistz256_mul_mont_vis3
1899
1900! compared to above __ecp_nistz256_mul_mont_vis3 it's 21% less
1901! instructions, but only 14% faster [on T4]...
1902.align	32
1903__ecp_nistz256_sqr_mont_vis3:
1904	!  |  |  |  |  |  |a1*a0|  |
1905	!  |  |  |  |  |a2*a0|  |  |
1906	!  |  |a3*a2|a3*a0|  |  |  |
1907	!  |  |  |  |a2*a1|  |  |  |
1908	!  |  |  |a3*a1|  |  |  |  |
1909	! *|  |  |  |  |  |  |  | 2|
1910	! +|a3*a3|a2*a2|a1*a1|a0*a0|
1911	!  |--+--+--+--+--+--+--+--|
1912	!  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1913	!
1914	!  "can't overflow" below mark carrying into high part of
1915	!  multiplication result, which can't overflow, because it
1916	!  can never be all ones.
1917
1918	mulx	$a1,$a0,$acc1		! a[1]*a[0]
1919	umulxhi	$a1,$a0,$t1
1920	mulx	$a2,$a0,$acc2		! a[2]*a[0]
1921	umulxhi	$a2,$a0,$t2
1922	mulx	$a3,$a0,$acc3		! a[3]*a[0]
1923	umulxhi	$a3,$a0,$acc4
1924
1925	addcc	$acc2,$t1,$acc2		! accumulate high parts of multiplication
1926	mulx	$a2,$a1,$t0		! a[2]*a[1]
1927	umulxhi	$a2,$a1,$t1
1928	addxccc	$acc3,$t2,$acc3
1929	mulx	$a3,$a1,$t2		! a[3]*a[1]
1930	umulxhi	$a3,$a1,$t3
1931	addxc	$acc4,%g0,$acc4		! can't overflow
1932
1933	mulx	$a3,$a2,$acc5		! a[3]*a[2]
1934	not	$poly3,$poly3		! 0xFFFFFFFF00000001
1935	umulxhi	$a3,$a2,$acc6
1936
1937	addcc	$t2,$t1,$t1		! accumulate high parts of multiplication
1938	mulx	$a0,$a0,$acc0		! a[0]*a[0]
1939	addxc	$t3,%g0,$t2		! can't overflow
1940
1941	addcc	$acc3,$t0,$acc3		! accumulate low parts of multiplication
1942	umulxhi	$a0,$a0,$a0
1943	addxccc	$acc4,$t1,$acc4
1944	mulx	$a1,$a1,$t1		! a[1]*a[1]
1945	addxccc	$acc5,$t2,$acc5
1946	umulxhi	$a1,$a1,$a1
1947	addxc	$acc6,%g0,$acc6		! can't overflow
1948
1949	addcc	$acc1,$acc1,$acc1	! acc[1-6]*=2
1950	mulx	$a2,$a2,$t2		! a[2]*a[2]
1951	addxccc	$acc2,$acc2,$acc2
1952	umulxhi	$a2,$a2,$a2
1953	addxccc	$acc3,$acc3,$acc3
1954	mulx	$a3,$a3,$t3		! a[3]*a[3]
1955	addxccc	$acc4,$acc4,$acc4
1956	umulxhi	$a3,$a3,$a3
1957	addxccc	$acc5,$acc5,$acc5
1958	addxccc	$acc6,$acc6,$acc6
1959	addxc	%g0,%g0,$acc7
1960
1961	addcc	$acc1,$a0,$acc1		! +a[i]*a[i]
1962	addxccc	$acc2,$t1,$acc2
1963	addxccc	$acc3,$a1,$acc3
1964	addxccc	$acc4,$t2,$acc4
1965	 sllx	$acc0,32,$t0
1966	addxccc	$acc5,$a2,$acc5
1967	 srlx	$acc0,32,$t1
1968	addxccc	$acc6,$t3,$acc6
1969	 sub	$acc0,$t0,$t2		! acc0*0xFFFFFFFF00000001, low part
1970	addxc	$acc7,$a3,$acc7
1971___
1972for($i=0;$i<3;$i++) {			# reductions, see commentary
1973					# in multiplication for details
1974$code.=<<___;
1975	umulxhi	$acc0,$poly3,$t3	! acc0*0xFFFFFFFF00000001, high part
1976	addcc	$acc1,$t0,$acc0		! +=acc[0]<<96 and omit acc[0]
1977	 sllx	$acc0,32,$t0
1978	addxccc	$acc2,$t1,$acc1
1979	 srlx	$acc0,32,$t1
1980	addxccc	$acc3,$t2,$acc2		! +=acc[0]*0xFFFFFFFF00000001
1981	 sub	$acc0,$t0,$t2		! acc0*0xFFFFFFFF00000001, low part
1982	addxc	%g0,$t3,$acc3		! can't overflow
1983___
1984}
1985$code.=<<___;
1986	umulxhi	$acc0,$poly3,$t3	! acc0*0xFFFFFFFF00000001, high part
1987	addcc	$acc1,$t0,$acc0		! +=acc[0]<<96 and omit acc[0]
1988	addxccc	$acc2,$t1,$acc1
1989	addxccc	$acc3,$t2,$acc2		! +=acc[0]*0xFFFFFFFF00000001
1990	addxc	%g0,$t3,$acc3		! can't overflow
1991
1992	addcc	$acc0,$acc4,$acc0	! accumulate upper half
1993	addxccc	$acc1,$acc5,$acc1
1994	addxccc	$acc2,$acc6,$acc2
1995	addxccc	$acc3,$acc7,$acc3
1996	addxc	%g0,%g0,$acc4
1997
1998.Lmul_final_vis3:
1999
2000	! Final step is "if result > mod, subtract mod", but as comparison
2001	! means subtraction, we do the subtraction and then copy outcome
2002	! if it didn't borrow. But note that as we [have to] replace
2003	! subtraction with addition with negative, carry/borrow logic is
2004	! inverse.
2005
2006	addcc	$acc0,1,$t0		! add -modulus, i.e. subtract
2007	not	$poly3,$poly3		! restore 0x00000000FFFFFFFE
2008	addxccc	$acc1,$poly1,$t1
2009	addxccc	$acc2,$minus1,$t2
2010	addxccc	$acc3,$poly3,$t3
2011	addxccc	$acc4,$minus1,%g0	! did it carry?
2012
2013	movcs	%xcc,$t0,$acc0
2014	movcs	%xcc,$t1,$acc1
2015	stx	$acc0,[$rp]
2016	movcs	%xcc,$t2,$acc2
2017	stx	$acc1,[$rp+8]
2018	movcs	%xcc,$t3,$acc3
2019	stx	$acc2,[$rp+16]
2020	retl
2021	stx	$acc3,[$rp+24]
2022.type	__ecp_nistz256_sqr_mont_vis3,#function
2023.size	__ecp_nistz256_sqr_mont_vis3,.-__ecp_nistz256_sqr_mont_vis3
2024___
2025
2026########################################################################
2027# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
2028#
2029{
2030my ($res_x,$res_y,$res_z,
2031    $in_x,$in_y,$in_z,
2032    $S,$M,$Zsqr,$tmp0)=map(32*$_,(0..9));
2033# above map() describes stack layout with 10 temporary
2034# 256-bit vectors on top.
2035
2036$code.=<<___;
2037.align	32
2038ecp_nistz256_point_double_vis3:
2039	save	%sp,-STACK64_FRAME-32*10,%sp
2040
2041	mov	$rp,$rp_real
2042.Ldouble_shortcut_vis3:
2043	mov	-1,$minus1
2044	mov	-2,$poly3
2045	sllx	$minus1,32,$poly1		! 0xFFFFFFFF00000000
2046	srl	$poly3,0,$poly3			! 0x00000000FFFFFFFE
2047
2048	! convert input to uint64_t[4]
2049	ld	[$ap],$a0			! in_x
2050	ld	[$ap+4],$t0
2051	ld	[$ap+8],$a1
2052	ld	[$ap+12],$t1
2053	ld	[$ap+16],$a2
2054	ld	[$ap+20],$t2
2055	ld	[$ap+24],$a3
2056	ld	[$ap+28],$t3
2057	sllx	$t0,32,$t0
2058	sllx	$t1,32,$t1
2059	ld	[$ap+32],$acc0			! in_y
2060	or	$a0,$t0,$a0
2061	ld	[$ap+32+4],$t0
2062	sllx	$t2,32,$t2
2063	ld	[$ap+32+8],$acc1
2064	or	$a1,$t1,$a1
2065	ld	[$ap+32+12],$t1
2066	sllx	$t3,32,$t3
2067	ld	[$ap+32+16],$acc2
2068	or	$a2,$t2,$a2
2069	ld	[$ap+32+20],$t2
2070	or	$a3,$t3,$a3
2071	ld	[$ap+32+24],$acc3
2072	sllx	$t0,32,$t0
2073	ld	[$ap+32+28],$t3
2074	sllx	$t1,32,$t1
2075	stx	$a0,[%sp+LOCALS64+$in_x]
2076	sllx	$t2,32,$t2
2077	stx	$a1,[%sp+LOCALS64+$in_x+8]
2078	sllx	$t3,32,$t3
2079	stx	$a2,[%sp+LOCALS64+$in_x+16]
2080	or	$acc0,$t0,$acc0
2081	stx	$a3,[%sp+LOCALS64+$in_x+24]
2082	or	$acc1,$t1,$acc1
2083	stx	$acc0,[%sp+LOCALS64+$in_y]
2084	or	$acc2,$t2,$acc2
2085	stx	$acc1,[%sp+LOCALS64+$in_y+8]
2086	or	$acc3,$t3,$acc3
2087	stx	$acc2,[%sp+LOCALS64+$in_y+16]
2088	stx	$acc3,[%sp+LOCALS64+$in_y+24]
2089
2090	ld	[$ap+64],$a0			! in_z
2091	ld	[$ap+64+4],$t0
2092	ld	[$ap+64+8],$a1
2093	ld	[$ap+64+12],$t1
2094	ld	[$ap+64+16],$a2
2095	ld	[$ap+64+20],$t2
2096	ld	[$ap+64+24],$a3
2097	ld	[$ap+64+28],$t3
2098	sllx	$t0,32,$t0
2099	sllx	$t1,32,$t1
2100	or	$a0,$t0,$a0
2101	sllx	$t2,32,$t2
2102	or	$a1,$t1,$a1
2103	sllx	$t3,32,$t3
2104	or	$a2,$t2,$a2
2105	or	$a3,$t3,$a3
2106	sllx	$t0,32,$t0
2107	sllx	$t1,32,$t1
2108	stx	$a0,[%sp+LOCALS64+$in_z]
2109	sllx	$t2,32,$t2
2110	stx	$a1,[%sp+LOCALS64+$in_z+8]
2111	sllx	$t3,32,$t3
2112	stx	$a2,[%sp+LOCALS64+$in_z+16]
2113	stx	$a3,[%sp+LOCALS64+$in_z+24]
2114
2115	! in_y is still in $acc0-$acc3
2116	call	__ecp_nistz256_mul_by_2_vis3	! p256_mul_by_2(S, in_y);
2117	add	%sp,LOCALS64+$S,$rp
2118
2119	! in_z is still in $a0-$a3
2120	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(Zsqr, in_z);
2121	add	%sp,LOCALS64+$Zsqr,$rp
2122
2123	mov	$acc0,$a0			! put Zsqr aside
2124	mov	$acc1,$a1
2125	mov	$acc2,$a2
2126	mov	$acc3,$a3
2127
2128	add	%sp,LOCALS64+$in_x,$bp
2129	call	__ecp_nistz256_add_vis3		! p256_add(M, Zsqr, in_x);
2130	add	%sp,LOCALS64+$M,$rp
2131
2132	mov	$a0,$acc0			! restore Zsqr
2133	ldx	[%sp+LOCALS64+$S],$a0		! forward load
2134	mov	$a1,$acc1
2135	ldx	[%sp+LOCALS64+$S+8],$a1
2136	mov	$a2,$acc2
2137	ldx	[%sp+LOCALS64+$S+16],$a2
2138	mov	$a3,$acc3
2139	ldx	[%sp+LOCALS64+$S+24],$a3
2140
2141	add	%sp,LOCALS64+$in_x,$bp
2142	call	__ecp_nistz256_sub_morf_vis3	! p256_sub(Zsqr, in_x, Zsqr);
2143	add	%sp,LOCALS64+$Zsqr,$rp
2144
2145	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(S, S);
2146	add	%sp,LOCALS64+$S,$rp
2147
2148	ldx	[%sp+LOCALS64+$in_z],$bi
2149	ldx	[%sp+LOCALS64+$in_y],$a0
2150	ldx	[%sp+LOCALS64+$in_y+8],$a1
2151	ldx	[%sp+LOCALS64+$in_y+16],$a2
2152	ldx	[%sp+LOCALS64+$in_y+24],$a3
2153	add	%sp,LOCALS64+$in_z,$bp
2154	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(tmp0, in_z, in_y);
2155	add	%sp,LOCALS64+$tmp0,$rp
2156
2157	ldx	[%sp+LOCALS64+$M],$bi		! forward load
2158	ldx	[%sp+LOCALS64+$Zsqr],$a0
2159	ldx	[%sp+LOCALS64+$Zsqr+8],$a1
2160	ldx	[%sp+LOCALS64+$Zsqr+16],$a2
2161	ldx	[%sp+LOCALS64+$Zsqr+24],$a3
2162
2163	call	__ecp_nistz256_mul_by_2_vis3	! p256_mul_by_2(res_z, tmp0);
2164	add	%sp,LOCALS64+$res_z,$rp
2165
2166	add	%sp,LOCALS64+$M,$bp
2167	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(M, M, Zsqr);
2168	add	%sp,LOCALS64+$M,$rp
2169
2170	mov	$acc0,$a0			! put aside M
2171	mov	$acc1,$a1
2172	mov	$acc2,$a2
2173	mov	$acc3,$a3
2174	call	__ecp_nistz256_mul_by_2_vis3
2175	add	%sp,LOCALS64+$M,$rp
2176	mov	$a0,$t0				! copy M
2177	ldx	[%sp+LOCALS64+$S],$a0		! forward load
2178	mov	$a1,$t1
2179	ldx	[%sp+LOCALS64+$S+8],$a1
2180	mov	$a2,$t2
2181	ldx	[%sp+LOCALS64+$S+16],$a2
2182	mov	$a3,$t3
2183	ldx	[%sp+LOCALS64+$S+24],$a3
2184	call	__ecp_nistz256_add_noload_vis3	! p256_mul_by_3(M, M);
2185	add	%sp,LOCALS64+$M,$rp
2186
2187	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(tmp0, S);
2188	add	%sp,LOCALS64+$tmp0,$rp
2189
2190	ldx	[%sp+LOCALS64+$S],$bi		! forward load
2191	ldx	[%sp+LOCALS64+$in_x],$a0
2192	ldx	[%sp+LOCALS64+$in_x+8],$a1
2193	ldx	[%sp+LOCALS64+$in_x+16],$a2
2194	ldx	[%sp+LOCALS64+$in_x+24],$a3
2195
2196	call	__ecp_nistz256_div_by_2_vis3	! p256_div_by_2(res_y, tmp0);
2197	add	%sp,LOCALS64+$res_y,$rp
2198
2199	add	%sp,LOCALS64+$S,$bp
2200	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S, S, in_x);
2201	add	%sp,LOCALS64+$S,$rp
2202
2203	ldx	[%sp+LOCALS64+$M],$a0		! forward load
2204	ldx	[%sp+LOCALS64+$M+8],$a1
2205	ldx	[%sp+LOCALS64+$M+16],$a2
2206	ldx	[%sp+LOCALS64+$M+24],$a3
2207
2208	call	__ecp_nistz256_mul_by_2_vis3	! p256_mul_by_2(tmp0, S);
2209	add	%sp,LOCALS64+$tmp0,$rp
2210
2211	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(res_x, M);
2212	add	%sp,LOCALS64+$res_x,$rp
2213
2214	add	%sp,LOCALS64+$tmp0,$bp
2215	call	__ecp_nistz256_sub_from_vis3	! p256_sub(res_x, res_x, tmp0);
2216	add	%sp,LOCALS64+$res_x,$rp
2217
2218	ldx	[%sp+LOCALS64+$M],$a0		! forward load
2219	ldx	[%sp+LOCALS64+$M+8],$a1
2220	ldx	[%sp+LOCALS64+$M+16],$a2
2221	ldx	[%sp+LOCALS64+$M+24],$a3
2222
2223	add	%sp,LOCALS64+$S,$bp
2224	call	__ecp_nistz256_sub_morf_vis3	! p256_sub(S, S, res_x);
2225	add	%sp,LOCALS64+$S,$rp
2226
2227	mov	$acc0,$bi
2228	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S, S, M);
2229	add	%sp,LOCALS64+$S,$rp
2230
2231	ldx	[%sp+LOCALS64+$res_x],$a0	! forward load
2232	ldx	[%sp+LOCALS64+$res_x+8],$a1
2233	ldx	[%sp+LOCALS64+$res_x+16],$a2
2234	ldx	[%sp+LOCALS64+$res_x+24],$a3
2235
2236	add	%sp,LOCALS64+$res_y,$bp
2237	call	__ecp_nistz256_sub_from_vis3	! p256_sub(res_y, S, res_y);
2238	add	%sp,LOCALS64+$res_y,$bp
2239
2240	! convert output to uint_32[8]
2241	srlx	$a0,32,$t0
2242	srlx	$a1,32,$t1
2243	st	$a0,[$rp_real]			! res_x
2244	srlx	$a2,32,$t2
2245	st	$t0,[$rp_real+4]
2246	srlx	$a3,32,$t3
2247	st	$a1,[$rp_real+8]
2248	st	$t1,[$rp_real+12]
2249	st	$a2,[$rp_real+16]
2250	st	$t2,[$rp_real+20]
2251	st	$a3,[$rp_real+24]
2252	st	$t3,[$rp_real+28]
2253
2254	ldx	[%sp+LOCALS64+$res_z],$a0	! forward load
2255	srlx	$acc0,32,$t0
2256	ldx	[%sp+LOCALS64+$res_z+8],$a1
2257	srlx	$acc1,32,$t1
2258	ldx	[%sp+LOCALS64+$res_z+16],$a2
2259	srlx	$acc2,32,$t2
2260	ldx	[%sp+LOCALS64+$res_z+24],$a3
2261	srlx	$acc3,32,$t3
2262	st	$acc0,[$rp_real+32]		! res_y
2263	st	$t0,  [$rp_real+32+4]
2264	st	$acc1,[$rp_real+32+8]
2265	st	$t1,  [$rp_real+32+12]
2266	st	$acc2,[$rp_real+32+16]
2267	st	$t2,  [$rp_real+32+20]
2268	st	$acc3,[$rp_real+32+24]
2269	st	$t3,  [$rp_real+32+28]
2270
2271	srlx	$a0,32,$t0
2272	srlx	$a1,32,$t1
2273	st	$a0,[$rp_real+64]		! res_z
2274	srlx	$a2,32,$t2
2275	st	$t0,[$rp_real+64+4]
2276	srlx	$a3,32,$t3
2277	st	$a1,[$rp_real+64+8]
2278	st	$t1,[$rp_real+64+12]
2279	st	$a2,[$rp_real+64+16]
2280	st	$t2,[$rp_real+64+20]
2281	st	$a3,[$rp_real+64+24]
2282	st	$t3,[$rp_real+64+28]
2283
2284	ret
2285	restore
2286.type	ecp_nistz256_point_double_vis3,#function
2287.size	ecp_nistz256_point_double_vis3,.-ecp_nistz256_point_double_vis3
2288___
2289}
2290########################################################################
2291# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
2292#			      const P256_POINT *in2);
2293{
2294my ($res_x,$res_y,$res_z,
2295    $in1_x,$in1_y,$in1_z,
2296    $in2_x,$in2_y,$in2_z,
2297    $H,$Hsqr,$R,$Rsqr,$Hcub,
2298    $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
2299my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
2300
2301# above map() describes stack layout with 18 temporary
2302# 256-bit vectors on top. Then we reserve some space for
2303# !in1infty, !in2infty and result of check for zero.
2304
2305$code.=<<___;
2306.align	32
2307ecp_nistz256_point_add_vis3:
2308	save	%sp,-STACK64_FRAME-32*18-32,%sp
2309
2310	mov	$rp,$rp_real
2311	mov	-1,$minus1
2312	mov	-2,$poly3
2313	sllx	$minus1,32,$poly1		! 0xFFFFFFFF00000000
2314	srl	$poly3,0,$poly3			! 0x00000000FFFFFFFE
2315
2316	! convert input to uint64_t[4]
2317	ld	[$bp],$a0			! in2_x
2318	ld	[$bp+4],$t0
2319	ld	[$bp+8],$a1
2320	ld	[$bp+12],$t1
2321	ld	[$bp+16],$a2
2322	ld	[$bp+20],$t2
2323	ld	[$bp+24],$a3
2324	ld	[$bp+28],$t3
2325	sllx	$t0,32,$t0
2326	sllx	$t1,32,$t1
2327	ld	[$bp+32],$acc0			! in2_y
2328	or	$a0,$t0,$a0
2329	ld	[$bp+32+4],$t0
2330	sllx	$t2,32,$t2
2331	ld	[$bp+32+8],$acc1
2332	or	$a1,$t1,$a1
2333	ld	[$bp+32+12],$t1
2334	sllx	$t3,32,$t3
2335	ld	[$bp+32+16],$acc2
2336	or	$a2,$t2,$a2
2337	ld	[$bp+32+20],$t2
2338	or	$a3,$t3,$a3
2339	ld	[$bp+32+24],$acc3
2340	sllx	$t0,32,$t0
2341	ld	[$bp+32+28],$t3
2342	sllx	$t1,32,$t1
2343	stx	$a0,[%sp+LOCALS64+$in2_x]
2344	sllx	$t2,32,$t2
2345	stx	$a1,[%sp+LOCALS64+$in2_x+8]
2346	sllx	$t3,32,$t3
2347	stx	$a2,[%sp+LOCALS64+$in2_x+16]
2348	or	$acc0,$t0,$acc0
2349	stx	$a3,[%sp+LOCALS64+$in2_x+24]
2350	or	$acc1,$t1,$acc1
2351	stx	$acc0,[%sp+LOCALS64+$in2_y]
2352	or	$acc2,$t2,$acc2
2353	stx	$acc1,[%sp+LOCALS64+$in2_y+8]
2354	or	$acc3,$t3,$acc3
2355	stx	$acc2,[%sp+LOCALS64+$in2_y+16]
2356	stx	$acc3,[%sp+LOCALS64+$in2_y+24]
2357
2358	ld	[$bp+64],$acc0			! in2_z
2359	ld	[$bp+64+4],$t0
2360	ld	[$bp+64+8],$acc1
2361	ld	[$bp+64+12],$t1
2362	ld	[$bp+64+16],$acc2
2363	ld	[$bp+64+20],$t2
2364	ld	[$bp+64+24],$acc3
2365	ld	[$bp+64+28],$t3
2366	sllx	$t0,32,$t0
2367	sllx	$t1,32,$t1
2368	ld	[$ap],$a0			! in1_x
2369	or	$acc0,$t0,$acc0
2370	ld	[$ap+4],$t0
2371	sllx	$t2,32,$t2
2372	ld	[$ap+8],$a1
2373	or	$acc1,$t1,$acc1
2374	ld	[$ap+12],$t1
2375	sllx	$t3,32,$t3
2376	ld	[$ap+16],$a2
2377	or	$acc2,$t2,$acc2
2378	ld	[$ap+20],$t2
2379	or	$acc3,$t3,$acc3
2380	ld	[$ap+24],$a3
2381	sllx	$t0,32,$t0
2382	ld	[$ap+28],$t3
2383	sllx	$t1,32,$t1
2384	stx	$acc0,[%sp+LOCALS64+$in2_z]
2385	sllx	$t2,32,$t2
2386	stx	$acc1,[%sp+LOCALS64+$in2_z+8]
2387	sllx	$t3,32,$t3
2388	stx	$acc2,[%sp+LOCALS64+$in2_z+16]
2389	stx	$acc3,[%sp+LOCALS64+$in2_z+24]
2390
2391	or	$acc1,$acc0,$acc0
2392	or	$acc3,$acc2,$acc2
2393	or	$acc2,$acc0,$acc0
2394	movrnz	$acc0,-1,$acc0			! !in2infty
2395	stx	$acc0,[%fp+STACK_BIAS-8]
2396
2397	or	$a0,$t0,$a0
2398	ld	[$ap+32],$acc0			! in1_y
2399	or	$a1,$t1,$a1
2400	ld	[$ap+32+4],$t0
2401	or	$a2,$t2,$a2
2402	ld	[$ap+32+8],$acc1
2403	or	$a3,$t3,$a3
2404	ld	[$ap+32+12],$t1
2405	ld	[$ap+32+16],$acc2
2406	ld	[$ap+32+20],$t2
2407	ld	[$ap+32+24],$acc3
2408	sllx	$t0,32,$t0
2409	ld	[$ap+32+28],$t3
2410	sllx	$t1,32,$t1
2411	stx	$a0,[%sp+LOCALS64+$in1_x]
2412	sllx	$t2,32,$t2
2413	stx	$a1,[%sp+LOCALS64+$in1_x+8]
2414	sllx	$t3,32,$t3
2415	stx	$a2,[%sp+LOCALS64+$in1_x+16]
2416	or	$acc0,$t0,$acc0
2417	stx	$a3,[%sp+LOCALS64+$in1_x+24]
2418	or	$acc1,$t1,$acc1
2419	stx	$acc0,[%sp+LOCALS64+$in1_y]
2420	or	$acc2,$t2,$acc2
2421	stx	$acc1,[%sp+LOCALS64+$in1_y+8]
2422	or	$acc3,$t3,$acc3
2423	stx	$acc2,[%sp+LOCALS64+$in1_y+16]
2424	stx	$acc3,[%sp+LOCALS64+$in1_y+24]
2425
2426	ldx	[%sp+LOCALS64+$in2_z],$a0	! forward load
2427	ldx	[%sp+LOCALS64+$in2_z+8],$a1
2428	ldx	[%sp+LOCALS64+$in2_z+16],$a2
2429	ldx	[%sp+LOCALS64+$in2_z+24],$a3
2430
2431	ld	[$ap+64],$acc0			! in1_z
2432	ld	[$ap+64+4],$t0
2433	ld	[$ap+64+8],$acc1
2434	ld	[$ap+64+12],$t1
2435	ld	[$ap+64+16],$acc2
2436	ld	[$ap+64+20],$t2
2437	ld	[$ap+64+24],$acc3
2438	ld	[$ap+64+28],$t3
2439	sllx	$t0,32,$t0
2440	sllx	$t1,32,$t1
2441	or	$acc0,$t0,$acc0
2442	sllx	$t2,32,$t2
2443	or	$acc1,$t1,$acc1
2444	sllx	$t3,32,$t3
2445	stx	$acc0,[%sp+LOCALS64+$in1_z]
2446	or	$acc2,$t2,$acc2
2447	stx	$acc1,[%sp+LOCALS64+$in1_z+8]
2448	or	$acc3,$t3,$acc3
2449	stx	$acc2,[%sp+LOCALS64+$in1_z+16]
2450	stx	$acc3,[%sp+LOCALS64+$in1_z+24]
2451
2452	or	$acc1,$acc0,$acc0
2453	or	$acc3,$acc2,$acc2
2454	or	$acc2,$acc0,$acc0
2455	movrnz	$acc0,-1,$acc0			! !in1infty
2456	stx	$acc0,[%fp+STACK_BIAS-16]
2457
2458	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(Z2sqr, in2_z);
2459	add	%sp,LOCALS64+$Z2sqr,$rp
2460
2461	ldx	[%sp+LOCALS64+$in1_z],$a0
2462	ldx	[%sp+LOCALS64+$in1_z+8],$a1
2463	ldx	[%sp+LOCALS64+$in1_z+16],$a2
2464	ldx	[%sp+LOCALS64+$in1_z+24],$a3
2465	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(Z1sqr, in1_z);
2466	add	%sp,LOCALS64+$Z1sqr,$rp
2467
2468	ldx	[%sp+LOCALS64+$Z2sqr],$bi
2469	ldx	[%sp+LOCALS64+$in2_z],$a0
2470	ldx	[%sp+LOCALS64+$in2_z+8],$a1
2471	ldx	[%sp+LOCALS64+$in2_z+16],$a2
2472	ldx	[%sp+LOCALS64+$in2_z+24],$a3
2473	add	%sp,LOCALS64+$Z2sqr,$bp
2474	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S1, Z2sqr, in2_z);
2475	add	%sp,LOCALS64+$S1,$rp
2476
2477	ldx	[%sp+LOCALS64+$Z1sqr],$bi
2478	ldx	[%sp+LOCALS64+$in1_z],$a0
2479	ldx	[%sp+LOCALS64+$in1_z+8],$a1
2480	ldx	[%sp+LOCALS64+$in1_z+16],$a2
2481	ldx	[%sp+LOCALS64+$in1_z+24],$a3
2482	add	%sp,LOCALS64+$Z1sqr,$bp
2483	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S2, Z1sqr, in1_z);
2484	add	%sp,LOCALS64+$S2,$rp
2485
2486	ldx	[%sp+LOCALS64+$S1],$bi
2487	ldx	[%sp+LOCALS64+$in1_y],$a0
2488	ldx	[%sp+LOCALS64+$in1_y+8],$a1
2489	ldx	[%sp+LOCALS64+$in1_y+16],$a2
2490	ldx	[%sp+LOCALS64+$in1_y+24],$a3
2491	add	%sp,LOCALS64+$S1,$bp
2492	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S1, S1, in1_y);
2493	add	%sp,LOCALS64+$S1,$rp
2494
2495	ldx	[%sp+LOCALS64+$S2],$bi
2496	ldx	[%sp+LOCALS64+$in2_y],$a0
2497	ldx	[%sp+LOCALS64+$in2_y+8],$a1
2498	ldx	[%sp+LOCALS64+$in2_y+16],$a2
2499	ldx	[%sp+LOCALS64+$in2_y+24],$a3
2500	add	%sp,LOCALS64+$S2,$bp
2501	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S2, S2, in2_y);
2502	add	%sp,LOCALS64+$S2,$rp
2503
2504	ldx	[%sp+LOCALS64+$Z2sqr],$bi	! forward load
2505	ldx	[%sp+LOCALS64+$in1_x],$a0
2506	ldx	[%sp+LOCALS64+$in1_x+8],$a1
2507	ldx	[%sp+LOCALS64+$in1_x+16],$a2
2508	ldx	[%sp+LOCALS64+$in1_x+24],$a3
2509
2510	add	%sp,LOCALS64+$S1,$bp
2511	call	__ecp_nistz256_sub_from_vis3	! p256_sub(R, S2, S1);
2512	add	%sp,LOCALS64+$R,$rp
2513
2514	or	$acc1,$acc0,$acc0		! see if result is zero
2515	or	$acc3,$acc2,$acc2
2516	or	$acc2,$acc0,$acc0
2517	stx	$acc0,[%fp+STACK_BIAS-24]
2518
2519	add	%sp,LOCALS64+$Z2sqr,$bp
2520	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(U1, in1_x, Z2sqr);
2521	add	%sp,LOCALS64+$U1,$rp
2522
2523	ldx	[%sp+LOCALS64+$Z1sqr],$bi
2524	ldx	[%sp+LOCALS64+$in2_x],$a0
2525	ldx	[%sp+LOCALS64+$in2_x+8],$a1
2526	ldx	[%sp+LOCALS64+$in2_x+16],$a2
2527	ldx	[%sp+LOCALS64+$in2_x+24],$a3
2528	add	%sp,LOCALS64+$Z1sqr,$bp
2529	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(U2, in2_x, Z1sqr);
2530	add	%sp,LOCALS64+$U2,$rp
2531
2532	ldx	[%sp+LOCALS64+$R],$a0		! forward load
2533	ldx	[%sp+LOCALS64+$R+8],$a1
2534	ldx	[%sp+LOCALS64+$R+16],$a2
2535	ldx	[%sp+LOCALS64+$R+24],$a3
2536
2537	add	%sp,LOCALS64+$U1,$bp
2538	call	__ecp_nistz256_sub_from_vis3	! p256_sub(H, U2, U1);
2539	add	%sp,LOCALS64+$H,$rp
2540
2541	or	$acc1,$acc0,$acc0		! see if result is zero
2542	or	$acc3,$acc2,$acc2
2543	orcc	$acc2,$acc0,$acc0
2544
2545	bne,pt	%xcc,.Ladd_proceed_vis3		! is_equal(U1,U2)?
2546	nop
2547
2548	ldx	[%fp+STACK_BIAS-8],$t0
2549	ldx	[%fp+STACK_BIAS-16],$t1
2550	ldx	[%fp+STACK_BIAS-24],$t2
2551	andcc	$t0,$t1,%g0
2552	be,pt	%xcc,.Ladd_proceed_vis3		! (in1infty || in2infty)?
2553	nop
2554	andcc	$t2,$t2,%g0
2555	be,a,pt	%xcc,.Ldouble_shortcut_vis3	! is_equal(S1,S2)?
2556	add	%sp,32*(12-10)+32,%sp		! difference in frame sizes
2557
2558	st	%g0,[$rp_real]
2559	st	%g0,[$rp_real+4]
2560	st	%g0,[$rp_real+8]
2561	st	%g0,[$rp_real+12]
2562	st	%g0,[$rp_real+16]
2563	st	%g0,[$rp_real+20]
2564	st	%g0,[$rp_real+24]
2565	st	%g0,[$rp_real+28]
2566	st	%g0,[$rp_real+32]
2567	st	%g0,[$rp_real+32+4]
2568	st	%g0,[$rp_real+32+8]
2569	st	%g0,[$rp_real+32+12]
2570	st	%g0,[$rp_real+32+16]
2571	st	%g0,[$rp_real+32+20]
2572	st	%g0,[$rp_real+32+24]
2573	st	%g0,[$rp_real+32+28]
2574	st	%g0,[$rp_real+64]
2575	st	%g0,[$rp_real+64+4]
2576	st	%g0,[$rp_real+64+8]
2577	st	%g0,[$rp_real+64+12]
2578	st	%g0,[$rp_real+64+16]
2579	st	%g0,[$rp_real+64+20]
2580	st	%g0,[$rp_real+64+24]
2581	st	%g0,[$rp_real+64+28]
2582	b	.Ladd_done_vis3
2583	nop
2584
2585.align	16
2586.Ladd_proceed_vis3:
2587	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(Rsqr, R);
2588	add	%sp,LOCALS64+$Rsqr,$rp
2589
2590	ldx	[%sp+LOCALS64+$H],$bi
2591	ldx	[%sp+LOCALS64+$in1_z],$a0
2592	ldx	[%sp+LOCALS64+$in1_z+8],$a1
2593	ldx	[%sp+LOCALS64+$in1_z+16],$a2
2594	ldx	[%sp+LOCALS64+$in1_z+24],$a3
2595	add	%sp,LOCALS64+$H,$bp
2596	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(res_z, H, in1_z);
2597	add	%sp,LOCALS64+$res_z,$rp
2598
2599	ldx	[%sp+LOCALS64+$H],$a0
2600	ldx	[%sp+LOCALS64+$H+8],$a1
2601	ldx	[%sp+LOCALS64+$H+16],$a2
2602	ldx	[%sp+LOCALS64+$H+24],$a3
2603	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(Hsqr, H);
2604	add	%sp,LOCALS64+$Hsqr,$rp
2605
2606	ldx	[%sp+LOCALS64+$res_z],$bi
2607	ldx	[%sp+LOCALS64+$in2_z],$a0
2608	ldx	[%sp+LOCALS64+$in2_z+8],$a1
2609	ldx	[%sp+LOCALS64+$in2_z+16],$a2
2610	ldx	[%sp+LOCALS64+$in2_z+24],$a3
2611	add	%sp,LOCALS64+$res_z,$bp
2612	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(res_z, res_z, in2_z);
2613	add	%sp,LOCALS64+$res_z,$rp
2614
2615	ldx	[%sp+LOCALS64+$H],$bi
2616	ldx	[%sp+LOCALS64+$Hsqr],$a0
2617	ldx	[%sp+LOCALS64+$Hsqr+8],$a1
2618	ldx	[%sp+LOCALS64+$Hsqr+16],$a2
2619	ldx	[%sp+LOCALS64+$Hsqr+24],$a3
2620	add	%sp,LOCALS64+$H,$bp
2621	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(Hcub, Hsqr, H);
2622	add	%sp,LOCALS64+$Hcub,$rp
2623
2624	ldx	[%sp+LOCALS64+$U1],$bi
2625	ldx	[%sp+LOCALS64+$Hsqr],$a0
2626	ldx	[%sp+LOCALS64+$Hsqr+8],$a1
2627	ldx	[%sp+LOCALS64+$Hsqr+16],$a2
2628	ldx	[%sp+LOCALS64+$Hsqr+24],$a3
2629	add	%sp,LOCALS64+$U1,$bp
2630	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(U2, U1, Hsqr);
2631	add	%sp,LOCALS64+$U2,$rp
2632
2633	call	__ecp_nistz256_mul_by_2_vis3	! p256_mul_by_2(Hsqr, U2);
2634	add	%sp,LOCALS64+$Hsqr,$rp
2635
2636	add	%sp,LOCALS64+$Rsqr,$bp
2637	call	__ecp_nistz256_sub_morf_vis3	! p256_sub(res_x, Rsqr, Hsqr);
2638	add	%sp,LOCALS64+$res_x,$rp
2639
2640	add	%sp,LOCALS64+$Hcub,$bp
2641	call	__ecp_nistz256_sub_from_vis3	!  p256_sub(res_x, res_x, Hcub);
2642	add	%sp,LOCALS64+$res_x,$rp
2643
2644	ldx	[%sp+LOCALS64+$S1],$bi		! forward load
2645	ldx	[%sp+LOCALS64+$Hcub],$a0
2646	ldx	[%sp+LOCALS64+$Hcub+8],$a1
2647	ldx	[%sp+LOCALS64+$Hcub+16],$a2
2648	ldx	[%sp+LOCALS64+$Hcub+24],$a3
2649
2650	add	%sp,LOCALS64+$U2,$bp
2651	call	__ecp_nistz256_sub_morf_vis3	! p256_sub(res_y, U2, res_x);
2652	add	%sp,LOCALS64+$res_y,$rp
2653
2654	add	%sp,LOCALS64+$S1,$bp
2655	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S2, S1, Hcub);
2656	add	%sp,LOCALS64+$S2,$rp
2657
2658	ldx	[%sp+LOCALS64+$R],$bi
2659	ldx	[%sp+LOCALS64+$res_y],$a0
2660	ldx	[%sp+LOCALS64+$res_y+8],$a1
2661	ldx	[%sp+LOCALS64+$res_y+16],$a2
2662	ldx	[%sp+LOCALS64+$res_y+24],$a3
2663	add	%sp,LOCALS64+$R,$bp
2664	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(res_y, res_y, R);
2665	add	%sp,LOCALS64+$res_y,$rp
2666
2667	add	%sp,LOCALS64+$S2,$bp
2668	call	__ecp_nistz256_sub_from_vis3	! p256_sub(res_y, res_y, S2);
2669	add	%sp,LOCALS64+$res_y,$rp
2670
2671	ldx	[%fp+STACK_BIAS-16],$t1		! !in1infty
2672	ldx	[%fp+STACK_BIAS-8],$t2		! !in2infty
2673___
2674for($i=0;$i<96;$i+=16) {			# conditional moves
2675$code.=<<___;
2676	ldx	[%sp+LOCALS64+$res_x+$i],$acc0	! res
2677	ldx	[%sp+LOCALS64+$res_x+$i+8],$acc1
2678	ldx	[%sp+LOCALS64+$in2_x+$i],$acc2	! in2
2679	ldx	[%sp+LOCALS64+$in2_x+$i+8],$acc3
2680	ldx	[%sp+LOCALS64+$in1_x+$i],$acc4	! in1
2681	ldx	[%sp+LOCALS64+$in1_x+$i+8],$acc5
2682	movrz	$t1,$acc2,$acc0
2683	movrz	$t1,$acc3,$acc1
2684	movrz	$t2,$acc4,$acc0
2685	movrz	$t2,$acc5,$acc1
2686	srlx	$acc0,32,$acc2
2687	srlx	$acc1,32,$acc3
2688	st	$acc0,[$rp_real+$i]
2689	st	$acc2,[$rp_real+$i+4]
2690	st	$acc1,[$rp_real+$i+8]
2691	st	$acc3,[$rp_real+$i+12]
2692___
2693}
2694$code.=<<___;
2695.Ladd_done_vis3:
2696	ret
2697	restore
2698.type	ecp_nistz256_point_add_vis3,#function
2699.size	ecp_nistz256_point_add_vis3,.-ecp_nistz256_point_add_vis3
2700___
2701}
2702########################################################################
2703# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
2704#				     const P256_POINT_AFFINE *in2);
2705{
2706my ($res_x,$res_y,$res_z,
2707    $in1_x,$in1_y,$in1_z,
2708    $in2_x,$in2_y,
2709    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
2710my $Z1sqr = $S2;
2711# above map() describes stack layout with 15 temporary
2712# 256-bit vectors on top. Then we reserve some space for
2713# !in1infty and !in2infty.
2714
2715$code.=<<___;
2716.align	32
2717ecp_nistz256_point_add_affine_vis3:
2718	save	%sp,-STACK64_FRAME-32*15-32,%sp
2719
2720	mov	$rp,$rp_real
2721	mov	-1,$minus1
2722	mov	-2,$poly3
2723	sllx	$minus1,32,$poly1		! 0xFFFFFFFF00000000
2724	srl	$poly3,0,$poly3			! 0x00000000FFFFFFFE
2725
2726	! convert input to uint64_t[4]
2727	ld	[$bp],$a0			! in2_x
2728	ld	[$bp+4],$t0
2729	ld	[$bp+8],$a1
2730	ld	[$bp+12],$t1
2731	ld	[$bp+16],$a2
2732	ld	[$bp+20],$t2
2733	ld	[$bp+24],$a3
2734	ld	[$bp+28],$t3
2735	sllx	$t0,32,$t0
2736	sllx	$t1,32,$t1
2737	ld	[$bp+32],$acc0			! in2_y
2738	or	$a0,$t0,$a0
2739	ld	[$bp+32+4],$t0
2740	sllx	$t2,32,$t2
2741	ld	[$bp+32+8],$acc1
2742	or	$a1,$t1,$a1
2743	ld	[$bp+32+12],$t1
2744	sllx	$t3,32,$t3
2745	ld	[$bp+32+16],$acc2
2746	or	$a2,$t2,$a2
2747	ld	[$bp+32+20],$t2
2748	or	$a3,$t3,$a3
2749	ld	[$bp+32+24],$acc3
2750	sllx	$t0,32,$t0
2751	ld	[$bp+32+28],$t3
2752	sllx	$t1,32,$t1
2753	stx	$a0,[%sp+LOCALS64+$in2_x]
2754	sllx	$t2,32,$t2
2755	stx	$a1,[%sp+LOCALS64+$in2_x+8]
2756	sllx	$t3,32,$t3
2757	stx	$a2,[%sp+LOCALS64+$in2_x+16]
2758	or	$acc0,$t0,$acc0
2759	stx	$a3,[%sp+LOCALS64+$in2_x+24]
2760	or	$acc1,$t1,$acc1
2761	stx	$acc0,[%sp+LOCALS64+$in2_y]
2762	or	$acc2,$t2,$acc2
2763	stx	$acc1,[%sp+LOCALS64+$in2_y+8]
2764	or	$acc3,$t3,$acc3
2765	stx	$acc2,[%sp+LOCALS64+$in2_y+16]
2766	stx	$acc3,[%sp+LOCALS64+$in2_y+24]
2767
2768	or	$a1,$a0,$a0
2769	or	$a3,$a2,$a2
2770	or	$acc1,$acc0,$acc0
2771	or	$acc3,$acc2,$acc2
2772	or	$a2,$a0,$a0
2773	or	$acc2,$acc0,$acc0
2774	or	$acc0,$a0,$a0
2775	movrnz	$a0,-1,$a0			! !in2infty
2776	stx	$a0,[%fp+STACK_BIAS-8]
2777
2778	ld	[$ap],$a0			! in1_x
2779	ld	[$ap+4],$t0
2780	ld	[$ap+8],$a1
2781	ld	[$ap+12],$t1
2782	ld	[$ap+16],$a2
2783	ld	[$ap+20],$t2
2784	ld	[$ap+24],$a3
2785	ld	[$ap+28],$t3
2786	sllx	$t0,32,$t0
2787	sllx	$t1,32,$t1
2788	ld	[$ap+32],$acc0			! in1_y
2789	or	$a0,$t0,$a0
2790	ld	[$ap+32+4],$t0
2791	sllx	$t2,32,$t2
2792	ld	[$ap+32+8],$acc1
2793	or	$a1,$t1,$a1
2794	ld	[$ap+32+12],$t1
2795	sllx	$t3,32,$t3
2796	ld	[$ap+32+16],$acc2
2797	or	$a2,$t2,$a2
2798	ld	[$ap+32+20],$t2
2799	or	$a3,$t3,$a3
2800	ld	[$ap+32+24],$acc3
2801	sllx	$t0,32,$t0
2802	ld	[$ap+32+28],$t3
2803	sllx	$t1,32,$t1
2804	stx	$a0,[%sp+LOCALS64+$in1_x]
2805	sllx	$t2,32,$t2
2806	stx	$a1,[%sp+LOCALS64+$in1_x+8]
2807	sllx	$t3,32,$t3
2808	stx	$a2,[%sp+LOCALS64+$in1_x+16]
2809	or	$acc0,$t0,$acc0
2810	stx	$a3,[%sp+LOCALS64+$in1_x+24]
2811	or	$acc1,$t1,$acc1
2812	stx	$acc0,[%sp+LOCALS64+$in1_y]
2813	or	$acc2,$t2,$acc2
2814	stx	$acc1,[%sp+LOCALS64+$in1_y+8]
2815	or	$acc3,$t3,$acc3
2816	stx	$acc2,[%sp+LOCALS64+$in1_y+16]
2817	stx	$acc3,[%sp+LOCALS64+$in1_y+24]
2818
2819	ld	[$ap+64],$a0			! in1_z
2820	ld	[$ap+64+4],$t0
2821	ld	[$ap+64+8],$a1
2822	ld	[$ap+64+12],$t1
2823	ld	[$ap+64+16],$a2
2824	ld	[$ap+64+20],$t2
2825	ld	[$ap+64+24],$a3
2826	ld	[$ap+64+28],$t3
2827	sllx	$t0,32,$t0
2828	sllx	$t1,32,$t1
2829	or	$a0,$t0,$a0
2830	sllx	$t2,32,$t2
2831	or	$a1,$t1,$a1
2832	sllx	$t3,32,$t3
2833	stx	$a0,[%sp+LOCALS64+$in1_z]
2834	or	$a2,$t2,$a2
2835	stx	$a1,[%sp+LOCALS64+$in1_z+8]
2836	or	$a3,$t3,$a3
2837	stx	$a2,[%sp+LOCALS64+$in1_z+16]
2838	stx	$a3,[%sp+LOCALS64+$in1_z+24]
2839
2840	or	$a1,$a0,$t0
2841	or	$a3,$a2,$t2
2842	or	$t2,$t0,$t0
2843	movrnz	$t0,-1,$t0			! !in1infty
2844	stx	$t0,[%fp+STACK_BIAS-16]
2845
2846	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(Z1sqr, in1_z);
2847	add	%sp,LOCALS64+$Z1sqr,$rp
2848
2849	ldx	[%sp+LOCALS64+$in2_x],$bi
2850	mov	$acc0,$a0
2851	mov	$acc1,$a1
2852	mov	$acc2,$a2
2853	mov	$acc3,$a3
2854	add	%sp,LOCALS64+$in2_x,$bp
2855	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(U2, Z1sqr, in2_x);
2856	add	%sp,LOCALS64+$U2,$rp
2857
2858	ldx	[%sp+LOCALS64+$Z1sqr],$bi	! forward load
2859	ldx	[%sp+LOCALS64+$in1_z],$a0
2860	ldx	[%sp+LOCALS64+$in1_z+8],$a1
2861	ldx	[%sp+LOCALS64+$in1_z+16],$a2
2862	ldx	[%sp+LOCALS64+$in1_z+24],$a3
2863
2864	add	%sp,LOCALS64+$in1_x,$bp
2865	call	__ecp_nistz256_sub_from_vis3	! p256_sub(H, U2, in1_x);
2866	add	%sp,LOCALS64+$H,$rp
2867
2868	add	%sp,LOCALS64+$Z1sqr,$bp
2869	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S2, Z1sqr, in1_z);
2870	add	%sp,LOCALS64+$S2,$rp
2871
2872	ldx	[%sp+LOCALS64+$H],$bi
2873	ldx	[%sp+LOCALS64+$in1_z],$a0
2874	ldx	[%sp+LOCALS64+$in1_z+8],$a1
2875	ldx	[%sp+LOCALS64+$in1_z+16],$a2
2876	ldx	[%sp+LOCALS64+$in1_z+24],$a3
2877	add	%sp,LOCALS64+$H,$bp
2878	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(res_z, H, in1_z);
2879	add	%sp,LOCALS64+$res_z,$rp
2880
2881	ldx	[%sp+LOCALS64+$S2],$bi
2882	ldx	[%sp+LOCALS64+$in2_y],$a0
2883	ldx	[%sp+LOCALS64+$in2_y+8],$a1
2884	ldx	[%sp+LOCALS64+$in2_y+16],$a2
2885	ldx	[%sp+LOCALS64+$in2_y+24],$a3
2886	add	%sp,LOCALS64+$S2,$bp
2887	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S2, S2, in2_y);
2888	add	%sp,LOCALS64+$S2,$rp
2889
2890	ldx	[%sp+LOCALS64+$H],$a0		! forward load
2891	ldx	[%sp+LOCALS64+$H+8],$a1
2892	ldx	[%sp+LOCALS64+$H+16],$a2
2893	ldx	[%sp+LOCALS64+$H+24],$a3
2894
2895	add	%sp,LOCALS64+$in1_y,$bp
2896	call	__ecp_nistz256_sub_from_vis3	! p256_sub(R, S2, in1_y);
2897	add	%sp,LOCALS64+$R,$rp
2898
2899	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(Hsqr, H);
2900	add	%sp,LOCALS64+$Hsqr,$rp
2901
2902	ldx	[%sp+LOCALS64+$R],$a0
2903	ldx	[%sp+LOCALS64+$R+8],$a1
2904	ldx	[%sp+LOCALS64+$R+16],$a2
2905	ldx	[%sp+LOCALS64+$R+24],$a3
2906	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(Rsqr, R);
2907	add	%sp,LOCALS64+$Rsqr,$rp
2908
2909	ldx	[%sp+LOCALS64+$H],$bi
2910	ldx	[%sp+LOCALS64+$Hsqr],$a0
2911	ldx	[%sp+LOCALS64+$Hsqr+8],$a1
2912	ldx	[%sp+LOCALS64+$Hsqr+16],$a2
2913	ldx	[%sp+LOCALS64+$Hsqr+24],$a3
2914	add	%sp,LOCALS64+$H,$bp
2915	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(Hcub, Hsqr, H);
2916	add	%sp,LOCALS64+$Hcub,$rp
2917
2918	ldx	[%sp+LOCALS64+$Hsqr],$bi
2919	ldx	[%sp+LOCALS64+$in1_x],$a0
2920	ldx	[%sp+LOCALS64+$in1_x+8],$a1
2921	ldx	[%sp+LOCALS64+$in1_x+16],$a2
2922	ldx	[%sp+LOCALS64+$in1_x+24],$a3
2923	add	%sp,LOCALS64+$Hsqr,$bp
2924	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(U2, in1_x, Hsqr);
2925	add	%sp,LOCALS64+$U2,$rp
2926
2927	call	__ecp_nistz256_mul_by_2_vis3	! p256_mul_by_2(Hsqr, U2);
2928	add	%sp,LOCALS64+$Hsqr,$rp
2929
2930	add	%sp,LOCALS64+$Rsqr,$bp
2931	call	__ecp_nistz256_sub_morf_vis3	! p256_sub(res_x, Rsqr, Hsqr);
2932	add	%sp,LOCALS64+$res_x,$rp
2933
2934	add	%sp,LOCALS64+$Hcub,$bp
2935	call	__ecp_nistz256_sub_from_vis3	!  p256_sub(res_x, res_x, Hcub);
2936	add	%sp,LOCALS64+$res_x,$rp
2937
2938	ldx	[%sp+LOCALS64+$Hcub],$bi	! forward load
2939	ldx	[%sp+LOCALS64+$in1_y],$a0
2940	ldx	[%sp+LOCALS64+$in1_y+8],$a1
2941	ldx	[%sp+LOCALS64+$in1_y+16],$a2
2942	ldx	[%sp+LOCALS64+$in1_y+24],$a3
2943
2944	add	%sp,LOCALS64+$U2,$bp
2945	call	__ecp_nistz256_sub_morf_vis3	! p256_sub(res_y, U2, res_x);
2946	add	%sp,LOCALS64+$res_y,$rp
2947
2948	add	%sp,LOCALS64+$Hcub,$bp
2949	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(S2, in1_y, Hcub);
2950	add	%sp,LOCALS64+$S2,$rp
2951
2952	ldx	[%sp+LOCALS64+$R],$bi
2953	ldx	[%sp+LOCALS64+$res_y],$a0
2954	ldx	[%sp+LOCALS64+$res_y+8],$a1
2955	ldx	[%sp+LOCALS64+$res_y+16],$a2
2956	ldx	[%sp+LOCALS64+$res_y+24],$a3
2957	add	%sp,LOCALS64+$R,$bp
2958	call	__ecp_nistz256_mul_mont_vis3	! p256_mul_mont(res_y, res_y, R);
2959	add	%sp,LOCALS64+$res_y,$rp
2960
2961	add	%sp,LOCALS64+$S2,$bp
2962	call	__ecp_nistz256_sub_from_vis3	! p256_sub(res_y, res_y, S2);
2963	add	%sp,LOCALS64+$res_y,$rp
2964
2965	ldx	[%fp+STACK_BIAS-16],$t1		! !in1infty
2966	ldx	[%fp+STACK_BIAS-8],$t2		! !in2infty
29671:	call	.+8
2968	add	%o7,.Lone_mont_vis3-1b,$bp
2969___
2970for($i=0;$i<64;$i+=16) {			# conditional moves
2971$code.=<<___;
2972	ldx	[%sp+LOCALS64+$res_x+$i],$acc0	! res
2973	ldx	[%sp+LOCALS64+$res_x+$i+8],$acc1
2974	ldx	[%sp+LOCALS64+$in2_x+$i],$acc2	! in2
2975	ldx	[%sp+LOCALS64+$in2_x+$i+8],$acc3
2976	ldx	[%sp+LOCALS64+$in1_x+$i],$acc4	! in1
2977	ldx	[%sp+LOCALS64+$in1_x+$i+8],$acc5
2978	movrz	$t1,$acc2,$acc0
2979	movrz	$t1,$acc3,$acc1
2980	movrz	$t2,$acc4,$acc0
2981	movrz	$t2,$acc5,$acc1
2982	srlx	$acc0,32,$acc2
2983	srlx	$acc1,32,$acc3
2984	st	$acc0,[$rp_real+$i]
2985	st	$acc2,[$rp_real+$i+4]
2986	st	$acc1,[$rp_real+$i+8]
2987	st	$acc3,[$rp_real+$i+12]
2988___
2989}
2990for(;$i<96;$i+=16) {
2991$code.=<<___;
2992	ldx	[%sp+LOCALS64+$res_x+$i],$acc0	! res
2993	ldx	[%sp+LOCALS64+$res_x+$i+8],$acc1
2994	ldx	[$bp+$i-64],$acc2		! "in2"
2995	ldx	[$bp+$i-64+8],$acc3
2996	ldx	[%sp+LOCALS64+$in1_x+$i],$acc4	! in1
2997	ldx	[%sp+LOCALS64+$in1_x+$i+8],$acc5
2998	movrz	$t1,$acc2,$acc0
2999	movrz	$t1,$acc3,$acc1
3000	movrz	$t2,$acc4,$acc0
3001	movrz	$t2,$acc5,$acc1
3002	srlx	$acc0,32,$acc2
3003	srlx	$acc1,32,$acc3
3004	st	$acc0,[$rp_real+$i]
3005	st	$acc2,[$rp_real+$i+4]
3006	st	$acc1,[$rp_real+$i+8]
3007	st	$acc3,[$rp_real+$i+12]
3008___
3009}
3010$code.=<<___;
3011	ret
3012	restore
3013.type	ecp_nistz256_point_add_affine_vis3,#function
3014.size	ecp_nistz256_point_add_affine_vis3,.-ecp_nistz256_point_add_affine_vis3
3015.align	64
3016.Lone_mont_vis3:
3017.long	0x00000000,0x00000001, 0xffffffff,0x00000000
3018.long	0xffffffff,0xffffffff, 0x00000000,0xfffffffe
3019.align	64
3020___
3021}								}}}
3022
3023# Purpose of these subroutines is to explicitly encode VIS instructions,
3024# so that one can compile the module without having to specify VIS
3025# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
3026# Idea is to reserve for option to produce "universal" binary and let
3027# programmer detect if current CPU is VIS capable at run-time.
3028sub unvis3 {
3029my ($mnemonic,$rs1,$rs2,$rd)=@_;
3030my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
3031my ($ref,$opf);
3032my %visopf = (	"addxc"		=> 0x011,
3033		"addxccc"	=> 0x013,
3034		"umulxhi"	=> 0x016	);
3035
3036    $ref = "$mnemonic\t$rs1,$rs2,$rd";
3037
3038    if ($opf=$visopf{$mnemonic}) {
3039	foreach ($rs1,$rs2,$rd) {
3040	    return $ref if (!/%([goli])([0-9])/);
3041	    $_=$bias{$1}+$2;
3042	}
3043
3044	return	sprintf ".word\t0x%08x !%s",
3045			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
3046			$ref;
3047    } else {
3048	return $ref;
3049    }
3050}
3051
3052foreach (split("\n",$code)) {
3053	s/\`([^\`]*)\`/eval $1/ge;
3054
3055	s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
3056		&unvis3($1,$2,$3,$4)
3057	 /ge;
3058
3059	print $_,"\n";
3060}
3061
3062close STDOUT or die "error closing STDOUT: $!";
3063