1#! /usr/bin/env perl
2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# ECP_NISTZ256 module for PPC64.
18#
19# August 2016.
20#
21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22# http://eprint.iacr.org/2013/816.
23#
24#			with/without -DECP_NISTZ256_ASM
25# POWER7		+260-530%
26# POWER8		+220-340%
27
28$flavour = shift;
29while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
30
31$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
32( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
33( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
34die "can't locate ppc-xlate.pl";
35
36open OUT,"| \"$^X\" $xlate $flavour $output";
37*STDOUT=*OUT;
38
39my $sp="r1";
40
41{
42my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3,
43    $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) =
44    map("r$_",(3..12,22..31));
45
46my ($acc6,$acc7)=($bp,$bi);	# used in __ecp_nistz256_sqr_mont
47
48$code.=<<___;
49.machine	"any"
50.text
51___
52########################################################################
53# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
54#
55$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
56open TABLE,"<ecp_nistz256_table.c"		or
57open TABLE,"<${dir}../ecp_nistz256_table.c"	or
58die "failed to open ecp_nistz256_table.c:",$!;
59
60use integer;
61
62foreach(<TABLE>) {
63	s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
64}
65close TABLE;
66
67# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
68# 64*16*37-1 is because $#arr returns last valid index or @arr, not
69# amount of elements.
70die "insane number of elements" if ($#arr != 64*16*37-1);
71
72$code.=<<___;
73.type	ecp_nistz256_precomputed,\@object
74.globl	ecp_nistz256_precomputed
75.align	12
76ecp_nistz256_precomputed:
77___
78########################################################################
79# this conversion smashes P256_POINT_AFFINE by individual bytes with
80# 64 byte interval, similar to
81#	1111222233334444
82#	1234123412341234
83for(1..37) {
84	@tbl = splice(@arr,0,64*16);
85	for($i=0;$i<64;$i++) {
86		undef @line;
87		for($j=0;$j<64;$j++) {
88			push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
89		}
90		$code.=".byte\t";
91		$code.=join(',',map { sprintf "0x%02x",$_} @line);
92		$code.="\n";
93	}
94}
95
96$code.=<<___;
97.size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
98.asciz	"ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
99
100# void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
101#					     const BN_ULONG x2[4]);
102.globl	ecp_nistz256_mul_mont
103.align	5
104ecp_nistz256_mul_mont:
105	stdu	$sp,-128($sp)
106	mflr	r0
107	std	r22,48($sp)
108	std	r23,56($sp)
109	std	r24,64($sp)
110	std	r25,72($sp)
111	std	r26,80($sp)
112	std	r27,88($sp)
113	std	r28,96($sp)
114	std	r29,104($sp)
115	std	r30,112($sp)
116	std	r31,120($sp)
117
118	ld	$a0,0($ap)
119	ld	$bi,0($bp)
120	ld	$a1,8($ap)
121	ld	$a2,16($ap)
122	ld	$a3,24($ap)
123
124	li	$poly1,-1
125	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
126	li	$poly3,1
127	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
128
129	bl	__ecp_nistz256_mul_mont
130
131	mtlr	r0
132	ld	r22,48($sp)
133	ld	r23,56($sp)
134	ld	r24,64($sp)
135	ld	r25,72($sp)
136	ld	r26,80($sp)
137	ld	r27,88($sp)
138	ld	r28,96($sp)
139	ld	r29,104($sp)
140	ld	r30,112($sp)
141	ld	r31,120($sp)
142	addi	$sp,$sp,128
143	blr
144	.long	0
145	.byte	0,12,4,0,0x80,10,3,0
146	.long	0
147.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
148
149# void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
150.globl	ecp_nistz256_sqr_mont
151.align	4
152ecp_nistz256_sqr_mont:
153	stdu	$sp,-128($sp)
154	mflr	r0
155	std	r22,48($sp)
156	std	r23,56($sp)
157	std	r24,64($sp)
158	std	r25,72($sp)
159	std	r26,80($sp)
160	std	r27,88($sp)
161	std	r28,96($sp)
162	std	r29,104($sp)
163	std	r30,112($sp)
164	std	r31,120($sp)
165
166	ld	$a0,0($ap)
167	ld	$a1,8($ap)
168	ld	$a2,16($ap)
169	ld	$a3,24($ap)
170
171	li	$poly1,-1
172	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
173	li	$poly3,1
174	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
175
176	bl	__ecp_nistz256_sqr_mont
177
178	mtlr	r0
179	ld	r22,48($sp)
180	ld	r23,56($sp)
181	ld	r24,64($sp)
182	ld	r25,72($sp)
183	ld	r26,80($sp)
184	ld	r27,88($sp)
185	ld	r28,96($sp)
186	ld	r29,104($sp)
187	ld	r30,112($sp)
188	ld	r31,120($sp)
189	addi	$sp,$sp,128
190	blr
191	.long	0
192	.byte	0,12,4,0,0x80,10,2,0
193	.long	0
194.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
195
196# void	ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
197#					const BN_ULONG x2[4]);
198.globl	ecp_nistz256_add
199.align	4
200ecp_nistz256_add:
201	stdu	$sp,-128($sp)
202	mflr	r0
203	std	r28,96($sp)
204	std	r29,104($sp)
205	std	r30,112($sp)
206	std	r31,120($sp)
207
208	ld	$acc0,0($ap)
209	ld	$t0,  0($bp)
210	ld	$acc1,8($ap)
211	ld	$t1,  8($bp)
212	ld	$acc2,16($ap)
213	ld	$t2,  16($bp)
214	ld	$acc3,24($ap)
215	ld	$t3,  24($bp)
216
217	li	$poly1,-1
218	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
219	li	$poly3,1
220	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
221
222	bl	__ecp_nistz256_add
223
224	mtlr	r0
225	ld	r28,96($sp)
226	ld	r29,104($sp)
227	ld	r30,112($sp)
228	ld	r31,120($sp)
229	addi	$sp,$sp,128
230	blr
231	.long	0
232	.byte	0,12,4,0,0x80,4,3,0
233	.long	0
234.size	ecp_nistz256_add,.-ecp_nistz256_add
235
236# void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
237.globl	ecp_nistz256_div_by_2
238.align	4
239ecp_nistz256_div_by_2:
240	stdu	$sp,-128($sp)
241	mflr	r0
242	std	r28,96($sp)
243	std	r29,104($sp)
244	std	r30,112($sp)
245	std	r31,120($sp)
246
247	ld	$acc0,0($ap)
248	ld	$acc1,8($ap)
249	ld	$acc2,16($ap)
250	ld	$acc3,24($ap)
251
252	li	$poly1,-1
253	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
254	li	$poly3,1
255	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
256
257	bl	__ecp_nistz256_div_by_2
258
259	mtlr	r0
260	ld	r28,96($sp)
261	ld	r29,104($sp)
262	ld	r30,112($sp)
263	ld	r31,120($sp)
264	addi	$sp,$sp,128
265	blr
266	.long	0
267	.byte	0,12,4,0,0x80,4,2,0
268	.long	0
269.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
270
271# void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
272.globl	ecp_nistz256_mul_by_2
273.align	4
274ecp_nistz256_mul_by_2:
275	stdu	$sp,-128($sp)
276	mflr	r0
277	std	r28,96($sp)
278	std	r29,104($sp)
279	std	r30,112($sp)
280	std	r31,120($sp)
281
282	ld	$acc0,0($ap)
283	ld	$acc1,8($ap)
284	ld	$acc2,16($ap)
285	ld	$acc3,24($ap)
286
287	mr	$t0,$acc0
288	mr	$t1,$acc1
289	mr	$t2,$acc2
290	mr	$t3,$acc3
291
292	li	$poly1,-1
293	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
294	li	$poly3,1
295	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
296
297	bl	__ecp_nistz256_add	# ret = a+a	// 2*a
298
299	mtlr	r0
300	ld	r28,96($sp)
301	ld	r29,104($sp)
302	ld	r30,112($sp)
303	ld	r31,120($sp)
304	addi	$sp,$sp,128
305	blr
306	.long	0
307	.byte	0,12,4,0,0x80,4,3,0
308	.long	0
309.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
310
311# void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
312.globl	ecp_nistz256_mul_by_3
313.align	4
314ecp_nistz256_mul_by_3:
315	stdu	$sp,-128($sp)
316	mflr	r0
317	std	r28,96($sp)
318	std	r29,104($sp)
319	std	r30,112($sp)
320	std	r31,120($sp)
321
322	ld	$acc0,0($ap)
323	ld	$acc1,8($ap)
324	ld	$acc2,16($ap)
325	ld	$acc3,24($ap)
326
327	mr	$t0,$acc0
328	std	$acc0,64($sp)
329	mr	$t1,$acc1
330	std	$acc1,72($sp)
331	mr	$t2,$acc2
332	std	$acc2,80($sp)
333	mr	$t3,$acc3
334	std	$acc3,88($sp)
335
336	li	$poly1,-1
337	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
338	li	$poly3,1
339	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
340
341	bl	__ecp_nistz256_add	# ret = a+a	// 2*a
342
343	ld	$t0,64($sp)
344	ld	$t1,72($sp)
345	ld	$t2,80($sp)
346	ld	$t3,88($sp)
347
348	bl	__ecp_nistz256_add	# ret += a	// 2*a+a=3*a
349
350	mtlr	r0
351	ld	r28,96($sp)
352	ld	r29,104($sp)
353	ld	r30,112($sp)
354	ld	r31,120($sp)
355	addi	$sp,$sp,128
356	blr
357	.long	0
358	.byte	0,12,4,0,0x80,4,2,0
359	.long	0
360.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
361
362# void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
363#				        const BN_ULONG x2[4]);
364.globl	ecp_nistz256_sub
365.align	4
366ecp_nistz256_sub:
367	stdu	$sp,-128($sp)
368	mflr	r0
369	std	r28,96($sp)
370	std	r29,104($sp)
371	std	r30,112($sp)
372	std	r31,120($sp)
373
374	ld	$acc0,0($ap)
375	ld	$acc1,8($ap)
376	ld	$acc2,16($ap)
377	ld	$acc3,24($ap)
378
379	li	$poly1,-1
380	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
381	li	$poly3,1
382	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
383
384	bl	__ecp_nistz256_sub_from
385
386	mtlr	r0
387	ld	r28,96($sp)
388	ld	r29,104($sp)
389	ld	r30,112($sp)
390	ld	r31,120($sp)
391	addi	$sp,$sp,128
392	blr
393	.long	0
394	.byte	0,12,4,0,0x80,4,3,0
395	.long	0
396.size	ecp_nistz256_sub,.-ecp_nistz256_sub
397
398# void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
399.globl	ecp_nistz256_neg
400.align	4
401ecp_nistz256_neg:
402	stdu	$sp,-128($sp)
403	mflr	r0
404	std	r28,96($sp)
405	std	r29,104($sp)
406	std	r30,112($sp)
407	std	r31,120($sp)
408
409	mr	$bp,$ap
410	li	$acc0,0
411	li	$acc1,0
412	li	$acc2,0
413	li	$acc3,0
414
415	li	$poly1,-1
416	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
417	li	$poly3,1
418	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
419
420	bl	__ecp_nistz256_sub_from
421
422	mtlr	r0
423	ld	r28,96($sp)
424	ld	r29,104($sp)
425	ld	r30,112($sp)
426	ld	r31,120($sp)
427	addi	$sp,$sp,128
428	blr
429	.long	0
430	.byte	0,12,4,0,0x80,4,2,0
431	.long	0
432.size	ecp_nistz256_neg,.-ecp_nistz256_neg
433
434# note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
435# to $a0-$a3 and b[0] - to $bi
436.type	__ecp_nistz256_mul_mont,\@function
437.align	4
438__ecp_nistz256_mul_mont:
439	mulld	$acc0,$a0,$bi		# a[0]*b[0]
440	mulhdu	$t0,$a0,$bi
441
442	mulld	$acc1,$a1,$bi		# a[1]*b[0]
443	mulhdu	$t1,$a1,$bi
444
445	mulld	$acc2,$a2,$bi		# a[2]*b[0]
446	mulhdu	$t2,$a2,$bi
447
448	mulld	$acc3,$a3,$bi		# a[3]*b[0]
449	mulhdu	$t3,$a3,$bi
450	ld	$bi,8($bp)		# b[1]
451
452	addc	$acc1,$acc1,$t0		# accumulate high parts of multiplication
453	 sldi	$t0,$acc0,32
454	adde	$acc2,$acc2,$t1
455	 srdi	$t1,$acc0,32
456	adde	$acc3,$acc3,$t2
457	addze	$acc4,$t3
458	li	$acc5,0
459___
460for($i=1;$i<4;$i++) {
461	################################################################
462	# Reduction iteration is normally performed by accumulating
463	# result of multiplication of modulus by "magic" digit [and
464	# omitting least significant word, which is guaranteed to
465	# be 0], but thanks to special form of modulus and "magic"
466	# digit being equal to least significant word, it can be
467	# performed with additions and subtractions alone. Indeed:
468	#
469	#            ffff0001.00000000.0000ffff.ffffffff
470	# *                                     abcdefgh
471	# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
472	#
473	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
474	# rewrite above as:
475	#
476	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
477	# + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
478	# - 0000abcd.efgh0000.00000000.00000000.abcdefgh
479	#
480	# or marking redundant operations:
481	#
482	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
483	# + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
484	# - 0000abcd.efgh0000.--------.--------.--------
485
486$code.=<<___;
487	subfc	$t2,$t0,$acc0		# "*0xffff0001"
488	subfe	$t3,$t1,$acc0
489	addc	$acc0,$acc1,$t0		# +=acc[0]<<96 and omit acc[0]
490	adde	$acc1,$acc2,$t1
491	adde	$acc2,$acc3,$t2		# +=acc[0]*0xffff0001
492	adde	$acc3,$acc4,$t3
493	addze	$acc4,$acc5
494
495	mulld	$t0,$a0,$bi		# lo(a[0]*b[i])
496	mulld	$t1,$a1,$bi		# lo(a[1]*b[i])
497	mulld	$t2,$a2,$bi		# lo(a[2]*b[i])
498	mulld	$t3,$a3,$bi		# lo(a[3]*b[i])
499	addc	$acc0,$acc0,$t0		# accumulate low parts of multiplication
500	 mulhdu	$t0,$a0,$bi		# hi(a[0]*b[i])
501	adde	$acc1,$acc1,$t1
502	 mulhdu	$t1,$a1,$bi		# hi(a[1]*b[i])
503	adde	$acc2,$acc2,$t2
504	 mulhdu	$t2,$a2,$bi		# hi(a[2]*b[i])
505	adde	$acc3,$acc3,$t3
506	 mulhdu	$t3,$a3,$bi		# hi(a[3]*b[i])
507	addze	$acc4,$acc4
508___
509$code.=<<___	if ($i<3);
510	ld	$bi,8*($i+1)($bp)	# b[$i+1]
511___
512$code.=<<___;
513	addc	$acc1,$acc1,$t0		# accumulate high parts of multiplication
514	 sldi	$t0,$acc0,32
515	adde	$acc2,$acc2,$t1
516	 srdi	$t1,$acc0,32
517	adde	$acc3,$acc3,$t2
518	adde	$acc4,$acc4,$t3
519	li	$acc5,0
520	addze	$acc5,$acc5
521___
522}
523$code.=<<___;
524	# last reduction
525	subfc	$t2,$t0,$acc0		# "*0xffff0001"
526	subfe	$t3,$t1,$acc0
527	addc	$acc0,$acc1,$t0		# +=acc[0]<<96 and omit acc[0]
528	adde	$acc1,$acc2,$t1
529	adde	$acc2,$acc3,$t2		# +=acc[0]*0xffff0001
530	adde	$acc3,$acc4,$t3
531	addze	$acc4,$acc5
532
533	li	$t2,0
534	addic	$acc0,$acc0,1		# ret -= modulus
535	subfe	$acc1,$poly1,$acc1
536	subfe	$acc2,$t2,$acc2
537	subfe	$acc3,$poly3,$acc3
538	subfe	$acc4,$t2,$acc4
539
540	addc	$acc0,$acc0,$acc4	# ret += modulus if borrow
541	and	$t1,$poly1,$acc4
542	and	$t3,$poly3,$acc4
543	adde	$acc1,$acc1,$t1
544	addze	$acc2,$acc2
545	adde	$acc3,$acc3,$t3
546
547	std	$acc0,0($rp)
548	std	$acc1,8($rp)
549	std	$acc2,16($rp)
550	std	$acc3,24($rp)
551
552	blr
553	.long	0
554	.byte	0,12,0x14,0,0,0,1,0
555	.long	0
556.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
557
558# note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
559# to $a0-$a3
560.type	__ecp_nistz256_sqr_mont,\@function
561.align	4
562__ecp_nistz256_sqr_mont:
563	################################################################
564	#  |  |  |  |  |  |a1*a0|  |
565	#  |  |  |  |  |a2*a0|  |  |
566	#  |  |a3*a2|a3*a0|  |  |  |
567	#  |  |  |  |a2*a1|  |  |  |
568	#  |  |  |a3*a1|  |  |  |  |
569	# *|  |  |  |  |  |  |  | 2|
570	# +|a3*a3|a2*a2|a1*a1|a0*a0|
571	#  |--+--+--+--+--+--+--+--|
572	#  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
573	#
574	#  "can't overflow" below mark carrying into high part of
575	#  multiplication result, which can't overflow, because it
576	#  can never be all ones.
577
578	mulld	$acc1,$a1,$a0		# a[1]*a[0]
579	mulhdu	$t1,$a1,$a0
580	mulld	$acc2,$a2,$a0		# a[2]*a[0]
581	mulhdu	$t2,$a2,$a0
582	mulld	$acc3,$a3,$a0		# a[3]*a[0]
583	mulhdu	$acc4,$a3,$a0
584
585	addc	$acc2,$acc2,$t1		# accumulate high parts of multiplication
586	 mulld	$t0,$a2,$a1		# a[2]*a[1]
587	 mulhdu	$t1,$a2,$a1
588	adde	$acc3,$acc3,$t2
589	 mulld	$t2,$a3,$a1		# a[3]*a[1]
590	 mulhdu	$t3,$a3,$a1
591	addze	$acc4,$acc4		# can't overflow
592
593	mulld	$acc5,$a3,$a2		# a[3]*a[2]
594	mulhdu	$acc6,$a3,$a2
595
596	addc	$t1,$t1,$t2		# accumulate high parts of multiplication
597	addze	$t2,$t3			# can't overflow
598
599	addc	$acc3,$acc3,$t0		# accumulate low parts of multiplication
600	adde	$acc4,$acc4,$t1
601	adde	$acc5,$acc5,$t2
602	addze	$acc6,$acc6		# can't overflow
603
604	addc	$acc1,$acc1,$acc1	# acc[1-6]*=2
605	adde	$acc2,$acc2,$acc2
606	adde	$acc3,$acc3,$acc3
607	adde	$acc4,$acc4,$acc4
608	adde	$acc5,$acc5,$acc5
609	adde	$acc6,$acc6,$acc6
610	li	$acc7,0
611	addze	$acc7,$acc7
612
613	mulld	$acc0,$a0,$a0		# a[0]*a[0]
614	mulhdu	$a0,$a0,$a0
615	mulld	$t1,$a1,$a1		# a[1]*a[1]
616	mulhdu	$a1,$a1,$a1
617	mulld	$t2,$a2,$a2		# a[2]*a[2]
618	mulhdu	$a2,$a2,$a2
619	mulld	$t3,$a3,$a3		# a[3]*a[3]
620	mulhdu	$a3,$a3,$a3
621	addc	$acc1,$acc1,$a0		# +a[i]*a[i]
622	 sldi	$t0,$acc0,32
623	adde	$acc2,$acc2,$t1
624	 srdi	$t1,$acc0,32
625	adde	$acc3,$acc3,$a1
626	adde	$acc4,$acc4,$t2
627	adde	$acc5,$acc5,$a2
628	adde	$acc6,$acc6,$t3
629	adde	$acc7,$acc7,$a3
630___
631for($i=0;$i<3;$i++) {			# reductions, see commentary in
632					# multiplication for details
633$code.=<<___;
634	subfc	$t2,$t0,$acc0		# "*0xffff0001"
635	subfe	$t3,$t1,$acc0
636	addc	$acc0,$acc1,$t0		# +=acc[0]<<96 and omit acc[0]
637	 sldi	$t0,$acc0,32
638	adde	$acc1,$acc2,$t1
639	 srdi	$t1,$acc0,32
640	adde	$acc2,$acc3,$t2		# +=acc[0]*0xffff0001
641	addze	$acc3,$t3		# can't overflow
642___
643}
644$code.=<<___;
645	subfc	$t2,$t0,$acc0		# "*0xffff0001"
646	subfe	$t3,$t1,$acc0
647	addc	$acc0,$acc1,$t0		# +=acc[0]<<96 and omit acc[0]
648	adde	$acc1,$acc2,$t1
649	adde	$acc2,$acc3,$t2		# +=acc[0]*0xffff0001
650	addze	$acc3,$t3		# can't overflow
651
652	addc	$acc0,$acc0,$acc4	# accumulate upper half
653	adde	$acc1,$acc1,$acc5
654	adde	$acc2,$acc2,$acc6
655	adde	$acc3,$acc3,$acc7
656	li	$t2,0
657	addze	$acc4,$t2
658
659	addic	$acc0,$acc0,1		# ret -= modulus
660	subfe	$acc1,$poly1,$acc1
661	subfe	$acc2,$t2,$acc2
662	subfe	$acc3,$poly3,$acc3
663	subfe	$acc4,$t2,$acc4
664
665	addc	$acc0,$acc0,$acc4	# ret += modulus if borrow
666	and	$t1,$poly1,$acc4
667	and	$t3,$poly3,$acc4
668	adde	$acc1,$acc1,$t1
669	addze	$acc2,$acc2
670	adde	$acc3,$acc3,$t3
671
672	std	$acc0,0($rp)
673	std	$acc1,8($rp)
674	std	$acc2,16($rp)
675	std	$acc3,24($rp)
676
677	blr
678	.long	0
679	.byte	0,12,0x14,0,0,0,1,0
680	.long	0
681.size	__ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
682
683# Note that __ecp_nistz256_add expects both input vectors pre-loaded to
684# $a0-$a3 and $t0-$t3. This is done because it's used in multiple
685# contexts, e.g. in multiplication by 2 and 3...
686.type	__ecp_nistz256_add,\@function
687.align	4
688__ecp_nistz256_add:
689	addc	$acc0,$acc0,$t0		# ret = a+b
690	adde	$acc1,$acc1,$t1
691	adde	$acc2,$acc2,$t2
692	li	$t2,0
693	adde	$acc3,$acc3,$t3
694	addze	$t0,$t2
695
696	# if a+b >= modulus, subtract modulus
697	#
698	# But since comparison implies subtraction, we subtract
699	# modulus and then add it back if subtraction borrowed.
700
701	subic	$acc0,$acc0,-1
702	subfe	$acc1,$poly1,$acc1
703	subfe	$acc2,$t2,$acc2
704	subfe	$acc3,$poly3,$acc3
705	subfe	$t0,$t2,$t0
706
707	addc	$acc0,$acc0,$t0
708	and	$t1,$poly1,$t0
709	and	$t3,$poly3,$t0
710	adde	$acc1,$acc1,$t1
711	addze	$acc2,$acc2
712	adde	$acc3,$acc3,$t3
713
714	std	$acc0,0($rp)
715	std	$acc1,8($rp)
716	std	$acc2,16($rp)
717	std	$acc3,24($rp)
718
719	blr
720	.long	0
721	.byte	0,12,0x14,0,0,0,3,0
722	.long	0
723.size	__ecp_nistz256_add,.-__ecp_nistz256_add
724
725.type	__ecp_nistz256_sub_from,\@function
726.align	4
727__ecp_nistz256_sub_from:
728	ld	$t0,0($bp)
729	ld	$t1,8($bp)
730	ld	$t2,16($bp)
731	ld	$t3,24($bp)
732	subfc	$acc0,$t0,$acc0		# ret = a-b
733	subfe	$acc1,$t1,$acc1
734	subfe	$acc2,$t2,$acc2
735	subfe	$acc3,$t3,$acc3
736	subfe	$t0,$t0,$t0		# t0 = borrow ? -1 : 0
737
738	# if a-b borrowed, add modulus
739
740	addc	$acc0,$acc0,$t0		# ret -= modulus & t0
741	and	$t1,$poly1,$t0
742	and	$t3,$poly3,$t0
743	adde	$acc1,$acc1,$t1
744	addze	$acc2,$acc2
745	adde	$acc3,$acc3,$t3
746
747	std	$acc0,0($rp)
748	std	$acc1,8($rp)
749	std	$acc2,16($rp)
750	std	$acc3,24($rp)
751
752	blr
753	.long	0
754	.byte	0,12,0x14,0,0,0,3,0
755	.long	0
756.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
757
758.type	__ecp_nistz256_sub_morf,\@function
759.align	4
760__ecp_nistz256_sub_morf:
761	ld	$t0,0($bp)
762	ld	$t1,8($bp)
763	ld	$t2,16($bp)
764	ld	$t3,24($bp)
765	subfc	$acc0,$acc0,$t0 	# ret = b-a
766	subfe	$acc1,$acc1,$t1
767	subfe	$acc2,$acc2,$t2
768	subfe	$acc3,$acc3,$t3
769	subfe	$t0,$t0,$t0		# t0 = borrow ? -1 : 0
770
771	# if b-a borrowed, add modulus
772
773	addc	$acc0,$acc0,$t0		# ret -= modulus & t0
774	and	$t1,$poly1,$t0
775	and	$t3,$poly3,$t0
776	adde	$acc1,$acc1,$t1
777	addze	$acc2,$acc2
778	adde	$acc3,$acc3,$t3
779
780	std	$acc0,0($rp)
781	std	$acc1,8($rp)
782	std	$acc2,16($rp)
783	std	$acc3,24($rp)
784
785	blr
786	.long	0
787	.byte	0,12,0x14,0,0,0,3,0
788	.long	0
789.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
790
791.type	__ecp_nistz256_div_by_2,\@function
792.align	4
793__ecp_nistz256_div_by_2:
794	andi.	$t0,$acc0,1
795	addic	$acc0,$acc0,-1		# a += modulus
796	 neg	$t0,$t0
797	adde	$acc1,$acc1,$poly1
798	 not	$t0,$t0
799	addze	$acc2,$acc2
800	 li	$t2,0
801	adde	$acc3,$acc3,$poly3
802	 and	$t1,$poly1,$t0
803	addze	$ap,$t2			# ap = carry
804	 and	$t3,$poly3,$t0
805
806	subfc	$acc0,$t0,$acc0		# a -= modulus if a was even
807	subfe	$acc1,$t1,$acc1
808	subfe	$acc2,$t2,$acc2
809	subfe	$acc3,$t3,$acc3
810	subfe	$ap,  $t2,$ap
811
812	srdi	$acc0,$acc0,1
813	sldi	$t0,$acc1,63
814	srdi	$acc1,$acc1,1
815	sldi	$t1,$acc2,63
816	srdi	$acc2,$acc2,1
817	sldi	$t2,$acc3,63
818	srdi	$acc3,$acc3,1
819	sldi	$t3,$ap,63
820	or	$acc0,$acc0,$t0
821	or	$acc1,$acc1,$t1
822	or	$acc2,$acc2,$t2
823	or	$acc3,$acc3,$t3
824
825	std	$acc0,0($rp)
826	std	$acc1,8($rp)
827	std	$acc2,16($rp)
828	std	$acc3,24($rp)
829
830	blr
831	.long	0
832	.byte	0,12,0x14,0,0,0,1,0
833	.long	0
834.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
835___
836########################################################################
837# following subroutines are "literal" implementation of those found in
838# ecp_nistz256.c
839#
840########################################################################
841# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
842#
843if (1) {
844my $FRAME=64+32*4+12*8;
845my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3));
846# above map() describes stack layout with 4 temporary
847# 256-bit vectors on top.
848my ($rp_real,$ap_real) = map("r$_",(20,21));
849
850$code.=<<___;
851.globl	ecp_nistz256_point_double
852.align	5
853ecp_nistz256_point_double:
854	stdu	$sp,-$FRAME($sp)
855	mflr	r0
856	std	r20,$FRAME-8*12($sp)
857	std	r21,$FRAME-8*11($sp)
858	std	r22,$FRAME-8*10($sp)
859	std	r23,$FRAME-8*9($sp)
860	std	r24,$FRAME-8*8($sp)
861	std	r25,$FRAME-8*7($sp)
862	std	r26,$FRAME-8*6($sp)
863	std	r27,$FRAME-8*5($sp)
864	std	r28,$FRAME-8*4($sp)
865	std	r29,$FRAME-8*3($sp)
866	std	r30,$FRAME-8*2($sp)
867	std	r31,$FRAME-8*1($sp)
868
869	li	$poly1,-1
870	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
871	li	$poly3,1
872	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
873.Ldouble_shortcut:
874	ld	$acc0,32($ap)
875	ld	$acc1,40($ap)
876	ld	$acc2,48($ap)
877	ld	$acc3,56($ap)
878	mr	$t0,$acc0
879	mr	$t1,$acc1
880	mr	$t2,$acc2
881	mr	$t3,$acc3
882	 ld	$a0,64($ap)		# forward load for p256_sqr_mont
883	 ld	$a1,72($ap)
884	 ld	$a2,80($ap)
885	 ld	$a3,88($ap)
886	 mr	$rp_real,$rp
887	 mr	$ap_real,$ap
888	addi	$rp,$sp,$S
889	bl	__ecp_nistz256_add	# p256_mul_by_2(S, in_y);
890
891	addi	$rp,$sp,$Zsqr
892	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Zsqr, in_z);
893
894	ld	$t0,0($ap_real)
895	ld	$t1,8($ap_real)
896	ld	$t2,16($ap_real)
897	ld	$t3,24($ap_real)
898	mr	$a0,$acc0		# put Zsqr aside for p256_sub
899	mr	$a1,$acc1
900	mr	$a2,$acc2
901	mr	$a3,$acc3
902	addi	$rp,$sp,$M
903	bl	__ecp_nistz256_add	# p256_add(M, Zsqr, in_x);
904
905	addi	$bp,$ap_real,0
906	mr	$acc0,$a0		# restore Zsqr
907	mr	$acc1,$a1
908	mr	$acc2,$a2
909	mr	$acc3,$a3
910	 ld	$a0,$S+0($sp)		# forward load for p256_sqr_mont
911	 ld	$a1,$S+8($sp)
912	 ld	$a2,$S+16($sp)
913	 ld	$a3,$S+24($sp)
914	addi	$rp,$sp,$Zsqr
915	bl	__ecp_nistz256_sub_morf	# p256_sub(Zsqr, in_x, Zsqr);
916
917	addi	$rp,$sp,$S
918	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(S, S);
919
920	ld	$bi,32($ap_real)
921	ld	$a0,64($ap_real)
922	ld	$a1,72($ap_real)
923	ld	$a2,80($ap_real)
924	ld	$a3,88($ap_real)
925	addi	$bp,$ap_real,32
926	addi	$rp,$sp,$tmp0
927	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(tmp0, in_z, in_y);
928
929	mr	$t0,$acc0
930	mr	$t1,$acc1
931	mr	$t2,$acc2
932	mr	$t3,$acc3
933	 ld	$a0,$S+0($sp)		# forward load for p256_sqr_mont
934	 ld	$a1,$S+8($sp)
935	 ld	$a2,$S+16($sp)
936	 ld	$a3,$S+24($sp)
937	addi	$rp,$rp_real,64
938	bl	__ecp_nistz256_add	# p256_mul_by_2(res_z, tmp0);
939
940	addi	$rp,$sp,$tmp0
941	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(tmp0, S);
942
943	 ld	$bi,$Zsqr($sp)		# forward load for p256_mul_mont
944	 ld	$a0,$M+0($sp)
945	 ld	$a1,$M+8($sp)
946	 ld	$a2,$M+16($sp)
947	 ld	$a3,$M+24($sp)
948	addi	$rp,$rp_real,32
949	bl	__ecp_nistz256_div_by_2	# p256_div_by_2(res_y, tmp0);
950
951	addi	$bp,$sp,$Zsqr
952	addi	$rp,$sp,$M
953	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(M, M, Zsqr);
954
955	mr	$t0,$acc0		# duplicate M
956	mr	$t1,$acc1
957	mr	$t2,$acc2
958	mr	$t3,$acc3
959	mr	$a0,$acc0		# put M aside
960	mr	$a1,$acc1
961	mr	$a2,$acc2
962	mr	$a3,$acc3
963	addi	$rp,$sp,$M
964	bl	__ecp_nistz256_add
965	mr	$t0,$a0			# restore M
966	mr	$t1,$a1
967	mr	$t2,$a2
968	mr	$t3,$a3
969	 ld	$bi,0($ap_real)		# forward load for p256_mul_mont
970	 ld	$a0,$S+0($sp)
971	 ld	$a1,$S+8($sp)
972	 ld	$a2,$S+16($sp)
973	 ld	$a3,$S+24($sp)
974	bl	__ecp_nistz256_add	# p256_mul_by_3(M, M);
975
976	addi	$bp,$ap_real,0
977	addi	$rp,$sp,$S
978	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S, S, in_x);
979
980	mr	$t0,$acc0
981	mr	$t1,$acc1
982	mr	$t2,$acc2
983	mr	$t3,$acc3
984	 ld	$a0,$M+0($sp)		# forward load for p256_sqr_mont
985	 ld	$a1,$M+8($sp)
986	 ld	$a2,$M+16($sp)
987	 ld	$a3,$M+24($sp)
988	addi	$rp,$sp,$tmp0
989	bl	__ecp_nistz256_add	# p256_mul_by_2(tmp0, S);
990
991	addi	$rp,$rp_real,0
992	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(res_x, M);
993
994	addi	$bp,$sp,$tmp0
995	bl	__ecp_nistz256_sub_from	# p256_sub(res_x, res_x, tmp0);
996
997	addi	$bp,$sp,$S
998	addi	$rp,$sp,$S
999	bl	__ecp_nistz256_sub_morf	# p256_sub(S, S, res_x);
1000
1001	ld	$bi,$M($sp)
1002	mr	$a0,$acc0		# copy S
1003	mr	$a1,$acc1
1004	mr	$a2,$acc2
1005	mr	$a3,$acc3
1006	addi	$bp,$sp,$M
1007	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S, S, M);
1008
1009	addi	$bp,$rp_real,32
1010	addi	$rp,$rp_real,32
1011	bl	__ecp_nistz256_sub_from	# p256_sub(res_y, S, res_y);
1012
1013	mtlr	r0
1014	ld	r20,$FRAME-8*12($sp)
1015	ld	r21,$FRAME-8*11($sp)
1016	ld	r22,$FRAME-8*10($sp)
1017	ld	r23,$FRAME-8*9($sp)
1018	ld	r24,$FRAME-8*8($sp)
1019	ld	r25,$FRAME-8*7($sp)
1020	ld	r26,$FRAME-8*6($sp)
1021	ld	r27,$FRAME-8*5($sp)
1022	ld	r28,$FRAME-8*4($sp)
1023	ld	r29,$FRAME-8*3($sp)
1024	ld	r30,$FRAME-8*2($sp)
1025	ld	r31,$FRAME-8*1($sp)
1026	addi	$sp,$sp,$FRAME
1027	blr
1028	.long	0
1029	.byte	0,12,4,0,0x80,12,2,0
1030	.long	0
1031.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
1032___
1033}
1034
1035########################################################################
1036# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
1037#			      const P256_POINT *in2);
1038if (1) {
1039my $FRAME = 64 + 32*12 + 16*8;
1040my ($res_x,$res_y,$res_z,
1041    $H,$Hsqr,$R,$Rsqr,$Hcub,
1042    $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11));
1043my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1044# above map() describes stack layout with 12 temporary
1045# 256-bit vectors on top.
1046my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
1047
1048$code.=<<___;
1049.globl	ecp_nistz256_point_add
1050.align	5
1051ecp_nistz256_point_add:
1052	stdu	$sp,-$FRAME($sp)
1053	mflr	r0
1054	std	r16,$FRAME-8*16($sp)
1055	std	r17,$FRAME-8*15($sp)
1056	std	r18,$FRAME-8*14($sp)
1057	std	r19,$FRAME-8*13($sp)
1058	std	r20,$FRAME-8*12($sp)
1059	std	r21,$FRAME-8*11($sp)
1060	std	r22,$FRAME-8*10($sp)
1061	std	r23,$FRAME-8*9($sp)
1062	std	r24,$FRAME-8*8($sp)
1063	std	r25,$FRAME-8*7($sp)
1064	std	r26,$FRAME-8*6($sp)
1065	std	r27,$FRAME-8*5($sp)
1066	std	r28,$FRAME-8*4($sp)
1067	std	r29,$FRAME-8*3($sp)
1068	std	r30,$FRAME-8*2($sp)
1069	std	r31,$FRAME-8*1($sp)
1070
1071	li	$poly1,-1
1072	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
1073	li	$poly3,1
1074	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
1075
1076	ld	$a0,64($bp)		# in2_z
1077	ld	$a1,72($bp)
1078	ld	$a2,80($bp)
1079	ld	$a3,88($bp)
1080	 mr	$rp_real,$rp
1081	 mr	$ap_real,$ap
1082	 mr	$bp_real,$bp
1083	or	$t0,$a0,$a1
1084	or	$t2,$a2,$a3
1085	or	$in2infty,$t0,$t2
1086	neg	$t0,$in2infty
1087	or	$in2infty,$in2infty,$t0
1088	sradi	$in2infty,$in2infty,63	# !in2infty
1089	addi	$rp,$sp,$Z2sqr
1090	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Z2sqr, in2_z);
1091
1092	ld	$a0,64($ap_real)	# in1_z
1093	ld	$a1,72($ap_real)
1094	ld	$a2,80($ap_real)
1095	ld	$a3,88($ap_real)
1096	or	$t0,$a0,$a1
1097	or	$t2,$a2,$a3
1098	or	$in1infty,$t0,$t2
1099	neg	$t0,$in1infty
1100	or	$in1infty,$in1infty,$t0
1101	sradi	$in1infty,$in1infty,63	# !in1infty
1102	addi	$rp,$sp,$Z1sqr
1103	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Z1sqr, in1_z);
1104
1105	ld	$bi,64($bp_real)
1106	ld	$a0,$Z2sqr+0($sp)
1107	ld	$a1,$Z2sqr+8($sp)
1108	ld	$a2,$Z2sqr+16($sp)
1109	ld	$a3,$Z2sqr+24($sp)
1110	addi	$bp,$bp_real,64
1111	addi	$rp,$sp,$S1
1112	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S1, Z2sqr, in2_z);
1113
1114	ld	$bi,64($ap_real)
1115	ld	$a0,$Z1sqr+0($sp)
1116	ld	$a1,$Z1sqr+8($sp)
1117	ld	$a2,$Z1sqr+16($sp)
1118	ld	$a3,$Z1sqr+24($sp)
1119	addi	$bp,$ap_real,64
1120	addi	$rp,$sp,$S2
1121	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S2, Z1sqr, in1_z);
1122
1123	ld	$bi,32($ap_real)
1124	ld	$a0,$S1+0($sp)
1125	ld	$a1,$S1+8($sp)
1126	ld	$a2,$S1+16($sp)
1127	ld	$a3,$S1+24($sp)
1128	addi	$bp,$ap_real,32
1129	addi	$rp,$sp,$S1
1130	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S1, S1, in1_y);
1131
1132	ld	$bi,32($bp_real)
1133	ld	$a0,$S2+0($sp)
1134	ld	$a1,$S2+8($sp)
1135	ld	$a2,$S2+16($sp)
1136	ld	$a3,$S2+24($sp)
1137	addi	$bp,$bp_real,32
1138	addi	$rp,$sp,$S2
1139	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S2, S2, in2_y);
1140
1141	addi	$bp,$sp,$S1
1142	 ld	$bi,$Z2sqr($sp)		# forward load for p256_mul_mont
1143	 ld	$a0,0($ap_real)
1144	 ld	$a1,8($ap_real)
1145	 ld	$a2,16($ap_real)
1146	 ld	$a3,24($ap_real)
1147	addi	$rp,$sp,$R
1148	bl	__ecp_nistz256_sub_from	# p256_sub(R, S2, S1);
1149
1150	or	$acc0,$acc0,$acc1	# see if result is zero
1151	or	$acc2,$acc2,$acc3
1152	or	$temp,$acc0,$acc2
1153
1154	addi	$bp,$sp,$Z2sqr
1155	addi	$rp,$sp,$U1
1156	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(U1, in1_x, Z2sqr);
1157
1158	ld	$bi,$Z1sqr($sp)
1159	ld	$a0,0($bp_real)
1160	ld	$a1,8($bp_real)
1161	ld	$a2,16($bp_real)
1162	ld	$a3,24($bp_real)
1163	addi	$bp,$sp,$Z1sqr
1164	addi	$rp,$sp,$U2
1165	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(U2, in2_x, Z1sqr);
1166
1167	addi	$bp,$sp,$U1
1168	 ld	$a0,$R+0($sp)		# forward load for p256_sqr_mont
1169	 ld	$a1,$R+8($sp)
1170	 ld	$a2,$R+16($sp)
1171	 ld	$a3,$R+24($sp)
1172	addi	$rp,$sp,$H
1173	bl	__ecp_nistz256_sub_from	# p256_sub(H, U2, U1);
1174
1175	or	$acc0,$acc0,$acc1	# see if result is zero
1176	or	$acc2,$acc2,$acc3
1177	or.	$acc0,$acc0,$acc2
1178	bne	.Ladd_proceed		# is_equal(U1,U2)?
1179
1180	and.	$t0,$in1infty,$in2infty
1181	beq	.Ladd_proceed		# (in1infty || in2infty)?
1182
1183	cmpldi	$temp,0
1184	beq	.Ladd_double		# is_equal(S1,S2)?
1185
1186	xor	$a0,$a0,$a0
1187	std	$a0,0($rp_real)
1188	std	$a0,8($rp_real)
1189	std	$a0,16($rp_real)
1190	std	$a0,24($rp_real)
1191	std	$a0,32($rp_real)
1192	std	$a0,40($rp_real)
1193	std	$a0,48($rp_real)
1194	std	$a0,56($rp_real)
1195	std	$a0,64($rp_real)
1196	std	$a0,72($rp_real)
1197	std	$a0,80($rp_real)
1198	std	$a0,88($rp_real)
1199	b	.Ladd_done
1200
1201.align	4
1202.Ladd_double:
1203	ld	$bp,0($sp)		# back-link
1204	mr	$ap,$ap_real
1205	mr	$rp,$rp_real
1206	ld	r16,$FRAME-8*16($sp)
1207	ld	r17,$FRAME-8*15($sp)
1208	ld	r18,$FRAME-8*14($sp)
1209	ld	r19,$FRAME-8*13($sp)
1210	stdu	$bp,$FRAME-288($sp)	# difference in stack frame sizes
1211	b	.Ldouble_shortcut
1212
1213.align	4
1214.Ladd_proceed:
1215	addi	$rp,$sp,$Rsqr
1216	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Rsqr, R);
1217
1218	ld	$bi,64($ap_real)
1219	ld	$a0,$H+0($sp)
1220	ld	$a1,$H+8($sp)
1221	ld	$a2,$H+16($sp)
1222	ld	$a3,$H+24($sp)
1223	addi	$bp,$ap_real,64
1224	addi	$rp,$sp,$res_z
1225	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(res_z, H, in1_z);
1226
1227	ld	$a0,$H+0($sp)
1228	ld	$a1,$H+8($sp)
1229	ld	$a2,$H+16($sp)
1230	ld	$a3,$H+24($sp)
1231	addi	$rp,$sp,$Hsqr
1232	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Hsqr, H);
1233
1234	ld	$bi,64($bp_real)
1235	ld	$a0,$res_z+0($sp)
1236	ld	$a1,$res_z+8($sp)
1237	ld	$a2,$res_z+16($sp)
1238	ld	$a3,$res_z+24($sp)
1239	addi	$bp,$bp_real,64
1240	addi	$rp,$sp,$res_z
1241	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(res_z, res_z, in2_z);
1242
1243	ld	$bi,$H($sp)
1244	ld	$a0,$Hsqr+0($sp)
1245	ld	$a1,$Hsqr+8($sp)
1246	ld	$a2,$Hsqr+16($sp)
1247	ld	$a3,$Hsqr+24($sp)
1248	addi	$bp,$sp,$H
1249	addi	$rp,$sp,$Hcub
1250	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(Hcub, Hsqr, H);
1251
1252	ld	$bi,$Hsqr($sp)
1253	ld	$a0,$U1+0($sp)
1254	ld	$a1,$U1+8($sp)
1255	ld	$a2,$U1+16($sp)
1256	ld	$a3,$U1+24($sp)
1257	addi	$bp,$sp,$Hsqr
1258	addi	$rp,$sp,$U2
1259	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(U2, U1, Hsqr);
1260
1261	mr	$t0,$acc0
1262	mr	$t1,$acc1
1263	mr	$t2,$acc2
1264	mr	$t3,$acc3
1265	addi	$rp,$sp,$Hsqr
1266	bl	__ecp_nistz256_add	# p256_mul_by_2(Hsqr, U2);
1267
1268	addi	$bp,$sp,$Rsqr
1269	addi	$rp,$sp,$res_x
1270	bl	__ecp_nistz256_sub_morf	# p256_sub(res_x, Rsqr, Hsqr);
1271
1272	addi	$bp,$sp,$Hcub
1273	bl	__ecp_nistz256_sub_from	# p256_sub(res_x, res_x, Hcub);
1274
1275	addi	$bp,$sp,$U2
1276	 ld	$bi,$Hcub($sp)		# forward load for p256_mul_mont
1277	 ld	$a0,$S1+0($sp)
1278	 ld	$a1,$S1+8($sp)
1279	 ld	$a2,$S1+16($sp)
1280	 ld	$a3,$S1+24($sp)
1281	addi	$rp,$sp,$res_y
1282	bl	__ecp_nistz256_sub_morf	# p256_sub(res_y, U2, res_x);
1283
1284	addi	$bp,$sp,$Hcub
1285	addi	$rp,$sp,$S2
1286	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S2, S1, Hcub);
1287
1288	ld	$bi,$R($sp)
1289	ld	$a0,$res_y+0($sp)
1290	ld	$a1,$res_y+8($sp)
1291	ld	$a2,$res_y+16($sp)
1292	ld	$a3,$res_y+24($sp)
1293	addi	$bp,$sp,$R
1294	addi	$rp,$sp,$res_y
1295	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(res_y, res_y, R);
1296
1297	addi	$bp,$sp,$S2
1298	bl	__ecp_nistz256_sub_from	# p256_sub(res_y, res_y, S2);
1299
1300	ld	$t0,0($bp_real)		# in2
1301	ld	$t1,8($bp_real)
1302	ld	$t2,16($bp_real)
1303	ld	$t3,24($bp_real)
1304	ld	$a0,$res_x+0($sp)	# res
1305	ld	$a1,$res_x+8($sp)
1306	ld	$a2,$res_x+16($sp)
1307	ld	$a3,$res_x+24($sp)
1308___
1309for($i=0;$i<64;$i+=32) {		# conditional moves
1310$code.=<<___;
1311	ld	$acc0,$i+0($ap_real)	# in1
1312	ld	$acc1,$i+8($ap_real)
1313	ld	$acc2,$i+16($ap_real)
1314	ld	$acc3,$i+24($ap_real)
1315	andc	$t0,$t0,$in1infty
1316	andc	$t1,$t1,$in1infty
1317	andc	$t2,$t2,$in1infty
1318	andc	$t3,$t3,$in1infty
1319	and	$a0,$a0,$in1infty
1320	and	$a1,$a1,$in1infty
1321	and	$a2,$a2,$in1infty
1322	and	$a3,$a3,$in1infty
1323	or	$t0,$t0,$a0
1324	or	$t1,$t1,$a1
1325	or	$t2,$t2,$a2
1326	or	$t3,$t3,$a3
1327	andc	$acc0,$acc0,$in2infty
1328	andc	$acc1,$acc1,$in2infty
1329	andc	$acc2,$acc2,$in2infty
1330	andc	$acc3,$acc3,$in2infty
1331	and	$t0,$t0,$in2infty
1332	and	$t1,$t1,$in2infty
1333	and	$t2,$t2,$in2infty
1334	and	$t3,$t3,$in2infty
1335	or	$acc0,$acc0,$t0
1336	or	$acc1,$acc1,$t1
1337	or	$acc2,$acc2,$t2
1338	or	$acc3,$acc3,$t3
1339
1340	ld	$t0,$i+32($bp_real)	# in2
1341	ld	$t1,$i+40($bp_real)
1342	ld	$t2,$i+48($bp_real)
1343	ld	$t3,$i+56($bp_real)
1344	ld	$a0,$res_x+$i+32($sp)
1345	ld	$a1,$res_x+$i+40($sp)
1346	ld	$a2,$res_x+$i+48($sp)
1347	ld	$a3,$res_x+$i+56($sp)
1348	std	$acc0,$i+0($rp_real)
1349	std	$acc1,$i+8($rp_real)
1350	std	$acc2,$i+16($rp_real)
1351	std	$acc3,$i+24($rp_real)
1352___
1353}
1354$code.=<<___;
1355	ld	$acc0,$i+0($ap_real)	# in1
1356	ld	$acc1,$i+8($ap_real)
1357	ld	$acc2,$i+16($ap_real)
1358	ld	$acc3,$i+24($ap_real)
1359	andc	$t0,$t0,$in1infty
1360	andc	$t1,$t1,$in1infty
1361	andc	$t2,$t2,$in1infty
1362	andc	$t3,$t3,$in1infty
1363	and	$a0,$a0,$in1infty
1364	and	$a1,$a1,$in1infty
1365	and	$a2,$a2,$in1infty
1366	and	$a3,$a3,$in1infty
1367	or	$t0,$t0,$a0
1368	or	$t1,$t1,$a1
1369	or	$t2,$t2,$a2
1370	or	$t3,$t3,$a3
1371	andc	$acc0,$acc0,$in2infty
1372	andc	$acc1,$acc1,$in2infty
1373	andc	$acc2,$acc2,$in2infty
1374	andc	$acc3,$acc3,$in2infty
1375	and	$t0,$t0,$in2infty
1376	and	$t1,$t1,$in2infty
1377	and	$t2,$t2,$in2infty
1378	and	$t3,$t3,$in2infty
1379	or	$acc0,$acc0,$t0
1380	or	$acc1,$acc1,$t1
1381	or	$acc2,$acc2,$t2
1382	or	$acc3,$acc3,$t3
1383	std	$acc0,$i+0($rp_real)
1384	std	$acc1,$i+8($rp_real)
1385	std	$acc2,$i+16($rp_real)
1386	std	$acc3,$i+24($rp_real)
1387
1388.Ladd_done:
1389	mtlr	r0
1390	ld	r16,$FRAME-8*16($sp)
1391	ld	r17,$FRAME-8*15($sp)
1392	ld	r18,$FRAME-8*14($sp)
1393	ld	r19,$FRAME-8*13($sp)
1394	ld	r20,$FRAME-8*12($sp)
1395	ld	r21,$FRAME-8*11($sp)
1396	ld	r22,$FRAME-8*10($sp)
1397	ld	r23,$FRAME-8*9($sp)
1398	ld	r24,$FRAME-8*8($sp)
1399	ld	r25,$FRAME-8*7($sp)
1400	ld	r26,$FRAME-8*6($sp)
1401	ld	r27,$FRAME-8*5($sp)
1402	ld	r28,$FRAME-8*4($sp)
1403	ld	r29,$FRAME-8*3($sp)
1404	ld	r30,$FRAME-8*2($sp)
1405	ld	r31,$FRAME-8*1($sp)
1406	addi	$sp,$sp,$FRAME
1407	blr
1408	.long	0
1409	.byte	0,12,4,0,0x80,16,3,0
1410	.long	0
1411.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
1412___
1413}
1414
1415########################################################################
1416# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1417#				     const P256_POINT_AFFINE *in2);
1418if (1) {
1419my $FRAME = 64 + 32*10 + 16*8;
1420my ($res_x,$res_y,$res_z,
1421    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9));
1422my $Z1sqr = $S2;
1423# above map() describes stack layout with 10 temporary
1424# 256-bit vectors on top.
1425my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
1426
1427$code.=<<___;
1428.globl	ecp_nistz256_point_add_affine
1429.align	5
1430ecp_nistz256_point_add_affine:
1431	stdu	$sp,-$FRAME($sp)
1432	mflr	r0
1433	std	r16,$FRAME-8*16($sp)
1434	std	r17,$FRAME-8*15($sp)
1435	std	r18,$FRAME-8*14($sp)
1436	std	r19,$FRAME-8*13($sp)
1437	std	r20,$FRAME-8*12($sp)
1438	std	r21,$FRAME-8*11($sp)
1439	std	r22,$FRAME-8*10($sp)
1440	std	r23,$FRAME-8*9($sp)
1441	std	r24,$FRAME-8*8($sp)
1442	std	r25,$FRAME-8*7($sp)
1443	std	r26,$FRAME-8*6($sp)
1444	std	r27,$FRAME-8*5($sp)
1445	std	r28,$FRAME-8*4($sp)
1446	std	r29,$FRAME-8*3($sp)
1447	std	r30,$FRAME-8*2($sp)
1448	std	r31,$FRAME-8*1($sp)
1449
1450	li	$poly1,-1
1451	srdi	$poly1,$poly1,32	# 0x00000000ffffffff
1452	li	$poly3,1
1453	orc	$poly3,$poly3,$poly1	# 0xffffffff00000001
1454
1455	mr	$rp_real,$rp
1456	mr	$ap_real,$ap
1457	mr	$bp_real,$bp
1458
1459	ld	$a0,64($ap)		# in1_z
1460	ld	$a1,72($ap)
1461	ld	$a2,80($ap)
1462	ld	$a3,88($ap)
1463	or	$t0,$a0,$a1
1464	or	$t2,$a2,$a3
1465	or	$in1infty,$t0,$t2
1466	neg	$t0,$in1infty
1467	or	$in1infty,$in1infty,$t0
1468	sradi	$in1infty,$in1infty,63	# !in1infty
1469
1470	ld	$acc0,0($bp)		# in2_x
1471	ld	$acc1,8($bp)
1472	ld	$acc2,16($bp)
1473	ld	$acc3,24($bp)
1474	ld	$t0,32($bp)		# in2_y
1475	ld	$t1,40($bp)
1476	ld	$t2,48($bp)
1477	ld	$t3,56($bp)
1478	or	$acc0,$acc0,$acc1
1479	or	$acc2,$acc2,$acc3
1480	or	$acc0,$acc0,$acc2
1481	or	$t0,$t0,$t1
1482	or	$t2,$t2,$t3
1483	or	$t0,$t0,$t2
1484	or	$in2infty,$acc0,$t0
1485	neg	$t0,$in2infty
1486	or	$in2infty,$in2infty,$t0
1487	sradi	$in2infty,$in2infty,63	# !in2infty
1488
1489	addi	$rp,$sp,$Z1sqr
1490	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Z1sqr, in1_z);
1491
1492	mr	$a0,$acc0
1493	mr	$a1,$acc1
1494	mr	$a2,$acc2
1495	mr	$a3,$acc3
1496	ld	$bi,0($bp_real)
1497	addi	$bp,$bp_real,0
1498	addi	$rp,$sp,$U2
1499	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(U2, Z1sqr, in2_x);
1500
1501	addi	$bp,$ap_real,0
1502	 ld	$bi,64($ap_real)	# forward load for p256_mul_mont
1503	 ld	$a0,$Z1sqr+0($sp)
1504	 ld	$a1,$Z1sqr+8($sp)
1505	 ld	$a2,$Z1sqr+16($sp)
1506	 ld	$a3,$Z1sqr+24($sp)
1507	addi	$rp,$sp,$H
1508	bl	__ecp_nistz256_sub_from	# p256_sub(H, U2, in1_x);
1509
1510	addi	$bp,$ap_real,64
1511	addi	$rp,$sp,$S2
1512	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S2, Z1sqr, in1_z);
1513
1514	ld	$bi,64($ap_real)
1515	ld	$a0,$H+0($sp)
1516	ld	$a1,$H+8($sp)
1517	ld	$a2,$H+16($sp)
1518	ld	$a3,$H+24($sp)
1519	addi	$bp,$ap_real,64
1520	addi	$rp,$sp,$res_z
1521	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(res_z, H, in1_z);
1522
1523	ld	$bi,32($bp_real)
1524	ld	$a0,$S2+0($sp)
1525	ld	$a1,$S2+8($sp)
1526	ld	$a2,$S2+16($sp)
1527	ld	$a3,$S2+24($sp)
1528	addi	$bp,$bp_real,32
1529	addi	$rp,$sp,$S2
1530	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S2, S2, in2_y);
1531
1532	addi	$bp,$ap_real,32
1533	 ld	$a0,$H+0($sp)		# forward load for p256_sqr_mont
1534	 ld	$a1,$H+8($sp)
1535	 ld	$a2,$H+16($sp)
1536	 ld	$a3,$H+24($sp)
1537	addi	$rp,$sp,$R
1538	bl	__ecp_nistz256_sub_from	# p256_sub(R, S2, in1_y);
1539
1540	addi	$rp,$sp,$Hsqr
1541	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Hsqr, H);
1542
1543	ld	$a0,$R+0($sp)
1544	ld	$a1,$R+8($sp)
1545	ld	$a2,$R+16($sp)
1546	ld	$a3,$R+24($sp)
1547	addi	$rp,$sp,$Rsqr
1548	bl	__ecp_nistz256_sqr_mont	# p256_sqr_mont(Rsqr, R);
1549
1550	ld	$bi,$H($sp)
1551	ld	$a0,$Hsqr+0($sp)
1552	ld	$a1,$Hsqr+8($sp)
1553	ld	$a2,$Hsqr+16($sp)
1554	ld	$a3,$Hsqr+24($sp)
1555	addi	$bp,$sp,$H
1556	addi	$rp,$sp,$Hcub
1557	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(Hcub, Hsqr, H);
1558
1559	ld	$bi,0($ap_real)
1560	ld	$a0,$Hsqr+0($sp)
1561	ld	$a1,$Hsqr+8($sp)
1562	ld	$a2,$Hsqr+16($sp)
1563	ld	$a3,$Hsqr+24($sp)
1564	addi	$bp,$ap_real,0
1565	addi	$rp,$sp,$U2
1566	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(U2, in1_x, Hsqr);
1567
1568	mr	$t0,$acc0
1569	mr	$t1,$acc1
1570	mr	$t2,$acc2
1571	mr	$t3,$acc3
1572	addi	$rp,$sp,$Hsqr
1573	bl	__ecp_nistz256_add	# p256_mul_by_2(Hsqr, U2);
1574
1575	addi	$bp,$sp,$Rsqr
1576	addi	$rp,$sp,$res_x
1577	bl	__ecp_nistz256_sub_morf	# p256_sub(res_x, Rsqr, Hsqr);
1578
1579	addi	$bp,$sp,$Hcub
1580	bl	__ecp_nistz256_sub_from	#  p256_sub(res_x, res_x, Hcub);
1581
1582	addi	$bp,$sp,$U2
1583	 ld	$bi,32($ap_real)	# forward load for p256_mul_mont
1584	 ld	$a0,$Hcub+0($sp)
1585	 ld	$a1,$Hcub+8($sp)
1586	 ld	$a2,$Hcub+16($sp)
1587	 ld	$a3,$Hcub+24($sp)
1588	addi	$rp,$sp,$res_y
1589	bl	__ecp_nistz256_sub_morf	# p256_sub(res_y, U2, res_x);
1590
1591	addi	$bp,$ap_real,32
1592	addi	$rp,$sp,$S2
1593	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(S2, in1_y, Hcub);
1594
1595	ld	$bi,$R($sp)
1596	ld	$a0,$res_y+0($sp)
1597	ld	$a1,$res_y+8($sp)
1598	ld	$a2,$res_y+16($sp)
1599	ld	$a3,$res_y+24($sp)
1600	addi	$bp,$sp,$R
1601	addi	$rp,$sp,$res_y
1602	bl	__ecp_nistz256_mul_mont	# p256_mul_mont(res_y, res_y, R);
1603
1604	addi	$bp,$sp,$S2
1605	bl	__ecp_nistz256_sub_from	# p256_sub(res_y, res_y, S2);
1606
1607	ld	$t0,0($bp_real)		# in2
1608	ld	$t1,8($bp_real)
1609	ld	$t2,16($bp_real)
1610	ld	$t3,24($bp_real)
1611	ld	$a0,$res_x+0($sp)	# res
1612	ld	$a1,$res_x+8($sp)
1613	ld	$a2,$res_x+16($sp)
1614	ld	$a3,$res_x+24($sp)
1615___
1616for($i=0;$i<64;$i+=32) {		# conditional moves
1617$code.=<<___;
1618	ld	$acc0,$i+0($ap_real)	# in1
1619	ld	$acc1,$i+8($ap_real)
1620	ld	$acc2,$i+16($ap_real)
1621	ld	$acc3,$i+24($ap_real)
1622	andc	$t0,$t0,$in1infty
1623	andc	$t1,$t1,$in1infty
1624	andc	$t2,$t2,$in1infty
1625	andc	$t3,$t3,$in1infty
1626	and	$a0,$a0,$in1infty
1627	and	$a1,$a1,$in1infty
1628	and	$a2,$a2,$in1infty
1629	and	$a3,$a3,$in1infty
1630	or	$t0,$t0,$a0
1631	or	$t1,$t1,$a1
1632	or	$t2,$t2,$a2
1633	or	$t3,$t3,$a3
1634	andc	$acc0,$acc0,$in2infty
1635	andc	$acc1,$acc1,$in2infty
1636	andc	$acc2,$acc2,$in2infty
1637	andc	$acc3,$acc3,$in2infty
1638	and	$t0,$t0,$in2infty
1639	and	$t1,$t1,$in2infty
1640	and	$t2,$t2,$in2infty
1641	and	$t3,$t3,$in2infty
1642	or	$acc0,$acc0,$t0
1643	or	$acc1,$acc1,$t1
1644	or	$acc2,$acc2,$t2
1645	or	$acc3,$acc3,$t3
1646___
1647$code.=<<___	if ($i==0);
1648	ld	$t0,32($bp_real)	# in2
1649	ld	$t1,40($bp_real)
1650	ld	$t2,48($bp_real)
1651	ld	$t3,56($bp_real)
1652___
1653$code.=<<___	if ($i==32);
1654	li	$t0,1			# Lone_mont
1655	not	$t1,$poly1
1656	li	$t2,-1
1657	not	$t3,$poly3
1658___
1659$code.=<<___;
1660	ld	$a0,$res_x+$i+32($sp)
1661	ld	$a1,$res_x+$i+40($sp)
1662	ld	$a2,$res_x+$i+48($sp)
1663	ld	$a3,$res_x+$i+56($sp)
1664	std	$acc0,$i+0($rp_real)
1665	std	$acc1,$i+8($rp_real)
1666	std	$acc2,$i+16($rp_real)
1667	std	$acc3,$i+24($rp_real)
1668___
1669}
1670$code.=<<___;
1671	ld	$acc0,$i+0($ap_real)	# in1
1672	ld	$acc1,$i+8($ap_real)
1673	ld	$acc2,$i+16($ap_real)
1674	ld	$acc3,$i+24($ap_real)
1675	andc	$t0,$t0,$in1infty
1676	andc	$t1,$t1,$in1infty
1677	andc	$t2,$t2,$in1infty
1678	andc	$t3,$t3,$in1infty
1679	and	$a0,$a0,$in1infty
1680	and	$a1,$a1,$in1infty
1681	and	$a2,$a2,$in1infty
1682	and	$a3,$a3,$in1infty
1683	or	$t0,$t0,$a0
1684	or	$t1,$t1,$a1
1685	or	$t2,$t2,$a2
1686	or	$t3,$t3,$a3
1687	andc	$acc0,$acc0,$in2infty
1688	andc	$acc1,$acc1,$in2infty
1689	andc	$acc2,$acc2,$in2infty
1690	andc	$acc3,$acc3,$in2infty
1691	and	$t0,$t0,$in2infty
1692	and	$t1,$t1,$in2infty
1693	and	$t2,$t2,$in2infty
1694	and	$t3,$t3,$in2infty
1695	or	$acc0,$acc0,$t0
1696	or	$acc1,$acc1,$t1
1697	or	$acc2,$acc2,$t2
1698	or	$acc3,$acc3,$t3
1699	std	$acc0,$i+0($rp_real)
1700	std	$acc1,$i+8($rp_real)
1701	std	$acc2,$i+16($rp_real)
1702	std	$acc3,$i+24($rp_real)
1703
1704	mtlr	r0
1705	ld	r16,$FRAME-8*16($sp)
1706	ld	r17,$FRAME-8*15($sp)
1707	ld	r18,$FRAME-8*14($sp)
1708	ld	r19,$FRAME-8*13($sp)
1709	ld	r20,$FRAME-8*12($sp)
1710	ld	r21,$FRAME-8*11($sp)
1711	ld	r22,$FRAME-8*10($sp)
1712	ld	r23,$FRAME-8*9($sp)
1713	ld	r24,$FRAME-8*8($sp)
1714	ld	r25,$FRAME-8*7($sp)
1715	ld	r26,$FRAME-8*6($sp)
1716	ld	r27,$FRAME-8*5($sp)
1717	ld	r28,$FRAME-8*4($sp)
1718	ld	r29,$FRAME-8*3($sp)
1719	ld	r30,$FRAME-8*2($sp)
1720	ld	r31,$FRAME-8*1($sp)
1721	addi	$sp,$sp,$FRAME
1722	blr
1723	.long	0
1724	.byte	0,12,4,0,0x80,16,3,0
1725	.long	0
1726.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1727___
1728}
1729if (1) {
1730my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21));
1731my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0");
1732
1733$code.=<<___;
1734########################################################################
1735# void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1736#                                uint64_t b[4]);
1737.globl	ecp_nistz256_ord_mul_mont
1738.align	5
1739ecp_nistz256_ord_mul_mont:
1740	stdu	$sp,-160($sp)
1741	std	r18,48($sp)
1742	std	r19,56($sp)
1743	std	r20,64($sp)
1744	std	r21,72($sp)
1745	std	r22,80($sp)
1746	std	r23,88($sp)
1747	std	r24,96($sp)
1748	std	r25,104($sp)
1749	std	r26,112($sp)
1750	std	r27,120($sp)
1751	std	r28,128($sp)
1752	std	r29,136($sp)
1753	std	r30,144($sp)
1754	std	r31,152($sp)
1755
1756	ld	$a0,0($ap)
1757	ld	$bi,0($bp)
1758	ld	$a1,8($ap)
1759	ld	$a2,16($ap)
1760	ld	$a3,24($ap)
1761
1762	lis	$ordk,0xccd1
1763	lis	$ord0,0xf3b9
1764	lis	$ord1,0xbce6
1765	ori	$ordk,$ordk,0xc8aa
1766	ori	$ord0,$ord0,0xcac2
1767	ori	$ord1,$ord1,0xfaad
1768	sldi	$ordk,$ordk,32
1769	sldi	$ord0,$ord0,32
1770	sldi	$ord1,$ord1,32
1771	oris	$ordk,$ordk,0xee00
1772	oris	$ord0,$ord0,0xfc63
1773	oris	$ord1,$ord1,0xa717
1774	ori	$ordk,$ordk,0xbc4f	# 0xccd1c8aaee00bc4f
1775	ori	$ord0,$ord0,0x2551	# 0xf3b9cac2fc632551
1776	ori	$ord1,$ord1,0x9e84	# 0xbce6faada7179e84
1777	li	$ord2,-1		# 0xffffffffffffffff
1778	sldi	$ord3,$ord2,32		# 0xffffffff00000000
1779	li	$zr,0
1780
1781	mulld	$acc0,$a0,$bi		# a[0]*b[0]
1782	mulhdu	$t0,$a0,$bi
1783
1784	mulld	$acc1,$a1,$bi		# a[1]*b[0]
1785	mulhdu	$t1,$a1,$bi
1786
1787	mulld	$acc2,$a2,$bi		# a[2]*b[0]
1788	mulhdu	$t2,$a2,$bi
1789
1790	mulld	$acc3,$a3,$bi		# a[3]*b[0]
1791	mulhdu	$acc4,$a3,$bi
1792
1793	mulld	$t4,$acc0,$ordk
1794
1795	addc	$acc1,$acc1,$t0		# accumulate high parts of multiplication
1796	adde	$acc2,$acc2,$t1
1797	adde	$acc3,$acc3,$t2
1798	addze	$acc4,$acc4
1799	li	$acc5,0
1800___
1801for ($i=1;$i<4;$i++) {
1802	################################################################
1803	#            ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
1804	# *                                     abcdefgh
1805	# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1806	#
1807	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1808	# rewrite above as:
1809	#
1810	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1811	# - 0000abcd.efgh0000.abcdefgh.00000000.00000000
1812	# + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
1813$code.=<<___;
1814	ld	$bi,8*$i($bp)		# b[i]
1815
1816	sldi	$t0,$t4,32
1817	subfc	$acc2,$t4,$acc2
1818	srdi	$t1,$t4,32
1819	subfe	$acc3,$t0,$acc3
1820	subfe	$acc4,$t1,$acc4
1821	subfe	$acc5,$zr,$acc5
1822
1823	addic	$t0,$acc0,-1		# discarded
1824	mulhdu	$t1,$ord0,$t4
1825	mulld	$t2,$ord1,$t4
1826	mulhdu	$t3,$ord1,$t4
1827
1828	adde	$t2,$t2,$t1
1829	 mulld	$t0,$a0,$bi
1830	addze	$t3,$t3
1831	 mulld	$t1,$a1,$bi
1832
1833	addc	$acc0,$acc1,$t2
1834	 mulld	$t2,$a2,$bi
1835	adde	$acc1,$acc2,$t3
1836	 mulld	$t3,$a3,$bi
1837	adde	$acc2,$acc3,$t4
1838	adde	$acc3,$acc4,$t4
1839	addze	$acc4,$acc5
1840
1841	addc	$acc0,$acc0,$t0		# accumulate low parts
1842	mulhdu	$t0,$a0,$bi
1843	adde	$acc1,$acc1,$t1
1844	mulhdu	$t1,$a1,$bi
1845	adde	$acc2,$acc2,$t2
1846	mulhdu	$t2,$a2,$bi
1847	adde	$acc3,$acc3,$t3
1848	mulhdu	$t3,$a3,$bi
1849	addze	$acc4,$acc4
1850	mulld	$t4,$acc0,$ordk
1851	addc	$acc1,$acc1,$t0		# accumulate high parts
1852	adde	$acc2,$acc2,$t1
1853	adde	$acc3,$acc3,$t2
1854	adde	$acc4,$acc4,$t3
1855	addze	$acc5,$zr
1856___
1857}
1858$code.=<<___;
1859	sldi	$t0,$t4,32		# last reduction
1860	subfc	$acc2,$t4,$acc2
1861	srdi	$t1,$t4,32
1862	subfe	$acc3,$t0,$acc3
1863	subfe	$acc4,$t1,$acc4
1864	subfe	$acc5,$zr,$acc5
1865
1866	addic	$t0,$acc0,-1		# discarded
1867	mulhdu	$t1,$ord0,$t4
1868	mulld	$t2,$ord1,$t4
1869	mulhdu	$t3,$ord1,$t4
1870
1871	adde	$t2,$t2,$t1
1872	addze	$t3,$t3
1873
1874	addc	$acc0,$acc1,$t2
1875	adde	$acc1,$acc2,$t3
1876	adde	$acc2,$acc3,$t4
1877	adde	$acc3,$acc4,$t4
1878	addze	$acc4,$acc5
1879
1880	subfc	$acc0,$ord0,$acc0	# ret -= modulus
1881	subfe	$acc1,$ord1,$acc1
1882	subfe	$acc2,$ord2,$acc2
1883	subfe	$acc3,$ord3,$acc3
1884	subfe	$acc4,$zr,$acc4
1885
1886	and	$t0,$ord0,$acc4
1887	and	$t1,$ord1,$acc4
1888	addc	$acc0,$acc0,$t0		# ret += modulus if borrow
1889	and	$t3,$ord3,$acc4
1890	adde	$acc1,$acc1,$t1
1891	adde	$acc2,$acc2,$acc4
1892	adde	$acc3,$acc3,$t3
1893
1894	std	$acc0,0($rp)
1895	std	$acc1,8($rp)
1896	std	$acc2,16($rp)
1897	std	$acc3,24($rp)
1898
1899	ld	r18,48($sp)
1900	ld	r19,56($sp)
1901	ld	r20,64($sp)
1902	ld	r21,72($sp)
1903	ld	r22,80($sp)
1904	ld	r23,88($sp)
1905	ld	r24,96($sp)
1906	ld	r25,104($sp)
1907	ld	r26,112($sp)
1908	ld	r27,120($sp)
1909	ld	r28,128($sp)
1910	ld	r29,136($sp)
1911	ld	r30,144($sp)
1912	ld	r31,152($sp)
1913	addi	$sp,$sp,160
1914	blr
1915	.long	0
1916	.byte	0,12,4,0,0x80,14,3,0
1917	.long	0
1918.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1919
1920################################################################################
1921# void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1922#                                int rep);
1923.globl	ecp_nistz256_ord_sqr_mont
1924.align	5
1925ecp_nistz256_ord_sqr_mont:
1926	stdu	$sp,-160($sp)
1927	std	r18,48($sp)
1928	std	r19,56($sp)
1929	std	r20,64($sp)
1930	std	r21,72($sp)
1931	std	r22,80($sp)
1932	std	r23,88($sp)
1933	std	r24,96($sp)
1934	std	r25,104($sp)
1935	std	r26,112($sp)
1936	std	r27,120($sp)
1937	std	r28,128($sp)
1938	std	r29,136($sp)
1939	std	r30,144($sp)
1940	std	r31,152($sp)
1941
1942	mtctr	$bp
1943
1944	ld	$a0,0($ap)
1945	ld	$a1,8($ap)
1946	ld	$a2,16($ap)
1947	ld	$a3,24($ap)
1948
1949	lis	$ordk,0xccd1
1950	lis	$ord0,0xf3b9
1951	lis	$ord1,0xbce6
1952	ori	$ordk,$ordk,0xc8aa
1953	ori	$ord0,$ord0,0xcac2
1954	ori	$ord1,$ord1,0xfaad
1955	sldi	$ordk,$ordk,32
1956	sldi	$ord0,$ord0,32
1957	sldi	$ord1,$ord1,32
1958	oris	$ordk,$ordk,0xee00
1959	oris	$ord0,$ord0,0xfc63
1960	oris	$ord1,$ord1,0xa717
1961	ori	$ordk,$ordk,0xbc4f	# 0xccd1c8aaee00bc4f
1962	ori	$ord0,$ord0,0x2551	# 0xf3b9cac2fc632551
1963	ori	$ord1,$ord1,0x9e84	# 0xbce6faada7179e84
1964	li	$ord2,-1		# 0xffffffffffffffff
1965	sldi	$ord3,$ord2,32		# 0xffffffff00000000
1966	li	$zr,0
1967	b	.Loop_ord_sqr
1968
1969.align	5
1970.Loop_ord_sqr:
1971	################################################################
1972	#  |  |  |  |  |  |a1*a0|  |
1973	#  |  |  |  |  |a2*a0|  |  |
1974	#  |  |a3*a2|a3*a0|  |  |  |
1975	#  |  |  |  |a2*a1|  |  |  |
1976	#  |  |  |a3*a1|  |  |  |  |
1977	# *|  |  |  |  |  |  |  | 2|
1978	# +|a3*a3|a2*a2|a1*a1|a0*a0|
1979	#  |--+--+--+--+--+--+--+--|
1980	#  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1981	#
1982	#  "can't overflow" below mark carrying into high part of
1983	#  multiplication result, which can't overflow, because it
1984	#  can never be all ones.
1985
1986	mulld	$acc1,$a1,$a0		# a[1]*a[0]
1987	mulhdu	$t1,$a1,$a0
1988	mulld	$acc2,$a2,$a0		# a[2]*a[0]
1989	mulhdu	$t2,$a2,$a0
1990	mulld	$acc3,$a3,$a0		# a[3]*a[0]
1991	mulhdu	$acc4,$a3,$a0
1992
1993	addc	$acc2,$acc2,$t1		# accumulate high parts of multiplication
1994	 mulld	$t0,$a2,$a1		# a[2]*a[1]
1995	 mulhdu	$t1,$a2,$a1
1996	adde	$acc3,$acc3,$t2
1997	 mulld	$t2,$a3,$a1		# a[3]*a[1]
1998	 mulhdu	$t3,$a3,$a1
1999	addze	$acc4,$acc4		# can't overflow
2000
2001	mulld	$acc5,$a3,$a2		# a[3]*a[2]
2002	mulhdu	$acc6,$a3,$a2
2003
2004	addc	$t1,$t1,$t2		# accumulate high parts of multiplication
2005	 mulld	$acc0,$a0,$a0		# a[0]*a[0]
2006	addze	$t2,$t3			# can't overflow
2007
2008	addc	$acc3,$acc3,$t0		# accumulate low parts of multiplication
2009	 mulhdu	$a0,$a0,$a0
2010	adde	$acc4,$acc4,$t1
2011	 mulld	$t1,$a1,$a1		# a[1]*a[1]
2012	adde	$acc5,$acc5,$t2
2013	 mulhdu	$a1,$a1,$a1
2014	addze	$acc6,$acc6		# can't overflow
2015
2016	addc	$acc1,$acc1,$acc1	# acc[1-6]*=2
2017	 mulld	$t2,$a2,$a2		# a[2]*a[2]
2018	adde	$acc2,$acc2,$acc2
2019	 mulhdu	$a2,$a2,$a2
2020	adde	$acc3,$acc3,$acc3
2021	 mulld	$t3,$a3,$a3		# a[3]*a[3]
2022	adde	$acc4,$acc4,$acc4
2023	 mulhdu	$a3,$a3,$a3
2024	adde	$acc5,$acc5,$acc5
2025	adde	$acc6,$acc6,$acc6
2026	addze	$acc7,$zr
2027
2028	addc	$acc1,$acc1,$a0		# +a[i]*a[i]
2029	 mulld	$t4,$acc0,$ordk
2030	adde	$acc2,$acc2,$t1
2031	adde	$acc3,$acc3,$a1
2032	adde	$acc4,$acc4,$t2
2033	adde	$acc5,$acc5,$a2
2034	adde	$acc6,$acc6,$t3
2035	adde	$acc7,$acc7,$a3
2036___
2037for($i=0; $i<4; $i++) {			# reductions
2038$code.=<<___;
2039	addic	$t0,$acc0,-1		# discarded
2040	mulhdu	$t1,$ord0,$t4
2041	mulld	$t2,$ord1,$t4
2042	mulhdu	$t3,$ord1,$t4
2043
2044	adde	$t2,$t2,$t1
2045	addze	$t3,$t3
2046
2047	addc	$acc0,$acc1,$t2
2048	adde	$acc1,$acc2,$t3
2049	adde	$acc2,$acc3,$t4
2050	adde	$acc3,$zr,$t4		# can't overflow
2051___
2052$code.=<<___	if ($i<3);
2053	mulld	$t3,$acc0,$ordk
2054___
2055$code.=<<___;
2056	sldi	$t0,$t4,32
2057	subfc	$acc1,$t4,$acc1
2058	srdi	$t1,$t4,32
2059	subfe	$acc2,$t0,$acc2
2060	subfe	$acc3,$t1,$acc3		# can't borrow
2061___
2062	($t3,$t4) = ($t4,$t3);
2063}
2064$code.=<<___;
2065	addc	$acc0,$acc0,$acc4	# accumulate upper half
2066	adde	$acc1,$acc1,$acc5
2067	adde	$acc2,$acc2,$acc6
2068	adde	$acc3,$acc3,$acc7
2069	addze	$acc4,$zr
2070
2071	subfc	$acc0,$ord0,$acc0	# ret -= modulus
2072	subfe	$acc1,$ord1,$acc1
2073	subfe	$acc2,$ord2,$acc2
2074	subfe	$acc3,$ord3,$acc3
2075	subfe	$acc4,$zr,$acc4
2076
2077	and	$t0,$ord0,$acc4
2078	and	$t1,$ord1,$acc4
2079	addc	$a0,$acc0,$t0		# ret += modulus if borrow
2080	and	$t3,$ord3,$acc4
2081	adde	$a1,$acc1,$t1
2082	adde	$a2,$acc2,$acc4
2083	adde	$a3,$acc3,$t3
2084
2085	bdnz	.Loop_ord_sqr
2086
2087	std	$a0,0($rp)
2088	std	$a1,8($rp)
2089	std	$a2,16($rp)
2090	std	$a3,24($rp)
2091
2092	ld	r18,48($sp)
2093	ld	r19,56($sp)
2094	ld	r20,64($sp)
2095	ld	r21,72($sp)
2096	ld	r22,80($sp)
2097	ld	r23,88($sp)
2098	ld	r24,96($sp)
2099	ld	r25,104($sp)
2100	ld	r26,112($sp)
2101	ld	r27,120($sp)
2102	ld	r28,128($sp)
2103	ld	r29,136($sp)
2104	ld	r30,144($sp)
2105	ld	r31,152($sp)
2106	addi	$sp,$sp,160
2107	blr
2108	.long	0
2109	.byte	0,12,4,0,0x80,14,3,0
2110	.long	0
2111.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
2112___
2113}	}
2114
2115########################################################################
2116# scatter-gather subroutines
2117{
2118my ($out,$inp,$index,$mask)=map("r$_",(3..7));
2119$code.=<<___;
2120########################################################################
2121# void	ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp,
2122#				int index);
2123.globl	ecp_nistz256_scatter_w5
2124.align	4
2125ecp_nistz256_scatter_w5:
2126	slwi	$index,$index,2
2127	add	$out,$out,$index
2128
2129	ld	r8, 0($inp)		# X
2130	ld	r9, 8($inp)
2131	ld	r10,16($inp)
2132	ld	r11,24($inp)
2133
2134	stw	r8, 64*0-4($out)
2135	srdi	r8, r8, 32
2136	stw	r9, 64*1-4($out)
2137	srdi	r9, r9, 32
2138	stw	r10,64*2-4($out)
2139	srdi	r10,r10,32
2140	stw	r11,64*3-4($out)
2141	srdi	r11,r11,32
2142	stw	r8, 64*4-4($out)
2143	stw	r9, 64*5-4($out)
2144	stw	r10,64*6-4($out)
2145	stw	r11,64*7-4($out)
2146	addi	$out,$out,64*8
2147
2148	ld	r8, 32($inp)		# Y
2149	ld	r9, 40($inp)
2150	ld	r10,48($inp)
2151	ld	r11,56($inp)
2152
2153	stw	r8, 64*0-4($out)
2154	srdi	r8, r8, 32
2155	stw	r9, 64*1-4($out)
2156	srdi	r9, r9, 32
2157	stw	r10,64*2-4($out)
2158	srdi	r10,r10,32
2159	stw	r11,64*3-4($out)
2160	srdi	r11,r11,32
2161	stw	r8, 64*4-4($out)
2162	stw	r9, 64*5-4($out)
2163	stw	r10,64*6-4($out)
2164	stw	r11,64*7-4($out)
2165	addi	$out,$out,64*8
2166
2167	ld	r8, 64($inp)		# Z
2168	ld	r9, 72($inp)
2169	ld	r10,80($inp)
2170	ld	r11,88($inp)
2171
2172	stw	r8, 64*0-4($out)
2173	srdi	r8, r8, 32
2174	stw	r9, 64*1-4($out)
2175	srdi	r9, r9, 32
2176	stw	r10,64*2-4($out)
2177	srdi	r10,r10,32
2178	stw	r11,64*3-4($out)
2179	srdi	r11,r11,32
2180	stw	r8, 64*4-4($out)
2181	stw	r9, 64*5-4($out)
2182	stw	r10,64*6-4($out)
2183	stw	r11,64*7-4($out)
2184
2185	blr
2186	.long	0
2187	.byte	0,12,0x14,0,0,0,3,0
2188	.long	0
2189.size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
2190
2191########################################################################
2192# void	ecp_nistz256_gather_w5(P256_POINT *out, const void *inp,
2193#				int index);
2194.globl	ecp_nistz256_gather_w5
2195.align	4
2196ecp_nistz256_gather_w5:
2197	neg	r0,$index
2198	sradi	r0,r0,63
2199
2200	add	$index,$index,r0
2201	slwi	$index,$index,2
2202	add	$inp,$inp,$index
2203
2204	lwz	r5, 64*0($inp)
2205	lwz	r6, 64*1($inp)
2206	lwz	r7, 64*2($inp)
2207	lwz	r8, 64*3($inp)
2208	lwz	r9, 64*4($inp)
2209	lwz	r10,64*5($inp)
2210	lwz	r11,64*6($inp)
2211	lwz	r12,64*7($inp)
2212	addi	$inp,$inp,64*8
2213	sldi	r9, r9, 32
2214	sldi	r10,r10,32
2215	sldi	r11,r11,32
2216	sldi	r12,r12,32
2217	or	r5,r5,r9
2218	or	r6,r6,r10
2219	or	r7,r7,r11
2220	or	r8,r8,r12
2221	and	r5,r5,r0
2222	and	r6,r6,r0
2223	and	r7,r7,r0
2224	and	r8,r8,r0
2225	std	r5,0($out)		# X
2226	std	r6,8($out)
2227	std	r7,16($out)
2228	std	r8,24($out)
2229
2230	lwz	r5, 64*0($inp)
2231	lwz	r6, 64*1($inp)
2232	lwz	r7, 64*2($inp)
2233	lwz	r8, 64*3($inp)
2234	lwz	r9, 64*4($inp)
2235	lwz	r10,64*5($inp)
2236	lwz	r11,64*6($inp)
2237	lwz	r12,64*7($inp)
2238	addi	$inp,$inp,64*8
2239	sldi	r9, r9, 32
2240	sldi	r10,r10,32
2241	sldi	r11,r11,32
2242	sldi	r12,r12,32
2243	or	r5,r5,r9
2244	or	r6,r6,r10
2245	or	r7,r7,r11
2246	or	r8,r8,r12
2247	and	r5,r5,r0
2248	and	r6,r6,r0
2249	and	r7,r7,r0
2250	and	r8,r8,r0
2251	std	r5,32($out)		# Y
2252	std	r6,40($out)
2253	std	r7,48($out)
2254	std	r8,56($out)
2255
2256	lwz	r5, 64*0($inp)
2257	lwz	r6, 64*1($inp)
2258	lwz	r7, 64*2($inp)
2259	lwz	r8, 64*3($inp)
2260	lwz	r9, 64*4($inp)
2261	lwz	r10,64*5($inp)
2262	lwz	r11,64*6($inp)
2263	lwz	r12,64*7($inp)
2264	sldi	r9, r9, 32
2265	sldi	r10,r10,32
2266	sldi	r11,r11,32
2267	sldi	r12,r12,32
2268	or	r5,r5,r9
2269	or	r6,r6,r10
2270	or	r7,r7,r11
2271	or	r8,r8,r12
2272	and	r5,r5,r0
2273	and	r6,r6,r0
2274	and	r7,r7,r0
2275	and	r8,r8,r0
2276	std	r5,64($out)		# Z
2277	std	r6,72($out)
2278	std	r7,80($out)
2279	std	r8,88($out)
2280
2281	blr
2282	.long	0
2283	.byte	0,12,0x14,0,0,0,3,0
2284	.long	0
2285.size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
2286
2287########################################################################
2288# void	ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp,
2289#				int index);
2290.globl	ecp_nistz256_scatter_w7
2291.align	4
2292ecp_nistz256_scatter_w7:
2293	li	r0,8
2294	mtctr	r0
2295	add	$out,$out,$index
2296	subi	$inp,$inp,8
2297
2298.Loop_scatter_w7:
2299	ldu	r0,8($inp)
2300	stb	r0,64*0($out)
2301	srdi	r0,r0,8
2302	stb	r0,64*1($out)
2303	srdi	r0,r0,8
2304	stb	r0,64*2($out)
2305	srdi	r0,r0,8
2306	stb	r0,64*3($out)
2307	srdi	r0,r0,8
2308	stb	r0,64*4($out)
2309	srdi	r0,r0,8
2310	stb	r0,64*5($out)
2311	srdi	r0,r0,8
2312	stb	r0,64*6($out)
2313	srdi	r0,r0,8
2314	stb	r0,64*7($out)
2315	addi	$out,$out,64*8
2316	bdnz	.Loop_scatter_w7
2317
2318	blr
2319	.long	0
2320	.byte	0,12,0x14,0,0,0,3,0
2321	.long	0
2322.size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
2323
2324########################################################################
2325# void	ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp,
2326#				int index);
2327.globl	ecp_nistz256_gather_w7
2328.align	4
2329ecp_nistz256_gather_w7:
2330	li	r0,8
2331	mtctr	r0
2332	neg	r0,$index
2333	sradi	r0,r0,63
2334
2335	add	$index,$index,r0
2336	add	$inp,$inp,$index
2337	subi	$out,$out,8
2338
2339.Loop_gather_w7:
2340	lbz	r5, 64*0($inp)
2341	lbz	r6, 64*1($inp)
2342	lbz	r7, 64*2($inp)
2343	lbz	r8, 64*3($inp)
2344	lbz	r9, 64*4($inp)
2345	lbz	r10,64*5($inp)
2346	lbz	r11,64*6($inp)
2347	lbz	r12,64*7($inp)
2348	addi	$inp,$inp,64*8
2349
2350	sldi	r6, r6, 8
2351	sldi	r7, r7, 16
2352	sldi	r8, r8, 24
2353	sldi	r9, r9, 32
2354	sldi	r10,r10,40
2355	sldi	r11,r11,48
2356	sldi	r12,r12,56
2357
2358	or	r5,r5,r6
2359	or	r7,r7,r8
2360	or	r9,r9,r10
2361	or	r11,r11,r12
2362	or	r5,r5,r7
2363	or	r9,r9,r11
2364	or	r5,r5,r9
2365	and	r5,r5,r0
2366	stdu	r5,8($out)
2367	bdnz	.Loop_gather_w7
2368
2369	blr
2370	.long	0
2371	.byte	0,12,0x14,0,0,0,3,0
2372	.long	0
2373.size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
2374___
2375}
2376
2377foreach (split("\n",$code)) {
2378	s/\`([^\`]*)\`/eval $1/ge;
2379
2380	print $_,"\n";
2381}
2382close STDOUT or die "error closing STDOUT: $!";	# enforce flush
2383