1#! /usr/bin/env perl
2# Copyright 2018-2019 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# X25519 lower-level primitives for PPC64.
17#
18# July 2018.
19#
20# Base 2^64 is faster than base 2^51 on pre-POWER8, most notably ~15%
21# faster on PPC970/G5. POWER8 on the other hand seems to trip on own
22# shoelaces when handling longer carry chains. As base 2^51 has just
23# single-carry pairs, it's 25% faster than base 2^64. Since PPC970 is
24# pretty old, base 2^64 implementation is not engaged. Comparison to
25# compiler-generated code is complicated by the fact that not all
26# compilers support 128-bit integers. When compiler doesn't, like xlc,
27# this module delivers more than 2x improvement, and when it does,
28# from 12% to 30% improvement was measured...
29
30$flavour = shift;
31while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
32
33$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
35( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
36die "can't locate ppc-xlate.pl";
37
38open OUT,"| \"$^X\" $xlate $flavour $output";
39*STDOUT=*OUT;
40
41my $sp = "r1";
42my ($rp,$ap,$bp) = map("r$_",3..5);
43
44####################################################### base 2^64
45if (0) {
46my ($bi,$a0,$a1,$a2,$a3,$t0,$t1, $t2,$t3,
47    $acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) =
48    map("r$_",(6..12,22..31));
49my $zero = "r0";
50my $FRAME = 16*8;
51
52$code.=<<___;
53.text
54
55.globl	x25519_fe64_mul
56.type	x25519_fe64_mul,\@function
57.align	5
58x25519_fe64_mul:
59	stdu	$sp,-$FRAME($sp)
60	std	r22,`$FRAME-8*10`($sp)
61	std	r23,`$FRAME-8*9`($sp)
62	std	r24,`$FRAME-8*8`($sp)
63	std	r25,`$FRAME-8*7`($sp)
64	std	r26,`$FRAME-8*6`($sp)
65	std	r27,`$FRAME-8*5`($sp)
66	std	r28,`$FRAME-8*4`($sp)
67	std	r29,`$FRAME-8*3`($sp)
68	std	r30,`$FRAME-8*2`($sp)
69	std	r31,`$FRAME-8*1`($sp)
70
71	ld	$bi,0($bp)
72	ld	$a0,0($ap)
73	xor	$zero,$zero,$zero
74	ld	$a1,8($ap)
75	ld	$a2,16($ap)
76	ld	$a3,24($ap)
77
78	mulld	$acc0,$a0,$bi		# a[0]*b[0]
79	mulhdu	$t0,$a0,$bi
80	mulld	$acc1,$a1,$bi		# a[1]*b[0]
81	mulhdu	$t1,$a1,$bi
82	mulld	$acc2,$a2,$bi		# a[2]*b[0]
83	mulhdu	$t2,$a2,$bi
84	mulld	$acc3,$a3,$bi		# a[3]*b[0]
85	mulhdu	$t3,$a3,$bi
86___
87for(my @acc=($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7),
88    my $i=1; $i<4; shift(@acc), $i++) {
89my $acc4 = $i==1? $zero : @acc[4];
90
91$code.=<<___;
92	ld	$bi,`8*$i`($bp)
93	addc	@acc[1],@acc[1],$t0	# accumulate high parts
94	mulld	$t0,$a0,$bi
95	adde	@acc[2],@acc[2],$t1
96	mulld	$t1,$a1,$bi
97	adde	@acc[3],@acc[3],$t2
98	mulld	$t2,$a2,$bi
99	adde	@acc[4],$acc4,$t3
100	mulld	$t3,$a3,$bi
101	addc	@acc[1],@acc[1],$t0	# accumulate low parts
102	mulhdu	$t0,$a0,$bi
103	adde	@acc[2],@acc[2],$t1
104	mulhdu	$t1,$a1,$bi
105	adde	@acc[3],@acc[3],$t2
106	mulhdu	$t2,$a2,$bi
107	adde	@acc[4],@acc[4],$t3
108	mulhdu	$t3,$a3,$bi
109	adde	@acc[5],$zero,$zero
110___
111}
112$code.=<<___;
113	li	$bi,38
114	addc	$acc4,$acc4,$t0
115	mulld	$t0,$acc4,$bi
116	adde	$acc5,$acc5,$t1
117	mulld	$t1,$acc5,$bi
118	adde	$acc6,$acc6,$t2
119	mulld	$t2,$acc6,$bi
120	adde	$acc7,$acc7,$t3
121	mulld	$t3,$acc7,$bi
122
123	addc	$acc0,$acc0,$t0
124	mulhdu	$t0,$acc4,$bi
125	adde	$acc1,$acc1,$t1
126	mulhdu	$t1,$acc5,$bi
127	adde	$acc2,$acc2,$t2
128	mulhdu	$t2,$acc6,$bi
129	adde	$acc3,$acc3,$t3
130	mulhdu	$t3,$acc7,$bi
131	adde	$acc4,$zero,$zero
132
133	addc	$acc1,$acc1,$t0
134	adde	$acc2,$acc2,$t1
135	adde	$acc3,$acc3,$t2
136	adde	$acc4,$acc4,$t3
137
138	mulld	$acc4,$acc4,$bi
139
140	addc	$acc0,$acc0,$acc4
141	addze	$acc1,$acc1
142	addze	$acc2,$acc2
143	addze	$acc3,$acc3
144
145	subfe	$acc4,$acc4,$acc4	# carry -> ~mask
146	std	$acc1,8($rp)
147	andc	$acc4,$bi,$acc4
148	std	$acc2,16($rp)
149	add	$acc0,$acc0,$acc4
150	std	$acc3,24($rp)
151	std	$acc0,0($rp)
152
153	ld	r22,`$FRAME-8*10`($sp)
154	ld	r23,`$FRAME-8*9`($sp)
155	ld	r24,`$FRAME-8*8`($sp)
156	ld	r25,`$FRAME-8*7`($sp)
157	ld	r26,`$FRAME-8*6`($sp)
158	ld	r27,`$FRAME-8*5`($sp)
159	ld	r28,`$FRAME-8*4`($sp)
160	ld	r29,`$FRAME-8*3`($sp)
161	ld	r30,`$FRAME-8*2`($sp)
162	ld	r31,`$FRAME-8*1`($sp)
163	addi	$sp,$sp,$FRAME
164	blr
165	.long	0
166	.byte	0,12,4,0,0x80,10,3,0
167	.long	0
168.size	x25519_fe64_mul,.-x25519_fe64_mul
169
170.globl	x25519_fe64_sqr
171.type	x25519_fe64_sqr,\@function
172.align	5
173x25519_fe64_sqr:
174	stdu	$sp,-$FRAME($sp)
175	std	r22,`$FRAME-8*10`($sp)
176	std	r23,`$FRAME-8*9`($sp)
177	std	r24,`$FRAME-8*8`($sp)
178	std	r25,`$FRAME-8*7`($sp)
179	std	r26,`$FRAME-8*6`($sp)
180	std	r27,`$FRAME-8*5`($sp)
181	std	r28,`$FRAME-8*4`($sp)
182	std	r29,`$FRAME-8*3`($sp)
183	std	r30,`$FRAME-8*2`($sp)
184	std	r31,`$FRAME-8*1`($sp)
185
186	ld	$a0,0($ap)
187	xor	$zero,$zero,$zero
188	ld	$a1,8($ap)
189	ld	$a2,16($ap)
190	ld	$a3,24($ap)
191
192	################################
193	#  |  |  |  |  |  |a1*a0|  |
194	#  |  |  |  |  |a2*a0|  |  |
195	#  |  |a3*a2|a3*a0|  |  |  |
196	#  |  |  |  |a2*a1|  |  |  |
197	#  |  |  |a3*a1|  |  |  |  |
198	# *|  |  |  |  |  |  |  | 2|
199	# +|a3*a3|a2*a2|a1*a1|a0*a0|
200	#  |--+--+--+--+--+--+--+--|
201	#  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
202	#
203	#  "can't overflow" below mark carrying into high part of
204	#  multiplication result, which can't overflow, because it
205	#  can never be all ones.
206
207	mulld	$acc1,$a1,$a0		# a[1]*a[0]
208	mulhdu	$t1,$a1,$a0
209	mulld	$acc2,$a2,$a0		# a[2]*a[0]
210	mulhdu	$t2,$a2,$a0
211	mulld	$acc3,$a3,$a0		# a[3]*a[0]
212	mulhdu	$acc4,$a3,$a0
213
214	addc	$acc2,$acc2,$t1		# accumulate high parts of multiplication
215	 mulld	$t0,$a2,$a1		# a[2]*a[1]
216	 mulhdu	$t1,$a2,$a1
217	adde	$acc3,$acc3,$t2
218	 mulld	$t2,$a3,$a1		# a[3]*a[1]
219	 mulhdu	$t3,$a3,$a1
220	addze	$acc4,$acc4		# can't overflow
221
222	mulld	$acc5,$a3,$a2		# a[3]*a[2]
223	mulhdu	$acc6,$a3,$a2
224
225	addc	$t1,$t1,$t2		# accumulate high parts of multiplication
226	 mulld	$acc0,$a0,$a0		# a[0]*a[0]
227	addze	$t2,$t3			# can't overflow
228
229	addc	$acc3,$acc3,$t0		# accumulate low parts of multiplication
230	 mulhdu	$a0,$a0,$a0
231	adde	$acc4,$acc4,$t1
232	 mulld	$t1,$a1,$a1		# a[1]*a[1]
233	adde	$acc5,$acc5,$t2
234	 mulhdu	$a1,$a1,$a1
235	addze	$acc6,$acc6		# can't overflow
236
237	addc	$acc1,$acc1,$acc1	# acc[1-6]*=2
238	 mulld	$t2,$a2,$a2		# a[2]*a[2]
239	adde	$acc2,$acc2,$acc2
240	 mulhdu	$a2,$a2,$a2
241	adde	$acc3,$acc3,$acc3
242	 mulld	$t3,$a3,$a3		# a[3]*a[3]
243	adde	$acc4,$acc4,$acc4
244	 mulhdu	$a3,$a3,$a3
245	adde	$acc5,$acc5,$acc5
246	adde	$acc6,$acc6,$acc6
247	addze	$acc7,$zero
248
249	addc	$acc1,$acc1,$a0		# +a[i]*a[i]
250	 li	$bi,38
251	adde	$acc2,$acc2,$t1
252	adde	$acc3,$acc3,$a1
253	adde	$acc4,$acc4,$t2
254	adde	$acc5,$acc5,$a2
255	adde	$acc6,$acc6,$t3
256	adde	$acc7,$acc7,$a3
257
258	mulld	$t0,$acc4,$bi
259	mulld	$t1,$acc5,$bi
260	mulld	$t2,$acc6,$bi
261	mulld	$t3,$acc7,$bi
262
263	addc	$acc0,$acc0,$t0
264	mulhdu	$t0,$acc4,$bi
265	adde	$acc1,$acc1,$t1
266	mulhdu	$t1,$acc5,$bi
267	adde	$acc2,$acc2,$t2
268	mulhdu	$t2,$acc6,$bi
269	adde	$acc3,$acc3,$t3
270	mulhdu	$t3,$acc7,$bi
271	addze	$acc4,$zero
272
273	addc	$acc1,$acc1,$t0
274	adde	$acc2,$acc2,$t1
275	adde	$acc3,$acc3,$t2
276	adde	$acc4,$acc4,$t3
277
278	mulld	$acc4,$acc4,$bi
279
280	addc	$acc0,$acc0,$acc4
281	addze	$acc1,$acc1
282	addze	$acc2,$acc2
283	addze	$acc3,$acc3
284
285	subfe	$acc4,$acc4,$acc4	# carry -> ~mask
286	std	$acc1,8($rp)
287	andc	$acc4,$bi,$acc4
288	std	$acc2,16($rp)
289	add	$acc0,$acc0,$acc4
290	std	$acc3,24($rp)
291	std	$acc0,0($rp)
292
293	ld	r22,`$FRAME-8*10`($sp)
294	ld	r23,`$FRAME-8*9`($sp)
295	ld	r24,`$FRAME-8*8`($sp)
296	ld	r25,`$FRAME-8*7`($sp)
297	ld	r26,`$FRAME-8*6`($sp)
298	ld	r27,`$FRAME-8*5`($sp)
299	ld	r28,`$FRAME-8*4`($sp)
300	ld	r29,`$FRAME-8*3`($sp)
301	ld	r30,`$FRAME-8*2`($sp)
302	ld	r31,`$FRAME-8*1`($sp)
303	addi	$sp,$sp,$FRAME
304	blr
305	.long	0
306	.byte	0,12,4,0,0x80,10,2,0
307	.long	0
308.size	x25519_fe64_sqr,.-x25519_fe64_sqr
309
310.globl	x25519_fe64_mul121666
311.type	x25519_fe64_mul121666,\@function
312.align	5
313x25519_fe64_mul121666:
314	lis	$bi,`65536>>16`
315	ori	$bi,$bi,`121666-65536`
316
317	ld	$t0,0($ap)
318	ld	$t1,8($ap)
319	ld	$bp,16($ap)
320	ld	$ap,24($ap)
321
322	mulld	$a0,$t0,$bi
323	mulhdu	$t0,$t0,$bi
324	mulld	$a1,$t1,$bi
325	mulhdu	$t1,$t1,$bi
326	mulld	$a2,$bp,$bi
327	mulhdu	$bp,$bp,$bi
328	mulld	$a3,$ap,$bi
329	mulhdu	$ap,$ap,$bi
330
331	addc	$a1,$a1,$t0
332	adde	$a2,$a2,$t1
333	adde	$a3,$a3,$bp
334	addze	$ap,    $ap
335
336	mulli	$ap,$ap,38
337
338	addc	$a0,$a0,$ap
339	addze	$a1,$a1
340	addze	$a2,$a2
341	addze	$a3,$a3
342
343	subfe	$t1,$t1,$t1		# carry -> ~mask
344	std	$a1,8($rp)
345	andc	$t0,$t0,$t1
346	std	$a2,16($rp)
347	add	$a0,$a0,$t0
348	std	$a3,24($rp)
349	std	$a0,0($rp)
350
351	blr
352	.long	0
353	.byte	0,12,0x14,0,0,0,2,0
354	.long	0
355.size	x25519_fe64_mul121666,.-x25519_fe64_mul121666
356
357.globl	x25519_fe64_add
358.type	x25519_fe64_add,\@function
359.align	5
360x25519_fe64_add:
361	ld	$a0,0($ap)
362	ld	$t0,0($bp)
363	ld	$a1,8($ap)
364	ld	$t1,8($bp)
365	ld	$a2,16($ap)
366	ld	$bi,16($bp)
367	ld	$a3,24($ap)
368	ld	$bp,24($bp)
369
370	addc	$a0,$a0,$t0
371	adde	$a1,$a1,$t1
372	adde	$a2,$a2,$bi
373	adde	$a3,$a3,$bp
374
375	li	$t0,38
376	subfe	$t1,$t1,$t1		# carry -> ~mask
377	andc	$t1,$t0,$t1
378
379	addc	$a0,$a0,$t1
380	addze	$a1,$a1
381	addze	$a2,$a2
382	addze	$a3,$a3
383
384	subfe	$t1,$t1,$t1		# carry -> ~mask
385	std	$a1,8($rp)
386	andc	$t0,$t0,$t1
387	std	$a2,16($rp)
388	add	$a0,$a0,$t0
389	std	$a3,24($rp)
390	std	$a0,0($rp)
391
392	blr
393	.long	0
394	.byte	0,12,0x14,0,0,0,3,0
395	.long	0
396.size	x25519_fe64_add,.-x25519_fe64_add
397
398.globl	x25519_fe64_sub
399.type	x25519_fe64_sub,\@function
400.align	5
401x25519_fe64_sub:
402	ld	$a0,0($ap)
403	ld	$t0,0($bp)
404	ld	$a1,8($ap)
405	ld	$t1,8($bp)
406	ld	$a2,16($ap)
407	ld	$bi,16($bp)
408	ld	$a3,24($ap)
409	ld	$bp,24($bp)
410
411	subfc	$a0,$t0,$a0
412	subfe	$a1,$t1,$a1
413	subfe	$a2,$bi,$a2
414	subfe	$a3,$bp,$a3
415
416	li	$t0,38
417	subfe	$t1,$t1,$t1		# borrow -> mask
418	xor	$zero,$zero,$zero
419	and	$t1,$t0,$t1
420
421	subfc	$a0,$t1,$a0
422	subfe	$a1,$zero,$a1
423	subfe	$a2,$zero,$a2
424	subfe	$a3,$zero,$a3
425
426	subfe	$t1,$t1,$t1		# borrow -> mask
427	std	$a1,8($rp)
428	and	$t0,$t0,$t1
429	std	$a2,16($rp)
430	subf	$a0,$t0,$a0
431	std	$a3,24($rp)
432	std	$a0,0($rp)
433
434	blr
435	.long	0
436	.byte	0,12,0x14,0,0,0,3,0
437	.long	0
438.size	x25519_fe64_sub,.-x25519_fe64_sub
439
440.globl	x25519_fe64_tobytes
441.type	x25519_fe64_tobytes,\@function
442.align	5
443x25519_fe64_tobytes:
444	ld	$a3,24($ap)
445	ld	$a0,0($ap)
446	ld	$a1,8($ap)
447	ld	$a2,16($ap)
448
449	sradi	$t0,$a3,63		# most significant bit -> mask
450	li	$t1,19
451	and	$t0,$t0,$t1
452	sldi	$a3,$a3,1
453	add	$t0,$t0,$t1		# compare to modulus in the same go
454	srdi	$a3,$a3,1		# most significant bit cleared
455
456	addc	$a0,$a0,$t0
457	addze	$a1,$a1
458	addze	$a2,$a2
459	addze	$a3,$a3
460
461	xor	$zero,$zero,$zero
462	sradi	$t0,$a3,63		# most significant bit -> mask
463	sldi	$a3,$a3,1
464	andc	$t0,$t1,$t0
465	srdi	$a3,$a3,1		# most significant bit cleared
466
467	subi	$rp,$rp,1
468	subfc	$a0,$t0,$a0
469	subfe	$a1,$zero,$a1
470	subfe	$a2,$zero,$a2
471	subfe	$a3,$zero,$a3
472
473___
474for (my @a=($a0,$a1,$a2,$a3), my $i=0; $i<4; shift(@a), $i++) {
475$code.=<<___;
476	srdi	$t0,@a[0],8
477	stbu	@a[0],1($rp)
478	srdi	@a[0],@a[0],16
479	stbu	$t0,1($rp)
480	srdi	$t0,@a[0],8
481	stbu	@a[0],1($rp)
482	srdi	@a[0],@a[0],16
483	stbu	$t0,1($rp)
484	srdi	$t0,@a[0],8
485	stbu	@a[0],1($rp)
486	srdi	@a[0],@a[0],16
487	stbu	$t0,1($rp)
488	srdi	$t0,@a[0],8
489	stbu	@a[0],1($rp)
490	stbu	$t0,1($rp)
491___
492}
493$code.=<<___;
494	blr
495	.long	0
496	.byte	0,12,0x14,0,0,0,2,0
497	.long	0
498.size	x25519_fe64_tobytes,.-x25519_fe64_tobytes
499___
500}
501####################################################### base 2^51
502{
503my ($bi,$a0,$a1,$a2,$a3,$a4,$t0, $t1,
504    $h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,$h4lo,$h4hi) =
505    map("r$_",(6..12,21..31));
506my $mask = "r0";
507my $FRAME = 18*8;
508
509$code.=<<___;
510.text
511
512.globl	x25519_fe51_mul
513.type	x25519_fe51_mul,\@function
514.align	5
515x25519_fe51_mul:
516	stdu	$sp,-$FRAME($sp)
517	std	r21,`$FRAME-8*11`($sp)
518	std	r22,`$FRAME-8*10`($sp)
519	std	r23,`$FRAME-8*9`($sp)
520	std	r24,`$FRAME-8*8`($sp)
521	std	r25,`$FRAME-8*7`($sp)
522	std	r26,`$FRAME-8*6`($sp)
523	std	r27,`$FRAME-8*5`($sp)
524	std	r28,`$FRAME-8*4`($sp)
525	std	r29,`$FRAME-8*3`($sp)
526	std	r30,`$FRAME-8*2`($sp)
527	std	r31,`$FRAME-8*1`($sp)
528
529	ld	$bi,0($bp)
530	ld	$a0,0($ap)
531	ld	$a1,8($ap)
532	ld	$a2,16($ap)
533	ld	$a3,24($ap)
534	ld	$a4,32($ap)
535
536	mulld	$h0lo,$a0,$bi		# a[0]*b[0]
537	mulhdu	$h0hi,$a0,$bi
538
539	mulld	$h1lo,$a1,$bi		# a[1]*b[0]
540	mulhdu	$h1hi,$a1,$bi
541
542	 mulld	$h4lo,$a4,$bi		# a[4]*b[0]
543	 mulhdu	$h4hi,$a4,$bi
544	 ld	$ap,8($bp)
545	 mulli	$a4,$a4,19
546
547	mulld	$h2lo,$a2,$bi		# a[2]*b[0]
548	mulhdu	$h2hi,$a2,$bi
549
550	mulld	$h3lo,$a3,$bi		# a[3]*b[0]
551	mulhdu	$h3hi,$a3,$bi
552___
553for(my @a=($a0,$a1,$a2,$a3,$a4),
554    my $i=1; $i<4; $i++) {
555	($ap,$bi) = ($bi,$ap);
556$code.=<<___;
557	mulld	$t0,@a[4],$bi
558	mulhdu	$t1,@a[4],$bi
559	addc	$h0lo,$h0lo,$t0
560	adde	$h0hi,$h0hi,$t1
561
562	mulld	$t0,@a[0],$bi
563	mulhdu	$t1,@a[0],$bi
564	addc	$h1lo,$h1lo,$t0
565	adde	$h1hi,$h1hi,$t1
566
567	 mulld	$t0,@a[3],$bi
568	 mulhdu	$t1,@a[3],$bi
569	 ld	$ap,`8*($i+1)`($bp)
570	 mulli	@a[3],@a[3],19
571	 addc	$h4lo,$h4lo,$t0
572	 adde	$h4hi,$h4hi,$t1
573
574	mulld	$t0,@a[1],$bi
575	mulhdu	$t1,@a[1],$bi
576	addc	$h2lo,$h2lo,$t0
577	adde	$h2hi,$h2hi,$t1
578
579	mulld	$t0,@a[2],$bi
580	mulhdu	$t1,@a[2],$bi
581	addc	$h3lo,$h3lo,$t0
582	adde	$h3hi,$h3hi,$t1
583___
584	unshift(@a,pop(@a));
585}
586	($ap,$bi) = ($bi,$ap);
587$code.=<<___;
588	mulld	$t0,$a1,$bi
589	mulhdu	$t1,$a1,$bi
590	addc	$h0lo,$h0lo,$t0
591	adde	$h0hi,$h0hi,$t1
592
593	mulld	$t0,$a2,$bi
594	mulhdu	$t1,$a2,$bi
595	addc	$h1lo,$h1lo,$t0
596	adde	$h1hi,$h1hi,$t1
597
598	mulld	$t0,$a3,$bi
599	mulhdu	$t1,$a3,$bi
600	addc	$h2lo,$h2lo,$t0
601	adde	$h2hi,$h2hi,$t1
602
603	mulld	$t0,$a4,$bi
604	mulhdu	$t1,$a4,$bi
605	addc	$h3lo,$h3lo,$t0
606	adde	$h3hi,$h3hi,$t1
607
608	mulld	$t0,$a0,$bi
609	mulhdu	$t1,$a0,$bi
610	addc	$h4lo,$h4lo,$t0
611	adde	$h4hi,$h4hi,$t1
612
613.Lfe51_reduce:
614	li	$mask,-1
615	srdi	$mask,$mask,13		# 0x7ffffffffffff
616
617	srdi	$t0,$h2lo,51
618	and	$a2,$h2lo,$mask
619	insrdi	$t0,$h2hi,51,0		# h2>>51
620	 srdi	$t1,$h0lo,51
621	 and	$a0,$h0lo,$mask
622	 insrdi	$t1,$h0hi,51,0		# h0>>51
623	addc	$h3lo,$h3lo,$t0
624	addze	$h3hi,$h3hi
625	 addc	$h1lo,$h1lo,$t1
626	 addze	$h1hi,$h1hi
627
628	srdi	$t0,$h3lo,51
629	and	$a3,$h3lo,$mask
630	insrdi	$t0,$h3hi,51,0		# h3>>51
631	 srdi	$t1,$h1lo,51
632	 and	$a1,$h1lo,$mask
633	 insrdi	$t1,$h1hi,51,0		# h1>>51
634	addc	$h4lo,$h4lo,$t0
635	addze	$h4hi,$h4hi
636	 add	$a2,$a2,$t1
637
638	srdi	$t0,$h4lo,51
639	and	$a4,$h4lo,$mask
640	insrdi	$t0,$h4hi,51,0
641	mulli	$t0,$t0,19		# (h4 >> 51) * 19
642
643	add	$a0,$a0,$t0
644
645	srdi	$t1,$a2,51
646	and	$a2,$a2,$mask
647	add	$a3,$a3,$t1
648
649	srdi	$t0,$a0,51
650	and	$a0,$a0,$mask
651	add	$a1,$a1,$t0
652
653	std	$a2,16($rp)
654	std	$a3,24($rp)
655	std	$a4,32($rp)
656	std	$a0,0($rp)
657	std	$a1,8($rp)
658
659	ld	r21,`$FRAME-8*11`($sp)
660	ld	r22,`$FRAME-8*10`($sp)
661	ld	r23,`$FRAME-8*9`($sp)
662	ld	r24,`$FRAME-8*8`($sp)
663	ld	r25,`$FRAME-8*7`($sp)
664	ld	r26,`$FRAME-8*6`($sp)
665	ld	r27,`$FRAME-8*5`($sp)
666	ld	r28,`$FRAME-8*4`($sp)
667	ld	r29,`$FRAME-8*3`($sp)
668	ld	r30,`$FRAME-8*2`($sp)
669	ld	r31,`$FRAME-8*1`($sp)
670	addi	$sp,$sp,$FRAME
671	blr
672	.long	0
673	.byte	0,12,4,0,0x80,11,3,0
674	.long	0
675.size	x25519_fe51_mul,.-x25519_fe51_mul
676___
677{
678my ($a0,$a1,$a2,$a3,$a4,$t0,$t1) = ($a0,$a1,$a2,$a3,$a4,$t0,$t1);
679$code.=<<___;
680.globl	x25519_fe51_sqr
681.type	x25519_fe51_sqr,\@function
682.align	5
683x25519_fe51_sqr:
684	stdu	$sp,-$FRAME($sp)
685	std	r21,`$FRAME-8*11`($sp)
686	std	r22,`$FRAME-8*10`($sp)
687	std	r23,`$FRAME-8*9`($sp)
688	std	r24,`$FRAME-8*8`($sp)
689	std	r25,`$FRAME-8*7`($sp)
690	std	r26,`$FRAME-8*6`($sp)
691	std	r27,`$FRAME-8*5`($sp)
692	std	r28,`$FRAME-8*4`($sp)
693	std	r29,`$FRAME-8*3`($sp)
694	std	r30,`$FRAME-8*2`($sp)
695	std	r31,`$FRAME-8*1`($sp)
696
697	ld	$a0,0($ap)
698	ld	$a1,8($ap)
699	ld	$a2,16($ap)
700	ld	$a3,24($ap)
701	ld	$a4,32($ap)
702
703	add	$bi,$a0,$a0		# a[0]*2
704	mulli	$t1,$a4,19		# a[4]*19
705
706	mulld	$h0lo,$a0,$a0
707	mulhdu	$h0hi,$a0,$a0
708	mulld	$h1lo,$a1,$bi
709	mulhdu	$h1hi,$a1,$bi
710	mulld	$h2lo,$a2,$bi
711	mulhdu	$h2hi,$a2,$bi
712	mulld	$h3lo,$a3,$bi
713	mulhdu	$h3hi,$a3,$bi
714	mulld	$h4lo,$a4,$bi
715	mulhdu	$h4hi,$a4,$bi
716	add	$bi,$a1,$a1		# a[1]*2
717___
718	($a4,$t1) = ($t1,$a4);
719$code.=<<___;
720	mulld	$t0,$t1,$a4
721	mulhdu	$t1,$t1,$a4
722	addc	$h3lo,$h3lo,$t0
723	adde	$h3hi,$h3hi,$t1
724
725	mulli	$bp,$a3,19		# a[3]*19
726
727	mulld	$t0,$a1,$a1
728	mulhdu	$t1,$a1,$a1
729	addc	$h2lo,$h2lo,$t0
730	adde	$h2hi,$h2hi,$t1
731	mulld	$t0,$a2,$bi
732	mulhdu	$t1,$a2,$bi
733	addc	$h3lo,$h3lo,$t0
734	adde	$h3hi,$h3hi,$t1
735	mulld	$t0,$a3,$bi
736	mulhdu	$t1,$a3,$bi
737	addc	$h4lo,$h4lo,$t0
738	adde	$h4hi,$h4hi,$t1
739	mulld	$t0,$a4,$bi
740	mulhdu	$t1,$a4,$bi
741	add	$bi,$a3,$a3		# a[3]*2
742	addc	$h0lo,$h0lo,$t0
743	adde	$h0hi,$h0hi,$t1
744___
745	($a3,$t1) = ($bp,$a3);
746$code.=<<___;
747	mulld	$t0,$t1,$a3
748	mulhdu	$t1,$t1,$a3
749	addc	$h1lo,$h1lo,$t0
750	adde	$h1hi,$h1hi,$t1
751	mulld	$t0,$bi,$a4
752	mulhdu	$t1,$bi,$a4
753	add	$bi,$a2,$a2		# a[2]*2
754	addc	$h2lo,$h2lo,$t0
755	adde	$h2hi,$h2hi,$t1
756
757	mulld	$t0,$a2,$a2
758	mulhdu	$t1,$a2,$a2
759	addc	$h4lo,$h4lo,$t0
760	adde	$h4hi,$h4hi,$t1
761	mulld	$t0,$a3,$bi
762	mulhdu	$t1,$a3,$bi
763	addc	$h0lo,$h0lo,$t0
764	adde	$h0hi,$h0hi,$t1
765	mulld	$t0,$a4,$bi
766	mulhdu	$t1,$a4,$bi
767	addc	$h1lo,$h1lo,$t0
768	adde	$h1hi,$h1hi,$t1
769
770	b	.Lfe51_reduce
771	.long	0
772	.byte	0,12,4,0,0x80,11,2,0
773	.long	0
774.size	x25519_fe51_sqr,.-x25519_fe51_sqr
775___
776}
777$code.=<<___;
778.globl	x25519_fe51_mul121666
779.type	x25519_fe51_mul121666,\@function
780.align	5
781x25519_fe51_mul121666:
782	stdu	$sp,-$FRAME($sp)
783	std	r21,`$FRAME-8*11`($sp)
784	std	r22,`$FRAME-8*10`($sp)
785	std	r23,`$FRAME-8*9`($sp)
786	std	r24,`$FRAME-8*8`($sp)
787	std	r25,`$FRAME-8*7`($sp)
788	std	r26,`$FRAME-8*6`($sp)
789	std	r27,`$FRAME-8*5`($sp)
790	std	r28,`$FRAME-8*4`($sp)
791	std	r29,`$FRAME-8*3`($sp)
792	std	r30,`$FRAME-8*2`($sp)
793	std	r31,`$FRAME-8*1`($sp)
794
795	lis	$bi,`65536>>16`
796	ori	$bi,$bi,`121666-65536`
797	ld	$a0,0($ap)
798	ld	$a1,8($ap)
799	ld	$a2,16($ap)
800	ld	$a3,24($ap)
801	ld	$a4,32($ap)
802
803	mulld	$h0lo,$a0,$bi		# a[0]*121666
804	mulhdu	$h0hi,$a0,$bi
805	mulld	$h1lo,$a1,$bi		# a[1]*121666
806	mulhdu	$h1hi,$a1,$bi
807	mulld	$h2lo,$a2,$bi		# a[2]*121666
808	mulhdu	$h2hi,$a2,$bi
809	mulld	$h3lo,$a3,$bi		# a[3]*121666
810	mulhdu	$h3hi,$a3,$bi
811	mulld	$h4lo,$a4,$bi		# a[4]*121666
812	mulhdu	$h4hi,$a4,$bi
813
814	b	.Lfe51_reduce
815	.long	0
816	.byte	0,12,4,0,0x80,11,2,0
817	.long	0
818.size	x25519_fe51_mul121666,.-x25519_fe51_mul121666
819___
820}
821
822$code =~ s/\`([^\`]*)\`/eval $1/gem;
823print $code;
824close STDOUT;
825