1#! /usr/bin/env perl
2# Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Amitay Isaacs <amitay@ozlabs.org> and Martin Schwenke
11# <martin@meltin.net> for the OpenSSL project.
12# ====================================================================
13#
14# p521 lower-level primitives for PPC64 using vector instructions.
15#
16
17use strict;
18use warnings;
19
20my $flavour = shift;
21my $output = "";
22while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
23if (!$output) {
24	$output = "-";
25}
26
27my ($xlate, $dir);
28$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
29( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
30( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
31die "can't locate ppc-xlate.pl";
32
33open OUT,"| \"$^X\" $xlate $flavour $output";
34*STDOUT=*OUT;
35
36my $code = "";
37
38my ($sp, $outp, $savelr, $savesp) = ("r1", "r3", "r10", "r12");
39
40my $vzero = "v32";
41
42sub startproc($)
43{
44    my ($name) = @_;
45
46    $code.=<<___;
47    .globl ${name}
48    .align 5
49${name}:
50
51___
52}
53
54sub endproc($)
55{
56    my ($name) = @_;
57
58    $code.=<<___;
59	blr
60	    .size	${name},.-${name}
61
62___
63}
64
65
66sub push_vrs($$)
67{
68	my ($min, $max) = @_;
69
70	my $count = $max - $min + 1;
71
72	$code.=<<___;
73	mr		$savesp,$sp
74	stdu		$sp,-16*`$count+1`($sp)
75
76___
77	    for (my $i = $min; $i <= $max; $i++) {
78		    my $mult = $max - $i + 1;
79		    $code.=<<___;
80	stxv		$i,-16*$mult($savesp)
81___
82
83	}
84
85	$code.=<<___;
86
87___
88}
89
90sub pop_vrs($$)
91{
92	my ($min, $max) = @_;
93
94	$code.=<<___;
95	ld		$savesp,0($sp)
96___
97	for (my $i = $min; $i <= $max; $i++) {
98		my $mult = $max - $i + 1;
99		$code.=<<___;
100	lxv		$i,-16*$mult($savesp)
101___
102	}
103
104	$code.=<<___;
105	mr		$sp,$savesp
106
107___
108}
109
110sub load_vrs($$)
111{
112	my ($pointer, $reg_list) = @_;
113
114	for (my $i = 0; $i <= 8; $i++) {
115		my $offset = $i * 8;
116		$code.=<<___;
117	lxsd		$reg_list->[$i],$offset($pointer)
118___
119	}
120
121	$code.=<<___;
122
123___
124}
125
126sub store_vrs($$)
127{
128	my ($pointer, $reg_list) = @_;
129
130	for (my $i = 0; $i <= 8; $i++) {
131		my $offset = $i * 16;
132		$code.=<<___;
133	stxv		$reg_list->[$i],$offset($pointer)
134___
135	}
136
137	$code.=<<___;
138
139___
140}
141
142$code.=<<___;
143.machine	"any"
144.text
145
146___
147
148{
149	# mul/square common
150	my ($t1, $t2, $t3, $t4) = ("v33", "v34", "v44", "v54");
151	my ($zero, $one) = ("r8", "r9");
152	my @out = map("v$_",(55..63));
153
154	{
155		#
156		# p521_felem_mul
157		#
158
159		my ($in1p, $in2p) = ("r4", "r5");
160		my @in1 = map("v$_",(45..53));
161		my @in2 = map("v$_",(35..43));
162
163		startproc("p521_felem_mul");
164
165		push_vrs(52, 63);
166
167		$code.=<<___;
168	vspltisw	$vzero,0
169
170___
171
172		load_vrs($in1p, \@in1);
173		load_vrs($in2p, \@in2);
174
175		$code.=<<___;
176	vmsumudm	$out[0],$in1[0],$in2[0],$vzero
177
178	xxpermdi	$t1,$in1[0],$in1[1],0b00
179	xxpermdi	$t2,$in2[1],$in2[0],0b00
180	vmsumudm	$out[1],$t1,$t2,$vzero
181
182	xxpermdi	$t2,$in2[2],$in2[1],0b00
183	vmsumudm	$out[2],$t1,$t2,$vzero
184	vmsumudm	$out[2],$in1[2],$in2[0],$out[2]
185
186	xxpermdi	$t2,$in2[3],$in2[2],0b00
187	vmsumudm	$out[3],$t1,$t2,$vzero
188	xxpermdi	$t3,$in1[2],$in1[3],0b00
189	xxpermdi	$t4,$in2[1],$in2[0],0b00
190	vmsumudm	$out[3],$t3,$t4,$out[3]
191
192	xxpermdi	$t2,$in2[4],$in2[3],0b00
193	vmsumudm	$out[4],$t1,$t2,$vzero
194	xxpermdi	$t4,$in2[2],$in2[1],0b00
195	vmsumudm	$out[4],$t3,$t4,$out[4]
196	vmsumudm	$out[4],$in1[4],$in2[0],$out[4]
197
198	xxpermdi	$t2,$in2[5],$in2[4],0b00
199	vmsumudm	$out[5],$t1,$t2,$vzero
200	xxpermdi	$t4,$in2[3],$in2[2],0b00
201	vmsumudm	$out[5],$t3,$t4,$out[5]
202
203	xxpermdi	$t2,$in2[6],$in2[5],0b00
204	vmsumudm	$out[6],$t1,$t2,$vzero
205	xxpermdi	$t4,$in2[4],$in2[3],0b00
206	vmsumudm	$out[6],$t3,$t4,$out[6]
207
208	xxpermdi	$t2,$in2[7],$in2[6],0b00
209	vmsumudm	$out[7],$t1,$t2,$vzero
210	xxpermdi	$t4,$in2[5],$in2[4],0b00
211	vmsumudm	$out[7],$t3,$t4,$out[7]
212
213	xxpermdi	$t2,$in2[8],$in2[7],0b00
214	vmsumudm	$out[8],$t1,$t2,$vzero
215	xxpermdi	$t4,$in2[6],$in2[5],0b00
216	vmsumudm	$out[8],$t3,$t4,$out[8]
217
218	xxpermdi	$t1,$in1[4],$in1[5],0b00
219	xxpermdi	$t2,$in2[1],$in2[0],0b00
220	vmsumudm	$out[5],$t1,$t2,$out[5]
221
222	xxpermdi	$t2,$in2[2],$in2[1],0b00
223	vmsumudm	$out[6],$t1,$t2,$out[6]
224	vmsumudm	$out[6],$in1[6],$in2[0],$out[6]
225
226	xxpermdi	$t2,$in2[3],$in2[2],0b00
227	vmsumudm	$out[7],$t1,$t2,$out[7]
228	xxpermdi	$t3,$in1[6],$in1[7],0b00
229	xxpermdi	$t4,$in2[1],$in2[0],0b00
230	vmsumudm	$out[7],$t3,$t4,$out[7]
231
232	xxpermdi	$t2,$in2[4],$in2[3],0b00
233	vmsumudm	$out[8],$t1,$t2,$out[8]
234	xxpermdi	$t4,$in2[2],$in2[1],0b00
235	vmsumudm	$out[8],$t3,$t4,$out[8]
236	vmsumudm	$out[8],$in1[8],$in2[0],$out[8]
237
238	li		$zero,0
239	li		$one,1
240	mtvsrdd		$t1,$one,$zero
241___
242
243		for (my $i = 0; $i <= 8; $i++) {
244			$code.=<<___;
245	vsld		$in2[$i],$in2[$i],$t1
246___
247		}
248
249		$code.=<<___;
250
251	vmsumudm	$out[7],$in1[8],$in2[8],$out[7]
252
253	xxpermdi	$t2,$in2[8],$in2[7],0b00
254	xxpermdi	$t1,$in1[7],$in1[8],0b00
255	vmsumudm	$out[6],$t1,$t2,$out[6]
256
257	xxpermdi	$t1,$in1[6],$in1[7],0b00
258	vmsumudm	$out[5],$t1,$t2,$out[5]
259	vmsumudm	$out[5],$in1[8],$in2[6],$out[5]
260
261	xxpermdi	$t1,$in1[5],$in1[6],0b00
262	vmsumudm	$out[4],$t1,$t2,$out[4]
263	xxpermdi	$t4,$in2[6],$in2[5],0b00
264	xxpermdi	$t3,$in1[7],$in1[8],0b00
265	vmsumudm	$out[4],$t3,$t4,$out[4]
266
267	xxpermdi	$t1,$in1[4],$in1[5],0b00
268	vmsumudm	$out[3],$t1,$t2,$out[3]
269	xxpermdi	$t3,$in1[6],$in1[7],0b00
270	vmsumudm	$out[3],$t3,$t4,$out[3]
271	vmsumudm	$out[3],$in1[8],$in2[4],$out[3]
272
273	xxpermdi	$t1,$in1[3],$in1[4],0b00
274	vmsumudm	$out[2],$t1,$t2,$out[2]
275	xxpermdi	$t3,$in1[5],$in1[6],0b00
276	vmsumudm	$out[2],$t3,$t4,$out[2]
277
278	xxpermdi	$t1,$in1[2],$in1[3],0b00
279	vmsumudm	$out[1],$t1,$t2,$out[1]
280	xxpermdi	$t3,$in1[4],$in1[5],0b00
281	vmsumudm	$out[1],$t3,$t4,$out[1]
282
283	xxpermdi	$t1,$in1[1],$in1[2],0b00
284	vmsumudm	$out[0],$t1,$t2,$out[0]
285	xxpermdi	$t3,$in1[3],$in1[4],0b00
286	vmsumudm	$out[0],$t3,$t4,$out[0]
287
288	xxpermdi	$t2,$in2[4],$in2[3],0b00
289	xxpermdi	$t1,$in1[7],$in1[8],0b00
290	vmsumudm	$out[2],$t1,$t2,$out[2]
291
292	xxpermdi	$t1,$in1[6],$in1[7],0b00
293	vmsumudm	$out[1],$t1,$t2,$out[1]
294	vmsumudm	$out[1],$in1[8],$in2[2],$out[1]
295
296	xxpermdi	$t1,$in1[5],$in1[6],0b00
297	vmsumudm	$out[0],$t1,$t2,$out[0]
298	xxpermdi	$t4,$in2[2],$in2[1],0b00
299	xxpermdi	$t3,$in1[7],$in1[8],0b00
300	vmsumudm	$out[0],$t3,$t4,$out[0]
301
302___
303
304		store_vrs($outp, \@out);
305
306		pop_vrs(52, 63);
307
308		endproc("p521_felem_mul");
309	}
310
311	{
312		#
313		# p51_felem_square
314		#
315
316		my ($inp) = ("r4");
317		my @in = map("v$_",(45..53));
318		my @inx2 = map("v$_",(35..43));
319
320		startproc("p521_felem_square");
321
322		push_vrs(52, 63);
323
324		$code.=<<___;
325	vspltisw	$vzero,0
326
327___
328
329		load_vrs($inp, \@in);
330
331		$code.=<<___;
332	li		$zero,0
333	li		$one,1
334	mtvsrdd		$t1,$one,$zero
335___
336
337		for (my $i = 0; $i <= 8; $i++) {
338			$code.=<<___;
339	vsld		$inx2[$i],$in[$i],$t1
340___
341		}
342
343		$code.=<<___;
344	vmsumudm	$out[0],$in[0],$in[0],$vzero
345
346	vmsumudm	$out[1],$in[0],$inx2[1],$vzero
347
348	xxpermdi	$t1,$in[0],$in[1],0b00
349	xxpermdi	$t2,$inx2[2],$in[1],0b00
350	vmsumudm	$out[2],$t1,$t2,$vzero
351
352	xxpermdi	$t2,$inx2[3],$inx2[2],0b00
353	vmsumudm	$out[3],$t1,$t2,$vzero
354
355	xxpermdi	$t2,$inx2[4],$inx2[3],0b00
356	vmsumudm	$out[4],$t1,$t2,$vzero
357	vmsumudm	$out[4],$in[2],$in[2],$out[4]
358
359	xxpermdi	$t2,$inx2[5],$inx2[4],0b00
360	vmsumudm	$out[5],$t1,$t2,$vzero
361	vmsumudm	$out[5],$in[2],$inx2[3],$out[5]
362
363	xxpermdi	$t2,$inx2[6],$inx2[5],0b00
364	vmsumudm	$out[6],$t1,$t2,$vzero
365	xxpermdi	$t3,$in[2],$in[3],0b00
366	xxpermdi	$t4,$inx2[4],$in[3],0b00
367	vmsumudm	$out[6],$t3,$t4,$out[6]
368
369	xxpermdi	$t2,$inx2[7],$inx2[6],0b00
370	vmsumudm	$out[7],$t1,$t2,$vzero
371	xxpermdi	$t4,$inx2[5],$inx2[4],0b00
372	vmsumudm	$out[7],$t3,$t4,$out[7]
373
374	xxpermdi	$t2,$inx2[8],$inx2[7],0b00
375	vmsumudm	$out[8],$t1,$t2,$vzero
376	xxpermdi	$t4,$inx2[6],$inx2[5],0b00
377	vmsumudm	$out[8],$t3,$t4,$out[8]
378	vmsumudm	$out[8],$in[4],$in[4],$out[8]
379
380	vmsumudm	$out[1],$in[5],$inx2[5],$out[1]
381
382	vmsumudm	$out[3],$in[6],$inx2[6],$out[3]
383
384	vmsumudm	$out[5],$in[7],$inx2[7],$out[5]
385
386	vmsumudm	$out[7],$in[8],$inx2[8],$out[7]
387
388	mtvsrdd		$t1,$one,$zero
389___
390
391		for (my $i = 5; $i <= 8; $i++) {
392			$code.=<<___;
393	vsld		$inx2[$i],$inx2[$i],$t1
394___
395		}
396
397		$code.=<<___;
398
399	vmsumudm	$out[6],$in[7],$inx2[8],$out[6]
400
401	vmsumudm	$out[5],$in[6],$inx2[8],$out[5]
402
403	xxpermdi	$t2,$inx2[8],$inx2[7],0b00
404	xxpermdi	$t1,$in[5],$in[6],0b00
405	vmsumudm	$out[4],$t1,$t2,$out[4]
406
407	xxpermdi	$t1,$in[4],$in[5],0b00
408	vmsumudm	$out[3],$t1,$t2,$out[3]
409
410	xxpermdi	$t1,$in[3],$in[4],0b00
411	vmsumudm	$out[2],$t1,$t2,$out[2]
412	vmsumudm	$out[2],$in[5],$inx2[6],$out[2]
413
414	xxpermdi	$t1,$in[2],$in[3],0b00
415	vmsumudm	$out[1],$t1,$t2,$out[1]
416	vmsumudm	$out[1],$in[4],$inx2[6],$out[1]
417
418	xxpermdi	$t1,$in[1],$in[2],0b00
419	vmsumudm	$out[0],$t1,$t2,$out[0]
420	xxpermdi	$t2,$inx2[6],$inx2[5],0b00
421	xxpermdi	$t1,$in[3],$in[4],0b00
422	vmsumudm	$out[0],$t1,$t2,$out[0]
423
424___
425
426		store_vrs($outp, \@out);
427
428		pop_vrs(52, 63);
429
430		endproc("p521_felem_square");
431	}
432}
433
434$code =~ s/\`([^\`]*)\`/eval $1/gem;
435print $code;
436close STDOUT or die "error closing STDOUT: $!";
437