1#! /usr/bin/env perl
2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements Poly1305 hash for s390x.
18#
19# June 2015
20#
21# ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated
22# code. For older compiler improvement coefficient is >3x, because
23# then base 2^64 and base 2^32 implementations are compared.
24#
25# On side note, z13 enables vector base 2^26 implementation...
26
27#
28# January 2019
29#
30# Add vx code path (base 2^26).
31#
32# Copyright IBM Corp. 2019
33# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
34
35#
36# January 2019
37#
38# Add vector base 2^26 implementation. It's problematic to accurately
39# measure performance, because reference system is hardly idle. But
40# it's sub-cycle, i.e. less than 1 cycle per processed byte, and it's
41# >=20% faster than IBM's submission on long inputs, and much faster on
42# short ones, because calculation of key powers is postponed till we
43# know that input is long enough to justify the additional overhead.
44
45use strict;
46use FindBin qw($Bin);
47use lib "$Bin/../..";
48use perlasm::s390x qw(:DEFAULT :GE :EI :MI1 :VX AUTOLOAD LABEL INCLUDE);
49
50# $output is the last argument if it looks like a file (it has an extension)
51# $flavour is the first argument if it doesn't look like a file
52my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
53my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
54
55my ($z,$SIZE_T);
56if ($flavour =~ /3[12]/) {
57	$z=0;	# S/390 ABI
58	$SIZE_T=4;
59} else {
60	$z=1;	# zSeries ABI
61	$SIZE_T=8;
62}
63
64my $stdframe=16*$SIZE_T+4*8;
65my $sp="%r15";
66
67my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
68
69PERLASM_BEGIN($output);
70
71INCLUDE	("s390x_arch.h");
72TEXT	();
73
74################
75# static void poly1305_init(void *ctx, const unsigned char key[16])
76{
77GLOBL	("poly1305_init");
78TYPE	("poly1305_init","\@function");
79ALIGN	(16);
80LABEL	("poly1305_init");
81	lghi	("%r0",0);
82	lghi	("%r1",-1);
83	stg	("%r0","0($ctx)");		# zero hash value
84	stg	("%r0","8($ctx)");
85	stg	("%r0","16($ctx)");
86	st	("%r0","24($ctx)");		# clear is_base2_26
87	lgr	("%r5",$ctx);			# reassign $ctx
88	lghi	("%r2",0);
89
90&{$z?	\&clgr:\&clr}	($inp,"%r0");
91	je	(".Lno_key");
92
93	lrvg	("%r2","0($inp)");		# load little-endian key
94	lrvg	("%r3","8($inp)");
95
96	nihl	("%r1",0xffc0);			# 0xffffffc0ffffffff
97	srlg	("%r0","%r1",4);		# 0x0ffffffc0fffffff
98	srlg	("%r1","%r1",4);
99	nill	("%r1",0xfffc);			# 0x0ffffffc0ffffffc
100
101	ngr	("%r2","%r0");
102	ngr	("%r3","%r1");
103
104	stmg	("%r2","%r3","32(%r5)");
105
106	larl	("%r1","OPENSSL_s390xcap_P");
107	lg	("%r0","16(%r1)");
108	srlg	("%r0","%r0",62);
109	nill	("%r0",1);			# extract vx bit
110	lcgr	("%r0","%r0");
111	larl	("%r1",".Lpoly1305_blocks");
112	larl	("%r2",".Lpoly1305_blocks_vx");
113	larl	("%r3",".Lpoly1305_emit");
114&{$z?	\&xgr:\&xr}	("%r2","%r1");		# select between scalar and vector
115&{$z?	\&ngr:\&nr}	("%r2","%r0");
116&{$z?	\&xgr:\&xr}	("%r2","%r1");
117&{$z?	\&stmg:\&stm}	("%r2","%r3","0(%r4)");
118	lghi	("%r2",1);
119LABEL	(".Lno_key");
120	br	("%r14");
121SIZE	("poly1305_init",".-poly1305_init");
122}
123
124################
125# static void poly1305_blocks(void *ctx, const unsigned char *inp,
126#                             size_t len, u32 padbit)
127{
128my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
129my ($r0,$r1,$s1) = map("%r$_",(0..2));
130
131GLOBL	("poly1305_blocks");
132TYPE	("poly1305_blocks","\@function");
133ALIGN	(16);
134LABEL	("poly1305_blocks");
135LABEL	(".Lpoly1305_blocks");
136&{$z?	\&ltgr:\&ltr}	("%r0",$len);
137	jz	(".Lno_data");
138
139&{$z?	\&stmg:\&stm}	("%r6","%r14","6*$SIZE_T($sp)");
140
141	lg	($h0,"0($ctx)");		# load hash value
142	lg	($h1,"8($ctx)");
143	lg	($h2,"16($ctx)");
144
145LABEL	(".Lpoly1305_blocks_entry");
146if ($z) {
147	srlg	($len,$len,4);
148} else {
149	srl	($len,4);
150}
151	llgfr   ($padbit,$padbit);		# clear upper half, much needed with
152						# non-64-bit ABI
153	lg	($r0,"32($ctx)");		# load key
154	lg	($r1,"40($ctx)");
155
156&{$z?	\&stg:\&st}	($ctx,"2*$SIZE_T($sp)");	# off-load $ctx
157	srlg	($s1,$r1,2);
158	algr	($s1,$r1);			# s1 = r1 + r1>>2
159	j	(".Loop");
160
161ALIGN	(16);
162LABEL	(".Loop");
163	lrvg	($d0lo,"0($inp)");		# load little-endian input
164	lrvg	($d1lo,"8($inp)");
165	la	($inp,"16($inp)");
166
167	algr	($d0lo,$h0);			# accumulate input
168	alcgr	($d1lo,$h1);
169	alcgr	($h2,$padbit);
170
171	lgr	($h0,$d0lo);
172	mlgr	($d0hi,$r0);			# h0*r0	  -> $d0hi:$d0lo
173	lgr	($h1,$d1lo);
174	mlgr	($d1hi,$s1);			# h1*5*r1 -> $d1hi:$d1lo
175
176	mlgr	($t0,$r1);			# h0*r1   -> $t0:$h0
177	mlgr	($t1,$r0);			# h1*r0   -> $t1:$h1
178
179	algr	($d0lo,$d1lo);
180	lgr	($d1lo,$h2);
181	alcgr	($d0hi,$d1hi);
182	lghi	($d1hi,0);
183
184	algr	($h1,$h0);
185	alcgr	($t1,$t0);
186
187	msgr	($d1lo,$s1);			# h2*s1
188	msgr	($h2,$r0);			# h2*r0
189
190	algr	($h1,$d1lo);
191	alcgr	($t1,$d1hi);			# $d1hi is zero
192
193	algr	($h1,$d0hi);
194	alcgr	($h2,$t1);
195
196	lghi	($h0,-4);			# final reduction step
197	ngr	($h0,$h2);
198	srlg	($t0,$h2,2);
199	algr	($h0,$t0);
200	lghi	($t1,3);
201	ngr	($h2,$t1);
202
203	algr	($h0,$d0lo);
204	alcgr	($h1,$d1hi);			# $d1hi is still zero
205	alcgr	($h2,$d1hi);			# $d1hi is still zero
206
207&{$z?	\&brctg:\&brct}	($len,".Loop");
208
209&{$z?	\&lg:\&l}	($ctx,"2*$SIZE_T($sp)");# restore $ctx
210
211	stg	($h0,"0($ctx)");		# store hash value
212	stg	($h1,"8($ctx)");
213	stg	($h2,"16($ctx)");
214
215&{$z?	\&lmg:\&lm}	("%r6","%r14","6*$SIZE_T($sp)");
216LABEL	(".Lno_data");
217	br	("%r14");
218SIZE	("poly1305_blocks",".-poly1305_blocks");
219}
220
221################
222# static void poly1305_blocks_vx(void *ctx, const unsigned char *inp,
223#                                size_t len, u32 padbit)
224{
225my ($H0, $H1, $H2, $H3, $H4) = map("%v$_",(0..4));
226my ($I0, $I1, $I2, $I3, $I4) = map("%v$_",(5..9));
227my ($R0, $R1, $S1, $R2, $S2) = map("%v$_",(10..14));
228my      ($R3, $S3, $R4, $S4) = map("%v$_",(15..18));
229my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("%v$_",(19..23));
230my      ($T1, $T2, $T3, $T4) = map("%v$_",(24..27));
231my ($mask26,$bswaplo,$bswaphi,$bswapmi) = map("%v$_",(28..31));
232
233my ($d2,$d0,$h0,$d1,$h1,$h2)=map("%r$_",(9..14));
234
235TYPE	("poly1305_blocks_vx","\@function");
236ALIGN	(16);
237LABEL	("poly1305_blocks_vx");
238LABEL	(".Lpoly1305_blocks_vx");
239&{$z?	\&clgfi:\&clfi} ($len,128);
240	jhe	("__poly1305_blocks_vx");
241
242&{$z?	\&stmg:\&stm}	("%r6","%r14","6*$SIZE_T($sp)");
243
244	lg	($d0,"0($ctx)");
245	lg	($d1,"8($ctx)");
246	lg	($d2,"16($ctx)");
247
248	llgfr	("%r0",$d0);			# base 2^26 -> base 2^64
249	srlg	($h0,$d0,32);
250	llgfr	("%r1",$d1);
251	srlg	($h1,$d1,32);
252	srlg	($h2,$d2,32);
253
254	sllg	("%r0","%r0",26);
255	algr	($h0,"%r0");
256	sllg	("%r0",$h1,52);
257	srlg	($h1,$h1,12);
258	sllg	("%r1","%r1",14);
259	algr	($h0,"%r0");
260	alcgr	($h1,"%r1");
261	sllg	("%r0",$h2,40);
262	srlg	($h2,$h2,24);
263	lghi	("%r1",0);
264	algr	($h1,"%r0");
265	alcgr	($h2,"%r1");
266
267	llgf	("%r0","24($ctx)");		# is_base2_26
268	lcgr	("%r0","%r0");
269
270	xgr	($h0,$d0);			# choose between radixes
271	xgr	($h1,$d1);
272	xgr	($h2,$d2);
273	ngr	($h0,"%r0");
274	ngr	($h1,"%r0");
275	ngr	($h2,"%r0");
276	xgr	($h0,$d0);
277	xgr	($h1,$d1);
278	xgr	($h2,$d2);
279
280	lhi	("%r0",0);
281	st	("%r0","24($ctx)");		# clear is_base2_26
282
283	j	(".Lpoly1305_blocks_entry");
284SIZE	("poly1305_blocks_vx",".-poly1305_blocks_vx");
285
286TYPE	("__poly1305_mul","\@function");
287ALIGN	(16);
288LABEL	("__poly1305_mul");
289	vmlof		($ACC0,$H0,$R0);
290	vmlof		($ACC1,$H0,$R1);
291	vmlof		($ACC2,$H0,$R2);
292	vmlof		($ACC3,$H0,$R3);
293	vmlof		($ACC4,$H0,$R4);
294
295	vmalof		($ACC0,$H1,$S4,$ACC0);
296	vmalof		($ACC1,$H1,$R0,$ACC1);
297	vmalof		($ACC2,$H1,$R1,$ACC2);
298	vmalof		($ACC3,$H1,$R2,$ACC3);
299	vmalof		($ACC4,$H1,$R3,$ACC4);
300
301	vmalof		($ACC0,$H2,$S3,$ACC0);
302	vmalof		($ACC1,$H2,$S4,$ACC1);
303	vmalof		($ACC2,$H2,$R0,$ACC2);
304	vmalof		($ACC3,$H2,$R1,$ACC3);
305	vmalof		($ACC4,$H2,$R2,$ACC4);
306
307	vmalof		($ACC0,$H3,$S2,$ACC0);
308	vmalof		($ACC1,$H3,$S3,$ACC1);
309	vmalof		($ACC2,$H3,$S4,$ACC2);
310	vmalof		($ACC3,$H3,$R0,$ACC3);
311	vmalof		($ACC4,$H3,$R1,$ACC4);
312
313	vmalof		($ACC0,$H4,$S1,$ACC0);
314	vmalof		($ACC1,$H4,$S2,$ACC1);
315	vmalof		($ACC2,$H4,$S3,$ACC2);
316	vmalof		($ACC3,$H4,$S4,$ACC3);
317	vmalof		($ACC4,$H4,$R0,$ACC4);
318
319	################################################################
320	# lazy reduction
321
322	vesrlg		($H4,$ACC3,26);
323	vesrlg		($H1,$ACC0,26);
324	vn		($H3,$ACC3,$mask26);
325	vn		($H0,$ACC0,$mask26);
326	vag		($H4,$H4,$ACC4);	# h3 -> h4
327	vag		($H1,$H1,$ACC1);	# h0 -> h1
328
329	vesrlg		($ACC4,$H4,26);
330	vesrlg		($ACC1,$H1,26);
331	vn		($H4,$H4,$mask26);
332	vn		($H1,$H1,$mask26);
333	vag		($H0,$H0,$ACC4);
334	vag		($H2,$ACC2,$ACC1);	# h1 -> h2
335
336	veslg		($ACC4,$ACC4,2);	# <<2
337	vesrlg		($ACC2,$H2,26);
338	vn		($H2,$H2,$mask26);
339	vag		($H0,$H0,$ACC4);	# h4 -> h0
340	vag		($H3,$H3,$ACC2);	# h2 -> h3
341
342	vesrlg		($ACC0,$H0,26);
343	vesrlg		($ACC3,$H3,26);
344	vn		($H0,$H0,$mask26);
345	vn		($H3,$H3,$mask26);
346	vag		($H1,$H1,$ACC0);	# h0 -> h1
347	vag		($H4,$H4,$ACC3);	# h3 -> h4
348	br		("%r14");
349SIZE	("__poly1305_mul",".-__poly1305_mul");
350
351TYPE	("__poly1305_blocks_vx","\@function");
352ALIGN	(16);
353LABEL	("__poly1305_blocks_vx");
354&{$z?	\&lgr:\&lr}	("%r0",$sp);
355&{$z?	\&stmg:\&stm}	("%r10","%r15","10*$SIZE_T($sp)");
356if (!$z) {
357	std	("%f4","16*$SIZE_T+2*8($sp)");
358	std	("%f6","16*$SIZE_T+3*8($sp)");
359	ahi	($sp,-$stdframe);
360	st	("%r0","0($sp)");		# back-chain
361
362	llgfr	($len,$len);			# so that srlg works on $len
363} else {
364	aghi	($sp,"-($stdframe+8*8)");
365	stg	("%r0","0($sp)");		# back-chain
366
367	std	("%f8","$stdframe+0*8($sp)");
368	std	("%f9","$stdframe+1*8($sp)");
369	std	("%f10","$stdframe+2*8($sp)");
370	std	("%f11","$stdframe+3*8($sp)");
371	std	("%f12","$stdframe+4*8($sp)");
372	std	("%f13","$stdframe+5*8($sp)");
373	std	("%f14","$stdframe+6*8($sp)");
374	std	("%f15","$stdframe+7*8($sp)");
375}
376	larl	("%r1",".Lconst");
377	vgmg	($mask26,38,63);
378	vlm	($bswaplo,$bswapmi,"16(%r1)");
379
380	&lt	("%r0","24($ctx)");		# is_base2_26?
381	jnz	(".Lskip_init");
382
383	lg	($h0,"32($ctx)");		# load key base 2^64
384	lg	($h1,"40($ctx)");
385
386	risbg	($d0,$h0,38,0x80+63,38);	# base 2^64 -> 2^26
387	srlg	($d1,$h0,52);
388	risbg	($h0,$h0,38,0x80+63,0);
389	vlvgg	($R0,$h0,0);
390	risbg	($d1,$h1,38,51,12);
391	vlvgg	($R1,$d0,0);
392	risbg	($d0,$h1,38,63,50);
393	vlvgg	($R2,$d1,0);
394	srlg	($d1,$h1,40);
395	vlvgg	($R3,$d0,0);
396	vlvgg	($R4,$d1,0);
397
398	veslg	($S1,$R1,2);
399	veslg	($S2,$R2,2);
400	veslg	($S3,$R3,2);
401	veslg	($S4,$R4,2);
402	vlr	($H0,$R0);
403	vlr	($H1,$R1);
404	vlr	($H2,$R2);
405	vlr	($H3,$R3);
406	vlr	($H4,$R4);
407	vag	($S1,$S1,$R1);			# * 5
408	vag	($S2,$S2,$R2);
409	vag	($S3,$S3,$R3);
410	vag	($S4,$S4,$R4);
411
412	brasl	("%r14","__poly1305_mul");	# r^1:- * r^1:-
413
414	vpdi	($R0,$H0,$R0,0);		# r^2:r^1
415	vpdi	($R1,$H1,$R1,0);
416	vpdi	($R2,$H2,$R2,0);
417	vpdi	($R3,$H3,$R3,0);
418	vpdi	($R4,$H4,$R4,0);
419	vpdi	($H0,$H0,$H0,0);		# r^2:r^2
420	vpdi	($H1,$H1,$H1,0);
421	vpdi	($H2,$H2,$H2,0);
422	vpdi	($H3,$H3,$H3,0);
423	vpdi	($H4,$H4,$H4,0);
424	veslg	($S1,$R1,2);
425	veslg	($S2,$R2,2);
426	veslg	($S3,$R3,2);
427	veslg	($S4,$R4,2);
428	vag	($S1,$S1,$R1);			# * 5
429	vag	($S2,$S2,$R2);
430	vag	($S3,$S3,$R3);
431	vag	($S4,$S4,$R4);
432
433	brasl	("%r14,__poly1305_mul");	# r^2:r^2 * r^2:r^1
434
435	vl	($I0,"0(%r1)");			# borrow $I0
436	vperm	($R0,$R0,$H0,$I0);		# r^2:r^4:r^1:r^3
437	vperm	($R1,$R1,$H1,$I0);
438	vperm	($R2,$R2,$H2,$I0);
439	vperm	($R3,$R3,$H3,$I0);
440	vperm	($R4,$R4,$H4,$I0);
441	veslf	($S1,$R1,2);
442	veslf	($S2,$R2,2);
443	veslf	($S3,$R3,2);
444	veslf	($S4,$R4,2);
445	vaf	($S1,$S1,$R1);			# * 5
446	vaf	($S2,$S2,$R2);
447	vaf	($S3,$S3,$R3);
448	vaf	($S4,$S4,$R4);
449
450	lg	($h0,"0($ctx)");		# load hash base 2^64
451	lg	($h1,"8($ctx)");
452	lg	($h2,"16($ctx)");
453
454	vzero	($H0);
455	vzero	($H1);
456	vzero	($H2);
457	vzero	($H3);
458	vzero	($H4);
459
460	risbg	($d0,$h0,38,0x80+63,38);	# base 2^64 -> 2^26
461	srlg	($d1,$h0,52);
462	risbg	($h0,$h0,38,0x80+63,0);
463	vlvgg	($H0,$h0,0);
464	risbg	($d1,$h1,38,51,12);
465	vlvgg	($H1,$d0,0);
466	risbg	($d0,$h1,38,63,50);
467	vlvgg	($H2,$d1,0);
468	srlg	($d1,$h1,40);
469	vlvgg	($H3,$d0,0);
470	risbg	($d1,$h2,37,39,24);
471	vlvgg	($H4,$d1,0);
472
473	lhi	("%r0",1);
474	st	("%r0","24($ctx)");		# set is_base2_26
475
476	vstm	($R0,$S4,"48($ctx)");		# save key schedule base 2^26
477
478	vpdi	($R0,$R0,$R0,0);		# broadcast r^2:r^4
479	vpdi	($R1,$R1,$R1,0);
480	vpdi	($S1,$S1,$S1,0);
481	vpdi	($R2,$R2,$R2,0);
482	vpdi	($S2,$S2,$S2,0);
483	vpdi	($R3,$R3,$R3,0);
484	vpdi	($S3,$S3,$S3,0);
485	vpdi	($R4,$R4,$R4,0);
486	vpdi	($S4,$S4,$S4,0);
487
488	j	(".Loaded_hash");
489
490ALIGN	(16);
491LABEL	(".Lskip_init");
492	vllezf	($H0,"0($ctx)");		# load hash base 2^26
493	vllezf	($H1,"4($ctx)");
494	vllezf	($H2,"8($ctx)");
495	vllezf	($H3,"12($ctx)");
496	vllezf	($H4,"16($ctx)");
497
498	vlrepg	($R0,"0x30($ctx)");		# broadcast r^2:r^4
499	vlrepg	($R1,"0x40($ctx)");
500	vlrepg	($S1,"0x50($ctx)");
501	vlrepg	($R2,"0x60($ctx)");
502	vlrepg	($S2,"0x70($ctx)");
503	vlrepg	($R3,"0x80($ctx)");
504	vlrepg	($S3,"0x90($ctx)");
505	vlrepg	($R4,"0xa0($ctx)");
506	vlrepg	($S4,"0xb0($ctx)");
507
508LABEL	(".Loaded_hash");
509	vzero	($I1);
510	vzero	($I3);
511
512	vlm	($T1,$T4,"0x00($inp)");		# load first input block
513	la	($inp,"0x40($inp)");
514	vgmg	($mask26,6,31);
515	vgmf	($I4,5,5);			# padbit<<2
516
517	vperm	($I0,$T3,$T4,$bswaplo);
518	vperm	($I2,$T3,$T4,$bswapmi);
519	vperm	($T3,$T3,$T4,$bswaphi);
520
521	verimg	($I1,$I0,$mask26,6);		# >>26
522	veslg	($I0,$I0,32);
523	veslg	($I2,$I2,28);			# >>4
524	verimg	($I3,$T3,$mask26,18);		# >>14
525	verimg	($I4,$T3,$mask26,58);		# >>38
526	vn	($I0,$I0,$mask26);
527	vn	($I2,$I2,$mask26);
528	vesrlf	($I4,$I4,2);			# >>2
529
530	vgmg	($mask26,38,63);
531	vperm	($T3,$T1,$T2,$bswaplo);
532	vperm	($T4,$T1,$T2,$bswaphi);
533	vperm	($T2,$T1,$T2,$bswapmi);
534
535	verimg	($I0,$T3,$mask26,0);
536	verimg	($I1,$T3,$mask26,38);		# >>26
537	verimg	($I2,$T2,$mask26,60);		# >>4
538	verimg	($I3,$T4,$mask26,50);		# >>14
539	vesrlg	($T4,$T4,40);
540	vo	($I4,$I4,$T4);
541
542	srlg	("%r0",$len,6);
543&{$z?	\&aghi:\&ahi}	("%r0",-1);
544
545ALIGN	(16);
546LABEL	(".Loop_vx");
547	vmlef		($ACC0,$I0,$R0);
548	vmlef		($ACC1,$I0,$R1);
549	vmlef		($ACC2,$I0,$R2);
550	vmlef		($ACC3,$I0,$R3);
551	vmlef		($ACC4,$I0,$R4);
552
553	vmalef		($ACC0,$I1,$S4,$ACC0);
554	vmalef		($ACC1,$I1,$R0,$ACC1);
555	vmalef		($ACC2,$I1,$R1,$ACC2);
556	vmalef		($ACC3,$I1,$R2,$ACC3);
557	vmalef		($ACC4,$I1,$R3,$ACC4);
558
559	 vaf		($H2,$H2,$I2);
560	 vaf		($H0,$H0,$I0);
561	 vaf		($H3,$H3,$I3);
562	 vaf		($H1,$H1,$I1);
563	 vaf		($H4,$H4,$I4);
564
565	vmalef		($ACC0,$I2,$S3,$ACC0);
566	vmalef		($ACC1,$I2,$S4,$ACC1);
567	vmalef		($ACC2,$I2,$R0,$ACC2);
568	vmalef		($ACC3,$I2,$R1,$ACC3);
569	vmalef		($ACC4,$I2,$R2,$ACC4);
570
571	 vlm		($T1,$T4,"0x00($inp)");	# load next input block
572	 la		($inp,"0x40($inp)");
573	 vgmg		($mask26,6,31);
574
575	vmalef		($ACC0,$I3,$S2,$ACC0);
576	vmalef		($ACC1,$I3,$S3,$ACC1);
577	vmalef		($ACC2,$I3,$S4,$ACC2);
578	vmalef		($ACC3,$I3,$R0,$ACC3);
579	vmalef		($ACC4,$I3,$R1,$ACC4);
580
581	 vperm		($I0,$T3,$T4,$bswaplo);
582	 vperm		($I2,$T3,$T4,$bswapmi);
583	 vperm		($T3,$T3,$T4,$bswaphi);
584
585	vmalef		($ACC0,$I4,$S1,$ACC0);
586	vmalef		($ACC1,$I4,$S2,$ACC1);
587	vmalef		($ACC2,$I4,$S3,$ACC2);
588	vmalef		($ACC3,$I4,$S4,$ACC3);
589	vmalef		($ACC4,$I4,$R0,$ACC4);
590
591	 verimg		($I1,$I0,$mask26,6);	# >>26
592	 veslg		($I0,$I0,32);
593	 veslg		($I2,$I2,28);		# >>4
594	 verimg		($I3,$T3,$mask26,18);	# >>14
595
596	vmalof		($ACC0,$H0,$R0,$ACC0);
597	vmalof		($ACC1,$H0,$R1,$ACC1);
598	vmalof		($ACC2,$H0,$R2,$ACC2);
599	vmalof		($ACC3,$H0,$R3,$ACC3);
600	vmalof		($ACC4,$H0,$R4,$ACC4);
601
602	 vgmf		($I4,5,5);		# padbit<<2
603	 verimg		($I4,$T3,$mask26,58);	# >>38
604	 vn		($I0,$I0,$mask26);
605	 vn		($I2,$I2,$mask26);
606	 vesrlf		($I4,$I4,2);		# >>2
607
608	vmalof		($ACC0,$H1,$S4,$ACC0);
609	vmalof		($ACC1,$H1,$R0,$ACC1);
610	vmalof		($ACC2,$H1,$R1,$ACC2);
611	vmalof		($ACC3,$H1,$R2,$ACC3);
612	vmalof		($ACC4,$H1,$R3,$ACC4);
613
614	 vgmg		($mask26,38,63);
615	 vperm		($T3,$T1,$T2,$bswaplo);
616	 vperm		($T4,$T1,$T2,$bswaphi);
617	 vperm		($T2,$T1,$T2,$bswapmi);
618
619	vmalof		($ACC0,$H2,$S3,$ACC0);
620	vmalof		($ACC1,$H2,$S4,$ACC1);
621	vmalof		($ACC2,$H2,$R0,$ACC2);
622	vmalof		($ACC3,$H2,$R1,$ACC3);
623	vmalof		($ACC4,$H2,$R2,$ACC4);
624
625	 verimg		($I0,$T3,$mask26,0);
626	 verimg		($I1,$T3,$mask26,38);	# >>26
627	 verimg		($I2,$T2,$mask26,60);	# >>4
628
629	vmalof		($ACC0,$H3,$S2,$ACC0);
630	vmalof		($ACC1,$H3,$S3,$ACC1);
631	vmalof		($ACC2,$H3,$S4,$ACC2);
632	vmalof		($ACC3,$H3,$R0,$ACC3);
633	vmalof		($ACC4,$H3,$R1,$ACC4);
634
635	 verimg		($I3,$T4,$mask26,50);	# >>14
636	 vesrlg		($T4,$T4,40);
637	 vo		($I4,$I4,$T4);
638
639	vmalof		($ACC0,$H4,$S1,$ACC0);
640	vmalof		($ACC1,$H4,$S2,$ACC1);
641	vmalof		($ACC2,$H4,$S3,$ACC2);
642	vmalof		($ACC3,$H4,$S4,$ACC3);
643	vmalof		($ACC4,$H4,$R0,$ACC4);
644
645	################################################################
646	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
647	# and P. Schwabe
648
649	vesrlg		($H4,$ACC3,26);
650	vesrlg		($H1,$ACC0,26);
651	vn		($H3,$ACC3,$mask26);
652	vn		($H0,$ACC0,$mask26);
653	vag		($H4,$H4,$ACC4);	# h3 -> h4
654	vag		($H1,$H1,$ACC1);	# h0 -> h1
655
656	vesrlg		($ACC4,$H4,26);
657	vesrlg		($ACC1,$H1,26);
658	vn		($H4,$H4,$mask26);
659	vn		($H1,$H1,$mask26);
660	vag		($H0,$H0,$ACC4);
661	vag		($H2,$ACC2,$ACC1);	# h1 -> h2
662
663	veslg		($ACC4,$ACC4,2);	# <<2
664	vesrlg		($ACC2,$H2,26);
665	vn		($H2,$H2,$mask26);
666	vag		($H0,$H0,$ACC4);	# h4 -> h0
667	vag		($H3,$H3,$ACC2);	# h2 -> h3
668
669	vesrlg		($ACC0,$H0,26);
670	vesrlg		($ACC3,$H3,26);
671	vn		($H0,$H0,$mask26);
672	vn		($H3,$H3,$mask26);
673	vag		($H1,$H1,$ACC0);	# h0 -> h1
674	vag		($H4,$H4,$ACC3);	# h3 -> h4
675
676&{$z?	\&brctg:\&brct}	("%r0",".Loop_vx");
677
678	vlm	($R0,$S4,"48($ctx)");		# load all powers
679
680	lghi	("%r0",0x30);
681&{$z?	\&lcgr:\&lcr}	($len,$len);
682&{$z?	\&ngr:\&nr}	($len,"%r0");
683&{$z?	\&slgr:\&slr}	($inp,$len);
684
685LABEL	(".Last");
686	vmlef	($ACC0,$I0,$R0);
687	vmlef	($ACC1,$I0,$R1);
688	vmlef	($ACC2,$I0,$R2);
689	vmlef	($ACC3,$I0,$R3);
690	vmlef	($ACC4,$I0,$R4);
691
692	vmalef	($ACC0,$I1,$S4,$ACC0);
693	vmalef	($ACC1,$I1,$R0,$ACC1);
694	vmalef	($ACC2,$I1,$R1,$ACC2);
695	vmalef	($ACC3,$I1,$R2,$ACC3);
696	vmalef	($ACC4,$I1,$R3,$ACC4);
697
698	 vaf	($H0,$H0,$I0);
699	 vaf	($H1,$H1,$I1);
700	 vaf	($H2,$H2,$I2);
701	 vaf	($H3,$H3,$I3);
702	 vaf	($H4,$H4,$I4);
703
704	vmalef	($ACC0,$I2,$S3,$ACC0);
705	vmalef	($ACC1,$I2,$S4,$ACC1);
706	vmalef	($ACC2,$I2,$R0,$ACC2);
707	vmalef	($ACC3,$I2,$R1,$ACC3);
708	vmalef	($ACC4,$I2,$R2,$ACC4);
709
710	vmalef	($ACC0,$I3,$S2,$ACC0);
711	vmalef	($ACC1,$I3,$S3,$ACC1);
712	vmalef	($ACC2,$I3,$S4,$ACC2);
713	vmalef	($ACC3,$I3,$R0,$ACC3);
714	vmalef	($ACC4,$I3,$R1,$ACC4);
715
716	vmalef	($ACC0,$I4,$S1,$ACC0);
717	vmalef	($ACC1,$I4,$S2,$ACC1);
718	vmalef	($ACC2,$I4,$S3,$ACC2);
719	vmalef	($ACC3,$I4,$S4,$ACC3);
720	vmalef	($ACC4,$I4,$R0,$ACC4);
721
722	vmalof	($ACC0,$H0,$R0,$ACC0);
723	vmalof	($ACC1,$H0,$R1,$ACC1);
724	vmalof	($ACC2,$H0,$R2,$ACC2);
725	vmalof	($ACC3,$H0,$R3,$ACC3);
726	vmalof	($ACC4,$H0,$R4,$ACC4);
727
728	vmalof	($ACC0,$H1,$S4,$ACC0);
729	vmalof	($ACC1,$H1,$R0,$ACC1);
730	vmalof	($ACC2,$H1,$R1,$ACC2);
731	vmalof	($ACC3,$H1,$R2,$ACC3);
732	vmalof	($ACC4,$H1,$R3,$ACC4);
733
734	vmalof	($ACC0,$H2,$S3,$ACC0);
735	vmalof	($ACC1,$H2,$S4,$ACC1);
736	vmalof	($ACC2,$H2,$R0,$ACC2);
737	vmalof	($ACC3,$H2,$R1,$ACC3);
738	vmalof	($ACC4,$H2,$R2,$ACC4);
739
740	vmalof	($ACC0,$H3,$S2,$ACC0);
741	vmalof	($ACC1,$H3,$S3,$ACC1);
742	vmalof	($ACC2,$H3,$S4,$ACC2);
743	vmalof	($ACC3,$H3,$R0,$ACC3);
744	vmalof	($ACC4,$H3,$R1,$ACC4);
745
746	vmalof	($ACC0,$H4,$S1,$ACC0);
747	vmalof	($ACC1,$H4,$S2,$ACC1);
748	vmalof	($ACC2,$H4,$S3,$ACC2);
749	vmalof	($ACC3,$H4,$S4,$ACC3);
750	vmalof	($ACC4,$H4,$R0,$ACC4);
751
752	################################################################
753	# horizontal addition
754
755	vzero	($H0);
756	vsumqg	($ACC0,$ACC0,$H0);
757	vsumqg	($ACC1,$ACC1,$H0);
758	vsumqg	($ACC2,$ACC2,$H0);
759	vsumqg	($ACC3,$ACC3,$H0);
760	vsumqg	($ACC4,$ACC4,$H0);
761
762	################################################################
763	# lazy reduction
764
765	vesrlg	($H4,$ACC3,26);
766	vesrlg	($H1,$ACC0,26);
767	vn	($H3,$ACC3,$mask26);
768	vn	($H0,$ACC0,$mask26);
769	vag	($H4,$H4,$ACC4);		# h3 -> h4
770	vag	($H1,$H1,$ACC1);		# h0 -> h1
771
772	vesrlg	($ACC4,$H4,26);
773	vesrlg	($ACC1,$H1,26);
774	vn	($H4,$H4,$mask26);
775	vn	($H1,$H1,$mask26);
776	vag	($H0,$H0,$ACC4);
777	vag	($H2,$ACC2,$ACC1);		# h1 -> h2
778
779	veslg	($ACC4,$ACC4,2);		# <<2
780	vesrlg	($ACC2,$H2,26);
781	vn	($H2,$H2,$mask26);
782	vag	($H0,$H0,$ACC4);		# h4 -> h0
783	vag	($H3,$H3,$ACC2);		# h2 -> h3
784
785	vesrlg	($ACC0,$H0,26);
786	vesrlg	($ACC3,$H3,26);
787	vn	($H0,$H0,$mask26);
788	vn	($H3,$H3,$mask26);
789	vag	($H1,$H1,$ACC0);		# h0 -> h1
790	vag	($H4,$H4,$ACC3);		# h3 -> h4
791
792&{$z?	\&clgfi:\&clfi} ($len,0);
793	je	(".Ldone");
794
795	vlm	($T1,$T4,"0x00($inp)");		# load last partial block
796	vgmg	($mask26,6,31);
797	vgmf	($I4,5,5);			# padbit<<2
798
799	vperm	($I0,$T3,$T4,$bswaplo);
800	vperm	($I2,$T3,$T4,$bswapmi);
801	vperm	($T3,$T3,$T4,$bswaphi);
802
803	vl	($ACC0,"0x30($len,%r1)");	# borrow $ACC0,1
804	vl	($ACC1,"0x60($len,%r1)");
805
806	verimg	($I1,$I0,$mask26,6);		# >>26
807	veslg	($I0,$I0,32);
808	veslg	($I2,$I2,28);			# >>4
809	verimg	($I3,$T3,$mask26,18);		# >>14
810	verimg	($I4,$T3,$mask26,58);		# >>38
811	vn	($I0,$I0,$mask26);
812	vn	($I2,$I2,$mask26);
813	vesrlf	($I4,$I4,2);			# >>2
814
815	vgmg	($mask26,38,63);
816	vperm	($T3,$T1,$T2,$bswaplo);
817	vperm	($T4,$T1,$T2,$bswaphi);
818	vperm	($T2,$T1,$T2,$bswapmi);
819
820	verimg	($I0,$T3,$mask26,0);
821	verimg	($I1,$T3,$mask26,38);		# >>26
822	verimg	($I2,$T2,$mask26,60);		# >>4
823	verimg	($I3,$T4,$mask26,50);		# >>14
824	vesrlg	($T4,$T4,40);
825	vo	($I4,$I4,$T4);
826
827	vperm	($H0,$H0,$H0,$ACC0);		# move hash to right lane
828	vn	($I0,$I0,$ACC1);		# mask redundant lane[s]
829	vperm	($H1,$H1,$H1,$ACC0);
830	vn	($I1,$I1,$ACC1);
831	vperm	($H2,$H2,$H2,$ACC0);
832	vn	($I2,$I2,$ACC1);
833	vperm	($H3,$H3,$H3,$ACC0);
834	vn	($I3,$I3,$ACC1);
835	vperm	($H4,$H4,$H4,$ACC0);
836	vn	($I4,$I4,$ACC1);
837
838	vaf	($I0,$I0,$H0);			# accumulate hash
839	vzero	($H0);				# wipe hash value
840	vaf	($I1,$I1,$H1);
841	vzero	($H1);
842	vaf	($I2,$I2,$H2);
843	vzero	($H2);
844	vaf	($I3,$I3,$H3);
845	vzero	($H3);
846	vaf	($I4,$I4,$H4);
847	vzero	($H4);
848
849&{$z?	\&lghi:\&lhi}	($len,0);
850	j	(".Last");
851	# I don't bother to tell apart cases when only one multiplication
852	# pass is sufficient, because I argue that mispredicted branch
853	# penalties are comparable to overhead of sometimes redundant
854	# multiplication pass...
855
856LABEL	(".Ldone");
857	vstef	($H0,"0($ctx)",3);		# store hash base 2^26
858	vstef	($H1,"4($ctx)",3);
859	vstef	($H2,"8($ctx)",3);
860	vstef	($H3,"12($ctx)",3);
861	vstef	($H4,"16($ctx)",3);
862
863if ($z) {
864	ld	("%f8","$stdframe+0*8($sp)");
865	ld	("%f9","$stdframe+1*8($sp)");
866	ld	("%f10","$stdframe+2*8($sp)");
867	ld	("%f11","$stdframe+3*8($sp)");
868	ld	("%f12","$stdframe+4*8($sp)");
869	ld	("%f13","$stdframe+5*8($sp)");
870	ld	("%f14","$stdframe+6*8($sp)");
871	ld	("%f15","$stdframe+7*8($sp)");
872&{$z?	\&lmg:\&lm}	("%r10","%r15","$stdframe+8*8+10*$SIZE_T($sp)");
873} else {
874	ld	("%f4","$stdframe+16*$SIZE_T+2*8($sp)");
875	ld	("%f6","$stdframe+16*$SIZE_T+3*8($sp)");
876&{$z?	\&lmg:\&lm}	("%r10","%r15","$stdframe+10*$SIZE_T($sp)");
877}
878	br	("%r14");
879SIZE	("__poly1305_blocks_vx",".-__poly1305_blocks_vx");
880}
881
882################
883# static void poly1305_emit(void *ctx, unsigned char mac[16],
884#                           const u32 nonce[4])
885{
886my ($mac,$nonce)=($inp,$len);
887my ($h0,$h1,$h2,$d0,$d1,$d2)=map("%r$_",(5..10));
888
889GLOBL	("poly1305_emit");
890TYPE	("poly1305_emit","\@function");
891ALIGN	(16);
892LABEL	("poly1305_emit");
893LABEL	(".Lpoly1305_emit");
894&{$z?	\&stmg:\&stm}	("%r6","%r10","6*$SIZE_T($sp)");
895
896	lg	($d0,"0($ctx)");
897	lg	($d1,"8($ctx)");
898	lg	($d2,"16($ctx)");
899
900	llgfr	("%r0",$d0);			# base 2^26 -> base 2^64
901	srlg	($h0,$d0,32);
902	llgfr	("%r1",$d1);
903	srlg	($h1,$d1,32);
904	srlg	($h2,$d2,32);
905
906	sllg	("%r0","%r0",26);
907	algr	($h0,"%r0");
908	sllg	("%r0",$h1,52);
909	srlg	($h1,$h1,12);
910	sllg	("%r1","%r1",14);
911	algr	($h0,"%r0");
912	alcgr	($h1,"%r1");
913	sllg	("%r0",$h2,40);
914	srlg	($h2,$h2,24);
915	lghi	("%r1",0);
916	algr	($h1,"%r0");
917	alcgr	($h2,"%r1");
918
919	llgf	("%r0","24($ctx)");		# is_base2_26
920	lcgr	("%r0","%r0");
921
922	xgr	($h0,$d0);			# choose between radixes
923	xgr	($h1,$d1);
924	xgr	($h2,$d2);
925	ngr	($h0,"%r0");
926	ngr	($h1,"%r0");
927	ngr	($h2,"%r0");
928	xgr	($h0,$d0);
929	xgr	($h1,$d1);
930	xgr	($h2,$d2);
931
932	lghi	("%r0",5);
933	lgr	($d0,$h0);
934	lgr	($d1,$h1);
935
936	algr	($h0,"%r0");			# compare to modulus
937	alcgr	($h1,"%r1");
938	alcgr	($h2,"%r1");
939
940	srlg	($h2,$h2,2);			# did it borrow/carry?
941	slgr	("%r1",$h2);				# 0-$h2>>2
942	lg	($d2,"0($nonce)");		# load nonce
943	lg	($ctx,"8($nonce)");
944
945	xgr	($h0,$d0);
946	xgr	($h1,$d1);
947	ngr	($h0,"%r1");
948	ngr	($h1,"%r1");
949	xgr	($h0,$d0);
950	rllg	($d0,$d2,32);			# flip nonce words
951	xgr	($h1,$d1);
952	rllg	($d1,$ctx,32);
953
954	algr	($h0,$d0);			# accumulate nonce
955	alcgr	($h1,$d1);
956
957	strvg	($h0,"0($mac)");		# write little-endian result
958	strvg	($h1,"8($mac)");
959
960&{$z?	\&lmg:\&lm}	("%r6","%r10","6*$SIZE_T($sp)");
961	br	("%r14");
962SIZE	("poly1305_emit",".-poly1305_emit");
963}
964
965################
966
967ALIGN	(16);
968LABEL	(".Lconst");
969LONG	(0x04050607,0x14151617,0x0c0d0e0f,0x1c1d1e1f);	# merge odd
970LONG	(0x07060504,0x03020100,0x17161514,0x13121110);	# byte swap masks
971LONG	(0x0f0e0d0c,0x0b0a0908,0x1f1e1d1c,0x1b1a1918);
972LONG	(0x00000000,0x09080706,0x00000000,0x19181716);
973
974LONG	(0x00000000,0x00000000,0x00000000,0x0c0d0e0f);	# magic tail masks
975LONG	(0x0c0d0e0f,0x00000000,0x00000000,0x00000000);
976LONG	(0x00000000,0x00000000,0x0c0d0e0f,0x00000000);
977
978LONG	(0xffffffff,0x00000000,0xffffffff,0xffffffff);
979LONG	(0xffffffff,0x00000000,0xffffffff,0x00000000);
980LONG	(0x00000000,0x00000000,0xffffffff,0x00000000);
981
982STRING	("\"Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
983
984PERLASM_END();
985