1#! /usr/bin/env perl
2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# March 2010
18#
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that it
21# uses 256 bytes per-key table [+128 bytes shared table]. Performance
22# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
23# and are expressed in cycles per processed byte, less is better:
24#
25#		gcc 3.3.x	cc 5.2		this assembler
26#
27# 32-bit build	81.4		43.3		12.6	(+546%/+244%)
28# 64-bit build	20.2		21.2		12.6	(+60%/+68%)
29#
30# Here is data collected on UltraSPARC T1 system running Linux:
31#
32#		gcc 4.4.1			this assembler
33#
34# 32-bit build	566				50	(+1000%)
35# 64-bit build	56				50	(+12%)
36#
37# I don't quite understand why difference between 32-bit and 64-bit
38# compiler-generated code is so big. Compilers *were* instructed to
39# generate code for UltraSPARC and should have used 64-bit registers
40# for Z vector (see C code) even in 32-bit build... Oh well, it only
41# means more impressive improvement coefficients for this assembler
42# module;-) Loops are aggressively modulo-scheduled in respect to
43# references to input data and Z.hi updates to achieve 12 cycles
44# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
45# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
46#
47# October 2012
48#
49# Add VIS3 lookup-table-free implementation using polynomial
50# multiplication xmulx[hi] and extended addition addxc[cc]
51# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
52# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
53# saturates at ~15.5x single-process result on 8-core processor,
54# or ~20.5GBps per 2.85GHz socket.
55
56$output=pop;
57open STDOUT,">$output";
58
59$frame="STACK_FRAME";
60$bias="STACK_BIAS";
61
62$Zhi="%o0";	# 64-bit values
63$Zlo="%o1";
64$Thi="%o2";
65$Tlo="%o3";
66$rem="%o4";
67$tmp="%o5";
68
69$nhi="%l0";	# small values and pointers
70$nlo="%l1";
71$xi0="%l2";
72$xi1="%l3";
73$rem_4bit="%l4";
74$remi="%l5";
75$Htblo="%l6";
76$cnt="%l7";
77
78$Xi="%i0";	# input argument block
79$Htbl="%i1";
80$inp="%i2";
81$len="%i3";
82
83$code.=<<___;
84#include "sparc_arch.h"
85
86#ifdef  __arch64__
87.register	%g2,#scratch
88.register	%g3,#scratch
89#endif
90
91.section	".text",#alloc,#execinstr
92
93.align	64
94rem_4bit:
95	.long	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
96	.long	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
97	.long	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
98	.long	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
99.type	rem_4bit,#object
100.size	rem_4bit,(.-rem_4bit)
101
102.globl	gcm_ghash_4bit
103.align	32
104gcm_ghash_4bit:
105	save	%sp,-$frame,%sp
106	ldub	[$inp+15],$nlo
107	ldub	[$Xi+15],$xi0
108	ldub	[$Xi+14],$xi1
109	add	$len,$inp,$len
110	add	$Htbl,8,$Htblo
111
1121:	call	.+8
113	add	%o7,rem_4bit-1b,$rem_4bit
114
115.Louter:
116	xor	$xi0,$nlo,$nlo
117	and	$nlo,0xf0,$nhi
118	and	$nlo,0x0f,$nlo
119	sll	$nlo,4,$nlo
120	ldx	[$Htblo+$nlo],$Zlo
121	ldx	[$Htbl+$nlo],$Zhi
122
123	ldub	[$inp+14],$nlo
124
125	ldx	[$Htblo+$nhi],$Tlo
126	and	$Zlo,0xf,$remi
127	ldx	[$Htbl+$nhi],$Thi
128	sll	$remi,3,$remi
129	ldx	[$rem_4bit+$remi],$rem
130	srlx	$Zlo,4,$Zlo
131	mov	13,$cnt
132	sllx	$Zhi,60,$tmp
133	xor	$Tlo,$Zlo,$Zlo
134	srlx	$Zhi,4,$Zhi
135	xor	$Zlo,$tmp,$Zlo
136
137	xor	$xi1,$nlo,$nlo
138	and	$Zlo,0xf,$remi
139	and	$nlo,0xf0,$nhi
140	and	$nlo,0x0f,$nlo
141	ba	.Lghash_inner
142	sll	$nlo,4,$nlo
143.align	32
144.Lghash_inner:
145	ldx	[$Htblo+$nlo],$Tlo
146	sll	$remi,3,$remi
147	xor	$Thi,$Zhi,$Zhi
148	ldx	[$Htbl+$nlo],$Thi
149	srlx	$Zlo,4,$Zlo
150	xor	$rem,$Zhi,$Zhi
151	ldx	[$rem_4bit+$remi],$rem
152	sllx	$Zhi,60,$tmp
153	xor	$Tlo,$Zlo,$Zlo
154	ldub	[$inp+$cnt],$nlo
155	srlx	$Zhi,4,$Zhi
156	xor	$Zlo,$tmp,$Zlo
157	ldub	[$Xi+$cnt],$xi1
158	xor	$Thi,$Zhi,$Zhi
159	and	$Zlo,0xf,$remi
160
161	ldx	[$Htblo+$nhi],$Tlo
162	sll	$remi,3,$remi
163	xor	$rem,$Zhi,$Zhi
164	ldx	[$Htbl+$nhi],$Thi
165	srlx	$Zlo,4,$Zlo
166	ldx	[$rem_4bit+$remi],$rem
167	sllx	$Zhi,60,$tmp
168	xor	$xi1,$nlo,$nlo
169	srlx	$Zhi,4,$Zhi
170	and	$nlo,0xf0,$nhi
171	addcc	$cnt,-1,$cnt
172	xor	$Zlo,$tmp,$Zlo
173	and	$nlo,0x0f,$nlo
174	xor	$Tlo,$Zlo,$Zlo
175	sll	$nlo,4,$nlo
176	blu	.Lghash_inner
177	and	$Zlo,0xf,$remi
178
179	ldx	[$Htblo+$nlo],$Tlo
180	sll	$remi,3,$remi
181	xor	$Thi,$Zhi,$Zhi
182	ldx	[$Htbl+$nlo],$Thi
183	srlx	$Zlo,4,$Zlo
184	xor	$rem,$Zhi,$Zhi
185	ldx	[$rem_4bit+$remi],$rem
186	sllx	$Zhi,60,$tmp
187	xor	$Tlo,$Zlo,$Zlo
188	srlx	$Zhi,4,$Zhi
189	xor	$Zlo,$tmp,$Zlo
190	xor	$Thi,$Zhi,$Zhi
191
192	add	$inp,16,$inp
193	cmp	$inp,$len
194	be,pn	SIZE_T_CC,.Ldone
195	and	$Zlo,0xf,$remi
196
197	ldx	[$Htblo+$nhi],$Tlo
198	sll	$remi,3,$remi
199	xor	$rem,$Zhi,$Zhi
200	ldx	[$Htbl+$nhi],$Thi
201	srlx	$Zlo,4,$Zlo
202	ldx	[$rem_4bit+$remi],$rem
203	sllx	$Zhi,60,$tmp
204	xor	$Tlo,$Zlo,$Zlo
205	ldub	[$inp+15],$nlo
206	srlx	$Zhi,4,$Zhi
207	xor	$Zlo,$tmp,$Zlo
208	xor	$Thi,$Zhi,$Zhi
209	stx	$Zlo,[$Xi+8]
210	xor	$rem,$Zhi,$Zhi
211	stx	$Zhi,[$Xi]
212	srl	$Zlo,8,$xi1
213	and	$Zlo,0xff,$xi0
214	ba	.Louter
215	and	$xi1,0xff,$xi1
216.align	32
217.Ldone:
218	ldx	[$Htblo+$nhi],$Tlo
219	sll	$remi,3,$remi
220	xor	$rem,$Zhi,$Zhi
221	ldx	[$Htbl+$nhi],$Thi
222	srlx	$Zlo,4,$Zlo
223	ldx	[$rem_4bit+$remi],$rem
224	sllx	$Zhi,60,$tmp
225	xor	$Tlo,$Zlo,$Zlo
226	srlx	$Zhi,4,$Zhi
227	xor	$Zlo,$tmp,$Zlo
228	xor	$Thi,$Zhi,$Zhi
229	stx	$Zlo,[$Xi+8]
230	xor	$rem,$Zhi,$Zhi
231	stx	$Zhi,[$Xi]
232
233	ret
234	restore
235.type	gcm_ghash_4bit,#function
236.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
237___
238
239undef $inp;
240undef $len;
241
242$code.=<<___;
243.globl	gcm_gmult_4bit
244.align	32
245gcm_gmult_4bit:
246	save	%sp,-$frame,%sp
247	ldub	[$Xi+15],$nlo
248	add	$Htbl,8,$Htblo
249
2501:	call	.+8
251	add	%o7,rem_4bit-1b,$rem_4bit
252
253	and	$nlo,0xf0,$nhi
254	and	$nlo,0x0f,$nlo
255	sll	$nlo,4,$nlo
256	ldx	[$Htblo+$nlo],$Zlo
257	ldx	[$Htbl+$nlo],$Zhi
258
259	ldub	[$Xi+14],$nlo
260
261	ldx	[$Htblo+$nhi],$Tlo
262	and	$Zlo,0xf,$remi
263	ldx	[$Htbl+$nhi],$Thi
264	sll	$remi,3,$remi
265	ldx	[$rem_4bit+$remi],$rem
266	srlx	$Zlo,4,$Zlo
267	mov	13,$cnt
268	sllx	$Zhi,60,$tmp
269	xor	$Tlo,$Zlo,$Zlo
270	srlx	$Zhi,4,$Zhi
271	xor	$Zlo,$tmp,$Zlo
272
273	and	$Zlo,0xf,$remi
274	and	$nlo,0xf0,$nhi
275	and	$nlo,0x0f,$nlo
276	ba	.Lgmult_inner
277	sll	$nlo,4,$nlo
278.align	32
279.Lgmult_inner:
280	ldx	[$Htblo+$nlo],$Tlo
281	sll	$remi,3,$remi
282	xor	$Thi,$Zhi,$Zhi
283	ldx	[$Htbl+$nlo],$Thi
284	srlx	$Zlo,4,$Zlo
285	xor	$rem,$Zhi,$Zhi
286	ldx	[$rem_4bit+$remi],$rem
287	sllx	$Zhi,60,$tmp
288	xor	$Tlo,$Zlo,$Zlo
289	ldub	[$Xi+$cnt],$nlo
290	srlx	$Zhi,4,$Zhi
291	xor	$Zlo,$tmp,$Zlo
292	xor	$Thi,$Zhi,$Zhi
293	and	$Zlo,0xf,$remi
294
295	ldx	[$Htblo+$nhi],$Tlo
296	sll	$remi,3,$remi
297	xor	$rem,$Zhi,$Zhi
298	ldx	[$Htbl+$nhi],$Thi
299	srlx	$Zlo,4,$Zlo
300	ldx	[$rem_4bit+$remi],$rem
301	sllx	$Zhi,60,$tmp
302	srlx	$Zhi,4,$Zhi
303	and	$nlo,0xf0,$nhi
304	addcc	$cnt,-1,$cnt
305	xor	$Zlo,$tmp,$Zlo
306	and	$nlo,0x0f,$nlo
307	xor	$Tlo,$Zlo,$Zlo
308	sll	$nlo,4,$nlo
309	blu	.Lgmult_inner
310	and	$Zlo,0xf,$remi
311
312	ldx	[$Htblo+$nlo],$Tlo
313	sll	$remi,3,$remi
314	xor	$Thi,$Zhi,$Zhi
315	ldx	[$Htbl+$nlo],$Thi
316	srlx	$Zlo,4,$Zlo
317	xor	$rem,$Zhi,$Zhi
318	ldx	[$rem_4bit+$remi],$rem
319	sllx	$Zhi,60,$tmp
320	xor	$Tlo,$Zlo,$Zlo
321	srlx	$Zhi,4,$Zhi
322	xor	$Zlo,$tmp,$Zlo
323	xor	$Thi,$Zhi,$Zhi
324	and	$Zlo,0xf,$remi
325
326	ldx	[$Htblo+$nhi],$Tlo
327	sll	$remi,3,$remi
328	xor	$rem,$Zhi,$Zhi
329	ldx	[$Htbl+$nhi],$Thi
330	srlx	$Zlo,4,$Zlo
331	ldx	[$rem_4bit+$remi],$rem
332	sllx	$Zhi,60,$tmp
333	xor	$Tlo,$Zlo,$Zlo
334	srlx	$Zhi,4,$Zhi
335	xor	$Zlo,$tmp,$Zlo
336	xor	$Thi,$Zhi,$Zhi
337	stx	$Zlo,[$Xi+8]
338	xor	$rem,$Zhi,$Zhi
339	stx	$Zhi,[$Xi]
340
341	ret
342	restore
343.type	gcm_gmult_4bit,#function
344.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
345___
346
347{{{
348# Straightforward 128x128-bit multiplication using Karatsuba algorithm
349# followed by pair of 64-bit reductions [with a shortcut in first one,
350# which allowed to break dependency between reductions and remove one
351# multiplication from critical path]. While it might be suboptimal
352# with regard to sheer number of multiplications, other methods [such
353# as aggregate reduction] would require more 64-bit registers, which
354# we don't have in 32-bit application context.
355
356($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
357
358($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
359	(map("%o$_",(0..5,7)),map("%g$_",(1..5)));
360
361($shl,$shr)=map("%l$_",(0..7));
362
363# For details regarding "twisted H" see ghash-x86.pl.
364$code.=<<___;
365.globl	gcm_init_vis3
366.align	32
367gcm_init_vis3:
368	save	%sp,-$frame,%sp
369
370	ldx	[%i1+0],$Hhi
371	ldx	[%i1+8],$Hlo
372	mov	0xE1,$Xhi
373	mov	1,$Xlo
374	sllx	$Xhi,57,$Xhi
375	srax	$Hhi,63,$C0		! broadcast carry
376	addcc	$Hlo,$Hlo,$Hlo		! H<<=1
377	addxc	$Hhi,$Hhi,$Hhi
378	and	$C0,$Xlo,$Xlo
379	and	$C0,$Xhi,$Xhi
380	xor	$Xlo,$Hlo,$Hlo
381	xor	$Xhi,$Hhi,$Hhi
382	stx	$Hlo,[%i0+8]		! save twisted H
383	stx	$Hhi,[%i0+0]
384
385	sethi	%hi(0xA0406080),$V
386	sethi	%hi(0x20C0E000),%l0
387	or	$V,%lo(0xA0406080),$V
388	or	%l0,%lo(0x20C0E000),%l0
389	sllx	$V,32,$V
390	or	%l0,$V,$V		! (0xE0·i)&0xff=0xA040608020C0E000
391	stx	$V,[%i0+16]
392
393	ret
394	restore
395.type	gcm_init_vis3,#function
396.size	gcm_init_vis3,.-gcm_init_vis3
397
398.globl	gcm_gmult_vis3
399.align	32
400gcm_gmult_vis3:
401	save	%sp,-$frame,%sp
402
403	ldx	[$Xip+8],$Xlo		! load Xi
404	ldx	[$Xip+0],$Xhi
405	ldx	[$Htable+8],$Hlo	! load twisted H
406	ldx	[$Htable+0],$Hhi
407
408	mov	0xE1,%l7
409	sllx	%l7,57,$xE1		! 57 is not a typo
410	ldx	[$Htable+16],$V		! (0xE0·i)&0xff=0xA040608020C0E000
411
412	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing
413	xmulx	$Xlo,$Hlo,$C0
414	xor	$Xlo,$Xhi,$C2		! Karatsuba pre-processing
415	xmulx	$C2,$Hhl,$C1
416	xmulxhi	$Xlo,$Hlo,$Xlo
417	xmulxhi	$C2,$Hhl,$C2
418	xmulxhi	$Xhi,$Hhi,$C3
419	xmulx	$Xhi,$Hhi,$Xhi
420
421	sll	$C0,3,$sqr
422	srlx	$V,$sqr,$sqr		! ·0xE0 [implicit &(7<<3)]
423	xor	$C0,$sqr,$sqr
424	sllx	$sqr,57,$sqr		! ($C0·0xE1)<<1<<56 [implicit &0x7f]
425
426	xor	$C0,$C1,$C1		! Karatsuba post-processing
427	xor	$Xlo,$C2,$C2
428	 xor	$sqr,$Xlo,$Xlo		! real destination is $C1
429	xor	$C3,$C2,$C2
430	xor	$Xlo,$C1,$C1
431	xor	$Xhi,$C2,$C2
432	xor	$Xhi,$C1,$C1
433
434	xmulxhi	$C0,$xE1,$Xlo		! ·0xE1<<1<<56
435	 xor	$C0,$C2,$C2
436	xmulx	$C1,$xE1,$C0
437	 xor	$C1,$C3,$C3
438	xmulxhi	$C1,$xE1,$C1
439
440	xor	$Xlo,$C2,$C2
441	xor	$C0,$C2,$C2
442	xor	$C1,$C3,$C3
443
444	stx	$C2,[$Xip+8]		! save Xi
445	stx	$C3,[$Xip+0]
446
447	ret
448	restore
449.type	gcm_gmult_vis3,#function
450.size	gcm_gmult_vis3,.-gcm_gmult_vis3
451
452.globl	gcm_ghash_vis3
453.align	32
454gcm_ghash_vis3:
455	save	%sp,-$frame,%sp
456	nop
457	srln	$len,0,$len		! needed on v8+, "nop" on v9
458
459	ldx	[$Xip+8],$C2		! load Xi
460	ldx	[$Xip+0],$C3
461	ldx	[$Htable+8],$Hlo	! load twisted H
462	ldx	[$Htable+0],$Hhi
463
464	mov	0xE1,%l7
465	sllx	%l7,57,$xE1		! 57 is not a typo
466	ldx	[$Htable+16],$V		! (0xE0·i)&0xff=0xA040608020C0E000
467
468	and	$inp,7,$shl
469	andn	$inp,7,$inp
470	sll	$shl,3,$shl
471	prefetch [$inp+63], 20
472	sub	%g0,$shl,$shr
473
474	xor	$Hhi,$Hlo,$Hhl		! Karatsuba pre-processing
475.Loop:
476	ldx	[$inp+8],$Xlo
477	brz,pt	$shl,1f
478	ldx	[$inp+0],$Xhi
479
480	ldx	[$inp+16],$C1		! align data
481	srlx	$Xlo,$shr,$C0
482	sllx	$Xlo,$shl,$Xlo
483	sllx	$Xhi,$shl,$Xhi
484	srlx	$C1,$shr,$C1
485	or	$C0,$Xhi,$Xhi
486	or	$C1,$Xlo,$Xlo
4871:
488	add	$inp,16,$inp
489	sub	$len,16,$len
490	xor	$C2,$Xlo,$Xlo
491	xor	$C3,$Xhi,$Xhi
492	prefetch [$inp+63], 20
493
494	xmulx	$Xlo,$Hlo,$C0
495	xor	$Xlo,$Xhi,$C2		! Karatsuba pre-processing
496	xmulx	$C2,$Hhl,$C1
497	xmulxhi	$Xlo,$Hlo,$Xlo
498	xmulxhi	$C2,$Hhl,$C2
499	xmulxhi	$Xhi,$Hhi,$C3
500	xmulx	$Xhi,$Hhi,$Xhi
501
502	sll	$C0,3,$sqr
503	srlx	$V,$sqr,$sqr		! ·0xE0 [implicit &(7<<3)]
504	xor	$C0,$sqr,$sqr
505	sllx	$sqr,57,$sqr		! ($C0·0xE1)<<1<<56 [implicit &0x7f]
506
507	xor	$C0,$C1,$C1		! Karatsuba post-processing
508	xor	$Xlo,$C2,$C2
509	 xor	$sqr,$Xlo,$Xlo		! real destination is $C1
510	xor	$C3,$C2,$C2
511	xor	$Xlo,$C1,$C1
512	xor	$Xhi,$C2,$C2
513	xor	$Xhi,$C1,$C1
514
515	xmulxhi	$C0,$xE1,$Xlo		! ·0xE1<<1<<56
516	 xor	$C0,$C2,$C2
517	xmulx	$C1,$xE1,$C0
518	 xor	$C1,$C3,$C3
519	xmulxhi	$C1,$xE1,$C1
520
521	xor	$Xlo,$C2,$C2
522	xor	$C0,$C2,$C2
523	brnz,pt	$len,.Loop
524	xor	$C1,$C3,$C3
525
526	stx	$C2,[$Xip+8]		! save Xi
527	stx	$C3,[$Xip+0]
528
529	ret
530	restore
531.type	gcm_ghash_vis3,#function
532.size	gcm_ghash_vis3,.-gcm_ghash_vis3
533___
534}}}
535$code.=<<___;
536.asciz	"GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
537.align	4
538___
539
540
541# Purpose of these subroutines is to explicitly encode VIS instructions,
542# so that one can compile the module without having to specify VIS
543# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
544# Idea is to reserve for option to produce "universal" binary and let
545# programmer detect if current CPU is VIS capable at run-time.
546sub unvis3 {
547my ($mnemonic,$rs1,$rs2,$rd)=@_;
548my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
549my ($ref,$opf);
550my %visopf = (	"addxc"		=> 0x011,
551		"addxccc"	=> 0x013,
552		"xmulx"		=> 0x115,
553		"xmulxhi"	=> 0x116	);
554
555    $ref = "$mnemonic\t$rs1,$rs2,$rd";
556
557    if ($opf=$visopf{$mnemonic}) {
558	foreach ($rs1,$rs2,$rd) {
559	    return $ref if (!/%([goli])([0-9])/);
560	    $_=$bias{$1}+$2;
561	}
562
563	return	sprintf ".word\t0x%08x !%s",
564			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
565			$ref;
566    } else {
567	return $ref;
568    }
569}
570
571foreach (split("\n",$code)) {
572	s/\`([^\`]*)\`/eval $1/ge;
573
574	s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
575		&unvis3($1,$2,$3,$4)
576	 /ge;
577
578	print $_,"\n";
579}
580
581close STDOUT or die "error closing STDOUT: $!";
582