1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# March 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Performance
15# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
16# and are expressed in cycles per processed byte, less is better:
17#
18#		gcc 3.3.x	cc 5.2		this assembler
19#
20# 32-bit build	81.4		43.3		12.6	(+546%/+244%)
21# 64-bit build	20.2		21.2		12.6	(+60%/+68%)
22#
23# Here is data collected on UltraSPARC T1 system running Linux:
24#
25#		gcc 4.4.1			this assembler
26#
27# 32-bit build	566				50	(+1000%)
28# 64-bit build	56				50	(+12%)
29#
30# I don't quite understand why difference between 32-bit and 64-bit
31# compiler-generated code is so big. Compilers *were* instructed to
32# generate code for UltraSPARC and should have used 64-bit registers
33# for Z vector (see C code) even in 32-bit build... Oh well, it only
34# means more impressive improvement coefficients for this assembler
35# module;-) Loops are aggressively modulo-scheduled in respect to
36# references to input data and Z.hi updates to achieve 12 cycles
37# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
38# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
39
40$bits=32;
41for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
42if ($bits==64)  { $bias=2047; $frame=192; }
43else            { $bias=0;    $frame=112; }
44
45$output=shift;
46open STDOUT,">$output";
47
48$Zhi="%o0";	# 64-bit values
49$Zlo="%o1";
50$Thi="%o2";
51$Tlo="%o3";
52$rem="%o4";
53$tmp="%o5";
54
55$nhi="%l0";	# small values and pointers
56$nlo="%l1";
57$xi0="%l2";
58$xi1="%l3";
59$rem_4bit="%l4";
60$remi="%l5";
61$Htblo="%l6";
62$cnt="%l7";
63
64$Xi="%i0";	# input argument block
65$Htbl="%i1";
66$inp="%i2";
67$len="%i3";
68
69$code.=<<___;
70.section	".text",#alloc,#execinstr
71
72.align	64
73rem_4bit:
74	.long	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
75	.long	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
76	.long	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
77	.long	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
78.type	rem_4bit,#object
79.size	rem_4bit,(.-rem_4bit)
80
81.globl	gcm_ghash_4bit
82.align	32
83gcm_ghash_4bit:
84	save	%sp,-$frame,%sp
85	ldub	[$inp+15],$nlo
86	ldub	[$Xi+15],$xi0
87	ldub	[$Xi+14],$xi1
88	add	$len,$inp,$len
89	add	$Htbl,8,$Htblo
90
911:	call	.+8
92	add	%o7,rem_4bit-1b,$rem_4bit
93
94.Louter:
95	xor	$xi0,$nlo,$nlo
96	and	$nlo,0xf0,$nhi
97	and	$nlo,0x0f,$nlo
98	sll	$nlo,4,$nlo
99	ldx	[$Htblo+$nlo],$Zlo
100	ldx	[$Htbl+$nlo],$Zhi
101
102	ldub	[$inp+14],$nlo
103
104	ldx	[$Htblo+$nhi],$Tlo
105	and	$Zlo,0xf,$remi
106	ldx	[$Htbl+$nhi],$Thi
107	sll	$remi,3,$remi
108	ldx	[$rem_4bit+$remi],$rem
109	srlx	$Zlo,4,$Zlo
110	mov	13,$cnt
111	sllx	$Zhi,60,$tmp
112	xor	$Tlo,$Zlo,$Zlo
113	srlx	$Zhi,4,$Zhi
114	xor	$Zlo,$tmp,$Zlo
115
116	xor	$xi1,$nlo,$nlo
117	and	$Zlo,0xf,$remi
118	and	$nlo,0xf0,$nhi
119	and	$nlo,0x0f,$nlo
120	ba	.Lghash_inner
121	sll	$nlo,4,$nlo
122.align	32
123.Lghash_inner:
124	ldx	[$Htblo+$nlo],$Tlo
125	sll	$remi,3,$remi
126	xor	$Thi,$Zhi,$Zhi
127	ldx	[$Htbl+$nlo],$Thi
128	srlx	$Zlo,4,$Zlo
129	xor	$rem,$Zhi,$Zhi
130	ldx	[$rem_4bit+$remi],$rem
131	sllx	$Zhi,60,$tmp
132	xor	$Tlo,$Zlo,$Zlo
133	ldub	[$inp+$cnt],$nlo
134	srlx	$Zhi,4,$Zhi
135	xor	$Zlo,$tmp,$Zlo
136	ldub	[$Xi+$cnt],$xi1
137	xor	$Thi,$Zhi,$Zhi
138	and	$Zlo,0xf,$remi
139
140	ldx	[$Htblo+$nhi],$Tlo
141	sll	$remi,3,$remi
142	xor	$rem,$Zhi,$Zhi
143	ldx	[$Htbl+$nhi],$Thi
144	srlx	$Zlo,4,$Zlo
145	ldx	[$rem_4bit+$remi],$rem
146	sllx	$Zhi,60,$tmp
147	xor	$xi1,$nlo,$nlo
148	srlx	$Zhi,4,$Zhi
149	and	$nlo,0xf0,$nhi
150	addcc	$cnt,-1,$cnt
151	xor	$Zlo,$tmp,$Zlo
152	and	$nlo,0x0f,$nlo
153	xor	$Tlo,$Zlo,$Zlo
154	sll	$nlo,4,$nlo
155	blu	.Lghash_inner
156	and	$Zlo,0xf,$remi
157
158	ldx	[$Htblo+$nlo],$Tlo
159	sll	$remi,3,$remi
160	xor	$Thi,$Zhi,$Zhi
161	ldx	[$Htbl+$nlo],$Thi
162	srlx	$Zlo,4,$Zlo
163	xor	$rem,$Zhi,$Zhi
164	ldx	[$rem_4bit+$remi],$rem
165	sllx	$Zhi,60,$tmp
166	xor	$Tlo,$Zlo,$Zlo
167	srlx	$Zhi,4,$Zhi
168	xor	$Zlo,$tmp,$Zlo
169	xor	$Thi,$Zhi,$Zhi
170
171	add	$inp,16,$inp
172	cmp	$inp,$len
173	be,pn	`$bits==64?"%xcc":"%icc"`,.Ldone
174	and	$Zlo,0xf,$remi
175
176	ldx	[$Htblo+$nhi],$Tlo
177	sll	$remi,3,$remi
178	xor	$rem,$Zhi,$Zhi
179	ldx	[$Htbl+$nhi],$Thi
180	srlx	$Zlo,4,$Zlo
181	ldx	[$rem_4bit+$remi],$rem
182	sllx	$Zhi,60,$tmp
183	xor	$Tlo,$Zlo,$Zlo
184	ldub	[$inp+15],$nlo
185	srlx	$Zhi,4,$Zhi
186	xor	$Zlo,$tmp,$Zlo
187	xor	$Thi,$Zhi,$Zhi
188	stx	$Zlo,[$Xi+8]
189	xor	$rem,$Zhi,$Zhi
190	stx	$Zhi,[$Xi]
191	srl	$Zlo,8,$xi1
192	and	$Zlo,0xff,$xi0
193	ba	.Louter
194	and	$xi1,0xff,$xi1
195.align	32
196.Ldone:
197	ldx	[$Htblo+$nhi],$Tlo
198	sll	$remi,3,$remi
199	xor	$rem,$Zhi,$Zhi
200	ldx	[$Htbl+$nhi],$Thi
201	srlx	$Zlo,4,$Zlo
202	ldx	[$rem_4bit+$remi],$rem
203	sllx	$Zhi,60,$tmp
204	xor	$Tlo,$Zlo,$Zlo
205	srlx	$Zhi,4,$Zhi
206	xor	$Zlo,$tmp,$Zlo
207	xor	$Thi,$Zhi,$Zhi
208	stx	$Zlo,[$Xi+8]
209	xor	$rem,$Zhi,$Zhi
210	stx	$Zhi,[$Xi]
211
212	ret
213	restore
214.type	gcm_ghash_4bit,#function
215.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
216___
217
218undef $inp;
219undef $len;
220
221$code.=<<___;
222.globl	gcm_gmult_4bit
223.align	32
224gcm_gmult_4bit:
225	save	%sp,-$frame,%sp
226	ldub	[$Xi+15],$nlo
227	add	$Htbl,8,$Htblo
228
2291:	call	.+8
230	add	%o7,rem_4bit-1b,$rem_4bit
231
232	and	$nlo,0xf0,$nhi
233	and	$nlo,0x0f,$nlo
234	sll	$nlo,4,$nlo
235	ldx	[$Htblo+$nlo],$Zlo
236	ldx	[$Htbl+$nlo],$Zhi
237
238	ldub	[$Xi+14],$nlo
239
240	ldx	[$Htblo+$nhi],$Tlo
241	and	$Zlo,0xf,$remi
242	ldx	[$Htbl+$nhi],$Thi
243	sll	$remi,3,$remi
244	ldx	[$rem_4bit+$remi],$rem
245	srlx	$Zlo,4,$Zlo
246	mov	13,$cnt
247	sllx	$Zhi,60,$tmp
248	xor	$Tlo,$Zlo,$Zlo
249	srlx	$Zhi,4,$Zhi
250	xor	$Zlo,$tmp,$Zlo
251
252	and	$Zlo,0xf,$remi
253	and	$nlo,0xf0,$nhi
254	and	$nlo,0x0f,$nlo
255	ba	.Lgmult_inner
256	sll	$nlo,4,$nlo
257.align	32
258.Lgmult_inner:
259	ldx	[$Htblo+$nlo],$Tlo
260	sll	$remi,3,$remi
261	xor	$Thi,$Zhi,$Zhi
262	ldx	[$Htbl+$nlo],$Thi
263	srlx	$Zlo,4,$Zlo
264	xor	$rem,$Zhi,$Zhi
265	ldx	[$rem_4bit+$remi],$rem
266	sllx	$Zhi,60,$tmp
267	xor	$Tlo,$Zlo,$Zlo
268	ldub	[$Xi+$cnt],$nlo
269	srlx	$Zhi,4,$Zhi
270	xor	$Zlo,$tmp,$Zlo
271	xor	$Thi,$Zhi,$Zhi
272	and	$Zlo,0xf,$remi
273
274	ldx	[$Htblo+$nhi],$Tlo
275	sll	$remi,3,$remi
276	xor	$rem,$Zhi,$Zhi
277	ldx	[$Htbl+$nhi],$Thi
278	srlx	$Zlo,4,$Zlo
279	ldx	[$rem_4bit+$remi],$rem
280	sllx	$Zhi,60,$tmp
281	srlx	$Zhi,4,$Zhi
282	and	$nlo,0xf0,$nhi
283	addcc	$cnt,-1,$cnt
284	xor	$Zlo,$tmp,$Zlo
285	and	$nlo,0x0f,$nlo
286	xor	$Tlo,$Zlo,$Zlo
287	sll	$nlo,4,$nlo
288	blu	.Lgmult_inner
289	and	$Zlo,0xf,$remi
290
291	ldx	[$Htblo+$nlo],$Tlo
292	sll	$remi,3,$remi
293	xor	$Thi,$Zhi,$Zhi
294	ldx	[$Htbl+$nlo],$Thi
295	srlx	$Zlo,4,$Zlo
296	xor	$rem,$Zhi,$Zhi
297	ldx	[$rem_4bit+$remi],$rem
298	sllx	$Zhi,60,$tmp
299	xor	$Tlo,$Zlo,$Zlo
300	srlx	$Zhi,4,$Zhi
301	xor	$Zlo,$tmp,$Zlo
302	xor	$Thi,$Zhi,$Zhi
303	and	$Zlo,0xf,$remi
304
305	ldx	[$Htblo+$nhi],$Tlo
306	sll	$remi,3,$remi
307	xor	$rem,$Zhi,$Zhi
308	ldx	[$Htbl+$nhi],$Thi
309	srlx	$Zlo,4,$Zlo
310	ldx	[$rem_4bit+$remi],$rem
311	sllx	$Zhi,60,$tmp
312	xor	$Tlo,$Zlo,$Zlo
313	srlx	$Zhi,4,$Zhi
314	xor	$Zlo,$tmp,$Zlo
315	xor	$Thi,$Zhi,$Zhi
316	stx	$Zlo,[$Xi+8]
317	xor	$rem,$Zhi,$Zhi
318	stx	$Zhi,[$Xi]
319
320	ret
321	restore
322.type	gcm_gmult_4bit,#function
323.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
324.asciz	"GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
325.align	4
326___
327
328$code =~ s/\`([^\`]*)\`/eval $1/gem;
329print $code;
330close STDOUT;
331