1#! /usr/bin/env perl
2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# March 2010
18#
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that it
21# uses 256 bytes per-key table [+128 bytes shared table]. Even though
22# loops are aggressively modulo-scheduled in respect to references to
23# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
24# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
25# scheduling "glitch," because uprofile(1) indicates uniform sample
26# distribution, as if all instruction bundles execute in 1.5 cycles.
27# Meaning that it could have been even faster, yet 12 cycles is ~60%
28# better than gcc-generated code and ~80% than code generated by vendor
29# compiler.
30
31$cnt="v0";	# $0
32$t0="t0";
33$t1="t1";
34$t2="t2";
35$Thi0="t3";	# $4
36$Tlo0="t4";
37$Thi1="t5";
38$Tlo1="t6";
39$rem="t7";	# $8
40#################
41$Xi="a0";	# $16, input argument block
42$Htbl="a1";
43$inp="a2";
44$len="a3";
45$nlo="a4";	# $20
46$nhi="a5";
47$Zhi="t8";
48$Zlo="t9";
49$Xhi="t10";	# $24
50$Xlo="t11";
51$remp="t12";
52$rem_4bit="AT";	# $28
53
54{ my $N;
55  sub loop() {
56
57	$N++;
58$code.=<<___;
59.align	4
60	extbl	$Xlo,7,$nlo
61	and	$nlo,0xf0,$nhi
62	sll	$nlo,4,$nlo
63	and	$nlo,0xf0,$nlo
64
65	addq	$nlo,$Htbl,$nlo
66	ldq	$Zlo,8($nlo)
67	addq	$nhi,$Htbl,$nhi
68	ldq	$Zhi,0($nlo)
69
70	and	$Zlo,0x0f,$remp
71	sll	$Zhi,60,$t0
72	lda	$cnt,6(zero)
73	extbl	$Xlo,6,$nlo
74
75	ldq	$Tlo1,8($nhi)
76	s8addq	$remp,$rem_4bit,$remp
77	ldq	$Thi1,0($nhi)
78	srl	$Zlo,4,$Zlo
79
80	ldq	$rem,0($remp)
81	srl	$Zhi,4,$Zhi
82	xor	$t0,$Zlo,$Zlo
83	and	$nlo,0xf0,$nhi
84
85	xor	$Tlo1,$Zlo,$Zlo
86	sll	$nlo,4,$nlo
87	xor	$Thi1,$Zhi,$Zhi
88	and	$nlo,0xf0,$nlo
89
90	addq	$nlo,$Htbl,$nlo
91	ldq	$Tlo0,8($nlo)
92	addq	$nhi,$Htbl,$nhi
93	ldq	$Thi0,0($nlo)
94
95.Looplo$N:
96	and	$Zlo,0x0f,$remp
97	sll	$Zhi,60,$t0
98	subq	$cnt,1,$cnt
99	srl	$Zlo,4,$Zlo
100
101	ldq	$Tlo1,8($nhi)
102	xor	$rem,$Zhi,$Zhi
103	ldq	$Thi1,0($nhi)
104	s8addq	$remp,$rem_4bit,$remp
105
106	ldq	$rem,0($remp)
107	srl	$Zhi,4,$Zhi
108	xor	$t0,$Zlo,$Zlo
109	extbl	$Xlo,$cnt,$nlo
110
111	and	$nlo,0xf0,$nhi
112	xor	$Thi0,$Zhi,$Zhi
113	xor	$Tlo0,$Zlo,$Zlo
114	sll	$nlo,4,$nlo
115
116
117	and	$Zlo,0x0f,$remp
118	sll	$Zhi,60,$t0
119	and	$nlo,0xf0,$nlo
120	srl	$Zlo,4,$Zlo
121
122	s8addq	$remp,$rem_4bit,$remp
123	xor	$rem,$Zhi,$Zhi
124	addq	$nlo,$Htbl,$nlo
125	addq	$nhi,$Htbl,$nhi
126
127	ldq	$rem,0($remp)
128	srl	$Zhi,4,$Zhi
129	ldq	$Tlo0,8($nlo)
130	xor	$t0,$Zlo,$Zlo
131
132	xor	$Tlo1,$Zlo,$Zlo
133	xor	$Thi1,$Zhi,$Zhi
134	ldq	$Thi0,0($nlo)
135	bne	$cnt,.Looplo$N
136
137
138	and	$Zlo,0x0f,$remp
139	sll	$Zhi,60,$t0
140	lda	$cnt,7(zero)
141	srl	$Zlo,4,$Zlo
142
143	ldq	$Tlo1,8($nhi)
144	xor	$rem,$Zhi,$Zhi
145	ldq	$Thi1,0($nhi)
146	s8addq	$remp,$rem_4bit,$remp
147
148	ldq	$rem,0($remp)
149	srl	$Zhi,4,$Zhi
150	xor	$t0,$Zlo,$Zlo
151	extbl	$Xhi,$cnt,$nlo
152
153	and	$nlo,0xf0,$nhi
154	xor	$Thi0,$Zhi,$Zhi
155	xor	$Tlo0,$Zlo,$Zlo
156	sll	$nlo,4,$nlo
157
158	and	$Zlo,0x0f,$remp
159	sll	$Zhi,60,$t0
160	and	$nlo,0xf0,$nlo
161	srl	$Zlo,4,$Zlo
162
163	s8addq	$remp,$rem_4bit,$remp
164	xor	$rem,$Zhi,$Zhi
165	addq	$nlo,$Htbl,$nlo
166	addq	$nhi,$Htbl,$nhi
167
168	ldq	$rem,0($remp)
169	srl	$Zhi,4,$Zhi
170	ldq	$Tlo0,8($nlo)
171	xor	$t0,$Zlo,$Zlo
172
173	xor	$Tlo1,$Zlo,$Zlo
174	xor	$Thi1,$Zhi,$Zhi
175	ldq	$Thi0,0($nlo)
176	unop
177
178
179.Loophi$N:
180	and	$Zlo,0x0f,$remp
181	sll	$Zhi,60,$t0
182	subq	$cnt,1,$cnt
183	srl	$Zlo,4,$Zlo
184
185	ldq	$Tlo1,8($nhi)
186	xor	$rem,$Zhi,$Zhi
187	ldq	$Thi1,0($nhi)
188	s8addq	$remp,$rem_4bit,$remp
189
190	ldq	$rem,0($remp)
191	srl	$Zhi,4,$Zhi
192	xor	$t0,$Zlo,$Zlo
193	extbl	$Xhi,$cnt,$nlo
194
195	and	$nlo,0xf0,$nhi
196	xor	$Thi0,$Zhi,$Zhi
197	xor	$Tlo0,$Zlo,$Zlo
198	sll	$nlo,4,$nlo
199
200
201	and	$Zlo,0x0f,$remp
202	sll	$Zhi,60,$t0
203	and	$nlo,0xf0,$nlo
204	srl	$Zlo,4,$Zlo
205
206	s8addq	$remp,$rem_4bit,$remp
207	xor	$rem,$Zhi,$Zhi
208	addq	$nlo,$Htbl,$nlo
209	addq	$nhi,$Htbl,$nhi
210
211	ldq	$rem,0($remp)
212	srl	$Zhi,4,$Zhi
213	ldq	$Tlo0,8($nlo)
214	xor	$t0,$Zlo,$Zlo
215
216	xor	$Tlo1,$Zlo,$Zlo
217	xor	$Thi1,$Zhi,$Zhi
218	ldq	$Thi0,0($nlo)
219	bne	$cnt,.Loophi$N
220
221
222	and	$Zlo,0x0f,$remp
223	sll	$Zhi,60,$t0
224	srl	$Zlo,4,$Zlo
225
226	ldq	$Tlo1,8($nhi)
227	xor	$rem,$Zhi,$Zhi
228	ldq	$Thi1,0($nhi)
229	s8addq	$remp,$rem_4bit,$remp
230
231	ldq	$rem,0($remp)
232	srl	$Zhi,4,$Zhi
233	xor	$t0,$Zlo,$Zlo
234
235	xor	$Tlo0,$Zlo,$Zlo
236	xor	$Thi0,$Zhi,$Zhi
237
238	and	$Zlo,0x0f,$remp
239	sll	$Zhi,60,$t0
240	srl	$Zlo,4,$Zlo
241
242	s8addq	$remp,$rem_4bit,$remp
243	xor	$rem,$Zhi,$Zhi
244
245	ldq	$rem,0($remp)
246	srl	$Zhi,4,$Zhi
247	xor	$Tlo1,$Zlo,$Zlo
248	xor	$Thi1,$Zhi,$Zhi
249	xor	$t0,$Zlo,$Zlo
250	xor	$rem,$Zhi,$Zhi
251___
252}}
253
254$code=<<___;
255#ifdef __linux__
256#include <asm/regdef.h>
257#else
258#include <asm.h>
259#include <regdef.h>
260#endif
261
262.text
263
264.set	noat
265.set	noreorder
266.globl	gcm_gmult_4bit
267.align	4
268.ent	gcm_gmult_4bit
269gcm_gmult_4bit:
270	.frame	sp,0,ra
271	.prologue 0
272
273	ldq	$Xlo,8($Xi)
274	ldq	$Xhi,0($Xi)
275
276	bsr	$t0,picmeup
277	nop
278___
279
280	&loop();
281
282$code.=<<___;
283	srl	$Zlo,24,$t0	# byte swap
284	srl	$Zlo,8,$t1
285
286	sll	$Zlo,8,$t2
287	sll	$Zlo,24,$Zlo
288	zapnot	$t0,0x11,$t0
289	zapnot	$t1,0x22,$t1
290
291	zapnot	$Zlo,0x88,$Zlo
292	or	$t0,$t1,$t0
293	zapnot	$t2,0x44,$t2
294
295	or	$Zlo,$t0,$Zlo
296	srl	$Zhi,24,$t0
297	srl	$Zhi,8,$t1
298
299	or	$Zlo,$t2,$Zlo
300	sll	$Zhi,8,$t2
301	sll	$Zhi,24,$Zhi
302
303	srl	$Zlo,32,$Xlo
304	sll	$Zlo,32,$Zlo
305
306	zapnot	$t0,0x11,$t0
307	zapnot	$t1,0x22,$t1
308	or	$Zlo,$Xlo,$Xlo
309
310	zapnot	$Zhi,0x88,$Zhi
311	or	$t0,$t1,$t0
312	zapnot	$t2,0x44,$t2
313
314	or	$Zhi,$t0,$Zhi
315	or	$Zhi,$t2,$Zhi
316
317	srl	$Zhi,32,$Xhi
318	sll	$Zhi,32,$Zhi
319
320	or	$Zhi,$Xhi,$Xhi
321	stq	$Xlo,8($Xi)
322	stq	$Xhi,0($Xi)
323
324	ret	(ra)
325.end	gcm_gmult_4bit
326___
327
328$inhi="s0";
329$inlo="s1";
330
331$code.=<<___;
332.globl	gcm_ghash_4bit
333.align	4
334.ent	gcm_ghash_4bit
335gcm_ghash_4bit:
336	lda	sp,-32(sp)
337	stq	ra,0(sp)
338	stq	s0,8(sp)
339	stq	s1,16(sp)
340	.mask	0x04000600,-32
341	.frame	sp,32,ra
342	.prologue 0
343
344	ldq_u	$inhi,0($inp)
345	ldq_u	$Thi0,7($inp)
346	ldq_u	$inlo,8($inp)
347	ldq_u	$Tlo0,15($inp)
348	ldq	$Xhi,0($Xi)
349	ldq	$Xlo,8($Xi)
350
351	bsr	$t0,picmeup
352	nop
353
354.Louter:
355	extql	$inhi,$inp,$inhi
356	extqh	$Thi0,$inp,$Thi0
357	or	$inhi,$Thi0,$inhi
358	lda	$inp,16($inp)
359
360	extql	$inlo,$inp,$inlo
361	extqh	$Tlo0,$inp,$Tlo0
362	or	$inlo,$Tlo0,$inlo
363	subq	$len,16,$len
364
365	xor	$Xlo,$inlo,$Xlo
366	xor	$Xhi,$inhi,$Xhi
367___
368
369	&loop();
370
371$code.=<<___;
372	srl	$Zlo,24,$t0	# byte swap
373	srl	$Zlo,8,$t1
374
375	sll	$Zlo,8,$t2
376	sll	$Zlo,24,$Zlo
377	zapnot	$t0,0x11,$t0
378	zapnot	$t1,0x22,$t1
379
380	zapnot	$Zlo,0x88,$Zlo
381	or	$t0,$t1,$t0
382	zapnot	$t2,0x44,$t2
383
384	or	$Zlo,$t0,$Zlo
385	srl	$Zhi,24,$t0
386	srl	$Zhi,8,$t1
387
388	or	$Zlo,$t2,$Zlo
389	sll	$Zhi,8,$t2
390	sll	$Zhi,24,$Zhi
391
392	srl	$Zlo,32,$Xlo
393	sll	$Zlo,32,$Zlo
394	beq	$len,.Ldone
395
396	zapnot	$t0,0x11,$t0
397	zapnot	$t1,0x22,$t1
398	or	$Zlo,$Xlo,$Xlo
399	ldq_u	$inhi,0($inp)
400
401	zapnot	$Zhi,0x88,$Zhi
402	or	$t0,$t1,$t0
403	zapnot	$t2,0x44,$t2
404	ldq_u	$Thi0,7($inp)
405
406	or	$Zhi,$t0,$Zhi
407	or	$Zhi,$t2,$Zhi
408	ldq_u	$inlo,8($inp)
409	ldq_u	$Tlo0,15($inp)
410
411	srl	$Zhi,32,$Xhi
412	sll	$Zhi,32,$Zhi
413
414	or	$Zhi,$Xhi,$Xhi
415	br	zero,.Louter
416
417.Ldone:
418	zapnot	$t0,0x11,$t0
419	zapnot	$t1,0x22,$t1
420	or	$Zlo,$Xlo,$Xlo
421
422	zapnot	$Zhi,0x88,$Zhi
423	or	$t0,$t1,$t0
424	zapnot	$t2,0x44,$t2
425
426	or	$Zhi,$t0,$Zhi
427	or	$Zhi,$t2,$Zhi
428
429	srl	$Zhi,32,$Xhi
430	sll	$Zhi,32,$Zhi
431
432	or	$Zhi,$Xhi,$Xhi
433
434	stq	$Xlo,8($Xi)
435	stq	$Xhi,0($Xi)
436
437	.set	noreorder
438	/*ldq	ra,0(sp)*/
439	ldq	s0,8(sp)
440	ldq	s1,16(sp)
441	lda	sp,32(sp)
442	ret	(ra)
443.end	gcm_ghash_4bit
444
445.align	4
446.ent	picmeup
447picmeup:
448	.frame	sp,0,$t0
449	.prologue 0
450	br	$rem_4bit,.Lpic
451.Lpic:	lda	$rem_4bit,12($rem_4bit)
452	ret	($t0)
453.end	picmeup
454	nop
455rem_4bit:
456	.long	0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
457	.long	0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
458	.long	0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
459	.long	0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
460.ascii	"GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
461.align	4
462
463___
464$output=pop and open STDOUT,">$output";
465print $code;
466close STDOUT or die "error closing STDOUT: $!";
467
468