1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# April 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
15# it processes one byte in 19.6 cycles, which is more than twice as
16# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
17# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
18# processed byte. This is ~2.2x faster than 64-bit code generated by
19# vendor compiler (which used to be very hard to beat:-).
20#
21# Special thanks to polarhome.com for providing HP-UX account.
22
23$flavour = shift;
24$output = shift;
25open STDOUT,">$output";
26
27if ($flavour =~ /64/) {
28	$LEVEL		="2.0W";
29	$SIZE_T		=8;
30	$FRAME_MARKER	=80;
31	$SAVED_RP	=16;
32	$PUSH		="std";
33	$PUSHMA		="std,ma";
34	$POP		="ldd";
35	$POPMB		="ldd,mb";
36	$NREGS		=6;
37} else {
38	$LEVEL		="1.0";	#"\n\t.ALLOW\t2.0";
39	$SIZE_T		=4;
40	$FRAME_MARKER	=48;
41	$SAVED_RP	=20;
42	$PUSH		="stw";
43	$PUSHMA		="stwm";
44	$POP		="ldw";
45	$POPMB		="ldwm";
46	$NREGS		=11;
47}
48
49$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
50				#                 [+ argument transfer]
51
52################# volatile registers
53$Xi="%r26";	# argument block
54$Htbl="%r25";
55$inp="%r24";
56$len="%r23";
57$Hhh=$Htbl;	# variables
58$Hll="%r22";
59$Zhh="%r21";
60$Zll="%r20";
61$cnt="%r19";
62$rem_4bit="%r28";
63$rem="%r29";
64$mask0xf0="%r31";
65
66################# preserved registers
67$Thh="%r1";
68$Tll="%r2";
69$nlo="%r3";
70$nhi="%r4";
71$byte="%r5";
72if ($SIZE_T==4) {
73	$Zhl="%r6";
74	$Zlh="%r7";
75	$Hhl="%r8";
76	$Hlh="%r9";
77	$Thl="%r10";
78	$Tlh="%r11";
79}
80$rem2="%r6";	# used in PA-RISC 2.0 code
81
82$code.=<<___;
83	.LEVEL	$LEVEL
84	.text
85
86	.EXPORT	gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
87	.ALIGN	64
88gcm_gmult_4bit
89	.PROC
90	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
91	.ENTRY
92	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
93	$PUSHMA	%r3,$FRAME(%sp)
94	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
95	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
96	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
97___
98$code.=<<___ if ($SIZE_T==4);
99	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
100	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
101	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
102	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
103	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
104___
105$code.=<<___;
106	addl	$inp,$len,$len
107#ifdef __PIC__
108	addil	LT'L\$rem_4bit, %r19
109	ldw	RT'L\$rem_4bit(%r1), $rem_4bit
110#else
111	ldil	L'L\$rem_4bit, %t1
112	ldo	R'L\$rem_4bit(%t1), $rem_4bit
113#endif
114	ldi	0xf0,$mask0xf0
115___
116$code.=<<___ if ($SIZE_T==4);
117#ifndef __OpenBSD__
118	ldi	31,$rem
119	mtctl	$rem,%cr11
120	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
121	b	L\$parisc1_gmult
122	nop
123___
124
125$code.=<<___;
126	ldb	15($Xi),$nlo
127	ldo	8($Htbl),$Hll
128
129	and	$mask0xf0,$nlo,$nhi
130	depd,z	$nlo,59,4,$nlo
131
132	ldd	$nlo($Hll),$Zll
133	ldd	$nlo($Hhh),$Zhh
134
135	depd,z	$Zll,60,4,$rem
136	shrpd	$Zhh,$Zll,4,$Zll
137	extrd,u	$Zhh,59,60,$Zhh
138	ldb	14($Xi),$nlo
139
140	ldd	$nhi($Hll),$Tll
141	ldd	$nhi($Hhh),$Thh
142	and	$mask0xf0,$nlo,$nhi
143	depd,z	$nlo,59,4,$nlo
144
145	xor	$Tll,$Zll,$Zll
146	xor	$Thh,$Zhh,$Zhh
147	ldd	$rem($rem_4bit),$rem
148	b	L\$oop_gmult_pa2
149	ldi	13,$cnt
150
151	.ALIGN	8
152L\$oop_gmult_pa2
153	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
154	depd,z	$Zll,60,4,$rem
155
156	shrpd	$Zhh,$Zll,4,$Zll
157	extrd,u	$Zhh,59,60,$Zhh
158	ldd	$nlo($Hll),$Tll
159	ldd	$nlo($Hhh),$Thh
160
161	xor	$Tll,$Zll,$Zll
162	xor	$Thh,$Zhh,$Zhh
163	ldd	$rem($rem_4bit),$rem
164
165	xor	$rem,$Zhh,$Zhh
166	depd,z	$Zll,60,4,$rem
167	ldbx	$cnt($Xi),$nlo
168
169	shrpd	$Zhh,$Zll,4,$Zll
170	extrd,u	$Zhh,59,60,$Zhh
171	ldd	$nhi($Hll),$Tll
172	ldd	$nhi($Hhh),$Thh
173
174	and	$mask0xf0,$nlo,$nhi
175	depd,z	$nlo,59,4,$nlo
176	ldd	$rem($rem_4bit),$rem
177
178	xor	$Tll,$Zll,$Zll
179	addib,uv -1,$cnt,L\$oop_gmult_pa2
180	xor	$Thh,$Zhh,$Zhh
181
182	xor	$rem,$Zhh,$Zhh
183	depd,z	$Zll,60,4,$rem
184
185	shrpd	$Zhh,$Zll,4,$Zll
186	extrd,u	$Zhh,59,60,$Zhh
187	ldd	$nlo($Hll),$Tll
188	ldd	$nlo($Hhh),$Thh
189
190	xor	$Tll,$Zll,$Zll
191	xor	$Thh,$Zhh,$Zhh
192	ldd	$rem($rem_4bit),$rem
193
194	xor	$rem,$Zhh,$Zhh
195	depd,z	$Zll,60,4,$rem
196
197	shrpd	$Zhh,$Zll,4,$Zll
198	extrd,u	$Zhh,59,60,$Zhh
199	ldd	$nhi($Hll),$Tll
200	ldd	$nhi($Hhh),$Thh
201
202	xor	$Tll,$Zll,$Zll
203	xor	$Thh,$Zhh,$Zhh
204	ldd	$rem($rem_4bit),$rem
205
206	xor	$rem,$Zhh,$Zhh
207	std	$Zll,8($Xi)
208	std	$Zhh,0($Xi)
209___
210
211$code.=<<___ if ($SIZE_T==4);
212	b	L\$done_gmult
213	nop
214
215L\$parisc1_gmult
216#endif
217	ldb	15($Xi),$nlo
218	ldo	12($Htbl),$Hll
219	ldo	8($Htbl),$Hlh
220	ldo	4($Htbl),$Hhl
221
222	and	$mask0xf0,$nlo,$nhi
223	zdep	$nlo,27,4,$nlo
224
225	ldwx	$nlo($Hll),$Zll
226	ldwx	$nlo($Hlh),$Zlh
227	ldwx	$nlo($Hhl),$Zhl
228	ldwx	$nlo($Hhh),$Zhh
229	zdep	$Zll,28,4,$rem
230	ldb	14($Xi),$nlo
231	ldwx	$rem($rem_4bit),$rem
232	shrpw	$Zlh,$Zll,4,$Zll
233	ldwx	$nhi($Hll),$Tll
234	shrpw	$Zhl,$Zlh,4,$Zlh
235	ldwx	$nhi($Hlh),$Tlh
236	shrpw	$Zhh,$Zhl,4,$Zhl
237	ldwx	$nhi($Hhl),$Thl
238	extru	$Zhh,27,28,$Zhh
239	ldwx	$nhi($Hhh),$Thh
240	xor	$rem,$Zhh,$Zhh
241	and	$mask0xf0,$nlo,$nhi
242	zdep	$nlo,27,4,$nlo
243
244	xor	$Tll,$Zll,$Zll
245	ldwx	$nlo($Hll),$Tll
246	xor	$Tlh,$Zlh,$Zlh
247	ldwx	$nlo($Hlh),$Tlh
248	xor	$Thl,$Zhl,$Zhl
249	b	L\$oop_gmult_pa1
250	ldi	13,$cnt
251
252	.ALIGN	8
253L\$oop_gmult_pa1
254	zdep	$Zll,28,4,$rem
255	ldwx	$nlo($Hhl),$Thl
256	xor	$Thh,$Zhh,$Zhh
257	ldwx	$rem($rem_4bit),$rem
258	shrpw	$Zlh,$Zll,4,$Zll
259	ldwx	$nlo($Hhh),$Thh
260	shrpw	$Zhl,$Zlh,4,$Zlh
261	ldbx	$cnt($Xi),$nlo
262	xor	$Tll,$Zll,$Zll
263	ldwx	$nhi($Hll),$Tll
264	shrpw	$Zhh,$Zhl,4,$Zhl
265	xor	$Tlh,$Zlh,$Zlh
266	ldwx	$nhi($Hlh),$Tlh
267	extru	$Zhh,27,28,$Zhh
268	xor	$Thl,$Zhl,$Zhl
269	ldwx	$nhi($Hhl),$Thl
270	xor	$rem,$Zhh,$Zhh
271	zdep	$Zll,28,4,$rem
272	xor	$Thh,$Zhh,$Zhh
273	ldwx	$nhi($Hhh),$Thh
274	shrpw	$Zlh,$Zll,4,$Zll
275	ldwx	$rem($rem_4bit),$rem
276	shrpw	$Zhl,$Zlh,4,$Zlh
277	shrpw	$Zhh,$Zhl,4,$Zhl
278	and	$mask0xf0,$nlo,$nhi
279	extru	$Zhh,27,28,$Zhh
280	zdep	$nlo,27,4,$nlo
281	xor	$Tll,$Zll,$Zll
282	ldwx	$nlo($Hll),$Tll
283	xor	$Tlh,$Zlh,$Zlh
284	ldwx	$nlo($Hlh),$Tlh
285	xor	$rem,$Zhh,$Zhh
286	addib,uv -1,$cnt,L\$oop_gmult_pa1
287	xor	$Thl,$Zhl,$Zhl
288
289	zdep	$Zll,28,4,$rem
290	ldwx	$nlo($Hhl),$Thl
291	xor	$Thh,$Zhh,$Zhh
292	ldwx	$rem($rem_4bit),$rem
293	shrpw	$Zlh,$Zll,4,$Zll
294	ldwx	$nlo($Hhh),$Thh
295	shrpw	$Zhl,$Zlh,4,$Zlh
296	xor	$Tll,$Zll,$Zll
297	ldwx	$nhi($Hll),$Tll
298	shrpw	$Zhh,$Zhl,4,$Zhl
299	xor	$Tlh,$Zlh,$Zlh
300	ldwx	$nhi($Hlh),$Tlh
301	extru	$Zhh,27,28,$Zhh
302	xor	$rem,$Zhh,$Zhh
303	xor	$Thl,$Zhl,$Zhl
304	ldwx	$nhi($Hhl),$Thl
305	xor	$Thh,$Zhh,$Zhh
306	ldwx	$nhi($Hhh),$Thh
307	zdep	$Zll,28,4,$rem
308	ldwx	$rem($rem_4bit),$rem
309	shrpw	$Zlh,$Zll,4,$Zll
310	shrpw	$Zhl,$Zlh,4,$Zlh
311	shrpw	$Zhh,$Zhl,4,$Zhl
312	extru	$Zhh,27,28,$Zhh
313	xor	$Tll,$Zll,$Zll
314	xor	$Tlh,$Zlh,$Zlh
315	xor	$rem,$Zhh,$Zhh
316	stw	$Zll,12($Xi)
317	xor	$Thl,$Zhl,$Zhl
318	stw	$Zlh,8($Xi)
319	xor	$Thh,$Zhh,$Zhh
320	stw	$Zhl,4($Xi)
321	stw	$Zhh,0($Xi)
322___
323$code.=<<___;
324L\$done_gmult
325	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
326	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
327	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
328	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
329___
330$code.=<<___ if ($SIZE_T==4);
331	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
332	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
333	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
334	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
335	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
336___
337$code.=<<___;
338	bv	(%r2)
339	.EXIT
340	$POPMB	-$FRAME(%sp),%r3
341	.PROCEND
342
343	.EXPORT	gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
344	.ALIGN	64
345gcm_ghash_4bit
346	.PROC
347	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
348	.ENTRY
349	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
350	$PUSHMA	%r3,$FRAME(%sp)
351	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
352	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
353	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
354___
355$code.=<<___ if ($SIZE_T==4);
356	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
357	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
358	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
359	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
360	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
361___
362$code.=<<___;
363	addl	$inp,$len,$len
364#ifdef __PIC__
365	addil	LT'L\$rem_4bit, %r19
366	ldw	RT'L\$rem_4bit(%r1), $rem_4bit
367#else
368	ldil	L'L\$rem_4bit, %t1
369	ldo	R'L\$rem_4bit(%t1), $rem_4bit
370#endif
371	ldi	0xf0,$mask0xf0
372___
373$code.=<<___ if ($SIZE_T==4);
374#ifndef __OpenBSD__
375	ldi	31,$rem
376	mtctl	$rem,%cr11
377	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
378	b	L\$parisc1_ghash
379	nop
380___
381
382$code.=<<___;
383	ldb	15($Xi),$nlo
384	ldo	8($Htbl),$Hll
385
386L\$outer_ghash_pa2
387	ldb	15($inp),$nhi
388	xor	$nhi,$nlo,$nlo
389	and	$mask0xf0,$nlo,$nhi
390	depd,z	$nlo,59,4,$nlo
391
392	ldd	$nlo($Hll),$Zll
393	ldd	$nlo($Hhh),$Zhh
394
395	depd,z	$Zll,60,4,$rem
396	shrpd	$Zhh,$Zll,4,$Zll
397	extrd,u	$Zhh,59,60,$Zhh
398	ldb	14($Xi),$nlo
399	ldb	14($inp),$byte
400
401	ldd	$nhi($Hll),$Tll
402	ldd	$nhi($Hhh),$Thh
403	xor	$byte,$nlo,$nlo
404	and	$mask0xf0,$nlo,$nhi
405	depd,z	$nlo,59,4,$nlo
406
407	xor	$Tll,$Zll,$Zll
408	xor	$Thh,$Zhh,$Zhh
409	ldd	$rem($rem_4bit),$rem
410	b	L\$oop_ghash_pa2
411	ldi	13,$cnt
412
413	.ALIGN	8
414L\$oop_ghash_pa2
415	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
416	depd,z	$Zll,60,4,$rem2
417
418	shrpd	$Zhh,$Zll,4,$Zll
419	extrd,u	$Zhh,59,60,$Zhh
420	ldd	$nlo($Hll),$Tll
421	ldd	$nlo($Hhh),$Thh
422
423	xor	$Tll,$Zll,$Zll
424	xor	$Thh,$Zhh,$Zhh
425	ldbx	$cnt($Xi),$nlo
426	ldbx	$cnt($inp),$byte
427
428	depd,z	$Zll,60,4,$rem
429	shrpd	$Zhh,$Zll,4,$Zll
430	ldd	$rem2($rem_4bit),$rem2
431
432	xor	$rem2,$Zhh,$Zhh
433	xor	$byte,$nlo,$nlo
434	ldd	$nhi($Hll),$Tll
435	ldd	$nhi($Hhh),$Thh
436
437	and	$mask0xf0,$nlo,$nhi
438	depd,z	$nlo,59,4,$nlo
439
440	extrd,u	$Zhh,59,60,$Zhh
441	xor	$Tll,$Zll,$Zll
442
443	ldd	$rem($rem_4bit),$rem
444	addib,uv -1,$cnt,L\$oop_ghash_pa2
445	xor	$Thh,$Zhh,$Zhh
446
447	xor	$rem,$Zhh,$Zhh
448	depd,z	$Zll,60,4,$rem2
449
450	shrpd	$Zhh,$Zll,4,$Zll
451	extrd,u	$Zhh,59,60,$Zhh
452	ldd	$nlo($Hll),$Tll
453	ldd	$nlo($Hhh),$Thh
454
455	xor	$Tll,$Zll,$Zll
456	xor	$Thh,$Zhh,$Zhh
457
458	depd,z	$Zll,60,4,$rem
459	shrpd	$Zhh,$Zll,4,$Zll
460	ldd	$rem2($rem_4bit),$rem2
461
462	xor	$rem2,$Zhh,$Zhh
463	ldd	$nhi($Hll),$Tll
464	ldd	$nhi($Hhh),$Thh
465
466	extrd,u	$Zhh,59,60,$Zhh
467	xor	$Tll,$Zll,$Zll
468	xor	$Thh,$Zhh,$Zhh
469	ldd	$rem($rem_4bit),$rem
470
471	xor	$rem,$Zhh,$Zhh
472	std	$Zll,8($Xi)
473	ldo	16($inp),$inp
474	std	$Zhh,0($Xi)
475	cmpb,*<> $inp,$len,L\$outer_ghash_pa2
476	copy	$Zll,$nlo
477___
478
479$code.=<<___ if ($SIZE_T==4);
480	b	L\$done_ghash
481	nop
482
483L\$parisc1_ghash
484#endif
485	ldb	15($Xi),$nlo
486	ldo	12($Htbl),$Hll
487	ldo	8($Htbl),$Hlh
488	ldo	4($Htbl),$Hhl
489
490L\$outer_ghash_pa1
491	ldb	15($inp),$byte
492	xor	$byte,$nlo,$nlo
493	and	$mask0xf0,$nlo,$nhi
494	zdep	$nlo,27,4,$nlo
495
496	ldwx	$nlo($Hll),$Zll
497	ldwx	$nlo($Hlh),$Zlh
498	ldwx	$nlo($Hhl),$Zhl
499	ldwx	$nlo($Hhh),$Zhh
500	zdep	$Zll,28,4,$rem
501	ldb	14($Xi),$nlo
502	ldb	14($inp),$byte
503	ldwx	$rem($rem_4bit),$rem
504	shrpw	$Zlh,$Zll,4,$Zll
505	ldwx	$nhi($Hll),$Tll
506	shrpw	$Zhl,$Zlh,4,$Zlh
507	ldwx	$nhi($Hlh),$Tlh
508	shrpw	$Zhh,$Zhl,4,$Zhl
509	ldwx	$nhi($Hhl),$Thl
510	extru	$Zhh,27,28,$Zhh
511	ldwx	$nhi($Hhh),$Thh
512	xor	$byte,$nlo,$nlo
513	xor	$rem,$Zhh,$Zhh
514	and	$mask0xf0,$nlo,$nhi
515	zdep	$nlo,27,4,$nlo
516
517	xor	$Tll,$Zll,$Zll
518	ldwx	$nlo($Hll),$Tll
519	xor	$Tlh,$Zlh,$Zlh
520	ldwx	$nlo($Hlh),$Tlh
521	xor	$Thl,$Zhl,$Zhl
522	b	L\$oop_ghash_pa1
523	ldi	13,$cnt
524
525	.ALIGN	8
526L\$oop_ghash_pa1
527	zdep	$Zll,28,4,$rem
528	ldwx	$nlo($Hhl),$Thl
529	xor	$Thh,$Zhh,$Zhh
530	ldwx	$rem($rem_4bit),$rem
531	shrpw	$Zlh,$Zll,4,$Zll
532	ldwx	$nlo($Hhh),$Thh
533	shrpw	$Zhl,$Zlh,4,$Zlh
534	ldbx	$cnt($Xi),$nlo
535	xor	$Tll,$Zll,$Zll
536	ldwx	$nhi($Hll),$Tll
537	shrpw	$Zhh,$Zhl,4,$Zhl
538	ldbx	$cnt($inp),$byte
539	xor	$Tlh,$Zlh,$Zlh
540	ldwx	$nhi($Hlh),$Tlh
541	extru	$Zhh,27,28,$Zhh
542	xor	$Thl,$Zhl,$Zhl
543	ldwx	$nhi($Hhl),$Thl
544	xor	$rem,$Zhh,$Zhh
545	zdep	$Zll,28,4,$rem
546	xor	$Thh,$Zhh,$Zhh
547	ldwx	$nhi($Hhh),$Thh
548	shrpw	$Zlh,$Zll,4,$Zll
549	ldwx	$rem($rem_4bit),$rem
550	shrpw	$Zhl,$Zlh,4,$Zlh
551	xor	$byte,$nlo,$nlo
552	shrpw	$Zhh,$Zhl,4,$Zhl
553	and	$mask0xf0,$nlo,$nhi
554	extru	$Zhh,27,28,$Zhh
555	zdep	$nlo,27,4,$nlo
556	xor	$Tll,$Zll,$Zll
557	ldwx	$nlo($Hll),$Tll
558	xor	$Tlh,$Zlh,$Zlh
559	ldwx	$nlo($Hlh),$Tlh
560	xor	$rem,$Zhh,$Zhh
561	addib,uv -1,$cnt,L\$oop_ghash_pa1
562	xor	$Thl,$Zhl,$Zhl
563
564	zdep	$Zll,28,4,$rem
565	ldwx	$nlo($Hhl),$Thl
566	xor	$Thh,$Zhh,$Zhh
567	ldwx	$rem($rem_4bit),$rem
568	shrpw	$Zlh,$Zll,4,$Zll
569	ldwx	$nlo($Hhh),$Thh
570	shrpw	$Zhl,$Zlh,4,$Zlh
571	xor	$Tll,$Zll,$Zll
572	ldwx	$nhi($Hll),$Tll
573	shrpw	$Zhh,$Zhl,4,$Zhl
574	xor	$Tlh,$Zlh,$Zlh
575	ldwx	$nhi($Hlh),$Tlh
576	extru	$Zhh,27,28,$Zhh
577	xor	$rem,$Zhh,$Zhh
578	xor	$Thl,$Zhl,$Zhl
579	ldwx	$nhi($Hhl),$Thl
580	xor	$Thh,$Zhh,$Zhh
581	ldwx	$nhi($Hhh),$Thh
582	zdep	$Zll,28,4,$rem
583	ldwx	$rem($rem_4bit),$rem
584	shrpw	$Zlh,$Zll,4,$Zll
585	shrpw	$Zhl,$Zlh,4,$Zlh
586	shrpw	$Zhh,$Zhl,4,$Zhl
587	extru	$Zhh,27,28,$Zhh
588	xor	$Tll,$Zll,$Zll
589	xor	$Tlh,$Zlh,$Zlh
590	xor	$rem,$Zhh,$Zhh
591	stw	$Zll,12($Xi)
592	xor	$Thl,$Zhl,$Zhl
593	stw	$Zlh,8($Xi)
594	xor	$Thh,$Zhh,$Zhh
595	stw	$Zhl,4($Xi)
596	ldo	16($inp),$inp
597	stw	$Zhh,0($Xi)
598	comb,<>	$inp,$len,L\$outer_ghash_pa1
599	copy	$Zll,$nlo
600___
601$code.=<<___;
602L\$done_ghash
603	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
604	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
605	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
606	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
607___
608$code.=<<___ if ($SIZE_T==4);
609	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
610	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
611	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
612	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
613	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
614___
615$code.=<<___;
616	bv	(%r2)
617	.EXIT
618	$POPMB	-$FRAME(%sp),%r3
619	.PROCEND
620
621	.section .rodata
622	.ALIGN	64
623L\$rem_4bit
624	.WORD	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
625	.WORD	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
626	.WORD	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
627	.WORD	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
628	.previous
629
630	.ALIGN	64
631___
632
633# Explicitly encode PA-RISC 2.0 instructions used in this module, so
634# that it can be compiled with .LEVEL 1.0. It should be noted that I
635# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
636# directive...
637
638my $ldd = sub {
639  my ($mod,$args) = @_;
640  my $orig = "ldd$mod\t$args";
641
642    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
643    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
644	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
645    }
646    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
647    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
648	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
649	$opcode|=(1<<5)  if ($mod =~ /^,m/);
650	$opcode|=(1<<13) if ($mod =~ /^,mb/);
651	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
652    }
653    else { "\t".$orig; }
654};
655
656my $std = sub {
657  my ($mod,$args) = @_;
658  my $orig = "std$mod\t$args";
659
660    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
661    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
662	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
663    }
664    else { "\t".$orig; }
665};
666
667my $extrd = sub {
668  my ($mod,$args) = @_;
669  my $orig = "extrd$mod\t$args";
670
671    # I only have ",u" completer, it's implicitly encoded...
672    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
673    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
674	my $len=32-$3;
675	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
676	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
677	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
678    }
679    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
680    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
681	my $len=32-$2;
682	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
683	$opcode |= (1<<13) if ($mod =~ /,\**=/);
684	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
685    }
686    else { "\t".$orig; }
687};
688
689my $shrpd = sub {
690  my ($mod,$args) = @_;
691  my $orig = "shrpd$mod\t$args";
692
693    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
694    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
695	my $cpos=63-$3;
696	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
697	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
698    }
699    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
700    {	sprintf "\t.WORD\t0x%08x\t; %s",
701		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
702    }
703    else { "\t".$orig; }
704};
705
706my $depd = sub {
707  my ($mod,$args) = @_;
708  my $orig = "depd$mod\t$args";
709
710    # I only have ",z" completer, it's implicitly encoded...
711    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 16
712    {	my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
713    	my $cpos=63-$2;
714	my $len=32-$3;
715	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode pos
716	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
717	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
718    }
719    else { "\t".$orig; }
720};
721
722sub assemble {
723  my ($mnemonic,$mod,$args)=@_;
724  my $opcode = eval("\$$mnemonic");
725
726    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
727}
728
729foreach (split("\n",$code)) {
730	s/\`([^\`]*)\`/eval $1/ge;
731	if ($SIZE_T==4) {
732		s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
733		s/cmpb,\*/comb,/;
734		s/,\*/,/;
735	}
736	s/\bbv\b/bve/	if ($SIZE_T==8);
737	print $_,"\n";
738}
739
740close STDOUT;
741