1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# April 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
15# it processes one byte in 19.6 cycles, which is more than twice as
16# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
17# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
18# processed byte. This is ~2.2x faster than 64-bit code generated by
19# vendor compiler (which used to be very hard to beat:-).
20#
21# Special thanks to polarhome.com for providing HP-UX account.
22
23$flavour = shift;
24$output = shift;
25open STDOUT,">$output";
26
27if ($flavour =~ /64/) {
28	$LEVEL		="2.0W";
29	$SIZE_T		=8;
30	$FRAME_MARKER	=80;
31	$SAVED_RP	=16;
32	$PUSH		="std";
33	$PUSHMA		="std,ma";
34	$POP		="ldd";
35	$POPMB		="ldd,mb";
36	$NREGS		=6;
37} else {
38	$LEVEL		="1.0";	#"\n\t.ALLOW\t2.0";
39	$SIZE_T		=4;
40	$FRAME_MARKER	=48;
41	$SAVED_RP	=20;
42	$PUSH		="stw";
43	$PUSHMA		="stwm";
44	$POP		="ldw";
45	$POPMB		="ldwm";
46	$NREGS		=11;
47}
48
49$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
50				#                 [+ argument transfer]
51
52################# volatile registers
53$Xi="%r26";	# argument block
54$Htbl="%r25";
55$inp="%r24";
56$len="%r23";
57$Hhh=$Htbl;	# variables
58$Hll="%r22";
59$Zhh="%r21";
60$Zll="%r20";
61$cnt="%r19";
62$rem_4bit="%r28";
63$rem="%r29";
64$mask0xf0="%r31";
65
66################# preserved registers
67$Thh="%r1";
68$Tll="%r2";
69$nlo="%r3";
70$nhi="%r4";
71$byte="%r5";
72if ($SIZE_T==4) {
73	$Zhl="%r6";
74	$Zlh="%r7";
75	$Hhl="%r8";
76	$Hlh="%r9";
77	$Thl="%r10";
78	$Tlh="%r11";
79}
80$rem2="%r6";	# used in PA-RISC 2.0 code
81
82$code.=<<___;
83	.LEVEL	$LEVEL
84#if 0
85	.SPACE	\$TEXT\$
86	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
87#else
88	.text
89#endif
90
91	.EXPORT	gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
92	.ALIGN	64
93gcm_gmult_4bit
94	.PROC
95	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
96	.ENTRY
97	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
98	$PUSHMA	%r3,$FRAME(%sp)
99	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
100	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
101	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
102___
103$code.=<<___ if ($SIZE_T==4);
104	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
105	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
106	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
107	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
108	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
109___
110$code.=<<___;
111	blr	%r0,$rem_4bit
112	ldi	3,$rem
113L\$pic_gmult
114	andcm	$rem_4bit,$rem,$rem_4bit
115	addl	$inp,$len,$len
116	ldo	L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
117	ldi	0xf0,$mask0xf0
118___
119$code.=<<___ if ($SIZE_T==4);
120	ldi	31,$rem
121	mtctl	$rem,%cr11
122	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
123	b	L\$parisc1_gmult
124	nop
125___
126
127$code.=<<___;
128	ldb	15($Xi),$nlo
129	ldo	8($Htbl),$Hll
130
131	and	$mask0xf0,$nlo,$nhi
132	depd,z	$nlo,59,4,$nlo
133
134	ldd	$nlo($Hll),$Zll
135	ldd	$nlo($Hhh),$Zhh
136
137	depd,z	$Zll,60,4,$rem
138	shrpd	$Zhh,$Zll,4,$Zll
139	extrd,u	$Zhh,59,60,$Zhh
140	ldb	14($Xi),$nlo
141
142	ldd	$nhi($Hll),$Tll
143	ldd	$nhi($Hhh),$Thh
144	and	$mask0xf0,$nlo,$nhi
145	depd,z	$nlo,59,4,$nlo
146
147	xor	$Tll,$Zll,$Zll
148	xor	$Thh,$Zhh,$Zhh
149	ldd	$rem($rem_4bit),$rem
150	b	L\$oop_gmult_pa2
151	ldi	13,$cnt
152
153	.ALIGN	8
154L\$oop_gmult_pa2
155	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
156	depd,z	$Zll,60,4,$rem
157
158	shrpd	$Zhh,$Zll,4,$Zll
159	extrd,u	$Zhh,59,60,$Zhh
160	ldd	$nlo($Hll),$Tll
161	ldd	$nlo($Hhh),$Thh
162
163	xor	$Tll,$Zll,$Zll
164	xor	$Thh,$Zhh,$Zhh
165	ldd	$rem($rem_4bit),$rem
166
167	xor	$rem,$Zhh,$Zhh
168	depd,z	$Zll,60,4,$rem
169	ldbx	$cnt($Xi),$nlo
170
171	shrpd	$Zhh,$Zll,4,$Zll
172	extrd,u	$Zhh,59,60,$Zhh
173	ldd	$nhi($Hll),$Tll
174	ldd	$nhi($Hhh),$Thh
175
176	and	$mask0xf0,$nlo,$nhi
177	depd,z	$nlo,59,4,$nlo
178	ldd	$rem($rem_4bit),$rem
179
180	xor	$Tll,$Zll,$Zll
181	addib,uv -1,$cnt,L\$oop_gmult_pa2
182	xor	$Thh,$Zhh,$Zhh
183
184	xor	$rem,$Zhh,$Zhh
185	depd,z	$Zll,60,4,$rem
186
187	shrpd	$Zhh,$Zll,4,$Zll
188	extrd,u	$Zhh,59,60,$Zhh
189	ldd	$nlo($Hll),$Tll
190	ldd	$nlo($Hhh),$Thh
191
192	xor	$Tll,$Zll,$Zll
193	xor	$Thh,$Zhh,$Zhh
194	ldd	$rem($rem_4bit),$rem
195
196	xor	$rem,$Zhh,$Zhh
197	depd,z	$Zll,60,4,$rem
198
199	shrpd	$Zhh,$Zll,4,$Zll
200	extrd,u	$Zhh,59,60,$Zhh
201	ldd	$nhi($Hll),$Tll
202	ldd	$nhi($Hhh),$Thh
203
204	xor	$Tll,$Zll,$Zll
205	xor	$Thh,$Zhh,$Zhh
206	ldd	$rem($rem_4bit),$rem
207
208	xor	$rem,$Zhh,$Zhh
209	std	$Zll,8($Xi)
210	std	$Zhh,0($Xi)
211___
212
213$code.=<<___ if ($SIZE_T==4);
214	b	L\$done_gmult
215	nop
216
217L\$parisc1_gmult
218	ldb	15($Xi),$nlo
219	ldo	12($Htbl),$Hll
220	ldo	8($Htbl),$Hlh
221	ldo	4($Htbl),$Hhl
222
223	and	$mask0xf0,$nlo,$nhi
224	zdep	$nlo,27,4,$nlo
225
226	ldwx	$nlo($Hll),$Zll
227	ldwx	$nlo($Hlh),$Zlh
228	ldwx	$nlo($Hhl),$Zhl
229	ldwx	$nlo($Hhh),$Zhh
230	zdep	$Zll,28,4,$rem
231	ldb	14($Xi),$nlo
232	ldwx	$rem($rem_4bit),$rem
233	shrpw	$Zlh,$Zll,4,$Zll
234	ldwx	$nhi($Hll),$Tll
235	shrpw	$Zhl,$Zlh,4,$Zlh
236	ldwx	$nhi($Hlh),$Tlh
237	shrpw	$Zhh,$Zhl,4,$Zhl
238	ldwx	$nhi($Hhl),$Thl
239	extru	$Zhh,27,28,$Zhh
240	ldwx	$nhi($Hhh),$Thh
241	xor	$rem,$Zhh,$Zhh
242	and	$mask0xf0,$nlo,$nhi
243	zdep	$nlo,27,4,$nlo
244
245	xor	$Tll,$Zll,$Zll
246	ldwx	$nlo($Hll),$Tll
247	xor	$Tlh,$Zlh,$Zlh
248	ldwx	$nlo($Hlh),$Tlh
249	xor	$Thl,$Zhl,$Zhl
250	b	L\$oop_gmult_pa1
251	ldi	13,$cnt
252
253	.ALIGN	8
254L\$oop_gmult_pa1
255	zdep	$Zll,28,4,$rem
256	ldwx	$nlo($Hhl),$Thl
257	xor	$Thh,$Zhh,$Zhh
258	ldwx	$rem($rem_4bit),$rem
259	shrpw	$Zlh,$Zll,4,$Zll
260	ldwx	$nlo($Hhh),$Thh
261	shrpw	$Zhl,$Zlh,4,$Zlh
262	ldbx	$cnt($Xi),$nlo
263	xor	$Tll,$Zll,$Zll
264	ldwx	$nhi($Hll),$Tll
265	shrpw	$Zhh,$Zhl,4,$Zhl
266	xor	$Tlh,$Zlh,$Zlh
267	ldwx	$nhi($Hlh),$Tlh
268	extru	$Zhh,27,28,$Zhh
269	xor	$Thl,$Zhl,$Zhl
270	ldwx	$nhi($Hhl),$Thl
271	xor	$rem,$Zhh,$Zhh
272	zdep	$Zll,28,4,$rem
273	xor	$Thh,$Zhh,$Zhh
274	ldwx	$nhi($Hhh),$Thh
275	shrpw	$Zlh,$Zll,4,$Zll
276	ldwx	$rem($rem_4bit),$rem
277	shrpw	$Zhl,$Zlh,4,$Zlh
278	shrpw	$Zhh,$Zhl,4,$Zhl
279	and	$mask0xf0,$nlo,$nhi
280	extru	$Zhh,27,28,$Zhh
281	zdep	$nlo,27,4,$nlo
282	xor	$Tll,$Zll,$Zll
283	ldwx	$nlo($Hll),$Tll
284	xor	$Tlh,$Zlh,$Zlh
285	ldwx	$nlo($Hlh),$Tlh
286	xor	$rem,$Zhh,$Zhh
287	addib,uv -1,$cnt,L\$oop_gmult_pa1
288	xor	$Thl,$Zhl,$Zhl
289
290	zdep	$Zll,28,4,$rem
291	ldwx	$nlo($Hhl),$Thl
292	xor	$Thh,$Zhh,$Zhh
293	ldwx	$rem($rem_4bit),$rem
294	shrpw	$Zlh,$Zll,4,$Zll
295	ldwx	$nlo($Hhh),$Thh
296	shrpw	$Zhl,$Zlh,4,$Zlh
297	xor	$Tll,$Zll,$Zll
298	ldwx	$nhi($Hll),$Tll
299	shrpw	$Zhh,$Zhl,4,$Zhl
300	xor	$Tlh,$Zlh,$Zlh
301	ldwx	$nhi($Hlh),$Tlh
302	extru	$Zhh,27,28,$Zhh
303	xor	$rem,$Zhh,$Zhh
304	xor	$Thl,$Zhl,$Zhl
305	ldwx	$nhi($Hhl),$Thl
306	xor	$Thh,$Zhh,$Zhh
307	ldwx	$nhi($Hhh),$Thh
308	zdep	$Zll,28,4,$rem
309	ldwx	$rem($rem_4bit),$rem
310	shrpw	$Zlh,$Zll,4,$Zll
311	shrpw	$Zhl,$Zlh,4,$Zlh
312	shrpw	$Zhh,$Zhl,4,$Zhl
313	extru	$Zhh,27,28,$Zhh
314	xor	$Tll,$Zll,$Zll
315	xor	$Tlh,$Zlh,$Zlh
316	xor	$rem,$Zhh,$Zhh
317	stw	$Zll,12($Xi)
318	xor	$Thl,$Zhl,$Zhl
319	stw	$Zlh,8($Xi)
320	xor	$Thh,$Zhh,$Zhh
321	stw	$Zhl,4($Xi)
322	stw	$Zhh,0($Xi)
323___
324$code.=<<___;
325L\$done_gmult
326	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
327	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
328	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
329	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
330___
331$code.=<<___ if ($SIZE_T==4);
332	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
333	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
334	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
335	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
336	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
337___
338$code.=<<___;
339	bv	(%r2)
340	.EXIT
341	$POPMB	-$FRAME(%sp),%r3
342	.PROCEND
343
344	.EXPORT	gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
345	.ALIGN	64
346gcm_ghash_4bit
347	.PROC
348	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
349	.ENTRY
350	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
351	$PUSHMA	%r3,$FRAME(%sp)
352	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
353	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
354	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
355___
356$code.=<<___ if ($SIZE_T==4);
357	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
358	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
359	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
360	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
361	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
362___
363$code.=<<___;
364	blr	%r0,$rem_4bit
365	ldi	3,$rem
366L\$pic_ghash
367	andcm	$rem_4bit,$rem,$rem_4bit
368	addl	$inp,$len,$len
369	ldo	L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
370	ldi	0xf0,$mask0xf0
371___
372$code.=<<___ if ($SIZE_T==4);
373	ldi	31,$rem
374	mtctl	$rem,%cr11
375	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
376	b	L\$parisc1_ghash
377	nop
378___
379
380$code.=<<___;
381	ldb	15($Xi),$nlo
382	ldo	8($Htbl),$Hll
383
384L\$outer_ghash_pa2
385	ldb	15($inp),$nhi
386	xor	$nhi,$nlo,$nlo
387	and	$mask0xf0,$nlo,$nhi
388	depd,z	$nlo,59,4,$nlo
389
390	ldd	$nlo($Hll),$Zll
391	ldd	$nlo($Hhh),$Zhh
392
393	depd,z	$Zll,60,4,$rem
394	shrpd	$Zhh,$Zll,4,$Zll
395	extrd,u	$Zhh,59,60,$Zhh
396	ldb	14($Xi),$nlo
397	ldb	14($inp),$byte
398
399	ldd	$nhi($Hll),$Tll
400	ldd	$nhi($Hhh),$Thh
401	xor	$byte,$nlo,$nlo
402	and	$mask0xf0,$nlo,$nhi
403	depd,z	$nlo,59,4,$nlo
404
405	xor	$Tll,$Zll,$Zll
406	xor	$Thh,$Zhh,$Zhh
407	ldd	$rem($rem_4bit),$rem
408	b	L\$oop_ghash_pa2
409	ldi	13,$cnt
410
411	.ALIGN	8
412L\$oop_ghash_pa2
413	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
414	depd,z	$Zll,60,4,$rem2
415
416	shrpd	$Zhh,$Zll,4,$Zll
417	extrd,u	$Zhh,59,60,$Zhh
418	ldd	$nlo($Hll),$Tll
419	ldd	$nlo($Hhh),$Thh
420
421	xor	$Tll,$Zll,$Zll
422	xor	$Thh,$Zhh,$Zhh
423	ldbx	$cnt($Xi),$nlo
424	ldbx	$cnt($inp),$byte
425
426	depd,z	$Zll,60,4,$rem
427	shrpd	$Zhh,$Zll,4,$Zll
428	ldd	$rem2($rem_4bit),$rem2
429
430	xor	$rem2,$Zhh,$Zhh
431	xor	$byte,$nlo,$nlo
432	ldd	$nhi($Hll),$Tll
433	ldd	$nhi($Hhh),$Thh
434
435	and	$mask0xf0,$nlo,$nhi
436	depd,z	$nlo,59,4,$nlo
437
438	extrd,u	$Zhh,59,60,$Zhh
439	xor	$Tll,$Zll,$Zll
440
441	ldd	$rem($rem_4bit),$rem
442	addib,uv -1,$cnt,L\$oop_ghash_pa2
443	xor	$Thh,$Zhh,$Zhh
444
445	xor	$rem,$Zhh,$Zhh
446	depd,z	$Zll,60,4,$rem2
447
448	shrpd	$Zhh,$Zll,4,$Zll
449	extrd,u	$Zhh,59,60,$Zhh
450	ldd	$nlo($Hll),$Tll
451	ldd	$nlo($Hhh),$Thh
452
453	xor	$Tll,$Zll,$Zll
454	xor	$Thh,$Zhh,$Zhh
455
456	depd,z	$Zll,60,4,$rem
457	shrpd	$Zhh,$Zll,4,$Zll
458	ldd	$rem2($rem_4bit),$rem2
459
460	xor	$rem2,$Zhh,$Zhh
461	ldd	$nhi($Hll),$Tll
462	ldd	$nhi($Hhh),$Thh
463
464	extrd,u	$Zhh,59,60,$Zhh
465	xor	$Tll,$Zll,$Zll
466	xor	$Thh,$Zhh,$Zhh
467	ldd	$rem($rem_4bit),$rem
468
469	xor	$rem,$Zhh,$Zhh
470	std	$Zll,8($Xi)
471	ldo	16($inp),$inp
472	std	$Zhh,0($Xi)
473	cmpb,*<> $inp,$len,L\$outer_ghash_pa2
474	copy	$Zll,$nlo
475___
476
477$code.=<<___ if ($SIZE_T==4);
478	b	L\$done_ghash
479	nop
480
481L\$parisc1_ghash
482	ldb	15($Xi),$nlo
483	ldo	12($Htbl),$Hll
484	ldo	8($Htbl),$Hlh
485	ldo	4($Htbl),$Hhl
486
487L\$outer_ghash_pa1
488	ldb	15($inp),$byte
489	xor	$byte,$nlo,$nlo
490	and	$mask0xf0,$nlo,$nhi
491	zdep	$nlo,27,4,$nlo
492
493	ldwx	$nlo($Hll),$Zll
494	ldwx	$nlo($Hlh),$Zlh
495	ldwx	$nlo($Hhl),$Zhl
496	ldwx	$nlo($Hhh),$Zhh
497	zdep	$Zll,28,4,$rem
498	ldb	14($Xi),$nlo
499	ldb	14($inp),$byte
500	ldwx	$rem($rem_4bit),$rem
501	shrpw	$Zlh,$Zll,4,$Zll
502	ldwx	$nhi($Hll),$Tll
503	shrpw	$Zhl,$Zlh,4,$Zlh
504	ldwx	$nhi($Hlh),$Tlh
505	shrpw	$Zhh,$Zhl,4,$Zhl
506	ldwx	$nhi($Hhl),$Thl
507	extru	$Zhh,27,28,$Zhh
508	ldwx	$nhi($Hhh),$Thh
509	xor	$byte,$nlo,$nlo
510	xor	$rem,$Zhh,$Zhh
511	and	$mask0xf0,$nlo,$nhi
512	zdep	$nlo,27,4,$nlo
513
514	xor	$Tll,$Zll,$Zll
515	ldwx	$nlo($Hll),$Tll
516	xor	$Tlh,$Zlh,$Zlh
517	ldwx	$nlo($Hlh),$Tlh
518	xor	$Thl,$Zhl,$Zhl
519	b	L\$oop_ghash_pa1
520	ldi	13,$cnt
521
522	.ALIGN	8
523L\$oop_ghash_pa1
524	zdep	$Zll,28,4,$rem
525	ldwx	$nlo($Hhl),$Thl
526	xor	$Thh,$Zhh,$Zhh
527	ldwx	$rem($rem_4bit),$rem
528	shrpw	$Zlh,$Zll,4,$Zll
529	ldwx	$nlo($Hhh),$Thh
530	shrpw	$Zhl,$Zlh,4,$Zlh
531	ldbx	$cnt($Xi),$nlo
532	xor	$Tll,$Zll,$Zll
533	ldwx	$nhi($Hll),$Tll
534	shrpw	$Zhh,$Zhl,4,$Zhl
535	ldbx	$cnt($inp),$byte
536	xor	$Tlh,$Zlh,$Zlh
537	ldwx	$nhi($Hlh),$Tlh
538	extru	$Zhh,27,28,$Zhh
539	xor	$Thl,$Zhl,$Zhl
540	ldwx	$nhi($Hhl),$Thl
541	xor	$rem,$Zhh,$Zhh
542	zdep	$Zll,28,4,$rem
543	xor	$Thh,$Zhh,$Zhh
544	ldwx	$nhi($Hhh),$Thh
545	shrpw	$Zlh,$Zll,4,$Zll
546	ldwx	$rem($rem_4bit),$rem
547	shrpw	$Zhl,$Zlh,4,$Zlh
548	xor	$byte,$nlo,$nlo
549	shrpw	$Zhh,$Zhl,4,$Zhl
550	and	$mask0xf0,$nlo,$nhi
551	extru	$Zhh,27,28,$Zhh
552	zdep	$nlo,27,4,$nlo
553	xor	$Tll,$Zll,$Zll
554	ldwx	$nlo($Hll),$Tll
555	xor	$Tlh,$Zlh,$Zlh
556	ldwx	$nlo($Hlh),$Tlh
557	xor	$rem,$Zhh,$Zhh
558	addib,uv -1,$cnt,L\$oop_ghash_pa1
559	xor	$Thl,$Zhl,$Zhl
560
561	zdep	$Zll,28,4,$rem
562	ldwx	$nlo($Hhl),$Thl
563	xor	$Thh,$Zhh,$Zhh
564	ldwx	$rem($rem_4bit),$rem
565	shrpw	$Zlh,$Zll,4,$Zll
566	ldwx	$nlo($Hhh),$Thh
567	shrpw	$Zhl,$Zlh,4,$Zlh
568	xor	$Tll,$Zll,$Zll
569	ldwx	$nhi($Hll),$Tll
570	shrpw	$Zhh,$Zhl,4,$Zhl
571	xor	$Tlh,$Zlh,$Zlh
572	ldwx	$nhi($Hlh),$Tlh
573	extru	$Zhh,27,28,$Zhh
574	xor	$rem,$Zhh,$Zhh
575	xor	$Thl,$Zhl,$Zhl
576	ldwx	$nhi($Hhl),$Thl
577	xor	$Thh,$Zhh,$Zhh
578	ldwx	$nhi($Hhh),$Thh
579	zdep	$Zll,28,4,$rem
580	ldwx	$rem($rem_4bit),$rem
581	shrpw	$Zlh,$Zll,4,$Zll
582	shrpw	$Zhl,$Zlh,4,$Zlh
583	shrpw	$Zhh,$Zhl,4,$Zhl
584	extru	$Zhh,27,28,$Zhh
585	xor	$Tll,$Zll,$Zll
586	xor	$Tlh,$Zlh,$Zlh
587	xor	$rem,$Zhh,$Zhh
588	stw	$Zll,12($Xi)
589	xor	$Thl,$Zhl,$Zhl
590	stw	$Zlh,8($Xi)
591	xor	$Thh,$Zhh,$Zhh
592	stw	$Zhl,4($Xi)
593	ldo	16($inp),$inp
594	stw	$Zhh,0($Xi)
595	comb,<>	$inp,$len,L\$outer_ghash_pa1
596	copy	$Zll,$nlo
597___
598$code.=<<___;
599L\$done_ghash
600	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
601	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
602	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
603	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
604___
605$code.=<<___ if ($SIZE_T==4);
606	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
607	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
608	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
609	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
610	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
611___
612$code.=<<___;
613	bv	(%r2)
614	.EXIT
615	$POPMB	-$FRAME(%sp),%r3
616	.PROCEND
617
618	.ALIGN	64
619L\$rem_4bit
620	.WORD	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
621	.WORD	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
622	.WORD	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
623	.WORD	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
624
625	.data
626	.STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
627	.ALIGN	64
628___
629
630# Explicitly encode PA-RISC 2.0 instructions used in this module, so
631# that it can be compiled with .LEVEL 1.0. It should be noted that I
632# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
633# directive...
634
635my $ldd = sub {
636  my ($mod,$args) = @_;
637  my $orig = "ldd$mod\t$args";
638
639    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
640    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
641	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
642    }
643    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
644    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
645	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
646	$opcode|=(1<<5)  if ($mod =~ /^,m/);
647	$opcode|=(1<<13) if ($mod =~ /^,mb/);
648	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
649    }
650    else { "\t".$orig; }
651};
652
653my $std = sub {
654  my ($mod,$args) = @_;
655  my $orig = "std$mod\t$args";
656
657    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
658    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
659	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
660    }
661    else { "\t".$orig; }
662};
663
664my $extrd = sub {
665  my ($mod,$args) = @_;
666  my $orig = "extrd$mod\t$args";
667
668    # I only have ",u" completer, it's implicitly encoded...
669    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
670    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
671	my $len=32-$3;
672	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
673	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
674	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
675    }
676    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
677    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
678	my $len=32-$2;
679	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
680	$opcode |= (1<<13) if ($mod =~ /,\**=/);
681	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
682    }
683    else { "\t".$orig; }
684};
685
686my $shrpd = sub {
687  my ($mod,$args) = @_;
688  my $orig = "shrpd$mod\t$args";
689
690    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
691    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
692	my $cpos=63-$3;
693	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
694	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
695    }
696    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
697    {	sprintf "\t.WORD\t0x%08x\t; %s",
698		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
699    }
700    else { "\t".$orig; }
701};
702
703my $depd = sub {
704  my ($mod,$args) = @_;
705  my $orig = "depd$mod\t$args";
706
707    # I only have ",z" completer, it's implicitly encoded...
708    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 16
709    {	my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
710    	my $cpos=63-$2;
711	my $len=32-$3;
712	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode pos
713	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
714	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
715    }
716    else { "\t".$orig; }
717};
718
719sub assemble {
720  my ($mnemonic,$mod,$args)=@_;
721  my $opcode = eval("\$$mnemonic");
722
723    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
724}
725
726foreach (split("\n",$code)) {
727	s/\`([^\`]*)\`/eval $1/ge;
728	if ($SIZE_T==4) {
729		s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
730		s/cmpb,\*/comb,/;
731		s/,\*/,/;
732	}
733	s/\bbv\b/bve/	if ($SIZE_T==8);
734	print $_,"\n";
735}
736
737close STDOUT;
738