1*1dcdf01fSchristos#! /usr/bin/env perl
2*1dcdf01fSchristos# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
3*1dcdf01fSchristos#
4*1dcdf01fSchristos# Licensed under the OpenSSL license (the "License").  You may not use
5*1dcdf01fSchristos# this file except in compliance with the License.  You can obtain a copy
6*1dcdf01fSchristos# in the file LICENSE in the source distribution or at
7*1dcdf01fSchristos# https://www.openssl.org/source/license.html
8*1dcdf01fSchristos
9*1dcdf01fSchristos#
10*1dcdf01fSchristos# ====================================================================
11*1dcdf01fSchristos# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12*1dcdf01fSchristos# project. The module is, however, dual licensed under OpenSSL and
13*1dcdf01fSchristos# CRYPTOGAMS licenses depending on where you obtain it. For further
14*1dcdf01fSchristos# details see http://www.openssl.org/~appro/cryptogams/.
15*1dcdf01fSchristos# ====================================================================
16*1dcdf01fSchristos#
17*1dcdf01fSchristos# December 2011
18*1dcdf01fSchristos#
19*1dcdf01fSchristos# The module implements GCM GHASH function and underlying single
20*1dcdf01fSchristos# multiplication operation in GF(2^128). Even though subroutines
21*1dcdf01fSchristos# have _4bit suffix, they are not using any tables, but rely on
22*1dcdf01fSchristos# hardware Galois Field Multiply support. Streamed GHASH processes
23*1dcdf01fSchristos# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
24*1dcdf01fSchristos# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
25*1dcdf01fSchristos# comparing apples vs. oranges, but compiler surely could have done
26*1dcdf01fSchristos# better, because theoretical [though not necessarily achievable]
27*1dcdf01fSchristos# estimate for "4-bit" table-driven implementation is ~12 cycles.
28*1dcdf01fSchristos
29*1dcdf01fSchristoswhile (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
30*1dcdf01fSchristosopen STDOUT,">$output";
31*1dcdf01fSchristos
32*1dcdf01fSchristos($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6");	# arguments
33*1dcdf01fSchristos
34*1dcdf01fSchristos($Z0,$Z1,$Z2,$Z3,	$H0, $H1, $H2, $H3,
35*1dcdf01fSchristos			$H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
36*1dcdf01fSchristos($H01u,$H01y,$H2u,$H3u,	$H0y,$H1y,$H2y,$H3y,
37*1dcdf01fSchristos			$H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
38*1dcdf01fSchristos($FF000000,$E10000)=("B30","B31");
39*1dcdf01fSchristos($xip,$x0,$x1,$xib)=map("B$_",(6..9));	# $xip zaps $len
40*1dcdf01fSchristos $xia="A9";
41*1dcdf01fSchristos($rem,$res)=("B4","B5");		# $rem zaps $Htable
42*1dcdf01fSchristos
43*1dcdf01fSchristos$code.=<<___;
44*1dcdf01fSchristos	.text
45*1dcdf01fSchristos
46*1dcdf01fSchristos	.if	.ASSEMBLER_VERSION<7000000
47*1dcdf01fSchristos	.asg	0,__TI_EABI__
48*1dcdf01fSchristos	.endif
49*1dcdf01fSchristos	.if	__TI_EABI__
50*1dcdf01fSchristos	.asg	gcm_gmult_1bit,_gcm_gmult_1bit
51*1dcdf01fSchristos	.asg	gcm_gmult_4bit,_gcm_gmult_4bit
52*1dcdf01fSchristos	.asg	gcm_ghash_4bit,_gcm_ghash_4bit
53*1dcdf01fSchristos	.endif
54*1dcdf01fSchristos
55*1dcdf01fSchristos	.asg	B3,RA
56*1dcdf01fSchristos
57*1dcdf01fSchristos	.if	0
58*1dcdf01fSchristos	.global	_gcm_gmult_1bit
59*1dcdf01fSchristos_gcm_gmult_1bit:
60*1dcdf01fSchristos	ADDAD	$Htable,2,$Htable
61*1dcdf01fSchristos	.endif
62*1dcdf01fSchristos	.global	_gcm_gmult_4bit
63*1dcdf01fSchristos_gcm_gmult_4bit:
64*1dcdf01fSchristos	.asmfunc
65*1dcdf01fSchristos	LDDW	*${Htable}[-1],$H1:$H0	; H.lo
66*1dcdf01fSchristos	LDDW	*${Htable}[-2],$H3:$H2	; H.hi
67*1dcdf01fSchristos||	MV	$Xip,${xip}		; reassign Xi
68*1dcdf01fSchristos||	MVK	15,B1			; SPLOOPD constant
69*1dcdf01fSchristos
70*1dcdf01fSchristos	MVK	0xE1,$E10000
71*1dcdf01fSchristos||	LDBU	*++${xip}[15],$x1	; Xi[15]
72*1dcdf01fSchristos	MVK	0xFF,$FF000000
73*1dcdf01fSchristos||	LDBU	*--${xip},$x0		; Xi[14]
74*1dcdf01fSchristos	SHL	$E10000,16,$E10000	; [pre-shifted] reduction polynomial
75*1dcdf01fSchristos	SHL	$FF000000,24,$FF000000	; upper byte mask
76*1dcdf01fSchristos||	BNOP	ghash_loop?
77*1dcdf01fSchristos||	MVK	1,B0			; take a single spin
78*1dcdf01fSchristos
79*1dcdf01fSchristos	PACKH2	$H0,$H1,$xia		; pack H0' and H1's upper bytes
80*1dcdf01fSchristos	AND	$H2,$FF000000,$H2u	; H2's upper byte
81*1dcdf01fSchristos	AND	$H3,$FF000000,$H3u	; H3's upper byte
82*1dcdf01fSchristos||	SHRU	$H2u,8,$H2u
83*1dcdf01fSchristos	SHRU	$H3u,8,$H3u
84*1dcdf01fSchristos||	ZERO	$Z1:$Z0
85*1dcdf01fSchristos	SHRU2	$xia,8,$H01u
86*1dcdf01fSchristos||	ZERO	$Z3:$Z2
87*1dcdf01fSchristos	.endasmfunc
88*1dcdf01fSchristos
89*1dcdf01fSchristos	.global	_gcm_ghash_4bit
90*1dcdf01fSchristos_gcm_ghash_4bit:
91*1dcdf01fSchristos	.asmfunc
92*1dcdf01fSchristos	LDDW	*${Htable}[-1],$H1:$H0	; H.lo
93*1dcdf01fSchristos||	SHRU	$len,4,B0		; reassign len
94*1dcdf01fSchristos	LDDW	*${Htable}[-2],$H3:$H2	; H.hi
95*1dcdf01fSchristos||	MV	$Xip,${xip}		; reassign Xi
96*1dcdf01fSchristos||	MVK	15,B1			; SPLOOPD constant
97*1dcdf01fSchristos
98*1dcdf01fSchristos	MVK	0xE1,$E10000
99*1dcdf01fSchristos|| [B0]	LDNDW	*${inp}[1],$H1x:$H0x
100*1dcdf01fSchristos	MVK	0xFF,$FF000000
101*1dcdf01fSchristos|| [B0]	LDNDW	*${inp}++[2],$H3x:$H2x
102*1dcdf01fSchristos	SHL	$E10000,16,$E10000	; [pre-shifted] reduction polynomial
103*1dcdf01fSchristos||	LDDW	*${xip}[1],$Z1:$Z0
104*1dcdf01fSchristos	SHL	$FF000000,24,$FF000000	; upper byte mask
105*1dcdf01fSchristos||	LDDW	*${xip}[0],$Z3:$Z2
106*1dcdf01fSchristos
107*1dcdf01fSchristos	PACKH2	$H0,$H1,$xia		; pack H0' and H1's upper bytes
108*1dcdf01fSchristos	AND	$H2,$FF000000,$H2u	; H2's upper byte
109*1dcdf01fSchristos	AND	$H3,$FF000000,$H3u	; H3's upper byte
110*1dcdf01fSchristos||	SHRU	$H2u,8,$H2u
111*1dcdf01fSchristos	SHRU	$H3u,8,$H3u
112*1dcdf01fSchristos	SHRU2	$xia,8,$H01u
113*1dcdf01fSchristos
114*1dcdf01fSchristos|| [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
115*1dcdf01fSchristos|| [B0]	XOR	$H1x,$Z1,$Z1
116*1dcdf01fSchristos	.if	.LITTLE_ENDIAN
117*1dcdf01fSchristos   [B0]	XOR	$H2x,$Z2,$Z2
118*1dcdf01fSchristos|| [B0]	XOR	$H3x,$Z3,$Z3
119*1dcdf01fSchristos|| [B0]	SHRU	$Z1,24,$xia		; Xi[15], avoid cross-path stall
120*1dcdf01fSchristos	STDW	$Z1:$Z0,*${xip}[1]
121*1dcdf01fSchristos|| [B0]	SHRU	$Z1,16,$x0		; Xi[14]
122*1dcdf01fSchristos|| [B0]	ZERO	$Z1:$Z0
123*1dcdf01fSchristos	.else
124*1dcdf01fSchristos   [B0]	XOR	$H2x,$Z2,$Z2
125*1dcdf01fSchristos|| [B0]	XOR	$H3x,$Z3,$Z3
126*1dcdf01fSchristos|| [B0]	MV	$Z0,$xia		; Xi[15], avoid cross-path stall
127*1dcdf01fSchristos	STDW	$Z1:$Z0,*${xip}[1]
128*1dcdf01fSchristos|| [B0] SHRU	$Z0,8,$x0		; Xi[14]
129*1dcdf01fSchristos|| [B0]	ZERO	$Z1:$Z0
130*1dcdf01fSchristos	.endif
131*1dcdf01fSchristos	STDW	$Z3:$Z2,*${xip}[0]
132*1dcdf01fSchristos|| [B0]	ZERO	$Z3:$Z2
133*1dcdf01fSchristos|| [B0]	MV	$xia,$x1
134*1dcdf01fSchristos   [B0]	ADDK	14,${xip}
135*1dcdf01fSchristos
136*1dcdf01fSchristosghash_loop?:
137*1dcdf01fSchristos	SPLOOPD	6			; 6*16+7
138*1dcdf01fSchristos||	MVC	B1,ILC
139*1dcdf01fSchristos|| [B0]	SUB	B0,1,B0
140*1dcdf01fSchristos||	ZERO	A0
141*1dcdf01fSchristos||	ADD	$x1,$x1,$xib		; SHL	$x1,1,$xib
142*1dcdf01fSchristos||	SHL	$x1,1,$xia
143*1dcdf01fSchristos___
144*1dcdf01fSchristos
145*1dcdf01fSchristos########____________________________
146*1dcdf01fSchristos#  0    D2.     M1          M2      |
147*1dcdf01fSchristos#  1            M1                  |
148*1dcdf01fSchristos#  2            M1          M2      |
149*1dcdf01fSchristos#  3        D1. M1          M2      |
150*1dcdf01fSchristos#  4        S1. L1                  |
151*1dcdf01fSchristos#  5    S2  S1x L1          D2  L2  |____________________________
152*1dcdf01fSchristos#  6/0          L1  S1      L2  S2x |D2.     M1          M2      |
153*1dcdf01fSchristos#  7/1          L1  S1  D1x S2  M2  |        M1                  |
154*1dcdf01fSchristos#  8/2              S1  L1x S2      |        M1          M2      |
155*1dcdf01fSchristos#  9/3              S1  L1x         |    D1. M1          M2      |
156*1dcdf01fSchristos# 10/4                  D1x         |    S1. L1                  |
157*1dcdf01fSchristos# 11/5                              |S2  S1x L1          D2  L2  |____________
158*1dcdf01fSchristos# 12/6/0                D1x       __|        L1  S1      L2  S2x |D2.     ....
159*1dcdf01fSchristos#    7/1                                     L1  S1  D1x S2  M2  |        ....
160*1dcdf01fSchristos#    8/2                                         S1  L1x S2      |        ....
161*1dcdf01fSchristos#####...                                         ................|............
162*1dcdf01fSchristos$code.=<<___;
163*1dcdf01fSchristos	XORMPY	$H0,$xia,$H0x		; 0	; H·(Xi[i]<<1)
164*1dcdf01fSchristos||	XORMPY	$H01u,$xib,$H01y
165*1dcdf01fSchristos|| [A0]	LDBU	*--${xip},$x0
166*1dcdf01fSchristos	XORMPY	$H1,$xia,$H1x		; 1
167*1dcdf01fSchristos	XORMPY	$H2,$xia,$H2x		; 2
168*1dcdf01fSchristos||	XORMPY	$H2u,$xib,$H2y
169*1dcdf01fSchristos	XORMPY	$H3,$xia,$H3x		; 3
170*1dcdf01fSchristos||	XORMPY	$H3u,$xib,$H3y
171*1dcdf01fSchristos||[!A0]	MVK.D	15,A0				; *--${xip} counter
172*1dcdf01fSchristos	XOR.L	$H0x,$Z0,$Z0		; 4	; Z^=H·(Xi[i]<<1)
173*1dcdf01fSchristos|| [A0]	SUB.S	A0,1,A0
174*1dcdf01fSchristos	XOR.L	$H1x,$Z1,$Z1		; 5
175*1dcdf01fSchristos||	AND.D	$H01y,$FF000000,$H0z
176*1dcdf01fSchristos||	SWAP2.L	$H01y,$H1y		;	; SHL	$H01y,16,$H1y
177*1dcdf01fSchristos||	SHL	$x0,1,$xib
178*1dcdf01fSchristos||	SHL	$x0,1,$xia
179*1dcdf01fSchristos
180*1dcdf01fSchristos	XOR.L	$H2x,$Z2,$Z2		; 6/0	; [0,0] in epilogue
181*1dcdf01fSchristos||	SHL	$Z0,1,$rem		;	; rem=Z<<1
182*1dcdf01fSchristos||	SHRMB.S	$Z1,$Z0,$Z0		;	; Z>>=8
183*1dcdf01fSchristos||	AND.L	$H1y,$FF000000,$H1z
184*1dcdf01fSchristos	XOR.L	$H3x,$Z3,$Z3		; 7/1
185*1dcdf01fSchristos||	SHRMB.S	$Z2,$Z1,$Z1
186*1dcdf01fSchristos||	XOR.D	$H0z,$Z0,$Z0			; merge upper byte products
187*1dcdf01fSchristos||	AND.S	$H2y,$FF000000,$H2z
188*1dcdf01fSchristos||	XORMPY	$E10000,$rem,$res	;	; implicit rem&0x1FE
189*1dcdf01fSchristos	XOR.L	$H1z,$Z1,$Z1		; 8/2
190*1dcdf01fSchristos||	SHRMB.S	$Z3,$Z2,$Z2
191*1dcdf01fSchristos||	AND.S	$H3y,$FF000000,$H3z
192*1dcdf01fSchristos	XOR.L	$H2z,$Z2,$Z2		; 9/3
193*1dcdf01fSchristos||	SHRU	$Z3,8,$Z3
194*1dcdf01fSchristos	XOR.D	$H3z,$Z3,$Z3		; 10/4
195*1dcdf01fSchristos	NOP				; 11/5
196*1dcdf01fSchristos
197*1dcdf01fSchristos	SPKERNEL 0,2
198*1dcdf01fSchristos||	XOR.D	$res,$Z3,$Z3		; 12/6/0; Z^=res
199*1dcdf01fSchristos
200*1dcdf01fSchristos	; input pre-fetch is possible where D1 slot is available...
201*1dcdf01fSchristos   [B0]	LDNDW	*${inp}[1],$H1x:$H0x	; 8/-
202*1dcdf01fSchristos   [B0]	LDNDW	*${inp}++[2],$H3x:$H2x	; 9/-
203*1dcdf01fSchristos	NOP				; 10/-
204*1dcdf01fSchristos	.if	.LITTLE_ENDIAN
205*1dcdf01fSchristos	SWAP2	$Z0,$Z1			; 11/-
206*1dcdf01fSchristos||	SWAP4	$Z1,$Z0
207*1dcdf01fSchristos	SWAP4	$Z1,$Z1			; 12/-
208*1dcdf01fSchristos||	SWAP2	$Z0,$Z0
209*1dcdf01fSchristos	SWAP2	$Z2,$Z3
210*1dcdf01fSchristos||	SWAP4	$Z3,$Z2
211*1dcdf01fSchristos||[!B0]	BNOP	RA
212*1dcdf01fSchristos	SWAP4	$Z3,$Z3
213*1dcdf01fSchristos||	SWAP2	$Z2,$Z2
214*1dcdf01fSchristos|| [B0]	BNOP	ghash_loop?
215*1dcdf01fSchristos   [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
216*1dcdf01fSchristos|| [B0]	XOR	$H1x,$Z1,$Z1
217*1dcdf01fSchristos   [B0]	XOR	$H2x,$Z2,$Z2
218*1dcdf01fSchristos|| [B0]	XOR	$H3x,$Z3,$Z3
219*1dcdf01fSchristos|| [B0]	SHRU	$Z1,24,$xia		; Xi[15], avoid cross-path stall
220*1dcdf01fSchristos	STDW	$Z1:$Z0,*${xip}[1]
221*1dcdf01fSchristos|| [B0]	SHRU	$Z1,16,$x0		; Xi[14]
222*1dcdf01fSchristos|| [B0]	ZERO	$Z1:$Z0
223*1dcdf01fSchristos	.else
224*1dcdf01fSchristos  [!B0]	BNOP	RA			; 11/-
225*1dcdf01fSchristos   [B0]	BNOP	ghash_loop?		; 12/-
226*1dcdf01fSchristos   [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
227*1dcdf01fSchristos|| [B0]	XOR	$H1x,$Z1,$Z1
228*1dcdf01fSchristos   [B0]	XOR	$H2x,$Z2,$Z2
229*1dcdf01fSchristos|| [B0]	XOR	$H3x,$Z3,$Z3
230*1dcdf01fSchristos|| [B0]	MV	$Z0,$xia		; Xi[15], avoid cross-path stall
231*1dcdf01fSchristos	STDW	$Z1:$Z0,*${xip}[1]
232*1dcdf01fSchristos|| [B0] SHRU	$Z0,8,$x0		; Xi[14]
233*1dcdf01fSchristos|| [B0]	ZERO	$Z1:$Z0
234*1dcdf01fSchristos	.endif
235*1dcdf01fSchristos	STDW	$Z3:$Z2,*${xip}[0]
236*1dcdf01fSchristos|| [B0]	ZERO	$Z3:$Z2
237*1dcdf01fSchristos|| [B0]	MV	$xia,$x1
238*1dcdf01fSchristos   [B0]	ADDK	14,${xip}
239*1dcdf01fSchristos	.endasmfunc
240*1dcdf01fSchristos
241*1dcdf01fSchristos	.sect	.const
242*1dcdf01fSchristos	.cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
243*1dcdf01fSchristos	.align	4
244*1dcdf01fSchristos___
245*1dcdf01fSchristos
246*1dcdf01fSchristosprint $code;
247*1dcdf01fSchristosclose STDOUT or die "error closing STDOUT: $!";
248