1#! /usr/bin/env perl 2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# December 2011 18# 19# The module implements GCM GHASH function and underlying single 20# multiplication operation in GF(2^128). Even though subroutines 21# have _4bit suffix, they are not using any tables, but rely on 22# hardware Galois Field Multiply support. Streamed GHASH processes 23# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven 24# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are 25# comparing apples vs. oranges, but compiler surely could have done 26# better, because theoretical [though not necessarily achievable] 27# estimate for "4-bit" table-driven implementation is ~12 cycles. 28 29while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 30open STDOUT,">$output"; 31 32($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments 33 34($Z0,$Z1,$Z2,$Z3, $H0, $H1, $H2, $H3, 35 $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27)); 36($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y, 37 $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27)); 38($FF000000,$E10000)=("B30","B31"); 39($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len 40 $xia="A9"; 41($rem,$res)=("B4","B5"); # $rem zaps $Htable 42 43$code.=<<___; 44 .text 45 46 .if .ASSEMBLER_VERSION<7000000 47 .asg 0,__TI_EABI__ 48 .endif 49 .if __TI_EABI__ 50 .asg gcm_gmult_1bit,_gcm_gmult_1bit 51 .asg gcm_gmult_4bit,_gcm_gmult_4bit 52 .asg gcm_ghash_4bit,_gcm_ghash_4bit 53 .endif 54 55 .asg B3,RA 56 57 .if 0 58 .global _gcm_gmult_1bit 59_gcm_gmult_1bit: 60 ADDAD $Htable,2,$Htable 61 .endif 62 .global _gcm_gmult_4bit 63_gcm_gmult_4bit: 64 .asmfunc 65 LDDW *${Htable}[-1],$H1:$H0 ; H.lo 66 LDDW *${Htable}[-2],$H3:$H2 ; H.hi 67|| MV $Xip,${xip} ; reassign Xi 68|| MVK 15,B1 ; SPLOOPD constant 69 70 MVK 0xE1,$E10000 71|| LDBU *++${xip}[15],$x1 ; Xi[15] 72 MVK 0xFF,$FF000000 73|| LDBU *--${xip},$x0 ; Xi[14] 74 SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial 75 SHL $FF000000,24,$FF000000 ; upper byte mask 76|| BNOP ghash_loop? 77|| MVK 1,B0 ; take a single spin 78 79 PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes 80 AND $H2,$FF000000,$H2u ; H2's upper byte 81 AND $H3,$FF000000,$H3u ; H3's upper byte 82|| SHRU $H2u,8,$H2u 83 SHRU $H3u,8,$H3u 84|| ZERO $Z1:$Z0 85 SHRU2 $xia,8,$H01u 86|| ZERO $Z3:$Z2 87 .endasmfunc 88 89 .global _gcm_ghash_4bit 90_gcm_ghash_4bit: 91 .asmfunc 92 LDDW *${Htable}[-1],$H1:$H0 ; H.lo 93|| SHRU $len,4,B0 ; reassign len 94 LDDW *${Htable}[-2],$H3:$H2 ; H.hi 95|| MV $Xip,${xip} ; reassign Xi 96|| MVK 15,B1 ; SPLOOPD constant 97 98 MVK 0xE1,$E10000 99|| [B0] LDNDW *${inp}[1],$H1x:$H0x 100 MVK 0xFF,$FF000000 101|| [B0] LDNDW *${inp}++[2],$H3x:$H2x 102 SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial 103|| LDDW *${xip}[1],$Z1:$Z0 104 SHL $FF000000,24,$FF000000 ; upper byte mask 105|| LDDW *${xip}[0],$Z3:$Z2 106 107 PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes 108 AND $H2,$FF000000,$H2u ; H2's upper byte 109 AND $H3,$FF000000,$H3u ; H3's upper byte 110|| SHRU $H2u,8,$H2u 111 SHRU $H3u,8,$H3u 112 SHRU2 $xia,8,$H01u 113 114|| [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp 115|| [B0] XOR $H1x,$Z1,$Z1 116 .if .LITTLE_ENDIAN 117 [B0] XOR $H2x,$Z2,$Z2 118|| [B0] XOR $H3x,$Z3,$Z3 119|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall 120 STDW $Z1:$Z0,*${xip}[1] 121|| [B0] SHRU $Z1,16,$x0 ; Xi[14] 122|| [B0] ZERO $Z1:$Z0 123 .else 124 [B0] XOR $H2x,$Z2,$Z2 125|| [B0] XOR $H3x,$Z3,$Z3 126|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall 127 STDW $Z1:$Z0,*${xip}[1] 128|| [B0] SHRU $Z0,8,$x0 ; Xi[14] 129|| [B0] ZERO $Z1:$Z0 130 .endif 131 STDW $Z3:$Z2,*${xip}[0] 132|| [B0] ZERO $Z3:$Z2 133|| [B0] MV $xia,$x1 134 [B0] ADDK 14,${xip} 135 136ghash_loop?: 137 SPLOOPD 6 ; 6*16+7 138|| MVC B1,ILC 139|| [B0] SUB B0,1,B0 140|| ZERO A0 141|| ADD $x1,$x1,$xib ; SHL $x1,1,$xib 142|| SHL $x1,1,$xia 143___ 144 145########____________________________ 146# 0 D2. M1 M2 | 147# 1 M1 | 148# 2 M1 M2 | 149# 3 D1. M1 M2 | 150# 4 S1. L1 | 151# 5 S2 S1x L1 D2 L2 |____________________________ 152# 6/0 L1 S1 L2 S2x |D2. M1 M2 | 153# 7/1 L1 S1 D1x S2 M2 | M1 | 154# 8/2 S1 L1x S2 | M1 M2 | 155# 9/3 S1 L1x | D1. M1 M2 | 156# 10/4 D1x | S1. L1 | 157# 11/5 |S2 S1x L1 D2 L2 |____________ 158# 12/6/0 D1x __| L1 S1 L2 S2x |D2. .... 159# 7/1 L1 S1 D1x S2 M2 | .... 160# 8/2 S1 L1x S2 | .... 161#####... ................|............ 162$code.=<<___; 163 XORMPY $H0,$xia,$H0x ; 0 ; H·(Xi[i]<<1) 164|| XORMPY $H01u,$xib,$H01y 165|| [A0] LDBU *--${xip},$x0 166 XORMPY $H1,$xia,$H1x ; 1 167 XORMPY $H2,$xia,$H2x ; 2 168|| XORMPY $H2u,$xib,$H2y 169 XORMPY $H3,$xia,$H3x ; 3 170|| XORMPY $H3u,$xib,$H3y 171||[!A0] MVK.D 15,A0 ; *--${xip} counter 172 XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·(Xi[i]<<1) 173|| [A0] SUB.S A0,1,A0 174 XOR.L $H1x,$Z1,$Z1 ; 5 175|| AND.D $H01y,$FF000000,$H0z 176|| SWAP2.L $H01y,$H1y ; ; SHL $H01y,16,$H1y 177|| SHL $x0,1,$xib 178|| SHL $x0,1,$xia 179 180 XOR.L $H2x,$Z2,$Z2 ; 6/0 ; [0,0] in epilogue 181|| SHL $Z0,1,$rem ; ; rem=Z<<1 182|| SHRMB.S $Z1,$Z0,$Z0 ; ; Z>>=8 183|| AND.L $H1y,$FF000000,$H1z 184 XOR.L $H3x,$Z3,$Z3 ; 7/1 185|| SHRMB.S $Z2,$Z1,$Z1 186|| XOR.D $H0z,$Z0,$Z0 ; merge upper byte products 187|| AND.S $H2y,$FF000000,$H2z 188|| XORMPY $E10000,$rem,$res ; ; implicit rem&0x1FE 189 XOR.L $H1z,$Z1,$Z1 ; 8/2 190|| SHRMB.S $Z3,$Z2,$Z2 191|| AND.S $H3y,$FF000000,$H3z 192 XOR.L $H2z,$Z2,$Z2 ; 9/3 193|| SHRU $Z3,8,$Z3 194 XOR.D $H3z,$Z3,$Z3 ; 10/4 195 NOP ; 11/5 196 197 SPKERNEL 0,2 198|| XOR.D $res,$Z3,$Z3 ; 12/6/0; Z^=res 199 200 ; input pre-fetch is possible where D1 slot is available... 201 [B0] LDNDW *${inp}[1],$H1x:$H0x ; 8/- 202 [B0] LDNDW *${inp}++[2],$H3x:$H2x ; 9/- 203 NOP ; 10/- 204 .if .LITTLE_ENDIAN 205 SWAP2 $Z0,$Z1 ; 11/- 206|| SWAP4 $Z1,$Z0 207 SWAP4 $Z1,$Z1 ; 12/- 208|| SWAP2 $Z0,$Z0 209 SWAP2 $Z2,$Z3 210|| SWAP4 $Z3,$Z2 211||[!B0] BNOP RA 212 SWAP4 $Z3,$Z3 213|| SWAP2 $Z2,$Z2 214|| [B0] BNOP ghash_loop? 215 [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp 216|| [B0] XOR $H1x,$Z1,$Z1 217 [B0] XOR $H2x,$Z2,$Z2 218|| [B0] XOR $H3x,$Z3,$Z3 219|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall 220 STDW $Z1:$Z0,*${xip}[1] 221|| [B0] SHRU $Z1,16,$x0 ; Xi[14] 222|| [B0] ZERO $Z1:$Z0 223 .else 224 [!B0] BNOP RA ; 11/- 225 [B0] BNOP ghash_loop? ; 12/- 226 [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp 227|| [B0] XOR $H1x,$Z1,$Z1 228 [B0] XOR $H2x,$Z2,$Z2 229|| [B0] XOR $H3x,$Z3,$Z3 230|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall 231 STDW $Z1:$Z0,*${xip}[1] 232|| [B0] SHRU $Z0,8,$x0 ; Xi[14] 233|| [B0] ZERO $Z1:$Z0 234 .endif 235 STDW $Z3:$Z2,*${xip}[0] 236|| [B0] ZERO $Z3:$Z2 237|| [B0] MV $xia,$x1 238 [B0] ADDK 14,${xip} 239 .endasmfunc 240 241 .sect .const 242 .cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>" 243 .align 4 244___ 245 246print $code; 247close STDOUT or die "error closing STDOUT: $!"; 248