1*1dcdf01fSchristos#! /usr/bin/env perl 2*1dcdf01fSchristos# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. 3*1dcdf01fSchristos# 4*1dcdf01fSchristos# Licensed under the OpenSSL license (the "License"). You may not use 5*1dcdf01fSchristos# this file except in compliance with the License. You can obtain a copy 6*1dcdf01fSchristos# in the file LICENSE in the source distribution or at 7*1dcdf01fSchristos# https://www.openssl.org/source/license.html 8*1dcdf01fSchristos 9*1dcdf01fSchristos# 10*1dcdf01fSchristos# ==================================================================== 11*1dcdf01fSchristos# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12*1dcdf01fSchristos# project. The module is, however, dual licensed under OpenSSL and 13*1dcdf01fSchristos# CRYPTOGAMS licenses depending on where you obtain it. For further 14*1dcdf01fSchristos# details see http://www.openssl.org/~appro/cryptogams/. 15*1dcdf01fSchristos# ==================================================================== 16*1dcdf01fSchristos# 17*1dcdf01fSchristos# December 2011 18*1dcdf01fSchristos# 19*1dcdf01fSchristos# The module implements GCM GHASH function and underlying single 20*1dcdf01fSchristos# multiplication operation in GF(2^128). Even though subroutines 21*1dcdf01fSchristos# have _4bit suffix, they are not using any tables, but rely on 22*1dcdf01fSchristos# hardware Galois Field Multiply support. Streamed GHASH processes 23*1dcdf01fSchristos# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven 24*1dcdf01fSchristos# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are 25*1dcdf01fSchristos# comparing apples vs. oranges, but compiler surely could have done 26*1dcdf01fSchristos# better, because theoretical [though not necessarily achievable] 27*1dcdf01fSchristos# estimate for "4-bit" table-driven implementation is ~12 cycles. 28*1dcdf01fSchristos 29*1dcdf01fSchristoswhile (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 30*1dcdf01fSchristosopen STDOUT,">$output"; 31*1dcdf01fSchristos 32*1dcdf01fSchristos($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments 33*1dcdf01fSchristos 34*1dcdf01fSchristos($Z0,$Z1,$Z2,$Z3, $H0, $H1, $H2, $H3, 35*1dcdf01fSchristos $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27)); 36*1dcdf01fSchristos($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y, 37*1dcdf01fSchristos $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27)); 38*1dcdf01fSchristos($FF000000,$E10000)=("B30","B31"); 39*1dcdf01fSchristos($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len 40*1dcdf01fSchristos $xia="A9"; 41*1dcdf01fSchristos($rem,$res)=("B4","B5"); # $rem zaps $Htable 42*1dcdf01fSchristos 43*1dcdf01fSchristos$code.=<<___; 44*1dcdf01fSchristos .text 45*1dcdf01fSchristos 46*1dcdf01fSchristos .if .ASSEMBLER_VERSION<7000000 47*1dcdf01fSchristos .asg 0,__TI_EABI__ 48*1dcdf01fSchristos .endif 49*1dcdf01fSchristos .if __TI_EABI__ 50*1dcdf01fSchristos .asg gcm_gmult_1bit,_gcm_gmult_1bit 51*1dcdf01fSchristos .asg gcm_gmult_4bit,_gcm_gmult_4bit 52*1dcdf01fSchristos .asg gcm_ghash_4bit,_gcm_ghash_4bit 53*1dcdf01fSchristos .endif 54*1dcdf01fSchristos 55*1dcdf01fSchristos .asg B3,RA 56*1dcdf01fSchristos 57*1dcdf01fSchristos .if 0 58*1dcdf01fSchristos .global _gcm_gmult_1bit 59*1dcdf01fSchristos_gcm_gmult_1bit: 60*1dcdf01fSchristos ADDAD $Htable,2,$Htable 61*1dcdf01fSchristos .endif 62*1dcdf01fSchristos .global _gcm_gmult_4bit 63*1dcdf01fSchristos_gcm_gmult_4bit: 64*1dcdf01fSchristos .asmfunc 65*1dcdf01fSchristos LDDW *${Htable}[-1],$H1:$H0 ; H.lo 66*1dcdf01fSchristos LDDW *${Htable}[-2],$H3:$H2 ; H.hi 67*1dcdf01fSchristos|| MV $Xip,${xip} ; reassign Xi 68*1dcdf01fSchristos|| MVK 15,B1 ; SPLOOPD constant 69*1dcdf01fSchristos 70*1dcdf01fSchristos MVK 0xE1,$E10000 71*1dcdf01fSchristos|| LDBU *++${xip}[15],$x1 ; Xi[15] 72*1dcdf01fSchristos MVK 0xFF,$FF000000 73*1dcdf01fSchristos|| LDBU *--${xip},$x0 ; Xi[14] 74*1dcdf01fSchristos SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial 75*1dcdf01fSchristos SHL $FF000000,24,$FF000000 ; upper byte mask 76*1dcdf01fSchristos|| BNOP ghash_loop? 77*1dcdf01fSchristos|| MVK 1,B0 ; take a single spin 78*1dcdf01fSchristos 79*1dcdf01fSchristos PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes 80*1dcdf01fSchristos AND $H2,$FF000000,$H2u ; H2's upper byte 81*1dcdf01fSchristos AND $H3,$FF000000,$H3u ; H3's upper byte 82*1dcdf01fSchristos|| SHRU $H2u,8,$H2u 83*1dcdf01fSchristos SHRU $H3u,8,$H3u 84*1dcdf01fSchristos|| ZERO $Z1:$Z0 85*1dcdf01fSchristos SHRU2 $xia,8,$H01u 86*1dcdf01fSchristos|| ZERO $Z3:$Z2 87*1dcdf01fSchristos .endasmfunc 88*1dcdf01fSchristos 89*1dcdf01fSchristos .global _gcm_ghash_4bit 90*1dcdf01fSchristos_gcm_ghash_4bit: 91*1dcdf01fSchristos .asmfunc 92*1dcdf01fSchristos LDDW *${Htable}[-1],$H1:$H0 ; H.lo 93*1dcdf01fSchristos|| SHRU $len,4,B0 ; reassign len 94*1dcdf01fSchristos LDDW *${Htable}[-2],$H3:$H2 ; H.hi 95*1dcdf01fSchristos|| MV $Xip,${xip} ; reassign Xi 96*1dcdf01fSchristos|| MVK 15,B1 ; SPLOOPD constant 97*1dcdf01fSchristos 98*1dcdf01fSchristos MVK 0xE1,$E10000 99*1dcdf01fSchristos|| [B0] LDNDW *${inp}[1],$H1x:$H0x 100*1dcdf01fSchristos MVK 0xFF,$FF000000 101*1dcdf01fSchristos|| [B0] LDNDW *${inp}++[2],$H3x:$H2x 102*1dcdf01fSchristos SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial 103*1dcdf01fSchristos|| LDDW *${xip}[1],$Z1:$Z0 104*1dcdf01fSchristos SHL $FF000000,24,$FF000000 ; upper byte mask 105*1dcdf01fSchristos|| LDDW *${xip}[0],$Z3:$Z2 106*1dcdf01fSchristos 107*1dcdf01fSchristos PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes 108*1dcdf01fSchristos AND $H2,$FF000000,$H2u ; H2's upper byte 109*1dcdf01fSchristos AND $H3,$FF000000,$H3u ; H3's upper byte 110*1dcdf01fSchristos|| SHRU $H2u,8,$H2u 111*1dcdf01fSchristos SHRU $H3u,8,$H3u 112*1dcdf01fSchristos SHRU2 $xia,8,$H01u 113*1dcdf01fSchristos 114*1dcdf01fSchristos|| [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp 115*1dcdf01fSchristos|| [B0] XOR $H1x,$Z1,$Z1 116*1dcdf01fSchristos .if .LITTLE_ENDIAN 117*1dcdf01fSchristos [B0] XOR $H2x,$Z2,$Z2 118*1dcdf01fSchristos|| [B0] XOR $H3x,$Z3,$Z3 119*1dcdf01fSchristos|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall 120*1dcdf01fSchristos STDW $Z1:$Z0,*${xip}[1] 121*1dcdf01fSchristos|| [B0] SHRU $Z1,16,$x0 ; Xi[14] 122*1dcdf01fSchristos|| [B0] ZERO $Z1:$Z0 123*1dcdf01fSchristos .else 124*1dcdf01fSchristos [B0] XOR $H2x,$Z2,$Z2 125*1dcdf01fSchristos|| [B0] XOR $H3x,$Z3,$Z3 126*1dcdf01fSchristos|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall 127*1dcdf01fSchristos STDW $Z1:$Z0,*${xip}[1] 128*1dcdf01fSchristos|| [B0] SHRU $Z0,8,$x0 ; Xi[14] 129*1dcdf01fSchristos|| [B0] ZERO $Z1:$Z0 130*1dcdf01fSchristos .endif 131*1dcdf01fSchristos STDW $Z3:$Z2,*${xip}[0] 132*1dcdf01fSchristos|| [B0] ZERO $Z3:$Z2 133*1dcdf01fSchristos|| [B0] MV $xia,$x1 134*1dcdf01fSchristos [B0] ADDK 14,${xip} 135*1dcdf01fSchristos 136*1dcdf01fSchristosghash_loop?: 137*1dcdf01fSchristos SPLOOPD 6 ; 6*16+7 138*1dcdf01fSchristos|| MVC B1,ILC 139*1dcdf01fSchristos|| [B0] SUB B0,1,B0 140*1dcdf01fSchristos|| ZERO A0 141*1dcdf01fSchristos|| ADD $x1,$x1,$xib ; SHL $x1,1,$xib 142*1dcdf01fSchristos|| SHL $x1,1,$xia 143*1dcdf01fSchristos___ 144*1dcdf01fSchristos 145*1dcdf01fSchristos########____________________________ 146*1dcdf01fSchristos# 0 D2. M1 M2 | 147*1dcdf01fSchristos# 1 M1 | 148*1dcdf01fSchristos# 2 M1 M2 | 149*1dcdf01fSchristos# 3 D1. M1 M2 | 150*1dcdf01fSchristos# 4 S1. L1 | 151*1dcdf01fSchristos# 5 S2 S1x L1 D2 L2 |____________________________ 152*1dcdf01fSchristos# 6/0 L1 S1 L2 S2x |D2. M1 M2 | 153*1dcdf01fSchristos# 7/1 L1 S1 D1x S2 M2 | M1 | 154*1dcdf01fSchristos# 8/2 S1 L1x S2 | M1 M2 | 155*1dcdf01fSchristos# 9/3 S1 L1x | D1. M1 M2 | 156*1dcdf01fSchristos# 10/4 D1x | S1. L1 | 157*1dcdf01fSchristos# 11/5 |S2 S1x L1 D2 L2 |____________ 158*1dcdf01fSchristos# 12/6/0 D1x __| L1 S1 L2 S2x |D2. .... 159*1dcdf01fSchristos# 7/1 L1 S1 D1x S2 M2 | .... 160*1dcdf01fSchristos# 8/2 S1 L1x S2 | .... 161*1dcdf01fSchristos#####... ................|............ 162*1dcdf01fSchristos$code.=<<___; 163*1dcdf01fSchristos XORMPY $H0,$xia,$H0x ; 0 ; H·(Xi[i]<<1) 164*1dcdf01fSchristos|| XORMPY $H01u,$xib,$H01y 165*1dcdf01fSchristos|| [A0] LDBU *--${xip},$x0 166*1dcdf01fSchristos XORMPY $H1,$xia,$H1x ; 1 167*1dcdf01fSchristos XORMPY $H2,$xia,$H2x ; 2 168*1dcdf01fSchristos|| XORMPY $H2u,$xib,$H2y 169*1dcdf01fSchristos XORMPY $H3,$xia,$H3x ; 3 170*1dcdf01fSchristos|| XORMPY $H3u,$xib,$H3y 171*1dcdf01fSchristos||[!A0] MVK.D 15,A0 ; *--${xip} counter 172*1dcdf01fSchristos XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·(Xi[i]<<1) 173*1dcdf01fSchristos|| [A0] SUB.S A0,1,A0 174*1dcdf01fSchristos XOR.L $H1x,$Z1,$Z1 ; 5 175*1dcdf01fSchristos|| AND.D $H01y,$FF000000,$H0z 176*1dcdf01fSchristos|| SWAP2.L $H01y,$H1y ; ; SHL $H01y,16,$H1y 177*1dcdf01fSchristos|| SHL $x0,1,$xib 178*1dcdf01fSchristos|| SHL $x0,1,$xia 179*1dcdf01fSchristos 180*1dcdf01fSchristos XOR.L $H2x,$Z2,$Z2 ; 6/0 ; [0,0] in epilogue 181*1dcdf01fSchristos|| SHL $Z0,1,$rem ; ; rem=Z<<1 182*1dcdf01fSchristos|| SHRMB.S $Z1,$Z0,$Z0 ; ; Z>>=8 183*1dcdf01fSchristos|| AND.L $H1y,$FF000000,$H1z 184*1dcdf01fSchristos XOR.L $H3x,$Z3,$Z3 ; 7/1 185*1dcdf01fSchristos|| SHRMB.S $Z2,$Z1,$Z1 186*1dcdf01fSchristos|| XOR.D $H0z,$Z0,$Z0 ; merge upper byte products 187*1dcdf01fSchristos|| AND.S $H2y,$FF000000,$H2z 188*1dcdf01fSchristos|| XORMPY $E10000,$rem,$res ; ; implicit rem&0x1FE 189*1dcdf01fSchristos XOR.L $H1z,$Z1,$Z1 ; 8/2 190*1dcdf01fSchristos|| SHRMB.S $Z3,$Z2,$Z2 191*1dcdf01fSchristos|| AND.S $H3y,$FF000000,$H3z 192*1dcdf01fSchristos XOR.L $H2z,$Z2,$Z2 ; 9/3 193*1dcdf01fSchristos|| SHRU $Z3,8,$Z3 194*1dcdf01fSchristos XOR.D $H3z,$Z3,$Z3 ; 10/4 195*1dcdf01fSchristos NOP ; 11/5 196*1dcdf01fSchristos 197*1dcdf01fSchristos SPKERNEL 0,2 198*1dcdf01fSchristos|| XOR.D $res,$Z3,$Z3 ; 12/6/0; Z^=res 199*1dcdf01fSchristos 200*1dcdf01fSchristos ; input pre-fetch is possible where D1 slot is available... 201*1dcdf01fSchristos [B0] LDNDW *${inp}[1],$H1x:$H0x ; 8/- 202*1dcdf01fSchristos [B0] LDNDW *${inp}++[2],$H3x:$H2x ; 9/- 203*1dcdf01fSchristos NOP ; 10/- 204*1dcdf01fSchristos .if .LITTLE_ENDIAN 205*1dcdf01fSchristos SWAP2 $Z0,$Z1 ; 11/- 206*1dcdf01fSchristos|| SWAP4 $Z1,$Z0 207*1dcdf01fSchristos SWAP4 $Z1,$Z1 ; 12/- 208*1dcdf01fSchristos|| SWAP2 $Z0,$Z0 209*1dcdf01fSchristos SWAP2 $Z2,$Z3 210*1dcdf01fSchristos|| SWAP4 $Z3,$Z2 211*1dcdf01fSchristos||[!B0] BNOP RA 212*1dcdf01fSchristos SWAP4 $Z3,$Z3 213*1dcdf01fSchristos|| SWAP2 $Z2,$Z2 214*1dcdf01fSchristos|| [B0] BNOP ghash_loop? 215*1dcdf01fSchristos [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp 216*1dcdf01fSchristos|| [B0] XOR $H1x,$Z1,$Z1 217*1dcdf01fSchristos [B0] XOR $H2x,$Z2,$Z2 218*1dcdf01fSchristos|| [B0] XOR $H3x,$Z3,$Z3 219*1dcdf01fSchristos|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall 220*1dcdf01fSchristos STDW $Z1:$Z0,*${xip}[1] 221*1dcdf01fSchristos|| [B0] SHRU $Z1,16,$x0 ; Xi[14] 222*1dcdf01fSchristos|| [B0] ZERO $Z1:$Z0 223*1dcdf01fSchristos .else 224*1dcdf01fSchristos [!B0] BNOP RA ; 11/- 225*1dcdf01fSchristos [B0] BNOP ghash_loop? ; 12/- 226*1dcdf01fSchristos [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp 227*1dcdf01fSchristos|| [B0] XOR $H1x,$Z1,$Z1 228*1dcdf01fSchristos [B0] XOR $H2x,$Z2,$Z2 229*1dcdf01fSchristos|| [B0] XOR $H3x,$Z3,$Z3 230*1dcdf01fSchristos|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall 231*1dcdf01fSchristos STDW $Z1:$Z0,*${xip}[1] 232*1dcdf01fSchristos|| [B0] SHRU $Z0,8,$x0 ; Xi[14] 233*1dcdf01fSchristos|| [B0] ZERO $Z1:$Z0 234*1dcdf01fSchristos .endif 235*1dcdf01fSchristos STDW $Z3:$Z2,*${xip}[0] 236*1dcdf01fSchristos|| [B0] ZERO $Z3:$Z2 237*1dcdf01fSchristos|| [B0] MV $xia,$x1 238*1dcdf01fSchristos [B0] ADDK 14,${xip} 239*1dcdf01fSchristos .endasmfunc 240*1dcdf01fSchristos 241*1dcdf01fSchristos .sect .const 242*1dcdf01fSchristos .cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>" 243*1dcdf01fSchristos .align 4 244*1dcdf01fSchristos___ 245*1dcdf01fSchristos 246*1dcdf01fSchristosprint $code; 247*1dcdf01fSchristosclose STDOUT or die "error closing STDOUT: $!"; 248