1#! /usr/bin/env perl 2# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# SHA1 block procedure for s390x. 18 19# April 2007. 20# 21# Performance is >30% better than gcc 3.3 generated code. But the real 22# twist is that SHA1 hardware support is detected and utilized. In 23# which case performance can reach further >4.5x for larger chunks. 24 25# January 2009. 26# 27# Optimize Xupdate for amount of memory references and reschedule 28# instructions to favour dual-issue z10 pipeline. On z10 hardware is 29# "only" ~2.3x faster than software. 30 31# November 2010. 32# 33# Adapt for -m31 build. If kernel supports what's called "highgprs" 34# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit 35# instructions and achieve "64-bit" performance even in 31-bit legacy 36# application context. The feature is not specific to any particular 37# processor, as long as it's "z-CPU". Latter implies that the code 38# remains z/Architecture specific. On z990 it was measured to perform 39# 23% better than code generated by gcc 4.3. 40 41$kimdfunc=1; # magic function code for kimd instruction 42 43$flavour = shift; 44 45if ($flavour =~ /3[12]/) { 46 $SIZE_T=4; 47 $g=""; 48} else { 49 $SIZE_T=8; 50 $g="g"; 51} 52 53while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 54open STDOUT,">$output"; 55 56$K_00_39="%r0"; $K=$K_00_39; 57$K_40_79="%r1"; 58$ctx="%r2"; $prefetch="%r2"; 59$inp="%r3"; 60$len="%r4"; 61 62$A="%r5"; 63$B="%r6"; 64$C="%r7"; 65$D="%r8"; 66$E="%r9"; @V=($A,$B,$C,$D,$E); 67$t0="%r10"; 68$t1="%r11"; 69@X=("%r12","%r13","%r14"); 70$sp="%r15"; 71 72$stdframe=16*$SIZE_T+4*8; 73$frame=$stdframe+16*4; 74 75sub Xupdate { 76my $i=shift; 77 78$code.=<<___ if ($i==15); 79 lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up 80 lr $X[0],$X[2] 81___ 82return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle 83$code.=<<___ if ($i<16); 84 lg $X[0],`$i*4`($inp) ### Xload($i) 85 rllg $X[1],$X[0],32 86___ 87$code.=<<___ if ($i>=16); 88 xgr $X[0],$prefetch ### Xupdate($i) 89 lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp) 90 xg $X[0],`$stdframe+4*(($i+8)%16)`($sp) 91 xgr $X[0],$prefetch 92 rll $X[0],$X[0],1 93 rllg $X[1],$X[0],32 94 rll $X[1],$X[1],1 95 rllg $X[0],$X[1],32 96 lr $X[2],$X[1] # feedback 97___ 98$code.=<<___ if ($i<=70); 99 stg $X[0],`$stdframe+4*($i%16)`($sp) 100___ 101unshift(@X,pop(@X)); 102} 103 104sub BODY_00_19 { 105my ($i,$a,$b,$c,$d,$e)=@_; 106my $xi=$X[1]; 107 108 &Xupdate($i); 109$code.=<<___; 110 alr $e,$K ### $i 111 rll $t1,$a,5 112 lr $t0,$d 113 xr $t0,$c 114 alr $e,$t1 115 nr $t0,$b 116 alr $e,$xi 117 xr $t0,$d 118 rll $b,$b,30 119 alr $e,$t0 120___ 121} 122 123sub BODY_20_39 { 124my ($i,$a,$b,$c,$d,$e)=@_; 125my $xi=$X[1]; 126 127 &Xupdate($i); 128$code.=<<___; 129 alr $e,$K ### $i 130 rll $t1,$a,5 131 lr $t0,$b 132 alr $e,$t1 133 xr $t0,$c 134 alr $e,$xi 135 xr $t0,$d 136 rll $b,$b,30 137 alr $e,$t0 138___ 139} 140 141sub BODY_40_59 { 142my ($i,$a,$b,$c,$d,$e)=@_; 143my $xi=$X[1]; 144 145 &Xupdate($i); 146$code.=<<___; 147 alr $e,$K ### $i 148 rll $t1,$a,5 149 lr $t0,$b 150 alr $e,$t1 151 or $t0,$c 152 lr $t1,$b 153 nr $t0,$d 154 nr $t1,$c 155 alr $e,$xi 156 or $t0,$t1 157 rll $b,$b,30 158 alr $e,$t0 159___ 160} 161 162$code.=<<___; 163#include "s390x_arch.h" 164 165.text 166.align 64 167.type Ktable,\@object 168Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6 169 .skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0 170.size Ktable,.-Ktable 171.globl sha1_block_data_order 172.type sha1_block_data_order,\@function 173sha1_block_data_order: 174___ 175$code.=<<___ if ($kimdfunc); 176 larl %r1,OPENSSL_s390xcap_P 177 lg %r0,S390X_KIMD(%r1) # check kimd capabilities 178 tmhh %r0,`0x8000>>$kimdfunc` 179 jz .Lsoftware 180 lghi %r0,$kimdfunc 181 lgr %r1,$ctx 182 lgr %r2,$inp 183 sllg %r3,$len,6 184 .long 0xb93e0002 # kimd %r0,%r2 185 brc 1,.-4 # pay attention to "partial completion" 186 br %r14 187.align 16 188.Lsoftware: 189___ 190$code.=<<___; 191 lghi %r1,-$frame 192 st${g} $ctx,`2*$SIZE_T`($sp) 193 stm${g} %r6,%r15,`6*$SIZE_T`($sp) 194 lgr %r0,$sp 195 la $sp,0(%r1,$sp) 196 st${g} %r0,0($sp) 197 198 larl $t0,Ktable 199 llgf $A,0($ctx) 200 llgf $B,4($ctx) 201 llgf $C,8($ctx) 202 llgf $D,12($ctx) 203 llgf $E,16($ctx) 204 205 lg $K_00_39,0($t0) 206 lg $K_40_79,8($t0) 207 208.Lloop: 209 rllg $K_00_39,$K_00_39,32 210___ 211for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 212$code.=<<___; 213 rllg $K_00_39,$K_00_39,32 214___ 215for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 216$code.=<<___; $K=$K_40_79; 217 rllg $K_40_79,$K_40_79,32 218___ 219for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 220$code.=<<___; 221 rllg $K_40_79,$K_40_79,32 222___ 223for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 224$code.=<<___; 225 226 l${g} $ctx,`$frame+2*$SIZE_T`($sp) 227 la $inp,64($inp) 228 al $A,0($ctx) 229 al $B,4($ctx) 230 al $C,8($ctx) 231 al $D,12($ctx) 232 al $E,16($ctx) 233 st $A,0($ctx) 234 st $B,4($ctx) 235 st $C,8($ctx) 236 st $D,12($ctx) 237 st $E,16($ctx) 238 brct${g} $len,.Lloop 239 240 lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) 241 br %r14 242.size sha1_block_data_order,.-sha1_block_data_order 243.string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>" 244___ 245 246$code =~ s/\`([^\`]*)\`/eval $1/gem; 247 248print $code; 249close STDOUT or die "error closing STDOUT: $!"; 250