1#! /usr/bin/env perl 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# December 2015 18# 19# ChaCha20 for s390x. 20# 21# 3 times faster than compiler-generated code. 22 23$flavour = shift; 24 25if ($flavour =~ /3[12]/) { 26 $SIZE_T=4; 27 $g=""; 28} else { 29 $SIZE_T=8; 30 $g="g"; 31} 32 33while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 34open STDOUT,">$output"; 35 36sub AUTOLOAD() # thunk [simplified] x86-style perlasm 37{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 38 $code .= "\t$opcode\t".join(',',@_)."\n"; 39} 40 41my $sp="%r15"; 42 43my $stdframe=16*$SIZE_T+4*8; 44my $frame=$stdframe+4*20; 45 46my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6)); 47 48my @x=map("%r$_",(0..7,"x","x","x","x",(10..13))); 49my @t=map("%r$_",(8,9)); 50 51sub ROUND { 52my ($a0,$b0,$c0,$d0)=@_; 53my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 54my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 55my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 56my ($xc,$xc_)=map("\"$_\"",@t); 57my @x=map("\"$_\"",@x); 58 59 # Consider order in which variables are addressed by their 60 # index: 61 # 62 # a b c d 63 # 64 # 0 4 8 12 < even round 65 # 1 5 9 13 66 # 2 6 10 14 67 # 3 7 11 15 68 # 0 5 10 15 < odd round 69 # 1 6 11 12 70 # 2 7 8 13 71 # 3 4 9 14 72 # 73 # 'a', 'b' and 'd's are permanently allocated in registers, 74 # @x[0..7,12..15], while 'c's are maintained in memory. If 75 # you observe 'c' column, you'll notice that pair of 'c's is 76 # invariant between rounds. This means that we have to reload 77 # them once per round, in the middle. This is why you'll see 78 # 'c' stores and loads in the middle, but none in the beginning 79 # or end. 80 81 ( 82 "&alr (@x[$a0],@x[$b0])", # Q1 83 "&alr (@x[$a1],@x[$b1])", # Q2 84 "&xr (@x[$d0],@x[$a0])", 85 "&xr (@x[$d1],@x[$a1])", 86 "&rll (@x[$d0],@x[$d0],16)", 87 "&rll (@x[$d1],@x[$d1],16)", 88 89 "&alr ($xc,@x[$d0])", 90 "&alr ($xc_,@x[$d1])", 91 "&xr (@x[$b0],$xc)", 92 "&xr (@x[$b1],$xc_)", 93 "&rll (@x[$b0],@x[$b0],12)", 94 "&rll (@x[$b1],@x[$b1],12)", 95 96 "&alr (@x[$a0],@x[$b0])", 97 "&alr (@x[$a1],@x[$b1])", 98 "&xr (@x[$d0],@x[$a0])", 99 "&xr (@x[$d1],@x[$a1])", 100 "&rll (@x[$d0],@x[$d0],8)", 101 "&rll (@x[$d1],@x[$d1],8)", 102 103 "&alr ($xc,@x[$d0])", 104 "&alr ($xc_,@x[$d1])", 105 "&xr (@x[$b0],$xc)", 106 "&xr (@x[$b1],$xc_)", 107 "&rll (@x[$b0],@x[$b0],7)", 108 "&rll (@x[$b1],@x[$b1],7)", 109 110 "&stm ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's 111 "&lm ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')", 112 113 "&alr (@x[$a2],@x[$b2])", # Q3 114 "&alr (@x[$a3],@x[$b3])", # Q4 115 "&xr (@x[$d2],@x[$a2])", 116 "&xr (@x[$d3],@x[$a3])", 117 "&rll (@x[$d2],@x[$d2],16)", 118 "&rll (@x[$d3],@x[$d3],16)", 119 120 "&alr ($xc,@x[$d2])", 121 "&alr ($xc_,@x[$d3])", 122 "&xr (@x[$b2],$xc)", 123 "&xr (@x[$b3],$xc_)", 124 "&rll (@x[$b2],@x[$b2],12)", 125 "&rll (@x[$b3],@x[$b3],12)", 126 127 "&alr (@x[$a2],@x[$b2])", 128 "&alr (@x[$a3],@x[$b3])", 129 "&xr (@x[$d2],@x[$a2])", 130 "&xr (@x[$d3],@x[$a3])", 131 "&rll (@x[$d2],@x[$d2],8)", 132 "&rll (@x[$d3],@x[$d3],8)", 133 134 "&alr ($xc,@x[$d2])", 135 "&alr ($xc_,@x[$d3])", 136 "&xr (@x[$b2],$xc)", 137 "&xr (@x[$b3],$xc_)", 138 "&rll (@x[$b2],@x[$b2],7)", 139 "&rll (@x[$b3],@x[$b3],7)" 140 ); 141} 142 143$code.=<<___; 144.text 145 146.globl ChaCha20_ctr32 147.type ChaCha20_ctr32,\@function 148.align 32 149ChaCha20_ctr32: 150 lt${g}r $len,$len # $len==0? 151 bzr %r14 152 a${g}hi $len,-64 153 l${g}hi %r1,-$frame 154 stm${g} %r6,%r15,`6*$SIZE_T`($sp) 155 sl${g}r $out,$inp # difference 156 la $len,0($inp,$len) # end of input minus 64 157 larl %r7,.Lsigma 158 lgr %r0,$sp 159 la $sp,0(%r1,$sp) 160 st${g} %r0,0($sp) 161 162 lmg %r8,%r11,0($key) # load key 163 lmg %r12,%r13,0($counter) # load counter 164 lmg %r6,%r7,0(%r7) # load sigma constant 165 166 la %r14,0($inp) 167 st${g} $out,$frame+3*$SIZE_T($sp) 168 st${g} $len,$frame+4*$SIZE_T($sp) 169 stmg %r6,%r13,$stdframe($sp) # copy key schedule to stack 170 srlg @x[12],%r12,32 # 32-bit counter value 171 j .Loop_outer 172 173.align 16 174.Loop_outer: 175 lm @x[0],@x[7],$stdframe+4*0($sp) # load x[0]-x[7] 176 lm @t[0],@t[1],$stdframe+4*10($sp) # load x[10]-x[11] 177 lm @x[13],@x[15],$stdframe+4*13($sp) # load x[13]-x[15] 178 stm @t[0],@t[1],$stdframe+4*8+4*10($sp) # offload x[10]-x[11] 179 lm @t[0],@t[1],$stdframe+4*8($sp) # load x[8]-x[9] 180 st @x[12],$stdframe+4*12($sp) # save counter 181 st${g} %r14,$frame+2*$SIZE_T($sp) # save input pointer 182 lhi %r14,10 183 j .Loop 184 185.align 4 186.Loop: 187___ 188 foreach (&ROUND(0, 4, 8,12)) { eval; } 189 foreach (&ROUND(0, 5,10,15)) { eval; } 190$code.=<<___; 191 brct %r14,.Loop 192 193 l${g} %r14,$frame+2*$SIZE_T($sp) # pull input pointer 194 stm @t[0],@t[1],$stdframe+4*8+4*8($sp) # offload x[8]-x[9] 195 lm${g} @t[0],@t[1],$frame+3*$SIZE_T($sp) 196 197 al @x[0],$stdframe+4*0($sp) # accumulate key schedule 198 al @x[1],$stdframe+4*1($sp) 199 al @x[2],$stdframe+4*2($sp) 200 al @x[3],$stdframe+4*3($sp) 201 al @x[4],$stdframe+4*4($sp) 202 al @x[5],$stdframe+4*5($sp) 203 al @x[6],$stdframe+4*6($sp) 204 al @x[7],$stdframe+4*7($sp) 205 lrvr @x[0],@x[0] 206 lrvr @x[1],@x[1] 207 lrvr @x[2],@x[2] 208 lrvr @x[3],@x[3] 209 lrvr @x[4],@x[4] 210 lrvr @x[5],@x[5] 211 lrvr @x[6],@x[6] 212 lrvr @x[7],@x[7] 213 al @x[12],$stdframe+4*12($sp) 214 al @x[13],$stdframe+4*13($sp) 215 al @x[14],$stdframe+4*14($sp) 216 al @x[15],$stdframe+4*15($sp) 217 lrvr @x[12],@x[12] 218 lrvr @x[13],@x[13] 219 lrvr @x[14],@x[14] 220 lrvr @x[15],@x[15] 221 222 la @t[0],0(@t[0],%r14) # reconstruct output pointer 223 cl${g}r %r14,@t[1] 224 jh .Ltail 225 226 x @x[0],4*0(%r14) # xor with input 227 x @x[1],4*1(%r14) 228 st @x[0],4*0(@t[0]) # store output 229 x @x[2],4*2(%r14) 230 st @x[1],4*1(@t[0]) 231 x @x[3],4*3(%r14) 232 st @x[2],4*2(@t[0]) 233 x @x[4],4*4(%r14) 234 st @x[3],4*3(@t[0]) 235 lm @x[0],@x[3],$stdframe+4*8+4*8($sp) # load x[8]-x[11] 236 x @x[5],4*5(%r14) 237 st @x[4],4*4(@t[0]) 238 x @x[6],4*6(%r14) 239 al @x[0],$stdframe+4*8($sp) 240 st @x[5],4*5(@t[0]) 241 x @x[7],4*7(%r14) 242 al @x[1],$stdframe+4*9($sp) 243 st @x[6],4*6(@t[0]) 244 x @x[12],4*12(%r14) 245 al @x[2],$stdframe+4*10($sp) 246 st @x[7],4*7(@t[0]) 247 x @x[13],4*13(%r14) 248 al @x[3],$stdframe+4*11($sp) 249 st @x[12],4*12(@t[0]) 250 x @x[14],4*14(%r14) 251 st @x[13],4*13(@t[0]) 252 x @x[15],4*15(%r14) 253 st @x[14],4*14(@t[0]) 254 lrvr @x[0],@x[0] 255 st @x[15],4*15(@t[0]) 256 lrvr @x[1],@x[1] 257 lrvr @x[2],@x[2] 258 lrvr @x[3],@x[3] 259 lhi @x[12],1 260 x @x[0],4*8(%r14) 261 al @x[12],$stdframe+4*12($sp) # increment counter 262 x @x[1],4*9(%r14) 263 st @x[0],4*8(@t[0]) 264 x @x[2],4*10(%r14) 265 st @x[1],4*9(@t[0]) 266 x @x[3],4*11(%r14) 267 st @x[2],4*10(@t[0]) 268 st @x[3],4*11(@t[0]) 269 270 cl${g}r %r14,@t[1] # done yet? 271 la %r14,64(%r14) 272 jl .Loop_outer 273 274.Ldone: 275 xgr %r0,%r0 276 xgr %r1,%r1 277 xgr %r2,%r2 278 xgr %r3,%r3 279 stmg %r0,%r3,$stdframe+4*4($sp) # wipe key copy 280 stmg %r0,%r3,$stdframe+4*12($sp) 281 282 lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) 283 br %r14 284 285.align 16 286.Ltail: 287 la @t[1],64($t[1]) 288 stm @x[0],@x[7],$stdframe+4*0($sp) 289 sl${g}r @t[1],%r14 290 lm @x[0],@x[3],$stdframe+4*8+4*8($sp) 291 l${g}hi @x[6],0 292 stm @x[12],@x[15],$stdframe+4*12($sp) 293 al @x[0],$stdframe+4*8($sp) 294 al @x[1],$stdframe+4*9($sp) 295 al @x[2],$stdframe+4*10($sp) 296 al @x[3],$stdframe+4*11($sp) 297 lrvr @x[0],@x[0] 298 lrvr @x[1],@x[1] 299 lrvr @x[2],@x[2] 300 lrvr @x[3],@x[3] 301 stm @x[0],@x[3],$stdframe+4*8($sp) 302 303.Loop_tail: 304 llgc @x[4],0(@x[6],%r14) 305 llgc @x[5],$stdframe(@x[6],$sp) 306 xr @x[5],@x[4] 307 stc @x[5],0(@x[6],@t[0]) 308 la @x[6],1(@x[6]) 309 brct @t[1],.Loop_tail 310 311 j .Ldone 312.size ChaCha20_ctr32,.-ChaCha20_ctr32 313 314.align 32 315.Lsigma: 316.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral 317.asciz "ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>" 318.align 4 319___ 320 321foreach (split("\n",$code)) { 322 s/\`([^\`]*)\`/eval $1/ge; 323 324 print $_,"\n"; 325} 326close STDOUT or die "error closing STDOUT: $!"; 327