1#! /usr/bin/env perl 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# Poly1305 hash for MIPS64. 18# 19# May 2016 20# 21# Numbers are cycles per processed byte with poly1305_blocks alone. 22# 23# IALU/gcc 24# R1x000 5.64/+120% (big-endian) 25# Octeon II 3.80/+280% (little-endian) 26 27###################################################################### 28# There is a number of MIPS ABI in use, O32 and N32/64 are most 29# widely used. Then there is a new contender: NUBI. It appears that if 30# one picks the latter, it's possible to arrange code in ABI neutral 31# manner. Therefore let's stick to NUBI register layout: 32# 33($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); 34($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 35($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); 36($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); 37# 38# The return value is placed in $a0. Following coding rules facilitate 39# interoperability: 40# 41# - never ever touch $tp, "thread pointer", former $gp [o32 can be 42# excluded from the rule, because it's specified volatile]; 43# - copy return value to $t0, former $v0 [or to $a0 if you're adapting 44# old code]; 45# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; 46# 47# For reference here is register layout for N32/64 MIPS ABIs: 48# 49# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 50# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 51# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 52# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 53# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 54# 55# <appro@openssl.org> 56# 57###################################################################### 58 59$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 60 61die "MIPS64 only" unless ($flavour =~ /64|n32/i); 62 63$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; 64$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; 65 66($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); 67($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); 68 69$code.=<<___; 70#include "mips_arch.h" 71 72#ifdef MIPSEB 73# define MSB 0 74# define LSB 7 75#else 76# define MSB 7 77# define LSB 0 78#endif 79 80.text 81.set noat 82.set noreorder 83 84.align 5 85.globl poly1305_init 86.ent poly1305_init 87poly1305_init: 88 .frame $sp,0,$ra 89 .set reorder 90 91 sd $zero,0($ctx) 92 sd $zero,8($ctx) 93 sd $zero,16($ctx) 94 95 beqz $inp,.Lno_key 96 97#if defined(_MIPS_ARCH_MIPS64R6) 98 ld $in0,0($inp) 99 ld $in1,8($inp) 100#else 101 ldl $in0,0+MSB($inp) 102 ldl $in1,8+MSB($inp) 103 ldr $in0,0+LSB($inp) 104 ldr $in1,8+LSB($inp) 105#endif 106#ifdef MIPSEB 107# if defined(_MIPS_ARCH_MIPS64R2) 108 dsbh $in0,$in0 # byte swap 109 dsbh $in1,$in1 110 dshd $in0,$in0 111 dshd $in1,$in1 112# else 113 ori $tmp0,$zero,0xFF 114 dsll $tmp2,$tmp0,32 115 or $tmp0,$tmp2 # 0x000000FF000000FF 116 117 and $tmp1,$in0,$tmp0 # byte swap 118 and $tmp3,$in1,$tmp0 119 dsrl $tmp2,$in0,24 120 dsrl $tmp4,$in1,24 121 dsll $tmp1,24 122 dsll $tmp3,24 123 and $tmp2,$tmp0 124 and $tmp4,$tmp0 125 dsll $tmp0,8 # 0x0000FF000000FF00 126 or $tmp1,$tmp2 127 or $tmp3,$tmp4 128 and $tmp2,$in0,$tmp0 129 and $tmp4,$in1,$tmp0 130 dsrl $in0,8 131 dsrl $in1,8 132 dsll $tmp2,8 133 dsll $tmp4,8 134 and $in0,$tmp0 135 and $in1,$tmp0 136 or $tmp1,$tmp2 137 or $tmp3,$tmp4 138 or $in0,$tmp1 139 or $in1,$tmp3 140 dsrl $tmp1,$in0,32 141 dsrl $tmp3,$in1,32 142 dsll $in0,32 143 dsll $in1,32 144 or $in0,$tmp1 145 or $in1,$tmp3 146# endif 147#endif 148 li $tmp0,1 149 dsll $tmp0,32 150 daddiu $tmp0,-63 151 dsll $tmp0,28 152 daddiu $tmp0,-1 # 0ffffffc0fffffff 153 154 and $in0,$tmp0 155 daddiu $tmp0,-3 # 0ffffffc0ffffffc 156 and $in1,$tmp0 157 158 sd $in0,24($ctx) 159 dsrl $tmp0,$in1,2 160 sd $in1,32($ctx) 161 daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) 162 sd $tmp0,40($ctx) 163 164.Lno_key: 165 li $v0,0 # return 0 166 jr $ra 167.end poly1305_init 168___ 169{ 170my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) = 171 ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); 172 173$code.=<<___; 174.align 5 175.globl poly1305_blocks 176.ent poly1305_blocks 177poly1305_blocks: 178 .set noreorder 179 dsrl $len,4 # number of complete blocks 180 bnez $len,poly1305_blocks_internal 181 nop 182 jr $ra 183 nop 184.end poly1305_blocks 185 186.align 5 187.ent poly1305_blocks_internal 188poly1305_blocks_internal: 189 .frame $sp,6*8,$ra 190 .mask $SAVED_REGS_MASK,-8 191 .set noreorder 192 dsubu $sp,6*8 193 sd $s5,40($sp) 194 sd $s4,32($sp) 195___ 196$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue 197 sd $s3,24($sp) 198 sd $s2,16($sp) 199 sd $s1,8($sp) 200 sd $s0,0($sp) 201___ 202$code.=<<___; 203 .set reorder 204 205 ld $h0,0($ctx) # load hash value 206 ld $h1,8($ctx) 207 ld $h2,16($ctx) 208 209 ld $r0,24($ctx) # load key 210 ld $r1,32($ctx) 211 ld $s1,40($ctx) 212 213.Loop: 214#if defined(_MIPS_ARCH_MIPS64R6) 215 ld $in0,0($inp) # load input 216 ld $in1,8($inp) 217#else 218 ldl $in0,0+MSB($inp) # load input 219 ldl $in1,8+MSB($inp) 220 ldr $in0,0+LSB($inp) 221 ldr $in1,8+LSB($inp) 222#endif 223 daddiu $len,-1 224 daddiu $inp,16 225#ifdef MIPSEB 226# if defined(_MIPS_ARCH_MIPS64R2) 227 dsbh $in0,$in0 # byte swap 228 dsbh $in1,$in1 229 dshd $in0,$in0 230 dshd $in1,$in1 231# else 232 ori $tmp0,$zero,0xFF 233 dsll $tmp2,$tmp0,32 234 or $tmp0,$tmp2 # 0x000000FF000000FF 235 236 and $tmp1,$in0,$tmp0 # byte swap 237 and $tmp3,$in1,$tmp0 238 dsrl $tmp2,$in0,24 239 dsrl $tmp4,$in1,24 240 dsll $tmp1,24 241 dsll $tmp3,24 242 and $tmp2,$tmp0 243 and $tmp4,$tmp0 244 dsll $tmp0,8 # 0x0000FF000000FF00 245 or $tmp1,$tmp2 246 or $tmp3,$tmp4 247 and $tmp2,$in0,$tmp0 248 and $tmp4,$in1,$tmp0 249 dsrl $in0,8 250 dsrl $in1,8 251 dsll $tmp2,8 252 dsll $tmp4,8 253 and $in0,$tmp0 254 and $in1,$tmp0 255 or $tmp1,$tmp2 256 or $tmp3,$tmp4 257 or $in0,$tmp1 258 or $in1,$tmp3 259 dsrl $tmp1,$in0,32 260 dsrl $tmp3,$in1,32 261 dsll $in0,32 262 dsll $in1,32 263 or $in0,$tmp1 264 or $in1,$tmp3 265# endif 266#endif 267 daddu $h0,$in0 # accumulate input 268 daddu $h1,$in1 269 sltu $tmp0,$h0,$in0 270 sltu $tmp1,$h1,$in1 271 daddu $h1,$tmp0 272 273 dmultu ($r0,$h0) # h0*r0 274 daddu $h2,$padbit 275 sltu $tmp0,$h1,$tmp0 276 mflo ($d0,$r0,$h0) 277 mfhi ($d1,$r0,$h0) 278 279 dmultu ($s1,$h1) # h1*5*r1 280 daddu $tmp0,$tmp1 281 daddu $h2,$tmp0 282 mflo ($tmp0,$s1,$h1) 283 mfhi ($tmp1,$s1,$h1) 284 285 dmultu ($r1,$h0) # h0*r1 286 daddu $d0,$tmp0 287 daddu $d1,$tmp1 288 mflo ($tmp2,$r1,$h0) 289 mfhi ($d2,$r1,$h0) 290 sltu $tmp0,$d0,$tmp0 291 daddu $d1,$tmp0 292 293 dmultu ($r0,$h1) # h1*r0 294 daddu $d1,$tmp2 295 sltu $tmp2,$d1,$tmp2 296 mflo ($tmp0,$r0,$h1) 297 mfhi ($tmp1,$r0,$h1) 298 daddu $d2,$tmp2 299 300 dmultu ($s1,$h2) # h2*5*r1 301 daddu $d1,$tmp0 302 daddu $d2,$tmp1 303 mflo ($tmp2,$s1,$h2) 304 305 dmultu ($r0,$h2) # h2*r0 306 sltu $tmp0,$d1,$tmp0 307 daddu $d2,$tmp0 308 mflo ($tmp3,$r0,$h2) 309 310 daddu $d1,$tmp2 311 daddu $d2,$tmp3 312 sltu $tmp2,$d1,$tmp2 313 daddu $d2,$tmp2 314 315 li $tmp0,-4 # final reduction 316 and $tmp0,$d2 317 dsrl $tmp1,$d2,2 318 andi $h2,$d2,3 319 daddu $tmp0,$tmp1 320 daddu $h0,$d0,$tmp0 321 sltu $tmp0,$h0,$tmp0 322 daddu $h1,$d1,$tmp0 323 sltu $tmp0,$h1,$tmp0 324 daddu $h2,$h2,$tmp0 325 326 bnez $len,.Loop 327 328 sd $h0,0($ctx) # store hash value 329 sd $h1,8($ctx) 330 sd $h2,16($ctx) 331 332 .set noreorder 333 ld $s5,40($sp) # epilogue 334 ld $s4,32($sp) 335___ 336$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue 337 ld $s3,24($sp) 338 ld $s2,16($sp) 339 ld $s1,8($sp) 340 ld $s0,0($sp) 341___ 342$code.=<<___; 343 jr $ra 344 daddu $sp,6*8 345.end poly1305_blocks_internal 346___ 347} 348{ 349my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); 350 351$code.=<<___; 352.align 5 353.globl poly1305_emit 354.ent poly1305_emit 355poly1305_emit: 356 .frame $sp,0,$ra 357 .set reorder 358 359 ld $tmp0,0($ctx) 360 ld $tmp1,8($ctx) 361 ld $tmp2,16($ctx) 362 363 daddiu $in0,$tmp0,5 # compare to modulus 364 sltiu $tmp3,$in0,5 365 daddu $in1,$tmp1,$tmp3 366 sltu $tmp3,$in1,$tmp3 367 daddu $tmp2,$tmp2,$tmp3 368 369 dsrl $tmp2,2 # see if it carried/borrowed 370 dsubu $tmp2,$zero,$tmp2 371 nor $tmp3,$zero,$tmp2 372 373 and $in0,$tmp2 374 and $tmp0,$tmp3 375 and $in1,$tmp2 376 and $tmp1,$tmp3 377 or $in0,$tmp0 378 or $in1,$tmp1 379 380 lwu $tmp0,0($nonce) # load nonce 381 lwu $tmp1,4($nonce) 382 lwu $tmp2,8($nonce) 383 lwu $tmp3,12($nonce) 384 dsll $tmp1,32 385 dsll $tmp3,32 386 or $tmp0,$tmp1 387 or $tmp2,$tmp3 388 389 daddu $in0,$tmp0 # accumulate nonce 390 daddu $in1,$tmp2 391 sltu $tmp0,$in0,$tmp0 392 daddu $in1,$tmp0 393 394 dsrl $tmp0,$in0,8 # write mac value 395 dsrl $tmp1,$in0,16 396 dsrl $tmp2,$in0,24 397 sb $in0,0($mac) 398 dsrl $tmp3,$in0,32 399 sb $tmp0,1($mac) 400 dsrl $tmp0,$in0,40 401 sb $tmp1,2($mac) 402 dsrl $tmp1,$in0,48 403 sb $tmp2,3($mac) 404 dsrl $tmp2,$in0,56 405 sb $tmp3,4($mac) 406 dsrl $tmp3,$in1,8 407 sb $tmp0,5($mac) 408 dsrl $tmp0,$in1,16 409 sb $tmp1,6($mac) 410 dsrl $tmp1,$in1,24 411 sb $tmp2,7($mac) 412 413 sb $in1,8($mac) 414 dsrl $tmp2,$in1,32 415 sb $tmp3,9($mac) 416 dsrl $tmp3,$in1,40 417 sb $tmp0,10($mac) 418 dsrl $tmp0,$in1,48 419 sb $tmp1,11($mac) 420 dsrl $tmp1,$in1,56 421 sb $tmp2,12($mac) 422 sb $tmp3,13($mac) 423 sb $tmp0,14($mac) 424 sb $tmp1,15($mac) 425 426 jr $ra 427.end poly1305_emit 428.rdata 429.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by <appro\@openssl.org>" 430.align 2 431___ 432} 433 434$output=pop and open STDOUT,">$output"; 435print $code; 436close STDOUT or die "error closing STDOUT: $!"; 437 438