1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# March 2010 11# 12# The module implements "4-bit" GCM GHASH function and underlying 13# single multiplication operation in GF(2^128). "4-bit" means that it 14# uses 256 bytes per-key table [+128 bytes shared table]. Even though 15# loops are aggressively modulo-scheduled in respect to references to 16# Htbl and Z.hi updates for 8 cycles per byte, measured performance is 17# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic 18# scheduling "glitch," because uprofile(1) indicates uniform sample 19# distribution, as if all instruction bundles execute in 1.5 cycles. 20# Meaning that it could have been even faster, yet 12 cycles is ~60% 21# better than gcc-generated code and ~80% than code generated by vendor 22# compiler. 23 24$cnt="v0"; # $0 25$t0="t0"; 26$t1="t1"; 27$t2="t2"; 28$Thi0="t3"; # $4 29$Tlo0="t4"; 30$Thi1="t5"; 31$Tlo1="t6"; 32$rem="t7"; # $8 33################# 34$Xi="a0"; # $16, input argument block 35$Htbl="a1"; 36$inp="a2"; 37$len="a3"; 38$nlo="a4"; # $20 39$nhi="a5"; 40$Zhi="t8"; 41$Zlo="t9"; 42$Xhi="t10"; # $24 43$Xlo="t11"; 44$remp="t12"; 45$rem_4bit="AT"; # $28 46 47{ my $N; 48 sub loop() { 49 50 $N++; 51$code.=<<___; 52.align 4 53 extbl $Xlo,7,$nlo 54 and $nlo,0xf0,$nhi 55 sll $nlo,4,$nlo 56 and $nlo,0xf0,$nlo 57 58 addq $nlo,$Htbl,$nlo 59 ldq $Zlo,8($nlo) 60 addq $nhi,$Htbl,$nhi 61 ldq $Zhi,0($nlo) 62 63 and $Zlo,0x0f,$remp 64 sll $Zhi,60,$t0 65 lda $cnt,6(zero) 66 extbl $Xlo,6,$nlo 67 68 ldq $Tlo1,8($nhi) 69 s8addq $remp,$rem_4bit,$remp 70 ldq $Thi1,0($nhi) 71 srl $Zlo,4,$Zlo 72 73 ldq $rem,0($remp) 74 srl $Zhi,4,$Zhi 75 xor $t0,$Zlo,$Zlo 76 and $nlo,0xf0,$nhi 77 78 xor $Tlo1,$Zlo,$Zlo 79 sll $nlo,4,$nlo 80 xor $Thi1,$Zhi,$Zhi 81 and $nlo,0xf0,$nlo 82 83 addq $nlo,$Htbl,$nlo 84 ldq $Tlo0,8($nlo) 85 addq $nhi,$Htbl,$nhi 86 ldq $Thi0,0($nlo) 87 88.Looplo$N: 89 and $Zlo,0x0f,$remp 90 sll $Zhi,60,$t0 91 subq $cnt,1,$cnt 92 srl $Zlo,4,$Zlo 93 94 ldq $Tlo1,8($nhi) 95 xor $rem,$Zhi,$Zhi 96 ldq $Thi1,0($nhi) 97 s8addq $remp,$rem_4bit,$remp 98 99 ldq $rem,0($remp) 100 srl $Zhi,4,$Zhi 101 xor $t0,$Zlo,$Zlo 102 extbl $Xlo,$cnt,$nlo 103 104 and $nlo,0xf0,$nhi 105 xor $Thi0,$Zhi,$Zhi 106 xor $Tlo0,$Zlo,$Zlo 107 sll $nlo,4,$nlo 108 109 110 and $Zlo,0x0f,$remp 111 sll $Zhi,60,$t0 112 and $nlo,0xf0,$nlo 113 srl $Zlo,4,$Zlo 114 115 s8addq $remp,$rem_4bit,$remp 116 xor $rem,$Zhi,$Zhi 117 addq $nlo,$Htbl,$nlo 118 addq $nhi,$Htbl,$nhi 119 120 ldq $rem,0($remp) 121 srl $Zhi,4,$Zhi 122 ldq $Tlo0,8($nlo) 123 xor $t0,$Zlo,$Zlo 124 125 xor $Tlo1,$Zlo,$Zlo 126 xor $Thi1,$Zhi,$Zhi 127 ldq $Thi0,0($nlo) 128 bne $cnt,.Looplo$N 129 130 131 and $Zlo,0x0f,$remp 132 sll $Zhi,60,$t0 133 lda $cnt,7(zero) 134 srl $Zlo,4,$Zlo 135 136 ldq $Tlo1,8($nhi) 137 xor $rem,$Zhi,$Zhi 138 ldq $Thi1,0($nhi) 139 s8addq $remp,$rem_4bit,$remp 140 141 ldq $rem,0($remp) 142 srl $Zhi,4,$Zhi 143 xor $t0,$Zlo,$Zlo 144 extbl $Xhi,$cnt,$nlo 145 146 and $nlo,0xf0,$nhi 147 xor $Thi0,$Zhi,$Zhi 148 xor $Tlo0,$Zlo,$Zlo 149 sll $nlo,4,$nlo 150 151 and $Zlo,0x0f,$remp 152 sll $Zhi,60,$t0 153 and $nlo,0xf0,$nlo 154 srl $Zlo,4,$Zlo 155 156 s8addq $remp,$rem_4bit,$remp 157 xor $rem,$Zhi,$Zhi 158 addq $nlo,$Htbl,$nlo 159 addq $nhi,$Htbl,$nhi 160 161 ldq $rem,0($remp) 162 srl $Zhi,4,$Zhi 163 ldq $Tlo0,8($nlo) 164 xor $t0,$Zlo,$Zlo 165 166 xor $Tlo1,$Zlo,$Zlo 167 xor $Thi1,$Zhi,$Zhi 168 ldq $Thi0,0($nlo) 169 unop 170 171 172.Loophi$N: 173 and $Zlo,0x0f,$remp 174 sll $Zhi,60,$t0 175 subq $cnt,1,$cnt 176 srl $Zlo,4,$Zlo 177 178 ldq $Tlo1,8($nhi) 179 xor $rem,$Zhi,$Zhi 180 ldq $Thi1,0($nhi) 181 s8addq $remp,$rem_4bit,$remp 182 183 ldq $rem,0($remp) 184 srl $Zhi,4,$Zhi 185 xor $t0,$Zlo,$Zlo 186 extbl $Xhi,$cnt,$nlo 187 188 and $nlo,0xf0,$nhi 189 xor $Thi0,$Zhi,$Zhi 190 xor $Tlo0,$Zlo,$Zlo 191 sll $nlo,4,$nlo 192 193 194 and $Zlo,0x0f,$remp 195 sll $Zhi,60,$t0 196 and $nlo,0xf0,$nlo 197 srl $Zlo,4,$Zlo 198 199 s8addq $remp,$rem_4bit,$remp 200 xor $rem,$Zhi,$Zhi 201 addq $nlo,$Htbl,$nlo 202 addq $nhi,$Htbl,$nhi 203 204 ldq $rem,0($remp) 205 srl $Zhi,4,$Zhi 206 ldq $Tlo0,8($nlo) 207 xor $t0,$Zlo,$Zlo 208 209 xor $Tlo1,$Zlo,$Zlo 210 xor $Thi1,$Zhi,$Zhi 211 ldq $Thi0,0($nlo) 212 bne $cnt,.Loophi$N 213 214 215 and $Zlo,0x0f,$remp 216 sll $Zhi,60,$t0 217 srl $Zlo,4,$Zlo 218 219 ldq $Tlo1,8($nhi) 220 xor $rem,$Zhi,$Zhi 221 ldq $Thi1,0($nhi) 222 s8addq $remp,$rem_4bit,$remp 223 224 ldq $rem,0($remp) 225 srl $Zhi,4,$Zhi 226 xor $t0,$Zlo,$Zlo 227 228 xor $Tlo0,$Zlo,$Zlo 229 xor $Thi0,$Zhi,$Zhi 230 231 and $Zlo,0x0f,$remp 232 sll $Zhi,60,$t0 233 srl $Zlo,4,$Zlo 234 235 s8addq $remp,$rem_4bit,$remp 236 xor $rem,$Zhi,$Zhi 237 238 ldq $rem,0($remp) 239 srl $Zhi,4,$Zhi 240 xor $Tlo1,$Zlo,$Zlo 241 xor $Thi1,$Zhi,$Zhi 242 xor $t0,$Zlo,$Zlo 243 xor $rem,$Zhi,$Zhi 244___ 245}} 246 247$code=<<___; 248#include <machine/asm.h> 249 250.text 251 252.set noat 253.set noreorder 254.globl gcm_gmult_4bit 255.align 4 256.ent gcm_gmult_4bit 257gcm_gmult_4bit: 258 .frame sp,0,ra 259 .prologue 0 260 261 ldq $Xlo,8($Xi) 262 ldq $Xhi,0($Xi) 263 264 bsr $t0,picmeup 265 nop 266___ 267 268 &loop(); 269 270$code.=<<___; 271 srl $Zlo,24,$t0 # byte swap 272 srl $Zlo,8,$t1 273 274 sll $Zlo,8,$t2 275 sll $Zlo,24,$Zlo 276 zapnot $t0,0x11,$t0 277 zapnot $t1,0x22,$t1 278 279 zapnot $Zlo,0x88,$Zlo 280 or $t0,$t1,$t0 281 zapnot $t2,0x44,$t2 282 283 or $Zlo,$t0,$Zlo 284 srl $Zhi,24,$t0 285 srl $Zhi,8,$t1 286 287 or $Zlo,$t2,$Zlo 288 sll $Zhi,8,$t2 289 sll $Zhi,24,$Zhi 290 291 srl $Zlo,32,$Xlo 292 sll $Zlo,32,$Zlo 293 294 zapnot $t0,0x11,$t0 295 zapnot $t1,0x22,$t1 296 or $Zlo,$Xlo,$Xlo 297 298 zapnot $Zhi,0x88,$Zhi 299 or $t0,$t1,$t0 300 zapnot $t2,0x44,$t2 301 302 or $Zhi,$t0,$Zhi 303 or $Zhi,$t2,$Zhi 304 305 srl $Zhi,32,$Xhi 306 sll $Zhi,32,$Zhi 307 308 or $Zhi,$Xhi,$Xhi 309 stq $Xlo,8($Xi) 310 stq $Xhi,0($Xi) 311 312 ret (ra) 313.end gcm_gmult_4bit 314___ 315 316$inhi="s0"; 317$inlo="s1"; 318 319$code.=<<___; 320.globl gcm_ghash_4bit 321.align 4 322.ent gcm_ghash_4bit 323gcm_ghash_4bit: 324 lda sp,-32(sp) 325 stq ra,0(sp) 326 stq s0,8(sp) 327 stq s1,16(sp) 328 .mask 0x04000600,-32 329 .frame sp,32,ra 330 .prologue 0 331 332 ldq_u $inhi,0($inp) 333 ldq_u $Thi0,7($inp) 334 ldq_u $inlo,8($inp) 335 ldq_u $Tlo0,15($inp) 336 ldq $Xhi,0($Xi) 337 ldq $Xlo,8($Xi) 338 339 bsr $t0,picmeup 340 nop 341 342.Louter: 343 extql $inhi,$inp,$inhi 344 extqh $Thi0,$inp,$Thi0 345 or $inhi,$Thi0,$inhi 346 lda $inp,16($inp) 347 348 extql $inlo,$inp,$inlo 349 extqh $Tlo0,$inp,$Tlo0 350 or $inlo,$Tlo0,$inlo 351 subq $len,16,$len 352 353 xor $Xlo,$inlo,$Xlo 354 xor $Xhi,$inhi,$Xhi 355___ 356 357 &loop(); 358 359$code.=<<___; 360 srl $Zlo,24,$t0 # byte swap 361 srl $Zlo,8,$t1 362 363 sll $Zlo,8,$t2 364 sll $Zlo,24,$Zlo 365 zapnot $t0,0x11,$t0 366 zapnot $t1,0x22,$t1 367 368 zapnot $Zlo,0x88,$Zlo 369 or $t0,$t1,$t0 370 zapnot $t2,0x44,$t2 371 372 or $Zlo,$t0,$Zlo 373 srl $Zhi,24,$t0 374 srl $Zhi,8,$t1 375 376 or $Zlo,$t2,$Zlo 377 sll $Zhi,8,$t2 378 sll $Zhi,24,$Zhi 379 380 srl $Zlo,32,$Xlo 381 sll $Zlo,32,$Zlo 382 beq $len,.Ldone 383 384 zapnot $t0,0x11,$t0 385 zapnot $t1,0x22,$t1 386 or $Zlo,$Xlo,$Xlo 387 ldq_u $inhi,0($inp) 388 389 zapnot $Zhi,0x88,$Zhi 390 or $t0,$t1,$t0 391 zapnot $t2,0x44,$t2 392 ldq_u $Thi0,7($inp) 393 394 or $Zhi,$t0,$Zhi 395 or $Zhi,$t2,$Zhi 396 ldq_u $inlo,8($inp) 397 ldq_u $Tlo0,15($inp) 398 399 srl $Zhi,32,$Xhi 400 sll $Zhi,32,$Zhi 401 402 or $Zhi,$Xhi,$Xhi 403 br zero,.Louter 404 405.Ldone: 406 zapnot $t0,0x11,$t0 407 zapnot $t1,0x22,$t1 408 or $Zlo,$Xlo,$Xlo 409 410 zapnot $Zhi,0x88,$Zhi 411 or $t0,$t1,$t0 412 zapnot $t2,0x44,$t2 413 414 or $Zhi,$t0,$Zhi 415 or $Zhi,$t2,$Zhi 416 417 srl $Zhi,32,$Xhi 418 sll $Zhi,32,$Zhi 419 420 or $Zhi,$Xhi,$Xhi 421 422 stq $Xlo,8($Xi) 423 stq $Xhi,0($Xi) 424 425 .set noreorder 426 /*ldq ra,0(sp)*/ 427 ldq s0,8(sp) 428 ldq s1,16(sp) 429 lda sp,32(sp) 430 ret (ra) 431.end gcm_ghash_4bit 432 433.align 4 434.ent picmeup 435picmeup: 436 .frame sp,0,$t0 437 .prologue 0 438 br $rem_4bit,.Lpic 439.Lpic: lda $rem_4bit,12($rem_4bit) 440 ret ($t0) 441.end picmeup 442 nop 443rem_4bit: 444 .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16 445 .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16 446 .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16 447 .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16 448.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>" 449.align 4 450 451___ 452$output=shift and open STDOUT,">$output"; 453print $code; 454close STDOUT; 455 456