1*1dcdf01fSchristos#! /usr/bin/env perl 2*1dcdf01fSchristos# Copyright 2011-2023 The OpenSSL Project Authors. All Rights Reserved. 3*1dcdf01fSchristos# 4*1dcdf01fSchristos# Licensed under the OpenSSL license (the "License"). You may not use 5*1dcdf01fSchristos# this file except in compliance with the License. You can obtain a copy 6*1dcdf01fSchristos# in the file LICENSE in the source distribution or at 7*1dcdf01fSchristos# https://www.openssl.org/source/license.html 8*1dcdf01fSchristos 9*1dcdf01fSchristos 10*1dcdf01fSchristos# ==================================================================== 11*1dcdf01fSchristos# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12*1dcdf01fSchristos# project. The module is, however, dual licensed under OpenSSL and 13*1dcdf01fSchristos# CRYPTOGAMS licenses depending on where you obtain it. For further 14*1dcdf01fSchristos# details see http://www.openssl.org/~appro/cryptogams/. 15*1dcdf01fSchristos# ==================================================================== 16*1dcdf01fSchristos 17*1dcdf01fSchristos# September 2011 18*1dcdf01fSchristos# 19*1dcdf01fSchristos# Assembler helpers for Padlock engine. Compared to original engine 20*1dcdf01fSchristos# version relying on inline assembler and compiled with gcc 3.4.6 it 21*1dcdf01fSchristos# was measured to provide ~100% improvement on misaligned data in ECB 22*1dcdf01fSchristos# mode and ~75% in CBC mode. For aligned data improvement can be 23*1dcdf01fSchristos# observed for short inputs only, e.g. 45% for 64-byte messages in 24*1dcdf01fSchristos# ECB mode, 20% in CBC. Difference in performance for aligned vs. 25*1dcdf01fSchristos# misaligned data depends on misalignment and is either ~1.8x or 2.9x. 26*1dcdf01fSchristos# These are approximately same factors as for hardware support, so 27*1dcdf01fSchristos# there is little reason to rely on the latter. On the contrary, it 28*1dcdf01fSchristos# might actually hurt performance in mixture of aligned and misaligned 29*1dcdf01fSchristos# buffers, because a) if you choose to flip 'align' flag in control 30*1dcdf01fSchristos# word on per-buffer basis, then you'd have to reload key context, 31*1dcdf01fSchristos# which incurs penalty; b) if you choose to set 'align' flag 32*1dcdf01fSchristos# permanently, it limits performance even for aligned data to ~1/2. 33*1dcdf01fSchristos# All above mentioned results were collected on 1.5GHz C7. Nano on the 34*1dcdf01fSchristos# other hand handles unaligned data more gracefully. Depending on 35*1dcdf01fSchristos# algorithm and how unaligned data is, hardware can be up to 70% more 36*1dcdf01fSchristos# efficient than below software alignment procedures, nor does 'align' 37*1dcdf01fSchristos# flag have affect on aligned performance [if has any meaning at all]. 38*1dcdf01fSchristos# Therefore suggestion is to unconditionally set 'align' flag on Nano 39*1dcdf01fSchristos# for optimal performance. 40*1dcdf01fSchristos 41*1dcdf01fSchristos$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42*1dcdf01fSchristospush(@INC,"${dir}","${dir}../../crypto/perlasm"); 43*1dcdf01fSchristosrequire "x86asm.pl"; 44*1dcdf01fSchristos 45*1dcdf01fSchristos$output=pop; 46*1dcdf01fSchristosopen STDOUT,">$output"; 47*1dcdf01fSchristos 48*1dcdf01fSchristos&asm_init($ARGV[0]); 49*1dcdf01fSchristos 50*1dcdf01fSchristos%PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata 51*1dcdf01fSchristos$PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16 52*1dcdf01fSchristos 53*1dcdf01fSchristos$ctx="edx"; 54*1dcdf01fSchristos$out="edi"; 55*1dcdf01fSchristos$inp="esi"; 56*1dcdf01fSchristos$len="ecx"; 57*1dcdf01fSchristos$chunk="ebx"; 58*1dcdf01fSchristos 59*1dcdf01fSchristos&function_begin_B("padlock_capability"); 60*1dcdf01fSchristos &push ("ebx"); 61*1dcdf01fSchristos &pushf (); 62*1dcdf01fSchristos &pop ("eax"); 63*1dcdf01fSchristos &mov ("ecx","eax"); 64*1dcdf01fSchristos &xor ("eax",1<<21); 65*1dcdf01fSchristos &push ("eax"); 66*1dcdf01fSchristos &popf (); 67*1dcdf01fSchristos &pushf (); 68*1dcdf01fSchristos &pop ("eax"); 69*1dcdf01fSchristos &xor ("ecx","eax"); 70*1dcdf01fSchristos &xor ("eax","eax"); 71*1dcdf01fSchristos &bt ("ecx",21); 72*1dcdf01fSchristos &jnc (&label("noluck")); 73*1dcdf01fSchristos &cpuid (); 74*1dcdf01fSchristos &xor ("eax","eax"); 75*1dcdf01fSchristos &cmp ("ebx","0x".unpack("H*",'tneC')); 76*1dcdf01fSchristos &jne (&label("zhaoxin")); 77*1dcdf01fSchristos &cmp ("edx","0x".unpack("H*",'Hrua')); 78*1dcdf01fSchristos &jne (&label("noluck")); 79*1dcdf01fSchristos &cmp ("ecx","0x".unpack("H*",'slua')); 80*1dcdf01fSchristos &jne (&label("noluck")); 81*1dcdf01fSchristos &jmp (&label("zhaoxinEnd")); 82*1dcdf01fSchristos&set_label("zhaoxin"); 83*1dcdf01fSchristos &cmp ("ebx","0x".unpack("H*",'hS ')); 84*1dcdf01fSchristos &jne (&label("noluck")); 85*1dcdf01fSchristos &cmp ("edx","0x".unpack("H*",'hgna')); 86*1dcdf01fSchristos &jne (&label("noluck")); 87*1dcdf01fSchristos &cmp ("ecx","0x".unpack("H*",' ia')); 88*1dcdf01fSchristos &jne (&label("noluck")); 89*1dcdf01fSchristos&set_label("zhaoxinEnd"); 90*1dcdf01fSchristos &mov ("eax",0xC0000000); 91*1dcdf01fSchristos &cpuid (); 92*1dcdf01fSchristos &mov ("edx","eax"); 93*1dcdf01fSchristos &xor ("eax","eax"); 94*1dcdf01fSchristos &cmp ("edx",0xC0000001); 95*1dcdf01fSchristos &jb (&label("noluck")); 96*1dcdf01fSchristos &mov ("eax",1); 97*1dcdf01fSchristos &cpuid (); 98*1dcdf01fSchristos &or ("eax",0x0f); 99*1dcdf01fSchristos &xor ("ebx","ebx"); 100*1dcdf01fSchristos &and ("eax",0x0fff); 101*1dcdf01fSchristos &cmp ("eax",0x06ff); # check for Nano 102*1dcdf01fSchristos &sete ("bl"); 103*1dcdf01fSchristos &mov ("eax",0xC0000001); 104*1dcdf01fSchristos &push ("ebx"); 105*1dcdf01fSchristos &cpuid (); 106*1dcdf01fSchristos &pop ("ebx"); 107*1dcdf01fSchristos &mov ("eax","edx"); 108*1dcdf01fSchristos &shl ("ebx",4); # bit#4 denotes Nano 109*1dcdf01fSchristos &and ("eax",0xffffffef); 110*1dcdf01fSchristos &or ("eax","ebx") 111*1dcdf01fSchristos&set_label("noluck"); 112*1dcdf01fSchristos &pop ("ebx"); 113*1dcdf01fSchristos &ret (); 114*1dcdf01fSchristos&function_end_B("padlock_capability") 115*1dcdf01fSchristos 116*1dcdf01fSchristos&function_begin_B("padlock_key_bswap"); 117*1dcdf01fSchristos &mov ("edx",&wparam(0)); 118*1dcdf01fSchristos &mov ("ecx",&DWP(240,"edx")); 119*1dcdf01fSchristos &inc ("ecx"); 120*1dcdf01fSchristos &shl ("ecx",2); 121*1dcdf01fSchristos&set_label("bswap_loop"); 122*1dcdf01fSchristos &mov ("eax",&DWP(0,"edx")); 123*1dcdf01fSchristos &bswap ("eax"); 124*1dcdf01fSchristos &mov (&DWP(0,"edx"),"eax"); 125*1dcdf01fSchristos &lea ("edx",&DWP(4,"edx")); 126*1dcdf01fSchristos &sub ("ecx",1); 127*1dcdf01fSchristos &jnz (&label("bswap_loop")); 128*1dcdf01fSchristos &ret (); 129*1dcdf01fSchristos&function_end_B("padlock_key_bswap"); 130*1dcdf01fSchristos 131*1dcdf01fSchristos# This is heuristic key context tracing. At first one 132*1dcdf01fSchristos# believes that one should use atomic swap instructions, 133*1dcdf01fSchristos# but it's not actually necessary. Point is that if 134*1dcdf01fSchristos# padlock_saved_context was changed by another thread 135*1dcdf01fSchristos# after we've read it and before we compare it with ctx, 136*1dcdf01fSchristos# our key *shall* be reloaded upon thread context switch 137*1dcdf01fSchristos# and we are therefore set in either case... 138*1dcdf01fSchristos&static_label("padlock_saved_context"); 139*1dcdf01fSchristos 140*1dcdf01fSchristos&function_begin_B("padlock_verify_context"); 141*1dcdf01fSchristos &mov ($ctx,&wparam(0)); 142*1dcdf01fSchristos &lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) : 143*1dcdf01fSchristos &DWP(&label("padlock_saved_context")."-".&label("verify_pic_point"))); 144*1dcdf01fSchristos &pushf (); 145*1dcdf01fSchristos &call ("_padlock_verify_ctx"); 146*1dcdf01fSchristos&set_label("verify_pic_point"); 147*1dcdf01fSchristos &lea ("esp",&DWP(4,"esp")); 148*1dcdf01fSchristos &ret (); 149*1dcdf01fSchristos&function_end_B("padlock_verify_context"); 150*1dcdf01fSchristos 151*1dcdf01fSchristos&function_begin_B("_padlock_verify_ctx"); 152*1dcdf01fSchristos &add ("eax",&DWP(0,"esp")) if(!($::win32 or $::coff));# &padlock_saved_context 153*1dcdf01fSchristos &bt (&DWP(4,"esp"),30); # eflags 154*1dcdf01fSchristos &jnc (&label("verified")); 155*1dcdf01fSchristos &cmp ($ctx,&DWP(0,"eax")); 156*1dcdf01fSchristos &je (&label("verified")); 157*1dcdf01fSchristos &pushf (); 158*1dcdf01fSchristos &popf (); 159*1dcdf01fSchristos&set_label("verified"); 160*1dcdf01fSchristos &mov (&DWP(0,"eax"),$ctx); 161*1dcdf01fSchristos &ret (); 162*1dcdf01fSchristos&function_end_B("_padlock_verify_ctx"); 163*1dcdf01fSchristos 164*1dcdf01fSchristos&function_begin_B("padlock_reload_key"); 165*1dcdf01fSchristos &pushf (); 166*1dcdf01fSchristos &popf (); 167*1dcdf01fSchristos &ret (); 168*1dcdf01fSchristos&function_end_B("padlock_reload_key"); 169*1dcdf01fSchristos 170*1dcdf01fSchristos&function_begin_B("padlock_aes_block"); 171*1dcdf01fSchristos &push ("edi"); 172*1dcdf01fSchristos &push ("esi"); 173*1dcdf01fSchristos &push ("ebx"); 174*1dcdf01fSchristos &mov ($out,&wparam(0)); # must be 16-byte aligned 175*1dcdf01fSchristos &mov ($inp,&wparam(1)); # must be 16-byte aligned 176*1dcdf01fSchristos &mov ($ctx,&wparam(2)); 177*1dcdf01fSchristos &mov ($len,1); 178*1dcdf01fSchristos &lea ("ebx",&DWP(32,$ctx)); # key 179*1dcdf01fSchristos &lea ($ctx,&DWP(16,$ctx)); # control word 180*1dcdf01fSchristos &data_byte(0xf3,0x0f,0xa7,0xc8); # rep xcryptecb 181*1dcdf01fSchristos &pop ("ebx"); 182*1dcdf01fSchristos &pop ("esi"); 183*1dcdf01fSchristos &pop ("edi"); 184*1dcdf01fSchristos &ret (); 185*1dcdf01fSchristos&function_end_B("padlock_aes_block"); 186*1dcdf01fSchristos 187*1dcdf01fSchristossub generate_mode { 188*1dcdf01fSchristosmy ($mode,$opcode) = @_; 189*1dcdf01fSchristos# int padlock_$mode_encrypt(void *out, const void *inp, 190*1dcdf01fSchristos# struct padlock_cipher_data *ctx, size_t len); 191*1dcdf01fSchristos&function_begin("padlock_${mode}_encrypt"); 192*1dcdf01fSchristos &mov ($out,&wparam(0)); 193*1dcdf01fSchristos &mov ($inp,&wparam(1)); 194*1dcdf01fSchristos &mov ($ctx,&wparam(2)); 195*1dcdf01fSchristos &mov ($len,&wparam(3)); 196*1dcdf01fSchristos &test ($ctx,15); 197*1dcdf01fSchristos &jnz (&label("${mode}_abort")); 198*1dcdf01fSchristos &test ($len,15); 199*1dcdf01fSchristos &jnz (&label("${mode}_abort")); 200*1dcdf01fSchristos &lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) : 201*1dcdf01fSchristos &DWP(&label("padlock_saved_context")."-".&label("${mode}_pic_point"))); 202*1dcdf01fSchristos &pushf (); 203*1dcdf01fSchristos &cld (); 204*1dcdf01fSchristos &call ("_padlock_verify_ctx"); 205*1dcdf01fSchristos&set_label("${mode}_pic_point"); 206*1dcdf01fSchristos &lea ($ctx,&DWP(16,$ctx)); # control word 207*1dcdf01fSchristos &xor ("eax","eax"); 208*1dcdf01fSchristos if ($mode eq "ctr32") { 209*1dcdf01fSchristos &movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter 210*1dcdf01fSchristos } else { 211*1dcdf01fSchristos &xor ("ebx","ebx"); 212*1dcdf01fSchristos &test (&DWP(0,$ctx),1<<5); # align bit in control word 213*1dcdf01fSchristos &jnz (&label("${mode}_aligned")); 214*1dcdf01fSchristos &test ($out,0x0f); 215*1dcdf01fSchristos &setz ("al"); # !out_misaligned 216*1dcdf01fSchristos &test ($inp,0x0f); 217*1dcdf01fSchristos &setz ("bl"); # !inp_misaligned 218*1dcdf01fSchristos &test ("eax","ebx"); 219*1dcdf01fSchristos &jnz (&label("${mode}_aligned")); 220*1dcdf01fSchristos &neg ("eax"); 221*1dcdf01fSchristos } 222*1dcdf01fSchristos &mov ($chunk,$PADLOCK_CHUNK); 223*1dcdf01fSchristos ¬ ("eax"); # out_misaligned?-1:0 224*1dcdf01fSchristos &lea ("ebp",&DWP(-24,"esp")); 225*1dcdf01fSchristos &cmp ($len,$chunk); 226*1dcdf01fSchristos &cmovc ($chunk,$len); # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len 227*1dcdf01fSchristos &and ("eax",$chunk); # out_misaligned?chunk:0 228*1dcdf01fSchristos &mov ($chunk,$len); 229*1dcdf01fSchristos &neg ("eax"); 230*1dcdf01fSchristos &and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK 231*1dcdf01fSchristos &lea ("esp",&DWP(0,"eax","ebp")); # alloca 232*1dcdf01fSchristos &mov ("eax",$PADLOCK_CHUNK); 233*1dcdf01fSchristos &cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK 234*1dcdf01fSchristos &mov ("eax","ebp"); 235*1dcdf01fSchristos &and ("ebp",-16); 236*1dcdf01fSchristos &and ("esp",-16); 237*1dcdf01fSchristos &mov (&DWP(16,"ebp"),"eax"); 238*1dcdf01fSchristos if ($PADLOCK_PREFETCH{$mode}) { 239*1dcdf01fSchristos &cmp ($len,$chunk); 240*1dcdf01fSchristos &ja (&label("${mode}_loop")); 241*1dcdf01fSchristos &mov ("eax",$inp); # check if prefetch crosses page 242*1dcdf01fSchristos &cmp ("ebp","esp"); 243*1dcdf01fSchristos &cmove ("eax",$out); 244*1dcdf01fSchristos &add ("eax",$len); 245*1dcdf01fSchristos &neg ("eax"); 246*1dcdf01fSchristos &and ("eax",0xfff); # distance to page boundary 247*1dcdf01fSchristos &cmp ("eax",$PADLOCK_PREFETCH{$mode}); 248*1dcdf01fSchristos &mov ("eax",-$PADLOCK_PREFETCH{$mode}); 249*1dcdf01fSchristos &cmovae ("eax",$chunk); # mask=distance<prefetch?-prefetch:-1 250*1dcdf01fSchristos &and ($chunk,"eax"); 251*1dcdf01fSchristos &jz (&label("${mode}_unaligned_tail")); 252*1dcdf01fSchristos } 253*1dcdf01fSchristos &jmp (&label("${mode}_loop")); 254*1dcdf01fSchristos 255*1dcdf01fSchristos&set_label("${mode}_loop",16); 256*1dcdf01fSchristos &mov (&DWP(0,"ebp"),$out); # save parameters 257*1dcdf01fSchristos &mov (&DWP(4,"ebp"),$inp); 258*1dcdf01fSchristos &mov (&DWP(8,"ebp"),$len); 259*1dcdf01fSchristos &mov ($len,$chunk); 260*1dcdf01fSchristos &mov (&DWP(12,"ebp"),$chunk); # chunk 261*1dcdf01fSchristos if ($mode eq "ctr32") { 262*1dcdf01fSchristos &mov ("ecx",&DWP(-4,$ctx)); 263*1dcdf01fSchristos &xor ($out,$out); 264*1dcdf01fSchristos &mov ("eax",&DWP(-8,$ctx)); # borrow $len 265*1dcdf01fSchristos&set_label("${mode}_prepare"); 266*1dcdf01fSchristos &mov (&DWP(12,"esp",$out),"ecx"); 267*1dcdf01fSchristos &bswap ("ecx"); 268*1dcdf01fSchristos &movq (&QWP(0,"esp",$out),"mm0"); 269*1dcdf01fSchristos &inc ("ecx"); 270*1dcdf01fSchristos &mov (&DWP(8,"esp",$out),"eax"); 271*1dcdf01fSchristos &bswap ("ecx"); 272*1dcdf01fSchristos &lea ($out,&DWP(16,$out)); 273*1dcdf01fSchristos &cmp ($out,$chunk); 274*1dcdf01fSchristos &jb (&label("${mode}_prepare")); 275*1dcdf01fSchristos 276*1dcdf01fSchristos &mov (&DWP(-4,$ctx),"ecx"); 277*1dcdf01fSchristos &lea ($inp,&DWP(0,"esp")); 278*1dcdf01fSchristos &lea ($out,&DWP(0,"esp")); 279*1dcdf01fSchristos &mov ($len,$chunk); 280*1dcdf01fSchristos } else { 281*1dcdf01fSchristos &test ($out,0x0f); # out_misaligned 282*1dcdf01fSchristos &cmovnz ($out,"esp"); 283*1dcdf01fSchristos &test ($inp,0x0f); # inp_misaligned 284*1dcdf01fSchristos &jz (&label("${mode}_inp_aligned")); 285*1dcdf01fSchristos &shr ($len,2); 286*1dcdf01fSchristos &data_byte(0xf3,0xa5); # rep movsl 287*1dcdf01fSchristos &sub ($out,$chunk); 288*1dcdf01fSchristos &mov ($len,$chunk); 289*1dcdf01fSchristos &mov ($inp,$out); 290*1dcdf01fSchristos&set_label("${mode}_inp_aligned"); 291*1dcdf01fSchristos } 292*1dcdf01fSchristos &lea ("eax",&DWP(-16,$ctx)); # ivp 293*1dcdf01fSchristos &lea ("ebx",&DWP(16,$ctx)); # key 294*1dcdf01fSchristos &shr ($len,4); # len/=AES_BLOCK_SIZE 295*1dcdf01fSchristos &data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt* 296*1dcdf01fSchristos if ($mode !~ /ecb|ctr/) { 297*1dcdf01fSchristos &movaps ("xmm0",&QWP(0,"eax")); 298*1dcdf01fSchristos &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv 299*1dcdf01fSchristos } 300*1dcdf01fSchristos &mov ($out,&DWP(0,"ebp")); # restore parameters 301*1dcdf01fSchristos &mov ($chunk,&DWP(12,"ebp")); 302*1dcdf01fSchristos if ($mode eq "ctr32") { 303*1dcdf01fSchristos &mov ($inp,&DWP(4,"ebp")); 304*1dcdf01fSchristos &xor ($len,$len); 305*1dcdf01fSchristos&set_label("${mode}_xor"); 306*1dcdf01fSchristos &movups ("xmm1",&QWP(0,$inp,$len)); 307*1dcdf01fSchristos &lea ($len,&DWP(16,$len)); 308*1dcdf01fSchristos &pxor ("xmm1",&QWP(-16,"esp",$len)); 309*1dcdf01fSchristos &movups (&QWP(-16,$out,$len),"xmm1"); 310*1dcdf01fSchristos &cmp ($len,$chunk); 311*1dcdf01fSchristos &jb (&label("${mode}_xor")); 312*1dcdf01fSchristos } else { 313*1dcdf01fSchristos &test ($out,0x0f); 314*1dcdf01fSchristos &jz (&label("${mode}_out_aligned")); 315*1dcdf01fSchristos &mov ($len,$chunk); 316*1dcdf01fSchristos &lea ($inp,&DWP(0,"esp")); 317*1dcdf01fSchristos &shr ($len,2); 318*1dcdf01fSchristos &data_byte(0xf3,0xa5); # rep movsl 319*1dcdf01fSchristos &sub ($out,$chunk); 320*1dcdf01fSchristos&set_label("${mode}_out_aligned"); 321*1dcdf01fSchristos &mov ($inp,&DWP(4,"ebp")); 322*1dcdf01fSchristos } 323*1dcdf01fSchristos &mov ($len,&DWP(8,"ebp")); 324*1dcdf01fSchristos &add ($out,$chunk); 325*1dcdf01fSchristos &add ($inp,$chunk); 326*1dcdf01fSchristos &sub ($len,$chunk); 327*1dcdf01fSchristos &mov ($chunk,$PADLOCK_CHUNK); 328*1dcdf01fSchristos if (!$PADLOCK_PREFETCH{$mode}) { 329*1dcdf01fSchristos &jnz (&label("${mode}_loop")); 330*1dcdf01fSchristos } else { 331*1dcdf01fSchristos &jz (&label("${mode}_break")); 332*1dcdf01fSchristos &cmp ($len,$chunk); 333*1dcdf01fSchristos &jae (&label("${mode}_loop")); 334*1dcdf01fSchristos 335*1dcdf01fSchristos&set_label("${mode}_unaligned_tail"); 336*1dcdf01fSchristos &xor ("eax","eax"); 337*1dcdf01fSchristos &cmp ("esp","ebp"); 338*1dcdf01fSchristos &cmove ("eax",$len); 339*1dcdf01fSchristos &sub ("esp","eax"); # alloca 340*1dcdf01fSchristos &mov ("eax", $out); # save parameters 341*1dcdf01fSchristos &mov ($chunk,$len); 342*1dcdf01fSchristos &shr ($len,2); 343*1dcdf01fSchristos &lea ($out,&DWP(0,"esp")); 344*1dcdf01fSchristos &data_byte(0xf3,0xa5); # rep movsl 345*1dcdf01fSchristos &mov ($inp,"esp"); 346*1dcdf01fSchristos &mov ($out,"eax"); # restore parameters 347*1dcdf01fSchristos &mov ($len,$chunk); 348*1dcdf01fSchristos &jmp (&label("${mode}_loop")); 349*1dcdf01fSchristos 350*1dcdf01fSchristos&set_label("${mode}_break",16); 351*1dcdf01fSchristos } 352*1dcdf01fSchristos if ($mode ne "ctr32") { 353*1dcdf01fSchristos &cmp ("esp","ebp"); 354*1dcdf01fSchristos &je (&label("${mode}_done")); 355*1dcdf01fSchristos } 356*1dcdf01fSchristos &pxor ("xmm0","xmm0"); 357*1dcdf01fSchristos &lea ("eax",&DWP(0,"esp")); 358*1dcdf01fSchristos&set_label("${mode}_bzero"); 359*1dcdf01fSchristos &movaps (&QWP(0,"eax"),"xmm0"); 360*1dcdf01fSchristos &lea ("eax",&DWP(16,"eax")); 361*1dcdf01fSchristos &cmp ("ebp","eax"); 362*1dcdf01fSchristos &ja (&label("${mode}_bzero")); 363*1dcdf01fSchristos 364*1dcdf01fSchristos&set_label("${mode}_done"); 365*1dcdf01fSchristos &mov ("ebp",&DWP(16,"ebp")); 366*1dcdf01fSchristos &lea ("esp",&DWP(24,"ebp")); 367*1dcdf01fSchristos if ($mode ne "ctr32") { 368*1dcdf01fSchristos &jmp (&label("${mode}_exit")); 369*1dcdf01fSchristos 370*1dcdf01fSchristos&set_label("${mode}_aligned",16); 371*1dcdf01fSchristos if ($PADLOCK_PREFETCH{$mode}) { 372*1dcdf01fSchristos &lea ("ebp",&DWP(0,$inp,$len)); 373*1dcdf01fSchristos &neg ("ebp"); 374*1dcdf01fSchristos &and ("ebp",0xfff); # distance to page boundary 375*1dcdf01fSchristos &xor ("eax","eax"); 376*1dcdf01fSchristos &cmp ("ebp",$PADLOCK_PREFETCH{$mode}); 377*1dcdf01fSchristos &mov ("ebp",$PADLOCK_PREFETCH{$mode}-1); 378*1dcdf01fSchristos &cmovae ("ebp","eax"); 379*1dcdf01fSchristos &and ("ebp",$len); # remainder 380*1dcdf01fSchristos &sub ($len,"ebp"); 381*1dcdf01fSchristos &jz (&label("${mode}_aligned_tail")); 382*1dcdf01fSchristos } 383*1dcdf01fSchristos &lea ("eax",&DWP(-16,$ctx)); # ivp 384*1dcdf01fSchristos &lea ("ebx",&DWP(16,$ctx)); # key 385*1dcdf01fSchristos &shr ($len,4); # len/=AES_BLOCK_SIZE 386*1dcdf01fSchristos &data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt* 387*1dcdf01fSchristos if ($mode ne "ecb") { 388*1dcdf01fSchristos &movaps ("xmm0",&QWP(0,"eax")); 389*1dcdf01fSchristos &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv 390*1dcdf01fSchristos } 391*1dcdf01fSchristos if ($PADLOCK_PREFETCH{$mode}) { 392*1dcdf01fSchristos &test ("ebp","ebp"); 393*1dcdf01fSchristos &jz (&label("${mode}_exit")); 394*1dcdf01fSchristos 395*1dcdf01fSchristos&set_label("${mode}_aligned_tail"); 396*1dcdf01fSchristos &mov ($len,"ebp"); 397*1dcdf01fSchristos &lea ("ebp",&DWP(-24,"esp")); 398*1dcdf01fSchristos &mov ("esp","ebp"); 399*1dcdf01fSchristos &mov ("eax","ebp"); 400*1dcdf01fSchristos &sub ("esp",$len); 401*1dcdf01fSchristos &and ("ebp",-16); 402*1dcdf01fSchristos &and ("esp",-16); 403*1dcdf01fSchristos &mov (&DWP(16,"ebp"),"eax"); 404*1dcdf01fSchristos &mov ("eax", $out); # save parameters 405*1dcdf01fSchristos &mov ($chunk,$len); 406*1dcdf01fSchristos &shr ($len,2); 407*1dcdf01fSchristos &lea ($out,&DWP(0,"esp")); 408*1dcdf01fSchristos &data_byte(0xf3,0xa5); # rep movsl 409*1dcdf01fSchristos &mov ($inp,"esp"); 410*1dcdf01fSchristos &mov ($out,"eax"); # restore parameters 411*1dcdf01fSchristos &mov ($len,$chunk); 412*1dcdf01fSchristos &jmp (&label("${mode}_loop")); 413*1dcdf01fSchristos } 414*1dcdf01fSchristos&set_label("${mode}_exit"); } 415*1dcdf01fSchristos &mov ("eax",1); 416*1dcdf01fSchristos &lea ("esp",&DWP(4,"esp")); # popf 417*1dcdf01fSchristos &emms () if ($mode eq "ctr32"); 418*1dcdf01fSchristos&set_label("${mode}_abort"); 419*1dcdf01fSchristos&function_end("padlock_${mode}_encrypt"); 420*1dcdf01fSchristos} 421*1dcdf01fSchristos 422*1dcdf01fSchristos&generate_mode("ecb",0xc8); 423*1dcdf01fSchristos&generate_mode("cbc",0xd0); 424*1dcdf01fSchristos&generate_mode("cfb",0xe0); 425*1dcdf01fSchristos&generate_mode("ofb",0xe8); 426*1dcdf01fSchristos&generate_mode("ctr32",0xc8); # yes, it implements own CTR with ECB opcode, 427*1dcdf01fSchristos # because hardware CTR was introduced later 428*1dcdf01fSchristos # and even has errata on certain C7 stepping. 429*1dcdf01fSchristos # own implementation *always* works, though 430*1dcdf01fSchristos # ~15% slower than dedicated hardware... 431*1dcdf01fSchristos 432*1dcdf01fSchristos&function_begin_B("padlock_xstore"); 433*1dcdf01fSchristos &push ("edi"); 434*1dcdf01fSchristos &mov ("edi",&wparam(0)); 435*1dcdf01fSchristos &mov ("edx",&wparam(1)); 436*1dcdf01fSchristos &data_byte(0x0f,0xa7,0xc0); # xstore 437*1dcdf01fSchristos &pop ("edi"); 438*1dcdf01fSchristos &ret (); 439*1dcdf01fSchristos&function_end_B("padlock_xstore"); 440*1dcdf01fSchristos 441*1dcdf01fSchristos&function_begin_B("_win32_segv_handler"); 442*1dcdf01fSchristos &mov ("eax",1); # ExceptionContinueSearch 443*1dcdf01fSchristos &mov ("edx",&wparam(0)); # *ExceptionRecord 444*1dcdf01fSchristos &mov ("ecx",&wparam(2)); # *ContextRecord 445*1dcdf01fSchristos &cmp (&DWP(0,"edx"),0xC0000005) # ExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION 446*1dcdf01fSchristos &jne (&label("ret")); 447*1dcdf01fSchristos &add (&DWP(184,"ecx"),4); # skip over rep sha* 448*1dcdf01fSchristos &mov ("eax",0); # ExceptionContinueExecution 449*1dcdf01fSchristos&set_label("ret"); 450*1dcdf01fSchristos &ret (); 451*1dcdf01fSchristos&function_end_B("_win32_segv_handler"); 452*1dcdf01fSchristos&safeseh("_win32_segv_handler") if ($::win32); 453*1dcdf01fSchristos 454*1dcdf01fSchristos&function_begin_B("padlock_sha1_oneshot"); 455*1dcdf01fSchristos &push ("edi"); 456*1dcdf01fSchristos &push ("esi"); 457*1dcdf01fSchristos &xor ("eax","eax"); 458*1dcdf01fSchristos &mov ("edi",&wparam(0)); 459*1dcdf01fSchristos &mov ("esi",&wparam(1)); 460*1dcdf01fSchristos &mov ("ecx",&wparam(2)); 461*1dcdf01fSchristos if ($::win32 or $::coff) { 462*1dcdf01fSchristos &push (&::islabel("_win32_segv_handler")); 463*1dcdf01fSchristos &data_byte(0x64,0xff,0x30); # push %fs:(%eax) 464*1dcdf01fSchristos &data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax) 465*1dcdf01fSchristos } 466*1dcdf01fSchristos &mov ("edx","esp"); # put aside %esp 467*1dcdf01fSchristos &add ("esp",-128); # 32 is enough but spec says 128 468*1dcdf01fSchristos &movups ("xmm0",&QWP(0,"edi")); # copy-in context 469*1dcdf01fSchristos &and ("esp",-16); 470*1dcdf01fSchristos &mov ("eax",&DWP(16,"edi")); 471*1dcdf01fSchristos &movaps (&QWP(0,"esp"),"xmm0"); 472*1dcdf01fSchristos &mov ("edi","esp"); 473*1dcdf01fSchristos &mov (&DWP(16,"esp"),"eax"); 474*1dcdf01fSchristos &xor ("eax","eax"); 475*1dcdf01fSchristos &data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1 476*1dcdf01fSchristos &movaps ("xmm0",&QWP(0,"esp")); 477*1dcdf01fSchristos &mov ("eax",&DWP(16,"esp")); 478*1dcdf01fSchristos &mov ("esp","edx"); # restore %esp 479*1dcdf01fSchristos if ($::win32 or $::coff) { 480*1dcdf01fSchristos &data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0 481*1dcdf01fSchristos &lea ("esp",&DWP(4,"esp")); 482*1dcdf01fSchristos } 483*1dcdf01fSchristos &mov ("edi",&wparam(0)); 484*1dcdf01fSchristos &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 485*1dcdf01fSchristos &mov (&DWP(16,"edi"),"eax"); 486*1dcdf01fSchristos &pop ("esi"); 487*1dcdf01fSchristos &pop ("edi"); 488*1dcdf01fSchristos &ret (); 489*1dcdf01fSchristos&function_end_B("padlock_sha1_oneshot"); 490*1dcdf01fSchristos 491*1dcdf01fSchristos&function_begin_B("padlock_sha1_blocks"); 492*1dcdf01fSchristos &push ("edi"); 493*1dcdf01fSchristos &push ("esi"); 494*1dcdf01fSchristos &mov ("edi",&wparam(0)); 495*1dcdf01fSchristos &mov ("esi",&wparam(1)); 496*1dcdf01fSchristos &mov ("edx","esp"); # put aside %esp 497*1dcdf01fSchristos &mov ("ecx",&wparam(2)); 498*1dcdf01fSchristos &add ("esp",-128); 499*1dcdf01fSchristos &movups ("xmm0",&QWP(0,"edi")); # copy-in context 500*1dcdf01fSchristos &and ("esp",-16); 501*1dcdf01fSchristos &mov ("eax",&DWP(16,"edi")); 502*1dcdf01fSchristos &movaps (&QWP(0,"esp"),"xmm0"); 503*1dcdf01fSchristos &mov ("edi","esp"); 504*1dcdf01fSchristos &mov (&DWP(16,"esp"),"eax"); 505*1dcdf01fSchristos &mov ("eax",-1); 506*1dcdf01fSchristos &data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1 507*1dcdf01fSchristos &movaps ("xmm0",&QWP(0,"esp")); 508*1dcdf01fSchristos &mov ("eax",&DWP(16,"esp")); 509*1dcdf01fSchristos &mov ("esp","edx"); # restore %esp 510*1dcdf01fSchristos &mov ("edi",&wparam(0)); 511*1dcdf01fSchristos &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 512*1dcdf01fSchristos &mov (&DWP(16,"edi"),"eax"); 513*1dcdf01fSchristos &pop ("esi"); 514*1dcdf01fSchristos &pop ("edi"); 515*1dcdf01fSchristos &ret (); 516*1dcdf01fSchristos&function_end_B("padlock_sha1_blocks"); 517*1dcdf01fSchristos 518*1dcdf01fSchristos&function_begin_B("padlock_sha256_oneshot"); 519*1dcdf01fSchristos &push ("edi"); 520*1dcdf01fSchristos &push ("esi"); 521*1dcdf01fSchristos &xor ("eax","eax"); 522*1dcdf01fSchristos &mov ("edi",&wparam(0)); 523*1dcdf01fSchristos &mov ("esi",&wparam(1)); 524*1dcdf01fSchristos &mov ("ecx",&wparam(2)); 525*1dcdf01fSchristos if ($::win32 or $::coff) { 526*1dcdf01fSchristos &push (&::islabel("_win32_segv_handler")); 527*1dcdf01fSchristos &data_byte(0x64,0xff,0x30); # push %fs:(%eax) 528*1dcdf01fSchristos &data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax) 529*1dcdf01fSchristos } 530*1dcdf01fSchristos &mov ("edx","esp"); # put aside %esp 531*1dcdf01fSchristos &add ("esp",-128); 532*1dcdf01fSchristos &movups ("xmm0",&QWP(0,"edi")); # copy-in context 533*1dcdf01fSchristos &and ("esp",-16); 534*1dcdf01fSchristos &movups ("xmm1",&QWP(16,"edi")); 535*1dcdf01fSchristos &movaps (&QWP(0,"esp"),"xmm0"); 536*1dcdf01fSchristos &mov ("edi","esp"); 537*1dcdf01fSchristos &movaps (&QWP(16,"esp"),"xmm1"); 538*1dcdf01fSchristos &xor ("eax","eax"); 539*1dcdf01fSchristos &data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256 540*1dcdf01fSchristos &movaps ("xmm0",&QWP(0,"esp")); 541*1dcdf01fSchristos &movaps ("xmm1",&QWP(16,"esp")); 542*1dcdf01fSchristos &mov ("esp","edx"); # restore %esp 543*1dcdf01fSchristos if ($::win32 or $::coff) { 544*1dcdf01fSchristos &data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0 545*1dcdf01fSchristos &lea ("esp",&DWP(4,"esp")); 546*1dcdf01fSchristos } 547*1dcdf01fSchristos &mov ("edi",&wparam(0)); 548*1dcdf01fSchristos &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 549*1dcdf01fSchristos &movups (&QWP(16,"edi"),"xmm1"); 550*1dcdf01fSchristos &pop ("esi"); 551*1dcdf01fSchristos &pop ("edi"); 552*1dcdf01fSchristos &ret (); 553*1dcdf01fSchristos&function_end_B("padlock_sha256_oneshot"); 554*1dcdf01fSchristos 555*1dcdf01fSchristos&function_begin_B("padlock_sha256_blocks"); 556*1dcdf01fSchristos &push ("edi"); 557*1dcdf01fSchristos &push ("esi"); 558*1dcdf01fSchristos &mov ("edi",&wparam(0)); 559*1dcdf01fSchristos &mov ("esi",&wparam(1)); 560*1dcdf01fSchristos &mov ("ecx",&wparam(2)); 561*1dcdf01fSchristos &mov ("edx","esp"); # put aside %esp 562*1dcdf01fSchristos &add ("esp",-128); 563*1dcdf01fSchristos &movups ("xmm0",&QWP(0,"edi")); # copy-in context 564*1dcdf01fSchristos &and ("esp",-16); 565*1dcdf01fSchristos &movups ("xmm1",&QWP(16,"edi")); 566*1dcdf01fSchristos &movaps (&QWP(0,"esp"),"xmm0"); 567*1dcdf01fSchristos &mov ("edi","esp"); 568*1dcdf01fSchristos &movaps (&QWP(16,"esp"),"xmm1"); 569*1dcdf01fSchristos &mov ("eax",-1); 570*1dcdf01fSchristos &data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256 571*1dcdf01fSchristos &movaps ("xmm0",&QWP(0,"esp")); 572*1dcdf01fSchristos &movaps ("xmm1",&QWP(16,"esp")); 573*1dcdf01fSchristos &mov ("esp","edx"); # restore %esp 574*1dcdf01fSchristos &mov ("edi",&wparam(0)); 575*1dcdf01fSchristos &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 576*1dcdf01fSchristos &movups (&QWP(16,"edi"),"xmm1"); 577*1dcdf01fSchristos &pop ("esi"); 578*1dcdf01fSchristos &pop ("edi"); 579*1dcdf01fSchristos &ret (); 580*1dcdf01fSchristos&function_end_B("padlock_sha256_blocks"); 581*1dcdf01fSchristos 582*1dcdf01fSchristos&function_begin_B("padlock_sha512_blocks"); 583*1dcdf01fSchristos &push ("edi"); 584*1dcdf01fSchristos &push ("esi"); 585*1dcdf01fSchristos &mov ("edi",&wparam(0)); 586*1dcdf01fSchristos &mov ("esi",&wparam(1)); 587*1dcdf01fSchristos &mov ("ecx",&wparam(2)); 588*1dcdf01fSchristos &mov ("edx","esp"); # put aside %esp 589*1dcdf01fSchristos &add ("esp",-128); 590*1dcdf01fSchristos &movups ("xmm0",&QWP(0,"edi")); # copy-in context 591*1dcdf01fSchristos &and ("esp",-16); 592*1dcdf01fSchristos &movups ("xmm1",&QWP(16,"edi")); 593*1dcdf01fSchristos &movups ("xmm2",&QWP(32,"edi")); 594*1dcdf01fSchristos &movups ("xmm3",&QWP(48,"edi")); 595*1dcdf01fSchristos &movaps (&QWP(0,"esp"),"xmm0"); 596*1dcdf01fSchristos &mov ("edi","esp"); 597*1dcdf01fSchristos &movaps (&QWP(16,"esp"),"xmm1"); 598*1dcdf01fSchristos &movaps (&QWP(32,"esp"),"xmm2"); 599*1dcdf01fSchristos &movaps (&QWP(48,"esp"),"xmm3"); 600*1dcdf01fSchristos &data_byte(0xf3,0x0f,0xa6,0xe0); # rep xsha512 601*1dcdf01fSchristos &movaps ("xmm0",&QWP(0,"esp")); 602*1dcdf01fSchristos &movaps ("xmm1",&QWP(16,"esp")); 603*1dcdf01fSchristos &movaps ("xmm2",&QWP(32,"esp")); 604*1dcdf01fSchristos &movaps ("xmm3",&QWP(48,"esp")); 605*1dcdf01fSchristos &mov ("esp","edx"); # restore %esp 606*1dcdf01fSchristos &mov ("edi",&wparam(0)); 607*1dcdf01fSchristos &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 608*1dcdf01fSchristos &movups (&QWP(16,"edi"),"xmm1"); 609*1dcdf01fSchristos &movups (&QWP(32,"edi"),"xmm2"); 610*1dcdf01fSchristos &movups (&QWP(48,"edi"),"xmm3"); 611*1dcdf01fSchristos &pop ("esi"); 612*1dcdf01fSchristos &pop ("edi"); 613*1dcdf01fSchristos &ret (); 614*1dcdf01fSchristos&function_end_B("padlock_sha512_blocks"); 615*1dcdf01fSchristos 616*1dcdf01fSchristos&asciz ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>"); 617*1dcdf01fSchristos&align (16); 618*1dcdf01fSchristos 619*1dcdf01fSchristos&dataseg(); 620*1dcdf01fSchristos# Essentially this variable belongs in thread local storage. 621*1dcdf01fSchristos# Having this variable global on the other hand can only cause 622*1dcdf01fSchristos# few bogus key reloads [if any at all on signle-CPU system], 623*1dcdf01fSchristos# so we accept the penalty... 624*1dcdf01fSchristos&set_label("padlock_saved_context",4); 625*1dcdf01fSchristos&data_word(0); 626*1dcdf01fSchristos 627*1dcdf01fSchristos&asm_finish(); 628*1dcdf01fSchristos 629*1dcdf01fSchristosclose STDOUT; 630