1#! /usr/bin/env perl 2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# ECP_NISTZ256 module for x86/SSE2. 18# 19# October 2014. 20# 21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22# http://eprint.iacr.org/2013/816. In the process of adaptation 23# original .c module was made 32-bit savvy in order to make this 24# implementation possible. 25# 26# with/without -DECP_NISTZ256_ASM 27# Pentium +66-163% 28# PIII +72-172% 29# P4 +65-132% 30# Core2 +90-215% 31# Sandy Bridge +105-265% (contemporary i[57]-* are all close to this) 32# Atom +65-155% 33# Opteron +54-110% 34# Bulldozer +99-240% 35# VIA Nano +93-290% 36# 37# Ranges denote minimum and maximum improvement coefficients depending 38# on benchmark. Lower coefficients are for ECDSA sign, server-side 39# operation. Keep in mind that +200% means 3x improvement. 40 41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42push(@INC,"${dir}","${dir}../../perlasm"); 43require "x86asm.pl"; 44 45$output=pop; 46open STDOUT,">$output"; 47 48&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); 49 50$sse2=0; 51for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 52 53&external_label("OPENSSL_ia32cap_P") if ($sse2); 54 55 56######################################################################## 57# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 58# 59open TABLE,"<ecp_nistz256_table.c" or 60open TABLE,"<${dir}../ecp_nistz256_table.c" or 61die "failed to open ecp_nistz256_table.c:",$!; 62 63use integer; 64 65foreach(<TABLE>) { 66 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 67} 68close TABLE; 69 70# See ecp_nistz256_table.c for explanation for why it's 64*16*37. 71# 64*16*37-1 is because $#arr returns last valid index or @arr, not 72# amount of elements. 73die "insane number of elements" if ($#arr != 64*16*37-1); 74 75&public_label("ecp_nistz256_precomputed"); 76&align(4096); 77&set_label("ecp_nistz256_precomputed"); 78 79######################################################################## 80# this conversion smashes P256_POINT_AFFINE by individual bytes with 81# 64 byte interval, similar to 82# 1111222233334444 83# 1234123412341234 84for(1..37) { 85 @tbl = splice(@arr,0,64*16); 86 for($i=0;$i<64;$i++) { 87 undef @line; 88 for($j=0;$j<64;$j++) { 89 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; 90 } 91 &data_byte(join(',',map { sprintf "0x%02x",$_} @line)); 92 } 93} 94 95######################################################################## 96# Keep in mind that constants are stored least to most significant word 97&static_label("RR"); 98&set_label("RR",64); 99&data_word(3,0,-1,-5,-2,-1,-3,4); # 2^512 mod P-256 100 101&static_label("ONE_mont"); 102&set_label("ONE_mont"); 103&data_word(1,0,0,-1,-1,-1,-2,0); 104 105&static_label("ONE"); 106&set_label("ONE"); 107&data_word(1,0,0,0,0,0,0,0); 108&asciz("ECP_NISZ256 for x86/SSE2, CRYPTOGAMS by <appro\@openssl.org>"); 109&align(64); 110 111######################################################################## 112# void ecp_nistz256_mul_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]); 113&function_begin("ecp_nistz256_mul_by_2"); 114 &mov ("esi",&wparam(1)); 115 &mov ("edi",&wparam(0)); 116 &mov ("ebp","esi"); 117######################################################################## 118# common pattern for internal functions is that %edi is result pointer, 119# %esi and %ebp are input ones, %ebp being optional. %edi is preserved. 120 &call ("_ecp_nistz256_add"); 121&function_end("ecp_nistz256_mul_by_2"); 122 123######################################################################## 124# void ecp_nistz256_mul_by_3(BN_ULONG edi[8],const BN_ULONG esi[8]); 125&function_begin("ecp_nistz256_mul_by_3"); 126 &mov ("esi",&wparam(1)); 127 # multiplication by 3 is performed 128 # as 2*n+n, but we can't use output 129 # to store 2*n, because if output 130 # pointer equals to input, then 131 # we'll get 2*n+2*n. 132 &stack_push(8); # therefore we need to allocate 133 # 256-bit intermediate buffer. 134 &mov ("edi","esp"); 135 &mov ("ebp","esi"); 136 &call ("_ecp_nistz256_add"); 137 &lea ("esi",&DWP(0,"edi")); 138 &mov ("ebp",&wparam(1)); 139 &mov ("edi",&wparam(0)); 140 &call ("_ecp_nistz256_add"); 141 &stack_pop(8); 142&function_end("ecp_nistz256_mul_by_3"); 143 144######################################################################## 145# void ecp_nistz256_div_by_2(BN_ULONG edi[8],const BN_ULONG esi[8]); 146&function_begin("ecp_nistz256_div_by_2"); 147 &mov ("esi",&wparam(1)); 148 &mov ("edi",&wparam(0)); 149 &call ("_ecp_nistz256_div_by_2"); 150&function_end("ecp_nistz256_div_by_2"); 151 152&function_begin_B("_ecp_nistz256_div_by_2"); 153 # tmp = a is odd ? a+mod : a 154 # 155 # note that because mod has special form, i.e. consists of 156 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 157 # assigning least significant bit of input to one register, 158 # %ebp, and its negative to another, %edx. 159 160 &mov ("ebp",&DWP(0,"esi")); 161 &xor ("edx","edx"); 162 &mov ("ebx",&DWP(4,"esi")); 163 &mov ("eax","ebp"); 164 &and ("ebp",1); 165 &mov ("ecx",&DWP(8,"esi")); 166 &sub ("edx","ebp"); 167 168 &add ("eax","edx"); 169 &adc ("ebx","edx"); 170 &mov (&DWP(0,"edi"),"eax"); 171 &adc ("ecx","edx"); 172 &mov (&DWP(4,"edi"),"ebx"); 173 &mov (&DWP(8,"edi"),"ecx"); 174 175 &mov ("eax",&DWP(12,"esi")); 176 &mov ("ebx",&DWP(16,"esi")); 177 &adc ("eax",0); 178 &mov ("ecx",&DWP(20,"esi")); 179 &adc ("ebx",0); 180 &mov (&DWP(12,"edi"),"eax"); 181 &adc ("ecx",0); 182 &mov (&DWP(16,"edi"),"ebx"); 183 &mov (&DWP(20,"edi"),"ecx"); 184 185 &mov ("eax",&DWP(24,"esi")); 186 &mov ("ebx",&DWP(28,"esi")); 187 &adc ("eax","ebp"); 188 &adc ("ebx","edx"); 189 &mov (&DWP(24,"edi"),"eax"); 190 &sbb ("esi","esi"); # broadcast carry bit 191 &mov (&DWP(28,"edi"),"ebx"); 192 193 # ret = tmp >> 1 194 195 &mov ("eax",&DWP(0,"edi")); 196 &mov ("ebx",&DWP(4,"edi")); 197 &mov ("ecx",&DWP(8,"edi")); 198 &mov ("edx",&DWP(12,"edi")); 199 200 &shr ("eax",1); 201 &mov ("ebp","ebx"); 202 &shl ("ebx",31); 203 &or ("eax","ebx"); 204 205 &shr ("ebp",1); 206 &mov ("ebx","ecx"); 207 &shl ("ecx",31); 208 &mov (&DWP(0,"edi"),"eax"); 209 &or ("ebp","ecx"); 210 &mov ("eax",&DWP(16,"edi")); 211 212 &shr ("ebx",1); 213 &mov ("ecx","edx"); 214 &shl ("edx",31); 215 &mov (&DWP(4,"edi"),"ebp"); 216 &or ("ebx","edx"); 217 &mov ("ebp",&DWP(20,"edi")); 218 219 &shr ("ecx",1); 220 &mov ("edx","eax"); 221 &shl ("eax",31); 222 &mov (&DWP(8,"edi"),"ebx"); 223 &or ("ecx","eax"); 224 &mov ("ebx",&DWP(24,"edi")); 225 226 &shr ("edx",1); 227 &mov ("eax","ebp"); 228 &shl ("ebp",31); 229 &mov (&DWP(12,"edi"),"ecx"); 230 &or ("edx","ebp"); 231 &mov ("ecx",&DWP(28,"edi")); 232 233 &shr ("eax",1); 234 &mov ("ebp","ebx"); 235 &shl ("ebx",31); 236 &mov (&DWP(16,"edi"),"edx"); 237 &or ("eax","ebx"); 238 239 &shr ("ebp",1); 240 &mov ("ebx","ecx"); 241 &shl ("ecx",31); 242 &mov (&DWP(20,"edi"),"eax"); 243 &or ("ebp","ecx"); 244 245 &shr ("ebx",1); 246 &shl ("esi",31); 247 &mov (&DWP(24,"edi"),"ebp"); 248 &or ("ebx","esi"); # handle top-most carry bit 249 &mov (&DWP(28,"edi"),"ebx"); 250 251 &ret (); 252&function_end_B("_ecp_nistz256_div_by_2"); 253 254######################################################################## 255# void ecp_nistz256_add(BN_ULONG edi[8],const BN_ULONG esi[8], 256# const BN_ULONG ebp[8]); 257&function_begin("ecp_nistz256_add"); 258 &mov ("esi",&wparam(1)); 259 &mov ("ebp",&wparam(2)); 260 &mov ("edi",&wparam(0)); 261 &call ("_ecp_nistz256_add"); 262&function_end("ecp_nistz256_add"); 263 264&function_begin_B("_ecp_nistz256_add"); 265 &mov ("eax",&DWP(0,"esi")); 266 &mov ("ebx",&DWP(4,"esi")); 267 &mov ("ecx",&DWP(8,"esi")); 268 &add ("eax",&DWP(0,"ebp")); 269 &mov ("edx",&DWP(12,"esi")); 270 &adc ("ebx",&DWP(4,"ebp")); 271 &mov (&DWP(0,"edi"),"eax"); 272 &adc ("ecx",&DWP(8,"ebp")); 273 &mov (&DWP(4,"edi"),"ebx"); 274 &adc ("edx",&DWP(12,"ebp")); 275 &mov (&DWP(8,"edi"),"ecx"); 276 &mov (&DWP(12,"edi"),"edx"); 277 278 &mov ("eax",&DWP(16,"esi")); 279 &mov ("ebx",&DWP(20,"esi")); 280 &mov ("ecx",&DWP(24,"esi")); 281 &adc ("eax",&DWP(16,"ebp")); 282 &mov ("edx",&DWP(28,"esi")); 283 &adc ("ebx",&DWP(20,"ebp")); 284 &mov (&DWP(16,"edi"),"eax"); 285 &adc ("ecx",&DWP(24,"ebp")); 286 &mov (&DWP(20,"edi"),"ebx"); 287 &mov ("esi",0); 288 &adc ("edx",&DWP(28,"ebp")); 289 &mov (&DWP(24,"edi"),"ecx"); 290 &adc ("esi",0); 291 &mov (&DWP(28,"edi"),"edx"); 292 293 # if a+b >= modulus, subtract modulus. 294 # 295 # But since comparison implies subtraction, we subtract modulus 296 # to see if it borrows, and then subtract it for real if 297 # subtraction didn't borrow. 298 299 &mov ("eax",&DWP(0,"edi")); 300 &mov ("ebx",&DWP(4,"edi")); 301 &mov ("ecx",&DWP(8,"edi")); 302 &sub ("eax",-1); 303 &mov ("edx",&DWP(12,"edi")); 304 &sbb ("ebx",-1); 305 &mov ("eax",&DWP(16,"edi")); 306 &sbb ("ecx",-1); 307 &mov ("ebx",&DWP(20,"edi")); 308 &sbb ("edx",0); 309 &mov ("ecx",&DWP(24,"edi")); 310 &sbb ("eax",0); 311 &mov ("edx",&DWP(28,"edi")); 312 &sbb ("ebx",0); 313 &sbb ("ecx",1); 314 &sbb ("edx",-1); 315 &sbb ("esi",0); 316 317 # Note that because mod has special form, i.e. consists of 318 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 319 # by using borrow. 320 321 ¬ ("esi"); 322 &mov ("eax",&DWP(0,"edi")); 323 &mov ("ebp","esi"); 324 &mov ("ebx",&DWP(4,"edi")); 325 &shr ("ebp",31); 326 &mov ("ecx",&DWP(8,"edi")); 327 &sub ("eax","esi"); 328 &mov ("edx",&DWP(12,"edi")); 329 &sbb ("ebx","esi"); 330 &mov (&DWP(0,"edi"),"eax"); 331 &sbb ("ecx","esi"); 332 &mov (&DWP(4,"edi"),"ebx"); 333 &sbb ("edx",0); 334 &mov (&DWP(8,"edi"),"ecx"); 335 &mov (&DWP(12,"edi"),"edx"); 336 337 &mov ("eax",&DWP(16,"edi")); 338 &mov ("ebx",&DWP(20,"edi")); 339 &mov ("ecx",&DWP(24,"edi")); 340 &sbb ("eax",0); 341 &mov ("edx",&DWP(28,"edi")); 342 &sbb ("ebx",0); 343 &mov (&DWP(16,"edi"),"eax"); 344 &sbb ("ecx","ebp"); 345 &mov (&DWP(20,"edi"),"ebx"); 346 &sbb ("edx","esi"); 347 &mov (&DWP(24,"edi"),"ecx"); 348 &mov (&DWP(28,"edi"),"edx"); 349 350 &ret (); 351&function_end_B("_ecp_nistz256_add"); 352 353######################################################################## 354# void ecp_nistz256_sub(BN_ULONG edi[8],const BN_ULONG esi[8], 355# const BN_ULONG ebp[8]); 356&function_begin("ecp_nistz256_sub"); 357 &mov ("esi",&wparam(1)); 358 &mov ("ebp",&wparam(2)); 359 &mov ("edi",&wparam(0)); 360 &call ("_ecp_nistz256_sub"); 361&function_end("ecp_nistz256_sub"); 362 363&function_begin_B("_ecp_nistz256_sub"); 364 &mov ("eax",&DWP(0,"esi")); 365 &mov ("ebx",&DWP(4,"esi")); 366 &mov ("ecx",&DWP(8,"esi")); 367 &sub ("eax",&DWP(0,"ebp")); 368 &mov ("edx",&DWP(12,"esi")); 369 &sbb ("ebx",&DWP(4,"ebp")); 370 &mov (&DWP(0,"edi"),"eax"); 371 &sbb ("ecx",&DWP(8,"ebp")); 372 &mov (&DWP(4,"edi"),"ebx"); 373 &sbb ("edx",&DWP(12,"ebp")); 374 &mov (&DWP(8,"edi"),"ecx"); 375 &mov (&DWP(12,"edi"),"edx"); 376 377 &mov ("eax",&DWP(16,"esi")); 378 &mov ("ebx",&DWP(20,"esi")); 379 &mov ("ecx",&DWP(24,"esi")); 380 &sbb ("eax",&DWP(16,"ebp")); 381 &mov ("edx",&DWP(28,"esi")); 382 &sbb ("ebx",&DWP(20,"ebp")); 383 &sbb ("ecx",&DWP(24,"ebp")); 384 &mov (&DWP(16,"edi"),"eax"); 385 &sbb ("edx",&DWP(28,"ebp")); 386 &mov (&DWP(20,"edi"),"ebx"); 387 &sbb ("esi","esi"); # broadcast borrow bit 388 &mov (&DWP(24,"edi"),"ecx"); 389 &mov (&DWP(28,"edi"),"edx"); 390 391 # if a-b borrows, add modulus. 392 # 393 # Note that because mod has special form, i.e. consists of 394 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 395 # assigning borrow bit to one register, %ebp, and its negative 396 # to another, %esi. But we started by calculating %esi... 397 398 &mov ("eax",&DWP(0,"edi")); 399 &mov ("ebp","esi"); 400 &mov ("ebx",&DWP(4,"edi")); 401 &shr ("ebp",31); 402 &mov ("ecx",&DWP(8,"edi")); 403 &add ("eax","esi"); 404 &mov ("edx",&DWP(12,"edi")); 405 &adc ("ebx","esi"); 406 &mov (&DWP(0,"edi"),"eax"); 407 &adc ("ecx","esi"); 408 &mov (&DWP(4,"edi"),"ebx"); 409 &adc ("edx",0); 410 &mov (&DWP(8,"edi"),"ecx"); 411 &mov (&DWP(12,"edi"),"edx"); 412 413 &mov ("eax",&DWP(16,"edi")); 414 &mov ("ebx",&DWP(20,"edi")); 415 &mov ("ecx",&DWP(24,"edi")); 416 &adc ("eax",0); 417 &mov ("edx",&DWP(28,"edi")); 418 &adc ("ebx",0); 419 &mov (&DWP(16,"edi"),"eax"); 420 &adc ("ecx","ebp"); 421 &mov (&DWP(20,"edi"),"ebx"); 422 &adc ("edx","esi"); 423 &mov (&DWP(24,"edi"),"ecx"); 424 &mov (&DWP(28,"edi"),"edx"); 425 426 &ret (); 427&function_end_B("_ecp_nistz256_sub"); 428 429######################################################################## 430# void ecp_nistz256_neg(BN_ULONG edi[8],const BN_ULONG esi[8]); 431&function_begin("ecp_nistz256_neg"); 432 &mov ("ebp",&wparam(1)); 433 &mov ("edi",&wparam(0)); 434 435 &xor ("eax","eax"); 436 &stack_push(8); 437 &mov (&DWP(0,"esp"),"eax"); 438 &mov ("esi","esp"); 439 &mov (&DWP(4,"esp"),"eax"); 440 &mov (&DWP(8,"esp"),"eax"); 441 &mov (&DWP(12,"esp"),"eax"); 442 &mov (&DWP(16,"esp"),"eax"); 443 &mov (&DWP(20,"esp"),"eax"); 444 &mov (&DWP(24,"esp"),"eax"); 445 &mov (&DWP(28,"esp"),"eax"); 446 447 &call ("_ecp_nistz256_sub"); 448 449 &stack_pop(8); 450&function_end("ecp_nistz256_neg"); 451 452&function_begin_B("_picup_eax"); 453 &mov ("eax",&DWP(0,"esp")); 454 &ret (); 455&function_end_B("_picup_eax"); 456 457######################################################################## 458# void ecp_nistz256_to_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); 459&function_begin("ecp_nistz256_to_mont"); 460 &mov ("esi",&wparam(1)); 461 &call ("_picup_eax"); 462 &set_label("pic"); 463 &lea ("ebp",&DWP(&label("RR")."-".&label("pic"),"eax")); 464 if ($sse2) { 465 &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); 466 &mov ("eax",&DWP(0,"eax")); } 467 &mov ("edi",&wparam(0)); 468 &call ("_ecp_nistz256_mul_mont"); 469&function_end("ecp_nistz256_to_mont"); 470 471######################################################################## 472# void ecp_nistz256_from_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); 473&function_begin("ecp_nistz256_from_mont"); 474 &mov ("esi",&wparam(1)); 475 &call ("_picup_eax"); 476 &set_label("pic"); 477 &lea ("ebp",&DWP(&label("ONE")."-".&label("pic"),"eax")); 478 if ($sse2) { 479 &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); 480 &mov ("eax",&DWP(0,"eax")); } 481 &mov ("edi",&wparam(0)); 482 &call ("_ecp_nistz256_mul_mont"); 483&function_end("ecp_nistz256_from_mont"); 484 485######################################################################## 486# void ecp_nistz256_mul_mont(BN_ULONG edi[8],const BN_ULONG esi[8], 487# const BN_ULONG ebp[8]); 488&function_begin("ecp_nistz256_mul_mont"); 489 &mov ("esi",&wparam(1)); 490 &mov ("ebp",&wparam(2)); 491 if ($sse2) { 492 &call ("_picup_eax"); 493 &set_label("pic"); 494 &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); 495 &mov ("eax",&DWP(0,"eax")); } 496 &mov ("edi",&wparam(0)); 497 &call ("_ecp_nistz256_mul_mont"); 498&function_end("ecp_nistz256_mul_mont"); 499 500######################################################################## 501# void ecp_nistz256_sqr_mont(BN_ULONG edi[8],const BN_ULONG esi[8]); 502&function_begin("ecp_nistz256_sqr_mont"); 503 &mov ("esi",&wparam(1)); 504 if ($sse2) { 505 &call ("_picup_eax"); 506 &set_label("pic"); 507 &picmeup("eax","OPENSSL_ia32cap_P","eax",&label("pic")); 508 &mov ("eax",&DWP(0,"eax")); } 509 &mov ("edi",&wparam(0)); 510 &mov ("ebp","esi"); 511 &call ("_ecp_nistz256_mul_mont"); 512&function_end("ecp_nistz256_sqr_mont"); 513 514&function_begin_B("_ecp_nistz256_mul_mont"); 515 if ($sse2) { 516 &and ("eax",1<<24|1<<26); 517 &cmp ("eax",1<<24|1<<26); # see if XMM+SSE2 is on 518 &jne (&label("mul_mont_ialu")); 519 520 ######################################## 521 # SSE2 code path featuring 32x16-bit 522 # multiplications is ~2x faster than 523 # IALU counterpart (except on Atom)... 524 ######################################## 525 # stack layout: 526 # +------------------------------------+< %esp 527 # | 7 16-byte temporary XMM words, | 528 # | "sliding" toward lower address | 529 # . . 530 # +------------------------------------+ 531 # | unused XMM word | 532 # +------------------------------------+< +128,%ebx 533 # | 8 16-byte XMM words holding copies | 534 # | of a[i]<<64|a[i] | 535 # . . 536 # . . 537 # +------------------------------------+< +256 538 &mov ("edx","esp"); 539 &sub ("esp",0x100); 540 541 &movd ("xmm7",&DWP(0,"ebp")); # b[0] -> 0000.00xy 542 &lea ("ebp",&DWP(4,"ebp")); 543 &pcmpeqd("xmm6","xmm6"); 544 &psrlq ("xmm6",48); # compose 0xffff<<64|0xffff 545 546 &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y 547 &and ("esp",-64); 548 &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y 549 &lea ("ebx",&DWP(0x80,"esp")); 550 551 &movd ("xmm0",&DWP(4*0,"esi")); # a[0] -> 0000.00xy 552 &pshufd ("xmm0","xmm0",0b11001100); # 0000.00xy -> 00xy.00xy 553 &movd ("xmm1",&DWP(4*1,"esi")); # a[1] -> ... 554 &movdqa (&QWP(0x00,"ebx"),"xmm0"); # offload converted a[0] 555 &pmuludq("xmm0","xmm7"); # a[0]*b[0] 556 557 &movd ("xmm2",&DWP(4*2,"esi")); 558 &pshufd ("xmm1","xmm1",0b11001100); 559 &movdqa (&QWP(0x10,"ebx"),"xmm1"); 560 &pmuludq("xmm1","xmm7"); # a[1]*b[0] 561 562 &movq ("xmm4","xmm0"); # clear upper 64 bits 563 &pslldq("xmm4",6); 564 &paddq ("xmm4","xmm0"); 565 &movdqa("xmm5","xmm4"); 566 &psrldq("xmm4",10); # upper 32 bits of a[0]*b[0] 567 &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[0] 568 569 # Upper half of a[0]*b[i] is carried into next multiplication 570 # iteration, while lower one "participates" in actual reduction. 571 # Normally latter is done by accumulating result of multiplication 572 # of modulus by "magic" digit, but thanks to special form of modulus 573 # and "magic" digit it can be performed only with additions and 574 # subtractions (see note in IALU section below). Note that we are 575 # not bothered with carry bits, they are accumulated in "flatten" 576 # phase after all multiplications and reductions. 577 578 &movd ("xmm3",&DWP(4*3,"esi")); 579 &pshufd ("xmm2","xmm2",0b11001100); 580 &movdqa (&QWP(0x20,"ebx"),"xmm2"); 581 &pmuludq("xmm2","xmm7"); # a[2]*b[0] 582 &paddq ("xmm1","xmm4"); # a[1]*b[0]+hw(a[0]*b[0]), carry 583 &movdqa (&QWP(0x00,"esp"),"xmm1"); # t[0] 584 585 &movd ("xmm0",&DWP(4*4,"esi")); 586 &pshufd ("xmm3","xmm3",0b11001100); 587 &movdqa (&QWP(0x30,"ebx"),"xmm3"); 588 &pmuludq("xmm3","xmm7"); # a[3]*b[0] 589 &movdqa (&QWP(0x10,"esp"),"xmm2"); 590 591 &movd ("xmm1",&DWP(4*5,"esi")); 592 &pshufd ("xmm0","xmm0",0b11001100); 593 &movdqa (&QWP(0x40,"ebx"),"xmm0"); 594 &pmuludq("xmm0","xmm7"); # a[4]*b[0] 595 &paddq ("xmm3","xmm5"); # a[3]*b[0]+lw(a[0]*b[0]), reduction step 596 &movdqa (&QWP(0x20,"esp"),"xmm3"); 597 598 &movd ("xmm2",&DWP(4*6,"esi")); 599 &pshufd ("xmm1","xmm1",0b11001100); 600 &movdqa (&QWP(0x50,"ebx"),"xmm1"); 601 &pmuludq("xmm1","xmm7"); # a[5]*b[0] 602 &movdqa (&QWP(0x30,"esp"),"xmm0"); 603 &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step 604 605 &movd ("xmm3",&DWP(4*7,"esi")); 606 &pshufd ("xmm2","xmm2",0b11001100); 607 &movdqa (&QWP(0x60,"ebx"),"xmm2"); 608 &pmuludq("xmm2","xmm7"); # a[6]*b[0] 609 &movdqa (&QWP(0x40,"esp"),"xmm1"); 610 &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step 611 612 &movd ("xmm0",&DWP(0,"ebp")); # b[1] -> 0000.00xy 613 &pshufd ("xmm3","xmm3",0b11001100); 614 &movdqa (&QWP(0x70,"ebx"),"xmm3"); 615 &pmuludq("xmm3","xmm7"); # a[7]*b[0] 616 617 &pshuflw("xmm7","xmm0",0b11011100); # 0000.00xy -> 0000.0x0y 618 &movdqa ("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] 619 &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y 620 621 &mov ("ecx",6); 622 &lea ("ebp",&DWP(4,"ebp")); 623 &jmp (&label("madd_sse2")); 624 625&set_label("madd_sse2",16); 626 &paddq ("xmm2","xmm5"); # a[6]*b[i-1]+lw(a[0]*b[i-1]), reduction step [modulo-scheduled] 627 &paddq ("xmm3","xmm4"); # a[7]*b[i-1]+lw(a[0]*b[i-1])*0xffffffff, reduction step [modulo-scheduled] 628 &movdqa ("xmm1",&QWP(0x10,"ebx")); 629 &pmuludq("xmm0","xmm7"); # a[0]*b[i] 630 &movdqa(&QWP(0x50,"esp"),"xmm2"); 631 632 &movdqa ("xmm2",&QWP(0x20,"ebx")); 633 &pmuludq("xmm1","xmm7"); # a[1]*b[i] 634 &movdqa(&QWP(0x60,"esp"),"xmm3"); 635 &paddq ("xmm0",&QWP(0x00,"esp")); 636 637 &movdqa ("xmm3",&QWP(0x30,"ebx")); 638 &pmuludq("xmm2","xmm7"); # a[2]*b[i] 639 &movq ("xmm4","xmm0"); # clear upper 64 bits 640 &pslldq("xmm4",6); 641 &paddq ("xmm1",&QWP(0x10,"esp")); 642 &paddq ("xmm4","xmm0"); 643 &movdqa("xmm5","xmm4"); 644 &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] 645 646 &movdqa ("xmm0",&QWP(0x40,"ebx")); 647 &pmuludq("xmm3","xmm7"); # a[3]*b[i] 648 &paddq ("xmm1","xmm4"); # a[1]*b[i]+hw(a[0]*b[i]), carry 649 &paddq ("xmm2",&QWP(0x20,"esp")); 650 &movdqa (&QWP(0x00,"esp"),"xmm1"); 651 652 &movdqa ("xmm1",&QWP(0x50,"ebx")); 653 &pmuludq("xmm0","xmm7"); # a[4]*b[i] 654 &paddq ("xmm3",&QWP(0x30,"esp")); 655 &movdqa (&QWP(0x10,"esp"),"xmm2"); 656 &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] 657 658 &movdqa ("xmm2",&QWP(0x60,"ebx")); 659 &pmuludq("xmm1","xmm7"); # a[5]*b[i] 660 &paddq ("xmm3","xmm5"); # a[3]*b[i]+lw(a[0]*b[i]), reduction step 661 &paddq ("xmm0",&QWP(0x40,"esp")); 662 &movdqa (&QWP(0x20,"esp"),"xmm3"); 663 &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step 664 665 &movdqa ("xmm3","xmm7"); 666 &pmuludq("xmm2","xmm7"); # a[6]*b[i] 667 &movd ("xmm7",&DWP(0,"ebp")); # b[i++] -> 0000.00xy 668 &lea ("ebp",&DWP(4,"ebp")); 669 &paddq ("xmm1",&QWP(0x50,"esp")); 670 &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step 671 &movdqa (&QWP(0x30,"esp"),"xmm0"); 672 &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y 673 674 &pmuludq("xmm3",&QWP(0x70,"ebx")); # a[7]*b[i] 675 &pshufd("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y 676 &movdqa("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] 677 &movdqa (&QWP(0x40,"esp"),"xmm1"); 678 &paddq ("xmm2",&QWP(0x60,"esp")); 679 680 &dec ("ecx"); 681 &jnz (&label("madd_sse2")); 682 683 &paddq ("xmm2","xmm5"); # a[6]*b[6]+lw(a[0]*b[6]), reduction step [modulo-scheduled] 684 &paddq ("xmm3","xmm4"); # a[7]*b[6]+lw(a[0]*b[6])*0xffffffff, reduction step [modulo-scheduled] 685 &movdqa ("xmm1",&QWP(0x10,"ebx")); 686 &pmuludq("xmm0","xmm7"); # a[0]*b[7] 687 &movdqa(&QWP(0x50,"esp"),"xmm2"); 688 689 &movdqa ("xmm2",&QWP(0x20,"ebx")); 690 &pmuludq("xmm1","xmm7"); # a[1]*b[7] 691 &movdqa(&QWP(0x60,"esp"),"xmm3"); 692 &paddq ("xmm0",&QWP(0x00,"esp")); 693 694 &movdqa ("xmm3",&QWP(0x30,"ebx")); 695 &pmuludq("xmm2","xmm7"); # a[2]*b[7] 696 &movq ("xmm4","xmm0"); # clear upper 64 bits 697 &pslldq("xmm4",6); 698 &paddq ("xmm1",&QWP(0x10,"esp")); 699 &paddq ("xmm4","xmm0"); 700 &movdqa("xmm5","xmm4"); 701 &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] 702 703 &movdqa ("xmm0",&QWP(0x40,"ebx")); 704 &pmuludq("xmm3","xmm7"); # a[3]*b[7] 705 &paddq ("xmm1","xmm4"); # a[1]*b[7]+hw(a[0]*b[7]), carry 706 &paddq ("xmm2",&QWP(0x20,"esp")); 707 &movdqa (&QWP(0x00,"esp"),"xmm1"); 708 709 &movdqa ("xmm1",&QWP(0x50,"ebx")); 710 &pmuludq("xmm0","xmm7"); # a[4]*b[7] 711 &paddq ("xmm3",&QWP(0x30,"esp")); 712 &movdqa (&QWP(0x10,"esp"),"xmm2"); 713 &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] 714 715 &movdqa ("xmm2",&QWP(0x60,"ebx")); 716 &pmuludq("xmm1","xmm7"); # a[5]*b[7] 717 &paddq ("xmm3","xmm5"); # reduction step 718 &paddq ("xmm0",&QWP(0x40,"esp")); 719 &movdqa (&QWP(0x20,"esp"),"xmm3"); 720 &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step 721 722 &movdqa ("xmm3",&QWP(0x70,"ebx")); 723 &pmuludq("xmm2","xmm7"); # a[6]*b[7] 724 &paddq ("xmm1",&QWP(0x50,"esp")); 725 &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step 726 &movdqa (&QWP(0x30,"esp"),"xmm0"); 727 728 &pmuludq("xmm3","xmm7"); # a[7]*b[7] 729 &pcmpeqd("xmm7","xmm7"); 730 &movdqa ("xmm0",&QWP(0x00,"esp")); 731 &pslldq ("xmm7",8); 732 &movdqa (&QWP(0x40,"esp"),"xmm1"); 733 &paddq ("xmm2",&QWP(0x60,"esp")); 734 735 &paddq ("xmm2","xmm5"); # a[6]*b[7]+lw(a[0]*b[7]), reduction step 736 &paddq ("xmm3","xmm4"); # a[6]*b[7]+lw(a[0]*b[7])*0xffffffff, reduction step 737 &movdqa(&QWP(0x50,"esp"),"xmm2"); 738 &movdqa(&QWP(0x60,"esp"),"xmm3"); 739 740 &movdqa ("xmm1",&QWP(0x10,"esp")); 741 &movdqa ("xmm2",&QWP(0x20,"esp")); 742 &movdqa ("xmm3",&QWP(0x30,"esp")); 743 744 &movq ("xmm4","xmm0"); # "flatten" 745 &pand ("xmm0","xmm7"); 746 &xor ("ebp","ebp"); 747 &pslldq ("xmm4",6); 748 &movq ("xmm5","xmm1"); 749 &paddq ("xmm0","xmm4"); 750 &pand ("xmm1","xmm7"); 751 &psrldq ("xmm0",6); 752 &movd ("eax","xmm0"); 753 &psrldq ("xmm0",4); 754 755 &paddq ("xmm5","xmm0"); 756 &movdqa ("xmm0",&QWP(0x40,"esp")); 757 &sub ("eax",-1); # start subtracting modulus, 758 # this is used to determine 759 # if result is larger/smaller 760 # than modulus (see below) 761 &pslldq ("xmm5",6); 762 &movq ("xmm4","xmm2"); 763 &paddq ("xmm1","xmm5"); 764 &pand ("xmm2","xmm7"); 765 &psrldq ("xmm1",6); 766 &mov (&DWP(4*0,"edi"),"eax"); 767 &movd ("eax","xmm1"); 768 &psrldq ("xmm1",4); 769 770 &paddq ("xmm4","xmm1"); 771 &movdqa ("xmm1",&QWP(0x50,"esp")); 772 &sbb ("eax",-1); 773 &pslldq ("xmm4",6); 774 &movq ("xmm5","xmm3"); 775 &paddq ("xmm2","xmm4"); 776 &pand ("xmm3","xmm7"); 777 &psrldq ("xmm2",6); 778 &mov (&DWP(4*1,"edi"),"eax"); 779 &movd ("eax","xmm2"); 780 &psrldq ("xmm2",4); 781 782 &paddq ("xmm5","xmm2"); 783 &movdqa ("xmm2",&QWP(0x60,"esp")); 784 &sbb ("eax",-1); 785 &pslldq ("xmm5",6); 786 &movq ("xmm4","xmm0"); 787 &paddq ("xmm3","xmm5"); 788 &pand ("xmm0","xmm7"); 789 &psrldq ("xmm3",6); 790 &mov (&DWP(4*2,"edi"),"eax"); 791 &movd ("eax","xmm3"); 792 &psrldq ("xmm3",4); 793 794 &paddq ("xmm4","xmm3"); 795 &sbb ("eax",0); 796 &pslldq ("xmm4",6); 797 &movq ("xmm5","xmm1"); 798 &paddq ("xmm0","xmm4"); 799 &pand ("xmm1","xmm7"); 800 &psrldq ("xmm0",6); 801 &mov (&DWP(4*3,"edi"),"eax"); 802 &movd ("eax","xmm0"); 803 &psrldq ("xmm0",4); 804 805 &paddq ("xmm5","xmm0"); 806 &sbb ("eax",0); 807 &pslldq ("xmm5",6); 808 &movq ("xmm4","xmm2"); 809 &paddq ("xmm1","xmm5"); 810 &pand ("xmm2","xmm7"); 811 &psrldq ("xmm1",6); 812 &movd ("ebx","xmm1"); 813 &psrldq ("xmm1",4); 814 &mov ("esp","edx"); 815 816 &paddq ("xmm4","xmm1"); 817 &pslldq ("xmm4",6); 818 &paddq ("xmm2","xmm4"); 819 &psrldq ("xmm2",6); 820 &movd ("ecx","xmm2"); 821 &psrldq ("xmm2",4); 822 &sbb ("ebx",0); 823 &movd ("edx","xmm2"); 824 &pextrw ("esi","xmm2",2); # top-most overflow bit 825 &sbb ("ecx",1); 826 &sbb ("edx",-1); 827 &sbb ("esi",0); # borrow from subtraction 828 829 # Final step is "if result > mod, subtract mod", and at this point 830 # we have result - mod written to output buffer, as well as borrow 831 # bit from this subtraction, and if borrow bit is set, we add 832 # modulus back. 833 # 834 # Note that because mod has special form, i.e. consists of 835 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 836 # assigning borrow bit to one register, %ebp, and its negative 837 # to another, %esi. But we started by calculating %esi... 838 839 &sub ("ebp","esi"); 840 &add (&DWP(4*0,"edi"),"esi"); # add modulus or zero 841 &adc (&DWP(4*1,"edi"),"esi"); 842 &adc (&DWP(4*2,"edi"),"esi"); 843 &adc (&DWP(4*3,"edi"),0); 844 &adc ("eax",0); 845 &adc ("ebx",0); 846 &mov (&DWP(4*4,"edi"),"eax"); 847 &adc ("ecx","ebp"); 848 &mov (&DWP(4*5,"edi"),"ebx"); 849 &adc ("edx","esi"); 850 &mov (&DWP(4*6,"edi"),"ecx"); 851 &mov (&DWP(4*7,"edi"),"edx"); 852 853 &ret (); 854 855&set_label("mul_mont_ialu",16); } 856 857 ######################################## 858 # IALU code path suitable for all CPUs. 859 ######################################## 860 # stack layout: 861 # +------------------------------------+< %esp 862 # | 8 32-bit temporary words, accessed | 863 # | as circular buffer | 864 # . . 865 # . . 866 # +------------------------------------+< +32 867 # | offloaded destination pointer | 868 # +------------------------------------+ 869 # | unused | 870 # +------------------------------------+< +40 871 &sub ("esp",10*4); 872 873 &mov ("eax",&DWP(0*4,"esi")); # a[0] 874 &mov ("ebx",&DWP(0*4,"ebp")); # b[0] 875 &mov (&DWP(8*4,"esp"),"edi"); # off-load dst ptr 876 877 &mul ("ebx"); # a[0]*b[0] 878 &mov (&DWP(0*4,"esp"),"eax"); # t[0] 879 &mov ("eax",&DWP(1*4,"esi")); 880 &mov ("ecx","edx") 881 882 &mul ("ebx"); # a[1]*b[0] 883 &add ("ecx","eax"); 884 &mov ("eax",&DWP(2*4,"esi")); 885 &adc ("edx",0); 886 &mov (&DWP(1*4,"esp"),"ecx"); # t[1] 887 &mov ("ecx","edx"); 888 889 &mul ("ebx"); # a[2]*b[0] 890 &add ("ecx","eax"); 891 &mov ("eax",&DWP(3*4,"esi")); 892 &adc ("edx",0); 893 &mov (&DWP(2*4,"esp"),"ecx"); # t[2] 894 &mov ("ecx","edx"); 895 896 &mul ("ebx"); # a[3]*b[0] 897 &add ("ecx","eax"); 898 &mov ("eax",&DWP(4*4,"esi")); 899 &adc ("edx",0); 900 &mov (&DWP(3*4,"esp"),"ecx"); # t[3] 901 &mov ("ecx","edx"); 902 903 &mul ("ebx"); # a[4]*b[0] 904 &add ("ecx","eax"); 905 &mov ("eax",&DWP(5*4,"esi")); 906 &adc ("edx",0); 907 &mov (&DWP(4*4,"esp"),"ecx"); # t[4] 908 &mov ("ecx","edx"); 909 910 &mul ("ebx"); # a[5]*b[0] 911 &add ("ecx","eax"); 912 &mov ("eax",&DWP(6*4,"esi")); 913 &adc ("edx",0); 914 &mov (&DWP(5*4,"esp"),"ecx"); # t[5] 915 &mov ("ecx","edx"); 916 917 &mul ("ebx"); # a[6]*b[0] 918 &add ("ecx","eax"); 919 &mov ("eax",&DWP(7*4,"esi")); 920 &adc ("edx",0); 921 &mov (&DWP(6*4,"esp"),"ecx"); # t[6] 922 &mov ("ecx","edx"); 923 924 &xor ("edi","edi"); # initial top-most carry 925 &mul ("ebx"); # a[7]*b[0] 926 &add ("ecx","eax"); # t[7] 927 &mov ("eax",&DWP(0*4,"esp")); # t[0] 928 &adc ("edx",0); # t[8] 929 930for ($i=0;$i<7;$i++) { 931 my $j=$i+1; 932 933 # Reduction iteration is normally performed by accumulating 934 # result of multiplication of modulus by "magic" digit [and 935 # omitting least significant word, which is guaranteed to 936 # be 0], but thanks to special form of modulus and "magic" 937 # digit being equal to least significant word, it can be 938 # performed with additions and subtractions alone. Indeed: 939 # 940 # ffff.0001.0000.0000.0000.ffff.ffff.ffff 941 # * abcd 942 # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 943 # 944 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 945 # rewrite above as: 946 # 947 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 948 # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 949 # - abcd.0000.0000.0000.0000.0000.0000.abcd 950 # 951 # or marking redundant operations: 952 # 953 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- 954 # + abcd.0000.abcd.0000.0000.abcd.----.----.---- 955 # - abcd.----.----.----.----.----.----.---- 956 957 &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0] 958 &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0 959 &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0 960 &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0] 961 &adc ("ecx",0); # t[7]+=0 962 &adc ("edx","eax"); # t[8]+=t[0] 963 &adc ("edi",0); # top-most carry 964 &mov ("ebx",&DWP($j*4,"ebp")); # b[i] 965 &sub ("ecx","eax"); # t[7]-=t[0] 966 &mov ("eax",&DWP(0*4,"esi")); # a[0] 967 &sbb ("edx",0); # t[8]-=0 968 &mov (&DWP((($i+7)%8)*4,"esp"),"ecx"); 969 &sbb ("edi",0); # top-most carry, 970 # keep in mind that 971 # netto result is 972 # *addition* of value 973 # with (abcd<<32)-abcd 974 # on top, so that 975 # underflow is 976 # impossible, because 977 # (abcd<<32)-abcd 978 # doesn't underflow 979 &mov (&DWP((($i+8)%8)*4,"esp"),"edx"); 980 981 &mul ("ebx"); # a[0]*b[i] 982 &add ("eax",&DWP((($j+0)%8)*4,"esp")); 983 &adc ("edx",0); 984 &mov (&DWP((($j+0)%8)*4,"esp"),"eax"); 985 &mov ("eax",&DWP(1*4,"esi")); 986 &mov ("ecx","edx") 987 988 &mul ("ebx"); # a[1]*b[i] 989 &add ("ecx",&DWP((($j+1)%8)*4,"esp")); 990 &adc ("edx",0); 991 &add ("ecx","eax"); 992 &adc ("edx",0); 993 &mov ("eax",&DWP(2*4,"esi")); 994 &mov (&DWP((($j+1)%8)*4,"esp"),"ecx"); 995 &mov ("ecx","edx"); 996 997 &mul ("ebx"); # a[2]*b[i] 998 &add ("ecx",&DWP((($j+2)%8)*4,"esp")); 999 &adc ("edx",0); 1000 &add ("ecx","eax"); 1001 &adc ("edx",0); 1002 &mov ("eax",&DWP(3*4,"esi")); 1003 &mov (&DWP((($j+2)%8)*4,"esp"),"ecx"); 1004 &mov ("ecx","edx"); 1005 1006 &mul ("ebx"); # a[3]*b[i] 1007 &add ("ecx",&DWP((($j+3)%8)*4,"esp")); 1008 &adc ("edx",0); 1009 &add ("ecx","eax"); 1010 &adc ("edx",0); 1011 &mov ("eax",&DWP(4*4,"esi")); 1012 &mov (&DWP((($j+3)%8)*4,"esp"),"ecx"); 1013 &mov ("ecx","edx"); 1014 1015 &mul ("ebx"); # a[4]*b[i] 1016 &add ("ecx",&DWP((($j+4)%8)*4,"esp")); 1017 &adc ("edx",0); 1018 &add ("ecx","eax"); 1019 &adc ("edx",0); 1020 &mov ("eax",&DWP(5*4,"esi")); 1021 &mov (&DWP((($j+4)%8)*4,"esp"),"ecx"); 1022 &mov ("ecx","edx"); 1023 1024 &mul ("ebx"); # a[5]*b[i] 1025 &add ("ecx",&DWP((($j+5)%8)*4,"esp")); 1026 &adc ("edx",0); 1027 &add ("ecx","eax"); 1028 &adc ("edx",0); 1029 &mov ("eax",&DWP(6*4,"esi")); 1030 &mov (&DWP((($j+5)%8)*4,"esp"),"ecx"); 1031 &mov ("ecx","edx"); 1032 1033 &mul ("ebx"); # a[6]*b[i] 1034 &add ("ecx",&DWP((($j+6)%8)*4,"esp")); 1035 &adc ("edx",0); 1036 &add ("ecx","eax"); 1037 &adc ("edx",0); 1038 &mov ("eax",&DWP(7*4,"esi")); 1039 &mov (&DWP((($j+6)%8)*4,"esp"),"ecx"); 1040 &mov ("ecx","edx"); 1041 1042 &mul ("ebx"); # a[7]*b[i] 1043 &add ("ecx",&DWP((($j+7)%8)*4,"esp")); 1044 &adc ("edx",0); 1045 &add ("ecx","eax"); # t[7] 1046 &mov ("eax",&DWP((($j+0)%8)*4,"esp")); # t[0] 1047 &adc ("edx","edi"); # t[8] 1048 &mov ("edi",0); 1049 &adc ("edi",0); # top-most carry 1050} 1051 &mov ("ebp",&DWP(8*4,"esp")); # restore dst ptr 1052 &xor ("esi","esi"); 1053 my $j=$i+1; 1054 1055 # last multiplication-less reduction 1056 &add (&DWP((($i+3)%8)*4,"esp"),"eax"); # t[3]+=t[0] 1057 &adc (&DWP((($i+4)%8)*4,"esp"),0); # t[4]+=0 1058 &adc (&DWP((($i+5)%8)*4,"esp"),0); # t[5]+=0 1059 &adc (&DWP((($i+6)%8)*4,"esp"),"eax"); # t[6]+=t[0] 1060 &adc ("ecx",0); # t[7]+=0 1061 &adc ("edx","eax"); # t[8]+=t[0] 1062 &adc ("edi",0); # top-most carry 1063 &mov ("ebx",&DWP((($j+1)%8)*4,"esp")); 1064 &sub ("ecx","eax"); # t[7]-=t[0] 1065 &mov ("eax",&DWP((($j+0)%8)*4,"esp")); 1066 &sbb ("edx",0); # t[8]-=0 1067 &mov (&DWP((($i+7)%8)*4,"esp"),"ecx"); 1068 &sbb ("edi",0); # top-most carry 1069 &mov (&DWP((($i+8)%8)*4,"esp"),"edx"); 1070 1071 # Final step is "if result > mod, subtract mod", but we do it 1072 # "other way around", namely write result - mod to output buffer 1073 # and if subtraction borrowed, add modulus back. 1074 1075 &mov ("ecx",&DWP((($j+2)%8)*4,"esp")); 1076 &sub ("eax",-1); 1077 &mov ("edx",&DWP((($j+3)%8)*4,"esp")); 1078 &sbb ("ebx",-1); 1079 &mov (&DWP(0*4,"ebp"),"eax"); 1080 &sbb ("ecx",-1); 1081 &mov (&DWP(1*4,"ebp"),"ebx"); 1082 &sbb ("edx",0); 1083 &mov (&DWP(2*4,"ebp"),"ecx"); 1084 &mov (&DWP(3*4,"ebp"),"edx"); 1085 1086 &mov ("eax",&DWP((($j+4)%8)*4,"esp")); 1087 &mov ("ebx",&DWP((($j+5)%8)*4,"esp")); 1088 &mov ("ecx",&DWP((($j+6)%8)*4,"esp")); 1089 &sbb ("eax",0); 1090 &mov ("edx",&DWP((($j+7)%8)*4,"esp")); 1091 &sbb ("ebx",0); 1092 &sbb ("ecx",1); 1093 &sbb ("edx",-1); 1094 &sbb ("edi",0); 1095 1096 # Note that because mod has special form, i.e. consists of 1097 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 1098 # assigning borrow bit to one register, %ebp, and its negative 1099 # to another, %esi. But we started by calculating %esi... 1100 1101 &sub ("esi","edi"); 1102 &add (&DWP(0*4,"ebp"),"edi"); # add modulus or zero 1103 &adc (&DWP(1*4,"ebp"),"edi"); 1104 &adc (&DWP(2*4,"ebp"),"edi"); 1105 &adc (&DWP(3*4,"ebp"),0); 1106 &adc ("eax",0); 1107 &adc ("ebx",0); 1108 &mov (&DWP(4*4,"ebp"),"eax"); 1109 &adc ("ecx","esi"); 1110 &mov (&DWP(5*4,"ebp"),"ebx"); 1111 &adc ("edx","edi"); 1112 &mov (&DWP(6*4,"ebp"),"ecx"); 1113 &mov ("edi","ebp"); # fulfill contract 1114 &mov (&DWP(7*4,"ebp"),"edx"); 1115 1116 &add ("esp",10*4); 1117 &ret (); 1118&function_end_B("_ecp_nistz256_mul_mont"); 1119 1120######################################################################## 1121# void ecp_nistz256_scatter_w5(void *edi,const P256_POINT *esi, 1122# int ebp); 1123&function_begin("ecp_nistz256_scatter_w5"); 1124 &mov ("edi",&wparam(0)); 1125 &mov ("esi",&wparam(1)); 1126 &mov ("ebp",&wparam(2)); 1127 1128 &lea ("edi",&DWP(128-4,"edi","ebp",4)); 1129 &mov ("ebp",96/16); 1130&set_label("scatter_w5_loop"); 1131 &mov ("eax",&DWP(0,"esi")); 1132 &mov ("ebx",&DWP(4,"esi")); 1133 &mov ("ecx",&DWP(8,"esi")); 1134 &mov ("edx",&DWP(12,"esi")); 1135 &lea ("esi",&DWP(16,"esi")); 1136 &mov (&DWP(64*0-128,"edi"),"eax"); 1137 &mov (&DWP(64*1-128,"edi"),"ebx"); 1138 &mov (&DWP(64*2-128,"edi"),"ecx"); 1139 &mov (&DWP(64*3-128,"edi"),"edx"); 1140 &lea ("edi",&DWP(64*4,"edi")); 1141 &dec ("ebp"); 1142 &jnz (&label("scatter_w5_loop")); 1143&function_end("ecp_nistz256_scatter_w5"); 1144 1145######################################################################## 1146# void ecp_nistz256_gather_w5(P256_POINT *edi,const void *esi, 1147# int ebp); 1148&function_begin("ecp_nistz256_gather_w5"); 1149 &mov ("esi",&wparam(1)); 1150 &mov ("ebp",&wparam(2)); 1151 1152 &lea ("esi",&DWP(0,"esi","ebp",4)); 1153 &neg ("ebp"); 1154 &sar ("ebp",31); 1155 &mov ("edi",&wparam(0)); 1156 &lea ("esi",&DWP(0,"esi","ebp",4)); 1157 1158 for($i=0;$i<24;$i+=4) { 1159 &mov ("eax",&DWP(64*($i+0),"esi")); 1160 &mov ("ebx",&DWP(64*($i+1),"esi")); 1161 &mov ("ecx",&DWP(64*($i+2),"esi")); 1162 &mov ("edx",&DWP(64*($i+3),"esi")); 1163 &and ("eax","ebp"); 1164 &and ("ebx","ebp"); 1165 &and ("ecx","ebp"); 1166 &and ("edx","ebp"); 1167 &mov (&DWP(4*($i+0),"edi"),"eax"); 1168 &mov (&DWP(4*($i+1),"edi"),"ebx"); 1169 &mov (&DWP(4*($i+2),"edi"),"ecx"); 1170 &mov (&DWP(4*($i+3),"edi"),"edx"); 1171 } 1172&function_end("ecp_nistz256_gather_w5"); 1173 1174######################################################################## 1175# void ecp_nistz256_scatter_w7(void *edi,const P256_POINT_AFFINE *esi, 1176# int ebp); 1177&function_begin("ecp_nistz256_scatter_w7"); 1178 &mov ("edi",&wparam(0)); 1179 &mov ("esi",&wparam(1)); 1180 &mov ("ebp",&wparam(2)); 1181 1182 &lea ("edi",&DWP(0,"edi","ebp")); 1183 &mov ("ebp",64/4); 1184&set_label("scatter_w7_loop"); 1185 &mov ("eax",&DWP(0,"esi")); 1186 &lea ("esi",&DWP(4,"esi")); 1187 &mov (&BP(64*0,"edi"),"al"); 1188 &mov (&BP(64*1,"edi"),"ah"); 1189 &shr ("eax",16); 1190 &mov (&BP(64*2,"edi"),"al"); 1191 &mov (&BP(64*3,"edi"),"ah"); 1192 &lea ("edi",&DWP(64*4,"edi")); 1193 &dec ("ebp"); 1194 &jnz (&label("scatter_w7_loop")); 1195&function_end("ecp_nistz256_scatter_w7"); 1196 1197######################################################################## 1198# void ecp_nistz256_gather_w7(P256_POINT_AFFINE *edi,const void *esi, 1199# int ebp); 1200&function_begin("ecp_nistz256_gather_w7"); 1201 &mov ("esi",&wparam(1)); 1202 &mov ("ebp",&wparam(2)); 1203 1204 &add ("esi","ebp"); 1205 &neg ("ebp"), 1206 &sar ("ebp",31); 1207 &mov ("edi",&wparam(0)); 1208 &lea ("esi",&DWP(0,"esi","ebp")); 1209 1210 for($i=0;$i<64;$i+=4) { 1211 &movz ("eax",&BP(64*($i+0),"esi")); 1212 &movz ("ebx",&BP(64*($i+1),"esi")); 1213 &movz ("ecx",&BP(64*($i+2),"esi")); 1214 &and ("eax","ebp"); 1215 &movz ("edx",&BP(64*($i+3),"esi")); 1216 &and ("ebx","ebp"); 1217 &mov (&BP($i+0,"edi"),"al"); 1218 &and ("ecx","ebp"); 1219 &mov (&BP($i+1,"edi"),"bl"); 1220 &and ("edx","ebp"); 1221 &mov (&BP($i+2,"edi"),"cl"); 1222 &mov (&BP($i+3,"edi"),"dl"); 1223 } 1224&function_end("ecp_nistz256_gather_w7"); 1225 1226######################################################################## 1227# following subroutines are "literal" implementation of those found in 1228# ecp_nistz256.c 1229# 1230######################################################################## 1231# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 1232# 1233&static_label("point_double_shortcut"); 1234&function_begin("ecp_nistz256_point_double"); 1235{ my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 1236 1237 &mov ("esi",&wparam(1)); 1238 1239 # above map() describes stack layout with 5 temporary 1240 # 256-bit vectors on top, then we take extra word for 1241 # OPENSSL_ia32cap_P copy. 1242 &stack_push(8*5+1); 1243 if ($sse2) { 1244 &call ("_picup_eax"); 1245 &set_label("pic"); 1246 &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); 1247 &mov ("ebp",&DWP(0,"edx")); } 1248 1249&set_label("point_double_shortcut"); 1250 &mov ("eax",&DWP(0,"esi")); # copy in_x 1251 &mov ("ebx",&DWP(4,"esi")); 1252 &mov ("ecx",&DWP(8,"esi")); 1253 &mov ("edx",&DWP(12,"esi")); 1254 &mov (&DWP($in_x+0,"esp"),"eax"); 1255 &mov (&DWP($in_x+4,"esp"),"ebx"); 1256 &mov (&DWP($in_x+8,"esp"),"ecx"); 1257 &mov (&DWP($in_x+12,"esp"),"edx"); 1258 &mov ("eax",&DWP(16,"esi")); 1259 &mov ("ebx",&DWP(20,"esi")); 1260 &mov ("ecx",&DWP(24,"esi")); 1261 &mov ("edx",&DWP(28,"esi")); 1262 &mov (&DWP($in_x+16,"esp"),"eax"); 1263 &mov (&DWP($in_x+20,"esp"),"ebx"); 1264 &mov (&DWP($in_x+24,"esp"),"ecx"); 1265 &mov (&DWP($in_x+28,"esp"),"edx"); 1266 &mov (&DWP(32*5,"esp"),"ebp"); # OPENSSL_ia32cap_P copy 1267 1268 &lea ("ebp",&DWP(32,"esi")); 1269 &lea ("esi",&DWP(32,"esi")); 1270 &lea ("edi",&DWP($S,"esp")); 1271 &call ("_ecp_nistz256_add"); # p256_mul_by_2(S, in_y); 1272 1273 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1274 &mov ("esi",64); 1275 &add ("esi",&wparam(1)); 1276 &lea ("edi",&DWP($Zsqr,"esp")); 1277 &mov ("ebp","esi"); 1278 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Zsqr, in_z); 1279 1280 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1281 &lea ("esi",&DWP($S,"esp")); 1282 &lea ("ebp",&DWP($S,"esp")); 1283 &lea ("edi",&DWP($S,"esp")); 1284 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(S, S); 1285 1286 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1287 &mov ("ebp",&wparam(1)); 1288 &lea ("esi",&DWP(32,"ebp")); 1289 &lea ("ebp",&DWP(64,"ebp")); 1290 &lea ("edi",&DWP($tmp0,"esp")); 1291 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(tmp0, in_z, in_y); 1292 1293 &lea ("esi",&DWP($in_x,"esp")); 1294 &lea ("ebp",&DWP($Zsqr,"esp")); 1295 &lea ("edi",&DWP($M,"esp")); 1296 &call ("_ecp_nistz256_add"); # p256_add(M, in_x, Zsqr); 1297 1298 &mov ("edi",64); 1299 &lea ("esi",&DWP($tmp0,"esp")); 1300 &lea ("ebp",&DWP($tmp0,"esp")); 1301 &add ("edi",&wparam(0)); 1302 &call ("_ecp_nistz256_add"); # p256_mul_by_2(res_z, tmp0); 1303 1304 &lea ("esi",&DWP($in_x,"esp")); 1305 &lea ("ebp",&DWP($Zsqr,"esp")); 1306 &lea ("edi",&DWP($Zsqr,"esp")); 1307 &call ("_ecp_nistz256_sub"); # p256_sub(Zsqr, in_x, Zsqr); 1308 1309 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1310 &lea ("esi",&DWP($S,"esp")); 1311 &lea ("ebp",&DWP($S,"esp")); 1312 &lea ("edi",&DWP($tmp0,"esp")); 1313 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(tmp0, S); 1314 1315 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1316 &lea ("esi",&DWP($M,"esp")); 1317 &lea ("ebp",&DWP($Zsqr,"esp")); 1318 &lea ("edi",&DWP($M,"esp")); 1319 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(M, M, Zsqr); 1320 1321 &mov ("edi",32); 1322 &lea ("esi",&DWP($tmp0,"esp")); 1323 &add ("edi",&wparam(0)); 1324 &call ("_ecp_nistz256_div_by_2"); # p256_div_by_2(res_y, tmp0); 1325 1326 &lea ("esi",&DWP($M,"esp")); 1327 &lea ("ebp",&DWP($M,"esp")); 1328 &lea ("edi",&DWP($tmp0,"esp")); 1329 &call ("_ecp_nistz256_add"); # 1/2 p256_mul_by_3(M, M); 1330 1331 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1332 &lea ("esi",&DWP($in_x,"esp")); 1333 &lea ("ebp",&DWP($S,"esp")); 1334 &lea ("edi",&DWP($S,"esp")); 1335 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, in_x); 1336 1337 &lea ("esi",&DWP($tmp0,"esp")); 1338 &lea ("ebp",&DWP($M,"esp")); 1339 &lea ("edi",&DWP($M,"esp")); 1340 &call ("_ecp_nistz256_add"); # 2/2 p256_mul_by_3(M, M); 1341 1342 &lea ("esi",&DWP($S,"esp")); 1343 &lea ("ebp",&DWP($S,"esp")); 1344 &lea ("edi",&DWP($tmp0,"esp")); 1345 &call ("_ecp_nistz256_add"); # p256_mul_by_2(tmp0, S); 1346 1347 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1348 &lea ("esi",&DWP($M,"esp")); 1349 &lea ("ebp",&DWP($M,"esp")); 1350 &mov ("edi",&wparam(0)); 1351 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(res_x, M); 1352 1353 &mov ("esi","edi"); # %edi is still res_x here 1354 &lea ("ebp",&DWP($tmp0,"esp")); 1355 &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, tmp0); 1356 1357 &lea ("esi",&DWP($S,"esp")); 1358 &mov ("ebp","edi"); # %edi is still res_x 1359 &lea ("edi",&DWP($S,"esp")); 1360 &call ("_ecp_nistz256_sub"); # p256_sub(S, S, res_x); 1361 1362 &mov ("eax",&DWP(32*5,"esp")); # OPENSSL_ia32cap_P copy 1363 &mov ("esi","edi"); # %edi is still &S 1364 &lea ("ebp",&DWP($M,"esp")); 1365 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, M); 1366 1367 &mov ("ebp",32); 1368 &lea ("esi",&DWP($S,"esp")); 1369 &add ("ebp",&wparam(0)); 1370 &mov ("edi","ebp"); 1371 &call ("_ecp_nistz256_sub"); # p256_sub(res_y, S, res_y); 1372 1373 &stack_pop(8*5+1); 1374} &function_end("ecp_nistz256_point_double"); 1375 1376######################################################################## 1377# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 1378# const P256_POINT *in2); 1379&function_begin("ecp_nistz256_point_add"); 1380{ my ($res_x,$res_y,$res_z, 1381 $in1_x,$in1_y,$in1_z, 1382 $in2_x,$in2_y,$in2_z, 1383 $H,$Hsqr,$R,$Rsqr,$Hcub, 1384 $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); 1385 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 1386 1387 &mov ("esi",&wparam(2)); 1388 1389 # above map() describes stack layout with 18 temporary 1390 # 256-bit vectors on top, then we take extra words for 1391 # ~in1infty, ~in2infty, result of check for zero and 1392 # OPENSSL_ia32cap_P copy. [one unused word for padding] 1393 &stack_push(8*18+5); 1394 if ($sse2) { 1395 &call ("_picup_eax"); 1396 &set_label("pic"); 1397 &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); 1398 &mov ("ebp",&DWP(0,"edx")); } 1399 1400 &lea ("edi",&DWP($in2_x,"esp")); 1401 for($i=0;$i<96;$i+=16) { 1402 &mov ("eax",&DWP($i+0,"esi")); # copy in2 1403 &mov ("ebx",&DWP($i+4,"esi")); 1404 &mov ("ecx",&DWP($i+8,"esi")); 1405 &mov ("edx",&DWP($i+12,"esi")); 1406 &mov (&DWP($i+0,"edi"),"eax"); 1407 &mov (&DWP(32*18+12,"esp"),"ebp") if ($i==0); 1408 &mov ("ebp","eax") if ($i==64); 1409 &or ("ebp","eax") if ($i>64); 1410 &mov (&DWP($i+4,"edi"),"ebx"); 1411 &or ("ebp","ebx") if ($i>=64); 1412 &mov (&DWP($i+8,"edi"),"ecx"); 1413 &or ("ebp","ecx") if ($i>=64); 1414 &mov (&DWP($i+12,"edi"),"edx"); 1415 &or ("ebp","edx") if ($i>=64); 1416 } 1417 &xor ("eax","eax"); 1418 &mov ("esi",&wparam(1)); 1419 &sub ("eax","ebp"); 1420 &or ("ebp","eax"); 1421 &sar ("ebp",31); 1422 &mov (&DWP(32*18+4,"esp"),"ebp"); # ~in2infty 1423 1424 &lea ("edi",&DWP($in1_x,"esp")); 1425 for($i=0;$i<96;$i+=16) { 1426 &mov ("eax",&DWP($i+0,"esi")); # copy in1 1427 &mov ("ebx",&DWP($i+4,"esi")); 1428 &mov ("ecx",&DWP($i+8,"esi")); 1429 &mov ("edx",&DWP($i+12,"esi")); 1430 &mov (&DWP($i+0,"edi"),"eax"); 1431 &mov ("ebp","eax") if ($i==64); 1432 &or ("ebp","eax") if ($i>64); 1433 &mov (&DWP($i+4,"edi"),"ebx"); 1434 &or ("ebp","ebx") if ($i>=64); 1435 &mov (&DWP($i+8,"edi"),"ecx"); 1436 &or ("ebp","ecx") if ($i>=64); 1437 &mov (&DWP($i+12,"edi"),"edx"); 1438 &or ("ebp","edx") if ($i>=64); 1439 } 1440 &xor ("eax","eax"); 1441 &sub ("eax","ebp"); 1442 &or ("ebp","eax"); 1443 &sar ("ebp",31); 1444 &mov (&DWP(32*18+0,"esp"),"ebp"); # ~in1infty 1445 1446 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1447 &lea ("esi",&DWP($in2_z,"esp")); 1448 &lea ("ebp",&DWP($in2_z,"esp")); 1449 &lea ("edi",&DWP($Z2sqr,"esp")); 1450 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z2sqr, in2_z); 1451 1452 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1453 &lea ("esi",&DWP($in1_z,"esp")); 1454 &lea ("ebp",&DWP($in1_z,"esp")); 1455 &lea ("edi",&DWP($Z1sqr,"esp")); 1456 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); 1457 1458 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1459 &lea ("esi",&DWP($Z2sqr,"esp")); 1460 &lea ("ebp",&DWP($in2_z,"esp")); 1461 &lea ("edi",&DWP($S1,"esp")); 1462 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, Z2sqr, in2_z); 1463 1464 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1465 &lea ("esi",&DWP($Z1sqr,"esp")); 1466 &lea ("ebp",&DWP($in1_z,"esp")); 1467 &lea ("edi",&DWP($S2,"esp")); 1468 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); 1469 1470 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1471 &lea ("esi",&DWP($in1_y,"esp")); 1472 &lea ("ebp",&DWP($S1,"esp")); 1473 &lea ("edi",&DWP($S1,"esp")); 1474 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S1, S1, in1_y); 1475 1476 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1477 &lea ("esi",&DWP($in2_y,"esp")); 1478 &lea ("ebp",&DWP($S2,"esp")); 1479 &lea ("edi",&DWP($S2,"esp")); 1480 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); 1481 1482 &lea ("esi",&DWP($S2,"esp")); 1483 &lea ("ebp",&DWP($S1,"esp")); 1484 &lea ("edi",&DWP($R,"esp")); 1485 &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, S1); 1486 1487 &or ("ebx","eax"); # see if result is zero 1488 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1489 &or ("ebx","ecx"); 1490 &or ("ebx","edx"); 1491 &or ("ebx",&DWP(0,"edi")); 1492 &or ("ebx",&DWP(4,"edi")); 1493 &lea ("esi",&DWP($in1_x,"esp")); 1494 &or ("ebx",&DWP(8,"edi")); 1495 &lea ("ebp",&DWP($Z2sqr,"esp")); 1496 &or ("ebx",&DWP(12,"edi")); 1497 &lea ("edi",&DWP($U1,"esp")); 1498 &mov (&DWP(32*18+8,"esp"),"ebx"); 1499 1500 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U1, in1_x, Z2sqr); 1501 1502 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1503 &lea ("esi",&DWP($in2_x,"esp")); 1504 &lea ("ebp",&DWP($Z1sqr,"esp")); 1505 &lea ("edi",&DWP($U2,"esp")); 1506 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in2_x, Z1sqr); 1507 1508 &lea ("esi",&DWP($U2,"esp")); 1509 &lea ("ebp",&DWP($U1,"esp")); 1510 &lea ("edi",&DWP($H,"esp")); 1511 &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, U1); 1512 1513 &or ("eax","ebx"); # see if result is zero 1514 &or ("eax","ecx"); 1515 &or ("eax","edx"); 1516 &or ("eax",&DWP(0,"edi")); 1517 &or ("eax",&DWP(4,"edi")); 1518 &or ("eax",&DWP(8,"edi")); 1519 &or ("eax",&DWP(12,"edi")); # ~is_equal(U1,U2) 1520 1521 &mov ("ebx",&DWP(32*18+0,"esp")); # ~in1infty 1522 ¬ ("ebx"); # -1/0 -> 0/-1 1523 &or ("eax","ebx"); 1524 &mov ("ebx",&DWP(32*18+4,"esp")); # ~in2infty 1525 ¬ ("ebx"); # -1/0 -> 0/-1 1526 &or ("eax","ebx"); 1527 &or ("eax",&DWP(32*18+8,"esp")); # ~is_equal(S1,S2) 1528 1529 # if (~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 1530 &data_byte(0x3e); # predict taken 1531 &jnz (&label("add_proceed")); 1532 1533&set_label("add_double",16); 1534 &mov ("esi",&wparam(1)); 1535 &mov ("ebp",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1536 &add ("esp",4*((8*18+5)-(8*5+1))); # difference in frame sizes 1537 &jmp (&label("point_double_shortcut")); 1538 1539&set_label("add_proceed",16); 1540 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1541 &lea ("esi",&DWP($R,"esp")); 1542 &lea ("ebp",&DWP($R,"esp")); 1543 &lea ("edi",&DWP($Rsqr,"esp")); 1544 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); 1545 1546 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1547 &lea ("esi",&DWP($H,"esp")); 1548 &lea ("ebp",&DWP($in1_z,"esp")); 1549 &lea ("edi",&DWP($res_z,"esp")); 1550 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); 1551 1552 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1553 &lea ("esi",&DWP($H,"esp")); 1554 &lea ("ebp",&DWP($H,"esp")); 1555 &lea ("edi",&DWP($Hsqr,"esp")); 1556 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); 1557 1558 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1559 &lea ("esi",&DWP($in2_z,"esp")); 1560 &lea ("ebp",&DWP($res_z,"esp")); 1561 &lea ("edi",&DWP($res_z,"esp")); 1562 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, res_z, in2_z); 1563 1564 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1565 &lea ("esi",&DWP($Hsqr,"esp")); 1566 &lea ("ebp",&DWP($U1,"esp")); 1567 &lea ("edi",&DWP($U2,"esp")); 1568 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, U1, Hsqr); 1569 1570 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1571 &lea ("esi",&DWP($H,"esp")); 1572 &lea ("ebp",&DWP($Hsqr,"esp")); 1573 &lea ("edi",&DWP($Hcub,"esp")); 1574 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); 1575 1576 &lea ("esi",&DWP($U2,"esp")); 1577 &lea ("ebp",&DWP($U2,"esp")); 1578 &lea ("edi",&DWP($Hsqr,"esp")); 1579 &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); 1580 1581 &lea ("esi",&DWP($Rsqr,"esp")); 1582 &lea ("ebp",&DWP($Hsqr,"esp")); 1583 &lea ("edi",&DWP($res_x,"esp")); 1584 &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); 1585 1586 &lea ("esi",&DWP($res_x,"esp")); 1587 &lea ("ebp",&DWP($Hcub,"esp")); 1588 &lea ("edi",&DWP($res_x,"esp")); 1589 &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); 1590 1591 &lea ("esi",&DWP($U2,"esp")); 1592 &lea ("ebp",&DWP($res_x,"esp")); 1593 &lea ("edi",&DWP($res_y,"esp")); 1594 &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); 1595 1596 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1597 &lea ("esi",&DWP($Hcub,"esp")); 1598 &lea ("ebp",&DWP($S1,"esp")); 1599 &lea ("edi",&DWP($S2,"esp")); 1600 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S1, Hcub); 1601 1602 &mov ("eax",&DWP(32*18+12,"esp")); # OPENSSL_ia32cap_P copy 1603 &lea ("esi",&DWP($R,"esp")); 1604 &lea ("ebp",&DWP($res_y,"esp")); 1605 &lea ("edi",&DWP($res_y,"esp")); 1606 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, R, res_y); 1607 1608 &lea ("esi",&DWP($res_y,"esp")); 1609 &lea ("ebp",&DWP($S2,"esp")); 1610 &lea ("edi",&DWP($res_y,"esp")); 1611 &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); 1612 1613 &mov ("ebp",&DWP(32*18+0,"esp")); # ~in1infty 1614 &mov ("esi",&DWP(32*18+4,"esp")); # ~in2infty 1615 &mov ("edi",&wparam(0)); 1616 &mov ("edx","ebp"); 1617 ¬ ("ebp"); 1618 &and ("edx","esi"); # ~in1infty & ~in2infty 1619 &and ("ebp","esi"); # in1infty & ~in2infty 1620 ¬ ("esi"); # in2infty 1621 1622 ######################################## 1623 # conditional moves 1624 for($i=64;$i<96;$i+=4) { 1625 &mov ("eax","edx"); # ~in1infty & ~in2infty 1626 &and ("eax",&DWP($res_x+$i,"esp")); 1627 &mov ("ebx","ebp"); # in1infty & ~in2infty 1628 &and ("ebx",&DWP($in2_x+$i,"esp")); 1629 &mov ("ecx","esi"); # in2infty 1630 &and ("ecx",&DWP($in1_x+$i,"esp")); 1631 &or ("eax","ebx"); 1632 &or ("eax","ecx"); 1633 &mov (&DWP($i,"edi"),"eax"); 1634 } 1635 for($i=0;$i<64;$i+=4) { 1636 &mov ("eax","edx"); # ~in1infty & ~in2infty 1637 &and ("eax",&DWP($res_x+$i,"esp")); 1638 &mov ("ebx","ebp"); # in1infty & ~in2infty 1639 &and ("ebx",&DWP($in2_x+$i,"esp")); 1640 &mov ("ecx","esi"); # in2infty 1641 &and ("ecx",&DWP($in1_x+$i,"esp")); 1642 &or ("eax","ebx"); 1643 &or ("eax","ecx"); 1644 &mov (&DWP($i,"edi"),"eax"); 1645 } 1646 &set_label("add_done"); 1647 &stack_pop(8*18+5); 1648} &function_end("ecp_nistz256_point_add"); 1649 1650######################################################################## 1651# void ecp_nistz256_point_add_affine(P256_POINT *out, 1652# const P256_POINT *in1, 1653# const P256_POINT_AFFINE *in2); 1654&function_begin("ecp_nistz256_point_add_affine"); 1655{ 1656 my ($res_x,$res_y,$res_z, 1657 $in1_x,$in1_y,$in1_z, 1658 $in2_x,$in2_y, 1659 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); 1660 my $Z1sqr = $S2; 1661 my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); 1662 1663 &mov ("esi",&wparam(1)); 1664 1665 # above map() describes stack layout with 15 temporary 1666 # 256-bit vectors on top, then we take extra words for 1667 # ~in1infty, ~in2infty, and OPENSSL_ia32cap_P copy. 1668 &stack_push(8*15+3); 1669 if ($sse2) { 1670 &call ("_picup_eax"); 1671 &set_label("pic"); 1672 &picmeup("edx","OPENSSL_ia32cap_P","eax",&label("pic")); 1673 &mov ("ebp",&DWP(0,"edx")); } 1674 1675 &lea ("edi",&DWP($in1_x,"esp")); 1676 for($i=0;$i<96;$i+=16) { 1677 &mov ("eax",&DWP($i+0,"esi")); # copy in1 1678 &mov ("ebx",&DWP($i+4,"esi")); 1679 &mov ("ecx",&DWP($i+8,"esi")); 1680 &mov ("edx",&DWP($i+12,"esi")); 1681 &mov (&DWP($i+0,"edi"),"eax"); 1682 &mov (&DWP(32*15+8,"esp"),"ebp") if ($i==0); 1683 &mov ("ebp","eax") if ($i==64); 1684 &or ("ebp","eax") if ($i>64); 1685 &mov (&DWP($i+4,"edi"),"ebx"); 1686 &or ("ebp","ebx") if ($i>=64); 1687 &mov (&DWP($i+8,"edi"),"ecx"); 1688 &or ("ebp","ecx") if ($i>=64); 1689 &mov (&DWP($i+12,"edi"),"edx"); 1690 &or ("ebp","edx") if ($i>=64); 1691 } 1692 &xor ("eax","eax"); 1693 &mov ("esi",&wparam(2)); 1694 &sub ("eax","ebp"); 1695 &or ("ebp","eax"); 1696 &sar ("ebp",31); 1697 &mov (&DWP(32*15+0,"esp"),"ebp"); # ~in1infty 1698 1699 &lea ("edi",&DWP($in2_x,"esp")); 1700 for($i=0;$i<64;$i+=16) { 1701 &mov ("eax",&DWP($i+0,"esi")); # copy in2 1702 &mov ("ebx",&DWP($i+4,"esi")); 1703 &mov ("ecx",&DWP($i+8,"esi")); 1704 &mov ("edx",&DWP($i+12,"esi")); 1705 &mov (&DWP($i+0,"edi"),"eax"); 1706 &mov ("ebp","eax") if ($i==0); 1707 &or ("ebp","eax") if ($i!=0); 1708 &mov (&DWP($i+4,"edi"),"ebx"); 1709 &or ("ebp","ebx"); 1710 &mov (&DWP($i+8,"edi"),"ecx"); 1711 &or ("ebp","ecx"); 1712 &mov (&DWP($i+12,"edi"),"edx"); 1713 &or ("ebp","edx"); 1714 } 1715 &xor ("ebx","ebx"); 1716 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1717 &sub ("ebx","ebp"); 1718 &lea ("esi",&DWP($in1_z,"esp")); 1719 &or ("ebx","ebp"); 1720 &lea ("ebp",&DWP($in1_z,"esp")); 1721 &sar ("ebx",31); 1722 &lea ("edi",&DWP($Z1sqr,"esp")); 1723 &mov (&DWP(32*15+4,"esp"),"ebx"); # ~in2infty 1724 1725 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); 1726 1727 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1728 &lea ("esi",&DWP($in2_x,"esp")); 1729 &mov ("ebp","edi"); # %esi is stull &Z1sqr 1730 &lea ("edi",&DWP($U2,"esp")); 1731 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, Z1sqr, in2_x); 1732 1733 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1734 &lea ("esi",&DWP($in1_z,"esp")); 1735 &lea ("ebp",&DWP($Z1sqr,"esp")); 1736 &lea ("edi",&DWP($S2,"esp")); 1737 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); 1738 1739 &lea ("esi",&DWP($U2,"esp")); 1740 &lea ("ebp",&DWP($in1_x,"esp")); 1741 &lea ("edi",&DWP($H,"esp")); 1742 &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, in1_x); 1743 1744 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1745 &lea ("esi",&DWP($in2_y,"esp")); 1746 &lea ("ebp",&DWP($S2,"esp")); 1747 &lea ("edi",&DWP($S2,"esp")); 1748 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); 1749 1750 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1751 &lea ("esi",&DWP($in1_z,"esp")); 1752 &lea ("ebp",&DWP($H,"esp")); 1753 &lea ("edi",&DWP($res_z,"esp")); 1754 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); 1755 1756 &lea ("esi",&DWP($S2,"esp")); 1757 &lea ("ebp",&DWP($in1_y,"esp")); 1758 &lea ("edi",&DWP($R,"esp")); 1759 &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, in1_y); 1760 1761 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1762 &lea ("esi",&DWP($H,"esp")); 1763 &lea ("ebp",&DWP($H,"esp")); 1764 &lea ("edi",&DWP($Hsqr,"esp")); 1765 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); 1766 1767 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1768 &lea ("esi",&DWP($R,"esp")); 1769 &lea ("ebp",&DWP($R,"esp")); 1770 &lea ("edi",&DWP($Rsqr,"esp")); 1771 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); 1772 1773 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1774 &lea ("esi",&DWP($in1_x,"esp")); 1775 &lea ("ebp",&DWP($Hsqr,"esp")); 1776 &lea ("edi",&DWP($U2,"esp")); 1777 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in1_x, Hsqr); 1778 1779 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1780 &lea ("esi",&DWP($H,"esp")); 1781 &lea ("ebp",&DWP($Hsqr,"esp")); 1782 &lea ("edi",&DWP($Hcub,"esp")); 1783 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); 1784 1785 &lea ("esi",&DWP($U2,"esp")); 1786 &lea ("ebp",&DWP($U2,"esp")); 1787 &lea ("edi",&DWP($Hsqr,"esp")); 1788 &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); 1789 1790 &lea ("esi",&DWP($Rsqr,"esp")); 1791 &lea ("ebp",&DWP($Hsqr,"esp")); 1792 &lea ("edi",&DWP($res_x,"esp")); 1793 &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); 1794 1795 &lea ("esi",&DWP($res_x,"esp")); 1796 &lea ("ebp",&DWP($Hcub,"esp")); 1797 &lea ("edi",&DWP($res_x,"esp")); 1798 &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); 1799 1800 &lea ("esi",&DWP($U2,"esp")); 1801 &lea ("ebp",&DWP($res_x,"esp")); 1802 &lea ("edi",&DWP($res_y,"esp")); 1803 &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); 1804 1805 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1806 &lea ("esi",&DWP($Hcub,"esp")); 1807 &lea ("ebp",&DWP($in1_y,"esp")); 1808 &lea ("edi",&DWP($S2,"esp")); 1809 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Hcub, in1_y); 1810 1811 &mov ("eax",&DWP(32*15+8,"esp")); # OPENSSL_ia32cap_P copy 1812 &lea ("esi",&DWP($R,"esp")); 1813 &lea ("ebp",&DWP($res_y,"esp")); 1814 &lea ("edi",&DWP($res_y,"esp")); 1815 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, res_y, R); 1816 1817 &lea ("esi",&DWP($res_y,"esp")); 1818 &lea ("ebp",&DWP($S2,"esp")); 1819 &lea ("edi",&DWP($res_y,"esp")); 1820 &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); 1821 1822 &mov ("ebp",&DWP(32*15+0,"esp")); # ~in1infty 1823 &mov ("esi",&DWP(32*15+4,"esp")); # ~in2infty 1824 &mov ("edi",&wparam(0)); 1825 &mov ("edx","ebp"); 1826 ¬ ("ebp"); 1827 &and ("edx","esi"); # ~in1infty & ~in2infty 1828 &and ("ebp","esi"); # in1infty & ~in2infty 1829 ¬ ("esi"); # in2infty 1830 1831 ######################################## 1832 # conditional moves 1833 for($i=64;$i<96;$i+=4) { 1834 my $one=@ONE_mont[($i-64)/4]; 1835 1836 &mov ("eax","edx"); 1837 &and ("eax",&DWP($res_x+$i,"esp")); 1838 &mov ("ebx","ebp") if ($one && $one!=-1); 1839 &and ("ebx",$one) if ($one && $one!=-1); 1840 &mov ("ecx","esi"); 1841 &and ("ecx",&DWP($in1_x+$i,"esp")); 1842 &or ("eax",$one==-1?"ebp":"ebx") if ($one); 1843 &or ("eax","ecx"); 1844 &mov (&DWP($i,"edi"),"eax"); 1845 } 1846 for($i=0;$i<64;$i+=4) { 1847 &mov ("eax","edx"); # ~in1infty & ~in2infty 1848 &and ("eax",&DWP($res_x+$i,"esp")); 1849 &mov ("ebx","ebp"); # in1infty & ~in2infty 1850 &and ("ebx",&DWP($in2_x+$i,"esp")); 1851 &mov ("ecx","esi"); # in2infty 1852 &and ("ecx",&DWP($in1_x+$i,"esp")); 1853 &or ("eax","ebx"); 1854 &or ("eax","ecx"); 1855 &mov (&DWP($i,"edi"),"eax"); 1856 } 1857 &stack_pop(8*15+3); 1858} &function_end("ecp_nistz256_point_add_affine"); 1859 1860&asm_finish(); 1861 1862close STDOUT or die "error closing STDOUT: $!"; 1863