1.section ".text",#alloc,#execinstr 2 3.global bn_mul_mont_fpu 4.align 32 5bn_mul_mont_fpu: 6 save %sp,-192-64,%sp 7 8 cmp %i5,4 9 bl,a,pn %icc,.Lret 10 clr %i0 11 andcc %i5,1,%g0 ! %i5 has to be even... 12 bnz,a,pn %icc,.Lret 13 clr %i0 ! signal "unsupported input value" 14 15 srl %i5,1,%i5 16 sethi %hi(0xffff),%l7 17 ld [%i4+0],%g4 ! %g4 reassigned, remember? 18 or %l7,%lo(0xffff),%l7 19 ld [%i4+4],%o0 20 sllx %o0,32,%o0 21 or %o0,%g4,%g4 ! %g4=n0[1].n0[0] 22 23 sll %i5,3,%i5 ! num*=8 24 25 add %sp,2047,%o0 ! real top of stack 26 sll %i5,2,%o1 27 add %o1,%i5,%o1 ! %o1=num*5 28 sub %o0,%o1,%o0 29 and %o0,-2048,%o0 ! optimize TLB utilization 30 sub %o0,2047,%sp ! alloca(5*num*8) 31 32 rd %asi,%o7 ! save %asi 33 add %sp,2047+192+64,%l0 34 add %l0,%i5,%l1 35 add %l1,%i5,%l1 ! [an]p_[lh] point at the vectors' ends ! 36 add %l1,%i5,%l2 37 add %l2,%i5,%l3 38 add %l3,%i5,%l4 39 40 wr %g0,210,%asi ! setup %asi for 16-bit FP loads 41 42 add %i0,%i5,%i0 ! readjust input pointers to point 43 add %i1,%i5,%i1 ! at the ends too... 44 add %i2,%i5,%i2 45 add %i3,%i5,%i3 46 47 stx %o7,[%sp+2047+192+48] ! save %asi 48 49 sub %g0,%i5,%l5 ! i=-num 50 sub %g0,%i5,%l6 ! j=-num 51 52 add %i1,%l6,%o3 53 add %i2,%l5,%o4 54 55 ld [%o3+4],%g1 ! bp[0] 56 ld [%o3+0],%o0 57 ld [%o4+4],%g5 ! ap[0] 58 sllx %g1,32,%g1 59 ld [%o4+0],%o1 60 sllx %g5,32,%g5 61 or %g1,%o0,%o0 62 or %g5,%o1,%o1 63 64 add %i3,%l6,%o5 65 66 mulx %o1,%o0,%o0 ! ap[0]*bp[0] 67 mulx %g4,%o0,%o0 ! ap[0]*bp[0]*n0 68 stx %o0,[%sp+2047+192+0] 69 70 ld [%o3+0],%f17 ! load a[j] as pair of 32-bit words 71 .word 0xa1b00c20 ! fzeros %f16 72 ld [%o3+4],%f19 73 .word 0xa5b00c20 ! fzeros %f18 74 ld [%o5+0],%f21 ! load n[j] as pair of 32-bit words 75 .word 0xa9b00c20 ! fzeros %f20 76 ld [%o5+4],%f23 77 .word 0xadb00c20 ! fzeros %f22 78 79 ! transfer b[i] to FPU as 4x16-bit values 80 ldda [%o4+2]%asi,%f0 81 fxtod %f16,%f16 82 ldda [%o4+0]%asi,%f2 83 fxtod %f18,%f18 84 ldda [%o4+6]%asi,%f4 85 fxtod %f20,%f20 86 ldda [%o4+4]%asi,%f6 87 fxtod %f22,%f22 88 89 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values 90 ldda [%sp+2047+192+6]%asi,%f8 91 fxtod %f0,%f0 92 ldda [%sp+2047+192+4]%asi,%f10 93 fxtod %f2,%f2 94 ldda [%sp+2047+192+2]%asi,%f12 95 fxtod %f4,%f4 96 ldda [%sp+2047+192+0]%asi,%f14 97 fxtod %f6,%f6 98 99 std %f16,[%l1+%l6] ! save smashed ap[j] in double format 100 fxtod %f8,%f8 101 std %f18,[%l2+%l6] 102 fxtod %f10,%f10 103 std %f20,[%l3+%l6] ! save smashed np[j] in double format 104 fxtod %f12,%f12 105 std %f22,[%l4+%l6] 106 fxtod %f14,%f14 107 108 fmuld %f16,%f0,%f32 109 fmuld %f20,%f8,%f48 110 fmuld %f16,%f2,%f34 111 fmuld %f20,%f10,%f50 112 fmuld %f16,%f4,%f36 113 faddd %f32,%f48,%f48 114 fmuld %f20,%f12,%f52 115 fmuld %f16,%f6,%f38 116 faddd %f34,%f50,%f50 117 fmuld %f20,%f14,%f54 118 fmuld %f18,%f0,%f40 119 faddd %f36,%f52,%f52 120 fmuld %f22,%f8,%f56 121 fmuld %f18,%f2,%f42 122 faddd %f38,%f54,%f54 123 fmuld %f22,%f10,%f58 124 fmuld %f18,%f4,%f44 125 faddd %f40,%f56,%f56 126 fmuld %f22,%f12,%f60 127 fmuld %f18,%f6,%f46 128 faddd %f42,%f58,%f58 129 fmuld %f22,%f14,%f62 130 131 faddd %f44,%f60,%f24 ! %f60 132 faddd %f46,%f62,%f26 ! %f62 133 134 faddd %f52,%f56,%f52 135 faddd %f54,%f58,%f54 136 137 fdtox %f48,%f48 138 fdtox %f50,%f50 139 fdtox %f52,%f52 140 fdtox %f54,%f54 141 142 std %f48,[%sp+2047+192+0] 143 add %l6,8,%l6 144 std %f50,[%sp+2047+192+8] 145 add %i1,%l6,%o4 146 std %f52,[%sp+2047+192+16] 147 add %i3,%l6,%o5 148 std %f54,[%sp+2047+192+24] 149 150 ld [%o4+0],%f17 ! load a[j] as pair of 32-bit words 151 .word 0xa1b00c20 ! fzeros %f16 152 ld [%o4+4],%f19 153 .word 0xa5b00c20 ! fzeros %f18 154 ld [%o5+0],%f21 ! load n[j] as pair of 32-bit words 155 .word 0xa9b00c20 ! fzeros %f20 156 ld [%o5+4],%f23 157 .word 0xadb00c20 ! fzeros %f22 158 159 fxtod %f16,%f16 160 fxtod %f18,%f18 161 fxtod %f20,%f20 162 fxtod %f22,%f22 163 164 ldx [%sp+2047+192+0],%o0 165 fmuld %f16,%f0,%f32 166 ldx [%sp+2047+192+8],%o1 167 fmuld %f20,%f8,%f48 168 ldx [%sp+2047+192+16],%o2 169 fmuld %f16,%f2,%f34 170 ldx [%sp+2047+192+24],%o3 171 fmuld %f20,%f10,%f50 172 173 srlx %o0,16,%o7 174 std %f16,[%l1+%l6] ! save smashed ap[j] in double format 175 fmuld %f16,%f4,%f36 176 add %o7,%o1,%o1 177 std %f18,[%l2+%l6] 178 faddd %f32,%f48,%f48 179 fmuld %f20,%f12,%f52 180 srlx %o1,16,%o7 181 std %f20,[%l3+%l6] ! save smashed np[j] in double format 182 fmuld %f16,%f6,%f38 183 add %o7,%o2,%o2 184 std %f22,[%l4+%l6] 185 faddd %f34,%f50,%f50 186 fmuld %f20,%f14,%f54 187 srlx %o2,16,%o7 188 fmuld %f18,%f0,%f40 189 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 190 faddd %f36,%f52,%f52 191 fmuld %f22,%f8,%f56 192 !and %o0,%l7,%o0 193 !and %o1,%l7,%o1 194 !and %o2,%l7,%o2 195 !sllx %o1,16,%o1 196 !sllx %o2,32,%o2 197 !sllx %o3,48,%o7 198 !or %o1,%o0,%o0 199 !or %o2,%o0,%o0 200 !or %o7,%o0,%o0 ! 64-bit result 201 srlx %o3,16,%g1 ! 34-bit carry 202 fmuld %f18,%f2,%f42 203 204 faddd %f38,%f54,%f54 205 fmuld %f22,%f10,%f58 206 fmuld %f18,%f4,%f44 207 faddd %f40,%f56,%f56 208 fmuld %f22,%f12,%f60 209 fmuld %f18,%f6,%f46 210 faddd %f42,%f58,%f58 211 fmuld %f22,%f14,%f62 212 213 faddd %f24,%f48,%f48 214 faddd %f26,%f50,%f50 215 faddd %f44,%f60,%f24 ! %f60 216 faddd %f46,%f62,%f26 ! %f62 217 218 faddd %f52,%f56,%f52 219 faddd %f54,%f58,%f54 220 221 fdtox %f48,%f48 222 fdtox %f50,%f50 223 fdtox %f52,%f52 224 fdtox %f54,%f54 225 226 std %f48,[%sp+2047+192+0] 227 std %f50,[%sp+2047+192+8] 228 addcc %l6,8,%l6 229 std %f52,[%sp+2047+192+16] 230 bz,pn %icc,.L1stskip 231 std %f54,[%sp+2047+192+24] 232 233.align 32 ! incidentally already aligned ! 234.L1st: 235 add %i1,%l6,%o4 236 add %i3,%l6,%o5 237 ld [%o4+0],%f17 ! load a[j] as pair of 32-bit words 238 .word 0xa1b00c20 ! fzeros %f16 239 ld [%o4+4],%f19 240 .word 0xa5b00c20 ! fzeros %f18 241 ld [%o5+0],%f21 ! load n[j] as pair of 32-bit words 242 .word 0xa9b00c20 ! fzeros %f20 243 ld [%o5+4],%f23 244 .word 0xadb00c20 ! fzeros %f22 245 246 fxtod %f16,%f16 247 fxtod %f18,%f18 248 fxtod %f20,%f20 249 fxtod %f22,%f22 250 251 ldx [%sp+2047+192+0],%o0 252 fmuld %f16,%f0,%f32 253 ldx [%sp+2047+192+8],%o1 254 fmuld %f20,%f8,%f48 255 ldx [%sp+2047+192+16],%o2 256 fmuld %f16,%f2,%f34 257 ldx [%sp+2047+192+24],%o3 258 fmuld %f20,%f10,%f50 259 260 srlx %o0,16,%o7 261 std %f16,[%l1+%l6] ! save smashed ap[j] in double format 262 fmuld %f16,%f4,%f36 263 add %o7,%o1,%o1 264 std %f18,[%l2+%l6] 265 faddd %f32,%f48,%f48 266 fmuld %f20,%f12,%f52 267 srlx %o1,16,%o7 268 std %f20,[%l3+%l6] ! save smashed np[j] in double format 269 fmuld %f16,%f6,%f38 270 add %o7,%o2,%o2 271 std %f22,[%l4+%l6] 272 faddd %f34,%f50,%f50 273 fmuld %f20,%f14,%f54 274 srlx %o2,16,%o7 275 fmuld %f18,%f0,%f40 276 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 277 and %o0,%l7,%o0 278 faddd %f36,%f52,%f52 279 fmuld %f22,%f8,%f56 280 and %o1,%l7,%o1 281 and %o2,%l7,%o2 282 fmuld %f18,%f2,%f42 283 sllx %o1,16,%o1 284 faddd %f38,%f54,%f54 285 fmuld %f22,%f10,%f58 286 sllx %o2,32,%o2 287 fmuld %f18,%f4,%f44 288 sllx %o3,48,%o7 289 or %o1,%o0,%o0 290 faddd %f40,%f56,%f56 291 fmuld %f22,%f12,%f60 292 or %o2,%o0,%o0 293 fmuld %f18,%f6,%f46 294 or %o7,%o0,%o0 ! 64-bit result 295 faddd %f42,%f58,%f58 296 fmuld %f22,%f14,%f62 297 addcc %g1,%o0,%o0 298 faddd %f24,%f48,%f48 299 srlx %o3,16,%g1 ! 34-bit carry 300 faddd %f26,%f50,%f50 301 bcs,a %xcc,.+8 302 add %g1,1,%g1 303 304 stx %o0,[%l0] ! tp[j-1]= 305 306 faddd %f44,%f60,%f24 ! %f60 307 faddd %f46,%f62,%f26 ! %f62 308 309 faddd %f52,%f56,%f52 310 faddd %f54,%f58,%f54 311 312 fdtox %f48,%f48 313 fdtox %f50,%f50 314 fdtox %f52,%f52 315 fdtox %f54,%f54 316 317 std %f48,[%sp+2047+192+0] 318 std %f50,[%sp+2047+192+8] 319 std %f52,[%sp+2047+192+16] 320 std %f54,[%sp+2047+192+24] 321 322 addcc %l6,8,%l6 323 bnz,pt %icc,.L1st 324 add %l0,8,%l0 325 326.L1stskip: 327 fdtox %f24,%f24 328 fdtox %f26,%f26 329 330 ldx [%sp+2047+192+0],%o0 331 ldx [%sp+2047+192+8],%o1 332 ldx [%sp+2047+192+16],%o2 333 ldx [%sp+2047+192+24],%o3 334 335 srlx %o0,16,%o7 336 std %f24,[%sp+2047+192+32] 337 add %o7,%o1,%o1 338 std %f26,[%sp+2047+192+40] 339 srlx %o1,16,%o7 340 add %o7,%o2,%o2 341 srlx %o2,16,%o7 342 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 343 and %o0,%l7,%o0 344 and %o1,%l7,%o1 345 and %o2,%l7,%o2 346 sllx %o1,16,%o1 347 sllx %o2,32,%o2 348 sllx %o3,48,%o7 349 or %o1,%o0,%o0 350 or %o2,%o0,%o0 351 or %o7,%o0,%o0 ! 64-bit result 352 ldx [%sp+2047+192+32],%o4 353 addcc %g1,%o0,%o0 354 ldx [%sp+2047+192+40],%o5 355 srlx %o3,16,%g1 ! 34-bit carry 356 bcs,a %xcc,.+8 357 add %g1,1,%g1 358 359 stx %o0,[%l0] ! tp[j-1]= 360 add %l0,8,%l0 361 362 srlx %o4,16,%o7 363 add %o7,%o5,%o5 364 and %o4,%l7,%o4 365 sllx %o5,16,%o7 366 or %o7,%o4,%o4 367 addcc %g1,%o4,%o4 368 srlx %o5,48,%g1 369 bcs,a %xcc,.+8 370 add %g1,1,%g1 371 372 mov %g1,%i4 373 stx %o4,[%l0] ! tp[num-1]= 374 375 ba .Louter 376 add %l5,8,%l5 377.align 32 378.Louter: 379 sub %g0,%i5,%l6 ! j=-num 380 add %sp,2047+192+64,%l0 381 382 add %i1,%l6,%o3 383 add %i2,%l5,%o4 384 385 ld [%o3+4],%g1 ! bp[i] 386 ld [%o3+0],%o0 387 ld [%o4+4],%g5 ! ap[0] 388 sllx %g1,32,%g1 389 ld [%o4+0],%o1 390 sllx %g5,32,%g5 391 or %g1,%o0,%o0 392 or %g5,%o1,%o1 393 394 ldx [%l0],%o2 ! tp[0] 395 mulx %o1,%o0,%o0 396 addcc %o2,%o0,%o0 397 mulx %g4,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0 398 stx %o0,[%sp+2047+192+0] 399 400 ! transfer b[i] to FPU as 4x16-bit values 401 ldda [%o4+2]%asi,%f0 402 ldda [%o4+0]%asi,%f2 403 ldda [%o4+6]%asi,%f4 404 ldda [%o4+4]%asi,%f6 405 406 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values 407 ldda [%sp+2047+192+6]%asi,%f8 408 fxtod %f0,%f0 409 ldda [%sp+2047+192+4]%asi,%f10 410 fxtod %f2,%f2 411 ldda [%sp+2047+192+2]%asi,%f12 412 fxtod %f4,%f4 413 ldda [%sp+2047+192+0]%asi,%f14 414 fxtod %f6,%f6 415 ldd [%l1+%l6],%f16 ! load a[j] in double format 416 fxtod %f8,%f8 417 ldd [%l2+%l6],%f18 418 fxtod %f10,%f10 419 ldd [%l3+%l6],%f20 ! load n[j] in double format 420 fxtod %f12,%f12 421 ldd [%l4+%l6],%f22 422 fxtod %f14,%f14 423 424 fmuld %f16,%f0,%f32 425 fmuld %f20,%f8,%f48 426 fmuld %f16,%f2,%f34 427 fmuld %f20,%f10,%f50 428 fmuld %f16,%f4,%f36 429 faddd %f32,%f48,%f48 430 fmuld %f20,%f12,%f52 431 fmuld %f16,%f6,%f38 432 faddd %f34,%f50,%f50 433 fmuld %f20,%f14,%f54 434 fmuld %f18,%f0,%f40 435 faddd %f36,%f52,%f52 436 fmuld %f22,%f8,%f56 437 fmuld %f18,%f2,%f42 438 faddd %f38,%f54,%f54 439 fmuld %f22,%f10,%f58 440 fmuld %f18,%f4,%f44 441 faddd %f40,%f56,%f56 442 fmuld %f22,%f12,%f60 443 fmuld %f18,%f6,%f46 444 faddd %f42,%f58,%f58 445 fmuld %f22,%f14,%f62 446 447 faddd %f44,%f60,%f24 ! %f60 448 faddd %f46,%f62,%f26 ! %f62 449 450 faddd %f52,%f56,%f52 451 faddd %f54,%f58,%f54 452 453 fdtox %f48,%f48 454 fdtox %f50,%f50 455 fdtox %f52,%f52 456 fdtox %f54,%f54 457 458 std %f48,[%sp+2047+192+0] 459 std %f50,[%sp+2047+192+8] 460 std %f52,[%sp+2047+192+16] 461 add %l6,8,%l6 462 std %f54,[%sp+2047+192+24] 463 464 ldd [%l1+%l6],%f16 ! load a[j] in double format 465 ldd [%l2+%l6],%f18 466 ldd [%l3+%l6],%f20 ! load n[j] in double format 467 ldd [%l4+%l6],%f22 468 469 fmuld %f16,%f0,%f32 470 fmuld %f20,%f8,%f48 471 fmuld %f16,%f2,%f34 472 fmuld %f20,%f10,%f50 473 fmuld %f16,%f4,%f36 474 ldx [%sp+2047+192+0],%o0 475 faddd %f32,%f48,%f48 476 fmuld %f20,%f12,%f52 477 ldx [%sp+2047+192+8],%o1 478 fmuld %f16,%f6,%f38 479 ldx [%sp+2047+192+16],%o2 480 faddd %f34,%f50,%f50 481 fmuld %f20,%f14,%f54 482 ldx [%sp+2047+192+24],%o3 483 fmuld %f18,%f0,%f40 484 485 srlx %o0,16,%o7 486 faddd %f36,%f52,%f52 487 fmuld %f22,%f8,%f56 488 add %o7,%o1,%o1 489 fmuld %f18,%f2,%f42 490 srlx %o1,16,%o7 491 faddd %f38,%f54,%f54 492 fmuld %f22,%f10,%f58 493 add %o7,%o2,%o2 494 fmuld %f18,%f4,%f44 495 srlx %o2,16,%o7 496 faddd %f40,%f56,%f56 497 fmuld %f22,%f12,%f60 498 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 499 ! why? 500 and %o0,%l7,%o0 501 fmuld %f18,%f6,%f46 502 and %o1,%l7,%o1 503 and %o2,%l7,%o2 504 faddd %f42,%f58,%f58 505 fmuld %f22,%f14,%f62 506 sllx %o1,16,%o1 507 faddd %f24,%f48,%f48 508 sllx %o2,32,%o2 509 faddd %f26,%f50,%f50 510 sllx %o3,48,%o7 511 or %o1,%o0,%o0 512 faddd %f44,%f60,%f24 ! %f60 513 or %o2,%o0,%o0 514 faddd %f46,%f62,%f26 ! %f62 515 or %o7,%o0,%o0 ! 64-bit result 516 ldx [%l0],%o7 517 faddd %f52,%f56,%f52 518 addcc %o7,%o0,%o0 519 ! end-of-why? 520 faddd %f54,%f58,%f54 521 srlx %o3,16,%g1 ! 34-bit carry 522 fdtox %f48,%f48 523 bcs,a %xcc,.+8 524 add %g1,1,%g1 525 526 fdtox %f50,%f50 527 fdtox %f52,%f52 528 fdtox %f54,%f54 529 530 std %f48,[%sp+2047+192+0] 531 std %f50,[%sp+2047+192+8] 532 addcc %l6,8,%l6 533 std %f52,[%sp+2047+192+16] 534 bz,pn %icc,.Linnerskip 535 std %f54,[%sp+2047+192+24] 536 537 ba .Linner 538 nop 539.align 32 540.Linner: 541 ldd [%l1+%l6],%f16 ! load a[j] in double format 542 ldd [%l2+%l6],%f18 543 ldd [%l3+%l6],%f20 ! load n[j] in double format 544 ldd [%l4+%l6],%f22 545 546 fmuld %f16,%f0,%f32 547 fmuld %f20,%f8,%f48 548 fmuld %f16,%f2,%f34 549 fmuld %f20,%f10,%f50 550 fmuld %f16,%f4,%f36 551 ldx [%sp+2047+192+0],%o0 552 faddd %f32,%f48,%f48 553 fmuld %f20,%f12,%f52 554 ldx [%sp+2047+192+8],%o1 555 fmuld %f16,%f6,%f38 556 ldx [%sp+2047+192+16],%o2 557 faddd %f34,%f50,%f50 558 fmuld %f20,%f14,%f54 559 ldx [%sp+2047+192+24],%o3 560 fmuld %f18,%f0,%f40 561 562 srlx %o0,16,%o7 563 faddd %f36,%f52,%f52 564 fmuld %f22,%f8,%f56 565 add %o7,%o1,%o1 566 fmuld %f18,%f2,%f42 567 srlx %o1,16,%o7 568 faddd %f38,%f54,%f54 569 fmuld %f22,%f10,%f58 570 add %o7,%o2,%o2 571 fmuld %f18,%f4,%f44 572 srlx %o2,16,%o7 573 faddd %f40,%f56,%f56 574 fmuld %f22,%f12,%f60 575 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 576 and %o0,%l7,%o0 577 fmuld %f18,%f6,%f46 578 and %o1,%l7,%o1 579 and %o2,%l7,%o2 580 faddd %f42,%f58,%f58 581 fmuld %f22,%f14,%f62 582 sllx %o1,16,%o1 583 faddd %f24,%f48,%f48 584 sllx %o2,32,%o2 585 faddd %f26,%f50,%f50 586 sllx %o3,48,%o7 587 or %o1,%o0,%o0 588 faddd %f44,%f60,%f24 ! %f60 589 or %o2,%o0,%o0 590 faddd %f46,%f62,%f26 ! %f62 591 or %o7,%o0,%o0 ! 64-bit result 592 faddd %f52,%f56,%f52 593 addcc %g1,%o0,%o0 594 ldx [%l0+8],%o7 ! tp[j] 595 faddd %f54,%f58,%f54 596 srlx %o3,16,%g1 ! 34-bit carry 597 fdtox %f48,%f48 598 bcs,a %xcc,.+8 599 add %g1,1,%g1 600 fdtox %f50,%f50 601 addcc %o7,%o0,%o0 602 fdtox %f52,%f52 603 bcs,a %xcc,.+8 604 add %g1,1,%g1 605 606 stx %o0,[%l0] ! tp[j-1] 607 fdtox %f54,%f54 608 609 std %f48,[%sp+2047+192+0] 610 std %f50,[%sp+2047+192+8] 611 std %f52,[%sp+2047+192+16] 612 addcc %l6,8,%l6 613 std %f54,[%sp+2047+192+24] 614 bnz,pt %icc,.Linner 615 add %l0,8,%l0 616 617.Linnerskip: 618 fdtox %f24,%f24 619 fdtox %f26,%f26 620 621 ldx [%sp+2047+192+0],%o0 622 ldx [%sp+2047+192+8],%o1 623 ldx [%sp+2047+192+16],%o2 624 ldx [%sp+2047+192+24],%o3 625 626 srlx %o0,16,%o7 627 std %f24,[%sp+2047+192+32] 628 add %o7,%o1,%o1 629 std %f26,[%sp+2047+192+40] 630 srlx %o1,16,%o7 631 add %o7,%o2,%o2 632 srlx %o2,16,%o7 633 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15] 634 and %o0,%l7,%o0 635 and %o1,%l7,%o1 636 and %o2,%l7,%o2 637 sllx %o1,16,%o1 638 sllx %o2,32,%o2 639 sllx %o3,48,%o7 640 or %o1,%o0,%o0 641 or %o2,%o0,%o0 642 ldx [%sp+2047+192+32],%o4 643 or %o7,%o0,%o0 ! 64-bit result 644 ldx [%sp+2047+192+40],%o5 645 addcc %g1,%o0,%o0 646 ldx [%l0+8],%o7 ! tp[j] 647 srlx %o3,16,%g1 ! 34-bit carry 648 bcs,a %xcc,.+8 649 add %g1,1,%g1 650 651 addcc %o7,%o0,%o0 652 bcs,a %xcc,.+8 653 add %g1,1,%g1 654 655 stx %o0,[%l0] ! tp[j-1] 656 add %l0,8,%l0 657 658 srlx %o4,16,%o7 659 add %o7,%o5,%o5 660 and %o4,%l7,%o4 661 sllx %o5,16,%o7 662 or %o7,%o4,%o4 663 addcc %g1,%o4,%o4 664 srlx %o5,48,%g1 665 bcs,a %xcc,.+8 666 add %g1,1,%g1 667 668 addcc %i4,%o4,%o4 669 stx %o4,[%l0] ! tp[num-1] 670 mov %g1,%i4 671 bcs,a %xcc,.+8 672 add %i4,1,%i4 673 674 addcc %l5,8,%l5 675 bnz %icc,.Louter 676 nop 677 678 add %l0,8,%l0 ! adjust tp to point at the end 679 orn %g0,%g0,%g4 680 sub %g0,%i5,%o7 ! n=-num 681 ba .Lsub 682 subcc %g0,%g0,%g0 ! clear %icc.c 683 684.align 32 685.Lsub: 686 ldx [%l0+%o7],%o0 687 add %i3,%o7,%g1 688 ld [%g1+0],%o2 689 ld [%g1+4],%o3 690 srlx %o0,32,%o1 691 subccc %o0,%o2,%o2 692 add %i0,%o7,%g1 693 subccc %o1,%o3,%o3 694 st %o2,[%g1+0] 695 add %o7,8,%o7 696 brnz,pt %o7,.Lsub 697 st %o3,[%g1+4] 698 subc %i4,0,%g4 699 sub %g0,%i5,%o7 ! n=-num 700 ba .Lcopy 701 nop 702 703.align 32 704.Lcopy: 705 ldx [%l0+%o7],%o0 706 add %i0,%o7,%g1 707 ld [%g1+0],%o2 708 ld [%g1+4],%o3 709 stx %g0,[%l0+%o7] 710 and %o0,%g4,%o0 711 srlx %o0,32,%o1 712 andn %o2,%g4,%o2 713 andn %o3,%g4,%o3 714 or %o2,%o0,%o0 715 or %o3,%o1,%o1 716 st %o0,[%g1+0] 717 add %o7,8,%o7 718 brnz,pt %o7,.Lcopy 719 st %o1,[%g1+4] 720 sub %g0,%i5,%o7 ! n=-num 721 722.Lzap: 723 stx %g0,[%l1+%o7] 724 stx %g0,[%l2+%o7] 725 stx %g0,[%l3+%o7] 726 stx %g0,[%l4+%o7] 727 add %o7,8,%o7 728 brnz,pt %o7,.Lzap 729 nop 730 731 ldx [%sp+2047+192+48],%o7 732 wr %g0,%o7,%asi ! restore %asi 733 734 mov 1,%i0 735.Lret: 736 ret 737 restore 738.type bn_mul_mont_fpu,#function 739.size bn_mul_mont_fpu,(.-bn_mul_mont_fpu) 740.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro@openssl.org>" 741.align 32 742