1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 .ident "@(#)__vcos.S 1.8 06/01/23 SMI" 27 28 .file "__vcos.S" 29 30#include "libm.h" 31 32 RO_DATA 33 .align 64 34constants: 35 .word 0x3ec718e3,0xa6972785 36 .word 0x3ef9fd39,0x94293940 37 .word 0xbf2a019f,0x75ee4be1 38 .word 0xbf56c16b,0xba552569 39 .word 0x3f811111,0x1108c703 40 .word 0x3fa55555,0x554f5b35 41 .word 0xbfc55555,0x555554d0 42 .word 0xbfdfffff,0xffffff85 43 .word 0x3ff00000,0x00000000 44 .word 0xbfc55555,0x5551fc28 45 .word 0x3f811107,0x62eacc9d 46 .word 0xbfdfffff,0xffff6328 47 .word 0x3fa55551,0x5f7acf0c 48 .word 0x3fe45f30,0x6dc9c883 49 .word 0x43380000,0x00000000 50 .word 0x3ff921fb,0x54400000 51 .word 0x3dd0b461,0x1a600000 52 .word 0x3ba3198a,0x2e000000 53 .word 0x397b839a,0x252049c1 54 .word 0x80000000,0x00004000 55 .word 0xffff8000,0x00000000 ! N.B.: low-order words used 56 .word 0x3fc90000,0x80000000 ! for sign bit hacking; see 57 .word 0x3fc40000,0x00000000 ! references to "thresh" below 58 59#define p4 0x0 60#define q4 0x08 61#define p3 0x10 62#define q3 0x18 63#define p2 0x20 64#define q2 0x28 65#define p1 0x30 66#define q1 0x38 67#define one 0x40 68#define pp1 0x48 69#define pp2 0x50 70#define qq1 0x58 71#define qq2 0x60 72#define invpio2 0x68 73#define round 0x70 74#define pio2_1 0x78 75#define pio2_2 0x80 76#define pio2_3 0x88 77#define pio2_3t 0x90 78#define f30val 0x98 79#define mask 0xa0 80#define thresh 0xa8 81 82! local storage indices 83 84#define xsave STACK_BIAS-0x8 85#define ysave STACK_BIAS-0x10 86#define nsave STACK_BIAS-0x14 87#define sxsave STACK_BIAS-0x18 88#define sysave STACK_BIAS-0x1c 89#define biguns STACK_BIAS-0x20 90#define n2 STACK_BIAS-0x24 91#define n1 STACK_BIAS-0x28 92#define n0 STACK_BIAS-0x2c 93#define x2_1 STACK_BIAS-0x40 94#define x1_1 STACK_BIAS-0x50 95#define x0_1 STACK_BIAS-0x60 96#define y2_0 STACK_BIAS-0x70 97#define y1_0 STACK_BIAS-0x80 98#define y0_0 STACK_BIAS-0x90 99! sizeof temp storage - must be a multiple of 16 for V9 100#define tmps 0x90 101 102!-------------------------------------------------------------------- 103! define pipes for easier reading 104 105#define P0_f0 %f0 106#define P0_f1 %f1 107#define P0_f2 %f2 108#define P0_f3 %f3 109#define P0_f4 %f4 110#define P0_f5 %f5 111#define P0_f6 %f6 112#define P0_f7 %f7 113#define P0_f8 %f8 114#define P0_f9 %f9 115 116#define P1_f10 %f10 117#define P1_f11 %f11 118#define P1_f12 %f12 119#define P1_f13 %f13 120#define P1_f14 %f14 121#define P1_f15 %f15 122#define P1_f16 %f16 123#define P1_f17 %f17 124#define P1_f18 %f18 125#define P1_f19 %f19 126 127#define P2_f20 %f20 128#define P2_f21 %f21 129#define P2_f22 %f22 130#define P2_f23 %f23 131#define P2_f24 %f24 132#define P2_f25 %f25 133#define P2_f26 %f26 134#define P2_f27 %f27 135#define P2_f28 %f28 136#define P2_f29 %f29 137 138! define __vlibm_TBL_sincos_hi & lo for easy reading 139 140#define SC_HI %l3 141#define SC_LO %l4 142 143! define constants for easy reading 144 145#define C_q1 %f46 146#define C_q2 %f48 147#define C_q3 %f50 148#define C_q4 %f52 149 150! one ( 1 ) uno eins echi un 151#define C_ONE %f54 152#define C_ONE_LO %f55 153 154! masks 155#define MSK_SIGN %i5 156#define MSK_BIT31 %f30 157#define MSK_BIT13 %f31 158#define MSK_BITSHI17 %f44 159 160 161! constants for pp and qq 162#define C_pp1 %f56 163#define C_pp2 %f58 164#define C_qq1 %f60 165#define C_qq2 %f62 166 167! sign mask 168#define C_signM %i5 169 170#define LIM_l5 %l5 171#define LIM_l6 %l6 172! when in pri range, using value as transition from poly to table. 173! for Medium range,change use of %l6 and use to keep track of biguns. 174#define LIM_l7 %l7 175 176!-------------------------------------------------------------------- 177 178 179 ENTRY(__vcos) 180 save %sp,-SA(MINFRAME)-tmps,%sp 181 PIC_SETUP(g5) 182 PIC_SET(g5,__vlibm_TBL_sincos_hi,l3) 183 PIC_SET(g5,__vlibm_TBL_sincos_lo,l4) 184 PIC_SET(g5,constants,o0) 185 mov %o0,%g1 186 wr %g0,0x82,%asi ! set %asi for non-faulting loads 187 188! ========== primary range ========== 189 190! register use 191 192! i0 n 193! i1 x 194! i2 stridex 195! i3 y 196! i4 stridey 197! i5 0x80000000 198 199! l0 hx0 200! l1 hx1 201! l2 hx2 202! l3 __vlibm_TBL_sincos_hi 203! l4 __vlibm_TBL_sincos_lo 204! l5 0x3fc40000 205! l6 0x3e400000 206! l7 0x3fe921fb 207 208! the following are 64-bit registers in both V8+ and V9 209 210! g1 scratch 211! g5 212 213! o0 py0 214! o1 py1 215! o2 py2 216! o3 oy0 217! o4 oy1 218! o5 oy2 219! o7 scratch 220 221! f0 x0 222! f2 223! f4 224! f6 225! f8 scratch for table base 226! f9 signbit0 227! f10 x1 228! f12 229! f14 230! f16 231! f18 scratch for table base 232! f19 signbit1 233! f20 x2 234! f22 235! f24 236! f26 237! f28 scratch for table base 238! f29 signbit2 239! f30 0x80000000 240! f31 0x4000 241! f32 242! f34 243! f36 244! f38 245! f40 246! f42 247! f44 0xffff800000000000 248! f46 p1 249! f48 p2 250! f50 p3 251! f52 p4 252! f54 one 253! f56 pp1 254! f58 pp2 255! f60 qq1 256! f62 qq2 257 258#ifdef __sparcv9 259 stx %i1,[%fp+xsave] ! save arguments 260 stx %i3,[%fp+ysave] 261#else 262 st %i1,[%fp+xsave] ! save arguments 263 st %i3,[%fp+ysave] 264#endif 265 266 st %i0,[%fp+nsave] 267 st %i2,[%fp+sxsave] 268 st %i4,[%fp+sysave] 269 sethi %hi(0x80000000),MSK_SIGN ! load/set up constants 270 sethi %hi(0x3fc40000),LIM_l5 271 sethi %hi(0x3e400000),LIM_l6 272 sethi %hi(0x3fe921fb),LIM_l7 273 or LIM_l7,%lo(0x3fe921fb),LIM_l7 274 ldd [%g1+f30val],MSK_BIT31 275 ldd [%g1+mask],MSK_BITSHI17 276 ldd [%g1+q1],C_q1 277 ldd [%g1+q2],C_q2 278 ldd [%g1+q3],C_q3 279 ldd [%g1+q4],C_q4 280 ldd [%g1+one],C_ONE 281 ldd [%g1+pp1],C_pp1 282 ldd [%g1+pp2],C_pp2 283 ldd [%g1+qq1],C_qq1 284 ldd [%g1+qq2],C_qq2 285 sll %i2,3,%i2 ! scale strides 286 sll %i4,3,%i4 287 add %fp,x0_1,%o3 ! precondition loop 288 add %fp,x0_1,%o4 289 add %fp,x0_1,%o5 290 ld [%i1],%l0 ! hx = *x 291 ld [%i1],P0_f0 292 ld [%i1+4],P0_f1 293 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 294 add %i1,%i2,%i1 ! x += stridex 295 296 ba,pt %icc,.loop0 297!delay slot 298 nop 299 300 .align 32 301.loop0: 302 lda [%i1]%asi,%l1 ! preload next argument 303 sub %l0,LIM_l6,%g1 304 sub LIM_l7,%l0,%o7 305 fands P0_f0,MSK_BIT31,P0_f9 ! save signbit 306 307 lda [%i1]%asi,P1_f10 308 orcc %o7,%g1,%g0 309 mov %i3,%o0 ! py0 = y 310 bl,pn %icc,.range0 ! if hx < 0x3e400000 or > 0x3fe921fb 311 312! delay slot 313 lda [%i1+4]%asi,P1_f11 314 addcc %i0,-1,%i0 315 add %i3,%i4,%i3 ! y += stridey 316 ble,pn %icc,.endloop1 317 318! delay slot 319 andn %l1,MSK_SIGN,%l1 320 add %i1,%i2,%i1 ! x += stridex 321 fabsd P0_f0,P0_f0 322 fmuld C_ONE,C_ONE,C_ONE ! one*one; a nop for alignment only 323 324.loop1: 325 lda [%i1]%asi,%l2 ! preload next argument 326 sub %l1,LIM_l6,%g1 327 sub LIM_l7,%l1,%o7 328 fands P1_f10,MSK_BIT31,P1_f19 ! save signbit 329 330 lda [%i1]%asi,P2_f20 331 orcc %o7,%g1,%g0 332 mov %i3,%o1 ! py1 = y 333 bl,pn %icc,.range1 ! if hx < 0x3e400000 or > 0x3fe921fb 334 335! delay slot 336 lda [%i1+4]%asi,P2_f21 337 addcc %i0,-1,%i0 338 add %i3,%i4,%i3 ! y += stridey 339 ble,pn %icc,.endloop2 340 341! delay slot 342 andn %l2,MSK_SIGN,%l2 343 add %i1,%i2,%i1 ! x += stridex 344 fabsd P1_f10,P1_f10 345 fmuld C_ONE,C_ONE,C_ONE ! one*one; a nop for alignment only 346 347.loop2: 348 st P0_f6,[%o3] 349 sub %l2,LIM_l6,%g1 350 sub LIM_l7,%l2,%o7 351 fands P2_f20,MSK_BIT31,P2_f29 ! save signbit 352 353 st P0_f7,[%o3+4] 354 orcc %g1,%o7,%g0 355 mov %i3,%o2 ! py2 = y 356 bl,pn %icc,.range2 ! if hx < 0x3e400000 or > 0x3fe921fb 357 358! delay slot 359 add %i3,%i4,%i3 ! y += stridey 360 cmp %l0,LIM_l5 361 fabsd P2_f20,P2_f20 362 bl,pn %icc,.case4 363 364! delay slot 365 st P1_f16,[%o4] 366 cmp %l1,LIM_l5 367 fpadd32s P0_f0,MSK_BIT13,P0_f8 368 bl,pn %icc,.case2 369 370! delay slot 371 st P1_f17,[%o4+4] 372 cmp %l2,LIM_l5 373 fpadd32s P1_f10,MSK_BIT13,P1_f18 374 bl,pn %icc,.case1 375 376! delay slot 377 st P2_f26,[%o5] 378 mov %o0,%o3 379 sethi %hi(0x3fc3c000),%o7 380 fpadd32s P2_f20,MSK_BIT13,P2_f28 381 382 st P2_f27,[%o5+4] 383 fand P0_f8,MSK_BITSHI17,P0_f2 384 mov %o1,%o4 385 386 fand P1_f18,MSK_BITSHI17,P1_f12 387 mov %o2,%o5 388 sub %l0,%o7,%l0 389 390 fand P2_f28,MSK_BITSHI17,P2_f22 391 sub %l1,%o7,%l1 392 sub %l2,%o7,%l2 393 394 fsubd P0_f0,P0_f2,P0_f0 395 srl %l0,10,%l0 396 add SC_HI,8,%g1;add SC_LO,8,%o7 397 398 fsubd P1_f10,P1_f12,P1_f10 399 srl %l1,10,%l1 400 401 fsubd P2_f20,P2_f22,P2_f20 402 srl %l2,10,%l2 403 404 fmuld P0_f0,P0_f0,P0_f2 405 andn %l0,0x1f,%l0 406 407 fmuld P1_f10,P1_f10,P1_f12 408 andn %l1,0x1f,%l1 409 410 fmuld P2_f20,P2_f20,P2_f22 411 andn %l2,0x1f,%l2 412 413 fmuld P0_f2,C_pp2,P0_f6 414 ldd [%g1+%l0],%f32 415 416 fmuld P1_f12,C_pp2,P1_f16 417 ldd [%g1+%l1],%f36 418 419 fmuld P2_f22,C_pp2,P2_f26 420 ldd [%g1+%l2],%f40 421 422 faddd P0_f6,C_pp1,P0_f6 423 fmuld P0_f2,C_qq2,P0_f4 424 ldd [SC_HI+%l0],%f34 425 426 faddd P1_f16,C_pp1,P1_f16 427 fmuld P1_f12,C_qq2,P1_f14 428 ldd [SC_HI+%l1],%f38 429 430 faddd P2_f26,C_pp1,P2_f26 431 fmuld P2_f22,C_qq2,P2_f24 432 ldd [SC_HI+%l2],%f42 433 434 fmuld P0_f2,P0_f6,P0_f6 435 faddd P0_f4,C_qq1,P0_f4 436 437 fmuld P1_f12,P1_f16,P1_f16 438 faddd P1_f14,C_qq1,P1_f14 439 440 fmuld P2_f22,P2_f26,P2_f26 441 faddd P2_f24,C_qq1,P2_f24 442 443 faddd P0_f6,C_ONE,P0_f6 444 fmuld P0_f2,P0_f4,P0_f4 445 446 faddd P1_f16,C_ONE,P1_f16 447 fmuld P1_f12,P1_f14,P1_f14 448 449 faddd P2_f26,C_ONE,P2_f26 450 fmuld P2_f22,P2_f24,P2_f24 451 452 fmuld P0_f0,P0_f6,P0_f6 453 ldd [%o7+%l0],P0_f2 454 455 fmuld P1_f10,P1_f16,P1_f16 456 ldd [%o7+%l1],P1_f12 457 458 fmuld P2_f20,P2_f26,P2_f26 459 ldd [%o7+%l2],P2_f22 460 461 fmuld P0_f4,%f32,P0_f4 462 lda [%i1]%asi,%l0 ! preload next argument 463 464 fmuld P1_f14,%f36,P1_f14 465 lda [%i1]%asi,P0_f0 466 467 fmuld P2_f24,%f40,P2_f24 468 lda [%i1+4]%asi,P0_f1 469 470 fmuld P0_f6,%f34,P0_f6 471 add %i1,%i2,%i1 ! x += stridex 472 473 fmuld P1_f16,%f38,P1_f16 474 475 fmuld P2_f26,%f42,P2_f26 476 477 fsubd P0_f6,P0_f4,P0_f6 478 479 fsubd P1_f16,P1_f14,P1_f16 480 481 fsubd P2_f26,P2_f24,P2_f26 482 483 fsubd P0_f2,P0_f6,P0_f6 484 485 fsubd P1_f12,P1_f16,P1_f16 486 487 fsubd P2_f22,P2_f26,P2_f26 488 489 faddd P0_f6,%f32,P0_f6 490 491 faddd P1_f16,%f36,P1_f16 492 493 faddd P2_f26,%f40,P2_f26 494 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 495 496 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 497 addcc %i0,-1,%i0 498 499 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 500 bg,pt %icc,.loop0 501 502! delay slot 503 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 504 505 ba,pt %icc,.endloop0 506! delay slot 507 nop 508 509 .align 32 510.case1: 511 st P2_f27,[%o5+4] 512 sethi %hi(0x3fc3c000),%o7 513 fand P0_f8,MSK_BITSHI17,P0_f2 514 515 sub %l0,%o7,%l0 516 sub %l1,%o7,%l1 517 add SC_HI,8,%g1;add SC_LO,8,%o7 518 fand P1_f18,MSK_BITSHI17,P1_f12 519 fmuld P2_f20,P2_f20,P2_f22 520 521 fsubd P0_f0,P0_f2,P0_f0 522 srl %l0,10,%l0 523 mov %o0,%o3 524 525 fsubd P1_f10,P1_f12,P1_f10 526 srl %l1,10,%l1 527 mov %o1,%o4 528 529 fmuld P2_f22,C_q4,P2_f24 530 mov %o2,%o5 531 532 fmuld P0_f0,P0_f0,P0_f2 533 andn %l0,0x1f,%l0 534 535 fmuld P1_f10,P1_f10,P1_f12 536 andn %l1,0x1f,%l1 537 538 faddd P2_f24,C_q3,P2_f24 539 540 fmuld P0_f2,C_pp2,P0_f6 541 ldd [%g1+%l0],%f32 542 543 fmuld P1_f12,C_pp2,P1_f16 544 ldd [%g1+%l1],%f36 545 546 fmuld P2_f22,P2_f24,P2_f24 547 548 faddd P0_f6,C_pp1,P0_f6 549 fmuld P0_f2,C_qq2,P0_f4 550 ldd [SC_HI+%l0],%f34 551 552 faddd P1_f16,C_pp1,P1_f16 553 fmuld P1_f12,C_qq2,P1_f14 554 ldd [SC_HI+%l1],%f38 555 556 faddd P2_f24,C_q2,P2_f24 557 558 fmuld P0_f2,P0_f6,P0_f6 559 faddd P0_f4,C_qq1,P0_f4 560 561 fmuld P1_f12,P1_f16,P1_f16 562 faddd P1_f14,C_qq1,P1_f14 563 564 fmuld P2_f22,P2_f24,P2_f24 565 566 faddd P0_f6,C_ONE,P0_f6 567 fmuld P0_f2,P0_f4,P0_f4 568 569 faddd P1_f16,C_ONE,P1_f16 570 fmuld P1_f12,P1_f14,P1_f14 571 572 faddd P2_f24,C_q1,P2_f24 573 574 fmuld P0_f0,P0_f6,P0_f6 575 ldd [%o7+%l0],P0_f2 576 577 fmuld P1_f10,P1_f16,P1_f16 578 ldd [%o7+%l1],P1_f12 579 580 fmuld P0_f4,%f32,P0_f4 581 lda [%i1]%asi,%l0 ! preload next argument 582 583 fmuld P1_f14,%f36,P1_f14 584 lda [%i1]%asi,P0_f0 585 586 fmuld P0_f6,%f34,P0_f6 587 lda [%i1+4]%asi,P0_f1 588 589 fmuld P1_f16,%f38,P1_f16 590 add %i1,%i2,%i1 ! x += stridex 591 592 fmuld P2_f22,P2_f24,P2_f24 593 594 fsubd P0_f6,P0_f4,P0_f6 595 596 fsubd P1_f16,P1_f14,P1_f16 597 598 !!(vsin)fmuld P2_f20,P2_f24,P2_f24 599 600 fsubd P0_f2,P0_f6,P0_f6 601 602 fsubd P1_f12,P1_f16,P1_f16 603 604 faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 605 606 faddd P0_f6,%f32,P0_f6 607 608 faddd P1_f16,%f36,P1_f16 609 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 610 611 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 612 addcc %i0,-1,%i0 613 614 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 615 bg,pt %icc,.loop0 616 617! delay slot 618 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 619 620 ba,pt %icc,.endloop0 621! delay slot 622 nop 623 624 .align 32 625.case2: 626 st P2_f26,[%o5] 627 cmp %l2,LIM_l5 628 fpadd32s P2_f20,MSK_BIT13,P2_f28 629 bl,pn %icc,.case3 630 631! delay slot 632 st P2_f27,[%o5+4] 633 sethi %hi(0x3fc3c000),%o7 634 fand P0_f8,MSK_BITSHI17,P0_f2 635 636 sub %l0,%o7,%l0 637 sub %l2,%o7,%l2 638 add SC_HI,8,%g1;add SC_LO,8,%o7 639 fand P2_f28,MSK_BITSHI17,P2_f22 640 fmuld P1_f10,P1_f10,P1_f12 641 642 fsubd P0_f0,P0_f2,P0_f0 643 srl %l0,10,%l0 644 mov %o0,%o3 645 646 fsubd P2_f20,P2_f22,P2_f20 647 srl %l2,10,%l2 648 mov %o2,%o5 649 650 fmuld P1_f12,C_q4,P1_f14 651 mov %o1,%o4 652 653 fmuld P0_f0,P0_f0,P0_f2 654 andn %l0,0x1f,%l0 655 656 fmuld P2_f20,P2_f20,P2_f22 657 andn %l2,0x1f,%l2 658 659 faddd P1_f14,C_q3,P1_f14 660 661 fmuld P0_f2,C_pp2,P0_f6 662 ldd [%g1+%l0],%f32 663 664 fmuld P2_f22,C_pp2,P2_f26 665 ldd [%g1+%l2],%f40 666 667 fmuld P1_f12,P1_f14,P1_f14 668 669 faddd P0_f6,C_pp1,P0_f6 670 fmuld P0_f2,C_qq2,P0_f4 671 ldd [SC_HI+%l0],%f34 672 673 faddd P2_f26,C_pp1,P2_f26 674 fmuld P2_f22,C_qq2,P2_f24 675 ldd [SC_HI+%l2],%f42 676 677 faddd P1_f14,C_q2,P1_f14 678 679 fmuld P0_f2,P0_f6,P0_f6 680 faddd P0_f4,C_qq1,P0_f4 681 682 fmuld P2_f22,P2_f26,P2_f26 683 faddd P2_f24,C_qq1,P2_f24 684 685 fmuld P1_f12,P1_f14,P1_f14 686 687 faddd P0_f6,C_ONE,P0_f6 688 fmuld P0_f2,P0_f4,P0_f4 689 690 faddd P2_f26,C_ONE,P2_f26 691 fmuld P2_f22,P2_f24,P2_f24 692 693 faddd P1_f14,C_q1,P1_f14 694 695 fmuld P0_f0,P0_f6,P0_f6 696 ldd [%o7+%l0],P0_f2 697 698 fmuld P2_f20,P2_f26,P2_f26 699 ldd [%o7+%l2],P2_f22 700 701 fmuld P0_f4,%f32,P0_f4 702 lda [%i1]%asi,%l0 ! preload next argument 703 704 fmuld P2_f24,%f40,P2_f24 705 lda [%i1]%asi,P0_f0 706 707 fmuld P0_f6,%f34,P0_f6 708 lda [%i1+4]%asi,P0_f1 709 710 fmuld P2_f26,%f42,P2_f26 711 add %i1,%i2,%i1 ! x += stridex 712 713 fmuld P1_f12,P1_f14,P1_f14 714 715 fsubd P0_f6,P0_f4,P0_f6 716 717 fsubd P2_f26,P2_f24,P2_f26 718 719 !!(vsin)fmuld P1_f10,P1_f14,P1_f14 720 721 fsubd P0_f2,P0_f6,P0_f6 722 723 fsubd P2_f22,P2_f26,P2_f26 724 725 faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 726 727 faddd P0_f6,%f32,P0_f6 728 729 faddd P2_f26,%f40,P2_f26 730 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 731 732 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 733 addcc %i0,-1,%i0 734 735 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 736 bg,pt %icc,.loop0 737 738! delay slot 739 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 740 741 ba,pt %icc,.endloop0 742! delay slot 743 nop 744 745 .align 32 746.case3: 747 sethi %hi(0x3fc3c000),%o7 748 fand P0_f8,MSK_BITSHI17,P0_f2 749 fmuld P1_f10,P1_f10,P1_f12 750 751 sub %l0,%o7,%l0 752 add SC_HI,8,%g1;add SC_LO,8,%o7 753 fmuld P2_f20,P2_f20,P2_f22 754 755 fsubd P0_f0,P0_f2,P0_f0 756 srl %l0,10,%l0 757 mov %o0,%o3 758 759 fmuld P1_f12,C_q4,P1_f14 760 mov %o1,%o4 761 762 fmuld P2_f22,C_q4,P2_f24 763 mov %o2,%o5 764 765 fmuld P0_f0,P0_f0,P0_f2 766 andn %l0,0x1f,%l0 767 768 faddd P1_f14,C_q3,P1_f14 769 770 faddd P2_f24,C_q3,P2_f24 771 772 fmuld P0_f2,C_pp2,P0_f6 773 ldd [%g1+%l0],%f32 774 775 fmuld P1_f12,P1_f14,P1_f14 776 777 fmuld P2_f22,P2_f24,P2_f24 778 779 faddd P0_f6,C_pp1,P0_f6 780 fmuld P0_f2,C_qq2,P0_f4 781 ldd [SC_HI+%l0],%f34 782 783 faddd P1_f14,C_q2,P1_f14 784 785 faddd P2_f24,C_q2,P2_f24 786 787 fmuld P0_f2,P0_f6,P0_f6 788 faddd P0_f4,C_qq1,P0_f4 789 790 fmuld P1_f12,P1_f14,P1_f14 791 792 fmuld P2_f22,P2_f24,P2_f24 793 794 faddd P0_f6,C_ONE,P0_f6 795 fmuld P0_f2,P0_f4,P0_f4 796 797 faddd P1_f14,C_q1,P1_f14 798 799 faddd P2_f24,C_q1,P2_f24 800 801 fmuld P0_f0,P0_f6,P0_f6 802 ldd [%o7+%l0],P0_f2 803 804 fmuld P0_f4,%f32,P0_f4 805 lda [%i1]%asi,%l0 ! preload next argument 806 807 fmuld P1_f12,P1_f14,P1_f14 808 lda [%i1]%asi,P0_f0 809 810 fmuld P0_f6,%f34,P0_f6 811 lda [%i1+4]%asi,P0_f1 812 813 fmuld P2_f22,P2_f24,P2_f24 814 add %i1,%i2,%i1 ! x += stridex 815 816 !!(vsin)fmuld P1_f10,P1_f14,P1_f14 817 818 fsubd P0_f6,P0_f4,P0_f6 819 820 !!(vsin)fmuld P2_f20,P2_f24,P2_f24 821 822 faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 823 824 fsubd P0_f2,P0_f6,P0_f6 825 826 faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 827 828 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 829 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 830 831 faddd P0_f6,%f32,P0_f6 832 addcc %i0,-1,%i0 833 834 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 835 bg,pt %icc,.loop0 836 837! delay slot 838 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 839 840 ba,pt %icc,.endloop0 841! delay slot 842 nop 843 844 .align 32 845.case4: 846 st P1_f17,[%o4+4] 847 cmp %l1,LIM_l5 848 fpadd32s P1_f10,MSK_BIT13,P1_f18 849 bl,pn %icc,.case6 850 851! delay slot 852 st P2_f26,[%o5] 853 cmp %l2,LIM_l5 854 fpadd32s P2_f20,MSK_BIT13,P2_f28 855 bl,pn %icc,.case5 856 857! delay slot 858 st P2_f27,[%o5+4] 859 sethi %hi(0x3fc3c000),%o7 860 fand P1_f18,MSK_BITSHI17,P1_f12 861 862 sub %l1,%o7,%l1 863 sub %l2,%o7,%l2 864 add SC_HI,8,%g1;add SC_LO,8,%o7 865 fand P2_f28,MSK_BITSHI17,P2_f22 866 fmuld P0_f0,P0_f0,P0_f2 867 868 fsubd P1_f10,P1_f12,P1_f10 869 srl %l1,10,%l1 870 mov %o1,%o4 871 872 fsubd P2_f20,P2_f22,P2_f20 873 srl %l2,10,%l2 874 mov %o2,%o5 875 876 fmovd P0_f0,P0_f6 !ID for processing 877 fmuld P0_f2,C_q4,P0_f4 878 mov %o0,%o3 879 880 fmuld P1_f10,P1_f10,P1_f12 881 andn %l1,0x1f,%l1 882 883 fmuld P2_f20,P2_f20,P2_f22 884 andn %l2,0x1f,%l2 885 886 faddd P0_f4,C_q3,P0_f4 887 888 fmuld P1_f12,C_pp2,P1_f16 889 ldd [%g1+%l1],%f36 890 891 fmuld P2_f22,C_pp2,P2_f26 892 ldd [%g1+%l2],%f40 893 894 fmuld P0_f2,P0_f4,P0_f4 895 896 faddd P1_f16,C_pp1,P1_f16 897 fmuld P1_f12,C_qq2,P1_f14 898 ldd [SC_HI+%l1],%f38 899 900 faddd P2_f26,C_pp1,P2_f26 901 fmuld P2_f22,C_qq2,P2_f24 902 ldd [SC_HI+%l2],%f42 903 904 faddd P0_f4,C_q2,P0_f4 905 906 fmuld P1_f12,P1_f16,P1_f16 907 faddd P1_f14,C_qq1,P1_f14 908 909 fmuld P2_f22,P2_f26,P2_f26 910 faddd P2_f24,C_qq1,P2_f24 911 912 fmuld P0_f2,P0_f4,P0_f4 913 914 faddd P1_f16,C_ONE,P1_f16 915 fmuld P1_f12,P1_f14,P1_f14 916 917 faddd P2_f26,C_ONE,P2_f26 918 fmuld P2_f22,P2_f24,P2_f24 919 920 faddd P0_f4,C_q1,P0_f4 921 922 fmuld P1_f10,P1_f16,P1_f16 923 ldd [%o7+%l1],P1_f12 924 925 fmuld P2_f20,P2_f26,P2_f26 926 ldd [%o7+%l2],P2_f22 927 928 fmuld P1_f14,%f36,P1_f14 929 lda [%i1]%asi,%l0 ! preload next argument 930 931 fmuld P2_f24,%f40,P2_f24 932 lda [%i1]%asi,P0_f0 933 934 fmuld P1_f16,%f38,P1_f16 935 lda [%i1+4]%asi,P0_f1 936 937 fmuld P2_f26,%f42,P2_f26 938 add %i1,%i2,%i1 ! x += stridex 939 940 fmuld P0_f2,P0_f4,P0_f4 941 942 fsubd P1_f16,P1_f14,P1_f16 943 944 fsubd P2_f26,P2_f24,P2_f26 945 946 !!(vsin)fmuld P0_f6,P0_f4,P0_f4 947 948 fsubd P1_f12,P1_f16,P1_f16 949 950 fsubd P2_f22,P2_f26,P2_f26 951 952 faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing 953 954 faddd P1_f16,%f36,P1_f16 955 956 faddd P2_f26,%f40,P2_f26 957 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 958 959 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 960 addcc %i0,-1,%i0 961 962 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 963 bg,pt %icc,.loop0 964 965! delay slot 966 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 967 968 ba,pt %icc,.endloop0 969! delay slot 970 nop 971 972 .align 32 973.case5: 974 sethi %hi(0x3fc3c000),%o7 975 fand P1_f18,MSK_BITSHI17,P1_f12 976 fmuld P0_f0,P0_f0,P0_f2 977 978 sub %l1,%o7,%l1 979 add SC_HI,8,%g1;add SC_LO,8,%o7 980 fmuld P2_f20,P2_f20,P2_f22 981 982 fsubd P1_f10,P1_f12,P1_f10 983 srl %l1,10,%l1 984 mov %o1,%o4 985 986 fmovd P0_f0,P0_f6 !ID for processing 987 fmuld P0_f2,C_q4,P0_f4 988 mov %o0,%o3 989 990 fmuld P2_f22,C_q4,P2_f24 991 mov %o2,%o5 992 993 fmuld P1_f10,P1_f10,P1_f12 994 andn %l1,0x1f,%l1 995 996 faddd P0_f4,C_q3,P0_f4 997 998 faddd P2_f24,C_q3,P2_f24 999 1000 fmuld P1_f12,C_pp2,P1_f16 1001 ldd [%g1+%l1],%f36 1002 1003 fmuld P0_f2,P0_f4,P0_f4 1004 1005 fmuld P2_f22,P2_f24,P2_f24 1006 1007 faddd P1_f16,C_pp1,P1_f16 1008 fmuld P1_f12,C_qq2,P1_f14 1009 ldd [SC_HI+%l1],%f38 1010 1011 faddd P0_f4,C_q2,P0_f4 1012 1013 faddd P2_f24,C_q2,P2_f24 1014 1015 fmuld P1_f12,P1_f16,P1_f16 1016 faddd P1_f14,C_qq1,P1_f14 1017 1018 fmuld P0_f2,P0_f4,P0_f4 1019 1020 fmuld P2_f22,P2_f24,P2_f24 1021 1022 faddd P1_f16,C_ONE,P1_f16 1023 fmuld P1_f12,P1_f14,P1_f14 1024 1025 faddd P0_f4,C_q1,P0_f4 1026 1027 faddd P2_f24,C_q1,P2_f24 1028 1029 fmuld P1_f10,P1_f16,P1_f16 1030 ldd [%o7+%l1],P1_f12 1031 1032 fmuld P1_f14,%f36,P1_f14 1033 lda [%i1]%asi,%l0 ! preload next argument 1034 1035 fmuld P0_f2,P0_f4,P0_f4 1036 lda [%i1]%asi,P0_f0 1037 1038 fmuld P1_f16,%f38,P1_f16 1039 lda [%i1+4]%asi,P0_f1 1040 1041 fmuld P2_f22,P2_f24,P2_f24 1042 add %i1,%i2,%i1 ! x += stridex 1043 1044 !!(vsin)fmuld P0_f6,P0_f4,P0_f4 1045 1046 fsubd P1_f16,P1_f14,P1_f16 1047 1048 !!(vsin)fmuld P2_f20,P2_f24,P2_f24 1049 1050 faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing 1051 1052 fsubd P1_f12,P1_f16,P1_f16 1053 1054 faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 1055 1056 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 1057 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 1058 1059 faddd P1_f16,%f36,P1_f16 1060 addcc %i0,-1,%i0 1061 1062 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 1063 bg,pt %icc,.loop0 1064 1065! delay slot 1066 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 1067 1068 ba,pt %icc,.endloop0 1069! delay slot 1070 nop 1071 1072 .align 32 1073.case6: 1074 st P2_f27,[%o5+4] 1075 cmp %l2,LIM_l5 1076 fpadd32s P2_f20,MSK_BIT13,P2_f28 1077 bl,pn %icc,.case7 1078 1079! delay slot 1080 sethi %hi(0x3fc3c000),%o7 1081 fand P2_f28,MSK_BITSHI17,P2_f22 1082 fmuld P0_f0,P0_f0,P0_f2 1083 1084 sub %l2,%o7,%l2 1085 add SC_HI,8,%g1;add SC_LO,8,%o7 1086 fmuld P1_f10,P1_f10,P1_f12 1087 1088 fsubd P2_f20,P2_f22,P2_f20 1089 srl %l2,10,%l2 1090 mov %o2,%o5 1091 1092 fmovd P0_f0,P0_f6 !ID for processing 1093 fmuld P0_f2,C_q4,P0_f4 1094 mov %o0,%o3 1095 1096 fmuld P1_f12,C_q4,P1_f14 1097 mov %o1,%o4 1098 1099 fmuld P2_f20,P2_f20,P2_f22 1100 andn %l2,0x1f,%l2 1101 1102 faddd P0_f4,C_q3,P0_f4 1103 1104 faddd P1_f14,C_q3,P1_f14 1105 1106 fmuld P2_f22,C_pp2,P2_f26 1107 ldd [%g1+%l2],%f40 1108 1109 fmuld P0_f2,P0_f4,P0_f4 1110 1111 fmuld P1_f12,P1_f14,P1_f14 1112 1113 faddd P2_f26,C_pp1,P2_f26 1114 fmuld P2_f22,C_qq2,P2_f24 1115 ldd [SC_HI+%l2],%f42 1116 1117 faddd P0_f4,C_q2,P0_f4 1118 1119 faddd P1_f14,C_q2,P1_f14 1120 1121 fmuld P2_f22,P2_f26,P2_f26 1122 faddd P2_f24,C_qq1,P2_f24 1123 1124 fmuld P0_f2,P0_f4,P0_f4 1125 1126 fmuld P1_f12,P1_f14,P1_f14 1127 1128 faddd P2_f26,C_ONE,P2_f26 1129 fmuld P2_f22,P2_f24,P2_f24 1130 1131 faddd P0_f4,C_q1,P0_f4 1132 1133 faddd P1_f14,C_q1,P1_f14 1134 1135 fmuld P2_f20,P2_f26,P2_f26 1136 ldd [%o7+%l2],P2_f22 1137 1138 fmuld P2_f24,%f40,P2_f24 1139 lda [%i1]%asi,%l0 ! preload next argument 1140 1141 fmuld P0_f2,P0_f4,P0_f4 1142 lda [%i1]%asi,P0_f0 1143 1144 fmuld P2_f26,%f42,P2_f26 1145 lda [%i1+4]%asi,P0_f1 1146 1147 fmuld P1_f12,P1_f14,P1_f14 1148 add %i1,%i2,%i1 ! x += stridex 1149 1150 !!(vsin)fmuld P0_f6,P0_f4,P0_f4 1151 1152 fsubd P2_f26,P2_f24,P2_f26 1153 1154 !!(vsin)fmuld P1_f10,P1_f14,P1_f14 1155 1156 faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing 1157 1158 fsubd P2_f22,P2_f26,P2_f26 1159 1160 faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 1161 1162 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 1163 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 1164 1165 faddd P2_f26,%f40,P2_f26 1166 addcc %i0,-1,%i0 1167 1168 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 1169 bg,pt %icc,.loop0 1170 1171! delay slot 1172 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 1173 1174 ba,pt %icc,.endloop0 1175! delay slot 1176 nop 1177 1178 .align 32 1179.case7: 1180 fmuld P0_f0,P0_f0,P0_f2 1181 fmovd P0_f0,P0_f6 !ID for processing 1182 mov %o0,%o3 1183 1184 fmuld P1_f10,P1_f10,P1_f12 1185 mov %o1,%o4 1186 1187 fmuld P2_f20,P2_f20,P2_f22 1188 mov %o2,%o5 1189 1190 fmuld P0_f2,C_q4,P0_f4 1191 lda [%i1]%asi,%l0 ! preload next argument 1192 1193 fmuld P1_f12,C_q4,P1_f14 1194 lda [%i1]%asi,P0_f0 1195 1196 fmuld P2_f22,C_q4,P2_f24 1197 lda [%i1+4]%asi,P0_f1 1198 1199 faddd P0_f4,C_q3,P0_f4 1200 add %i1,%i2,%i1 ! x += stridex 1201 1202 faddd P1_f14,C_q3,P1_f14 1203 1204 faddd P2_f24,C_q3,P2_f24 1205 1206 fmuld P0_f2,P0_f4,P0_f4 1207 1208 fmuld P1_f12,P1_f14,P1_f14 1209 1210 fmuld P2_f22,P2_f24,P2_f24 1211 1212 faddd P0_f4,C_q2,P0_f4 1213 1214 faddd P1_f14,C_q2,P1_f14 1215 1216 faddd P2_f24,C_q2,P2_f24 1217 1218 fmuld P0_f2,P0_f4,P0_f4 1219 1220 fmuld P1_f12,P1_f14,P1_f14 1221 1222 fmuld P2_f22,P2_f24,P2_f24 1223 1224 faddd P0_f4,C_q1,P0_f4 1225 1226 faddd P1_f14,C_q1,P1_f14 1227 1228 faddd P2_f24,C_q1,P2_f24 1229 1230 fmuld P0_f2,P0_f4,P0_f4 1231 1232 fmuld P1_f12,P1_f14,P1_f14 1233 1234 fmuld P2_f22,P2_f24,P2_f24 1235 1236 !!(vsin)fmuld P0_f6,P0_f4,P0_f4 1237 1238 !!(vsin)fmuld P1_f10,P1_f14,P1_f14 1239 1240 !!(vsin)fmuld P2_f20,P2_f24,P2_f24 1241 1242 faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing 1243 1244 faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 1245 1246 faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 1247 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 1248 1249 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 1250 addcc %i0,-1,%i0 1251 1252 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 1253 bg,pt %icc,.loop0 1254 1255! delay slot 1256 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 1257 1258 ba,pt %icc,.endloop0 1259! delay slot 1260 nop 1261 1262 1263 .align 32 1264.endloop2: 1265 cmp %l1,LIM_l5 1266 bl,pn %icc,1f 1267! delay slot 1268 fabsd P1_f10,P1_f10 1269 sethi %hi(0x3fc3c000),%o7 1270 fpadd32s P1_f10,MSK_BIT13,P1_f18 1271 fand P1_f18,MSK_BITSHI17,P1_f12 1272 sub %l1,%o7,%l1 1273 add SC_HI,8,%g1;add SC_LO,8,%o7 1274 fsubd P1_f10,P1_f12,P1_f10 1275 srl %l1,10,%l1 1276 fmuld P1_f10,P1_f10,P1_f12 1277 andn %l1,0x1f,%l1 1278 fmuld P1_f12,C_pp2,P2_f20 1279 ldd [%g1+%l1],%f36 1280 faddd P2_f20,C_pp1,P2_f20 1281 fmuld P1_f12,C_qq2,P1_f14 1282 ldd [SC_HI+%l1],%f38 1283 fmuld P1_f12,P2_f20,P2_f20 1284 faddd P1_f14,C_qq1,P1_f14 1285 faddd P2_f20,C_ONE,P2_f20 1286 fmuld P1_f12,P1_f14,P1_f14 1287 fmuld P1_f10,P2_f20,P2_f20 1288 ldd [%o7+%l1],P1_f12 1289 fmuld P1_f14,%f36,P1_f14 1290 fmuld P2_f20,%f38,P2_f20 1291 fsubd P2_f20,P1_f14,P2_f20 1292 fsubd P1_f12,P2_f20,P2_f20 1293 ba,pt %icc,2f 1294! delay slot 1295 faddd P2_f20,%f36,P2_f20 12961: 1297 fmuld P1_f10,P1_f10,P1_f12 1298 fmuld P1_f12,C_q4,P1_f14 1299 faddd P1_f14,C_q3,P1_f14 1300 fmuld P1_f12,P1_f14,P1_f14 1301 faddd P1_f14,C_q2,P1_f14 1302 fmuld P1_f12,P1_f14,P1_f14 1303 faddd P1_f14,C_q1,P1_f14 1304 fmuld P1_f12,P1_f14,P1_f14 1305 !!(vsin)fmuld P1_f10,P1_f14,P1_f14 1306 faddd C_ONE,P1_f14,P2_f20 !!(vsin)faddd P1_f10,P1_f14,P2_f20 13072: 1308 nop !!(vsin) fors P2_f20,P1_f19,P2_f20 1309 st P2_f20,[%o1] 1310 st P2_f21,[%o1+4] 1311 1312.endloop1: 1313 cmp %l0,LIM_l5 1314 bl,pn %icc,1f 1315! delay slot 1316 fabsd P0_f0,P0_f0 1317 sethi %hi(0x3fc3c000),%o7 1318 fpadd32s P0_f0,MSK_BIT13,P0_f8 1319 fand P0_f8,MSK_BITSHI17,P0_f2 1320 sub %l0,%o7,%l0 1321 add SC_HI,8,%g1;add SC_LO,8,%o7 1322 fsubd P0_f0,P0_f2,P0_f0 1323 srl %l0,10,%l0 1324 fmuld P0_f0,P0_f0,P0_f2 1325 andn %l0,0x1f,%l0 1326 fmuld P0_f2,C_pp2,P2_f20 1327 ldd [%g1+%l0],%f32 1328 faddd P2_f20,C_pp1,P2_f20 1329 fmuld P0_f2,C_qq2,P0_f4 1330 ldd [SC_HI+%l0],%f34 1331 fmuld P0_f2,P2_f20,P2_f20 1332 faddd P0_f4,C_qq1,P0_f4 1333 faddd P2_f20,C_ONE,P2_f20 1334 fmuld P0_f2,P0_f4,P0_f4 1335 fmuld P0_f0,P2_f20,P2_f20 1336 ldd [%o7+%l0],P0_f2 1337 fmuld P0_f4,%f32,P0_f4 1338 fmuld P2_f20,%f34,P2_f20 1339 fsubd P2_f20,P0_f4,P2_f20 1340 fsubd P0_f2,P2_f20,P2_f20 1341 ba,pt %icc,2f 1342! delay slot 1343 faddd P2_f20,%f32,P2_f20 13441: 1345 fmuld P0_f0,P0_f0,P0_f2 1346 fmuld P0_f2,C_q4,P0_f4 1347 faddd P0_f4,C_q3,P0_f4 1348 fmuld P0_f2,P0_f4,P0_f4 1349 faddd P0_f4,C_q2,P0_f4 1350 fmuld P0_f2,P0_f4,P0_f4 1351 faddd P0_f4,C_q1,P0_f4 1352 fmuld P0_f2,P0_f4,P0_f4 1353 !!(vsin)fmuld P0_f0,P0_f4,P0_f4 1354 faddd C_ONE,P0_f4,P2_f20 !!(vsin)faddd P0_f0,P0_f4,P2_f20 13552: 1356 nop !!(vsin) fors P2_f20,P0_f9,P2_f20 1357 st P2_f20,[%o0] 1358 st P2_f21,[%o0+4] 1359 1360.endloop0: 1361 st P0_f6,[%o3] 1362 st P0_f7,[%o3+4] 1363 st P1_f16,[%o4] 1364 st P1_f17,[%o4+4] 1365 st P2_f26,[%o5] 1366 st P2_f27,[%o5+4] 1367 1368! return. finished off with only primary range arguments 1369 1370 ret 1371 restore 1372 1373 1374 .align 32 1375.range0: 1376 cmp %l0,LIM_l6 1377 bg,a,pt %icc,.MEDIUM ! branch to Medium range on big arg. 1378! delay slot, annulled if branch not taken 1379 mov 0x1,LIM_l6 ! set biguns flag or 1380 fdtoi P0_f0,P0_f2; fmovd C_ONE,P0_f0 ; st P0_f0,[%o0] ! *y = *x with inexact if x nonzero 1381 st P0_f1,[%o0+4] 1382 !nop ! (vsin) fdtoi P0_f0,P0_f2 1383 addcc %i0,-1,%i0 1384 ble,pn %icc,.endloop0 1385! delay slot, harmless if branch taken 1386 add %i3,%i4,%i3 ! y += stridey 1387 andn %l1,MSK_SIGN,%l0 ! hx &= ~0x80000000 1388 fmovd P1_f10,P0_f0 1389 ba,pt %icc,.loop0 1390! delay slot 1391 add %i1,%i2,%i1 ! x += stridex 1392 1393 1394 .align 32 1395.range1: 1396 cmp %l1,LIM_l6 1397 bg,a,pt %icc,.MEDIUM ! branch to Medium range on big arg. 1398! delay slot, annulled if branch not taken 1399 mov 0x2,LIM_l6 ! set biguns flag or 1400 fdtoi P1_f10,P1_f12; fmovd C_ONE,P1_f10 ; st P1_f10,[%o1] ! *y = *x with inexact if x nonzero 1401 st P1_f11,[%o1+4] 1402 !nop ! (vsin) fdtoi P1_f10,P1_f12 1403 addcc %i0,-1,%i0 1404 ble,pn %icc,.endloop1 1405! delay slot, harmless if branch taken 1406 add %i3,%i4,%i3 ! y += stridey 1407 andn %l2,MSK_SIGN,%l1 ! hx &= ~0x80000000 1408 fmovd P2_f20,P1_f10 1409 ba,pt %icc,.loop1 1410! delay slot 1411 add %i1,%i2,%i1 ! x += stridex 1412 1413 1414 .align 32 1415.range2: 1416 cmp %l2,LIM_l6 1417 bg,a,pt %icc,.MEDIUM ! brance to Medium range on big arg. 1418! delay slot, annulled if branch not taken 1419 mov 0x3,LIM_l6 ! set biguns flag or 1420 fdtoi P2_f20,P2_f22; fmovd C_ONE,P2_f20 ; st P2_f20,[%o2] ! *y = *x with inexact if x nonzero 1421 st P2_f21,[%o2+4] 1422 nop ! (vsin) fdtoi P2_f20,P2_f22 14231: 1424 addcc %i0,-1,%i0 1425 ble,pn %icc,.endloop2 1426! delay slot 1427 nop 1428 ld [%i1],%l2 1429 ld [%i1],P2_f20 1430 ld [%i1+4],P2_f21 1431 andn %l2,MSK_SIGN,%l2 ! hx &= ~0x80000000 1432 ba,pt %icc,.loop2 1433! delay slot 1434 add %i1,%i2,%i1 ! x += stridex 1435 1436 1437 .align 32 1438.MEDIUM: 1439 1440! ========== medium range ========== 1441 1442! register use 1443 1444! i0 n 1445! i1 x 1446! i2 stridex 1447! i3 y 1448! i4 stridey 1449! i5 0x80000000 1450 1451! l0 hx0 1452! l1 hx1 1453! l2 hx2 1454! l3 __vlibm_TBL_sincos_hi 1455! l4 __vlibm_TBL_sincos_lo 1456! l5 constants 1457! l6 biguns stored here : still called LIM_l6 1458! l7 0x413921fb 1459 1460! the following are 64-bit registers in both V8+ and V9 1461 1462! g1 scratch 1463! g5 1464 1465! o0 py0 1466! o1 py1 1467! o2 py2 1468! o3 n0 1469! o4 n1 1470! o5 n2 1471! o7 scratch 1472 1473! f0 x0 1474! f2 n0,y0 1475! f4 1476! f6 1477! f8 scratch for table base 1478! f9 signbit0 1479! f10 x1 1480! f12 n1,y1 1481! f14 1482! f16 1483! f18 scratch for table base 1484! f19 signbit1 1485! f20 x2 1486! f22 n2,y2 1487! f24 1488! f26 1489! f28 scratch for table base 1490! f29 signbit2 1491! f30 0x80000000 1492! f31 0x4000 1493! f32 1494! f34 1495! f36 1496! f38 1497! f40 invpio2 1498! f42 round 1499! f44 0xffff800000000000 1500! f46 pio2_1 1501! f48 pio2_2 1502! f50 pio2_3 1503! f52 pio2_3t 1504! f54 one 1505! f56 pp1 1506! f58 pp2 1507! f60 qq1 1508! f62 qq2 1509 1510 1511 PIC_SET(g5,constants,l5) 1512 1513 ! %o3,%o4,%o5 need to be stored 1514 st P0_f6,[%o3] 1515 sethi %hi(0x413921fb),%l7 1516 st P0_f7,[%o3+4] 1517 or %l7,%lo(0x413921fb),%l7 1518 st P1_f16,[%o4] 1519 st P1_f17,[%o4+4] 1520 st P2_f26,[%o5] 1521 st P2_f27,[%o5+4] 1522 ldd [%l5+invpio2],%f40 1523 ldd [%l5+round],%f42 1524 ldd [%l5+pio2_1],%f46 1525 ldd [%l5+pio2_2],%f48 1526 ldd [%l5+pio2_3],%f50 1527 ldd [%l5+pio2_3t],%f52 1528 std %f54,[%fp+x0_1+8] ! set up stack data 1529 std %f54,[%fp+x1_1+8] 1530 std %f54,[%fp+x2_1+8] 1531 stx %g0,[%fp+y0_0+8] 1532 stx %g0,[%fp+y1_0+8] 1533 stx %g0,[%fp+y2_0+8] 1534 1535! branched here in the middle of the array. Need to adjust 1536! for the members of the triple that were selected in the primary 1537! loop. 1538 1539! no adjustment since all three selected here 1540 subcc LIM_l6,0x1,%g0 ! continue in LOOP0? 1541 bz,a %icc,.LOOP0 1542 mov 0x0,LIM_l6 ! delay slot set biguns=0 1543 1544! ajust 1st triple since 2d and 3d done here 1545 subcc LIM_l6,0x2,%g0 ! continue in LOOP1? 1546 fmuld %f0,%f40,%f2 ! adj LOOP0 1547 bz,a %icc,.LOOP1 1548 mov 0x0,LIM_l6 ! delay slot set biguns=0 1549 1550! ajust 1st and 2d triple since 3d done here 1551 subcc LIM_l6,0x3,%g0 ! continue in LOOP2? 1552 !done fmuld %f0,%f40,%f2 ! adj LOOP0 1553 sub %i3,%i4,%i3 ! adjust to not double increment 1554 fmuld %f10,%f40,%f12 ! adj LOOP1 1555 faddd %f2,%f42,%f2 ! adj LOOP1 1556 bz,a %icc,.LOOP2 1557 mov 0x0,LIM_l6 ! delay slot set biguns=0 1558 1559 ba .LOOP0 1560 nop 1561 1562! -- 16 byte aligned 1563 1564 .align 32 1565.LOOP0: 1566 lda [%i1]%asi,%l1 ! preload next argument 1567 mov %i3,%o0 ! py0 = y 1568 1569 lda [%i1]%asi,%f10 1570 cmp %l0,%l7 1571 add %i3,%i4,%i3 ! y += stridey 1572 bg,pn %icc,.BIG0 ! if hx > 0x413921fb 1573 1574! delay slot 1575 lda [%i1+4]%asi,%f11 1576 addcc %i0,-1,%i0 1577 add %i1,%i2,%i1 ! x += stridex 1578 ble,pn %icc,.ENDLOOP1 1579 1580! delay slot 1581 andn %l1,%i5,%l1 1582 nop 1583 fmuld %f0,%f40,%f2 1584 fabsd %f54,%f54 ! a nop for alignment only 1585 1586.LOOP1: 1587 lda [%i1]%asi,%l2 ! preload next argument 1588 mov %i3,%o1 ! py1 = y 1589 1590 lda [%i1]%asi,%f20 1591 cmp %l1,%l7 1592 add %i3,%i4,%i3 ! y += stridey 1593 bg,pn %icc,.BIG1 ! if hx > 0x413921fb 1594 1595! delay slot 1596 lda [%i1+4]%asi,%f21 1597 addcc %i0,-1,%i0 1598 add %i1,%i2,%i1 ! x += stridex 1599 ble,pn %icc,.ENDLOOP2 1600 1601! delay slot 1602 andn %l2,%i5,%l2 1603 nop 1604 fmuld %f10,%f40,%f12 1605 faddd %f2,%f42,%f2 1606 1607.LOOP2: 1608 st %f3,[%fp+n0] 1609 mov %i3,%o2 ! py2 = y 1610 1611 cmp %l2,%l7 1612 add %i3,%i4,%i3 ! y += stridey 1613 fmuld %f20,%f40,%f22 1614 bg,pn %icc,.BIG2 ! if hx > 0x413921fb 1615 1616! delay slot 1617 add %l5,thresh+4,%o7 1618 faddd %f12,%f42,%f12 1619 st %f13,[%fp+n1] 1620 1621! - 1622 1623 add %l5,thresh,%g1 1624 faddd %f22,%f42,%f22 1625 st %f23,[%fp+n2] 1626 1627 fsubd %f2,%f42,%f2 ! n 1628 1629 fsubd %f12,%f42,%f12 ! n 1630 1631 fsubd %f22,%f42,%f22 ! n 1632 1633 fmuld %f2,%f46,%f4 1634 1635 fmuld %f12,%f46,%f14 1636 1637 fmuld %f22,%f46,%f24 1638 1639 fsubd %f0,%f4,%f4 1640 fmuld %f2,%f48,%f6 1641 1642 fsubd %f10,%f14,%f14 1643 fmuld %f12,%f48,%f16 1644 1645 fsubd %f20,%f24,%f24 1646 fmuld %f22,%f48,%f26 1647 1648 fsubd %f4,%f6,%f0 1649 ld [%fp+n0],%o3 ; add %o3,1,%o3 1650 1651 fsubd %f14,%f16,%f10 1652 ld [%fp+n1],%o4 ; add %o4,1,%o4 1653 1654 fsubd %f24,%f26,%f20 1655 ld [%fp+n2],%o5 ; add %o5,1,%o5 1656 1657 fsubd %f4,%f0,%f32 1658 and %o3,1,%o3 1659 1660 fsubd %f14,%f10,%f34 1661 and %o4,1,%o4 1662 1663 fsubd %f24,%f20,%f36 1664 and %o5,1,%o5 1665 1666 fsubd %f32,%f6,%f32 1667 fmuld %f2,%f50,%f8 1668 sll %o3,3,%o3 1669 1670 fsubd %f34,%f16,%f34 1671 fmuld %f12,%f50,%f18 1672 sll %o4,3,%o4 1673 1674 fsubd %f36,%f26,%f36 1675 fmuld %f22,%f50,%f28 1676 sll %o5,3,%o5 1677 1678 fsubd %f8,%f32,%f8 1679 ld [%g1+%o3],%f6 1680 1681 fsubd %f18,%f34,%f18 1682 ld [%g1+%o4],%f16 1683 1684 fsubd %f28,%f36,%f28 1685 ld [%g1+%o5],%f26 1686 1687 fsubd %f0,%f8,%f4 1688 1689 fsubd %f10,%f18,%f14 1690 1691 fsubd %f20,%f28,%f24 1692 1693 fsubd %f0,%f4,%f32 1694 1695 fsubd %f10,%f14,%f34 1696 1697 fsubd %f20,%f24,%f36 1698 1699 fsubd %f32,%f8,%f32 1700 fmuld %f2,%f52,%f2 1701 1702 fsubd %f34,%f18,%f34 1703 fmuld %f12,%f52,%f12 1704 1705 fsubd %f36,%f28,%f36 1706 fmuld %f22,%f52,%f22 1707 1708 fsubd %f2,%f32,%f2 1709 ld [%o7+%o3],%f8 1710 1711 fsubd %f12,%f34,%f12 1712 ld [%o7+%o4],%f18 1713 1714 fsubd %f22,%f36,%f22 1715 ld [%o7+%o5],%f28 1716 1717 fsubd %f4,%f2,%f0 ! x 1718 1719 fsubd %f14,%f12,%f10 ! x 1720 1721 fsubd %f24,%f22,%f20 ! x 1722 1723 fsubd %f4,%f0,%f4 1724 1725 fsubd %f14,%f10,%f14 1726 1727 fsubd %f24,%f20,%f24 1728 1729 fands %f0,%f30,%f9 ! save signbit 1730 1731 fands %f10,%f30,%f19 ! save signbit 1732 1733 fands %f20,%f30,%f29 ! save signbit 1734 1735 fabsd %f0,%f0 1736 std %f0,[%fp+x0_1] 1737 1738 fabsd %f10,%f10 1739 std %f10,[%fp+x1_1] 1740 1741 fabsd %f20,%f20 1742 std %f20,[%fp+x2_1] 1743 1744 fsubd %f4,%f2,%f2 ! y 1745 1746 fsubd %f14,%f12,%f12 ! y 1747 1748 fsubd %f24,%f22,%f22 ! y 1749 1750 fcmpgt32 %f6,%f0,%l0 1751 1752 fcmpgt32 %f16,%f10,%l1 1753 1754 fcmpgt32 %f26,%f20,%l2 1755 1756! -- 16 byte aligned 1757 fxors %f2,%f9,%f2 1758 1759 fxors %f12,%f19,%f12 1760 1761 fxors %f22,%f29,%f22 1762 1763 fands %f9,%f8,%f9 ! if (n & 1) clear sign bit 1764 andcc %l0,2,%g0 1765 bne,pn %icc,.CASE4 1766 1767! delay slot 1768 fands %f19,%f18,%f19 ! if (n & 1) clear sign bit 1769 andcc %l1,2,%g0 1770 bne,pn %icc,.CASE2 1771 1772! delay slot 1773 fands %f29,%f28,%f29 ! if (n & 1) clear sign bit 1774 andcc %l2,2,%g0 1775 bne,pn %icc,.CASE1 1776 1777! delay slot 1778 fpadd32s %f0,%f31,%f8 1779 sethi %hi(0x3fc3c000),%o7 1780 ld [%fp+x0_1],%l0 1781 1782 fpadd32s %f10,%f31,%f18 1783 add %l3,8,%g1 1784 ld [%fp+x1_1],%l1 1785 1786 fpadd32s %f20,%f31,%f28 1787 ld [%fp+x2_1],%l2 1788 1789 fand %f8,%f44,%f4 1790 sub %l0,%o7,%l0 1791 1792 fand %f18,%f44,%f14 1793 sub %l1,%o7,%l1 1794 1795 fand %f28,%f44,%f24 1796 sub %l2,%o7,%l2 1797 1798 fsubd %f0,%f4,%f0 1799 srl %l0,10,%l0 1800 1801 fsubd %f10,%f14,%f10 1802 srl %l1,10,%l1 1803 1804 fsubd %f20,%f24,%f20 1805 srl %l2,10,%l2 1806 1807 faddd %f0,%f2,%f0 1808 andn %l0,0x1f,%l0 1809 1810 faddd %f10,%f12,%f10 1811 andn %l1,0x1f,%l1 1812 1813 faddd %f20,%f22,%f20 1814 andn %l2,0x1f,%l2 1815 1816 fmuld %f0,%f0,%f2 1817 add %l0,%o3,%l0 1818 1819 fmuld %f10,%f10,%f12 1820 add %l1,%o4,%l1 1821 1822 fmuld %f20,%f20,%f22 1823 add %l2,%o5,%l2 1824 1825 fmuld %f2,%f58,%f6 1826 ldd [%l3+%l0],%f32 1827 1828 fmuld %f12,%f58,%f16 1829 ldd [%l3+%l1],%f34 1830 1831 fmuld %f22,%f58,%f26 1832 ldd [%l3+%l2],%f36 1833 1834 faddd %f6,%f56,%f6 1835 fmuld %f2,%f62,%f4 1836 1837 faddd %f16,%f56,%f16 1838 fmuld %f12,%f62,%f14 1839 1840 faddd %f26,%f56,%f26 1841 fmuld %f22,%f62,%f24 1842 1843 fmuld %f2,%f6,%f6 1844 faddd %f4,%f60,%f4 1845 1846 fmuld %f12,%f16,%f16 1847 faddd %f14,%f60,%f14 1848 1849 fmuld %f22,%f26,%f26 1850 faddd %f24,%f60,%f24 1851 1852 faddd %f6,%f54,%f6 1853 fmuld %f2,%f4,%f4 1854 1855 faddd %f16,%f54,%f16 1856 fmuld %f12,%f14,%f14 1857 1858 faddd %f26,%f54,%f26 1859 fmuld %f22,%f24,%f24 1860 1861 fmuld %f0,%f6,%f6 1862 ldd [%g1+%l0],%f2 1863 1864 fmuld %f10,%f16,%f16 1865 ldd [%g1+%l1],%f12 1866 1867 fmuld %f20,%f26,%f26 1868 ldd [%g1+%l2],%f22 1869 1870 fmuld %f4,%f32,%f4 1871 ldd [%l4+%l0],%f0 1872 1873 fmuld %f14,%f34,%f14 1874 ldd [%l4+%l1],%f10 1875 1876 fmuld %f24,%f36,%f24 1877 ldd [%l4+%l2],%f20 1878 1879 fmuld %f6,%f2,%f6 1880 1881 fmuld %f16,%f12,%f16 1882 1883 fmuld %f26,%f22,%f26 1884 1885 faddd %f6,%f4,%f6 1886 1887 faddd %f16,%f14,%f16 1888 1889 faddd %f26,%f24,%f26 1890 1891 faddd %f6,%f0,%f6 1892 1893 faddd %f16,%f10,%f16 1894 1895 faddd %f26,%f20,%f26 1896 1897 faddd %f6,%f32,%f6 1898 1899 faddd %f16,%f34,%f16 1900 1901 faddd %f26,%f36,%f26 1902 1903.FIXSIGN: 1904 ld [%fp+n0],%o3 ; add %o3,1,%o3 1905 add %l5,thresh-4,%g1 1906 1907 ld [%fp+n1],%o4 ; add %o4,1,%o4 1908 1909 ld [%fp+n2],%o5 ; add %o5,1,%o5 1910 and %o3,2,%o3 1911 1912 sll %o3,2,%o3 1913 and %o4,2,%o4 1914 lda [%i1]%asi,%l0 ! preload next argument 1915 1916 sll %o4,2,%o4 1917 and %o5,2,%o5 1918 ld [%g1+%o3],%f8 1919 1920 sll %o5,2,%o5 1921 ld [%g1+%o4],%f18 1922 1923 ld [%g1+%o5],%f28 1924 fxors %f9,%f8,%f9 1925 1926 lda [%i1]%asi,%f0 1927 fxors %f29,%f28,%f29 1928 1929 lda [%i1+4]%asi,%f1 1930 fxors %f19,%f18,%f19 1931 1932 fors %f6,%f9,%f6 ! tack on sign 1933 add %i1,%i2,%i1 ! x += stridex 1934 st %f6,[%o0] 1935 1936 fors %f26,%f29,%f26 ! tack on sign 1937 st %f7,[%o0+4] 1938 1939 fors %f16,%f19,%f16 ! tack on sign 1940 st %f26,[%o2] 1941 1942 st %f27,[%o2+4] 1943 addcc %i0,-1,%i0 1944 1945 st %f16,[%o1] 1946 andn %l0,%i5,%l0 ! hx &= ~0x80000000 1947 bg,pt %icc,.LOOP0 1948 1949! delay slot 1950 st %f17,[%o1+4] 1951 1952 ba,pt %icc,.ENDLOOP0 1953! delay slot 1954 nop 1955 1956 .align 32 1957.CASE1: 1958 fpadd32s %f10,%f31,%f18 1959 sethi %hi(0x3fc3c000),%o7 1960 ld [%fp+x0_1],%l0 1961 1962 fand %f8,%f44,%f4 1963 add %l3,8,%g1 1964 ld [%fp+x1_1],%l1 1965 1966 fand %f18,%f44,%f14 1967 sub %l0,%o7,%l0 1968 1969 fsubd %f0,%f4,%f0 1970 srl %l0,10,%l0 1971 sub %l1,%o7,%l1 1972 1973 fsubd %f10,%f14,%f10 1974 srl %l1,10,%l1 1975 1976 fmuld %f20,%f20,%f20 1977 ldd [%l5+%o5],%f36 1978 add %l5,%o5,%l2 1979 1980 faddd %f0,%f2,%f0 1981 andn %l0,0x1f,%l0 1982 1983 faddd %f10,%f12,%f10 1984 andn %l1,0x1f,%l1 1985 1986 fmuld %f20,%f36,%f24 1987 ldd [%l2+0x10],%f26 1988 add %fp,%o5,%o5 1989 1990 fmuld %f0,%f0,%f2 1991 add %l0,%o3,%l0 1992 1993 fmuld %f10,%f10,%f12 1994 add %l1,%o4,%l1 1995 1996 faddd %f24,%f26,%f24 1997 ldd [%l2+0x20],%f36 1998 1999 fmuld %f2,%f58,%f6 2000 ldd [%l3+%l0],%f32 2001 2002 fmuld %f12,%f58,%f16 2003 ldd [%l3+%l1],%f34 2004 2005 fmuld %f20,%f24,%f24 2006 ldd [%l2+0x30],%f26 2007 2008 faddd %f6,%f56,%f6 2009 fmuld %f2,%f62,%f4 2010 2011 faddd %f16,%f56,%f16 2012 fmuld %f12,%f62,%f14 2013 2014 faddd %f24,%f36,%f24 2015 ldd [%o5+x2_1],%f36 2016 2017 fmuld %f2,%f6,%f6 2018 faddd %f4,%f60,%f4 2019 2020 fmuld %f12,%f16,%f16 2021 faddd %f14,%f60,%f14 2022 2023 fmuld %f20,%f24,%f24 2024 2025 faddd %f6,%f54,%f6 2026 fmuld %f2,%f4,%f4 2027 ldd [%g1+%l0],%f2 2028 2029 faddd %f16,%f54,%f16 2030 fmuld %f12,%f14,%f14 2031 ldd [%g1+%l1],%f12 2032 2033 faddd %f24,%f26,%f24 2034 2035 fmuld %f0,%f6,%f6 2036 ldd [%l4+%l0],%f0 2037 2038 fmuld %f10,%f16,%f16 2039 ldd [%l4+%l1],%f10 2040 2041 fmuld %f4,%f32,%f4 2042 std %f22,[%fp+y2_0] 2043 2044 fmuld %f14,%f34,%f14 2045 2046 fmuld %f6,%f2,%f6 2047 2048 fmuld %f16,%f12,%f16 2049 2050 fmuld %f20,%f24,%f24 2051 2052 faddd %f6,%f4,%f6 2053 2054 faddd %f16,%f14,%f16 2055 2056 fmuld %f36,%f24,%f24 2057 ldd [%o5+y2_0],%f22 2058 2059 faddd %f6,%f0,%f6 2060 2061 faddd %f16,%f10,%f16 2062 2063 faddd %f24,%f22,%f24 2064 2065 faddd %f6,%f32,%f6 2066 2067 faddd %f16,%f34,%f16 2068 ba,pt %icc,.FIXSIGN 2069 2070! delay slot 2071 faddd %f36,%f24,%f26 2072 2073 .align 32 2074.CASE2: 2075 fpadd32s %f0,%f31,%f8 2076 ld [%fp+x0_1],%l0 2077 andcc %l2,2,%g0 2078 bne,pn %icc,.CASE3 2079 2080! delay slot 2081 sethi %hi(0x3fc3c000),%o7 2082 fpadd32s %f20,%f31,%f28 2083 ld [%fp+x2_1],%l2 2084 2085 fand %f8,%f44,%f4 2086 sub %l0,%o7,%l0 2087 add %l3,8,%g1 2088 2089 fand %f28,%f44,%f24 2090 sub %l2,%o7,%l2 2091 2092 fsubd %f0,%f4,%f0 2093 srl %l0,10,%l0 2094 2095 fsubd %f20,%f24,%f20 2096 srl %l2,10,%l2 2097 2098 fmuld %f10,%f10,%f10 2099 ldd [%l5+%o4],%f34 2100 add %l5,%o4,%l1 2101 2102 faddd %f0,%f2,%f0 2103 andn %l0,0x1f,%l0 2104 2105 faddd %f20,%f22,%f20 2106 andn %l2,0x1f,%l2 2107 2108 fmuld %f10,%f34,%f14 2109 ldd [%l1+0x10],%f16 2110 add %fp,%o4,%o4 2111 2112 fmuld %f0,%f0,%f2 2113 add %l0,%o3,%l0 2114 2115 fmuld %f20,%f20,%f22 2116 add %l2,%o5,%l2 2117 2118 faddd %f14,%f16,%f14 2119 ldd [%l1+0x20],%f34 2120 2121 fmuld %f2,%f58,%f6 2122 ldd [%l3+%l0],%f32 2123 2124 fmuld %f22,%f58,%f26 2125 ldd [%l3+%l2],%f36 2126 2127 fmuld %f10,%f14,%f14 2128 ldd [%l1+0x30],%f16 2129 2130 faddd %f6,%f56,%f6 2131 fmuld %f2,%f62,%f4 2132 2133 faddd %f26,%f56,%f26 2134 fmuld %f22,%f62,%f24 2135 2136 faddd %f14,%f34,%f14 2137 ldd [%o4+x1_1],%f34 2138 2139 fmuld %f2,%f6,%f6 2140 faddd %f4,%f60,%f4 2141 2142 fmuld %f22,%f26,%f26 2143 faddd %f24,%f60,%f24 2144 2145 fmuld %f10,%f14,%f14 2146 2147 faddd %f6,%f54,%f6 2148 fmuld %f2,%f4,%f4 2149 ldd [%g1+%l0],%f2 2150 2151 faddd %f26,%f54,%f26 2152 fmuld %f22,%f24,%f24 2153 ldd [%g1+%l2],%f22 2154 2155 faddd %f14,%f16,%f14 2156 2157 fmuld %f0,%f6,%f6 2158 ldd [%l4+%l0],%f0 2159 2160 fmuld %f20,%f26,%f26 2161 ldd [%l4+%l2],%f20 2162 2163 fmuld %f4,%f32,%f4 2164 std %f12,[%fp+y1_0] 2165 2166 fmuld %f24,%f36,%f24 2167 2168 fmuld %f6,%f2,%f6 2169 2170 fmuld %f26,%f22,%f26 2171 2172 fmuld %f10,%f14,%f14 2173 2174 faddd %f6,%f4,%f6 2175 2176 faddd %f26,%f24,%f26 2177 2178 fmuld %f34,%f14,%f14 2179 ldd [%o4+y1_0],%f12 2180 2181 faddd %f6,%f0,%f6 2182 2183 faddd %f26,%f20,%f26 2184 2185 faddd %f14,%f12,%f14 2186 2187 faddd %f6,%f32,%f6 2188 2189 faddd %f26,%f36,%f26 2190 ba,pt %icc,.FIXSIGN 2191 2192! delay slot 2193 faddd %f34,%f14,%f16 2194 2195 .align 32 2196.CASE3: 2197 fand %f8,%f44,%f4 2198 add %l3,8,%g1 2199 sub %l0,%o7,%l0 2200 2201 fmuld %f10,%f10,%f10 2202 ldd [%l5+%o4],%f34 2203 add %l5,%o4,%l1 2204 2205 fsubd %f0,%f4,%f0 2206 srl %l0,10,%l0 2207 2208 fmuld %f20,%f20,%f20 2209 ldd [%l5+%o5],%f36 2210 add %l5,%o5,%l2 2211 2212 fmuld %f10,%f34,%f14 2213 ldd [%l1+0x10],%f16 2214 add %fp,%o4,%o4 2215 2216 faddd %f0,%f2,%f0 2217 andn %l0,0x1f,%l0 2218 2219 fmuld %f20,%f36,%f24 2220 ldd [%l2+0x10],%f26 2221 add %fp,%o5,%o5 2222 2223 faddd %f14,%f16,%f14 2224 ldd [%l1+0x20],%f34 2225 2226 fmuld %f0,%f0,%f2 2227 add %l0,%o3,%l0 2228 2229 faddd %f24,%f26,%f24 2230 ldd [%l2+0x20],%f36 2231 2232 fmuld %f10,%f14,%f14 2233 ldd [%l1+0x30],%f16 2234 2235 fmuld %f2,%f58,%f6 2236 ldd [%l3+%l0],%f32 2237 2238 fmuld %f20,%f24,%f24 2239 ldd [%l2+0x30],%f26 2240 2241 faddd %f14,%f34,%f14 2242 ldd [%o4+x1_1],%f34 2243 2244 faddd %f6,%f56,%f6 2245 fmuld %f2,%f62,%f4 2246 2247 faddd %f24,%f36,%f24 2248 ldd [%o5+x2_1],%f36 2249 2250 fmuld %f10,%f14,%f14 2251 std %f12,[%fp+y1_0] 2252 2253 fmuld %f2,%f6,%f6 2254 faddd %f4,%f60,%f4 2255 2256 fmuld %f20,%f24,%f24 2257 std %f22,[%fp+y2_0] 2258 2259 faddd %f14,%f16,%f14 2260 2261 faddd %f6,%f54,%f6 2262 fmuld %f2,%f4,%f4 2263 ldd [%g1+%l0],%f2 2264 2265 faddd %f24,%f26,%f24 2266 2267 fmuld %f10,%f14,%f14 2268 2269 fmuld %f0,%f6,%f6 2270 ldd [%l4+%l0],%f0 2271 2272 fmuld %f4,%f32,%f4 2273 2274 fmuld %f20,%f24,%f24 2275 2276 fmuld %f6,%f2,%f6 2277 2278 fmuld %f34,%f14,%f14 2279 ldd [%o4+y1_0],%f12 2280 2281 fmuld %f36,%f24,%f24 2282 ldd [%o5+y2_0],%f22 2283 2284 faddd %f6,%f4,%f6 2285 2286 faddd %f14,%f12,%f14 2287 2288 faddd %f24,%f22,%f24 2289 2290 faddd %f6,%f0,%f6 2291 2292 faddd %f34,%f14,%f16 2293 2294 faddd %f36,%f24,%f26 2295 ba,pt %icc,.FIXSIGN 2296 2297! delay slot 2298 faddd %f6,%f32,%f6 2299 2300 .align 32 2301.CASE4: 2302 fands %f29,%f28,%f29 ! if (n & 1) clear sign bit 2303 sethi %hi(0x3fc3c000),%o7 2304 andcc %l1,2,%g0 2305 bne,pn %icc,.CASE6 2306 2307! delay slot 2308 andcc %l2,2,%g0 2309 fpadd32s %f10,%f31,%f18 2310 ld [%fp+x1_1],%l1 2311 bne,pn %icc,.CASE5 2312 2313! delay slot 2314 add %l3,8,%g1 2315 ld [%fp+x2_1],%l2 2316 fpadd32s %f20,%f31,%f28 2317 2318 fand %f18,%f44,%f14 2319 sub %l1,%o7,%l1 2320 2321 fand %f28,%f44,%f24 2322 sub %l2,%o7,%l2 2323 2324 fsubd %f10,%f14,%f10 2325 srl %l1,10,%l1 2326 2327 fsubd %f20,%f24,%f20 2328 srl %l2,10,%l2 2329 2330 fmuld %f0,%f0,%f0 2331 ldd [%l5+%o3],%f32 2332 add %l5,%o3,%l0 2333 2334 faddd %f10,%f12,%f10 2335 andn %l1,0x1f,%l1 2336 2337 faddd %f20,%f22,%f20 2338 andn %l2,0x1f,%l2 2339 2340 fmuld %f0,%f32,%f4 2341 ldd [%l0+0x10],%f6 2342 add %fp,%o3,%o3 2343 2344 fmuld %f10,%f10,%f12 2345 add %l1,%o4,%l1 2346 2347 fmuld %f20,%f20,%f22 2348 add %l2,%o5,%l2 2349 2350 faddd %f4,%f6,%f4 2351 ldd [%l0+0x20],%f32 2352 2353 fmuld %f12,%f58,%f16 2354 ldd [%l3+%l1],%f34 2355 2356 fmuld %f22,%f58,%f26 2357 ldd [%l3+%l2],%f36 2358 2359 fmuld %f0,%f4,%f4 2360 ldd [%l0+0x30],%f6 2361 2362 faddd %f16,%f56,%f16 2363 fmuld %f12,%f62,%f14 2364 2365 faddd %f26,%f56,%f26 2366 fmuld %f22,%f62,%f24 2367 2368 faddd %f4,%f32,%f4 2369 ldd [%o3+x0_1],%f32 2370 2371 fmuld %f12,%f16,%f16 2372 faddd %f14,%f60,%f14 2373 2374 fmuld %f22,%f26,%f26 2375 faddd %f24,%f60,%f24 2376 2377 fmuld %f0,%f4,%f4 2378 2379 faddd %f16,%f54,%f16 2380 fmuld %f12,%f14,%f14 2381 ldd [%g1+%l1],%f12 2382 2383 faddd %f26,%f54,%f26 2384 fmuld %f22,%f24,%f24 2385 ldd [%g1+%l2],%f22 2386 2387 faddd %f4,%f6,%f4 2388 2389 fmuld %f10,%f16,%f16 2390 ldd [%l4+%l1],%f10 2391 2392 fmuld %f20,%f26,%f26 2393 ldd [%l4+%l2],%f20 2394 2395 fmuld %f14,%f34,%f14 2396 std %f2,[%fp+y0_0] 2397 2398 fmuld %f24,%f36,%f24 2399 2400 fmuld %f0,%f4,%f4 2401 2402 fmuld %f16,%f12,%f16 2403 2404 fmuld %f26,%f22,%f26 2405 2406 fmuld %f32,%f4,%f4 2407 ldd [%o3+y0_0],%f2 2408 2409 faddd %f16,%f14,%f16 2410 2411 faddd %f26,%f24,%f26 2412 2413 faddd %f4,%f2,%f4 2414 2415 faddd %f16,%f10,%f16 2416 2417 faddd %f26,%f20,%f26 2418 2419 faddd %f32,%f4,%f6 2420 2421 faddd %f16,%f34,%f16 2422 ba,pt %icc,.FIXSIGN 2423 2424! delay slot 2425 faddd %f26,%f36,%f26 2426 2427 .align 32 2428.CASE5: 2429 fand %f18,%f44,%f14 2430 sub %l1,%o7,%l1 2431 2432 fmuld %f0,%f0,%f0 2433 ldd [%l5+%o3],%f32 2434 add %l5,%o3,%l0 2435 2436 fsubd %f10,%f14,%f10 2437 srl %l1,10,%l1 2438 2439 fmuld %f20,%f20,%f20 2440 ldd [%l5+%o5],%f36 2441 add %l5,%o5,%l2 2442 2443 fmuld %f0,%f32,%f4 2444 ldd [%l0+0x10],%f6 2445 add %fp,%o3,%o3 2446 2447 faddd %f10,%f12,%f10 2448 andn %l1,0x1f,%l1 2449 2450 fmuld %f20,%f36,%f24 2451 ldd [%l2+0x10],%f26 2452 add %fp,%o5,%o5 2453 2454 faddd %f4,%f6,%f4 2455 ldd [%l0+0x20],%f32 2456 2457 fmuld %f10,%f10,%f12 2458 add %l1,%o4,%l1 2459 2460 faddd %f24,%f26,%f24 2461 ldd [%l2+0x20],%f36 2462 2463 fmuld %f0,%f4,%f4 2464 ldd [%l0+0x30],%f6 2465 2466 fmuld %f12,%f58,%f16 2467 ldd [%l3+%l1],%f34 2468 2469 fmuld %f20,%f24,%f24 2470 ldd [%l2+0x30],%f26 2471 2472 faddd %f4,%f32,%f4 2473 ldd [%o3+x0_1],%f32 2474 2475 faddd %f16,%f56,%f16 2476 fmuld %f12,%f62,%f14 2477 2478 faddd %f24,%f36,%f24 2479 ldd [%o5+x2_1],%f36 2480 2481 fmuld %f0,%f4,%f4 2482 std %f2,[%fp+y0_0] 2483 2484 fmuld %f12,%f16,%f16 2485 faddd %f14,%f60,%f14 2486 2487 fmuld %f20,%f24,%f24 2488 std %f22,[%fp+y2_0] 2489 2490 faddd %f4,%f6,%f4 2491 2492 faddd %f16,%f54,%f16 2493 fmuld %f12,%f14,%f14 2494 ldd [%g1+%l1],%f12 2495 2496 faddd %f24,%f26,%f24 2497 2498 fmuld %f0,%f4,%f4 2499 2500 fmuld %f10,%f16,%f16 2501 ldd [%l4+%l1],%f10 2502 2503 fmuld %f14,%f34,%f14 2504 2505 fmuld %f20,%f24,%f24 2506 2507 fmuld %f16,%f12,%f16 2508 2509 fmuld %f32,%f4,%f4 2510 ldd [%o3+y0_0],%f2 2511 2512 fmuld %f36,%f24,%f24 2513 ldd [%o5+y2_0],%f22 2514 2515 faddd %f16,%f14,%f16 2516 2517 faddd %f4,%f2,%f4 2518 2519 faddd %f24,%f22,%f24 2520 2521 faddd %f16,%f10,%f16 2522 2523 faddd %f32,%f4,%f6 2524 2525 faddd %f36,%f24,%f26 2526 ba,pt %icc,.FIXSIGN 2527 2528! delay slot 2529 faddd %f16,%f34,%f16 2530 2531 .align 32 2532.CASE6: 2533 ld [%fp+x2_1],%l2 2534 add %l3,8,%g1 2535 bne,pn %icc,.CASE7 2536! delay slot 2537 fpadd32s %f20,%f31,%f28 2538 2539 fand %f28,%f44,%f24 2540 ldd [%l5+%o3],%f32 2541 add %l5,%o3,%l0 2542 2543 fmuld %f0,%f0,%f0 2544 sub %l2,%o7,%l2 2545 2546 fsubd %f20,%f24,%f20 2547 srl %l2,10,%l2 2548 2549 fmuld %f10,%f10,%f10 2550 ldd [%l5+%o4],%f34 2551 add %l5,%o4,%l1 2552 2553 fmuld %f0,%f32,%f4 2554 ldd [%l0+0x10],%f6 2555 add %fp,%o3,%o3 2556 2557 faddd %f20,%f22,%f20 2558 andn %l2,0x1f,%l2 2559 2560 fmuld %f10,%f34,%f14 2561 ldd [%l1+0x10],%f16 2562 add %fp,%o4,%o4 2563 2564 faddd %f4,%f6,%f4 2565 ldd [%l0+0x20],%f32 2566 2567 fmuld %f20,%f20,%f22 2568 add %l2,%o5,%l2 2569 2570 faddd %f14,%f16,%f14 2571 ldd [%l1+0x20],%f34 2572 2573 fmuld %f0,%f4,%f4 2574 ldd [%l0+0x30],%f6 2575 2576 fmuld %f22,%f58,%f26 2577 ldd [%l3+%l2],%f36 2578 2579 fmuld %f10,%f14,%f14 2580 ldd [%l1+0x30],%f16 2581 2582 faddd %f4,%f32,%f4 2583 ldd [%o3+x0_1],%f32 2584 2585 faddd %f26,%f56,%f26 2586 fmuld %f22,%f62,%f24 2587 2588 faddd %f14,%f34,%f14 2589 ldd [%o4+x1_1],%f34 2590 2591 fmuld %f0,%f4,%f4 2592 std %f2,[%fp+y0_0] 2593 2594 fmuld %f22,%f26,%f26 2595 faddd %f24,%f60,%f24 2596 2597 fmuld %f10,%f14,%f14 2598 std %f12,[%fp+y1_0] 2599 2600 faddd %f4,%f6,%f4 2601 2602 faddd %f26,%f54,%f26 2603 fmuld %f22,%f24,%f24 2604 ldd [%g1+%l2],%f22 2605 2606 faddd %f14,%f16,%f14 2607 2608 fmuld %f0,%f4,%f4 2609 2610 fmuld %f20,%f26,%f26 2611 ldd [%l4+%l2],%f20 2612 2613 fmuld %f24,%f36,%f24 2614 2615 fmuld %f10,%f14,%f14 2616 2617 fmuld %f26,%f22,%f26 2618 2619 fmuld %f32,%f4,%f4 2620 ldd [%o3+y0_0],%f2 2621 2622 fmuld %f34,%f14,%f14 2623 ldd [%o4+y1_0],%f12 2624 2625 faddd %f26,%f24,%f26 2626 2627 faddd %f4,%f2,%f4 2628 2629 faddd %f14,%f12,%f14 2630 2631 faddd %f26,%f20,%f26 2632 2633 faddd %f32,%f4,%f6 2634 2635 faddd %f34,%f14,%f16 2636 ba,pt %icc,.FIXSIGN 2637 2638! delay slot 2639 faddd %f26,%f36,%f26 2640 2641 .align 32 2642.CASE7: 2643 fmuld %f0,%f0,%f0 2644 ldd [%l5+%o3],%f32 2645 add %l5,%o3,%l0 2646 2647 fmuld %f10,%f10,%f10 2648 ldd [%l5+%o4],%f34 2649 add %l5,%o4,%l1 2650 2651 fmuld %f20,%f20,%f20 2652 ldd [%l5+%o5],%f36 2653 add %l5,%o5,%l2 2654 2655 fmuld %f0,%f32,%f4 2656 ldd [%l0+0x10],%f6 2657 add %fp,%o3,%o3 2658 2659 fmuld %f10,%f34,%f14 2660 ldd [%l1+0x10],%f16 2661 add %fp,%o4,%o4 2662 2663 fmuld %f20,%f36,%f24 2664 ldd [%l2+0x10],%f26 2665 add %fp,%o5,%o5 2666 2667 faddd %f4,%f6,%f4 2668 ldd [%l0+0x20],%f32 2669 2670 faddd %f14,%f16,%f14 2671 ldd [%l1+0x20],%f34 2672 2673 faddd %f24,%f26,%f24 2674 ldd [%l2+0x20],%f36 2675 2676 fmuld %f0,%f4,%f4 2677 ldd [%l0+0x30],%f6 2678 2679 fmuld %f10,%f14,%f14 2680 ldd [%l1+0x30],%f16 2681 2682 fmuld %f20,%f24,%f24 2683 ldd [%l2+0x30],%f26 2684 2685 faddd %f4,%f32,%f4 2686 ldd [%o3+x0_1],%f32 2687 2688 faddd %f14,%f34,%f14 2689 ldd [%o4+x1_1],%f34 2690 2691 faddd %f24,%f36,%f24 2692 ldd [%o5+x2_1],%f36 2693 2694 fmuld %f0,%f4,%f4 2695 std %f2,[%fp+y0_0] 2696 2697 fmuld %f10,%f14,%f14 2698 std %f12,[%fp+y1_0] 2699 2700 fmuld %f20,%f24,%f24 2701 std %f22,[%fp+y2_0] 2702 2703 faddd %f4,%f6,%f4 2704 2705 faddd %f14,%f16,%f14 2706 2707 faddd %f24,%f26,%f24 2708 2709 fmuld %f0,%f4,%f4 2710 2711 fmuld %f10,%f14,%f14 2712 2713 fmuld %f20,%f24,%f24 2714 2715 fmuld %f32,%f4,%f4 2716 ldd [%o3+y0_0],%f2 2717 2718 fmuld %f34,%f14,%f14 2719 ldd [%o4+y1_0],%f12 2720 2721 fmuld %f36,%f24,%f24 2722 ldd [%o5+y2_0],%f22 2723 2724 faddd %f4,%f2,%f4 2725 2726 faddd %f14,%f12,%f14 2727 2728 faddd %f24,%f22,%f24 2729 2730 faddd %f32,%f4,%f6 2731 2732 faddd %f34,%f14,%f16 2733 ba,pt %icc,.FIXSIGN 2734 2735! delay slot 2736 faddd %f36,%f24,%f26 2737 2738 2739 .align 32 2740.ENDLOOP2: 2741 fmuld %f10,%f40,%f12 2742 add %l5,thresh,%g1 2743 faddd %f12,%f42,%f12 2744 st %f13,[%fp+n1] 2745 fsubd %f12,%f42,%f12 ! n 2746 fmuld %f12,%f46,%f14 2747 fsubd %f10,%f14,%f14 2748 fmuld %f12,%f48,%f16 2749 fsubd %f14,%f16,%f10 2750 ld [%fp+n1],%o4 ; add %o4,1,%o4 2751 fsubd %f14,%f10,%f34 2752 and %o4,1,%o4 2753 fsubd %f34,%f16,%f34 2754 fmuld %f12,%f50,%f18 2755 sll %o4,3,%o4 2756 fsubd %f18,%f34,%f18 2757 ld [%g1+%o4],%f16 2758 fsubd %f10,%f18,%f14 2759 fsubd %f10,%f14,%f34 2760 add %l5,thresh+4,%o7 2761 fsubd %f34,%f18,%f34 2762 fmuld %f12,%f52,%f12 2763 fsubd %f12,%f34,%f12 2764 ld [%o7+%o4],%f18 2765 fsubd %f14,%f12,%f10 ! x 2766 fsubd %f14,%f10,%f14 2767 fands %f10,%f30,%f19 ! save signbit 2768 fabsd %f10,%f10 2769 std %f10,[%fp+x1_1] 2770 fsubd %f14,%f12,%f12 ! y 2771 fcmpgt32 %f16,%f10,%l1 2772 fxors %f12,%f19,%f12 2773 fands %f19,%f18,%f19 ! if (n & 1) clear sign bit 2774 andcc %l1,2,%g0 2775 bne,pn %icc,1f 2776! delay slot 2777 nop 2778 fpadd32s %f10,%f31,%f18 2779 ld [%fp+x1_1],%l1 2780 fand %f18,%f44,%f14 2781 sethi %hi(0x3fc3c000),%o7 2782 add %l3,8,%g1 2783 fsubd %f10,%f14,%f10 2784 sub %l1,%o7,%l1 2785 srl %l1,10,%l1 2786 faddd %f10,%f12,%f10 2787 andn %l1,0x1f,%l1 2788 fmuld %f10,%f10,%f12 2789 add %l1,%o4,%l1 2790 fmuld %f12,%f58,%f16 2791 ldd [%l3+%l1],%f34 2792 faddd %f16,%f56,%f16 2793 fmuld %f12,%f62,%f14 2794 fmuld %f12,%f16,%f16 2795 faddd %f14,%f60,%f14 2796 faddd %f16,%f54,%f16 2797 fmuld %f12,%f14,%f14 2798 ldd [%g1+%l1],%f12 2799 fmuld %f10,%f16,%f16 2800 ldd [%l4+%l1],%f10 2801 fmuld %f14,%f34,%f14 2802 fmuld %f16,%f12,%f16 2803 faddd %f16,%f14,%f16 2804 faddd %f16,%f10,%f16 2805 ba,pt %icc,2f 2806 faddd %f16,%f34,%f16 28071: 2808 fmuld %f10,%f10,%f10 2809 ldd [%l5+%o4],%f34 2810 add %l5,%o4,%l1 2811 fmuld %f10,%f34,%f14 2812 ldd [%l1+0x10],%f16 2813 add %fp,%o4,%o4 2814 faddd %f14,%f16,%f14 2815 ldd [%l1+0x20],%f34 2816 fmuld %f10,%f14,%f14 2817 ldd [%l1+0x30],%f16 2818 faddd %f14,%f34,%f14 2819 ldd [%o4+x1_1],%f34 2820 fmuld %f10,%f14,%f14 2821 std %f12,[%fp+y1_0] 2822 faddd %f14,%f16,%f14 2823 fmuld %f10,%f14,%f14 2824 fmuld %f34,%f14,%f14 2825 ldd [%o4+y1_0],%f12 2826 faddd %f14,%f12,%f14 2827 faddd %f34,%f14,%f16 28282: 2829 add %l5,thresh-4,%g1 2830 ld [%fp+n1],%o4 ; add %o4,1,%o4 2831 and %o4,2,%o4 2832 sll %o4,2,%o4 2833 ld [%g1+%o4],%f18 2834 fxors %f19,%f18,%f19 2835 fors %f16,%f19,%f16 ! tack on sign 2836 st %f16,[%o1] 2837 st %f17,[%o1+4] 2838 2839.ENDLOOP1: 2840 fmuld %f0,%f40,%f2 2841 add %l5,thresh,%g1 2842 faddd %f2,%f42,%f2 2843 st %f3,[%fp+n0] 2844 fsubd %f2,%f42,%f2 ! n 2845 fmuld %f2,%f46,%f4 2846 fsubd %f0,%f4,%f4 2847 fmuld %f2,%f48,%f6 2848 fsubd %f4,%f6,%f0 2849 ld [%fp+n0],%o3 ; add %o3,1,%o3 2850 fsubd %f4,%f0,%f32 2851 and %o3,1,%o3 2852 fsubd %f32,%f6,%f32 2853 fmuld %f2,%f50,%f8 2854 sll %o3,3,%o3 2855 fsubd %f8,%f32,%f8 2856 ld [%g1+%o3],%f6 2857 fsubd %f0,%f8,%f4 2858 fsubd %f0,%f4,%f32 2859 add %l5,thresh+4,%o7 2860 fsubd %f32,%f8,%f32 2861 fmuld %f2,%f52,%f2 2862 fsubd %f2,%f32,%f2 2863 ld [%o7+%o3],%f8 2864 fsubd %f4,%f2,%f0 ! x 2865 fsubd %f4,%f0,%f4 2866 fands %f0,%f30,%f9 ! save signbit 2867 fabsd %f0,%f0 2868 std %f0,[%fp+x0_1] 2869 fsubd %f4,%f2,%f2 ! y 2870 fcmpgt32 %f6,%f0,%l0 2871 fxors %f2,%f9,%f2 2872 fands %f9,%f8,%f9 ! if (n & 1) clear sign bit 2873 andcc %l0,2,%g0 2874 bne,pn %icc,1f 2875! delay slot 2876 nop 2877 fpadd32s %f0,%f31,%f8 2878 ld [%fp+x0_1],%l0 2879 fand %f8,%f44,%f4 2880 sethi %hi(0x3fc3c000),%o7 2881 add %l3,8,%g1 2882 fsubd %f0,%f4,%f0 2883 sub %l0,%o7,%l0 2884 srl %l0,10,%l0 2885 faddd %f0,%f2,%f0 2886 andn %l0,0x1f,%l0 2887 fmuld %f0,%f0,%f2 2888 add %l0,%o3,%l0 2889 fmuld %f2,%f58,%f6 2890 ldd [%l3+%l0],%f32 2891 faddd %f6,%f56,%f6 2892 fmuld %f2,%f62,%f4 2893 fmuld %f2,%f6,%f6 2894 faddd %f4,%f60,%f4 2895 faddd %f6,%f54,%f6 2896 fmuld %f2,%f4,%f4 2897 ldd [%g1+%l0],%f2 2898 fmuld %f0,%f6,%f6 2899 ldd [%l4+%l0],%f0 2900 fmuld %f4,%f32,%f4 2901 fmuld %f6,%f2,%f6 2902 faddd %f6,%f4,%f6 2903 faddd %f6,%f0,%f6 2904 ba,pt %icc,2f 2905 faddd %f6,%f32,%f6 29061: 2907 fmuld %f0,%f0,%f0 2908 ldd [%l5+%o3],%f32 2909 add %l5,%o3,%l0 2910 fmuld %f0,%f32,%f4 2911 ldd [%l0+0x10],%f6 2912 add %fp,%o3,%o3 2913 faddd %f4,%f6,%f4 2914 ldd [%l0+0x20],%f32 2915 fmuld %f0,%f4,%f4 2916 ldd [%l0+0x30],%f6 2917 faddd %f4,%f32,%f4 2918 ldd [%o3+x0_1],%f32 2919 fmuld %f0,%f4,%f4 2920 std %f2,[%fp+y0_0] 2921 faddd %f4,%f6,%f4 2922 fmuld %f0,%f4,%f4 2923 fmuld %f32,%f4,%f4 2924 ldd [%o3+y0_0],%f2 2925 faddd %f4,%f2,%f4 2926 faddd %f32,%f4,%f6 29272: 2928 add %l5,thresh-4,%g1 2929 ld [%fp+n0],%o3 ; add %o3,1,%o3 2930 and %o3,2,%o3 2931 sll %o3,2,%o3 2932 ld [%g1+%o3],%f8 2933 fxors %f9,%f8,%f9 2934 fors %f6,%f9,%f6 ! tack on sign 2935 st %f6,[%o0] 2936 st %f7,[%o0+4] 2937 2938.ENDLOOP0: 2939 2940! check for huge arguments remaining 2941 2942 tst LIM_l6 2943 be,pt %icc,.exit 2944! delay slot 2945 nop 2946 2947! ========== huge range (use C code) ========== 2948 2949#ifdef __sparcv9 2950 ldx [%fp+xsave],%o1 2951 ldx [%fp+ysave],%o3 2952#else 2953 ld [%fp+xsave],%o1 2954 ld [%fp+ysave],%o3 2955#endif 2956 ld [%fp+nsave],%o0 2957 ld [%fp+sxsave],%o2 2958 ld [%fp+sysave],%o4 2959 sra %o2,0,%o2 ! sign-extend for V9 2960 sra %o4,0,%o4 2961 call __vlibm_vcos_big 2962 mov %l7,%o5 ! delay slot 2963 2964.exit: 2965 ret 2966 restore 2967 2968 2969 .align 32 2970.SKIP0: 2971 addcc %i0,-1,%i0 2972 ble,pn %icc,.ENDLOOP0 2973! delay slot, harmless if branch taken 2974 add %i3,%i4,%i3 ! y += stridey 2975 andn %l1,%i5,%l0 ! hx &= ~0x80000000 2976 fmovs %f10,%f0 2977 ld [%i1+4],%f1 2978 ba,pt %icc,.LOOP0 2979! delay slot 2980 add %i1,%i2,%i1 ! x += stridex 2981 2982 2983 .align 32 2984.SKIP1: 2985 addcc %i0,-1,%i0 2986 ble,pn %icc,.ENDLOOP1 2987! delay slot, harmless if branch taken 2988 add %i3,%i4,%i3 ! y += stridey 2989 andn %l2,%i5,%l1 ! hx &= ~0x80000000 2990 fmovs %f20,%f10 2991 ld [%i1+4],%f11 2992 ba,pt %icc,.LOOP1 2993! delay slot 2994 add %i1,%i2,%i1 ! x += stridex 2995 2996 2997 .align 32 2998.SKIP2: 2999 addcc %i0,-1,%i0 3000 ble,pn %icc,.ENDLOOP2 3001! delay slot, harmless if branch taken 3002 add %i3,%i4,%i3 ! y += stridey 3003 ld [%i1],%l2 3004 ld [%i1],%f20 3005 ld [%i1+4],%f21 3006 andn %l2,%i5,%l2 ! hx &= ~0x80000000 3007 ba,pt %icc,.LOOP2 3008! delay slot 3009 add %i1,%i2,%i1 ! x += stridex 3010 3011 3012 .align 32 3013.BIG0: 3014 sethi %hi(0x7ff00000),%o7 3015 cmp %l0,%o7 3016 bl,a,pt %icc,1f ! if hx < 0x7ff00000 3017! delay slot, annulled if branch not taken 3018 mov %l7,LIM_l6 ! set biguns flag or 3019 fsubd %f0,%f0,%f0 ! y = x - x 3020 st %f0,[%o0] 3021 st %f1,[%o0+4] 30221: 3023 addcc %i0,-1,%i0 3024 ble,pn %icc,.ENDLOOP0 3025! delay slot, harmless if branch taken 3026 andn %l1,%i5,%l0 ! hx &= ~0x80000000 3027 fmovd %f10,%f0 3028 ba,pt %icc,.LOOP0 3029! delay slot 3030 add %i1,%i2,%i1 ! x += stridex 3031 3032 3033 .align 32 3034.BIG1: 3035 sethi %hi(0x7ff00000),%o7 3036 cmp %l1,%o7 3037 bl,a,pt %icc,1f ! if hx < 0x7ff00000 3038! delay slot, annulled if branch not taken 3039 mov %l7,LIM_l6 ! set biguns flag or 3040 fsubd %f10,%f10,%f10 ! y = x - x 3041 st %f10,[%o1] 3042 st %f11,[%o1+4] 30431: 3044 addcc %i0,-1,%i0 3045 ble,pn %icc,.ENDLOOP1 3046! delay slot, harmless if branch taken 3047 andn %l2,%i5,%l1 ! hx &= ~0x80000000 3048 fmovd %f20,%f10 3049 ba,pt %icc,.LOOP1 3050! delay slot 3051 add %i1,%i2,%i1 ! x += stridex 3052 3053 3054 .align 32 3055.BIG2: 3056 sethi %hi(0x7ff00000),%o7 3057 cmp %l2,%o7 3058 bl,a,pt %icc,1f ! if hx < 0x7ff00000 3059! delay slot, annulled if branch not taken 3060 mov %l7,LIM_l6 ! set biguns flag or 3061 fsubd %f20,%f20,%f20 ! y = x - x 3062 st %f20,[%o2] 3063 st %f21,[%o2+4] 30641: 3065 addcc %i0,-1,%i0 3066 ble,pn %icc,.ENDLOOP2 3067! delay slot 3068 nop 3069 ld [%i1],%l2 3070 ld [%i1],%f20 3071 ld [%i1+4],%f21 3072 andn %l2,%i5,%l2 ! hx &= ~0x80000000 3073 ba,pt %icc,.LOOP2 3074! delay slot 3075 add %i1,%i2,%i1 ! x += stridex 3076 3077 SET_SIZE(__vcos) 3078 3079