1/* libgcc routines for the Texas Instruments TMS320C[34]x 2 Copyright (C) 1997,98, 1999 Free Software Foundation, Inc. 3 4 Contributed by Michael Hayes (m.hayes@elec.canterbury.ac.nz) 5 and Herman Ten Brugge (Haj.Ten.Brugge@net.HCC.nl). 6 7 8This file is part of GCC. 9 10GCC is free software; you can redistribute it and/or modify it 11under the terms of the GNU General Public License as published by the 12Free Software Foundation; either version 2, or (at your option) any 13later version. 14 15In addition to the permissions in the GNU General Public License, the 16Free Software Foundation gives you unlimited permission to link the 17compiled version of this file into combinations with other programs, 18and to distribute those combinations without any restriction coming 19from the use of this file. (The General Public License restrictions 20do apply in other respects; for example, they cover modification of 21the file, and distribution when not linked into a combine 22executable.) 23 24This file is distributed in the hope that it will be useful, but 25WITHOUT ANY WARRANTY; without even the implied warranty of 26MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 27General Public License for more details. 28 29You should have received a copy of the GNU General Public License 30along with this program; see the file COPYING. If not, write to 31the Free Software Foundation, 51 Franklin Street, Fifth Floor, 32Boston, MA 02110-1301, USA. */ 33 34; These routines are called using the standard TI register argument 35; passing model. 36; The following registers do not have to be saved: 37; r0, r1, r2, r3, ar0, ar1, ar2, ir0, ir1, bk, rs, rc, re, (r9, r10, r11) 38; 39; Perform floating point divqf3 40; 41; This routine performs a reciprocal of the divisor using the method 42; described in the C30/C40 user manuals. It then multiplies that 43; result by the dividend. 44; 45; Let r be the reciprocal of the divisor v and let the ith estimate 46; of r be denoted by r[i]. An iterative approach can be used to 47; improve the estimate of r, given an initial estimate r[0], where 48; 49; r[i + 1] = r[i] * (2.0 - v * r[i]) 50; 51; The normalized error e[i] at the ith iteration is 52; 53; e[i] = (r - r[i]) / r = (1 / v - r[i]) * v = (1 - v * r[i]) 54; 55; Note that 56; 57; e[i + 1] = (1 - v * r[i + 1]) = 1 - 2 * v * r[i] + v^2 + (r[i])^2 58; = (1 - v * r[i])^2 = (e[i])^2 59 60; r2 dividend, r3 divisor, r0 quotient 61; clobbers r1, ar1 62#ifdef L_divsf3 63 .text 64 .global ___divqf3 65___divqf3: 66 67#ifdef _TMS320C4x 68 .if .REGPARM == 0 69 lda sp,ar0 70 ldf *-ar0(2), r3 71 .endif 72 73 pop ar1 ; Pop return address 74 75; r0 = estimate of r, r1 = tmp, r2 = dividend, r3 = divisor 76 rcpf r3, r0 ; Compute initial estimate r[0] 77 78 mpyf3 r0, r3, r1 ; r1 = r[0] * v 79 subrf 2.0, r1 ; r1 = 2.0 - r[0] * v 80 mpyf r1, r0 ; r0 = r[0] * (2.0 - r[0] * v) = r[1] 81; End of 1st iteration (16 bits accuracy) 82 83 mpyf3 r0, r3, r1 ; r1 = r[1] * v 84 subrf 2.0, r1 ; r1 = 2.0 - r[1] * v 85 86 bud ar1 ; Delayed branch 87 mpyf r1, r0 ; r0 = r[1] * (2.0 - r[1] * v) = r[2] 88; End of 2nd iteration (32 bits accuracy) 89 .if .REGPARM == 0 90 mpyf *-ar0(1), r0 ; Multiply by the dividend 91 .else 92 mpyf r2, r0 ; Multiply by the dividend 93 .endif 94 rnd r0 95 ; Branch occurs here 96#else 97 .if .REGPARM == 0 98 ldiu sp,ar0 99 ldf *-ar0(2), r3 100 .endif 101 102 pop ar1 ; Pop return address 103 104; Initial estimate r[0] = 1.0 * 2^(-e - 1) 105; where v = m * 2^e 106 107; r0 = estimate of r, r1 = tmp, r2 = dividend, r3 = divisor 108 109; Calculate initial estimate r[0] 110 pushf r3 111 pop r0 112 not r0 ; r0 = -e 113 ; complement exponent = -e -1 114 ; complement sign (side effect) 115 ; complement mantissa (almost 3 bit accurate) 116 push r0 117 popf r0 ; r0 = 1.0 * e^(-e - 1) + inverted mantissa 118 ldf -1.0, r1 ; undo complement sign bit 119 xor r1, r0 120 121 mpyf3 r0, r3, r1 ; r1 = r[0] * v 122 subrf 2.0, r1 ; r1 = 2.0 - r[0] * v 123 mpyf r1, r0 ; r0 = r[0] * (2.0 - r[0] * v) = r[1] 124; End of 1st iteration 125 126 mpyf3 r0, r3, r1 ; r1 = r[1] * v 127 subrf 2.0, r1 ; r1 = 2.0 - r[1] * v 128 mpyf r1, r0 ; r0 = r[1] * (2.0 - r[1] * v) = r[2] 129; End of 2nd iteration 130 131 mpyf3 r0, r3, r1 ; r1 = r[2] * v 132 subrf 2.0, r1 ; r1 = 2.0 - r[2] * v 133 mpyf r1, r0 ; r0 = r[2] * (2.0 - r[2] * v) = r[3] 134; End of 3rd iteration 135 136 rnd r0 ; Minimize error in x[3]'s LSBs 137 138; Use modified last iteration 139; r[4] = (r[3] * (1.0 - (v * r[3]))) + r[3] 140 mpyf3 r0, r3, r1 ; r1 = r[3] * v 141 subrf 1.0, r1 ; r1 = 1.0 - r[3] * v 142 mpyf r0, r1 ; r1 = r[3] * (1.0 - r[3] * v) 143 addf r1, r0 ; r0 = r[3] * (1.0 - r[3] * v) + r[3] = r[4] 144 145 rnd r0 ; Minimize error in x[4]'s LSBs 146 147 bud ar1 ; Delayed branch 148 149 .if .REGPARM == 0 150 ldfu *-ar0(1), r2 ; Dividend in mem has only 24 bits significance 151 .else 152 rnd r2 ; Minimize error in reg dividend's LSBs 153 ; since this may have 32 bit significance 154 .endif 155 156 mpyf r2, r0 ; Multiply by the dividend 157 rnd r0 ; Round result to 32 bits 158 159 ; Branch occurs here 160#endif 161 162#endif 163; 164; Integer signed division 165; 166; ar2 dividend, r2 divisor, r0 quotient 167; clobbers r1, r3, ar0, ar1, ir0, ir1, rc, rs, re 168#ifdef L_divsi3 169 .text 170 .global ___divqi3 171 .ref udivqi3n 172___divqi3: 173 .if .REGPARM == 0 174#ifdef _TMS320C4x 175 lda sp,ar0 176#else 177 ldiu sp,ar0 178#endif 179 ldi *-ar0(1), ar2 180 ldi *-ar0(2), r2 181 .endif 182 183 xor3 ar2, r2, r3 ; Get the sign 184 absi ar2, r0 185 bvd divq32 186 ldi r0, ar2 187 absi r2, r2 188 cmpi ar2, r2 ; Divisor > dividend? 189 190 pop ir1 191 bhid zero ; If so, return 0 192 193; 194; Normalize oeprands. Use difference exponents as shift count 195; for divisor, and as repeat count for "subc" 196; 197 float ar2, r1 ; Normalize dividend 198 pushf r1 ; Get as integer 199 pop ar0 200 lsh -24, ar0 ; Get exponent 201 202 float r2, r1 ; Normalize divisor 203 pushf r1 ; Get as integer 204 pop ir0 205 lsh -24, ir0 ; Get exponent 206 207 subi ir0, ar0 ; Get difference of exponents 208 lsh ar0, r2 ; Align divisor with dividend 209 210; 211; Do count + 1 subtracts and shifts 212; 213 rpts ar0 214 subc r2, ar2 215 216; 217; Mask off the lower count+1 bits of ar2 218; 219 subri 31, ar0 ; Shift count is (32 - (ar0 + 1)) 220 lsh ar0, ar2 ; Shift left 221 negi ar0, ar0 222 lsh3 ar0, ar2, r0 ; Shift right and put result in r0 223 224; 225; Check sign and negate result if necessary 226; 227 bud ir1 ; Delayed return 228 negi r0, r1 ; Negate result 229 ash -31, r3 ; Check sign 230 ldinz r1, r0 ; If set, use negative result 231 ; Branch occurs here 232 233zero: bud ir1 ; Delayed branch 234 ldi 0, r0 235 nop 236 nop 237 ; Branch occurs here 238; 239; special case where ar2 = abs(ar2) = 0x80000000. We handle this by 240; calling unsigned divide and negating the result if necessary. 241; 242divq32: 243 push r3 ; Save sign 244 call udivqi3n 245 pop r3 246 pop ir1 247 bd ir1 248 negi r0, r1 ; Negate result 249 ash -31, r3 ; Check sign 250 ldinz r1, r0 ; If set, use negative result 251 ; Branch occurs here 252#endif 253; 254; 255; ar2 dividend, r2 divisor, r0 quotient, 256; clobbers r1, r3, ar0, ar1, ir0, ir1, rc, rs, re 257#ifdef L_udivsi3 258 .text 259 .global ___udivqi3 260 .global udivqi3n 261___udivqi3: 262 .if .REGPARM == 0 263#ifdef _TMS320C4x 264 lda sp,ar0 265#else 266 ldiu sp,ar0 267#endif 268 ldi *-ar0(1), ar2 269 ldi *-ar0(2), r2 270 .endif 271 272udivqi3n: 273 pop ir1 274 275 cmpi ar2, r2 ; If divisor > dividend 276 bhi qzero ; return zero 277 ldi r2, ar1 ; Store divisor in ar1 278 279 tstb ar2, ar2 ; Check top bit, jump if set to special handler 280 bld div_32 ; Delayed branch 281 282; 283; Get divisor exponent 284; 285 float ar1, r1 ; Normalize the divisor 286 pushf r1 ; Get into int register 287 pop rc 288 ; branch occurs here 289 290 bzd qzero ; if (float) divisor zero, return zero 291 292 float ar2, r1 ; Normalize the dividend 293 pushf r1 ; Get into int register 294 pop ar0 295 lsh -24, ar0 ; Get both the exponents 296 lsh -24, rc 297 298 subi rc, ar0 ; Get the difference between the exponents 299 lsh ar0, ar1 ; Normalize the divisor with the dividend 300 301; 302; Do count_1 subtracts and shifts 303; 304 rpts ar0 305 subc ar1, ar2 306 307; 308; mask off the lower count+1 bits 309; 310 subri 31, ar0 ; Shift count (31 - (ar0+1)) 311 bud ir1 ; Delayed return 312 lsh3 ar0, ar2, r0 313 negi ar0, ar0 314 lsh ar0, r0 315 ; Branch occurs here 316 317; 318; Handle a full 32-bit dividend 319; 320div_32: tstb ar1, ar1 321 bld qone ; if divisor high bit is one, the result is one 322 lsh -24, rc 323 subri 31, rc 324 lsh rc, ar1 ; Line up the divisor 325 326; 327; Now divisor and dividend are aligned. Do first SUBC by hand, save 328; of the forst quotient digit. Then, shift divisor right rather 329; than shifting dividend left. This leaves a zero in the top bit of 330; the divident 331; 332 ldi 1, ar0 ; Initizialize MSB of quotient 333 lsh rc, ar0 ; create a mask for MSBs 334 subi 1, ar0 ; mask is (2 << count) - 1 335 336 subi3 ar1, ar2, r1 337 ldihs r1, ar2 338 ldihs 1, r1 339 ldilo 0, r1 340 lsh rc, r1 341 342 lsh -1, ar1 343 subi 1, rc 344; 345; do the rest of the shifts and subtracts 346; 347 rpts rc 348 subc ar1, ar2 349 350 bud ir1 351 and ar0, ar2 352 or3 r1, ar2, r0 353 nop 354 355qone: 356 bud ir1 357 ldi 1, r0 358 nop 359 nop 360 361qzero: 362 bud ir1 363 ldi 0, r0 364 nop 365 nop 366#endif 367 368#ifdef L_umodsi3 369 .text 370 .global ___umodqi3 371 .global umodqi3n 372___umodqi3: 373 .if .REGPARM == 0 374#ifdef _TMS320C4x 375 lda sp,ar0 376#else 377 ldiu sp,ar0 378#endif 379 ldi *-ar0(1), ar2 380 ldi *-ar0(2), r2 381 .endif 382 383umodqi3n: 384 pop ir1 ; return address 385 cmpi ar2, r2 ; divisor > dividend ? 386 bhi uzero ; if so, return dividend 387 ldi r2, ar1 ; load divisor 388; 389; If top bit of dividend is set, handle specially. 390; 391 tstb ar2, ar2 ; check top bit 392 bld umod_32 ; get divisor exponent, then jump. 393; 394; Get divisor exponent by converting to float. 395; 396 float ar1, r1 ; normalize divisor 397 pushf r1 ; push as float 398 pop rc ; pop as int to get exponent 399 bzd uzero ; if (float)divisor was zero, return 400; 401; 31 or less bits in dividend. Get dividend exponent. 402; 403 float ar2, r1 ; normalize dividend 404 pushf r1 ; push as float 405 pop ar0 ; pop as int to get exponent 406; 407; Use difference in exponents as shift count to line up MSBs. 408; 409 lsh -24, rc ; divisor exponent 410 lsh -24, ar0 ; dividend exponent 411 subi rc, ar0 ; difference 412 lsh ar0, ar1 ; shift divisor up 413; 414; Do COUNT+1 subtract & shifts. 415; 416 rpts ar0 417 subc ar1, ar2 418; 419; Remainder is in upper 31-COUNT bits. 420; 421 bud ir1 ; delayed branch to return 422 addi 1, ar0 ; shift count is COUNT+1 423 negi ar0, ar0 ; negate for right shift 424 lsh3 ar0, ar2, r0 ; shift to get result 425 ; Return occurs here 426 427; 428; The following code handles cases of a full 32-bit dividend. Before 429; SUBC can be used, the top bit must be cleared (otherwise SUBC can 430; possibly shift a significant 1 out the top of the dividend). This 431; is accomplished by first doing a normal subtraction, then proceeding 432; with SUBCs. 433; 434umod_32: 435; 436; If the top bit of the divisor is set too, the remainder is simply 437; the difference between the dividend and divisor. Otherwise, shift 438; the divisor up to line up the MSBs. 439; 440 tstb ar1, ar1 ; check divisor 441 bld uone ; if negative, remainder is diff 442 443 lsh -24, rc ; divisor exponent 444 subri 31, rc ; shift count = 31 - exp 445 negi rc, ar0 ; used later as shift count 446 lsh rc, ar1 ; shift up to line up MSBs 447; 448; Now MSBs are aligned. Do first SUBC by hand using a plain subtraction. 449; Then, shift divisor right rather than shifting dividend left. This leaves 450; a 0 in the top bit of the dividend. 451; 452 subi3 ar1, ar2, r1 ; subtract 453 ldihs r1, ar2 ; if positive, replace dividend 454 subi 1, rc ; first iteration is done 455 lsh -1, ar1 ; shift divisor down 456; 457; Do EXP subtract & shifts. 458; 459 rpts rc 460 subc ar1, ar2 461; 462; Quotient is in EXP+1 LSBs; shift remainder (in MSBs) down. 463; 464 bud ir1 465 lsh3 ar0, ar2, r0 ; COUNT contains -(EXP+1) 466 nop 467 nop 468; 469; Return (dividend - divisor). 470; 471uone: bud ir1 472 subi3 r2, ar2, r0 473 nop 474 nop 475; 476; Return dividend. 477; 478uzero: bud ir1 479 ldi ar2, r0 ; set status from result 480 nop 481 nop 482#endif 483 484#ifdef L_modsi3 485 .text 486 .global ___modqi3 487 .ref umodqi3n 488___modqi3: 489 .if .REGPARM == 0 490#ifdef _TMS320C4x 491 lda sp,ar0 492#else 493 ldiu sp,ar0 494#endif 495 ldi *-ar0(1), ar2 496 ldi *-ar0(2), r2 497 .endif 498 499; 500; Determine sign of result. Get absolute value of operands. 501; 502 ldi ar2, ar0 ; sign of result same as dividend 503 absi ar2, r0 ; make dividend positive 504 bvd mod_32 ; if still negative, escape 505 absi r2, r1 ; make divisor positive 506 ldi r1, ar1 ; save in ar1 507 cmpi r0, ar1 ; divisor > dividend ? 508 509 pop ir1 ; return address 510 bhid return ; if so, return dividend 511; 512; Normalize operands. Use difference in exponents as shift count 513; for divisor, and as repeat count for SUBC. 514; 515 float r1, r1 ; normalize divisor 516 pushf r1 ; push as float 517 pop rc ; pop as int 518 bzd return ; if (float)divisor was zero, return 519 520 float r0, r1 ; normalize dividend 521 pushf r1 ; push as float 522 pop r1 ; pop as int 523 524 lsh -24, rc ; get divisor exponent 525 lsh -24, r1 ; get dividend exponent 526 subi rc, r1 ; get difference in exponents 527 lsh r1, ar1 ; align divisor with dividend 528; 529; Do COUNT+1 subtract & shifts. 530; 531 rpts r1 532 subc ar1, r0 533; 534; Remainder is in upper bits of R0 535; 536 addi 1, r1 ; shift count is -(r1+1) 537 negi r1, r1 538 lsh r1, r0 ; shift right 539; 540; Check sign and negate result if necessary. 541; 542return: 543 bud ir1 ; delayed branch to return 544 negi r0, r1 ; negate result 545 cmpi 0, ar0 ; check sign 546 ldin r1, r0 ; if set, use negative result 547 ; Return occurs here 548; 549; The following code handles cases of a full 32-bit dividend. This occurs 550; when R0 = abs(R0) = 080000000h. Handle this by calling the unsigned mod 551; function, then negating the result if necessary. 552; 553mod_32: 554 push ar0 ; remember sign 555 call umodqi3n ; do divide 556 557 brd return ; return 558 pop ar0 ; restore sign 559 pop ir1 ; return address 560 nop 561#endif 562 563#ifdef L_unsfltconst 564 .section .const 565 .global ___unsfltconst 566___unsfltconst: .float 4294967296.0 567#endif 568 569#ifdef L_unsfltcompare 570 .section .const 571 .global ___unsfltcompare 572___unsfltcompare: .float 2147483648.0 573#endif 574 575; Integer 32-bit signed multiplication 576; 577; The TMS320C3x MPYI instruction takes two 24-bit signed integers 578; and produces a 48-bit signed result which is truncated to 32-bits. 579; 580; A 32-bit by 32-bit multiplication thus requires a number of steps. 581; 582; Consider the product of two 32-bit signed integers, 583; 584; z = x * y 585; 586; where x = (b << 16) + a, y = (d << 16) + c 587; 588; This can be expressed as 589; 590; z = ((b << 16) + a) * ((d << 16) + c) 591; 592; = ((b * d) << 32) + ((b * c + a * d) << 16) + a * c 593; 594; Let z = (f << 16) + e where f < (1 << 16). 595; 596; Since we are only interested in a 32-bit result, we can ignore the 597; (b * d) << 32 term, and thus 598; 599; f = b * c + a * d, e = a * c 600; 601; We can simplify things if we have some a priori knowledge of the 602; operands, for example, if -32768 <= y <= 32767, then y = c and d = 0 and thus 603; 604; f = b * c, e = a * c 605; 606; ar2 multiplier, r2 multiplicand, r0 product 607; clobbers r1, r2, r3 608#ifdef L_mulsi3 609 .text 610 .global ___mulqi3 611___mulqi3: 612 .if .REGPARM == 0 613#ifdef _TMS320C4x 614 lda sp,ar0 615#else 616 ldiu sp,ar0 617#endif 618 ldi *-ar0(1), ar2 619 ldi *-ar0(2), r2 620 .endif 621 622 pop ir1 ; return address 623 ldi ar2, r0 ; 624 and 0ffffh, r0 ; a 625 lsh -16, ar2 ; b 626 ldi r2, r3 ; 627 and 0ffffh, r3 ; c 628 mpyi r3, ar2 ; c * b 629 lsh -16, r2 ; d 630 mpyi r0, r2 ; a * d 631 addi ar2, r2 ; c * b + a * d 632 bd ir1 ; delayed branch to return 633 lsh 16, r2 ; (c * b + a * d) << 16 634 mpyi r3, r0 ; a * c 635 addi r2, r0 ; a * c + (c * b + a * d) << 16 636; branch occurs here 637 638#endif 639 640; 641; Integer 64 by 64 multiply 642; long1 and long2 on stack 643; result in r0,r1 644; 645#ifdef L_muldi3 646 .text 647 .global ___mulhi3 648#ifdef _TMS320C4x 649___mulhi3: 650 pop ar0 651 ldi sp,ar2 652 ldi *-ar2(1),r2 653 ldi *-ar2(3),r3 654 mpyi3 r2,r3,r0 655 mpyuhi3 r2,r3,r1 656 mpyi *-ar2(2),r2 657 bd ar0 658 mpyi *-ar2(0),r3 659 addi r2,r1 660 addi r3,r1 661#else 662___mulhi3: 663 ldi sp,ar2 664 ldi -16,rs 665 ldi *-ar2(2),ar0 666 ldi *-ar2(4),ar1 667 ldi ar0,r2 668 and 0ffffh,r2 669 ldi ar1,r3 670 and 0ffffh,r3 671 lsh rs,ar0 672 lsh rs,ar1 673 674 mpyi r2,r3,r0 675 mpyi ar0,ar1,r1 676 mpyi r2,ar1,rc 677 lsh rs,rc,re 678 addi re,r1 679 lsh 16,rc 680 addi rc,r0 681 addc 0,r1 682 mpyi r3,ar0,rc 683 lsh rs,rc,re 684 addi re,r1 685 lsh 16,rc 686 addi rc,r0 687 addc 0,r1 688 689 ldi *-ar2(1),ar0 690 ldi ar0,r2 691 and 0ffffh,r2 692 lsh rs,ar0 693 mpyi r2,r3,rc 694 addi rc,r1 695 mpyi r2,ar1,rc 696 mpyi r3,ar0,re 697 addi re,rc 698 lsh 16,rc 699 addi rc,r1 700 701 ldi *-ar2(2),ar0 702 ldi *-ar2(3),ar1 703 ldi ar0,r2 704 and 0ffffh,r2 705 ldi ar1,r3 706 and 0ffffh,r3 707 lsh rs,ar0 708 lsh rs,ar1 709 mpyi r2,r3,rc 710 addi rc,r1 711 mpyi r2,ar1,rc 712 mpyi r3,ar0,re 713 pop ar0 714 bd ar0 715 addi re,rc 716 lsh 16,rc 717 addi rc,r1 718#endif 719#endif 720 721; 722; Integer 32 by 32 multiply highpart unsigned 723; src1 in ar2 724; src2 in r2 725; result in r0 726; 727#ifdef L_umuldi3_high 728 .text 729 .global ___umulhi3_high 730___umulhi3_high: 731 .if .REGPARM == 0 732#ifdef _TMS320C4x 733 lda sp,ar0 734#else 735 ldiu sp,ar0 736#endif 737 ldi *-ar0(1), ar2 738 ldi *-ar0(2), r2 739 .endif 740 741 ldi -16,rs 742 ldi r2,r3 743 and 0ffffh,r2 744 ldi ar2,ar1 745 and 0ffffh,ar2 746 lsh rs,r3 747 lsh rs,ar1 748 749 mpyi ar2,r2,r1 750 mpyi ar1,r3,r0 751 mpyi ar2,r3,rc 752 lsh rs,rc,re 753 addi re,r0 754 lsh 16,rc 755 addi rc,r1 756 addc 0,r0 757 mpyi r2,ar1,rc 758 lsh rs,rc,re 759 addi re,r0 760 pop ar0 761 bd ar0 762 lsh 16,rc 763 addi rc,r1 764 addc 0,r0 765#endif 766 767; 768; Integer 32 by 32 multiply highpart signed 769; src1 in ar2 770; src2 in r2 771; result in r0 772; 773#ifdef L_smuldi3_high 774 .text 775 .global ___smulhi3_high 776___smulhi3_high: 777 .if .REGPARM == 0 778#ifdef _TMS320C4x 779 lda sp,ar0 780#else 781 ldiu sp,ar0 782#endif 783 ldi *-ar0(1), ar2 784 ldi *-ar0(2), r2 785 .endif 786 787 ldi -16,rs 788 ldi 0,rc 789 subi3 ar2,rc,r0 790 ldi r2,r3 791 ldilt r0,rc 792 subi3 r2,rc,r0 793 ldi ar2,ar1 794 tstb ar1,ar1 795 ldilt r0,rc 796 and 0ffffh,r2 797 and 0ffffh,ar2 798 lsh rs,r3 799 lsh rs,ar1 800 801 mpyi ar2,r2,r1 802 mpyi ar1,r3,r0 803 addi rc,r0 804 mpyi ar2,r3,rc 805 lsh rs,rc,re 806 addi re,r0 807 lsh 16,rc 808 addi rc,r1 809 addc 0,r0 810 mpyi r2,ar1,rc 811 lsh rs,rc,re 812 addi re,r0 813 pop ar0 814 bd ar0 815 lsh 16,rc 816 addi rc,r1 817 addc 0,r0 818#endif 819 820; 821; Integer 64 by 64 unsigned divide 822; long1 and long2 on stack 823; divide in r0,r1 824; modulo in r2,r3 825; routine takes a maximum of 64*8+23=535 cycles = 21.4 us @ 50Mhz 826; 827#ifdef L_udivdi3 828 .text 829 .global ___udivhi3 830 .global ___udivide 831 .global ___umodulo 832 .ref udivqi3n 833 .ref umodqi3n 834___udivhi3: 835 ldi sp,ar2 836 ldi *-ar2(4),ar0 837 ldi *-ar2(3),ar1 838 ldi *-ar2(2),r0 839 ldi *-ar2(1),r1 840 841___udivide: 842 or r1,ar1,r2 843 bne udiv0 844 ldi ar0,r2 845 ldi r0,ar2 846 call udivqi3n 847 ldiu 0,r1 848 rets 849 850___umodulo: 851 or r1,ar1,r2 852 bne udiv0 853 ldi ar0,r2 854 ldi r0,ar2 855 call umodqi3n 856 ldi r0,r2 857 ldiu 0,r3 858 rets 859 860udiv0: 861 tstb ar1,ar1 862 bne udiv1 863 tstb ar0,ar0 864 bn udiv1 865 866 ldiu 63,rc 867#ifdef _TMS320C4x 868 rptbd udivend0 869 ldiu 0,r2 870 addi r0,r0 871 rolc r1 872#else 873 ldiu 0,r2 874 addi r0,r0 875 rolc r1 876 rptb udivend0 877#endif 878 879 rolc r2 880 subi3 ar0,r2,r3 881 ldinc r3,r2 882 rolc r0 883udivend0: 884 rolc r1 885 886 not r0 887 not r1 888 ldiu 0,r3 889 rets 890udiv1: 891 push r4 892 push r5 893 ldiu 63,rc 894 ldiu 0,r2 895#ifdef _TMS320C4x 896 rptbd udivend1 897 ldiu 0,r3 898 addi r0,r0 899 rolc r1 900#else 901 ldiu 0,r3 902 addi r0,r0 903 rolc r1 904 rptb udivend1 905#endif 906 907 rolc r2 908 rolc r3 909 subi3 ar0,r2,r4 910 subb3 ar1,r3,r5 911 ldinc r4,r2 912 ldinc r5,r3 913 rolc r0 914udivend1: 915 rolc r1 916 917 not r0 918 not r1 919 pop r5 920 pop r4 921 rets 922#endif 923 924; 925; Integer 64 by 64 unsigned modulo 926; long1 and long2 on stack 927; result in r0,r1 928; 929#ifdef L_umoddi3 930 .text 931 .global ___umodhi3 932 .ref ___modulo 933___umodhi3: 934 ldi sp,ar2 935 ldi *-ar2(4),ar0 936 ldi *-ar2(3),ar1 937 ldi *-ar2(2),r0 938 ldi *-ar2(1),r1 939 call ___umodulo 940 pop ar0 941 bd ar0 942 ldi r2,r0 943 ldi r3,r1 944 nop 945#endif 946 947; 948; Integer 64 by 64 signed divide 949; long1 and long2 on stack 950; result in r0,r1 951; 952#ifdef L_divdi3 953 .text 954 .global ___divhi3 955 .ref ___udivide 956___divhi3: 957 ldi 0,ir0 958 ldi sp,ar2 959 ldi *-ar2(4),r0 960 ldi *-ar2(3),r1 961 bge div1 962 not ir0 963 negi r0 964 negb r1 965div1: 966 ldi r0,ar0 967 ldi r1,ar1 968 ldi *-ar2(2),r0 969 ldi *-ar2(1),r1 970 bge div2 971 not ir0 972 negi r0 973 negb r1 974div2: 975 call ___udivide 976 tstb ir0,ir0 977 bge div3 978 negi r0 979 negb r1 980div3: 981 rets 982#endif 983 984; 985; Integer 64 by 64 signed modulo 986; long1 and long2 on stack 987; result in r0,r1 988; 989#ifdef L_moddi3 990 .text 991 .global ___modhi3 992 .ref ___umodulo 993___modhi3: 994 ldi 0,ir0 995 ldi sp,ar2 996 ldi *-ar2(4),r0 997 ldi *-ar2(3),r1 998 bge mod1 999 not ir0 1000 negi r0 1001 negb r1 1002mod1: 1003 ldi r0,ar0 1004 ldi r1,ar1 1005 ldi *-ar2(2),r0 1006 ldi *-ar2(1),r1 1007 bge mod2 1008 not ir0 1009 negi r0 1010 negb r1 1011mod2: 1012 call ___umodulo 1013 ldi r2,r0 1014 ldi r3,r1 1015 tstb ir0,ir0 1016 bge mod3 1017 negi r0 1018 negb r1 1019mod3: 1020 rets 1021#endif 1022 1023; 1024; double to signed long long conversion 1025; input in r2 1026; result in r0,r1 1027; 1028#ifdef L_fix_truncsfdi2 1029 .text 1030 .global ___fix_truncqfhi2 1031 .ref ufix_truncqfhi2n 1032___fix_truncqfhi2: 1033 .if .REGPARM == 0 1034#ifdef _TMS320C4x 1035 lda sp,ar0 1036#else 1037 ldiu sp,ar0 1038#endif 1039 ldf *-ar0(1), r2 1040 .endif 1041 1042 cmpf 0.0,r2 1043 bge ufix_truncqfhi2n 1044 negf r2 1045 call ufix_truncqfhi2n 1046 negi r0 1047 negb r1 1048 rets 1049#endif 1050 1051; 1052; double to unsigned long long conversion 1053; input in r2 1054; result in r0,r1 1055; 1056#ifdef L_ufix_truncsfdi2 1057 .text 1058 .global ___ufix_truncqfhi2 1059 .global ufix_truncqfhi2n 1060___ufix_truncqfhi2: 1061 .if .REGPARM == 0 1062#ifdef _TMS320C4x 1063 lda sp,ar0 1064#else 1065 ldiu sp,ar0 1066#endif 1067 ldf *-ar0(1), r2 1068 .endif 1069 1070ufix_truncqfhi2n: 1071 cmpf 0.0,r2 1072 ble ufix1 1073 pushf r2 1074 pop r3 1075 ash -24,r3 1076 subi 31,r3 1077 cmpi 32,r3 1078 bgt ufix1 1079 cmpi -32,r3 1080 ble ufix1 1081 ldi 1,r0 1082 ash 31,r0 1083 or3 r0,r2,r0 1084 ldi r0,r1 1085 lsh3 r3,r0,r0 1086 subi 32,r3 1087 cmpi -32,r3 1088 ldile 0,r1 1089 lsh3 r3,r1,r1 1090 rets 1091ufix1: 1092 ldi 0,r0 1093 ldi 0,r1 1094 rets 1095#endif 1096 1097; 1098; signed long long to double conversion 1099; input on stack 1100; result in r0 1101; 1102#ifdef L_floatdisf2 1103 .text 1104 .global ___floathiqf2 1105 .ref ufloathiqf2n 1106___floathiqf2: 1107 ldi sp,ar2 1108 ldi *-ar2(2),r0 1109 ldi *-ar2(1),r1 1110 bge ufloathiqf2n 1111 negi r0 1112 negb r1 1113 call ufloathiqf2n 1114 negf r0 1115 rets 1116#endif 1117 1118; 1119; unsigned long long to double conversion 1120; input on stack 1121; result in r0 1122; 1123#ifdef L_ufloatdisf2 1124 .text 1125 .global ___ufloathiqf2 1126 .global ufloathiqf2n 1127 .ref ___unsfltconst 1128___ufloathiqf2: 1129 ldi sp,ar2 1130 ldi *-ar2(2),r0 1131 ldi *-ar2(1),r1 1132ufloathiqf2n: 1133 .if .BIGMODEL 1134#ifdef _TMS320C4x 1135 ldpk @___unsfltconst 1136#else 1137 ldp @___unsfltconst 1138#endif 1139 .endif 1140 ldf @___unsfltconst,r2 1141 float r0 1142 bge uflt1 1143 addf r2,r0 1144uflt1: 1145 float r1 1146 bge uflt2 1147 addf r2,r1 1148uflt2: 1149#ifdef _TMS320C4x 1150 pop r3 1151 bd r3 1152 mpyf r2,r1 1153 addf r1,r0 1154 nop 1155#else 1156 ldf r1,r3 1157 and 0ffh,r3 1158 norm r3,r3 1159 mpyf r2,r3 1160 pop ar2 1161 bd ar2 1162 addf r3,r0 1163 mpyf r2,r1 1164 addf r1,r0 1165#endif 1166#endif 1167 1168; 1169; long double to signed long long conversion 1170; input in r2 1171; result in r0,r1 1172; 1173#ifdef L_fix_truncdfdi2 1174 .text 1175 .global ___fix_trunchfhi2 1176 .ref ufix_trunchfhi2n 1177___fix_trunchfhi2: 1178 .if .REGPARM == 0 1179#ifdef _TMS320C4x 1180 lda sp,ar0 1181#else 1182 ldiu sp,ar0 1183#endif 1184 ldf *-ar0(2), r2 1185 ldi *-ar0(1), r2 1186 .endif 1187 1188 cmpf 0.0,r2 1189 bge ufix_trunchfhi2n 1190 negf r2 1191 call ufix_trunchfhi2n 1192 negi r0 1193 negb r1 1194 rets 1195#endif 1196 1197; 1198; long double to unsigned long long conversion 1199; input in r2 1200; result in r0,r1 1201; 1202#ifdef L_ufix_truncdfdi2 1203 .text 1204 .global ___ufix_trunchfhi2 1205 .global ufix_trunchfhi2n 1206___ufix_trunchfhi2: 1207 .if .REGPARM == 0 1208#ifdef _TMS320C4x 1209 lda sp,ar0 1210#else 1211 ldiu sp,ar0 1212#endif 1213 ldf *-ar0(2), r2 1214 ldi *-ar0(1), r2 1215 .endif 1216 1217ufix_trunchfhi2n: 1218 cmpf 0.0,r2 1219 ble ufixh1 1220 pushf r2 1221 pop r3 1222 ash -24,r3 1223 subi 31,r3 1224 cmpi 32,r3 1225 bgt ufixh1 1226 cmpi -32,r3 1227 ble ufixh1 1228 ldi 1,r0 1229 ash 31,r0 1230 or3 r0,r2,r0 1231 ldi r0,r1 1232 lsh3 r3,r0,r0 1233 subi 32,r3 1234 cmpi -32,r3 1235 ldile 0,r1 1236 lsh3 r3,r1,r1 1237 rets 1238ufixh1: 1239 ldi 0,r0 1240 ldi 0,r1 1241 rets 1242#endif 1243 1244; 1245; signed long long to long double conversion 1246; input on stack 1247; result in r0 1248; 1249#ifdef L_floatdidf2 1250 .text 1251 .global ___floathihf2 1252 .ref ufloathihf2n 1253___floathihf2: 1254 ldi sp,ar2 1255 ldi *-ar2(2),r0 1256 ldi *-ar2(1),r1 1257 bge ufloathihf2n 1258 negi r0 1259 negb r1 1260 call ufloathihf2n 1261 negf r0 1262 rets 1263#endif 1264 1265; 1266; unsigned long long to double conversion 1267; input on stack 1268; result in r0 1269; 1270#ifdef L_ufloatdidf2 1271 .text 1272 .global ___ufloathihf2 1273 .global ufloathihf2n 1274 .ref ___unsfltconst 1275___ufloathihf2: 1276 ldi sp,ar2 1277 ldi *-ar2(2),r0 1278 ldi *-ar2(1),r1 1279ufloathihf2n 1280 .if .BIGMODEL 1281#ifdef _TMS320C4x 1282 ldpk @___unsfltconst 1283#else 1284 ldp @___unsfltconst 1285#endif 1286 .endif 1287 ldf @___unsfltconst,r2 1288 float r0 1289 bge uflth1 1290 addf r2,r0 1291uflth1: 1292 float r1 1293 bge uflth2 1294 addf r2,r1 1295uflth2: 1296#ifdef _TMS320C4x 1297 pop r3 1298 bd r3 1299 mpyf r2,r1 1300 addf r1,r0 1301 nop 1302#else 1303 ldf r1,r3 1304 and 0ffh,r3 1305 norm r3,r3 1306 mpyf r2,r3 1307 pop ar2 1308 bd ar2 1309 addf r3,r0 1310 mpyf r2,r1 1311 addf r1,r0 1312#endif 1313#endif 1314 1315; 1316; calculate ffs 1317; input in ar2 1318; result in r0 1319; 1320#ifdef L_ffs 1321 .global ___ffs 1322 .ref ___unsfltconst 1323 .text 1324___ffs: 1325 .if .REGPARM == 0 1326#ifdef _TMS320C4x 1327 lda sp,ar0 1328#else 1329 ldiu sp,ar0 1330#endif 1331 ldi *-ar0(1), ar2 1332 .endif 1333 1334 negi ar2,r0 1335 and ar2,r0 1336 float r0,r0 1337 ldfu 0.0,r1 1338 .if .BIGMODEL 1339#ifdef _TMS320C4x 1340 ldpk @___unsfltconst 1341#else 1342 ldp @___unsfltconst 1343#endif 1344 .endif 1345 ldflt @___unsfltconst,r1 1346 addf r1,r0 1347 pushf r0 1348 pop r0 1349 pop ar0 1350 bd ar0 1351 ash -24,r0 1352 ldilt -1,r0 1353 addi 1,r0 1354#endif 1355 1356; 1357; calculate long double * long double 1358; input in r2, r3 1359; output in r0 1360; 1361#ifdef L_muldf3 1362 .global ___mulhf3 1363 .text 1364___mulhf3: 1365 .if .REGPARM == 0 1366#ifdef _TMS320C4x 1367 lda sp,ar0 1368#else 1369 ldiu sp,ar0 1370#endif 1371 ldf *-ar0(2), r2 1372 ldi *-ar0(1), r2 1373 ldf *-ar0(4), r3 1374 ldi *-ar0(3), r3 1375 .endif 1376 1377 pop ar2 ; return ad 1378 ldf r2,r0 ; copy lsb0 1379 ldf r3,r1 ; copy lsb1 1380 and 0ffh,r0 ; mask lsb0 1381 and 0ffh,r1 ; mask lsb1 1382 norm r0,r0 ; correct lsb0 1383 norm r1,r1 ; correct lsb1 1384 mpyf r2,r1 ; arg0*lsb1 1385 mpyf r3,r0 ; arg1*lsb0 1386 bd ar2 ; return (delayed) 1387 addf r0,r1 ; arg0*lsb1 + arg1*lsb0 1388 mpyf r2,r3,r0 ; msb0*msb1 1389 addf r1,r0 ; msb0*msb1 + arg0*lsb1 + arg1*lsb0 1390#endif 1391 1392; 1393; calculate long double / long double 1394; r2 dividend, r3 divisor, r0 quotient 1395; 1396#ifdef L_divdf3 1397 .global ___divhf3 1398 .text 1399___divhf3: 1400 .if .REGPARM == 0 1401#ifdef _TMS320C4x 1402 lda sp,ar0 1403#else 1404 ldiu sp,ar0 1405#endif 1406 ldf *-ar0(2), r2 1407 ldi *-ar0(1), r2 1408 ldf *-ar0(4), r3 1409 ldi *-ar0(3), r3 1410 .endif 1411 1412#ifdef _TMS320C4x 1413 pop ar1 1414 rcpf r3, r0 1415 mpyf3 r0, r3, r1 1416 subrf 2.0, r1 1417 mpyf r1, r0 1418 mpyf3 r0, r3, r1 1419 bud ar1 1420 subrf 2.0, r1 1421 mpyf r1, r0 1422 mpyf r2, r0 1423#else 1424 pop ar1 1425 pushf r3 1426 pop r0 1427 not r0 1428 push r0 1429 popf r0 1430 ldf -1.0, r1 1431 xor r1, r0 1432 1433 mpyf3 r0, r3, r1 ; r1 = r[0] * v 1434 subrf 2.0, r1 ; r1 = 2.0 - r[0] * v 1435 mpyf r1, r0 ; r0 = r[0] * (2.0 - r[0] * v) = r[1] 1436; End of 1st iteration 1437 1438 mpyf3 r0, r3, r1 ; r1 = r[1] * v 1439 subrf 2.0, r1 ; r1 = 2.0 - r[1] * v 1440 mpyf r1, r0 ; r0 = r[1] * (2.0 - r[1] * v) = r[2] 1441; End of 2nd iteration 1442 1443 mpyf3 r0, r3, r1 ; r1 = r[2] * v 1444 subrf 2.0, r1 ; r1 = 2.0 - r[2] * v 1445 mpyf r1, r0 ; r0 = r[2] * (2.0 - r[2] * v) = r[3] 1446; End of 3rd iteration 1447 1448 or 080h, r0 1449 rnd r0 1450 1451; mpyf3 r0, r3, r1 ; r1 = r[3] * v 1452 push r4 1453 pushf r4 1454 mpyf r0, r3, r1 1455 1456 ldf r0, r4 1457 and 0ffh, r4 1458 norm r4, r4 1459 mpyf r3, r4 1460 addf r4, r1 1461 1462 ldf r3, r4 1463 and 0ffh, r4 1464 norm r4, r4 1465 mpyf r0, r4 1466 addf r4, r1 1467 1468 subrf 2.0, r1 ; r1 = 2.0 - r[3] * v 1469 1470 mpyf r1, r0, r3 ; r3 = r[3] * (2.0 - r[3] * v) = r[5] 1471 1472 ldf r1, r4 1473 and 0ffh, r4 1474 norm r4, r4 1475 mpyf r0, r4 1476 addf r4, r3 1477 1478 ldf r0, r4 1479 and 0ffh, r4 1480 norm r4, r4 1481 mpyf r1, r4 1482 addf r4, r3 1483 1484 mpyf r2, r3, r0 ; Multiply by the dividend 1485 1486 ldf r2, r4 1487 and 0ffh, r4 1488 norm r4, r4 1489 mpyf r3, r4 1490 addf r4, r0 1491 1492 ldf r3, r4 1493 and 0ffh, r4 1494 norm r4, r4 1495 mpyf r2, r4 1496 bd ar1 1497 addf r4, r0 1498 1499 popf r4 1500 pop r4 1501#endif 1502#endif 1503