1/* -*- Mode: Asm -*- */ 2;; Copyright (C) 2012-2021 Free Software Foundation, Inc. 3;; Contributed by Sean D'Epagnier (sean@depagnier.com) 4;; Georg-Johann Lay (avr@gjlay.de) 5 6;; This file is free software; you can redistribute it and/or modify it 7;; under the terms of the GNU General Public License as published by the 8;; Free Software Foundation; either version 3, or (at your option) any 9;; later version. 10 11;; In addition to the permissions in the GNU General Public License, the 12;; Free Software Foundation gives you unlimited permission to link the 13;; compiled version of this file into combinations with other programs, 14;; and to distribute those combinations without any restriction coming 15;; from the use of this file. (The General Public License restrictions 16;; do apply in other respects; for example, they cover modification of 17;; the file, and distribution when not linked into a combine 18;; executable.) 19 20;; This file is distributed in the hope that it will be useful, but 21;; WITHOUT ANY WARRANTY; without even the implied warranty of 22;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 23;; General Public License for more details. 24 25;; You should have received a copy of the GNU General Public License 26;; along with this program; see the file COPYING. If not, write to 27;; the Free Software Foundation, 51 Franklin Street, Fifth Floor, 28;; Boston, MA 02110-1301, USA. 29 30;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 31;; Fixed point library routines for AVR 32;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 33 34#if defined __AVR_TINY__ 35#define __zero_reg__ r17 36#define __tmp_reg__ r16 37#else 38#define __zero_reg__ r1 39#define __tmp_reg__ r0 40#endif 41 42.section .text.libgcc.fixed, "ax", @progbits 43 44#ifndef __AVR_TINY__ 45 46;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 47;; Conversions to float 48;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 49 50#if defined (L_fractqqsf) 51DEFUN __fractqqsf 52 ;; Move in place for SA -> SF conversion 53 clr r22 54 mov r23, r24 55 ;; Sign-extend 56 lsl r24 57 sbc r24, r24 58 mov r25, r24 59 XJMP __fractsasf 60ENDF __fractqqsf 61#endif /* L_fractqqsf */ 62 63#if defined (L_fractuqqsf) 64DEFUN __fractuqqsf 65 ;; Move in place for USA -> SF conversion 66 clr r22 67 mov r23, r24 68 ;; Zero-extend 69 clr r24 70 clr r25 71 XJMP __fractusasf 72ENDF __fractuqqsf 73#endif /* L_fractuqqsf */ 74 75#if defined (L_fracthqsf) 76DEFUN __fracthqsf 77 ;; Move in place for SA -> SF conversion 78 wmov 22, 24 79 ;; Sign-extend 80 lsl r25 81 sbc r24, r24 82 mov r25, r24 83 XJMP __fractsasf 84ENDF __fracthqsf 85#endif /* L_fracthqsf */ 86 87#if defined (L_fractuhqsf) 88DEFUN __fractuhqsf 89 ;; Move in place for USA -> SF conversion 90 wmov 22, 24 91 ;; Zero-extend 92 clr r24 93 clr r25 94 XJMP __fractusasf 95ENDF __fractuhqsf 96#endif /* L_fractuhqsf */ 97 98#if defined (L_fracthasf) 99DEFUN __fracthasf 100 ;; Move in place for SA -> SF conversion 101 clr r22 102 mov r23, r24 103 mov r24, r25 104 ;; Sign-extend 105 lsl r25 106 sbc r25, r25 107 XJMP __fractsasf 108ENDF __fracthasf 109#endif /* L_fracthasf */ 110 111#if defined (L_fractuhasf) 112DEFUN __fractuhasf 113 ;; Move in place for USA -> SF conversion 114 clr r22 115 mov r23, r24 116 mov r24, r25 117 ;; Zero-extend 118 clr r25 119 XJMP __fractusasf 120ENDF __fractuhasf 121#endif /* L_fractuhasf */ 122 123 124#if defined (L_fractsqsf) 125DEFUN __fractsqsf 126 XCALL __floatsisf 127 ;; Divide non-zero results by 2^31 to move the 128 ;; decimal point into place 129 tst r25 130 breq 0f 131 subi r24, exp_lo (31) 132 sbci r25, exp_hi (31) 1330: ret 134ENDF __fractsqsf 135#endif /* L_fractsqsf */ 136 137#if defined (L_fractusqsf) 138DEFUN __fractusqsf 139 XCALL __floatunsisf 140 ;; Divide non-zero results by 2^32 to move the 141 ;; decimal point into place 142 cpse r25, __zero_reg__ 143 subi r25, exp_hi (32) 144 ret 145ENDF __fractusqsf 146#endif /* L_fractusqsf */ 147 148#if defined (L_fractsasf) 149DEFUN __fractsasf 150 XCALL __floatsisf 151 ;; Divide non-zero results by 2^15 to move the 152 ;; decimal point into place 153 tst r25 154 breq 0f 155 subi r24, exp_lo (15) 156 sbci r25, exp_hi (15) 1570: ret 158ENDF __fractsasf 159#endif /* L_fractsasf */ 160 161#if defined (L_fractusasf) 162DEFUN __fractusasf 163 XCALL __floatunsisf 164 ;; Divide non-zero results by 2^16 to move the 165 ;; decimal point into place 166 cpse r25, __zero_reg__ 167 subi r25, exp_hi (16) 168 ret 169ENDF __fractusasf 170#endif /* L_fractusasf */ 171 172;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 173;; Conversions from float 174;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 175 176#if defined (L_fractsfqq) 177DEFUN __fractsfqq 178 ;; Multiply with 2^{24+7} to get a QQ result in r25 179 subi r24, exp_lo (-31) 180 sbci r25, exp_hi (-31) 181 XCALL __fixsfsi 182 mov r24, r25 183 ret 184ENDF __fractsfqq 185#endif /* L_fractsfqq */ 186 187#if defined (L_fractsfuqq) 188DEFUN __fractsfuqq 189 ;; Multiply with 2^{24+8} to get a UQQ result in r25 190 subi r25, exp_hi (-32) 191 XCALL __fixunssfsi 192 mov r24, r25 193 ret 194ENDF __fractsfuqq 195#endif /* L_fractsfuqq */ 196 197#if defined (L_fractsfha) 198DEFUN __fractsfha 199 ;; Multiply with 2^{16+7} to get a HA result in r25:r24 200 subi r24, exp_lo (-23) 201 sbci r25, exp_hi (-23) 202 XJMP __fixsfsi 203ENDF __fractsfha 204#endif /* L_fractsfha */ 205 206#if defined (L_fractsfuha) 207DEFUN __fractsfuha 208 ;; Multiply with 2^24 to get a UHA result in r25:r24 209 subi r25, exp_hi (-24) 210 XJMP __fixunssfsi 211ENDF __fractsfuha 212#endif /* L_fractsfuha */ 213 214#if defined (L_fractsfhq) 215FALIAS __fractsfsq 216 217DEFUN __fractsfhq 218 ;; Multiply with 2^{16+15} to get a HQ result in r25:r24 219 ;; resp. with 2^31 to get a SQ result in r25:r22 220 subi r24, exp_lo (-31) 221 sbci r25, exp_hi (-31) 222 XJMP __fixsfsi 223ENDF __fractsfhq 224#endif /* L_fractsfhq */ 225 226#if defined (L_fractsfuhq) 227FALIAS __fractsfusq 228 229DEFUN __fractsfuhq 230 ;; Multiply with 2^{16+16} to get a UHQ result in r25:r24 231 ;; resp. with 2^32 to get a USQ result in r25:r22 232 subi r25, exp_hi (-32) 233 XJMP __fixunssfsi 234ENDF __fractsfuhq 235#endif /* L_fractsfuhq */ 236 237#if defined (L_fractsfsa) 238DEFUN __fractsfsa 239 ;; Multiply with 2^15 to get a SA result in r25:r22 240 subi r24, exp_lo (-15) 241 sbci r25, exp_hi (-15) 242 XJMP __fixsfsi 243ENDF __fractsfsa 244#endif /* L_fractsfsa */ 245 246#if defined (L_fractsfusa) 247DEFUN __fractsfusa 248 ;; Multiply with 2^16 to get a USA result in r25:r22 249 subi r25, exp_hi (-16) 250 XJMP __fixunssfsi 251ENDF __fractsfusa 252#endif /* L_fractsfusa */ 253 254 255;; For multiplication the functions here are called directly from 256;; avr-fixed.md instead of using the standard libcall mechanisms. 257;; This can make better code because GCC knows exactly which 258;; of the call-used registers (not all of them) are clobbered. */ 259 260/******************************************************* 261 Fractional Multiplication 8 x 8 without MUL 262*******************************************************/ 263 264#if defined (L_mulqq3) && !defined (__AVR_HAVE_MUL__) 265;;; R23 = R24 * R25 266;;; Clobbers: __tmp_reg__, R22, R24, R25 267;;; Rounding: ??? 268DEFUN __mulqq3 269 XCALL __fmuls 270 ;; TR 18037 requires that (-1) * (-1) does not overflow 271 ;; The only input that can produce -1 is (-1)^2. 272 dec r23 273 brvs 0f 274 inc r23 2750: ret 276ENDF __mulqq3 277#endif /* L_mulqq3 && ! HAVE_MUL */ 278 279/******************************************************* 280 Fractional Multiply .16 x .16 with and without MUL 281*******************************************************/ 282 283#if defined (L_mulhq3) 284;;; Same code with and without MUL, but the interfaces differ: 285;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25) 286;;; Clobbers: ABI, called by optabs 287;;; MUL: (R25:R24) = (R19:R18) * (R27:R26) 288;;; Clobbers: __tmp_reg__, R22, R23 289;;; Rounding: -0.5 LSB <= error <= 0.5 LSB 290DEFUN __mulhq3 291 XCALL __mulhisi3 292 ;; Shift result into place 293 lsl r23 294 rol r24 295 rol r25 296 brvs 1f 297 ;; Round 298 sbrc r23, 7 299 adiw r24, 1 300 ret 3011: ;; Overflow. TR 18037 requires (-1)^2 not to overflow 302 ldi r24, lo8 (0x7fff) 303 ldi r25, hi8 (0x7fff) 304 ret 305ENDF __mulhq3 306#endif /* defined (L_mulhq3) */ 307 308#if defined (L_muluhq3) 309;;; Same code with and without MUL, but the interfaces differ: 310;;; no MUL: (R25:R24) *= (R23:R22) 311;;; Clobbers: ABI, called by optabs 312;;; MUL: (R25:R24) = (R19:R18) * (R27:R26) 313;;; Clobbers: __tmp_reg__, R22, R23 314;;; Rounding: -0.5 LSB < error <= 0.5 LSB 315DEFUN __muluhq3 316 XCALL __umulhisi3 317 ;; Round 318 sbrc r23, 7 319 adiw r24, 1 320 ret 321ENDF __muluhq3 322#endif /* L_muluhq3 */ 323 324 325/******************************************************* 326 Fixed Multiply 8.8 x 8.8 with and without MUL 327*******************************************************/ 328 329#if defined (L_mulha3) 330;;; Same code with and without MUL, but the interfaces differ: 331;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25) 332;;; Clobbers: ABI, called by optabs 333;;; MUL: (R25:R24) = (R19:R18) * (R27:R26) 334;;; Clobbers: __tmp_reg__, R22, R23 335;;; Rounding: -0.5 LSB <= error <= 0.5 LSB 336DEFUN __mulha3 337 XCALL __mulhisi3 338 lsl r22 339 rol r23 340 rol r24 341 XJMP __muluha3_round 342ENDF __mulha3 343#endif /* L_mulha3 */ 344 345#if defined (L_muluha3) 346;;; Same code with and without MUL, but the interfaces differ: 347;;; no MUL: (R25:R24) *= (R23:R22) 348;;; Clobbers: ABI, called by optabs 349;;; MUL: (R25:R24) = (R19:R18) * (R27:R26) 350;;; Clobbers: __tmp_reg__, R22, R23 351;;; Rounding: -0.5 LSB < error <= 0.5 LSB 352DEFUN __muluha3 353 XCALL __umulhisi3 354 XJMP __muluha3_round 355ENDF __muluha3 356#endif /* L_muluha3 */ 357 358#if defined (L_muluha3_round) 359DEFUN __muluha3_round 360 ;; Shift result into place 361 mov r25, r24 362 mov r24, r23 363 ;; Round 364 sbrc r22, 7 365 adiw r24, 1 366 ret 367ENDF __muluha3_round 368#endif /* L_muluha3_round */ 369 370 371/******************************************************* 372 Fixed Multiplication 16.16 x 16.16 373*******************************************************/ 374 375;; Bits outside the result (below LSB), used in the signed version 376#define GUARD __tmp_reg__ 377 378#if defined (__AVR_HAVE_MUL__) 379 380;; Multiplier 381#define A0 16 382#define A1 A0+1 383#define A2 A1+1 384#define A3 A2+1 385 386;; Multiplicand 387#define B0 20 388#define B1 B0+1 389#define B2 B1+1 390#define B3 B2+1 391 392;; Result 393#define C0 24 394#define C1 C0+1 395#define C2 C1+1 396#define C3 C2+1 397 398#if defined (L_mulusa3) 399;;; (C3:C0) = (A3:A0) * (B3:B0) 400DEFUN __mulusa3 401 set 402 ;; Fallthru 403ENDF __mulusa3 404 405;;; Round for last digit iff T = 1 406;;; Return guard bits in GUARD (__tmp_reg__). 407;;; Rounding, T = 0: -1.0 LSB < error <= 0 LSB 408;;; Rounding, T = 1: -0.5 LSB < error <= 0.5 LSB 409DEFUN __mulusa3_round 410 ;; Some of the MUL instructions have LSBs outside the result. 411 ;; Don't ignore these LSBs in order to tame rounding error. 412 ;; Use C2/C3 for these LSBs. 413 414 clr C0 415 clr C1 416 mul A0, B0 $ movw C2, r0 417 418 mul A1, B0 $ add C3, r0 $ adc C0, r1 419 mul A0, B1 $ add C3, r0 $ adc C0, r1 $ rol C1 420 421 ;; Round if T = 1. Store guarding bits outside the result for rounding 422 ;; and left-shift by the signed version (function below). 423 brtc 0f 424 sbrc C3, 7 425 adiw C0, 1 4260: push C3 427 428 ;; The following MULs don't have LSBs outside the result. 429 ;; C2/C3 is the high part. 430 431 mul A0, B2 $ add C0, r0 $ adc C1, r1 $ sbc C2, C2 432 mul A1, B1 $ add C0, r0 $ adc C1, r1 $ sbci C2, 0 433 mul A2, B0 $ add C0, r0 $ adc C1, r1 $ sbci C2, 0 434 neg C2 435 436 mul A0, B3 $ add C1, r0 $ adc C2, r1 $ sbc C3, C3 437 mul A1, B2 $ add C1, r0 $ adc C2, r1 $ sbci C3, 0 438 mul A2, B1 $ add C1, r0 $ adc C2, r1 $ sbci C3, 0 439 mul A3, B0 $ add C1, r0 $ adc C2, r1 $ sbci C3, 0 440 neg C3 441 442 mul A1, B3 $ add C2, r0 $ adc C3, r1 443 mul A2, B2 $ add C2, r0 $ adc C3, r1 444 mul A3, B1 $ add C2, r0 $ adc C3, r1 445 446 mul A2, B3 $ add C3, r0 447 mul A3, B2 $ add C3, r0 448 449 ;; Guard bits used in the signed version below. 450 pop GUARD 451 clr __zero_reg__ 452 ret 453ENDF __mulusa3_round 454#endif /* L_mulusa3 */ 455 456#if defined (L_mulsa3) 457;;; (C3:C0) = (A3:A0) * (B3:B0) 458;;; Clobbers: __tmp_reg__, T 459;;; Rounding: -0.5 LSB <= error <= 0.5 LSB 460DEFUN __mulsa3 461 clt 462 XCALL __mulusa3_round 463 ;; A posteriori sign extension of the operands 464 tst B3 465 brpl 1f 466 sub C2, A0 467 sbc C3, A1 4681: sbrs A3, 7 469 rjmp 2f 470 sub C2, B0 471 sbc C3, B1 4722: 473 ;; Shift 1 bit left to adjust for 15 fractional bits 474 lsl GUARD 475 rol C0 476 rol C1 477 rol C2 478 rol C3 479 ;; Round last digit 480 lsl GUARD 481 adc C0, __zero_reg__ 482 adc C1, __zero_reg__ 483 adc C2, __zero_reg__ 484 adc C3, __zero_reg__ 485 ret 486ENDF __mulsa3 487#endif /* L_mulsa3 */ 488 489#undef A0 490#undef A1 491#undef A2 492#undef A3 493#undef B0 494#undef B1 495#undef B2 496#undef B3 497#undef C0 498#undef C1 499#undef C2 500#undef C3 501 502#else /* __AVR_HAVE_MUL__ */ 503 504#define A0 18 505#define A1 A0+1 506#define A2 A0+2 507#define A3 A0+3 508 509#define B0 22 510#define B1 B0+1 511#define B2 B0+2 512#define B3 B0+3 513 514#define C0 22 515#define C1 C0+1 516#define C2 C0+2 517#define C3 C0+3 518 519;; __tmp_reg__ 520#define CC0 0 521;; __zero_reg__ 522#define CC1 1 523#define CC2 16 524#define CC3 17 525 526#define AA0 26 527#define AA1 AA0+1 528#define AA2 30 529#define AA3 AA2+1 530 531#if defined (L_mulsa3) 532;;; (R25:R22) *= (R21:R18) 533;;; Clobbers: ABI, called by optabs 534;;; Rounding: -1 LSB <= error <= 1 LSB 535DEFUN __mulsa3 536 push B0 537 push B1 538 push B3 539 clt 540 XCALL __mulusa3_round 541 pop r30 542 ;; sign-extend B 543 bst r30, 7 544 brtc 1f 545 ;; A1, A0 survived in R27:R26 546 sub C2, AA0 547 sbc C3, AA1 5481: 549 pop AA1 ;; B1 550 pop AA0 ;; B0 551 552 ;; sign-extend A. A3 survived in R31 553 bst AA3, 7 554 brtc 2f 555 sub C2, AA0 556 sbc C3, AA1 5572: 558 ;; Shift 1 bit left to adjust for 15 fractional bits 559 lsl GUARD 560 rol C0 561 rol C1 562 rol C2 563 rol C3 564 ;; Round last digit 565 lsl GUARD 566 adc C0, __zero_reg__ 567 adc C1, __zero_reg__ 568 adc C2, __zero_reg__ 569 adc C3, __zero_reg__ 570 ret 571ENDF __mulsa3 572#endif /* L_mulsa3 */ 573 574#if defined (L_mulusa3) 575;;; (R25:R22) *= (R21:R18) 576;;; Clobbers: ABI, called by optabs 577;;; Rounding: -1 LSB <= error <= 1 LSB 578DEFUN __mulusa3 579 set 580 ;; Fallthru 581ENDF __mulusa3 582 583;;; A[] survives in 26, 27, 30, 31 584;;; Also used by __mulsa3 with T = 0 585;;; Round if T = 1 586;;; Return Guard bits in GUARD (__tmp_reg__), used by signed version. 587DEFUN __mulusa3_round 588 push CC2 589 push CC3 590 ; clear result 591 clr __tmp_reg__ 592 wmov CC2, CC0 593 ; save multiplicand 594 wmov AA0, A0 595 wmov AA2, A2 596 rjmp 3f 597 598 ;; Loop the integral part 599 6001: ;; CC += A * 2^n; n >= 0 601 add CC0,A0 $ adc CC1,A1 $ adc CC2,A2 $ adc CC3,A3 602 6032: ;; A <<= 1 604 lsl A0 $ rol A1 $ rol A2 $ rol A3 605 6063: ;; IBIT(B) >>= 1 607 ;; Carry = n-th bit of B; n >= 0 608 lsr B3 609 ror B2 610 brcs 1b 611 sbci B3, 0 612 brne 2b 613 614 ;; Loop the fractional part 615 ;; B2/B3 is 0 now, use as guard bits for rounding 616 ;; Restore multiplicand 617 wmov A0, AA0 618 wmov A2, AA2 619 rjmp 5f 620 6214: ;; CC += A:Guard * 2^n; n < 0 622 add B3,B2 $ adc CC0,A0 $ adc CC1,A1 $ adc CC2,A2 $ adc CC3,A3 6235: 624 ;; A:Guard >>= 1 625 lsr A3 $ ror A2 $ ror A1 $ ror A0 $ ror B2 626 627 ;; FBIT(B) <<= 1 628 ;; Carry = n-th bit of B; n < 0 629 lsl B0 630 rol B1 631 brcs 4b 632 sbci B0, 0 633 brne 5b 634 635 ;; Save guard bits and set carry for rounding 636 push B3 637 lsl B3 638 ;; Move result into place 639 wmov C2, CC2 640 wmov C0, CC0 641 clr __zero_reg__ 642 brtc 6f 643 ;; Round iff T = 1 644 adc C0, __zero_reg__ 645 adc C1, __zero_reg__ 646 adc C2, __zero_reg__ 647 adc C3, __zero_reg__ 6486: 649 pop GUARD 650 ;; Epilogue 651 pop CC3 652 pop CC2 653 ret 654ENDF __mulusa3_round 655#endif /* L_mulusa3 */ 656 657#undef A0 658#undef A1 659#undef A2 660#undef A3 661#undef B0 662#undef B1 663#undef B2 664#undef B3 665#undef C0 666#undef C1 667#undef C2 668#undef C3 669#undef AA0 670#undef AA1 671#undef AA2 672#undef AA3 673#undef CC0 674#undef CC1 675#undef CC2 676#undef CC3 677 678#endif /* __AVR_HAVE_MUL__ */ 679 680#undef GUARD 681 682/*********************************************************** 683 Fixed unsigned saturated Multiplication 8.8 x 8.8 684***********************************************************/ 685 686#define C0 22 687#define C1 C0+1 688#define C2 C0+2 689#define C3 C0+3 690#define SS __tmp_reg__ 691 692#if defined (L_usmuluha3) 693DEFUN __usmuluha3 694 ;; Widening multiply 695#ifdef __AVR_HAVE_MUL__ 696 ;; Adjust interface 697 movw R26, R22 698 movw R18, R24 699#endif /* HAVE MUL */ 700 XCALL __umulhisi3 701 tst C3 702 brne .Lmax 703 ;; Round, target is in C1..C2 704 lsl C0 705 adc C1, __zero_reg__ 706 adc C2, __zero_reg__ 707 brcs .Lmax 708 ;; Move result into place 709 mov C3, C2 710 mov C2, C1 711 ret 712.Lmax: 713 ;; Saturate 714 ldi C2, 0xff 715 ldi C3, 0xff 716 ret 717ENDF __usmuluha3 718#endif /* L_usmuluha3 */ 719 720/*********************************************************** 721 Fixed signed saturated Multiplication s8.7 x s8.7 722***********************************************************/ 723 724#if defined (L_ssmulha3) 725DEFUN __ssmulha3 726 ;; Widening multiply 727#ifdef __AVR_HAVE_MUL__ 728 ;; Adjust interface 729 movw R26, R22 730 movw R18, R24 731#endif /* HAVE MUL */ 732 XCALL __mulhisi3 733 ;; Adjust decimal point 734 lsl C0 735 rol C1 736 rol C2 737 brvs .LsatC3.3 738 ;; The 9 MSBs must be the same 739 rol C3 740 sbc SS, SS 741 cp C3, SS 742 brne .LsatSS 743 ;; Round 744 lsl C0 745 adc C1, __zero_reg__ 746 adc C2, __zero_reg__ 747 brvs .Lmax 748 ;; Move result into place 749 mov C3, C2 750 mov C2, C1 751 ret 752.Lmax: 753 ;; Load 0x7fff 754 clr C3 755.LsatC3.3: 756 ;; C3 < 0 --> 0x8000 757 ;; C3 >= 0 --> 0x7fff 758 mov SS, C3 759.LsatSS: 760 ;; Load min / max value: 761 ;; SS = -1 --> 0x8000 762 ;; SS = 0 --> 0x7fff 763 ldi C3, 0x7f 764 ldi C2, 0xff 765 sbrc SS, 7 766 adiw C2, 1 767 ret 768ENDF __ssmulha3 769#endif /* L_ssmulha3 */ 770 771#undef C0 772#undef C1 773#undef C2 774#undef C3 775#undef SS 776 777/*********************************************************** 778 Fixed unsigned saturated Multiplication 16.16 x 16.16 779***********************************************************/ 780 781#define C0 18 782#define C1 C0+1 783#define C2 C0+2 784#define C3 C0+3 785#define C4 C0+4 786#define C5 C0+5 787#define C6 C0+6 788#define C7 C0+7 789#define SS __tmp_reg__ 790 791#if defined (L_usmulusa3) 792;; R22[4] = R22[4] *{ssat} R18[4] 793;; Ordinary ABI function 794DEFUN __usmulusa3 795 ;; Widening multiply 796 XCALL __umulsidi3 797 or C7, C6 798 brne .Lmax 799 ;; Round, target is in C2..C5 800 lsl C1 801 adc C2, __zero_reg__ 802 adc C3, __zero_reg__ 803 adc C4, __zero_reg__ 804 adc C5, __zero_reg__ 805 brcs .Lmax 806 ;; Move result into place 807 wmov C6, C4 808 wmov C4, C2 809 ret 810.Lmax: 811 ;; Saturate 812 ldi C7, 0xff 813 ldi C6, 0xff 814 wmov C4, C6 815 ret 816ENDF __usmulusa3 817#endif /* L_usmulusa3 */ 818 819/*********************************************************** 820 Fixed signed saturated Multiplication s16.15 x s16.15 821***********************************************************/ 822 823#if defined (L_ssmulsa3) 824;; R22[4] = R22[4] *{ssat} R18[4] 825;; Ordinary ABI function 826DEFUN __ssmulsa3 827 ;; Widening multiply 828 XCALL __mulsidi3 829 ;; Adjust decimal point 830 lsl C1 831 rol C2 832 rol C3 833 rol C4 834 rol C5 835 brvs .LsatC7.7 836 ;; The 17 MSBs must be the same 837 rol C6 838 rol C7 839 sbc SS, SS 840 cp C6, SS 841 cpc C7, SS 842 brne .LsatSS 843 ;; Round 844 lsl C1 845 adc C2, __zero_reg__ 846 adc C3, __zero_reg__ 847 adc C4, __zero_reg__ 848 adc C5, __zero_reg__ 849 brvs .Lmax 850 ;; Move result into place 851 wmov C6, C4 852 wmov C4, C2 853 ret 854 855.Lmax: 856 ;; Load 0x7fffffff 857 clr C7 858.LsatC7.7: 859 ;; C7 < 0 --> 0x80000000 860 ;; C7 >= 0 --> 0x7fffffff 861 lsl C7 862 sbc SS, SS 863.LsatSS: 864 ;; Load min / max value: 865 ;; SS = -1 --> 0x80000000 866 ;; SS = 0 --> 0x7fffffff 867 com SS 868 mov C4, SS 869 mov C5, C4 870 wmov C6, C4 871 subi C7, 0x80 872 ret 873ENDF __ssmulsa3 874#endif /* L_ssmulsa3 */ 875 876#undef C0 877#undef C1 878#undef C2 879#undef C3 880#undef C4 881#undef C5 882#undef C6 883#undef C7 884#undef SS 885 886/******************************************************* 887 Fractional Division 8 / 8 888*******************************************************/ 889 890#define r_divd r25 /* dividend */ 891#define r_quo r24 /* quotient */ 892#define r_div r22 /* divisor */ 893#define r_sign __tmp_reg__ 894 895#if defined (L_divqq3) 896DEFUN __divqq3 897 mov r_sign, r_divd 898 eor r_sign, r_div 899 sbrc r_div, 7 900 neg r_div 901 sbrc r_divd, 7 902 neg r_divd 903 XCALL __divqq_helper 904 lsr r_quo 905 sbrc r_sign, 7 ; negate result if needed 906 neg r_quo 907 ret 908ENDF __divqq3 909#endif /* L_divqq3 */ 910 911#if defined (L_udivuqq3) 912DEFUN __udivuqq3 913 cp r_divd, r_div 914 brsh 0f 915 XJMP __divqq_helper 916 ;; Result is out of [0, 1) ==> Return 1 - eps. 9170: ldi r_quo, 0xff 918 ret 919ENDF __udivuqq3 920#endif /* L_udivuqq3 */ 921 922 923#if defined (L_divqq_helper) 924DEFUN __divqq_helper 925 clr r_quo ; clear quotient 926 inc __zero_reg__ ; init loop counter, used per shift 927__udivuqq3_loop: 928 lsl r_divd ; shift dividend 929 brcs 0f ; dividend overflow 930 cp r_divd,r_div ; compare dividend & divisor 931 brcc 0f ; dividend >= divisor 932 rol r_quo ; shift quotient (with CARRY) 933 rjmp __udivuqq3_cont 9340: 935 sub r_divd,r_div ; restore dividend 936 lsl r_quo ; shift quotient (without CARRY) 937__udivuqq3_cont: 938 lsl __zero_reg__ ; shift loop-counter bit 939 brne __udivuqq3_loop 940 com r_quo ; complement result 941 ; because C flag was complemented in loop 942 ret 943ENDF __divqq_helper 944#endif /* L_divqq_helper */ 945 946#undef r_divd 947#undef r_quo 948#undef r_div 949#undef r_sign 950 951 952/******************************************************* 953 Fractional Division 16 / 16 954*******************************************************/ 955#define r_divdL 26 /* dividend Low */ 956#define r_divdH 27 /* dividend Hig */ 957#define r_quoL 24 /* quotient Low */ 958#define r_quoH 25 /* quotient High */ 959#define r_divL 22 /* divisor */ 960#define r_divH 23 /* divisor */ 961#define r_cnt 21 962 963#if defined (L_divhq3) 964DEFUN __divhq3 965 mov r0, r_divdH 966 eor r0, r_divH 967 sbrs r_divH, 7 968 rjmp 1f 969 NEG2 r_divL 9701: 971 sbrs r_divdH, 7 972 rjmp 2f 973 NEG2 r_divdL 9742: 975 cp r_divdL, r_divL 976 cpc r_divdH, r_divH 977 breq __divhq3_minus1 ; if equal return -1 978 XCALL __udivuhq3 979 lsr r_quoH 980 ror r_quoL 981 brpl 9f 982 ;; negate result if needed 983 NEG2 r_quoL 9849: 985 ret 986__divhq3_minus1: 987 ldi r_quoH, 0x80 988 clr r_quoL 989 ret 990ENDF __divhq3 991#endif /* defined (L_divhq3) */ 992 993#if defined (L_udivuhq3) 994DEFUN __udivuhq3 995 sub r_quoH,r_quoH ; clear quotient and carry 996 ;; FALLTHRU 997ENDF __udivuhq3 998 999DEFUN __udivuha3_common 1000 clr r_quoL ; clear quotient 1001 ldi r_cnt,16 ; init loop counter 1002__udivuhq3_loop: 1003 rol r_divdL ; shift dividend (with CARRY) 1004 rol r_divdH 1005 brcs __udivuhq3_ep ; dividend overflow 1006 cp r_divdL,r_divL ; compare dividend & divisor 1007 cpc r_divdH,r_divH 1008 brcc __udivuhq3_ep ; dividend >= divisor 1009 rol r_quoL ; shift quotient (with CARRY) 1010 rjmp __udivuhq3_cont 1011__udivuhq3_ep: 1012 sub r_divdL,r_divL ; restore dividend 1013 sbc r_divdH,r_divH 1014 lsl r_quoL ; shift quotient (without CARRY) 1015__udivuhq3_cont: 1016 rol r_quoH ; shift quotient 1017 dec r_cnt ; decrement loop counter 1018 brne __udivuhq3_loop 1019 com r_quoL ; complement result 1020 com r_quoH ; because C flag was complemented in loop 1021 ret 1022ENDF __udivuha3_common 1023#endif /* defined (L_udivuhq3) */ 1024 1025/******************************************************* 1026 Fixed Division 8.8 / 8.8 1027*******************************************************/ 1028#if defined (L_divha3) 1029DEFUN __divha3 1030 mov r0, r_divdH 1031 eor r0, r_divH 1032 sbrs r_divH, 7 1033 rjmp 1f 1034 NEG2 r_divL 10351: 1036 sbrs r_divdH, 7 1037 rjmp 2f 1038 NEG2 r_divdL 10392: 1040 XCALL __udivuha3 1041 lsr r_quoH ; adjust to 7 fractional bits 1042 ror r_quoL 1043 sbrs r0, 7 ; negate result if needed 1044 ret 1045 NEG2 r_quoL 1046 ret 1047ENDF __divha3 1048#endif /* defined (L_divha3) */ 1049 1050#if defined (L_udivuha3) 1051DEFUN __udivuha3 1052 mov r_quoH, r_divdL 1053 mov r_divdL, r_divdH 1054 clr r_divdH 1055 lsl r_quoH ; shift quotient into carry 1056 XJMP __udivuha3_common ; same as fractional after rearrange 1057ENDF __udivuha3 1058#endif /* defined (L_udivuha3) */ 1059 1060#undef r_divdL 1061#undef r_divdH 1062#undef r_quoL 1063#undef r_quoH 1064#undef r_divL 1065#undef r_divH 1066#undef r_cnt 1067 1068/******************************************************* 1069 Fixed Division 16.16 / 16.16 1070*******************************************************/ 1071 1072#define r_arg1L 24 /* arg1 gets passed already in place */ 1073#define r_arg1H 25 1074#define r_arg1HL 26 1075#define r_arg1HH 27 1076#define r_divdL 26 /* dividend Low */ 1077#define r_divdH 27 1078#define r_divdHL 30 1079#define r_divdHH 31 /* dividend High */ 1080#define r_quoL 22 /* quotient Low */ 1081#define r_quoH 23 1082#define r_quoHL 24 1083#define r_quoHH 25 /* quotient High */ 1084#define r_divL 18 /* divisor Low */ 1085#define r_divH 19 1086#define r_divHL 20 1087#define r_divHH 21 /* divisor High */ 1088#define r_cnt __zero_reg__ /* loop count (0 after the loop!) */ 1089 1090#if defined (L_divsa3) 1091DEFUN __divsa3 1092 mov r0, r_arg1HH 1093 eor r0, r_divHH 1094 sbrs r_divHH, 7 1095 rjmp 1f 1096 NEG4 r_divL 10971: 1098 sbrs r_arg1HH, 7 1099 rjmp 2f 1100 NEG4 r_arg1L 11012: 1102 XCALL __udivusa3 1103 lsr r_quoHH ; adjust to 15 fractional bits 1104 ror r_quoHL 1105 ror r_quoH 1106 ror r_quoL 1107 sbrs r0, 7 ; negate result if needed 1108 ret 1109 ;; negate r_quoL 1110 XJMP __negsi2 1111ENDF __divsa3 1112#endif /* defined (L_divsa3) */ 1113 1114#if defined (L_udivusa3) 1115DEFUN __udivusa3 1116 ldi r_divdHL, 32 ; init loop counter 1117 mov r_cnt, r_divdHL 1118 clr r_divdHL 1119 clr r_divdHH 1120 wmov r_quoL, r_divdHL 1121 lsl r_quoHL ; shift quotient into carry 1122 rol r_quoHH 1123__udivusa3_loop: 1124 rol r_divdL ; shift dividend (with CARRY) 1125 rol r_divdH 1126 rol r_divdHL 1127 rol r_divdHH 1128 brcs __udivusa3_ep ; dividend overflow 1129 cp r_divdL,r_divL ; compare dividend & divisor 1130 cpc r_divdH,r_divH 1131 cpc r_divdHL,r_divHL 1132 cpc r_divdHH,r_divHH 1133 brcc __udivusa3_ep ; dividend >= divisor 1134 rol r_quoL ; shift quotient (with CARRY) 1135 rjmp __udivusa3_cont 1136__udivusa3_ep: 1137 sub r_divdL,r_divL ; restore dividend 1138 sbc r_divdH,r_divH 1139 sbc r_divdHL,r_divHL 1140 sbc r_divdHH,r_divHH 1141 lsl r_quoL ; shift quotient (without CARRY) 1142__udivusa3_cont: 1143 rol r_quoH ; shift quotient 1144 rol r_quoHL 1145 rol r_quoHH 1146 dec r_cnt ; decrement loop counter 1147 brne __udivusa3_loop 1148 com r_quoL ; complement result 1149 com r_quoH ; because C flag was complemented in loop 1150 com r_quoHL 1151 com r_quoHH 1152 ret 1153ENDF __udivusa3 1154#endif /* defined (L_udivusa3) */ 1155 1156#undef r_arg1L 1157#undef r_arg1H 1158#undef r_arg1HL 1159#undef r_arg1HH 1160#undef r_divdL 1161#undef r_divdH 1162#undef r_divdHL 1163#undef r_divdHH 1164#undef r_quoL 1165#undef r_quoH 1166#undef r_quoHL 1167#undef r_quoHH 1168#undef r_divL 1169#undef r_divH 1170#undef r_divHL 1171#undef r_divHH 1172#undef r_cnt 1173 1174 1175;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1176;; Saturation, 1 Byte 1177;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1178 1179;; First Argument and Return Register 1180#define A0 24 1181 1182#if defined (L_ssabs_1) 1183DEFUN __ssabs_1 1184 sbrs A0, 7 1185 ret 1186 neg A0 1187 sbrc A0,7 1188 dec A0 1189 ret 1190ENDF __ssabs_1 1191#endif /* L_ssabs_1 */ 1192 1193#undef A0 1194 1195 1196 1197;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1198;; Saturation, 2 Bytes 1199;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1200 1201;; First Argument and Return Register 1202#define A0 24 1203#define A1 A0+1 1204 1205#if defined (L_ssneg_2) 1206DEFUN __ssneg_2 1207 NEG2 A0 1208 brvc 0f 1209 sbiw A0, 1 12100: ret 1211ENDF __ssneg_2 1212#endif /* L_ssneg_2 */ 1213 1214#if defined (L_ssabs_2) 1215DEFUN __ssabs_2 1216 sbrs A1, 7 1217 ret 1218 XJMP __ssneg_2 1219ENDF __ssabs_2 1220#endif /* L_ssabs_2 */ 1221 1222#undef A0 1223#undef A1 1224 1225 1226 1227;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1228;; Saturation, 4 Bytes 1229;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1230 1231;; First Argument and Return Register 1232#define A0 22 1233#define A1 A0+1 1234#define A2 A0+2 1235#define A3 A0+3 1236 1237#if defined (L_ssneg_4) 1238DEFUN __ssneg_4 1239 XCALL __negsi2 1240 brvc 0f 1241 ldi A3, 0x7f 1242 ldi A2, 0xff 1243 ldi A1, 0xff 1244 ldi A0, 0xff 12450: ret 1246ENDF __ssneg_4 1247#endif /* L_ssneg_4 */ 1248 1249#if defined (L_ssabs_4) 1250DEFUN __ssabs_4 1251 sbrs A3, 7 1252 ret 1253 XJMP __ssneg_4 1254ENDF __ssabs_4 1255#endif /* L_ssabs_4 */ 1256 1257#undef A0 1258#undef A1 1259#undef A2 1260#undef A3 1261 1262 1263 1264;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1265;; Saturation, 8 Bytes 1266;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1267 1268;; First Argument and Return Register 1269#define A0 18 1270#define A1 A0+1 1271#define A2 A0+2 1272#define A3 A0+3 1273#define A4 A0+4 1274#define A5 A0+5 1275#define A6 A0+6 1276#define A7 A0+7 1277 1278#if defined (L_clr_8) 1279FALIAS __usneguta2 1280FALIAS __usneguda2 1281FALIAS __usnegudq2 1282 1283;; Clear Carry and all Bytes 1284DEFUN __clr_8 1285 ;; Clear Carry and set Z 1286 sub A7, A7 1287 ;; FALLTHRU 1288ENDF __clr_8 1289;; Propagate Carry to all Bytes, Carry unaltered 1290DEFUN __sbc_8 1291 sbc A7, A7 1292 sbc A6, A6 1293 wmov A4, A6 1294 wmov A2, A6 1295 wmov A0, A6 1296 ret 1297ENDF __sbc_8 1298#endif /* L_clr_8 */ 1299 1300#if defined (L_ssneg_8) 1301FALIAS __ssnegta2 1302FALIAS __ssnegda2 1303FALIAS __ssnegdq2 1304 1305DEFUN __ssneg_8 1306 XCALL __negdi2 1307 brvc 0f 1308 ;; A[] = 0x7fffffff 1309 sec 1310 XCALL __sbc_8 1311 ldi A7, 0x7f 13120: ret 1313ENDF __ssneg_8 1314#endif /* L_ssneg_8 */ 1315 1316#if defined (L_ssabs_8) 1317FALIAS __ssabsta2 1318FALIAS __ssabsda2 1319FALIAS __ssabsdq2 1320 1321DEFUN __ssabs_8 1322 sbrs A7, 7 1323 ret 1324 XJMP __ssneg_8 1325ENDF __ssabs_8 1326#endif /* L_ssabs_8 */ 1327 1328;; Second Argument 1329#define B0 10 1330#define B1 B0+1 1331#define B2 B0+2 1332#define B3 B0+3 1333#define B4 B0+4 1334#define B5 B0+5 1335#define B6 B0+6 1336#define B7 B0+7 1337 1338#if defined (L_usadd_8) 1339FALIAS __usadduta3 1340FALIAS __usadduda3 1341FALIAS __usaddudq3 1342 1343DEFUN __usadd_8 1344 XCALL __adddi3 1345 brcs 0f 1346 ret 13470: ;; A[] = 0xffffffff 1348 XJMP __sbc_8 1349ENDF __usadd_8 1350#endif /* L_usadd_8 */ 1351 1352#if defined (L_ussub_8) 1353FALIAS __ussubuta3 1354FALIAS __ussubuda3 1355FALIAS __ussubudq3 1356 1357DEFUN __ussub_8 1358 XCALL __subdi3 1359 brcs 0f 1360 ret 13610: ;; A[] = 0 1362 XJMP __clr_8 1363ENDF __ussub_8 1364#endif /* L_ussub_8 */ 1365 1366#if defined (L_ssadd_8) 1367FALIAS __ssaddta3 1368FALIAS __ssaddda3 1369FALIAS __ssadddq3 1370 1371DEFUN __ssadd_8 1372 XCALL __adddi3 1373 brvc 0f 1374 ;; A = (B >= 0) ? INT64_MAX : INT64_MIN 1375 cpi B7, 0x80 1376 XCALL __sbc_8 1377 subi A7, 0x80 13780: ret 1379ENDF __ssadd_8 1380#endif /* L_ssadd_8 */ 1381 1382#if defined (L_sssub_8) 1383FALIAS __sssubta3 1384FALIAS __sssubda3 1385FALIAS __sssubdq3 1386 1387DEFUN __sssub_8 1388 XCALL __subdi3 1389 brvc 0f 1390 ;; A = (B < 0) ? INT64_MAX : INT64_MIN 1391 ldi A7, 0x7f 1392 cp A7, B7 1393 XCALL __sbc_8 1394 subi A7, 0x80 13950: ret 1396ENDF __sssub_8 1397#endif /* L_sssub_8 */ 1398 1399#undef A0 1400#undef A1 1401#undef A2 1402#undef A3 1403#undef A4 1404#undef A5 1405#undef A6 1406#undef A7 1407#undef B0 1408#undef B1 1409#undef B2 1410#undef B3 1411#undef B4 1412#undef B5 1413#undef B6 1414#undef B7 1415 1416 1417;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1418;; Rounding Helpers 1419;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1420 1421#ifdef L_mask1 1422 1423#define AA 24 1424#define CC 25 1425 1426;; R25 = 1 << (R24 & 7) 1427;; CC = 1 << (AA & 7) 1428;; Clobbers: None 1429DEFUN __mask1 1430 ;; CC = 2 ^ AA.1 1431 ldi CC, 1 << 2 1432 sbrs AA, 1 1433 ldi CC, 1 << 0 1434 ;; CC *= 2 ^ AA.0 1435 sbrc AA, 0 1436 lsl CC 1437 ;; CC *= 2 ^ AA.2 1438 sbrc AA, 2 1439 swap CC 1440 ret 1441ENDF __mask1 1442 1443#undef AA 1444#undef CC 1445#endif /* L_mask1 */ 1446 1447;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1448 1449;; The rounding point. Any bits smaller than 1450;; 2^{-RP} will be cleared. 1451#define RP R24 1452 1453#define A0 22 1454#define A1 A0 + 1 1455 1456#define C0 24 1457#define C1 C0 + 1 1458 1459;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1460;; Rounding, 1 Byte 1461;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1462 1463#ifdef L_roundqq3 1464 1465;; R24 = round (R22, R24) 1466;; Clobbers: R22, __tmp_reg__ 1467DEFUN __roundqq3 1468 mov __tmp_reg__, C1 1469 subi RP, __QQ_FBIT__ - 1 1470 neg RP 1471 ;; R25 = 1 << RP (Total offset is FBIT-1 - RP) 1472 XCALL __mask1 1473 mov C0, C1 1474 ;; Add-Saturate 2^{-RP-1} 1475 add A0, C0 1476 brvc 0f 1477 ldi C0, 0x7f 1478 rjmp 9f 14790: ;; Mask out bits beyond RP 1480 lsl C0 1481 neg C0 1482 and C0, A0 14839: mov C1, __tmp_reg__ 1484 ret 1485ENDF __roundqq3 1486#endif /* L_roundqq3 */ 1487 1488#ifdef L_rounduqq3 1489 1490;; R24 = round (R22, R24) 1491;; Clobbers: R22, __tmp_reg__ 1492DEFUN __rounduqq3 1493 mov __tmp_reg__, C1 1494 subi RP, __UQQ_FBIT__ - 1 1495 neg RP 1496 ;; R25 = 1 << RP (Total offset is FBIT-1 - RP) 1497 XCALL __mask1 1498 mov C0, C1 1499 ;; Add-Saturate 2^{-RP-1} 1500 add A0, C0 1501 brcc 0f 1502 ldi C0, 0xff 1503 rjmp 9f 15040: ;; Mask out bits beyond RP 1505 lsl C0 1506 neg C0 1507 and C0, A0 15089: mov C1, __tmp_reg__ 1509 ret 1510ENDF __rounduqq3 1511#endif /* L_rounduqq3 */ 1512 1513;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1514;; Rounding, 2 Bytes 1515;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1516 1517#ifdef L_addmask_2 1518 1519;; [ R25:R24 = 1 << (R24 & 15) 1520;; R23:R22 += 1 << (R24 & 15) ] 1521;; SREG is set according to the addition 1522DEFUN __addmask_2 1523 ;; R25 = 1 << (R24 & 7) 1524 XCALL __mask1 1525 cpi RP, 1 << 3 1526 sbc C0, C0 1527 ;; Swap C0 and C1 if RP.3 was set 1528 and C0, C1 1529 eor C1, C0 1530 ;; Finally, add the power-of-two: A[] += C[] 1531 add A0, C0 1532 adc A1, C1 1533 ret 1534ENDF __addmask_2 1535#endif /* L_addmask_2 */ 1536 1537#ifdef L_round_s2 1538 1539;; R25:R24 = round (R23:R22, R24) 1540;; Clobbers: R23, R22 1541DEFUN __roundhq3 1542 subi RP, __HQ_FBIT__ - __HA_FBIT__ 1543ENDF __roundhq3 1544DEFUN __roundha3 1545 subi RP, __HA_FBIT__ - 1 1546 neg RP 1547 ;; [ R25:R24 = 1 << (FBIT-1 - RP) 1548 ;; R23:R22 += 1 << (FBIT-1 - RP) ] 1549 XCALL __addmask_2 1550 XJMP __round_s2_const 1551ENDF __roundha3 1552 1553#endif /* L_round_s2 */ 1554 1555#ifdef L_round_u2 1556 1557;; R25:R24 = round (R23:R22, R24) 1558;; Clobbers: R23, R22 1559DEFUN __rounduhq3 1560 subi RP, __UHQ_FBIT__ - __UHA_FBIT__ 1561ENDF __rounduhq3 1562DEFUN __rounduha3 1563 subi RP, __UHA_FBIT__ - 1 1564 neg RP 1565 ;; [ R25:R24 = 1 << (FBIT-1 - RP) 1566 ;; R23:R22 += 1 << (FBIT-1 - RP) ] 1567 XCALL __addmask_2 1568 XJMP __round_u2_const 1569ENDF __rounduha3 1570 1571#endif /* L_round_u2 */ 1572 1573 1574#ifdef L_round_2_const 1575 1576;; Helpers for 2 byte wide rounding 1577 1578DEFUN __round_s2_const 1579 brvc 2f 1580 ldi C1, 0x7f 1581 rjmp 1f 1582 ;; FALLTHRU (Barrier) 1583ENDF __round_s2_const 1584 1585DEFUN __round_u2_const 1586 brcc 2f 1587 ldi C1, 0xff 15881: 1589 ldi C0, 0xff 1590 rjmp 9f 15912: 1592 ;; Saturation is performed now. 1593 ;; Currently, we have C[] = 2^{-RP-1} 1594 ;; C[] = 2^{-RP} 1595 lsl C0 1596 rol C1 1597 ;; 1598 NEG2 C0 1599 ;; Clear the bits beyond the rounding point. 1600 and C0, A0 1601 and C1, A1 16029: ret 1603ENDF __round_u2_const 1604 1605#endif /* L_round_2_const */ 1606 1607#undef A0 1608#undef A1 1609#undef C0 1610#undef C1 1611 1612;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1613;; Rounding, 4 Bytes 1614;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1615 1616#define A0 18 1617#define A1 A0 + 1 1618#define A2 A0 + 2 1619#define A3 A0 + 3 1620 1621#define C0 22 1622#define C1 C0 + 1 1623#define C2 C0 + 2 1624#define C3 C0 + 3 1625 1626#ifdef L_addmask_4 1627 1628;; [ R25:R22 = 1 << (R24 & 31) 1629;; R21:R18 += 1 << (R24 & 31) ] 1630;; SREG is set according to the addition 1631DEFUN __addmask_4 1632 ;; R25 = 1 << (R24 & 7) 1633 XCALL __mask1 1634 cpi RP, 1 << 4 1635 sbc C0, C0 1636 sbc C1, C1 1637 ;; Swap C2 with C3 if RP.3 is not set 1638 cpi RP, 1 << 3 1639 sbc C2, C2 1640 and C2, C3 1641 eor C3, C2 1642 ;; Swap C3:C2 with C1:C0 if RP.4 is not set 1643 and C0, C2 $ eor C2, C0 1644 and C1, C3 $ eor C3, C1 1645 ;; Finally, add the power-of-two: A[] += C[] 1646 add A0, C0 1647 adc A1, C1 1648 adc A2, C2 1649 adc A3, C3 1650 ret 1651ENDF __addmask_4 1652#endif /* L_addmask_4 */ 1653 1654#ifdef L_round_s4 1655 1656;; R25:R22 = round (R21:R18, R24) 1657;; Clobbers: R18...R21 1658DEFUN __roundsq3 1659 subi RP, __SQ_FBIT__ - __SA_FBIT__ 1660ENDF __roundsq3 1661DEFUN __roundsa3 1662 subi RP, __SA_FBIT__ - 1 1663 neg RP 1664 ;; [ R25:R22 = 1 << (FBIT-1 - RP) 1665 ;; R21:R18 += 1 << (FBIT-1 - RP) ] 1666 XCALL __addmask_4 1667 XJMP __round_s4_const 1668ENDF __roundsa3 1669 1670#endif /* L_round_s4 */ 1671 1672#ifdef L_round_u4 1673 1674;; R25:R22 = round (R21:R18, R24) 1675;; Clobbers: R18...R21 1676DEFUN __roundusq3 1677 subi RP, __USQ_FBIT__ - __USA_FBIT__ 1678ENDF __roundusq3 1679DEFUN __roundusa3 1680 subi RP, __USA_FBIT__ - 1 1681 neg RP 1682 ;; [ R25:R22 = 1 << (FBIT-1 - RP) 1683 ;; R21:R18 += 1 << (FBIT-1 - RP) ] 1684 XCALL __addmask_4 1685 XJMP __round_u4_const 1686ENDF __roundusa3 1687 1688#endif /* L_round_u4 */ 1689 1690 1691#ifdef L_round_4_const 1692 1693;; Helpers for 4 byte wide rounding 1694 1695DEFUN __round_s4_const 1696 brvc 2f 1697 ldi C3, 0x7f 1698 rjmp 1f 1699 ;; FALLTHRU (Barrier) 1700ENDF __round_s4_const 1701 1702DEFUN __round_u4_const 1703 brcc 2f 1704 ldi C3, 0xff 17051: 1706 ldi C2, 0xff 1707 ldi C1, 0xff 1708 ldi C0, 0xff 1709 rjmp 9f 17102: 1711 ;; Saturation is performed now. 1712 ;; Currently, we have C[] = 2^{-RP-1} 1713 ;; C[] = 2^{-RP} 1714 lsl C0 1715 rol C1 1716 rol C2 1717 rol C3 1718 XCALL __negsi2 1719 ;; Clear the bits beyond the rounding point. 1720 and C0, A0 1721 and C1, A1 1722 and C2, A2 1723 and C3, A3 17249: ret 1725ENDF __round_u4_const 1726 1727#endif /* L_round_4_const */ 1728 1729#undef A0 1730#undef A1 1731#undef A2 1732#undef A3 1733#undef C0 1734#undef C1 1735#undef C2 1736#undef C3 1737 1738#undef RP 1739 1740;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1741;; Rounding, 8 Bytes 1742;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1743 1744#define RP 16 1745#define FBITm1 31 1746 1747#define C0 18 1748#define C1 C0 + 1 1749#define C2 C0 + 2 1750#define C3 C0 + 3 1751#define C4 C0 + 4 1752#define C5 C0 + 5 1753#define C6 C0 + 6 1754#define C7 C0 + 7 1755 1756#define A0 16 1757#define A1 17 1758#define A2 26 1759#define A3 27 1760#define A4 28 1761#define A5 29 1762#define A6 30 1763#define A7 31 1764 1765 1766#ifdef L_rounddq3 1767;; R25:R18 = round (R25:R18, R16) 1768;; Clobbers: ABI 1769DEFUN __rounddq3 1770 ldi FBITm1, __DQ_FBIT__ - 1 1771 clt 1772 XJMP __round_x8 1773ENDF __rounddq3 1774#endif /* L_rounddq3 */ 1775 1776#ifdef L_roundudq3 1777;; R25:R18 = round (R25:R18, R16) 1778;; Clobbers: ABI 1779DEFUN __roundudq3 1780 ldi FBITm1, __UDQ_FBIT__ - 1 1781 set 1782 XJMP __round_x8 1783ENDF __roundudq3 1784#endif /* L_roundudq3 */ 1785 1786#ifdef L_roundda3 1787;; R25:R18 = round (R25:R18, R16) 1788;; Clobbers: ABI 1789DEFUN __roundda3 1790 ldi FBITm1, __DA_FBIT__ - 1 1791 clt 1792 XJMP __round_x8 1793ENDF __roundda3 1794#endif /* L_roundda3 */ 1795 1796#ifdef L_rounduda3 1797;; R25:R18 = round (R25:R18, R16) 1798;; Clobbers: ABI 1799DEFUN __rounduda3 1800 ldi FBITm1, __UDA_FBIT__ - 1 1801 set 1802 XJMP __round_x8 1803ENDF __rounduda3 1804#endif /* L_rounduda3 */ 1805 1806#ifdef L_roundta3 1807;; R25:R18 = round (R25:R18, R16) 1808;; Clobbers: ABI 1809DEFUN __roundta3 1810 ldi FBITm1, __TA_FBIT__ - 1 1811 clt 1812 XJMP __round_x8 1813ENDF __roundta3 1814#endif /* L_roundta3 */ 1815 1816#ifdef L_rounduta3 1817;; R25:R18 = round (R25:R18, R16) 1818;; Clobbers: ABI 1819DEFUN __rounduta3 1820 ldi FBITm1, __UTA_FBIT__ - 1 1821 set 1822 XJMP __round_x8 1823ENDF __rounduta3 1824#endif /* L_rounduta3 */ 1825 1826 1827#ifdef L_round_x8 1828DEFUN __round_x8 1829 push r16 1830 push r17 1831 push r28 1832 push r29 1833 ;; Compute log2 of addend from rounding point 1834 sub RP, FBITm1 1835 neg RP 1836 ;; Move input to work register A[] 1837 push C0 1838 mov A1, C1 1839 wmov A2, C2 1840 wmov A4, C4 1841 wmov A6, C6 1842 ;; C[] = 1 << (FBIT-1 - RP) 1843 XCALL __clr_8 1844 inc C0 1845 XCALL __ashldi3 1846 pop A0 1847 ;; A[] += C[] 1848 add A0, C0 1849 adc A1, C1 1850 adc A2, C2 1851 adc A3, C3 1852 adc A4, C4 1853 adc A5, C5 1854 adc A6, C6 1855 adc A7, C7 1856 brts 1f 1857 ;; Signed 1858 brvc 3f 1859 ;; Signed overflow: A[] = 0x7f... 1860 brvs 2f 18611: ;; Unsigned 1862 brcc 3f 1863 ;; Unsigned overflow: A[] = 0xff... 18642: ldi C7, 0xff 1865 ldi C6, 0xff 1866 wmov C0, C6 1867 wmov C2, C6 1868 wmov C4, C6 1869 bld C7, 7 1870 rjmp 9f 18713: 1872 ;; C[] = -C[] - C[] 1873 push A0 1874 ldi r16, 1 1875 XCALL __ashldi3 1876 pop A0 1877 XCALL __negdi2 1878 ;; Clear the bits beyond the rounding point. 1879 and C0, A0 1880 and C1, A1 1881 and C2, A2 1882 and C3, A3 1883 and C4, A4 1884 and C5, A5 1885 and C6, A6 1886 and C7, A7 18879: ;; Epilogue 1888 pop r29 1889 pop r28 1890 pop r17 1891 pop r16 1892 ret 1893ENDF __round_x8 1894 1895#endif /* L_round_x8 */ 1896 1897#undef A0 1898#undef A1 1899#undef A2 1900#undef A3 1901#undef A4 1902#undef A5 1903#undef A6 1904#undef A7 1905 1906#undef C0 1907#undef C1 1908#undef C2 1909#undef C3 1910#undef C4 1911#undef C5 1912#undef C6 1913#undef C7 1914 1915#undef RP 1916#undef FBITm1 1917 1918 1919;; Supply implementations / symbols for the bit-banging functions 1920;; __builtin_avr_bitsfx and __builtin_avr_fxbits 1921#ifdef L_ret 1922DEFUN __ret 1923 ret 1924ENDF __ret 1925#endif /* L_ret */ 1926 1927#endif /* if not __AVR_TINY__ */ 1928