1/*  -*- Mode: Asm -*-  */
2;;    Copyright (C) 2012-2021 Free Software Foundation, Inc.
3;;    Contributed by Sean D'Epagnier  (sean@depagnier.com)
4;;                   Georg-Johann Lay (avr@gjlay.de)
5
6;; This file is free software; you can redistribute it and/or modify it
7;; under the terms of the GNU General Public License as published by the
8;; Free Software Foundation; either version 3, or (at your option) any
9;; later version.
10
11;; In addition to the permissions in the GNU General Public License, the
12;; Free Software Foundation gives you unlimited permission to link the
13;; compiled version of this file into combinations with other programs,
14;; and to distribute those combinations without any restriction coming
15;; from the use of this file.  (The General Public License restrictions
16;; do apply in other respects; for example, they cover modification of
17;; the file, and distribution when not linked into a combine
18;; executable.)
19
20;; This file is distributed in the hope that it will be useful, but
21;; WITHOUT ANY WARRANTY; without even the implied warranty of
22;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23;; General Public License for more details.
24
25;; You should have received a copy of the GNU General Public License
26;; along with this program; see the file COPYING.  If not, write to
27;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
28;; Boston, MA 02110-1301, USA.
29
30;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31;; Fixed point library routines for AVR
32;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
33
34#if defined __AVR_TINY__
35#define __zero_reg__ r17
36#define __tmp_reg__ r16
37#else
38#define __zero_reg__ r1
39#define __tmp_reg__ r0
40#endif
41
42.section .text.libgcc.fixed, "ax", @progbits
43
44#ifndef __AVR_TINY__
45
46;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
47;; Conversions to float
48;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
49
50#if defined (L_fractqqsf)
51DEFUN __fractqqsf
52    ;; Move in place for SA -> SF conversion
53    clr     r22
54    mov     r23, r24
55    ;; Sign-extend
56    lsl     r24
57    sbc     r24, r24
58    mov     r25, r24
59    XJMP    __fractsasf
60ENDF __fractqqsf
61#endif  /* L_fractqqsf */
62
63#if defined (L_fractuqqsf)
64DEFUN __fractuqqsf
65    ;; Move in place for USA -> SF conversion
66    clr     r22
67    mov     r23, r24
68    ;; Zero-extend
69    clr     r24
70    clr     r25
71    XJMP    __fractusasf
72ENDF __fractuqqsf
73#endif  /* L_fractuqqsf */
74
75#if defined (L_fracthqsf)
76DEFUN __fracthqsf
77    ;; Move in place for SA -> SF conversion
78    wmov    22, 24
79    ;; Sign-extend
80    lsl     r25
81    sbc     r24, r24
82    mov     r25, r24
83    XJMP    __fractsasf
84ENDF __fracthqsf
85#endif  /* L_fracthqsf */
86
87#if defined (L_fractuhqsf)
88DEFUN __fractuhqsf
89    ;; Move in place for USA -> SF conversion
90    wmov    22, 24
91    ;; Zero-extend
92    clr     r24
93    clr     r25
94    XJMP    __fractusasf
95ENDF __fractuhqsf
96#endif  /* L_fractuhqsf */
97
98#if defined (L_fracthasf)
99DEFUN __fracthasf
100    ;; Move in place for SA -> SF conversion
101    clr     r22
102    mov     r23, r24
103    mov     r24, r25
104    ;; Sign-extend
105    lsl     r25
106    sbc     r25, r25
107    XJMP    __fractsasf
108ENDF __fracthasf
109#endif  /* L_fracthasf */
110
111#if defined (L_fractuhasf)
112DEFUN __fractuhasf
113    ;; Move in place for USA -> SF conversion
114    clr     r22
115    mov     r23, r24
116    mov     r24, r25
117    ;; Zero-extend
118    clr     r25
119    XJMP    __fractusasf
120ENDF __fractuhasf
121#endif  /* L_fractuhasf */
122
123
124#if defined (L_fractsqsf)
125DEFUN __fractsqsf
126    XCALL   __floatsisf
127    ;; Divide non-zero results by 2^31 to move the
128    ;; decimal point into place
129    tst     r25
130    breq    0f
131    subi    r24, exp_lo (31)
132    sbci    r25, exp_hi (31)
1330:  ret
134ENDF __fractsqsf
135#endif  /* L_fractsqsf */
136
137#if defined (L_fractusqsf)
138DEFUN __fractusqsf
139    XCALL   __floatunsisf
140    ;; Divide non-zero results by 2^32 to move the
141    ;; decimal point into place
142    cpse    r25, __zero_reg__
143    subi    r25, exp_hi (32)
144    ret
145ENDF __fractusqsf
146#endif  /* L_fractusqsf */
147
148#if defined (L_fractsasf)
149DEFUN __fractsasf
150    XCALL   __floatsisf
151    ;; Divide non-zero results by 2^15 to move the
152    ;; decimal point into place
153    tst     r25
154    breq    0f
155    subi    r24, exp_lo (15)
156    sbci    r25, exp_hi (15)
1570:  ret
158ENDF __fractsasf
159#endif  /* L_fractsasf */
160
161#if defined (L_fractusasf)
162DEFUN __fractusasf
163    XCALL   __floatunsisf
164    ;; Divide non-zero results by 2^16 to move the
165    ;; decimal point into place
166    cpse    r25, __zero_reg__
167    subi    r25, exp_hi (16)
168    ret
169ENDF __fractusasf
170#endif  /* L_fractusasf */
171
172;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
173;; Conversions from float
174;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
175
176#if defined (L_fractsfqq)
177DEFUN __fractsfqq
178    ;; Multiply with 2^{24+7} to get a QQ result in r25
179    subi    r24, exp_lo (-31)
180    sbci    r25, exp_hi (-31)
181    XCALL   __fixsfsi
182    mov     r24, r25
183    ret
184ENDF __fractsfqq
185#endif  /* L_fractsfqq */
186
187#if defined (L_fractsfuqq)
188DEFUN __fractsfuqq
189    ;; Multiply with 2^{24+8} to get a UQQ result in r25
190    subi    r25, exp_hi (-32)
191    XCALL   __fixunssfsi
192    mov     r24, r25
193    ret
194ENDF __fractsfuqq
195#endif  /* L_fractsfuqq */
196
197#if defined (L_fractsfha)
198DEFUN __fractsfha
199    ;; Multiply with 2^{16+7} to get a HA result in r25:r24
200    subi    r24, exp_lo (-23)
201    sbci    r25, exp_hi (-23)
202    XJMP    __fixsfsi
203ENDF __fractsfha
204#endif  /* L_fractsfha */
205
206#if defined (L_fractsfuha)
207DEFUN __fractsfuha
208    ;; Multiply with 2^24 to get a UHA result in r25:r24
209    subi    r25, exp_hi (-24)
210    XJMP    __fixunssfsi
211ENDF __fractsfuha
212#endif  /* L_fractsfuha */
213
214#if defined (L_fractsfhq)
215FALIAS __fractsfsq
216
217DEFUN __fractsfhq
218    ;; Multiply with 2^{16+15} to get a HQ result in r25:r24
219    ;; resp. with 2^31 to get a SQ result in r25:r22
220    subi    r24, exp_lo (-31)
221    sbci    r25, exp_hi (-31)
222    XJMP    __fixsfsi
223ENDF __fractsfhq
224#endif  /* L_fractsfhq */
225
226#if defined (L_fractsfuhq)
227FALIAS __fractsfusq
228
229DEFUN __fractsfuhq
230    ;; Multiply with 2^{16+16} to get a UHQ result in r25:r24
231    ;; resp. with 2^32 to get a USQ result in r25:r22
232    subi    r25, exp_hi (-32)
233    XJMP    __fixunssfsi
234ENDF __fractsfuhq
235#endif  /* L_fractsfuhq */
236
237#if defined (L_fractsfsa)
238DEFUN __fractsfsa
239    ;; Multiply with 2^15 to get a SA result in r25:r22
240    subi    r24, exp_lo (-15)
241    sbci    r25, exp_hi (-15)
242    XJMP    __fixsfsi
243ENDF __fractsfsa
244#endif  /* L_fractsfsa */
245
246#if defined (L_fractsfusa)
247DEFUN __fractsfusa
248    ;; Multiply with 2^16 to get a USA result in r25:r22
249    subi    r25, exp_hi (-16)
250    XJMP    __fixunssfsi
251ENDF __fractsfusa
252#endif  /* L_fractsfusa */
253
254
255;; For multiplication the functions here are called directly from
256;; avr-fixed.md instead of using the standard libcall mechanisms.
257;; This can make better code because GCC knows exactly which
258;; of the call-used registers (not all of them) are clobbered.  */
259
260/*******************************************************
261    Fractional  Multiplication  8 x 8  without MUL
262*******************************************************/
263
264#if defined (L_mulqq3) && !defined (__AVR_HAVE_MUL__)
265;;; R23 = R24 * R25
266;;; Clobbers: __tmp_reg__, R22, R24, R25
267;;; Rounding: ???
268DEFUN __mulqq3
269    XCALL   __fmuls
270    ;; TR 18037 requires that  (-1) * (-1)  does not overflow
271    ;; The only input that can produce  -1  is  (-1)^2.
272    dec     r23
273    brvs    0f
274    inc     r23
2750:  ret
276ENDF  __mulqq3
277#endif /* L_mulqq3 && ! HAVE_MUL */
278
279/*******************************************************
280    Fractional Multiply  .16 x .16  with and without MUL
281*******************************************************/
282
283#if defined (L_mulhq3)
284;;; Same code with and without MUL, but the interfaces differ:
285;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25)
286;;;         Clobbers: ABI, called by optabs
287;;; MUL:    (R25:R24) = (R19:R18) * (R27:R26)
288;;;         Clobbers: __tmp_reg__, R22, R23
289;;; Rounding:  -0.5 LSB  <= error  <=  0.5 LSB
290DEFUN   __mulhq3
291    XCALL   __mulhisi3
292    ;; Shift result into place
293    lsl     r23
294    rol     r24
295    rol     r25
296    brvs    1f
297    ;; Round
298    sbrc    r23, 7
299    adiw    r24, 1
300    ret
3011:  ;; Overflow.  TR 18037 requires  (-1)^2  not to overflow
302    ldi     r24, lo8 (0x7fff)
303    ldi     r25, hi8 (0x7fff)
304    ret
305ENDF __mulhq3
306#endif  /* defined (L_mulhq3) */
307
308#if defined (L_muluhq3)
309;;; Same code with and without MUL, but the interfaces differ:
310;;; no MUL: (R25:R24) *= (R23:R22)
311;;;         Clobbers: ABI, called by optabs
312;;; MUL:    (R25:R24) = (R19:R18) * (R27:R26)
313;;;         Clobbers: __tmp_reg__, R22, R23
314;;; Rounding:  -0.5 LSB  <  error  <=  0.5 LSB
315DEFUN   __muluhq3
316    XCALL   __umulhisi3
317    ;; Round
318    sbrc    r23, 7
319    adiw    r24, 1
320    ret
321ENDF __muluhq3
322#endif  /* L_muluhq3 */
323
324
325/*******************************************************
326    Fixed  Multiply  8.8 x 8.8  with and without MUL
327*******************************************************/
328
329#if defined (L_mulha3)
330;;; Same code with and without MUL, but the interfaces differ:
331;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25)
332;;;         Clobbers: ABI, called by optabs
333;;; MUL:    (R25:R24) = (R19:R18) * (R27:R26)
334;;;         Clobbers: __tmp_reg__, R22, R23
335;;; Rounding:  -0.5 LSB  <=  error  <=  0.5 LSB
336DEFUN   __mulha3
337    XCALL   __mulhisi3
338    lsl     r22
339    rol     r23
340    rol     r24
341    XJMP    __muluha3_round
342ENDF __mulha3
343#endif  /* L_mulha3 */
344
345#if defined (L_muluha3)
346;;; Same code with and without MUL, but the interfaces differ:
347;;; no MUL: (R25:R24) *= (R23:R22)
348;;;         Clobbers: ABI, called by optabs
349;;; MUL:    (R25:R24) = (R19:R18) * (R27:R26)
350;;;         Clobbers: __tmp_reg__, R22, R23
351;;; Rounding:  -0.5 LSB  <  error  <=  0.5 LSB
352DEFUN   __muluha3
353    XCALL   __umulhisi3
354    XJMP    __muluha3_round
355ENDF __muluha3
356#endif  /* L_muluha3 */
357
358#if defined (L_muluha3_round)
359DEFUN   __muluha3_round
360    ;; Shift result into place
361    mov     r25, r24
362    mov     r24, r23
363    ;; Round
364    sbrc    r22, 7
365    adiw    r24, 1
366    ret
367ENDF __muluha3_round
368#endif  /* L_muluha3_round */
369
370
371/*******************************************************
372    Fixed  Multiplication  16.16 x 16.16
373*******************************************************/
374
375;; Bits outside the result (below LSB), used in the signed version
376#define GUARD __tmp_reg__
377
378#if defined (__AVR_HAVE_MUL__)
379
380;; Multiplier
381#define A0  16
382#define A1  A0+1
383#define A2  A1+1
384#define A3  A2+1
385
386;; Multiplicand
387#define B0  20
388#define B1  B0+1
389#define B2  B1+1
390#define B3  B2+1
391
392;; Result
393#define C0  24
394#define C1  C0+1
395#define C2  C1+1
396#define C3  C2+1
397
398#if defined (L_mulusa3)
399;;; (C3:C0) = (A3:A0) * (B3:B0)
400DEFUN __mulusa3
401    set
402    ;; Fallthru
403ENDF  __mulusa3
404
405;;; Round for last digit iff T = 1
406;;; Return guard bits in GUARD (__tmp_reg__).
407;;; Rounding, T = 0:  -1.0 LSB  <  error  <=  0   LSB
408;;; Rounding, T = 1:  -0.5 LSB  <  error  <=  0.5 LSB
409DEFUN __mulusa3_round
410    ;; Some of the MUL instructions have LSBs outside the result.
411    ;; Don't ignore these LSBs in order to tame rounding error.
412    ;; Use C2/C3 for these LSBs.
413
414    clr C0
415    clr C1
416    mul A0, B0  $  movw C2, r0
417
418    mul A1, B0  $  add  C3, r0  $  adc C0, r1
419    mul A0, B1  $  add  C3, r0  $  adc C0, r1  $  rol C1
420
421    ;; Round if T = 1.  Store guarding bits outside the result for rounding
422    ;; and left-shift by the signed version (function below).
423    brtc 0f
424    sbrc C3, 7
425    adiw C0, 1
4260:  push C3
427
428    ;; The following MULs don't have LSBs outside the result.
429    ;; C2/C3 is the high part.
430
431    mul  A0, B2  $  add C0, r0  $  adc C1, r1  $  sbc  C2, C2
432    mul  A1, B1  $  add C0, r0  $  adc C1, r1  $  sbci C2, 0
433    mul  A2, B0  $  add C0, r0  $  adc C1, r1  $  sbci C2, 0
434    neg  C2
435
436    mul  A0, B3  $  add C1, r0  $  adc C2, r1  $  sbc  C3, C3
437    mul  A1, B2  $  add C1, r0  $  adc C2, r1  $  sbci C3, 0
438    mul  A2, B1  $  add C1, r0  $  adc C2, r1  $  sbci C3, 0
439    mul  A3, B0  $  add C1, r0  $  adc C2, r1  $  sbci C3, 0
440    neg  C3
441
442    mul  A1, B3  $  add C2, r0  $  adc C3, r1
443    mul  A2, B2  $  add C2, r0  $  adc C3, r1
444    mul  A3, B1  $  add C2, r0  $  adc C3, r1
445
446    mul  A2, B3  $  add C3, r0
447    mul  A3, B2  $  add C3, r0
448
449    ;; Guard bits used in the signed version below.
450    pop  GUARD
451    clr  __zero_reg__
452    ret
453ENDF __mulusa3_round
454#endif /* L_mulusa3 */
455
456#if defined (L_mulsa3)
457;;; (C3:C0) = (A3:A0) * (B3:B0)
458;;; Clobbers: __tmp_reg__, T
459;;; Rounding:  -0.5 LSB  <=  error  <=  0.5 LSB
460DEFUN __mulsa3
461    clt
462    XCALL   __mulusa3_round
463    ;; A posteriori sign extension of the operands
464    tst     B3
465    brpl 1f
466    sub     C2, A0
467    sbc     C3, A1
4681:  sbrs    A3, 7
469    rjmp 2f
470    sub     C2, B0
471    sbc     C3, B1
4722:
473    ;;  Shift 1 bit left to adjust for 15 fractional bits
474    lsl     GUARD
475    rol     C0
476    rol     C1
477    rol     C2
478    rol     C3
479    ;; Round last digit
480    lsl     GUARD
481    adc     C0, __zero_reg__
482    adc     C1, __zero_reg__
483    adc     C2, __zero_reg__
484    adc     C3, __zero_reg__
485    ret
486ENDF __mulsa3
487#endif /* L_mulsa3 */
488
489#undef A0
490#undef A1
491#undef A2
492#undef A3
493#undef B0
494#undef B1
495#undef B2
496#undef B3
497#undef C0
498#undef C1
499#undef C2
500#undef C3
501
502#else /* __AVR_HAVE_MUL__ */
503
504#define A0 18
505#define A1 A0+1
506#define A2 A0+2
507#define A3 A0+3
508
509#define B0 22
510#define B1 B0+1
511#define B2 B0+2
512#define B3 B0+3
513
514#define C0  22
515#define C1  C0+1
516#define C2  C0+2
517#define C3  C0+3
518
519;; __tmp_reg__
520#define CC0  0
521;; __zero_reg__
522#define CC1  1
523#define CC2  16
524#define CC3  17
525
526#define AA0  26
527#define AA1  AA0+1
528#define AA2  30
529#define AA3  AA2+1
530
531#if defined (L_mulsa3)
532;;; (R25:R22)  *=  (R21:R18)
533;;; Clobbers: ABI, called by optabs
534;;; Rounding:  -1 LSB  <=  error  <=  1 LSB
535DEFUN   __mulsa3
536    push    B0
537    push    B1
538    push    B3
539    clt
540    XCALL   __mulusa3_round
541    pop     r30
542    ;; sign-extend B
543    bst     r30, 7
544    brtc 1f
545    ;; A1, A0 survived in  R27:R26
546    sub     C2, AA0
547    sbc     C3, AA1
5481:
549    pop     AA1  ;; B1
550    pop     AA0  ;; B0
551
552    ;; sign-extend A.  A3 survived in  R31
553    bst     AA3, 7
554    brtc 2f
555    sub     C2, AA0
556    sbc     C3, AA1
5572:
558    ;;  Shift 1 bit left to adjust for 15 fractional bits
559    lsl     GUARD
560    rol     C0
561    rol     C1
562    rol     C2
563    rol     C3
564    ;; Round last digit
565    lsl     GUARD
566    adc     C0, __zero_reg__
567    adc     C1, __zero_reg__
568    adc     C2, __zero_reg__
569    adc     C3, __zero_reg__
570    ret
571ENDF __mulsa3
572#endif  /* L_mulsa3 */
573
574#if defined (L_mulusa3)
575;;; (R25:R22)  *=  (R21:R18)
576;;; Clobbers: ABI, called by optabs
577;;; Rounding:  -1 LSB  <=  error  <=  1 LSB
578DEFUN __mulusa3
579    set
580    ;; Fallthru
581ENDF  __mulusa3
582
583;;; A[] survives in 26, 27, 30, 31
584;;; Also used by __mulsa3 with T = 0
585;;; Round if T = 1
586;;; Return Guard bits in GUARD (__tmp_reg__), used by signed version.
587DEFUN __mulusa3_round
588    push    CC2
589    push    CC3
590    ; clear result
591    clr     __tmp_reg__
592    wmov    CC2, CC0
593    ; save multiplicand
594    wmov    AA0, A0
595    wmov    AA2, A2
596    rjmp 3f
597
598    ;; Loop the integral part
599
6001:  ;; CC += A * 2^n;  n >= 0
601    add  CC0,A0  $  adc CC1,A1  $  adc  CC2,A2  $  adc  CC3,A3
602
6032:  ;; A <<= 1
604    lsl  A0      $  rol A1      $  rol  A2      $  rol  A3
605
6063:  ;; IBIT(B) >>= 1
607    ;; Carry = n-th bit of B;  n >= 0
608    lsr     B3
609    ror     B2
610    brcs 1b
611    sbci    B3, 0
612    brne 2b
613
614    ;; Loop the fractional part
615    ;; B2/B3 is 0 now, use as guard bits for rounding
616    ;; Restore multiplicand
617    wmov    A0, AA0
618    wmov    A2, AA2
619    rjmp 5f
620
6214:  ;; CC += A:Guard * 2^n;  n < 0
622    add  B3,B2 $  adc  CC0,A0  $  adc  CC1,A1  $  adc  CC2,A2  $  adc  CC3,A3
6235:
624    ;; A:Guard >>= 1
625    lsr  A3   $  ror  A2  $  ror  A1  $  ror   A0  $   ror  B2
626
627    ;; FBIT(B) <<= 1
628    ;; Carry = n-th bit of B;  n < 0
629    lsl     B0
630    rol     B1
631    brcs 4b
632    sbci    B0, 0
633    brne 5b
634
635    ;; Save guard bits and set carry for rounding
636    push    B3
637    lsl     B3
638    ;; Move result into place
639    wmov    C2, CC2
640    wmov    C0, CC0
641    clr     __zero_reg__
642    brtc 6f
643    ;; Round iff T = 1
644    adc     C0, __zero_reg__
645    adc     C1, __zero_reg__
646    adc     C2, __zero_reg__
647    adc     C3, __zero_reg__
6486:
649    pop     GUARD
650    ;; Epilogue
651    pop     CC3
652    pop     CC2
653    ret
654ENDF __mulusa3_round
655#endif  /* L_mulusa3 */
656
657#undef A0
658#undef A1
659#undef A2
660#undef A3
661#undef B0
662#undef B1
663#undef B2
664#undef B3
665#undef C0
666#undef C1
667#undef C2
668#undef C3
669#undef AA0
670#undef AA1
671#undef AA2
672#undef AA3
673#undef CC0
674#undef CC1
675#undef CC2
676#undef CC3
677
678#endif /* __AVR_HAVE_MUL__ */
679
680#undef GUARD
681
682/***********************************************************
683    Fixed  unsigned saturated Multiplication  8.8 x 8.8
684***********************************************************/
685
686#define C0  22
687#define C1  C0+1
688#define C2  C0+2
689#define C3  C0+3
690#define SS __tmp_reg__
691
692#if defined (L_usmuluha3)
693DEFUN __usmuluha3
694    ;; Widening multiply
695#ifdef __AVR_HAVE_MUL__
696    ;; Adjust interface
697    movw    R26, R22
698    movw    R18, R24
699#endif /* HAVE MUL */
700    XCALL   __umulhisi3
701    tst     C3
702    brne .Lmax
703    ;; Round, target is in C1..C2
704    lsl     C0
705    adc     C1, __zero_reg__
706    adc     C2, __zero_reg__
707    brcs .Lmax
708    ;; Move result into place
709    mov     C3, C2
710    mov     C2, C1
711    ret
712.Lmax:
713    ;; Saturate
714    ldi     C2, 0xff
715    ldi     C3, 0xff
716    ret
717ENDF  __usmuluha3
718#endif /* L_usmuluha3 */
719
720/***********************************************************
721    Fixed signed saturated Multiplication  s8.7 x s8.7
722***********************************************************/
723
724#if defined (L_ssmulha3)
725DEFUN __ssmulha3
726    ;; Widening multiply
727#ifdef __AVR_HAVE_MUL__
728    ;; Adjust interface
729    movw    R26, R22
730    movw    R18, R24
731#endif /* HAVE MUL */
732    XCALL   __mulhisi3
733    ;; Adjust decimal point
734    lsl     C0
735    rol     C1
736    rol     C2
737    brvs .LsatC3.3
738    ;; The 9 MSBs must be the same
739    rol     C3
740    sbc     SS, SS
741    cp      C3, SS
742    brne .LsatSS
743    ;; Round
744    lsl     C0
745    adc     C1, __zero_reg__
746    adc     C2, __zero_reg__
747    brvs .Lmax
748    ;; Move result into place
749    mov    C3, C2
750    mov    C2, C1
751    ret
752.Lmax:
753    ;; Load 0x7fff
754    clr     C3
755.LsatC3.3:
756    ;; C3 <  0 -->  0x8000
757    ;; C3 >= 0 -->  0x7fff
758    mov     SS, C3
759.LsatSS:
760    ;; Load min / max value:
761    ;; SS = -1  -->  0x8000
762    ;; SS =  0  -->  0x7fff
763    ldi     C3, 0x7f
764    ldi     C2, 0xff
765    sbrc    SS, 7
766    adiw    C2, 1
767    ret
768ENDF  __ssmulha3
769#endif /* L_ssmulha3 */
770
771#undef C0
772#undef C1
773#undef C2
774#undef C3
775#undef SS
776
777/***********************************************************
778    Fixed  unsigned saturated Multiplication  16.16 x 16.16
779***********************************************************/
780
781#define C0  18
782#define C1  C0+1
783#define C2  C0+2
784#define C3  C0+3
785#define C4  C0+4
786#define C5  C0+5
787#define C6  C0+6
788#define C7  C0+7
789#define SS __tmp_reg__
790
791#if defined (L_usmulusa3)
792;; R22[4] = R22[4] *{ssat} R18[4]
793;; Ordinary ABI function
794DEFUN __usmulusa3
795    ;; Widening multiply
796    XCALL   __umulsidi3
797    or      C7, C6
798    brne .Lmax
799    ;; Round, target is in C2..C5
800    lsl     C1
801    adc     C2, __zero_reg__
802    adc     C3, __zero_reg__
803    adc     C4, __zero_reg__
804    adc     C5, __zero_reg__
805    brcs .Lmax
806    ;; Move result into place
807    wmov    C6, C4
808    wmov    C4, C2
809    ret
810.Lmax:
811    ;; Saturate
812    ldi     C7, 0xff
813    ldi     C6, 0xff
814    wmov    C4, C6
815    ret
816ENDF  __usmulusa3
817#endif /* L_usmulusa3 */
818
819/***********************************************************
820    Fixed signed saturated Multiplication  s16.15 x s16.15
821***********************************************************/
822
823#if defined (L_ssmulsa3)
824;; R22[4] = R22[4] *{ssat} R18[4]
825;; Ordinary ABI function
826DEFUN __ssmulsa3
827    ;; Widening multiply
828    XCALL   __mulsidi3
829    ;; Adjust decimal point
830    lsl     C1
831    rol     C2
832    rol     C3
833    rol     C4
834    rol     C5
835    brvs .LsatC7.7
836    ;; The 17 MSBs must be the same
837    rol     C6
838    rol     C7
839    sbc     SS, SS
840    cp      C6, SS
841    cpc     C7, SS
842    brne .LsatSS
843    ;; Round
844    lsl     C1
845    adc     C2, __zero_reg__
846    adc     C3, __zero_reg__
847    adc     C4, __zero_reg__
848    adc     C5, __zero_reg__
849    brvs .Lmax
850    ;; Move result into place
851    wmov    C6, C4
852    wmov    C4, C2
853    ret
854
855.Lmax:
856    ;; Load 0x7fffffff
857    clr     C7
858.LsatC7.7:
859    ;; C7 <  0 -->  0x80000000
860    ;; C7 >= 0 -->  0x7fffffff
861    lsl     C7
862    sbc     SS, SS
863.LsatSS:
864    ;; Load min / max value:
865    ;; SS = -1  -->  0x80000000
866    ;; SS =  0  -->  0x7fffffff
867    com     SS
868    mov     C4, SS
869    mov     C5, C4
870    wmov    C6, C4
871    subi    C7, 0x80
872    ret
873ENDF  __ssmulsa3
874#endif /* L_ssmulsa3 */
875
876#undef C0
877#undef C1
878#undef C2
879#undef C3
880#undef C4
881#undef C5
882#undef C6
883#undef C7
884#undef SS
885
886/*******************************************************
887      Fractional Division 8 / 8
888*******************************************************/
889
890#define r_divd  r25     /* dividend */
891#define r_quo   r24     /* quotient */
892#define r_div   r22     /* divisor */
893#define r_sign  __tmp_reg__
894
895#if defined (L_divqq3)
896DEFUN   __divqq3
897    mov     r_sign, r_divd
898    eor     r_sign, r_div
899    sbrc    r_div, 7
900    neg     r_div
901    sbrc    r_divd, 7
902    neg     r_divd
903    XCALL   __divqq_helper
904    lsr     r_quo
905    sbrc    r_sign, 7   ; negate result if needed
906    neg     r_quo
907    ret
908ENDF __divqq3
909#endif  /* L_divqq3 */
910
911#if defined (L_udivuqq3)
912DEFUN   __udivuqq3
913    cp      r_divd, r_div
914    brsh    0f
915    XJMP __divqq_helper
916    ;; Result is out of [0, 1)  ==>  Return 1 - eps.
9170:  ldi     r_quo, 0xff
918    ret
919ENDF __udivuqq3
920#endif  /* L_udivuqq3 */
921
922
923#if defined (L_divqq_helper)
924DEFUN   __divqq_helper
925    clr     r_quo           ; clear quotient
926    inc     __zero_reg__    ; init loop counter, used per shift
927__udivuqq3_loop:
928    lsl     r_divd          ; shift dividend
929    brcs    0f              ; dividend overflow
930    cp      r_divd,r_div    ; compare dividend & divisor
931    brcc    0f              ; dividend >= divisor
932    rol     r_quo           ; shift quotient (with CARRY)
933    rjmp    __udivuqq3_cont
9340:
935    sub     r_divd,r_div    ; restore dividend
936    lsl     r_quo           ; shift quotient (without CARRY)
937__udivuqq3_cont:
938    lsl     __zero_reg__    ; shift loop-counter bit
939    brne    __udivuqq3_loop
940    com     r_quo           ; complement result
941                            ; because C flag was complemented in loop
942    ret
943ENDF __divqq_helper
944#endif  /* L_divqq_helper */
945
946#undef  r_divd
947#undef  r_quo
948#undef  r_div
949#undef  r_sign
950
951
952/*******************************************************
953    Fractional Division 16 / 16
954*******************************************************/
955#define r_divdL 26     /* dividend Low */
956#define r_divdH 27     /* dividend Hig */
957#define r_quoL  24     /* quotient Low */
958#define r_quoH  25     /* quotient High */
959#define r_divL  22     /* divisor */
960#define r_divH  23     /* divisor */
961#define r_cnt   21
962
963#if defined (L_divhq3)
964DEFUN   __divhq3
965    mov     r0, r_divdH
966    eor     r0, r_divH
967    sbrs    r_divH, 7
968    rjmp    1f
969    NEG2    r_divL
9701:
971    sbrs    r_divdH, 7
972    rjmp    2f
973    NEG2    r_divdL
9742:
975    cp      r_divdL, r_divL
976    cpc     r_divdH, r_divH
977    breq    __divhq3_minus1  ; if equal return -1
978    XCALL   __udivuhq3
979    lsr     r_quoH
980    ror     r_quoL
981    brpl    9f
982    ;; negate result if needed
983    NEG2    r_quoL
9849:
985    ret
986__divhq3_minus1:
987    ldi     r_quoH, 0x80
988    clr     r_quoL
989    ret
990ENDF __divhq3
991#endif  /* defined (L_divhq3) */
992
993#if defined (L_udivuhq3)
994DEFUN   __udivuhq3
995    sub     r_quoH,r_quoH   ; clear quotient and carry
996    ;; FALLTHRU
997ENDF __udivuhq3
998
999DEFUN   __udivuha3_common
1000    clr     r_quoL          ; clear quotient
1001    ldi     r_cnt,16        ; init loop counter
1002__udivuhq3_loop:
1003    rol     r_divdL         ; shift dividend (with CARRY)
1004    rol     r_divdH
1005    brcs    __udivuhq3_ep   ; dividend overflow
1006    cp      r_divdL,r_divL  ; compare dividend & divisor
1007    cpc     r_divdH,r_divH
1008    brcc    __udivuhq3_ep   ; dividend >= divisor
1009    rol     r_quoL          ; shift quotient (with CARRY)
1010    rjmp    __udivuhq3_cont
1011__udivuhq3_ep:
1012    sub     r_divdL,r_divL  ; restore dividend
1013    sbc     r_divdH,r_divH
1014    lsl     r_quoL          ; shift quotient (without CARRY)
1015__udivuhq3_cont:
1016    rol     r_quoH          ; shift quotient
1017    dec     r_cnt           ; decrement loop counter
1018    brne    __udivuhq3_loop
1019    com     r_quoL          ; complement result
1020    com     r_quoH          ; because C flag was complemented in loop
1021    ret
1022ENDF __udivuha3_common
1023#endif  /* defined (L_udivuhq3) */
1024
1025/*******************************************************
1026    Fixed Division 8.8 / 8.8
1027*******************************************************/
1028#if defined (L_divha3)
1029DEFUN   __divha3
1030    mov     r0, r_divdH
1031    eor     r0, r_divH
1032    sbrs    r_divH, 7
1033    rjmp    1f
1034    NEG2    r_divL
10351:
1036    sbrs    r_divdH, 7
1037    rjmp    2f
1038    NEG2    r_divdL
10392:
1040    XCALL   __udivuha3
1041    lsr     r_quoH  ; adjust to 7 fractional bits
1042    ror     r_quoL
1043    sbrs    r0, 7   ; negate result if needed
1044    ret
1045    NEG2    r_quoL
1046    ret
1047ENDF __divha3
1048#endif  /* defined (L_divha3) */
1049
1050#if defined (L_udivuha3)
1051DEFUN   __udivuha3
1052    mov     r_quoH, r_divdL
1053    mov     r_divdL, r_divdH
1054    clr     r_divdH
1055    lsl     r_quoH     ; shift quotient into carry
1056    XJMP    __udivuha3_common ; same as fractional after rearrange
1057ENDF __udivuha3
1058#endif  /* defined (L_udivuha3) */
1059
1060#undef  r_divdL
1061#undef  r_divdH
1062#undef  r_quoL
1063#undef  r_quoH
1064#undef  r_divL
1065#undef  r_divH
1066#undef  r_cnt
1067
1068/*******************************************************
1069    Fixed Division 16.16 / 16.16
1070*******************************************************/
1071
1072#define r_arg1L  24    /* arg1 gets passed already in place */
1073#define r_arg1H  25
1074#define r_arg1HL 26
1075#define r_arg1HH 27
1076#define r_divdL  26    /* dividend Low */
1077#define r_divdH  27
1078#define r_divdHL 30
1079#define r_divdHH 31    /* dividend High */
1080#define r_quoL   22    /* quotient Low */
1081#define r_quoH   23
1082#define r_quoHL  24
1083#define r_quoHH  25    /* quotient High */
1084#define r_divL   18    /* divisor Low */
1085#define r_divH   19
1086#define r_divHL  20
1087#define r_divHH  21    /* divisor High */
1088#define r_cnt  __zero_reg__  /* loop count (0 after the loop!) */
1089
1090#if defined (L_divsa3)
1091DEFUN   __divsa3
1092    mov     r0, r_arg1HH
1093    eor     r0, r_divHH
1094    sbrs    r_divHH, 7
1095    rjmp    1f
1096    NEG4    r_divL
10971:
1098    sbrs    r_arg1HH, 7
1099    rjmp    2f
1100    NEG4    r_arg1L
11012:
1102    XCALL   __udivusa3
1103    lsr     r_quoHH ; adjust to 15 fractional bits
1104    ror     r_quoHL
1105    ror     r_quoH
1106    ror     r_quoL
1107    sbrs    r0, 7   ; negate result if needed
1108    ret
1109    ;; negate r_quoL
1110    XJMP    __negsi2
1111ENDF __divsa3
1112#endif  /* defined (L_divsa3) */
1113
1114#if defined (L_udivusa3)
1115DEFUN   __udivusa3
1116    ldi     r_divdHL, 32    ; init loop counter
1117    mov     r_cnt, r_divdHL
1118    clr     r_divdHL
1119    clr     r_divdHH
1120    wmov    r_quoL, r_divdHL
1121    lsl     r_quoHL         ; shift quotient into carry
1122    rol     r_quoHH
1123__udivusa3_loop:
1124    rol     r_divdL         ; shift dividend (with CARRY)
1125    rol     r_divdH
1126    rol     r_divdHL
1127    rol     r_divdHH
1128    brcs    __udivusa3_ep   ; dividend overflow
1129    cp      r_divdL,r_divL  ; compare dividend & divisor
1130    cpc     r_divdH,r_divH
1131    cpc     r_divdHL,r_divHL
1132    cpc     r_divdHH,r_divHH
1133    brcc    __udivusa3_ep   ; dividend >= divisor
1134    rol     r_quoL          ; shift quotient (with CARRY)
1135    rjmp    __udivusa3_cont
1136__udivusa3_ep:
1137    sub     r_divdL,r_divL  ; restore dividend
1138    sbc     r_divdH,r_divH
1139    sbc     r_divdHL,r_divHL
1140    sbc     r_divdHH,r_divHH
1141    lsl     r_quoL          ; shift quotient (without CARRY)
1142__udivusa3_cont:
1143    rol     r_quoH          ; shift quotient
1144    rol     r_quoHL
1145    rol     r_quoHH
1146    dec     r_cnt           ; decrement loop counter
1147    brne    __udivusa3_loop
1148    com     r_quoL          ; complement result
1149    com     r_quoH          ; because C flag was complemented in loop
1150    com     r_quoHL
1151    com     r_quoHH
1152    ret
1153ENDF __udivusa3
1154#endif  /* defined (L_udivusa3) */
1155
1156#undef  r_arg1L
1157#undef  r_arg1H
1158#undef  r_arg1HL
1159#undef  r_arg1HH
1160#undef  r_divdL
1161#undef  r_divdH
1162#undef  r_divdHL
1163#undef  r_divdHH
1164#undef  r_quoL
1165#undef  r_quoH
1166#undef  r_quoHL
1167#undef  r_quoHH
1168#undef  r_divL
1169#undef  r_divH
1170#undef  r_divHL
1171#undef  r_divHH
1172#undef  r_cnt
1173
1174
1175;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1176;; Saturation, 1 Byte
1177;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1178
1179;; First Argument and Return Register
1180#define A0  24
1181
1182#if defined (L_ssabs_1)
1183DEFUN __ssabs_1
1184    sbrs    A0, 7
1185    ret
1186    neg     A0
1187    sbrc    A0,7
1188    dec     A0
1189    ret
1190ENDF __ssabs_1
1191#endif /* L_ssabs_1 */
1192
1193#undef A0
1194
1195
1196
1197;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1198;; Saturation, 2 Bytes
1199;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1200
1201;; First Argument and Return Register
1202#define A0  24
1203#define A1  A0+1
1204
1205#if defined (L_ssneg_2)
1206DEFUN __ssneg_2
1207    NEG2    A0
1208    brvc 0f
1209    sbiw    A0, 1
12100:  ret
1211ENDF __ssneg_2
1212#endif /* L_ssneg_2 */
1213
1214#if defined (L_ssabs_2)
1215DEFUN __ssabs_2
1216    sbrs    A1, 7
1217    ret
1218    XJMP    __ssneg_2
1219ENDF __ssabs_2
1220#endif /* L_ssabs_2 */
1221
1222#undef A0
1223#undef A1
1224
1225
1226
1227;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1228;; Saturation, 4 Bytes
1229;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1230
1231;; First Argument and Return Register
1232#define A0  22
1233#define A1  A0+1
1234#define A2  A0+2
1235#define A3  A0+3
1236
1237#if defined (L_ssneg_4)
1238DEFUN __ssneg_4
1239    XCALL   __negsi2
1240    brvc 0f
1241    ldi     A3, 0x7f
1242    ldi     A2, 0xff
1243    ldi     A1, 0xff
1244    ldi     A0, 0xff
12450:  ret
1246ENDF __ssneg_4
1247#endif /* L_ssneg_4 */
1248
1249#if defined (L_ssabs_4)
1250DEFUN __ssabs_4
1251    sbrs    A3, 7
1252    ret
1253    XJMP    __ssneg_4
1254ENDF __ssabs_4
1255#endif /* L_ssabs_4 */
1256
1257#undef A0
1258#undef A1
1259#undef A2
1260#undef A3
1261
1262
1263
1264;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1265;; Saturation, 8 Bytes
1266;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1267
1268;; First Argument and Return Register
1269#define A0  18
1270#define A1  A0+1
1271#define A2  A0+2
1272#define A3  A0+3
1273#define A4  A0+4
1274#define A5  A0+5
1275#define A6  A0+6
1276#define A7  A0+7
1277
1278#if defined (L_clr_8)
1279FALIAS __usneguta2
1280FALIAS __usneguda2
1281FALIAS __usnegudq2
1282
1283;; Clear Carry and all Bytes
1284DEFUN __clr_8
1285    ;; Clear Carry and set Z
1286    sub     A7, A7
1287    ;; FALLTHRU
1288ENDF  __clr_8
1289;; Propagate Carry to all Bytes, Carry unaltered
1290DEFUN __sbc_8
1291    sbc     A7, A7
1292    sbc     A6, A6
1293    wmov    A4, A6
1294    wmov    A2, A6
1295    wmov    A0, A6
1296    ret
1297ENDF __sbc_8
1298#endif /* L_clr_8 */
1299
1300#if defined (L_ssneg_8)
1301FALIAS __ssnegta2
1302FALIAS __ssnegda2
1303FALIAS __ssnegdq2
1304
1305DEFUN __ssneg_8
1306    XCALL   __negdi2
1307    brvc 0f
1308    ;; A[] = 0x7fffffff
1309    sec
1310    XCALL   __sbc_8
1311    ldi     A7, 0x7f
13120:  ret
1313ENDF __ssneg_8
1314#endif /* L_ssneg_8 */
1315
1316#if defined (L_ssabs_8)
1317FALIAS __ssabsta2
1318FALIAS __ssabsda2
1319FALIAS __ssabsdq2
1320
1321DEFUN __ssabs_8
1322    sbrs    A7, 7
1323    ret
1324    XJMP    __ssneg_8
1325ENDF __ssabs_8
1326#endif /* L_ssabs_8 */
1327
1328;; Second Argument
1329#define B0  10
1330#define B1  B0+1
1331#define B2  B0+2
1332#define B3  B0+3
1333#define B4  B0+4
1334#define B5  B0+5
1335#define B6  B0+6
1336#define B7  B0+7
1337
1338#if defined (L_usadd_8)
1339FALIAS __usadduta3
1340FALIAS __usadduda3
1341FALIAS __usaddudq3
1342
1343DEFUN __usadd_8
1344    XCALL   __adddi3
1345    brcs 0f
1346    ret
13470:  ;; A[] = 0xffffffff
1348    XJMP    __sbc_8
1349ENDF __usadd_8
1350#endif /* L_usadd_8 */
1351
1352#if defined (L_ussub_8)
1353FALIAS __ussubuta3
1354FALIAS __ussubuda3
1355FALIAS __ussubudq3
1356
1357DEFUN __ussub_8
1358    XCALL   __subdi3
1359    brcs 0f
1360    ret
13610:  ;; A[] = 0
1362    XJMP    __clr_8
1363ENDF __ussub_8
1364#endif /* L_ussub_8 */
1365
1366#if defined (L_ssadd_8)
1367FALIAS __ssaddta3
1368FALIAS __ssaddda3
1369FALIAS __ssadddq3
1370
1371DEFUN __ssadd_8
1372    XCALL   __adddi3
1373    brvc 0f
1374    ;; A = (B >= 0) ? INT64_MAX : INT64_MIN
1375    cpi     B7, 0x80
1376    XCALL   __sbc_8
1377    subi    A7, 0x80
13780:  ret
1379ENDF __ssadd_8
1380#endif /* L_ssadd_8 */
1381
1382#if defined (L_sssub_8)
1383FALIAS __sssubta3
1384FALIAS __sssubda3
1385FALIAS __sssubdq3
1386
1387DEFUN __sssub_8
1388    XCALL   __subdi3
1389    brvc 0f
1390    ;; A = (B < 0) ? INT64_MAX : INT64_MIN
1391    ldi     A7, 0x7f
1392    cp      A7, B7
1393    XCALL   __sbc_8
1394    subi    A7, 0x80
13950:  ret
1396ENDF __sssub_8
1397#endif /* L_sssub_8 */
1398
1399#undef A0
1400#undef A1
1401#undef A2
1402#undef A3
1403#undef A4
1404#undef A5
1405#undef A6
1406#undef A7
1407#undef B0
1408#undef B1
1409#undef B2
1410#undef B3
1411#undef B4
1412#undef B5
1413#undef B6
1414#undef B7
1415
1416
1417;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1418;; Rounding Helpers
1419;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1420
1421#ifdef L_mask1
1422
1423#define AA 24
1424#define CC 25
1425
1426;; R25 = 1 << (R24 & 7)
1427;; CC  = 1 << (AA  & 7)
1428;; Clobbers: None
1429DEFUN __mask1
1430    ;; CC = 2 ^ AA.1
1431    ldi     CC, 1 << 2
1432    sbrs    AA, 1
1433    ldi     CC, 1 << 0
1434    ;; CC *= 2 ^ AA.0
1435    sbrc    AA, 0
1436    lsl     CC
1437    ;; CC *= 2 ^ AA.2
1438    sbrc    AA, 2
1439    swap    CC
1440    ret
1441ENDF __mask1
1442
1443#undef AA
1444#undef CC
1445#endif /* L_mask1 */
1446
1447;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1448
1449;; The rounding point. Any bits smaller than
1450;; 2^{-RP} will be cleared.
1451#define RP R24
1452
1453#define A0 22
1454#define A1 A0 + 1
1455
1456#define C0 24
1457#define C1 C0 + 1
1458
1459;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1460;; Rounding, 1 Byte
1461;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1462
1463#ifdef L_roundqq3
1464
1465;; R24 = round (R22, R24)
1466;; Clobbers: R22, __tmp_reg__
1467DEFUN  __roundqq3
1468    mov     __tmp_reg__, C1
1469    subi    RP, __QQ_FBIT__ - 1
1470    neg     RP
1471    ;; R25 = 1 << RP  (Total offset is FBIT-1 - RP)
1472    XCALL   __mask1
1473    mov     C0, C1
1474    ;; Add-Saturate 2^{-RP-1}
1475    add     A0, C0
1476    brvc 0f
1477    ldi     C0, 0x7f
1478    rjmp 9f
14790:  ;; Mask out bits beyond RP
1480    lsl     C0
1481    neg     C0
1482    and     C0, A0
14839:  mov     C1, __tmp_reg__
1484    ret
1485ENDF  __roundqq3
1486#endif /* L_roundqq3 */
1487
1488#ifdef L_rounduqq3
1489
1490;; R24 = round (R22, R24)
1491;; Clobbers: R22, __tmp_reg__
1492DEFUN  __rounduqq3
1493    mov     __tmp_reg__, C1
1494    subi    RP, __UQQ_FBIT__ - 1
1495    neg     RP
1496    ;; R25 = 1 << RP  (Total offset is FBIT-1 - RP)
1497    XCALL   __mask1
1498    mov     C0, C1
1499    ;; Add-Saturate 2^{-RP-1}
1500    add     A0, C0
1501    brcc 0f
1502    ldi     C0, 0xff
1503    rjmp 9f
15040:  ;; Mask out bits beyond RP
1505    lsl     C0
1506    neg     C0
1507    and     C0, A0
15089:  mov     C1, __tmp_reg__
1509    ret
1510ENDF  __rounduqq3
1511#endif /* L_rounduqq3 */
1512
1513;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1514;; Rounding, 2 Bytes
1515;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1516
1517#ifdef L_addmask_2
1518
1519;; [ R25:R24 =  1 << (R24 & 15)
1520;;   R23:R22 += 1 << (R24 & 15) ]
1521;; SREG is set according to the addition
1522DEFUN __addmask_2
1523    ;; R25 = 1 << (R24 & 7)
1524    XCALL   __mask1
1525    cpi     RP, 1 << 3
1526    sbc     C0, C0
1527    ;; Swap C0 and C1 if RP.3 was set
1528    and     C0, C1
1529    eor     C1, C0
1530    ;; Finally, add the power-of-two:  A[] += C[]
1531    add     A0, C0
1532    adc     A1, C1
1533    ret
1534ENDF  __addmask_2
1535#endif /* L_addmask_2 */
1536
1537#ifdef L_round_s2
1538
1539;; R25:R24 = round (R23:R22, R24)
1540;; Clobbers: R23, R22
1541DEFUN  __roundhq3
1542    subi    RP, __HQ_FBIT__ - __HA_FBIT__
1543ENDF   __roundhq3
1544DEFUN  __roundha3
1545    subi    RP, __HA_FBIT__ - 1
1546    neg     RP
1547    ;; [ R25:R24  = 1 << (FBIT-1 - RP)
1548    ;;   R23:R22 += 1 << (FBIT-1 - RP) ]
1549    XCALL   __addmask_2
1550    XJMP    __round_s2_const
1551ENDF  __roundha3
1552
1553#endif /* L_round_s2 */
1554
1555#ifdef L_round_u2
1556
1557;; R25:R24 = round (R23:R22, R24)
1558;; Clobbers: R23, R22
1559DEFUN  __rounduhq3
1560    subi    RP, __UHQ_FBIT__ - __UHA_FBIT__
1561ENDF   __rounduhq3
1562DEFUN  __rounduha3
1563    subi    RP, __UHA_FBIT__ - 1
1564    neg     RP
1565    ;; [ R25:R24  = 1 << (FBIT-1 - RP)
1566    ;;   R23:R22 += 1 << (FBIT-1 - RP) ]
1567    XCALL   __addmask_2
1568    XJMP    __round_u2_const
1569ENDF  __rounduha3
1570
1571#endif /* L_round_u2 */
1572
1573
1574#ifdef L_round_2_const
1575
1576;; Helpers for 2 byte wide rounding
1577
1578DEFUN  __round_s2_const
1579    brvc 2f
1580    ldi     C1, 0x7f
1581    rjmp 1f
1582    ;; FALLTHRU (Barrier)
1583ENDF  __round_s2_const
1584
1585DEFUN __round_u2_const
1586    brcc 2f
1587    ldi     C1, 0xff
15881:
1589    ldi     C0, 0xff
1590    rjmp 9f
15912:
1592    ;; Saturation is performed now.
1593    ;; Currently, we have C[] = 2^{-RP-1}
1594    ;; C[] = 2^{-RP}
1595    lsl     C0
1596    rol     C1
1597    ;;
1598    NEG2    C0
1599    ;; Clear the bits beyond the rounding point.
1600    and     C0, A0
1601    and     C1, A1
16029:  ret
1603ENDF  __round_u2_const
1604
1605#endif /* L_round_2_const */
1606
1607#undef A0
1608#undef A1
1609#undef C0
1610#undef C1
1611
1612;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1613;; Rounding, 4 Bytes
1614;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1615
1616#define A0 18
1617#define A1 A0 + 1
1618#define A2 A0 + 2
1619#define A3 A0 + 3
1620
1621#define C0 22
1622#define C1 C0 + 1
1623#define C2 C0 + 2
1624#define C3 C0 + 3
1625
1626#ifdef L_addmask_4
1627
1628;; [ R25:R22 =  1 << (R24 & 31)
1629;;   R21:R18 += 1 << (R24 & 31) ]
1630;; SREG is set according to the addition
1631DEFUN __addmask_4
1632    ;; R25 = 1 << (R24 & 7)
1633    XCALL   __mask1
1634    cpi     RP, 1 << 4
1635    sbc     C0, C0
1636    sbc     C1, C1
1637    ;; Swap C2 with C3 if RP.3 is not set
1638    cpi     RP, 1 << 3
1639    sbc     C2, C2
1640    and     C2, C3
1641    eor     C3, C2
1642    ;; Swap C3:C2 with C1:C0 if RP.4 is not set
1643    and     C0, C2  $  eor     C2, C0
1644    and     C1, C3  $  eor     C3, C1
1645    ;; Finally, add the power-of-two:  A[] += C[]
1646    add     A0, C0
1647    adc     A1, C1
1648    adc     A2, C2
1649    adc     A3, C3
1650    ret
1651ENDF  __addmask_4
1652#endif /* L_addmask_4 */
1653
1654#ifdef L_round_s4
1655
1656;; R25:R22 = round (R21:R18, R24)
1657;; Clobbers: R18...R21
1658DEFUN  __roundsq3
1659    subi    RP, __SQ_FBIT__ - __SA_FBIT__
1660ENDF   __roundsq3
1661DEFUN  __roundsa3
1662    subi    RP, __SA_FBIT__ - 1
1663    neg     RP
1664    ;; [ R25:R22  = 1 << (FBIT-1 - RP)
1665    ;;   R21:R18 += 1 << (FBIT-1 - RP) ]
1666    XCALL   __addmask_4
1667    XJMP    __round_s4_const
1668ENDF  __roundsa3
1669
1670#endif /* L_round_s4 */
1671
1672#ifdef L_round_u4
1673
1674;; R25:R22 = round (R21:R18, R24)
1675;; Clobbers: R18...R21
1676DEFUN  __roundusq3
1677    subi    RP, __USQ_FBIT__ - __USA_FBIT__
1678ENDF   __roundusq3
1679DEFUN  __roundusa3
1680    subi    RP, __USA_FBIT__ - 1
1681    neg     RP
1682    ;; [ R25:R22  = 1 << (FBIT-1 - RP)
1683    ;;   R21:R18 += 1 << (FBIT-1 - RP) ]
1684    XCALL   __addmask_4
1685    XJMP    __round_u4_const
1686ENDF  __roundusa3
1687
1688#endif /* L_round_u4 */
1689
1690
1691#ifdef L_round_4_const
1692
1693;; Helpers for 4 byte wide rounding
1694
1695DEFUN  __round_s4_const
1696    brvc 2f
1697    ldi     C3, 0x7f
1698    rjmp 1f
1699    ;; FALLTHRU (Barrier)
1700ENDF  __round_s4_const
1701
1702DEFUN __round_u4_const
1703    brcc 2f
1704    ldi     C3, 0xff
17051:
1706    ldi     C2, 0xff
1707    ldi     C1, 0xff
1708    ldi     C0, 0xff
1709    rjmp 9f
17102:
1711    ;; Saturation is performed now.
1712    ;; Currently, we have C[] = 2^{-RP-1}
1713    ;; C[] = 2^{-RP}
1714    lsl     C0
1715    rol     C1
1716    rol     C2
1717    rol     C3
1718    XCALL   __negsi2
1719    ;; Clear the bits beyond the rounding point.
1720    and     C0, A0
1721    and     C1, A1
1722    and     C2, A2
1723    and     C3, A3
17249:  ret
1725ENDF  __round_u4_const
1726
1727#endif /* L_round_4_const */
1728
1729#undef A0
1730#undef A1
1731#undef A2
1732#undef A3
1733#undef C0
1734#undef C1
1735#undef C2
1736#undef C3
1737
1738#undef RP
1739
1740;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1741;; Rounding, 8 Bytes
1742;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1743
1744#define RP     16
1745#define FBITm1 31
1746
1747#define C0 18
1748#define C1 C0 + 1
1749#define C2 C0 + 2
1750#define C3 C0 + 3
1751#define C4 C0 + 4
1752#define C5 C0 + 5
1753#define C6 C0 + 6
1754#define C7 C0 + 7
1755
1756#define A0 16
1757#define A1 17
1758#define A2 26
1759#define A3 27
1760#define A4 28
1761#define A5 29
1762#define A6 30
1763#define A7 31
1764
1765
1766#ifdef L_rounddq3
1767;; R25:R18 = round (R25:R18, R16)
1768;; Clobbers: ABI
1769DEFUN  __rounddq3
1770    ldi     FBITm1, __DQ_FBIT__ - 1
1771    clt
1772    XJMP    __round_x8
1773ENDF  __rounddq3
1774#endif /* L_rounddq3 */
1775
1776#ifdef L_roundudq3
1777;; R25:R18 = round (R25:R18, R16)
1778;; Clobbers: ABI
1779DEFUN  __roundudq3
1780    ldi     FBITm1, __UDQ_FBIT__ - 1
1781    set
1782    XJMP    __round_x8
1783ENDF  __roundudq3
1784#endif /* L_roundudq3 */
1785
1786#ifdef L_roundda3
1787;; R25:R18 = round (R25:R18, R16)
1788;; Clobbers: ABI
1789DEFUN  __roundda3
1790    ldi     FBITm1, __DA_FBIT__ - 1
1791    clt
1792    XJMP    __round_x8
1793ENDF  __roundda3
1794#endif /* L_roundda3 */
1795
1796#ifdef L_rounduda3
1797;; R25:R18 = round (R25:R18, R16)
1798;; Clobbers: ABI
1799DEFUN  __rounduda3
1800    ldi     FBITm1, __UDA_FBIT__ - 1
1801    set
1802    XJMP    __round_x8
1803ENDF  __rounduda3
1804#endif /* L_rounduda3 */
1805
1806#ifdef L_roundta3
1807;; R25:R18 = round (R25:R18, R16)
1808;; Clobbers: ABI
1809DEFUN  __roundta3
1810    ldi     FBITm1, __TA_FBIT__ - 1
1811    clt
1812    XJMP    __round_x8
1813ENDF  __roundta3
1814#endif /* L_roundta3 */
1815
1816#ifdef L_rounduta3
1817;; R25:R18 = round (R25:R18, R16)
1818;; Clobbers: ABI
1819DEFUN  __rounduta3
1820    ldi     FBITm1, __UTA_FBIT__ - 1
1821    set
1822    XJMP    __round_x8
1823ENDF  __rounduta3
1824#endif /* L_rounduta3 */
1825
1826
1827#ifdef L_round_x8
1828DEFUN __round_x8
1829    push r16
1830    push r17
1831    push r28
1832    push r29
1833    ;; Compute log2 of addend from rounding point
1834    sub     RP, FBITm1
1835    neg     RP
1836    ;; Move input to work register A[]
1837    push    C0
1838    mov     A1, C1
1839    wmov    A2, C2
1840    wmov    A4, C4
1841    wmov    A6, C6
1842    ;; C[] = 1 << (FBIT-1 - RP)
1843    XCALL   __clr_8
1844    inc     C0
1845    XCALL   __ashldi3
1846    pop     A0
1847    ;; A[] += C[]
1848    add     A0, C0
1849    adc     A1, C1
1850    adc     A2, C2
1851    adc     A3, C3
1852    adc     A4, C4
1853    adc     A5, C5
1854    adc     A6, C6
1855    adc     A7, C7
1856    brts    1f
1857    ;; Signed
1858    brvc    3f
1859    ;; Signed overflow: A[] = 0x7f...
1860    brvs    2f
18611:  ;; Unsigned
1862    brcc    3f
1863    ;; Unsigned overflow: A[] = 0xff...
18642:  ldi     C7, 0xff
1865    ldi     C6, 0xff
1866    wmov    C0, C6
1867    wmov    C2, C6
1868    wmov    C4, C6
1869    bld     C7, 7
1870    rjmp 9f
18713:
1872    ;;  C[] = -C[] - C[]
1873    push    A0
1874    ldi     r16, 1
1875    XCALL   __ashldi3
1876    pop     A0
1877    XCALL   __negdi2
1878    ;; Clear the bits beyond the rounding point.
1879    and     C0, A0
1880    and     C1, A1
1881    and     C2, A2
1882    and     C3, A3
1883    and     C4, A4
1884    and     C5, A5
1885    and     C6, A6
1886    and     C7, A7
18879:  ;; Epilogue
1888    pop r29
1889    pop r28
1890    pop r17
1891    pop r16
1892    ret
1893ENDF  __round_x8
1894
1895#endif /* L_round_x8 */
1896
1897#undef A0
1898#undef A1
1899#undef A2
1900#undef A3
1901#undef A4
1902#undef A5
1903#undef A6
1904#undef A7
1905
1906#undef C0
1907#undef C1
1908#undef C2
1909#undef C3
1910#undef C4
1911#undef C5
1912#undef C6
1913#undef C7
1914
1915#undef RP
1916#undef FBITm1
1917
1918
1919;; Supply implementations / symbols for the bit-banging functions
1920;; __builtin_avr_bitsfx and __builtin_avr_fxbits
1921#ifdef L_ret
1922DEFUN __ret
1923    ret
1924ENDF  __ret
1925#endif /* L_ret */
1926
1927#endif /* if not __AVR_TINY__ */
1928