1 /* Copyright (C) 2019-2021 Free Software Foundation, Inc.
2 
3    This file is part of LIBF7, which is part of GCC.
4 
5    GCC is free software; you can redistribute it and/or modify it under
6    the terms of the GNU General Public License as published by the Free
7    Software Foundation; either version 3, or (at your option) any later
8    version.
9 
10    GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11    WARRANTY; without even the implied warranty of MERCHANTABILITY or
12    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13    for more details.
14 
15    Under Section 7 of GPL version 3, you are granted additional
16    permissions described in the GCC Runtime Library Exception, version
17    3.1, as published by the Free Software Foundation.
18 
19    You should have received a copy of the GNU General Public License and
20    a copy of the GCC Runtime Library Exception along with this program;
21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22    <http://www.gnu.org/licenses/>.  */
23 
24 #include "libf7.h"
25 
26 #ifndef __AVR_TINY__
27 
28 #define ALIAS(X, Y) \
29   F7_WEAK __attribute__((__alias__(F7_STRINGY(X)))) __typeof__(X) Y;
30 
31 #define DALIAS(...) // empty
32 #define LALIAS(...) // empty
33 
34 #ifndef IN_LIBGCC2
35 
36 #include <stdio.h>
37 #include <assert.h>
38 
39 #define in_libgcc false
40 
41 _Static_assert (sizeof (f7_t) == 10 && F7_MANT_BYTES == 7,
42 		"libf7 will only work with 7-byte mantissa.");
43 #else
44 
45 #define in_libgcc true
46 
47 #if __SIZEOF_DOUBLE__ == 8
48 #undef  DALIAS
49 #define DALIAS(X,Y) \
50   F7_WEAK __attribute__((__alias__(F7_STRINGY(X)))) __typeof__(X) Y;
51 #endif
52 
53 #if __SIZEOF_LONG_DOUBLE__ == 8
54 #undef  LALIAS
55 #define LALIAS(X,Y) \
56   F7_WEAK __attribute__((__alias__(F7_STRINGY(X)))) __typeof__(X) Y;
57 #endif
58 
59 #endif // in libgcc
60 
61 static F7_INLINE
f7_assert(bool x)62 void f7_assert (bool x)
63 {
64   if (!in_libgcc && !x)
65     __builtin_abort();
66 }
67 
68 static F7_INLINE
abs_ssat16(int16_t a)69 int16_t abs_ssat16 (int16_t a)
70 {
71   _Sat _Fract sa = __builtin_avr_rbits (a);
72   return __builtin_avr_bitsr (__builtin_avr_absr (sa));
73 }
74 
75 static F7_INLINE
add_ssat16(int16_t a,int16_t b)76 int16_t add_ssat16 (int16_t a, int16_t b)
77 {
78   _Sat _Fract sa = __builtin_avr_rbits (a);
79   _Sat _Fract sb = __builtin_avr_rbits (b);
80   return __builtin_avr_bitsr (sa + sb);
81 }
82 
83 static F7_INLINE
sub_ssat16(int16_t a,int16_t b)84 int16_t sub_ssat16 (int16_t a, int16_t b)
85 {
86   _Sat _Fract sa = __builtin_avr_rbits (a);
87   _Sat _Fract sb = __builtin_avr_rbits (b);
88   return __builtin_avr_bitsr (sa - sb);
89 }
90 
91 static F7_INLINE
ssat8_range(int16_t a,int8_t range)92 int8_t ssat8_range (int16_t a, int8_t range)
93 {
94   if (a >= range)
95     return range;
96   if (a <= -range)
97     return -range;
98   return a;
99 }
100 
101 
102 #define IN_LIBF7_H
103   #define F7_CONST_DEF(NAME, FLAGS, M6, M5, M4, M3, M2, M1, M0, EXPO) \
104     F7_UNUSED static const uint8_t F7_(const_##NAME##_msb)  = M6;     \
105     F7_UNUSED static const int16_t F7_(const_##NAME##_expo) = EXPO;
106   #include "libf7-const.def"
107   #undef F7_CONST_DEF
108 #undef IN_LIBF7_H
109 
110 
111 /*
112   libgcc naming converntions for conversions:
113 
114    __float<fmode><fmode>  : Convert float modes.
115    __floatun<imode><fmode>: Convert unsigned integral to float.
116    __fix<fmode><imode>    : Convert float to signed integral.
117    __fixuns<fmode><imode> : Convert float to unsigned integral.
118 */
119 
120 
121 #ifdef F7MOD_floatundidf_
122 F7_WEAK
__floatundidf(uint64_t x)123 f7_double_t __floatundidf (uint64_t x)
124 {
125   f7_t xx;
126   f7_set_u64 (&xx, x);
127   return f7_get_double (&xx);
128 }
129 #endif // F7MOD_floatundidf_
130 
131 
132 #ifdef F7MOD_floatdidf_
133 F7_WEAK
__floatdidf(int64_t x)134 f7_double_t __floatdidf (int64_t x)
135 {
136   f7_t xx;
137   f7_set_s64 (&xx, x);
138   return f7_get_double (&xx);
139 }
140 #endif // F7MOD_floatdidf_
141 
142 
143 #ifdef F7MOD_init_
f7_init_impl(uint64_t mant,uint8_t flags,f7_t * cc,int16_t expo)144 f7_t* f7_init_impl (uint64_t mant, uint8_t flags, f7_t *cc, int16_t expo)
145 {
146   flags &= F7_FLAGS;
147   if (f7_class_number (flags))
148     {
149       uint8_t msb;
150       while ((__builtin_memcpy (&msb, (uint8_t*) &mant + 7, 1), msb))
151 	{
152 	  mant >>= 1;
153 	  expo = add_ssat16 (expo, 1);
154 	}
155       *(uint64_t*) cc->mant = mant;
156       cc->expo = add_ssat16 (expo, F7_MANT_BITS-1);
157 
158       cc = f7_normalize_asm (cc);
159     }
160 
161   cc->flags = flags;
162 
163   return cc;
164 }
165 #endif // F7MOD_init_
166 
167 
168 #ifdef F7MOD_set_s16_
f7_set_s16_impl(f7_t * cc,int16_t i16)169 f7_t* f7_set_s16_impl (f7_t *cc, int16_t i16)
170 {
171   uint16_t u16 = (uint16_t) i16;
172   uint8_t flags = 0;
173   if (i16 < 0)
174     {
175       u16 = -u16;
176       flags = F7_FLAG_sign;
177     }
178   f7_set_u16_impl (cc, u16);
179   cc->flags = flags;
180   return cc;
181 }
182 #endif // F7MOD_set_s16_
183 
184 
185 #ifdef F7MOD_set_u16_
f7_set_u16_impl(f7_t * cc,uint16_t u16)186 f7_t* f7_set_u16_impl (f7_t *cc, uint16_t u16)
187 {
188   f7_clr (cc);
189   F7_MANT_HI2 (cc) = u16;
190   cc->expo = 15;
191   return f7_normalize_asm (cc);
192 }
193 #endif // F7MOD_set_u16_
194 
195 
196 #ifdef F7MOD_set_s32_
f7_set_s32(f7_t * cc,int32_t i32)197 f7_t* f7_set_s32 (f7_t *cc, int32_t i32)
198 {
199   uint32_t u32 = (uint32_t) i32;
200   uint8_t flags = 0;
201   if (i32 < 0)
202     {
203       u32 = -u32;
204       flags = F7_FLAG_sign;
205     }
206   cc = f7_set_u32 (cc, u32);
207   cc->flags = flags;
208   return cc;
209 }
ALIAS(f7_set_s32,f7_floatsidf)210 ALIAS (f7_set_s32, f7_floatsidf)
211 #endif // F7MOD_set_s32_
212 
213 
214 #ifdef F7MOD_set_u32_
215 f7_t* f7_set_u32 (f7_t *cc, uint32_t u32)
216 {
217   f7_clr (cc);
218   F7_MANT_HI4 (cc) = u32;
219   cc->expo = 31;
220   return f7_normalize_asm (cc);
221 }
ALIAS(f7_set_u32,f7_floatunsidf)222 ALIAS (f7_set_u32, f7_floatunsidf)
223 #endif // F7MOD_set_u32_
224 
225 
226 // IEEE 754 single
227 // float =  s  bbbbbbbb mmmmmmmmmmmmmmmmmmmmmmm
228 //	   31
229 // s = sign
230 // b = biased exponent, bias = 127
231 // m = mantissa
232 
233 // +0	      =	 0 0 0
234 // -0	      =	 1 0 0
235 // Inf	      =	 S B 0	=  S * Inf, B = 0xff
236 // NaN	      =	 S B M,	   B = 0xff, M != 0
237 // Sub-normal =	 S 0 M	=  S * 0.M * 2^{1 - bias}, M != 0
238 // Normal     =  S B M  =  S * 1.M * 2^{B - bias}, B = 1 ... 0xfe
239 
240 #define FLT_DIG_EXP   8
241 #define FLT_DIG_MANT  (31 - FLT_DIG_EXP)
242 #define FLT_MAX_EXP   ((1 << FLT_DIG_EXP) - 1)
243 #define FLT_EXP_BIAS  (FLT_MAX_EXP >> 1)
244 
245 #ifdef F7MOD_set_float_
246 F7_WEAK
247 void f7_set_float (f7_t *cc, float f)
248 {
249   uint32_t val32;
250 
251   _Static_assert (__SIZEOF_FLOAT__ == 4, "");
252   _Static_assert (__FLT_MANT_DIG__ == 24, "");
253   __builtin_memcpy (&val32, &f, __SIZEOF_FLOAT__);
254 
255   uint16_t val16 = val32 >> 16;
256   val16 >>= FLT_DIG_MANT - 16;
257 
258   uint8_t expo_biased = val16 & FLT_MAX_EXP;
259   bool sign = val16 & (1u << FLT_DIG_EXP);
260 
261   f7_clr (cc);
262 
263   uint32_t mant = val32 & ((1ul << FLT_DIG_MANT) -1);
264 
265   if (mant == 0)
266     {
267       if (expo_biased == 0)
268 	return;
269       if (expo_biased >= FLT_MAX_EXP)
270 	return f7_set_inf (cc, sign);
271     }
272 
273   if (expo_biased == 0)
274     expo_biased = 1;   // Sub-normal: biased expo of 1 was encoded as 0.
275   else if (expo_biased < FLT_MAX_EXP)
276     mant |= (1ul << FLT_DIG_MANT);
277   else
278     return f7_set_nan (cc);
279 
280   __builtin_memcpy (& F7_MANT_HI4 (cc), &mant, 4);
281 
282   cc->expo = expo_biased - FLT_EXP_BIAS + 31 - FLT_DIG_MANT;
283   f7_normalize_asm (cc);
284   f7_set_sign (cc, sign);
285 }
ALIAS(f7_set_float,f7_extendsfdf2)286 ALIAS (f7_set_float, f7_extendsfdf2)
287 #endif // F7MOD_set_float_
288 
289 
290 #ifdef F7MOD_get_float_
291 static F7_INLINE
292 float make_float (uint32_t x)
293 {
294   float ff;
295   __builtin_memcpy (&ff, &x, 4);
296   return ff;
297 
298 }
299 
300 F7_WEAK
f7_get_float(const f7_t * aa)301 float f7_get_float (const f7_t *aa)
302 {
303   uint8_t a_class = f7_classify (aa);
304 
305   if (f7_class_nan (a_class))
306     return make_float (0xffc00000 /* NaN: Biased expo = 0xff, mant != 0 */);
307 
308   uint32_t mant;
309   __builtin_memcpy (&mant, &F7_MANT_CONST_HI4 (aa), 4);
310 
311   uint8_t expo8 = 0;
312   uint8_t mant_offset = FLT_DIG_EXP;
313   int16_t c_expo = add_ssat16 (aa->expo, FLT_EXP_BIAS);
314 
315   if (f7_class_zero (a_class) || c_expo <= -FLT_DIG_MANT)
316     {
317       // Zero or tiny.
318       return 0.0f;
319     }
320   else if (c_expo >= FLT_MAX_EXP || f7_class_inf (a_class))
321     {
322       // Inf or overflow.
323       expo8 = FLT_MAX_EXP;
324       mant = 0;
325     }
326   else if (c_expo > 0)
327     {
328       // Normal.
329       expo8 = c_expo;
330     }
331   else
332     {
333       // Sub-normal:  -DIG_MANT < c_expo <= 0.
334       // Encoding of 0 represents a biased exponent of 1.
335       // mant_offset in 9...31.
336       expo8 = 0;
337       mant_offset += 1 - c_expo;
338     }
339 
340   uint16_t expo16 = expo8 << (FLT_DIG_MANT - 16);
341 
342   if (f7_class_sign (a_class))
343     expo16 |= 1u << (FLT_DIG_EXP + FLT_DIG_MANT - 16);
344 
345   mant >>= mant_offset;
346 
347   __asm ("cbr %T0%t2, 1 << (7 & %2)"  "\n\t"
348 	 "or  %C0, %A1"		      "\n\t"
349 	 "or  %D0, %B1"
350 	 : "+d" (mant)
351 	 : "r" (expo16), "n" (FLT_DIG_MANT));
352 
353   return make_float (mant);
354 }
ALIAS(f7_get_float,f7_truncdfsf2)355 F7_PURE ALIAS (f7_get_float, f7_truncdfsf2)
356 #endif // F7MOD_get_float_
357 
358 #define DBL_DIG_EXP   11
359 #define DBL_DIG_MANT  (63 - DBL_DIG_EXP)
360 #define DBL_MAX_EXP   ((1 << DBL_DIG_EXP) - 1)
361 #define DBL_EXP_BIAS  (DBL_MAX_EXP >> 1)
362 
363 
364 #ifdef F7MOD_set_double_
365 void f7_set_double_impl (f7_double_t val64, f7_t *cc)
366 {
367   f7_clr (cc);
368   register uint64_t mant __asm ("r18") = val64 & ((1ull << DBL_DIG_MANT) -1);
369 
370   uint16_t val16 = 3[(uint16_t*) & val64];
371   val16 >>= DBL_DIG_MANT - 48;
372 
373   uint16_t expo_biased = val16 & DBL_MAX_EXP;
374   bool sign = val16 & (1u << DBL_DIG_EXP);
375 
376   if (mant == 0)
377     {
378       if (expo_biased == 0)
379 	return;
380       if (expo_biased >= DBL_MAX_EXP)
381 	return f7_set_inf (cc, sign);
382     }
383   __asm ("" : "+r" (mant));
384 
385   if (expo_biased == 0)
386     expo_biased = 1;   // Sub-normal: biased expo of 1 was encoded as 0.
387   else if (expo_biased < DBL_MAX_EXP)
388     mant |= (1ull << DBL_DIG_MANT);
389   else
390     return f7_set_nan (cc);
391 
392   *(uint64_t*) & cc->mant = mant;
393 
394   cc->expo = expo_biased - DBL_EXP_BIAS + 63 - DBL_DIG_MANT - 8;
395   f7_normalize_asm (cc);
396   f7_set_sign (cc, sign);
397 }
398 #endif // F7MOD_set_double_
399 
400 
401 #ifdef F7MOD_set_pdouble_
f7_set_pdouble(f7_t * cc,const f7_double_t * val64)402 void f7_set_pdouble (f7_t *cc, const f7_double_t *val64)
403 {
404   f7_set_double (cc, *val64);
405 }
406 #endif // F7MOD_set_pdouble_
407 
408 
409 #ifdef F7MOD_get_double_
410 static F7_INLINE
clr_r18(void)411 uint64_t clr_r18 (void)
412 {
413   extern void __clr_8 (void);
414   register uint64_t r18 __asm ("r18");
415   __asm ("%~call %x[f]" : "=r" (r18) : [f] "i" (__clr_8));
416   return r18;
417 }
418 
419 static F7_INLINE
make_double(uint64_t x)420 f7_double_t make_double (uint64_t x)
421 {
422   register f7_double_t r18 __asm ("r18") = x;
423   __asm ("" : "+r" (r18));
424   return r18;
425 }
426 
427 F7_WEAK
f7_get_double(const f7_t * aa)428 f7_double_t f7_get_double (const f7_t *aa)
429 {
430   uint8_t a_class = f7_classify (aa);
431 
432   if (f7_class_nan (a_class))
433     {
434       uint64_t nan = clr_r18() | (0x7fffull << 48);
435       return make_double (nan);
436     }
437 
438   uint64_t mant;
439   __builtin_memcpy (&mant, & aa->mant, 8);
440 
441   mant &= 0x00ffffffffffffff;
442 
443   // FIXME: For subnormals, rounding is premature and should be
444   //	    done *after* the mantissa has been shifted into place
445   //	    (or the round value be shifted left accordingly).
446   // Round.
447   mant += 1u << (F7_MANT_BITS - (1 + DBL_DIG_MANT) - 1);
448 
449   uint8_t dex;
450   register uint64_t r18 __asm ("r18") = mant;
451   // dex = Overflow ? 1 : 0.
452   __asm ("bst %T[mant]%T[bitno]"  "\n\t"
453 	 "clr %0"		  "\n\t"
454 	 "bld %0,0"
455 	 : "=r" (dex), [mant] "+r" (r18)
456 	 : [bitno] "n" (64 - 8));
457 
458   mant = r18 >> dex;
459 
460   uint16_t expo16 = 0;
461   uint16_t mant_offset = DBL_DIG_EXP - 8;
462   int16_t c_expo = add_ssat16 (aa->expo, DBL_EXP_BIAS + dex);
463 
464   if (f7_class_zero (a_class) || c_expo <= -DBL_DIG_MANT)
465     {
466       // Zero or tiny.
467       return make_double (clr_r18());
468     }
469   else if (c_expo >= DBL_MAX_EXP || f7_class_inf (a_class))
470     {
471       // Inf or overflow.
472       expo16 = DBL_MAX_EXP;
473       mant = clr_r18();
474     }
475   else if (c_expo > 0)
476     {
477       // Normal.
478       expo16 = c_expo;
479     }
480   else
481     {
482       // Sub-normal:  -DIG_MANT < c_expo <= 0.
483       // Encoding expo of 0 represents a biased exponent of 1.
484       // mant_offset in 5...55 = 63-8.
485       mant_offset += 1 - c_expo;
486     }
487 
488   expo16 <<= (DBL_DIG_MANT - 48);
489 
490   if (f7_class_sign (a_class))
491     expo16 |= 1u << (DBL_DIG_EXP + DBL_DIG_MANT - 48);
492 
493   // mant >>= mant_offset;
494   mant = f7_lshrdi3 (mant, mant_offset);
495 
496   r18 = mant;
497   __asm ("cbr %T0%t2, 1 << (7 & %2)"  "\n\t"
498 	 "or  %r0+6, %A1"	      "\n\t"
499 	 "or  %r0+7, %B1"
500 	 : "+r" (r18)
501 	 : "r" (expo16), "n" (DBL_DIG_MANT));
502 
503   return make_double (r18);
504 }
505 #endif // F7MOD_get_double_
506 
507 
508 #ifdef F7MOD_fabs_
509 F7_WEAK
f7_fabs(f7_t * cc,const f7_t * aa)510 void f7_fabs (f7_t *cc, const f7_t *aa)
511 {
512   f7_abs (cc, aa);
513 }
514 #endif // F7MOD_fabs_
515 
516 
517 #ifdef F7MOD_neg_
518 F7_WEAK
f7_neg(f7_t * cc,const f7_t * aa)519 f7_t* f7_neg (f7_t *cc, const f7_t *aa)
520 {
521   f7_copy (cc, aa);
522 
523   uint8_t c_class = f7_classify (cc);
524 
525   if (! f7_class_zero (c_class))
526     cc->sign = ! f7_class_sign (c_class);
527 
528   return cc;
529 }
530 #endif // F7MOD_neg_
531 
532 
533 #ifdef F7MOD_frexp_
534 F7_WEAK
f7_frexp(f7_t * cc,const f7_t * aa,int * expo)535 void f7_frexp (f7_t *cc, const f7_t *aa, int *expo)
536 {
537   uint8_t a_class = f7_classify (aa);
538 
539   if (f7_class_nan (a_class))
540     return f7_set_nan (cc);
541 
542   if (f7_class_inf (a_class) || aa->expo == INT16_MAX)
543     return f7_set_inf (cc, f7_class_sign (a_class));
544 
545   if (! f7_msbit (aa))
546     {
547       *expo = 0;
548       return f7_clr (cc);
549     }
550 
551   *expo = 1 + aa->expo;
552   cc->flags = a_class & F7_FLAG_sign;
553   cc->expo = -1;
554   f7_copy_mant (cc, aa);
555 }
556 #endif // F7MOD_frexp_
557 
558 #ifdef F7MOD_get_s16_
559 F7_WEAK
f7_get_s16(const f7_t * aa)560 int16_t f7_get_s16 (const f7_t *aa)
561 {
562   extern int16_t to_s16 (const f7_t*, uint8_t) F7ASM(f7_to_integer_asm);
563   return to_s16 (aa, 0xf);
564 }
565 #endif // F7MOD_get_s16_
566 
567 
568 #ifdef F7MOD_get_s32_
569 F7_WEAK
f7_get_s32(const f7_t * aa)570 int32_t f7_get_s32 (const f7_t *aa)
571 {
572   extern int32_t to_s32 (const f7_t*, uint8_t) F7ASM(f7_to_integer_asm);
573   return to_s32 (aa, 0x1f);
574 }
ALIAS(f7_get_s32,f7_fixdfsi)575 F7_PURE ALIAS (f7_get_s32, f7_fixdfsi)
576 #endif // F7MOD_get_s32_
577 
578 
579 #ifdef F7MOD_get_s64_
580   F7_WEAK
581   int64_t f7_get_s64 (const f7_t *aa)
582 {
583   extern int64_t to_s64 (const f7_t*, uint8_t) F7ASM(f7_to_integer_asm);
584   return to_s64 (aa, 0x3f);
585 }
ALIAS(f7_get_s64,f7_fixdfdi)586 F7_PURE ALIAS (f7_get_s64, f7_fixdfdi)
587 #endif // F7MOD_get_s64_
588 
589 #ifdef F7MOD_get_u16_
590   F7_WEAK
591   uint16_t f7_get_u16 (const f7_t *aa)
592 {
593   extern uint16_t to_u16 (const f7_t*, uint8_t) F7ASM(f7_to_unsigned_asm);
594   return to_u16 (aa, 0xf);
595 }
596 #endif // F7MOD_get_u16_
597 
598 
599 #ifdef F7MOD_get_u32_
600 F7_WEAK
f7_get_u32(const f7_t * aa)601 uint32_t f7_get_u32 (const f7_t *aa)
602 {
603   extern uint32_t to_u32 (const f7_t*, uint8_t) F7ASM(f7_to_unsigned_asm);
604   return to_u32 (aa, 0x1f);
605 }
ALIAS(f7_get_u32,f7_fixunsdfsi)606 F7_PURE ALIAS (f7_get_u32, f7_fixunsdfsi)
607 #endif // F7MOD_get_u32_
608 
609 
610 #ifdef F7MOD_get_u64_
611 F7_WEAK
612 uint64_t f7_get_u64 (const f7_t *aa)
613 {
614   extern int64_t to_u64 (const f7_t*, uint8_t) F7ASM(f7_to_unsigned_asm);
615   return to_u64 (aa, 0x3f);
616 }
617 F7_PURE ALIAS (f7_get_u64, f7_fixunsdfdi)
618 #endif // F7MOD_get_u64_
619 
620 
621 #ifdef F7MOD_cmp_unordered_
622 F7_NOINLINE
623 static int8_t cmp_u8 (uint8_t a_class, uint8_t b_class, bool sign_a);
624 
625 F7_WEAK
f7_cmp_unordered(const f7_t * aa,const f7_t * bb,bool with_sign)626 int8_t f7_cmp_unordered (const f7_t *aa, const f7_t *bb, bool with_sign)
627 {
628   uint8_t a_class = f7_classify (aa);
629   uint8_t b_class = f7_classify (bb);
630 
631   uint8_t a_sign = f7_class_sign (a_class) & with_sign;
632   uint8_t b_sign = f7_class_sign (b_class) & with_sign;
633   uint8_t ab_class = a_class | b_class;
634   ab_class &= with_sign - 2;
635 
636   if (f7_class_nan (ab_class))
637     return INT8_MIN;
638 
639   if (a_sign != b_sign)
640     return b_sign - a_sign;
641 
642   if (f7_class_inf (ab_class))
643     return cmp_u8 (a_class, b_class, a_sign);
644 
645   if (f7_class_zero (ab_class))
646     return cmp_u8 (b_class, a_class, a_sign);
647 
648   if (aa->expo < bb->expo)
649     return a_sign ? 1 : -1;
650 
651   if (aa->expo > bb->expo)
652     return a_sign ? -1 : 1;
653 
654   return cmp_u8 (1 + f7_cmp_mant (aa, bb), 1, a_sign);
655 }
656 
657 
cmp_u8(uint8_t a_class,uint8_t b_class,bool sign_a)658 int8_t cmp_u8 (uint8_t a_class, uint8_t b_class, bool sign_a)
659 {
660   int8_t c;
661   __asm ("sub  %[a], %[b]"    "\n\t"
662 	 "breq 1f"	      "\n\t"
663 	 "sbc  %[c], %[c]"    "\n\t"
664 	 "sbci %[c], -1"      "\n\t"
665 	 "sbrc %[s], 0"	      "\n\t"
666 	 "neg  %[c]"	      "\n\t"
667 	 "1:"
668 	 : [c] "=d" (c)
669 	 : [a] "0" (a_class), [b] "r" (b_class), [s] "r" (sign_a));
670   return c;
671 }
672 #endif // F7MOD_cmp_unordered_
673 
674 
675 #ifdef F7MOD_cmp_abs_
676 F7_WEAK
f7_cmp_abs(const f7_t * aa,const f7_t * bb)677 int8_t f7_cmp_abs (const f7_t *aa, const f7_t *bb)
678 {
679   return f7_cmp_unordered (aa, bb, false /* no signs */);
680 }
681 #endif // F7MOD_cmp_abs_
682 
683 
684 #ifdef F7MOD_cmp_
685 F7_WEAK
f7_cmp(const f7_t * aa,const f7_t * bb)686 int8_t f7_cmp (const f7_t *aa, const f7_t *bb)
687 {
688   return f7_cmp_unordered (aa, bb, true /* with signs */);
689 }
690 #endif // F7MOD_cmp_
691 
692 
693 #ifdef F7MOD_abscmp_msb_ge_
694 // Compare absolute value of Number aa against a f7_t represented
695 // by msb and expo.
696 F7_WEAK
f7_abscmp_msb_ge(const f7_t * aa,uint8_t msb,int16_t expo)697 bool f7_abscmp_msb_ge (const f7_t *aa, uint8_t msb, int16_t expo)
698 {
699   uint8_t a_msb = aa->mant[F7_MANT_BYTES - 1];
700 
701   if (0 == (0x80 & a_msb))
702     // 0 or subnormal.
703     return false;
704 
705   return aa->expo == expo
706     ? a_msb >= msb
707     : aa->expo > expo;
708 }
709 #endif // F7MOD_abscmp_msb_ge_
710 
711 #ifdef F7MOD_lt_
712 F7_WEAK
f7_lt_impl(const f7_t * aa,const f7_t * bb)713 bool f7_lt_impl (const f7_t *aa, const f7_t *bb)
714 {
715   return f7_lt (aa, bb);
716 }
717 #endif // F7MOD_lt_
718 
719 #ifdef F7MOD_le_
720 F7_WEAK
f7_le_impl(const f7_t * aa,const f7_t * bb)721 bool f7_le_impl (const f7_t *aa, const f7_t *bb)
722 {
723   return f7_le (aa, bb);
724 }
725 #endif // F7MOD_le_
726 
727 #ifdef F7MOD_gt_
728 F7_WEAK
f7_gt_impl(const f7_t * aa,const f7_t * bb)729 bool f7_gt_impl (const f7_t *aa, const f7_t *bb)
730 {
731   return f7_gt (aa, bb);
732 }
733 #endif // F7MOD_gt_
734 
735 #ifdef F7MOD_ge_
736 F7_WEAK
f7_ge_impl(const f7_t * aa,const f7_t * bb)737 bool f7_ge_impl (const f7_t *aa, const f7_t *bb)
738 {
739   return f7_ge (aa, bb);
740 }
741 #endif // F7MOD_ge_
742 
743 #ifdef F7MOD_ne_
744 F7_WEAK
f7_ne_impl(const f7_t * aa,const f7_t * bb)745 bool f7_ne_impl (const f7_t *aa, const f7_t *bb)
746 {
747   return f7_ne (aa, bb);
748 }
749 #endif // F7MOD_ne_
750 
751 #ifdef F7MOD_eq_
752 F7_WEAK
f7_eq_impl(const f7_t * aa,const f7_t * bb)753 bool f7_eq_impl (const f7_t *aa, const f7_t *bb)
754 {
755   return f7_eq (aa, bb);
756 }
757 #endif // F7MOD_eq_
758 
759 
760 #ifdef F7MOD_unord_
761 F7_WEAK
f7_unord_impl(const f7_t * aa,const f7_t * bb)762 bool f7_unord_impl (const f7_t *aa, const f7_t *bb)
763 {
764   return f7_unordered (aa, bb);
765 }
766 #endif // F7MOD_unord_
767 
768 
769 #ifdef F7MOD_minmax_
770 F7_WEAK
f7_minmax(f7_t * cc,const f7_t * aa,const f7_t * bb,bool do_min)771 f7_t* f7_minmax (f7_t *cc, const f7_t *aa, const f7_t *bb, bool do_min)
772 {
773   int8_t cmp = f7_cmp_unordered (aa, bb, true /* with signs */);
774 
775   if (cmp == INT8_MIN)
776     return (f7_set_nan (cc), cc);
777 
778   if (do_min)
779     cmp = -cmp;
780 
781   return f7_copy (cc, cmp >= 0 ? aa : bb);
782 }
783 #endif // F7MOD_minmax_
784 
785 
786 #ifdef F7MOD_fmax_
787 F7_WEAK
f7_fmax(f7_t * cc,const f7_t * aa,const f7_t * bb)788 f7_t* f7_fmax (f7_t *cc, const f7_t *aa, const f7_t *bb)
789 {
790   return f7_minmax (cc, aa, bb, false);
791 }
ALIAS(f7_fmax,f7_max)792 ALIAS (f7_fmax, f7_max)
793 #endif // F7MOD_fmax_
794 
795 
796 #ifdef F7MOD_fmin_
797 F7_WEAK
798 f7_t* f7_fmin (f7_t *cc, const f7_t *aa, const f7_t *bb)
799 {
800   return f7_minmax (cc, aa, bb, true);
801 }
ALIAS(f7_fmin,f7_min)802 ALIAS (f7_fmin, f7_min)
803 #endif // F7MOD_fmin_
804 
805 
806 #ifdef F7MOD_mulx_
807 F7_WEAK
808 uint8_t f7_mulx (f7_t *cc, const f7_t *aa, const f7_t *bb, bool no_rounding)
809 {
810   uint8_t a_class = f7_classify (aa);
811   uint8_t b_class = f7_classify (bb);
812   // From this point on, no more access aa->flags or bb->flags
813   // to avoid early-clobber when writing cc->flags.
814 
815   uint8_t ab_class = a_class | b_class;
816   // If either value is NaN, return NaN.
817   if (f7_class_nan (ab_class)
818       // Any combination of Inf and 0.
819       || (f7_class_zero (ab_class) && f7_class_inf (ab_class)))
820     {
821       cc->flags = F7_FLAG_nan;
822       return 0;
823     }
824   // If either value is 0.0, return 0.0.
825   if (f7_class_zero (ab_class))
826     {
827       f7_clr (cc);
828       return 0;
829     }
830   // We have 2 non-zero numbers-or-INF.
831 
832   uint8_t c_sign = (a_class ^ b_class) & F7_FLAG_sign;
833   uint8_t c_inf  = ab_class & F7_FLAG_inf;
834   cc->flags = c_sign | c_inf;
835   if (c_inf)
836     return 0;
837 
838   int16_t expo = add_ssat16 (aa->expo, bb->expo);
839   // Store expo and handle expo = INT16_MIN  and INT16_MAX.
840   if (f7_store_expo (cc, expo))
841     return 0;
842 
843   return f7_mul_mant_asm (cc, aa, bb, no_rounding);
844 }
845 #endif // F7MOD_mulx_
846 
847 
848 #ifdef F7MOD_square_
849 F7_WEAK
f7_square(f7_t * cc,const f7_t * aa)850 void f7_square (f7_t *cc, const f7_t *aa)
851 {
852   f7_mul (cc, aa, aa);
853 }
854 #endif // F7MOD_square_
855 
856 
857 #ifdef F7MOD_mul_
858 F7_WEAK
f7_mul(f7_t * cc,const f7_t * aa,const f7_t * bb)859 void f7_mul (f7_t *cc, const f7_t *aa, const f7_t *bb)
860 {
861   f7_mulx (cc, aa, bb, false);
862 }
863 #endif // F7MOD_mul_
864 
865 
866 #ifdef F7MOD_Iadd_
f7_Iadd(f7_t * cc,const f7_t * aa)867 F7_WEAK void f7_Iadd (f7_t *cc, const f7_t *aa) { f7_add (cc, cc, aa); }
868 #endif
869 
870 #ifdef F7MOD_Isub_
f7_Isub(f7_t * cc,const f7_t * aa)871 F7_WEAK void f7_Isub (f7_t *cc, const f7_t *aa) { f7_sub (cc, cc, aa); }
872 #endif
873 
874 #ifdef F7MOD_Imul_
f7_Imul(f7_t * cc,const f7_t * aa)875 F7_WEAK void f7_Imul (f7_t *cc, const f7_t *aa) { f7_mul (cc, cc, aa); }
876 #endif
877 
878 #ifdef F7MOD_Idiv_
f7_Idiv(f7_t * cc,const f7_t * aa)879 F7_WEAK void f7_Idiv (f7_t *cc, const f7_t *aa) { f7_div (cc, cc, aa); }
880 #endif
881 
882 #ifdef F7MOD_IRsub_
f7_IRsub(f7_t * cc,const f7_t * aa)883 F7_WEAK void f7_IRsub (f7_t *cc, const f7_t *aa) { f7_sub (cc, aa, cc); }
884 #endif
885 
886 #ifdef F7MOD_Ineg_
f7_Ineg(f7_t * cc)887 F7_WEAK void f7_Ineg (f7_t *cc) { f7_neg (cc, cc); }
888 #endif
889 
890 #ifdef F7MOD_Isqrt_
f7_Isqrt(f7_t * cc)891 F7_WEAK void f7_Isqrt (f7_t *cc) { f7_sqrt (cc, cc); }
892 #endif
893 
894 #ifdef F7MOD_Isquare_
f7_Isquare(f7_t * cc)895 F7_WEAK void f7_Isquare (f7_t *cc) { f7_square (cc, cc); }
896 #endif
897 
898 #ifdef F7MOD_Ildexp_
f7_Ildexp(f7_t * cc,int ex)899 F7_WEAK f7_t* f7_Ildexp (f7_t *cc, int ex) { return f7_ldexp (cc, cc, ex); }
900 #endif
901 
902 
903 #ifdef F7MOD_add_
904 F7_WEAK
f7_add(f7_t * cc,const f7_t * aa,const f7_t * bb)905 void f7_add (f7_t *cc, const f7_t *aa, const f7_t *bb)
906 {
907   f7_addsub (cc, aa, bb, false);
908 }
909 #endif // F7MOD_add_
910 
911 
912 #ifdef F7MOD_sub_
913 F7_WEAK
f7_sub(f7_t * cc,const f7_t * aa,const f7_t * bb)914 void f7_sub (f7_t *cc, const f7_t *aa, const f7_t *bb)
915 {
916   f7_addsub (cc, aa, bb, true);
917 }
918 #endif // F7MOD_sub_
919 
920 
921 #ifdef F7MOD_addsub_
return_with_sign(f7_t * cc,const f7_t * aa,int8_t c_sign)922 static void return_with_sign (f7_t *cc, const f7_t *aa, int8_t c_sign)
923 {
924   __asm (";;; return with sign");
925   f7_copy (cc, aa);
926   if (c_sign != -1)
927     f7_set_sign (cc, c_sign);
928 }
929 
930 F7_WEAK
f7_addsub(f7_t * cc,const f7_t * aa,const f7_t * bb,bool neg_b)931 void f7_addsub (f7_t *cc, const f7_t *aa, const f7_t *bb, bool neg_b)
932 {
933   uint8_t a_class = f7_classify (aa);
934   uint8_t b_class = f7_classify (bb);
935   // From this point on, no more access aa->flags or bb->flags
936   // to avoid early-clobber when writing cc->flags.
937 
938   // Hande NaNs.
939   if (f7_class_nan (a_class | b_class))
940     return f7_set_nan (cc);
941 
942   bool a_sign = f7_class_sign (a_class);
943   bool b_sign = f7_class_sign (b_class) ^ neg_b;
944 
945   // Add the mantissae?
946   bool do_add = a_sign == b_sign;
947 
948   // Handle +Infs and -Infs.
949   bool a_inf = f7_class_inf (a_class);
950   bool b_inf = f7_class_inf (b_class);
951 
952   if (a_inf && b_inf)
953     {
954       if (do_add)
955 	return f7_set_inf (cc, a_sign);
956       else
957 	return f7_set_nan (cc);
958     }
959   else if (a_inf)
960     return f7_set_inf (cc, a_sign);
961   else if (b_inf)
962     return f7_set_inf (cc, b_sign);
963 
964   int16_t shift16 = sub_ssat16 (aa->expo, bb->expo);
965 
966   // aa + 0 = aa.
967   // Also check MSBit to get rid of Subnormals and 0.
968   if (shift16 > F7_MANT_BITS || f7_is0 (bb))
969     return return_with_sign (cc, aa, -1);
970 
971   // 0 + bb = bb.
972   // 0 - bb = -bb.
973   // Also check MSBit to get rid of Subnormals and 0.
974   if (shift16 < -F7_MANT_BITS || f7_is0 (aa))
975     return return_with_sign (cc, bb, b_sign);
976 
977   // Now aa and bb are non-zero, non-NaN, non-Inf.
978   // shift > 0 ==> |a| > |b|
979   // shift < 0 ==> |a| < |b|
980   int8_t shift = (int8_t) shift16;
981   bool c_sign = a_sign;
982   if (shift < 0
983       || (shift == 0 && f7_cmp_mant (aa, bb) < 0))
984     {
985       const f7_t *p = aa; aa = bb; bb = p;
986       c_sign = b_sign;
987       shift = -shift;
988     }
989   uint8_t shift2 = (uint8_t) (shift << 1);
990 
991   cc->expo = aa->expo;
992   // From this point on, no more access aa->expo or bb->expo
993   // to avoid early-clobber when writing cc->expo.
994 
995   cc->flags = c_sign;  _Static_assert (F7_FLAGNO_sign == 0, "");
996 
997   // This function uses neither .expo nor .flags from either aa or bb,
998   // hence there is early-clobber for cc->expo and cc->flags.
999   f7_addsub_mant_scaled_asm (cc, aa, bb, shift2 | do_add);
1000 }
1001 #endif // F7MOD_addsub_
1002 
1003 
1004 #ifdef F7MOD_madd_msub_
1005 F7_WEAK
f7_madd_msub(f7_t * cc,const f7_t * aa,const f7_t * bb,const f7_t * dd,bool neg_d)1006 void f7_madd_msub (f7_t *cc, const f7_t *aa, const f7_t *bb, const f7_t *dd,
1007                    bool neg_d)
1008 {
1009   f7_t xx7, *xx = &xx7;
1010   uint8_t x_lsb = f7_mulx (xx, aa, bb, true /* no rounding */);
1011   uint8_t x_sign = f7_signbit (xx);
1012   int16_t x_expo = xx->expo;
1013   f7_addsub (xx, xx, dd, neg_d);
1014   // Now add LSB.  If cancellation occured in the add / sub, then we have the
1015   // chance of extra 8 bits of precision.  Turn LSByte into f7_t.
1016   f7_clr (cc);
1017   cc->expo = sub_ssat16 (x_expo, F7_MANT_BITS);
1018   cc->mant[F7_MANT_BYTES - 1] = x_lsb;
1019   cc = f7_normalize_asm (cc);
1020   cc->flags = x_sign;
1021   f7_Iadd (cc, xx);
1022 }
1023 #endif // F7MOD_madd_msub_
1024 
1025 #ifdef F7MOD_madd_
1026 F7_WEAK
f7_madd(f7_t * cc,const f7_t * aa,const f7_t * bb,const f7_t * dd)1027 void f7_madd (f7_t *cc, const f7_t *aa, const f7_t *bb, const f7_t *dd)
1028 {
1029   f7_madd_msub (cc, aa, bb, dd, false);
1030 }
1031 #endif // F7MOD_madd_
1032 
1033 #ifdef F7MOD_msub_
1034 F7_WEAK
f7_msub(f7_t * cc,const f7_t * aa,const f7_t * bb,const f7_t * dd)1035 void f7_msub (f7_t *cc, const f7_t *aa, const f7_t *bb, const f7_t *dd)
1036 {
1037   f7_madd_msub (cc, aa, bb, dd, true);
1038 }
1039 #endif // F7MOD_msub_
1040 
1041 
1042 #ifdef F7MOD_ldexp_
1043 F7_WEAK
f7_ldexp(f7_t * cc,const f7_t * aa,int delta)1044 f7_t* f7_ldexp (f7_t *cc, const f7_t *aa, int delta)
1045 {
1046   uint8_t a_class = f7_classify (aa);
1047 
1048   cc->flags = a_class;
1049 
1050   // Inf and NaN.
1051   if (! f7_class_number (a_class))
1052     return cc;
1053 
1054   if (f7_msbit (aa) == 0)
1055     return (f7_clr (cc), cc);
1056 
1057   int16_t expo = add_ssat16 (delta, aa->expo);
1058   // Store expo and handle expo = INT16_MIN  and INT16_MAX.
1059   if (! f7_store_expo (cc, expo))
1060     f7_copy_mant (cc, aa);
1061 
1062   return cc;
1063 }
1064 #endif // F7MOD_ldexp_
1065 
1066 
1067 #if USE_LPM
1068 #elif USE_LD
1069 #else
1070 #error need include "asm-defs.h"
1071 #endif // USE_LPM
1072 
1073 /*
1074   Handling constants:
1075 
1076   F7_PCONST (PVAR, X)
1077 
1078       Set  f7_t [const] *PVAR  to an LD address for one
1079       of the  f7_const_X[_P]  constants.
1080       PVAR might be set to point to a local auto that serves
1081       as temporary storage for f7_const_X_P.  PVAR is only
1082       valid in the current block.
1083 
1084   const f7_t* F7_PCONST_U16 (PVAR, <ident> X)       // USE_LD
1085   f7_t*       F7_PCONST_U16 (PVAR, uint16_t X)      // USE_LPM
1086 
1087       Set  f7_t [const] *PVAR  to an LD address for one of the
1088       f7_const_X[_P]  constants.  PVAR might be set to point to a
1089       local auto that serves as temporary storage for X.  PVAR is
1090       only valid in the current block.
1091 
1092   F7_PCONST_VAR (PVAR, VAR)
1093 
1094       VAR is a pointer variable holding the address of some f7_const_X[_P]
1095       constant.  Set  [const] f7_t *PVAR  to a respective LD address.
1096       PVAR might be set to point to a local auto that serves
1097       as temporary storage for f7_const_X_P.  PVAR is only
1098       valid in the current block.
1099 
1100   F7_CONST_ADDR (<ident> CST, f7_t* PTMP)
1101 
1102       Return an LD address to for some f7_const_X[_P] constant.
1103       *PTMP might be needed to hold a copy of f7_const_X_P in RAM.
1104 
1105   f7_t*       F7_U16_ADDR (uint16_t     X, f7_t* PTMP)   // USE_LPM
1106   const f7_t* F7_U16_ADDR (<cst-ident>  X, <unused>)     // USE_LD
1107 
1108       Return an LD address to compile-time constant  uint16_t X  which is
1109       also known as  f7_const_X[_P].  *PTMP might be set to  (f7_t) X.
1110 
1111   f7_t* f7_const (f7_t *PVAR, <cst-ident> X)
1112 
1113       Copy  f7_const_X[_P]  to *PVAR.
1114 
1115   f7_t* f7_copy_flash (f7_t *DST, const f7_t *SRC)
1116 
1117       Copy to *DST with LD (from .rodata in flash) if the address
1118       space is linear, or with  LPM (from .progmem.data) if the
1119       address space is not linear.
1120 
1121   f7_t* f7_copy (f7_t *DST, const f7_t* SRC)
1122 
1123       Copy to RAM using LD.
1124 
1125   f7_t* f7_copy_P (f7_t *DST, const f7_t *SRC)
1126 
1127       Copy to RAM using LPM.
1128 */
1129 
1130 #if USE_LPM
1131   #define F7_RAW_CONST_ADDR(CST) \
1132       & F7_(const_##CST##_P)
1133 
1134   #define F7_PCONST(PVAR, CST)				    \
1135       f7_t _var_for_##CST;				    \
1136       f7_copy_P (& _var_for_##CST, & F7_(const_##CST##_P)); \
1137       PVAR = & _var_for_##CST
1138 
1139   #define F7_PCONST_U16(PVAR, CST)		\
1140       f7_t _var_for_##CST;			\
1141       PVAR = f7_set_u16 (& _var_for_##CST, CST)
1142 
1143   #define F7_PCONST_VAR(PVAR, VAR)		\
1144       f7_t _var_for_##VAR;			\
1145       f7_copy_P (& _var_for_##VAR, VAR);	\
1146       PVAR = & _var_for_##VAR
1147 
1148   #define MAYBE_const // empty
1149 
1150   #define F7_CONST_ADDR(CST, PTMP) \
1151       f7_copy_P ((PTMP), & F7_(const_##CST##_P))
1152 
1153   #define F7_U16_ADDR(CST, PTMP)   \
1154       f7_set_u16 ((PTMP), CST)
1155 
1156 #elif USE_LD
1157   #define F7_RAW_CONST_ADDR(CST)   \
1158       & F7_(const_##CST)
1159 
1160   #define F7_PCONST(PVAR, CST)	   \
1161       PVAR = & F7_(const_##CST)
1162 
1163   #define F7_PCONST_U16(PVAR, CST) \
1164       PVAR = & F7_(const_##CST)
1165 
1166   #define F7_PCONST_VAR(PVAR, VAR) \
1167       PVAR = (VAR)
1168 
1169   #define F7_CONST_ADDR(CST, PTMP) \
1170       (& F7_(const_##CST))
1171 
1172   #define F7_U16_ADDR(CST, PTMP)   \
1173       (& F7_(const_##CST))
1174 
1175   #define MAYBE_const const
1176 #endif
1177 
1178 
1179 
1180 #define DD(str, X)		\
1181   do {				\
1182     LOG_PSTR (PSTR (str));	\
1183     f7_dump (X);		\
1184   } while (0)
1185 
1186 #undef DD
1187 #define DD(...) (void) 0
1188 
1189 
1190 #ifdef F7MOD_sqrt_
sqrt_worker(f7_t * cc,const f7_t * rr)1191 static void sqrt_worker (f7_t *cc, const f7_t *rr)
1192 {
1193   f7_t tmp7, *tmp = &tmp7;
1194   f7_t aa7, *aa = &aa7;
1195 
1196   // aa in  [1/2, 2)  =>  aa->expo in { -1, 0 }.
1197   int16_t a_expo = -(rr->expo & 1);
1198   int16_t c_expo = (rr->expo - a_expo) >> 1;  // FIXME: r_expo = INT_MAX???
1199 
1200   __asm ("" : "+r" (aa));
1201 
1202   f7_copy (aa, rr);
1203   aa->expo = a_expo;
1204 
1205   // No use of rr or *cc past this point:  We may use cc as temporary.
1206   // Approximate square-root of  A  by  X <-- (X + A / X) / 2.
1207 
1208   f7_sqrt_approx_asm (cc, aa);
1209 
1210   // Iterate  X <-- (X + A / X) / 2.
1211   // 3 Iterations with 16, 32, 58 bits of precision for the quotient.
1212 
1213   for (uint8_t prec = 16; (prec & 0x80) == 0; prec <<= 1)
1214     {
1215       f7_divx (tmp, aa, cc, (prec & 64) ? 2 + F7_MANT_BITS : prec);
1216       f7_Iadd (cc, tmp);
1217       // This will never underflow because |c_expo| is small.
1218       cc->expo--;
1219     }
1220 
1221   // Similar: |c_expo| is small, hence no ldexp needed.
1222   cc->expo += c_expo;
1223 }
1224 
1225 F7_WEAK
f7_sqrt(f7_t * cc,const f7_t * aa)1226 void f7_sqrt (f7_t *cc, const f7_t *aa)
1227 {
1228   uint8_t a_class = f7_classify (aa);
1229 
1230   if (f7_class_nan (a_class) || f7_class_sign (a_class))
1231     return f7_set_nan (cc);
1232 
1233   if (f7_class_inf (a_class))
1234     return f7_set_inf (cc, 0);
1235 
1236   if (f7_class_zero (a_class))
1237     return f7_clr (cc);
1238 
1239   sqrt_worker (cc, aa);
1240 }
1241 #endif // F7MOD_sqrt_
1242 
1243 
1244 #ifdef F7MOD_hypot_
1245 F7_WEAK
f7_hypot(f7_t * cc,const f7_t * aa,const f7_t * bb)1246 void f7_hypot (f7_t *cc, const f7_t *aa, const f7_t *bb)
1247 {
1248   f7_t xx7, *xx = &xx7;
1249 
1250   f7_square (xx, aa);
1251   f7_square (cc, bb);
1252   f7_Iadd (cc, xx);
1253   f7_Isqrt (cc);
1254 }
1255 #endif // F7MOD_hypot_
1256 
1257 
1258 #ifdef F7MOD_const_m1_
1259 #include "libf7-constdef.h"
1260 #endif // -1
1261 
1262 #ifdef F7MOD_const_1_2_
1263 #include "libf7-constdef.h"
1264 #endif // 1/2
1265 
1266 #ifdef F7MOD_const_1_3_
1267 #include "libf7-constdef.h"
1268 #endif // 1/3
1269 
1270 #ifdef F7MOD_const_ln2_
1271 #include "libf7-constdef.h"
1272 #endif // ln2
1273 
1274 #ifdef F7MOD_const_1_ln2_
1275 #include "libf7-constdef.h"
1276 #endif // 1_ln2
1277 
1278 #ifdef F7MOD_const_ln10_
1279 #include "libf7-constdef.h"
1280 #endif // ln10
1281 
1282 #ifdef F7MOD_const_1_ln10_
1283 #include "libf7-constdef.h"
1284 #endif // 1_ln10
1285 
1286 #ifdef F7MOD_const_1_
1287 #include "libf7-constdef.h"
1288 #endif // 1
1289 
1290 #ifdef F7MOD_const_sqrt2_
1291 #include "libf7-constdef.h"
1292 #endif // sqrt2
1293 
1294 #ifdef F7MOD_const_2_
1295 #include "libf7-constdef.h"
1296 #endif // 2
1297 
1298 #ifdef F7MOD_const_pi_
1299 #include "libf7-constdef.h"
1300 #endif // pi
1301 
1302 
1303 #ifdef F7MOD_divx_
1304 
1305 // C /= A
1306 extern void f7_div_asm (f7_t*, const f7_t*, uint8_t);
1307 
1308 F7_WEAK
f7_divx(f7_t * cc,const f7_t * aa,const f7_t * bb,uint8_t quot_bits)1309 void f7_divx (f7_t *cc, const f7_t *aa, const f7_t *bb, uint8_t quot_bits)
1310 {
1311   uint8_t a_class = f7_classify (aa);
1312   uint8_t b_class = f7_classify (bb);
1313   // From this point on, no more access aa->flags or bb->flags
1314   // to avoid early-clobber when writing cc->flags.
1315 
1316   // If either value is NaN, return NaN.
1317   if (f7_class_nan (a_class | b_class)
1318       // If both values are Inf or both are 0, return NaN.
1319       || f7_class_zero (a_class & b_class)
1320       || f7_class_inf (a_class & b_class)
1321       // Inf / 0 = NaN.
1322       || (f7_class_inf (a_class) && f7_class_zero (b_class)))
1323     {
1324       return f7_set_nan (cc);
1325     }
1326 
1327   // 0 / B   = 0  for non-zero, non-NaN B.
1328   // A / Inf = 0  for non-zero numbers A.
1329   if (f7_class_zero (a_class) || f7_class_inf (b_class))
1330     return f7_clr (cc);
1331 
1332   uint8_t c_sign = (a_class ^ b_class) & F7_FLAG_sign;
1333 
1334   if (f7_class_inf (a_class) || f7_class_zero (b_class))
1335     return f7_set_inf (cc, c_sign);
1336 
1337   cc->flags = c_sign;     _Static_assert (F7_FLAGNO_sign == 0, "");
1338   int16_t expo = sub_ssat16 (aa->expo, bb->expo);
1339 
1340   // Store expo and handle expo = INT16_MIN  and INT16_MAX.
1341   if (f7_store_expo (cc, expo))
1342     return;
1343 
1344   f7_t ss7, *ss = &ss7;
1345   ss->flags = cc->flags;
1346   ss->expo  = cc->expo;
1347 
1348   f7_copy_mant (ss, aa);
1349   f7_div_asm (ss, bb, quot_bits);
1350   f7_copy (cc, ss);
1351 }
1352 #endif // F7MOD_divx_
1353 
1354 
1355 #ifdef F7MOD_div_
1356 F7_WEAK
f7_div(f7_t * cc,const f7_t * aa,const f7_t * bb)1357 void f7_div (f7_t *cc, const f7_t *aa, const f7_t *bb)
1358 {
1359   /* When f7_divx calls f7_div_asm, dividend and divisor are valid
1360      mantissae, i.e. their MSBit is set.  Therefore, the quotient will
1361      be in  [0x0.ff..., 0x0.40...]  and to adjust it, at most 1 left-shift
1362      is needed.  Compute F7_MANT_BITS + 2 bits of the quotient:
1363      One bit is used for rounding, and one bit might be consumed by the
1364      mentioned left-shift.  */
1365 
1366   f7_divx (cc, aa, bb, 2 + F7_MANT_BITS);
1367 }
1368 #endif // F7MOD_div_
1369 
1370 
1371 #ifdef F7MOD_div1_
1372 F7_WEAK
f7_div1(f7_t * cc,const f7_t * aa)1373 void f7_div1 (f7_t *cc, const f7_t *aa)
1374 {
1375   F7_PCONST_U16 (const f7_t *one, 1);
1376   f7_div (cc, one, aa);
1377 }
1378 #endif // F7MOD_div_
1379 
1380 
1381 #ifdef F7MOD_fmod_
1382 F7_WEAK
f7_fmod(f7_t * cc,const f7_t * aa,const f7_t * bb)1383 void f7_fmod (f7_t *cc, const f7_t *aa, const f7_t *bb)
1384 {
1385   uint8_t a_class = f7_classify (aa);
1386   uint8_t b_class = f7_classify (bb);
1387 
1388   if (! f7_class_number (a_class)
1389       || f7_class_nan (b_class)
1390       || f7_class_zero (b_class))
1391     {
1392       return f7_set_nan (cc);
1393     }
1394 
1395   // A == 0 and B != 0  =>  0.
1396   if (f7_class_zero (a_class))
1397     return f7_clr (cc);
1398 
1399   f7_t zz7, *zz = & zz7;
1400 
1401   f7_div (zz, aa, bb);
1402 
1403   // Z in Z,  |Z| <= |A/B|.
1404   f7_trunc (zz, zz);
1405 
1406   // C = A - Z * B.
1407   f7_msub (cc, zz, bb, aa);
1408   cc->flags ^= F7_FLAG_sign;
1409 }
1410 #endif // F7MOD_fmod_
1411 
1412 
1413 #ifdef F7MOD_truncx_
1414 F7_WEAK
f7_truncx(f7_t * cc,const f7_t * aa,bool do_floor)1415 f7_t* f7_truncx (f7_t *cc, const f7_t *aa, bool do_floor)
1416 {
1417   uint8_t a_class = f7_classify (aa);
1418 
1419   if (! f7_class_nonzero (a_class))
1420     return f7_copy (cc, aa);
1421 
1422   bool sign = f7_class_sign (a_class);
1423 
1424   int16_t a_expo = aa->expo;
1425 
1426   if (a_expo < 0)
1427     {
1428       // |A| < 1.
1429       if (sign & do_floor)
1430 	return f7_set_s16 (cc, -1);
1431 
1432       f7_clr (cc);
1433       return cc;
1434     }
1435   else if (a_expo >= F7_MANT_BITS - 1)
1436     // A == floor (A).
1437     return f7_copy (cc, aa);
1438 
1439   f7_t tmp7, *tmp = &tmp7;
1440 
1441   // Needed if aa === cc.
1442   f7_copy (tmp, aa);
1443 
1444   cc->flags = sign;
1445   cc->expo = a_expo;
1446   f7_clr_mant_lsbs (cc, aa, F7_MANT_BITS - 1 - a_expo);
1447 
1448   if (do_floor && cc->sign && f7_cmp_mant (cc, tmp) != 0)
1449     {
1450       F7_PCONST_U16 (const f7_t *one, 1);
1451       f7_Isub (cc, one);
1452     }
1453 
1454   return cc;
1455 }
1456 #endif // F7MOD_truncx_
1457 
1458 
1459 #ifdef F7MOD_floor_
1460 F7_WEAK
f7_floor(f7_t * cc,const f7_t * aa)1461 f7_t* f7_floor (f7_t *cc, const f7_t *aa)
1462 {
1463   return f7_truncx (cc, aa, true);
1464 }
1465 #endif // F7MOD_floor_
1466 
1467 
1468 #ifdef F7MOD_trunc_
1469 F7_WEAK
f7_trunc(f7_t * cc,const f7_t * aa)1470 f7_t* f7_trunc (f7_t *cc, const f7_t *aa)
1471 {
1472   return f7_truncx (cc, aa, false);
1473 }
1474 #endif // F7MOD_trunc_
1475 
1476 
1477 #ifdef F7MOD_ceil_
1478 F7_WEAK
f7_ceil(f7_t * cc,const f7_t * aa)1479 void f7_ceil (f7_t *cc, const f7_t *aa)
1480 {
1481   cc = f7_copy (cc, aa);
1482   cc->flags ^= F7_FLAG_sign;
1483   cc = f7_floor (cc, cc);
1484   f7_Ineg (cc);
1485 }
1486 #endif // F7MOD_ceil_
1487 
1488 
1489 #ifdef F7MOD_round_
1490 F7_WEAK
f7_round(f7_t * cc,const f7_t * aa)1491 void f7_round (f7_t *cc, const f7_t *aa)
1492 {
1493   f7_t tmp;
1494   (void) tmp;
1495   const f7_t *half = F7_CONST_ADDR (1_2, &tmp);
1496 
1497   f7_addsub   (cc, aa, half, f7_signbit (aa));
1498   f7_trunc (cc, cc);
1499 }
1500 #endif // F7MOD_round_
1501 
1502 
1503 #ifdef F7MOD_horner_
1504 
1505 // Assertion when using this function is that either cc != xx,
1506 // or if cc == xx, then tmp1 must be non-NULL and tmp1 != xx.
1507 // In General, the calling functions have a spare f7_t object available
1508 // and can pass it down to save some stack.
1509 // Moreover, the power series must have degree 1 at least.
1510 
1511 F7_WEAK
f7_horner(f7_t * cc,const f7_t * xx,uint8_t n_coeff,const f7_t * coeff,f7_t * tmp1)1512 void f7_horner (f7_t *cc, const f7_t *xx, uint8_t n_coeff, const f7_t *coeff,
1513                 f7_t *tmp1)
1514 {
1515   f7_assert (n_coeff > 1);
1516 
1517   if (cc != xx)
1518     tmp1 = cc;
1519   else
1520     f7_assert (tmp1 != NULL && tmp1 != xx);
1521 
1522   f7_t *yy = tmp1;
1523   f7_t tmp27, *tmp2 = &tmp27;
1524 
1525   n_coeff--;
1526   const f7_t *pcoeff = coeff + n_coeff;
1527 
1528   f7_copy_flash (yy, pcoeff);
1529 
1530   while (1)
1531     {
1532       --pcoeff;
1533 #if 1
1534       f7_Imul (yy, xx);
1535       const f7_t *cst = USE_LD ? pcoeff : f7_copy_P (tmp2, pcoeff);
1536       if (coeff == pcoeff)
1537 	return f7_add (cc, yy, cst);
1538       f7_Iadd (yy, cst);
1539 #else
1540       const f7_t *cst = USE_LD ? pcoeff : f7_copy_P (tmp2, pcoeff);
1541       f7_madd (yy, yy, xx, cst);
1542       if (coeff == pcoeff)
1543         {
1544 	  f7_copy (cc, yy);
1545 	  return;
1546         }
1547 #endif
1548     }
1549 
1550   __builtin_unreachable();
1551 }
1552 #endif // F7MOD_horner_
1553 
1554 
1555 #ifdef F7MOD_log_
1556 F7_WEAK
f7_log(f7_t * cc,const f7_t * aa)1557 void f7_log (f7_t *cc, const f7_t *aa)
1558 {
1559     f7_logx (cc, aa, NULL);
1560 }
1561 #endif // F7MOD_log_
1562 
1563 
1564 #ifdef F7MOD_log2_
1565 F7_WEAK
f7_log2(f7_t * cc,const f7_t * aa)1566 void f7_log2 (f7_t *cc, const f7_t *aa)
1567 {
1568   f7_logx (cc, aa, F7_RAW_CONST_ADDR (1_ln2));
1569 }
1570 #endif // F7MOD_log2_
1571 
1572 
1573 #ifdef F7MOD_log10_
1574 F7_WEAK
f7_log10(f7_t * cc,const f7_t * aa)1575 void f7_log10 (f7_t *cc, const f7_t *aa)
1576 {
1577   f7_logx (cc, aa, F7_RAW_CONST_ADDR (1_ln10));
1578 }
1579 #endif // F7MOD_log10_
1580 
1581 
1582 #ifdef F7MOD_logx_
1583 
1584 #define ARRAY_NAME coeff_artanh
1585 #include "libf7-array.def"
1586 #undef ARRAY_NAME
1587 
1588 // Compute P * ln(A)  if P != NULL and ln(A), otherwise.
1589 // P is a LD-address if USE_LD and a LPM-address if USE_LPM.
1590 // Assumption is that P > 0.
1591 
1592 F7_WEAK
f7_logx(f7_t * cc,const f7_t * aa,const f7_t * p)1593 void f7_logx (f7_t *cc, const f7_t *aa, const f7_t *p)
1594 {
1595   uint8_t a_class = f7_classify (aa);
1596 
1597   if (f7_class_nan (a_class) || f7_class_sign (a_class))
1598     return f7_set_nan (cc);
1599 
1600   if (f7_class_inf (a_class))
1601     return f7_set_inf (cc, 0);
1602 
1603   if (f7_class_zero (a_class))
1604     return f7_set_inf (cc, 1);
1605 
1606   f7_t *yy = cc;
1607   f7_t xx7, *xx = &xx7;
1608   f7_t tmp7, *tmp = &tmp7;
1609 
1610   // Y in [1, 2]  =  A * 2 ^ (-a_expo).
1611   int16_t a_expo = aa->expo;
1612   f7_copy (yy, aa);
1613   yy->expo = 0;
1614 
1615   // Y in [1 / sqrt2, sqrt2].
1616 
1617   if (f7_abscmp_msb_ge (yy, F7_(const_sqrt2_msb), F7_(const_sqrt2_expo)))
1618     {
1619       yy->expo = -1;
1620       a_expo = add_ssat16 (a_expo, 1);
1621     }
1622 
1623   const f7_t *one = F7_U16_ADDR (1, & tmp7);
1624 
1625   // X := (Y - 1) / (Y + 1),  |X| <= (sqrt2 - 1) / (sqrt2 + 1)  ~  0.172.
1626   f7_sub (xx, yy, one);
1627   f7_Iadd (yy, one);
1628   f7_Idiv (xx, yy);
1629 
1630   // Y := X^2,  |Y| < 0.03.
1631   f7_square (yy, xx);
1632 
1633   // Y := artanh (X^2) / X
1634   f7_horner (yy, yy, n_coeff_artanh, coeff_artanh, tmp);
1635 
1636   // C = X * Y = ln A - a_expo * ln2.
1637   f7_mul (cc, xx, yy);
1638 
1639   // X := a_expo * ln2.
1640   f7_set_s16 (xx, a_expo);
1641   f7_Imul (xx, F7_CONST_ADDR (ln2, & tmp7));
1642 
1643   // C = ln A.
1644   f7_Iadd (cc, xx);
1645 
1646   if (p && USE_LPM)
1647     f7_Imul (cc, f7_copy_P (tmp, p));
1648   if (p && USE_LD)
1649     f7_Imul (cc, p);
1650 }
1651 #endif // F7MOD_logx_
1652 
1653 
1654 #ifdef F7MOD_exp_
1655 
1656 #define ARRAY_NAME coeff_exp
1657 #include "libf7-array.def"
1658 #undef ARRAY_NAME
1659 
1660 #define STATIC static
1661 #include "libf7-constdef.h" // ln2_low
1662 #undef STATIC
1663 
1664 F7_WEAK
f7_exp(f7_t * cc,const f7_t * aa)1665 void f7_exp (f7_t *cc, const f7_t *aa)
1666 {
1667   uint8_t a_class = f7_classify (aa);
1668 
1669   if (f7_class_nan (a_class))
1670     return f7_set_nan (cc);
1671 
1672   /* The maximal exponent of 2 for a double is 1023, hence we may limit
1673      to  |A| < 1023 * ln2 ~ 709.  We limit to  1024 ~ 1.99 * 2^9  */
1674 
1675   if (f7_class_inf (a_class)
1676       || (f7_class_nonzero (a_class) && aa->expo >= 9))
1677     {
1678       if (f7_class_sign (a_class))
1679 	return f7_clr (cc);
1680       else
1681 	return f7_set_inf (cc, 0);
1682     }
1683 
1684   f7_t const *cst;
1685   f7_t qq7, *qq = &qq7;
1686 
1687   F7_PCONST (cst, ln2);
1688 
1689   // We limited |A| to 1024 and are now dividing by ln2, hence Q will
1690   // be at most 1024 / ln2 ~ 1477 and fit into 11 bits.  We will
1691   // round Q anyway, hence only request 11 bits from the division and
1692   // one additional bit that might be needed to normalize the quotient.
1693   f7_divx (qq, aa, cst, 1 + 11);
1694 
1695   // Use the smallest (by absolute value) remainder system.
1696   f7_round (qq, qq);
1697   int16_t q = f7_get_s16 (qq);
1698 
1699   // Reducing A mod ln2 gives |C| <= ln2 / 2,  C = -A mod ln2.
1700   f7_msub (cc, qq, cst, aa);
1701 
1702   // Corrigendum:  We added Q * ln2; now add Q times the low part of ln2
1703   // for better precision.  Due to |C| < |A| this is not a no-op in general.
1704   const f7_t *yy = F7_CONST_ADDR (ln2_low, &_var_for_ln2);
1705   f7_madd (cc, qq, yy, cc);
1706 
1707   // Because we computed C = -A mod ...
1708   cc->flags ^= F7_FLAG_sign;
1709 
1710   // Reduce further to |C| < ln2 / 8 which is the range of our MiniMax poly.
1711   const uint8_t MAX_LN2_RED = 3;
1712   int8_t scal2 = 0;
1713 
1714   while (f7_abscmp_msb_ge (cc, F7_(const_ln2_msb),
1715 			   F7_(const_ln2_expo) - MAX_LN2_RED))
1716     {
1717       scal2++;
1718       cc->expo--;
1719     }
1720 
1721   f7_horner (cc, cc, n_coeff_exp, coeff_exp, qq);
1722 
1723   while (--scal2 >= 0)
1724     f7_Isquare (cc);
1725 
1726   f7_Ildexp (cc, q);
1727 }
1728 #endif // F7MOD_exp_
1729 
1730 
1731 #ifdef F7MOD_pow10_
1732 F7_WEAK
f7_pow10(f7_t * cc,const f7_t * aa)1733 void f7_pow10 (f7_t *cc, const f7_t *aa)
1734 {
1735   const f7_t *p_ln10;
1736   F7_PCONST (p_ln10, ln10);
1737   f7_mul (cc, aa, p_ln10);
1738   f7_exp (cc, cc);
1739 }
ALIAS(f7_pow10,f7_exp10)1740 ALIAS (f7_pow10, f7_exp10)
1741 #endif // F7MOD_pow10_
1742 
1743 
1744 #ifdef F7MOD_cbrt_
1745 F7_WEAK
1746 void f7_cbrt (f7_t *cc, const f7_t *aa)
1747 {
1748   f7_copy (cc, aa);
1749   const f7_t *p_1_3;
1750   uint8_t c_flags = cc->flags;
1751   cc->flags &= ~F7_FLAG_sign;
1752   f7_log (cc, cc);
1753   F7_PCONST (p_1_3, 1_3);
1754   f7_Imul (cc, p_1_3);
1755   f7_exp (cc, cc);
1756 
1757   if (c_flags & F7_FLAG_sign)
1758     cc->flags |= F7_FLAG_sign;
1759 }
1760 #endif // F7MOD_cbrt_
1761 
1762 
1763 #ifdef F7MOD_pow_
1764 F7_WEAK
f7_pow(f7_t * cc,const f7_t * aa,const f7_t * bb)1765 void f7_pow (f7_t *cc, const f7_t *aa, const f7_t *bb)
1766 {
1767 #if 0
1768   f7_t slots[cc == bb];
1769   f7_t *yy = cc == bb ? slots : cc;
1770 #else
1771   f7_t yy7, *yy = &yy7;
1772 #endif
1773   f7_log (yy, aa);
1774   f7_Imul (yy, bb);
1775   f7_exp (cc, yy);
1776 }
1777 #endif // F7MOD_pow_
1778 
1779 
1780 #ifdef F7MOD_powi_
1781 F7_WEAK
f7_powi(f7_t * cc,const f7_t * aa,int ii)1782 void f7_powi (f7_t *cc, const f7_t *aa, int ii)
1783 {
1784   uint16_t u16 = ii;
1785   f7_t xx27, *xx2 = &xx27;
1786 
1787   if (ii < 0)
1788     u16 = -u16;
1789 
1790   f7_copy (xx2, aa);
1791 
1792   f7_set_u16 (cc, 1);
1793 
1794   while (1)
1795     {
1796       if (u16 & 1)
1797 	f7_Imul (cc, xx2);
1798 
1799       if (! f7_is_nonzero (cc))
1800 	break;
1801 
1802       u16 >>= 1;
1803       if (u16 == 0)
1804 	break;
1805       f7_Isquare (xx2);
1806     }
1807 
1808   if (ii < 0)
1809     f7_div1 (xx2, aa);
1810 }
1811 #endif // F7MOD_powi_
1812 
1813 
1814 #ifdef F7MOD_sincos_
1815 
1816 #define ARRAY_NAME coeff_sin
1817   #define FOR_SIN
1818   #include "libf7-array.def"
1819   #undef  FOR_SIN
1820 #undef ARRAY_NAME
1821 
1822 #define ARRAY_NAME coeff_cos
1823   #define FOR_COS
1824   #include "libf7-array.def"
1825   #undef  FOR_COS
1826 #undef ARRAY_NAME
1827 
1828 #define STATIC static
1829 #include "libf7-constdef.h" // pi_low
1830 #undef STATIC
1831 
1832 typedef union
1833 {
1834   struct
1835   {
1836     bool    neg_sin : 1; // Must be bit F7_FLAGNO_sign.
1837     bool    neg_cos : 1;
1838     bool    do_sin: 1;
1839     bool    do_cos: 1;
1840     bool    swap_sincos : 1;
1841     uint8_t res : 3;
1842   };
1843   uint8_t bits;
1844 } sincos_t;
1845 
1846 
1847 F7_WEAK
f7_sincos(f7_t * ss,f7_t * cc,const f7_t * aa)1848 void f7_sincos (f7_t *ss, f7_t *cc, const f7_t *aa)
1849 {
1850   uint8_t a_class = f7_classify (aa);
1851 
1852   sincos_t sc = { .bits = a_class & F7_FLAG_sign };
1853   if (ss != NULL) sc.do_sin = 1;
1854   if (cc != NULL) sc.do_cos = 1;
1855 
1856   if (f7_class_nan (a_class) || f7_class_inf (a_class))
1857     {
1858       if (sc.do_sin)  f7_set_nan (ss);
1859       if (sc.do_cos)  f7_set_nan (cc);
1860       return;
1861     }
1862 
1863   f7_t pi7, *pi = &pi7;
1864   f7_t xx7, *xx = &xx7;
1865   f7_t yy7, *yy = &yy7;
1866   f7_t *hh = sc.do_sin ? ss : cc;
1867 
1868   // X = |A|
1869   f7_copy (xx, aa);
1870   xx->flags = 0;
1871 
1872   // Y is how often we subtract PI from X.
1873   f7_clr (yy);
1874   f7_const (pi, pi);
1875 
1876   if (f7_abscmp_msb_ge (xx, F7_(const_pi_msb), F7_(const_pi_expo) + 1))
1877     {
1878       pi->expo = 1 + F7_(const_pi_expo);  // 2*pi
1879 
1880       // Y = X / 2pi.
1881       f7_div (yy, xx, pi);
1882 
1883       // The integral part of |A| / pi mod 2 is bit 55 - x_expo.
1884       if (yy->expo >= F7_MANT_BITS && !f7_is_zero (yy))
1885         {
1886 	  // Too big for sensible calculation:  Should this be NaN instead?
1887 	  if (sc.do_sin)  f7_clr (ss);
1888 	  if (sc.do_cos)  f7_clr (cc);
1889 	  return;
1890         }
1891 
1892       // X -= 2pi * [ X / 2pi ]
1893       f7_floor (yy, yy);
1894 
1895       f7_msub (xx, yy, pi, xx);
1896       xx->flags ^= F7_FLAG_sign;
1897 
1898       // We divided by 2pi, but Y should count times we subtracted pi.
1899       yy->expo++;
1900     }
1901 
1902   pi->expo = F7_(const_pi_expo); // pi
1903   f7_sub (hh, xx, pi);
1904   if (!f7_signbit (hh))
1905     {
1906       // H = X - pi >= 0  =>  X >= pi
1907       // sin(x) = -sin(x - pi)
1908       // cos(x) = -cos(x - pi)
1909       f7_copy (xx, hh);
1910       // Y: We subtracted pi one more time.
1911       f7_Iadd (yy, f7_set_u16 (hh, 1));
1912       sc.neg_sin ^= 1;
1913       sc.neg_cos ^= 1;
1914     }
1915 
1916   pi->expo = F7_(const_pi_expo) - 1; // pi/2
1917   if (f7_gt (xx, pi))
1918     {
1919       // x > pi/2
1920       // sin(x) =  sin(pi - x)
1921       // cos(x) = -cos(pi - x)
1922       pi->expo = F7_(const_pi_expo); // pi
1923       f7_IRsub (xx, pi);
1924       // Y: We subtracted pi one more time (and then negated).
1925       f7_Iadd (yy, f7_set_u16 (hh, 1));
1926       yy->flags ^= F7_FLAG_sign;
1927       sc.neg_cos ^= 1;
1928     }
1929 
1930   pi->expo = F7_(const_pi_expo) - 2; // pi/4
1931   if (f7_gt (xx, pi))
1932     {
1933       // x > pi/4
1934       // sin(x) = cos(pi/2 - x)
1935       // cos(x) = sin(pi/2 - x)
1936       pi->expo = F7_(const_pi_expo) - 1; // pi/2
1937       f7_IRsub (xx, pi);
1938       // Y: We subtracted pi/2 one more time (and then negated).
1939       f7_Iadd (yy, f7_set_1pow2 (hh, -1, 0));
1940       yy->flags ^= F7_FLAG_sign;
1941       sc.swap_sincos = 1;
1942     }
1943 
1944   if (!f7_is0 (yy))
1945     {
1946       // Y counts how often we subtracted pi from X in order to
1947       // get 0 <= X < pi/4 as small as possible (Y is 0 mod 0.5).
1948       // Now also subtract the low part of pi:
1949       // f7_const_pi_low = pi - f7_const_pi  in order to get more precise
1950       // results in the cases where the final result is close to 0.
1951       const f7_t *pi_low = F7_CONST_ADDR (pi_low, pi);
1952       //f7_const (pi, pi_low);
1953       f7_Imul (yy, pi_low);
1954       f7_Isub (xx, yy);
1955     }
1956 
1957   // X   in [0, pi/4].
1958   // X^2 in [0, pi^2/16] ~ [0, 0.6169]
1959 
1960   f7_square (yy, xx);
1961 
1962   f7_t *x_sin = xx;
1963   f7_t *x_cos = yy;
1964 
1965   if ((sc.do_sin && !sc.swap_sincos)
1966       || (sc.do_cos && sc.swap_sincos))
1967     {
1968       f7_horner (hh, yy, n_coeff_sin, coeff_sin, NULL);
1969       f7_mul (x_sin, hh, xx);
1970     }
1971 
1972   if ((sc.do_cos && !sc.swap_sincos)
1973       || (sc.do_sin && sc.swap_sincos))
1974     {
1975       f7_horner (x_cos, yy, n_coeff_cos, coeff_cos, hh);
1976     }
1977 
1978   if (sc.swap_sincos)
1979     {
1980       x_sin = yy;
1981       x_cos = xx;
1982     }
1983 
1984   if (sc.do_sin)
1985     {
1986       x_sin->flags ^= sc.bits;
1987       x_sin->flags &= F7_FLAG_sign;
1988       f7_copy (ss, x_sin);
1989     }
1990 
1991   if (sc.do_cos)
1992     {
1993       x_cos->flags = sc.neg_cos;
1994       f7_copy (cc, x_cos);
1995     }
1996 }
1997 #endif // F7MOD_sincos_
1998 
1999 #ifdef F7MOD_sin_
2000 F7_WEAK
f7_sin(f7_t * ss,const f7_t * aa)2001 void f7_sin (f7_t *ss, const f7_t *aa)
2002 {
2003   f7_sincos (ss, NULL, aa);
2004 }
2005 #endif // F7MOD_sin_
2006 
2007 #ifdef F7MOD_cos_
2008 F7_WEAK
f7_cos(f7_t * cc,const f7_t * aa)2009 void f7_cos (f7_t *cc, const f7_t *aa)
2010 {
2011   f7_sincos (NULL, cc, aa);
2012 }
2013 #endif // F7MOD_cos_
2014 
2015 
2016 #ifdef F7MOD_tan_
2017 F7_WEAK
f7_tan(f7_t * tt,const f7_t * aa)2018 void f7_tan (f7_t *tt, const f7_t *aa)
2019 {
2020   f7_t xcos;
2021   f7_sincos (tt, & xcos, aa);
2022   f7_Idiv (tt, & xcos);
2023 }
2024 #endif // F7MOD_tan_
2025 
2026 
2027 #ifdef F7MOD_cotan_
2028 F7_WEAK
f7_cotan(f7_t * ct,const f7_t * aa)2029 void f7_cotan (f7_t *ct, const f7_t *aa)
2030 {
2031   f7_t xcos;
2032   f7_sincos (ct, & xcos, aa);
2033   f7_div (ct, & xcos, ct);
2034 }
2035 #endif // F7MOD_cotan_
2036 
2037 
2038 #ifdef F7MOD_sinhcosh_
2039 F7_WEAK
f7_sinhcosh(f7_t * cc,const f7_t * aa,bool do_sinh)2040 void f7_sinhcosh (f7_t *cc, const f7_t *aa, bool do_sinh)
2041 {
2042   f7_t xx7, *xx = &xx7;
2043   // C = exp(A)
2044   f7_exp (cc, aa);
2045   // X = exp(-A)
2046   f7_div (xx, f7_set_u16 (xx, 1), cc);
2047   // sinh(A) = (exp(A) - exp(-A)) / 2
2048   // cosh(A) = (exp(A) + exp(-A)) / 2
2049   f7_addsub (cc, cc, xx, do_sinh);
2050   cc->expo--;
2051 }
2052 #endif // F7MOD_sinhcosh_
2053 
2054 
2055 #ifdef F7MOD_sinh_
2056 F7_WEAK
f7_sinh(f7_t * cc,const f7_t * aa)2057 void f7_sinh (f7_t *cc, const f7_t *aa)
2058 {
2059   f7_sinhcosh (cc, aa, true);
2060 }
2061 #endif // F7MOD_sinh_
2062 
2063 
2064 #ifdef F7MOD_cosh_
2065 F7_WEAK
f7_cosh(f7_t * cc,const f7_t * aa)2066 void f7_cosh (f7_t *cc, const f7_t *aa)
2067 {
2068   f7_sinhcosh (cc, aa, false);
2069 }
2070 #endif // F7MOD_cosh_
2071 
2072 
2073 #ifdef F7MOD_tanh_
2074 F7_WEAK
f7_tanh(f7_t * cc,const f7_t * aa)2075 void f7_tanh (f7_t *cc, const f7_t *aa)
2076 {
2077   // tanh(A) = (exp(2A) - 1) / (exp(2A) + 1)
2078   f7_t xx7, *xx = &xx7;
2079   F7_PCONST_U16 (const f7_t *one, 1);
2080   // C = 2A
2081   f7_copy (cc, aa);
2082   cc->expo++;
2083   // C = exp(2A)
2084   f7_exp (cc, cc);
2085   // X = exp(2A) + 1
2086   f7_add (xx, cc, one);
2087   // C = exp(2A) - 1
2088   f7_Isub (cc, one);
2089   // C = tanh(A)
2090   f7_Idiv (cc, xx);
2091 }
2092 #endif // F7MOD_tanh_
2093 
2094 
2095 #ifdef F7MOD_atan_
2096 
2097 #define MINIMAX_6_6_IN_0_1
2098 
2099 #define ARRAY_NAME coeff_atan_zahler
2100 #define FOR_NUMERATOR
2101 #include "libf7-array.def"
2102 #undef FOR_NUMERATOR
2103 #undef ARRAY_NAME
2104 
2105 #define ARRAY_NAME coeff_atan_nenner
2106 #define FOR_DENOMINATOR
2107 #include "libf7-array.def"
2108 #undef FOR_DENOMINATOR
2109 #undef ARRAY_NAME
2110 
2111 #include "libf7-constdef.h"
2112 
2113 F7_WEAK
f7_atan(f7_t * cc,const f7_t * aa)2114 void f7_atan (f7_t *cc, const f7_t *aa)
2115 {
2116   uint8_t a_class = f7_classify (aa);
2117   uint8_t flags = a_class & F7_FLAG_sign;
2118 
2119   if (f7_class_nan (a_class))
2120     return f7_set_nan (cc);
2121 
2122   f7_t yy7, *yy = &yy7;
2123   f7_t zz7, *zz = &zz7;
2124 
2125   if (f7_class_inf (a_class))
2126     {
2127       f7_set_u16 (cc, 0);
2128       goto do_Inf;
2129     }
2130 
2131   // C = |A|
2132   f7_copy (cc, aa);
2133   cc->flags = 0;
2134 
2135   if (!f7_class_zero (a_class) && cc->expo >= 0)
2136     {
2137       // C >= 1:  use  atan (x) + atan (1/x) = pi / 2  to reduce to [0, 1].
2138       flags |= 1 << 1;
2139       f7_div (cc, f7_set_u16 (yy, 1), cc);
2140     }
2141 #if !defined (MINIMAX_6_6_IN_0_1)
2142   const uint8_t const_a_msb = 0x89;
2143   const int16_t const_a_expo = -2;
2144   if (f7_abscmp_msb_ge (cc, const_a_msb, const_a_expo))
2145     {
2146       // We have C in [0, 1] and we want to use argument reduction by means
2147       // of addition theorem  atan(x) - atan(y) = atan((x - y) / (1 + xy)).
2148       // We want to split [0, 1] into  [0, a] u [a, 1]  in such a way that
2149       // the upper interval will be mapped to  [-a, a].  The system is easy
2150       // to solve and yiels
2151       //    y = 1 / sqrt (3)       ~  0.57735     atan(y) = pi / 6
2152       //    a = (1 - y) / (1 + y)  ~  0.26795  ~  0x0.8930A2F * 2^-1.
2153       flags |= 1 << 2;
2154       // C <- (C - Y) / (1 + C * Y)  in  [-a, a]
2155       const f7_t *cst = F7_CONST_ADDR (1_sqrt3, zz);
2156       f7_mul (yy, cc, cst);
2157       f7_Isub (cc, cst);
2158       f7_Iadd (yy, F7_U16_ADDR (1, zz));
2159       f7_Idiv (cc, yy);
2160     }
2161 #endif
2162   // C <- C * p(C^2) / q(C^2)
2163   f7_square (yy, cc);
2164   f7_horner (zz, yy, n_coeff_atan_zahler, coeff_atan_zahler, NULL);
2165   f7_Imul (zz, cc);
2166   f7_horner (cc, yy, n_coeff_atan_nenner, coeff_atan_nenner, NULL);
2167   f7_div (cc, zz, cc);
2168 
2169 #if !defined (MINIMAX_6_6_IN_0_1)
2170   if (flags & (1 << 2))
2171     f7_Iadd (cc, F7_CONST_ADDR (pi_6, yy));
2172 #endif
2173 
2174   if (flags & (1 << 1))
2175     {
2176     do_Inf:;
2177       // Y = pi / 2
2178       f7_const (yy, pi);
2179       yy->expo = F7_(const_pi_expo) - 1;
2180       f7_IRsub (cc, yy);
2181     }
2182 
2183   cc->flags = a_class & F7_FLAG_sign;
2184 }
2185 #undef MINIMAX_6_6_IN_0_1
2186 #endif // F7MOD_atan_
2187 
2188 
2189 #ifdef F7MOD_asinacos_
2190 
2191 #define ARRAY_NAME coeff_func_a_zahler
2192 #define FOR_NUMERATOR
2193 #include "libf7-array.def"
2194 #undef  FOR_NUMERATOR
2195 #undef  ARRAY_NAME
2196 
2197 #define ARRAY_NAME coeff_func_a_nenner
2198 #define FOR_DENOMINATOR
2199 #include "libf7-array.def"
2200 #undef  FOR_DENOMINATOR
2201 #undef  ARRAY_NAME
2202 
2203 typedef union
2204 {
2205   struct
2206   {
2207     bool    sign : 1;       // Must be bit F7_FLAGNO_sign.
2208     bool    do_acos : 1;    // From caller.
2209     bool    have_acos : 1;  // What we compute from rational approx p/q.
2210     uint8_t res : 5;
2211   };
2212   uint8_t bits;
2213 } asinacos_t;
2214 
2215 F7_WEAK
f7_asinacos(f7_t * cc,const f7_t * aa,uint8_t what)2216 void f7_asinacos (f7_t *cc, const f7_t *aa, uint8_t what)
2217 {
2218   f7_t xx7, *xx = &xx7;
2219   f7_t xx27, *xx2 = &xx27;
2220 
2221   asinacos_t flags = { .bits = what | f7_signbit (aa) };
2222 
2223   f7_abs (xx, aa);
2224 
2225   int8_t cmp = f7_cmp (xx, f7_set_u16 (cc, 1));
2226 
2227   if (cmp == INT8_MIN
2228       || cmp > 0)
2229     {
2230       return f7_set_nan (cc);
2231     }
2232 
2233   if (xx->expo <= -2 || f7_is_zero (xx))
2234     {
2235       // |A| < 1/2:  asin(x) = x * a(2*x^2)
2236       f7_square (xx2, xx);
2237       xx2->expo ++;
2238     }
2239   else
2240     {
2241       // |A| > 1/2: acos (1-x) = sqrt(2*x) * a(x)
2242       // C is 1 from above.
2243       f7_IRsub (xx, cc);
2244       f7_copy (xx2, xx);
2245       flags.have_acos = 1;
2246     }
2247 
2248   // MiniMax [5/4] numerator.
2249   f7_horner (cc, xx2, n_coeff_func_a_zahler, coeff_func_a_zahler, NULL);
2250 
2251   if (flags.have_acos)
2252     {
2253       xx->expo ++;
2254       f7_Isqrt (xx);
2255     }
2256   f7_Imul (cc, xx);
2257 
2258   // MiniMax [5/4] denominator.
2259   f7_horner (xx, xx2, n_coeff_func_a_nenner, coeff_func_a_nenner, NULL);
2260 
2261   f7_Idiv (cc, xx);
2262 
2263   /*
2264       With the current value of C, we have:
2265 
2266 		|	     |	      do_asin	    |	    do_acos
2267 		|     C	     |	A <= 0	 |  A >= 0  |  A <= 0  |  A >= 0
2268       ----------+------------+-----------+----------+----------+----------
2269       have_asin | asin (|A|) |	  -C	 |    C	    | pi/2 + C | pi/2 - C
2270       have_acos | acos (|A|) | -pi/2 + C | pi/2 - C |  pi - C  |     C
2271 
2272       Result = n_pi2 * pi/2 + C * (c_sign ? -1 : 1)
2273       Result (A, do_asin) = asin (A)
2274       Result (A, do_acos) = acos (A)
2275 
2276       with
2277 	  c_sign = do_acos ^ have_acos ^ a_sign
2278 	  n_pi2	 = do_acos + have_acos * (a_sign ^ do_acos) ? (-1 : 1)
2279 	  n_pi2 in { -1, 0, 1, 2 }
2280   */
2281 
2282   // All what matters for c_sign is bit 0.
2283   uint8_t c_sign = flags.bits;
2284   int8_t n_pi2 = flags.do_acos;
2285   c_sign ^= flags.do_acos;
2286   if (flags.have_acos)
2287     {
2288       n_pi2++;
2289       __asm ("" : "+r" (n_pi2));
2290       if (c_sign & 1)  // c_sign & 1  =  a_sign ^ do_acos
2291 	n_pi2 -= 2;
2292       c_sign++;
2293     }
2294 
2295   cc->flags = c_sign & F7_FLAG_sign;
2296 
2297   if (n_pi2)
2298     {
2299       f7_const (xx, pi);
2300       if (n_pi2 < 0)
2301 	xx->sign = 1;
2302       if (n_pi2 != 2)
2303 	xx->expo = F7_(const_pi_expo) - 1;
2304 
2305       f7_Iadd (cc, xx);
2306     }
2307 }
2308 #endif // F7MOD_asinacos_
2309 
2310 
2311 #ifdef F7MOD_asin_
2312 F7_WEAK
f7_asin(f7_t * cc,const f7_t * aa)2313 void f7_asin (f7_t *cc, const f7_t *aa)
2314 {
2315   f7_asinacos (cc, aa, 0);
2316 }
2317 #endif // F7MOD_asin_
2318 
2319 
2320 #ifdef F7MOD_acos_
2321 F7_WEAK
f7_acos(f7_t * cc,const f7_t * aa)2322 void f7_acos (f7_t *cc, const f7_t *aa)
2323 {
2324   f7_asinacos (cc, aa, 1 << 1);
2325 }
2326 #endif // F7MOD_acos_
2327 
2328 
2329 #ifndef IN_LIBGCC2
2330 
2331 #ifdef F7MOD_put_C_
2332 
2333 #include <stdio.h>
2334 #include <avr/pgmspace.h>
2335 
2336 static F7_INLINE
f7_hex_digit(uint8_t nibble)2337 uint8_t f7_hex_digit (uint8_t nibble)
2338 {
2339   nibble = (uint8_t) (nibble + '0');
2340   if (nibble > '9')
2341     nibble = (uint8_t) (nibble + ('a' - '0' - 10));
2342   return nibble;
2343 }
2344 
f7_put_hex2(uint8_t x,FILE * stream)2345 static void f7_put_hex2 (uint8_t x, FILE *stream)
2346 {
2347   putc ('0', stream);
2348   if (x)
2349     {
2350       putc ('x', stream);
2351       putc (f7_hex_digit (x >> 4), stream);
2352       putc (f7_hex_digit (x & 0xf), stream);
2353     }
2354 }
2355 
2356 #define XPUT(str) \
2357   fputs_P (PSTR (str), stream)
2358 
2359 // Write to STREAM a line that is appropriate for usage in libf7-const.def.
2360 
2361 F7_WEAK
f7_put_CDEF(const char * name,const f7_t * aa,FILE * stream)2362 void f7_put_CDEF (const char *name, const f7_t *aa, FILE *stream)
2363 {
2364   char buf[7];
2365   XPUT ("F7_CONST_DEF (");
2366   fputs (name, stream);
2367   XPUT (",\t");
2368   uint8_t a_class = f7_classify (aa);
2369   if (! f7_class_nonzero (a_class))
2370     {
2371       f7_put_hex2 (a_class & F7_FLAGS, stream);
2372       XPUT (",\t0,0,0,0,0,0,0,\t0)");
2373       return;
2374     }
2375   putc ('0' + (a_class & F7_FLAGS), stream);
2376   XPUT (",\t");
2377 
2378   for (uint8_t i = 0; i < F7_MANT_BYTES; i++)
2379     {
2380       f7_put_hex2 (aa->mant[F7_MANT_BYTES-1 - i], stream);
2381       putc (',', stream);
2382     }
2383   putc ('\t', stream);
2384 
2385   itoa (aa->expo, buf, 10);
2386   fputs (buf, stream);
2387   XPUT (")");
2388 }
2389 
f7_put_C(const f7_t * aa,FILE * stream)2390 void f7_put_C (const f7_t *aa, FILE *stream)
2391 {
2392   char buf[7];
2393 
2394   uint8_t a_class = f7_classify (aa);
2395   if (f7_class_nan (a_class))
2396     {
2397       XPUT ("{ .is_nan = 1 }");
2398       return;
2399     }
2400   bool sign = a_class & F7_FLAG_sign;
2401 
2402   if (f7_class_inf (a_class))
2403     {
2404       XPUT ("{ .is_nan = 1, .sign = ");
2405       putc ('0' + sign, stream);
2406       XPUT (" }");
2407       return;
2408     }
2409 
2410   XPUT ("{ .sign = ");
2411   putc ('0' + sign, stream);
2412 
2413   XPUT (", .mant = { ");
2414   for (uint8_t i = 0; i < F7_MANT_BYTES; i++)
2415     {
2416       f7_put_hex2 (aa->mant[F7_MANT_BYTES-1 - i], stream);
2417       if (i != F7_MANT_BYTES - 1)
2418 	putc (',', stream);
2419     }
2420 
2421   XPUT (" }, .expo = ");
2422   itoa (aa->expo, buf, 10);
2423   fputs (buf, stream);
2424   XPUT (" }");
2425 }
2426 #endif //F7MOD_put_C_
2427 
2428 
2429 #ifdef F7MOD_dump_
2430 
2431 #include <avr/pgmspace.h>
2432 
2433 #ifndef AVRTEST_H
2434 
2435 #include <stdio.h>
2436 
LOG_PSTR(const char * str)2437 static void LOG_PSTR (const char *str)
2438 {
2439   fputs_P (str, stdout);
2440 }
2441 
LOG_PFMT_U16(const char * fmt,uint16_t x)2442 static void LOG_PFMT_U16 (const char *fmt, uint16_t x)
2443 {
2444   printf_P (fmt, x);
2445 }
2446 
LOG_PFMT_FLOAT(const char * fmt,float x)2447 static void LOG_PFMT_FLOAT (const char *fmt, float x)
2448 {
2449   printf_P (fmt, x);
2450 }
2451 
2452 #define LOG_X8(X)               LOG_PFMT_U16 (PSTR (" 0x%02x "), (uint8_t)(X))
2453 #define LOG_PFMT_S16(FMT, X)    LOG_PFMT_U16 (FMT, (unsigned)(X))
2454 #define LOG_PFMT_ADDR(FMT, X)   LOG_PFMT_U16 (FMT, (unsigned)(X))
2455 
2456 #endif // AVRTEST_H
2457 
dump_byte(uint8_t b)2458 static void dump_byte (uint8_t b)
2459 {
2460   LOG_PSTR (PSTR (" "));
2461   for (uint8_t i = 0; i < 8; i++)
2462     {
2463       LOG_PSTR ((b & 0x80) ? PSTR ("1") : PSTR ("0"));
2464       b = (uint8_t) (b << 1);
2465     }
2466 }
2467 
f7_dump_mant(const f7_t * aa)2468 void f7_dump_mant (const f7_t *aa)
2469 {
2470   LOG_PSTR (PSTR ("\tmant   ="));
2471   for (int i = F7_MANT_BYTES - 1; i >= 0; i--)
2472     LOG_X8 (aa->mant[i]);
2473   LOG_PSTR (PSTR ("\n\t       ="));
2474 
2475   for (int i = F7_MANT_BYTES - 1; i >= 0; i--)
2476     dump_byte (aa->mant[i]);
2477   LOG_PSTR (PSTR ("\n"));
2478 }
2479 
f7_dump(const f7_t * aa)2480 void f7_dump (const f7_t *aa)
2481 {
2482   LOG_PFMT_ADDR (PSTR ("\n0x%04x\tflags  = "), aa);
2483   dump_byte (aa->flags);
2484   uint8_t a_class = f7_classify (aa);
2485   LOG_PSTR (PSTR ("  = "));
2486   LOG_PSTR (f7_class_sign (a_class) ? PSTR ("-") : PSTR ("+"));
2487   if (f7_class_inf (a_class))    LOG_PSTR (PSTR ("Inf "));
2488   if (f7_class_nan (a_class))    LOG_PSTR (PSTR ("NaN "));
2489   if (f7_class_zero (a_class))   LOG_PSTR (PSTR ("0 "));
2490   if (f7_class_number (a_class)) LOG_PSTR (PSTR ("Number "));
2491 
2492   LOG_PFMT_FLOAT (PSTR (" = %.10g\n"), f7_get_float (aa));
2493   LOG_PFMT_S16 (PSTR ("\texpo   = %d\n"), aa->expo);
2494 
2495   f7_dump_mant (aa);
2496 }
2497 #endif // F7MOD_dump_
2498 
2499 #endif // ! libgcc
2500 
2501 #endif // !AVR_TINY
2502