1/* IEEE-754 single-precision functions for Xtensa
2   Copyright (C) 2006-2021 Free Software Foundation, Inc.
3   Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
4
5   This file is part of GCC.
6
7   GCC is free software; you can redistribute it and/or modify it
8   under the terms of the GNU General Public License as published by
9   the Free Software Foundation; either version 3, or (at your option)
10   any later version.
11
12   GCC is distributed in the hope that it will be useful, but WITHOUT
13   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15   License for more details.
16
17   Under Section 7 of GPL version 3, you are granted additional
18   permissions described in the GCC Runtime Library Exception, version
19   3.1, as published by the Free Software Foundation.
20
21   You should have received a copy of the GNU General Public License and
22   a copy of the GCC Runtime Library Exception along with this program;
23   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24   <http://www.gnu.org/licenses/>.  */
25
26#ifdef __XTENSA_EB__
27#define xh a2
28#define xl a3
29#define yh a4
30#define yl a5
31#else
32#define xh a3
33#define xl a2
34#define yh a5
35#define yl a4
36#endif
37
38/*  Warning!  The branch displacements for some Xtensa branch instructions
39    are quite small, and this code has been carefully laid out to keep
40    branch targets in range.  If you change anything, be sure to check that
41    the assembler is not relaxing anything to branch over a jump.  */
42
43#ifdef L_negsf2
44
45	.align	4
46	.global	__negsf2
47	.type	__negsf2, @function
48__negsf2:
49	leaf_entry sp, 16
50	movi	a4, 0x80000000
51	xor	a2, a2, a4
52	leaf_return
53
54#endif /* L_negsf2 */
55
56#ifdef L_addsubsf3
57
58	.literal_position
59	/* Addition */
60__addsf3_aux:
61
62	/* Handle NaNs and Infinities.  (This code is placed before the
63	   start of the function just to keep it in range of the limited
64	   branch displacements.)  */
65
66.Ladd_xnan_or_inf:
67	/* If y is neither Infinity nor NaN, return x.  */
68	bnall	a3, a6, .Ladd_return_nan_or_inf
69	/* If x is a NaN, return it.  Otherwise, return y.  */
70	slli	a7, a2, 9
71	bnez	a7, .Ladd_return_nan
72
73.Ladd_ynan_or_inf:
74	/* Return y.  */
75	mov	a2, a3
76
77.Ladd_return_nan_or_inf:
78	slli	a7, a2, 9
79	bnez	a7, .Ladd_return_nan
80	leaf_return
81
82.Ladd_return_nan:
83	movi	a6, 0x400000	/* make it a quiet NaN */
84	or	a2, a2, a6
85	leaf_return
86
87.Ladd_opposite_signs:
88	/* Operand signs differ.  Do a subtraction.  */
89	slli	a7, a6, 8
90	xor	a3, a3, a7
91	j	.Lsub_same_sign
92
93	.align	4
94	.global	__addsf3
95	.type	__addsf3, @function
96__addsf3:
97	leaf_entry sp, 16
98	movi	a6, 0x7f800000
99
100	/* Check if the two operands have the same sign.  */
101	xor	a7, a2, a3
102	bltz	a7, .Ladd_opposite_signs
103
104.Ladd_same_sign:
105	/* Check if either exponent == 0x7f8 (i.e., NaN or Infinity).  */
106	ball	a2, a6, .Ladd_xnan_or_inf
107	ball	a3, a6, .Ladd_ynan_or_inf
108
109	/* Compare the exponents.  The smaller operand will be shifted
110	   right by the exponent difference and added to the larger
111	   one.  */
112	extui	a7, a2, 23, 9
113	extui	a8, a3, 23, 9
114	bltu	a7, a8, .Ladd_shiftx
115
116.Ladd_shifty:
117	/* Check if the smaller (or equal) exponent is zero.  */
118	bnone	a3, a6, .Ladd_yexpzero
119
120	/* Replace y sign/exponent with 0x008.  */
121	or	a3, a3, a6
122	slli	a3, a3, 8
123	srli	a3, a3, 8
124
125.Ladd_yexpdiff:
126	/* Compute the exponent difference.  */
127	sub	a10, a7, a8
128
129	/* Exponent difference > 32 -- just return the bigger value.  */
130	bgeui	a10, 32, 1f
131
132	/* Shift y right by the exponent difference.  Any bits that are
133	   shifted out of y are saved in a9 for rounding the result.  */
134	ssr	a10
135	movi	a9, 0
136	src	a9, a3, a9
137	srl	a3, a3
138
139	/* Do the addition.  */
140	add	a2, a2, a3
141
142	/* Check if the add overflowed into the exponent.  */
143	extui	a10, a2, 23, 9
144	beq	a10, a7, .Ladd_round
145	mov	a8, a7
146	j	.Ladd_carry
147
148.Ladd_yexpzero:
149	/* y is a subnormal value.  Replace its sign/exponent with zero,
150	   i.e., no implicit "1.0", and increment the apparent exponent
151	   because subnormals behave as if they had the minimum (nonzero)
152	   exponent.  Test for the case when both exponents are zero.  */
153	slli	a3, a3, 9
154	srli	a3, a3, 9
155	bnone	a2, a6, .Ladd_bothexpzero
156	addi	a8, a8, 1
157	j	.Ladd_yexpdiff
158
159.Ladd_bothexpzero:
160	/* Both exponents are zero.  Handle this as a special case.  There
161	   is no need to shift or round, and the normal code for handling
162	   a carry into the exponent field will not work because it
163	   assumes there is an implicit "1.0" that needs to be added.  */
164	add	a2, a2, a3
1651:	leaf_return
166
167.Ladd_xexpzero:
168	/* Same as "yexpzero" except skip handling the case when both
169	   exponents are zero.  */
170	slli	a2, a2, 9
171	srli	a2, a2, 9
172	addi	a7, a7, 1
173	j	.Ladd_xexpdiff
174
175.Ladd_shiftx:
176	/* Same thing as the "shifty" code, but with x and y swapped.  Also,
177	   because the exponent difference is always nonzero in this version,
178	   the shift sequence can use SLL and skip loading a constant zero.  */
179	bnone	a2, a6, .Ladd_xexpzero
180
181	or	a2, a2, a6
182	slli	a2, a2, 8
183	srli	a2, a2, 8
184
185.Ladd_xexpdiff:
186	sub	a10, a8, a7
187	bgeui	a10, 32, .Ladd_returny
188
189	ssr	a10
190	sll	a9, a2
191	srl	a2, a2
192
193	add	a2, a2, a3
194
195	/* Check if the add overflowed into the exponent.  */
196	extui	a10, a2, 23, 9
197	bne	a10, a8, .Ladd_carry
198
199.Ladd_round:
200	/* Round up if the leftover fraction is >= 1/2.  */
201	bgez	a9, 1f
202	addi	a2, a2, 1
203
204	/* Check if the leftover fraction is exactly 1/2.  */
205	slli	a9, a9, 1
206	beqz	a9, .Ladd_exactlyhalf
2071:	leaf_return
208
209.Ladd_returny:
210	mov	a2, a3
211	leaf_return
212
213.Ladd_carry:
214	/* The addition has overflowed into the exponent field, so the
215	   value needs to be renormalized.  The mantissa of the result
216	   can be recovered by subtracting the original exponent and
217	   adding 0x800000 (which is the explicit "1.0" for the
218	   mantissa of the non-shifted operand -- the "1.0" for the
219	   shifted operand was already added).  The mantissa can then
220	   be shifted right by one bit.  The explicit "1.0" of the
221	   shifted mantissa then needs to be replaced by the exponent,
222	   incremented by one to account for the normalizing shift.
223	   It is faster to combine these operations: do the shift first
224	   and combine the additions and subtractions.  If x is the
225	   original exponent, the result is:
226	       shifted mantissa - (x << 22) + (1 << 22) + (x << 23)
227	   or:
228	       shifted mantissa + ((x + 1) << 22)
229	   Note that the exponent is incremented here by leaving the
230	   explicit "1.0" of the mantissa in the exponent field.  */
231
232	/* Shift x right by one bit.  Save the lsb.  */
233	mov	a10, a2
234	srli	a2, a2, 1
235
236	/* See explanation above.  The original exponent is in a8.  */
237	addi	a8, a8, 1
238	slli	a8, a8, 22
239	add	a2, a2, a8
240
241	/* Return an Infinity if the exponent overflowed.  */
242	ball	a2, a6, .Ladd_infinity
243
244	/* Same thing as the "round" code except the msb of the leftover
245	   fraction is bit 0 of a10, with the rest of the fraction in a9.  */
246	bbci.l	a10, 0, 1f
247	addi	a2, a2, 1
248	beqz	a9, .Ladd_exactlyhalf
2491:	leaf_return
250
251.Ladd_infinity:
252	/* Clear the mantissa.  */
253	srli	a2, a2, 23
254	slli	a2, a2, 23
255
256	/* The sign bit may have been lost in a carry-out.  Put it back.  */
257	slli	a8, a8, 1
258	or	a2, a2, a8
259	leaf_return
260
261.Ladd_exactlyhalf:
262	/* Round down to the nearest even value.  */
263	srli	a2, a2, 1
264	slli	a2, a2, 1
265	leaf_return
266
267
268	/* Subtraction */
269__subsf3_aux:
270
271	/* Handle NaNs and Infinities.  (This code is placed before the
272	   start of the function just to keep it in range of the limited
273	   branch displacements.)  */
274
275.Lsub_xnan_or_inf:
276	/* If y is neither Infinity nor NaN, return x.  */
277	bnall	a3, a6, .Lsub_return_nan_or_inf
278	/* Both x and y are either NaN or Inf, so the result is NaN.  */
279
280.Lsub_return_nan:
281	movi	a4, 0x400000	/* make it a quiet NaN */
282	or	a2, a2, a4
283	leaf_return
284
285.Lsub_ynan_or_inf:
286	/* Negate y and return it.  */
287	slli	a7, a6, 8
288	xor	a2, a3, a7
289
290.Lsub_return_nan_or_inf:
291	slli	a7, a2, 9
292	bnez	a7, .Lsub_return_nan
293	leaf_return
294
295.Lsub_opposite_signs:
296	/* Operand signs differ.  Do an addition.  */
297	slli	a7, a6, 8
298	xor	a3, a3, a7
299	j	.Ladd_same_sign
300
301	.align	4
302	.global	__subsf3
303	.type	__subsf3, @function
304__subsf3:
305	leaf_entry sp, 16
306	movi	a6, 0x7f800000
307
308	/* Check if the two operands have the same sign.  */
309	xor	a7, a2, a3
310	bltz	a7, .Lsub_opposite_signs
311
312.Lsub_same_sign:
313	/* Check if either exponent == 0x7f8 (i.e., NaN or Infinity).  */
314	ball	a2, a6, .Lsub_xnan_or_inf
315	ball	a3, a6, .Lsub_ynan_or_inf
316
317	/* Compare the operands.  In contrast to addition, the entire
318	   value matters here.  */
319	extui	a7, a2, 23, 8
320	extui	a8, a3, 23, 8
321	bltu	a2, a3, .Lsub_xsmaller
322
323.Lsub_ysmaller:
324	/* Check if the smaller (or equal) exponent is zero.  */
325	bnone	a3, a6, .Lsub_yexpzero
326
327	/* Replace y sign/exponent with 0x008.  */
328	or	a3, a3, a6
329	slli	a3, a3, 8
330	srli	a3, a3, 8
331
332.Lsub_yexpdiff:
333	/* Compute the exponent difference.  */
334	sub	a10, a7, a8
335
336	/* Exponent difference > 32 -- just return the bigger value.  */
337	bgeui	a10, 32, 1f
338
339	/* Shift y right by the exponent difference.  Any bits that are
340	   shifted out of y are saved in a9 for rounding the result.  */
341	ssr	a10
342	movi	a9, 0
343	src	a9, a3, a9
344	srl	a3, a3
345
346	sub	a2, a2, a3
347
348	/* Subtract the leftover bits in a9 from zero and propagate any
349	   borrow from a2.  */
350	neg	a9, a9
351	addi	a10, a2, -1
352	movnez	a2, a10, a9
353
354	/* Check if the subtract underflowed into the exponent.  */
355	extui	a10, a2, 23, 8
356	beq	a10, a7, .Lsub_round
357	j	.Lsub_borrow
358
359.Lsub_yexpzero:
360	/* Return zero if the inputs are equal.  (For the non-subnormal
361	   case, subtracting the "1.0" will cause a borrow from the exponent
362	   and this case can be detected when handling the borrow.)  */
363	beq	a2, a3, .Lsub_return_zero
364
365	/* y is a subnormal value.  Replace its sign/exponent with zero,
366	   i.e., no implicit "1.0".  Unless x is also a subnormal, increment
367	   y's apparent exponent because subnormals behave as if they had
368	   the minimum (nonzero) exponent.  */
369	slli	a3, a3, 9
370	srli	a3, a3, 9
371	bnone	a2, a6, .Lsub_yexpdiff
372	addi	a8, a8, 1
373	j	.Lsub_yexpdiff
374
375.Lsub_returny:
376	/* Negate and return y.  */
377	slli	a7, a6, 8
378	xor	a2, a3, a7
3791:	leaf_return
380
381.Lsub_xsmaller:
382	/* Same thing as the "ysmaller" code, but with x and y swapped and
383	   with y negated.  */
384	bnone	a2, a6, .Lsub_xexpzero
385
386	or	a2, a2, a6
387	slli	a2, a2, 8
388	srli	a2, a2, 8
389
390.Lsub_xexpdiff:
391	sub	a10, a8, a7
392	bgeui	a10, 32, .Lsub_returny
393
394	ssr	a10
395	movi	a9, 0
396	src	a9, a2, a9
397	srl	a2, a2
398
399	/* Negate y.  */
400	slli	a11, a6, 8
401	xor	a3, a3, a11
402
403	sub	a2, a3, a2
404
405	neg	a9, a9
406	addi	a10, a2, -1
407	movnez	a2, a10, a9
408
409	/* Check if the subtract underflowed into the exponent.  */
410	extui	a10, a2, 23, 8
411	bne	a10, a8, .Lsub_borrow
412
413.Lsub_round:
414	/* Round up if the leftover fraction is >= 1/2.  */
415	bgez	a9, 1f
416	addi	a2, a2, 1
417
418	/* Check if the leftover fraction is exactly 1/2.  */
419	slli	a9, a9, 1
420	beqz	a9, .Lsub_exactlyhalf
4211:	leaf_return
422
423.Lsub_xexpzero:
424	/* Same as "yexpzero".  */
425	beq	a2, a3, .Lsub_return_zero
426	slli	a2, a2, 9
427	srli	a2, a2, 9
428	bnone	a3, a6, .Lsub_xexpdiff
429	addi	a7, a7, 1
430	j	.Lsub_xexpdiff
431
432.Lsub_return_zero:
433	movi	a2, 0
434	leaf_return
435
436.Lsub_borrow:
437	/* The subtraction has underflowed into the exponent field, so the
438	   value needs to be renormalized.  Shift the mantissa left as
439	   needed to remove any leading zeros and adjust the exponent
440	   accordingly.  If the exponent is not large enough to remove
441	   all the leading zeros, the result will be a subnormal value.  */
442
443	slli	a8, a2, 9
444	beqz	a8, .Lsub_xzero
445	do_nsau	a6, a8, a7, a11
446	srli	a8, a8, 9
447	bge	a6, a10, .Lsub_subnormal
448	addi	a6, a6, 1
449
450.Lsub_normalize_shift:
451	/* Shift the mantissa (a8/a9) left by a6.  */
452	ssl	a6
453	src	a8, a8, a9
454	sll	a9, a9
455
456	/* Combine the shifted mantissa with the sign and exponent,
457	   decrementing the exponent by a6.  (The exponent has already
458	   been decremented by one due to the borrow from the subtraction,
459	   but adding the mantissa will increment the exponent by one.)  */
460	srli	a2, a2, 23
461	sub	a2, a2, a6
462	slli	a2, a2, 23
463	add	a2, a2, a8
464	j	.Lsub_round
465
466.Lsub_exactlyhalf:
467	/* Round down to the nearest even value.  */
468	srli	a2, a2, 1
469	slli	a2, a2, 1
470	leaf_return
471
472.Lsub_xzero:
473	/* If there was a borrow from the exponent, and the mantissa and
474	   guard digits are all zero, then the inputs were equal and the
475	   result should be zero.  */
476	beqz	a9, .Lsub_return_zero
477
478	/* Only the guard digit is nonzero.  Shift by min(24, a10).  */
479	addi	a11, a10, -24
480	movi	a6, 24
481	movltz	a6, a10, a11
482	j	.Lsub_normalize_shift
483
484.Lsub_subnormal:
485	/* The exponent is too small to shift away all the leading zeros.
486	   Set a6 to the current exponent (which has already been
487	   decremented by the borrow) so that the exponent of the result
488	   will be zero.  Do not add 1 to a6 in this case, because: (1)
489	   adding the mantissa will not increment the exponent, so there is
490	   no need to subtract anything extra from the exponent to
491	   compensate, and (2) the effective exponent of a subnormal is 1
492	   not 0 so the shift amount must be 1 smaller than normal. */
493	mov	a6, a10
494	j	.Lsub_normalize_shift
495
496#endif /* L_addsubsf3 */
497
498#ifdef L_mulsf3
499
500	/* Multiplication */
501#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
502#define XCHAL_NO_MUL 1
503#endif
504
505	.literal_position
506__mulsf3_aux:
507
508	/* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
509	   (This code is placed before the start of the function just to
510	   keep it in range of the limited branch displacements.)  */
511
512.Lmul_xexpzero:
513	/* Clear the sign bit of x.  */
514	slli	a2, a2, 1
515	srli	a2, a2, 1
516
517	/* If x is zero, return zero.  */
518	beqz	a2, .Lmul_return_zero
519
520	/* Normalize x.  Adjust the exponent in a8.  */
521	do_nsau	a10, a2, a11, a12
522	addi	a10, a10, -8
523	ssl	a10
524	sll	a2, a2
525	movi	a8, 1
526	sub	a8, a8, a10
527	j	.Lmul_xnormalized
528
529.Lmul_yexpzero:
530	/* Clear the sign bit of y.  */
531	slli	a3, a3, 1
532	srli	a3, a3, 1
533
534	/* If y is zero, return zero.  */
535	beqz	a3, .Lmul_return_zero
536
537	/* Normalize y.  Adjust the exponent in a9.  */
538	do_nsau	a10, a3, a11, a12
539	addi	a10, a10, -8
540	ssl	a10
541	sll	a3, a3
542	movi	a9, 1
543	sub	a9, a9, a10
544	j	.Lmul_ynormalized
545
546.Lmul_return_zero:
547	/* Return zero with the appropriate sign bit.  */
548	srli	a2, a7, 31
549	slli	a2, a2, 31
550	j	.Lmul_done
551
552.Lmul_xnan_or_inf:
553	/* If y is zero, return NaN.  */
554	slli	a8, a3, 1
555	beqz	a8, .Lmul_return_nan
556	/* If y is NaN, return y.  */
557	bnall	a3, a6, .Lmul_returnx
558	slli	a8, a3, 9
559	beqz	a8, .Lmul_returnx
560
561.Lmul_returny:
562	mov	a2, a3
563
564.Lmul_returnx:
565	slli	a8, a2, 9
566	bnez	a8, .Lmul_return_nan
567	/* Set the sign bit and return.  */
568	extui	a7, a7, 31, 1
569	slli	a2, a2, 1
570	ssai	1
571	src	a2, a7, a2
572	j	.Lmul_done
573
574.Lmul_ynan_or_inf:
575	/* If x is zero, return NaN.  */
576	slli	a8, a2, 1
577	bnez	a8, .Lmul_returny
578	mov	a2, a3
579
580.Lmul_return_nan:
581	movi	a4, 0x400000	/* make it a quiet NaN */
582	or	a2, a2, a4
583	j	.Lmul_done
584
585	.align	4
586	.global	__mulsf3
587	.type	__mulsf3, @function
588__mulsf3:
589#if __XTENSA_CALL0_ABI__
590	leaf_entry sp, 32
591	addi	sp, sp, -32
592	s32i	a12, sp, 16
593	s32i	a13, sp, 20
594	s32i	a14, sp, 24
595	s32i	a15, sp, 28
596#elif XCHAL_NO_MUL
597	/* This is not really a leaf function; allocate enough stack space
598	   to allow CALL12s to a helper function.  */
599	leaf_entry sp, 64
600#else
601	leaf_entry sp, 32
602#endif
603	movi	a6, 0x7f800000
604
605	/* Get the sign of the result.  */
606	xor	a7, a2, a3
607
608	/* Check for NaN and infinity.  */
609	ball	a2, a6, .Lmul_xnan_or_inf
610	ball	a3, a6, .Lmul_ynan_or_inf
611
612	/* Extract the exponents.  */
613	extui	a8, a2, 23, 8
614	extui	a9, a3, 23, 8
615
616	beqz	a8, .Lmul_xexpzero
617.Lmul_xnormalized:
618	beqz	a9, .Lmul_yexpzero
619.Lmul_ynormalized:
620
621	/* Add the exponents.  */
622	add	a8, a8, a9
623
624	/* Replace sign/exponent fields with explicit "1.0".  */
625	movi	a10, 0xffffff
626	or	a2, a2, a6
627	and	a2, a2, a10
628	or	a3, a3, a6
629	and	a3, a3, a10
630
631	/* Multiply 32x32 to 64 bits.  The result ends up in a2/a6.  */
632
633#if XCHAL_HAVE_MUL32_HIGH
634
635	mull	a6, a2, a3
636	muluh	a2, a2, a3
637
638#else
639
640	/* Break the inputs into 16-bit chunks and compute 4 32-bit partial
641	   products.  These partial products are:
642
643		0 xl * yl
644
645		1 xl * yh
646		2 xh * yl
647
648		3 xh * yh
649
650	   If using the Mul16 or Mul32 multiplier options, these input
651	   chunks must be stored in separate registers.  For Mac16, the
652	   UMUL.AA.* opcodes can specify that the inputs come from either
653	   half of the registers, so there is no need to shift them out
654	   ahead of time.  If there is no multiply hardware, the 16-bit
655	   chunks can be extracted when setting up the arguments to the
656	   separate multiply function.  */
657
658#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
659	/* Calling a separate multiply function will clobber a0 and requires
660	   use of a8 as a temporary, so save those values now.  (The function
661	   uses a custom ABI so nothing else needs to be saved.)  */
662	s32i	a0, sp, 0
663	s32i	a8, sp, 4
664#endif
665
666#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32
667
668#define a2h a4
669#define a3h a5
670
671	/* Get the high halves of the inputs into registers.  */
672	srli	a2h, a2, 16
673	srli	a3h, a3, 16
674
675#define a2l a2
676#define a3l a3
677
678#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16
679	/* Clear the high halves of the inputs.  This does not matter
680	   for MUL16 because the high bits are ignored.  */
681	extui	a2, a2, 0, 16
682	extui	a3, a3, 0, 16
683#endif
684#endif /* MUL16 || MUL32 */
685
686
687#if XCHAL_HAVE_MUL16
688
689#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
690	mul16u	dst, xreg ## xhalf, yreg ## yhalf
691
692#elif XCHAL_HAVE_MUL32
693
694#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
695	mull	dst, xreg ## xhalf, yreg ## yhalf
696
697#elif XCHAL_HAVE_MAC16
698
699/* The preprocessor insists on inserting a space when concatenating after
700   a period in the definition of do_mul below.  These macros are a workaround
701   using underscores instead of periods when doing the concatenation.  */
702#define umul_aa_ll umul.aa.ll
703#define umul_aa_lh umul.aa.lh
704#define umul_aa_hl umul.aa.hl
705#define umul_aa_hh umul.aa.hh
706
707#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
708	umul_aa_ ## xhalf ## yhalf	xreg, yreg; \
709	rsr	dst, ACCLO
710
711#else /* no multiply hardware */
712
713#define set_arg_l(dst, src) \
714	extui	dst, src, 0, 16
715#define set_arg_h(dst, src) \
716	srli	dst, src, 16
717
718#if __XTENSA_CALL0_ABI__
719#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
720	set_arg_ ## xhalf (a13, xreg); \
721	set_arg_ ## yhalf (a14, yreg); \
722	call0	.Lmul_mulsi3; \
723	mov	dst, a12
724#else
725#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
726	set_arg_ ## xhalf (a14, xreg); \
727	set_arg_ ## yhalf (a15, yreg); \
728	call12	.Lmul_mulsi3; \
729	mov	dst, a14
730#endif /* __XTENSA_CALL0_ABI__ */
731
732#endif /* no multiply hardware */
733
734	/* Add pp1 and pp2 into a6 with carry-out in a9.  */
735	do_mul(a6, a2, l, a3, h)	/* pp 1 */
736	do_mul(a11, a2, h, a3, l)	/* pp 2 */
737	movi	a9, 0
738	add	a6, a6, a11
739	bgeu	a6, a11, 1f
740	addi	a9, a9, 1
7411:
742	/* Shift the high half of a9/a6 into position in a9.  Note that
743	   this value can be safely incremented without any carry-outs.  */
744	ssai	16
745	src	a9, a9, a6
746
747	/* Compute the low word into a6.  */
748	do_mul(a11, a2, l, a3, l)	/* pp 0 */
749	sll	a6, a6
750	add	a6, a6, a11
751	bgeu	a6, a11, 1f
752	addi	a9, a9, 1
7531:
754	/* Compute the high word into a2.  */
755	do_mul(a2, a2, h, a3, h)	/* pp 3 */
756	add	a2, a2, a9
757
758#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
759	/* Restore values saved on the stack during the multiplication.  */
760	l32i	a0, sp, 0
761	l32i	a8, sp, 4
762#endif
763#endif /* ! XCHAL_HAVE_MUL32_HIGH */
764
765	/* Shift left by 9 bits, unless there was a carry-out from the
766	   multiply, in which case, shift by 8 bits and increment the
767	   exponent.  */
768	movi	a4, 9
769	srli	a5, a2, 24 - 9
770	beqz	a5, 1f
771	addi	a4, a4, -1
772	addi	a8, a8, 1
7731:	ssl	a4
774	src	a2, a2, a6
775	sll	a6, a6
776
777	/* Subtract the extra bias from the exponent sum (plus one to account
778	   for the explicit "1.0" of the mantissa that will be added to the
779	   exponent in the final result).  */
780	movi	a4, 0x80
781	sub	a8, a8, a4
782
783	/* Check for over/underflow.  The value in a8 is one less than the
784	   final exponent, so values in the range 0..fd are OK here.  */
785	movi	a4, 0xfe
786	bgeu	a8, a4, .Lmul_overflow
787
788.Lmul_round:
789	/* Round.  */
790	bgez	a6, .Lmul_rounded
791	addi	a2, a2, 1
792	slli	a6, a6, 1
793	beqz	a6, .Lmul_exactlyhalf
794
795.Lmul_rounded:
796	/* Add the exponent to the mantissa.  */
797	slli	a8, a8, 23
798	add	a2, a2, a8
799
800.Lmul_addsign:
801	/* Add the sign bit.  */
802	srli	a7, a7, 31
803	slli	a7, a7, 31
804	or	a2, a2, a7
805
806.Lmul_done:
807#if __XTENSA_CALL0_ABI__
808	l32i	a12, sp, 16
809	l32i	a13, sp, 20
810	l32i	a14, sp, 24
811	l32i	a15, sp, 28
812	addi	sp, sp, 32
813#endif
814	leaf_return
815
816.Lmul_exactlyhalf:
817	/* Round down to the nearest even value.  */
818	srli	a2, a2, 1
819	slli	a2, a2, 1
820	j	.Lmul_rounded
821
822.Lmul_overflow:
823	bltz	a8, .Lmul_underflow
824	/* Return +/- Infinity.  */
825	movi	a8, 0xff
826	slli	a2, a8, 23
827	j	.Lmul_addsign
828
829.Lmul_underflow:
830	/* Create a subnormal value, where the exponent field contains zero,
831	   but the effective exponent is 1.  The value of a8 is one less than
832	   the actual exponent, so just negate it to get the shift amount.  */
833	neg	a8, a8
834	mov	a9, a6
835	ssr	a8
836	bgeui	a8, 32, .Lmul_flush_to_zero
837
838	/* Shift a2 right.  Any bits that are shifted out of a2 are saved
839	   in a6 (combined with the shifted-out bits currently in a6) for
840	   rounding the result.  */
841	sll	a6, a2
842	srl	a2, a2
843
844	/* Set the exponent to zero.  */
845	movi	a8, 0
846
847	/* Pack any nonzero bits shifted out into a6.  */
848	beqz	a9, .Lmul_round
849	movi	a9, 1
850	or	a6, a6, a9
851	j	.Lmul_round
852
853.Lmul_flush_to_zero:
854	/* Return zero with the appropriate sign bit.  */
855	srli	a2, a7, 31
856	slli	a2, a2, 31
857	j	.Lmul_done
858
859#if XCHAL_NO_MUL
860
861	/* For Xtensa processors with no multiply hardware, this simplified
862	   version of _mulsi3 is used for multiplying 16-bit chunks of
863	   the floating-point mantissas.  When using CALL0, this function
864	   uses a custom ABI: the inputs are passed in a13 and a14, the
865	   result is returned in a12, and a8 and a15 are clobbered.  */
866	.align	4
867.Lmul_mulsi3:
868	leaf_entry sp, 16
869	.macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
870	movi	\dst, 0
8711:	add	\tmp1, \src2, \dst
872	extui	\tmp2, \src1, 0, 1
873	movnez	\dst, \tmp1, \tmp2
874
875	do_addx2 \tmp1, \src2, \dst, \tmp1
876	extui	\tmp2, \src1, 1, 1
877	movnez	\dst, \tmp1, \tmp2
878
879	do_addx4 \tmp1, \src2, \dst, \tmp1
880	extui	\tmp2, \src1, 2, 1
881	movnez	\dst, \tmp1, \tmp2
882
883	do_addx8 \tmp1, \src2, \dst, \tmp1
884	extui	\tmp2, \src1, 3, 1
885	movnez	\dst, \tmp1, \tmp2
886
887	srli	\src1, \src1, 4
888	slli	\src2, \src2, 4
889	bnez	\src1, 1b
890	.endm
891#if __XTENSA_CALL0_ABI__
892	mul_mulsi3_body a12, a13, a14, a15, a8
893#else
894	/* The result will be written into a2, so save that argument in a4.  */
895	mov	a4, a2
896	mul_mulsi3_body a2, a4, a3, a5, a6
897#endif
898	leaf_return
899#endif /* XCHAL_NO_MUL */
900#endif /* L_mulsf3 */
901
902#ifdef L_divsf3
903
904	/* Division */
905
906#if XCHAL_HAVE_FP_DIV
907
908	.align	4
909	.global	__divsf3
910	.type	__divsf3, @function
911__divsf3:
912	leaf_entry	sp, 16
913
914	wfr		f1, a2	/* dividend */
915	wfr		f2, a3	/* divisor */
916
917	div0.s		f3, f2
918	nexp01.s	f4, f2
919	const.s		f5, 1
920	maddn.s		f5, f4, f3
921	mov.s		f6, f3
922	mov.s		f7, f2
923	nexp01.s	f2, f1
924	maddn.s		f6, f5, f6
925	const.s		f5, 1
926	const.s		f0, 0
927	neg.s		f8, f2
928	maddn.s		f5, f4, f6
929	maddn.s		f0, f8, f3
930	mkdadj.s	f7, f1
931	maddn.s		f6, f5, f6
932	maddn.s		f8, f4, f0
933	const.s		f3, 1
934	maddn.s		f3, f4, f6
935	maddn.s		f0, f8, f6
936	neg.s		f2, f2
937	maddn.s		f6, f3, f6
938	maddn.s		f2, f4, f0
939	addexpm.s	f0, f7
940	addexp.s	f6, f7
941	divn.s		f0, f2, f6
942
943	rfr		a2, f0
944
945	leaf_return
946
947#else
948
949	.literal_position
950__divsf3_aux:
951
952	/* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
953	   (This code is placed before the start of the function just to
954	   keep it in range of the limited branch displacements.)  */
955
956.Ldiv_yexpzero:
957	/* Clear the sign bit of y.  */
958	slli	a3, a3, 1
959	srli	a3, a3, 1
960
961	/* Check for division by zero.  */
962	beqz	a3, .Ldiv_yzero
963
964	/* Normalize y.  Adjust the exponent in a9.  */
965	do_nsau	a10, a3, a4, a5
966	addi	a10, a10, -8
967	ssl	a10
968	sll	a3, a3
969	movi	a9, 1
970	sub	a9, a9, a10
971	j	.Ldiv_ynormalized
972
973.Ldiv_yzero:
974	/* y is zero.  Return NaN if x is also zero; otherwise, infinity.  */
975	slli	a4, a2, 1
976	srli	a4, a4, 1
977	srli	a2, a7, 31
978	slli	a2, a2, 31
979	or	a2, a2, a6
980	bnez	a4, 1f
981	movi	a4, 0x400000	/* make it a quiet NaN */
982	or	a2, a2, a4
9831:	leaf_return
984
985.Ldiv_xexpzero:
986	/* Clear the sign bit of x.  */
987	slli	a2, a2, 1
988	srli	a2, a2, 1
989
990	/* If x is zero, return zero.  */
991	beqz	a2, .Ldiv_return_zero
992
993	/* Normalize x.  Adjust the exponent in a8.  */
994	do_nsau	a10, a2, a4, a5
995	addi	a10, a10, -8
996	ssl	a10
997	sll	a2, a2
998	movi	a8, 1
999	sub	a8, a8, a10
1000	j	.Ldiv_xnormalized
1001
1002.Ldiv_return_zero:
1003	/* Return zero with the appropriate sign bit.  */
1004	srli	a2, a7, 31
1005	slli	a2, a2, 31
1006	leaf_return
1007
1008.Ldiv_xnan_or_inf:
1009	/* Set the sign bit of the result.  */
1010	srli	a7, a3, 31
1011	slli	a7, a7, 31
1012	xor	a2, a2, a7
1013	/* If y is NaN or Inf, return NaN.  */
1014	ball	a3, a6, .Ldiv_return_nan
1015	slli	a7, a2, 9
1016	bnez	a7, .Ldiv_return_nan
1017	leaf_return
1018
1019.Ldiv_ynan_or_inf:
1020	/* If y is Infinity, return zero.  */
1021	slli	a8, a3, 9
1022	beqz	a8, .Ldiv_return_zero
1023	/* y is NaN; return it.  */
1024	mov	a2, a3
1025
1026.Ldiv_return_nan:
1027	movi	a4, 0x400000	/* make it a quiet NaN */
1028	or	a2, a2, a4
1029	leaf_return
1030
1031	.align	4
1032	.global	__divsf3
1033	.type	__divsf3, @function
1034__divsf3:
1035	leaf_entry sp, 16
1036	movi	a6, 0x7f800000
1037
1038	/* Get the sign of the result.  */
1039	xor	a7, a2, a3
1040
1041	/* Check for NaN and infinity.  */
1042	ball	a2, a6, .Ldiv_xnan_or_inf
1043	ball	a3, a6, .Ldiv_ynan_or_inf
1044
1045	/* Extract the exponents.  */
1046	extui	a8, a2, 23, 8
1047	extui	a9, a3, 23, 8
1048
1049	beqz	a9, .Ldiv_yexpzero
1050.Ldiv_ynormalized:
1051	beqz	a8, .Ldiv_xexpzero
1052.Ldiv_xnormalized:
1053
1054	/* Subtract the exponents.  */
1055	sub	a8, a8, a9
1056
1057	/* Replace sign/exponent fields with explicit "1.0".  */
1058	movi	a10, 0xffffff
1059	or	a2, a2, a6
1060	and	a2, a2, a10
1061	or	a3, a3, a6
1062	and	a3, a3, a10
1063
1064	/* The first digit of the mantissa division must be a one.
1065	   Shift x (and adjust the exponent) as needed to make this true.  */
1066	bltu	a3, a2, 1f
1067	slli	a2, a2, 1
1068	addi	a8, a8, -1
10691:
1070	/* Do the first subtraction and shift.  */
1071	sub	a2, a2, a3
1072	slli	a2, a2, 1
1073
1074	/* Put the quotient into a10.  */
1075	movi	a10, 1
1076
1077	/* Divide one bit at a time for 23 bits.  */
1078	movi	a9, 23
1079#if XCHAL_HAVE_LOOPS
1080	loop	a9, .Ldiv_loopend
1081#endif
1082.Ldiv_loop:
1083	/* Shift the quotient << 1.  */
1084	slli	a10, a10, 1
1085
1086	/* Is this digit a 0 or 1?  */
1087	bltu	a2, a3, 1f
1088
1089	/* Output a 1 and subtract.  */
1090	addi	a10, a10, 1
1091	sub	a2, a2, a3
1092
1093	/* Shift the dividend << 1.  */
10941:	slli	a2, a2, 1
1095
1096#if !XCHAL_HAVE_LOOPS
1097	addi	a9, a9, -1
1098	bnez	a9, .Ldiv_loop
1099#endif
1100.Ldiv_loopend:
1101
1102	/* Add the exponent bias (less one to account for the explicit "1.0"
1103	   of the mantissa that will be added to the exponent in the final
1104	   result).  */
1105	addi	a8, a8, 0x7e
1106
1107	/* Check for over/underflow.  The value in a8 is one less than the
1108	   final exponent, so values in the range 0..fd are OK here.  */
1109	movi	a4, 0xfe
1110	bgeu	a8, a4, .Ldiv_overflow
1111
1112.Ldiv_round:
1113	/* Round.  The remainder (<< 1) is in a2.  */
1114	bltu	a2, a3, .Ldiv_rounded
1115	addi	a10, a10, 1
1116	beq	a2, a3, .Ldiv_exactlyhalf
1117
1118.Ldiv_rounded:
1119	/* Add the exponent to the mantissa.  */
1120	slli	a8, a8, 23
1121	add	a2, a10, a8
1122
1123.Ldiv_addsign:
1124	/* Add the sign bit.  */
1125	srli	a7, a7, 31
1126	slli	a7, a7, 31
1127	or	a2, a2, a7
1128	leaf_return
1129
1130.Ldiv_overflow:
1131	bltz	a8, .Ldiv_underflow
1132	/* Return +/- Infinity.  */
1133	addi	a8, a4, 1	/* 0xff */
1134	slli	a2, a8, 23
1135	j	.Ldiv_addsign
1136
1137.Ldiv_exactlyhalf:
1138	/* Remainder is exactly half the divisor.  Round even.  */
1139	srli	a10, a10, 1
1140	slli	a10, a10, 1
1141	j	.Ldiv_rounded
1142
1143.Ldiv_underflow:
1144	/* Create a subnormal value, where the exponent field contains zero,
1145	   but the effective exponent is 1.  The value of a8 is one less than
1146	   the actual exponent, so just negate it to get the shift amount.  */
1147	neg	a8, a8
1148	ssr	a8
1149	bgeui	a8, 32, .Ldiv_flush_to_zero
1150
1151	/* Shift a10 right.  Any bits that are shifted out of a10 are
1152	   saved in a6 for rounding the result.  */
1153	sll	a6, a10
1154	srl	a10, a10
1155
1156	/* Set the exponent to zero.  */
1157	movi	a8, 0
1158
1159	/* Pack any nonzero remainder (in a2) into a6.  */
1160	beqz	a2, 1f
1161	movi	a9, 1
1162	or	a6, a6, a9
1163
1164	/* Round a10 based on the bits shifted out into a6.  */
11651:	bgez	a6, .Ldiv_rounded
1166	addi	a10, a10, 1
1167	slli	a6, a6, 1
1168	bnez	a6, .Ldiv_rounded
1169	srli	a10, a10, 1
1170	slli	a10, a10, 1
1171	j	.Ldiv_rounded
1172
1173.Ldiv_flush_to_zero:
1174	/* Return zero with the appropriate sign bit.  */
1175	srli	a2, a7, 31
1176	slli	a2, a2, 31
1177	leaf_return
1178
1179#endif /* XCHAL_HAVE_FP_DIV */
1180
1181#endif /* L_divsf3 */
1182
1183#ifdef L_cmpsf2
1184
1185	/* Equal and Not Equal */
1186
1187	.align	4
1188	.global	__eqsf2
1189	.global	__nesf2
1190	.set	__nesf2, __eqsf2
1191	.type	__eqsf2, @function
1192__eqsf2:
1193	leaf_entry sp, 16
1194	bne	a2, a3, 4f
1195
1196	/* The values are equal but NaN != NaN.  Check the exponent.  */
1197	movi	a6, 0x7f800000
1198	ball	a2, a6, 3f
1199
1200	/* Equal.  */
1201	movi	a2, 0
1202	leaf_return
1203
1204	/* Not equal.  */
12052:	movi	a2, 1
1206	leaf_return
1207
1208	/* Check if the mantissas are nonzero.  */
12093:	slli	a7, a2, 9
1210	j	5f
1211
1212	/* Check if x and y are zero with different signs.  */
12134:	or	a7, a2, a3
1214	slli	a7, a7, 1
1215
1216	/* Equal if a7 == 0, where a7 is either abs(x | y) or the mantissa
1217	   or x when exponent(x) = 0x7f8 and x == y.  */
12185:	movi	a2, 0
1219	movi	a3, 1
1220	movnez	a2, a3, a7
1221	leaf_return
1222
1223
1224	/* Greater Than */
1225
1226	.align	4
1227	.global	__gtsf2
1228	.type	__gtsf2, @function
1229__gtsf2:
1230	leaf_entry sp, 16
1231	movi	a6, 0x7f800000
1232	ball	a2, a6, 2f
12331:	bnall	a3, a6, .Lle_cmp
1234
1235	/* Check if y is a NaN.  */
1236	slli	a7, a3, 9
1237	beqz	a7, .Lle_cmp
1238	movi	a2, 0
1239	leaf_return
1240
1241	/* Check if x is a NaN.  */
12422:	slli	a7, a2, 9
1243	beqz	a7, 1b
1244	movi	a2, 0
1245	leaf_return
1246
1247
1248	/* Less Than or Equal */
1249
1250	.align	4
1251	.global	__lesf2
1252	.type	__lesf2, @function
1253__lesf2:
1254	leaf_entry sp, 16
1255	movi	a6, 0x7f800000
1256	ball	a2, a6, 2f
12571:	bnall	a3, a6, .Lle_cmp
1258
1259	/* Check if y is a NaN.  */
1260	slli	a7, a3, 9
1261	beqz	a7, .Lle_cmp
1262	movi	a2, 1
1263	leaf_return
1264
1265	/* Check if x is a NaN.  */
12662:	slli	a7, a2, 9
1267	beqz	a7, 1b
1268	movi	a2, 1
1269	leaf_return
1270
1271.Lle_cmp:
1272	/* Check if x and y have different signs.  */
1273	xor	a7, a2, a3
1274	bltz	a7, .Lle_diff_signs
1275
1276	/* Check if x is negative.  */
1277	bltz	a2, .Lle_xneg
1278
1279	/* Check if x <= y.  */
1280	bltu	a3, a2, 5f
12814:	movi	a2, 0
1282	leaf_return
1283
1284.Lle_xneg:
1285	/* Check if y <= x.  */
1286	bgeu	a2, a3, 4b
12875:	movi	a2, 1
1288	leaf_return
1289
1290.Lle_diff_signs:
1291	bltz	a2, 4b
1292
1293	/* Check if both x and y are zero.  */
1294	or	a7, a2, a3
1295	slli	a7, a7, 1
1296	movi	a2, 1
1297	movi	a3, 0
1298	moveqz	a2, a3, a7
1299	leaf_return
1300
1301
1302	/* Greater Than or Equal */
1303
1304	.align	4
1305	.global	__gesf2
1306	.type	__gesf2, @function
1307__gesf2:
1308	leaf_entry sp, 16
1309	movi	a6, 0x7f800000
1310	ball	a2, a6, 2f
13111:	bnall	a3, a6, .Llt_cmp
1312
1313	/* Check if y is a NaN.  */
1314	slli	a7, a3, 9
1315	beqz	a7, .Llt_cmp
1316	movi	a2, -1
1317	leaf_return
1318
1319	/* Check if x is a NaN.  */
13202:	slli	a7, a2, 9
1321	beqz	a7, 1b
1322	movi	a2, -1
1323	leaf_return
1324
1325
1326	/* Less Than */
1327
1328	.align	4
1329	.global	__ltsf2
1330	.type	__ltsf2, @function
1331__ltsf2:
1332	leaf_entry sp, 16
1333	movi	a6, 0x7f800000
1334	ball	a2, a6, 2f
13351:	bnall	a3, a6, .Llt_cmp
1336
1337	/* Check if y is a NaN.  */
1338	slli	a7, a3, 9
1339	beqz	a7, .Llt_cmp
1340	movi	a2, 0
1341	leaf_return
1342
1343	/* Check if x is a NaN.  */
13442:	slli	a7, a2, 9
1345	beqz	a7, 1b
1346	movi	a2, 0
1347	leaf_return
1348
1349.Llt_cmp:
1350	/* Check if x and y have different signs.  */
1351	xor	a7, a2, a3
1352	bltz	a7, .Llt_diff_signs
1353
1354	/* Check if x is negative.  */
1355	bltz	a2, .Llt_xneg
1356
1357	/* Check if x < y.  */
1358	bgeu	a2, a3, 5f
13594:	movi	a2, -1
1360	leaf_return
1361
1362.Llt_xneg:
1363	/* Check if y < x.  */
1364	bltu	a3, a2, 4b
13655:	movi	a2, 0
1366	leaf_return
1367
1368.Llt_diff_signs:
1369	bgez	a2, 5b
1370
1371	/* Check if both x and y are nonzero.  */
1372	or	a7, a2, a3
1373	slli	a7, a7, 1
1374	movi	a2, 0
1375	movi	a3, -1
1376	movnez	a2, a3, a7
1377	leaf_return
1378
1379
1380	/* Unordered */
1381
1382	.align	4
1383	.global	__unordsf2
1384	.type	__unordsf2, @function
1385__unordsf2:
1386	leaf_entry sp, 16
1387	movi	a6, 0x7f800000
1388	ball	a2, a6, 3f
13891:	ball	a3, a6, 4f
13902:	movi	a2, 0
1391	leaf_return
1392
13933:	slli	a7, a2, 9
1394	beqz	a7, 1b
1395	movi	a2, 1
1396	leaf_return
1397
13984:	slli	a7, a3, 9
1399	beqz	a7, 2b
1400	movi	a2, 1
1401	leaf_return
1402
1403#endif /* L_cmpsf2 */
1404
1405#ifdef L_fixsfsi
1406
1407	.align	4
1408	.global	__fixsfsi
1409	.type	__fixsfsi, @function
1410__fixsfsi:
1411	leaf_entry sp, 16
1412
1413	/* Check for NaN and Infinity.  */
1414	movi	a6, 0x7f800000
1415	ball	a2, a6, .Lfixsfsi_nan_or_inf
1416
1417	/* Extract the exponent and check if 0 < (exp - 0x7e) < 32.  */
1418	extui	a4, a2, 23, 8
1419	addi	a4, a4, -0x7e
1420	bgei	a4, 32, .Lfixsfsi_maxint
1421	blti	a4, 1, .Lfixsfsi_zero
1422
1423	/* Add explicit "1.0" and shift << 8.  */
1424	or	a7, a2, a6
1425	slli	a5, a7, 8
1426
1427	/* Shift back to the right, based on the exponent.  */
1428	ssl	a4		/* shift by 32 - a4 */
1429	srl	a5, a5
1430
1431	/* Negate the result if sign != 0.  */
1432	neg	a2, a5
1433	movgez	a2, a5, a7
1434	leaf_return
1435
1436.Lfixsfsi_nan_or_inf:
1437	/* Handle Infinity and NaN.  */
1438	slli	a4, a2, 9
1439	beqz	a4, .Lfixsfsi_maxint
1440
1441	/* Translate NaN to +maxint.  */
1442	movi	a2, 0
1443
1444.Lfixsfsi_maxint:
1445	slli	a4, a6, 8	/* 0x80000000 */
1446	addi	a5, a4, -1	/* 0x7fffffff */
1447	movgez	a4, a5, a2
1448	mov	a2, a4
1449	leaf_return
1450
1451.Lfixsfsi_zero:
1452	movi	a2, 0
1453	leaf_return
1454
1455#endif /* L_fixsfsi */
1456
1457#ifdef L_fixsfdi
1458
1459	.align	4
1460	.global	__fixsfdi
1461	.type	__fixsfdi, @function
1462__fixsfdi:
1463	leaf_entry sp, 16
1464
1465	/* Check for NaN and Infinity.  */
1466	movi	a6, 0x7f800000
1467	ball	a2, a6, .Lfixsfdi_nan_or_inf
1468
1469	/* Extract the exponent and check if 0 < (exp - 0x7e) < 64.  */
1470	extui	a4, a2, 23, 8
1471	addi	a4, a4, -0x7e
1472	bgei	a4, 64, .Lfixsfdi_maxint
1473	blti	a4, 1, .Lfixsfdi_zero
1474
1475	/* Add explicit "1.0" and shift << 8.  */
1476	or	a7, a2, a6
1477	slli	xh, a7, 8
1478
1479	/* Shift back to the right, based on the exponent.  */
1480	ssl	a4		/* shift by 64 - a4 */
1481	bgei	a4, 32, .Lfixsfdi_smallshift
1482	srl	xl, xh
1483	movi	xh, 0
1484
1485.Lfixsfdi_shifted:
1486	/* Negate the result if sign != 0.  */
1487	bgez	a7, 1f
1488	neg	xl, xl
1489	neg	xh, xh
1490	beqz	xl, 1f
1491	addi	xh, xh, -1
14921:	leaf_return
1493
1494.Lfixsfdi_smallshift:
1495	movi	xl, 0
1496	sll	xl, xh
1497	srl	xh, xh
1498	j	.Lfixsfdi_shifted
1499
1500.Lfixsfdi_nan_or_inf:
1501	/* Handle Infinity and NaN.  */
1502	slli	a4, a2, 9
1503	beqz	a4, .Lfixsfdi_maxint
1504
1505	/* Translate NaN to +maxint.  */
1506	movi	a2, 0
1507
1508.Lfixsfdi_maxint:
1509	slli	a7, a6, 8	/* 0x80000000 */
1510	bgez	a2, 1f
1511	mov	xh, a7
1512	movi	xl, 0
1513	leaf_return
1514
15151:	addi	xh, a7, -1	/* 0x7fffffff */
1516	movi	xl, -1
1517	leaf_return
1518
1519.Lfixsfdi_zero:
1520	movi	xh, 0
1521	movi	xl, 0
1522	leaf_return
1523
1524#endif /* L_fixsfdi */
1525
1526#ifdef L_fixunssfsi
1527
1528	.align	4
1529	.global	__fixunssfsi
1530	.type	__fixunssfsi, @function
1531__fixunssfsi:
1532	leaf_entry sp, 16
1533
1534	/* Check for NaN and Infinity.  */
1535	movi	a6, 0x7f800000
1536	ball	a2, a6, .Lfixunssfsi_nan_or_inf
1537
1538	/* Extract the exponent and check if 0 <= (exp - 0x7f) < 32.  */
1539	extui	a4, a2, 23, 8
1540	addi	a4, a4, -0x7f
1541	bgei	a4, 32, .Lfixunssfsi_maxint
1542	bltz	a4, .Lfixunssfsi_zero
1543
1544	/* Add explicit "1.0" and shift << 8.  */
1545	or	a7, a2, a6
1546	slli	a5, a7, 8
1547
1548	/* Shift back to the right, based on the exponent.  */
1549	addi	a4, a4, 1
1550	beqi	a4, 32, .Lfixunssfsi_bigexp
1551	ssl	a4		/* shift by 32 - a4 */
1552	srl	a5, a5
1553
1554	/* Negate the result if sign != 0.  */
1555	neg	a2, a5
1556	movgez	a2, a5, a7
1557	leaf_return
1558
1559.Lfixunssfsi_nan_or_inf:
1560	/* Handle Infinity and NaN.  */
1561	slli	a4, a2, 9
1562	beqz	a4, .Lfixunssfsi_maxint
1563
1564	/* Translate NaN to 0xffffffff.  */
1565	movi	a2, -1
1566	leaf_return
1567
1568.Lfixunssfsi_maxint:
1569	slli	a4, a6, 8	/* 0x80000000 */
1570	movi	a5, -1		/* 0xffffffff */
1571	movgez	a4, a5, a2
1572	mov	a2, a4
1573	leaf_return
1574
1575.Lfixunssfsi_zero:
1576	movi	a2, 0
1577	leaf_return
1578
1579.Lfixunssfsi_bigexp:
1580	/* Handle unsigned maximum exponent case.  */
1581	bltz	a2, 1f
1582	mov	a2, a5		/* no shift needed */
1583	leaf_return
1584
1585	/* Return 0x80000000 if negative.  */
15861:	slli	a2, a6, 8
1587	leaf_return
1588
1589#endif /* L_fixunssfsi */
1590
1591#ifdef L_fixunssfdi
1592
1593	.align	4
1594	.global	__fixunssfdi
1595	.type	__fixunssfdi, @function
1596__fixunssfdi:
1597	leaf_entry sp, 16
1598
1599	/* Check for NaN and Infinity.  */
1600	movi	a6, 0x7f800000
1601	ball	a2, a6, .Lfixunssfdi_nan_or_inf
1602
1603	/* Extract the exponent and check if 0 <= (exp - 0x7f) < 64.  */
1604	extui	a4, a2, 23, 8
1605	addi	a4, a4, -0x7f
1606	bgei	a4, 64, .Lfixunssfdi_maxint
1607	bltz	a4, .Lfixunssfdi_zero
1608
1609	/* Add explicit "1.0" and shift << 8.  */
1610	or	a7, a2, a6
1611	slli	xh, a7, 8
1612
1613	/* Shift back to the right, based on the exponent.  */
1614	addi	a4, a4, 1
1615	beqi	a4, 64, .Lfixunssfdi_bigexp
1616	ssl	a4		/* shift by 64 - a4 */
1617	bgei	a4, 32, .Lfixunssfdi_smallshift
1618	srl	xl, xh
1619	movi	xh, 0
1620
1621.Lfixunssfdi_shifted:
1622	/* Negate the result if sign != 0.  */
1623	bgez	a7, 1f
1624	neg	xl, xl
1625	neg	xh, xh
1626	beqz	xl, 1f
1627	addi	xh, xh, -1
16281:	leaf_return
1629
1630.Lfixunssfdi_smallshift:
1631	movi	xl, 0
1632	src	xl, xh, xl
1633	srl	xh, xh
1634	j	.Lfixunssfdi_shifted
1635
1636.Lfixunssfdi_nan_or_inf:
1637	/* Handle Infinity and NaN.  */
1638	slli	a4, a2, 9
1639	beqz	a4, .Lfixunssfdi_maxint
1640
1641	/* Translate NaN to 0xffffffff.... */
16421:	movi	xh, -1
1643	movi	xl, -1
1644	leaf_return
1645
1646.Lfixunssfdi_maxint:
1647	bgez	a2, 1b
16482:	slli	xh, a6, 8	/* 0x80000000 */
1649	movi	xl, 0
1650	leaf_return
1651
1652.Lfixunssfdi_zero:
1653	movi	xh, 0
1654	movi	xl, 0
1655	leaf_return
1656
1657.Lfixunssfdi_bigexp:
1658	/* Handle unsigned maximum exponent case.  */
1659	bltz	a7, 2b
1660	movi	xl, 0
1661	leaf_return		/* no shift needed */
1662
1663#endif /* L_fixunssfdi */
1664
1665#ifdef L_floatsisf
1666
1667	.align	4
1668	.global	__floatunsisf
1669	.type	__floatunsisf, @function
1670__floatunsisf:
1671	leaf_entry sp, 16
1672	beqz	a2, .Lfloatsisf_return
1673
1674	/* Set the sign to zero and jump to the floatsisf code.  */
1675	movi	a7, 0
1676	j	.Lfloatsisf_normalize
1677
1678	.align	4
1679	.global	__floatsisf
1680	.type	__floatsisf, @function
1681__floatsisf:
1682	leaf_entry sp, 16
1683
1684	/* Check for zero.  */
1685	beqz	a2, .Lfloatsisf_return
1686
1687	/* Save the sign.  */
1688	extui	a7, a2, 31, 1
1689
1690	/* Get the absolute value.  */
1691#if XCHAL_HAVE_ABS
1692	abs	a2, a2
1693#else
1694	neg	a4, a2
1695	movltz	a2, a4, a2
1696#endif
1697
1698.Lfloatsisf_normalize:
1699	/* Normalize with the first 1 bit in the msb.  */
1700	do_nsau	a4, a2, a5, a6
1701	ssl	a4
1702	sll	a5, a2
1703
1704	/* Shift the mantissa into position, with rounding bits in a6.  */
1705	srli	a2, a5, 8
1706	slli	a6, a5, (32 - 8)
1707
1708	/* Set the exponent.  */
1709	movi	a5, 0x9d	/* 0x7e + 31 */
1710	sub	a5, a5, a4
1711	slli	a5, a5, 23
1712	add	a2, a2, a5
1713
1714	/* Add the sign.  */
1715	slli	a7, a7, 31
1716	or	a2, a2, a7
1717
1718	/* Round up if the leftover fraction is >= 1/2.  */
1719	bgez	a6, .Lfloatsisf_return
1720	addi	a2, a2, 1	/* Overflow to the exponent is OK.  */
1721
1722	/* Check if the leftover fraction is exactly 1/2.  */
1723	slli	a6, a6, 1
1724	beqz	a6, .Lfloatsisf_exactlyhalf
1725
1726.Lfloatsisf_return:
1727	leaf_return
1728
1729.Lfloatsisf_exactlyhalf:
1730	/* Round down to the nearest even value.  */
1731	srli	a2, a2, 1
1732	slli	a2, a2, 1
1733	leaf_return
1734
1735#endif /* L_floatsisf */
1736
1737#ifdef L_floatdisf
1738
1739	.align	4
1740	.global	__floatundisf
1741	.type	__floatundisf, @function
1742__floatundisf:
1743	leaf_entry sp, 16
1744
1745	/* Check for zero.  */
1746	or	a4, xh, xl
1747	beqz	a4, 2f
1748
1749	/* Set the sign to zero and jump to the floatdisf code.  */
1750	movi	a7, 0
1751	j	.Lfloatdisf_normalize
1752
1753	.align	4
1754	.global	__floatdisf
1755	.type	__floatdisf, @function
1756__floatdisf:
1757	leaf_entry sp, 16
1758
1759	/* Check for zero.  */
1760	or	a4, xh, xl
1761	beqz	a4, 2f
1762
1763	/* Save the sign.  */
1764	extui	a7, xh, 31, 1
1765
1766	/* Get the absolute value.  */
1767	bgez	xh, .Lfloatdisf_normalize
1768	neg	xl, xl
1769	neg	xh, xh
1770	beqz	xl, .Lfloatdisf_normalize
1771	addi	xh, xh, -1
1772
1773.Lfloatdisf_normalize:
1774	/* Normalize with the first 1 bit in the msb of xh.  */
1775	beqz	xh, .Lfloatdisf_bigshift
1776	do_nsau	a4, xh, a5, a6
1777	ssl	a4
1778	src	xh, xh, xl
1779	sll	xl, xl
1780
1781.Lfloatdisf_shifted:
1782	/* Shift the mantissa into position, with rounding bits in a6.  */
1783	ssai	8
1784	sll	a5, xl
1785	src	a6, xh, xl
1786	srl	xh, xh
1787	beqz	a5, 1f
1788	movi	a5, 1
1789	or	a6, a6, a5
17901:
1791	/* Set the exponent.  */
1792	movi	a5, 0xbd	/* 0x7e + 63 */
1793	sub	a5, a5, a4
1794	slli	a5, a5, 23
1795	add	a2, xh, a5
1796
1797	/* Add the sign.  */
1798	slli	a7, a7, 31
1799	or	a2, a2, a7
1800
1801	/* Round up if the leftover fraction is >= 1/2.  */
1802	bgez	a6, 2f
1803	addi	a2, a2, 1	/* Overflow to the exponent is OK.  */
1804
1805	/* Check if the leftover fraction is exactly 1/2.  */
1806	slli	a6, a6, 1
1807	beqz	a6, .Lfloatdisf_exactlyhalf
18082:	leaf_return
1809
1810.Lfloatdisf_bigshift:
1811	/* xh is zero.  Normalize with first 1 bit of xl in the msb of xh.  */
1812	do_nsau	a4, xl, a5, a6
1813	ssl	a4
1814	sll	xh, xl
1815	movi	xl, 0
1816	addi	a4, a4, 32
1817	j	.Lfloatdisf_shifted
1818
1819.Lfloatdisf_exactlyhalf:
1820	/* Round down to the nearest even value.  */
1821	srli	a2, a2, 1
1822	slli	a2, a2, 1
1823	leaf_return
1824
1825#endif /* L_floatdisf */
1826
1827#if XCHAL_HAVE_FP_SQRT
1828#ifdef L_sqrtf
1829	/* Square root */
1830
1831	.align	4
1832	.global	__ieee754_sqrtf
1833	.type	__ieee754_sqrtf, @function
1834__ieee754_sqrtf:
1835	leaf_entry	sp, 16
1836
1837	wfr		f1, a2
1838
1839	sqrt0.s		f2, f1
1840	const.s		f3, 0
1841	maddn.s		f3, f2, f2
1842	nexp01.s	f4, f1
1843	const.s		f0, 3
1844	addexp.s	f4, f0
1845	maddn.s		f0, f3, f4
1846	nexp01.s	f3, f1
1847	neg.s		f5, f3
1848	maddn.s		f2, f0, f2
1849	const.s		f0, 0
1850	const.s		f6, 0
1851	const.s		f7, 0
1852	maddn.s		f0, f5, f2
1853	maddn.s		f6, f2, f4
1854	const.s		f4, 3
1855	maddn.s		f7, f4, f2
1856	maddn.s		f3, f0, f0
1857	maddn.s		f4, f6, f2
1858	neg.s		f2, f7
1859	maddn.s		f0, f3, f2
1860	maddn.s		f7, f4, f7
1861	mksadj.s	f2, f1
1862	nexp01.s	f1, f1
1863	maddn.s		f1, f0, f0
1864	neg.s		f3, f7
1865	addexpm.s	f0, f2
1866	addexp.s	f3, f2
1867	divn.s		f0, f1, f3
1868
1869	rfr		a2, f0
1870
1871	leaf_return
1872
1873#endif /* L_sqrtf */
1874#endif /* XCHAL_HAVE_FP_SQRT */
1875
1876#if XCHAL_HAVE_FP_RECIP
1877#ifdef L_recipsf2
1878	/* Reciprocal */
1879
1880	.align	4
1881	.global	__recipsf2
1882	.type	__recipsf2, @function
1883__recipsf2:
1884	leaf_entry	sp, 16
1885
1886	wfr		f1, a2
1887
1888	recip0.s	f0, f1
1889	const.s		f2, 1
1890	msub.s		f2, f1, f0
1891	maddn.s		f0, f0, f2
1892	const.s		f2, 1
1893	msub.s		f2, f1, f0
1894	maddn.s		f0, f0, f2
1895
1896	rfr		a2, f0
1897
1898	leaf_return
1899
1900#endif /* L_recipsf2 */
1901#endif /* XCHAL_HAVE_FP_RECIP */
1902
1903#if XCHAL_HAVE_FP_RSQRT
1904#ifdef L_rsqrtsf2
1905	/* Reciprocal square root */
1906
1907	.align	4
1908	.global	__rsqrtsf2
1909	.type	__rsqrtsf2, @function
1910__rsqrtsf2:
1911	leaf_entry	sp, 16
1912
1913	wfr		f1, a2
1914
1915	rsqrt0.s	f0, f1
1916	mul.s		f2, f1, f0
1917	const.s		f3, 3;
1918	mul.s		f4, f3, f0
1919	const.s		f5, 1
1920	msub.s		f5, f2, f0
1921	maddn.s		f0, f4, f5
1922	mul.s		f2, f1, f0
1923	mul.s		f1, f3, f0
1924	const.s		f3, 1
1925	msub.s		f3, f2, f0
1926	maddn.s		f0, f1, f3
1927
1928	rfr		a2, f0
1929
1930	leaf_return
1931
1932#endif /* L_rsqrtsf2 */
1933#endif /* XCHAL_HAVE_FP_RSQRT */
1934