xref: /openbsd/gnu/gcc/gcc/config/xtensa/ieee754-df.S (revision 404b540a)
1/* IEEE-754 double-precision functions for Xtensa
2   Copyright (C) 2006 Free Software Foundation, Inc.
3   Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
4
5   This file is part of GCC.
6
7   GCC is free software; you can redistribute it and/or modify it
8   under the terms of the GNU General Public License as published by
9   the Free Software Foundation; either version 2, or (at your option)
10   any later version.
11
12   In addition to the permissions in the GNU General Public License,
13   the Free Software Foundation gives you unlimited permission to link
14   the compiled version of this file into combinations with other
15   programs, and to distribute those combinations without any
16   restriction coming from the use of this file.  (The General Public
17   License restrictions do apply in other respects; for example, they
18   cover modification of the file, and distribution when not linked
19   into a combine executable.)
20
21   GCC is distributed in the hope that it will be useful, but WITHOUT
22   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
23   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
24   License for more details.
25
26   You should have received a copy of the GNU General Public License
27   along with GCC; see the file COPYING.  If not, write to the Free
28   Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
29   02110-1301, USA.  */
30
31#ifdef __XTENSA_EB__
32#define xh a2
33#define xl a3
34#define yh a4
35#define yl a5
36#else
37#define xh a3
38#define xl a2
39#define yh a5
40#define yl a4
41#endif
42
43/*  Warning!  The branch displacements for some Xtensa branch instructions
44    are quite small, and this code has been carefully laid out to keep
45    branch targets in range.  If you change anything, be sure to check that
46    the assembler is not relaxing anything to branch over a jump.  */
47
48#ifdef L_negdf2
49
50	.align	4
51	.global	__negdf2
52	.type	__negdf2, @function
53__negdf2:
54	leaf_entry sp, 16
55	movi	a4, 0x80000000
56	xor	xh, xh, a4
57	leaf_return
58
59#endif /* L_negdf2 */
60
61#ifdef L_addsubdf3
62
63	/* Addition */
64__adddf3_aux:
65
66	/* Handle NaNs and Infinities.  (This code is placed before the
67	   start of the function just to keep it in range of the limited
68	   branch displacements.)  */
69
70.Ladd_xnan_or_inf:
71	/* If y is neither Infinity nor NaN, return x.  */
72	bnall	yh, a6, 1f
73	/* If x is a NaN, return it.  Otherwise, return y.  */
74	slli	a7, xh, 12
75	or	a7, a7, xl
76	beqz	a7, .Ladd_ynan_or_inf
771:	leaf_return
78
79.Ladd_ynan_or_inf:
80	/* Return y.  */
81	mov	xh, yh
82	mov	xl, yl
83	leaf_return
84
85.Ladd_opposite_signs:
86	/* Operand signs differ.  Do a subtraction.  */
87	slli	a7, a6, 11
88	xor	yh, yh, a7
89	j	.Lsub_same_sign
90
91	.align	4
92	.global	__adddf3
93	.type	__adddf3, @function
94__adddf3:
95	leaf_entry sp, 16
96	movi	a6, 0x7ff00000
97
98	/* Check if the two operands have the same sign.  */
99	xor	a7, xh, yh
100	bltz	a7, .Ladd_opposite_signs
101
102.Ladd_same_sign:
103	/* Check if either exponent == 0x7ff (i.e., NaN or Infinity).  */
104	ball	xh, a6, .Ladd_xnan_or_inf
105	ball	yh, a6, .Ladd_ynan_or_inf
106
107	/* Compare the exponents.  The smaller operand will be shifted
108	   right by the exponent difference and added to the larger
109	   one.  */
110	extui	a7, xh, 20, 12
111	extui	a8, yh, 20, 12
112	bltu	a7, a8, .Ladd_shiftx
113
114.Ladd_shifty:
115	/* Check if the smaller (or equal) exponent is zero.  */
116	bnone	yh, a6, .Ladd_yexpzero
117
118	/* Replace yh sign/exponent with 0x001.  */
119	or	yh, yh, a6
120	slli	yh, yh, 11
121	srli	yh, yh, 11
122
123.Ladd_yexpdiff:
124	/* Compute the exponent difference.  Optimize for difference < 32.  */
125	sub	a10, a7, a8
126	bgeui	a10, 32, .Ladd_bigshifty
127
128	/* Shift yh/yl right by the exponent difference.  Any bits that are
129	   shifted out of yl are saved in a9 for rounding the result.  */
130	ssr	a10
131	movi	a9, 0
132	src	a9, yl, a9
133	src	yl, yh, yl
134	srl	yh, yh
135
136.Ladd_addy:
137	/* Do the 64-bit addition.  */
138	add	xl, xl, yl
139	add	xh, xh, yh
140	bgeu	xl, yl, 1f
141	addi	xh, xh, 1
1421:
143	/* Check if the add overflowed into the exponent.  */
144	extui	a10, xh, 20, 12
145	beq	a10, a7, .Ladd_round
146	mov	a8, a7
147	j	.Ladd_carry
148
149.Ladd_yexpzero:
150	/* y is a subnormal value.  Replace its sign/exponent with zero,
151	   i.e., no implicit "1.0", and increment the apparent exponent
152	   because subnormals behave as if they had the minimum (nonzero)
153	   exponent.  Test for the case when both exponents are zero.  */
154	slli	yh, yh, 12
155	srli	yh, yh, 12
156	bnone	xh, a6, .Ladd_bothexpzero
157	addi	a8, a8, 1
158	j	.Ladd_yexpdiff
159
160.Ladd_bothexpzero:
161	/* Both exponents are zero.  Handle this as a special case.  There
162	   is no need to shift or round, and the normal code for handling
163	   a carry into the exponent field will not work because it
164	   assumes there is an implicit "1.0" that needs to be added.  */
165	add	xl, xl, yl
166	add	xh, xh, yh
167	bgeu	xl, yl, 1f
168	addi	xh, xh, 1
1691:	leaf_return
170
171.Ladd_bigshifty:
172	/* Exponent difference > 64 -- just return the bigger value.  */
173	bgeui	a10, 64, 1b
174
175	/* Shift yh/yl right by the exponent difference.  Any bits that are
176	   shifted out are saved in a9 for rounding the result.  */
177	ssr	a10
178	sll	a11, yl		/* lost bits shifted out of yl */
179	src	a9, yh, yl
180	srl	yl, yh
181	movi	yh, 0
182	beqz	a11, .Ladd_addy
183	or	a9, a9, a10	/* any positive, nonzero value will work */
184	j	.Ladd_addy
185
186.Ladd_xexpzero:
187	/* Same as "yexpzero" except skip handling the case when both
188	   exponents are zero.  */
189	slli	xh, xh, 12
190	srli	xh, xh, 12
191	addi	a7, a7, 1
192	j	.Ladd_xexpdiff
193
194.Ladd_shiftx:
195	/* Same thing as the "shifty" code, but with x and y swapped.  Also,
196	   because the exponent difference is always nonzero in this version,
197	   the shift sequence can use SLL and skip loading a constant zero.  */
198	bnone	xh, a6, .Ladd_xexpzero
199
200	or	xh, xh, a6
201	slli	xh, xh, 11
202	srli	xh, xh, 11
203
204.Ladd_xexpdiff:
205	sub	a10, a8, a7
206	bgeui	a10, 32, .Ladd_bigshiftx
207
208	ssr	a10
209	sll	a9, xl
210	src	xl, xh, xl
211	srl	xh, xh
212
213.Ladd_addx:
214	add	xl, xl, yl
215	add	xh, xh, yh
216	bgeu	xl, yl, 1f
217	addi	xh, xh, 1
2181:
219	/* Check if the add overflowed into the exponent.  */
220	extui	a10, xh, 20, 12
221	bne	a10, a8, .Ladd_carry
222
223.Ladd_round:
224	/* Round up if the leftover fraction is >= 1/2.  */
225	bgez	a9, 1f
226	addi	xl, xl, 1
227	beqz	xl, .Ladd_roundcarry
228
229	/* Check if the leftover fraction is exactly 1/2.  */
230	slli	a9, a9, 1
231	beqz	a9, .Ladd_exactlyhalf
2321:	leaf_return
233
234.Ladd_bigshiftx:
235	/* Mostly the same thing as "bigshifty"....  */
236	bgeui	a10, 64, .Ladd_returny
237
238	ssr	a10
239	sll	a11, xl
240	src	a9, xh, xl
241	srl	xl, xh
242	movi	xh, 0
243	beqz	a11, .Ladd_addx
244	or	a9, a9, a10
245	j	.Ladd_addx
246
247.Ladd_returny:
248	mov	xh, yh
249	mov	xl, yl
250	leaf_return
251
252.Ladd_carry:
253	/* The addition has overflowed into the exponent field, so the
254	   value needs to be renormalized.  The mantissa of the result
255	   can be recovered by subtracting the original exponent and
256	   adding 0x100000 (which is the explicit "1.0" for the
257	   mantissa of the non-shifted operand -- the "1.0" for the
258	   shifted operand was already added).  The mantissa can then
259	   be shifted right by one bit.  The explicit "1.0" of the
260	   shifted mantissa then needs to be replaced by the exponent,
261	   incremented by one to account for the normalizing shift.
262	   It is faster to combine these operations: do the shift first
263	   and combine the additions and subtractions.  If x is the
264	   original exponent, the result is:
265	       shifted mantissa - (x << 19) + (1 << 19) + (x << 20)
266	   or:
267	       shifted mantissa + ((x + 1) << 19)
268	   Note that the exponent is incremented here by leaving the
269	   explicit "1.0" of the mantissa in the exponent field.  */
270
271	/* Shift xh/xl right by one bit.  Save the lsb of xl.  */
272	mov	a10, xl
273	ssai	1
274	src	xl, xh, xl
275	srl	xh, xh
276
277	/* See explanation above.  The original exponent is in a8.  */
278	addi	a8, a8, 1
279	slli	a8, a8, 19
280	add	xh, xh, a8
281
282	/* Return an Infinity if the exponent overflowed.  */
283	ball	xh, a6, .Ladd_infinity
284
285	/* Same thing as the "round" code except the msb of the leftover
286	   fraction is bit 0 of a10, with the rest of the fraction in a9.  */
287	bbci.l	a10, 0, 1f
288	addi	xl, xl, 1
289	beqz	xl, .Ladd_roundcarry
290	beqz	a9, .Ladd_exactlyhalf
2911:	leaf_return
292
293.Ladd_infinity:
294	/* Clear the mantissa.  */
295	movi	xl, 0
296	srli	xh, xh, 20
297	slli	xh, xh, 20
298
299	/* The sign bit may have been lost in a carry-out.  Put it back.  */
300	slli	a8, a8, 1
301	or	xh, xh, a8
302	leaf_return
303
304.Ladd_exactlyhalf:
305	/* Round down to the nearest even value.  */
306	srli	xl, xl, 1
307	slli	xl, xl, 1
308	leaf_return
309
310.Ladd_roundcarry:
311	/* xl is always zero when the rounding increment overflows, so
312	   there's no need to round it to an even value.  */
313	addi	xh, xh, 1
314	/* Overflow to the exponent is OK.  */
315	leaf_return
316
317
318	/* Subtraction */
319__subdf3_aux:
320
321	/* Handle NaNs and Infinities.  (This code is placed before the
322	   start of the function just to keep it in range of the limited
323	   branch displacements.)  */
324
325.Lsub_xnan_or_inf:
326	/* If y is neither Infinity nor NaN, return x.  */
327	bnall	yh, a6, 1f
328	/* Both x and y are either NaN or Inf, so the result is NaN.  */
329	movi	a4, 0x80000	/* make it a quiet NaN */
330	or	xh, xh, a4
3311:	leaf_return
332
333.Lsub_ynan_or_inf:
334	/* Negate y and return it.  */
335	slli	a7, a6, 11
336	xor	xh, yh, a7
337	mov	xl, yl
338	leaf_return
339
340.Lsub_opposite_signs:
341	/* Operand signs differ.  Do an addition.  */
342	slli	a7, a6, 11
343	xor	yh, yh, a7
344	j	.Ladd_same_sign
345
346	.align	4
347	.global	__subdf3
348	.type	__subdf3, @function
349__subdf3:
350	leaf_entry sp, 16
351	movi	a6, 0x7ff00000
352
353	/* Check if the two operands have the same sign.  */
354	xor	a7, xh, yh
355	bltz	a7, .Lsub_opposite_signs
356
357.Lsub_same_sign:
358	/* Check if either exponent == 0x7ff (i.e., NaN or Infinity).  */
359	ball	xh, a6, .Lsub_xnan_or_inf
360	ball	yh, a6, .Lsub_ynan_or_inf
361
362	/* Compare the operands.  In contrast to addition, the entire
363	   value matters here.  */
364	extui	a7, xh, 20, 11
365	extui	a8, yh, 20, 11
366	bltu	xh, yh, .Lsub_xsmaller
367	beq	xh, yh, .Lsub_compare_low
368
369.Lsub_ysmaller:
370	/* Check if the smaller (or equal) exponent is zero.  */
371	bnone	yh, a6, .Lsub_yexpzero
372
373	/* Replace yh sign/exponent with 0x001.  */
374	or	yh, yh, a6
375	slli	yh, yh, 11
376	srli	yh, yh, 11
377
378.Lsub_yexpdiff:
379	/* Compute the exponent difference.  Optimize for difference < 32.  */
380	sub	a10, a7, a8
381	bgeui	a10, 32, .Lsub_bigshifty
382
383	/* Shift yh/yl right by the exponent difference.  Any bits that are
384	   shifted out of yl are saved in a9 for rounding the result.  */
385	ssr	a10
386	movi	a9, 0
387	src	a9, yl, a9
388	src	yl, yh, yl
389	srl	yh, yh
390
391.Lsub_suby:
392	/* Do the 64-bit subtraction.  */
393	sub	xh, xh, yh
394	bgeu	xl, yl, 1f
395	addi	xh, xh, -1
3961:	sub	xl, xl, yl
397
398	/* Subtract the leftover bits in a9 from zero and propagate any
399	   borrow from xh/xl.  */
400	neg	a9, a9
401	beqz	a9, 1f
402	addi	a5, xh, -1
403	moveqz	xh, a5, xl
404	addi	xl, xl, -1
4051:
406	/* Check if the subtract underflowed into the exponent.  */
407	extui	a10, xh, 20, 11
408	beq	a10, a7, .Lsub_round
409	j	.Lsub_borrow
410
411.Lsub_compare_low:
412	/* The high words are equal.  Compare the low words.  */
413	bltu	xl, yl, .Lsub_xsmaller
414	bltu	yl, xl, .Lsub_ysmaller
415	/* The operands are equal.  Return 0.0.  */
416	movi	xh, 0
417	movi	xl, 0
4181:	leaf_return
419
420.Lsub_yexpzero:
421	/* y is a subnormal value.  Replace its sign/exponent with zero,
422	   i.e., no implicit "1.0".  Unless x is also a subnormal, increment
423	   y's apparent exponent because subnormals behave as if they had
424	   the minimum (nonzero) exponent.  */
425	slli	yh, yh, 12
426	srli	yh, yh, 12
427	bnone	xh, a6, .Lsub_yexpdiff
428	addi	a8, a8, 1
429	j	.Lsub_yexpdiff
430
431.Lsub_bigshifty:
432	/* Exponent difference > 64 -- just return the bigger value.  */
433	bgeui	a10, 64, 1b
434
435	/* Shift yh/yl right by the exponent difference.  Any bits that are
436	   shifted out are saved in a9 for rounding the result.  */
437	ssr	a10
438	sll	a11, yl		/* lost bits shifted out of yl */
439	src	a9, yh, yl
440	srl	yl, yh
441	movi	yh, 0
442	beqz	a11, .Lsub_suby
443	or	a9, a9, a10	/* any positive, nonzero value will work */
444	j	.Lsub_suby
445
446.Lsub_xsmaller:
447	/* Same thing as the "ysmaller" code, but with x and y swapped and
448	   with y negated.  */
449	bnone	xh, a6, .Lsub_xexpzero
450
451	or	xh, xh, a6
452	slli	xh, xh, 11
453	srli	xh, xh, 11
454
455.Lsub_xexpdiff:
456	sub	a10, a8, a7
457	bgeui	a10, 32, .Lsub_bigshiftx
458
459	ssr	a10
460	movi	a9, 0
461	src	a9, xl, a9
462	src	xl, xh, xl
463	srl	xh, xh
464
465	/* Negate y.  */
466	slli	a11, a6, 11
467	xor	yh, yh, a11
468
469.Lsub_subx:
470	sub	xl, yl, xl
471	sub	xh, yh, xh
472	bgeu	yl, xl, 1f
473	addi	xh, xh, -1
4741:
475	/* Subtract the leftover bits in a9 from zero and propagate any
476	   borrow from xh/xl.  */
477	neg	a9, a9
478	beqz	a9, 1f
479	addi	a5, xh, -1
480	moveqz	xh, a5, xl
481	addi	xl, xl, -1
4821:
483	/* Check if the subtract underflowed into the exponent.  */
484	extui	a10, xh, 20, 11
485	bne	a10, a8, .Lsub_borrow
486
487.Lsub_round:
488	/* Round up if the leftover fraction is >= 1/2.  */
489	bgez	a9, 1f
490	addi	xl, xl, 1
491	beqz	xl, .Lsub_roundcarry
492
493	/* Check if the leftover fraction is exactly 1/2.  */
494	slli	a9, a9, 1
495	beqz	a9, .Lsub_exactlyhalf
4961:	leaf_return
497
498.Lsub_xexpzero:
499	/* Same as "yexpzero".  */
500	slli	xh, xh, 12
501	srli	xh, xh, 12
502	bnone	yh, a6, .Lsub_xexpdiff
503	addi	a7, a7, 1
504	j	.Lsub_xexpdiff
505
506.Lsub_bigshiftx:
507	/* Mostly the same thing as "bigshifty", but with the sign bit of the
508	   shifted value set so that the subsequent subtraction flips the
509	   sign of y.  */
510	bgeui	a10, 64, .Lsub_returny
511
512	ssr	a10
513	sll	a11, xl
514	src	a9, xh, xl
515	srl	xl, xh
516	slli	xh, a6, 11	/* set sign bit of xh */
517	beqz	a11, .Lsub_subx
518	or	a9, a9, a10
519	j	.Lsub_subx
520
521.Lsub_returny:
522	/* Negate and return y.  */
523	slli	a7, a6, 11
524	xor	xh, yh, a7
525	mov	xl, yl
526	leaf_return
527
528.Lsub_borrow:
529	/* The subtraction has underflowed into the exponent field, so the
530	   value needs to be renormalized.  Shift the mantissa left as
531	   needed to remove any leading zeros and adjust the exponent
532	   accordingly.  If the exponent is not large enough to remove
533	   all the leading zeros, the result will be a subnormal value.  */
534
535	slli	a8, xh, 12
536	beqz	a8, .Lsub_xhzero
537	do_nsau	a6, a8, a7, a11
538	srli	a8, a8, 12
539	bge	a6, a10, .Lsub_subnormal
540	addi	a6, a6, 1
541
542.Lsub_shift_lt32:
543	/* Shift the mantissa (a8/xl/a9) left by a6.  */
544	ssl	a6
545	src	a8, a8, xl
546	src	xl, xl, a9
547	sll	a9, a9
548
549	/* Combine the shifted mantissa with the sign and exponent,
550	   decrementing the exponent by a6.  (The exponent has already
551	   been decremented by one due to the borrow from the subtraction,
552	   but adding the mantissa will increment the exponent by one.)  */
553	srli	xh, xh, 20
554	sub	xh, xh, a6
555	slli	xh, xh, 20
556	add	xh, xh, a8
557	j	.Lsub_round
558
559.Lsub_exactlyhalf:
560	/* Round down to the nearest even value.  */
561	srli	xl, xl, 1
562	slli	xl, xl, 1
563	leaf_return
564
565.Lsub_roundcarry:
566	/* xl is always zero when the rounding increment overflows, so
567	   there's no need to round it to an even value.  */
568	addi	xh, xh, 1
569	/* Overflow to the exponent is OK.  */
570	leaf_return
571
572.Lsub_xhzero:
573	/* When normalizing the result, all the mantissa bits in the high
574	   word are zero.  Shift by "20 + (leading zero count of xl) + 1".  */
575	do_nsau	a6, xl, a7, a11
576	addi	a6, a6, 21
577	blt	a10, a6, .Lsub_subnormal
578
579.Lsub_normalize_shift:
580	bltui	a6, 32, .Lsub_shift_lt32
581
582	ssl	a6
583	src	a8, xl, a9
584	sll	xl, a9
585	movi	a9, 0
586
587	srli	xh, xh, 20
588	sub	xh, xh, a6
589	slli	xh, xh, 20
590	add	xh, xh, a8
591	j	.Lsub_round
592
593.Lsub_subnormal:
594	/* The exponent is too small to shift away all the leading zeros.
595	   Set a6 to the current exponent (which has already been
596	   decremented by the borrow) so that the exponent of the result
597	   will be zero.  Do not add 1 to a6 in this case, because: (1)
598	   adding the mantissa will not increment the exponent, so there is
599	   no need to subtract anything extra from the exponent to
600	   compensate, and (2) the effective exponent of a subnormal is 1
601	   not 0 so the shift amount must be 1 smaller than normal. */
602	mov	a6, a10
603	j	.Lsub_normalize_shift
604
605#endif /* L_addsubdf3 */
606
607#ifdef L_muldf3
608
609	/* Multiplication */
610__muldf3_aux:
611
612	/* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
613	   (This code is placed before the start of the function just to
614	   keep it in range of the limited branch displacements.)  */
615
616.Lmul_xexpzero:
617	/* Clear the sign bit of x.  */
618	slli	xh, xh, 1
619	srli	xh, xh, 1
620
621	/* If x is zero, return zero.  */
622	or	a10, xh, xl
623	beqz	a10, .Lmul_return_zero
624
625	/* Normalize x.  Adjust the exponent in a8.  */
626	beqz	xh, .Lmul_xh_zero
627	do_nsau	a10, xh, a11, a12
628	addi	a10, a10, -11
629	ssl	a10
630	src	xh, xh, xl
631	sll	xl, xl
632	movi	a8, 1
633	sub	a8, a8, a10
634	j	.Lmul_xnormalized
635.Lmul_xh_zero:
636	do_nsau	a10, xl, a11, a12
637	addi	a10, a10, -11
638	movi	a8, -31
639	sub	a8, a8, a10
640	ssl	a10
641	bltz	a10, .Lmul_xl_srl
642	sll	xh, xl
643	movi	xl, 0
644	j	.Lmul_xnormalized
645.Lmul_xl_srl:
646	srl	xh, xl
647	sll	xl, xl
648	j	.Lmul_xnormalized
649
650.Lmul_yexpzero:
651	/* Clear the sign bit of y.  */
652	slli	yh, yh, 1
653	srli	yh, yh, 1
654
655	/* If y is zero, return zero.  */
656	or	a10, yh, yl
657	beqz	a10, .Lmul_return_zero
658
659	/* Normalize y.  Adjust the exponent in a9.  */
660	beqz	yh, .Lmul_yh_zero
661	do_nsau	a10, yh, a11, a12
662	addi	a10, a10, -11
663	ssl	a10
664	src	yh, yh, yl
665	sll	yl, yl
666	movi	a9, 1
667	sub	a9, a9, a10
668	j	.Lmul_ynormalized
669.Lmul_yh_zero:
670	do_nsau	a10, yl, a11, a12
671	addi	a10, a10, -11
672	movi	a9, -31
673	sub	a9, a9, a10
674	ssl	a10
675	bltz	a10, .Lmul_yl_srl
676	sll	yh, yl
677	movi	yl, 0
678	j	.Lmul_ynormalized
679.Lmul_yl_srl:
680	srl	yh, yl
681	sll	yl, yl
682	j	.Lmul_ynormalized
683
684.Lmul_return_zero:
685	/* Return zero with the appropriate sign bit.  */
686	srli	xh, a7, 31
687	slli	xh, xh, 31
688	movi	xl, 0
689	j	.Lmul_done
690
691.Lmul_xnan_or_inf:
692	/* If y is zero, return NaN.  */
693	bnez	yl, 1f
694	slli	a8, yh, 1
695	bnez	a8, 1f
696	movi	a4, 0x80000	/* make it a quiet NaN */
697	or	xh, xh, a4
698	j	.Lmul_done
6991:
700	/* If y is NaN, return y.  */
701	bnall	yh, a6, .Lmul_returnx
702	slli	a8, yh, 12
703	or	a8, a8, yl
704	beqz	a8, .Lmul_returnx
705
706.Lmul_returny:
707	mov	xh, yh
708	mov	xl, yl
709
710.Lmul_returnx:
711	/* Set the sign bit and return.  */
712	extui	a7, a7, 31, 1
713	slli	xh, xh, 1
714	ssai	1
715	src	xh, a7, xh
716	j	.Lmul_done
717
718.Lmul_ynan_or_inf:
719	/* If x is zero, return NaN.  */
720	bnez	xl, .Lmul_returny
721	slli	a8, xh, 1
722	bnez	a8, .Lmul_returny
723	movi	a7, 0x80000	/* make it a quiet NaN */
724	or	xh, yh, a7
725	j	.Lmul_done
726
727	.align	4
728	.global	__muldf3
729	.type	__muldf3, @function
730__muldf3:
731	leaf_entry sp, 32
732#if __XTENSA_CALL0_ABI__
733	addi	sp, sp, -32
734	s32i	a12, sp, 16
735	s32i	a13, sp, 20
736	s32i	a14, sp, 24
737	s32i	a15, sp, 28
738#endif
739	movi	a6, 0x7ff00000
740
741	/* Get the sign of the result.  */
742	xor	a7, xh, yh
743
744	/* Check for NaN and infinity.  */
745	ball	xh, a6, .Lmul_xnan_or_inf
746	ball	yh, a6, .Lmul_ynan_or_inf
747
748	/* Extract the exponents.  */
749	extui	a8, xh, 20, 11
750	extui	a9, yh, 20, 11
751
752	beqz	a8, .Lmul_xexpzero
753.Lmul_xnormalized:
754	beqz	a9, .Lmul_yexpzero
755.Lmul_ynormalized:
756
757	/* Add the exponents.  */
758	add	a8, a8, a9
759
760	/* Replace sign/exponent fields with explicit "1.0".  */
761	movi	a10, 0x1fffff
762	or	xh, xh, a6
763	and	xh, xh, a10
764	or	yh, yh, a6
765	and	yh, yh, a10
766
767	/* Multiply 64x64 to 128 bits.  The result ends up in xh/xl/a6.
768	   The least-significant word of the result is thrown away except
769	   that if it is nonzero, the lsb of a6 is set to 1.  */
770#if XCHAL_HAVE_MUL32_HIGH
771
772	/* Compute a6 with any carry-outs in a10.  */
773	movi	a10, 0
774	mull	a6, xl, yh
775	mull	a11, xh, yl
776	add	a6, a6, a11
777	bgeu	a6, a11, 1f
778	addi	a10, a10, 1
7791:
780	muluh	a11, xl, yl
781	add	a6, a6, a11
782	bgeu	a6, a11, 1f
783	addi	a10, a10, 1
7841:
785	/* If the low word of the result is nonzero, set the lsb of a6.  */
786	mull	a11, xl, yl
787	beqz	a11, 1f
788	movi	a9, 1
789	or	a6, a6, a9
7901:
791	/* Compute xl with any carry-outs in a9.  */
792	movi	a9, 0
793	mull	a11, xh, yh
794	add	a10, a10, a11
795	bgeu	a10, a11, 1f
796	addi	a9, a9, 1
7971:
798	muluh	a11, xh, yl
799	add	a10, a10, a11
800	bgeu	a10, a11, 1f
801	addi	a9, a9, 1
8021:
803	muluh	xl, xl, yh
804	add	xl, xl, a10
805	bgeu	xl, a10, 1f
806	addi	a9, a9, 1
8071:
808	/* Compute xh.  */
809	muluh	xh, xh, yh
810	add	xh, xh, a9
811
812#else
813
814	/* Break the inputs into 16-bit chunks and compute 16 32-bit partial
815	   products.  These partial products are:
816
817		0 xll * yll
818
819		1 xll * ylh
820		2 xlh * yll
821
822		3 xll * yhl
823		4 xlh * ylh
824		5 xhl * yll
825
826		6 xll * yhh
827		7 xlh * yhl
828		8 xhl * ylh
829		9 xhh * yll
830
831		10 xlh * yhh
832		11 xhl * yhl
833		12 xhh * ylh
834
835		13 xhl * yhh
836		14 xhh * yhl
837
838		15 xhh * yhh
839
840	   where the input chunks are (hh, hl, lh, ll).  If using the Mul16
841	   or Mul32 multiplier options, these input chunks must be stored in
842	   separate registers.  For Mac16, the UMUL.AA.* opcodes can specify
843	   that the inputs come from either half of the registers, so there
844	   is no need to shift them out ahead of time.  If there is no
845	   multiply hardware, the 16-bit chunks can be extracted when setting
846	   up the arguments to the separate multiply function.  */
847
848	/* Save a7 since it is needed to hold a temporary value.  */
849	s32i	a7, sp, 4
850#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
851	/* Calling a separate multiply function will clobber a0 and requires
852	   use of a8 as a temporary, so save those values now.  (The function
853	   uses a custom ABI so nothing else needs to be saved.)  */
854	s32i	a0, sp, 0
855	s32i	a8, sp, 8
856#endif
857
858#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32
859
860#define xlh a12
861#define ylh a13
862#define xhh a14
863#define yhh a15
864
865	/* Get the high halves of the inputs into registers.  */
866	srli	xlh, xl, 16
867	srli	ylh, yl, 16
868	srli	xhh, xh, 16
869	srli	yhh, yh, 16
870
871#define xll xl
872#define yll yl
873#define xhl xh
874#define yhl yh
875
876#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16
877	/* Clear the high halves of the inputs.  This does not matter
878	   for MUL16 because the high bits are ignored.  */
879	extui	xl, xl, 0, 16
880	extui	xh, xh, 0, 16
881	extui	yl, yl, 0, 16
882	extui	yh, yh, 0, 16
883#endif
884#endif /* MUL16 || MUL32 */
885
886
887#if XCHAL_HAVE_MUL16
888
889#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
890	mul16u	dst, xreg ## xhalf, yreg ## yhalf
891
892#elif XCHAL_HAVE_MUL32
893
894#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
895	mull	dst, xreg ## xhalf, yreg ## yhalf
896
897#elif XCHAL_HAVE_MAC16
898
899/* The preprocessor insists on inserting a space when concatenating after
900   a period in the definition of do_mul below.  These macros are a workaround
901   using underscores instead of periods when doing the concatenation.  */
902#define umul_aa_ll umul.aa.ll
903#define umul_aa_lh umul.aa.lh
904#define umul_aa_hl umul.aa.hl
905#define umul_aa_hh umul.aa.hh
906
907#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
908	umul_aa_ ## xhalf ## yhalf	xreg, yreg; \
909	rsr	dst, ACCLO
910
911#else /* no multiply hardware */
912
913#define set_arg_l(dst, src) \
914	extui	dst, src, 0, 16
915#define set_arg_h(dst, src) \
916	srli	dst, src, 16
917
918#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
919	set_arg_ ## xhalf (a13, xreg); \
920	set_arg_ ## yhalf (a14, yreg); \
921	call0	.Lmul_mulsi3; \
922	mov	dst, a12
923#endif
924
925	/* Add pp1 and pp2 into a10 with carry-out in a9.  */
926	do_mul(a10, xl, l, yl, h)	/* pp 1 */
927	do_mul(a11, xl, h, yl, l)	/* pp 2 */
928	movi	a9, 0
929	add	a10, a10, a11
930	bgeu	a10, a11, 1f
931	addi	a9, a9, 1
9321:
933	/* Initialize a6 with a9/a10 shifted into position.  Note that
934	   this value can be safely incremented without any carry-outs.  */
935	ssai	16
936	src	a6, a9, a10
937
938	/* Compute the low word into a10.  */
939	do_mul(a11, xl, l, yl, l)	/* pp 0 */
940	sll	a10, a10
941	add	a10, a10, a11
942	bgeu	a10, a11, 1f
943	addi	a6, a6, 1
9441:
945	/* Compute the contributions of pp0-5 to a6, with carry-outs in a9.
946	   This is good enough to determine the low half of a6, so that any
947	   nonzero bits from the low word of the result can be collapsed
948	   into a6, freeing up a register.  */
949	movi	a9, 0
950	do_mul(a11, xl, l, yh, l)	/* pp 3 */
951	add	a6, a6, a11
952	bgeu	a6, a11, 1f
953	addi	a9, a9, 1
9541:
955	do_mul(a11, xl, h, yl, h)	/* pp 4 */
956	add	a6, a6, a11
957	bgeu	a6, a11, 1f
958	addi	a9, a9, 1
9591:
960	do_mul(a11, xh, l, yl, l)	/* pp 5 */
961	add	a6, a6, a11
962	bgeu	a6, a11, 1f
963	addi	a9, a9, 1
9641:
965	/* Collapse any nonzero bits from the low word into a6.  */
966	beqz	a10, 1f
967	movi	a11, 1
968	or	a6, a6, a11
9691:
970	/* Add pp6-9 into a11 with carry-outs in a10.  */
971	do_mul(a7, xl, l, yh, h)	/* pp 6 */
972	do_mul(a11, xh, h, yl, l)	/* pp 9 */
973	movi	a10, 0
974	add	a11, a11, a7
975	bgeu	a11, a7, 1f
976	addi	a10, a10, 1
9771:
978	do_mul(a7, xl, h, yh, l)	/* pp 7 */
979	add	a11, a11, a7
980	bgeu	a11, a7, 1f
981	addi	a10, a10, 1
9821:
983	do_mul(a7, xh, l, yl, h)	/* pp 8 */
984	add	a11, a11, a7
985	bgeu	a11, a7, 1f
986	addi	a10, a10, 1
9871:
988	/* Shift a10/a11 into position, and add low half of a11 to a6.  */
989	src	a10, a10, a11
990	add	a10, a10, a9
991	sll	a11, a11
992	add	a6, a6, a11
993	bgeu	a6, a11, 1f
994	addi	a10, a10, 1
9951:
996	/* Add pp10-12 into xl with carry-outs in a9.  */
997	movi	a9, 0
998	do_mul(xl, xl, h, yh, h)	/* pp 10 */
999	add	xl, xl, a10
1000	bgeu	xl, a10, 1f
1001	addi	a9, a9, 1
10021:
1003	do_mul(a10, xh, l, yh, l)	/* pp 11 */
1004	add	xl, xl, a10
1005	bgeu	xl, a10, 1f
1006	addi	a9, a9, 1
10071:
1008	do_mul(a10, xh, h, yl, h)	/* pp 12 */
1009	add	xl, xl, a10
1010	bgeu	xl, a10, 1f
1011	addi	a9, a9, 1
10121:
1013	/* Add pp13-14 into a11 with carry-outs in a10.  */
1014	do_mul(a11, xh, l, yh, h)	/* pp 13 */
1015	do_mul(a7, xh, h, yh, l)	/* pp 14 */
1016	movi	a10, 0
1017	add	a11, a11, a7
1018	bgeu	a11, a7, 1f
1019	addi	a10, a10, 1
10201:
1021	/* Shift a10/a11 into position, and add low half of a11 to a6.  */
1022	src	a10, a10, a11
1023	add	a10, a10, a9
1024	sll	a11, a11
1025	add	xl, xl, a11
1026	bgeu	xl, a11, 1f
1027	addi	a10, a10, 1
10281:
1029	/* Compute xh.  */
1030	do_mul(xh, xh, h, yh, h)	/* pp 15 */
1031	add	xh, xh, a10
1032
1033	/* Restore values saved on the stack during the multiplication.  */
1034	l32i	a7, sp, 4
1035#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
1036	l32i	a0, sp, 0
1037	l32i	a8, sp, 8
1038#endif
1039#endif
1040
1041	/* Shift left by 12 bits, unless there was a carry-out from the
1042	   multiply, in which case, shift by 11 bits and increment the
1043	   exponent.  Note: It is convenient to use the constant 0x3ff
1044	   instead of 0x400 when removing the extra exponent bias (so that
1045	   it is easy to construct 0x7fe for the overflow check).  Reverse
1046	   the logic here to decrement the exponent sum by one unless there
1047	   was a carry-out.  */
1048	movi	a4, 11
1049	srli	a5, xh, 21 - 12
1050	bnez	a5, 1f
1051	addi	a4, a4, 1
1052	addi	a8, a8, -1
10531:	ssl	a4
1054	src	xh, xh, xl
1055	src	xl, xl, a6
1056	sll	a6, a6
1057
1058	/* Subtract the extra bias from the exponent sum (plus one to account
1059	   for the explicit "1.0" of the mantissa that will be added to the
1060	   exponent in the final result).  */
1061	movi	a4, 0x3ff
1062	sub	a8, a8, a4
1063
1064	/* Check for over/underflow.  The value in a8 is one less than the
1065	   final exponent, so values in the range 0..7fd are OK here.  */
1066	slli	a4, a4, 1	/* 0x7fe */
1067	bgeu	a8, a4, .Lmul_overflow
1068
1069.Lmul_round:
1070	/* Round.  */
1071	bgez	a6, .Lmul_rounded
1072	addi	xl, xl, 1
1073	beqz	xl, .Lmul_roundcarry
1074	slli	a6, a6, 1
1075	beqz	a6, .Lmul_exactlyhalf
1076
1077.Lmul_rounded:
1078	/* Add the exponent to the mantissa.  */
1079	slli	a8, a8, 20
1080	add	xh, xh, a8
1081
1082.Lmul_addsign:
1083	/* Add the sign bit.  */
1084	srli	a7, a7, 31
1085	slli	a7, a7, 31
1086	or	xh, xh, a7
1087
1088.Lmul_done:
1089#if __XTENSA_CALL0_ABI__
1090	l32i	a12, sp, 16
1091	l32i	a13, sp, 20
1092	l32i	a14, sp, 24
1093	l32i	a15, sp, 28
1094	addi	sp, sp, 32
1095#endif
1096	leaf_return
1097
1098.Lmul_exactlyhalf:
1099	/* Round down to the nearest even value.  */
1100	srli	xl, xl, 1
1101	slli	xl, xl, 1
1102	j	.Lmul_rounded
1103
1104.Lmul_roundcarry:
1105	/* xl is always zero when the rounding increment overflows, so
1106	   there's no need to round it to an even value.  */
1107	addi	xh, xh, 1
1108	/* Overflow is OK -- it will be added to the exponent.  */
1109	j	.Lmul_rounded
1110
1111.Lmul_overflow:
1112	bltz	a8, .Lmul_underflow
1113	/* Return +/- Infinity.  */
1114	addi	a8, a4, 1	/* 0x7ff */
1115	slli	xh, a8, 20
1116	movi	xl, 0
1117	j	.Lmul_addsign
1118
1119.Lmul_underflow:
1120	/* Create a subnormal value, where the exponent field contains zero,
1121	   but the effective exponent is 1.  The value of a8 is one less than
1122	   the actual exponent, so just negate it to get the shift amount.  */
1123	neg	a8, a8
1124	mov	a9, a6
1125	ssr	a8
1126	bgeui	a8, 32, .Lmul_bigshift
1127
1128	/* Shift xh/xl right.  Any bits that are shifted out of xl are saved
1129	   in a6 (combined with the shifted-out bits currently in a6) for
1130	   rounding the result.  */
1131	sll	a6, xl
1132	src	xl, xh, xl
1133	srl	xh, xh
1134	j	1f
1135
1136.Lmul_bigshift:
1137	bgeui	a8, 64, .Lmul_flush_to_zero
1138	sll	a10, xl		/* lost bits shifted out of xl */
1139	src	a6, xh, xl
1140	srl	xl, xh
1141	movi	xh, 0
1142	or	a9, a9, a10
1143
1144	/* Set the exponent to zero.  */
11451:	movi	a8, 0
1146
1147	/* Pack any nonzero bits shifted out into a6.  */
1148	beqz	a9, .Lmul_round
1149	movi	a9, 1
1150	or	a6, a6, a9
1151	j	.Lmul_round
1152
1153.Lmul_flush_to_zero:
1154	/* Return zero with the appropriate sign bit.  */
1155	srli	xh, a7, 31
1156	slli	xh, xh, 31
1157	movi	xl, 0
1158	j	.Lmul_done
1159
1160#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
1161
1162	/* For Xtensa processors with no multiply hardware, this simplified
1163	   version of _mulsi3 is used for multiplying 16-bit chunks of
1164	   the floating-point mantissas.  It uses a custom ABI:	the inputs
1165	   are passed in a13 and a14, the result is returned in a12, and
1166	   a8 and a15 are clobbered.  */
1167	.align	4
1168.Lmul_mulsi3:
1169	movi	a12, 0
1170.Lmul_mult_loop:
1171	add	a15, a14, a12
1172	extui	a8, a13, 0, 1
1173	movnez	a12, a15, a8
1174
1175	do_addx2 a15, a14, a12, a15
1176	extui	a8, a13, 1, 1
1177	movnez	a12, a15, a8
1178
1179	do_addx4 a15, a14, a12, a15
1180	extui	a8, a13, 2, 1
1181	movnez	a12, a15, a8
1182
1183	do_addx8 a15, a14, a12, a15
1184	extui	a8, a13, 3, 1
1185	movnez	a12, a15, a8
1186
1187	srli	a13, a13, 4
1188	slli	a14, a14, 4
1189	bnez	a13, .Lmul_mult_loop
1190	ret
1191#endif /* !MUL16 && !MUL32 && !MAC16 */
1192#endif /* L_muldf3 */
1193
1194#ifdef L_divdf3
1195
1196	/* Division */
1197__divdf3_aux:
1198
1199	/* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
1200	   (This code is placed before the start of the function just to
1201	   keep it in range of the limited branch displacements.)  */
1202
1203.Ldiv_yexpzero:
1204	/* Clear the sign bit of y.  */
1205	slli	yh, yh, 1
1206	srli	yh, yh, 1
1207
1208	/* Check for division by zero.  */
1209	or	a10, yh, yl
1210	beqz	a10, .Ldiv_yzero
1211
1212	/* Normalize y.  Adjust the exponent in a9.  */
1213	beqz	yh, .Ldiv_yh_zero
1214	do_nsau	a10, yh, a11, a9
1215	addi	a10, a10, -11
1216	ssl	a10
1217	src	yh, yh, yl
1218	sll	yl, yl
1219	movi	a9, 1
1220	sub	a9, a9, a10
1221	j	.Ldiv_ynormalized
1222.Ldiv_yh_zero:
1223	do_nsau	a10, yl, a11, a9
1224	addi	a10, a10, -11
1225	movi	a9, -31
1226	sub	a9, a9, a10
1227	ssl	a10
1228	bltz	a10, .Ldiv_yl_srl
1229	sll	yh, yl
1230	movi	yl, 0
1231	j	.Ldiv_ynormalized
1232.Ldiv_yl_srl:
1233	srl	yh, yl
1234	sll	yl, yl
1235	j	.Ldiv_ynormalized
1236
1237.Ldiv_yzero:
1238	/* y is zero.  Return NaN if x is also zero; otherwise, infinity.  */
1239	slli	xh, xh, 1
1240	srli	xh, xh, 1
1241	or	xl, xl, xh
1242	srli	xh, a7, 31
1243	slli	xh, xh, 31
1244	or	xh, xh, a6
1245	bnez	xl, 1f
1246	movi	a4, 0x80000	/* make it a quiet NaN */
1247	or	xh, xh, a4
12481:	movi	xl, 0
1249	leaf_return
1250
1251.Ldiv_xexpzero:
1252	/* Clear the sign bit of x.  */
1253	slli	xh, xh, 1
1254	srli	xh, xh, 1
1255
1256	/* If x is zero, return zero.  */
1257	or	a10, xh, xl
1258	beqz	a10, .Ldiv_return_zero
1259
1260	/* Normalize x.  Adjust the exponent in a8.  */
1261	beqz	xh, .Ldiv_xh_zero
1262	do_nsau	a10, xh, a11, a8
1263	addi	a10, a10, -11
1264	ssl	a10
1265	src	xh, xh, xl
1266	sll	xl, xl
1267	movi	a8, 1
1268	sub	a8, a8, a10
1269	j	.Ldiv_xnormalized
1270.Ldiv_xh_zero:
1271	do_nsau	a10, xl, a11, a8
1272	addi	a10, a10, -11
1273	movi	a8, -31
1274	sub	a8, a8, a10
1275	ssl	a10
1276	bltz	a10, .Ldiv_xl_srl
1277	sll	xh, xl
1278	movi	xl, 0
1279	j	.Ldiv_xnormalized
1280.Ldiv_xl_srl:
1281	srl	xh, xl
1282	sll	xl, xl
1283	j	.Ldiv_xnormalized
1284
1285.Ldiv_return_zero:
1286	/* Return zero with the appropriate sign bit.  */
1287	srli	xh, a7, 31
1288	slli	xh, xh, 31
1289	movi	xl, 0
1290	leaf_return
1291
1292.Ldiv_xnan_or_inf:
1293	/* Set the sign bit of the result.  */
1294	srli	a7, yh, 31
1295	slli	a7, a7, 31
1296	xor	xh, xh, a7
1297	/* If y is NaN or Inf, return NaN.  */
1298	bnall	yh, a6, 1f
1299	movi	a4, 0x80000	/* make it a quiet NaN */
1300	or	xh, xh, a4
13011:	leaf_return
1302
1303.Ldiv_ynan_or_inf:
1304	/* If y is Infinity, return zero.  */
1305	slli	a8, yh, 12
1306	or	a8, a8, yl
1307	beqz	a8, .Ldiv_return_zero
1308	/* y is NaN; return it.  */
1309	mov	xh, yh
1310	mov	xl, yl
1311	leaf_return
1312
1313.Ldiv_highequal1:
1314	bltu	xl, yl, 2f
1315	j	3f
1316
1317	.align	4
1318	.global	__divdf3
1319	.type	__divdf3, @function
1320__divdf3:
1321	leaf_entry sp, 16
1322	movi	a6, 0x7ff00000
1323
1324	/* Get the sign of the result.  */
1325	xor	a7, xh, yh
1326
1327	/* Check for NaN and infinity.  */
1328	ball	xh, a6, .Ldiv_xnan_or_inf
1329	ball	yh, a6, .Ldiv_ynan_or_inf
1330
1331	/* Extract the exponents.  */
1332	extui	a8, xh, 20, 11
1333	extui	a9, yh, 20, 11
1334
1335	beqz	a9, .Ldiv_yexpzero
1336.Ldiv_ynormalized:
1337	beqz	a8, .Ldiv_xexpzero
1338.Ldiv_xnormalized:
1339
1340	/* Subtract the exponents.  */
1341	sub	a8, a8, a9
1342
1343	/* Replace sign/exponent fields with explicit "1.0".  */
1344	movi	a10, 0x1fffff
1345	or	xh, xh, a6
1346	and	xh, xh, a10
1347	or	yh, yh, a6
1348	and	yh, yh, a10
1349
1350	/* Set SAR for left shift by one.  */
1351	ssai	(32 - 1)
1352
1353	/* The first digit of the mantissa division must be a one.
1354	   Shift x (and adjust the exponent) as needed to make this true.  */
1355	bltu	yh, xh, 3f
1356	beq	yh, xh, .Ldiv_highequal1
13572:	src	xh, xh, xl
1358	sll	xl, xl
1359	addi	a8, a8, -1
13603:
1361	/* Do the first subtraction and shift.  */
1362	sub	xh, xh, yh
1363	bgeu	xl, yl, 1f
1364	addi	xh, xh, -1
13651:	sub	xl, xl, yl
1366	src	xh, xh, xl
1367	sll	xl, xl
1368
1369	/* Put the quotient into a10/a11.  */
1370	movi	a10, 0
1371	movi	a11, 1
1372
1373	/* Divide one bit at a time for 52 bits.  */
1374	movi	a9, 52
1375#if XCHAL_HAVE_LOOPS
1376	loop	a9, .Ldiv_loopend
1377#endif
1378.Ldiv_loop:
1379	/* Shift the quotient << 1.  */
1380	src	a10, a10, a11
1381	sll	a11, a11
1382
1383	/* Is this digit a 0 or 1?  */
1384	bltu	xh, yh, 3f
1385	beq	xh, yh, .Ldiv_highequal2
1386
1387	/* Output a 1 and subtract.  */
13882:	addi	a11, a11, 1
1389	sub	xh, xh, yh
1390	bgeu	xl, yl, 1f
1391	addi	xh, xh, -1
13921:	sub	xl, xl, yl
1393
1394	/* Shift the dividend << 1.  */
13953:	src	xh, xh, xl
1396	sll	xl, xl
1397
1398#if !XCHAL_HAVE_LOOPS
1399	addi	a9, a9, -1
1400	bnez	a9, .Ldiv_loop
1401#endif
1402.Ldiv_loopend:
1403
1404	/* Add the exponent bias (less one to account for the explicit "1.0"
1405	   of the mantissa that will be added to the exponent in the final
1406	   result).  */
1407	movi	a9, 0x3fe
1408	add	a8, a8, a9
1409
1410	/* Check for over/underflow.  The value in a8 is one less than the
1411	   final exponent, so values in the range 0..7fd are OK here.  */
1412	addmi	a9, a9, 0x400	/* 0x7fe */
1413	bgeu	a8, a9, .Ldiv_overflow
1414
1415.Ldiv_round:
1416	/* Round.  The remainder (<< 1) is in xh/xl.  */
1417	bltu	xh, yh, .Ldiv_rounded
1418	beq	xh, yh, .Ldiv_highequal3
1419.Ldiv_roundup:
1420	addi	a11, a11, 1
1421	beqz	a11, .Ldiv_roundcarry
1422
1423.Ldiv_rounded:
1424	mov	xl, a11
1425	/* Add the exponent to the mantissa.  */
1426	slli	a8, a8, 20
1427	add	xh, a10, a8
1428
1429.Ldiv_addsign:
1430	/* Add the sign bit.  */
1431	srli	a7, a7, 31
1432	slli	a7, a7, 31
1433	or	xh, xh, a7
1434	leaf_return
1435
1436.Ldiv_highequal2:
1437	bgeu	xl, yl, 2b
1438	j	3b
1439
1440.Ldiv_highequal3:
1441	bltu	xl, yl, .Ldiv_rounded
1442	bne	xl, yl, .Ldiv_roundup
1443
1444	/* Remainder is exactly half the divisor.  Round even.  */
1445	addi	a11, a11, 1
1446	beqz	a11, .Ldiv_roundcarry
1447	srli	a11, a11, 1
1448	slli	a11, a11, 1
1449	j	.Ldiv_rounded
1450
1451.Ldiv_overflow:
1452	bltz	a8, .Ldiv_underflow
1453	/* Return +/- Infinity.  */
1454	addi	a8, a9, 1	/* 0x7ff */
1455	slli	xh, a8, 20
1456	movi	xl, 0
1457	j	.Ldiv_addsign
1458
1459.Ldiv_underflow:
1460	/* Create a subnormal value, where the exponent field contains zero,
1461	   but the effective exponent is 1.  The value of a8 is one less than
1462	   the actual exponent, so just negate it to get the shift amount.  */
1463	neg	a8, a8
1464	ssr	a8
1465	bgeui	a8, 32, .Ldiv_bigshift
1466
1467	/* Shift a10/a11 right.  Any bits that are shifted out of a11 are
1468	   saved in a6 for rounding the result.  */
1469	sll	a6, a11
1470	src	a11, a10, a11
1471	srl	a10, a10
1472	j	1f
1473
1474.Ldiv_bigshift:
1475	bgeui	a8, 64, .Ldiv_flush_to_zero
1476	sll	a9, a11		/* lost bits shifted out of a11 */
1477	src	a6, a10, a11
1478	srl	a11, a10
1479	movi	a10, 0
1480	or	xl, xl, a9
1481
1482	/* Set the exponent to zero.  */
14831:	movi	a8, 0
1484
1485	/* Pack any nonzero remainder (in xh/xl) into a6.  */
1486	or	xh, xh, xl
1487	beqz	xh, 1f
1488	movi	a9, 1
1489	or	a6, a6, a9
1490
1491	/* Round a10/a11 based on the bits shifted out into a6.  */
14921:	bgez	a6, .Ldiv_rounded
1493	addi	a11, a11, 1
1494	beqz	a11, .Ldiv_roundcarry
1495	slli	a6, a6, 1
1496	bnez	a6, .Ldiv_rounded
1497	srli	a11, a11, 1
1498	slli	a11, a11, 1
1499	j	.Ldiv_rounded
1500
1501.Ldiv_roundcarry:
1502	/* a11 is always zero when the rounding increment overflows, so
1503	   there's no need to round it to an even value.  */
1504	addi	a10, a10, 1
1505	/* Overflow to the exponent field is OK.  */
1506	j	.Ldiv_rounded
1507
1508.Ldiv_flush_to_zero:
1509	/* Return zero with the appropriate sign bit.  */
1510	srli	xh, a7, 31
1511	slli	xh, xh, 31
1512	movi	xl, 0
1513	leaf_return
1514
1515#endif /* L_divdf3 */
1516
1517#ifdef L_cmpdf2
1518
1519	/* Equal and Not Equal */
1520
1521	.align	4
1522	.global	__eqdf2
1523	.global	__nedf2
1524	.set	__nedf2, __eqdf2
1525	.type	__eqdf2, @function
1526__eqdf2:
1527	leaf_entry sp, 16
1528	bne	xl, yl, 2f
1529	bne	xh, yh, 4f
1530
1531	/* The values are equal but NaN != NaN.  Check the exponent.  */
1532	movi	a6, 0x7ff00000
1533	ball	xh, a6, 3f
1534
1535	/* Equal.  */
1536	movi	a2, 0
1537	leaf_return
1538
1539	/* Not equal.  */
15402:	movi	a2, 1
1541	leaf_return
1542
1543	/* Check if the mantissas are nonzero.  */
15443:	slli	a7, xh, 12
1545	or	a7, a7, xl
1546	j	5f
1547
1548	/* Check if x and y are zero with different signs.  */
15494:	or	a7, xh, yh
1550	slli	a7, a7, 1
1551	or	a7, a7, xl	/* xl == yl here */
1552
1553	/* Equal if a7 == 0, where a7 is either abs(x | y) or the mantissa
1554	   or x when exponent(x) = 0x7ff and x == y.  */
15555:	movi	a2, 0
1556	movi	a3, 1
1557	movnez	a2, a3, a7
1558	leaf_return
1559
1560
1561	/* Greater Than */
1562
1563	.align	4
1564	.global	__gtdf2
1565	.type	__gtdf2, @function
1566__gtdf2:
1567	leaf_entry sp, 16
1568	movi	a6, 0x7ff00000
1569	ball	xh, a6, 2f
15701:	bnall	yh, a6, .Lle_cmp
1571
1572	/* Check if y is a NaN.  */
1573	slli	a7, yh, 12
1574	or	a7, a7, yl
1575	beqz	a7, .Lle_cmp
1576	movi	a2, 0
1577	leaf_return
1578
1579	/* Check if x is a NaN.  */
15802:	slli	a7, xh, 12
1581	or	a7, a7, xl
1582	beqz	a7, 1b
1583	movi	a2, 0
1584	leaf_return
1585
1586
1587	/* Less Than or Equal */
1588
1589	.align	4
1590	.global	__ledf2
1591	.type	__ledf2, @function
1592__ledf2:
1593	leaf_entry sp, 16
1594	movi	a6, 0x7ff00000
1595	ball	xh, a6, 2f
15961:	bnall	yh, a6, .Lle_cmp
1597
1598	/* Check if y is a NaN.  */
1599	slli	a7, yh, 12
1600	or	a7, a7, yl
1601	beqz	a7, .Lle_cmp
1602	movi	a2, 1
1603	leaf_return
1604
1605	/* Check if x is a NaN.  */
16062:	slli	a7, xh, 12
1607	or	a7, a7, xl
1608	beqz	a7, 1b
1609	movi	a2, 1
1610	leaf_return
1611
1612.Lle_cmp:
1613	/* Check if x and y have different signs.  */
1614	xor	a7, xh, yh
1615	bltz	a7, .Lle_diff_signs
1616
1617	/* Check if x is negative.  */
1618	bltz	xh, .Lle_xneg
1619
1620	/* Check if x <= y.  */
1621	bltu	xh, yh, 4f
1622	bne	xh, yh, 5f
1623	bltu	yl, xl, 5f
16244:	movi	a2, 0
1625	leaf_return
1626
1627.Lle_xneg:
1628	/* Check if y <= x.  */
1629	bltu	yh, xh, 4b
1630	bne	yh, xh, 5f
1631	bgeu	xl, yl, 4b
16325:	movi	a2, 1
1633	leaf_return
1634
1635.Lle_diff_signs:
1636	bltz	xh, 4b
1637
1638	/* Check if both x and y are zero.  */
1639	or	a7, xh, yh
1640	slli	a7, a7, 1
1641	or	a7, a7, xl
1642	or	a7, a7, yl
1643	movi	a2, 1
1644	movi	a3, 0
1645	moveqz	a2, a3, a7
1646	leaf_return
1647
1648
1649	/* Greater Than or Equal */
1650
1651	.align	4
1652	.global	__gedf2
1653	.type	__gedf2, @function
1654__gedf2:
1655	leaf_entry sp, 16
1656	movi	a6, 0x7ff00000
1657	ball	xh, a6, 2f
16581:	bnall	yh, a6, .Llt_cmp
1659
1660	/* Check if y is a NaN.  */
1661	slli	a7, yh, 12
1662	or	a7, a7, yl
1663	beqz	a7, .Llt_cmp
1664	movi	a2, -1
1665	leaf_return
1666
1667	/* Check if x is a NaN.  */
16682:	slli	a7, xh, 12
1669	or	a7, a7, xl
1670	beqz	a7, 1b
1671	movi	a2, -1
1672	leaf_return
1673
1674
1675	/* Less Than */
1676
1677	.align	4
1678	.global	__ltdf2
1679	.type	__ltdf2, @function
1680__ltdf2:
1681	leaf_entry sp, 16
1682	movi	a6, 0x7ff00000
1683	ball	xh, a6, 2f
16841:	bnall	yh, a6, .Llt_cmp
1685
1686	/* Check if y is a NaN.  */
1687	slli	a7, yh, 12
1688	or	a7, a7, yl
1689	beqz	a7, .Llt_cmp
1690	movi	a2, 0
1691	leaf_return
1692
1693	/* Check if x is a NaN.  */
16942:	slli	a7, xh, 12
1695	or	a7, a7, xl
1696	beqz	a7, 1b
1697	movi	a2, 0
1698	leaf_return
1699
1700.Llt_cmp:
1701	/* Check if x and y have different signs.  */
1702	xor	a7, xh, yh
1703	bltz	a7, .Llt_diff_signs
1704
1705	/* Check if x is negative.  */
1706	bltz	xh, .Llt_xneg
1707
1708	/* Check if x < y.  */
1709	bltu	xh, yh, 4f
1710	bne	xh, yh, 5f
1711	bgeu	xl, yl, 5f
17124:	movi	a2, -1
1713	leaf_return
1714
1715.Llt_xneg:
1716	/* Check if y < x.  */
1717	bltu	yh, xh, 4b
1718	bne	yh, xh, 5f
1719	bltu	yl, xl, 4b
17205:	movi	a2, 0
1721	leaf_return
1722
1723.Llt_diff_signs:
1724	bgez	xh, 5b
1725
1726	/* Check if both x and y are nonzero.  */
1727	or	a7, xh, yh
1728	slli	a7, a7, 1
1729	or	a7, a7, xl
1730	or	a7, a7, yl
1731	movi	a2, 0
1732	movi	a3, -1
1733	movnez	a2, a3, a7
1734	leaf_return
1735
1736
1737	/* Unordered */
1738
1739	.align	4
1740	.global	__unorddf2
1741	.type	__unorddf2, @function
1742__unorddf2:
1743	leaf_entry sp, 16
1744	movi	a6, 0x7ff00000
1745	ball	xh, a6, 3f
17461:	ball	yh, a6, 4f
17472:	movi	a2, 0
1748	leaf_return
1749
17503:	slli	a7, xh, 12
1751	or	a7, a7, xl
1752	beqz	a7, 1b
1753	movi	a2, 1
1754	leaf_return
1755
17564:	slli	a7, yh, 12
1757	or	a7, a7, yl
1758	beqz	a7, 2b
1759	movi	a2, 1
1760	leaf_return
1761
1762#endif /* L_cmpdf2 */
1763
1764#ifdef L_fixdfsi
1765
1766	.align	4
1767	.global	__fixdfsi
1768	.type	__fixdfsi, @function
1769__fixdfsi:
1770	leaf_entry sp, 16
1771
1772	/* Check for NaN and Infinity.  */
1773	movi	a6, 0x7ff00000
1774	ball	xh, a6, .Lfixdfsi_nan_or_inf
1775
1776	/* Extract the exponent and check if 0 < (exp - 0x3fe) < 32.  */
1777	extui	a4, xh, 20, 11
1778	extui	a5, a6, 19, 10	/* 0x3fe */
1779	sub	a4, a4, a5
1780	bgei	a4, 32, .Lfixdfsi_maxint
1781	blti	a4, 1, .Lfixdfsi_zero
1782
1783	/* Add explicit "1.0" and shift << 11.  */
1784	or	a7, xh, a6
1785	ssai	(32 - 11)
1786	src	a5, a7, xl
1787
1788	/* Shift back to the right, based on the exponent.  */
1789	ssl	a4		/* shift by 32 - a4 */
1790	srl	a5, a5
1791
1792	/* Negate the result if sign != 0.  */
1793	neg	a2, a5
1794	movgez	a2, a5, a7
1795	leaf_return
1796
1797.Lfixdfsi_nan_or_inf:
1798	/* Handle Infinity and NaN.  */
1799	slli	a4, xh, 12
1800	or	a4, a4, xl
1801	beqz	a4, .Lfixdfsi_maxint
1802
1803	/* Translate NaN to +maxint.  */
1804	movi	xh, 0
1805
1806.Lfixdfsi_maxint:
1807	slli	a4, a6, 11	/* 0x80000000 */
1808	addi	a5, a4, -1	/* 0x7fffffff */
1809	movgez	a4, a5, xh
1810	mov	a2, a4
1811	leaf_return
1812
1813.Lfixdfsi_zero:
1814	movi	a2, 0
1815	leaf_return
1816
1817#endif /* L_fixdfsi */
1818
1819#ifdef L_fixdfdi
1820
1821	.align	4
1822	.global	__fixdfdi
1823	.type	__fixdfdi, @function
1824__fixdfdi:
1825	leaf_entry sp, 16
1826
1827	/* Check for NaN and Infinity.  */
1828	movi	a6, 0x7ff00000
1829	ball	xh, a6, .Lfixdfdi_nan_or_inf
1830
1831	/* Extract the exponent and check if 0 < (exp - 0x3fe) < 64.  */
1832	extui	a4, xh, 20, 11
1833	extui	a5, a6, 19, 10	/* 0x3fe */
1834	sub	a4, a4, a5
1835	bgei	a4, 64, .Lfixdfdi_maxint
1836	blti	a4, 1, .Lfixdfdi_zero
1837
1838	/* Add explicit "1.0" and shift << 11.  */
1839	or	a7, xh, a6
1840	ssai	(32 - 11)
1841	src	xh, a7, xl
1842	sll	xl, xl
1843
1844	/* Shift back to the right, based on the exponent.  */
1845	ssl	a4		/* shift by 64 - a4 */
1846	bgei	a4, 32, .Lfixdfdi_smallshift
1847	srl	xl, xh
1848	movi	xh, 0
1849
1850.Lfixdfdi_shifted:
1851	/* Negate the result if sign != 0.  */
1852	bgez	a7, 1f
1853	neg	xl, xl
1854	neg	xh, xh
1855	beqz	xl, 1f
1856	addi	xh, xh, -1
18571:	leaf_return
1858
1859.Lfixdfdi_smallshift:
1860	src	xl, xh, xl
1861	srl	xh, xh
1862	j	.Lfixdfdi_shifted
1863
1864.Lfixdfdi_nan_or_inf:
1865	/* Handle Infinity and NaN.  */
1866	slli	a4, xh, 12
1867	or	a4, a4, xl
1868	beqz	a4, .Lfixdfdi_maxint
1869
1870	/* Translate NaN to +maxint.  */
1871	movi	xh, 0
1872
1873.Lfixdfdi_maxint:
1874	slli	a7, a6, 11	/* 0x80000000 */
1875	bgez	xh, 1f
1876	mov	xh, a7
1877	movi	xl, 0
1878	leaf_return
1879
18801:	addi	xh, a7, -1	/* 0x7fffffff */
1881	movi	xl, -1
1882	leaf_return
1883
1884.Lfixdfdi_zero:
1885	movi	xh, 0
1886	movi	xl, 0
1887	leaf_return
1888
1889#endif /* L_fixdfdi */
1890
1891#ifdef L_fixunsdfsi
1892
1893	.align	4
1894	.global	__fixunsdfsi
1895	.type	__fixunsdfsi, @function
1896__fixunsdfsi:
1897	leaf_entry sp, 16
1898
1899	/* Check for NaN and Infinity.  */
1900	movi	a6, 0x7ff00000
1901	ball	xh, a6, .Lfixunsdfsi_nan_or_inf
1902
1903	/* Extract the exponent and check if 0 <= (exp - 0x3ff) < 32.  */
1904	extui	a4, xh, 20, 11
1905	extui	a5, a6, 20, 10	/* 0x3ff */
1906	sub	a4, a4, a5
1907	bgei	a4, 32, .Lfixunsdfsi_maxint
1908	bltz	a4, .Lfixunsdfsi_zero
1909
1910	/* Add explicit "1.0" and shift << 11.  */
1911	or	a7, xh, a6
1912	ssai	(32 - 11)
1913	src	a5, a7, xl
1914
1915	/* Shift back to the right, based on the exponent.  */
1916	addi	a4, a4, 1
1917	beqi	a4, 32, .Lfixunsdfsi_bigexp
1918	ssl	a4		/* shift by 32 - a4 */
1919	srl	a5, a5
1920
1921	/* Negate the result if sign != 0.  */
1922	neg	a2, a5
1923	movgez	a2, a5, a7
1924	leaf_return
1925
1926.Lfixunsdfsi_nan_or_inf:
1927	/* Handle Infinity and NaN.  */
1928	slli	a4, xh, 12
1929	or	a4, a4, xl
1930	beqz	a4, .Lfixunsdfsi_maxint
1931
1932	/* Translate NaN to 0xffffffff.  */
1933	movi	a2, -1
1934	leaf_return
1935
1936.Lfixunsdfsi_maxint:
1937	slli	a4, a6, 11	/* 0x80000000 */
1938	movi	a5, -1		/* 0xffffffff */
1939	movgez	a4, a5, xh
1940	mov	a2, a4
1941	leaf_return
1942
1943.Lfixunsdfsi_zero:
1944	movi	a2, 0
1945	leaf_return
1946
1947.Lfixunsdfsi_bigexp:
1948	/* Handle unsigned maximum exponent case.  */
1949	bltz	xh, 1f
1950	mov	a2, a5		/* no shift needed */
1951	leaf_return
1952
1953	/* Return 0x80000000 if negative.  */
19541:	slli	a2, a6, 11
1955	leaf_return
1956
1957#endif /* L_fixunsdfsi */
1958
1959#ifdef L_fixunsdfdi
1960
1961	.align	4
1962	.global	__fixunsdfdi
1963	.type	__fixunsdfdi, @function
1964__fixunsdfdi:
1965	leaf_entry sp, 16
1966
1967	/* Check for NaN and Infinity.  */
1968	movi	a6, 0x7ff00000
1969	ball	xh, a6, .Lfixunsdfdi_nan_or_inf
1970
1971	/* Extract the exponent and check if 0 <= (exp - 0x3ff) < 64.  */
1972	extui	a4, xh, 20, 11
1973	extui	a5, a6, 20, 10	/* 0x3ff */
1974	sub	a4, a4, a5
1975	bgei	a4, 64, .Lfixunsdfdi_maxint
1976	bltz	a4, .Lfixunsdfdi_zero
1977
1978	/* Add explicit "1.0" and shift << 11.  */
1979	or	a7, xh, a6
1980	ssai	(32 - 11)
1981	src	xh, a7, xl
1982	sll	xl, xl
1983
1984	/* Shift back to the right, based on the exponent.  */
1985	addi	a4, a4, 1
1986	beqi	a4, 64, .Lfixunsdfdi_bigexp
1987	ssl	a4		/* shift by 64 - a4 */
1988	bgei	a4, 32, .Lfixunsdfdi_smallshift
1989	srl	xl, xh
1990	movi	xh, 0
1991
1992.Lfixunsdfdi_shifted:
1993	/* Negate the result if sign != 0.  */
1994	bgez	a7, 1f
1995	neg	xl, xl
1996	neg	xh, xh
1997	beqz	xl, 1f
1998	addi	xh, xh, -1
19991:	leaf_return
2000
2001.Lfixunsdfdi_smallshift:
2002	src	xl, xh, xl
2003	srl	xh, xh
2004	j	.Lfixunsdfdi_shifted
2005
2006.Lfixunsdfdi_nan_or_inf:
2007	/* Handle Infinity and NaN.  */
2008	slli	a4, xh, 12
2009	or	a4, a4, xl
2010	beqz	a4, .Lfixunsdfdi_maxint
2011
2012	/* Translate NaN to 0xffffffff.... */
20131:	movi	xh, -1
2014	movi	xl, -1
2015	leaf_return
2016
2017.Lfixunsdfdi_maxint:
2018	bgez	xh, 1b
20192:	slli	xh, a6, 11	/* 0x80000000 */
2020	movi	xl, 0
2021	leaf_return
2022
2023.Lfixunsdfdi_zero:
2024	movi	xh, 0
2025	movi	xl, 0
2026	leaf_return
2027
2028.Lfixunsdfdi_bigexp:
2029	/* Handle unsigned maximum exponent case.  */
2030	bltz	a7, 2b
2031	leaf_return		/* no shift needed */
2032
2033#endif /* L_fixunsdfdi */
2034
2035#ifdef L_floatsidf
2036
2037	.align	4
2038	.global	__floatunsidf
2039	.type	__floatunsidf, @function
2040__floatunsidf:
2041	leaf_entry sp, 16
2042	beqz	a2, .Lfloatsidf_return_zero
2043
2044	/* Set the sign to zero and jump to the floatsidf code.  */
2045	movi	a7, 0
2046	j	.Lfloatsidf_normalize
2047
2048	.align	4
2049	.global	__floatsidf
2050	.type	__floatsidf, @function
2051__floatsidf:
2052	leaf_entry sp, 16
2053
2054	/* Check for zero.  */
2055	beqz	a2, .Lfloatsidf_return_zero
2056
2057	/* Save the sign.  */
2058	extui	a7, a2, 31, 1
2059
2060	/* Get the absolute value.  */
2061#if XCHAL_HAVE_ABS
2062	abs	a2, a2
2063#else
2064	neg	a4, a2
2065	movltz	a2, a4, a2
2066#endif
2067
2068.Lfloatsidf_normalize:
2069	/* Normalize with the first 1 bit in the msb.  */
2070	do_nsau	a4, a2, a5, a6
2071	ssl	a4
2072	sll	a5, a2
2073
2074	/* Shift the mantissa into position.  */
2075	srli	xh, a5, 11
2076	slli	xl, a5, (32 - 11)
2077
2078	/* Set the exponent.  */
2079	movi	a5, 0x41d	/* 0x3fe + 31 */
2080	sub	a5, a5, a4
2081	slli	a5, a5, 20
2082	add	xh, xh, a5
2083
2084	/* Add the sign and return. */
2085	slli	a7, a7, 31
2086	or	xh, xh, a7
2087	leaf_return
2088
2089.Lfloatsidf_return_zero:
2090	movi	a3, 0
2091	leaf_return
2092
2093#endif /* L_floatsidf */
2094
2095#ifdef L_floatdidf
2096
2097	.align	4
2098	.global	__floatundidf
2099	.type	__floatundidf, @function
2100__floatundidf:
2101	leaf_entry sp, 16
2102
2103	/* Check for zero.  */
2104	or	a4, xh, xl
2105	beqz	a4, 2f
2106
2107	/* Set the sign to zero and jump to the floatdidf code.  */
2108	movi	a7, 0
2109	j	.Lfloatdidf_normalize
2110
2111	.align	4
2112	.global	__floatdidf
2113	.type	__floatdidf, @function
2114__floatdidf:
2115	leaf_entry sp, 16
2116
2117	/* Check for zero.  */
2118	or	a4, xh, xl
2119	beqz	a4, 2f
2120
2121	/* Save the sign.  */
2122	extui	a7, xh, 31, 1
2123
2124	/* Get the absolute value.  */
2125	bgez	xh, .Lfloatdidf_normalize
2126	neg	xl, xl
2127	neg	xh, xh
2128	beqz	xl, .Lfloatdidf_normalize
2129	addi	xh, xh, -1
2130
2131.Lfloatdidf_normalize:
2132	/* Normalize with the first 1 bit in the msb of xh.  */
2133	beqz	xh, .Lfloatdidf_bigshift
2134	do_nsau	a4, xh, a5, a6
2135	ssl	a4
2136	src	xh, xh, xl
2137	sll	xl, xl
2138
2139.Lfloatdidf_shifted:
2140	/* Shift the mantissa into position, with rounding bits in a6.  */
2141	ssai	11
2142	sll	a6, xl
2143	src	xl, xh, xl
2144	srl	xh, xh
2145
2146	/* Set the exponent.  */
2147	movi	a5, 0x43d	/* 0x3fe + 63 */
2148	sub	a5, a5, a4
2149	slli	a5, a5, 20
2150	add	xh, xh, a5
2151
2152	/* Add the sign.  */
2153	slli	a7, a7, 31
2154	or	xh, xh, a7
2155
2156	/* Round up if the leftover fraction is >= 1/2.  */
2157	bgez	a6, 2f
2158	addi	xl, xl, 1
2159	beqz	xl, .Lfloatdidf_roundcarry
2160
2161	/* Check if the leftover fraction is exactly 1/2.  */
2162	slli	a6, a6, 1
2163	beqz	a6, .Lfloatdidf_exactlyhalf
21642:	leaf_return
2165
2166.Lfloatdidf_bigshift:
2167	/* xh is zero.  Normalize with first 1 bit of xl in the msb of xh.  */
2168	do_nsau	a4, xl, a5, a6
2169	ssl	a4
2170	sll	xh, xl
2171	movi	xl, 0
2172	addi	a4, a4, 32
2173	j	.Lfloatdidf_shifted
2174
2175.Lfloatdidf_exactlyhalf:
2176	/* Round down to the nearest even value.  */
2177	srli	xl, xl, 1
2178	slli	xl, xl, 1
2179	leaf_return
2180
2181.Lfloatdidf_roundcarry:
2182	/* xl is always zero when the rounding increment overflows, so
2183	   there's no need to round it to an even value.  */
2184	addi	xh, xh, 1
2185	/* Overflow to the exponent is OK.  */
2186	leaf_return
2187
2188#endif /* L_floatdidf */
2189
2190#ifdef L_truncdfsf2
2191
2192	.align	4
2193	.global	__truncdfsf2
2194	.type	__truncdfsf2, @function
2195__truncdfsf2:
2196	leaf_entry sp, 16
2197
2198	/* Adjust the exponent bias.  */
2199	movi	a4, (0x3ff - 0x7f) << 20
2200	sub	a5, xh, a4
2201
2202	/* Check for underflow.  */
2203	xor	a6, xh, a5
2204	bltz	a6, .Ltrunc_underflow
2205	extui	a6, a5, 20, 11
2206	beqz	a6, .Ltrunc_underflow
2207
2208	/* Check for overflow.  */
2209	movi	a4, 255
2210	bge	a6, a4, .Ltrunc_overflow
2211
2212	/* Shift a5/xl << 3 into a5/a4.  */
2213	ssai	(32 - 3)
2214	src	a5, a5, xl
2215	sll	a4, xl
2216
2217.Ltrunc_addsign:
2218	/* Add the sign bit.  */
2219	extui	a6, xh, 31, 1
2220	slli	a6, a6, 31
2221	or	a2, a6, a5
2222
2223	/* Round up if the leftover fraction is >= 1/2.  */
2224	bgez	a4, 1f
2225	addi	a2, a2, 1
2226	/* Overflow to the exponent is OK.  The answer will be correct.  */
2227
2228	/* Check if the leftover fraction is exactly 1/2.  */
2229	slli	a4, a4, 1
2230	beqz	a4, .Ltrunc_exactlyhalf
22311:	leaf_return
2232
2233.Ltrunc_exactlyhalf:
2234	/* Round down to the nearest even value.  */
2235	srli	a2, a2, 1
2236	slli	a2, a2, 1
2237	leaf_return
2238
2239.Ltrunc_overflow:
2240	/* Check if exponent == 0x7ff.  */
2241	movi	a4, 0x7ff00000
2242	bnall	xh, a4, 1f
2243
2244	/* Check if mantissa is nonzero.  */
2245	slli	a5, xh, 12
2246	or	a5, a5, xl
2247	beqz	a5, 1f
2248
2249	/* Shift a4 to set a bit in the mantissa, making a quiet NaN.  */
2250	srli	a4, a4, 1
2251
22521:	slli	a4, a4, 4	/* 0xff000000 or 0xff800000 */
2253	/* Add the sign bit.  */
2254	extui	a6, xh, 31, 1
2255	ssai	1
2256	src	a2, a6, a4
2257	leaf_return
2258
2259.Ltrunc_underflow:
2260	/* Find shift count for a subnormal.  Flush to zero if >= 32.  */
2261	extui	a6, xh, 20, 11
2262	movi	a5, 0x3ff - 0x7f
2263	sub	a6, a5, a6
2264	addi	a6, a6, 1
2265	bgeui	a6, 32, 1f
2266
2267	/* Replace the exponent with an explicit "1.0".  */
2268	slli	a5, a5, 13	/* 0x700000 */
2269	or	a5, a5, xh
2270	slli	a5, a5, 11
2271	srli	a5, a5, 11
2272
2273	/* Shift the mantissa left by 3 bits (into a5/a4).  */
2274	ssai	(32 - 3)
2275	src	a5, a5, xl
2276	sll	a4, xl
2277
2278	/* Shift right by a6.  */
2279	ssr	a6
2280	sll	a7, a4
2281	src	a4, a5, a4
2282	srl	a5, a5
2283	beqz	a7, .Ltrunc_addsign
2284	or	a4, a4, a6	/* any positive, nonzero value will work */
2285	j	.Ltrunc_addsign
2286
2287	/* Return +/- zero.  */
22881:	extui	a2, xh, 31, 1
2289	slli	a2, a2, 31
2290	leaf_return
2291
2292#endif /* L_truncdfsf2 */
2293
2294#ifdef L_extendsfdf2
2295
2296	.align	4
2297	.global	__extendsfdf2
2298	.type	__extendsfdf2, @function
2299__extendsfdf2:
2300	leaf_entry sp, 16
2301
2302	/* Save the sign bit and then shift it off.  */
2303	extui	a5, a2, 31, 1
2304	slli	a5, a5, 31
2305	slli	a4, a2, 1
2306
2307	/* Extract and check the exponent.  */
2308	extui	a6, a2, 23, 8
2309	beqz	a6, .Lextend_expzero
2310	addi	a6, a6, 1
2311	beqi	a6, 256, .Lextend_nan_or_inf
2312
2313	/* Shift >> 3 into a4/xl.  */
2314	srli	a4, a4, 4
2315	slli	xl, a2, (32 - 3)
2316
2317	/* Adjust the exponent bias.  */
2318	movi	a6, (0x3ff - 0x7f) << 20
2319	add	a4, a4, a6
2320
2321	/* Add the sign bit.  */
2322	or	xh, a4, a5
2323	leaf_return
2324
2325.Lextend_nan_or_inf:
2326	movi	a4, 0x7ff00000
2327
2328	/* Check for NaN.  */
2329	slli	a7, a2, 9
2330	beqz	a7, 1f
2331
2332	slli	a6, a6, 11	/* 0x80000 */
2333	or	a4, a4, a6
2334
2335	/* Add the sign and return.  */
23361:	or	xh, a4, a5
2337	movi	xl, 0
2338	leaf_return
2339
2340.Lextend_expzero:
2341	beqz	a4, 1b
2342
2343	/* Normalize it to have 8 zero bits before the first 1 bit.  */
2344	do_nsau	a7, a4, a2, a3
2345	addi	a7, a7, -8
2346	ssl	a7
2347	sll	a4, a4
2348
2349	/* Shift >> 3 into a4/xl.  */
2350	slli	xl, a4, (32 - 3)
2351	srli	a4, a4, 3
2352
2353	/* Set the exponent.  */
2354	movi	a6, 0x3fe - 0x7f
2355	sub	a6, a6, a7
2356	slli	a6, a6, 20
2357	add	a4, a4, a6
2358
2359	/* Add the sign and return.  */
2360	or	xh, a4, a5
2361	leaf_return
2362
2363#endif /* L_extendsfdf2 */
2364
2365
2366