1/* Assembly functions for the Xtensa version of libgcc1.
2   Copyright (C) 2001-2021 Free Software Foundation, Inc.
3   Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
4
5This file is part of GCC.
6
7GCC is free software; you can redistribute it and/or modify it under
8the terms of the GNU General Public License as published by the Free
9Software Foundation; either version 3, or (at your option) any later
10version.
11
12GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13WARRANTY; without even the implied warranty of MERCHANTABILITY or
14FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15for more details.
16
17Under Section 7 of GPL version 3, you are granted additional
18permissions described in the GCC Runtime Library Exception, version
193.1, as published by the Free Software Foundation.
20
21You should have received a copy of the GNU General Public License and
22a copy of the GCC Runtime Library Exception along with this program;
23see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24<http://www.gnu.org/licenses/>.  */
25
26#include "xtensa-config.h"
27
28/* Define macros for the ABS and ADDX* instructions to handle cases
29   where they are not included in the Xtensa processor configuration.  */
30
31	.macro	do_abs dst, src, tmp
32#if XCHAL_HAVE_ABS
33	abs	\dst, \src
34#else
35	neg	\tmp, \src
36	movgez	\tmp, \src, \src
37	mov	\dst, \tmp
38#endif
39	.endm
40
41	.macro	do_addx2 dst, as, at, tmp
42#if XCHAL_HAVE_ADDX
43	addx2	\dst, \as, \at
44#else
45	slli	\tmp, \as, 1
46	add	\dst, \tmp, \at
47#endif
48	.endm
49
50	.macro	do_addx4 dst, as, at, tmp
51#if XCHAL_HAVE_ADDX
52	addx4	\dst, \as, \at
53#else
54	slli	\tmp, \as, 2
55	add	\dst, \tmp, \at
56#endif
57	.endm
58
59	.macro	do_addx8 dst, as, at, tmp
60#if XCHAL_HAVE_ADDX
61	addx8	\dst, \as, \at
62#else
63	slli	\tmp, \as, 3
64	add	\dst, \tmp, \at
65#endif
66	.endm
67
68/* Define macros for leaf function entry and return, supporting either the
69   standard register windowed ABI or the non-windowed call0 ABI.  These
70   macros do not allocate any extra stack space, so they only work for
71   leaf functions that do not need to spill anything to the stack.  */
72
73	.macro leaf_entry reg, size
74#if XCHAL_HAVE_WINDOWED && !__XTENSA_CALL0_ABI__
75	entry \reg, \size
76#else
77	/* do nothing */
78#endif
79	.endm
80
81	.macro leaf_return
82#if XCHAL_HAVE_WINDOWED && !__XTENSA_CALL0_ABI__
83	retw
84#else
85	ret
86#endif
87	.endm
88
89
90#ifdef L_mulsi3
91	.align	4
92	.global	__mulsi3
93	.type	__mulsi3, @function
94__mulsi3:
95	leaf_entry sp, 16
96
97#if XCHAL_HAVE_MUL32
98	mull	a2, a2, a3
99
100#elif XCHAL_HAVE_MUL16
101	or	a4, a2, a3
102	srai	a4, a4, 16
103	bnez	a4, .LMUL16
104	mul16u	a2, a2, a3
105	leaf_return
106.LMUL16:
107	srai	a4, a2, 16
108	srai	a5, a3, 16
109	mul16u	a7, a4, a3
110	mul16u	a6, a5, a2
111	mul16u	a4, a2, a3
112	add	a7, a7, a6
113	slli	a7, a7, 16
114	add	a2, a7, a4
115
116#elif XCHAL_HAVE_MAC16
117	mul.aa.hl a2, a3
118	mula.aa.lh a2, a3
119	rsr	a5, ACCLO
120	umul.aa.ll a2, a3
121	rsr	a4, ACCLO
122	slli	a5, a5, 16
123	add	a2, a4, a5
124
125#else /* !MUL32 && !MUL16 && !MAC16 */
126
127	/* Multiply one bit at a time, but unroll the loop 4x to better
128	   exploit the addx instructions and avoid overhead.
129	   Peel the first iteration to save a cycle on init.  */
130
131	/* Avoid negative numbers.  */
132	xor	a5, a2, a3	/* Top bit is 1 if one input is negative.  */
133	do_abs	a3, a3, a6
134	do_abs	a2, a2, a6
135
136	/* Swap so the second argument is smaller.  */
137	sub	a7, a2, a3
138	mov	a4, a3
139	movgez	a4, a2, a7	/* a4 = max (a2, a3) */
140	movltz	a3, a2, a7	/* a3 = min (a2, a3) */
141
142	movi	a2, 0
143	extui	a6, a3, 0, 1
144	movnez	a2, a4, a6
145
146	do_addx2 a7, a4, a2, a7
147	extui	a6, a3, 1, 1
148	movnez	a2, a7, a6
149
150	do_addx4 a7, a4, a2, a7
151	extui	a6, a3, 2, 1
152	movnez	a2, a7, a6
153
154	do_addx8 a7, a4, a2, a7
155	extui	a6, a3, 3, 1
156	movnez	a2, a7, a6
157
158	bgeui	a3, 16, .Lmult_main_loop
159	neg	a3, a2
160	movltz	a2, a3, a5
161	leaf_return
162
163	.align	4
164.Lmult_main_loop:
165	srli	a3, a3, 4
166	slli	a4, a4, 4
167
168	add	a7, a4, a2
169	extui	a6, a3, 0, 1
170	movnez	a2, a7, a6
171
172	do_addx2 a7, a4, a2, a7
173	extui	a6, a3, 1, 1
174	movnez	a2, a7, a6
175
176	do_addx4 a7, a4, a2, a7
177	extui	a6, a3, 2, 1
178	movnez	a2, a7, a6
179
180	do_addx8 a7, a4, a2, a7
181	extui	a6, a3, 3, 1
182	movnez	a2, a7, a6
183
184	bgeui	a3, 16, .Lmult_main_loop
185
186	neg	a3, a2
187	movltz	a2, a3, a5
188
189#endif /* !MUL32 && !MUL16 && !MAC16 */
190
191	leaf_return
192	.size	__mulsi3, . - __mulsi3
193
194#endif /* L_mulsi3 */
195
196
197#ifdef L_umulsidi3
198
199#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
200#define XCHAL_NO_MUL 1
201#endif
202
203	.align	4
204	.global	__umulsidi3
205	.type	__umulsidi3, @function
206__umulsidi3:
207#if __XTENSA_CALL0_ABI__
208	leaf_entry sp, 32
209	addi	sp, sp, -32
210	s32i	a12, sp, 16
211	s32i	a13, sp, 20
212	s32i	a14, sp, 24
213	s32i	a15, sp, 28
214#elif XCHAL_NO_MUL
215	/* This is not really a leaf function; allocate enough stack space
216	   to allow CALL12s to a helper function.  */
217	leaf_entry sp, 48
218#else
219	leaf_entry sp, 16
220#endif
221
222#ifdef __XTENSA_EB__
223#define wh a2
224#define wl a3
225#else
226#define wh a3
227#define wl a2
228#endif /* __XTENSA_EB__ */
229
230	/* This code is taken from the mulsf3 routine in ieee754-sf.S.
231	   See more comments there.  */
232
233#if XCHAL_HAVE_MUL32_HIGH
234	mull	a6, a2, a3
235	muluh	wh, a2, a3
236	mov	wl, a6
237
238#else /* ! MUL32_HIGH */
239
240#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
241	/* a0 and a8 will be clobbered by calling the multiply function
242	   but a8 is not used here and need not be saved.  */
243	s32i	a0, sp, 0
244#endif
245
246#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32
247
248#define a2h a4
249#define a3h a5
250
251	/* Get the high halves of the inputs into registers.  */
252	srli	a2h, a2, 16
253	srli	a3h, a3, 16
254
255#define a2l a2
256#define a3l a3
257
258#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16
259	/* Clear the high halves of the inputs.  This does not matter
260	   for MUL16 because the high bits are ignored.  */
261	extui	a2, a2, 0, 16
262	extui	a3, a3, 0, 16
263#endif
264#endif /* MUL16 || MUL32 */
265
266
267#if XCHAL_HAVE_MUL16
268
269#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
270	mul16u	dst, xreg ## xhalf, yreg ## yhalf
271
272#elif XCHAL_HAVE_MUL32
273
274#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
275	mull	dst, xreg ## xhalf, yreg ## yhalf
276
277#elif XCHAL_HAVE_MAC16
278
279/* The preprocessor insists on inserting a space when concatenating after
280   a period in the definition of do_mul below.  These macros are a workaround
281   using underscores instead of periods when doing the concatenation.  */
282#define umul_aa_ll umul.aa.ll
283#define umul_aa_lh umul.aa.lh
284#define umul_aa_hl umul.aa.hl
285#define umul_aa_hh umul.aa.hh
286
287#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
288	umul_aa_ ## xhalf ## yhalf	xreg, yreg; \
289	rsr	dst, ACCLO
290
291#else /* no multiply hardware */
292
293#define set_arg_l(dst, src) \
294	extui	dst, src, 0, 16
295#define set_arg_h(dst, src) \
296	srli	dst, src, 16
297
298#if __XTENSA_CALL0_ABI__
299#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
300	set_arg_ ## xhalf (a13, xreg); \
301	set_arg_ ## yhalf (a14, yreg); \
302	call0	.Lmul_mulsi3; \
303	mov	dst, a12
304#else
305#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
306	set_arg_ ## xhalf (a14, xreg); \
307	set_arg_ ## yhalf (a15, yreg); \
308	call12	.Lmul_mulsi3; \
309	mov	dst, a14
310#endif /* __XTENSA_CALL0_ABI__ */
311
312#endif /* no multiply hardware */
313
314	/* Add pp1 and pp2 into a6 with carry-out in a9.  */
315	do_mul(a6, a2, l, a3, h)	/* pp 1 */
316	do_mul(a11, a2, h, a3, l)	/* pp 2 */
317	movi	a9, 0
318	add	a6, a6, a11
319	bgeu	a6, a11, 1f
320	addi	a9, a9, 1
3211:
322	/* Shift the high half of a9/a6 into position in a9.  Note that
323	   this value can be safely incremented without any carry-outs.  */
324	ssai	16
325	src	a9, a9, a6
326
327	/* Compute the low word into a6.  */
328	do_mul(a11, a2, l, a3, l)	/* pp 0 */
329	sll	a6, a6
330	add	a6, a6, a11
331	bgeu	a6, a11, 1f
332	addi	a9, a9, 1
3331:
334	/* Compute the high word into wh.  */
335	do_mul(wh, a2, h, a3, h)	/* pp 3 */
336	add	wh, wh, a9
337	mov	wl, a6
338
339#endif /* !MUL32_HIGH */
340
341#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
342	/* Restore the original return address.  */
343	l32i	a0, sp, 0
344#endif
345#if __XTENSA_CALL0_ABI__
346	l32i	a12, sp, 16
347	l32i	a13, sp, 20
348	l32i	a14, sp, 24
349	l32i	a15, sp, 28
350	addi	sp, sp, 32
351#endif
352	leaf_return
353
354#if XCHAL_NO_MUL
355
356	/* For Xtensa processors with no multiply hardware, this simplified
357	   version of _mulsi3 is used for multiplying 16-bit chunks of
358	   the floating-point mantissas.  When using CALL0, this function
359	   uses a custom ABI: the inputs are passed in a13 and a14, the
360	   result is returned in a12, and a8 and a15 are clobbered.  */
361	.align	4
362.Lmul_mulsi3:
363	leaf_entry sp, 16
364	.macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
365	movi	\dst, 0
3661:	add	\tmp1, \src2, \dst
367	extui	\tmp2, \src1, 0, 1
368	movnez	\dst, \tmp1, \tmp2
369
370	do_addx2 \tmp1, \src2, \dst, \tmp1
371	extui	\tmp2, \src1, 1, 1
372	movnez	\dst, \tmp1, \tmp2
373
374	do_addx4 \tmp1, \src2, \dst, \tmp1
375	extui	\tmp2, \src1, 2, 1
376	movnez	\dst, \tmp1, \tmp2
377
378	do_addx8 \tmp1, \src2, \dst, \tmp1
379	extui	\tmp2, \src1, 3, 1
380	movnez	\dst, \tmp1, \tmp2
381
382	srli	\src1, \src1, 4
383	slli	\src2, \src2, 4
384	bnez	\src1, 1b
385	.endm
386#if __XTENSA_CALL0_ABI__
387	mul_mulsi3_body a12, a13, a14, a15, a8
388#else
389	/* The result will be written into a2, so save that argument in a4.  */
390	mov	a4, a2
391	mul_mulsi3_body a2, a4, a3, a5, a6
392#endif
393	leaf_return
394#endif /* XCHAL_NO_MUL */
395
396	.size	__umulsidi3, . - __umulsidi3
397
398#endif /* L_umulsidi3 */
399
400
401/* Define a macro for the NSAU (unsigned normalize shift amount)
402   instruction, which computes the number of leading zero bits,
403   to handle cases where it is not included in the Xtensa processor
404   configuration.  */
405
406	.macro	do_nsau cnt, val, tmp, a
407#if XCHAL_HAVE_NSA
408	nsau	\cnt, \val
409#else
410	mov	\a, \val
411	movi	\cnt, 0
412	extui	\tmp, \a, 16, 16
413	bnez	\tmp, 0f
414	movi	\cnt, 16
415	slli	\a, \a, 16
4160:
417	extui	\tmp, \a, 24, 8
418	bnez	\tmp, 1f
419	addi	\cnt, \cnt, 8
420	slli	\a, \a, 8
4211:
422	movi	\tmp, __nsau_data
423	extui	\a, \a, 24, 8
424	add	\tmp, \tmp, \a
425	l8ui	\tmp, \tmp, 0
426	add	\cnt, \cnt, \tmp
427#endif /* !XCHAL_HAVE_NSA */
428	.endm
429
430#ifdef L_clz
431	.section .rodata
432	.align	4
433	.global	__nsau_data
434	.type	__nsau_data, @object
435__nsau_data:
436#if !XCHAL_HAVE_NSA
437	.byte	8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
438	.byte	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
439	.byte	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
440	.byte	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
441	.byte	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
442	.byte	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
443	.byte	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
444	.byte	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
445	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
446	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
447	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
448	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
449	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
450	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
451	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
452	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
453#endif /* !XCHAL_HAVE_NSA */
454	.size	__nsau_data, . - __nsau_data
455	.hidden	__nsau_data
456#endif /* L_clz */
457
458
459#ifdef L_clzsi2
460	.align	4
461	.global	__clzsi2
462	.type	__clzsi2, @function
463__clzsi2:
464	leaf_entry sp, 16
465	do_nsau	a2, a2, a3, a4
466	leaf_return
467	.size	__clzsi2, . - __clzsi2
468
469#endif /* L_clzsi2 */
470
471
472#ifdef L_ctzsi2
473	.align	4
474	.global	__ctzsi2
475	.type	__ctzsi2, @function
476__ctzsi2:
477	leaf_entry sp, 16
478	neg	a3, a2
479	and	a3, a3, a2
480	do_nsau	a2, a3, a4, a5
481	neg	a2, a2
482	addi	a2, a2, 31
483	leaf_return
484	.size	__ctzsi2, . - __ctzsi2
485
486#endif /* L_ctzsi2 */
487
488
489#ifdef L_ffssi2
490	.align	4
491	.global	__ffssi2
492	.type	__ffssi2, @function
493__ffssi2:
494	leaf_entry sp, 16
495	neg	a3, a2
496	and	a3, a3, a2
497	do_nsau	a2, a3, a4, a5
498	neg	a2, a2
499	addi	a2, a2, 32
500	leaf_return
501	.size	__ffssi2, . - __ffssi2
502
503#endif /* L_ffssi2 */
504
505
506#ifdef L_udivsi3
507	.align	4
508	.global	__udivsi3
509	.type	__udivsi3, @function
510__udivsi3:
511	leaf_entry sp, 16
512#if XCHAL_HAVE_DIV32
513	quou	a2, a2, a3
514#else
515	bltui	a3, 2, .Lle_one	/* check if the divisor <= 1 */
516
517	mov	a6, a2		/* keep dividend in a6 */
518	do_nsau	a5, a6, a2, a7	/* dividend_shift = nsau (dividend) */
519	do_nsau	a4, a3, a2, a7	/* divisor_shift = nsau (divisor) */
520	bgeu	a5, a4, .Lspecial
521
522	sub	a4, a4, a5	/* count = divisor_shift - dividend_shift */
523	ssl	a4
524	sll	a3, a3		/* divisor <<= count */
525	movi	a2, 0		/* quotient = 0 */
526
527	/* test-subtract-and-shift loop; one quotient bit on each iteration */
528#if XCHAL_HAVE_LOOPS
529	loopnez	a4, .Lloopend
530#endif /* XCHAL_HAVE_LOOPS */
531.Lloop:
532	bltu	a6, a3, .Lzerobit
533	sub	a6, a6, a3
534	addi	a2, a2, 1
535.Lzerobit:
536	slli	a2, a2, 1
537	srli	a3, a3, 1
538#if !XCHAL_HAVE_LOOPS
539	addi	a4, a4, -1
540	bnez	a4, .Lloop
541#endif /* !XCHAL_HAVE_LOOPS */
542.Lloopend:
543
544	bltu	a6, a3, .Lreturn
545	addi	a2, a2, 1	/* increment quotient if dividend >= divisor */
546.Lreturn:
547	leaf_return
548
549.Lle_one:
550	beqz	a3, .Lerror	/* if divisor == 1, return the dividend */
551	leaf_return
552
553.Lspecial:
554	/* return dividend >= divisor */
555	bltu	a6, a3, .Lreturn0
556	movi	a2, 1
557	leaf_return
558
559.Lerror:
560	/* Divide by zero: Use an illegal instruction to force an exception.
561	   The subsequent "DIV0" string can be recognized by the exception
562	   handler to identify the real cause of the exception.  */
563	ill
564	.ascii	"DIV0"
565
566.Lreturn0:
567	movi	a2, 0
568#endif /* XCHAL_HAVE_DIV32 */
569	leaf_return
570	.size	__udivsi3, . - __udivsi3
571
572#endif /* L_udivsi3 */
573
574
575#ifdef L_divsi3
576	.align	4
577	.global	__divsi3
578	.type	__divsi3, @function
579__divsi3:
580	leaf_entry sp, 16
581#if XCHAL_HAVE_DIV32
582	quos	a2, a2, a3
583#else
584	xor	a7, a2, a3	/* sign = dividend ^ divisor */
585	do_abs	a6, a2, a4	/* udividend = abs (dividend) */
586	do_abs	a3, a3, a4	/* udivisor = abs (divisor) */
587	bltui	a3, 2, .Lle_one	/* check if udivisor <= 1 */
588	do_nsau	a5, a6, a2, a8	/* udividend_shift = nsau (udividend) */
589	do_nsau	a4, a3, a2, a8	/* udivisor_shift = nsau (udivisor) */
590	bgeu	a5, a4, .Lspecial
591
592	sub	a4, a4, a5	/* count = udivisor_shift - udividend_shift */
593	ssl	a4
594	sll	a3, a3		/* udivisor <<= count */
595	movi	a2, 0		/* quotient = 0 */
596
597	/* test-subtract-and-shift loop; one quotient bit on each iteration */
598#if XCHAL_HAVE_LOOPS
599	loopnez	a4, .Lloopend
600#endif /* XCHAL_HAVE_LOOPS */
601.Lloop:
602	bltu	a6, a3, .Lzerobit
603	sub	a6, a6, a3
604	addi	a2, a2, 1
605.Lzerobit:
606	slli	a2, a2, 1
607	srli	a3, a3, 1
608#if !XCHAL_HAVE_LOOPS
609	addi	a4, a4, -1
610	bnez	a4, .Lloop
611#endif /* !XCHAL_HAVE_LOOPS */
612.Lloopend:
613
614	bltu	a6, a3, .Lreturn
615	addi	a2, a2, 1	/* increment if udividend >= udivisor */
616.Lreturn:
617	neg	a5, a2
618	movltz	a2, a5, a7	/* return (sign < 0) ? -quotient : quotient */
619	leaf_return
620
621.Lle_one:
622	beqz	a3, .Lerror
623	neg	a2, a6		/* if udivisor == 1, then return... */
624	movgez	a2, a6, a7	/* (sign < 0) ? -udividend : udividend */
625	leaf_return
626
627.Lspecial:
628	bltu	a6, a3, .Lreturn0 /* if dividend < divisor, return 0 */
629	movi	a2, 1
630	movi	a4, -1
631	movltz	a2, a4, a7	/* else return (sign < 0) ? -1 : 1 */
632	leaf_return
633
634.Lerror:
635	/* Divide by zero: Use an illegal instruction to force an exception.
636	   The subsequent "DIV0" string can be recognized by the exception
637	   handler to identify the real cause of the exception.  */
638	ill
639	.ascii	"DIV0"
640
641.Lreturn0:
642	movi	a2, 0
643#endif /* XCHAL_HAVE_DIV32 */
644	leaf_return
645	.size	__divsi3, . - __divsi3
646
647#endif /* L_divsi3 */
648
649
650#ifdef L_umodsi3
651	.align	4
652	.global	__umodsi3
653	.type	__umodsi3, @function
654__umodsi3:
655	leaf_entry sp, 16
656#if XCHAL_HAVE_DIV32
657	remu	a2, a2, a3
658#else
659	bltui	a3, 2, .Lle_one	/* check if the divisor is <= 1 */
660
661	do_nsau	a5, a2, a6, a7	/* dividend_shift = nsau (dividend) */
662	do_nsau	a4, a3, a6, a7	/* divisor_shift = nsau (divisor) */
663	bgeu	a5, a4, .Lspecial
664
665	sub	a4, a4, a5	/* count = divisor_shift - dividend_shift */
666	ssl	a4
667	sll	a3, a3		/* divisor <<= count */
668
669	/* test-subtract-and-shift loop */
670#if XCHAL_HAVE_LOOPS
671	loopnez	a4, .Lloopend
672#endif /* XCHAL_HAVE_LOOPS */
673.Lloop:
674	bltu	a2, a3, .Lzerobit
675	sub	a2, a2, a3
676.Lzerobit:
677	srli	a3, a3, 1
678#if !XCHAL_HAVE_LOOPS
679	addi	a4, a4, -1
680	bnez	a4, .Lloop
681#endif /* !XCHAL_HAVE_LOOPS */
682.Lloopend:
683
684.Lspecial:
685	bltu	a2, a3, .Lreturn
686	sub	a2, a2, a3	/* subtract once more if dividend >= divisor */
687.Lreturn:
688	leaf_return
689
690.Lle_one:
691	bnez	a3, .Lreturn0
692
693	/* Divide by zero: Use an illegal instruction to force an exception.
694	   The subsequent "DIV0" string can be recognized by the exception
695	   handler to identify the real cause of the exception.  */
696	ill
697	.ascii	"DIV0"
698
699.Lreturn0:
700	movi	a2, 0
701#endif /* XCHAL_HAVE_DIV32 */
702	leaf_return
703	.size	__umodsi3, . - __umodsi3
704
705#endif /* L_umodsi3 */
706
707
708#ifdef L_modsi3
709	.align	4
710	.global	__modsi3
711	.type	__modsi3, @function
712__modsi3:
713	leaf_entry sp, 16
714#if XCHAL_HAVE_DIV32
715	rems	a2, a2, a3
716#else
717	mov	a7, a2		/* save original (signed) dividend */
718	do_abs	a2, a2, a4	/* udividend = abs (dividend) */
719	do_abs	a3, a3, a4	/* udivisor = abs (divisor) */
720	bltui	a3, 2, .Lle_one	/* check if udivisor <= 1 */
721	do_nsau	a5, a2, a6, a8	/* udividend_shift = nsau (udividend) */
722	do_nsau	a4, a3, a6, a8	/* udivisor_shift = nsau (udivisor) */
723	bgeu	a5, a4, .Lspecial
724
725	sub	a4, a4, a5	/* count = udivisor_shift - udividend_shift */
726	ssl	a4
727	sll	a3, a3		/* udivisor <<= count */
728
729	/* test-subtract-and-shift loop */
730#if XCHAL_HAVE_LOOPS
731	loopnez	a4, .Lloopend
732#endif /* XCHAL_HAVE_LOOPS */
733.Lloop:
734	bltu	a2, a3, .Lzerobit
735	sub	a2, a2, a3
736.Lzerobit:
737	srli	a3, a3, 1
738#if !XCHAL_HAVE_LOOPS
739	addi	a4, a4, -1
740	bnez	a4, .Lloop
741#endif /* !XCHAL_HAVE_LOOPS */
742.Lloopend:
743
744.Lspecial:
745	bltu	a2, a3, .Lreturn
746	sub	a2, a2, a3	/* subtract again if udividend >= udivisor */
747.Lreturn:
748	bgez	a7, .Lpositive
749	neg	a2, a2		/* if (dividend < 0), return -udividend */
750.Lpositive:
751	leaf_return
752
753.Lle_one:
754	bnez	a3, .Lreturn0
755
756	/* Divide by zero: Use an illegal instruction to force an exception.
757	   The subsequent "DIV0" string can be recognized by the exception
758	   handler to identify the real cause of the exception.  */
759	ill
760	.ascii	"DIV0"
761
762.Lreturn0:
763	movi	a2, 0
764#endif /* XCHAL_HAVE_DIV32 */
765	leaf_return
766	.size	__modsi3, . - __modsi3
767
768#endif /* L_modsi3 */
769
770
771#ifdef __XTENSA_EB__
772#define uh a2
773#define ul a3
774#else
775#define uh a3
776#define ul a2
777#endif /* __XTENSA_EB__ */
778
779
780#ifdef L_ashldi3
781	.align	4
782	.global	__ashldi3
783	.type	__ashldi3, @function
784__ashldi3:
785	leaf_entry sp, 16
786	ssl	a4
787	bgei	a4, 32, .Llow_only
788	src	uh, uh, ul
789	sll	ul, ul
790	leaf_return
791
792.Llow_only:
793	sll	uh, ul
794	movi	ul, 0
795	leaf_return
796	.size	__ashldi3, . - __ashldi3
797
798#endif /* L_ashldi3 */
799
800
801#ifdef L_ashrdi3
802	.align	4
803	.global	__ashrdi3
804	.type	__ashrdi3, @function
805__ashrdi3:
806	leaf_entry sp, 16
807	ssr	a4
808	bgei	a4, 32, .Lhigh_only
809	src	ul, uh, ul
810	sra	uh, uh
811	leaf_return
812
813.Lhigh_only:
814	sra	ul, uh
815	srai	uh, uh, 31
816	leaf_return
817	.size	__ashrdi3, . - __ashrdi3
818
819#endif /* L_ashrdi3 */
820
821
822#ifdef L_lshrdi3
823	.align	4
824	.global	__lshrdi3
825	.type	__lshrdi3, @function
826__lshrdi3:
827	leaf_entry sp, 16
828	ssr	a4
829	bgei	a4, 32, .Lhigh_only1
830	src	ul, uh, ul
831	srl	uh, uh
832	leaf_return
833
834.Lhigh_only1:
835	srl	ul, uh
836	movi	uh, 0
837	leaf_return
838	.size	__lshrdi3, . - __lshrdi3
839
840#endif /* L_lshrdi3 */
841
842
843#ifdef L_bswapsi2
844	.align	4
845	.global	__bswapsi2
846	.type	__bswapsi2, @function
847__bswapsi2:
848	leaf_entry sp, 16
849	ssai	8
850	srli	a3, a2, 16
851	src	a3, a3, a2
852	src	a3, a3, a3
853	src	a2, a2, a3
854	leaf_return
855	.size	__bswapsi2, . - __bswapsi2
856
857#endif /* L_bswapsi2 */
858
859
860#ifdef L_bswapdi2
861	.align	4
862	.global	__bswapdi2
863	.type	__bswapdi2, @function
864__bswapdi2:
865	leaf_entry sp, 16
866	ssai	8
867	srli	a4, a2, 16
868	src	a4, a4, a2
869	src	a4, a4, a4
870	src	a4, a2, a4
871	srli	a2, a3, 16
872	src	a2, a2, a3
873	src	a2, a2, a2
874	src	a2, a3, a2
875	mov	a3, a4
876	leaf_return
877	.size	__bswapdi2, . - __bswapdi2
878
879#endif /* L_bswapdi2 */
880
881
882#include "ieee754-df.S"
883#include "ieee754-sf.S"
884