1/* Assembly functions for the Xtensa version of libgcc1.
2   Copyright (C) 2001,2002,2003 Free Software Foundation, Inc.
3   Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
4
5This file is part of GCC.
6
7GCC is free software; you can redistribute it and/or modify it under
8the terms of the GNU General Public License as published by the Free
9Software Foundation; either version 2, or (at your option) any later
10version.
11
12In addition to the permissions in the GNU General Public License, the
13Free Software Foundation gives you unlimited permission to link the
14compiled version of this file into combinations with other programs,
15and to distribute those combinations without any restriction coming
16from the use of this file.  (The General Public License restrictions
17do apply in other respects; for example, they cover modification of
18the file, and distribution when not linked into a combine
19executable.)
20
21GCC is distributed in the hope that it will be useful, but WITHOUT ANY
22WARRANTY; without even the implied warranty of MERCHANTABILITY or
23FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
24for more details.
25
26You should have received a copy of the GNU General Public License
27along with GCC; see the file COPYING.  If not, write to the Free
28Software Foundation, 59 Temple Place - Suite 330, Boston, MA
2902111-1307, USA.  */
30
31#include "xtensa-config.h"
32
33# Note: These functions use a minimum stack frame size of 32.  This is
34# necessary for Xtensa configurations that only support a fixed register
35# window size of 8, where even leaf functions (such as these) need to
36# allocate space for a 4-word "extra save area".
37
38# Define macros for the ABS and ADDX* instructions to handle cases
39# where they are not included in the Xtensa processor configuration.
40
41	.macro	do_abs dst, src, tmp
42#if XCHAL_HAVE_ABS
43	abs	\dst, \src
44#else
45	neg	\tmp, \src
46	movgez	\tmp, \src, \src
47	mov	\dst, \tmp
48#endif
49	.endm
50
51	.macro	do_addx2 dst, as, at, tmp
52#if XCHAL_HAVE_ADDX
53	addx2	\dst, \as, \at
54#else
55	slli	\tmp, \as, 1
56	add	\dst, \tmp, \at
57#endif
58	.endm
59
60	.macro	do_addx4 dst, as, at, tmp
61#if XCHAL_HAVE_ADDX
62	addx4	\dst, \as, \at
63#else
64	slli	\tmp, \as, 2
65	add	\dst, \tmp, \at
66#endif
67	.endm
68
69	.macro	do_addx8 dst, as, at, tmp
70#if XCHAL_HAVE_ADDX
71	addx8	\dst, \as, \at
72#else
73	slli	\tmp, \as, 3
74	add	\dst, \tmp, \at
75#endif
76	.endm
77
78# Define macros for function entry and return, supporting either the
79# standard register windowed ABI or the non-windowed call0 ABI.  These
80# macros do not allocate any extra stack space, so they only work for
81# leaf functions that do not need to spill anything to the stack.
82
83	.macro abi_entry reg, size
84#if XCHAL_HAVE_WINDOWED && !__XTENSA_CALL0_ABI__
85	entry \reg, \size
86#else
87	/* do nothing */
88#endif
89	.endm
90
91	.macro abi_return
92#if XCHAL_HAVE_WINDOWED && !__XTENSA_CALL0_ABI__
93	retw
94#else
95	ret
96#endif
97	.endm
98
99
100#ifdef L_mulsi3
101	.align	4
102	.global	__mulsi3
103	.type	__mulsi3,@function
104__mulsi3:
105	abi_entry sp, 32
106
107#if XCHAL_HAVE_MUL16
108	or	a4, a2, a3
109	srai	a4, a4, 16
110	bnez	a4, .LMUL16
111	mul16u	a2, a2, a3
112	abi_return
113.LMUL16:
114	srai	a4, a2, 16
115	srai	a5, a3, 16
116	mul16u	a7, a4, a3
117	mul16u	a6, a5, a2
118	mul16u	a4, a2, a3
119	add	a7, a7, a6
120	slli	a7, a7, 16
121	add	a2, a7, a4
122
123#elif XCHAL_HAVE_MAC16
124	mul.aa.hl a2, a3
125	mula.aa.lh a2, a3
126	rsr	a5, 16 # ACCLO
127	umul.aa.ll a2, a3
128	rsr	a4, 16 # ACCLO
129	slli	a5, a5, 16
130	add	a2, a4, a5
131
132#else /* !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MAC16 */
133
134	# Multiply one bit at a time, but unroll the loop 4x to better
135	# exploit the addx instructions and avoid overhead.
136	# Peel the first iteration to save a cycle on init.
137
138	# Avoid negative numbers.
139	xor	a5, a2, a3  # top bit is 1 iff one of the inputs is negative
140	do_abs	a3, a3, a6
141	do_abs	a2, a2, a6
142
143	# Swap so the second argument is smaller.
144	sub	a7, a2, a3
145	mov	a4, a3
146	movgez	a4, a2, a7  # a4 = max(a2, a3)
147	movltz	a3, a2, a7  # a3 = min(a2, a3)
148
149	movi	a2, 0
150	extui	a6, a3, 0, 1
151	movnez	a2, a4, a6
152
153	do_addx2 a7, a4, a2, a7
154	extui	a6, a3, 1, 1
155	movnez	a2, a7, a6
156
157	do_addx4 a7, a4, a2, a7
158	extui	a6, a3, 2, 1
159	movnez	a2, a7, a6
160
161	do_addx8 a7, a4, a2, a7
162	extui	a6, a3, 3, 1
163	movnez	a2, a7, a6
164
165	bgeui	a3, 16, .Lmult_main_loop
166	neg	a3, a2
167	movltz	a2, a3, a5
168	abi_return
169
170	.align	4
171.Lmult_main_loop:
172	srli	a3, a3, 4
173	slli	a4, a4, 4
174
175	add	a7, a4, a2
176	extui	a6, a3, 0, 1
177	movnez	a2, a7, a6
178
179	do_addx2 a7, a4, a2, a7
180	extui	a6, a3, 1, 1
181	movnez	a2, a7, a6
182
183	do_addx4 a7, a4, a2, a7
184	extui	a6, a3, 2, 1
185	movnez	a2, a7, a6
186
187	do_addx8 a7, a4, a2, a7
188	extui	a6, a3, 3, 1
189	movnez	a2, a7, a6
190
191	bgeui	a3, 16, .Lmult_main_loop
192
193	neg	a3, a2
194	movltz	a2, a3, a5
195
196#endif /* !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MAC16 */
197
198	abi_return
199	.size	__mulsi3,.-__mulsi3
200
201#endif /* L_mulsi3 */
202
203
204# Define a macro for the NSAU (unsigned normalize shift amount)
205# instruction, which computes the number of leading zero bits,
206# to handle cases where it is not included in the Xtensa processor
207# configuration.
208
209	.macro	do_nsau cnt, val, tmp, a
210#if XCHAL_HAVE_NSA
211	nsau	\cnt, \val
212#else
213	mov	\a, \val
214	movi	\cnt, 0
215	extui	\tmp, \a, 16, 16
216	bnez	\tmp, 0f
217	movi	\cnt, 16
218	slli	\a, \a, 16
2190:
220	extui	\tmp, \a, 24, 8
221	bnez	\tmp, 1f
222	addi	\cnt, \cnt, 8
223	slli	\a, \a, 8
2241:
225	movi	\tmp, __nsau_data
226	extui	\a, \a, 24, 8
227	add	\tmp, \tmp, \a
228	l8ui	\tmp, \tmp, 0
229	add	\cnt, \cnt, \tmp
230#endif /* !XCHAL_HAVE_NSA */
231	.endm
232
233#ifdef L_nsau
234	.section .rodata
235	.align	4
236	.global	__nsau_data
237	.type	__nsau_data,@object
238__nsau_data:
239#if !XCHAL_HAVE_NSA
240	.byte	8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
241	.byte	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
242	.byte	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
243	.byte	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
244	.byte	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
245	.byte	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
246	.byte	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
247	.byte	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
248	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
249	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
250	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
251	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
252	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
253	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
254	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
255	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
256#endif /* !XCHAL_HAVE_NSA */
257	.size	__nsau_data,.-__nsau_data
258	.hidden	__nsau_data
259#endif /* L_nsau */
260
261
262#ifdef L_udivsi3
263	.align	4
264	.global	__udivsi3
265	.type	__udivsi3,@function
266__udivsi3:
267	abi_entry sp, 32
268	bltui	a3, 2, .Lle_one	# check if the divisor <= 1
269
270	mov	a6, a2		# keep dividend in a6
271	do_nsau	a5, a6, a2, a7	# dividend_shift = nsau(dividend)
272	do_nsau	a4, a3, a2, a7	# divisor_shift = nsau(divisor)
273	bgeu	a5, a4, .Lspecial
274
275	sub	a4, a4, a5	# count = divisor_shift - dividend_shift
276	ssl	a4
277	sll	a3, a3		# divisor <<= count
278	movi	a2, 0		# quotient = 0
279
280	# test-subtract-and-shift loop; one quotient bit on each iteration
281#if XCHAL_HAVE_LOOPS
282	loopnez	a4, .Lloopend
283#endif /* XCHAL_HAVE_LOOPS */
284.Lloop:
285	bltu	a6, a3, .Lzerobit
286	sub	a6, a6, a3
287	addi	a2, a2, 1
288.Lzerobit:
289	slli	a2, a2, 1
290	srli	a3, a3, 1
291#if !XCHAL_HAVE_LOOPS
292	addi	a4, a4, -1
293	bnez	a4, .Lloop
294#endif /* !XCHAL_HAVE_LOOPS */
295.Lloopend:
296
297	bltu	a6, a3, .Lreturn
298	addi	a2, a2, 1	# increment quotient if dividend >= divisor
299.Lreturn:
300	abi_return
301
302.Lspecial:
303	# return dividend >= divisor
304	movi	a2, 0
305	bltu	a6, a3, .Lreturn2
306	movi	a2, 1
307.Lreturn2:
308	abi_return
309
310.Lle_one:
311	beqz	a3, .Lerror	# if divisor == 1, return the dividend
312	abi_return
313.Lerror:
314	movi	a2, 0		# just return 0; could throw an exception
315	abi_return
316	.size	__udivsi3,.-__udivsi3
317
318#endif /* L_udivsi3 */
319
320
321#ifdef L_divsi3
322	.align	4
323	.global	__divsi3
324	.type	__divsi3,@function
325__divsi3:
326	abi_entry sp, 32
327	xor	a7, a2, a3	# sign = dividend ^ divisor
328	do_abs	a6, a2, a4	# udividend = abs(dividend)
329	do_abs	a3, a3, a4	# udivisor = abs(divisor)
330	bltui	a3, 2, .Lle_one	# check if udivisor <= 1
331	do_nsau	a5, a6, a2, a8	# udividend_shift = nsau(udividend)
332	do_nsau	a4, a3, a2, a8	# udivisor_shift = nsau(udivisor)
333	bgeu	a5, a4, .Lspecial
334
335	sub	a4, a4, a5	# count = udivisor_shift - udividend_shift
336	ssl	a4
337	sll	a3, a3		# udivisor <<= count
338	movi	a2, 0		# quotient = 0
339
340	# test-subtract-and-shift loop; one quotient bit on each iteration
341#if XCHAL_HAVE_LOOPS
342	loopnez	a4, .Lloopend
343#endif /* XCHAL_HAVE_LOOPS */
344.Lloop:
345	bltu	a6, a3, .Lzerobit
346	sub	a6, a6, a3
347	addi	a2, a2, 1
348.Lzerobit:
349	slli	a2, a2, 1
350	srli	a3, a3, 1
351#if !XCHAL_HAVE_LOOPS
352	addi	a4, a4, -1
353	bnez	a4, .Lloop
354#endif /* !XCHAL_HAVE_LOOPS */
355.Lloopend:
356
357	bltu	a6, a3, .Lreturn
358	addi	a2, a2, 1	# increment quotient if udividend >= udivisor
359.Lreturn:
360	neg	a5, a2
361	movltz	a2, a5, a7	# return (sign < 0) ? -quotient : quotient
362	abi_return
363
364.Lspecial:
365	movi	a2, 0
366	bltu	a6, a3, .Lreturn2 #  if dividend < divisor, return 0
367	movi	a2, 1
368	movi	a4, -1
369	movltz	a2, a4, a7	# else return (sign < 0) ? -1 :	 1
370.Lreturn2:
371	abi_return
372
373.Lle_one:
374	beqz	a3, .Lerror
375	neg	a2, a6		# if udivisor == 1, then return...
376	movgez	a2, a6, a7	# (sign < 0) ? -udividend : udividend
377	abi_return
378.Lerror:
379	movi	a2, 0		# just return 0; could throw an exception
380	abi_return
381	.size	__divsi3,.-__divsi3
382
383#endif /* L_divsi3 */
384
385
386#ifdef L_umodsi3
387	.align	4
388	.global	__umodsi3
389	.type	__umodsi3,@function
390__umodsi3:
391	abi_entry sp, 32
392	bltui	a3, 2, .Lle_one	# check if the divisor is <= 1
393
394	do_nsau	a5, a2, a6, a7	# dividend_shift = nsau(dividend)
395	do_nsau	a4, a3, a6, a7	# divisor_shift = nsau(divisor)
396	bgeu	a5, a4, .Lspecial
397
398	sub	a4, a4, a5	# count = divisor_shift - dividend_shift
399	ssl	a4
400	sll	a3, a3		# divisor <<= count
401
402	# test-subtract-and-shift loop
403#if XCHAL_HAVE_LOOPS
404	loopnez	a4, .Lloopend
405#endif /* XCHAL_HAVE_LOOPS */
406.Lloop:
407	bltu	a2, a3, .Lzerobit
408	sub	a2, a2, a3
409.Lzerobit:
410	srli	a3, a3, 1
411#if !XCHAL_HAVE_LOOPS
412	addi	a4, a4, -1
413	bnez	a4, .Lloop
414#endif /* !XCHAL_HAVE_LOOPS */
415.Lloopend:
416
417	bltu	a2, a3, .Lreturn
418	sub	a2, a2, a3	# subtract once more if dividend >= divisor
419.Lreturn:
420	abi_return
421
422.Lspecial:
423	bltu	a2, a3, .Lreturn2
424	sub	a2, a2, a3	# subtract once if dividend >= divisor
425.Lreturn2:
426	abi_return
427
428.Lle_one:
429	# the divisor is either 0 or 1, so just return 0.
430	# someday we may want to throw an exception if the divisor is 0.
431	movi	a2, 0
432	abi_return
433	.size	__umodsi3,.-__umodsi3
434
435#endif /* L_umodsi3 */
436
437
438#ifdef L_modsi3
439	.align	4
440	.global	__modsi3
441	.type	__modsi3,@function
442__modsi3:
443	abi_entry sp, 32
444	mov	a7, a2		# save original (signed) dividend
445	do_abs	a2, a2, a4	# udividend = abs(dividend)
446	do_abs	a3, a3, a4	# udivisor = abs(divisor)
447	bltui	a3, 2, .Lle_one	# check if udivisor <= 1
448	do_nsau	a5, a2, a6, a8	# udividend_shift = nsau(udividend)
449	do_nsau	a4, a3, a6, a8	# udivisor_shift = nsau(udivisor)
450	bgeu	a5, a4, .Lspecial
451
452	sub	a4, a4, a5	# count = udivisor_shift - udividend_shift
453	ssl	a4
454	sll	a3, a3		# udivisor <<= count
455
456	# test-subtract-and-shift loop
457#if XCHAL_HAVE_LOOPS
458	loopnez	a4, .Lloopend
459#endif /* XCHAL_HAVE_LOOPS */
460.Lloop:
461	bltu	a2, a3, .Lzerobit
462	sub	a2, a2, a3
463.Lzerobit:
464	srli	a3, a3, 1
465#if !XCHAL_HAVE_LOOPS
466	addi	a4, a4, -1
467	bnez	a4, .Lloop
468#endif /* !XCHAL_HAVE_LOOPS */
469.Lloopend:
470
471	bltu	a2, a3, .Lreturn
472	sub	a2, a2, a3	# subtract once more if udividend >= udivisor
473.Lreturn:
474	bgez	a7, .Lpositive
475	neg	a2, a2		# if (dividend < 0), return -udividend
476.Lpositive:
477	abi_return
478
479.Lspecial:
480	bltu	a2, a3, .Lreturn2
481	sub	a2, a2, a3	# subtract once if dividend >= divisor
482.Lreturn2:
483	bgez	a7, .Lpositive2
484	neg	a2, a2		# if (dividend < 0), return -udividend
485.Lpositive2:
486	abi_return
487
488.Lle_one:
489	# udivisor is either 0 or 1, so just return 0.
490	# someday we may want to throw an exception if udivisor is 0.
491	movi	a2, 0
492	abi_return
493	.size	__modsi3,.-__modsi3
494
495#endif /* L_modsi3 */
496