1/* Copyright (C) 2010-2020 Free Software Foundation, Inc.
2   Contributed by Bernd Schmidt <bernds@codesourcery.com>.
3
4This file is free software; you can redistribute it and/or modify it
5under the terms of the GNU General Public License as published by the
6Free Software Foundation; either version 3, or (at your option) any
7later version.
8
9This file is distributed in the hope that it will be useful, but
10WITHOUT ANY WARRANTY; without even the implied warranty of
11MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12General Public License for more details.
13
14Under Section 7 of GPL version 3, you are granted additional
15permissions described in the GCC Runtime Library Exception, version
163.1, as published by the Free Software Foundation.
17
18You should have received a copy of the GNU General Public License and
19a copy of the GCC Runtime Library Exception along with this program;
20see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
21<http://www.gnu.org/licenses/>.  */
22
23	;; ABI considerations for the divide functions
24	;; The following registers are call-used:
25	;; __c6xabi_divi A0,A1,A2,A4,A6,B0,B1,B2,B4,B5
26	;; __c6xabi_divu A0,A1,A2,A4,A6,B0,B1,B2,B4
27	;; __c6xabi_remi A1,A2,A4,A5,A6,B0,B1,B2,B4
28	;; __c6xabi_remu A1,A4,A5,A7,B0,B1,B2,B4
29	;;
30	;; In our implementation, divu and remu are leaf functions,
31	;; while both divi and remi call into divu.
32	;; A0 is not clobbered by any of the functions.
33	;; divu does not clobber B2 either, which is taken advantage of
34	;; in remi.
35	;; divi uses B5 to hold the original return address during
36	;; the call to divu.
37	;; remi uses B2 and A5 to hold the input values during the
38	;; call to divu.  It stores B3 in on the stack.
39
40#ifdef L_divsi3
41.text
42.align 2
43.global __c6xabi_divi
44.hidden __c6xabi_divi
45.type __c6xabi_divi, STT_FUNC
46
47__c6xabi_divi:
48	call .s2	__c6xabi_divu
49||	mv .d2		B3, B5
50||	cmpgt .l1	0, A4, A1
51||	cmpgt .l2	0, B4, B1
52
53	[A1] neg .l1	A4, A4
54||	[B1] neg .l2	B4, B4
55||	xor .s1x	A1, B1, A1
56
57#ifdef _TMS320C6400
58	[A1] addkpc .s2	1f, B3, 4
59#else
60	[A1] mvkl .s2	1f, B3
61	[A1] mvkh .s2	1f, B3
62	nop		2
63#endif
641:
65	neg .l1		A4, A4
66||	mv .l2		B3,B5
67||	ret .s2		B5
68	nop		5
69#endif
70
71#if defined L_modsi3 || defined L_divmodsi4
72.align 2
73#ifdef L_modsi3
74#define MOD_OUTPUT_REG A4
75.global __c6xabi_remi
76.hidden __c6xabi_remi
77.type __c6xabi_remi, STT_FUNC
78#else
79#define MOD_OUTPUT_REG A5
80.global __c6xabi_divremi
81.hidden __c6xabi_divremi
82.type __c6xabi_divremi, STT_FUNC
83__c6xabi_divremi:
84#endif
85
86__c6xabi_remi:
87	stw .d2t2	B3, *B15--[2]
88||	cmpgt .l1	0, A4, A1
89||	cmpgt .l2	0, B4, B2
90||	mv .s1		A4, A5
91||	call .s2	__c6xabi_divu
92
93	[A1] neg .l1	A4, A4
94||	[B2] neg .l2	B4, B4
95||	xor .s2x	B2, A1, B0
96||	mv .d2		B4, B2
97
98#ifdef _TMS320C6400
99	[B0] addkpc .s2	1f, B3, 1
100	[!B0] addkpc .s2 2f, B3, 1
101	nop		2
102#else
103	[B0] mvkl .s2	1f,B3
104	[!B0] mvkl .s2	2f,B3
105
106	[B0] mvkh .s2	1f,B3
107	[!B0] mvkh .s2	2f,B3
108#endif
1091:
110	neg .l1		A4, A4
1112:
112	ldw .d2t2	*++B15[2], B3
113
114#ifdef _TMS320C6400_PLUS
115	mpy32 .m1x	A4, B2, A6
116	nop		3
117	ret .s2		B3
118	sub .l1		A5, A6, MOD_OUTPUT_REG
119	nop		4
120#else
121	mpyu .m1x	A4, B2, A1
122	nop		1
123	mpylhu .m1x	A4, B2, A6
124||	mpylhu .m2x	B2, A4, B2
125	nop		1
126	add .l1x	A6, B2, A6
127||	ret .s2		B3
128	shl .s1		A6, 16, A6
129	add .d1		A6, A1, A6
130	sub .l1		A5, A6, MOD_OUTPUT_REG
131	nop		2
132#endif
133
134#endif
135
136#if defined L_udivsi3 || defined L_udivmodsi4
137.align 2
138#ifdef L_udivsi3
139.global __c6xabi_divu
140.hidden __c6xabi_divu
141.type __c6xabi_divu, STT_FUNC
142__c6xabi_divu:
143#else
144.global __c6xabi_divremu
145.hidden __c6xabi_divremu
146.type __c6xabi_divremu, STT_FUNC
147__c6xabi_divremu:
148#endif
149	;; We use a series of up to 31 subc instructions.  First, we find
150	;; out how many leading zero bits there are in the divisor.  This
151	;; gives us both a shift count for aligning (shifting) the divisor
152	;; to the, and the number of times we have to execute subc.
153
154	;; At the end, we have both the remainder and most of the quotient
155	;; in A4.  The top bit of the quotient is computed first and is
156	;; placed in A2.
157
158	;; Return immediately if the dividend is zero.  Setting B4 to 1
159	;; is a trick to allow us to leave the following insns in the jump
160	;; delay slot without affecting the result.
161	mv	.s2x	A4, B1
162
163#ifndef _TMS320C6400
164[!b1]	mvk	.s2	1, B4
165#endif
166[b1]	lmbd	.l2	1, B4, B1
167||[!b1] b	.s2	B3	; RETURN A
168#ifdef _TMS320C6400
169||[!b1] mvk	.d2	1, B4
170#endif
171#ifdef L_udivmodsi4
172||[!b1] zero	.s1	A5
173#endif
174	mv	.l1x	B1, A6
175||	shl	.s2	B4, B1, B4
176
177	;; The loop performs a maximum of 28 steps, so we do the
178	;; first 3 here.
179	cmpltu	.l1x	A4, B4, A2
180[!A2]	sub	.l1x	A4, B4, A4
181||	shru	.s2	B4, 1, B4
182||	xor	.s1	1, A2, A2
183
184	shl	.s1	A2, 31, A2
185|| [b1]	subc	.l1x	A4,B4,A4
186|| [b1]	add	.s2	-1, B1, B1
187[b1]	subc	.l1x	A4,B4,A4
188|| [b1]	add	.s2	-1, B1, B1
189
190	;; RETURN A may happen here (note: must happen before the next branch)
1910:
192	cmpgt	.l2	B1, 7, B0
193|| [b1]	subc	.l1x	A4,B4,A4
194|| [b1]	add	.s2	-1, B1, B1
195[b1]	subc	.l1x	A4,B4,A4
196|| [b1]	add	.s2	-1, B1, B1
197|| [b0] b	.s1	0b
198[b1]	subc	.l1x	A4,B4,A4
199|| [b1]	add	.s2	-1, B1, B1
200[b1]	subc	.l1x	A4,B4,A4
201|| [b1]	add	.s2	-1, B1, B1
202[b1]	subc	.l1x	A4,B4,A4
203|| [b1]	add	.s2	-1, B1, B1
204[b1]	subc	.l1x	A4,B4,A4
205|| [b1]	add	.s2	-1, B1, B1
206[b1]	subc	.l1x	A4,B4,A4
207|| [b1]	add	.s2	-1, B1, B1
208	;; loop backwards branch happens here
209
210	ret	.s2	B3
211||	mvk	.s1	32, A1
212	sub	.l1	A1, A6, A6
213#ifdef L_udivmodsi4
214||	extu	.s1	A4, A6, A5
215#endif
216	shl	.s1	A4, A6, A4
217	shru	.s1	A4, 1, A4
218||	sub	.l1	A6, 1, A6
219	or	.l1	A2, A4, A4
220	shru	.s1	A4, A6, A4
221	nop
222
223#endif
224
225#ifdef L_umodsi3
226.align 2
227.global __c6xabi_remu
228.hidden __c6xabi_remu
229.type __c6xabi_remu, STT_FUNC
230__c6xabi_remu:
231	;; The ABI seems designed to prevent these functions calling each other,
232	;; so we duplicate most of the divsi3 code here.
233	mv	.s2x	A4, B1
234#ifndef _TMS320C6400
235[!b1]	mvk	.s2	1, B4
236#endif
237	lmbd	.l2	1, B4, B1
238||[!b1] b	.s2	B3	; RETURN A
239#ifdef _TMS320C6400
240||[!b1] mvk	.d2	1, B4
241#endif
242
243	mv	.l1x	B1, A7
244||	shl	.s2	B4, B1, B4
245
246	cmpltu	.l1x	A4, B4, A1
247[!a1]	sub	.l1x	A4, B4, A4
248	shru	.s2	B4, 1, B4
249
2500:
251	cmpgt	.l2	B1, 7, B0
252|| [b1]	subc	.l1x	A4,B4,A4
253|| [b1]	add	.s2	-1, B1, B1
254	;; RETURN A may happen here (note: must happen before the next branch)
255[b1]	subc	.l1x	A4,B4,A4
256|| [b1]	add	.s2	-1, B1, B1
257|| [b0] b	.s1	0b
258[b1]	subc	.l1x	A4,B4,A4
259|| [b1]	add	.s2	-1, B1, B1
260[b1]	subc	.l1x	A4,B4,A4
261|| [b1]	add	.s2	-1, B1, B1
262[b1]	subc	.l1x	A4,B4,A4
263|| [b1]	add	.s2	-1, B1, B1
264[b1]	subc	.l1x	A4,B4,A4
265|| [b1]	add	.s2	-1, B1, B1
266[b1]	subc	.l1x	A4,B4,A4
267|| [b1]	add	.s2	-1, B1, B1
268	;; loop backwards branch happens here
269
270	ret	.s2	B3
271[b1]	subc	.l1x	A4,B4,A4
272|| [b1]	add	.s2	-1, B1, B1
273[b1]	subc	.l1x	A4,B4,A4
274
275	extu	.s1	A4, A7, A4
276	nop	2
277#endif
278
279#if defined L_strasgi_64plus && defined _TMS320C6400_PLUS
280
281.align 2
282.global __c6xabi_strasgi_64plus
283.hidden __c6xabi_strasgi_64plus
284.type __c6xabi_strasgi_64plus, STT_FUNC
285__c6xabi_strasgi_64plus:
286	shru	.s2x	a6, 2, b31
287||	mv	.s1	a4, a30
288||	mv	.d2	b4, b30
289
290	add	.s2	-4, b31, b31
291
292	sploopd		1
293||	mvc	.s2	b31, ilc
294	ldw	.d2t2	*b30++, b31
295	nop	4
296	mv	.s1x	b31,a31
297	spkernel	6, 0
298||	stw	.d1t1	a31, *a30++
299
300	ret	.s2	b3
301	nop 5
302#endif
303
304#ifdef L_strasgi
305.global __c6xabi_strasgi
306.type __c6xabi_strasgi, STT_FUNC
307__c6xabi_strasgi:
308	;; This is essentially memcpy, with alignment known to be at least
309	;; 4, and the size a multiple of 4 greater than or equal to 28.
310	ldw	.d2t1	*B4++, A0
311||	mvk	.s2	16, B1
312	ldw	.d2t1	*B4++, A1
313||	mvk	.s2	20, B2
314||	sub	.d1	A6, 24, A6
315	ldw	.d2t1	*B4++, A5
316	ldw	.d2t1	*B4++, A7
317||	mv	.l2x	A6, B7
318	ldw	.d2t1	*B4++, A8
319	ldw	.d2t1	*B4++, A9
320||	mv	.s2x	A0, B5
321||	cmpltu	.l2	B2, B7, B0
322
3230:
324	stw	.d1t2	B5, *A4++
325||[b0]	ldw	.d2t1	*B4++, A0
326||	mv	.s2x	A1, B5
327||	mv	.l2	B7, B6
328
329[b0]	sub	.d2	B6, 24, B7
330||[b0]	b	.s2	0b
331||	cmpltu	.l2	B1, B6, B0
332
333[b0]	ldw	.d2t1	*B4++, A1
334||	stw	.d1t2	B5, *A4++
335||	mv	.s2x	A5, B5
336||	cmpltu	.l2	12, B6, B0
337
338[b0]	ldw	.d2t1	*B4++, A5
339||	stw	.d1t2	B5, *A4++
340||	mv	.s2x	A7, B5
341||	cmpltu	.l2	8, B6, B0
342
343[b0]	ldw	.d2t1	*B4++, A7
344||	stw	.d1t2	B5, *A4++
345||	mv	.s2x	A8, B5
346||	cmpltu	.l2	4, B6, B0
347
348[b0]	ldw	.d2t1	*B4++, A8
349||	stw	.d1t2	B5, *A4++
350||	mv	.s2x	A9, B5
351||	cmpltu	.l2	0, B6, B0
352
353[b0]	ldw	.d2t1	*B4++, A9
354||	stw	.d1t2	B5, *A4++
355||	mv	.s2x	A0, B5
356||	cmpltu	.l2	B2, B7, B0
357
358	;; loop back branch happens here
359
360	cmpltu	.l2	B1, B6, B0
361||	ret	.s2	b3
362
363[b0]	stw	.d1t1	A1, *A4++
364||	cmpltu	.l2	12, B6, B0
365[b0]	stw	.d1t1	A5, *A4++
366||	cmpltu	.l2	8, B6, B0
367[b0]	stw	.d1t1	A7, *A4++
368||	cmpltu	.l2	4, B6, B0
369[b0]	stw	.d1t1	A8, *A4++
370||	cmpltu	.l2	0, B6, B0
371[b0]	stw	.d1t1	A9, *A4++
372
373	;; return happens here
374
375#endif
376
377#ifdef _TMS320C6400_PLUS
378#ifdef L_push_rts
379.align 2
380.global __c6xabi_push_rts
381.hidden __c6xabi_push_rts
382.type __c6xabi_push_rts, STT_FUNC
383__c6xabi_push_rts:
384	stw .d2t2	B14, *B15--[2]
385	stdw .d2t1	A15:A14, *B15--
386||	b .s2x		A3
387	stdw .d2t2	B13:B12, *B15--
388	stdw .d2t1	A13:A12, *B15--
389	stdw .d2t2	B11:B10, *B15--
390	stdw .d2t1	A11:A10, *B15--
391	stdw .d2t2	B3:B2, *B15--
392#endif
393
394#ifdef L_pop_rts
395.align 2
396.global __c6xabi_pop_rts
397.hidden __c6xabi_pop_rts
398.type __c6xabi_pop_rts, STT_FUNC
399__c6xabi_pop_rts:
400	lddw .d2t2	*++B15, B3:B2
401	lddw .d2t1	*++B15, A11:A10
402	lddw .d2t2	*++B15, B11:B10
403	lddw .d2t1	*++B15, A13:A12
404	lddw .d2t2	*++B15, B13:B12
405	lddw .d2t1	*++B15, A15:A14
406||	b .s2		B3
407	ldw .d2t2	*++B15[2], B14
408	nop		4
409#endif
410
411#ifdef L_call_stub
412.align 2
413.global __c6xabi_call_stub
414.type __c6xabi_call_stub, STT_FUNC
415__c6xabi_call_stub:
416	stw .d2t1	A2, *B15--[2]
417	stdw .d2t1	A7:A6, *B15--
418||	call .s2	B31
419	stdw .d2t1	A1:A0, *B15--
420	stdw .d2t2	B7:B6, *B15--
421	stdw .d2t2	B5:B4, *B15--
422	stdw .d2t2	B1:B0, *B15--
423	stdw .d2t2	B3:B2, *B15--
424||	addkpc .s2	1f, B3, 0
4251:
426	lddw .d2t2	*++B15, B3:B2
427	lddw .d2t2	*++B15, B1:B0
428	lddw .d2t2	*++B15, B5:B4
429	lddw .d2t2	*++B15, B7:B6
430	lddw .d2t1	*++B15, A1:A0
431	lddw .d2t1	*++B15, A7:A6
432||	b .s2		B3
433	ldw .d2t1	*++B15[2], A2
434	nop		4
435#endif
436
437#endif
438
439