xref: /openbsd/gnu/gcc/gcc/config/c4x/libgcc.S (revision 404b540a)
1/* libgcc routines for the Texas Instruments TMS320C[34]x
2   Copyright (C) 1997,98, 1999 Free Software Foundation, Inc.
3
4 Contributed by Michael Hayes (m.hayes@elec.canterbury.ac.nz)
5            and Herman Ten Brugge (Haj.Ten.Brugge@net.HCC.nl).
6
7
8This file is part of GCC.
9
10GCC is free software; you can redistribute it and/or modify it
11under the terms of the GNU General Public License as published by the
12Free Software Foundation; either version 2, or (at your option) any
13later version.
14
15In addition to the permissions in the GNU General Public License, the
16Free Software Foundation gives you unlimited permission to link the
17compiled version of this file into combinations with other programs,
18and to distribute those combinations without any restriction coming
19from the use of this file.  (The General Public License restrictions
20do apply in other respects; for example, they cover modification of
21the file, and distribution when not linked into a combine
22executable.)
23
24This file is distributed in the hope that it will be useful, but
25WITHOUT ANY WARRANTY; without even the implied warranty of
26MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
27General Public License for more details.
28
29You should have received a copy of the GNU General Public License
30along with this program; see the file COPYING.  If not, write to
31the Free Software Foundation, 51 Franklin Street, Fifth Floor,
32Boston, MA 02110-1301, USA.  */
33
34; These routines are called using the standard TI register argument
35; passing model.
36; The following registers do not have to be saved:
37; r0, r1, r2, r3, ar0, ar1, ar2, ir0, ir1, bk, rs, rc, re, (r9, r10, r11)
38;
39; Perform floating point divqf3
40;
41; This routine performs a reciprocal of the divisor using the method
42; described in the C30/C40 user manuals.  It then multiplies that
43; result by the dividend.
44;
45; Let r be the reciprocal of the divisor v and let the ith estimate
46; of r be denoted by r[i].  An iterative approach can be used to
47; improve the estimate of r, given an initial estimate r[0], where
48;
49; r[i + 1] = r[i] * (2.0 - v * r[i])
50;
51; The normalized error e[i] at the ith iteration is
52;
53; e[i] = (r - r[i]) / r = (1 / v - r[i]) * v = (1 - v * r[i])
54;
55; Note that
56;
57; e[i + 1]  = (1 - v * r[i + 1]) = 1 - 2 * v * r[i] + v^2 + (r[i])^2
58;           = (1 - v * r[i])^2 = (e[i])^2
59
60; r2 dividend, r3 divisor, r0 quotient
61; clobbers r1, ar1
62#ifdef L_divsf3
63	.text
64        .global ___divqf3
65___divqf3:
66
67#ifdef _TMS320C4x
68	.if .REGPARM == 0
69	lda	sp,ar0
70	ldf	*-ar0(2), r3
71	.endif
72
73	pop	ar1		; Pop return address
74
75; r0 = estimate of r, r1 = tmp, r2 = dividend, r3 = divisor
76        rcpf    r3, r0		; Compute initial estimate r[0]
77
78	mpyf3	r0, r3, r1	; r1 = r[0] * v
79	subrf	2.0, r1		; r1 = 2.0 - r[0] * v
80	mpyf	r1, r0		; r0 = r[0] * (2.0 - r[0] * v) = r[1]
81; End of 1st iteration (16 bits accuracy)
82
83	mpyf3	r0, r3, r1	; r1 = r[1] * v
84	subrf	2.0, r1		; r1 = 2.0 - r[1] * v
85
86	bud	ar1		; Delayed branch
87	mpyf	r1, r0		; r0 = r[1] * (2.0 - r[1] * v) = r[2]
88; End of 2nd iteration (32 bits accuracy)
89	.if .REGPARM == 0
90	mpyf	*-ar0(1), r0	; Multiply by the dividend
91	.else
92	mpyf	r2, r0		; Multiply by the dividend
93	.endif
94	rnd	r0
95	; Branch occurs here
96#else
97	.if .REGPARM == 0
98	ldiu	sp,ar0
99	ldf	*-ar0(2), r3
100	.endif
101
102	pop	ar1		; Pop return address
103
104; Initial estimate       r[0] = 1.0 * 2^(-e - 1)
105; where                  v = m * 2^e
106
107; r0 = estimate of r, r1 = tmp, r2 = dividend, r3 = divisor
108
109; Calculate initial estimate r[0]
110	pushf	r3
111	pop	r0
112	not	r0		; r0 = -e
113				; complement exponent = -e -1
114				; complement sign (side effect)
115				; complement mantissa (almost 3 bit accurate)
116	push	r0
117	popf	r0		; r0 = 1.0 * e^(-e - 1) + inverted mantissa
118	ldf	-1.0, r1	; undo complement sign bit
119	xor	r1, r0
120
121	mpyf3	r0, r3, r1	; r1 = r[0] * v
122	subrf	2.0, r1		; r1 = 2.0 - r[0] * v
123	mpyf	r1, r0		; r0 = r[0] * (2.0 - r[0] * v) = r[1]
124; End of 1st iteration
125
126	mpyf3	r0, r3, r1	; r1 = r[1] * v
127	subrf	2.0, r1		; r1 = 2.0 - r[1] * v
128	mpyf	r1, r0		; r0 = r[1] * (2.0 - r[1] * v) = r[2]
129; End of 2nd iteration
130
131	mpyf3	r0, r3, r1	; r1 = r[2] * v
132	subrf	2.0, r1		; r1 = 2.0 - r[2] * v
133	mpyf	r1, r0		; r0 = r[2] * (2.0 - r[2] * v) = r[3]
134; End of 3rd iteration
135
136	rnd	r0		; Minimize error in x[3]'s LSBs
137
138; Use modified last iteration
139; r[4] = (r[3] * (1.0 - (v * r[3]))) + r[3]
140	mpyf3	r0, r3, r1	; r1 = r[3] * v
141	subrf	1.0, r1		; r1 = 1.0 - r[3] * v
142	mpyf	r0, r1		; r1 = r[3] * (1.0 - r[3] * v)
143	addf	r1, r0		; r0 = r[3] * (1.0 - r[3] * v) + r[3] = r[4]
144
145        rnd     r0              ; Minimize error in x[4]'s LSBs
146
147	bud	ar1		; Delayed branch
148
149        .if .REGPARM == 0
150        ldfu    *-ar0(1), r2    ; Dividend in mem has only 24 bits significance
151        .else
152        rnd     r2              ; Minimize error in reg dividend's LSBs
153				; since this may have 32 bit significance
154        .endif
155
156        mpyf    r2, r0          ; Multiply by the dividend
157        rnd     r0              ; Round result to 32 bits
158
159	; Branch occurs here
160#endif
161
162#endif
163;
164; Integer signed division
165;
166; ar2 dividend, r2 divisor, r0 quotient
167; clobbers r1, r3, ar0, ar1, ir0, ir1, rc, rs, re
168#ifdef L_divsi3
169	.text
170	.global ___divqi3
171	.ref	udivqi3n
172___divqi3:
173	.if .REGPARM == 0
174#ifdef _TMS320C4x
175	lda	sp,ar0
176#else
177	ldiu	sp,ar0
178#endif
179	ldi	*-ar0(1), ar2
180	ldi	*-ar0(2), r2
181	.endif
182
183	xor3	ar2, r2, r3	; Get the sign
184	absi	ar2, r0
185	bvd	divq32
186	ldi	r0, ar2
187	absi	r2, r2
188	cmpi	ar2, r2		; Divisor > dividend?
189
190	pop	ir1
191	bhid	zero		; If so, return 0
192
193;
194; Normalize oeprands.  Use difference exponents as shift count
195; for divisor, and as repeat count for "subc"
196;
197	float	ar2, r1		; Normalize dividend
198	pushf	r1		; Get as integer
199	pop	ar0
200	lsh	-24, ar0	; Get exponent
201
202	float	r2, r1		; Normalize divisor
203	pushf	r1		; Get as integer
204	pop	ir0
205	lsh	-24, ir0	; Get exponent
206
207	subi	ir0, ar0	; Get difference of exponents
208	lsh	ar0, r2		; Align divisor with dividend
209
210;
211; Do count + 1 subtracts and shifts
212;
213	rpts	ar0
214		subc	r2, ar2
215
216;
217; Mask off the lower count+1 bits of ar2
218;
219	subri	31, ar0		; Shift count is (32 - (ar0 + 1))
220	lsh	ar0, ar2	; Shift left
221	negi	ar0, ar0
222	lsh3	ar0, ar2, r0	; Shift right and put result in r0
223
224;
225; Check sign and negate result if necessary
226;
227	bud	ir1		; Delayed return
228	negi	r0, r1		; Negate result
229	ash	-31, r3		; Check sign
230	ldinz	r1, r0		; If set, use negative result
231	; Branch occurs here
232
233zero:	bud	ir1		; Delayed branch
234	ldi	0, r0
235	nop
236	nop
237	; Branch occurs here
238;
239; special case where ar2 = abs(ar2) = 0x80000000.  We handle this by
240; calling unsigned divide and negating the result if necessary.
241;
242divq32:
243	push	r3		; Save sign
244	call	udivqi3n
245	pop	r3
246	pop	ir1
247	bd	ir1
248	negi	r0, r1		; Negate result
249	ash	-31, r3		; Check sign
250	ldinz	r1, r0		; If set, use negative result
251	; Branch occurs here
252#endif
253;
254;
255; ar2 dividend, r2 divisor, r0 quotient,
256; clobbers r1, r3, ar0, ar1, ir0, ir1, rc, rs, re
257#ifdef L_udivsi3
258	.text
259	.global ___udivqi3
260	.global udivqi3n
261___udivqi3:
262	.if .REGPARM == 0
263#ifdef _TMS320C4x
264	lda	sp,ar0
265#else
266	ldiu	sp,ar0
267#endif
268	ldi	*-ar0(1), ar2
269	ldi	*-ar0(2), r2
270	.endif
271
272udivqi3n:
273	pop	ir1
274
275	cmpi	ar2, r2		; If divisor > dividend
276	bhi	qzero		; return zero
277	ldi	r2, ar1		; Store divisor in ar1
278
279	tstb	ar2, ar2	; Check top bit, jump if set to special handler
280	bld	div_32		; Delayed branch
281
282;
283; Get divisor exponent
284;
285	float	ar1, r1		; Normalize the divisor
286	pushf	r1		; Get into int register
287	pop	rc
288	; branch occurs here
289
290	bzd	qzero		; if (float) divisor zero, return zero
291
292	float	ar2, r1		; Normalize the dividend
293	pushf	r1		; Get into int register
294	pop	ar0
295	lsh	-24, ar0	; Get both the exponents
296	lsh	-24, rc
297
298	subi	rc, ar0		; Get the difference between the exponents
299	lsh	ar0, ar1	; Normalize the divisor with the dividend
300
301;
302; Do count_1 subtracts and shifts
303;
304	rpts	ar0
305		subc	ar1, ar2
306
307;
308; mask off the lower count+1 bits
309;
310	subri	31, ar0		; Shift count (31 - (ar0+1))
311	bud	ir1		; Delayed return
312	lsh3	ar0, ar2, r0
313	negi	ar0, ar0
314	lsh	ar0, r0
315	; Branch occurs here
316
317;
318; Handle a full 32-bit dividend
319;
320div_32:	tstb	ar1, ar1
321	bld	qone		; if divisor high bit is one, the result is one
322	lsh	-24, rc
323	subri	31, rc
324	lsh	rc, ar1		; Line up the divisor
325
326;
327; Now divisor and dividend are aligned.  Do first SUBC by hand, save
328; of the forst quotient digit.  Then, shift divisor right rather
329; than shifting dividend left.  This leaves a zero in the top bit of
330; the divident
331;
332	ldi	1, ar0		; Initizialize MSB of quotient
333	lsh	rc, ar0		; create a mask for MSBs
334	subi	1, ar0		; mask is (2 << count) - 1
335
336	subi3	ar1, ar2, r1
337	ldihs	r1, ar2
338	ldihs	1, r1
339	ldilo	0, r1
340	lsh	rc, r1
341
342	lsh	-1, ar1
343	subi	1, rc
344;
345; do the rest of the shifts and subtracts
346;
347	rpts	rc
348		subc	ar1, ar2
349
350	bud	ir1
351	and	ar0, ar2
352	or3	r1, ar2, r0
353	nop
354
355qone:
356	bud	ir1
357	ldi	1, r0
358	nop
359	nop
360
361qzero:
362	bud	ir1
363	ldi	0, r0
364	nop
365	nop
366#endif
367
368#ifdef L_umodsi3
369	.text
370	.global	___umodqi3
371	.global	umodqi3n
372___umodqi3:
373	.if .REGPARM == 0
374#ifdef _TMS320C4x
375	lda	sp,ar0
376#else
377	ldiu	sp,ar0
378#endif
379	ldi	*-ar0(1), ar2
380	ldi	*-ar0(2), r2
381	.endif
382
383umodqi3n:
384	pop     ir1		; return address
385        cmpi    ar2, r2		; divisor > dividend ?
386	bhi     uzero		;    if so, return dividend
387	ldi     r2, ar1		; load divisor
388;
389; If top bit of dividend is set, handle specially.
390;
391        tstb    ar2, ar2	; check top bit
392	bld     umod_32		; get divisor exponent, then jump.
393;
394; Get divisor exponent by converting to float.
395;
396	float   ar1, r1		; normalize divisor
397	pushf   r1		; push as float
398	pop     rc		; pop as int to get exponent
399        bzd     uzero		; if (float)divisor was zero, return
400;
401; 31 or less bits in dividend.  Get dividend exponent.
402;
403        float   ar2, r1		; normalize dividend
404	pushf   r1		; push as float
405	pop     ar0		; pop as int to get exponent
406;
407; Use difference in exponents as shift count to line up MSBs.
408;
409	lsh     -24, rc		; divisor exponent
410	lsh     -24, ar0	; dividend exponent
411	subi    rc, ar0		; difference
412        lsh     ar0, ar1	; shift divisor up
413;
414; Do COUNT+1 subtract & shifts.
415;
416	rpts    ar0
417		subc    ar1, ar2
418;
419;  Remainder is in upper 31-COUNT bits.
420;
421	bud     ir1		; delayed branch to return
422	addi    1, ar0		; shift count is COUNT+1
423	negi    ar0, ar0	; negate for right shift
424	lsh3    ar0, ar2, r0	; shift to get result
425	; Return occurs here
426
427;
428; The following code handles cases of a full 32-bit dividend.  Before
429; SUBC can be used, the top bit must be cleared (otherwise SUBC can
430; possibly shift a significant 1 out the top of the dividend).  This
431; is accomplished by first doing a normal subtraction, then proceeding
432; with SUBCs.
433;
434umod_32:
435;
436; If the top bit of the divisor is set too, the remainder is simply
437; the difference between the dividend and divisor.  Otherwise, shift
438; the divisor up to line up the MSBs.
439;
440	tstb    ar1, ar1	; check divisor
441	bld     uone		; if negative, remainder is diff
442
443	lsh     -24, rc		; divisor exponent
444	subri   31, rc		; shift count = 31 - exp
445	negi    rc, ar0		; used later as shift count
446	lsh     rc, ar1		; shift up to line up MSBs
447;
448; Now MSBs are aligned.  Do first SUBC by hand using a plain subtraction.
449; Then, shift divisor right rather than shifting dividend left.  This leaves
450; a 0 in the top bit of the dividend.
451;
452	subi3   ar1, ar2, r1	; subtract
453	ldihs   r1, ar2		; if positive, replace dividend
454	subi    1, rc		; first iteration is done
455	lsh     -1, ar1		; shift divisor down
456;
457; Do EXP subtract & shifts.
458;
459	rpts    rc
460		subc    ar1, ar2
461;
462;  Quotient is in EXP+1 LSBs; shift remainder (in MSBs) down.
463;
464	bud	ir1
465	lsh3    ar0, ar2, r0	; COUNT contains -(EXP+1)
466	nop
467	nop
468;
469;  Return (dividend - divisor).
470;
471uone:	bud	ir1
472	subi3   r2, ar2, r0
473	nop
474	nop
475;
476;  Return dividend.
477;
478uzero:	bud	ir1
479	ldi     ar2, r0		; set status from result
480	nop
481	nop
482#endif
483
484#ifdef L_modsi3
485	.text
486	.global	___modqi3
487	.ref umodqi3n
488___modqi3:
489	.if .REGPARM == 0
490#ifdef _TMS320C4x
491	lda	sp,ar0
492#else
493	ldiu	sp,ar0
494#endif
495	ldi	*-ar0(1), ar2
496	ldi	*-ar0(2), r2
497	.endif
498
499;
500; Determine sign of result.  Get absolute value of operands.
501;
502	ldi     ar2, ar0	; sign of result same as dividend
503	absi    ar2, r0		; make dividend positive
504	bvd     mod_32		; if still negative, escape
505	absi    r2, r1		; make divisor positive
506	ldi     r1, ar1		; save in ar1
507        cmpi    r0, ar1		; divisor > dividend ?
508
509        pop     ir1            ; return address
510	bhid    return 		;   if so, return dividend
511;
512; Normalize operands.  Use difference in exponents as shift count
513; for divisor, and as repeat count for SUBC.
514;
515        float   r1, r1		; normalize divisor
516        pushf   r1		; push as float
517	pop     rc		; pop as int
518        bzd     return		; if (float)divisor was zero, return
519
520        float   r0, r1		; normalize dividend
521        pushf   r1		; push as float
522        pop     r1		; pop as int
523
524	lsh     -24, rc		; get divisor exponent
525	lsh     -24, r1		; get dividend exponent
526	subi    rc, r1		; get difference in exponents
527	lsh     r1, ar1		; align divisor with dividend
528;
529; Do COUNT+1 subtract & shifts.
530;
531	rpts    r1
532		subc    ar1, r0
533;
534;  Remainder is in upper bits of R0
535;
536	addi    1, r1		; shift count is -(r1+1)
537	negi    r1, r1
538	lsh     r1, r0		; shift right
539;
540;  Check sign and negate result if necessary.
541;
542return:
543	bud     ir1		; delayed branch to return
544        negi    r0, r1		; negate result
545	cmpi    0, ar0		; check sign
546	ldin    r1, r0		; if set, use negative result
547	; Return occurs here
548;
549; The following code handles cases of a full 32-bit dividend.  This occurs
550; when R0 = abs(R0) = 080000000h.  Handle this by calling the unsigned mod
551; function, then negating the result if necessary.
552;
553mod_32:
554        push    ar0		; remember sign
555	call    umodqi3n	; do divide
556
557	brd     return		; return
558	pop     ar0		; restore sign
559        pop     ir1             ; return address
560	nop
561#endif
562
563#ifdef L_unsfltconst
564	.section .const
565        .global ___unsfltconst
566___unsfltconst:   .float 4294967296.0
567#endif
568
569#ifdef L_unsfltcompare
570	.section .const
571        .global ___unsfltcompare
572___unsfltcompare: .float 2147483648.0
573#endif
574
575; Integer 32-bit signed multiplication
576;
577; The TMS320C3x MPYI instruction takes two 24-bit signed integers
578; and produces a 48-bit signed result which is truncated to 32-bits.
579;
580; A 32-bit by 32-bit multiplication thus requires a number of steps.
581;
582; Consider the product of two 32-bit signed integers,
583;
584;	z = x * y
585;
586; where x = (b << 16) + a,  y = (d << 16) + c
587;
588; This can be expressed as
589;
590;	z = ((b << 16) + a) * ((d << 16) + c)
591;
592;          = ((b * d) << 32) + ((b * c + a * d) << 16) + a * c
593;
594; Let z = (f << 16) + e where f < (1 << 16).
595;
596; Since we are only interested in a 32-bit result, we can ignore the
597; (b * d) << 32 term, and thus
598;
599;	f = b * c + a * d,  e = a * c
600;
601; We can simplify things if we have some a priori knowledge of the
602; operands, for example, if -32768 <= y <= 32767, then y = c and d = 0 and thus
603;
604;	f = b * c,  e = a * c
605;
606; ar2 multiplier, r2 multiplicand, r0 product
607; clobbers r1, r2, r3
608#ifdef L_mulsi3
609	.text
610	.global	___mulqi3
611___mulqi3:
612	.if .REGPARM == 0
613#ifdef _TMS320C4x
614	lda	sp,ar0
615#else
616	ldiu	sp,ar0
617#endif
618	ldi	*-ar0(1), ar2
619	ldi	*-ar0(2), r2
620	.endif
621
622        pop     ir1		; return address
623	ldi	ar2, r0		;
624	and	0ffffh, r0	; a
625	lsh	-16, ar2	; b
626	ldi	r2, r3		;
627	and	0ffffh, r3	; c
628	mpyi	r3, ar2		; c * b
629	lsh	-16, r2		; d
630	mpyi	r0, r2		; a * d
631	addi	ar2, r2		; c * b + a * d
632	bd	ir1		; delayed branch to return
633	lsh	16, r2		; (c * b + a * d) << 16
634	mpyi	r3, r0		; a * c
635	addi	r2, r0		; a * c + (c * b + a * d) << 16
636; branch occurs here
637
638#endif
639
640;
641; Integer 64 by 64 multiply
642; long1 and long2 on stack
643; result in r0,r1
644;
645#ifdef L_muldi3
646	.text
647	.global	___mulhi3
648#ifdef _TMS320C4x
649___mulhi3:
650	pop	ar0
651	ldi	sp,ar2
652	ldi	*-ar2(1),r2
653	ldi	*-ar2(3),r3
654	mpyi3	r2,r3,r0
655	mpyuhi3	r2,r3,r1
656	mpyi	*-ar2(2),r2
657	bd	ar0
658	mpyi	*-ar2(0),r3
659	addi	r2,r1
660	addi	r3,r1
661#else
662___mulhi3:
663	ldi	sp,ar2
664	ldi	-16,rs
665	ldi	*-ar2(2),ar0
666	ldi	*-ar2(4),ar1
667	ldi	ar0,r2
668	and	0ffffh,r2
669	ldi	ar1,r3
670	and	0ffffh,r3
671	lsh	rs,ar0
672	lsh	rs,ar1
673
674	mpyi	r2,r3,r0
675	mpyi	ar0,ar1,r1
676	mpyi	r2,ar1,rc
677	lsh	rs,rc,re
678	addi	re,r1
679	lsh	16,rc
680	addi	rc,r0
681	addc	0,r1
682	mpyi	r3,ar0,rc
683	lsh	rs,rc,re
684	addi	re,r1
685	lsh	16,rc
686	addi	rc,r0
687	addc	0,r1
688
689	ldi	*-ar2(1),ar0
690	ldi	ar0,r2
691	and	0ffffh,r2
692	lsh	rs,ar0
693	mpyi	r2,r3,rc
694	addi	rc,r1
695	mpyi	r2,ar1,rc
696	mpyi	r3,ar0,re
697	addi	re,rc
698	lsh	16,rc
699	addi	rc,r1
700
701	ldi	*-ar2(2),ar0
702	ldi	*-ar2(3),ar1
703	ldi	ar0,r2
704	and	0ffffh,r2
705	ldi	ar1,r3
706	and	0ffffh,r3
707	lsh	rs,ar0
708	lsh	rs,ar1
709	mpyi	r2,r3,rc
710	addi	rc,r1
711	mpyi	r2,ar1,rc
712	mpyi	r3,ar0,re
713	pop	ar0
714	bd	ar0
715	addi	re,rc
716	lsh	16,rc
717	addi	rc,r1
718#endif
719#endif
720
721;
722; Integer 32 by 32 multiply highpart unsigned
723; src1 in ar2
724; src2 in r2
725; result in r0
726;
727#ifdef L_umuldi3_high
728	.text
729	.global	___umulhi3_high
730___umulhi3_high:
731	.if .REGPARM == 0
732#ifdef _TMS320C4x
733	lda	sp,ar0
734#else
735	ldiu	sp,ar0
736#endif
737	ldi	*-ar0(1), ar2
738	ldi	*-ar0(2), r2
739	.endif
740
741	ldi	-16,rs
742	ldi	r2,r3
743	and	0ffffh,r2
744	ldi	ar2,ar1
745	and	0ffffh,ar2
746	lsh	rs,r3
747	lsh	rs,ar1
748
749	mpyi	ar2,r2,r1
750	mpyi	ar1,r3,r0
751	mpyi	ar2,r3,rc
752	lsh	rs,rc,re
753	addi	re,r0
754	lsh	16,rc
755	addi	rc,r1
756	addc	0,r0
757	mpyi	r2,ar1,rc
758	lsh	rs,rc,re
759	addi	re,r0
760	pop	ar0
761	bd	ar0
762	lsh	16,rc
763	addi	rc,r1
764	addc	0,r0
765#endif
766
767;
768; Integer 32 by 32 multiply highpart signed
769; src1 in ar2
770; src2 in r2
771; result in r0
772;
773#ifdef L_smuldi3_high
774	.text
775	.global	___smulhi3_high
776___smulhi3_high:
777	.if .REGPARM == 0
778#ifdef _TMS320C4x
779	lda	sp,ar0
780#else
781	ldiu	sp,ar0
782#endif
783	ldi	*-ar0(1), ar2
784	ldi	*-ar0(2), r2
785	.endif
786
787	ldi	-16,rs
788	ldi	0,rc
789	subi3	ar2,rc,r0
790	ldi	r2,r3
791	ldilt	r0,rc
792	subi3	r2,rc,r0
793	ldi	ar2,ar1
794	tstb	ar1,ar1
795	ldilt	r0,rc
796	and	0ffffh,r2
797	and	0ffffh,ar2
798	lsh	rs,r3
799	lsh	rs,ar1
800
801	mpyi	ar2,r2,r1
802	mpyi	ar1,r3,r0
803	addi	rc,r0
804	mpyi	ar2,r3,rc
805	lsh	rs,rc,re
806	addi	re,r0
807	lsh	16,rc
808	addi	rc,r1
809	addc	0,r0
810	mpyi	r2,ar1,rc
811	lsh	rs,rc,re
812	addi	re,r0
813	pop	ar0
814	bd	ar0
815	lsh	16,rc
816	addi	rc,r1
817	addc	0,r0
818#endif
819
820;
821; Integer 64 by 64 unsigned divide
822; long1 and long2 on stack
823; divide in r0,r1
824; modulo in r2,r3
825; routine takes a maximum of 64*8+23=535 cycles = 21.4 us @ 50Mhz
826;
827#ifdef L_udivdi3
828	.text
829	.global	___udivhi3
830	.global	___udivide
831	.global	___umodulo
832	.ref udivqi3n
833	.ref umodqi3n
834___udivhi3:
835	ldi	sp,ar2
836	ldi     *-ar2(4),ar0
837	ldi     *-ar2(3),ar1
838	ldi     *-ar2(2),r0
839	ldi     *-ar2(1),r1
840
841___udivide:
842	or	r1,ar1,r2
843	bne	udiv0
844	ldi	ar0,r2
845	ldi	r0,ar2
846	call	udivqi3n
847	ldiu	0,r1
848	rets
849
850___umodulo:
851	or	r1,ar1,r2
852	bne	udiv0
853	ldi	ar0,r2
854	ldi	r0,ar2
855	call	umodqi3n
856	ldi	r0,r2
857	ldiu	0,r3
858	rets
859
860udiv0:
861	tstb	ar1,ar1
862	bne	udiv1
863	tstb	ar0,ar0
864	bn	udiv1
865
866	ldiu	63,rc
867#ifdef _TMS320C4x
868	rptbd	udivend0
869	ldiu	0,r2
870	addi	r0,r0
871	rolc	r1
872#else
873	ldiu	0,r2
874	addi	r0,r0
875	rolc	r1
876	rptb	udivend0
877#endif
878
879	rolc	r2
880	subi3	ar0,r2,r3
881	ldinc	r3,r2
882	rolc	r0
883udivend0:
884	rolc	r1
885
886	not	r0
887	not	r1
888	ldiu	0,r3
889	rets
890udiv1:
891	push	r4
892	push	r5
893	ldiu	63,rc
894	ldiu	0,r2
895#ifdef _TMS320C4x
896	rptbd	udivend1
897	ldiu	0,r3
898	addi	r0,r0
899	rolc	r1
900#else
901	ldiu	0,r3
902	addi	r0,r0
903	rolc	r1
904	rptb	udivend1
905#endif
906
907	rolc	r2
908	rolc	r3
909	subi3	ar0,r2,r4
910	subb3	ar1,r3,r5
911	ldinc	r4,r2
912	ldinc	r5,r3
913	rolc	r0
914udivend1:
915	rolc	r1
916
917	not	r0
918	not	r1
919	pop	r5
920	pop	r4
921	rets
922#endif
923
924;
925; Integer 64 by 64 unsigned modulo
926; long1 and long2 on stack
927; result in r0,r1
928;
929#ifdef L_umoddi3
930	.text
931	.global	___umodhi3
932	.ref ___modulo
933___umodhi3:
934	ldi	sp,ar2
935	ldi     *-ar2(4),ar0
936	ldi     *-ar2(3),ar1
937	ldi     *-ar2(2),r0
938	ldi     *-ar2(1),r1
939	call	___umodulo
940	pop	ar0
941	bd	ar0
942	ldi	r2,r0
943	ldi	r3,r1
944	nop
945#endif
946
947;
948; Integer 64 by 64 signed divide
949; long1 and long2 on stack
950; result in r0,r1
951;
952#ifdef L_divdi3
953	.text
954	.global	___divhi3
955	.ref ___udivide
956___divhi3:
957	ldi	0,ir0
958	ldi	sp,ar2
959	ldi     *-ar2(4),r0
960	ldi     *-ar2(3),r1
961	bge	div1
962	not	ir0
963	negi	r0
964	negb	r1
965div1:
966	ldi	r0,ar0
967	ldi	r1,ar1
968	ldi     *-ar2(2),r0
969	ldi     *-ar2(1),r1
970	bge	div2
971	not	ir0
972	negi	r0
973	negb	r1
974div2:
975	call	___udivide
976	tstb	ir0,ir0
977	bge	div3
978	negi	r0
979	negb	r1
980div3:
981	rets
982#endif
983
984;
985; Integer 64 by 64 signed modulo
986; long1 and long2 on stack
987; result in r0,r1
988;
989#ifdef L_moddi3
990	.text
991	.global	___modhi3
992	.ref ___umodulo
993___modhi3:
994	ldi	0,ir0
995	ldi	sp,ar2
996	ldi     *-ar2(4),r0
997	ldi     *-ar2(3),r1
998	bge	mod1
999	not	ir0
1000	negi	r0
1001	negb	r1
1002mod1:
1003	ldi	r0,ar0
1004	ldi	r1,ar1
1005	ldi     *-ar2(2),r0
1006	ldi     *-ar2(1),r1
1007	bge	mod2
1008	not	ir0
1009	negi	r0
1010	negb	r1
1011mod2:
1012	call	___umodulo
1013	ldi	r2,r0
1014	ldi	r3,r1
1015	tstb	ir0,ir0
1016	bge	mod3
1017	negi	r0
1018	negb	r1
1019mod3:
1020	rets
1021#endif
1022
1023;
1024; double to signed long long conversion
1025; input in r2
1026; result in r0,r1
1027;
1028#ifdef L_fix_truncsfdi2
1029	.text
1030	.global	___fix_truncqfhi2
1031	.ref ufix_truncqfhi2n
1032___fix_truncqfhi2:
1033	.if .REGPARM == 0
1034#ifdef _TMS320C4x
1035	lda	sp,ar0
1036#else
1037	ldiu	sp,ar0
1038#endif
1039	ldf	*-ar0(1), r2
1040	.endif
1041
1042	cmpf	0.0,r2
1043	bge	ufix_truncqfhi2n
1044	negf	r2
1045	call	ufix_truncqfhi2n
1046	negi	r0
1047	negb	r1
1048	rets
1049#endif
1050
1051;
1052; double to unsigned long long conversion
1053; input in r2
1054; result in r0,r1
1055;
1056#ifdef L_ufix_truncsfdi2
1057	.text
1058	.global	___ufix_truncqfhi2
1059	.global	ufix_truncqfhi2n
1060___ufix_truncqfhi2:
1061	.if .REGPARM == 0
1062#ifdef _TMS320C4x
1063	lda	sp,ar0
1064#else
1065	ldiu	sp,ar0
1066#endif
1067	ldf	*-ar0(1), r2
1068	.endif
1069
1070ufix_truncqfhi2n:
1071	cmpf	0.0,r2
1072	ble	ufix1
1073	pushf	r2
1074	pop	r3
1075	ash	-24,r3
1076	subi	31,r3
1077	cmpi	32,r3
1078	bgt	ufix1
1079	cmpi	-32,r3
1080	ble	ufix1
1081	ldi	1,r0
1082	ash	31,r0
1083	or3	r0,r2,r0
1084	ldi	r0,r1
1085	lsh3	r3,r0,r0
1086	subi	32,r3
1087	cmpi	-32,r3
1088	ldile	0,r1
1089	lsh3	r3,r1,r1
1090	rets
1091ufix1:
1092	ldi	0,r0
1093	ldi	0,r1
1094	rets
1095#endif
1096
1097;
1098; signed long long to double conversion
1099; input on stack
1100; result in r0
1101;
1102#ifdef L_floatdisf2
1103	.text
1104	.global	___floathiqf2
1105	.ref ufloathiqf2n
1106___floathiqf2:
1107	ldi	sp,ar2
1108	ldi	*-ar2(2),r0
1109	ldi	*-ar2(1),r1
1110	bge	ufloathiqf2n
1111	negi	r0
1112	negb	r1
1113	call	ufloathiqf2n
1114	negf	r0
1115	rets
1116#endif
1117
1118;
1119; unsigned long long to double conversion
1120; input on stack
1121; result in r0
1122;
1123#ifdef L_ufloatdisf2
1124	.text
1125	.global	___ufloathiqf2
1126	.global	ufloathiqf2n
1127	.ref ___unsfltconst
1128___ufloathiqf2:
1129	ldi	sp,ar2
1130	ldi	*-ar2(2),r0
1131	ldi	*-ar2(1),r1
1132ufloathiqf2n:
1133	.if .BIGMODEL
1134#ifdef _TMS320C4x
1135	ldpk	@___unsfltconst
1136#else
1137	ldp	@___unsfltconst
1138#endif
1139	.endif
1140	ldf	@___unsfltconst,r2
1141	float	r0
1142	bge	uflt1
1143	addf	r2,r0
1144uflt1:
1145	float	r1
1146	bge	uflt2
1147	addf	r2,r1
1148uflt2:
1149#ifdef _TMS320C4x
1150	pop	r3
1151	bd	r3
1152	mpyf	r2,r1
1153	addf	r1,r0
1154	nop
1155#else
1156	ldf	r1,r3
1157	and	0ffh,r3
1158	norm	r3,r3
1159	mpyf	r2,r3
1160	pop	ar2
1161	bd	ar2
1162	addf	r3,r0
1163	mpyf	r2,r1
1164	addf	r1,r0
1165#endif
1166#endif
1167
1168;
1169; long double to signed long long conversion
1170; input in r2
1171; result in r0,r1
1172;
1173#ifdef L_fix_truncdfdi2
1174	.text
1175	.global	___fix_trunchfhi2
1176	.ref ufix_trunchfhi2n
1177___fix_trunchfhi2:
1178	.if .REGPARM == 0
1179#ifdef _TMS320C4x
1180	lda	sp,ar0
1181#else
1182	ldiu	sp,ar0
1183#endif
1184	ldf	*-ar0(2), r2
1185	ldi	*-ar0(1), r2
1186	.endif
1187
1188	cmpf	0.0,r2
1189	bge	ufix_trunchfhi2n
1190	negf	r2
1191	call	ufix_trunchfhi2n
1192	negi	r0
1193	negb	r1
1194	rets
1195#endif
1196
1197;
1198; long double to unsigned long long conversion
1199; input in r2
1200; result in r0,r1
1201;
1202#ifdef L_ufix_truncdfdi2
1203	.text
1204	.global	___ufix_trunchfhi2
1205	.global	ufix_trunchfhi2n
1206___ufix_trunchfhi2:
1207	.if .REGPARM == 0
1208#ifdef _TMS320C4x
1209	lda	sp,ar0
1210#else
1211	ldiu	sp,ar0
1212#endif
1213	ldf	*-ar0(2), r2
1214	ldi	*-ar0(1), r2
1215	.endif
1216
1217ufix_trunchfhi2n:
1218	cmpf	0.0,r2
1219	ble	ufixh1
1220	pushf	r2
1221	pop	r3
1222	ash	-24,r3
1223	subi	31,r3
1224	cmpi	32,r3
1225	bgt	ufixh1
1226	cmpi	-32,r3
1227	ble	ufixh1
1228	ldi	1,r0
1229	ash	31,r0
1230	or3	r0,r2,r0
1231	ldi	r0,r1
1232	lsh3	r3,r0,r0
1233	subi	32,r3
1234	cmpi	-32,r3
1235	ldile	0,r1
1236	lsh3	r3,r1,r1
1237	rets
1238ufixh1:
1239	ldi	0,r0
1240	ldi	0,r1
1241	rets
1242#endif
1243
1244;
1245; signed long long to long double conversion
1246; input on stack
1247; result in r0
1248;
1249#ifdef L_floatdidf2
1250	.text
1251	.global	___floathihf2
1252	.ref ufloathihf2n
1253___floathihf2:
1254	ldi	sp,ar2
1255	ldi	*-ar2(2),r0
1256	ldi	*-ar2(1),r1
1257	bge	ufloathihf2n
1258	negi	r0
1259	negb	r1
1260	call	ufloathihf2n
1261	negf	r0
1262	rets
1263#endif
1264
1265;
1266; unsigned long long to double conversion
1267; input on stack
1268; result in r0
1269;
1270#ifdef L_ufloatdidf2
1271	.text
1272	.global	___ufloathihf2
1273	.global	ufloathihf2n
1274	.ref ___unsfltconst
1275___ufloathihf2:
1276	ldi	sp,ar2
1277	ldi	*-ar2(2),r0
1278	ldi	*-ar2(1),r1
1279ufloathihf2n
1280	.if .BIGMODEL
1281#ifdef _TMS320C4x
1282	ldpk	@___unsfltconst
1283#else
1284	ldp	@___unsfltconst
1285#endif
1286	.endif
1287	ldf	@___unsfltconst,r2
1288	float	r0
1289	bge	uflth1
1290	addf	r2,r0
1291uflth1:
1292	float	r1
1293	bge	uflth2
1294	addf	r2,r1
1295uflth2:
1296#ifdef _TMS320C4x
1297	pop	r3
1298	bd	r3
1299	mpyf	r2,r1
1300	addf	r1,r0
1301	nop
1302#else
1303	ldf	r1,r3
1304	and	0ffh,r3
1305	norm	r3,r3
1306	mpyf	r2,r3
1307	pop	ar2
1308	bd	ar2
1309	addf	r3,r0
1310	mpyf	r2,r1
1311	addf	r1,r0
1312#endif
1313#endif
1314
1315;
1316; calculate ffs
1317; input in ar2
1318; result in r0
1319;
1320#ifdef L_ffs
1321	.global	___ffs
1322	.ref ___unsfltconst
1323	.text
1324___ffs:
1325	.if .REGPARM == 0
1326#ifdef _TMS320C4x
1327	lda	sp,ar0
1328#else
1329	ldiu	sp,ar0
1330#endif
1331	ldi	*-ar0(1), ar2
1332	.endif
1333
1334	negi	ar2,r0
1335	and	ar2,r0
1336	float	r0,r0
1337	ldfu	0.0,r1
1338	.if .BIGMODEL
1339#ifdef _TMS320C4x
1340	ldpk	@___unsfltconst
1341#else
1342	ldp	@___unsfltconst
1343#endif
1344	.endif
1345	ldflt	@___unsfltconst,r1
1346	addf	r1,r0
1347	pushf	r0
1348	pop	r0
1349	pop	ar0
1350	bd	ar0
1351	ash	-24,r0
1352	ldilt	-1,r0
1353	addi	1,r0
1354#endif
1355
1356;
1357; calculate long double * long double
1358; input in r2, r3
1359; output in r0
1360;
1361#ifdef L_muldf3
1362	.global ___mulhf3
1363	.text
1364___mulhf3:
1365	.if .REGPARM == 0
1366#ifdef _TMS320C4x
1367	lda	sp,ar0
1368#else
1369	ldiu	sp,ar0
1370#endif
1371	ldf	*-ar0(2), r2
1372	ldi	*-ar0(1), r2
1373	ldf	*-ar0(4), r3
1374	ldi	*-ar0(3), r3
1375	.endif
1376
1377	pop	ar2		; return ad
1378	ldf	r2,r0		; copy lsb0
1379	ldf	r3,r1		; copy lsb1
1380	and	0ffh,r0		; mask lsb0
1381	and	0ffh,r1		; mask lsb1
1382	norm	r0,r0		; correct lsb0
1383	norm	r1,r1		; correct lsb1
1384	mpyf	r2,r1		; arg0*lsb1
1385	mpyf	r3,r0		; arg1*lsb0
1386	bd	ar2		; return (delayed)
1387	addf	r0,r1		; arg0*lsb1 + arg1*lsb0
1388	mpyf	r2,r3,r0	; msb0*msb1
1389	addf	r1,r0		; msb0*msb1 + arg0*lsb1 + arg1*lsb0
1390#endif
1391
1392;
1393; calculate long double / long double
1394; r2 dividend, r3 divisor, r0 quotient
1395;
1396#ifdef L_divdf3
1397	.global ___divhf3
1398	.text
1399___divhf3:
1400	.if .REGPARM == 0
1401#ifdef _TMS320C4x
1402	lda	sp,ar0
1403#else
1404	ldiu	sp,ar0
1405#endif
1406	ldf	*-ar0(2), r2
1407	ldi	*-ar0(1), r2
1408	ldf	*-ar0(4), r3
1409	ldi	*-ar0(3), r3
1410	.endif
1411
1412#ifdef _TMS320C4x
1413	pop	ar1
1414        rcpf    r3, r0
1415	mpyf3	r0, r3, r1
1416	subrf	2.0, r1
1417	mpyf	r1, r0
1418	mpyf3	r0, r3, r1
1419	bud	ar1
1420	subrf	2.0, r1
1421	mpyf	r1, r0
1422	mpyf	r2, r0
1423#else
1424	pop	ar1
1425	pushf	r3
1426	pop	r0
1427	not	r0
1428	push	r0
1429	popf	r0
1430	ldf	-1.0, r1
1431	xor	r1, r0
1432
1433	mpyf3	r0, r3, r1	; r1 = r[0] * v
1434	subrf	2.0, r1		; r1 = 2.0 - r[0] * v
1435	mpyf	r1, r0		; r0 = r[0] * (2.0 - r[0] * v) = r[1]
1436; End of 1st iteration
1437
1438	mpyf3	r0, r3, r1	; r1 = r[1] * v
1439	subrf	2.0, r1		; r1 = 2.0 - r[1] * v
1440	mpyf	r1, r0		; r0 = r[1] * (2.0 - r[1] * v) = r[2]
1441; End of 2nd iteration
1442
1443	mpyf3	r0, r3, r1	; r1 = r[2] * v
1444	subrf	2.0, r1		; r1 = 2.0 - r[2] * v
1445	mpyf	r1, r0		; r0 = r[2] * (2.0 - r[2] * v) = r[3]
1446; End of 3rd iteration
1447
1448	or	080h, r0
1449	rnd	r0
1450
1451;	mpyf3	r0, r3, r1	; r1 = r[3] * v
1452	push	r4
1453	pushf	r4
1454	mpyf	r0, r3, r1
1455
1456	ldf	r0, r4
1457	and	0ffh, r4
1458	norm	r4, r4
1459	mpyf	r3, r4
1460	addf	r4, r1
1461
1462	ldf	r3, r4
1463	and	0ffh, r4
1464	norm 	r4, r4
1465	mpyf	r0, r4
1466	addf	r4, r1
1467
1468	subrf	2.0, r1		; r1 = 2.0 - r[3] * v
1469
1470	mpyf	r1, r0, r3	; r3 = r[3] * (2.0 - r[3] * v) = r[5]
1471
1472	ldf	r1, r4
1473	and	0ffh, r4
1474	norm	r4, r4
1475	mpyf	r0, r4
1476	addf	r4, r3
1477
1478	ldf	r0, r4
1479	and	0ffh, r4
1480	norm 	r4, r4
1481	mpyf	r1, r4
1482	addf	r4, r3
1483
1484	mpyf	r2, r3, r0	; Multiply by the dividend
1485
1486	ldf	r2, r4
1487	and	0ffh, r4
1488	norm	r4, r4
1489	mpyf	r3, r4
1490	addf	r4, r0
1491
1492	ldf	r3, r4
1493	and	0ffh, r4
1494	norm 	r4, r4
1495	mpyf	r2, r4
1496	bd	ar1
1497	addf	r4, r0
1498
1499	popf	r4
1500	pop	r4
1501#endif
1502#endif
1503