1*ec02198aSmrg/* Copyright (C) 2000-2020 Free Software Foundation, Inc.
263d1a8abSmrg   Contributed by James E. Wilson <wilson@cygnus.com>.
363d1a8abSmrg
463d1a8abSmrg   This file is part of GCC.
563d1a8abSmrg
663d1a8abSmrg   GCC is free software; you can redistribute it and/or modify
763d1a8abSmrg   it under the terms of the GNU General Public License as published by
863d1a8abSmrg   the Free Software Foundation; either version 3, or (at your option)
963d1a8abSmrg   any later version.
1063d1a8abSmrg
1163d1a8abSmrg   GCC is distributed in the hope that it will be useful,
1263d1a8abSmrg   but WITHOUT ANY WARRANTY; without even the implied warranty of
1363d1a8abSmrg   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1463d1a8abSmrg   GNU General Public License for more details.
1563d1a8abSmrg
1663d1a8abSmrg   Under Section 7 of GPL version 3, you are granted additional
1763d1a8abSmrg   permissions described in the GCC Runtime Library Exception, version
1863d1a8abSmrg   3.1, as published by the Free Software Foundation.
1963d1a8abSmrg
2063d1a8abSmrg   You should have received a copy of the GNU General Public License and
2163d1a8abSmrg   a copy of the GCC Runtime Library Exception along with this program;
2263d1a8abSmrg   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
2363d1a8abSmrg   <http://www.gnu.org/licenses/>.  */
2463d1a8abSmrg
2563d1a8abSmrg#ifdef L__divxf3
2663d1a8abSmrg// Compute a 80-bit IEEE double-extended quotient.
2763d1a8abSmrg//
2863d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency
2963d1a8abSmrg// alternative.
3063d1a8abSmrg//
3163d1a8abSmrg// farg0 holds the dividend.  farg1 holds the divisor.
3263d1a8abSmrg//
3363d1a8abSmrg// __divtf3 is an alternate symbol name for backward compatibility.
3463d1a8abSmrg
3563d1a8abSmrg	.text
3663d1a8abSmrg	.align 16
3763d1a8abSmrg	.global __divxf3
3863d1a8abSmrg	.proc __divxf3
3963d1a8abSmrg__divxf3:
4063d1a8abSmrg#ifdef SHARED
4163d1a8abSmrg	.global __divtf3
4263d1a8abSmrg__divtf3:
4363d1a8abSmrg#endif
4463d1a8abSmrg	cmp.eq p7, p0 = r0, r0
4563d1a8abSmrg	frcpa.s0 f10, p6 = farg0, farg1
4663d1a8abSmrg	;;
4763d1a8abSmrg(p6)	cmp.ne p7, p0 = r0, r0
4863d1a8abSmrg	.pred.rel.mutex p6, p7
4963d1a8abSmrg(p6)	fnma.s1 f11 = farg1, f10, f1
5063d1a8abSmrg(p6)	fma.s1 f12 = farg0, f10, f0
5163d1a8abSmrg	;;
5263d1a8abSmrg(p6)	fma.s1 f13 = f11, f11, f0
5363d1a8abSmrg(p6)	fma.s1 f14 = f11, f11, f11
5463d1a8abSmrg	;;
5563d1a8abSmrg(p6)	fma.s1 f11 = f13, f13, f11
5663d1a8abSmrg(p6)	fma.s1 f13 = f14, f10, f10
5763d1a8abSmrg	;;
5863d1a8abSmrg(p6)	fma.s1 f10 = f13, f11, f10
5963d1a8abSmrg(p6)	fnma.s1 f11 = farg1, f12, farg0
6063d1a8abSmrg	;;
6163d1a8abSmrg(p6)	fma.s1 f11 = f11, f10, f12
6263d1a8abSmrg(p6)	fnma.s1 f12 = farg1, f10, f1
6363d1a8abSmrg	;;
6463d1a8abSmrg(p6)	fma.s1 f10 = f12, f10, f10
6563d1a8abSmrg(p6)	fnma.s1 f12 = farg1, f11, farg0
6663d1a8abSmrg	;;
6763d1a8abSmrg(p6)	fma.s0 fret0 = f12, f10, f11
6863d1a8abSmrg(p7)	mov fret0 = f10
6963d1a8abSmrg	br.ret.sptk rp
7063d1a8abSmrg	.endp __divxf3
7163d1a8abSmrg#endif
7263d1a8abSmrg
7363d1a8abSmrg#ifdef L__divdf3
7463d1a8abSmrg// Compute a 64-bit IEEE double quotient.
7563d1a8abSmrg//
7663d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency
7763d1a8abSmrg// alternative.
7863d1a8abSmrg//
7963d1a8abSmrg// farg0 holds the dividend.  farg1 holds the divisor.
8063d1a8abSmrg
8163d1a8abSmrg	.text
8263d1a8abSmrg	.align 16
8363d1a8abSmrg	.global __divdf3
8463d1a8abSmrg	.proc __divdf3
8563d1a8abSmrg__divdf3:
8663d1a8abSmrg	cmp.eq p7, p0 = r0, r0
8763d1a8abSmrg	frcpa.s0 f10, p6 = farg0, farg1
8863d1a8abSmrg	;;
8963d1a8abSmrg(p6)	cmp.ne p7, p0 = r0, r0
9063d1a8abSmrg	.pred.rel.mutex p6, p7
9163d1a8abSmrg(p6)	fmpy.s1 f11 = farg0, f10
9263d1a8abSmrg(p6)	fnma.s1 f12 = farg1, f10, f1
9363d1a8abSmrg	;;
9463d1a8abSmrg(p6)	fma.s1 f11 = f12, f11, f11
9563d1a8abSmrg(p6)	fmpy.s1 f13 = f12, f12
9663d1a8abSmrg	;;
9763d1a8abSmrg(p6)	fma.s1 f10 = f12, f10, f10
9863d1a8abSmrg(p6)	fma.s1 f11 = f13, f11, f11
9963d1a8abSmrg	;;
10063d1a8abSmrg(p6)	fmpy.s1 f12 = f13, f13
10163d1a8abSmrg(p6)	fma.s1 f10 = f13, f10, f10
10263d1a8abSmrg	;;
10363d1a8abSmrg(p6)	fma.d.s1 f11 = f12, f11, f11
10463d1a8abSmrg(p6)	fma.s1 f10 = f12, f10, f10
10563d1a8abSmrg	;;
10663d1a8abSmrg(p6)	fnma.d.s1 f8 = farg1, f11, farg0
10763d1a8abSmrg	;;
10863d1a8abSmrg(p6)	fma.d fret0 = f8, f10, f11
10963d1a8abSmrg(p7)	mov fret0 = f10
11063d1a8abSmrg	br.ret.sptk rp
11163d1a8abSmrg	;;
11263d1a8abSmrg	.endp __divdf3
11363d1a8abSmrg#endif
11463d1a8abSmrg
11563d1a8abSmrg#ifdef L__divsf3
11663d1a8abSmrg// Compute a 32-bit IEEE float quotient.
11763d1a8abSmrg//
11863d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency
11963d1a8abSmrg// alternative.
12063d1a8abSmrg//
12163d1a8abSmrg// farg0 holds the dividend.  farg1 holds the divisor.
12263d1a8abSmrg
12363d1a8abSmrg	.text
12463d1a8abSmrg	.align 16
12563d1a8abSmrg	.global __divsf3
12663d1a8abSmrg	.proc __divsf3
12763d1a8abSmrg__divsf3:
12863d1a8abSmrg	cmp.eq p7, p0 = r0, r0
12963d1a8abSmrg	frcpa.s0 f10, p6 = farg0, farg1
13063d1a8abSmrg	;;
13163d1a8abSmrg(p6)	cmp.ne p7, p0 = r0, r0
13263d1a8abSmrg	.pred.rel.mutex p6, p7
13363d1a8abSmrg(p6)	fmpy.s1 f8 = farg0, f10
13463d1a8abSmrg(p6)	fnma.s1 f9 = farg1, f10, f1
13563d1a8abSmrg	;;
13663d1a8abSmrg(p6)	fma.s1 f8 = f9, f8, f8
13763d1a8abSmrg(p6)	fmpy.s1 f9 = f9, f9
13863d1a8abSmrg	;;
13963d1a8abSmrg(p6)	fma.s1 f8 = f9, f8, f8
14063d1a8abSmrg(p6)	fmpy.s1 f9 = f9, f9
14163d1a8abSmrg	;;
14263d1a8abSmrg(p6)	fma.d.s1 f10 = f9, f8, f8
14363d1a8abSmrg	;;
14463d1a8abSmrg(p6)	fnorm.s.s0 fret0 = f10
14563d1a8abSmrg(p7)	mov fret0 = f10
14663d1a8abSmrg	br.ret.sptk rp
14763d1a8abSmrg	;;
14863d1a8abSmrg	.endp __divsf3
14963d1a8abSmrg#endif
15063d1a8abSmrg
15163d1a8abSmrg#ifdef L__divdi3
15263d1a8abSmrg// Compute a 64-bit integer quotient.
15363d1a8abSmrg//
15463d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency
15563d1a8abSmrg// alternative.
15663d1a8abSmrg//
15763d1a8abSmrg// in0 holds the dividend.  in1 holds the divisor.
15863d1a8abSmrg
15963d1a8abSmrg	.text
16063d1a8abSmrg	.align 16
16163d1a8abSmrg	.global __divdi3
16263d1a8abSmrg	.proc __divdi3
16363d1a8abSmrg__divdi3:
16463d1a8abSmrg	.regstk 2,0,0,0
16563d1a8abSmrg	// Transfer inputs to FP registers.
16663d1a8abSmrg	setf.sig f8 = in0
16763d1a8abSmrg	setf.sig f9 = in1
16863d1a8abSmrg	// Check divide by zero.
16963d1a8abSmrg	cmp.ne.unc p0,p7=0,in1
17063d1a8abSmrg	;;
17163d1a8abSmrg	// Convert the inputs to FP, so that they won't be treated as unsigned.
17263d1a8abSmrg	fcvt.xf f8 = f8
17363d1a8abSmrg	fcvt.xf f9 = f9
17463d1a8abSmrg(p7)	break 1
17563d1a8abSmrg	;;
17663d1a8abSmrg	// Compute the reciprocal approximation.
17763d1a8abSmrg	frcpa.s1 f10, p6 = f8, f9
17863d1a8abSmrg	;;
17963d1a8abSmrg	// 3 Newton-Raphson iterations.
18063d1a8abSmrg(p6)	fnma.s1 f11 = f9, f10, f1
18163d1a8abSmrg(p6)	fmpy.s1 f12 = f8, f10
18263d1a8abSmrg	;;
18363d1a8abSmrg(p6)	fmpy.s1 f13 = f11, f11
18463d1a8abSmrg(p6)	fma.s1 f12 = f11, f12, f12
18563d1a8abSmrg	;;
18663d1a8abSmrg(p6)	fma.s1 f10 = f11, f10, f10
18763d1a8abSmrg(p6)	fma.s1 f11 = f13, f12, f12
18863d1a8abSmrg	;;
18963d1a8abSmrg(p6)	fma.s1 f10 = f13, f10, f10
19063d1a8abSmrg(p6)	fnma.s1 f12 = f9, f11, f8
19163d1a8abSmrg	;;
19263d1a8abSmrg(p6)	fma.s1 f10 = f12, f10, f11
19363d1a8abSmrg	;;
19463d1a8abSmrg	// Round quotient to an integer.
19563d1a8abSmrg	fcvt.fx.trunc.s1 f10 = f10
19663d1a8abSmrg	;;
19763d1a8abSmrg	// Transfer result to GP registers.
19863d1a8abSmrg	getf.sig ret0 = f10
19963d1a8abSmrg	br.ret.sptk rp
20063d1a8abSmrg	;;
20163d1a8abSmrg	.endp __divdi3
20263d1a8abSmrg#endif
20363d1a8abSmrg
20463d1a8abSmrg#ifdef L__moddi3
20563d1a8abSmrg// Compute a 64-bit integer modulus.
20663d1a8abSmrg//
20763d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency
20863d1a8abSmrg// alternative.
20963d1a8abSmrg//
21063d1a8abSmrg// in0 holds the dividend (a).  in1 holds the divisor (b).
21163d1a8abSmrg
21263d1a8abSmrg	.text
21363d1a8abSmrg	.align 16
21463d1a8abSmrg	.global __moddi3
21563d1a8abSmrg	.proc __moddi3
21663d1a8abSmrg__moddi3:
21763d1a8abSmrg	.regstk 2,0,0,0
21863d1a8abSmrg	// Transfer inputs to FP registers.
21963d1a8abSmrg	setf.sig f14 = in0
22063d1a8abSmrg	setf.sig f9 = in1
22163d1a8abSmrg	// Check divide by zero.
22263d1a8abSmrg	cmp.ne.unc p0,p7=0,in1
22363d1a8abSmrg	;;
22463d1a8abSmrg	// Convert the inputs to FP, so that they won't be treated as unsigned.
22563d1a8abSmrg	fcvt.xf f8 = f14
22663d1a8abSmrg	fcvt.xf f9 = f9
22763d1a8abSmrg(p7)	break 1
22863d1a8abSmrg	;;
22963d1a8abSmrg	// Compute the reciprocal approximation.
23063d1a8abSmrg	frcpa.s1 f10, p6 = f8, f9
23163d1a8abSmrg	;;
23263d1a8abSmrg	// 3 Newton-Raphson iterations.
23363d1a8abSmrg(p6)	fmpy.s1 f12 = f8, f10
23463d1a8abSmrg(p6)	fnma.s1 f11 = f9, f10, f1
23563d1a8abSmrg	;;
23663d1a8abSmrg(p6)	fma.s1 f12 = f11, f12, f12
23763d1a8abSmrg(p6)	fmpy.s1 f13 = f11, f11
23863d1a8abSmrg	;;
23963d1a8abSmrg(p6)	fma.s1 f10 = f11, f10, f10
24063d1a8abSmrg(p6)	fma.s1 f11 = f13, f12, f12
24163d1a8abSmrg	;;
24263d1a8abSmrg	sub in1 = r0, in1
24363d1a8abSmrg(p6)	fma.s1 f10 = f13, f10, f10
24463d1a8abSmrg(p6)	fnma.s1 f12 = f9, f11, f8
24563d1a8abSmrg	;;
24663d1a8abSmrg	setf.sig f9 = in1
24763d1a8abSmrg(p6)	fma.s1 f10 = f12, f10, f11
24863d1a8abSmrg	;;
24963d1a8abSmrg	fcvt.fx.trunc.s1 f10 = f10
25063d1a8abSmrg	;;
25163d1a8abSmrg	// r = q * (-b) + a
25263d1a8abSmrg	xma.l f10 = f10, f9, f14
25363d1a8abSmrg	;;
25463d1a8abSmrg	// Transfer result to GP registers.
25563d1a8abSmrg	getf.sig ret0 = f10
25663d1a8abSmrg	br.ret.sptk rp
25763d1a8abSmrg	;;
25863d1a8abSmrg	.endp __moddi3
25963d1a8abSmrg#endif
26063d1a8abSmrg
26163d1a8abSmrg#ifdef L__udivdi3
26263d1a8abSmrg// Compute a 64-bit unsigned integer quotient.
26363d1a8abSmrg//
26463d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency
26563d1a8abSmrg// alternative.
26663d1a8abSmrg//
26763d1a8abSmrg// in0 holds the dividend.  in1 holds the divisor.
26863d1a8abSmrg
26963d1a8abSmrg	.text
27063d1a8abSmrg	.align 16
27163d1a8abSmrg	.global __udivdi3
27263d1a8abSmrg	.proc __udivdi3
27363d1a8abSmrg__udivdi3:
27463d1a8abSmrg	.regstk 2,0,0,0
27563d1a8abSmrg	// Transfer inputs to FP registers.
27663d1a8abSmrg	setf.sig f8 = in0
27763d1a8abSmrg	setf.sig f9 = in1
27863d1a8abSmrg	// Check divide by zero.
27963d1a8abSmrg	cmp.ne.unc p0,p7=0,in1
28063d1a8abSmrg	;;
28163d1a8abSmrg	// Convert the inputs to FP, to avoid FP software-assist faults.
28263d1a8abSmrg	fcvt.xuf.s1 f8 = f8
28363d1a8abSmrg	fcvt.xuf.s1 f9 = f9
28463d1a8abSmrg(p7)	break 1
28563d1a8abSmrg	;;
28663d1a8abSmrg	// Compute the reciprocal approximation.
28763d1a8abSmrg	frcpa.s1 f10, p6 = f8, f9
28863d1a8abSmrg	;;
28963d1a8abSmrg	// 3 Newton-Raphson iterations.
29063d1a8abSmrg(p6)	fnma.s1 f11 = f9, f10, f1
29163d1a8abSmrg(p6)	fmpy.s1 f12 = f8, f10
29263d1a8abSmrg	;;
29363d1a8abSmrg(p6)	fmpy.s1 f13 = f11, f11
29463d1a8abSmrg(p6)	fma.s1 f12 = f11, f12, f12
29563d1a8abSmrg	;;
29663d1a8abSmrg(p6)	fma.s1 f10 = f11, f10, f10
29763d1a8abSmrg(p6)	fma.s1 f11 = f13, f12, f12
29863d1a8abSmrg	;;
29963d1a8abSmrg(p6)	fma.s1 f10 = f13, f10, f10
30063d1a8abSmrg(p6)	fnma.s1 f12 = f9, f11, f8
30163d1a8abSmrg	;;
30263d1a8abSmrg(p6)	fma.s1 f10 = f12, f10, f11
30363d1a8abSmrg	;;
30463d1a8abSmrg	// Round quotient to an unsigned integer.
30563d1a8abSmrg	fcvt.fxu.trunc.s1 f10 = f10
30663d1a8abSmrg	;;
30763d1a8abSmrg	// Transfer result to GP registers.
30863d1a8abSmrg	getf.sig ret0 = f10
30963d1a8abSmrg	br.ret.sptk rp
31063d1a8abSmrg	;;
31163d1a8abSmrg	.endp __udivdi3
31263d1a8abSmrg#endif
31363d1a8abSmrg
31463d1a8abSmrg#ifdef L__umoddi3
31563d1a8abSmrg// Compute a 64-bit unsigned integer modulus.
31663d1a8abSmrg//
31763d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency
31863d1a8abSmrg// alternative.
31963d1a8abSmrg//
32063d1a8abSmrg// in0 holds the dividend (a).  in1 holds the divisor (b).
32163d1a8abSmrg
32263d1a8abSmrg	.text
32363d1a8abSmrg	.align 16
32463d1a8abSmrg	.global __umoddi3
32563d1a8abSmrg	.proc __umoddi3
32663d1a8abSmrg__umoddi3:
32763d1a8abSmrg	.regstk 2,0,0,0
32863d1a8abSmrg	// Transfer inputs to FP registers.
32963d1a8abSmrg	setf.sig f14 = in0
33063d1a8abSmrg	setf.sig f9 = in1
33163d1a8abSmrg	// Check divide by zero.
33263d1a8abSmrg	cmp.ne.unc p0,p7=0,in1
33363d1a8abSmrg	;;
33463d1a8abSmrg	// Convert the inputs to FP, to avoid FP software assist faults.
33563d1a8abSmrg	fcvt.xuf.s1 f8 = f14
33663d1a8abSmrg	fcvt.xuf.s1 f9 = f9
33763d1a8abSmrg(p7)	break 1;
33863d1a8abSmrg	;;
33963d1a8abSmrg	// Compute the reciprocal approximation.
34063d1a8abSmrg	frcpa.s1 f10, p6 = f8, f9
34163d1a8abSmrg	;;
34263d1a8abSmrg	// 3 Newton-Raphson iterations.
34363d1a8abSmrg(p6)	fmpy.s1 f12 = f8, f10
34463d1a8abSmrg(p6)	fnma.s1 f11 = f9, f10, f1
34563d1a8abSmrg	;;
34663d1a8abSmrg(p6)	fma.s1 f12 = f11, f12, f12
34763d1a8abSmrg(p6)	fmpy.s1 f13 = f11, f11
34863d1a8abSmrg	;;
34963d1a8abSmrg(p6)	fma.s1 f10 = f11, f10, f10
35063d1a8abSmrg(p6)	fma.s1 f11 = f13, f12, f12
35163d1a8abSmrg	;;
35263d1a8abSmrg	sub in1 = r0, in1
35363d1a8abSmrg(p6)	fma.s1 f10 = f13, f10, f10
35463d1a8abSmrg(p6)	fnma.s1 f12 = f9, f11, f8
35563d1a8abSmrg	;;
35663d1a8abSmrg	setf.sig f9 = in1
35763d1a8abSmrg(p6)	fma.s1 f10 = f12, f10, f11
35863d1a8abSmrg	;;
35963d1a8abSmrg	// Round quotient to an unsigned integer.
36063d1a8abSmrg	fcvt.fxu.trunc.s1 f10 = f10
36163d1a8abSmrg	;;
36263d1a8abSmrg	// r = q * (-b) + a
36363d1a8abSmrg	xma.l f10 = f10, f9, f14
36463d1a8abSmrg	;;
36563d1a8abSmrg	// Transfer result to GP registers.
36663d1a8abSmrg	getf.sig ret0 = f10
36763d1a8abSmrg	br.ret.sptk rp
36863d1a8abSmrg	;;
36963d1a8abSmrg	.endp __umoddi3
37063d1a8abSmrg#endif
37163d1a8abSmrg
37263d1a8abSmrg#ifdef L__divsi3
37363d1a8abSmrg// Compute a 32-bit integer quotient.
37463d1a8abSmrg//
37563d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency
37663d1a8abSmrg// alternative.
37763d1a8abSmrg//
37863d1a8abSmrg// in0 holds the dividend.  in1 holds the divisor.
37963d1a8abSmrg
38063d1a8abSmrg	.text
38163d1a8abSmrg	.align 16
38263d1a8abSmrg	.global __divsi3
38363d1a8abSmrg	.proc __divsi3
38463d1a8abSmrg__divsi3:
38563d1a8abSmrg	.regstk 2,0,0,0
38663d1a8abSmrg	// Check divide by zero.
38763d1a8abSmrg	cmp.ne.unc p0,p7=0,in1
38863d1a8abSmrg	sxt4 in0 = in0
38963d1a8abSmrg	sxt4 in1 = in1
39063d1a8abSmrg	;;
39163d1a8abSmrg	setf.sig f8 = in0
39263d1a8abSmrg	setf.sig f9 = in1
39363d1a8abSmrg(p7)	break 1
39463d1a8abSmrg	;;
39563d1a8abSmrg	mov r2 = 0x0ffdd
39663d1a8abSmrg	fcvt.xf f8 = f8
39763d1a8abSmrg	fcvt.xf f9 = f9
39863d1a8abSmrg	;;
39963d1a8abSmrg	setf.exp f11 = r2
40063d1a8abSmrg	frcpa.s1 f10, p6 = f8, f9
40163d1a8abSmrg	;;
40263d1a8abSmrg(p6)	fmpy.s1 f8 = f8, f10
40363d1a8abSmrg(p6)	fnma.s1 f9 = f9, f10, f1
40463d1a8abSmrg	;;
40563d1a8abSmrg(p6)	fma.s1 f8 = f9, f8, f8
40663d1a8abSmrg(p6)	fma.s1 f9 = f9, f9, f11
40763d1a8abSmrg	;;
40863d1a8abSmrg(p6)	fma.s1 f10 = f9, f8, f8
40963d1a8abSmrg	;;
41063d1a8abSmrg	fcvt.fx.trunc.s1 f10 = f10
41163d1a8abSmrg	;;
41263d1a8abSmrg	getf.sig ret0 = f10
41363d1a8abSmrg	br.ret.sptk rp
41463d1a8abSmrg	;;
41563d1a8abSmrg	.endp __divsi3
41663d1a8abSmrg#endif
41763d1a8abSmrg
41863d1a8abSmrg#ifdef L__modsi3
41963d1a8abSmrg// Compute a 32-bit integer modulus.
42063d1a8abSmrg//
42163d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency
42263d1a8abSmrg// alternative.
42363d1a8abSmrg//
42463d1a8abSmrg// in0 holds the dividend.  in1 holds the divisor.
42563d1a8abSmrg
42663d1a8abSmrg	.text
42763d1a8abSmrg	.align 16
42863d1a8abSmrg	.global __modsi3
42963d1a8abSmrg	.proc __modsi3
43063d1a8abSmrg__modsi3:
43163d1a8abSmrg	.regstk 2,0,0,0
43263d1a8abSmrg	mov r2 = 0x0ffdd
43363d1a8abSmrg	sxt4 in0 = in0
43463d1a8abSmrg	sxt4 in1 = in1
43563d1a8abSmrg	;;
43663d1a8abSmrg	setf.sig f13 = r32
43763d1a8abSmrg	setf.sig f9 = r33
43863d1a8abSmrg	// Check divide by zero.
43963d1a8abSmrg	cmp.ne.unc p0,p7=0,in1
44063d1a8abSmrg	;;
44163d1a8abSmrg	sub in1 = r0, in1
44263d1a8abSmrg	fcvt.xf f8 = f13
44363d1a8abSmrg	fcvt.xf f9 = f9
44463d1a8abSmrg	;;
44563d1a8abSmrg	setf.exp f11 = r2
44663d1a8abSmrg	frcpa.s1 f10, p6 = f8, f9
44763d1a8abSmrg(p7)	break 1
44863d1a8abSmrg	;;
44963d1a8abSmrg(p6)	fmpy.s1 f12 = f8, f10
45063d1a8abSmrg(p6)	fnma.s1 f10 = f9, f10, f1
45163d1a8abSmrg	;;
45263d1a8abSmrg	setf.sig f9 = in1
45363d1a8abSmrg(p6)	fma.s1 f12 = f10, f12, f12
45463d1a8abSmrg(p6)	fma.s1 f10 = f10, f10, f11
45563d1a8abSmrg	;;
45663d1a8abSmrg(p6)	fma.s1 f10 = f10, f12, f12
45763d1a8abSmrg	;;
45863d1a8abSmrg	fcvt.fx.trunc.s1 f10 = f10
45963d1a8abSmrg	;;
46063d1a8abSmrg	xma.l f10 = f10, f9, f13
46163d1a8abSmrg	;;
46263d1a8abSmrg	getf.sig ret0 = f10
46363d1a8abSmrg	br.ret.sptk rp
46463d1a8abSmrg	;;
46563d1a8abSmrg	.endp __modsi3
46663d1a8abSmrg#endif
46763d1a8abSmrg
46863d1a8abSmrg#ifdef L__udivsi3
46963d1a8abSmrg// Compute a 32-bit unsigned integer quotient.
47063d1a8abSmrg//
47163d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency
47263d1a8abSmrg// alternative.
47363d1a8abSmrg//
47463d1a8abSmrg// in0 holds the dividend.  in1 holds the divisor.
47563d1a8abSmrg
47663d1a8abSmrg	.text
47763d1a8abSmrg	.align 16
47863d1a8abSmrg	.global __udivsi3
47963d1a8abSmrg	.proc __udivsi3
48063d1a8abSmrg__udivsi3:
48163d1a8abSmrg	.regstk 2,0,0,0
48263d1a8abSmrg	mov r2 = 0x0ffdd
48363d1a8abSmrg	zxt4 in0 = in0
48463d1a8abSmrg	zxt4 in1 = in1
48563d1a8abSmrg	;;
48663d1a8abSmrg	setf.sig f8 = in0
48763d1a8abSmrg	setf.sig f9 = in1
48863d1a8abSmrg	// Check divide by zero.
48963d1a8abSmrg	cmp.ne.unc p0,p7=0,in1
49063d1a8abSmrg	;;
49163d1a8abSmrg	fcvt.xf f8 = f8
49263d1a8abSmrg	fcvt.xf f9 = f9
49363d1a8abSmrg(p7)	break 1
49463d1a8abSmrg	;;
49563d1a8abSmrg	setf.exp f11 = r2
49663d1a8abSmrg	frcpa.s1 f10, p6 = f8, f9
49763d1a8abSmrg	;;
49863d1a8abSmrg(p6)	fmpy.s1 f8 = f8, f10
49963d1a8abSmrg(p6)	fnma.s1 f9 = f9, f10, f1
50063d1a8abSmrg	;;
50163d1a8abSmrg(p6)	fma.s1 f8 = f9, f8, f8
50263d1a8abSmrg(p6)	fma.s1 f9 = f9, f9, f11
50363d1a8abSmrg	;;
50463d1a8abSmrg(p6)	fma.s1 f10 = f9, f8, f8
50563d1a8abSmrg	;;
50663d1a8abSmrg	fcvt.fxu.trunc.s1 f10 = f10
50763d1a8abSmrg	;;
50863d1a8abSmrg	getf.sig ret0 = f10
50963d1a8abSmrg	br.ret.sptk rp
51063d1a8abSmrg	;;
51163d1a8abSmrg	.endp __udivsi3
51263d1a8abSmrg#endif
51363d1a8abSmrg
51463d1a8abSmrg#ifdef L__umodsi3
51563d1a8abSmrg// Compute a 32-bit unsigned integer modulus.
51663d1a8abSmrg//
51763d1a8abSmrg// From the Intel IA-64 Optimization Guide, choose the minimum latency
51863d1a8abSmrg// alternative.
51963d1a8abSmrg//
52063d1a8abSmrg// in0 holds the dividend.  in1 holds the divisor.
52163d1a8abSmrg
52263d1a8abSmrg	.text
52363d1a8abSmrg	.align 16
52463d1a8abSmrg	.global __umodsi3
52563d1a8abSmrg	.proc __umodsi3
52663d1a8abSmrg__umodsi3:
52763d1a8abSmrg	.regstk 2,0,0,0
52863d1a8abSmrg	mov r2 = 0x0ffdd
52963d1a8abSmrg	zxt4 in0 = in0
53063d1a8abSmrg	zxt4 in1 = in1
53163d1a8abSmrg	;;
53263d1a8abSmrg	setf.sig f13 = in0
53363d1a8abSmrg	setf.sig f9 = in1
53463d1a8abSmrg	// Check divide by zero.
53563d1a8abSmrg	cmp.ne.unc p0,p7=0,in1
53663d1a8abSmrg	;;
53763d1a8abSmrg	sub in1 = r0, in1
53863d1a8abSmrg	fcvt.xf f8 = f13
53963d1a8abSmrg	fcvt.xf f9 = f9
54063d1a8abSmrg	;;
54163d1a8abSmrg	setf.exp f11 = r2
54263d1a8abSmrg	frcpa.s1 f10, p6 = f8, f9
54363d1a8abSmrg(p7)	break 1;
54463d1a8abSmrg	;;
54563d1a8abSmrg(p6)	fmpy.s1 f12 = f8, f10
54663d1a8abSmrg(p6)	fnma.s1 f10 = f9, f10, f1
54763d1a8abSmrg	;;
54863d1a8abSmrg	setf.sig f9 = in1
54963d1a8abSmrg(p6)	fma.s1 f12 = f10, f12, f12
55063d1a8abSmrg(p6)	fma.s1 f10 = f10, f10, f11
55163d1a8abSmrg	;;
55263d1a8abSmrg(p6)	fma.s1 f10 = f10, f12, f12
55363d1a8abSmrg	;;
55463d1a8abSmrg	fcvt.fxu.trunc.s1 f10 = f10
55563d1a8abSmrg	;;
55663d1a8abSmrg	xma.l f10 = f10, f9, f13
55763d1a8abSmrg	;;
55863d1a8abSmrg	getf.sig ret0 = f10
55963d1a8abSmrg	br.ret.sptk rp
56063d1a8abSmrg	;;
56163d1a8abSmrg	.endp __umodsi3
56263d1a8abSmrg#endif
56363d1a8abSmrg
56463d1a8abSmrg#ifdef L__save_stack_nonlocal
56563d1a8abSmrg// Notes on save/restore stack nonlocal: We read ar.bsp but write
56663d1a8abSmrg// ar.bspstore.  This is because ar.bsp can be read at all times
56763d1a8abSmrg// (independent of the RSE mode) but since it's read-only we need to
56863d1a8abSmrg// restore the value via ar.bspstore.  This is OK because
56963d1a8abSmrg// ar.bsp==ar.bspstore after executing "flushrs".
57063d1a8abSmrg
57163d1a8abSmrg// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
57263d1a8abSmrg
57363d1a8abSmrg	.text
57463d1a8abSmrg	.align 16
57563d1a8abSmrg	.global __ia64_save_stack_nonlocal
57663d1a8abSmrg	.proc __ia64_save_stack_nonlocal
57763d1a8abSmrg__ia64_save_stack_nonlocal:
57863d1a8abSmrg	{ .mmf
57963d1a8abSmrg	  alloc r18 = ar.pfs, 2, 0, 0, 0
58063d1a8abSmrg	  mov r19 = ar.rsc
58163d1a8abSmrg	  ;;
58263d1a8abSmrg	}
58363d1a8abSmrg	{ .mmi
58463d1a8abSmrg	  flushrs
58563d1a8abSmrg	  st8 [in0] = in1, 24
58663d1a8abSmrg	  and r19 = 0x1c, r19
58763d1a8abSmrg	  ;;
58863d1a8abSmrg	}
58963d1a8abSmrg	{ .mmi
59063d1a8abSmrg	  st8 [in0] = r18, -16
59163d1a8abSmrg	  mov ar.rsc = r19
59263d1a8abSmrg	  or r19 = 0x3, r19
59363d1a8abSmrg	  ;;
59463d1a8abSmrg	}
59563d1a8abSmrg	{ .mmi
59663d1a8abSmrg	  mov r16 = ar.bsp
59763d1a8abSmrg	  mov r17 = ar.rnat
59863d1a8abSmrg	  adds r2 = 8, in0
59963d1a8abSmrg	  ;;
60063d1a8abSmrg	}
60163d1a8abSmrg	{ .mmi
60263d1a8abSmrg	  st8 [in0] = r16
60363d1a8abSmrg	  st8 [r2] = r17
60463d1a8abSmrg	}
60563d1a8abSmrg	{ .mib
60663d1a8abSmrg	  mov ar.rsc = r19
60763d1a8abSmrg	  br.ret.sptk.few rp
60863d1a8abSmrg	  ;;
60963d1a8abSmrg	}
61063d1a8abSmrg	.endp __ia64_save_stack_nonlocal
61163d1a8abSmrg#endif
61263d1a8abSmrg
61363d1a8abSmrg#ifdef L__nonlocal_goto
61463d1a8abSmrg// void __ia64_nonlocal_goto(void *target_label, void *save_area,
61563d1a8abSmrg//			     void *static_chain);
61663d1a8abSmrg
61763d1a8abSmrg	.text
61863d1a8abSmrg	.align 16
61963d1a8abSmrg	.global __ia64_nonlocal_goto
62063d1a8abSmrg	.proc __ia64_nonlocal_goto
62163d1a8abSmrg__ia64_nonlocal_goto:
62263d1a8abSmrg	{ .mmi
62363d1a8abSmrg	  alloc r20 = ar.pfs, 3, 0, 0, 0
62463d1a8abSmrg	  ld8 r12 = [in1], 8
62563d1a8abSmrg	  mov.ret.sptk rp = in0, .L0
62663d1a8abSmrg	  ;;
62763d1a8abSmrg	}
62863d1a8abSmrg	{ .mmf
62963d1a8abSmrg	  ld8 r16 = [in1], 8
63063d1a8abSmrg	  mov r19 = ar.rsc
63163d1a8abSmrg	  ;;
63263d1a8abSmrg	}
63363d1a8abSmrg	{ .mmi
63463d1a8abSmrg	  flushrs
63563d1a8abSmrg	  ld8 r17 = [in1], 8
63663d1a8abSmrg	  and r19 = 0x1c, r19
63763d1a8abSmrg	  ;;
63863d1a8abSmrg	}
63963d1a8abSmrg	{ .mmi
64063d1a8abSmrg	  ld8 r18 = [in1]
64163d1a8abSmrg	  mov ar.rsc = r19
64263d1a8abSmrg	  or r19 = 0x3, r19
64363d1a8abSmrg	  ;;
64463d1a8abSmrg	}
64563d1a8abSmrg	{ .mmi
64663d1a8abSmrg	  mov ar.bspstore = r16
64763d1a8abSmrg	  ;;
64863d1a8abSmrg	  mov ar.rnat = r17
64963d1a8abSmrg	  ;;
65063d1a8abSmrg	}
65163d1a8abSmrg	{ .mmi
65263d1a8abSmrg	  loadrs
65363d1a8abSmrg	  invala
65463d1a8abSmrg	  mov r15 = in2
65563d1a8abSmrg	  ;;
65663d1a8abSmrg	}
65763d1a8abSmrg.L0:	{ .mib
65863d1a8abSmrg	  mov ar.rsc = r19
65963d1a8abSmrg	  mov ar.pfs = r18
66063d1a8abSmrg	  br.ret.sptk.few rp
66163d1a8abSmrg	  ;;
66263d1a8abSmrg	}
66363d1a8abSmrg	.endp __ia64_nonlocal_goto
66463d1a8abSmrg#endif
66563d1a8abSmrg
66663d1a8abSmrg#ifdef L__restore_stack_nonlocal
66763d1a8abSmrg// This is mostly the same as nonlocal_goto above.
66863d1a8abSmrg// ??? This has not been tested yet.
66963d1a8abSmrg
67063d1a8abSmrg// void __ia64_restore_stack_nonlocal(void *save_area)
67163d1a8abSmrg
67263d1a8abSmrg	.text
67363d1a8abSmrg	.align 16
67463d1a8abSmrg	.global __ia64_restore_stack_nonlocal
67563d1a8abSmrg	.proc __ia64_restore_stack_nonlocal
67663d1a8abSmrg__ia64_restore_stack_nonlocal:
67763d1a8abSmrg	{ .mmf
67863d1a8abSmrg	  alloc r20 = ar.pfs, 4, 0, 0, 0
67963d1a8abSmrg	  ld8 r12 = [in0], 8
68063d1a8abSmrg	  ;;
68163d1a8abSmrg	}
68263d1a8abSmrg	{ .mmb
68363d1a8abSmrg	  ld8 r16=[in0], 8
68463d1a8abSmrg	  mov r19 = ar.rsc
68563d1a8abSmrg	  ;;
68663d1a8abSmrg	}
68763d1a8abSmrg	{ .mmi
68863d1a8abSmrg	  flushrs
68963d1a8abSmrg	  ld8 r17 = [in0], 8
69063d1a8abSmrg	  and r19 = 0x1c, r19
69163d1a8abSmrg	  ;;
69263d1a8abSmrg	}
69363d1a8abSmrg	{ .mmf
69463d1a8abSmrg	  ld8 r18 = [in0]
69563d1a8abSmrg	  mov ar.rsc = r19
69663d1a8abSmrg	  ;;
69763d1a8abSmrg	}
69863d1a8abSmrg	{ .mmi
69963d1a8abSmrg	  mov ar.bspstore = r16
70063d1a8abSmrg	  ;;
70163d1a8abSmrg	  mov ar.rnat = r17
70263d1a8abSmrg	  or r19 = 0x3, r19
70363d1a8abSmrg	  ;;
70463d1a8abSmrg	}
70563d1a8abSmrg	{ .mmf
70663d1a8abSmrg	  loadrs
70763d1a8abSmrg	  invala
70863d1a8abSmrg	  ;;
70963d1a8abSmrg	}
71063d1a8abSmrg.L0:	{ .mib
71163d1a8abSmrg	  mov ar.rsc = r19
71263d1a8abSmrg	  mov ar.pfs = r18
71363d1a8abSmrg	  br.ret.sptk.few rp
71463d1a8abSmrg	  ;;
71563d1a8abSmrg	}
71663d1a8abSmrg	.endp __ia64_restore_stack_nonlocal
71763d1a8abSmrg#endif
71863d1a8abSmrg
71963d1a8abSmrg#ifdef L__trampoline
72063d1a8abSmrg// Implement the nested function trampoline.  This is out of line
72163d1a8abSmrg// so that we don't have to bother with flushing the icache, as
72263d1a8abSmrg// well as making the on-stack trampoline smaller.
72363d1a8abSmrg//
72463d1a8abSmrg// The trampoline has the following form:
72563d1a8abSmrg//
72663d1a8abSmrg//		+-------------------+ >
72763d1a8abSmrg//	TRAMP:	| __ia64_trampoline | |
72863d1a8abSmrg//		+-------------------+  > fake function descriptor
72963d1a8abSmrg//		| TRAMP+16          | |
73063d1a8abSmrg//		+-------------------+ >
73163d1a8abSmrg//		| target descriptor |
73263d1a8abSmrg//		+-------------------+
73363d1a8abSmrg//		| static link	    |
73463d1a8abSmrg//		+-------------------+
73563d1a8abSmrg
73663d1a8abSmrg	.text
73763d1a8abSmrg	.align 16
73863d1a8abSmrg	.global __ia64_trampoline
73963d1a8abSmrg	.proc __ia64_trampoline
74063d1a8abSmrg__ia64_trampoline:
74163d1a8abSmrg	{ .mmi
74263d1a8abSmrg	  ld8 r2 = [r1], 8
74363d1a8abSmrg	  ;;
74463d1a8abSmrg	  ld8 r15 = [r1]
74563d1a8abSmrg	}
74663d1a8abSmrg	{ .mmi
74763d1a8abSmrg	  ld8 r3 = [r2], 8
74863d1a8abSmrg	  ;;
74963d1a8abSmrg	  ld8 r1 = [r2]
75063d1a8abSmrg	  mov b6 = r3
75163d1a8abSmrg	}
75263d1a8abSmrg	{ .bbb
75363d1a8abSmrg	  br.sptk.many b6
75463d1a8abSmrg	  ;;
75563d1a8abSmrg	}
75663d1a8abSmrg	.endp __ia64_trampoline
75763d1a8abSmrg#endif
75863d1a8abSmrg
75963d1a8abSmrg#ifdef SHARED
76063d1a8abSmrg// Thunks for backward compatibility.
76163d1a8abSmrg#ifdef L_fixtfdi
76263d1a8abSmrg	.text
76363d1a8abSmrg	.align 16
76463d1a8abSmrg	.global __fixtfti
76563d1a8abSmrg	.proc __fixtfti
76663d1a8abSmrg__fixtfti:
76763d1a8abSmrg	{ .bbb
76863d1a8abSmrg	  br.sptk.many __fixxfti
76963d1a8abSmrg	  ;;
77063d1a8abSmrg	}
77163d1a8abSmrg	.endp __fixtfti
77263d1a8abSmrg#endif
77363d1a8abSmrg#ifdef L_fixunstfdi
77463d1a8abSmrg	.align 16
77563d1a8abSmrg	.global __fixunstfti
77663d1a8abSmrg	.proc __fixunstfti
77763d1a8abSmrg__fixunstfti:
77863d1a8abSmrg	{ .bbb
77963d1a8abSmrg	  br.sptk.many __fixunsxfti
78063d1a8abSmrg	  ;;
78163d1a8abSmrg	}
78263d1a8abSmrg	.endp __fixunstfti
78363d1a8abSmrg#endif
78463d1a8abSmrg#ifdef L_floatditf
78563d1a8abSmrg	.align 16
78663d1a8abSmrg	.global __floattitf
78763d1a8abSmrg	.proc __floattitf
78863d1a8abSmrg__floattitf:
78963d1a8abSmrg	{ .bbb
79063d1a8abSmrg	  br.sptk.many __floattixf
79163d1a8abSmrg	  ;;
79263d1a8abSmrg	}
79363d1a8abSmrg	.endp __floattitf
79463d1a8abSmrg#endif
79563d1a8abSmrg#endif
796