1/* libgcc functions for Blackfin.
2   Copyright (C) 2005-2019 Free Software Foundation, Inc.
3   Contributed by Analog Devices.
4
5This file is part of GCC.
6
7GCC is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 3, or (at your option)
10any later version.
11
12GCC is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15GNU General Public License for more details.
16
17Under Section 7 of GPL version 3, you are granted additional
18permissions described in the GCC Runtime Library Exception, version
193.1, as published by the Free Software Foundation.
20
21You should have received a copy of the GNU General Public License and
22a copy of the GCC Runtime Library Exception along with this program;
23see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24<http://www.gnu.org/licenses/>.  */
25
26#ifdef L_divsi3
27.text
28.align 2
29.global ___divsi3;
30.type ___divsi3, STT_FUNC;
31
32___divsi3:
33        [--SP]= RETS;
34	[--SP] = R7;
35
36	R2 = -R0;
37        CC = R0 < 0;
38	IF CC R0 = R2;
39	R7 = CC;
40
41	R2 = -R1;
42        CC = R1 < 0;
43	IF CC R1 = R2;
44	R2 = CC;
45	R7 = R7 ^ R2;
46
47        CALL ___udivsi3;
48
49	CC = R7;
50	R1 = -R0;
51	IF CC R0 = R1;
52
53	R7 = [SP++];
54        RETS = [SP++];
55        RTS;
56#endif
57
58#ifdef L_modsi3
59.align 2
60.global ___modsi3;
61.type ___modsi3, STT_FUNC;
62
63___modsi3:
64	[--SP] = RETS;
65	[--SP] = R0;
66	[--SP] = R1;
67	CALL ___divsi3;
68	R2 = [SP++];
69	R1 = [SP++];
70	R2 *= R0;
71	R0 = R1 - R2;
72	RETS = [SP++];
73	RTS;
74#endif
75
76#ifdef L_udivsi3
77.align 2
78.global ___udivsi3;
79.type ___udivsi3, STT_FUNC;
80
81___udivsi3:
82        P0 = 32;
83        LSETUP (0f, 1f) LC0 = P0;
84	/* upper half of dividend */
85        R3 = 0;
860:
87	/* The first time round in the loop we shift in garbage, but since we
88	   perform 33 shifts, it doesn't matter.  */
89	R0 = ROT R0 BY 1;
90	R3 = ROT R3 BY 1;
91	R2 = R3 - R1;
92        CC = R3 < R1 (IU);
931:
94	/* Last instruction of the loop.  */
95	IF ! CC R3 = R2;
96
97	/* Shift in the last bit.  */
98	R0 = ROT R0 BY 1;
99	/* R0 is the result, R3 contains the remainder.  */
100	R0 = ~ R0;
101        RTS;
102#endif
103
104#ifdef L_umodsi3
105.align 2
106.global ___umodsi3;
107.type ___umodsi3, STT_FUNC;
108
109___umodsi3:
110	[--SP] = RETS;
111	CALL ___udivsi3;
112	R0 = R3;
113	RETS = [SP++];
114	RTS;
115#endif
116
117#ifdef L_umulsi3_highpart
118.align 2
119.global ___umulsi3_highpart;
120.type ___umulsi3_highpart, STT_FUNC;
121
122___umulsi3_highpart:
123	A1 = R1.L * R0.L (FU);
124	A1 = A1 >> 16;
125	A0 = R1.H * R0.H, A1 += R1.L * R0.H (FU);
126	A1 += R0.L * R1.H (FU);
127	A1 = A1 >> 16;
128	A0 += A1;
129	R0 = A0 (FU);
130	RTS;
131#endif
132
133#ifdef L_smulsi3_highpart
134.align 2
135.global ___smulsi3_highpart;
136.type ___smulsi3_highpart, STT_FUNC;
137
138___smulsi3_highpart:
139	A1 = R1.L * R0.L (FU);
140	A1 = A1 >> 16;
141	A0 = R0.H * R1.H, A1 += R0.H * R1.L (IS,M);
142	A1 += R1.H * R0.L (IS,M);
143	A1 = A1 >>> 16;
144	R0 = (A0 += A1);
145	RTS;
146#endif
147
148#ifdef L_muldi3
149.align 2
150.global ___muldi3;
151.type ___muldi3, STT_FUNC;
152
153/*
154	   R1:R0 * R3:R2
155	 = R1.h:R1.l:R0.h:R0.l * R3.h:R3.l:R2.h:R2.l
156[X]	 = (R1.h * R3.h) * 2^96
157[X]	   + (R1.h * R3.l + R1.l * R3.h) * 2^80
158[X]	   + (R1.h * R2.h + R1.l * R3.l + R3.h * R0.h) * 2^64
159[T1]	   + (R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h) * 2^48
160[T2]	   + (R1.l * R2.l + R3.l * R0.l + R0.h * R2.h) * 2^32
161[T3]	   + (R0.l * R2.h + R2.l * R0.h) * 2^16
162[T4]	   + (R0.l * R2.l)
163
164	We can discard the first three lines marked "X" since we produce
165	only a 64 bit result.  So, we need ten 16-bit multiplies.
166
167	Individual mul-acc results:
168[E1]	 =  R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h
169[E2]	 =  R1.l * R2.l + R3.l * R0.l + R0.h * R2.h
170[E3]	 =  R0.l * R2.h + R2.l * R0.h
171[E4]	 =  R0.l * R2.l
172
173	We also need to add high parts from lower-level results to higher ones:
174	E[n]c = E[n] + (E[n+1]c >> 16), where E4c := E4
175
176	One interesting property is that all parts of the result that depend
177	on the sign of the multiplication are discarded.  Those would be the
178	multiplications involving R1.h and R3.h, but only the top 16 bit of
179	the 32 bit result depend on the sign, and since R1.h and R3.h only
180	occur in E1, the top half of these results is cut off.
181	So, we can just use FU mode for all of the 16-bit multiplies, and
182	ignore questions of when to use mixed mode.  */
183
184___muldi3:
185	/* [SP] technically is part of the caller's frame, but we can
186	   use it as scratch space.  */
187	A0 = R2.H * R1.L, A1 = R2.L * R1.H (FU) || R3 = [SP + 12];	/* E1 */
188	A0 += R3.H * R0.L, A1 += R3.L * R0.H (FU) || [SP] = R4;		/* E1 */
189	A0 += A1;							/* E1 */
190	R4 = A0.w;
191	A0 = R0.l * R3.l (FU);						/* E2 */
192	A0 += R2.l * R1.l (FU);						/* E2 */
193
194	A1 = R2.L * R0.L (FU);						/* E4 */
195	R3 = A1.w;
196	A1 = A1 >> 16;							/* E3c */
197	A0 += R2.H * R0.H, A1 += R2.L * R0.H (FU);			/* E2, E3c */
198	A1 += R0.L * R2.H (FU);						/* E3c */
199	R0 = A1.w;
200	A1 = A1 >> 16;							/* E2c */
201	A0 += A1;							/* E2c */
202	R1 = A0.w;
203
204	/* low(result) = low(E3c):low(E4) */
205	R0 = PACK (R0.l, R3.l);
206	/* high(result) = E2c + (E1 << 16) */
207	R1.h = R1.h + R4.l (NS) || R4 = [SP];
208	RTS;
209
210.size ___muldi3, .-___muldi3
211#endif
212