1//===-- udivsi3.S - 32-bit unsigned integer divide ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the __udivsi3 (32-bit unsigned integer divide)
10// function for the ARM 32-bit architecture.
11//
12//===----------------------------------------------------------------------===//
13
14#include "../assembly.h"
15
16	.syntax unified
17	.text
18
19DEFINE_CODE_STATE
20
21	.p2align 2
22DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_uidiv, __udivsi3)
23
24@ unsigned int __udivsi3(unsigned int divident, unsigned int divisor)
25@   Calculate and return the quotient of the (unsigned) division.
26
27DEFINE_COMPILERRT_FUNCTION(__udivsi3)
28#if __ARM_ARCH_EXT_IDIV__
29	tst     r1, r1
30	beq     LOCAL_LABEL(divby0)
31	udiv	r0, r0, r1
32	bx  	lr
33
34LOCAL_LABEL(divby0):
35	// Use movs for compatibility with v8-m.base.
36	movs    r0, #0
37#  ifdef __ARM_EABI__
38	b       __aeabi_idiv0
39#  else
40	JMP(lr)
41#  endif
42
43#else // ! __ARM_ARCH_EXT_IDIV__
44	cmp	r1, #1
45	bcc	LOCAL_LABEL(divby0)
46#if defined(USE_THUMB_1)
47	bne LOCAL_LABEL(num_neq_denom)
48	JMP(lr)
49LOCAL_LABEL(num_neq_denom):
50#else
51	IT(eq)
52	JMPc(lr, eq)
53#endif
54	cmp	r0, r1
55#if defined(USE_THUMB_1)
56	bhs LOCAL_LABEL(num_ge_denom)
57	movs r0, #0
58	JMP(lr)
59LOCAL_LABEL(num_ge_denom):
60#else
61	ITT(cc)
62	movcc	r0, #0
63	JMPc(lr, cc)
64#endif
65
66	// Implement division using binary long division algorithm.
67	//
68	// r0 is the numerator, r1 the denominator.
69	//
70	// The code before JMP computes the correct shift I, so that
71	// r0 and (r1 << I) have the highest bit set in the same position.
72	// At the time of JMP, ip := .Ldiv0block - 12 * I.
73	// This depends on the fixed instruction size of block.
74	// For ARM mode, this is 12 Bytes, for THUMB mode 14 Bytes.
75	//
76	// block(shift) implements the test-and-update-quotient core.
77	// It assumes (r0 << shift) can be computed without overflow and
78	// that (r0 << shift) < 2 * r1. The quotient is stored in r3.
79
80#  if defined(__ARM_FEATURE_CLZ)
81	clz	ip, r0
82	clz	r3, r1
83	// r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3.
84	sub	r3, r3, ip
85#    if defined(USE_THUMB_2)
86	adr	ip, LOCAL_LABEL(div0block) + 1
87	sub	ip, ip, r3, lsl #1
88#    else
89	adr	ip, LOCAL_LABEL(div0block)
90#    endif
91	sub	ip, ip, r3, lsl #2
92	sub	ip, ip, r3, lsl #3
93	mov	r3, #0
94	bx	ip
95#  else // No CLZ Feature
96#    if defined(USE_THUMB_2)
97#    error THUMB mode requires CLZ or UDIV
98#    endif
99#    if defined(USE_THUMB_1)
100#      define BLOCK_SIZE 10
101#    else
102#      define BLOCK_SIZE 12
103#    endif
104
105	mov	r2, r0
106#    if defined(USE_THUMB_1)
107	mov ip, r0
108	adr r0, LOCAL_LABEL(div0block)
109	adds r0, #1
110#    else
111	adr	ip, LOCAL_LABEL(div0block)
112#    endif
113	lsrs	r3, r2, #16
114	cmp	r3, r1
115#    if defined(USE_THUMB_1)
116	blo LOCAL_LABEL(skip_16)
117	movs r2, r3
118	subs r0, r0, #(16 * BLOCK_SIZE)
119LOCAL_LABEL(skip_16):
120#    else
121	movhs	r2, r3
122	subhs	ip, ip, #(16 * BLOCK_SIZE)
123#    endif
124
125	lsrs	r3, r2, #8
126	cmp	r3, r1
127#    if defined(USE_THUMB_1)
128	blo LOCAL_LABEL(skip_8)
129	movs r2, r3
130	subs r0, r0, #(8 * BLOCK_SIZE)
131LOCAL_LABEL(skip_8):
132#    else
133	movhs	r2, r3
134	subhs	ip, ip, #(8 * BLOCK_SIZE)
135#    endif
136
137	lsrs	r3, r2, #4
138	cmp	r3, r1
139#    if defined(USE_THUMB_1)
140	blo LOCAL_LABEL(skip_4)
141	movs r2, r3
142	subs r0, r0, #(4 * BLOCK_SIZE)
143LOCAL_LABEL(skip_4):
144#    else
145	movhs	r2, r3
146	subhs	ip, #(4 * BLOCK_SIZE)
147#    endif
148
149	lsrs	r3, r2, #2
150	cmp	r3, r1
151#    if defined(USE_THUMB_1)
152	blo LOCAL_LABEL(skip_2)
153	movs r2, r3
154	subs r0, r0, #(2 * BLOCK_SIZE)
155LOCAL_LABEL(skip_2):
156#    else
157	movhs	r2, r3
158	subhs	ip, ip, #(2 * BLOCK_SIZE)
159#    endif
160
161	// Last block, no need to update r2 or r3.
162#    if defined(USE_THUMB_1)
163	lsrs r3, r2, #1
164	cmp r3, r1
165	blo LOCAL_LABEL(skip_1)
166	subs r0, r0, #(1 * BLOCK_SIZE)
167LOCAL_LABEL(skip_1):
168	movs r2, r0
169	mov r0, ip
170	movs r3, #0
171	JMP (r2)
172
173#    else
174	cmp	r1, r2, lsr #1
175	subls	ip, ip, #(1 * BLOCK_SIZE)
176
177	movs	r3, #0
178
179	JMP(ip)
180#    endif
181#  endif // __ARM_FEATURE_CLZ
182
183
184#define	IMM	#
185	// due to the range limit of branch in Thumb1, we have to place the
186	// block closer
187LOCAL_LABEL(divby0):
188	movs	r0, #0
189#      if defined(__ARM_EABI__)
190	push {r7, lr}
191	bl	__aeabi_idiv0 // due to relocation limit, can't use b.
192	pop  {r7, pc}
193#      else
194	JMP(lr)
195#      endif
196
197
198#if defined(USE_THUMB_1)
199#define block(shift)                                                           \
200	lsls r2, r1, IMM shift;                                                      \
201	cmp r0, r2;                                                                  \
202	blo LOCAL_LABEL(block_skip_##shift);                                         \
203	subs r0, r0, r2;                                                             \
204	LOCAL_LABEL(block_skip_##shift) :;                                           \
205	adcs r3, r3 // same as ((r3 << 1) | Carry). Carry is set if r0 >= r2.
206
207	// TODO: if current location counter is not word aligned, we don't
208	// need the .p2align and nop
209	// Label div0block must be word-aligned. First align block 31
210	.p2align 2
211	nop // Padding to align div0block as 31 blocks = 310 bytes
212
213#else
214#define block(shift)                                                           \
215	cmp	r0, r1, lsl IMM shift;                                         \
216	ITT(hs);                                                               \
217	WIDE(addhs)	r3, r3, IMM (1 << shift);                              \
218	WIDE(subhs)	r0, r0, r1, lsl IMM shift
219#endif
220
221	block(31)
222	block(30)
223	block(29)
224	block(28)
225	block(27)
226	block(26)
227	block(25)
228	block(24)
229	block(23)
230	block(22)
231	block(21)
232	block(20)
233	block(19)
234	block(18)
235	block(17)
236	block(16)
237	block(15)
238	block(14)
239	block(13)
240	block(12)
241	block(11)
242	block(10)
243	block(9)
244	block(8)
245	block(7)
246	block(6)
247	block(5)
248	block(4)
249	block(3)
250	block(2)
251	block(1)
252LOCAL_LABEL(div0block):
253	block(0)
254
255	mov	r0, r3
256	JMP(lr)
257#endif // __ARM_ARCH_EXT_IDIV__
258
259END_COMPILERRT_FUNCTION(__udivsi3)
260
261NO_EXEC_STACK_DIRECTIVE
262
263