1/*
2 * strcmp for ARMv7
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 */
8
9#if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
10
11/* Implementation of strcmp for ARMv7 when DSP instructions are
12   available.  Use ldrd to support wider loads, provided the data
13   is sufficiently aligned.  Use saturating arithmetic to optimize
14   the compares.  */
15
16#include "../asmdefs.h"
17
18/* Build Options:
19   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
20   byte in the string.  If comparing completely random strings
21   the pre-check will save time, since there is a very high
22   probability of a mismatch in the first character: we save
23   significant overhead if this is the common case.  However,
24   if strings are likely to be identical (eg because we're
25   verifying a hit in a hash table), then this check is largely
26   redundant.  */
27
28#define STRCMP_NO_PRECHECK	0
29
30	/* This version uses Thumb-2 code.  */
31	.thumb
32	.syntax unified
33
34#ifdef __ARM_BIG_ENDIAN
35#define S2LO lsl
36#define S2LOEQ lsleq
37#define S2HI lsr
38#define MSB 0x000000ff
39#define LSB 0xff000000
40#define BYTE0_OFFSET 24
41#define BYTE1_OFFSET 16
42#define BYTE2_OFFSET 8
43#define BYTE3_OFFSET 0
44#else /* not  __ARM_BIG_ENDIAN */
45#define S2LO lsr
46#define S2LOEQ lsreq
47#define S2HI lsl
48#define BYTE0_OFFSET 0
49#define BYTE1_OFFSET 8
50#define BYTE2_OFFSET 16
51#define BYTE3_OFFSET 24
52#define MSB 0xff000000
53#define LSB 0x000000ff
54#endif /* not  __ARM_BIG_ENDIAN */
55
56/* Parameters and result.  */
57#define src1		r0
58#define src2		r1
59#define result		r0	/* Overlaps src1.  */
60
61/* Internal variables.  */
62#define tmp1		r4
63#define tmp2		r5
64#define const_m1	r12
65
66/* Additional internal variables for 64-bit aligned data.  */
67#define data1a		r2
68#define data1b		r3
69#define data2a		r6
70#define data2b		r7
71#define syndrome_a	tmp1
72#define syndrome_b	tmp2
73
74/* Additional internal variables for 32-bit aligned data.  */
75#define data1		r2
76#define data2		r3
77#define syndrome	tmp2
78
79
80	/* Macro to compute and return the result value for word-aligned
81	   cases.  */
82	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
83#ifdef __ARM_BIG_ENDIAN
84	/* If data1 contains a zero byte, then syndrome will contain a 1 in
85	   bit 7 of that byte.  Otherwise, the highest set bit in the
86	   syndrome will highlight the first different bit.  It is therefore
87	   sufficient to extract the eight bits starting with the syndrome
88	   bit.  */
89	clz	tmp1, \synd
90	lsl	r1, \d2, tmp1
91	.if \restore_r6
92	ldrd	r6, r7, [sp, #8]
93	.endif
94	.cfi_restore 6
95	.cfi_restore 7
96	lsl	\d1, \d1, tmp1
97	.cfi_remember_state
98	lsr	result, \d1, #24
99	ldrd	r4, r5, [sp], #16
100	.cfi_restore 4
101	.cfi_restore 5
102	sub	result, result, r1, lsr #24
103	bx	lr
104#else
105	/* To use the big-endian trick we'd have to reverse all three words.
106	   that's slower than this approach.  */
107	rev	\synd, \synd
108	clz	tmp1, \synd
109	bic	tmp1, tmp1, #7
110	lsr	r1, \d2, tmp1
111	.cfi_remember_state
112	.if \restore_r6
113	ldrd	r6, r7, [sp, #8]
114	.endif
115	.cfi_restore 6
116	.cfi_restore 7
117	lsr	\d1, \d1, tmp1
118	and	result, \d1, #255
119	and	r1, r1, #255
120	ldrd	r4, r5, [sp], #16
121	.cfi_restore 4
122	.cfi_restore 5
123	sub	result, result, r1
124
125	bx	lr
126#endif
127	.endm
128
129	.text
130	.p2align	5
131L(strcmp_start_addr):
132#if STRCMP_NO_PRECHECK == 0
133L(fastpath_exit):
134	sub	r0, r2, r3
135	bx	lr
136	nop
137#endif
138ENTRY_ALIGN (__strcmp_arm, 0)
139#if STRCMP_NO_PRECHECK == 0
140	ldrb	r2, [src1]
141	ldrb	r3, [src2]
142	cmp	r2, #1
143	it	cs
144	cmpcs	r2, r3
145	bne	L(fastpath_exit)
146#endif
147	strd	r4, r5, [sp, #-16]!
148	.cfi_def_cfa_offset 16
149	.cfi_offset 4, -16
150	.cfi_offset 5, -12
151	orr	tmp1, src1, src2
152	strd	r6, r7, [sp, #8]
153	.cfi_offset 6, -8
154	.cfi_offset 7, -4
155	mvn	const_m1, #0
156	lsl	r2, tmp1, #29
157	cbz	r2, L(loop_aligned8)
158
159L(not_aligned):
160	eor	tmp1, src1, src2
161	tst	tmp1, #7
162	bne	L(misaligned8)
163
164	/* Deal with mutual misalignment by aligning downwards and then
165	   masking off the unwanted loaded data to prevent a difference.  */
166	and	tmp1, src1, #7
167	bic	src1, src1, #7
168	and	tmp2, tmp1, #3
169	bic	src2, src2, #7
170	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
171	ldrd	data1a, data1b, [src1], #16
172	tst	tmp1, #4
173	ldrd	data2a, data2b, [src2], #16
174	/* In thumb code we can't use MVN with a register shift, but
175	   we do have ORN.  */
176	S2HI	tmp1, const_m1, tmp2
177	orn	data1a, data1a, tmp1
178	orn	data2a, data2a, tmp1
179	beq	L(start_realigned8)
180	orn	data1b, data1b, tmp1
181	mov	data1a, const_m1
182	orn	data2b, data2b, tmp1
183	mov	data2a, const_m1
184	b	L(start_realigned8)
185
186	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
187	   pass.  */
188	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
189	.p2align 2	/* Always word aligned.  */
190L(loop_aligned8):
191	ldrd	data1a, data1b, [src1], #16
192	ldrd	data2a, data2b, [src2], #16
193L(start_realigned8):
194	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
195	eor	syndrome_a, data1a, data2a
196	sel	syndrome_a, syndrome_a, const_m1
197	cbnz	syndrome_a, L(diff_in_a)
198	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
199	eor	syndrome_b, data1b, data2b
200	sel	syndrome_b, syndrome_b, const_m1
201	cbnz	syndrome_b, L(diff_in_b)
202
203	ldrd	data1a, data1b, [src1, #-8]
204	ldrd	data2a, data2b, [src2, #-8]
205	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
206	eor	syndrome_a, data1a, data2a
207	sel	syndrome_a, syndrome_a, const_m1
208	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
209	eor	syndrome_b, data1b, data2b
210	sel	syndrome_b, syndrome_b, const_m1
211	/* Can't use CBZ for backwards branch.  */
212	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
213	beq	L(loop_aligned8)
214
215L(diff_found):
216	cbnz	syndrome_a, L(diff_in_a)
217
218L(diff_in_b):
219	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
220
221L(diff_in_a):
222	.cfi_restore_state
223	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
224
225	.cfi_restore_state
226L(misaligned8):
227	tst	tmp1, #3
228	bne	L(misaligned4)
229	ands	tmp1, src1, #3
230	bne	L(mutual_align4)
231
232	/* Unrolled by a factor of 2, to reduce the number of post-increment
233	   operations.  */
234L(loop_aligned4):
235	ldr	data1, [src1], #8
236	ldr	data2, [src2], #8
237L(start_realigned4):
238	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
239	eor	syndrome, data1, data2
240	sel	syndrome, syndrome, const_m1
241	cbnz	syndrome, L(aligned4_done)
242	ldr	data1, [src1, #-4]
243	ldr	data2, [src2, #-4]
244	uadd8	syndrome, data1, const_m1
245	eor	syndrome, data1, data2
246	sel	syndrome, syndrome, const_m1
247	cmp	syndrome, #0
248	beq	L(loop_aligned4)
249
250L(aligned4_done):
251	strcmp_epilogue_aligned syndrome, data1, data2, 0
252
253L(mutual_align4):
254	.cfi_restore_state
255	/* Deal with mutual misalignment by aligning downwards and then
256	   masking off the unwanted loaded data to prevent a difference.  */
257	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
258	bic	src1, src1, #3
259	ldr	data1, [src1], #8
260	bic	src2, src2, #3
261	ldr	data2, [src2], #8
262
263	/* In thumb code we can't use MVN with a register shift, but
264	   we do have ORN.  */
265	S2HI	tmp1, const_m1, tmp1
266	orn	data1, data1, tmp1
267	orn	data2, data2, tmp1
268	b	L(start_realigned4)
269
270L(misaligned4):
271	ands	tmp1, src1, #3
272	beq	L(src1_aligned)
273	sub	src2, src2, tmp1
274	bic	src1, src1, #3
275	lsls	tmp1, tmp1, #31
276	ldr	data1, [src1], #4
277	beq	L(aligned_m2)
278	bcs	L(aligned_m1)
279
280#if STRCMP_NO_PRECHECK == 1
281	ldrb	data2, [src2, #1]
282	uxtb	tmp1, data1, ror #BYTE1_OFFSET
283	subs	tmp1, tmp1, data2
284	bne	L(misaligned_exit)
285	cbz	data2, L(misaligned_exit)
286
287L(aligned_m2):
288	ldrb	data2, [src2, #2]
289	uxtb	tmp1, data1, ror #BYTE2_OFFSET
290	subs	tmp1, tmp1, data2
291	bne	L(misaligned_exit)
292	cbz	data2, L(misaligned_exit)
293
294L(aligned_m1):
295	ldrb	data2, [src2, #3]
296	uxtb	tmp1, data1, ror #BYTE3_OFFSET
297	subs	tmp1, tmp1, data2
298	bne	L(misaligned_exit)
299	add	src2, src2, #4
300	cbnz	data2, L(src1_aligned)
301#else  /* STRCMP_NO_PRECHECK */
302	/* If we've done the pre-check, then we don't need to check the
303	   first byte again here.  */
304	ldrb	data2, [src2, #2]
305	uxtb	tmp1, data1, ror #BYTE2_OFFSET
306	subs	tmp1, tmp1, data2
307	bne	L(misaligned_exit)
308	cbz	data2, L(misaligned_exit)
309
310L(aligned_m2):
311	ldrb	data2, [src2, #3]
312	uxtb	tmp1, data1, ror #BYTE3_OFFSET
313	subs	tmp1, tmp1, data2
314	bne	L(misaligned_exit)
315	cbnz	data2, L(aligned_m1)
316#endif
317
318L(misaligned_exit):
319	.cfi_remember_state
320	mov	result, tmp1
321	ldr	r4, [sp], #16
322	.cfi_restore 4
323	bx	lr
324
325#if STRCMP_NO_PRECHECK == 0
326L(aligned_m1):
327	add	src2, src2, #4
328#endif
329L(src1_aligned):
330	.cfi_restore_state
331	/* src1 is word aligned, but src2 has no common alignment
332	   with it.  */
333	ldr	data1, [src1], #4
334	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
335
336	bic	src2, src2, #3
337	ldr	data2, [src2], #4
338	bhi	L(overlap1)		/* C=1, Z=0 => src2[1:0] = 0b11.  */
339	bcs	L(overlap2)		/* C=1, Z=1 => src2[1:0] = 0b10.  */
340
341	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
342L(overlap3):
343	bic	tmp1, data1, #MSB
344	uadd8	syndrome, data1, const_m1
345	eors	syndrome, tmp1, data2, S2LO #8
346	sel	syndrome, syndrome, const_m1
347	bne	4f
348	cbnz	syndrome, 5f
349	ldr	data2, [src2], #4
350	eor	tmp1, tmp1, data1
351	cmp	tmp1, data2, S2HI #24
352	bne	6f
353	ldr	data1, [src1], #4
354	b	L(overlap3)
3554:
356	S2LO	data2, data2, #8
357	b	L(strcmp_tail)
358
3595:
360	bics	syndrome, syndrome, #MSB
361	bne	L(strcmp_done_equal)
362
363	/* We can only get here if the MSB of data1 contains 0, so
364	   fast-path the exit.  */
365	ldrb	result, [src2]
366	.cfi_remember_state
367	ldrd	r4, r5, [sp], #16
368	.cfi_restore 4
369	.cfi_restore 5
370	/* R6/7 Not used in this sequence.  */
371	.cfi_restore 6
372	.cfi_restore 7
373	neg	result, result
374	bx	lr
375
3766:
377	.cfi_restore_state
378	S2LO	data1, data1, #24
379	and	data2, data2, #LSB
380	b	L(strcmp_tail)
381
382	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
383L(overlap2):
384	and	tmp1, data1, const_m1, S2LO #16
385	uadd8	syndrome, data1, const_m1
386	eors	syndrome, tmp1, data2, S2LO #16
387	sel	syndrome, syndrome, const_m1
388	bne	4f
389	cbnz	syndrome, 5f
390	ldr	data2, [src2], #4
391	eor	tmp1, tmp1, data1
392	cmp	tmp1, data2, S2HI #16
393	bne	6f
394	ldr	data1, [src1], #4
395	b	L(overlap2)
3964:
397	S2LO	data2, data2, #16
398	b	L(strcmp_tail)
3995:
400	ands	syndrome, syndrome, const_m1, S2LO #16
401	bne	L(strcmp_done_equal)
402
403	ldrh	data2, [src2]
404	S2LO	data1, data1, #16
405#ifdef __ARM_BIG_ENDIAN
406	lsl	data2, data2, #16
407#endif
408	b	L(strcmp_tail)
409
4106:
411	S2LO	data1, data1, #16
412	and	data2, data2, const_m1, S2LO #16
413	b	L(strcmp_tail)
414
415	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
416L(overlap1):
417	and	tmp1, data1, #LSB
418	uadd8	syndrome, data1, const_m1
419	eors	syndrome, tmp1, data2, S2LO #24
420	sel	syndrome, syndrome, const_m1
421	bne	4f
422	cbnz	syndrome, 5f
423	ldr	data2, [src2], #4
424	eor	tmp1, tmp1, data1
425	cmp	tmp1, data2, S2HI #8
426	bne	6f
427	ldr	data1, [src1], #4
428	b	L(overlap1)
4294:
430	S2LO	data2, data2, #24
431	b	L(strcmp_tail)
4325:
433	tst	syndrome, #LSB
434	bne	L(strcmp_done_equal)
435	ldr	data2, [src2]
4366:
437	S2LO	data1, data1, #8
438	bic	data2, data2, #MSB
439	b	L(strcmp_tail)
440
441L(strcmp_done_equal):
442	mov	result, #0
443	.cfi_remember_state
444	ldrd	r4, r5, [sp], #16
445	.cfi_restore 4
446	.cfi_restore 5
447	/* R6/7 not used in this sequence.  */
448	.cfi_restore 6
449	.cfi_restore 7
450	bx	lr
451
452L(strcmp_tail):
453	.cfi_restore_state
454#ifndef __ARM_BIG_ENDIAN
455	rev	data1, data1
456	rev	data2, data2
457	/* Now everything looks big-endian...  */
458#endif
459	uadd8	tmp1, data1, const_m1
460	eor	tmp1, data1, data2
461	sel	syndrome, tmp1, const_m1
462	clz	tmp1, syndrome
463	lsl	data1, data1, tmp1
464	lsl	data2, data2, tmp1
465	lsr	result, data1, #24
466	ldrd	r4, r5, [sp], #16
467	.cfi_restore 4
468	.cfi_restore 5
469	/* R6/7 not used in this sequence.  */
470	.cfi_restore 6
471	.cfi_restore 7
472	sub	result, result, data2, lsr #24
473	bx	lr
474
475END (__strcmp_arm)
476
477#endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1  */
478