1/*
2 * Copyright (c) 2012-2014 ARM Ltd
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 *    products derived from this software without specific prior written
15 *    permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29	/* Implementation of strcmp for ARMv7 when DSP instructions are
30	   available.  Use ldrd to support wider loads, provided the data
31	   is sufficiently aligned.  Use saturating arithmetic to optimize
32	   the compares.  */
33
34	/* Build Options:
35	   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
36	   byte in the string.  If comparing completely random strings
37	   the pre-check will save time, since there is a very high
38	   probability of a mismatch in the first character: we save
39	   significant overhead if this is the common case.  However,
40	   if strings are likely to be identical (eg because we're
41	   verifying a hit in a hash table), then this check is largely
42	   redundant.  */
43
44	/* This version uses Thumb-2 code.  */
45	.thumb
46	.syntax unified
47
48/* Parameters and result.  */
49#define src1		r0
50#define src2		r1
51#define result		r0	/* Overlaps src1.  */
52
53/* Internal variables.  */
54#define tmp1		r4
55#define tmp2		r5
56#define const_m1	r12
57
58/* Additional internal variables for 64-bit aligned data.  */
59#define data1a		r2
60#define data1b		r3
61#define data2a		r6
62#define data2b		r7
63#define syndrome_a	tmp1
64#define syndrome_b	tmp2
65
66/* Additional internal variables for 32-bit aligned data.  */
67#define data1		r2
68#define data2		r3
69#define syndrome	tmp2
70
71
72	/* Macro to compute and return the result value for word-aligned
73	   cases.  */
74	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
75#ifdef __ARM_BIG_ENDIAN
76	/* If data1 contains a zero byte, then syndrome will contain a 1 in
77	   bit 7 of that byte.  Otherwise, the highest set bit in the
78	   syndrome will highlight the first different bit.  It is therefore
79	   sufficient to extract the eight bits starting with the syndrome
80	   bit.  */
81	clz	tmp1, \synd
82	lsl	r1, \d2, tmp1
83	.if \restore_r6
84	ldrd	r6, r7, [sp, #8]
85	.endif
86	.cfi_restore 6
87	.cfi_restore 7
88	lsl	\d1, \d1, tmp1
89	.cfi_remember_state
90	lsr	result, \d1, #24
91	ldrd	r4, r5, [sp], #16
92	.cfi_restore 4
93	.cfi_restore 5
94	sub	result, result, r1, lsr #24
95	bx	lr
96#else
97	/* To use the big-endian trick we'd have to reverse all three words.
98	   that's slower than this approach.  */
99	rev	\synd, \synd
100	clz	tmp1, \synd
101	bic	tmp1, tmp1, #7
102	lsr	r1, \d2, tmp1
103	.cfi_remember_state
104	.if \restore_r6
105	ldrd	r6, r7, [sp, #8]
106	.endif
107	.cfi_restore 6
108	.cfi_restore 7
109	lsr	\d1, \d1, tmp1
110	and	result, \d1, #255
111	and	r1, r1, #255
112	ldrd	r4, r5, [sp], #16
113	.cfi_restore 4
114	.cfi_restore 5
115	sub	result, result, r1
116
117	bx	lr
118#endif
119	.endm
120
121	.text
122	.p2align	5
123.Lstrcmp_start_addr:
124#ifndef STRCMP_NO_PRECHECK
125.Lfastpath_exit:
126	sub	r0, r2, r3
127	bx	lr
128	nop
129#endif
130def_fn	strcmp
131#ifndef STRCMP_NO_PRECHECK
132	ldrb	r2, [src1]
133	ldrb	r3, [src2]
134	cmp	r2, #1
135	it	cs
136	cmpcs	r2, r3
137	bne	.Lfastpath_exit
138#endif
139	.cfi_sections .debug_frame
140	.cfi_startproc
141	strd	r4, r5, [sp, #-16]!
142	.cfi_def_cfa_offset 16
143	.cfi_offset 4, -16
144	.cfi_offset 5, -12
145	orr	tmp1, src1, src2
146	strd	r6, r7, [sp, #8]
147	.cfi_offset 6, -8
148	.cfi_offset 7, -4
149	mvn	const_m1, #0
150	lsl	r2, tmp1, #29
151	cbz	r2, .Lloop_aligned8
152
153.Lnot_aligned:
154	eor	tmp1, src1, src2
155	tst	tmp1, #7
156	bne	.Lmisaligned8
157
158	/* Deal with mutual misalignment by aligning downwards and then
159	   masking off the unwanted loaded data to prevent a difference.  */
160	and	tmp1, src1, #7
161	bic	src1, src1, #7
162	and	tmp2, tmp1, #3
163	bic	src2, src2, #7
164	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
165	ldrd	data1a, data1b, [src1], #16
166	tst	tmp1, #4
167	ldrd	data2a, data2b, [src2], #16
168	/* In thumb code we can't use MVN with a register shift, but
169	   we do have ORN.  */
170	S2HI	tmp1, const_m1, tmp2
171	orn	data1a, data1a, tmp1
172	orn	data2a, data2a, tmp1
173	beq	.Lstart_realigned8
174	orn	data1b, data1b, tmp1
175	mov	data1a, const_m1
176	orn	data2b, data2b, tmp1
177	mov	data2a, const_m1
178	b	.Lstart_realigned8
179
180	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
181	   pass.  */
182	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
183	.p2align 2	/* Always word aligned.  */
184.Lloop_aligned8:
185	ldrd	data1a, data1b, [src1], #16
186	ldrd	data2a, data2b, [src2], #16
187.Lstart_realigned8:
188	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
189	eor	syndrome_a, data1a, data2a
190	sel	syndrome_a, syndrome_a, const_m1
191	cbnz	syndrome_a, .Ldiff_in_a
192	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
193	eor	syndrome_b, data1b, data2b
194	sel	syndrome_b, syndrome_b, const_m1
195	cbnz	syndrome_b, .Ldiff_in_b
196
197	ldrd	data1a, data1b, [src1, #-8]
198	ldrd	data2a, data2b, [src2, #-8]
199	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
200	eor	syndrome_a, data1a, data2a
201	sel	syndrome_a, syndrome_a, const_m1
202	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
203	eor	syndrome_b, data1b, data2b
204	sel	syndrome_b, syndrome_b, const_m1
205	/* Can't use CBZ for backwards branch.  */
206	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
207	beq	.Lloop_aligned8
208
209.Ldiff_found:
210	cbnz	syndrome_a, .Ldiff_in_a
211
212.Ldiff_in_b:
213	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
214
215.Ldiff_in_a:
216	.cfi_restore_state
217	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
218
219	.cfi_restore_state
220.Lmisaligned8:
221	tst	tmp1, #3
222	bne	.Lmisaligned4
223	ands	tmp1, src1, #3
224	bne	.Lmutual_align4
225
226	/* Unrolled by a factor of 2, to reduce the number of post-increment
227	   operations.  */
228.Lloop_aligned4:
229	ldr	data1, [src1], #8
230	ldr	data2, [src2], #8
231.Lstart_realigned4:
232	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
233	eor	syndrome, data1, data2
234	sel	syndrome, syndrome, const_m1
235	cbnz	syndrome, .Laligned4_done
236	ldr	data1, [src1, #-4]
237	ldr	data2, [src2, #-4]
238	uadd8	syndrome, data1, const_m1
239	eor	syndrome, data1, data2
240	sel	syndrome, syndrome, const_m1
241	cmp	syndrome, #0
242	beq	.Lloop_aligned4
243
244.Laligned4_done:
245	strcmp_epilogue_aligned syndrome, data1, data2, 0
246
247.Lmutual_align4:
248	.cfi_restore_state
249	/* Deal with mutual misalignment by aligning downwards and then
250	   masking off the unwanted loaded data to prevent a difference.  */
251	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
252	bic	src1, src1, #3
253	ldr	data1, [src1], #8
254	bic	src2, src2, #3
255	ldr	data2, [src2], #8
256
257	/* In thumb code we can't use MVN with a register shift, but
258	   we do have ORN.  */
259	S2HI	tmp1, const_m1, tmp1
260	orn	data1, data1, tmp1
261	orn	data2, data2, tmp1
262	b	.Lstart_realigned4
263
264.Lmisaligned4:
265	ands	tmp1, src1, #3
266	beq	.Lsrc1_aligned
267	sub	src2, src2, tmp1
268	bic	src1, src1, #3
269	lsls	tmp1, tmp1, #31
270	ldr	data1, [src1], #4
271	beq	.Laligned_m2
272	bcs	.Laligned_m1
273
274#ifdef STRCMP_NO_PRECHECK
275	ldrb	data2, [src2, #1]
276	uxtb	tmp1, data1, ror #BYTE1_OFFSET
277	subs	tmp1, tmp1, data2
278	bne	.Lmisaligned_exit
279	cbz	data2, .Lmisaligned_exit
280
281.Laligned_m2:
282	ldrb	data2, [src2, #2]
283	uxtb	tmp1, data1, ror #BYTE2_OFFSET
284	subs	tmp1, tmp1, data2
285	bne	.Lmisaligned_exit
286	cbz	data2, .Lmisaligned_exit
287
288.Laligned_m1:
289	ldrb	data2, [src2, #3]
290	uxtb	tmp1, data1, ror #BYTE3_OFFSET
291	subs	tmp1, tmp1, data2
292	bne	.Lmisaligned_exit
293	add	src2, src2, #4
294	cbnz	data2, .Lsrc1_aligned
295#else  /* STRCMP_NO_PRECHECK */
296	/* If we've done the pre-check, then we don't need to check the
297	   first byte again here.  */
298	ldrb	data2, [src2, #2]
299	uxtb	tmp1, data1, ror #BYTE2_OFFSET
300	subs	tmp1, tmp1, data2
301	bne	.Lmisaligned_exit
302	cbz	data2, .Lmisaligned_exit
303
304.Laligned_m2:
305	ldrb	data2, [src2, #3]
306	uxtb	tmp1, data1, ror #BYTE3_OFFSET
307	subs	tmp1, tmp1, data2
308	bne	.Lmisaligned_exit
309	cbnz	data2, .Laligned_m1
310#endif
311
312.Lmisaligned_exit:
313	.cfi_remember_state
314	mov	result, tmp1
315	ldr	r4, [sp], #16
316	.cfi_restore 4
317	bx	lr
318
319#ifndef STRCMP_NO_PRECHECK
320.Laligned_m1:
321	add	src2, src2, #4
322#endif
323.Lsrc1_aligned:
324	.cfi_restore_state
325	/* src1 is word aligned, but src2 has no common alignment
326	   with it.  */
327	ldr	data1, [src1], #4
328	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
329
330	bic	src2, src2, #3
331	ldr	data2, [src2], #4
332	bhi	.Loverlap1		/* C=1, Z=0 => src2[1:0] = 0b11.  */
333	bcs	.Loverlap2		/* C=1, Z=1 => src2[1:0] = 0b10.  */
334
335	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
336.Loverlap3:
337	bic	tmp1, data1, #MSB
338	uadd8	syndrome, data1, const_m1
339	eors	syndrome, tmp1, data2, S2LO #8
340	sel	syndrome, syndrome, const_m1
341	bne	4f
342	cbnz	syndrome, 5f
343	ldr	data2, [src2], #4
344	eor	tmp1, tmp1, data1
345	cmp	tmp1, data2, S2HI #24
346	bne	6f
347	ldr	data1, [src1], #4
348	b	.Loverlap3
3494:
350	S2LO	data2, data2, #8
351	b	.Lstrcmp_tail
352
3535:
354	bics	syndrome, syndrome, #MSB
355	bne	.Lstrcmp_done_equal
356
357	/* We can only get here if the MSB of data1 contains 0, so
358	   fast-path the exit.  */
359	ldrb	result, [src2]
360	.cfi_remember_state
361	ldrd	r4, r5, [sp], #16
362	.cfi_restore 4
363	.cfi_restore 5
364	/* R6/7 Not used in this sequence.  */
365	.cfi_restore 6
366	.cfi_restore 7
367	neg	result, result
368	bx	lr
369
3706:
371	.cfi_restore_state
372	S2LO	data1, data1, #24
373	and	data2, data2, #LSB
374	b	.Lstrcmp_tail
375
376	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
377.Loverlap2:
378	and	tmp1, data1, const_m1, S2LO #16
379	uadd8	syndrome, data1, const_m1
380	eors	syndrome, tmp1, data2, S2LO #16
381	sel	syndrome, syndrome, const_m1
382	bne	4f
383	cbnz	syndrome, 5f
384	ldr	data2, [src2], #4
385	eor	tmp1, tmp1, data1
386	cmp	tmp1, data2, S2HI #16
387	bne	6f
388	ldr	data1, [src1], #4
389	b	.Loverlap2
3904:
391	S2LO	data2, data2, #16
392	b	.Lstrcmp_tail
3935:
394	ands	syndrome, syndrome, const_m1, S2LO #16
395	bne	.Lstrcmp_done_equal
396
397	ldrh	data2, [src2]
398	S2LO	data1, data1, #16
399#ifdef __ARM_BIG_ENDIAN
400	lsl	data2, data2, #16
401#endif
402	b	.Lstrcmp_tail
403
4046:
405	S2LO	data1, data1, #16
406	and	data2, data2, const_m1, S2LO #16
407	b	.Lstrcmp_tail
408
409	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
410.Loverlap1:
411	and	tmp1, data1, #LSB
412	uadd8	syndrome, data1, const_m1
413	eors	syndrome, tmp1, data2, S2LO #24
414	sel	syndrome, syndrome, const_m1
415	bne	4f
416	cbnz	syndrome, 5f
417	ldr	data2, [src2], #4
418	eor	tmp1, tmp1, data1
419	cmp	tmp1, data2, S2HI #8
420	bne	6f
421	ldr	data1, [src1], #4
422	b	.Loverlap1
4234:
424	S2LO	data2, data2, #24
425	b	.Lstrcmp_tail
4265:
427	tst	syndrome, #LSB
428	bne	.Lstrcmp_done_equal
429	ldr	data2, [src2]
4306:
431	S2LO	data1, data1, #8
432	bic	data2, data2, #MSB
433	b	.Lstrcmp_tail
434
435.Lstrcmp_done_equal:
436	mov	result, #0
437	.cfi_remember_state
438	ldrd	r4, r5, [sp], #16
439	.cfi_restore 4
440	.cfi_restore 5
441	/* R6/7 not used in this sequence.  */
442	.cfi_restore 6
443	.cfi_restore 7
444	bx	lr
445
446.Lstrcmp_tail:
447	.cfi_restore_state
448#ifndef __ARM_BIG_ENDIAN
449	rev	data1, data1
450	rev	data2, data2
451	/* Now everything looks big-endian...  */
452#endif
453	uadd8	tmp1, data1, const_m1
454	eor	tmp1, data1, data2
455	sel	syndrome, tmp1, const_m1
456	clz	tmp1, syndrome
457	lsl	data1, data1, tmp1
458	lsl	data2, data2, tmp1
459	lsr	result, data1, #24
460	ldrd	r4, r5, [sp], #16
461	.cfi_restore 4
462	.cfi_restore 5
463	/* R6/7 not used in this sequence.  */
464	.cfi_restore 6
465	.cfi_restore 7
466	sub	result, result, data2, lsr #24
467	bx	lr
468	.cfi_endproc
469	.size strcmp, . - .Lstrcmp_start_addr
470