1/* Copyright (c) 2012, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27/* Assumptions:
28 *
29 * ARMv8-a, AArch64
30 */
31
32	.macro def_fn f p2align=0
33	.text
34	.p2align \p2align
35	.global \f
36	.type \f, %function
37\f:
38	.endm
39
40#define REP8_01 0x0101010101010101
41#define REP8_7f 0x7f7f7f7f7f7f7f7f
42#define REP8_80 0x8080808080808080
43
44/* Parameters and result.  */
45#define src1		x0
46#define src2		x1
47#define result		x0
48
49/* Internal variables.  */
50#define data1		x2
51#define data1w		w2
52#define data2		x3
53#define data2w		w3
54#define has_nul		x4
55#define diff		x5
56#define syndrome	x6
57#define tmp1		x7
58#define tmp2		x8
59#define tmp3		x9
60#define zeroones	x10
61#define pos		x11
62
63	/* Start of performance-critical section  -- one 64B cache line.  */
64def_fn strcmp p2align=6
65	eor	tmp1, src1, src2
66	mov	zeroones, #REP8_01
67	tst	tmp1, #7
68	b.ne	.Lmisaligned8
69	ands	tmp1, src1, #7
70	b.ne	.Lmutual_align
71	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
72	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
73	   can be done in parallel across the entire word.  */
74.Lloop_aligned:
75	ldr	data1, [src1], #8
76	ldr	data2, [src2], #8
77.Lstart_realigned:
78	sub	tmp1, data1, zeroones
79	orr	tmp2, data1, #REP8_7f
80	eor	diff, data1, data2	/* Non-zero if differences found.  */
81	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
82	orr	syndrome, diff, has_nul
83	cbz	syndrome, .Lloop_aligned
84	/* End of performance-critical section  -- one 64B cache line.  */
85
86#ifndef	__AARCH64EB__
87	rev	syndrome, syndrome
88	rev	data1, data1
89	/* The MS-non-zero bit of the syndrome marks either the first bit
90	   that is different, or the top bit of the first zero byte.
91	   Shifting left now will bring the critical information into the
92	   top bits.  */
93	clz	pos, syndrome
94	rev	data2, data2
95	lsl	data1, data1, pos
96	lsl	data2, data2, pos
97	/* But we need to zero-extend (char is unsigned) the value and then
98	   perform a signed 32-bit subtraction.  */
99	lsr	data1, data1, #56
100	sub	result, data1, data2, lsr #56
101	ret
102#else
103	/* For big-endian we cannot use the trick with the syndrome value
104	   as carry-propagation can corrupt the upper bits if the trailing
105	   bytes in the string contain 0x01.  */
106	/* However, if there is no NUL byte in the dword, we can generate
107	   the result directly.  We can't just subtract the bytes as the
108	   MSB might be significant.  */
109	cbnz	has_nul, 1f
110	cmp	data1, data2
111	cset	result, ne
112	cneg	result, result, lo
113	ret
1141:
115	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
116	rev	tmp3, data1
117	sub	tmp1, tmp3, zeroones
118	orr	tmp2, tmp3, #REP8_7f
119	bic	has_nul, tmp1, tmp2
120	rev	has_nul, has_nul
121	orr	syndrome, diff, has_nul
122	clz	pos, syndrome
123	/* The MS-non-zero bit of the syndrome marks either the first bit
124	   that is different, or the top bit of the first zero byte.
125	   Shifting left now will bring the critical information into the
126	   top bits.  */
127	lsl	data1, data1, pos
128	lsl	data2, data2, pos
129	/* But we need to zero-extend (char is unsigned) the value and then
130	   perform a signed 32-bit subtraction.  */
131	lsr	data1, data1, #56
132	sub	result, data1, data2, lsr #56
133	ret
134#endif
135
136.Lmutual_align:
137	/* Sources are mutually aligned, but are not currently at an
138	   alignment boundary.  Round down the addresses and then mask off
139	   the bytes that preceed the start point.  */
140	bic	src1, src1, #7
141	bic	src2, src2, #7
142	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
143	ldr	data1, [src1], #8
144	neg	tmp1, tmp1		/* Bits to alignment -64.  */
145	ldr	data2, [src2], #8
146	mov	tmp2, #~0
147#ifdef __AARCH64EB__
148	/* Big-endian.  Early bytes are at MSB.  */
149	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
150#else
151	/* Little-endian.  Early bytes are at LSB.  */
152	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
153#endif
154	orr	data1, data1, tmp2
155	orr	data2, data2, tmp2
156	b	.Lstart_realigned
157
158.Lmisaligned8:
159	/* We can do better than this.  */
160	ldrb	data1w, [src1], #1
161	ldrb	data2w, [src2], #1
162	cmp	data1w, #1
163	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
164	b.eq	.Lmisaligned8
165	sub	result, data1, data2
166	ret
167