1/*
2 * strcpy/stpcpy - copy a string returning pointer to start/end.
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 */
8
9/* Assumptions:
10 *
11 * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
12 */
13
14#include "../asmdefs.h"
15
16/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
17
18   To test the page crossing code path more thoroughly, compile with
19   -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
20   entry path.  This option is not intended for production use.  */
21
22/* Arguments and results.  */
23#define dstin		x0
24#define srcin		x1
25
26/* Locals and temporaries.  */
27#define src		x2
28#define dst		x3
29#define data1		x4
30#define data1w		w4
31#define data2		x5
32#define data2w		w5
33#define has_nul1	x6
34#define has_nul2	x7
35#define tmp1		x8
36#define tmp2		x9
37#define tmp3		x10
38#define tmp4		x11
39#define zeroones	x12
40#define data1a		x13
41#define data2a		x14
42#define pos		x15
43#define len		x16
44#define to_align	x17
45
46#ifdef BUILD_STPCPY
47#define STRCPY __stpcpy_aarch64
48#else
49#define STRCPY __strcpy_aarch64
50#endif
51
52	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
53	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
54	   can be done in parallel across the entire word.  */
55
56#define REP8_01 0x0101010101010101
57#define REP8_7f 0x7f7f7f7f7f7f7f7f
58#define REP8_80 0x8080808080808080
59
60	/* AArch64 systems have a minimum page size of 4k.  We can do a quick
61	   page size check for crossing this boundary on entry and if we
62	   do not, then we can short-circuit much of the entry code.  We
63	   expect early page-crossing strings to be rare (probability of
64	   16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
65	   predictable, even with random strings.
66
67	   We don't bother checking for larger page sizes, the cost of setting
68	   up the correct page size is just not worth the extra gain from
69	   a small reduction in the cases taking the slow path.  Note that
70	   we only care about whether the first fetch, which may be
71	   misaligned, crosses a page boundary - after that we move to aligned
72	   fetches for the remainder of the string.  */
73
74#ifdef STRCPY_TEST_PAGE_CROSS
75	/* Make everything that isn't Qword aligned look like a page cross.  */
76#define MIN_PAGE_P2 4
77#else
78#define MIN_PAGE_P2 12
79#endif
80
81#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
82
83ENTRY (STRCPY)
84	/* For moderately short strings, the fastest way to do the copy is to
85	   calculate the length of the string in the same way as strlen, then
86	   essentially do a memcpy of the result.  This avoids the need for
87	   multiple byte copies and further means that by the time we
88	   reach the bulk copy loop we know we can always use DWord
89	   accesses.  We expect __strcpy_aarch64 to rarely be called repeatedly
90	   with the same source string, so branch prediction is likely to
91	   always be difficult - we mitigate against this by preferring
92	   conditional select operations over branches whenever this is
93	   feasible.  */
94	and	tmp2, srcin, #(MIN_PAGE_SIZE - 1)
95	mov	zeroones, #REP8_01
96	and	to_align, srcin, #15
97	cmp	tmp2, #(MIN_PAGE_SIZE - 16)
98	neg	tmp1, to_align
99	/* The first fetch will straddle a (possible) page boundary iff
100	   srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
101	   aligned string will never fail the page align check, so will
102	   always take the fast path.  */
103	b.gt	L(page_cross)
104
105L(page_cross_ok):
106	ldp	data1, data2, [srcin]
107#ifdef __AARCH64EB__
108	/* Because we expect the end to be found within 16 characters
109	   (profiling shows this is the most common case), it's worth
110	   swapping the bytes now to save having to recalculate the
111	   termination syndrome later.  We preserve data1 and data2
112	   so that we can re-use the values later on.  */
113	rev	tmp2, data1
114	sub	tmp1, tmp2, zeroones
115	orr	tmp2, tmp2, #REP8_7f
116	bics	has_nul1, tmp1, tmp2
117	b.ne	L(fp_le8)
118	rev	tmp4, data2
119	sub	tmp3, tmp4, zeroones
120	orr	tmp4, tmp4, #REP8_7f
121#else
122	sub	tmp1, data1, zeroones
123	orr	tmp2, data1, #REP8_7f
124	bics	has_nul1, tmp1, tmp2
125	b.ne	L(fp_le8)
126	sub	tmp3, data2, zeroones
127	orr	tmp4, data2, #REP8_7f
128#endif
129	bics	has_nul2, tmp3, tmp4
130	b.eq	L(bulk_entry)
131
132	/* The string is short (<=16 bytes).  We don't know exactly how
133	   short though, yet.  Work out the exact length so that we can
134	   quickly select the optimal copy strategy.  */
135L(fp_gt8):
136	rev	has_nul2, has_nul2
137	clz	pos, has_nul2
138	mov	tmp2, #56
139	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
140	sub	pos, tmp2, pos
141#ifdef __AARCH64EB__
142	lsr	data2, data2, pos
143#else
144	lsl	data2, data2, pos
145#endif
146	str	data2, [dst, #1]
147	str	data1, [dstin]
148#ifdef BUILD_STPCPY
149	add	dstin, dst, #8
150#endif
151	ret
152
153L(fp_le8):
154	rev	has_nul1, has_nul1
155	clz	pos, has_nul1
156	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
157	subs	tmp2, pos, #24			/* Pos in bits. */
158	b.lt	L(fp_lt4)
159#ifdef __AARCH64EB__
160	mov	tmp2, #56
161	sub	pos, tmp2, pos
162	lsr	data2, data1, pos
163	lsr	data1, data1, #32
164#else
165	lsr	data2, data1, tmp2
166#endif
167	/* 4->7 bytes to copy.  */
168	str	data2w, [dst, #-3]
169	str	data1w, [dstin]
170#ifdef BUILD_STPCPY
171	mov	dstin, dst
172#endif
173	ret
174L(fp_lt4):
175	cbz	pos, L(fp_lt2)
176	/* 2->3 bytes to copy.  */
177#ifdef __AARCH64EB__
178	lsr	data1, data1, #48
179#endif
180	strh	data1w, [dstin]
181	/* Fall-through, one byte (max) to go.  */
182L(fp_lt2):
183	/* Null-terminated string.  Last character must be zero!  */
184	strb	wzr, [dst]
185#ifdef BUILD_STPCPY
186	mov	dstin, dst
187#endif
188	ret
189
190	.p2align 6
191	/* Aligning here ensures that the entry code and main loop all lies
192	   within one 64-byte cache line.  */
193L(bulk_entry):
194	sub	to_align, to_align, #16
195	stp	data1, data2, [dstin]
196	sub	src, srcin, to_align
197	sub	dst, dstin, to_align
198	b	L(entry_no_page_cross)
199
200	/* The inner loop deals with two Dwords at a time.  This has a
201	   slightly higher start-up cost, but we should win quite quickly,
202	   especially on cores with a high number of issue slots per
203	   cycle, as we get much better parallelism out of the operations.  */
204L(main_loop):
205	stp	data1, data2, [dst], #16
206L(entry_no_page_cross):
207	ldp	data1, data2, [src], #16
208	sub	tmp1, data1, zeroones
209	orr	tmp2, data1, #REP8_7f
210	sub	tmp3, data2, zeroones
211	orr	tmp4, data2, #REP8_7f
212	bic	has_nul1, tmp1, tmp2
213	bics	has_nul2, tmp3, tmp4
214	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
215	b.eq	L(main_loop)
216
217	/* Since we know we are copying at least 16 bytes, the fastest way
218	   to deal with the tail is to determine the location of the
219	   trailing NUL, then (re)copy the 16 bytes leading up to that.  */
220	cmp	has_nul1, #0
221#ifdef __AARCH64EB__
222	/* For big-endian, carry propagation (if the final byte in the
223	   string is 0x01) means we cannot use has_nul directly.  The
224	   easiest way to get the correct byte is to byte-swap the data
225	   and calculate the syndrome a second time.  */
226	csel	data1, data1, data2, ne
227	rev	data1, data1
228	sub	tmp1, data1, zeroones
229	orr	tmp2, data1, #REP8_7f
230	bic	has_nul1, tmp1, tmp2
231#else
232	csel	has_nul1, has_nul1, has_nul2, ne
233#endif
234	rev	has_nul1, has_nul1
235	clz	pos, has_nul1
236	add	tmp1, pos, #72
237	add	pos, pos, #8
238	csel	pos, pos, tmp1, ne
239	add	src, src, pos, lsr #3
240	add	dst, dst, pos, lsr #3
241	ldp	data1, data2, [src, #-32]
242	stp	data1, data2, [dst, #-16]
243#ifdef BUILD_STPCPY
244	sub	dstin, dst, #1
245#endif
246	ret
247
248L(page_cross):
249	bic	src, srcin, #15
250	/* Start by loading two words at [srcin & ~15], then forcing the
251	   bytes that precede srcin to 0xff.  This means they never look
252	   like termination bytes.  */
253	ldp	data1, data2, [src]
254	lsl	tmp1, tmp1, #3	/* Bytes beyond alignment -> bits.  */
255	tst	to_align, #7
256	csetm	tmp2, ne
257#ifdef __AARCH64EB__
258	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
259#else
260	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
261#endif
262	orr	data1, data1, tmp2
263	orr	data2a, data2, tmp2
264	cmp	to_align, #8
265	csinv	data1, data1, xzr, lt
266	csel	data2, data2, data2a, lt
267	sub	tmp1, data1, zeroones
268	orr	tmp2, data1, #REP8_7f
269	sub	tmp3, data2, zeroones
270	orr	tmp4, data2, #REP8_7f
271	bic	has_nul1, tmp1, tmp2
272	bics	has_nul2, tmp3, tmp4
273	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
274	b.eq	L(page_cross_ok)
275	/* We now need to make data1 and data2 look like they've been
276	   loaded directly from srcin.  Do a rotate on the 128-bit value.  */
277	lsl	tmp1, to_align, #3	/* Bytes->bits.  */
278	neg	tmp2, to_align, lsl #3
279#ifdef __AARCH64EB__
280	lsl	data1a, data1, tmp1
281	lsr	tmp4, data2, tmp2
282	lsl	data2, data2, tmp1
283	orr	tmp4, tmp4, data1a
284	cmp	to_align, #8
285	csel	data1, tmp4, data2, lt
286	rev	tmp2, data1
287	rev	tmp4, data2
288	sub	tmp1, tmp2, zeroones
289	orr	tmp2, tmp2, #REP8_7f
290	sub	tmp3, tmp4, zeroones
291	orr	tmp4, tmp4, #REP8_7f
292#else
293	lsr	data1a, data1, tmp1
294	lsl	tmp4, data2, tmp2
295	lsr	data2, data2, tmp1
296	orr	tmp4, tmp4, data1a
297	cmp	to_align, #8
298	csel	data1, tmp4, data2, lt
299	sub	tmp1, data1, zeroones
300	orr	tmp2, data1, #REP8_7f
301	sub	tmp3, data2, zeroones
302	orr	tmp4, data2, #REP8_7f
303#endif
304	bic	has_nul1, tmp1, tmp2
305	cbnz	has_nul1, L(fp_le8)
306	bic	has_nul2, tmp3, tmp4
307	b	L(fp_gt8)
308
309END (STRCPY)
310