1/*
2 * strcpy/stpcpy - copy a string returning pointer to start/end.
3 *
4 * Copyright (c) 2020-2023, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, Advanced SIMD.
11 * MTE compatible.
12 */
13
14#include "asmdefs.h"
15
16#define dstin		x0
17#define srcin		x1
18#define result		x0
19
20#define src		x2
21#define dst		x3
22#define len		x4
23#define synd		x4
24#define	tmp		x5
25#define shift		x5
26#define data1		x6
27#define dataw1		w6
28#define data2		x7
29#define dataw2		w7
30
31#define dataq		q0
32#define vdata		v0
33#define vhas_nul	v1
34#define vend		v2
35#define dend		d2
36#define dataq2		q1
37
38#ifdef BUILD_STPCPY
39# define STRCPY __stpcpy_aarch64
40# define IFSTPCPY(X,...) X,__VA_ARGS__
41#else
42# define STRCPY __strcpy_aarch64
43# define IFSTPCPY(X,...)
44#endif
45
46/*
47   Core algorithm:
48   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
49   per byte. We take 4 bits of every comparison byte with shift right and narrow
50   by 4 instruction. Since the bits in the nibble mask reflect the order in
51   which things occur in the original string, counting leading zeros identifies
52   exactly which byte matched.  */
53
54ENTRY (STRCPY)
55	PTR_ARG (0)
56	PTR_ARG (1)
57	bic	src, srcin, 15
58	ld1	{vdata.16b}, [src]
59	cmeq	vhas_nul.16b, vdata.16b, 0
60	lsl	shift, srcin, 2
61	shrn	vend.8b, vhas_nul.8h, 4
62	fmov	synd, dend
63	lsr	synd, synd, shift
64	cbnz	synd, L(tail)
65
66	ldr	dataq, [src, 16]!
67	cmeq	vhas_nul.16b, vdata.16b, 0
68	shrn	vend.8b, vhas_nul.8h, 4
69	fmov	synd, dend
70	cbz	synd, L(start_loop)
71
72#ifndef __AARCH64EB__
73	rbit	synd, synd
74#endif
75	sub	tmp, src, srcin
76	clz	len, synd
77	add	len, tmp, len, lsr 2
78	tbz	len, 4, L(less16)
79	sub	tmp, len, 15
80	ldr	dataq, [srcin]
81	ldr	dataq2, [srcin, tmp]
82	str	dataq, [dstin]
83	str	dataq2, [dstin, tmp]
84	IFSTPCPY (add result, dstin, len)
85	ret
86
87L(tail):
88	rbit	synd, synd
89	clz	len, synd
90	lsr	len, len, 2
91L(less16):
92	tbz	len, 3, L(less8)
93	sub	tmp, len, 7
94	ldr	data1, [srcin]
95	ldr	data2, [srcin, tmp]
96	str	data1, [dstin]
97	str	data2, [dstin, tmp]
98	IFSTPCPY (add result, dstin, len)
99	ret
100
101	.p2align 4
102L(less8):
103	subs	tmp, len, 3
104	b.lo	L(less4)
105	ldr	dataw1, [srcin]
106	ldr	dataw2, [srcin, tmp]
107	str	dataw1, [dstin]
108	str	dataw2, [dstin, tmp]
109	IFSTPCPY (add result, dstin, len)
110	ret
111
112L(less4):
113	cbz	len, L(zerobyte)
114	ldrh	dataw1, [srcin]
115	strh	dataw1, [dstin]
116L(zerobyte):
117	strb	wzr, [dstin, len]
118	IFSTPCPY (add result, dstin, len)
119	ret
120
121	.p2align 4
122L(start_loop):
123	sub	tmp, srcin, dstin
124	ldr	dataq2, [srcin]
125	sub	dst, src, tmp
126	str	dataq2, [dstin]
127L(loop):
128	str	dataq, [dst], 32
129	ldr	dataq, [src, 16]
130	cmeq	vhas_nul.16b, vdata.16b, 0
131	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
132	fmov	synd, dend
133	cbnz	synd, L(loopend)
134	str	dataq, [dst, -16]
135	ldr	dataq, [src, 32]!
136	cmeq	vhas_nul.16b, vdata.16b, 0
137	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
138	fmov	synd, dend
139	cbz	synd, L(loop)
140	add	dst, dst, 16
141L(loopend):
142	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
143	fmov	synd, dend
144	sub	dst, dst, 31
145#ifndef __AARCH64EB__
146	rbit	synd, synd
147#endif
148	clz	len, synd
149	lsr	len, len, 2
150	add	dst, dst, len
151	ldr	dataq, [dst, tmp]
152	str	dataq, [dst]
153	IFSTPCPY (add result, dst, 15)
154	ret
155
156END (STRCPY)
157