1/*
2 * memcpy - copy memory area
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 */
8
9/* Assumptions:
10 *
11 * ARMv8-a, AArch64, unaligned accesses.
12 *
13 */
14
15#include "../asmdefs.h"
16
17#define dstin	x0
18#define src	x1
19#define count	x2
20#define dst	x3
21#define srcend	x4
22#define dstend	x5
23#define A_l	x6
24#define A_lw	w6
25#define A_h	x7
26#define B_l	x8
27#define B_lw	w8
28#define B_h	x9
29#define C_l	x10
30#define C_lw	w10
31#define C_h	x11
32#define D_l	x12
33#define D_h	x13
34#define E_l	x14
35#define E_h	x15
36#define F_l	x16
37#define F_h	x17
38#define G_l	count
39#define G_h	dst
40#define H_l	src
41#define H_h	srcend
42#define tmp1	x14
43
44/* This implementation handles overlaps and supports both memcpy and memmove
45   from a single entry point.  It uses unaligned accesses and branchless
46   sequences to keep the code small, simple and improve performance.
47
48   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
49   copies of up to 128 bytes, and large copies.  The overhead of the overlap
50   check is negligible since it is only required for large copies.
51
52   Large copies use a software pipelined loop processing 64 bytes per iteration.
53   The destination pointer is 16-byte aligned to minimize unaligned accesses.
54   The loop tail is handled by always copying 64 bytes from the end.
55*/
56
57ENTRY (__memcpy_aarch64)
58ENTRY_ALIAS (__memmove_aarch64)
59	add	srcend, src, count
60	add	dstend, dstin, count
61	cmp	count, 128
62	b.hi	L(copy_long)
63	cmp	count, 32
64	b.hi	L(copy32_128)
65
66	/* Small copies: 0..32 bytes.  */
67	cmp	count, 16
68	b.lo	L(copy16)
69	ldp	A_l, A_h, [src]
70	ldp	D_l, D_h, [srcend, -16]
71	stp	A_l, A_h, [dstin]
72	stp	D_l, D_h, [dstend, -16]
73	ret
74
75	/* Copy 8-15 bytes.  */
76L(copy16):
77	tbz	count, 3, L(copy8)
78	ldr	A_l, [src]
79	ldr	A_h, [srcend, -8]
80	str	A_l, [dstin]
81	str	A_h, [dstend, -8]
82	ret
83
84	.p2align 3
85	/* Copy 4-7 bytes.  */
86L(copy8):
87	tbz	count, 2, L(copy4)
88	ldr	A_lw, [src]
89	ldr	B_lw, [srcend, -4]
90	str	A_lw, [dstin]
91	str	B_lw, [dstend, -4]
92	ret
93
94	/* Copy 0..3 bytes using a branchless sequence.  */
95L(copy4):
96	cbz	count, L(copy0)
97	lsr	tmp1, count, 1
98	ldrb	A_lw, [src]
99	ldrb	C_lw, [srcend, -1]
100	ldrb	B_lw, [src, tmp1]
101	strb	A_lw, [dstin]
102	strb	B_lw, [dstin, tmp1]
103	strb	C_lw, [dstend, -1]
104L(copy0):
105	ret
106
107	.p2align 4
108	/* Medium copies: 33..128 bytes.  */
109L(copy32_128):
110	ldp	A_l, A_h, [src]
111	ldp	B_l, B_h, [src, 16]
112	ldp	C_l, C_h, [srcend, -32]
113	ldp	D_l, D_h, [srcend, -16]
114	cmp	count, 64
115	b.hi	L(copy128)
116	stp	A_l, A_h, [dstin]
117	stp	B_l, B_h, [dstin, 16]
118	stp	C_l, C_h, [dstend, -32]
119	stp	D_l, D_h, [dstend, -16]
120	ret
121
122	.p2align 4
123	/* Copy 65..128 bytes.  */
124L(copy128):
125	ldp	E_l, E_h, [src, 32]
126	ldp	F_l, F_h, [src, 48]
127	cmp	count, 96
128	b.ls	L(copy96)
129	ldp	G_l, G_h, [srcend, -64]
130	ldp	H_l, H_h, [srcend, -48]
131	stp	G_l, G_h, [dstend, -64]
132	stp	H_l, H_h, [dstend, -48]
133L(copy96):
134	stp	A_l, A_h, [dstin]
135	stp	B_l, B_h, [dstin, 16]
136	stp	E_l, E_h, [dstin, 32]
137	stp	F_l, F_h, [dstin, 48]
138	stp	C_l, C_h, [dstend, -32]
139	stp	D_l, D_h, [dstend, -16]
140	ret
141
142	.p2align 4
143	/* Copy more than 128 bytes.  */
144L(copy_long):
145	/* Use backwards copy if there is an overlap.  */
146	sub	tmp1, dstin, src
147	cbz	tmp1, L(copy0)
148	cmp	tmp1, count
149	b.lo	L(copy_long_backwards)
150
151	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
152
153	ldp	D_l, D_h, [src]
154	and	tmp1, dstin, 15
155	bic	dst, dstin, 15
156	sub	src, src, tmp1
157	add	count, count, tmp1	/* Count is now 16 too large.  */
158	ldp	A_l, A_h, [src, 16]
159	stp	D_l, D_h, [dstin]
160	ldp	B_l, B_h, [src, 32]
161	ldp	C_l, C_h, [src, 48]
162	ldp	D_l, D_h, [src, 64]!
163	subs	count, count, 128 + 16	/* Test and readjust count.  */
164	b.ls	L(copy64_from_end)
165
166L(loop64):
167	stp	A_l, A_h, [dst, 16]
168	ldp	A_l, A_h, [src, 16]
169	stp	B_l, B_h, [dst, 32]
170	ldp	B_l, B_h, [src, 32]
171	stp	C_l, C_h, [dst, 48]
172	ldp	C_l, C_h, [src, 48]
173	stp	D_l, D_h, [dst, 64]!
174	ldp	D_l, D_h, [src, 64]!
175	subs	count, count, 64
176	b.hi	L(loop64)
177
178	/* Write the last iteration and copy 64 bytes from the end.  */
179L(copy64_from_end):
180	ldp	E_l, E_h, [srcend, -64]
181	stp	A_l, A_h, [dst, 16]
182	ldp	A_l, A_h, [srcend, -48]
183	stp	B_l, B_h, [dst, 32]
184	ldp	B_l, B_h, [srcend, -32]
185	stp	C_l, C_h, [dst, 48]
186	ldp	C_l, C_h, [srcend, -16]
187	stp	D_l, D_h, [dst, 64]
188	stp	E_l, E_h, [dstend, -64]
189	stp	A_l, A_h, [dstend, -48]
190	stp	B_l, B_h, [dstend, -32]
191	stp	C_l, C_h, [dstend, -16]
192	ret
193
194	.p2align 4
195
196	/* Large backwards copy for overlapping copies.
197	   Copy 16 bytes and then align dst to 16-byte alignment.  */
198L(copy_long_backwards):
199	ldp	D_l, D_h, [srcend, -16]
200	and	tmp1, dstend, 15
201	sub	srcend, srcend, tmp1
202	sub	count, count, tmp1
203	ldp	A_l, A_h, [srcend, -16]
204	stp	D_l, D_h, [dstend, -16]
205	ldp	B_l, B_h, [srcend, -32]
206	ldp	C_l, C_h, [srcend, -48]
207	ldp	D_l, D_h, [srcend, -64]!
208	sub	dstend, dstend, tmp1
209	subs	count, count, 128
210	b.ls	L(copy64_from_start)
211
212L(loop64_backwards):
213	stp	A_l, A_h, [dstend, -16]
214	ldp	A_l, A_h, [srcend, -16]
215	stp	B_l, B_h, [dstend, -32]
216	ldp	B_l, B_h, [srcend, -32]
217	stp	C_l, C_h, [dstend, -48]
218	ldp	C_l, C_h, [srcend, -48]
219	stp	D_l, D_h, [dstend, -64]!
220	ldp	D_l, D_h, [srcend, -64]!
221	subs	count, count, 64
222	b.hi	L(loop64_backwards)
223
224	/* Write the last iteration and copy 64 bytes from the start.  */
225L(copy64_from_start):
226	ldp	G_l, G_h, [src, 48]
227	stp	A_l, A_h, [dstend, -16]
228	ldp	A_l, A_h, [src, 32]
229	stp	B_l, B_h, [dstend, -32]
230	ldp	B_l, B_h, [src, 16]
231	stp	C_l, C_h, [dstend, -48]
232	ldp	C_l, C_h, [src]
233	stp	D_l, D_h, [dstend, -64]
234	stp	G_l, G_h, [dstin, 48]
235	stp	A_l, A_h, [dstin, 32]
236	stp	B_l, B_h, [dstin, 16]
237	stp	C_l, C_h, [dstin]
238	ret
239
240END (__memcpy_aarch64)
241