xref: /freebsd/sys/arm64/arm64/memcpy.S (revision c7046f76)
1/*
2 * memcpy - copy memory area
3 *
4 * Copyright (c) 2012-2020, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, unaligned accesses.
11 *
12 */
13
14#include <machine/asm.h>
15
16#define L(l) .L ## l
17
18#define dstin	x0
19#define src	x1
20#define count	x2
21#define dst	x3
22#define srcend	x4
23#define dstend	x5
24#define A_l	x6
25#define A_lw	w6
26#define A_h	x7
27#define B_l	x8
28#define B_lw	w8
29#define B_h	x9
30#define C_l	x10
31#define C_lw	w10
32#define C_h	x11
33#define D_l	x12
34#define D_h	x13
35#define E_l	x14
36#define E_h	x15
37#define F_l	x16
38#define F_h	x17
39#define G_l	count
40#define G_h	dst
41#define H_l	src
42#define H_h	srcend
43#define tmp1	x14
44
45/* This implementation handles overlaps and supports both memcpy and memmove
46   from a single entry point.  It uses unaligned accesses and branchless
47   sequences to keep the code small, simple and improve performance.
48
49   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
50   copies of up to 128 bytes, and large copies.  The overhead of the overlap
51   check is negligible since it is only required for large copies.
52
53   Large copies use a software pipelined loop processing 64 bytes per iteration.
54   The destination pointer is 16-byte aligned to minimize unaligned accesses.
55   The loop tail is handled by always copying 64 bytes from the end.
56*/
57
58ENTRY(memcpy)
59EENTRY(memmove)
60	add	srcend, src, count
61	add	dstend, dstin, count
62	cmp	count, 128
63	b.hi	L(copy_long)
64	cmp	count, 32
65	b.hi	L(copy32_128)
66
67	/* Small copies: 0..32 bytes.  */
68	cmp	count, 16
69	b.lo	L(copy16)
70	ldp	A_l, A_h, [src]
71	ldp	D_l, D_h, [srcend, -16]
72	stp	A_l, A_h, [dstin]
73	stp	D_l, D_h, [dstend, -16]
74	ret
75
76	/* Copy 8-15 bytes.  */
77L(copy16):
78	tbz	count, 3, L(copy8)
79	ldr	A_l, [src]
80	ldr	A_h, [srcend, -8]
81	str	A_l, [dstin]
82	str	A_h, [dstend, -8]
83	ret
84
85	.p2align 3
86	/* Copy 4-7 bytes.  */
87L(copy8):
88	tbz	count, 2, L(copy4)
89	ldr	A_lw, [src]
90	ldr	B_lw, [srcend, -4]
91	str	A_lw, [dstin]
92	str	B_lw, [dstend, -4]
93	ret
94
95	/* Copy 0..3 bytes using a branchless sequence.  */
96L(copy4):
97	cbz	count, L(copy0)
98	lsr	tmp1, count, 1
99	ldrb	A_lw, [src]
100	ldrb	C_lw, [srcend, -1]
101	ldrb	B_lw, [src, tmp1]
102	strb	A_lw, [dstin]
103	strb	B_lw, [dstin, tmp1]
104	strb	C_lw, [dstend, -1]
105L(copy0):
106	ret
107
108	.p2align 4
109	/* Medium copies: 33..128 bytes.  */
110L(copy32_128):
111	ldp	A_l, A_h, [src]
112	ldp	B_l, B_h, [src, 16]
113	ldp	C_l, C_h, [srcend, -32]
114	ldp	D_l, D_h, [srcend, -16]
115	cmp	count, 64
116	b.hi	L(copy128)
117	stp	A_l, A_h, [dstin]
118	stp	B_l, B_h, [dstin, 16]
119	stp	C_l, C_h, [dstend, -32]
120	stp	D_l, D_h, [dstend, -16]
121	ret
122
123	.p2align 4
124	/* Copy 65..128 bytes.  */
125L(copy128):
126	ldp	E_l, E_h, [src, 32]
127	ldp	F_l, F_h, [src, 48]
128	cmp	count, 96
129	b.ls	L(copy96)
130	ldp	G_l, G_h, [srcend, -64]
131	ldp	H_l, H_h, [srcend, -48]
132	stp	G_l, G_h, [dstend, -64]
133	stp	H_l, H_h, [dstend, -48]
134L(copy96):
135	stp	A_l, A_h, [dstin]
136	stp	B_l, B_h, [dstin, 16]
137	stp	E_l, E_h, [dstin, 32]
138	stp	F_l, F_h, [dstin, 48]
139	stp	C_l, C_h, [dstend, -32]
140	stp	D_l, D_h, [dstend, -16]
141	ret
142
143	.p2align 4
144	/* Copy more than 128 bytes.  */
145L(copy_long):
146	/* Use backwards copy if there is an overlap.  */
147	sub	tmp1, dstin, src
148	cbz	tmp1, L(copy0)
149	cmp	tmp1, count
150	b.lo	L(copy_long_backwards)
151
152	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
153
154	ldp	D_l, D_h, [src]
155	and	tmp1, dstin, 15
156	bic	dst, dstin, 15
157	sub	src, src, tmp1
158	add	count, count, tmp1	/* Count is now 16 too large.  */
159	ldp	A_l, A_h, [src, 16]
160	stp	D_l, D_h, [dstin]
161	ldp	B_l, B_h, [src, 32]
162	ldp	C_l, C_h, [src, 48]
163	ldp	D_l, D_h, [src, 64]!
164	subs	count, count, 128 + 16	/* Test and readjust count.  */
165	b.ls	L(copy64_from_end)
166
167L(loop64):
168	stp	A_l, A_h, [dst, 16]
169	ldp	A_l, A_h, [src, 16]
170	stp	B_l, B_h, [dst, 32]
171	ldp	B_l, B_h, [src, 32]
172	stp	C_l, C_h, [dst, 48]
173	ldp	C_l, C_h, [src, 48]
174	stp	D_l, D_h, [dst, 64]!
175	ldp	D_l, D_h, [src, 64]!
176	subs	count, count, 64
177	b.hi	L(loop64)
178
179	/* Write the last iteration and copy 64 bytes from the end.  */
180L(copy64_from_end):
181	ldp	E_l, E_h, [srcend, -64]
182	stp	A_l, A_h, [dst, 16]
183	ldp	A_l, A_h, [srcend, -48]
184	stp	B_l, B_h, [dst, 32]
185	ldp	B_l, B_h, [srcend, -32]
186	stp	C_l, C_h, [dst, 48]
187	ldp	C_l, C_h, [srcend, -16]
188	stp	D_l, D_h, [dst, 64]
189	stp	E_l, E_h, [dstend, -64]
190	stp	A_l, A_h, [dstend, -48]
191	stp	B_l, B_h, [dstend, -32]
192	stp	C_l, C_h, [dstend, -16]
193	ret
194
195	.p2align 4
196
197	/* Large backwards copy for overlapping copies.
198	   Copy 16 bytes and then align dst to 16-byte alignment.  */
199L(copy_long_backwards):
200	ldp	D_l, D_h, [srcend, -16]
201	and	tmp1, dstend, 15
202	sub	srcend, srcend, tmp1
203	sub	count, count, tmp1
204	ldp	A_l, A_h, [srcend, -16]
205	stp	D_l, D_h, [dstend, -16]
206	ldp	B_l, B_h, [srcend, -32]
207	ldp	C_l, C_h, [srcend, -48]
208	ldp	D_l, D_h, [srcend, -64]!
209	sub	dstend, dstend, tmp1
210	subs	count, count, 128
211	b.ls	L(copy64_from_start)
212
213L(loop64_backwards):
214	stp	A_l, A_h, [dstend, -16]
215	ldp	A_l, A_h, [srcend, -16]
216	stp	B_l, B_h, [dstend, -32]
217	ldp	B_l, B_h, [srcend, -32]
218	stp	C_l, C_h, [dstend, -48]
219	ldp	C_l, C_h, [srcend, -48]
220	stp	D_l, D_h, [dstend, -64]!
221	ldp	D_l, D_h, [srcend, -64]!
222	subs	count, count, 64
223	b.hi	L(loop64_backwards)
224
225	/* Write the last iteration and copy 64 bytes from the start.  */
226L(copy64_from_start):
227	ldp	G_l, G_h, [src, 48]
228	stp	A_l, A_h, [dstend, -16]
229	ldp	A_l, A_h, [src, 32]
230	stp	B_l, B_h, [dstend, -32]
231	ldp	B_l, B_h, [src, 16]
232	stp	C_l, C_h, [dstend, -48]
233	ldp	C_l, C_h, [src]
234	stp	D_l, D_h, [dstend, -64]
235	stp	G_l, G_h, [dstin, 48]
236	stp	A_l, A_h, [dstin, 32]
237	stp	B_l, B_h, [dstin, 16]
238	stp	C_l, C_h, [dstin]
239	ret
240EEND(memmove)
241END(memcpy)
242
243