xref: /freebsd/sys/arm64/arm64/memcpy.S (revision c697fb7f)
1/* Copyright (c) 2012, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27/*
28 * Copyright (c) 2015 ARM Ltd
29 * All rights reserved.
30 *
31 * Redistribution and use in source and binary forms, with or without
32 * modification, are permitted provided that the following conditions
33 * are met:
34 * 1. Redistributions of source code must retain the above copyright
35 *    notice, this list of conditions and the following disclaimer.
36 * 2. Redistributions in binary form must reproduce the above copyright
37 *    notice, this list of conditions and the following disclaimer in the
38 *    documentation and/or other materials provided with the distribution.
39 * 3. The name of the company may not be used to endorse or promote
40 *    products derived from this software without specific prior written
41 *    permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
45 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
46 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
48 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
49 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
50 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
51 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
52 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 */
54
55#include <machine/asm.h>
56__FBSDID("$FreeBSD$");
57
58/* Assumptions:
59 *
60 * ARMv8-a, AArch64, unaligned accesses.
61 *
62 */
63
64#define dstin	x0
65#define src	x1
66#define count	x2
67#define dst	x3
68#define srcend	x4
69#define dstend	x5
70#define A_l	x6
71#define A_lw	w6
72#define A_h	x7
73#define A_hw	w7
74#define B_l	x8
75#define B_lw	w8
76#define B_h	x9
77#define C_l	x10
78#define C_h	x11
79#define D_l	x12
80#define D_h	x13
81#define E_l	src
82#define E_h	count
83#define F_l	srcend
84#define F_h	dst
85#define tmp1	x9
86
87#define L(l) .L ## l
88
89/* Copies are split into 3 main cases: small copies of up to 16 bytes,
90   medium copies of 17..96 bytes which are fully unrolled. Large copies
91   of more than 96 bytes align the destination and use an unrolled loop
92   processing 64 bytes per iteration.
93   Small and medium copies read all data before writing, allowing any
94   kind of overlap, and memmove tailcalls memcpy for these cases as
95   well as non-overlapping copies.
96*/
97
98ENTRY(memcpy)
99	prfm	PLDL1KEEP, [src]
100	add	srcend, src, count
101	add	dstend, dstin, count
102	cmp	count, 16
103	b.ls	L(copy16)
104	cmp	count, 96
105	b.hi	L(copy_long)
106
107	/* Medium copies: 17..96 bytes.  */
108	sub	tmp1, count, 1
109	ldp	A_l, A_h, [src]
110	tbnz	tmp1, 6, L(copy96)
111	ldp	D_l, D_h, [srcend, -16]
112	tbz	tmp1, 5, 1f
113	ldp	B_l, B_h, [src, 16]
114	ldp	C_l, C_h, [srcend, -32]
115	stp	B_l, B_h, [dstin, 16]
116	stp	C_l, C_h, [dstend, -32]
1171:
118	stp	A_l, A_h, [dstin]
119	stp	D_l, D_h, [dstend, -16]
120	ret
121
122	.p2align 4
123	/* Small copies: 0..16 bytes.  */
124L(copy16):
125	cmp	count, 8
126	b.lo	1f
127	ldr	A_l, [src]
128	ldr	A_h, [srcend, -8]
129	str	A_l, [dstin]
130	str	A_h, [dstend, -8]
131	ret
132	.p2align 4
1331:
134	tbz	count, 2, 1f
135	ldr	A_lw, [src]
136	ldr	A_hw, [srcend, -4]
137	str	A_lw, [dstin]
138	str	A_hw, [dstend, -4]
139	ret
140
141	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
142	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
1431:
144	cbz	count, 2f
145	lsr	tmp1, count, 1
146	ldrb	A_lw, [src]
147	ldrb	A_hw, [srcend, -1]
148	ldrb	B_lw, [src, tmp1]
149	strb	A_lw, [dstin]
150	strb	B_lw, [dstin, tmp1]
151	strb	A_hw, [dstend, -1]
1522:	ret
153
154	.p2align 4
155	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
156	   32 bytes from the end.  */
157L(copy96):
158	ldp	B_l, B_h, [src, 16]
159	ldp	C_l, C_h, [src, 32]
160	ldp	D_l, D_h, [src, 48]
161	ldp	E_l, E_h, [srcend, -32]
162	ldp	F_l, F_h, [srcend, -16]
163	stp	A_l, A_h, [dstin]
164	stp	B_l, B_h, [dstin, 16]
165	stp	C_l, C_h, [dstin, 32]
166	stp	D_l, D_h, [dstin, 48]
167	stp	E_l, E_h, [dstend, -32]
168	stp	F_l, F_h, [dstend, -16]
169	ret
170
171	/* Align DST to 16 byte alignment so that we don't cross cache line
172	   boundaries on both loads and stores.	 There are at least 96 bytes
173	   to copy, so copy 16 bytes unaligned and then align.	The loop
174	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
175
176	.p2align 4
177L(copy_long):
178	and	tmp1, dstin, 15
179	bic	dst, dstin, 15
180	ldp	D_l, D_h, [src]
181	sub	src, src, tmp1
182	add	count, count, tmp1	/* Count is now 16 too large.  */
183	ldp	A_l, A_h, [src, 16]
184	stp	D_l, D_h, [dstin]
185	ldp	B_l, B_h, [src, 32]
186	ldp	C_l, C_h, [src, 48]
187	ldp	D_l, D_h, [src, 64]!
188	subs	count, count, 128 + 16	/* Test and readjust count.  */
189	b.ls	2f
1901:
191	stp	A_l, A_h, [dst, 16]
192	ldp	A_l, A_h, [src, 16]
193	stp	B_l, B_h, [dst, 32]
194	ldp	B_l, B_h, [src, 32]
195	stp	C_l, C_h, [dst, 48]
196	ldp	C_l, C_h, [src, 48]
197	stp	D_l, D_h, [dst, 64]!
198	ldp	D_l, D_h, [src, 64]!
199	subs	count, count, 64
200	b.hi	1b
201
202	/* Write the last full set of 64 bytes.	 The remainder is at most 64
203	   bytes, so it is safe to always copy 64 bytes from the end even if
204	   there is just 1 byte left.  */
2052:
206	ldp	E_l, E_h, [srcend, -64]
207	stp	A_l, A_h, [dst, 16]
208	ldp	A_l, A_h, [srcend, -48]
209	stp	B_l, B_h, [dst, 32]
210	ldp	B_l, B_h, [srcend, -32]
211	stp	C_l, C_h, [dst, 48]
212	ldp	C_l, C_h, [srcend, -16]
213	stp	D_l, D_h, [dst, 64]
214	stp	E_l, E_h, [dstend, -64]
215	stp	A_l, A_h, [dstend, -48]
216	stp	B_l, B_h, [dstend, -32]
217	stp	C_l, C_h, [dstend, -16]
218	ret
219END(memcpy)
220