1/*
2 * memcpy - copy memory area
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 */
8
9/* Assumptions:
10 *
11 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
12 *
13 */
14
15#include "../asmdefs.h"
16
17#define dstin	x0
18#define src	x1
19#define count	x2
20#define dst	x3
21#define srcend	x4
22#define dstend	x5
23#define A_l	x6
24#define A_lw	w6
25#define A_h	x7
26#define B_l	x8
27#define B_lw	w8
28#define B_h	x9
29#define C_lw	w10
30#define tmp1	x14
31
32#define A_q	q0
33#define B_q	q1
34#define C_q	q2
35#define D_q	q3
36#define E_q	q4
37#define F_q	q5
38#define G_q	q6
39#define H_q	q7
40
41/* This implementation handles overlaps and supports both memcpy and memmove
42   from a single entry point.  It uses unaligned accesses and branchless
43   sequences to keep the code small, simple and improve performance.
44
45   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
46   copies of up to 128 bytes, and large copies.  The overhead of the overlap
47   check is negligible since it is only required for large copies.
48
49   Large copies use a software pipelined loop processing 64 bytes per iteration.
50   The source pointer is 16-byte aligned to minimize unaligned accesses.
51   The loop tail is handled by always copying 64 bytes from the end.
52*/
53
54ENTRY (__memcpy_aarch64_simd)
55ENTRY_ALIAS (__memmove_aarch64_simd)
56	add	srcend, src, count
57	add	dstend, dstin, count
58	cmp	count, 128
59	b.hi	L(copy_long)
60	cmp	count, 32
61	b.hi	L(copy32_128)
62
63	/* Small copies: 0..32 bytes.  */
64	cmp	count, 16
65	b.lo	L(copy16)
66	ldr	A_q, [src]
67	ldr	B_q, [srcend, -16]
68	str	A_q, [dstin]
69	str	B_q, [dstend, -16]
70	ret
71
72	/* Copy 8-15 bytes.  */
73L(copy16):
74	tbz	count, 3, L(copy8)
75	ldr	A_l, [src]
76	ldr	A_h, [srcend, -8]
77	str	A_l, [dstin]
78	str	A_h, [dstend, -8]
79	ret
80
81	.p2align 3
82	/* Copy 4-7 bytes.  */
83L(copy8):
84	tbz	count, 2, L(copy4)
85	ldr	A_lw, [src]
86	ldr	B_lw, [srcend, -4]
87	str	A_lw, [dstin]
88	str	B_lw, [dstend, -4]
89	ret
90
91	/* Copy 0..3 bytes using a branchless sequence.  */
92L(copy4):
93	cbz	count, L(copy0)
94	lsr	tmp1, count, 1
95	ldrb	A_lw, [src]
96	ldrb	C_lw, [srcend, -1]
97	ldrb	B_lw, [src, tmp1]
98	strb	A_lw, [dstin]
99	strb	B_lw, [dstin, tmp1]
100	strb	C_lw, [dstend, -1]
101L(copy0):
102	ret
103
104	.p2align 4
105	/* Medium copies: 33..128 bytes.  */
106L(copy32_128):
107	ldp	A_q, B_q, [src]
108	ldp	C_q, D_q, [srcend, -32]
109	cmp	count, 64
110	b.hi	L(copy128)
111	stp	A_q, B_q, [dstin]
112	stp	C_q, D_q, [dstend, -32]
113	ret
114
115	.p2align 4
116	/* Copy 65..128 bytes.  */
117L(copy128):
118	ldp	E_q, F_q, [src, 32]
119	cmp	count, 96
120	b.ls	L(copy96)
121	ldp	G_q, H_q, [srcend, -64]
122	stp	G_q, H_q, [dstend, -64]
123L(copy96):
124	stp	A_q, B_q, [dstin]
125	stp	E_q, F_q, [dstin, 32]
126	stp	C_q, D_q, [dstend, -32]
127	ret
128
129	/* Copy more than 128 bytes.  */
130L(copy_long):
131	/* Use backwards copy if there is an overlap.  */
132	sub	tmp1, dstin, src
133	cmp	tmp1, count
134	b.lo	L(copy_long_backwards)
135
136	/* Copy 16 bytes and then align src to 16-byte alignment.  */
137	ldr	D_q, [src]
138	and	tmp1, src, 15
139	bic	src, src, 15
140	sub	dst, dstin, tmp1
141	add	count, count, tmp1	/* Count is now 16 too large.  */
142	ldp	A_q, B_q, [src, 16]
143	str	D_q, [dstin]
144	ldp	C_q, D_q, [src, 48]
145	subs	count, count, 128 + 16	/* Test and readjust count.  */
146	b.ls	L(copy64_from_end)
147L(loop64):
148	stp	A_q, B_q, [dst, 16]
149	ldp	A_q, B_q, [src, 80]
150	stp	C_q, D_q, [dst, 48]
151	ldp	C_q, D_q, [src, 112]
152	add	src, src, 64
153	add	dst, dst, 64
154	subs	count, count, 64
155	b.hi	L(loop64)
156
157	/* Write the last iteration and copy 64 bytes from the end.  */
158L(copy64_from_end):
159	ldp	E_q, F_q, [srcend, -64]
160	stp	A_q, B_q, [dst, 16]
161	ldp	A_q, B_q, [srcend, -32]
162	stp	C_q, D_q, [dst, 48]
163	stp	E_q, F_q, [dstend, -64]
164	stp	A_q, B_q, [dstend, -32]
165	ret
166
167	/* Large backwards copy for overlapping copies.
168	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
169L(copy_long_backwards):
170	cbz	tmp1, L(copy0)
171	ldr	D_q, [srcend, -16]
172	and	tmp1, srcend, 15
173	bic	srcend, srcend, 15
174	sub	count, count, tmp1
175	ldp	A_q, B_q, [srcend, -32]
176	str	D_q, [dstend, -16]
177	ldp	C_q, D_q, [srcend, -64]
178	sub	dstend, dstend, tmp1
179	subs	count, count, 128
180	b.ls	L(copy64_from_start)
181
182L(loop64_backwards):
183	stp	A_q, B_q, [dstend, -32]
184	ldp	A_q, B_q, [srcend, -96]
185	stp	C_q, D_q, [dstend, -64]
186	ldp	C_q, D_q, [srcend, -128]
187	sub	srcend, srcend, 64
188	sub	dstend, dstend, 64
189	subs	count, count, 64
190	b.hi	L(loop64_backwards)
191
192	/* Write the last iteration and copy 64 bytes from the start.  */
193L(copy64_from_start):
194	ldp	E_q, F_q, [src, 32]
195	stp	A_q, B_q, [dstend, -32]
196	ldp	A_q, B_q, [src]
197	stp	C_q, D_q, [dstend, -64]
198	stp	E_q, F_q, [dstin, 32]
199	stp	A_q, B_q, [dstin]
200	ret
201
202END (__memcpy_aarch64_simd)
203