109a53ad8SAndrew Turner/* Copyright (c) 2012, Linaro Limited
209a53ad8SAndrew Turner   All rights reserved.
309a53ad8SAndrew Turner
409a53ad8SAndrew Turner   Redistribution and use in source and binary forms, with or without
509a53ad8SAndrew Turner   modification, are permitted provided that the following conditions are met:
609a53ad8SAndrew Turner       * Redistributions of source code must retain the above copyright
709a53ad8SAndrew Turner         notice, this list of conditions and the following disclaimer.
809a53ad8SAndrew Turner       * Redistributions in binary form must reproduce the above copyright
909a53ad8SAndrew Turner         notice, this list of conditions and the following disclaimer in the
1009a53ad8SAndrew Turner         documentation and/or other materials provided with the distribution.
1109a53ad8SAndrew Turner       * Neither the name of the Linaro nor the
1209a53ad8SAndrew Turner         names of its contributors may be used to endorse or promote products
1309a53ad8SAndrew Turner         derived from this software without specific prior written permission.
1409a53ad8SAndrew Turner
1509a53ad8SAndrew Turner   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
1609a53ad8SAndrew Turner   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
1709a53ad8SAndrew Turner   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
1809a53ad8SAndrew Turner   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
1909a53ad8SAndrew Turner   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
2009a53ad8SAndrew Turner   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
2109a53ad8SAndrew Turner   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
2209a53ad8SAndrew Turner   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
2309a53ad8SAndrew Turner   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
2409a53ad8SAndrew Turner   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2509a53ad8SAndrew Turner   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
2609a53ad8SAndrew Turner
2709a53ad8SAndrew Turner/*
2809a53ad8SAndrew Turner * Copyright (c) 2015 ARM Ltd
2909a53ad8SAndrew Turner * All rights reserved.
3009a53ad8SAndrew Turner *
3109a53ad8SAndrew Turner * Redistribution and use in source and binary forms, with or without
3209a53ad8SAndrew Turner * modification, are permitted provided that the following conditions
3309a53ad8SAndrew Turner * are met:
3409a53ad8SAndrew Turner * 1. Redistributions of source code must retain the above copyright
3509a53ad8SAndrew Turner *    notice, this list of conditions and the following disclaimer.
3609a53ad8SAndrew Turner * 2. Redistributions in binary form must reproduce the above copyright
3709a53ad8SAndrew Turner *    notice, this list of conditions and the following disclaimer in the
3809a53ad8SAndrew Turner *    documentation and/or other materials provided with the distribution.
3909a53ad8SAndrew Turner * 3. The name of the company may not be used to endorse or promote
4009a53ad8SAndrew Turner *    products derived from this software without specific prior written
4109a53ad8SAndrew Turner *    permission.
4209a53ad8SAndrew Turner *
4309a53ad8SAndrew Turner * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
4409a53ad8SAndrew Turner * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
4509a53ad8SAndrew Turner * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
4609a53ad8SAndrew Turner * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
4709a53ad8SAndrew Turner * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
4809a53ad8SAndrew Turner * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
4909a53ad8SAndrew Turner * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
5009a53ad8SAndrew Turner * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
5109a53ad8SAndrew Turner * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
5209a53ad8SAndrew Turner * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5309a53ad8SAndrew Turner */
5409a53ad8SAndrew Turner
5509a53ad8SAndrew Turner/* Assumptions:
5609a53ad8SAndrew Turner *
5709a53ad8SAndrew Turner * ARMv8-a, AArch64, unaligned accesses.
5809a53ad8SAndrew Turner *
5909a53ad8SAndrew Turner */
6009a53ad8SAndrew Turner
6109a53ad8SAndrew Turner#define dstin	x0
6209a53ad8SAndrew Turner#define src	x1
6309a53ad8SAndrew Turner#define count	x2
6409a53ad8SAndrew Turner#define dst	x3
6509a53ad8SAndrew Turner#define srcend	x4
6609a53ad8SAndrew Turner#define dstend	x5
6709a53ad8SAndrew Turner#define A_l	x6
6809a53ad8SAndrew Turner#define A_lw	w6
6909a53ad8SAndrew Turner#define A_h	x7
7009a53ad8SAndrew Turner#define A_hw	w7
7109a53ad8SAndrew Turner#define B_l	x8
7209a53ad8SAndrew Turner#define B_lw	w8
7309a53ad8SAndrew Turner#define B_h	x9
7409a53ad8SAndrew Turner#define C_l	x10
7509a53ad8SAndrew Turner#define C_h	x11
7609a53ad8SAndrew Turner#define D_l	x12
7709a53ad8SAndrew Turner#define D_h	x13
7809a53ad8SAndrew Turner#define E_l	src
7909a53ad8SAndrew Turner#define E_h	count
8086573876SAndrew Turner#define F_l	dst
8186573876SAndrew Turner#define F_h	srcend
8209a53ad8SAndrew Turner#define tmp1	x9
8309a53ad8SAndrew Turner
8409a53ad8SAndrew Turner#define L(l) .L ## l
8509a53ad8SAndrew Turner
8609a53ad8SAndrew Turner	.macro def_fn f p2align=0
8709a53ad8SAndrew Turner	.text
8809a53ad8SAndrew Turner	.p2align \p2align
8909a53ad8SAndrew Turner	.global \f
9009a53ad8SAndrew Turner	.type \f, %function
9109a53ad8SAndrew Turner\f:
9209a53ad8SAndrew Turner	.endm
9309a53ad8SAndrew Turner
9409a53ad8SAndrew Turner/* Copies are split into 3 main cases: small copies of up to 16 bytes,
9509a53ad8SAndrew Turner   medium copies of 17..96 bytes which are fully unrolled. Large copies
9609a53ad8SAndrew Turner   of more than 96 bytes align the destination and use an unrolled loop
9709a53ad8SAndrew Turner   processing 64 bytes per iteration.
9809a53ad8SAndrew Turner   Small and medium copies read all data before writing, allowing any
9909a53ad8SAndrew Turner   kind of overlap, and memmove tailcalls memcpy for these cases as
10009a53ad8SAndrew Turner   well as non-overlapping copies.
10109a53ad8SAndrew Turner*/
10209a53ad8SAndrew Turner
10309a53ad8SAndrew Turnerdef_fn memcpy p2align=6
10409a53ad8SAndrew Turner	prfm	PLDL1KEEP, [src]
10509a53ad8SAndrew Turner	add	srcend, src, count
10609a53ad8SAndrew Turner	add	dstend, dstin, count
10709a53ad8SAndrew Turner	cmp	count, 16
10809a53ad8SAndrew Turner	b.ls	L(copy16)
10909a53ad8SAndrew Turner	cmp	count, 96
11009a53ad8SAndrew Turner	b.hi	L(copy_long)
11109a53ad8SAndrew Turner
11209a53ad8SAndrew Turner	/* Medium copies: 17..96 bytes.  */
11309a53ad8SAndrew Turner	sub	tmp1, count, 1
11409a53ad8SAndrew Turner	ldp	A_l, A_h, [src]
11509a53ad8SAndrew Turner	tbnz	tmp1, 6, L(copy96)
11609a53ad8SAndrew Turner	ldp	D_l, D_h, [srcend, -16]
11709a53ad8SAndrew Turner	tbz	tmp1, 5, 1f
11809a53ad8SAndrew Turner	ldp	B_l, B_h, [src, 16]
11909a53ad8SAndrew Turner	ldp	C_l, C_h, [srcend, -32]
12009a53ad8SAndrew Turner	stp	B_l, B_h, [dstin, 16]
12109a53ad8SAndrew Turner	stp	C_l, C_h, [dstend, -32]
12209a53ad8SAndrew Turner1:
12309a53ad8SAndrew Turner	stp	A_l, A_h, [dstin]
12409a53ad8SAndrew Turner	stp	D_l, D_h, [dstend, -16]
12509a53ad8SAndrew Turner	ret
12609a53ad8SAndrew Turner
12709a53ad8SAndrew Turner	.p2align 4
12809a53ad8SAndrew Turner	/* Small copies: 0..16 bytes.  */
12909a53ad8SAndrew TurnerL(copy16):
13009a53ad8SAndrew Turner	cmp	count, 8
13109a53ad8SAndrew Turner	b.lo	1f
13209a53ad8SAndrew Turner	ldr	A_l, [src]
13309a53ad8SAndrew Turner	ldr	A_h, [srcend, -8]
13409a53ad8SAndrew Turner	str	A_l, [dstin]
13509a53ad8SAndrew Turner	str	A_h, [dstend, -8]
13609a53ad8SAndrew Turner	ret
13709a53ad8SAndrew Turner	.p2align 4
13809a53ad8SAndrew Turner1:
13909a53ad8SAndrew Turner	tbz	count, 2, 1f
14009a53ad8SAndrew Turner	ldr	A_lw, [src]
14109a53ad8SAndrew Turner	ldr	A_hw, [srcend, -4]
14209a53ad8SAndrew Turner	str	A_lw, [dstin]
14309a53ad8SAndrew Turner	str	A_hw, [dstend, -4]
14409a53ad8SAndrew Turner	ret
14509a53ad8SAndrew Turner
14609a53ad8SAndrew Turner	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
14709a53ad8SAndrew Turner	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
14809a53ad8SAndrew Turner1:
14909a53ad8SAndrew Turner	cbz	count, 2f
15009a53ad8SAndrew Turner	lsr	tmp1, count, 1
15109a53ad8SAndrew Turner	ldrb	A_lw, [src]
15209a53ad8SAndrew Turner	ldrb	A_hw, [srcend, -1]
15309a53ad8SAndrew Turner	ldrb	B_lw, [src, tmp1]
15409a53ad8SAndrew Turner	strb	A_lw, [dstin]
15509a53ad8SAndrew Turner	strb	B_lw, [dstin, tmp1]
15609a53ad8SAndrew Turner	strb	A_hw, [dstend, -1]
15709a53ad8SAndrew Turner2:	ret
15809a53ad8SAndrew Turner
15909a53ad8SAndrew Turner	.p2align 4
16009a53ad8SAndrew Turner	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
16109a53ad8SAndrew Turner	   32 bytes from the end.  */
16209a53ad8SAndrew TurnerL(copy96):
16309a53ad8SAndrew Turner	ldp	B_l, B_h, [src, 16]
16409a53ad8SAndrew Turner	ldp	C_l, C_h, [src, 32]
16509a53ad8SAndrew Turner	ldp	D_l, D_h, [src, 48]
16609a53ad8SAndrew Turner	ldp	E_l, E_h, [srcend, -32]
16709a53ad8SAndrew Turner	ldp	F_l, F_h, [srcend, -16]
16809a53ad8SAndrew Turner	stp	A_l, A_h, [dstin]
16909a53ad8SAndrew Turner	stp	B_l, B_h, [dstin, 16]
17009a53ad8SAndrew Turner	stp	C_l, C_h, [dstin, 32]
17109a53ad8SAndrew Turner	stp	D_l, D_h, [dstin, 48]
17209a53ad8SAndrew Turner	stp	E_l, E_h, [dstend, -32]
17309a53ad8SAndrew Turner	stp	F_l, F_h, [dstend, -16]
17409a53ad8SAndrew Turner	ret
17509a53ad8SAndrew Turner
17609a53ad8SAndrew Turner	/* Align DST to 16 byte alignment so that we don't cross cache line
17709a53ad8SAndrew Turner	   boundaries on both loads and stores.	 There are at least 96 bytes
17809a53ad8SAndrew Turner	   to copy, so copy 16 bytes unaligned and then align.	The loop
17909a53ad8SAndrew Turner	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
18009a53ad8SAndrew Turner
18109a53ad8SAndrew Turner	.p2align 4
18209a53ad8SAndrew TurnerL(copy_long):
18309a53ad8SAndrew Turner	and	tmp1, dstin, 15
18409a53ad8SAndrew Turner	bic	dst, dstin, 15
18509a53ad8SAndrew Turner	ldp	D_l, D_h, [src]
18609a53ad8SAndrew Turner	sub	src, src, tmp1
18709a53ad8SAndrew Turner	add	count, count, tmp1	/* Count is now 16 too large.  */
18809a53ad8SAndrew Turner	ldp	A_l, A_h, [src, 16]
18909a53ad8SAndrew Turner	stp	D_l, D_h, [dstin]
19009a53ad8SAndrew Turner	ldp	B_l, B_h, [src, 32]
19109a53ad8SAndrew Turner	ldp	C_l, C_h, [src, 48]
19209a53ad8SAndrew Turner	ldp	D_l, D_h, [src, 64]!
19309a53ad8SAndrew Turner	subs	count, count, 128 + 16	/* Test and readjust count.  */
19409a53ad8SAndrew Turner	b.ls	2f
19509a53ad8SAndrew Turner1:
19609a53ad8SAndrew Turner	stp	A_l, A_h, [dst, 16]
19709a53ad8SAndrew Turner	ldp	A_l, A_h, [src, 16]
19809a53ad8SAndrew Turner	stp	B_l, B_h, [dst, 32]
19909a53ad8SAndrew Turner	ldp	B_l, B_h, [src, 32]
20009a53ad8SAndrew Turner	stp	C_l, C_h, [dst, 48]
20109a53ad8SAndrew Turner	ldp	C_l, C_h, [src, 48]
20209a53ad8SAndrew Turner	stp	D_l, D_h, [dst, 64]!
20309a53ad8SAndrew Turner	ldp	D_l, D_h, [src, 64]!
20409a53ad8SAndrew Turner	subs	count, count, 64
20509a53ad8SAndrew Turner	b.hi	1b
20609a53ad8SAndrew Turner
20709a53ad8SAndrew Turner	/* Write the last full set of 64 bytes.	 The remainder is at most 64
20809a53ad8SAndrew Turner	   bytes, so it is safe to always copy 64 bytes from the end even if
20909a53ad8SAndrew Turner	   there is just 1 byte left.  */
21009a53ad8SAndrew Turner2:
21109a53ad8SAndrew Turner	ldp	E_l, E_h, [srcend, -64]
21209a53ad8SAndrew Turner	stp	A_l, A_h, [dst, 16]
21309a53ad8SAndrew Turner	ldp	A_l, A_h, [srcend, -48]
21409a53ad8SAndrew Turner	stp	B_l, B_h, [dst, 32]
21509a53ad8SAndrew Turner	ldp	B_l, B_h, [srcend, -32]
21609a53ad8SAndrew Turner	stp	C_l, C_h, [dst, 48]
21709a53ad8SAndrew Turner	ldp	C_l, C_h, [srcend, -16]
21809a53ad8SAndrew Turner	stp	D_l, D_h, [dst, 64]
21909a53ad8SAndrew Turner	stp	E_l, E_h, [dstend, -64]
22009a53ad8SAndrew Turner	stp	A_l, A_h, [dstend, -48]
22109a53ad8SAndrew Turner	stp	B_l, B_h, [dstend, -32]
22209a53ad8SAndrew Turner	stp	C_l, C_h, [dstend, -16]
22309a53ad8SAndrew Turner	ret
22409a53ad8SAndrew Turner
22509a53ad8SAndrew Turner	.size	memcpy, . - memcpy
226