131914882SAlex Richardson/*
231914882SAlex Richardson * memcpy - copy memory area
331914882SAlex Richardson *
4*072a4ba8SAndrew Turner * Copyright (c) 2013-2022, Arm Limited.
5*072a4ba8SAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
631914882SAlex Richardson */
731914882SAlex Richardson
831914882SAlex Richardson/*
931914882SAlex Richardson   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
1031914882SAlex Richardson   of VFP or NEON when built with the appropriate flags.
1131914882SAlex Richardson
1231914882SAlex Richardson   Assumptions:
1331914882SAlex Richardson
1431914882SAlex Richardson    ARMv6 (ARMv7-a if using Neon)
1531914882SAlex Richardson    ARM state
1631914882SAlex Richardson    Unaligned accesses
1731914882SAlex Richardson
1831914882SAlex Richardson */
1931914882SAlex Richardson
20*072a4ba8SAndrew Turner#include "asmdefs.h"
2131914882SAlex Richardson
2231914882SAlex Richardson	.syntax unified
2331914882SAlex Richardson	/* This implementation requires ARM state.  */
2431914882SAlex Richardson	.arm
2531914882SAlex Richardson
2631914882SAlex Richardson#ifdef __ARM_NEON__
2731914882SAlex Richardson
2831914882SAlex Richardson	.fpu	neon
2931914882SAlex Richardson	.arch	armv7-a
3031914882SAlex Richardson# define FRAME_SIZE	4
3131914882SAlex Richardson# define USE_VFP
3231914882SAlex Richardson# define USE_NEON
3331914882SAlex Richardson
3431914882SAlex Richardson#elif !defined (__SOFTFP__)
3531914882SAlex Richardson
3631914882SAlex Richardson	.arch	armv6
3731914882SAlex Richardson	.fpu	vfpv2
3831914882SAlex Richardson# define FRAME_SIZE	32
3931914882SAlex Richardson# define USE_VFP
4031914882SAlex Richardson
4131914882SAlex Richardson#else
4231914882SAlex Richardson	.arch	armv6
4331914882SAlex Richardson# define FRAME_SIZE    32
4431914882SAlex Richardson
4531914882SAlex Richardson#endif
4631914882SAlex Richardson
4731914882SAlex Richardson/* Old versions of GAS incorrectly implement the NEON align semantics.  */
4831914882SAlex Richardson#ifdef BROKEN_ASM_NEON_ALIGN
4931914882SAlex Richardson#define ALIGN(addr, align) addr,:align
5031914882SAlex Richardson#else
5131914882SAlex Richardson#define ALIGN(addr, align) addr:align
5231914882SAlex Richardson#endif
5331914882SAlex Richardson
5431914882SAlex Richardson#define PC_OFFSET	8	/* PC pipeline compensation.  */
5531914882SAlex Richardson#define INSN_SIZE	4
5631914882SAlex Richardson
5731914882SAlex Richardson/* Call parameters.  */
5831914882SAlex Richardson#define dstin	r0
5931914882SAlex Richardson#define src	r1
6031914882SAlex Richardson#define count	r2
6131914882SAlex Richardson
6231914882SAlex Richardson/* Locals.  */
6331914882SAlex Richardson#define tmp1	r3
6431914882SAlex Richardson#define dst	ip
6531914882SAlex Richardson#define tmp2	r10
6631914882SAlex Richardson
6731914882SAlex Richardson#ifndef USE_NEON
6831914882SAlex Richardson/* For bulk copies using GP registers.  */
6931914882SAlex Richardson#define	A_l	r2		/* Call-clobbered.  */
7031914882SAlex Richardson#define	A_h	r3		/* Call-clobbered.  */
7131914882SAlex Richardson#define	B_l	r4
7231914882SAlex Richardson#define	B_h	r5
7331914882SAlex Richardson#define	C_l	r6
7431914882SAlex Richardson#define	C_h	r7
7531914882SAlex Richardson#define	D_l	r8
7631914882SAlex Richardson#define	D_h	r9
7731914882SAlex Richardson#endif
7831914882SAlex Richardson
7931914882SAlex Richardson/* Number of lines ahead to pre-fetch data.  If you change this the code
8031914882SAlex Richardson   below will need adjustment to compensate.  */
8131914882SAlex Richardson
8231914882SAlex Richardson#define prefetch_lines	5
8331914882SAlex Richardson
8431914882SAlex Richardson#ifdef USE_VFP
8531914882SAlex Richardson	.macro	cpy_line_vfp vreg, base
8631914882SAlex Richardson	vstr	\vreg, [dst, #\base]
8731914882SAlex Richardson	vldr	\vreg, [src, #\base]
8831914882SAlex Richardson	vstr	d0, [dst, #\base + 8]
8931914882SAlex Richardson	vldr	d0, [src, #\base + 8]
9031914882SAlex Richardson	vstr	d1, [dst, #\base + 16]
9131914882SAlex Richardson	vldr	d1, [src, #\base + 16]
9231914882SAlex Richardson	vstr	d2, [dst, #\base + 24]
9331914882SAlex Richardson	vldr	d2, [src, #\base + 24]
9431914882SAlex Richardson	vstr	\vreg, [dst, #\base + 32]
9531914882SAlex Richardson	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
9631914882SAlex Richardson	vstr	d0, [dst, #\base + 40]
9731914882SAlex Richardson	vldr	d0, [src, #\base + 40]
9831914882SAlex Richardson	vstr	d1, [dst, #\base + 48]
9931914882SAlex Richardson	vldr	d1, [src, #\base + 48]
10031914882SAlex Richardson	vstr	d2, [dst, #\base + 56]
10131914882SAlex Richardson	vldr	d2, [src, #\base + 56]
10231914882SAlex Richardson	.endm
10331914882SAlex Richardson
10431914882SAlex Richardson	.macro	cpy_tail_vfp vreg, base
10531914882SAlex Richardson	vstr	\vreg, [dst, #\base]
10631914882SAlex Richardson	vldr	\vreg, [src, #\base]
10731914882SAlex Richardson	vstr	d0, [dst, #\base + 8]
10831914882SAlex Richardson	vldr	d0, [src, #\base + 8]
10931914882SAlex Richardson	vstr	d1, [dst, #\base + 16]
11031914882SAlex Richardson	vldr	d1, [src, #\base + 16]
11131914882SAlex Richardson	vstr	d2, [dst, #\base + 24]
11231914882SAlex Richardson	vldr	d2, [src, #\base + 24]
11331914882SAlex Richardson	vstr	\vreg, [dst, #\base + 32]
11431914882SAlex Richardson	vstr	d0, [dst, #\base + 40]
11531914882SAlex Richardson	vldr	d0, [src, #\base + 40]
11631914882SAlex Richardson	vstr	d1, [dst, #\base + 48]
11731914882SAlex Richardson	vldr	d1, [src, #\base + 48]
11831914882SAlex Richardson	vstr	d2, [dst, #\base + 56]
11931914882SAlex Richardson	vldr	d2, [src, #\base + 56]
12031914882SAlex Richardson	.endm
12131914882SAlex Richardson#endif
12231914882SAlex Richardson
12331914882SAlex RichardsonENTRY (__memcpy_arm)
12431914882SAlex Richardson
12531914882SAlex Richardson	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
12631914882SAlex Richardson	cmp	count, #64
12731914882SAlex Richardson	bhs	L(cpy_not_short)
12831914882SAlex Richardson	/* Deal with small copies quickly by dropping straight into the
12931914882SAlex Richardson	   exit block.  */
13031914882SAlex Richardson
13131914882SAlex RichardsonL(tail63unaligned):
13231914882SAlex Richardson#ifdef USE_NEON
13331914882SAlex Richardson	and	tmp1, count, #0x38
13431914882SAlex Richardson	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
13531914882SAlex Richardson	add	pc, pc, tmp1
13631914882SAlex Richardson	vld1.8	{d0}, [src]!	/* 14 words to go.  */
13731914882SAlex Richardson	vst1.8	{d0}, [dst]!
13831914882SAlex Richardson	vld1.8	{d0}, [src]!	/* 12 words to go.  */
13931914882SAlex Richardson	vst1.8	{d0}, [dst]!
14031914882SAlex Richardson	vld1.8	{d0}, [src]!	/* 10 words to go.  */
14131914882SAlex Richardson	vst1.8	{d0}, [dst]!
14231914882SAlex Richardson	vld1.8	{d0}, [src]!	/* 8 words to go.  */
14331914882SAlex Richardson	vst1.8	{d0}, [dst]!
14431914882SAlex Richardson	vld1.8	{d0}, [src]!	/* 6 words to go.  */
14531914882SAlex Richardson	vst1.8	{d0}, [dst]!
14631914882SAlex Richardson	vld1.8	{d0}, [src]!	/* 4 words to go.  */
14731914882SAlex Richardson	vst1.8	{d0}, [dst]!
14831914882SAlex Richardson	vld1.8	{d0}, [src]!	/* 2 words to go.  */
14931914882SAlex Richardson	vst1.8	{d0}, [dst]!
15031914882SAlex Richardson
15131914882SAlex Richardson	tst	count, #4
15231914882SAlex Richardson	ldrne	tmp1, [src], #4
15331914882SAlex Richardson	strne	tmp1, [dst], #4
15431914882SAlex Richardson#else
15531914882SAlex Richardson	/* Copy up to 15 full words of data.  May not be aligned.  */
15631914882SAlex Richardson	/* Cannot use VFP for unaligned data.  */
15731914882SAlex Richardson	and	tmp1, count, #0x3c
15831914882SAlex Richardson	add	dst, dst, tmp1
15931914882SAlex Richardson	add	src, src, tmp1
16031914882SAlex Richardson	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
16131914882SAlex Richardson	/* Jump directly into the sequence below at the correct offset.  */
16231914882SAlex Richardson	add	pc, pc, tmp1, lsl #1
16331914882SAlex Richardson
16431914882SAlex Richardson	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
16531914882SAlex Richardson	str	tmp1, [dst, #-60]
16631914882SAlex Richardson
16731914882SAlex Richardson	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
16831914882SAlex Richardson	str	tmp1, [dst, #-56]
16931914882SAlex Richardson	ldr	tmp1, [src, #-52]
17031914882SAlex Richardson	str	tmp1, [dst, #-52]
17131914882SAlex Richardson
17231914882SAlex Richardson	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
17331914882SAlex Richardson	str	tmp1, [dst, #-48]
17431914882SAlex Richardson	ldr	tmp1, [src, #-44]
17531914882SAlex Richardson	str	tmp1, [dst, #-44]
17631914882SAlex Richardson
17731914882SAlex Richardson	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
17831914882SAlex Richardson	str	tmp1, [dst, #-40]
17931914882SAlex Richardson	ldr	tmp1, [src, #-36]
18031914882SAlex Richardson	str	tmp1, [dst, #-36]
18131914882SAlex Richardson
18231914882SAlex Richardson	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
18331914882SAlex Richardson	str	tmp1, [dst, #-32]
18431914882SAlex Richardson	ldr	tmp1, [src, #-28]
18531914882SAlex Richardson	str	tmp1, [dst, #-28]
18631914882SAlex Richardson
18731914882SAlex Richardson	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
18831914882SAlex Richardson	str	tmp1, [dst, #-24]
18931914882SAlex Richardson	ldr	tmp1, [src, #-20]
19031914882SAlex Richardson	str	tmp1, [dst, #-20]
19131914882SAlex Richardson
19231914882SAlex Richardson	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
19331914882SAlex Richardson	str	tmp1, [dst, #-16]
19431914882SAlex Richardson	ldr	tmp1, [src, #-12]
19531914882SAlex Richardson	str	tmp1, [dst, #-12]
19631914882SAlex Richardson
19731914882SAlex Richardson	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
19831914882SAlex Richardson	str	tmp1, [dst, #-8]
19931914882SAlex Richardson	ldr	tmp1, [src, #-4]
20031914882SAlex Richardson	str	tmp1, [dst, #-4]
20131914882SAlex Richardson#endif
20231914882SAlex Richardson
20331914882SAlex Richardson	lsls	count, count, #31
20431914882SAlex Richardson	ldrhcs	tmp1, [src], #2
20531914882SAlex Richardson	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
20631914882SAlex Richardson	strhcs	tmp1, [dst], #2
20731914882SAlex Richardson	strbne	src, [dst]
20831914882SAlex Richardson	bx	lr
20931914882SAlex Richardson
21031914882SAlex RichardsonL(cpy_not_short):
21131914882SAlex Richardson	/* At least 64 bytes to copy, but don't know the alignment yet.  */
21231914882SAlex Richardson	str	tmp2, [sp, #-FRAME_SIZE]!
21331914882SAlex Richardson	and	tmp2, src, #7
21431914882SAlex Richardson	and	tmp1, dst, #7
21531914882SAlex Richardson	cmp	tmp1, tmp2
21631914882SAlex Richardson	bne	L(cpy_notaligned)
21731914882SAlex Richardson
21831914882SAlex Richardson#ifdef USE_VFP
21931914882SAlex Richardson	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
22031914882SAlex Richardson	   that the FP pipeline is much better at streaming loads and
22131914882SAlex Richardson	   stores.  This is outside the critical loop.  */
22231914882SAlex Richardson	vmov.f32	s0, s0
22331914882SAlex Richardson#endif
22431914882SAlex Richardson
22531914882SAlex Richardson	/* SRC and DST have the same mutual 64-bit alignment, but we may
22631914882SAlex Richardson	   still need to pre-copy some bytes to get to natural alignment.
22731914882SAlex Richardson	   We bring SRC and DST into full 64-bit alignment.  */
22831914882SAlex Richardson	lsls	tmp2, dst, #29
22931914882SAlex Richardson	beq	1f
23031914882SAlex Richardson	rsbs	tmp2, tmp2, #0
23131914882SAlex Richardson	sub	count, count, tmp2, lsr #29
23231914882SAlex Richardson	ldrmi	tmp1, [src], #4
23331914882SAlex Richardson	strmi	tmp1, [dst], #4
23431914882SAlex Richardson	lsls	tmp2, tmp2, #2
23531914882SAlex Richardson	ldrhcs	tmp1, [src], #2
23631914882SAlex Richardson	ldrbne	tmp2, [src], #1
23731914882SAlex Richardson	strhcs	tmp1, [dst], #2
23831914882SAlex Richardson	strbne	tmp2, [dst], #1
23931914882SAlex Richardson
24031914882SAlex Richardson1:
24131914882SAlex Richardson	subs	tmp2, count, #64	/* Use tmp2 for count.  */
24231914882SAlex Richardson	blo	L(tail63aligned)
24331914882SAlex Richardson
24431914882SAlex Richardson	cmp	tmp2, #512
24531914882SAlex Richardson	bhs	L(cpy_body_long)
24631914882SAlex Richardson
24731914882SAlex RichardsonL(cpy_body_medium):			/* Count in tmp2.  */
24831914882SAlex Richardson#ifdef USE_VFP
24931914882SAlex Richardson1:
25031914882SAlex Richardson	vldr	d0, [src, #0]
25131914882SAlex Richardson	subs	tmp2, tmp2, #64
25231914882SAlex Richardson	vldr	d1, [src, #8]
25331914882SAlex Richardson	vstr	d0, [dst, #0]
25431914882SAlex Richardson	vldr	d0, [src, #16]
25531914882SAlex Richardson	vstr	d1, [dst, #8]
25631914882SAlex Richardson	vldr	d1, [src, #24]
25731914882SAlex Richardson	vstr	d0, [dst, #16]
25831914882SAlex Richardson	vldr	d0, [src, #32]
25931914882SAlex Richardson	vstr	d1, [dst, #24]
26031914882SAlex Richardson	vldr	d1, [src, #40]
26131914882SAlex Richardson	vstr	d0, [dst, #32]
26231914882SAlex Richardson	vldr	d0, [src, #48]
26331914882SAlex Richardson	vstr	d1, [dst, #40]
26431914882SAlex Richardson	vldr	d1, [src, #56]
26531914882SAlex Richardson	vstr	d0, [dst, #48]
26631914882SAlex Richardson	add	src, src, #64
26731914882SAlex Richardson	vstr	d1, [dst, #56]
26831914882SAlex Richardson	add	dst, dst, #64
26931914882SAlex Richardson	bhs	1b
27031914882SAlex Richardson	tst	tmp2, #0x3f
27131914882SAlex Richardson	beq	L(done)
27231914882SAlex Richardson
27331914882SAlex RichardsonL(tail63aligned):			/* Count in tmp2.  */
27431914882SAlex Richardson	and	tmp1, tmp2, #0x38
27531914882SAlex Richardson	add	dst, dst, tmp1
27631914882SAlex Richardson	add	src, src, tmp1
27731914882SAlex Richardson	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
27831914882SAlex Richardson	add	pc, pc, tmp1
27931914882SAlex Richardson
28031914882SAlex Richardson	vldr	d0, [src, #-56]	/* 14 words to go.  */
28131914882SAlex Richardson	vstr	d0, [dst, #-56]
28231914882SAlex Richardson	vldr	d0, [src, #-48]	/* 12 words to go.  */
28331914882SAlex Richardson	vstr	d0, [dst, #-48]
28431914882SAlex Richardson	vldr	d0, [src, #-40]	/* 10 words to go.  */
28531914882SAlex Richardson	vstr	d0, [dst, #-40]
28631914882SAlex Richardson	vldr	d0, [src, #-32]	/* 8 words to go.  */
28731914882SAlex Richardson	vstr	d0, [dst, #-32]
28831914882SAlex Richardson	vldr	d0, [src, #-24]	/* 6 words to go.  */
28931914882SAlex Richardson	vstr	d0, [dst, #-24]
29031914882SAlex Richardson	vldr	d0, [src, #-16]	/* 4 words to go.  */
29131914882SAlex Richardson	vstr	d0, [dst, #-16]
29231914882SAlex Richardson	vldr	d0, [src, #-8]	/* 2 words to go.  */
29331914882SAlex Richardson	vstr	d0, [dst, #-8]
29431914882SAlex Richardson#else
29531914882SAlex Richardson	sub	src, src, #8
29631914882SAlex Richardson	sub	dst, dst, #8
29731914882SAlex Richardson1:
29831914882SAlex Richardson	ldrd	A_l, A_h, [src, #8]
29931914882SAlex Richardson	strd	A_l, A_h, [dst, #8]
30031914882SAlex Richardson	ldrd	A_l, A_h, [src, #16]
30131914882SAlex Richardson	strd	A_l, A_h, [dst, #16]
30231914882SAlex Richardson	ldrd	A_l, A_h, [src, #24]
30331914882SAlex Richardson	strd	A_l, A_h, [dst, #24]
30431914882SAlex Richardson	ldrd	A_l, A_h, [src, #32]
30531914882SAlex Richardson	strd	A_l, A_h, [dst, #32]
30631914882SAlex Richardson	ldrd	A_l, A_h, [src, #40]
30731914882SAlex Richardson	strd	A_l, A_h, [dst, #40]
30831914882SAlex Richardson	ldrd	A_l, A_h, [src, #48]
30931914882SAlex Richardson	strd	A_l, A_h, [dst, #48]
31031914882SAlex Richardson	ldrd	A_l, A_h, [src, #56]
31131914882SAlex Richardson	strd	A_l, A_h, [dst, #56]
31231914882SAlex Richardson	ldrd	A_l, A_h, [src, #64]!
31331914882SAlex Richardson	strd	A_l, A_h, [dst, #64]!
31431914882SAlex Richardson	subs	tmp2, tmp2, #64
31531914882SAlex Richardson	bhs	1b
31631914882SAlex Richardson	tst	tmp2, #0x3f
31731914882SAlex Richardson	bne	1f
31831914882SAlex Richardson	ldr	tmp2,[sp], #FRAME_SIZE
31931914882SAlex Richardson	bx	lr
32031914882SAlex Richardson1:
32131914882SAlex Richardson	add	src, src, #8
32231914882SAlex Richardson	add	dst, dst, #8
32331914882SAlex Richardson
32431914882SAlex RichardsonL(tail63aligned):			/* Count in tmp2.  */
32531914882SAlex Richardson	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
32631914882SAlex Richardson	   we know that the src and dest are 64-bit aligned so we can use
32731914882SAlex Richardson	   LDRD/STRD to improve efficiency.  */
32831914882SAlex Richardson	/* TMP2 is now negative, but we don't care about that.  The bottom
32931914882SAlex Richardson	   six bits still tell us how many bytes are left to copy.  */
33031914882SAlex Richardson
33131914882SAlex Richardson	and	tmp1, tmp2, #0x38
33231914882SAlex Richardson	add	dst, dst, tmp1
33331914882SAlex Richardson	add	src, src, tmp1
33431914882SAlex Richardson	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
33531914882SAlex Richardson	add	pc, pc, tmp1
33631914882SAlex Richardson	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
33731914882SAlex Richardson	strd	A_l, A_h, [dst, #-56]
33831914882SAlex Richardson	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
33931914882SAlex Richardson	strd	A_l, A_h, [dst, #-48]
34031914882SAlex Richardson	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
34131914882SAlex Richardson	strd	A_l, A_h, [dst, #-40]
34231914882SAlex Richardson	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
34331914882SAlex Richardson	strd	A_l, A_h, [dst, #-32]
34431914882SAlex Richardson	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
34531914882SAlex Richardson	strd	A_l, A_h, [dst, #-24]
34631914882SAlex Richardson	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
34731914882SAlex Richardson	strd	A_l, A_h, [dst, #-16]
34831914882SAlex Richardson	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
34931914882SAlex Richardson	strd	A_l, A_h, [dst, #-8]
35031914882SAlex Richardson
35131914882SAlex Richardson#endif
35231914882SAlex Richardson	tst	tmp2, #4
35331914882SAlex Richardson	ldrne	tmp1, [src], #4
35431914882SAlex Richardson	strne	tmp1, [dst], #4
35531914882SAlex Richardson	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
35631914882SAlex Richardson	ldrhcs	tmp1, [src], #2
35731914882SAlex Richardson	ldrbne	tmp2, [src]
35831914882SAlex Richardson	strhcs	tmp1, [dst], #2
35931914882SAlex Richardson	strbne	tmp2, [dst]
36031914882SAlex Richardson
36131914882SAlex RichardsonL(done):
36231914882SAlex Richardson	ldr	tmp2, [sp], #FRAME_SIZE
36331914882SAlex Richardson	bx	lr
36431914882SAlex Richardson
36531914882SAlex RichardsonL(cpy_body_long):			/* Count in tmp2.  */
36631914882SAlex Richardson
36731914882SAlex Richardson	/* Long copy.  We know that there's at least (prefetch_lines * 64)
36831914882SAlex Richardson	   bytes to go.  */
36931914882SAlex Richardson#ifdef USE_VFP
37031914882SAlex Richardson	/* Don't use PLD.  Instead, read some data in advance of the current
37131914882SAlex Richardson	   copy position into a register.  This should act like a PLD
37231914882SAlex Richardson	   operation but we won't have to repeat the transfer.  */
37331914882SAlex Richardson
37431914882SAlex Richardson	vldr	d3, [src, #0]
37531914882SAlex Richardson	vldr	d4, [src, #64]
37631914882SAlex Richardson	vldr	d5, [src, #128]
37731914882SAlex Richardson	vldr	d6, [src, #192]
37831914882SAlex Richardson	vldr	d7, [src, #256]
37931914882SAlex Richardson
38031914882SAlex Richardson	vldr	d0, [src, #8]
38131914882SAlex Richardson	vldr	d1, [src, #16]
38231914882SAlex Richardson	vldr	d2, [src, #24]
38331914882SAlex Richardson	add	src, src, #32
38431914882SAlex Richardson
38531914882SAlex Richardson	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
38631914882SAlex Richardson	blo	2f
38731914882SAlex Richardson1:
38831914882SAlex Richardson	cpy_line_vfp	d3, 0
38931914882SAlex Richardson	cpy_line_vfp	d4, 64
39031914882SAlex Richardson	cpy_line_vfp	d5, 128
39131914882SAlex Richardson	add	dst, dst, #3 * 64
39231914882SAlex Richardson	add	src, src, #3 * 64
39331914882SAlex Richardson	cpy_line_vfp	d6, 0
39431914882SAlex Richardson	cpy_line_vfp	d7, 64
39531914882SAlex Richardson	add	dst, dst, #2 * 64
39631914882SAlex Richardson	add	src, src, #2 * 64
39731914882SAlex Richardson	subs	tmp2, tmp2, #prefetch_lines * 64
39831914882SAlex Richardson	bhs	1b
39931914882SAlex Richardson
40031914882SAlex Richardson2:
40131914882SAlex Richardson	cpy_tail_vfp	d3, 0
40231914882SAlex Richardson	cpy_tail_vfp	d4, 64
40331914882SAlex Richardson	cpy_tail_vfp	d5, 128
40431914882SAlex Richardson	add	src, src, #3 * 64
40531914882SAlex Richardson	add	dst, dst, #3 * 64
40631914882SAlex Richardson	cpy_tail_vfp	d6, 0
40731914882SAlex Richardson	vstr	d7, [dst, #64]
40831914882SAlex Richardson	vldr	d7, [src, #64]
40931914882SAlex Richardson	vstr	d0, [dst, #64 + 8]
41031914882SAlex Richardson	vldr	d0, [src, #64 + 8]
41131914882SAlex Richardson	vstr	d1, [dst, #64 + 16]
41231914882SAlex Richardson	vldr	d1, [src, #64 + 16]
41331914882SAlex Richardson	vstr	d2, [dst, #64 + 24]
41431914882SAlex Richardson	vldr	d2, [src, #64 + 24]
41531914882SAlex Richardson	vstr	d7, [dst, #64 + 32]
41631914882SAlex Richardson	add	src, src, #96
41731914882SAlex Richardson	vstr	d0, [dst, #64 + 40]
41831914882SAlex Richardson	vstr	d1, [dst, #64 + 48]
41931914882SAlex Richardson	vstr	d2, [dst, #64 + 56]
42031914882SAlex Richardson	add	dst, dst, #128
42131914882SAlex Richardson	add	tmp2, tmp2, #prefetch_lines * 64
42231914882SAlex Richardson	b	L(cpy_body_medium)
42331914882SAlex Richardson#else
42431914882SAlex Richardson	/* Long copy.  Use an SMS style loop to maximize the I/O
42531914882SAlex Richardson	   bandwidth of the core.  We don't have enough spare registers
42631914882SAlex Richardson	   to synthesise prefetching, so use PLD operations.  */
42731914882SAlex Richardson	/* Pre-bias src and dst.  */
42831914882SAlex Richardson	sub	src, src, #8
42931914882SAlex Richardson	sub	dst, dst, #8
43031914882SAlex Richardson	pld	[src, #8]
43131914882SAlex Richardson	pld	[src, #72]
43231914882SAlex Richardson	subs	tmp2, tmp2, #64
43331914882SAlex Richardson	pld	[src, #136]
43431914882SAlex Richardson	ldrd	A_l, A_h, [src, #8]
43531914882SAlex Richardson	strd	B_l, B_h, [sp, #8]
43631914882SAlex Richardson	ldrd	B_l, B_h, [src, #16]
43731914882SAlex Richardson	strd	C_l, C_h, [sp, #16]
43831914882SAlex Richardson	ldrd	C_l, C_h, [src, #24]
43931914882SAlex Richardson	strd	D_l, D_h, [sp, #24]
44031914882SAlex Richardson	pld	[src, #200]
44131914882SAlex Richardson	ldrd	D_l, D_h, [src, #32]!
44231914882SAlex Richardson	b	1f
44331914882SAlex Richardson	.p2align	6
44431914882SAlex Richardson2:
44531914882SAlex Richardson	pld	[src, #232]
44631914882SAlex Richardson	strd	A_l, A_h, [dst, #40]
44731914882SAlex Richardson	ldrd	A_l, A_h, [src, #40]
44831914882SAlex Richardson	strd	B_l, B_h, [dst, #48]
44931914882SAlex Richardson	ldrd	B_l, B_h, [src, #48]
45031914882SAlex Richardson	strd	C_l, C_h, [dst, #56]
45131914882SAlex Richardson	ldrd	C_l, C_h, [src, #56]
45231914882SAlex Richardson	strd	D_l, D_h, [dst, #64]!
45331914882SAlex Richardson	ldrd	D_l, D_h, [src, #64]!
45431914882SAlex Richardson	subs	tmp2, tmp2, #64
45531914882SAlex Richardson1:
45631914882SAlex Richardson	strd	A_l, A_h, [dst, #8]
45731914882SAlex Richardson	ldrd	A_l, A_h, [src, #8]
45831914882SAlex Richardson	strd	B_l, B_h, [dst, #16]
45931914882SAlex Richardson	ldrd	B_l, B_h, [src, #16]
46031914882SAlex Richardson	strd	C_l, C_h, [dst, #24]
46131914882SAlex Richardson	ldrd	C_l, C_h, [src, #24]
46231914882SAlex Richardson	strd	D_l, D_h, [dst, #32]
46331914882SAlex Richardson	ldrd	D_l, D_h, [src, #32]
46431914882SAlex Richardson	bcs	2b
46531914882SAlex Richardson	/* Save the remaining bytes and restore the callee-saved regs.  */
46631914882SAlex Richardson	strd	A_l, A_h, [dst, #40]
46731914882SAlex Richardson	add	src, src, #40
46831914882SAlex Richardson	strd	B_l, B_h, [dst, #48]
46931914882SAlex Richardson	ldrd	B_l, B_h, [sp, #8]
47031914882SAlex Richardson	strd	C_l, C_h, [dst, #56]
47131914882SAlex Richardson	ldrd	C_l, C_h, [sp, #16]
47231914882SAlex Richardson	strd	D_l, D_h, [dst, #64]
47331914882SAlex Richardson	ldrd	D_l, D_h, [sp, #24]
47431914882SAlex Richardson	add	dst, dst, #72
47531914882SAlex Richardson	tst	tmp2, #0x3f
47631914882SAlex Richardson	bne	L(tail63aligned)
47731914882SAlex Richardson	ldr	tmp2, [sp], #FRAME_SIZE
47831914882SAlex Richardson	bx	lr
47931914882SAlex Richardson#endif
48031914882SAlex Richardson
48131914882SAlex RichardsonL(cpy_notaligned):
48231914882SAlex Richardson	pld	[src]
48331914882SAlex Richardson	pld	[src, #64]
48431914882SAlex Richardson	/* There's at least 64 bytes to copy, but there is no mutual
48531914882SAlex Richardson	   alignment.  */
48631914882SAlex Richardson	/* Bring DST to 64-bit alignment.  */
48731914882SAlex Richardson	lsls	tmp2, dst, #29
48831914882SAlex Richardson	pld	[src, #(2 * 64)]
48931914882SAlex Richardson	beq	1f
49031914882SAlex Richardson	rsbs	tmp2, tmp2, #0
49131914882SAlex Richardson	sub	count, count, tmp2, lsr #29
49231914882SAlex Richardson	ldrmi	tmp1, [src], #4
49331914882SAlex Richardson	strmi	tmp1, [dst], #4
49431914882SAlex Richardson	lsls	tmp2, tmp2, #2
49531914882SAlex Richardson	ldrbne	tmp1, [src], #1
49631914882SAlex Richardson	ldrhcs	tmp2, [src], #2
49731914882SAlex Richardson	strbne	tmp1, [dst], #1
49831914882SAlex Richardson	strhcs	tmp2, [dst], #2
49931914882SAlex Richardson1:
50031914882SAlex Richardson	pld	[src, #(3 * 64)]
50131914882SAlex Richardson	subs	count, count, #64
50231914882SAlex Richardson	ldrlo	tmp2, [sp], #FRAME_SIZE
50331914882SAlex Richardson	blo	L(tail63unaligned)
50431914882SAlex Richardson	pld	[src, #(4 * 64)]
50531914882SAlex Richardson
50631914882SAlex Richardson#ifdef USE_NEON
50731914882SAlex Richardson	vld1.8	{d0-d3}, [src]!
50831914882SAlex Richardson	vld1.8	{d4-d7}, [src]!
50931914882SAlex Richardson	subs	count, count, #64
51031914882SAlex Richardson	blo	2f
51131914882SAlex Richardson1:
51231914882SAlex Richardson	pld	[src, #(4 * 64)]
51331914882SAlex Richardson	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
51431914882SAlex Richardson	vld1.8	{d0-d3}, [src]!
51531914882SAlex Richardson	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
51631914882SAlex Richardson	vld1.8	{d4-d7}, [src]!
51731914882SAlex Richardson	subs	count, count, #64
51831914882SAlex Richardson	bhs	1b
51931914882SAlex Richardson2:
52031914882SAlex Richardson	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
52131914882SAlex Richardson	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
52231914882SAlex Richardson	ands	count, count, #0x3f
52331914882SAlex Richardson#else
52431914882SAlex Richardson	/* Use an SMS style loop to maximize the I/O bandwidth.  */
52531914882SAlex Richardson	sub	src, src, #4
52631914882SAlex Richardson	sub	dst, dst, #8
52731914882SAlex Richardson	subs	tmp2, count, #64	/* Use tmp2 for count.  */
52831914882SAlex Richardson	ldr	A_l, [src, #4]
52931914882SAlex Richardson	ldr	A_h, [src, #8]
53031914882SAlex Richardson	strd	B_l, B_h, [sp, #8]
53131914882SAlex Richardson	ldr	B_l, [src, #12]
53231914882SAlex Richardson	ldr	B_h, [src, #16]
53331914882SAlex Richardson	strd	C_l, C_h, [sp, #16]
53431914882SAlex Richardson	ldr	C_l, [src, #20]
53531914882SAlex Richardson	ldr	C_h, [src, #24]
53631914882SAlex Richardson	strd	D_l, D_h, [sp, #24]
53731914882SAlex Richardson	ldr	D_l, [src, #28]
53831914882SAlex Richardson	ldr	D_h, [src, #32]!
53931914882SAlex Richardson	b	1f
54031914882SAlex Richardson	.p2align	6
54131914882SAlex Richardson2:
54231914882SAlex Richardson	pld	[src, #(5 * 64) - (32 - 4)]
54331914882SAlex Richardson	strd	A_l, A_h, [dst, #40]
54431914882SAlex Richardson	ldr	A_l, [src, #36]
54531914882SAlex Richardson	ldr	A_h, [src, #40]
54631914882SAlex Richardson	strd	B_l, B_h, [dst, #48]
54731914882SAlex Richardson	ldr	B_l, [src, #44]
54831914882SAlex Richardson	ldr	B_h, [src, #48]
54931914882SAlex Richardson	strd	C_l, C_h, [dst, #56]
55031914882SAlex Richardson	ldr	C_l, [src, #52]
55131914882SAlex Richardson	ldr	C_h, [src, #56]
55231914882SAlex Richardson	strd	D_l, D_h, [dst, #64]!
55331914882SAlex Richardson	ldr	D_l, [src, #60]
55431914882SAlex Richardson	ldr	D_h, [src, #64]!
55531914882SAlex Richardson	subs	tmp2, tmp2, #64
55631914882SAlex Richardson1:
55731914882SAlex Richardson	strd	A_l, A_h, [dst, #8]
55831914882SAlex Richardson	ldr	A_l, [src, #4]
55931914882SAlex Richardson	ldr	A_h, [src, #8]
56031914882SAlex Richardson	strd	B_l, B_h, [dst, #16]
56131914882SAlex Richardson	ldr	B_l, [src, #12]
56231914882SAlex Richardson	ldr	B_h, [src, #16]
56331914882SAlex Richardson	strd	C_l, C_h, [dst, #24]
56431914882SAlex Richardson	ldr	C_l, [src, #20]
56531914882SAlex Richardson	ldr	C_h, [src, #24]
56631914882SAlex Richardson	strd	D_l, D_h, [dst, #32]
56731914882SAlex Richardson	ldr	D_l, [src, #28]
56831914882SAlex Richardson	ldr	D_h, [src, #32]
56931914882SAlex Richardson	bcs	2b
57031914882SAlex Richardson
57131914882SAlex Richardson	/* Save the remaining bytes and restore the callee-saved regs.  */
57231914882SAlex Richardson	strd	A_l, A_h, [dst, #40]
57331914882SAlex Richardson	add	src, src, #36
57431914882SAlex Richardson	strd	B_l, B_h, [dst, #48]
57531914882SAlex Richardson	ldrd	B_l, B_h, [sp, #8]
57631914882SAlex Richardson	strd	C_l, C_h, [dst, #56]
57731914882SAlex Richardson	ldrd	C_l, C_h, [sp, #16]
57831914882SAlex Richardson	strd	D_l, D_h, [dst, #64]
57931914882SAlex Richardson	ldrd	D_l, D_h, [sp, #24]
58031914882SAlex Richardson	add	dst, dst, #72
58131914882SAlex Richardson	ands	count, tmp2, #0x3f
58231914882SAlex Richardson#endif
58331914882SAlex Richardson	ldr	tmp2, [sp], #FRAME_SIZE
58431914882SAlex Richardson	bne	L(tail63unaligned)
58531914882SAlex Richardson	bx	lr
58631914882SAlex Richardson
58731914882SAlex RichardsonEND (__memcpy_arm)
588