1*09a53ad8SAndrew Turner/* Copyright (c) 2013, Linaro Limited
2*09a53ad8SAndrew Turner   All rights reserved.
3*09a53ad8SAndrew Turner
4*09a53ad8SAndrew Turner   Redistribution and use in source and binary forms, with or without
5*09a53ad8SAndrew Turner   modification, are permitted provided that the following conditions
6*09a53ad8SAndrew Turner   are met:
7*09a53ad8SAndrew Turner
8*09a53ad8SAndrew Turner      * Redistributions of source code must retain the above copyright
9*09a53ad8SAndrew Turner      notice, this list of conditions and the following disclaimer.
10*09a53ad8SAndrew Turner
11*09a53ad8SAndrew Turner      * Redistributions in binary form must reproduce the above copyright
12*09a53ad8SAndrew Turner      notice, this list of conditions and the following disclaimer in the
13*09a53ad8SAndrew Turner      documentation and/or other materials provided with the distribution.
14*09a53ad8SAndrew Turner
15*09a53ad8SAndrew Turner      * Neither the name of Linaro Limited nor the names of its
16*09a53ad8SAndrew Turner      contributors may be used to endorse or promote products derived
17*09a53ad8SAndrew Turner      from this software without specific prior written permission.
18*09a53ad8SAndrew Turner
19*09a53ad8SAndrew Turner   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20*09a53ad8SAndrew Turner   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21*09a53ad8SAndrew Turner   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22*09a53ad8SAndrew Turner   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23*09a53ad8SAndrew Turner   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24*09a53ad8SAndrew Turner   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25*09a53ad8SAndrew Turner   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26*09a53ad8SAndrew Turner   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27*09a53ad8SAndrew Turner   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28*09a53ad8SAndrew Turner   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29*09a53ad8SAndrew Turner   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30*09a53ad8SAndrew Turner */
31*09a53ad8SAndrew Turner
32*09a53ad8SAndrew Turner/*
33*09a53ad8SAndrew Turner   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
34*09a53ad8SAndrew Turner   of VFP or NEON when built with the appropriate flags.
35*09a53ad8SAndrew Turner
36*09a53ad8SAndrew Turner   Assumptions:
37*09a53ad8SAndrew Turner
38*09a53ad8SAndrew Turner    ARMv6 (ARMv7-a if using Neon)
39*09a53ad8SAndrew Turner    ARM state
40*09a53ad8SAndrew Turner    Unaligned accesses
41*09a53ad8SAndrew Turner
42*09a53ad8SAndrew Turner */
43*09a53ad8SAndrew Turner
44*09a53ad8SAndrew Turner	.syntax unified
45*09a53ad8SAndrew Turner	/* This implementation requires ARM state.  */
46*09a53ad8SAndrew Turner	.arm
47*09a53ad8SAndrew Turner
48*09a53ad8SAndrew Turner#ifdef __ARM_NEON__
49*09a53ad8SAndrew Turner
50*09a53ad8SAndrew Turner	.fpu	neon
51*09a53ad8SAndrew Turner	.arch	armv7-a
52*09a53ad8SAndrew Turner# define FRAME_SIZE	4
53*09a53ad8SAndrew Turner# define USE_VFP
54*09a53ad8SAndrew Turner# define USE_NEON
55*09a53ad8SAndrew Turner
56*09a53ad8SAndrew Turner#elif !defined (__SOFTFP__)
57*09a53ad8SAndrew Turner
58*09a53ad8SAndrew Turner	.arch	armv6
59*09a53ad8SAndrew Turner	.fpu	vfpv2
60*09a53ad8SAndrew Turner# define FRAME_SIZE	32
61*09a53ad8SAndrew Turner# define USE_VFP
62*09a53ad8SAndrew Turner
63*09a53ad8SAndrew Turner#else
64*09a53ad8SAndrew Turner	.arch	armv6
65*09a53ad8SAndrew Turner# define FRAME_SIZE    32
66*09a53ad8SAndrew Turner
67*09a53ad8SAndrew Turner#endif
68*09a53ad8SAndrew Turner
69*09a53ad8SAndrew Turner/* Old versions of GAS incorrectly implement the NEON align semantics.  */
70*09a53ad8SAndrew Turner#ifdef BROKEN_ASM_NEON_ALIGN
71*09a53ad8SAndrew Turner#define ALIGN(addr, align) addr,:align
72*09a53ad8SAndrew Turner#else
73*09a53ad8SAndrew Turner#define ALIGN(addr, align) addr:align
74*09a53ad8SAndrew Turner#endif
75*09a53ad8SAndrew Turner
76*09a53ad8SAndrew Turner#define PC_OFFSET	8	/* PC pipeline compensation.  */
77*09a53ad8SAndrew Turner#define INSN_SIZE	4
78*09a53ad8SAndrew Turner
79*09a53ad8SAndrew Turner/* Call parameters.  */
80*09a53ad8SAndrew Turner#define dstin	r0
81*09a53ad8SAndrew Turner#define src	r1
82*09a53ad8SAndrew Turner#define count	r2
83*09a53ad8SAndrew Turner
84*09a53ad8SAndrew Turner/* Locals.  */
85*09a53ad8SAndrew Turner#define tmp1	r3
86*09a53ad8SAndrew Turner#define dst	ip
87*09a53ad8SAndrew Turner#define tmp2	r10
88*09a53ad8SAndrew Turner
89*09a53ad8SAndrew Turner#ifndef USE_NEON
90*09a53ad8SAndrew Turner/* For bulk copies using GP registers.  */
91*09a53ad8SAndrew Turner#define	A_l	r2		/* Call-clobbered.  */
92*09a53ad8SAndrew Turner#define	A_h	r3		/* Call-clobbered.  */
93*09a53ad8SAndrew Turner#define	B_l	r4
94*09a53ad8SAndrew Turner#define	B_h	r5
95*09a53ad8SAndrew Turner#define	C_l	r6
96*09a53ad8SAndrew Turner#define	C_h	r7
97*09a53ad8SAndrew Turner#define	D_l	r8
98*09a53ad8SAndrew Turner#define	D_h	r9
99*09a53ad8SAndrew Turner#endif
100*09a53ad8SAndrew Turner
101*09a53ad8SAndrew Turner/* Number of lines ahead to pre-fetch data.  If you change this the code
102*09a53ad8SAndrew Turner   below will need adjustment to compensate.  */
103*09a53ad8SAndrew Turner
104*09a53ad8SAndrew Turner#define prefetch_lines	5
105*09a53ad8SAndrew Turner
106*09a53ad8SAndrew Turner#ifdef USE_VFP
107*09a53ad8SAndrew Turner	.macro	cpy_line_vfp vreg, base
108*09a53ad8SAndrew Turner	vstr	\vreg, [dst, #\base]
109*09a53ad8SAndrew Turner	vldr	\vreg, [src, #\base]
110*09a53ad8SAndrew Turner	vstr	d0, [dst, #\base + 8]
111*09a53ad8SAndrew Turner	vldr	d0, [src, #\base + 8]
112*09a53ad8SAndrew Turner	vstr	d1, [dst, #\base + 16]
113*09a53ad8SAndrew Turner	vldr	d1, [src, #\base + 16]
114*09a53ad8SAndrew Turner	vstr	d2, [dst, #\base + 24]
115*09a53ad8SAndrew Turner	vldr	d2, [src, #\base + 24]
116*09a53ad8SAndrew Turner	vstr	\vreg, [dst, #\base + 32]
117*09a53ad8SAndrew Turner	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
118*09a53ad8SAndrew Turner	vstr	d0, [dst, #\base + 40]
119*09a53ad8SAndrew Turner	vldr	d0, [src, #\base + 40]
120*09a53ad8SAndrew Turner	vstr	d1, [dst, #\base + 48]
121*09a53ad8SAndrew Turner	vldr	d1, [src, #\base + 48]
122*09a53ad8SAndrew Turner	vstr	d2, [dst, #\base + 56]
123*09a53ad8SAndrew Turner	vldr	d2, [src, #\base + 56]
124*09a53ad8SAndrew Turner	.endm
125*09a53ad8SAndrew Turner
126*09a53ad8SAndrew Turner	.macro	cpy_tail_vfp vreg, base
127*09a53ad8SAndrew Turner	vstr	\vreg, [dst, #\base]
128*09a53ad8SAndrew Turner	vldr	\vreg, [src, #\base]
129*09a53ad8SAndrew Turner	vstr	d0, [dst, #\base + 8]
130*09a53ad8SAndrew Turner	vldr	d0, [src, #\base + 8]
131*09a53ad8SAndrew Turner	vstr	d1, [dst, #\base + 16]
132*09a53ad8SAndrew Turner	vldr	d1, [src, #\base + 16]
133*09a53ad8SAndrew Turner	vstr	d2, [dst, #\base + 24]
134*09a53ad8SAndrew Turner	vldr	d2, [src, #\base + 24]
135*09a53ad8SAndrew Turner	vstr	\vreg, [dst, #\base + 32]
136*09a53ad8SAndrew Turner	vstr	d0, [dst, #\base + 40]
137*09a53ad8SAndrew Turner	vldr	d0, [src, #\base + 40]
138*09a53ad8SAndrew Turner	vstr	d1, [dst, #\base + 48]
139*09a53ad8SAndrew Turner	vldr	d1, [src, #\base + 48]
140*09a53ad8SAndrew Turner	vstr	d2, [dst, #\base + 56]
141*09a53ad8SAndrew Turner	vldr	d2, [src, #\base + 56]
142*09a53ad8SAndrew Turner	.endm
143*09a53ad8SAndrew Turner#endif
144*09a53ad8SAndrew Turner
145*09a53ad8SAndrew Turner	.macro def_fn f p2align=0
146*09a53ad8SAndrew Turner	.text
147*09a53ad8SAndrew Turner	.p2align \p2align
148*09a53ad8SAndrew Turner	.global \f
149*09a53ad8SAndrew Turner	.type \f, %function
150*09a53ad8SAndrew Turner\f:
151*09a53ad8SAndrew Turner	.endm
152*09a53ad8SAndrew Turner
153*09a53ad8SAndrew Turnerdef_fn memcpy p2align=6
154*09a53ad8SAndrew Turner
155*09a53ad8SAndrew Turner	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
156*09a53ad8SAndrew Turner	cmp	count, #64
157*09a53ad8SAndrew Turner	bge	.Lcpy_not_short
158*09a53ad8SAndrew Turner	/* Deal with small copies quickly by dropping straight into the
159*09a53ad8SAndrew Turner	   exit block.  */
160*09a53ad8SAndrew Turner
161*09a53ad8SAndrew Turner.Ltail63unaligned:
162*09a53ad8SAndrew Turner#ifdef USE_NEON
163*09a53ad8SAndrew Turner	and	tmp1, count, #0x38
164*09a53ad8SAndrew Turner	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
165*09a53ad8SAndrew Turner	add	pc, pc, tmp1
166*09a53ad8SAndrew Turner	vld1.8	{d0}, [src]!	/* 14 words to go.  */
167*09a53ad8SAndrew Turner	vst1.8	{d0}, [dst]!
168*09a53ad8SAndrew Turner	vld1.8	{d0}, [src]!	/* 12 words to go.  */
169*09a53ad8SAndrew Turner	vst1.8	{d0}, [dst]!
170*09a53ad8SAndrew Turner	vld1.8	{d0}, [src]!	/* 10 words to go.  */
171*09a53ad8SAndrew Turner	vst1.8	{d0}, [dst]!
172*09a53ad8SAndrew Turner	vld1.8	{d0}, [src]!	/* 8 words to go.  */
173*09a53ad8SAndrew Turner	vst1.8	{d0}, [dst]!
174*09a53ad8SAndrew Turner	vld1.8	{d0}, [src]!	/* 6 words to go.  */
175*09a53ad8SAndrew Turner	vst1.8	{d0}, [dst]!
176*09a53ad8SAndrew Turner	vld1.8	{d0}, [src]!	/* 4 words to go.  */
177*09a53ad8SAndrew Turner	vst1.8	{d0}, [dst]!
178*09a53ad8SAndrew Turner	vld1.8	{d0}, [src]!	/* 2 words to go.  */
179*09a53ad8SAndrew Turner	vst1.8	{d0}, [dst]!
180*09a53ad8SAndrew Turner
181*09a53ad8SAndrew Turner	tst	count, #4
182*09a53ad8SAndrew Turner	ldrne	tmp1, [src], #4
183*09a53ad8SAndrew Turner	strne	tmp1, [dst], #4
184*09a53ad8SAndrew Turner#else
185*09a53ad8SAndrew Turner	/* Copy up to 15 full words of data.  May not be aligned.  */
186*09a53ad8SAndrew Turner	/* Cannot use VFP for unaligned data.  */
187*09a53ad8SAndrew Turner	and	tmp1, count, #0x3c
188*09a53ad8SAndrew Turner	add	dst, dst, tmp1
189*09a53ad8SAndrew Turner	add	src, src, tmp1
190*09a53ad8SAndrew Turner	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
191*09a53ad8SAndrew Turner	/* Jump directly into the sequence below at the correct offset.  */
192*09a53ad8SAndrew Turner	add	pc, pc, tmp1, lsl #1
193*09a53ad8SAndrew Turner
194*09a53ad8SAndrew Turner	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
195*09a53ad8SAndrew Turner	str	tmp1, [dst, #-60]
196*09a53ad8SAndrew Turner
197*09a53ad8SAndrew Turner	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
198*09a53ad8SAndrew Turner	str	tmp1, [dst, #-56]
199*09a53ad8SAndrew Turner	ldr	tmp1, [src, #-52]
200*09a53ad8SAndrew Turner	str	tmp1, [dst, #-52]
201*09a53ad8SAndrew Turner
202*09a53ad8SAndrew Turner	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
203*09a53ad8SAndrew Turner	str	tmp1, [dst, #-48]
204*09a53ad8SAndrew Turner	ldr	tmp1, [src, #-44]
205*09a53ad8SAndrew Turner	str	tmp1, [dst, #-44]
206*09a53ad8SAndrew Turner
207*09a53ad8SAndrew Turner	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
208*09a53ad8SAndrew Turner	str	tmp1, [dst, #-40]
209*09a53ad8SAndrew Turner	ldr	tmp1, [src, #-36]
210*09a53ad8SAndrew Turner	str	tmp1, [dst, #-36]
211*09a53ad8SAndrew Turner
212*09a53ad8SAndrew Turner	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
213*09a53ad8SAndrew Turner	str	tmp1, [dst, #-32]
214*09a53ad8SAndrew Turner	ldr	tmp1, [src, #-28]
215*09a53ad8SAndrew Turner	str	tmp1, [dst, #-28]
216*09a53ad8SAndrew Turner
217*09a53ad8SAndrew Turner	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
218*09a53ad8SAndrew Turner	str	tmp1, [dst, #-24]
219*09a53ad8SAndrew Turner	ldr	tmp1, [src, #-20]
220*09a53ad8SAndrew Turner	str	tmp1, [dst, #-20]
221*09a53ad8SAndrew Turner
222*09a53ad8SAndrew Turner	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
223*09a53ad8SAndrew Turner	str	tmp1, [dst, #-16]
224*09a53ad8SAndrew Turner	ldr	tmp1, [src, #-12]
225*09a53ad8SAndrew Turner	str	tmp1, [dst, #-12]
226*09a53ad8SAndrew Turner
227*09a53ad8SAndrew Turner	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
228*09a53ad8SAndrew Turner	str	tmp1, [dst, #-8]
229*09a53ad8SAndrew Turner	ldr	tmp1, [src, #-4]
230*09a53ad8SAndrew Turner	str	tmp1, [dst, #-4]
231*09a53ad8SAndrew Turner#endif
232*09a53ad8SAndrew Turner
233*09a53ad8SAndrew Turner	lsls	count, count, #31
234*09a53ad8SAndrew Turner	ldrhcs	tmp1, [src], #2
235*09a53ad8SAndrew Turner	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
236*09a53ad8SAndrew Turner	strhcs	tmp1, [dst], #2
237*09a53ad8SAndrew Turner	strbne	src, [dst]
238*09a53ad8SAndrew Turner	bx	lr
239*09a53ad8SAndrew Turner
240*09a53ad8SAndrew Turner.Lcpy_not_short:
241*09a53ad8SAndrew Turner	/* At least 64 bytes to copy, but don't know the alignment yet.  */
242*09a53ad8SAndrew Turner	str	tmp2, [sp, #-FRAME_SIZE]!
243*09a53ad8SAndrew Turner	and	tmp2, src, #7
244*09a53ad8SAndrew Turner	and	tmp1, dst, #7
245*09a53ad8SAndrew Turner	cmp	tmp1, tmp2
246*09a53ad8SAndrew Turner	bne	.Lcpy_notaligned
247*09a53ad8SAndrew Turner
248*09a53ad8SAndrew Turner#ifdef USE_VFP
249*09a53ad8SAndrew Turner	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
250*09a53ad8SAndrew Turner	   that the FP pipeline is much better at streaming loads and
251*09a53ad8SAndrew Turner	   stores.  This is outside the critical loop.  */
252*09a53ad8SAndrew Turner	vmov.f32	s0, s0
253*09a53ad8SAndrew Turner#endif
254*09a53ad8SAndrew Turner
255*09a53ad8SAndrew Turner	/* SRC and DST have the same mutual 64-bit alignment, but we may
256*09a53ad8SAndrew Turner	   still need to pre-copy some bytes to get to natural alignment.
257*09a53ad8SAndrew Turner	   We bring SRC and DST into full 64-bit alignment.  */
258*09a53ad8SAndrew Turner	lsls	tmp2, dst, #29
259*09a53ad8SAndrew Turner	beq	1f
260*09a53ad8SAndrew Turner	rsbs	tmp2, tmp2, #0
261*09a53ad8SAndrew Turner	sub	count, count, tmp2, lsr #29
262*09a53ad8SAndrew Turner	ldrmi	tmp1, [src], #4
263*09a53ad8SAndrew Turner	strmi	tmp1, [dst], #4
264*09a53ad8SAndrew Turner	lsls	tmp2, tmp2, #2
265*09a53ad8SAndrew Turner	ldrhcs	tmp1, [src], #2
266*09a53ad8SAndrew Turner	ldrbne	tmp2, [src], #1
267*09a53ad8SAndrew Turner	strhcs	tmp1, [dst], #2
268*09a53ad8SAndrew Turner	strbne	tmp2, [dst], #1
269*09a53ad8SAndrew Turner
270*09a53ad8SAndrew Turner1:
271*09a53ad8SAndrew Turner	subs	tmp2, count, #64	/* Use tmp2 for count.  */
272*09a53ad8SAndrew Turner	blt	.Ltail63aligned
273*09a53ad8SAndrew Turner
274*09a53ad8SAndrew Turner	cmp	tmp2, #512
275*09a53ad8SAndrew Turner	bge	.Lcpy_body_long
276*09a53ad8SAndrew Turner
277*09a53ad8SAndrew Turner.Lcpy_body_medium:			/* Count in tmp2.  */
278*09a53ad8SAndrew Turner#ifdef USE_VFP
279*09a53ad8SAndrew Turner1:
280*09a53ad8SAndrew Turner	vldr	d0, [src, #0]
281*09a53ad8SAndrew Turner	subs	tmp2, tmp2, #64
282*09a53ad8SAndrew Turner	vldr	d1, [src, #8]
283*09a53ad8SAndrew Turner	vstr	d0, [dst, #0]
284*09a53ad8SAndrew Turner	vldr	d0, [src, #16]
285*09a53ad8SAndrew Turner	vstr	d1, [dst, #8]
286*09a53ad8SAndrew Turner	vldr	d1, [src, #24]
287*09a53ad8SAndrew Turner	vstr	d0, [dst, #16]
288*09a53ad8SAndrew Turner	vldr	d0, [src, #32]
289*09a53ad8SAndrew Turner	vstr	d1, [dst, #24]
290*09a53ad8SAndrew Turner	vldr	d1, [src, #40]
291*09a53ad8SAndrew Turner	vstr	d0, [dst, #32]
292*09a53ad8SAndrew Turner	vldr	d0, [src, #48]
293*09a53ad8SAndrew Turner	vstr	d1, [dst, #40]
294*09a53ad8SAndrew Turner	vldr	d1, [src, #56]
295*09a53ad8SAndrew Turner	vstr	d0, [dst, #48]
296*09a53ad8SAndrew Turner	add	src, src, #64
297*09a53ad8SAndrew Turner	vstr	d1, [dst, #56]
298*09a53ad8SAndrew Turner	add	dst, dst, #64
299*09a53ad8SAndrew Turner	bge	1b
300*09a53ad8SAndrew Turner	tst	tmp2, #0x3f
301*09a53ad8SAndrew Turner	beq	.Ldone
302*09a53ad8SAndrew Turner
303*09a53ad8SAndrew Turner.Ltail63aligned:			/* Count in tmp2.  */
304*09a53ad8SAndrew Turner	and	tmp1, tmp2, #0x38
305*09a53ad8SAndrew Turner	add	dst, dst, tmp1
306*09a53ad8SAndrew Turner	add	src, src, tmp1
307*09a53ad8SAndrew Turner	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
308*09a53ad8SAndrew Turner	add	pc, pc, tmp1
309*09a53ad8SAndrew Turner
310*09a53ad8SAndrew Turner	vldr	d0, [src, #-56]	/* 14 words to go.  */
311*09a53ad8SAndrew Turner	vstr	d0, [dst, #-56]
312*09a53ad8SAndrew Turner	vldr	d0, [src, #-48]	/* 12 words to go.  */
313*09a53ad8SAndrew Turner	vstr	d0, [dst, #-48]
314*09a53ad8SAndrew Turner	vldr	d0, [src, #-40]	/* 10 words to go.  */
315*09a53ad8SAndrew Turner	vstr	d0, [dst, #-40]
316*09a53ad8SAndrew Turner	vldr	d0, [src, #-32]	/* 8 words to go.  */
317*09a53ad8SAndrew Turner	vstr	d0, [dst, #-32]
318*09a53ad8SAndrew Turner	vldr	d0, [src, #-24]	/* 6 words to go.  */
319*09a53ad8SAndrew Turner	vstr	d0, [dst, #-24]
320*09a53ad8SAndrew Turner	vldr	d0, [src, #-16]	/* 4 words to go.  */
321*09a53ad8SAndrew Turner	vstr	d0, [dst, #-16]
322*09a53ad8SAndrew Turner	vldr	d0, [src, #-8]	/* 2 words to go.  */
323*09a53ad8SAndrew Turner	vstr	d0, [dst, #-8]
324*09a53ad8SAndrew Turner#else
325*09a53ad8SAndrew Turner	sub	src, src, #8
326*09a53ad8SAndrew Turner	sub	dst, dst, #8
327*09a53ad8SAndrew Turner1:
328*09a53ad8SAndrew Turner	ldrd	A_l, A_h, [src, #8]
329*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #8]
330*09a53ad8SAndrew Turner	ldrd	A_l, A_h, [src, #16]
331*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #16]
332*09a53ad8SAndrew Turner	ldrd	A_l, A_h, [src, #24]
333*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #24]
334*09a53ad8SAndrew Turner	ldrd	A_l, A_h, [src, #32]
335*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #32]
336*09a53ad8SAndrew Turner	ldrd	A_l, A_h, [src, #40]
337*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #40]
338*09a53ad8SAndrew Turner	ldrd	A_l, A_h, [src, #48]
339*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #48]
340*09a53ad8SAndrew Turner	ldrd	A_l, A_h, [src, #56]
341*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #56]
342*09a53ad8SAndrew Turner	ldrd	A_l, A_h, [src, #64]!
343*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #64]!
344*09a53ad8SAndrew Turner	subs	tmp2, tmp2, #64
345*09a53ad8SAndrew Turner	bge	1b
346*09a53ad8SAndrew Turner	tst	tmp2, #0x3f
347*09a53ad8SAndrew Turner	bne	1f
348*09a53ad8SAndrew Turner	ldr	tmp2,[sp], #FRAME_SIZE
349*09a53ad8SAndrew Turner	bx	lr
350*09a53ad8SAndrew Turner1:
351*09a53ad8SAndrew Turner	add	src, src, #8
352*09a53ad8SAndrew Turner	add	dst, dst, #8
353*09a53ad8SAndrew Turner
354*09a53ad8SAndrew Turner.Ltail63aligned:			/* Count in tmp2.  */
355*09a53ad8SAndrew Turner	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
356*09a53ad8SAndrew Turner	   we know that the src and dest are 64-bit aligned so we can use
357*09a53ad8SAndrew Turner	   LDRD/STRD to improve efficiency.  */
358*09a53ad8SAndrew Turner	/* TMP2 is now negative, but we don't care about that.  The bottom
359*09a53ad8SAndrew Turner	   six bits still tell us how many bytes are left to copy.  */
360*09a53ad8SAndrew Turner
361*09a53ad8SAndrew Turner	and	tmp1, tmp2, #0x38
362*09a53ad8SAndrew Turner	add	dst, dst, tmp1
363*09a53ad8SAndrew Turner	add	src, src, tmp1
364*09a53ad8SAndrew Turner	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
365*09a53ad8SAndrew Turner	add	pc, pc, tmp1
366*09a53ad8SAndrew Turner	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
367*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #-56]
368*09a53ad8SAndrew Turner	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
369*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #-48]
370*09a53ad8SAndrew Turner	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
371*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #-40]
372*09a53ad8SAndrew Turner	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
373*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #-32]
374*09a53ad8SAndrew Turner	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
375*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #-24]
376*09a53ad8SAndrew Turner	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
377*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #-16]
378*09a53ad8SAndrew Turner	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
379*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #-8]
380*09a53ad8SAndrew Turner
381*09a53ad8SAndrew Turner#endif
382*09a53ad8SAndrew Turner	tst	tmp2, #4
383*09a53ad8SAndrew Turner	ldrne	tmp1, [src], #4
384*09a53ad8SAndrew Turner	strne	tmp1, [dst], #4
385*09a53ad8SAndrew Turner	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
386*09a53ad8SAndrew Turner	ldrhcs	tmp1, [src], #2
387*09a53ad8SAndrew Turner	ldrbne	tmp2, [src]
388*09a53ad8SAndrew Turner	strhcs	tmp1, [dst], #2
389*09a53ad8SAndrew Turner	strbne	tmp2, [dst]
390*09a53ad8SAndrew Turner
391*09a53ad8SAndrew Turner.Ldone:
392*09a53ad8SAndrew Turner	ldr	tmp2, [sp], #FRAME_SIZE
393*09a53ad8SAndrew Turner	bx	lr
394*09a53ad8SAndrew Turner
395*09a53ad8SAndrew Turner.Lcpy_body_long:			/* Count in tmp2.  */
396*09a53ad8SAndrew Turner
397*09a53ad8SAndrew Turner	/* Long copy.  We know that there's at least (prefetch_lines * 64)
398*09a53ad8SAndrew Turner	   bytes to go.  */
399*09a53ad8SAndrew Turner#ifdef USE_VFP
400*09a53ad8SAndrew Turner	/* Don't use PLD.  Instead, read some data in advance of the current
401*09a53ad8SAndrew Turner	   copy position into a register.  This should act like a PLD
402*09a53ad8SAndrew Turner	   operation but we won't have to repeat the transfer.  */
403*09a53ad8SAndrew Turner
404*09a53ad8SAndrew Turner	vldr	d3, [src, #0]
405*09a53ad8SAndrew Turner	vldr	d4, [src, #64]
406*09a53ad8SAndrew Turner	vldr	d5, [src, #128]
407*09a53ad8SAndrew Turner	vldr	d6, [src, #192]
408*09a53ad8SAndrew Turner	vldr	d7, [src, #256]
409*09a53ad8SAndrew Turner
410*09a53ad8SAndrew Turner	vldr	d0, [src, #8]
411*09a53ad8SAndrew Turner	vldr	d1, [src, #16]
412*09a53ad8SAndrew Turner	vldr	d2, [src, #24]
413*09a53ad8SAndrew Turner	add	src, src, #32
414*09a53ad8SAndrew Turner
415*09a53ad8SAndrew Turner	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
416*09a53ad8SAndrew Turner	blt	2f
417*09a53ad8SAndrew Turner1:
418*09a53ad8SAndrew Turner	cpy_line_vfp	d3, 0
419*09a53ad8SAndrew Turner	cpy_line_vfp	d4, 64
420*09a53ad8SAndrew Turner	cpy_line_vfp	d5, 128
421*09a53ad8SAndrew Turner	add	dst, dst, #3 * 64
422*09a53ad8SAndrew Turner	add	src, src, #3 * 64
423*09a53ad8SAndrew Turner	cpy_line_vfp	d6, 0
424*09a53ad8SAndrew Turner	cpy_line_vfp	d7, 64
425*09a53ad8SAndrew Turner	add	dst, dst, #2 * 64
426*09a53ad8SAndrew Turner	add	src, src, #2 * 64
427*09a53ad8SAndrew Turner	subs	tmp2, tmp2, #prefetch_lines * 64
428*09a53ad8SAndrew Turner	bge	1b
429*09a53ad8SAndrew Turner
430*09a53ad8SAndrew Turner2:
431*09a53ad8SAndrew Turner	cpy_tail_vfp	d3, 0
432*09a53ad8SAndrew Turner	cpy_tail_vfp	d4, 64
433*09a53ad8SAndrew Turner	cpy_tail_vfp	d5, 128
434*09a53ad8SAndrew Turner	add	src, src, #3 * 64
435*09a53ad8SAndrew Turner	add	dst, dst, #3 * 64
436*09a53ad8SAndrew Turner	cpy_tail_vfp	d6, 0
437*09a53ad8SAndrew Turner	vstr	d7, [dst, #64]
438*09a53ad8SAndrew Turner	vldr	d7, [src, #64]
439*09a53ad8SAndrew Turner	vstr	d0, [dst, #64 + 8]
440*09a53ad8SAndrew Turner	vldr	d0, [src, #64 + 8]
441*09a53ad8SAndrew Turner	vstr	d1, [dst, #64 + 16]
442*09a53ad8SAndrew Turner	vldr	d1, [src, #64 + 16]
443*09a53ad8SAndrew Turner	vstr	d2, [dst, #64 + 24]
444*09a53ad8SAndrew Turner	vldr	d2, [src, #64 + 24]
445*09a53ad8SAndrew Turner	vstr	d7, [dst, #64 + 32]
446*09a53ad8SAndrew Turner	add	src, src, #96
447*09a53ad8SAndrew Turner	vstr	d0, [dst, #64 + 40]
448*09a53ad8SAndrew Turner	vstr	d1, [dst, #64 + 48]
449*09a53ad8SAndrew Turner	vstr	d2, [dst, #64 + 56]
450*09a53ad8SAndrew Turner	add	dst, dst, #128
451*09a53ad8SAndrew Turner	add	tmp2, tmp2, #prefetch_lines * 64
452*09a53ad8SAndrew Turner	b	.Lcpy_body_medium
453*09a53ad8SAndrew Turner#else
454*09a53ad8SAndrew Turner	/* Long copy.  Use an SMS style loop to maximize the I/O
455*09a53ad8SAndrew Turner	   bandwidth of the core.  We don't have enough spare registers
456*09a53ad8SAndrew Turner	   to synthesise prefetching, so use PLD operations.  */
457*09a53ad8SAndrew Turner	/* Pre-bias src and dst.  */
458*09a53ad8SAndrew Turner	sub	src, src, #8
459*09a53ad8SAndrew Turner	sub	dst, dst, #8
460*09a53ad8SAndrew Turner	pld	[src, #8]
461*09a53ad8SAndrew Turner	pld	[src, #72]
462*09a53ad8SAndrew Turner	subs	tmp2, tmp2, #64
463*09a53ad8SAndrew Turner	pld	[src, #136]
464*09a53ad8SAndrew Turner	ldrd	A_l, A_h, [src, #8]
465*09a53ad8SAndrew Turner	strd	B_l, B_h, [sp, #8]
466*09a53ad8SAndrew Turner	ldrd	B_l, B_h, [src, #16]
467*09a53ad8SAndrew Turner	strd	C_l, C_h, [sp, #16]
468*09a53ad8SAndrew Turner	ldrd	C_l, C_h, [src, #24]
469*09a53ad8SAndrew Turner	strd	D_l, D_h, [sp, #24]
470*09a53ad8SAndrew Turner	pld	[src, #200]
471*09a53ad8SAndrew Turner	ldrd	D_l, D_h, [src, #32]!
472*09a53ad8SAndrew Turner	b	1f
473*09a53ad8SAndrew Turner	.p2align	6
474*09a53ad8SAndrew Turner2:
475*09a53ad8SAndrew Turner	pld	[src, #232]
476*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #40]
477*09a53ad8SAndrew Turner	ldrd	A_l, A_h, [src, #40]
478*09a53ad8SAndrew Turner	strd	B_l, B_h, [dst, #48]
479*09a53ad8SAndrew Turner	ldrd	B_l, B_h, [src, #48]
480*09a53ad8SAndrew Turner	strd	C_l, C_h, [dst, #56]
481*09a53ad8SAndrew Turner	ldrd	C_l, C_h, [src, #56]
482*09a53ad8SAndrew Turner	strd	D_l, D_h, [dst, #64]!
483*09a53ad8SAndrew Turner	ldrd	D_l, D_h, [src, #64]!
484*09a53ad8SAndrew Turner	subs	tmp2, tmp2, #64
485*09a53ad8SAndrew Turner1:
486*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #8]
487*09a53ad8SAndrew Turner	ldrd	A_l, A_h, [src, #8]
488*09a53ad8SAndrew Turner	strd	B_l, B_h, [dst, #16]
489*09a53ad8SAndrew Turner	ldrd	B_l, B_h, [src, #16]
490*09a53ad8SAndrew Turner	strd	C_l, C_h, [dst, #24]
491*09a53ad8SAndrew Turner	ldrd	C_l, C_h, [src, #24]
492*09a53ad8SAndrew Turner	strd	D_l, D_h, [dst, #32]
493*09a53ad8SAndrew Turner	ldrd	D_l, D_h, [src, #32]
494*09a53ad8SAndrew Turner	bcs	2b
495*09a53ad8SAndrew Turner	/* Save the remaining bytes and restore the callee-saved regs.  */
496*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #40]
497*09a53ad8SAndrew Turner	add	src, src, #40
498*09a53ad8SAndrew Turner	strd	B_l, B_h, [dst, #48]
499*09a53ad8SAndrew Turner	ldrd	B_l, B_h, [sp, #8]
500*09a53ad8SAndrew Turner	strd	C_l, C_h, [dst, #56]
501*09a53ad8SAndrew Turner	ldrd	C_l, C_h, [sp, #16]
502*09a53ad8SAndrew Turner	strd	D_l, D_h, [dst, #64]
503*09a53ad8SAndrew Turner	ldrd	D_l, D_h, [sp, #24]
504*09a53ad8SAndrew Turner	add	dst, dst, #72
505*09a53ad8SAndrew Turner	tst	tmp2, #0x3f
506*09a53ad8SAndrew Turner	bne	.Ltail63aligned
507*09a53ad8SAndrew Turner	ldr	tmp2, [sp], #FRAME_SIZE
508*09a53ad8SAndrew Turner	bx	lr
509*09a53ad8SAndrew Turner#endif
510*09a53ad8SAndrew Turner
511*09a53ad8SAndrew Turner.Lcpy_notaligned:
512*09a53ad8SAndrew Turner	pld	[src]
513*09a53ad8SAndrew Turner	pld	[src, #64]
514*09a53ad8SAndrew Turner	/* There's at least 64 bytes to copy, but there is no mutual
515*09a53ad8SAndrew Turner	   alignment.  */
516*09a53ad8SAndrew Turner	/* Bring DST to 64-bit alignment.  */
517*09a53ad8SAndrew Turner	lsls	tmp2, dst, #29
518*09a53ad8SAndrew Turner	pld	[src, #(2 * 64)]
519*09a53ad8SAndrew Turner	beq	1f
520*09a53ad8SAndrew Turner	rsbs	tmp2, tmp2, #0
521*09a53ad8SAndrew Turner	sub	count, count, tmp2, lsr #29
522*09a53ad8SAndrew Turner	ldrmi	tmp1, [src], #4
523*09a53ad8SAndrew Turner	strmi	tmp1, [dst], #4
524*09a53ad8SAndrew Turner	lsls	tmp2, tmp2, #2
525*09a53ad8SAndrew Turner	ldrbne	tmp1, [src], #1
526*09a53ad8SAndrew Turner	ldrhcs	tmp2, [src], #2
527*09a53ad8SAndrew Turner	strbne	tmp1, [dst], #1
528*09a53ad8SAndrew Turner	strhcs	tmp2, [dst], #2
529*09a53ad8SAndrew Turner1:
530*09a53ad8SAndrew Turner	pld	[src, #(3 * 64)]
531*09a53ad8SAndrew Turner	subs	count, count, #64
532*09a53ad8SAndrew Turner	ldrmi	tmp2, [sp], #FRAME_SIZE
533*09a53ad8SAndrew Turner	bmi	.Ltail63unaligned
534*09a53ad8SAndrew Turner	pld	[src, #(4 * 64)]
535*09a53ad8SAndrew Turner
536*09a53ad8SAndrew Turner#ifdef USE_NEON
537*09a53ad8SAndrew Turner	vld1.8	{d0-d3}, [src]!
538*09a53ad8SAndrew Turner	vld1.8	{d4-d7}, [src]!
539*09a53ad8SAndrew Turner	subs	count, count, #64
540*09a53ad8SAndrew Turner	bmi	2f
541*09a53ad8SAndrew Turner1:
542*09a53ad8SAndrew Turner	pld	[src, #(4 * 64)]
543*09a53ad8SAndrew Turner	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
544*09a53ad8SAndrew Turner	vld1.8	{d0-d3}, [src]!
545*09a53ad8SAndrew Turner	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
546*09a53ad8SAndrew Turner	vld1.8	{d4-d7}, [src]!
547*09a53ad8SAndrew Turner	subs	count, count, #64
548*09a53ad8SAndrew Turner	bpl	1b
549*09a53ad8SAndrew Turner2:
550*09a53ad8SAndrew Turner	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
551*09a53ad8SAndrew Turner	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
552*09a53ad8SAndrew Turner	ands	count, count, #0x3f
553*09a53ad8SAndrew Turner#else
554*09a53ad8SAndrew Turner	/* Use an SMS style loop to maximize the I/O bandwidth.  */
555*09a53ad8SAndrew Turner	sub	src, src, #4
556*09a53ad8SAndrew Turner	sub	dst, dst, #8
557*09a53ad8SAndrew Turner	subs	tmp2, count, #64	/* Use tmp2 for count.  */
558*09a53ad8SAndrew Turner	ldr	A_l, [src, #4]
559*09a53ad8SAndrew Turner	ldr	A_h, [src, #8]
560*09a53ad8SAndrew Turner	strd	B_l, B_h, [sp, #8]
561*09a53ad8SAndrew Turner	ldr	B_l, [src, #12]
562*09a53ad8SAndrew Turner	ldr	B_h, [src, #16]
563*09a53ad8SAndrew Turner	strd	C_l, C_h, [sp, #16]
564*09a53ad8SAndrew Turner	ldr	C_l, [src, #20]
565*09a53ad8SAndrew Turner	ldr	C_h, [src, #24]
566*09a53ad8SAndrew Turner	strd	D_l, D_h, [sp, #24]
567*09a53ad8SAndrew Turner	ldr	D_l, [src, #28]
568*09a53ad8SAndrew Turner	ldr	D_h, [src, #32]!
569*09a53ad8SAndrew Turner	b	1f
570*09a53ad8SAndrew Turner	.p2align	6
571*09a53ad8SAndrew Turner2:
572*09a53ad8SAndrew Turner	pld	[src, #(5 * 64) - (32 - 4)]
573*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #40]
574*09a53ad8SAndrew Turner	ldr	A_l, [src, #36]
575*09a53ad8SAndrew Turner	ldr	A_h, [src, #40]
576*09a53ad8SAndrew Turner	strd	B_l, B_h, [dst, #48]
577*09a53ad8SAndrew Turner	ldr	B_l, [src, #44]
578*09a53ad8SAndrew Turner	ldr	B_h, [src, #48]
579*09a53ad8SAndrew Turner	strd	C_l, C_h, [dst, #56]
580*09a53ad8SAndrew Turner	ldr	C_l, [src, #52]
581*09a53ad8SAndrew Turner	ldr	C_h, [src, #56]
582*09a53ad8SAndrew Turner	strd	D_l, D_h, [dst, #64]!
583*09a53ad8SAndrew Turner	ldr	D_l, [src, #60]
584*09a53ad8SAndrew Turner	ldr	D_h, [src, #64]!
585*09a53ad8SAndrew Turner	subs	tmp2, tmp2, #64
586*09a53ad8SAndrew Turner1:
587*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #8]
588*09a53ad8SAndrew Turner	ldr	A_l, [src, #4]
589*09a53ad8SAndrew Turner	ldr	A_h, [src, #8]
590*09a53ad8SAndrew Turner	strd	B_l, B_h, [dst, #16]
591*09a53ad8SAndrew Turner	ldr	B_l, [src, #12]
592*09a53ad8SAndrew Turner	ldr	B_h, [src, #16]
593*09a53ad8SAndrew Turner	strd	C_l, C_h, [dst, #24]
594*09a53ad8SAndrew Turner	ldr	C_l, [src, #20]
595*09a53ad8SAndrew Turner	ldr	C_h, [src, #24]
596*09a53ad8SAndrew Turner	strd	D_l, D_h, [dst, #32]
597*09a53ad8SAndrew Turner	ldr	D_l, [src, #28]
598*09a53ad8SAndrew Turner	ldr	D_h, [src, #32]
599*09a53ad8SAndrew Turner	bcs	2b
600*09a53ad8SAndrew Turner
601*09a53ad8SAndrew Turner	/* Save the remaining bytes and restore the callee-saved regs.  */
602*09a53ad8SAndrew Turner	strd	A_l, A_h, [dst, #40]
603*09a53ad8SAndrew Turner	add	src, src, #36
604*09a53ad8SAndrew Turner	strd	B_l, B_h, [dst, #48]
605*09a53ad8SAndrew Turner	ldrd	B_l, B_h, [sp, #8]
606*09a53ad8SAndrew Turner	strd	C_l, C_h, [dst, #56]
607*09a53ad8SAndrew Turner	ldrd	C_l, C_h, [sp, #16]
608*09a53ad8SAndrew Turner	strd	D_l, D_h, [dst, #64]
609*09a53ad8SAndrew Turner	ldrd	D_l, D_h, [sp, #24]
610*09a53ad8SAndrew Turner	add	dst, dst, #72
611*09a53ad8SAndrew Turner	ands	count, tmp2, #0x3f
612*09a53ad8SAndrew Turner#endif
613*09a53ad8SAndrew Turner	ldr	tmp2, [sp], #FRAME_SIZE
614*09a53ad8SAndrew Turner	bne	.Ltail63unaligned
615*09a53ad8SAndrew Turner	bx	lr
616*09a53ad8SAndrew Turner
617*09a53ad8SAndrew Turner	.size	memcpy, . - memcpy
618