1/*
2 * memcpy - copy memory area
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 */
8
9/*
10   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
11   of VFP or NEON when built with the appropriate flags.
12
13   Assumptions:
14
15    ARMv6 (ARMv7-a if using Neon)
16    ARM state
17    Unaligned accesses
18
19 */
20
21#include "../asmdefs.h"
22
23	.syntax unified
24	/* This implementation requires ARM state.  */
25	.arm
26
27#ifdef __ARM_NEON__
28
29	.fpu	neon
30	.arch	armv7-a
31# define FRAME_SIZE	4
32# define USE_VFP
33# define USE_NEON
34
35#elif !defined (__SOFTFP__)
36
37	.arch	armv6
38	.fpu	vfpv2
39# define FRAME_SIZE	32
40# define USE_VFP
41
42#else
43	.arch	armv6
44# define FRAME_SIZE    32
45
46#endif
47
48/* Old versions of GAS incorrectly implement the NEON align semantics.  */
49#ifdef BROKEN_ASM_NEON_ALIGN
50#define ALIGN(addr, align) addr,:align
51#else
52#define ALIGN(addr, align) addr:align
53#endif
54
55#define PC_OFFSET	8	/* PC pipeline compensation.  */
56#define INSN_SIZE	4
57
58/* Call parameters.  */
59#define dstin	r0
60#define src	r1
61#define count	r2
62
63/* Locals.  */
64#define tmp1	r3
65#define dst	ip
66#define tmp2	r10
67
68#ifndef USE_NEON
69/* For bulk copies using GP registers.  */
70#define	A_l	r2		/* Call-clobbered.  */
71#define	A_h	r3		/* Call-clobbered.  */
72#define	B_l	r4
73#define	B_h	r5
74#define	C_l	r6
75#define	C_h	r7
76#define	D_l	r8
77#define	D_h	r9
78#endif
79
80/* Number of lines ahead to pre-fetch data.  If you change this the code
81   below will need adjustment to compensate.  */
82
83#define prefetch_lines	5
84
85#ifdef USE_VFP
86	.macro	cpy_line_vfp vreg, base
87	vstr	\vreg, [dst, #\base]
88	vldr	\vreg, [src, #\base]
89	vstr	d0, [dst, #\base + 8]
90	vldr	d0, [src, #\base + 8]
91	vstr	d1, [dst, #\base + 16]
92	vldr	d1, [src, #\base + 16]
93	vstr	d2, [dst, #\base + 24]
94	vldr	d2, [src, #\base + 24]
95	vstr	\vreg, [dst, #\base + 32]
96	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
97	vstr	d0, [dst, #\base + 40]
98	vldr	d0, [src, #\base + 40]
99	vstr	d1, [dst, #\base + 48]
100	vldr	d1, [src, #\base + 48]
101	vstr	d2, [dst, #\base + 56]
102	vldr	d2, [src, #\base + 56]
103	.endm
104
105	.macro	cpy_tail_vfp vreg, base
106	vstr	\vreg, [dst, #\base]
107	vldr	\vreg, [src, #\base]
108	vstr	d0, [dst, #\base + 8]
109	vldr	d0, [src, #\base + 8]
110	vstr	d1, [dst, #\base + 16]
111	vldr	d1, [src, #\base + 16]
112	vstr	d2, [dst, #\base + 24]
113	vldr	d2, [src, #\base + 24]
114	vstr	\vreg, [dst, #\base + 32]
115	vstr	d0, [dst, #\base + 40]
116	vldr	d0, [src, #\base + 40]
117	vstr	d1, [dst, #\base + 48]
118	vldr	d1, [src, #\base + 48]
119	vstr	d2, [dst, #\base + 56]
120	vldr	d2, [src, #\base + 56]
121	.endm
122#endif
123
124ENTRY (__memcpy_arm)
125
126	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
127	cmp	count, #64
128	bge	L(cpy_not_short)
129	/* Deal with small copies quickly by dropping straight into the
130	   exit block.  */
131
132L(tail63unaligned):
133#ifdef USE_NEON
134	and	tmp1, count, #0x38
135	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
136	add	pc, pc, tmp1
137	vld1.8	{d0}, [src]!	/* 14 words to go.  */
138	vst1.8	{d0}, [dst]!
139	vld1.8	{d0}, [src]!	/* 12 words to go.  */
140	vst1.8	{d0}, [dst]!
141	vld1.8	{d0}, [src]!	/* 10 words to go.  */
142	vst1.8	{d0}, [dst]!
143	vld1.8	{d0}, [src]!	/* 8 words to go.  */
144	vst1.8	{d0}, [dst]!
145	vld1.8	{d0}, [src]!	/* 6 words to go.  */
146	vst1.8	{d0}, [dst]!
147	vld1.8	{d0}, [src]!	/* 4 words to go.  */
148	vst1.8	{d0}, [dst]!
149	vld1.8	{d0}, [src]!	/* 2 words to go.  */
150	vst1.8	{d0}, [dst]!
151
152	tst	count, #4
153	ldrne	tmp1, [src], #4
154	strne	tmp1, [dst], #4
155#else
156	/* Copy up to 15 full words of data.  May not be aligned.  */
157	/* Cannot use VFP for unaligned data.  */
158	and	tmp1, count, #0x3c
159	add	dst, dst, tmp1
160	add	src, src, tmp1
161	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
162	/* Jump directly into the sequence below at the correct offset.  */
163	add	pc, pc, tmp1, lsl #1
164
165	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
166	str	tmp1, [dst, #-60]
167
168	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
169	str	tmp1, [dst, #-56]
170	ldr	tmp1, [src, #-52]
171	str	tmp1, [dst, #-52]
172
173	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
174	str	tmp1, [dst, #-48]
175	ldr	tmp1, [src, #-44]
176	str	tmp1, [dst, #-44]
177
178	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
179	str	tmp1, [dst, #-40]
180	ldr	tmp1, [src, #-36]
181	str	tmp1, [dst, #-36]
182
183	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
184	str	tmp1, [dst, #-32]
185	ldr	tmp1, [src, #-28]
186	str	tmp1, [dst, #-28]
187
188	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
189	str	tmp1, [dst, #-24]
190	ldr	tmp1, [src, #-20]
191	str	tmp1, [dst, #-20]
192
193	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
194	str	tmp1, [dst, #-16]
195	ldr	tmp1, [src, #-12]
196	str	tmp1, [dst, #-12]
197
198	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
199	str	tmp1, [dst, #-8]
200	ldr	tmp1, [src, #-4]
201	str	tmp1, [dst, #-4]
202#endif
203
204	lsls	count, count, #31
205	ldrhcs	tmp1, [src], #2
206	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
207	strhcs	tmp1, [dst], #2
208	strbne	src, [dst]
209	bx	lr
210
211L(cpy_not_short):
212	/* At least 64 bytes to copy, but don't know the alignment yet.  */
213	str	tmp2, [sp, #-FRAME_SIZE]!
214	and	tmp2, src, #7
215	and	tmp1, dst, #7
216	cmp	tmp1, tmp2
217	bne	L(cpy_notaligned)
218
219#ifdef USE_VFP
220	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
221	   that the FP pipeline is much better at streaming loads and
222	   stores.  This is outside the critical loop.  */
223	vmov.f32	s0, s0
224#endif
225
226	/* SRC and DST have the same mutual 64-bit alignment, but we may
227	   still need to pre-copy some bytes to get to natural alignment.
228	   We bring SRC and DST into full 64-bit alignment.  */
229	lsls	tmp2, dst, #29
230	beq	1f
231	rsbs	tmp2, tmp2, #0
232	sub	count, count, tmp2, lsr #29
233	ldrmi	tmp1, [src], #4
234	strmi	tmp1, [dst], #4
235	lsls	tmp2, tmp2, #2
236	ldrhcs	tmp1, [src], #2
237	ldrbne	tmp2, [src], #1
238	strhcs	tmp1, [dst], #2
239	strbne	tmp2, [dst], #1
240
2411:
242	subs	tmp2, count, #64	/* Use tmp2 for count.  */
243	blt	L(tail63aligned)
244
245	cmp	tmp2, #512
246	bge	L(cpy_body_long)
247
248L(cpy_body_medium):			/* Count in tmp2.  */
249#ifdef USE_VFP
2501:
251	vldr	d0, [src, #0]
252	subs	tmp2, tmp2, #64
253	vldr	d1, [src, #8]
254	vstr	d0, [dst, #0]
255	vldr	d0, [src, #16]
256	vstr	d1, [dst, #8]
257	vldr	d1, [src, #24]
258	vstr	d0, [dst, #16]
259	vldr	d0, [src, #32]
260	vstr	d1, [dst, #24]
261	vldr	d1, [src, #40]
262	vstr	d0, [dst, #32]
263	vldr	d0, [src, #48]
264	vstr	d1, [dst, #40]
265	vldr	d1, [src, #56]
266	vstr	d0, [dst, #48]
267	add	src, src, #64
268	vstr	d1, [dst, #56]
269	add	dst, dst, #64
270	bge	1b
271	tst	tmp2, #0x3f
272	beq	L(done)
273
274L(tail63aligned):			/* Count in tmp2.  */
275	and	tmp1, tmp2, #0x38
276	add	dst, dst, tmp1
277	add	src, src, tmp1
278	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
279	add	pc, pc, tmp1
280
281	vldr	d0, [src, #-56]	/* 14 words to go.  */
282	vstr	d0, [dst, #-56]
283	vldr	d0, [src, #-48]	/* 12 words to go.  */
284	vstr	d0, [dst, #-48]
285	vldr	d0, [src, #-40]	/* 10 words to go.  */
286	vstr	d0, [dst, #-40]
287	vldr	d0, [src, #-32]	/* 8 words to go.  */
288	vstr	d0, [dst, #-32]
289	vldr	d0, [src, #-24]	/* 6 words to go.  */
290	vstr	d0, [dst, #-24]
291	vldr	d0, [src, #-16]	/* 4 words to go.  */
292	vstr	d0, [dst, #-16]
293	vldr	d0, [src, #-8]	/* 2 words to go.  */
294	vstr	d0, [dst, #-8]
295#else
296	sub	src, src, #8
297	sub	dst, dst, #8
2981:
299	ldrd	A_l, A_h, [src, #8]
300	strd	A_l, A_h, [dst, #8]
301	ldrd	A_l, A_h, [src, #16]
302	strd	A_l, A_h, [dst, #16]
303	ldrd	A_l, A_h, [src, #24]
304	strd	A_l, A_h, [dst, #24]
305	ldrd	A_l, A_h, [src, #32]
306	strd	A_l, A_h, [dst, #32]
307	ldrd	A_l, A_h, [src, #40]
308	strd	A_l, A_h, [dst, #40]
309	ldrd	A_l, A_h, [src, #48]
310	strd	A_l, A_h, [dst, #48]
311	ldrd	A_l, A_h, [src, #56]
312	strd	A_l, A_h, [dst, #56]
313	ldrd	A_l, A_h, [src, #64]!
314	strd	A_l, A_h, [dst, #64]!
315	subs	tmp2, tmp2, #64
316	bge	1b
317	tst	tmp2, #0x3f
318	bne	1f
319	ldr	tmp2,[sp], #FRAME_SIZE
320	bx	lr
3211:
322	add	src, src, #8
323	add	dst, dst, #8
324
325L(tail63aligned):			/* Count in tmp2.  */
326	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
327	   we know that the src and dest are 64-bit aligned so we can use
328	   LDRD/STRD to improve efficiency.  */
329	/* TMP2 is now negative, but we don't care about that.  The bottom
330	   six bits still tell us how many bytes are left to copy.  */
331
332	and	tmp1, tmp2, #0x38
333	add	dst, dst, tmp1
334	add	src, src, tmp1
335	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
336	add	pc, pc, tmp1
337	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
338	strd	A_l, A_h, [dst, #-56]
339	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
340	strd	A_l, A_h, [dst, #-48]
341	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
342	strd	A_l, A_h, [dst, #-40]
343	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
344	strd	A_l, A_h, [dst, #-32]
345	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
346	strd	A_l, A_h, [dst, #-24]
347	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
348	strd	A_l, A_h, [dst, #-16]
349	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
350	strd	A_l, A_h, [dst, #-8]
351
352#endif
353	tst	tmp2, #4
354	ldrne	tmp1, [src], #4
355	strne	tmp1, [dst], #4
356	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
357	ldrhcs	tmp1, [src], #2
358	ldrbne	tmp2, [src]
359	strhcs	tmp1, [dst], #2
360	strbne	tmp2, [dst]
361
362L(done):
363	ldr	tmp2, [sp], #FRAME_SIZE
364	bx	lr
365
366L(cpy_body_long):			/* Count in tmp2.  */
367
368	/* Long copy.  We know that there's at least (prefetch_lines * 64)
369	   bytes to go.  */
370#ifdef USE_VFP
371	/* Don't use PLD.  Instead, read some data in advance of the current
372	   copy position into a register.  This should act like a PLD
373	   operation but we won't have to repeat the transfer.  */
374
375	vldr	d3, [src, #0]
376	vldr	d4, [src, #64]
377	vldr	d5, [src, #128]
378	vldr	d6, [src, #192]
379	vldr	d7, [src, #256]
380
381	vldr	d0, [src, #8]
382	vldr	d1, [src, #16]
383	vldr	d2, [src, #24]
384	add	src, src, #32
385
386	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
387	blt	2f
3881:
389	cpy_line_vfp	d3, 0
390	cpy_line_vfp	d4, 64
391	cpy_line_vfp	d5, 128
392	add	dst, dst, #3 * 64
393	add	src, src, #3 * 64
394	cpy_line_vfp	d6, 0
395	cpy_line_vfp	d7, 64
396	add	dst, dst, #2 * 64
397	add	src, src, #2 * 64
398	subs	tmp2, tmp2, #prefetch_lines * 64
399	bge	1b
400
4012:
402	cpy_tail_vfp	d3, 0
403	cpy_tail_vfp	d4, 64
404	cpy_tail_vfp	d5, 128
405	add	src, src, #3 * 64
406	add	dst, dst, #3 * 64
407	cpy_tail_vfp	d6, 0
408	vstr	d7, [dst, #64]
409	vldr	d7, [src, #64]
410	vstr	d0, [dst, #64 + 8]
411	vldr	d0, [src, #64 + 8]
412	vstr	d1, [dst, #64 + 16]
413	vldr	d1, [src, #64 + 16]
414	vstr	d2, [dst, #64 + 24]
415	vldr	d2, [src, #64 + 24]
416	vstr	d7, [dst, #64 + 32]
417	add	src, src, #96
418	vstr	d0, [dst, #64 + 40]
419	vstr	d1, [dst, #64 + 48]
420	vstr	d2, [dst, #64 + 56]
421	add	dst, dst, #128
422	add	tmp2, tmp2, #prefetch_lines * 64
423	b	L(cpy_body_medium)
424#else
425	/* Long copy.  Use an SMS style loop to maximize the I/O
426	   bandwidth of the core.  We don't have enough spare registers
427	   to synthesise prefetching, so use PLD operations.  */
428	/* Pre-bias src and dst.  */
429	sub	src, src, #8
430	sub	dst, dst, #8
431	pld	[src, #8]
432	pld	[src, #72]
433	subs	tmp2, tmp2, #64
434	pld	[src, #136]
435	ldrd	A_l, A_h, [src, #8]
436	strd	B_l, B_h, [sp, #8]
437	ldrd	B_l, B_h, [src, #16]
438	strd	C_l, C_h, [sp, #16]
439	ldrd	C_l, C_h, [src, #24]
440	strd	D_l, D_h, [sp, #24]
441	pld	[src, #200]
442	ldrd	D_l, D_h, [src, #32]!
443	b	1f
444	.p2align	6
4452:
446	pld	[src, #232]
447	strd	A_l, A_h, [dst, #40]
448	ldrd	A_l, A_h, [src, #40]
449	strd	B_l, B_h, [dst, #48]
450	ldrd	B_l, B_h, [src, #48]
451	strd	C_l, C_h, [dst, #56]
452	ldrd	C_l, C_h, [src, #56]
453	strd	D_l, D_h, [dst, #64]!
454	ldrd	D_l, D_h, [src, #64]!
455	subs	tmp2, tmp2, #64
4561:
457	strd	A_l, A_h, [dst, #8]
458	ldrd	A_l, A_h, [src, #8]
459	strd	B_l, B_h, [dst, #16]
460	ldrd	B_l, B_h, [src, #16]
461	strd	C_l, C_h, [dst, #24]
462	ldrd	C_l, C_h, [src, #24]
463	strd	D_l, D_h, [dst, #32]
464	ldrd	D_l, D_h, [src, #32]
465	bcs	2b
466	/* Save the remaining bytes and restore the callee-saved regs.  */
467	strd	A_l, A_h, [dst, #40]
468	add	src, src, #40
469	strd	B_l, B_h, [dst, #48]
470	ldrd	B_l, B_h, [sp, #8]
471	strd	C_l, C_h, [dst, #56]
472	ldrd	C_l, C_h, [sp, #16]
473	strd	D_l, D_h, [dst, #64]
474	ldrd	D_l, D_h, [sp, #24]
475	add	dst, dst, #72
476	tst	tmp2, #0x3f
477	bne	L(tail63aligned)
478	ldr	tmp2, [sp], #FRAME_SIZE
479	bx	lr
480#endif
481
482L(cpy_notaligned):
483	pld	[src]
484	pld	[src, #64]
485	/* There's at least 64 bytes to copy, but there is no mutual
486	   alignment.  */
487	/* Bring DST to 64-bit alignment.  */
488	lsls	tmp2, dst, #29
489	pld	[src, #(2 * 64)]
490	beq	1f
491	rsbs	tmp2, tmp2, #0
492	sub	count, count, tmp2, lsr #29
493	ldrmi	tmp1, [src], #4
494	strmi	tmp1, [dst], #4
495	lsls	tmp2, tmp2, #2
496	ldrbne	tmp1, [src], #1
497	ldrhcs	tmp2, [src], #2
498	strbne	tmp1, [dst], #1
499	strhcs	tmp2, [dst], #2
5001:
501	pld	[src, #(3 * 64)]
502	subs	count, count, #64
503	ldrmi	tmp2, [sp], #FRAME_SIZE
504	bmi	L(tail63unaligned)
505	pld	[src, #(4 * 64)]
506
507#ifdef USE_NEON
508	vld1.8	{d0-d3}, [src]!
509	vld1.8	{d4-d7}, [src]!
510	subs	count, count, #64
511	bmi	2f
5121:
513	pld	[src, #(4 * 64)]
514	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
515	vld1.8	{d0-d3}, [src]!
516	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
517	vld1.8	{d4-d7}, [src]!
518	subs	count, count, #64
519	bpl	1b
5202:
521	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
522	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
523	ands	count, count, #0x3f
524#else
525	/* Use an SMS style loop to maximize the I/O bandwidth.  */
526	sub	src, src, #4
527	sub	dst, dst, #8
528	subs	tmp2, count, #64	/* Use tmp2 for count.  */
529	ldr	A_l, [src, #4]
530	ldr	A_h, [src, #8]
531	strd	B_l, B_h, [sp, #8]
532	ldr	B_l, [src, #12]
533	ldr	B_h, [src, #16]
534	strd	C_l, C_h, [sp, #16]
535	ldr	C_l, [src, #20]
536	ldr	C_h, [src, #24]
537	strd	D_l, D_h, [sp, #24]
538	ldr	D_l, [src, #28]
539	ldr	D_h, [src, #32]!
540	b	1f
541	.p2align	6
5422:
543	pld	[src, #(5 * 64) - (32 - 4)]
544	strd	A_l, A_h, [dst, #40]
545	ldr	A_l, [src, #36]
546	ldr	A_h, [src, #40]
547	strd	B_l, B_h, [dst, #48]
548	ldr	B_l, [src, #44]
549	ldr	B_h, [src, #48]
550	strd	C_l, C_h, [dst, #56]
551	ldr	C_l, [src, #52]
552	ldr	C_h, [src, #56]
553	strd	D_l, D_h, [dst, #64]!
554	ldr	D_l, [src, #60]
555	ldr	D_h, [src, #64]!
556	subs	tmp2, tmp2, #64
5571:
558	strd	A_l, A_h, [dst, #8]
559	ldr	A_l, [src, #4]
560	ldr	A_h, [src, #8]
561	strd	B_l, B_h, [dst, #16]
562	ldr	B_l, [src, #12]
563	ldr	B_h, [src, #16]
564	strd	C_l, C_h, [dst, #24]
565	ldr	C_l, [src, #20]
566	ldr	C_h, [src, #24]
567	strd	D_l, D_h, [dst, #32]
568	ldr	D_l, [src, #28]
569	ldr	D_h, [src, #32]
570	bcs	2b
571
572	/* Save the remaining bytes and restore the callee-saved regs.  */
573	strd	A_l, A_h, [dst, #40]
574	add	src, src, #36
575	strd	B_l, B_h, [dst, #48]
576	ldrd	B_l, B_h, [sp, #8]
577	strd	C_l, C_h, [dst, #56]
578	ldrd	C_l, C_h, [sp, #16]
579	strd	D_l, D_h, [dst, #64]
580	ldrd	D_l, D_h, [sp, #24]
581	add	dst, dst, #72
582	ands	count, tmp2, #0x3f
583#endif
584	ldr	tmp2, [sp], #FRAME_SIZE
585	bne	L(tail63unaligned)
586	bx	lr
587
588END (__memcpy_arm)
589