xref: /openbsd/lib/libc/arch/arm/string/_memcpy.S (revision a6445c1d)
1/*	$OpenBSD: _memcpy.S,v 1.3 2008/06/26 05:42:04 ray Exp $	*/
2/*	$NetBSD: _memcpy.S,v 1.4 2003/04/05 23:08:52 bjh21 Exp $	*/
3
4/*-
5 * Copyright (c) 1997 The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Neil A. Carson and Mark Brinicombe
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33#include <machine/asm.h>
34
35/*
36 * This is one fun bit of code ...
37 * Some easy listening music is suggested while trying to understand this
38 * code e.g. Iron Maiden
39 *
40 * For anyone attempting to understand it :
41 *
42 * The core code is implemented here with simple stubs for memcpy()
43 * memmove() and bcopy().
44 *
45 * All local labels are prefixed with Lmemcpy_
46 * Following the prefix a label starting f is used in the forward copy code
47 * while a label using b is used in the backwards copy code
48 * The source and destination addresses determine whether a forward or
49 * backward copy is performed.
50 * Separate bits of code are used to deal with the following situations
51 * for both the forward and backwards copy.
52 * unaligned source address
53 * unaligned destination address
54 * Separate copy routines are used to produce an optimised result for each
55 * of these cases.
56 * The copy code will use LDM/STM instructions to copy up to 32 bytes at
57 * a time where possible.
58 *
59 * Note: r12 (aka ip) can be trashed during the function along with
60 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
61 * Additional registers are preserved prior to use i.e. r4, r5 & lr
62 *
63 * Apologies for the state of the comments ;-)
64 */
65
66ENTRY(_memcpy)
67	/* Determine copy direction */
68	cmp	r1, r0
69	bcc	.Lmemcpy_backwards
70
71	moveq	r0, #0			/* Quick abort for len=0 */
72	moveq	pc, lr
73
74	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
75	subs	r2, r2, #4
76	blt	.Lmemcpy_fl4		/* less than 4 bytes */
77	ands	r12, r0, #3
78	bne	.Lmemcpy_fdestul	/* oh unaligned destination addr */
79	ands	r12, r1, #3
80	bne	.Lmemcpy_fsrcul		/* oh unaligned source addr */
81
82.Lmemcpy_ft8:
83	/* We have aligned source and destination */
84	subs	r2, r2, #8
85	blt	.Lmemcpy_fl12		/* less than 12 bytes (4 from above) */
86	subs	r2, r2, #0x14
87	blt	.Lmemcpy_fl32		/* less than 32 bytes (12 from above) */
88	stmdb	sp!, {r4}		/* borrow r4 */
89
90	/* blat 32 bytes at a time */
91	/* XXX for really big copies perhaps we should use more registers */
92.Lmemcpy_floop32:
93	ldmia	r1!, {r3, r4, r12, lr}
94	stmia	r0!, {r3, r4, r12, lr}
95	ldmia	r1!, {r3, r4, r12, lr}
96	stmia	r0!, {r3, r4, r12, lr}
97	subs	r2, r2, #0x20
98	bge	.Lmemcpy_floop32
99
100	cmn	r2, #0x10
101	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
102	stmgeia	r0!, {r3, r4, r12, lr}
103	subge	r2, r2, #0x10
104	ldmia	sp!, {r4}		/* return r4 */
105
106.Lmemcpy_fl32:
107	adds	r2, r2, #0x14
108
109	/* blat 12 bytes at a time */
110.Lmemcpy_floop12:
111	ldmgeia	r1!, {r3, r12, lr}
112	stmgeia	r0!, {r3, r12, lr}
113	subges	r2, r2, #0x0c
114	bge	.Lmemcpy_floop12
115
116.Lmemcpy_fl12:
117	adds	r2, r2, #8
118	blt	.Lmemcpy_fl4
119
120	subs	r2, r2, #4
121	ldrlt	r3, [r1], #4
122	strlt	r3, [r0], #4
123	ldmgeia	r1!, {r3, r12}
124	stmgeia	r0!, {r3, r12}
125	subge	r2, r2, #4
126
127.Lmemcpy_fl4:
128	/* less than 4 bytes to go */
129	adds	r2, r2, #4
130	ldmeqia	sp!, {r0, pc}		/* done */
131
132	/* copy the crud byte at a time */
133	cmp	r2, #2
134	ldrb	r3, [r1], #1
135	strb	r3, [r0], #1
136	ldrgeb	r3, [r1], #1
137	strgeb	r3, [r0], #1
138	ldrgtb	r3, [r1], #1
139	strgtb	r3, [r0], #1
140	ldmia	sp!, {r0, pc}
141
142	/* erg - unaligned destination */
143.Lmemcpy_fdestul:
144	rsb	r12, r12, #4
145	cmp	r12, #2
146
147	/* align destination with byte copies */
148	ldrb	r3, [r1], #1
149	strb	r3, [r0], #1
150	ldrgeb	r3, [r1], #1
151	strgeb	r3, [r0], #1
152	ldrgtb	r3, [r1], #1
153	strgtb	r3, [r0], #1
154	subs	r2, r2, r12
155	blt	.Lmemcpy_fl4		/* less the 4 bytes */
156
157	ands	r12, r1, #3
158	beq	.Lmemcpy_ft8		/* we have an aligned source */
159
160	/* erg - unaligned source */
161	/* This is where it gets nasty ... */
162.Lmemcpy_fsrcul:
163	bic	r1, r1, #3
164	ldr	lr, [r1], #4
165	cmp	r12, #2
166	bgt	.Lmemcpy_fsrcul3
167	beq	.Lmemcpy_fsrcul2
168	cmp	r2, #0x0c
169	blt	.Lmemcpy_fsrcul1loop4
170	sub	r2, r2, #0x0c
171	stmdb	sp!, {r4, r5}
172
173.Lmemcpy_fsrcul1loop16:
174	mov	r3, lr, lsr #8
175	ldmia	r1!, {r4, r5, r12, lr}
176	orr	r3, r3, r4, lsl #24
177	mov	r4, r4, lsr #8
178	orr	r4, r4, r5, lsl #24
179	mov	r5, r5, lsr #8
180	orr	r5, r5, r12, lsl #24
181	mov	r12, r12, lsr #8
182	orr	r12, r12, lr, lsl #24
183	stmia	r0!, {r3-r5, r12}
184	subs	r2, r2, #0x10
185	bge	.Lmemcpy_fsrcul1loop16
186	ldmia	sp!, {r4, r5}
187	adds	r2, r2, #0x0c
188	blt	.Lmemcpy_fsrcul1l4
189
190.Lmemcpy_fsrcul1loop4:
191	mov	r12, lr, lsr #8
192	ldr	lr, [r1], #4
193	orr	r12, r12, lr, lsl #24
194	str	r12, [r0], #4
195	subs	r2, r2, #4
196	bge	.Lmemcpy_fsrcul1loop4
197
198.Lmemcpy_fsrcul1l4:
199	sub	r1, r1, #3
200	b	.Lmemcpy_fl4
201
202.Lmemcpy_fsrcul2:
203	cmp	r2, #0x0c
204	blt	.Lmemcpy_fsrcul2loop4
205	sub	r2, r2, #0x0c
206	stmdb	sp!, {r4, r5}
207
208.Lmemcpy_fsrcul2loop16:
209	mov	r3, lr, lsr #16
210	ldmia	r1!, {r4, r5, r12, lr}
211	orr	r3, r3, r4, lsl #16
212	mov	r4, r4, lsr #16
213	orr	r4, r4, r5, lsl #16
214	mov	r5, r5, lsr #16
215	orr	r5, r5, r12, lsl #16
216	mov	r12, r12, lsr #16
217	orr	r12, r12, lr, lsl #16
218	stmia	r0!, {r3-r5, r12}
219	subs	r2, r2, #0x10
220	bge	.Lmemcpy_fsrcul2loop16
221	ldmia	sp!, {r4, r5}
222	adds	r2, r2, #0x0c
223	blt	.Lmemcpy_fsrcul2l4
224
225.Lmemcpy_fsrcul2loop4:
226	mov	r12, lr, lsr #16
227	ldr	lr, [r1], #4
228	orr	r12, r12, lr, lsl #16
229	str	r12, [r0], #4
230	subs	r2, r2, #4
231	bge	.Lmemcpy_fsrcul2loop4
232
233.Lmemcpy_fsrcul2l4:
234	sub	r1, r1, #2
235	b	.Lmemcpy_fl4
236
237.Lmemcpy_fsrcul3:
238	cmp	r2, #0x0c
239	blt	.Lmemcpy_fsrcul3loop4
240	sub	r2, r2, #0x0c
241	stmdb	sp!, {r4, r5}
242
243.Lmemcpy_fsrcul3loop16:
244	mov	r3, lr, lsr #24
245	ldmia	r1!, {r4, r5, r12, lr}
246	orr	r3, r3, r4, lsl #8
247	mov	r4, r4, lsr #24
248	orr	r4, r4, r5, lsl #8
249	mov	r5, r5, lsr #24
250	orr	r5, r5, r12, lsl #8
251	mov	r12, r12, lsr #24
252	orr	r12, r12, lr, lsl #8
253	stmia	r0!, {r3-r5, r12}
254	subs	r2, r2, #0x10
255	bge	.Lmemcpy_fsrcul3loop16
256	ldmia	sp!, {r4, r5}
257	adds	r2, r2, #0x0c
258	blt	.Lmemcpy_fsrcul3l4
259
260.Lmemcpy_fsrcul3loop4:
261	mov	r12, lr, lsr #24
262	ldr	lr, [r1], #4
263	orr	r12, r12, lr, lsl #8
264	str	r12, [r0], #4
265	subs	r2, r2, #4
266	bge	.Lmemcpy_fsrcul3loop4
267
268.Lmemcpy_fsrcul3l4:
269	sub	r1, r1, #1
270	b	.Lmemcpy_fl4
271
272.Lmemcpy_backwards:
273	add	r1, r1, r2
274	add	r0, r0, r2
275	subs	r2, r2, #4
276	blt	.Lmemcpy_bl4		/* less than 4 bytes */
277	ands	r12, r0, #3
278	bne	.Lmemcpy_bdestul	/* oh unaligned destination addr */
279	ands	r12, r1, #3
280	bne	.Lmemcpy_bsrcul		/* oh unaligned source addr */
281
282.Lmemcpy_bt8:
283	/* We have aligned source and destination */
284	subs	r2, r2, #8
285	blt	.Lmemcpy_bl12		/* less than 12 bytes (4 from above) */
286	stmdb	sp!, {r4, lr}
287	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
288	blt	.Lmemcpy_bl32
289
290	/* blat 32 bytes at a time */
291	/* XXX for really big copies perhaps we should use more registers */
292.Lmemcpy_bloop32:
293	ldmdb	r1!, {r3, r4, r12, lr}
294	stmdb	r0!, {r3, r4, r12, lr}
295	ldmdb	r1!, {r3, r4, r12, lr}
296	stmdb	r0!, {r3, r4, r12, lr}
297	subs	r2, r2, #0x20
298	bge	.Lmemcpy_bloop32
299
300.Lmemcpy_bl32:
301	cmn	r2, #0x10
302	ldmgedb	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
303	stmgedb	r0!, {r3, r4, r12, lr}
304	subge	r2, r2, #0x10
305	adds	r2, r2, #0x14
306	ldmgedb	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
307	stmgedb	r0!, {r3, r12, lr}
308	subge	r2, r2, #0x0c
309	ldmia	sp!, {r4, lr}
310
311.Lmemcpy_bl12:
312	adds	r2, r2, #8
313	blt	.Lmemcpy_bl4
314	subs	r2, r2, #4
315	ldrlt	r3, [r1, #-4]!
316	strlt	r3, [r0, #-4]!
317	ldmgedb	r1!, {r3, r12}
318	stmgedb	r0!, {r3, r12}
319	subge	r2, r2, #4
320
321.Lmemcpy_bl4:
322	/* less than 4 bytes to go */
323	adds	r2, r2, #4
324	moveq	pc, lr			/* done */
325
326	/* copy the crud byte at a time */
327	cmp	r2, #2
328	ldrb	r3, [r1, #-1]!
329	strb	r3, [r0, #-1]!
330	ldrgeb	r3, [r1, #-1]!
331	strgeb	r3, [r0, #-1]!
332	ldrgtb	r3, [r1, #-1]!
333	strgtb	r3, [r0, #-1]!
334	mov	pc, lr
335
336	/* erg - unaligned destination */
337.Lmemcpy_bdestul:
338	cmp	r12, #2
339
340	/* align destination with byte copies */
341	ldrb	r3, [r1, #-1]!
342	strb	r3, [r0, #-1]!
343	ldrgeb	r3, [r1, #-1]!
344	strgeb	r3, [r0, #-1]!
345	ldrgtb	r3, [r1, #-1]!
346	strgtb	r3, [r0, #-1]!
347	subs	r2, r2, r12
348	blt	.Lmemcpy_bl4		/* less than 4 bytes to go */
349	ands	r12, r1, #3
350	beq	.Lmemcpy_bt8		/* we have an aligned source */
351
352	/* erg - unaligned source */
353	/* This is where it gets nasty ... */
354.Lmemcpy_bsrcul:
355	bic	r1, r1, #3
356	ldr	r3, [r1, #0]
357	cmp	r12, #2
358	blt	.Lmemcpy_bsrcul1
359	beq	.Lmemcpy_bsrcul2
360	cmp	r2, #0x0c
361	blt	.Lmemcpy_bsrcul3loop4
362	sub	r2, r2, #0x0c
363	stmdb	sp!, {r4, r5, lr}
364
365.Lmemcpy_bsrcul3loop16:
366	mov	lr, r3, lsl #8
367	ldmdb	r1!, {r3-r5, r12}
368	orr	lr, lr, r12, lsr #24
369	mov	r12, r12, lsl #8
370	orr	r12, r12, r5, lsr #24
371	mov	r5, r5, lsl #8
372	orr	r5, r5, r4, lsr #24
373	mov	r4, r4, lsl #8
374	orr	r4, r4, r3, lsr #24
375	stmdb	r0!, {r4, r5, r12, lr}
376	subs	r2, r2, #0x10
377	bge	.Lmemcpy_bsrcul3loop16
378	ldmia	sp!, {r4, r5, lr}
379	adds	r2, r2, #0x0c
380	blt	.Lmemcpy_bsrcul3l4
381
382.Lmemcpy_bsrcul3loop4:
383	mov	r12, r3, lsl #8
384	ldr	r3, [r1, #-4]!
385	orr	r12, r12, r3, lsr #24
386	str	r12, [r0, #-4]!
387	subs	r2, r2, #4
388	bge	.Lmemcpy_bsrcul3loop4
389
390.Lmemcpy_bsrcul3l4:
391	add	r1, r1, #3
392	b	.Lmemcpy_bl4
393
394.Lmemcpy_bsrcul2:
395	cmp	r2, #0x0c
396	blt	.Lmemcpy_bsrcul2loop4
397	sub	r2, r2, #0x0c
398	stmdb	sp!, {r4, r5, lr}
399
400.Lmemcpy_bsrcul2loop16:
401	mov	lr, r3, lsl #16
402	ldmdb	r1!, {r3-r5, r12}
403	orr	lr, lr, r12, lsr #16
404	mov	r12, r12, lsl #16
405	orr	r12, r12, r5, lsr #16
406	mov	r5, r5, lsl #16
407	orr	r5, r5, r4, lsr #16
408	mov	r4, r4, lsl #16
409	orr	r4, r4, r3, lsr #16
410	stmdb	r0!, {r4, r5, r12, lr}
411	subs	r2, r2, #0x10
412	bge	.Lmemcpy_bsrcul2loop16
413	ldmia	sp!, {r4, r5, lr}
414	adds	r2, r2, #0x0c
415	blt	.Lmemcpy_bsrcul2l4
416
417.Lmemcpy_bsrcul2loop4:
418	mov	r12, r3, lsl #16
419	ldr	r3, [r1, #-4]!
420	orr	r12, r12, r3, lsr #16
421	str	r12, [r0, #-4]!
422	subs	r2, r2, #4
423	bge	.Lmemcpy_bsrcul2loop4
424
425.Lmemcpy_bsrcul2l4:
426	add	r1, r1, #2
427	b	.Lmemcpy_bl4
428
429.Lmemcpy_bsrcul1:
430	cmp	r2, #0x0c
431	blt	.Lmemcpy_bsrcul1loop4
432	sub	r2, r2, #0x0c
433	stmdb	sp!, {r4, r5, lr}
434
435.Lmemcpy_bsrcul1loop32:
436	mov	lr, r3, lsl #24
437	ldmdb	r1!, {r3-r5, r12}
438	orr	lr, lr, r12, lsr #8
439	mov	r12, r12, lsl #24
440	orr	r12, r12, r5, lsr #8
441	mov	r5, r5, lsl #24
442	orr	r5, r5, r4, lsr #8
443	mov	r4, r4, lsl #24
444	orr	r4, r4, r3, lsr #8
445	stmdb	r0!, {r4, r5, r12, lr}
446	subs	r2, r2, #0x10
447	bge	.Lmemcpy_bsrcul1loop32
448	ldmia	sp!, {r4, r5, lr}
449	adds	r2, r2, #0x0c
450	blt	.Lmemcpy_bsrcul1l4
451
452.Lmemcpy_bsrcul1loop4:
453	mov	r12, r3, lsl #24
454	ldr	r3, [r1, #-4]!
455	orr	r12, r12, r3, lsr #8
456	str	r12, [r0, #-4]!
457	subs	r2, r2, #4
458	bge	.Lmemcpy_bsrcul1loop4
459
460.Lmemcpy_bsrcul1l4:
461	add	r1, r1, #1
462	b	.Lmemcpy_bl4
463