xref: /openbsd/lib/libc/arch/arm/string/_memcpy.S (revision 8ead0783)
1/*	$OpenBSD: _memcpy.S,v 1.7 2017/10/29 02:21:33 guenther Exp $	*/
2/*	$NetBSD: _memcpy.S,v 1.4 2003/04/05 23:08:52 bjh21 Exp $	*/
3
4/*-
5 * Copyright (c) 1997 The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Neil A. Carson and Mark Brinicombe
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33#include "DEFS.h"
34
35/*
36 * This is one fun bit of code ...
37 * Some easy listening music is suggested while trying to understand this
38 * code e.g. Iron Maiden
39 *
40 * For anyone attempting to understand it :
41 *
42 * The core code is implemented here with simple stubs for memcpy()
43 * memmove() and bcopy().
44 *
45 * All local labels are prefixed with Lmemcpy_
46 * Following the prefix a label starting f is used in the forward copy code
47 * while a label using b is used in the backwards copy code
48 * The source and destination addresses determine whether a forward or
49 * backward copy is performed.
50 * Separate bits of code are used to deal with the following situations
51 * for both the forward and backwards copy.
52 * unaligned source address
53 * unaligned destination address
54 * Separate copy routines are used to produce an optimised result for each
55 * of these cases.
56 * The copy code will use LDM/STM instructions to copy up to 32 bytes at
57 * a time where possible.
58 *
59 * Note: r12 (aka ip) can be trashed during the function along with
60 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
61 * Additional registers are preserved prior to use i.e. r4, r5 & lr
62 *
63 * Apologies for the state of the comments ;-)
64 */
65
66.syntax unified
67
68.hidden _memcpy
69
70ENTRY(_memcpy)
71	/* Determine copy direction */
72	cmp	r1, r0
73	bcc	.Lmemcpy_backwards
74
75	moveq	r0, #0			/* Quick abort for len=0 */
76	moveq	pc, lr
77
78	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
79	subs	r2, r2, #4
80	blt	.Lmemcpy_fl4		/* less than 4 bytes */
81	ands	r12, r0, #3
82	bne	.Lmemcpy_fdestul	/* oh unaligned destination addr */
83	ands	r12, r1, #3
84	bne	.Lmemcpy_fsrcul		/* oh unaligned source addr */
85
86.Lmemcpy_ft8:
87	/* We have aligned source and destination */
88	subs	r2, r2, #8
89	blt	.Lmemcpy_fl12		/* less than 12 bytes (4 from above) */
90	subs	r2, r2, #0x14
91	blt	.Lmemcpy_fl32		/* less than 32 bytes (12 from above) */
92	stmdb	sp!, {r4}		/* borrow r4 */
93
94	/* blat 32 bytes at a time */
95	/* XXX for really big copies perhaps we should use more registers */
96.Lmemcpy_floop32:
97	ldmia	r1!, {r3, r4, r12, lr}
98	stmia	r0!, {r3, r4, r12, lr}
99	ldmia	r1!, {r3, r4, r12, lr}
100	stmia	r0!, {r3, r4, r12, lr}
101	subs	r2, r2, #0x20
102	bge	.Lmemcpy_floop32
103
104	cmn	r2, #0x10
105	ldmiage	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
106	stmiage	r0!, {r3, r4, r12, lr}
107	subge	r2, r2, #0x10
108	ldmia	sp!, {r4}		/* return r4 */
109
110.Lmemcpy_fl32:
111	adds	r2, r2, #0x14
112
113	/* blat 12 bytes at a time */
114.Lmemcpy_floop12:
115	ldmiage	r1!, {r3, r12, lr}
116	stmiage	r0!, {r3, r12, lr}
117	subsge	r2, r2, #0x0c
118	bge	.Lmemcpy_floop12
119
120.Lmemcpy_fl12:
121	adds	r2, r2, #8
122	blt	.Lmemcpy_fl4
123
124	subs	r2, r2, #4
125	ldrlt	r3, [r1], #4
126	strlt	r3, [r0], #4
127	ldmiage	r1!, {r3, r12}
128	stmiage	r0!, {r3, r12}
129	subge	r2, r2, #4
130
131.Lmemcpy_fl4:
132	/* less than 4 bytes to go */
133	adds	r2, r2, #4
134	ldmiaeq	sp!, {r0, pc}		/* done */
135
136	/* copy the crud byte at a time */
137	cmp	r2, #2
138	ldrb	r3, [r1], #1
139	strb	r3, [r0], #1
140	ldrbge	r3, [r1], #1
141	strbge	r3, [r0], #1
142	ldrbgt	r3, [r1], #1
143	strbgt	r3, [r0], #1
144	ldmia	sp!, {r0, pc}
145
146	/* erg - unaligned destination */
147.Lmemcpy_fdestul:
148	rsb	r12, r12, #4
149	cmp	r12, #2
150
151	/* align destination with byte copies */
152	ldrb	r3, [r1], #1
153	strb	r3, [r0], #1
154	ldrbge	r3, [r1], #1
155	strbge	r3, [r0], #1
156	ldrbgt	r3, [r1], #1
157	strbgt	r3, [r0], #1
158	subs	r2, r2, r12
159	blt	.Lmemcpy_fl4		/* less the 4 bytes */
160
161	ands	r12, r1, #3
162	beq	.Lmemcpy_ft8		/* we have an aligned source */
163
164	/* erg - unaligned source */
165	/* This is where it gets nasty ... */
166.Lmemcpy_fsrcul:
167	bic	r1, r1, #3
168	ldr	lr, [r1], #4
169	cmp	r12, #2
170	bgt	.Lmemcpy_fsrcul3
171	beq	.Lmemcpy_fsrcul2
172	cmp	r2, #0x0c
173	blt	.Lmemcpy_fsrcul1loop4
174	sub	r2, r2, #0x0c
175	stmdb	sp!, {r4, r5}
176
177.Lmemcpy_fsrcul1loop16:
178	mov	r3, lr, lsr #8
179	ldmia	r1!, {r4, r5, r12, lr}
180	orr	r3, r3, r4, lsl #24
181	mov	r4, r4, lsr #8
182	orr	r4, r4, r5, lsl #24
183	mov	r5, r5, lsr #8
184	orr	r5, r5, r12, lsl #24
185	mov	r12, r12, lsr #8
186	orr	r12, r12, lr, lsl #24
187	stmia	r0!, {r3-r5, r12}
188	subs	r2, r2, #0x10
189	bge	.Lmemcpy_fsrcul1loop16
190	ldmia	sp!, {r4, r5}
191	adds	r2, r2, #0x0c
192	blt	.Lmemcpy_fsrcul1l4
193
194.Lmemcpy_fsrcul1loop4:
195	mov	r12, lr, lsr #8
196	ldr	lr, [r1], #4
197	orr	r12, r12, lr, lsl #24
198	str	r12, [r0], #4
199	subs	r2, r2, #4
200	bge	.Lmemcpy_fsrcul1loop4
201
202.Lmemcpy_fsrcul1l4:
203	sub	r1, r1, #3
204	b	.Lmemcpy_fl4
205
206.Lmemcpy_fsrcul2:
207	cmp	r2, #0x0c
208	blt	.Lmemcpy_fsrcul2loop4
209	sub	r2, r2, #0x0c
210	stmdb	sp!, {r4, r5}
211
212.Lmemcpy_fsrcul2loop16:
213	mov	r3, lr, lsr #16
214	ldmia	r1!, {r4, r5, r12, lr}
215	orr	r3, r3, r4, lsl #16
216	mov	r4, r4, lsr #16
217	orr	r4, r4, r5, lsl #16
218	mov	r5, r5, lsr #16
219	orr	r5, r5, r12, lsl #16
220	mov	r12, r12, lsr #16
221	orr	r12, r12, lr, lsl #16
222	stmia	r0!, {r3-r5, r12}
223	subs	r2, r2, #0x10
224	bge	.Lmemcpy_fsrcul2loop16
225	ldmia	sp!, {r4, r5}
226	adds	r2, r2, #0x0c
227	blt	.Lmemcpy_fsrcul2l4
228
229.Lmemcpy_fsrcul2loop4:
230	mov	r12, lr, lsr #16
231	ldr	lr, [r1], #4
232	orr	r12, r12, lr, lsl #16
233	str	r12, [r0], #4
234	subs	r2, r2, #4
235	bge	.Lmemcpy_fsrcul2loop4
236
237.Lmemcpy_fsrcul2l4:
238	sub	r1, r1, #2
239	b	.Lmemcpy_fl4
240
241.Lmemcpy_fsrcul3:
242	cmp	r2, #0x0c
243	blt	.Lmemcpy_fsrcul3loop4
244	sub	r2, r2, #0x0c
245	stmdb	sp!, {r4, r5}
246
247.Lmemcpy_fsrcul3loop16:
248	mov	r3, lr, lsr #24
249	ldmia	r1!, {r4, r5, r12, lr}
250	orr	r3, r3, r4, lsl #8
251	mov	r4, r4, lsr #24
252	orr	r4, r4, r5, lsl #8
253	mov	r5, r5, lsr #24
254	orr	r5, r5, r12, lsl #8
255	mov	r12, r12, lsr #24
256	orr	r12, r12, lr, lsl #8
257	stmia	r0!, {r3-r5, r12}
258	subs	r2, r2, #0x10
259	bge	.Lmemcpy_fsrcul3loop16
260	ldmia	sp!, {r4, r5}
261	adds	r2, r2, #0x0c
262	blt	.Lmemcpy_fsrcul3l4
263
264.Lmemcpy_fsrcul3loop4:
265	mov	r12, lr, lsr #24
266	ldr	lr, [r1], #4
267	orr	r12, r12, lr, lsl #8
268	str	r12, [r0], #4
269	subs	r2, r2, #4
270	bge	.Lmemcpy_fsrcul3loop4
271
272.Lmemcpy_fsrcul3l4:
273	sub	r1, r1, #1
274	b	.Lmemcpy_fl4
275
276.Lmemcpy_backwards:
277	add	r1, r1, r2
278	add	r0, r0, r2
279	subs	r2, r2, #4
280	blt	.Lmemcpy_bl4		/* less than 4 bytes */
281	ands	r12, r0, #3
282	bne	.Lmemcpy_bdestul	/* oh unaligned destination addr */
283	ands	r12, r1, #3
284	bne	.Lmemcpy_bsrcul		/* oh unaligned source addr */
285
286.Lmemcpy_bt8:
287	/* We have aligned source and destination */
288	subs	r2, r2, #8
289	blt	.Lmemcpy_bl12		/* less than 12 bytes (4 from above) */
290	stmdb	sp!, {r4, lr}
291	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
292	blt	.Lmemcpy_bl32
293
294	/* blat 32 bytes at a time */
295	/* XXX for really big copies perhaps we should use more registers */
296.Lmemcpy_bloop32:
297	ldmdb	r1!, {r3, r4, r12, lr}
298	stmdb	r0!, {r3, r4, r12, lr}
299	ldmdb	r1!, {r3, r4, r12, lr}
300	stmdb	r0!, {r3, r4, r12, lr}
301	subs	r2, r2, #0x20
302	bge	.Lmemcpy_bloop32
303
304.Lmemcpy_bl32:
305	cmn	r2, #0x10
306	ldmdbge	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
307	stmdbge	r0!, {r3, r4, r12, lr}
308	subge	r2, r2, #0x10
309	adds	r2, r2, #0x14
310	ldmdbge	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
311	stmdbge	r0!, {r3, r12, lr}
312	subge	r2, r2, #0x0c
313	ldmia	sp!, {r4, lr}
314
315.Lmemcpy_bl12:
316	adds	r2, r2, #8
317	blt	.Lmemcpy_bl4
318	subs	r2, r2, #4
319	ldrlt	r3, [r1, #-4]!
320	strlt	r3, [r0, #-4]!
321	ldmdbge	r1!, {r3, r12}
322	stmdbge	r0!, {r3, r12}
323	subge	r2, r2, #4
324
325.Lmemcpy_bl4:
326	/* less than 4 bytes to go */
327	adds	r2, r2, #4
328	moveq	pc, lr			/* done */
329
330	/* copy the crud byte at a time */
331	cmp	r2, #2
332	ldrb	r3, [r1, #-1]!
333	strb	r3, [r0, #-1]!
334	ldrbge	r3, [r1, #-1]!
335	strbge	r3, [r0, #-1]!
336	ldrbgt	r3, [r1, #-1]!
337	strbgt	r3, [r0, #-1]!
338	mov	pc, lr
339
340	/* erg - unaligned destination */
341.Lmemcpy_bdestul:
342	cmp	r12, #2
343
344	/* align destination with byte copies */
345	ldrb	r3, [r1, #-1]!
346	strb	r3, [r0, #-1]!
347	ldrbge	r3, [r1, #-1]!
348	strbge	r3, [r0, #-1]!
349	ldrbgt	r3, [r1, #-1]!
350	strbgt	r3, [r0, #-1]!
351	subs	r2, r2, r12
352	blt	.Lmemcpy_bl4		/* less than 4 bytes to go */
353	ands	r12, r1, #3
354	beq	.Lmemcpy_bt8		/* we have an aligned source */
355
356	/* erg - unaligned source */
357	/* This is where it gets nasty ... */
358.Lmemcpy_bsrcul:
359	bic	r1, r1, #3
360	ldr	r3, [r1, #0]
361	cmp	r12, #2
362	blt	.Lmemcpy_bsrcul1
363	beq	.Lmemcpy_bsrcul2
364	cmp	r2, #0x0c
365	blt	.Lmemcpy_bsrcul3loop4
366	sub	r2, r2, #0x0c
367	stmdb	sp!, {r4, r5, lr}
368
369.Lmemcpy_bsrcul3loop16:
370	mov	lr, r3, lsl #8
371	ldmdb	r1!, {r3-r5, r12}
372	orr	lr, lr, r12, lsr #24
373	mov	r12, r12, lsl #8
374	orr	r12, r12, r5, lsr #24
375	mov	r5, r5, lsl #8
376	orr	r5, r5, r4, lsr #24
377	mov	r4, r4, lsl #8
378	orr	r4, r4, r3, lsr #24
379	stmdb	r0!, {r4, r5, r12, lr}
380	subs	r2, r2, #0x10
381	bge	.Lmemcpy_bsrcul3loop16
382	ldmia	sp!, {r4, r5, lr}
383	adds	r2, r2, #0x0c
384	blt	.Lmemcpy_bsrcul3l4
385
386.Lmemcpy_bsrcul3loop4:
387	mov	r12, r3, lsl #8
388	ldr	r3, [r1, #-4]!
389	orr	r12, r12, r3, lsr #24
390	str	r12, [r0, #-4]!
391	subs	r2, r2, #4
392	bge	.Lmemcpy_bsrcul3loop4
393
394.Lmemcpy_bsrcul3l4:
395	add	r1, r1, #3
396	b	.Lmemcpy_bl4
397
398.Lmemcpy_bsrcul2:
399	cmp	r2, #0x0c
400	blt	.Lmemcpy_bsrcul2loop4
401	sub	r2, r2, #0x0c
402	stmdb	sp!, {r4, r5, lr}
403
404.Lmemcpy_bsrcul2loop16:
405	mov	lr, r3, lsl #16
406	ldmdb	r1!, {r3-r5, r12}
407	orr	lr, lr, r12, lsr #16
408	mov	r12, r12, lsl #16
409	orr	r12, r12, r5, lsr #16
410	mov	r5, r5, lsl #16
411	orr	r5, r5, r4, lsr #16
412	mov	r4, r4, lsl #16
413	orr	r4, r4, r3, lsr #16
414	stmdb	r0!, {r4, r5, r12, lr}
415	subs	r2, r2, #0x10
416	bge	.Lmemcpy_bsrcul2loop16
417	ldmia	sp!, {r4, r5, lr}
418	adds	r2, r2, #0x0c
419	blt	.Lmemcpy_bsrcul2l4
420
421.Lmemcpy_bsrcul2loop4:
422	mov	r12, r3, lsl #16
423	ldr	r3, [r1, #-4]!
424	orr	r12, r12, r3, lsr #16
425	str	r12, [r0, #-4]!
426	subs	r2, r2, #4
427	bge	.Lmemcpy_bsrcul2loop4
428
429.Lmemcpy_bsrcul2l4:
430	add	r1, r1, #2
431	b	.Lmemcpy_bl4
432
433.Lmemcpy_bsrcul1:
434	cmp	r2, #0x0c
435	blt	.Lmemcpy_bsrcul1loop4
436	sub	r2, r2, #0x0c
437	stmdb	sp!, {r4, r5, lr}
438
439.Lmemcpy_bsrcul1loop32:
440	mov	lr, r3, lsl #24
441	ldmdb	r1!, {r3-r5, r12}
442	orr	lr, lr, r12, lsr #8
443	mov	r12, r12, lsl #24
444	orr	r12, r12, r5, lsr #8
445	mov	r5, r5, lsl #24
446	orr	r5, r5, r4, lsr #8
447	mov	r4, r4, lsl #24
448	orr	r4, r4, r3, lsr #8
449	stmdb	r0!, {r4, r5, r12, lr}
450	subs	r2, r2, #0x10
451	bge	.Lmemcpy_bsrcul1loop32
452	ldmia	sp!, {r4, r5, lr}
453	adds	r2, r2, #0x0c
454	blt	.Lmemcpy_bsrcul1l4
455
456.Lmemcpy_bsrcul1loop4:
457	mov	r12, r3, lsl #24
458	ldr	r3, [r1, #-4]!
459	orr	r12, r12, r3, lsr #8
460	str	r12, [r0, #-4]!
461	subs	r2, r2, #4
462	bge	.Lmemcpy_bsrcul1loop4
463
464.Lmemcpy_bsrcul1l4:
465	add	r1, r1, #1
466	b	.Lmemcpy_bl4
467END(_memcpy)
468