xref: /freebsd/sys/arm/arm/support.S (revision 3157ba21)
1/*-
2 * Copyright (c) 2004 Olivier Houchard
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26/*
27 * Copyright 2003 Wasabi Systems, Inc.
28 * All rights reserved.
29 *
30 * Written by Steve C. Woodford for Wasabi Systems, Inc.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *      This product includes software developed for the NetBSD Project by
43 *      Wasabi Systems, Inc.
44 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
45 *    or promote products derived from this software without specific prior
46 *    written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
50 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
51 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
52 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
53 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
54 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
55 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
56 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
57 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
58 * POSSIBILITY OF SUCH DAMAGE.
59 */
60/*
61 * Copyright (c) 1997 The NetBSD Foundation, Inc.
62 * All rights reserved.
63 *
64 * This code is derived from software contributed to The NetBSD Foundation
65 * by Neil A. Carson and Mark Brinicombe
66 *
67 * Redistribution and use in source and binary forms, with or without
68 * modification, are permitted provided that the following conditions
69 * are met:
70 * 1. Redistributions of source code must retain the above copyright
71 *    notice, this list of conditions and the following disclaimer.
72 * 2. Redistributions in binary form must reproduce the above copyright
73 *    notice, this list of conditions and the following disclaimer in the
74 *    documentation and/or other materials provided with the distribution.
75 *
76 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
77 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
78 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
79 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
80 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
81 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
82 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
83 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
84 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
85 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
86 * POSSIBILITY OF SUCH DAMAGE.
87 */
88
89#include <machine/asm.h>
90#include <machine/asmacros.h>
91__FBSDID("$FreeBSD$");
92
93#include "assym.s"
94
95.L_arm_memcpy:
96	.word	_C_LABEL(_arm_memcpy)
97.L_arm_bzero:
98	.word	_C_LABEL(_arm_bzero)
99.L_min_memcpy_size:
100	.word	_C_LABEL(_min_memcpy_size)
101.L_min_bzero_size:
102	.word	_C_LABEL(_min_bzero_size)
103/*
104 * memset: Sets a block of memory to the specified value
105 *
106 * On entry:
107 *   r0 - dest address
108 *   r1 - byte to write
109 *   r2 - number of bytes to write
110 *
111 * On exit:
112 *   r0 - dest address
113 */
114/* LINTSTUB: Func: void bzero(void *, size_t) */
115ENTRY(bzero)
116	ldr	r3, .L_arm_bzero
117	ldr	r3, [r3]
118	cmp	r3, #0
119	beq	.Lnormal0
120	ldr	r2, .L_min_bzero_size
121	ldr	r2, [r2]
122	cmp	r1, r2
123	blt	.Lnormal0
124	stmfd	sp!, {r0, r1, lr}
125	mov	r2, #0
126	mov	lr, pc
127	mov	pc, r3
128	cmp	r0, #0
129	ldmfd	sp!, {r0, r1, lr}
130	RETeq
131.Lnormal0:
132	mov	r3, #0x00
133	b	do_memset
134
135/* LINTSTUB: Func: void *memset(void *, int, size_t) */
136ENTRY(memset)
137	and	r3, r1, #0xff		/* We deal with bytes */
138	mov	r1, r2
139do_memset:
140	cmp	r1, #0x04		/* Do we have less than 4 bytes */
141	mov	ip, r0
142	blt	.Lmemset_lessthanfour
143
144	/* Ok first we will word align the address */
145	ands	r2, ip, #0x03		/* Get the bottom two bits */
146	bne	.Lmemset_wordunaligned	/* The address is not word aligned */
147
148	/* We are now word aligned */
149.Lmemset_wordaligned:
150	orr	r3, r3, r3, lsl #8	/* Extend value to 16-bits */
151#ifdef _ARM_ARCH_5E
152	tst	ip, #0x04		/* Quad-align for armv5e */
153#else
154	cmp	r1, #0x10
155#endif
156	orr	r3, r3, r3, lsl #16	/* Extend value to 32-bits */
157#ifdef _ARM_ARCH_5E
158	subne	r1, r1, #0x04		/* Quad-align if necessary */
159	strne	r3, [ip], #0x04
160	cmp	r1, #0x10
161#endif
162	blt	.Lmemset_loop4		/* If less than 16 then use words */
163	mov	r2, r3			/* Duplicate data */
164	cmp	r1, #0x80		/* If < 128 then skip the big loop */
165	blt	.Lmemset_loop32
166
167	/* Do 128 bytes at a time */
168.Lmemset_loop128:
169	subs	r1, r1, #0x80
170#ifdef _ARM_ARCH_5E
171	strged	r2, [ip], #0x08
172	strged	r2, [ip], #0x08
173	strged	r2, [ip], #0x08
174	strged	r2, [ip], #0x08
175	strged	r2, [ip], #0x08
176	strged	r2, [ip], #0x08
177	strged	r2, [ip], #0x08
178	strged	r2, [ip], #0x08
179	strged	r2, [ip], #0x08
180	strged	r2, [ip], #0x08
181	strged	r2, [ip], #0x08
182	strged	r2, [ip], #0x08
183	strged	r2, [ip], #0x08
184	strged	r2, [ip], #0x08
185	strged	r2, [ip], #0x08
186	strged	r2, [ip], #0x08
187#else
188	stmgeia	ip!, {r2-r3}
189	stmgeia	ip!, {r2-r3}
190	stmgeia	ip!, {r2-r3}
191	stmgeia	ip!, {r2-r3}
192	stmgeia	ip!, {r2-r3}
193	stmgeia	ip!, {r2-r3}
194	stmgeia	ip!, {r2-r3}
195	stmgeia	ip!, {r2-r3}
196	stmgeia	ip!, {r2-r3}
197	stmgeia	ip!, {r2-r3}
198	stmgeia	ip!, {r2-r3}
199	stmgeia	ip!, {r2-r3}
200	stmgeia	ip!, {r2-r3}
201	stmgeia	ip!, {r2-r3}
202	stmgeia	ip!, {r2-r3}
203	stmgeia	ip!, {r2-r3}
204#endif
205	bgt	.Lmemset_loop128
206	RETeq			/* Zero length so just exit */
207
208	add	r1, r1, #0x80		/* Adjust for extra sub */
209
210	/* Do 32 bytes at a time */
211.Lmemset_loop32:
212	subs	r1, r1, #0x20
213#ifdef _ARM_ARCH_5E
214	strged	r2, [ip], #0x08
215	strged	r2, [ip], #0x08
216	strged	r2, [ip], #0x08
217	strged	r2, [ip], #0x08
218#else
219	stmgeia	ip!, {r2-r3}
220	stmgeia	ip!, {r2-r3}
221	stmgeia	ip!, {r2-r3}
222	stmgeia	ip!, {r2-r3}
223#endif
224	bgt	.Lmemset_loop32
225	RETeq			/* Zero length so just exit */
226
227	adds	r1, r1, #0x10		/* Partially adjust for extra sub */
228
229	/* Deal with 16 bytes or more */
230#ifdef _ARM_ARCH_5E
231	strged	r2, [ip], #0x08
232	strged	r2, [ip], #0x08
233#else
234	stmgeia	ip!, {r2-r3}
235	stmgeia	ip!, {r2-r3}
236#endif
237	RETeq			/* Zero length so just exit */
238
239	addlt	r1, r1, #0x10		/* Possibly adjust for extra sub */
240
241	/* We have at least 4 bytes so copy as words */
242.Lmemset_loop4:
243	subs	r1, r1, #0x04
244	strge	r3, [ip], #0x04
245	bgt	.Lmemset_loop4
246	RETeq			/* Zero length so just exit */
247
248#ifdef _ARM_ARCH_5E
249	/* Compensate for 64-bit alignment check */
250	adds	r1, r1, #0x04
251	RETeq
252	cmp	r1, #2
253#else
254	cmp	r1, #-2
255#endif
256
257	strb	r3, [ip], #0x01		/* Set 1 byte */
258	strgeb	r3, [ip], #0x01		/* Set another byte */
259	strgtb	r3, [ip]		/* and a third */
260	RET			/* Exit */
261
262.Lmemset_wordunaligned:
263	rsb	r2, r2, #0x004
264	strb	r3, [ip], #0x01		/* Set 1 byte */
265	cmp	r2, #0x02
266	strgeb	r3, [ip], #0x01		/* Set another byte */
267	sub	r1, r1, r2
268	strgtb	r3, [ip], #0x01		/* and a third */
269	cmp	r1, #0x04		/* More than 4 bytes left? */
270	bge	.Lmemset_wordaligned	/* Yup */
271
272.Lmemset_lessthanfour:
273	cmp	r1, #0x00
274	RETeq			/* Zero length so exit */
275	strb	r3, [ip], #0x01		/* Set 1 byte */
276	cmp	r1, #0x02
277	strgeb	r3, [ip], #0x01		/* Set another byte */
278	strgtb	r3, [ip]		/* and a third */
279	RET			/* Exit */
280
281ENTRY(bcmp)
282	mov	ip, r0
283	cmp	r2, #0x06
284	beq	.Lmemcmp_6bytes
285	mov	r0, #0x00
286
287	/* Are both addresses aligned the same way? */
288	cmp	r2, #0x00
289	eornes	r3, ip, r1
290	RETeq			/* len == 0, or same addresses! */
291	tst	r3, #0x03
292	subne	r2, r2, #0x01
293	bne	.Lmemcmp_bytewise2	/* Badly aligned. Do it the slow way */
294
295	/* Word-align the addresses, if necessary */
296	sub	r3, r1, #0x05
297	ands	r3, r3, #0x03
298	add	r3, r3, r3, lsl #1
299	addne	pc, pc, r3, lsl #3
300	nop
301
302	/* Compare up to 3 bytes */
303	ldrb	r0, [ip], #0x01
304	ldrb	r3, [r1], #0x01
305	subs	r0, r0, r3
306	RETne
307	subs	r2, r2, #0x01
308	RETeq
309
310	/* Compare up to 2 bytes */
311	ldrb	r0, [ip], #0x01
312	ldrb	r3, [r1], #0x01
313	subs	r0, r0, r3
314	RETne
315	subs	r2, r2, #0x01
316	RETeq
317
318	/* Compare 1 byte */
319	ldrb	r0, [ip], #0x01
320	ldrb	r3, [r1], #0x01
321	subs	r0, r0, r3
322	RETne
323	subs	r2, r2, #0x01
324	RETeq
325
326	/* Compare 4 bytes at a time, if possible */
327	subs	r2, r2, #0x04
328	bcc	.Lmemcmp_bytewise
329.Lmemcmp_word_aligned:
330	ldr	r0, [ip], #0x04
331	ldr	r3, [r1], #0x04
332	subs	r2, r2, #0x04
333	cmpcs	r0, r3
334	beq	.Lmemcmp_word_aligned
335	sub	r0, r0, r3
336
337	/* Correct for extra subtraction, and check if done */
338	adds	r2, r2, #0x04
339	cmpeq	r0, #0x00		/* If done, did all bytes match? */
340	RETeq			/* Yup. Just return */
341
342	/* Re-do the final word byte-wise */
343	sub	ip, ip, #0x04
344	sub	r1, r1, #0x04
345
346.Lmemcmp_bytewise:
347	add	r2, r2, #0x03
348.Lmemcmp_bytewise2:
349	ldrb	r0, [ip], #0x01
350	ldrb	r3, [r1], #0x01
351	subs	r2, r2, #0x01
352	cmpcs	r0, r3
353	beq	.Lmemcmp_bytewise2
354	sub	r0, r0, r3
355	RET
356
357	/*
358	 * 6 byte compares are very common, thanks to the network stack.
359	 * This code is hand-scheduled to reduce the number of stalls for
360	 * load results. Everything else being equal, this will be ~32%
361	 * faster than a byte-wise memcmp.
362	 */
363	.align	5
364.Lmemcmp_6bytes:
365	ldrb	r3, [r1, #0x00]		/* r3 = b2#0 */
366	ldrb	r0, [ip, #0x00]		/* r0 = b1#0 */
367	ldrb	r2, [r1, #0x01]		/* r2 = b2#1 */
368	subs	r0, r0, r3		/* r0 = b1#0 - b2#0 */
369	ldreqb	r3, [ip, #0x01]		/* r3 = b1#1 */
370	RETne			/* Return if mismatch on #0 */
371	subs	r0, r3, r2		/* r0 = b1#1 - b2#1 */
372	ldreqb	r3, [r1, #0x02]		/* r3 = b2#2 */
373	ldreqb	r0, [ip, #0x02]		/* r0 = b1#2 */
374	RETne			/* Return if mismatch on #1 */
375	ldrb	r2, [r1, #0x03]		/* r2 = b2#3 */
376	subs	r0, r0, r3		/* r0 = b1#2 - b2#2 */
377	ldreqb	r3, [ip, #0x03]		/* r3 = b1#3 */
378	RETne			/* Return if mismatch on #2 */
379	subs	r0, r3, r2		/* r0 = b1#3 - b2#3 */
380	ldreqb	r3, [r1, #0x04]		/* r3 = b2#4 */
381	ldreqb	r0, [ip, #0x04]		/* r0 = b1#4 */
382	RETne			/* Return if mismatch on #3 */
383	ldrb	r2, [r1, #0x05]		/* r2 = b2#5 */
384	subs	r0, r0, r3		/* r0 = b1#4 - b2#4 */
385	ldreqb	r3, [ip, #0x05]		/* r3 = b1#5 */
386	RETne			/* Return if mismatch on #4 */
387	sub	r0, r3, r2		/* r0 = b1#5 - b2#5 */
388	RET
389
390ENTRY(bcopy)
391	/* switch the source and destination registers */
392	eor     r0, r1, r0
393	eor     r1, r0, r1
394	eor     r0, r1, r0
395ENTRY(memmove)
396	/* Do the buffers overlap? */
397	cmp	r0, r1
398	RETeq		/* Bail now if src/dst are the same */
399	subcc	r3, r0, r1	/* if (dst > src) r3 = dst - src */
400	subcs	r3, r1, r0	/* if (src > dsr) r3 = src - dst */
401	cmp	r3, r2		/* if (r3 < len) we have an overlap */
402	bcc	PIC_SYM(_C_LABEL(memcpy), PLT)
403
404	/* Determine copy direction */
405	cmp	r1, r0
406	bcc	.Lmemmove_backwards
407
408	moveq	r0, #0			/* Quick abort for len=0 */
409	RETeq
410
411	stmdb	sp!, {r0, lr}		/* memmove() returns dest addr */
412	subs	r2, r2, #4
413	blt	.Lmemmove_fl4		/* less than 4 bytes */
414	ands	r12, r0, #3
415	bne	.Lmemmove_fdestul	/* oh unaligned destination addr */
416	ands	r12, r1, #3
417	bne	.Lmemmove_fsrcul		/* oh unaligned source addr */
418
419.Lmemmove_ft8:
420	/* We have aligned source and destination */
421	subs	r2, r2, #8
422	blt	.Lmemmove_fl12		/* less than 12 bytes (4 from above) */
423	subs	r2, r2, #0x14
424	blt	.Lmemmove_fl32		/* less than 32 bytes (12 from above) */
425	stmdb	sp!, {r4}		/* borrow r4 */
426
427	/* blat 32 bytes at a time */
428	/* XXX for really big copies perhaps we should use more registers */
429.Lmemmove_floop32:
430	ldmia	r1!, {r3, r4, r12, lr}
431	stmia	r0!, {r3, r4, r12, lr}
432	ldmia	r1!, {r3, r4, r12, lr}
433	stmia	r0!, {r3, r4, r12, lr}
434	subs	r2, r2, #0x20
435	bge	.Lmemmove_floop32
436
437	cmn	r2, #0x10
438	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
439	stmgeia	r0!, {r3, r4, r12, lr}
440	subge	r2, r2, #0x10
441	ldmia	sp!, {r4}		/* return r4 */
442
443.Lmemmove_fl32:
444	adds	r2, r2, #0x14
445
446	/* blat 12 bytes at a time */
447.Lmemmove_floop12:
448	ldmgeia	r1!, {r3, r12, lr}
449	stmgeia	r0!, {r3, r12, lr}
450	subges	r2, r2, #0x0c
451	bge	.Lmemmove_floop12
452
453.Lmemmove_fl12:
454	adds	r2, r2, #8
455	blt	.Lmemmove_fl4
456
457	subs	r2, r2, #4
458	ldrlt	r3, [r1], #4
459	strlt	r3, [r0], #4
460	ldmgeia	r1!, {r3, r12}
461	stmgeia	r0!, {r3, r12}
462	subge	r2, r2, #4
463
464.Lmemmove_fl4:
465	/* less than 4 bytes to go */
466	adds	r2, r2, #4
467	ldmeqia	sp!, {r0, pc}		/* done */
468
469	/* copy the crud byte at a time */
470	cmp	r2, #2
471	ldrb	r3, [r1], #1
472	strb	r3, [r0], #1
473	ldrgeb	r3, [r1], #1
474	strgeb	r3, [r0], #1
475	ldrgtb	r3, [r1], #1
476	strgtb	r3, [r0], #1
477	ldmia	sp!, {r0, pc}
478
479	/* erg - unaligned destination */
480.Lmemmove_fdestul:
481	rsb	r12, r12, #4
482	cmp	r12, #2
483
484	/* align destination with byte copies */
485	ldrb	r3, [r1], #1
486	strb	r3, [r0], #1
487	ldrgeb	r3, [r1], #1
488	strgeb	r3, [r0], #1
489	ldrgtb	r3, [r1], #1
490	strgtb	r3, [r0], #1
491	subs	r2, r2, r12
492	blt	.Lmemmove_fl4		/* less the 4 bytes */
493
494	ands	r12, r1, #3
495	beq	.Lmemmove_ft8		/* we have an aligned source */
496
497	/* erg - unaligned source */
498	/* This is where it gets nasty ... */
499.Lmemmove_fsrcul:
500	bic	r1, r1, #3
501	ldr	lr, [r1], #4
502	cmp	r12, #2
503	bgt	.Lmemmove_fsrcul3
504	beq	.Lmemmove_fsrcul2
505	cmp	r2, #0x0c
506	blt	.Lmemmove_fsrcul1loop4
507	sub	r2, r2, #0x0c
508	stmdb	sp!, {r4, r5}
509
510.Lmemmove_fsrcul1loop16:
511#ifdef __ARMEB__
512	mov	r3, lr, lsl #8
513#else
514	mov	r3, lr, lsr #8
515#endif
516	ldmia	r1!, {r4, r5, r12, lr}
517#ifdef __ARMEB__
518	orr	r3, r3, r4, lsr #24
519	mov	r4, r4, lsl #8
520	orr	r4, r4, r5, lsr #24
521	mov	r5, r5, lsl #8
522	orr	r5, r5, r12, lsr #24
523	mov	r12, r12, lsl #8
524	orr	r12, r12, lr, lsr #24
525#else
526	orr	r3, r3, r4, lsl #24
527	mov	r4, r4, lsr #8
528	orr	r4, r4, r5, lsl #24
529	mov	r5, r5, lsr #8
530	orr	r5, r5, r12, lsl #24
531	mov	r12, r12, lsr #8
532	orr	r12, r12, lr, lsl #24
533#endif
534	stmia	r0!, {r3-r5, r12}
535	subs	r2, r2, #0x10
536	bge	.Lmemmove_fsrcul1loop16
537	ldmia	sp!, {r4, r5}
538	adds	r2, r2, #0x0c
539	blt	.Lmemmove_fsrcul1l4
540
541.Lmemmove_fsrcul1loop4:
542#ifdef __ARMEB__
543	mov	r12, lr, lsl #8
544#else
545	mov	r12, lr, lsr #8
546#endif
547	ldr	lr, [r1], #4
548#ifdef __ARMEB__
549	orr	r12, r12, lr, lsr #24
550#else
551	orr	r12, r12, lr, lsl #24
552#endif
553	str	r12, [r0], #4
554	subs	r2, r2, #4
555	bge	.Lmemmove_fsrcul1loop4
556
557.Lmemmove_fsrcul1l4:
558	sub	r1, r1, #3
559	b	.Lmemmove_fl4
560
561.Lmemmove_fsrcul2:
562	cmp	r2, #0x0c
563	blt	.Lmemmove_fsrcul2loop4
564	sub	r2, r2, #0x0c
565	stmdb	sp!, {r4, r5}
566
567.Lmemmove_fsrcul2loop16:
568#ifdef __ARMEB__
569	mov	r3, lr, lsl #16
570#else
571	mov	r3, lr, lsr #16
572#endif
573	ldmia	r1!, {r4, r5, r12, lr}
574#ifdef __ARMEB__
575	orr	r3, r3, r4, lsr #16
576	mov	r4, r4, lsl #16
577	orr	r4, r4, r5, lsr #16
578	mov	r5, r5, lsl #16
579	orr	r5, r5, r12, lsr #16
580	mov	r12, r12, lsl #16
581	orr	r12, r12, lr, lsr #16
582#else
583	orr	r3, r3, r4, lsl #16
584	mov	r4, r4, lsr #16
585	orr	r4, r4, r5, lsl #16
586	mov	r5, r5, lsr #16
587	orr	r5, r5, r12, lsl #16
588	mov	r12, r12, lsr #16
589	orr	r12, r12, lr, lsl #16
590#endif
591	stmia	r0!, {r3-r5, r12}
592	subs	r2, r2, #0x10
593	bge	.Lmemmove_fsrcul2loop16
594	ldmia	sp!, {r4, r5}
595	adds	r2, r2, #0x0c
596	blt	.Lmemmove_fsrcul2l4
597
598.Lmemmove_fsrcul2loop4:
599#ifdef __ARMEB__
600	mov	r12, lr, lsl #16
601#else
602	mov	r12, lr, lsr #16
603#endif
604	ldr	lr, [r1], #4
605#ifdef __ARMEB__
606	orr	r12, r12, lr, lsr #16
607#else
608	orr	r12, r12, lr, lsl #16
609#endif
610	str	r12, [r0], #4
611	subs	r2, r2, #4
612	bge	.Lmemmove_fsrcul2loop4
613
614.Lmemmove_fsrcul2l4:
615	sub	r1, r1, #2
616	b	.Lmemmove_fl4
617
618.Lmemmove_fsrcul3:
619	cmp	r2, #0x0c
620	blt	.Lmemmove_fsrcul3loop4
621	sub	r2, r2, #0x0c
622	stmdb	sp!, {r4, r5}
623
624.Lmemmove_fsrcul3loop16:
625#ifdef __ARMEB__
626	mov	r3, lr, lsl #24
627#else
628	mov	r3, lr, lsr #24
629#endif
630	ldmia	r1!, {r4, r5, r12, lr}
631#ifdef __ARMEB__
632	orr	r3, r3, r4, lsr #8
633	mov	r4, r4, lsl #24
634	orr	r4, r4, r5, lsr #8
635	mov	r5, r5, lsl #24
636	orr	r5, r5, r12, lsr #8
637	mov	r12, r12, lsl #24
638	orr	r12, r12, lr, lsr #8
639#else
640	orr	r3, r3, r4, lsl #8
641	mov	r4, r4, lsr #24
642	orr	r4, r4, r5, lsl #8
643	mov	r5, r5, lsr #24
644	orr	r5, r5, r12, lsl #8
645	mov	r12, r12, lsr #24
646	orr	r12, r12, lr, lsl #8
647#endif
648	stmia	r0!, {r3-r5, r12}
649	subs	r2, r2, #0x10
650	bge	.Lmemmove_fsrcul3loop16
651	ldmia	sp!, {r4, r5}
652	adds	r2, r2, #0x0c
653	blt	.Lmemmove_fsrcul3l4
654
655.Lmemmove_fsrcul3loop4:
656#ifdef __ARMEB__
657	mov	r12, lr, lsl #24
658#else
659	mov	r12, lr, lsr #24
660#endif
661	ldr	lr, [r1], #4
662#ifdef __ARMEB__
663	orr	r12, r12, lr, lsr #8
664#else
665	orr	r12, r12, lr, lsl #8
666#endif
667	str	r12, [r0], #4
668	subs	r2, r2, #4
669	bge	.Lmemmove_fsrcul3loop4
670
671.Lmemmove_fsrcul3l4:
672	sub	r1, r1, #1
673	b	.Lmemmove_fl4
674
675.Lmemmove_backwards:
676	add	r1, r1, r2
677	add	r0, r0, r2
678	subs	r2, r2, #4
679	blt	.Lmemmove_bl4		/* less than 4 bytes */
680	ands	r12, r0, #3
681	bne	.Lmemmove_bdestul	/* oh unaligned destination addr */
682	ands	r12, r1, #3
683	bne	.Lmemmove_bsrcul		/* oh unaligned source addr */
684
685.Lmemmove_bt8:
686	/* We have aligned source and destination */
687	subs	r2, r2, #8
688	blt	.Lmemmove_bl12		/* less than 12 bytes (4 from above) */
689	stmdb	sp!, {r4, lr}
690	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
691	blt	.Lmemmove_bl32
692
693	/* blat 32 bytes at a time */
694	/* XXX for really big copies perhaps we should use more registers */
695.Lmemmove_bloop32:
696	ldmdb	r1!, {r3, r4, r12, lr}
697	stmdb	r0!, {r3, r4, r12, lr}
698	ldmdb	r1!, {r3, r4, r12, lr}
699	stmdb	r0!, {r3, r4, r12, lr}
700	subs	r2, r2, #0x20
701	bge	.Lmemmove_bloop32
702
703.Lmemmove_bl32:
704	cmn	r2, #0x10
705	ldmgedb	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
706	stmgedb	r0!, {r3, r4, r12, lr}
707	subge	r2, r2, #0x10
708	adds	r2, r2, #0x14
709	ldmgedb	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
710	stmgedb	r0!, {r3, r12, lr}
711	subge	r2, r2, #0x0c
712	ldmia	sp!, {r4, lr}
713
714.Lmemmove_bl12:
715	adds	r2, r2, #8
716	blt	.Lmemmove_bl4
717	subs	r2, r2, #4
718	ldrlt	r3, [r1, #-4]!
719	strlt	r3, [r0, #-4]!
720	ldmgedb	r1!, {r3, r12}
721	stmgedb	r0!, {r3, r12}
722	subge	r2, r2, #4
723
724.Lmemmove_bl4:
725	/* less than 4 bytes to go */
726	adds	r2, r2, #4
727	RETeq			/* done */
728
729	/* copy the crud byte at a time */
730	cmp	r2, #2
731	ldrb	r3, [r1, #-1]!
732	strb	r3, [r0, #-1]!
733	ldrgeb	r3, [r1, #-1]!
734	strgeb	r3, [r0, #-1]!
735	ldrgtb	r3, [r1, #-1]!
736	strgtb	r3, [r0, #-1]!
737	RET
738
739	/* erg - unaligned destination */
740.Lmemmove_bdestul:
741	cmp	r12, #2
742
743	/* align destination with byte copies */
744	ldrb	r3, [r1, #-1]!
745	strb	r3, [r0, #-1]!
746	ldrgeb	r3, [r1, #-1]!
747	strgeb	r3, [r0, #-1]!
748	ldrgtb	r3, [r1, #-1]!
749	strgtb	r3, [r0, #-1]!
750	subs	r2, r2, r12
751	blt	.Lmemmove_bl4		/* less than 4 bytes to go */
752	ands	r12, r1, #3
753	beq	.Lmemmove_bt8		/* we have an aligned source */
754
755	/* erg - unaligned source */
756	/* This is where it gets nasty ... */
757.Lmemmove_bsrcul:
758	bic	r1, r1, #3
759	ldr	r3, [r1, #0]
760	cmp	r12, #2
761	blt	.Lmemmove_bsrcul1
762	beq	.Lmemmove_bsrcul2
763	cmp	r2, #0x0c
764	blt	.Lmemmove_bsrcul3loop4
765	sub	r2, r2, #0x0c
766	stmdb	sp!, {r4, r5, lr}
767
768.Lmemmove_bsrcul3loop16:
769#ifdef __ARMEB__
770	mov	lr, r3, lsr #8
771#else
772	mov	lr, r3, lsl #8
773#endif
774	ldmdb	r1!, {r3-r5, r12}
775#ifdef __ARMEB__
776	orr	lr, lr, r12, lsl #24
777	mov	r12, r12, lsr #8
778	orr	r12, r12, r5, lsl #24
779	mov	r5, r5, lsr #8
780	orr	r5, r5, r4, lsl #24
781	mov	r4, r4, lsr #8
782	orr	r4, r4, r3, lsl #24
783#else
784	orr	lr, lr, r12, lsr #24
785	mov	r12, r12, lsl #8
786	orr	r12, r12, r5, lsr #24
787	mov	r5, r5, lsl #8
788	orr	r5, r5, r4, lsr #24
789	mov	r4, r4, lsl #8
790	orr	r4, r4, r3, lsr #24
791#endif
792	stmdb	r0!, {r4, r5, r12, lr}
793	subs	r2, r2, #0x10
794	bge	.Lmemmove_bsrcul3loop16
795	ldmia	sp!, {r4, r5, lr}
796	adds	r2, r2, #0x0c
797	blt	.Lmemmove_bsrcul3l4
798
799.Lmemmove_bsrcul3loop4:
800#ifdef __ARMEB__
801	mov	r12, r3, lsr #8
802#else
803	mov	r12, r3, lsl #8
804#endif
805	ldr	r3, [r1, #-4]!
806#ifdef __ARMEB__
807	orr	r12, r12, r3, lsl #24
808#else
809	orr	r12, r12, r3, lsr #24
810#endif
811	str	r12, [r0, #-4]!
812	subs	r2, r2, #4
813	bge	.Lmemmove_bsrcul3loop4
814
815.Lmemmove_bsrcul3l4:
816	add	r1, r1, #3
817	b	.Lmemmove_bl4
818
819.Lmemmove_bsrcul2:
820	cmp	r2, #0x0c
821	blt	.Lmemmove_bsrcul2loop4
822	sub	r2, r2, #0x0c
823	stmdb	sp!, {r4, r5, lr}
824
825.Lmemmove_bsrcul2loop16:
826#ifdef __ARMEB__
827	mov	lr, r3, lsr #16
828#else
829	mov	lr, r3, lsl #16
830#endif
831	ldmdb	r1!, {r3-r5, r12}
832#ifdef __ARMEB__
833	orr	lr, lr, r12, lsl #16
834	mov	r12, r12, lsr #16
835	orr	r12, r12, r5, lsl #16
836	mov	r5, r5, lsr #16
837	orr	r5, r5, r4, lsl #16
838	mov	r4, r4, lsr #16
839	orr	r4, r4, r3, lsl #16
840#else
841	orr	lr, lr, r12, lsr #16
842	mov	r12, r12, lsl #16
843	orr	r12, r12, r5, lsr #16
844	mov	r5, r5, lsl #16
845	orr	r5, r5, r4, lsr #16
846	mov	r4, r4, lsl #16
847	orr	r4, r4, r3, lsr #16
848#endif
849	stmdb	r0!, {r4, r5, r12, lr}
850	subs	r2, r2, #0x10
851	bge	.Lmemmove_bsrcul2loop16
852	ldmia	sp!, {r4, r5, lr}
853	adds	r2, r2, #0x0c
854	blt	.Lmemmove_bsrcul2l4
855
856.Lmemmove_bsrcul2loop4:
857#ifdef __ARMEB__
858	mov	r12, r3, lsr #16
859#else
860	mov	r12, r3, lsl #16
861#endif
862	ldr	r3, [r1, #-4]!
863#ifdef __ARMEB__
864	orr	r12, r12, r3, lsl #16
865#else
866	orr	r12, r12, r3, lsr #16
867#endif
868	str	r12, [r0, #-4]!
869	subs	r2, r2, #4
870	bge	.Lmemmove_bsrcul2loop4
871
872.Lmemmove_bsrcul2l4:
873	add	r1, r1, #2
874	b	.Lmemmove_bl4
875
876.Lmemmove_bsrcul1:
877	cmp	r2, #0x0c
878	blt	.Lmemmove_bsrcul1loop4
879	sub	r2, r2, #0x0c
880	stmdb	sp!, {r4, r5, lr}
881
882.Lmemmove_bsrcul1loop32:
883#ifdef __ARMEB__
884	mov	lr, r3, lsr #24
885#else
886	mov	lr, r3, lsl #24
887#endif
888	ldmdb	r1!, {r3-r5, r12}
889#ifdef __ARMEB__
890	orr	lr, lr, r12, lsl #8
891	mov	r12, r12, lsr #24
892	orr	r12, r12, r5, lsl #8
893	mov	r5, r5, lsr #24
894	orr	r5, r5, r4, lsl #8
895	mov	r4, r4, lsr #24
896	orr	r4, r4, r3, lsl #8
897#else
898	orr	lr, lr, r12, lsr #8
899	mov	r12, r12, lsl #24
900	orr	r12, r12, r5, lsr #8
901	mov	r5, r5, lsl #24
902	orr	r5, r5, r4, lsr #8
903	mov	r4, r4, lsl #24
904	orr	r4, r4, r3, lsr #8
905#endif
906	stmdb	r0!, {r4, r5, r12, lr}
907	subs	r2, r2, #0x10
908	bge	.Lmemmove_bsrcul1loop32
909	ldmia	sp!, {r4, r5, lr}
910	adds	r2, r2, #0x0c
911	blt	.Lmemmove_bsrcul1l4
912
913.Lmemmove_bsrcul1loop4:
914#ifdef __ARMEB__
915	mov	r12, r3, lsr #24
916#else
917	mov	r12, r3, lsl #24
918#endif
919	ldr	r3, [r1, #-4]!
920#ifdef __ARMEB__
921	orr	r12, r12, r3, lsl #8
922#else
923	orr	r12, r12, r3, lsr #8
924#endif
925	str	r12, [r0, #-4]!
926	subs	r2, r2, #4
927	bge	.Lmemmove_bsrcul1loop4
928
929.Lmemmove_bsrcul1l4:
930	add	r1, r1, #1
931	b	.Lmemmove_bl4
932
933#if !defined(_ARM_ARCH_5E)
934ENTRY(memcpy)
935	/* save leaf functions having to store this away */
936	/* Do not check arm_memcpy if we're running from flash */
937#ifdef FLASHADDR
938#if FLASHADDR > PHYSADDR
939	ldr	r3, =FLASHADDR
940	cmp	r3, pc
941	bls	.Lnormal
942#else
943	ldr	r3, =FLASHADDR
944	cmp	r3, pc
945	bhi	.Lnormal
946#endif
947#endif
948	ldr	r3, .L_arm_memcpy
949	ldr	r3, [r3]
950	cmp	r3, #0
951	beq	.Lnormal
952	ldr	r3, .L_min_memcpy_size
953	ldr	r3, [r3]
954	cmp	r2, r3
955	blt	.Lnormal
956	stmfd	sp!, {r0-r2, r4, lr}
957	mov	r3, #0
958	ldr	r4, .L_arm_memcpy
959	mov	lr, pc
960	ldr	pc, [r4]
961	cmp	r0, #0
962	ldmfd	sp!, {r0-r2, r4, lr}
963	RETeq
964
965.Lnormal:
966	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
967
968	subs	r2, r2, #4
969	blt	.Lmemcpy_l4		/* less than 4 bytes */
970	ands	r12, r0, #3
971	bne	.Lmemcpy_destul		/* oh unaligned destination addr */
972	ands	r12, r1, #3
973	bne	.Lmemcpy_srcul		/* oh unaligned source addr */
974
975.Lmemcpy_t8:
976	/* We have aligned source and destination */
977	subs	r2, r2, #8
978	blt	.Lmemcpy_l12		/* less than 12 bytes (4 from above) */
979	subs	r2, r2, #0x14
980	blt	.Lmemcpy_l32		/* less than 32 bytes (12 from above) */
981	stmdb	sp!, {r4}		/* borrow r4 */
982
983	/* blat 32 bytes at a time */
984	/* XXX for really big copies perhaps we should use more registers */
985.Lmemcpy_loop32:
986	ldmia	r1!, {r3, r4, r12, lr}
987	stmia	r0!, {r3, r4, r12, lr}
988	ldmia	r1!, {r3, r4, r12, lr}
989	stmia	r0!, {r3, r4, r12, lr}
990	subs	r2, r2, #0x20
991	bge	.Lmemcpy_loop32
992
993	cmn	r2, #0x10
994	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
995	stmgeia	r0!, {r3, r4, r12, lr}
996	subge	r2, r2, #0x10
997	ldmia	sp!, {r4}		/* return r4 */
998
999.Lmemcpy_l32:
1000	adds	r2, r2, #0x14
1001
1002	/* blat 12 bytes at a time */
1003.Lmemcpy_loop12:
1004	ldmgeia	r1!, {r3, r12, lr}
1005	stmgeia	r0!, {r3, r12, lr}
1006	subges	r2, r2, #0x0c
1007	bge	.Lmemcpy_loop12
1008
1009.Lmemcpy_l12:
1010	adds	r2, r2, #8
1011	blt	.Lmemcpy_l4
1012
1013	subs	r2, r2, #4
1014	ldrlt	r3, [r1], #4
1015	strlt	r3, [r0], #4
1016	ldmgeia	r1!, {r3, r12}
1017	stmgeia	r0!, {r3, r12}
1018	subge	r2, r2, #4
1019
1020.Lmemcpy_l4:
1021	/* less than 4 bytes to go */
1022	adds	r2, r2, #4
1023#ifdef __APCS_26_
1024	ldmeqia sp!, {r0, pc}^		/* done */
1025#else
1026	ldmeqia	sp!, {r0, pc}		/* done */
1027#endif
1028	/* copy the crud byte at a time */
1029	cmp	r2, #2
1030	ldrb	r3, [r1], #1
1031	strb	r3, [r0], #1
1032	ldrgeb	r3, [r1], #1
1033	strgeb	r3, [r0], #1
1034	ldrgtb	r3, [r1], #1
1035	strgtb	r3, [r0], #1
1036	ldmia	sp!, {r0, pc}
1037
1038	/* erg - unaligned destination */
1039.Lmemcpy_destul:
1040	rsb	r12, r12, #4
1041	cmp	r12, #2
1042
1043	/* align destination with byte copies */
1044	ldrb	r3, [r1], #1
1045	strb	r3, [r0], #1
1046	ldrgeb	r3, [r1], #1
1047	strgeb	r3, [r0], #1
1048	ldrgtb	r3, [r1], #1
1049	strgtb	r3, [r0], #1
1050	subs	r2, r2, r12
1051	blt	.Lmemcpy_l4		/* less the 4 bytes */
1052
1053	ands	r12, r1, #3
1054	beq	.Lmemcpy_t8		/* we have an aligned source */
1055
1056	/* erg - unaligned source */
1057	/* This is where it gets nasty ... */
1058.Lmemcpy_srcul:
1059	bic	r1, r1, #3
1060	ldr	lr, [r1], #4
1061	cmp	r12, #2
1062	bgt	.Lmemcpy_srcul3
1063	beq	.Lmemcpy_srcul2
1064	cmp	r2, #0x0c
1065	blt	.Lmemcpy_srcul1loop4
1066	sub	r2, r2, #0x0c
1067	stmdb	sp!, {r4, r5}
1068
1069.Lmemcpy_srcul1loop16:
1070	mov	r3, lr, lsr #8
1071	ldmia	r1!, {r4, r5, r12, lr}
1072	orr	r3, r3, r4, lsl #24
1073	mov	r4, r4, lsr #8
1074	orr	r4, r4, r5, lsl #24
1075	mov	r5, r5, lsr #8
1076	orr	r5, r5, r12, lsl #24
1077	mov	r12, r12, lsr #8
1078	orr	r12, r12, lr, lsl #24
1079	stmia	r0!, {r3-r5, r12}
1080	subs	r2, r2, #0x10
1081	bge	.Lmemcpy_srcul1loop16
1082	ldmia	sp!, {r4, r5}
1083	adds	r2, r2, #0x0c
1084	blt	.Lmemcpy_srcul1l4
1085
1086.Lmemcpy_srcul1loop4:
1087	mov	r12, lr, lsr #8
1088	ldr	lr, [r1], #4
1089	orr	r12, r12, lr, lsl #24
1090	str	r12, [r0], #4
1091	subs	r2, r2, #4
1092	bge	.Lmemcpy_srcul1loop4
1093
1094.Lmemcpy_srcul1l4:
1095	sub	r1, r1, #3
1096	b	.Lmemcpy_l4
1097
1098.Lmemcpy_srcul2:
1099	cmp	r2, #0x0c
1100	blt	.Lmemcpy_srcul2loop4
1101	sub	r2, r2, #0x0c
1102	stmdb	sp!, {r4, r5}
1103
1104.Lmemcpy_srcul2loop16:
1105	mov	r3, lr, lsr #16
1106	ldmia	r1!, {r4, r5, r12, lr}
1107	orr	r3, r3, r4, lsl #16
1108	mov	r4, r4, lsr #16
1109	orr	r4, r4, r5, lsl #16
1110	mov	r5, r5, lsr #16
1111	orr	r5, r5, r12, lsl #16
1112	mov	r12, r12, lsr #16
1113	orr	r12, r12, lr, lsl #16
1114	stmia	r0!, {r3-r5, r12}
1115	subs	r2, r2, #0x10
1116	bge	.Lmemcpy_srcul2loop16
1117	ldmia	sp!, {r4, r5}
1118	adds	r2, r2, #0x0c
1119	blt	.Lmemcpy_srcul2l4
1120
1121.Lmemcpy_srcul2loop4:
1122	mov	r12, lr, lsr #16
1123	ldr	lr, [r1], #4
1124	orr	r12, r12, lr, lsl #16
1125	str	r12, [r0], #4
1126	subs	r2, r2, #4
1127	bge	.Lmemcpy_srcul2loop4
1128
1129.Lmemcpy_srcul2l4:
1130	sub	r1, r1, #2
1131	b	.Lmemcpy_l4
1132
1133.Lmemcpy_srcul3:
1134	cmp	r2, #0x0c
1135	blt	.Lmemcpy_srcul3loop4
1136	sub	r2, r2, #0x0c
1137	stmdb	sp!, {r4, r5}
1138
1139.Lmemcpy_srcul3loop16:
1140	mov	r3, lr, lsr #24
1141	ldmia	r1!, {r4, r5, r12, lr}
1142	orr	r3, r3, r4, lsl #8
1143	mov	r4, r4, lsr #24
1144	orr	r4, r4, r5, lsl #8
1145	mov	r5, r5, lsr #24
1146	orr	r5, r5, r12, lsl #8
1147	mov	r12, r12, lsr #24
1148	orr	r12, r12, lr, lsl #8
1149	stmia	r0!, {r3-r5, r12}
1150	subs	r2, r2, #0x10
1151	bge	.Lmemcpy_srcul3loop16
1152	ldmia	sp!, {r4, r5}
1153	adds	r2, r2, #0x0c
1154	blt	.Lmemcpy_srcul3l4
1155
1156.Lmemcpy_srcul3loop4:
1157	mov	r12, lr, lsr #24
1158	ldr	lr, [r1], #4
1159	orr	r12, r12, lr, lsl #8
1160	str	r12, [r0], #4
1161	subs	r2, r2, #4
1162	bge	.Lmemcpy_srcul3loop4
1163
1164.Lmemcpy_srcul3l4:
1165	sub	r1, r1, #1
1166	b	.Lmemcpy_l4
1167#else
1168/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
1169ENTRY(memcpy)
1170	pld	[r1]
1171	cmp	r2, #0x0c
1172	ble	.Lmemcpy_short		/* <= 12 bytes */
1173#ifdef FLASHADDR
1174#if FLASHADDR > PHYSADDR
1175	ldr	r3, =FLASHADDR
1176	cmp	r3, pc
1177	bls	.Lnormal
1178#else
1179	ldr	r3, =FLASHADDR
1180	cmp	r3, pc
1181	bhi	.Lnormal
1182#endif
1183#endif
1184	ldr	r3, .L_arm_memcpy
1185	ldr	r3, [r3]
1186	cmp	r3, #0
1187	beq	.Lnormal
1188	ldr	r3, .L_min_memcpy_size
1189	ldr	r3, [r3]
1190	cmp	r2, r3
1191	blt	.Lnormal
1192	stmfd	sp!, {r0-r2, r4, lr}
1193	mov	r3, #0
1194	ldr	r4, .L_arm_memcpy
1195	mov	lr, pc
1196	ldr	pc, [r4]
1197	cmp	r0, #0
1198	ldmfd	sp!, {r0-r2, r4, lr}
1199	RETeq
1200.Lnormal:
1201	mov	r3, r0			/* We must not clobber r0 */
1202
1203	/* Word-align the destination buffer */
1204	ands	ip, r3, #0x03		/* Already word aligned? */
1205	beq	.Lmemcpy_wordaligned	/* Yup */
1206	cmp	ip, #0x02
1207	ldrb	ip, [r1], #0x01
1208	sub	r2, r2, #0x01
1209	strb	ip, [r3], #0x01
1210	ldrleb	ip, [r1], #0x01
1211	suble	r2, r2, #0x01
1212	strleb	ip, [r3], #0x01
1213	ldrltb	ip, [r1], #0x01
1214	sublt	r2, r2, #0x01
1215	strltb	ip, [r3], #0x01
1216
1217	/* Destination buffer is now word aligned */
1218.Lmemcpy_wordaligned:
1219	ands	ip, r1, #0x03		/* Is src also word-aligned? */
1220	bne	.Lmemcpy_bad_align	/* Nope. Things just got bad */
1221
1222	/* Quad-align the destination buffer */
1223	tst	r3, #0x07		/* Already quad aligned? */
1224	ldrne	ip, [r1], #0x04
1225	stmfd	sp!, {r4-r9}		/* Free up some registers */
1226	subne	r2, r2, #0x04
1227	strne	ip, [r3], #0x04
1228
1229	/* Destination buffer quad aligned, source is at least word aligned */
1230	subs	r2, r2, #0x80
1231	blt	.Lmemcpy_w_lessthan128
1232
1233	/* Copy 128 bytes at a time */
1234.Lmemcpy_w_loop128:
1235	ldr	r4, [r1], #0x04		/* LD:00-03 */
1236	ldr	r5, [r1], #0x04		/* LD:04-07 */
1237	pld	[r1, #0x18]		/* Prefetch 0x20 */
1238	ldr	r6, [r1], #0x04		/* LD:08-0b */
1239	ldr	r7, [r1], #0x04		/* LD:0c-0f */
1240	ldr	r8, [r1], #0x04		/* LD:10-13 */
1241	ldr	r9, [r1], #0x04		/* LD:14-17 */
1242	strd	r4, [r3], #0x08		/* ST:00-07 */
1243	ldr	r4, [r1], #0x04		/* LD:18-1b */
1244	ldr	r5, [r1], #0x04		/* LD:1c-1f */
1245	strd	r6, [r3], #0x08		/* ST:08-0f */
1246	ldr	r6, [r1], #0x04		/* LD:20-23 */
1247	ldr	r7, [r1], #0x04		/* LD:24-27 */
1248	pld	[r1, #0x18]		/* Prefetch 0x40 */
1249	strd	r8, [r3], #0x08		/* ST:10-17 */
1250	ldr	r8, [r1], #0x04		/* LD:28-2b */
1251	ldr	r9, [r1], #0x04		/* LD:2c-2f */
1252	strd	r4, [r3], #0x08		/* ST:18-1f */
1253	ldr	r4, [r1], #0x04		/* LD:30-33 */
1254	ldr	r5, [r1], #0x04		/* LD:34-37 */
1255	strd	r6, [r3], #0x08		/* ST:20-27 */
1256	ldr	r6, [r1], #0x04		/* LD:38-3b */
1257	ldr	r7, [r1], #0x04		/* LD:3c-3f */
1258	strd	r8, [r3], #0x08		/* ST:28-2f */
1259	ldr	r8, [r1], #0x04		/* LD:40-43 */
1260	ldr	r9, [r1], #0x04		/* LD:44-47 */
1261	pld	[r1, #0x18]		/* Prefetch 0x60 */
1262	strd	r4, [r3], #0x08		/* ST:30-37 */
1263	ldr	r4, [r1], #0x04		/* LD:48-4b */
1264	ldr	r5, [r1], #0x04		/* LD:4c-4f */
1265	strd	r6, [r3], #0x08		/* ST:38-3f */
1266	ldr	r6, [r1], #0x04		/* LD:50-53 */
1267	ldr	r7, [r1], #0x04		/* LD:54-57 */
1268	strd	r8, [r3], #0x08		/* ST:40-47 */
1269	ldr	r8, [r1], #0x04		/* LD:58-5b */
1270	ldr	r9, [r1], #0x04		/* LD:5c-5f */
1271	strd	r4, [r3], #0x08		/* ST:48-4f */
1272	ldr	r4, [r1], #0x04		/* LD:60-63 */
1273	ldr	r5, [r1], #0x04		/* LD:64-67 */
1274	pld	[r1, #0x18]		/* Prefetch 0x80 */
1275	strd	r6, [r3], #0x08		/* ST:50-57 */
1276	ldr	r6, [r1], #0x04		/* LD:68-6b */
1277	ldr	r7, [r1], #0x04		/* LD:6c-6f */
1278	strd	r8, [r3], #0x08		/* ST:58-5f */
1279	ldr	r8, [r1], #0x04		/* LD:70-73 */
1280	ldr	r9, [r1], #0x04		/* LD:74-77 */
1281	strd	r4, [r3], #0x08		/* ST:60-67 */
1282	ldr	r4, [r1], #0x04		/* LD:78-7b */
1283	ldr	r5, [r1], #0x04		/* LD:7c-7f */
1284	strd	r6, [r3], #0x08		/* ST:68-6f */
1285	strd	r8, [r3], #0x08		/* ST:70-77 */
1286	subs	r2, r2, #0x80
1287	strd	r4, [r3], #0x08		/* ST:78-7f */
1288	bge	.Lmemcpy_w_loop128
1289
1290.Lmemcpy_w_lessthan128:
1291	adds	r2, r2, #0x80		/* Adjust for extra sub */
1292	ldmeqfd	sp!, {r4-r9}
1293	RETeq			/* Return now if done */
1294	subs	r2, r2, #0x20
1295	blt	.Lmemcpy_w_lessthan32
1296
1297	/* Copy 32 bytes at a time */
1298.Lmemcpy_w_loop32:
1299	ldr	r4, [r1], #0x04
1300	ldr	r5, [r1], #0x04
1301	pld	[r1, #0x18]
1302	ldr	r6, [r1], #0x04
1303	ldr	r7, [r1], #0x04
1304	ldr	r8, [r1], #0x04
1305	ldr	r9, [r1], #0x04
1306	strd	r4, [r3], #0x08
1307	ldr	r4, [r1], #0x04
1308	ldr	r5, [r1], #0x04
1309	strd	r6, [r3], #0x08
1310	strd	r8, [r3], #0x08
1311	subs	r2, r2, #0x20
1312	strd	r4, [r3], #0x08
1313	bge	.Lmemcpy_w_loop32
1314
1315.Lmemcpy_w_lessthan32:
1316	adds	r2, r2, #0x20		/* Adjust for extra sub */
1317	ldmeqfd	sp!, {r4-r9}
1318	RETeq			/* Return now if done */
1319
1320	and	r4, r2, #0x18
1321	rsbs	r4, r4, #0x18
1322	addne	pc, pc, r4, lsl #1
1323	nop
1324
1325	/* At least 24 bytes remaining */
1326	ldr	r4, [r1], #0x04
1327	ldr	r5, [r1], #0x04
1328	sub	r2, r2, #0x08
1329	strd	r4, [r3], #0x08
1330
1331	/* At least 16 bytes remaining */
1332	ldr	r4, [r1], #0x04
1333	ldr	r5, [r1], #0x04
1334	sub	r2, r2, #0x08
1335	strd	r4, [r3], #0x08
1336
1337	/* At least 8 bytes remaining */
1338	ldr	r4, [r1], #0x04
1339	ldr	r5, [r1], #0x04
1340	subs	r2, r2, #0x08
1341	strd	r4, [r3], #0x08
1342
1343	/* Less than 8 bytes remaining */
1344	ldmfd	sp!, {r4-r9}
1345	RETeq			/* Return now if done */
1346	subs	r2, r2, #0x04
1347	ldrge	ip, [r1], #0x04
1348	strge	ip, [r3], #0x04
1349	RETeq			/* Return now if done */
1350	addlt	r2, r2, #0x04
1351	ldrb	ip, [r1], #0x01
1352	cmp	r2, #0x02
1353	ldrgeb	r2, [r1], #0x01
1354	strb	ip, [r3], #0x01
1355	ldrgtb	ip, [r1]
1356	strgeb	r2, [r3], #0x01
1357	strgtb	ip, [r3]
1358	RET
1359
1360
1361/*
1362 * At this point, it has not been possible to word align both buffers.
1363 * The destination buffer is word aligned, but the source buffer is not.
1364 */
1365.Lmemcpy_bad_align:
1366	stmfd	sp!, {r4-r7}
1367	bic	r1, r1, #0x03
1368	cmp	ip, #2
1369	ldr	ip, [r1], #0x04
1370	bgt	.Lmemcpy_bad3
1371	beq	.Lmemcpy_bad2
1372	b	.Lmemcpy_bad1
1373
1374.Lmemcpy_bad1_loop16:
1375#ifdef __ARMEB__
1376	mov	r4, ip, lsl #8
1377#else
1378	mov	r4, ip, lsr #8
1379#endif
1380	ldr	r5, [r1], #0x04
1381	pld	[r1, #0x018]
1382	ldr	r6, [r1], #0x04
1383	ldr	r7, [r1], #0x04
1384	ldr	ip, [r1], #0x04
1385#ifdef __ARMEB__
1386	orr	r4, r4, r5, lsr #24
1387	mov	r5, r5, lsl #8
1388	orr	r5, r5, r6, lsr #24
1389	mov	r6, r6, lsl #8
1390	orr	r6, r6, r7, lsr #24
1391	mov	r7, r7, lsl #8
1392	orr	r7, r7, ip, lsr #24
1393#else
1394	orr	r4, r4, r5, lsl #24
1395	mov	r5, r5, lsr #8
1396	orr	r5, r5, r6, lsl #24
1397	mov	r6, r6, lsr #8
1398	orr	r6, r6, r7, lsl #24
1399	mov	r7, r7, lsr #8
1400	orr	r7, r7, ip, lsl #24
1401#endif
1402	str	r4, [r3], #0x04
1403	str	r5, [r3], #0x04
1404	str	r6, [r3], #0x04
1405	str	r7, [r3], #0x04
1406.Lmemcpy_bad1:
1407	subs	r2, r2, #0x10
1408	bge	.Lmemcpy_bad1_loop16
1409
1410	adds	r2, r2, #0x10
1411	ldmeqfd	sp!, {r4-r7}
1412	RETeq			/* Return now if done */
1413	subs	r2, r2, #0x04
1414	sublt	r1, r1, #0x03
1415	blt	.Lmemcpy_bad_done
1416
1417.Lmemcpy_bad1_loop4:
1418#ifdef __ARMEB__
1419	mov	r4, ip, lsl #8
1420#else
1421	mov	r4, ip, lsr #8
1422#endif
1423	ldr	ip, [r1], #0x04
1424	subs	r2, r2, #0x04
1425#ifdef __ARMEB__
1426	orr	r4, r4, ip, lsr #24
1427#else
1428	orr	r4, r4, ip, lsl #24
1429#endif
1430	str	r4, [r3], #0x04
1431	bge	.Lmemcpy_bad1_loop4
1432	sub	r1, r1, #0x03
1433	b	.Lmemcpy_bad_done
1434
1435.Lmemcpy_bad2_loop16:
1436#ifdef __ARMEB__
1437	mov	r4, ip, lsl #16
1438#else
1439	mov	r4, ip, lsr #16
1440#endif
1441	ldr	r5, [r1], #0x04
1442	pld	[r1, #0x018]
1443	ldr	r6, [r1], #0x04
1444	ldr	r7, [r1], #0x04
1445	ldr	ip, [r1], #0x04
1446#ifdef __ARMEB__
1447	orr	r4, r4, r5, lsr #16
1448	mov	r5, r5, lsl #16
1449	orr	r5, r5, r6, lsr #16
1450	mov	r6, r6, lsl #16
1451	orr	r6, r6, r7, lsr #16
1452	mov	r7, r7, lsl #16
1453	orr	r7, r7, ip, lsr #16
1454#else
1455	orr	r4, r4, r5, lsl #16
1456	mov	r5, r5, lsr #16
1457	orr	r5, r5, r6, lsl #16
1458	mov	r6, r6, lsr #16
1459	orr	r6, r6, r7, lsl #16
1460	mov	r7, r7, lsr #16
1461	orr	r7, r7, ip, lsl #16
1462#endif
1463	str	r4, [r3], #0x04
1464	str	r5, [r3], #0x04
1465	str	r6, [r3], #0x04
1466	str	r7, [r3], #0x04
1467.Lmemcpy_bad2:
1468	subs	r2, r2, #0x10
1469	bge	.Lmemcpy_bad2_loop16
1470
1471	adds	r2, r2, #0x10
1472	ldmeqfd	sp!, {r4-r7}
1473	RETeq			/* Return now if done */
1474	subs	r2, r2, #0x04
1475	sublt	r1, r1, #0x02
1476	blt	.Lmemcpy_bad_done
1477
1478.Lmemcpy_bad2_loop4:
1479#ifdef __ARMEB__
1480	mov	r4, ip, lsl #16
1481#else
1482	mov	r4, ip, lsr #16
1483#endif
1484	ldr	ip, [r1], #0x04
1485	subs	r2, r2, #0x04
1486#ifdef __ARMEB__
1487	orr	r4, r4, ip, lsr #16
1488#else
1489	orr	r4, r4, ip, lsl #16
1490#endif
1491	str	r4, [r3], #0x04
1492	bge	.Lmemcpy_bad2_loop4
1493	sub	r1, r1, #0x02
1494	b	.Lmemcpy_bad_done
1495
1496.Lmemcpy_bad3_loop16:
1497#ifdef __ARMEB__
1498	mov	r4, ip, lsl #24
1499#else
1500	mov	r4, ip, lsr #24
1501#endif
1502	ldr	r5, [r1], #0x04
1503	pld	[r1, #0x018]
1504	ldr	r6, [r1], #0x04
1505	ldr	r7, [r1], #0x04
1506	ldr	ip, [r1], #0x04
1507#ifdef __ARMEB__
1508	orr	r4, r4, r5, lsr #8
1509	mov	r5, r5, lsl #24
1510	orr	r5, r5, r6, lsr #8
1511	mov	r6, r6, lsl #24
1512	orr	r6, r6, r7, lsr #8
1513	mov	r7, r7, lsl #24
1514	orr	r7, r7, ip, lsr #8
1515#else
1516	orr	r4, r4, r5, lsl #8
1517	mov	r5, r5, lsr #24
1518	orr	r5, r5, r6, lsl #8
1519	mov	r6, r6, lsr #24
1520	orr	r6, r6, r7, lsl #8
1521	mov	r7, r7, lsr #24
1522	orr	r7, r7, ip, lsl #8
1523#endif
1524	str	r4, [r3], #0x04
1525	str	r5, [r3], #0x04
1526	str	r6, [r3], #0x04
1527	str	r7, [r3], #0x04
1528.Lmemcpy_bad3:
1529	subs	r2, r2, #0x10
1530	bge	.Lmemcpy_bad3_loop16
1531
1532	adds	r2, r2, #0x10
1533	ldmeqfd	sp!, {r4-r7}
1534	RETeq			/* Return now if done */
1535	subs	r2, r2, #0x04
1536	sublt	r1, r1, #0x01
1537	blt	.Lmemcpy_bad_done
1538
1539.Lmemcpy_bad3_loop4:
1540#ifdef __ARMEB__
1541	mov	r4, ip, lsl #24
1542#else
1543	mov	r4, ip, lsr #24
1544#endif
1545	ldr	ip, [r1], #0x04
1546	subs	r2, r2, #0x04
1547#ifdef __ARMEB__
1548	orr	r4, r4, ip, lsr #8
1549#else
1550	orr	r4, r4, ip, lsl #8
1551#endif
1552	str	r4, [r3], #0x04
1553	bge	.Lmemcpy_bad3_loop4
1554	sub	r1, r1, #0x01
1555
1556.Lmemcpy_bad_done:
1557	ldmfd	sp!, {r4-r7}
1558	adds	r2, r2, #0x04
1559	RETeq
1560	ldrb	ip, [r1], #0x01
1561	cmp	r2, #0x02
1562	ldrgeb	r2, [r1], #0x01
1563	strb	ip, [r3], #0x01
1564	ldrgtb	ip, [r1]
1565	strgeb	r2, [r3], #0x01
1566	strgtb	ip, [r3]
1567	RET
1568
1569
1570/*
1571 * Handle short copies (less than 16 bytes), possibly misaligned.
1572 * Some of these are *very* common, thanks to the network stack,
1573 * and so are handled specially.
1574 */
1575.Lmemcpy_short:
1576	add	pc, pc, r2, lsl #2
1577	nop
1578	RET			/* 0x00 */
1579	b	.Lmemcpy_bytewise	/* 0x01 */
1580	b	.Lmemcpy_bytewise	/* 0x02 */
1581	b	.Lmemcpy_bytewise	/* 0x03 */
1582	b	.Lmemcpy_4		/* 0x04 */
1583	b	.Lmemcpy_bytewise	/* 0x05 */
1584	b	.Lmemcpy_6		/* 0x06 */
1585	b	.Lmemcpy_bytewise	/* 0x07 */
1586	b	.Lmemcpy_8		/* 0x08 */
1587	b	.Lmemcpy_bytewise	/* 0x09 */
1588	b	.Lmemcpy_bytewise	/* 0x0a */
1589	b	.Lmemcpy_bytewise	/* 0x0b */
1590	b	.Lmemcpy_c		/* 0x0c */
1591.Lmemcpy_bytewise:
1592	mov	r3, r0			/* We must not clobber r0 */
1593	ldrb	ip, [r1], #0x01
15941:	subs	r2, r2, #0x01
1595	strb	ip, [r3], #0x01
1596	ldrneb	ip, [r1], #0x01
1597	bne	1b
1598	RET
1599
1600/******************************************************************************
1601 * Special case for 4 byte copies
1602 */
1603#define	LMEMCPY_4_LOG2	6	/* 64 bytes */
1604#define	LMEMCPY_4_PAD	.align LMEMCPY_4_LOG2
1605	LMEMCPY_4_PAD
1606.Lmemcpy_4:
1607	and	r2, r1, #0x03
1608	orr	r2, r2, r0, lsl #2
1609	ands	r2, r2, #0x0f
1610	sub	r3, pc, #0x14
1611	addne	pc, r3, r2, lsl #LMEMCPY_4_LOG2
1612
1613/*
1614 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1615 */
1616	ldr	r2, [r1]
1617	str	r2, [r0]
1618	RET
1619	LMEMCPY_4_PAD
1620
1621/*
1622 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1623 */
1624	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1625	ldr	r2, [r1, #3]		/* BE:r2 = 3xxx  LE:r2 = xxx3 */
1626#ifdef __ARMEB__
1627	mov	r3, r3, lsl #8		/* r3 = 012. */
1628	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
1629#else
1630	mov	r3, r3, lsr #8		/* r3 = .210 */
1631	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
1632#endif
1633	str	r3, [r0]
1634	RET
1635	LMEMCPY_4_PAD
1636
1637/*
1638 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1639 */
1640#ifdef __ARMEB__
1641	ldrh	r3, [r1]
1642	ldrh	r2, [r1, #0x02]
1643#else
1644	ldrh	r3, [r1, #0x02]
1645	ldrh	r2, [r1]
1646#endif
1647	orr	r3, r2, r3, lsl #16
1648	str	r3, [r0]
1649	RET
1650	LMEMCPY_4_PAD
1651
1652/*
1653 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1654 */
1655	ldr	r3, [r1, #-3]		/* BE:r3 = xxx0  LE:r3 = 0xxx */
1656	ldr	r2, [r1, #1]		/* BE:r2 = 123x  LE:r2 = x321 */
1657#ifdef __ARMEB__
1658	mov	r3, r3, lsl #24		/* r3 = 0... */
1659	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
1660#else
1661	mov	r3, r3, lsr #24		/* r3 = ...0 */
1662	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1663#endif
1664	str	r3, [r0]
1665	RET
1666	LMEMCPY_4_PAD
1667
1668/*
1669 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1670 */
1671	ldr	r2, [r1]
1672#ifdef __ARMEB__
1673	strb	r2, [r0, #0x03]
1674	mov	r3, r2, lsr #8
1675	mov	r1, r2, lsr #24
1676	strb	r1, [r0]
1677#else
1678	strb	r2, [r0]
1679	mov	r3, r2, lsr #8
1680	mov	r1, r2, lsr #24
1681	strb	r1, [r0, #0x03]
1682#endif
1683	strh	r3, [r0, #0x01]
1684	RET
1685	LMEMCPY_4_PAD
1686
1687/*
1688 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1689 */
1690	ldrb	r2, [r1]
1691	ldrh	r3, [r1, #0x01]
1692	ldrb	r1, [r1, #0x03]
1693	strb	r2, [r0]
1694	strh	r3, [r0, #0x01]
1695	strb	r1, [r0, #0x03]
1696	RET
1697	LMEMCPY_4_PAD
1698
1699/*
1700 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1701 */
1702	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1703	ldrh	r3, [r1, #0x02]		/* LE:r3 = ..23  LE:r3 = ..32 */
1704#ifdef __ARMEB__
1705	mov	r1, r2, lsr #8		/* r1 = ...0 */
1706	strb	r1, [r0]
1707	mov	r2, r2, lsl #8		/* r2 = .01. */
1708	orr	r2, r2, r3, lsr #8	/* r2 = .012 */
1709#else
1710	strb	r2, [r0]
1711	mov	r2, r2, lsr #8		/* r2 = ...1 */
1712	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1713	mov	r3, r3, lsr #8		/* r3 = ...3 */
1714#endif
1715	strh	r2, [r0, #0x01]
1716	strb	r3, [r0, #0x03]
1717	RET
1718	LMEMCPY_4_PAD
1719
1720/*
1721 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1722 */
1723	ldrb	r2, [r1]
1724	ldrh	r3, [r1, #0x01]
1725	ldrb	r1, [r1, #0x03]
1726	strb	r2, [r0]
1727	strh	r3, [r0, #0x01]
1728	strb	r1, [r0, #0x03]
1729	RET
1730	LMEMCPY_4_PAD
1731
1732/*
1733 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1734 */
1735	ldr	r2, [r1]
1736#ifdef __ARMEB__
1737	strh	r2, [r0, #0x02]
1738	mov	r3, r2, lsr #16
1739	strh	r3, [r0]
1740#else
1741	strh	r2, [r0]
1742	mov	r3, r2, lsr #16
1743	strh	r3, [r0, #0x02]
1744#endif
1745	RET
1746	LMEMCPY_4_PAD
1747
1748/*
1749 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1750 */
1751	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1752	ldr	r3, [r1, #3]		/* BE:r3 = 3xxx  LE:r3 = xxx3 */
1753	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1754	strh	r1, [r0]
1755#ifdef __ARMEB__
1756	mov	r2, r2, lsl #8		/* r2 = 012. */
1757	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
1758#else
1759	mov	r2, r2, lsr #24		/* r2 = ...2 */
1760	orr	r2, r2, r3, lsl #8	/* r2 = xx32 */
1761#endif
1762	strh	r2, [r0, #0x02]
1763	RET
1764	LMEMCPY_4_PAD
1765
1766/*
1767 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1768 */
1769	ldrh	r2, [r1]
1770	ldrh	r3, [r1, #0x02]
1771	strh	r2, [r0]
1772	strh	r3, [r0, #0x02]
1773	RET
1774	LMEMCPY_4_PAD
1775
1776/*
1777 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1778 */
1779	ldr	r3, [r1, #1]		/* BE:r3 = 123x  LE:r3 = x321 */
1780	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1781	mov	r1, r3, lsr #8		/* BE:r1 = .123  LE:r1 = .x32 */
1782	strh	r1, [r0, #0x02]
1783#ifdef __ARMEB__
1784	mov	r3, r3, lsr #24		/* r3 = ...1 */
1785	orr	r3, r3, r2, lsl #8	/* r3 = xx01 */
1786#else
1787	mov	r3, r3, lsl #8		/* r3 = 321. */
1788	orr	r3, r3, r2, lsr #24	/* r3 = 3210 */
1789#endif
1790	strh	r3, [r0]
1791	RET
1792	LMEMCPY_4_PAD
1793
1794/*
1795 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1796 */
1797	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1798#ifdef __ARMEB__
1799	strb	r2, [r0, #0x03]
1800	mov	r3, r2, lsr #8
1801	mov	r1, r2, lsr #24
1802	strh	r3, [r0, #0x01]
1803	strb	r1, [r0]
1804#else
1805	strb	r2, [r0]
1806	mov	r3, r2, lsr #8
1807	mov	r1, r2, lsr #24
1808	strh	r3, [r0, #0x01]
1809	strb	r1, [r0, #0x03]
1810#endif
1811	RET
1812	LMEMCPY_4_PAD
1813
1814/*
1815 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1816 */
1817	ldrb	r2, [r1]
1818	ldrh	r3, [r1, #0x01]
1819	ldrb	r1, [r1, #0x03]
1820	strb	r2, [r0]
1821	strh	r3, [r0, #0x01]
1822	strb	r1, [r0, #0x03]
1823	RET
1824	LMEMCPY_4_PAD
1825
1826/*
1827 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1828 */
1829#ifdef __ARMEB__
1830	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
1831	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1832	strb	r3, [r0, #0x03]
1833	mov	r3, r3, lsr #8		/* r3 = ...2 */
1834	orr	r3, r3, r2, lsl #8	/* r3 = ..12 */
1835	strh	r3, [r0, #0x01]
1836	mov	r2, r2, lsr #8		/* r2 = ...0 */
1837	strb	r2, [r0]
1838#else
1839	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1840	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
1841	strb	r2, [r0]
1842	mov	r2, r2, lsr #8		/* r2 = ...1 */
1843	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1844	strh	r2, [r0, #0x01]
1845	mov	r3, r3, lsr #8		/* r3 = ...3 */
1846	strb	r3, [r0, #0x03]
1847#endif
1848	RET
1849	LMEMCPY_4_PAD
1850
1851/*
1852 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1853 */
1854	ldrb	r2, [r1]
1855	ldrh	r3, [r1, #0x01]
1856	ldrb	r1, [r1, #0x03]
1857	strb	r2, [r0]
1858	strh	r3, [r0, #0x01]
1859	strb	r1, [r0, #0x03]
1860	RET
1861	LMEMCPY_4_PAD
1862
1863
1864/******************************************************************************
1865 * Special case for 6 byte copies
1866 */
1867#define	LMEMCPY_6_LOG2	6	/* 64 bytes */
1868#define	LMEMCPY_6_PAD	.align LMEMCPY_6_LOG2
1869	LMEMCPY_6_PAD
1870.Lmemcpy_6:
1871	and	r2, r1, #0x03
1872	orr	r2, r2, r0, lsl #2
1873	ands	r2, r2, #0x0f
1874	sub	r3, pc, #0x14
1875	addne	pc, r3, r2, lsl #LMEMCPY_6_LOG2
1876
1877/*
1878 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1879 */
1880	ldr	r2, [r1]
1881	ldrh	r3, [r1, #0x04]
1882	str	r2, [r0]
1883	strh	r3, [r0, #0x04]
1884	RET
1885	LMEMCPY_6_PAD
1886
1887/*
1888 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1889 */
1890	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1891	ldr	r3, [r1, #0x03]		/* BE:r3 = 345x  LE:r3 = x543 */
1892#ifdef __ARMEB__
1893	mov	r2, r2, lsl #8		/* r2 = 012. */
1894	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
1895#else
1896	mov	r2, r2, lsr #8		/* r2 = .210 */
1897	orr	r2, r2, r3, lsl #24	/* r2 = 3210 */
1898#endif
1899	mov	r3, r3, lsr #8		/* BE:r3 = .345  LE:r3 = .x54 */
1900	str	r2, [r0]
1901	strh	r3, [r0, #0x04]
1902	RET
1903	LMEMCPY_6_PAD
1904
1905/*
1906 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1907 */
1908	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1909	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1910#ifdef __ARMEB__
1911	mov	r1, r3, lsr #16		/* r1 = ..23 */
1912	orr	r1, r1, r2, lsl #16	/* r1 = 0123 */
1913	str	r1, [r0]
1914	strh	r3, [r0, #0x04]
1915#else
1916	mov	r1, r3, lsr #16		/* r1 = ..54 */
1917	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1918	str	r2, [r0]
1919	strh	r1, [r0, #0x04]
1920#endif
1921	RET
1922	LMEMCPY_6_PAD
1923
1924/*
1925 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1926 */
1927	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1928	ldr	r3, [r1, #1]		/* BE:r3 = 1234  LE:r3 = 4321 */
1929	ldr	r1, [r1, #5]		/* BE:r1 = 5xxx  LE:r3 = xxx5 */
1930#ifdef __ARMEB__
1931	mov	r2, r2, lsl #24		/* r2 = 0... */
1932	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
1933	mov	r3, r3, lsl #8		/* r3 = 234. */
1934	orr	r1, r3, r1, lsr #24	/* r1 = 2345 */
1935#else
1936	mov	r2, r2, lsr #24		/* r2 = ...0 */
1937	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
1938	mov	r1, r1, lsl #8		/* r1 = xx5. */
1939	orr	r1, r1, r3, lsr #24	/* r1 = xx54 */
1940#endif
1941	str	r2, [r0]
1942	strh	r1, [r0, #0x04]
1943	RET
1944	LMEMCPY_6_PAD
1945
1946/*
1947 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1948 */
1949	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
1950	ldrh	r2, [r1, #0x04]		/* BE:r2 = ..45  LE:r2 = ..54 */
1951	mov	r1, r3, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
1952	strh	r1, [r0, #0x01]
1953#ifdef __ARMEB__
1954	mov	r1, r3, lsr #24		/* r1 = ...0 */
1955	strb	r1, [r0]
1956	mov	r3, r3, lsl #8		/* r3 = 123. */
1957	orr	r3, r3, r2, lsr #8	/* r3 = 1234 */
1958#else
1959	strb	r3, [r0]
1960	mov	r3, r3, lsr #24		/* r3 = ...3 */
1961	orr	r3, r3, r2, lsl #8	/* r3 = .543 */
1962	mov	r2, r2, lsr #8		/* r2 = ...5 */
1963#endif
1964	strh	r3, [r0, #0x03]
1965	strb	r2, [r0, #0x05]
1966	RET
1967	LMEMCPY_6_PAD
1968
1969/*
1970 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1971 */
1972	ldrb	r2, [r1]
1973	ldrh	r3, [r1, #0x01]
1974	ldrh	ip, [r1, #0x03]
1975	ldrb	r1, [r1, #0x05]
1976	strb	r2, [r0]
1977	strh	r3, [r0, #0x01]
1978	strh	ip, [r0, #0x03]
1979	strb	r1, [r0, #0x05]
1980	RET
1981	LMEMCPY_6_PAD
1982
1983/*
1984 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1985 */
1986	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1987	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
1988#ifdef __ARMEB__
1989	mov	r3, r2, lsr #8		/* r3 = ...0 */
1990	strb	r3, [r0]
1991	strb	r1, [r0, #0x05]
1992	mov	r3, r1, lsr #8		/* r3 = .234 */
1993	strh	r3, [r0, #0x03]
1994	mov	r3, r2, lsl #8		/* r3 = .01. */
1995	orr	r3, r3, r1, lsr #24	/* r3 = .012 */
1996	strh	r3, [r0, #0x01]
1997#else
1998	strb	r2, [r0]
1999	mov	r3, r1, lsr #24
2000	strb	r3, [r0, #0x05]
2001	mov	r3, r1, lsr #8		/* r3 = .543 */
2002	strh	r3, [r0, #0x03]
2003	mov	r3, r2, lsr #8		/* r3 = ...1 */
2004	orr	r3, r3, r1, lsl #8	/* r3 = 4321 */
2005	strh	r3, [r0, #0x01]
2006#endif
2007	RET
2008	LMEMCPY_6_PAD
2009
2010/*
2011 * 0111: dst is 8-bit aligned, src is 8-bit aligned
2012 */
2013	ldrb	r2, [r1]
2014	ldrh	r3, [r1, #0x01]
2015	ldrh	ip, [r1, #0x03]
2016	ldrb	r1, [r1, #0x05]
2017	strb	r2, [r0]
2018	strh	r3, [r0, #0x01]
2019	strh	ip, [r0, #0x03]
2020	strb	r1, [r0, #0x05]
2021	RET
2022	LMEMCPY_6_PAD
2023
2024/*
2025 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2026 */
2027#ifdef __ARMEB__
2028	ldr	r2, [r1]		/* r2 = 0123 */
2029	ldrh	r3, [r1, #0x04]		/* r3 = ..45 */
2030	mov	r1, r2, lsr #16		/* r1 = ..01 */
2031	orr	r3, r3, r2, lsl#16	/* r3 = 2345 */
2032	strh	r1, [r0]
2033	str	r3, [r0, #0x02]
2034#else
2035	ldrh	r2, [r1, #0x04]		/* r2 = ..54 */
2036	ldr	r3, [r1]		/* r3 = 3210 */
2037	mov	r2, r2, lsl #16		/* r2 = 54.. */
2038	orr	r2, r2, r3, lsr #16	/* r2 = 5432 */
2039	strh	r3, [r0]
2040	str	r2, [r0, #0x02]
2041#endif
2042	RET
2043	LMEMCPY_6_PAD
2044
2045/*
2046 * 1001: dst is 16-bit aligned, src is 8-bit aligned
2047 */
2048	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
2049	ldr	r2, [r1, #3]		/* BE:r2 = 345x  LE:r2 = x543 */
2050	mov	r1, r3, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
2051#ifdef __ARMEB__
2052	mov	r2, r2, lsr #8		/* r2 = .345 */
2053	orr	r2, r2, r3, lsl #24	/* r2 = 2345 */
2054#else
2055	mov	r2, r2, lsl #8		/* r2 = 543. */
2056	orr	r2, r2, r3, lsr #24	/* r2 = 5432 */
2057#endif
2058	strh	r1, [r0]
2059	str	r2, [r0, #0x02]
2060	RET
2061	LMEMCPY_6_PAD
2062
2063/*
2064 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2065 */
2066	ldrh	r2, [r1]
2067	ldr	r3, [r1, #0x02]
2068	strh	r2, [r0]
2069	str	r3, [r0, #0x02]
2070	RET
2071	LMEMCPY_6_PAD
2072
2073/*
2074 * 1011: dst is 16-bit aligned, src is 8-bit aligned
2075 */
2076	ldrb	r3, [r1]		/* r3 = ...0 */
2077	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2078	ldrb	r1, [r1, #0x05]		/* r1 = ...5 */
2079#ifdef __ARMEB__
2080	mov	r3, r3, lsl #8		/* r3 = ..0. */
2081	orr	r3, r3, r2, lsr #24	/* r3 = ..01 */
2082	orr	r1, r1, r2, lsl #8	/* r1 = 2345 */
2083#else
2084	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
2085	mov	r1, r1, lsl #24		/* r1 = 5... */
2086	orr	r1, r1, r2, lsr #8	/* r1 = 5432 */
2087#endif
2088	strh	r3, [r0]
2089	str	r1, [r0, #0x02]
2090	RET
2091	LMEMCPY_6_PAD
2092
2093/*
2094 * 1100: dst is 8-bit aligned, src is 32-bit aligned
2095 */
2096	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2097	ldrh	r1, [r1, #0x04]		/* BE:r1 = ..45  LE:r1 = ..54 */
2098#ifdef __ARMEB__
2099	mov	r3, r2, lsr #24		/* r3 = ...0 */
2100	strb	r3, [r0]
2101	mov	r2, r2, lsl #8		/* r2 = 123. */
2102	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
2103#else
2104	strb	r2, [r0]
2105	mov	r2, r2, lsr #8		/* r2 = .321 */
2106	orr	r2, r2, r1, lsl #24	/* r2 = 4321 */
2107	mov	r1, r1, lsr #8		/* r1 = ...5 */
2108#endif
2109	str	r2, [r0, #0x01]
2110	strb	r1, [r0, #0x05]
2111	RET
2112	LMEMCPY_6_PAD
2113
2114/*
2115 * 1101: dst is 8-bit aligned, src is 8-bit aligned
2116 */
2117	ldrb	r2, [r1]
2118	ldrh	r3, [r1, #0x01]
2119	ldrh	ip, [r1, #0x03]
2120	ldrb	r1, [r1, #0x05]
2121	strb	r2, [r0]
2122	strh	r3, [r0, #0x01]
2123	strh	ip, [r0, #0x03]
2124	strb	r1, [r0, #0x05]
2125	RET
2126	LMEMCPY_6_PAD
2127
2128/*
2129 * 1110: dst is 8-bit aligned, src is 16-bit aligned
2130 */
2131	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2132	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
2133#ifdef __ARMEB__
2134	mov	r3, r2, lsr #8		/* r3 = ...0 */
2135	strb	r3, [r0]
2136	mov	r2, r2, lsl #24		/* r2 = 1... */
2137	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
2138#else
2139	strb	r2, [r0]
2140	mov	r2, r2, lsr #8		/* r2 = ...1 */
2141	orr	r2, r2, r1, lsl #8	/* r2 = 4321 */
2142	mov	r1, r1, lsr #24		/* r1 = ...5 */
2143#endif
2144	str	r2, [r0, #0x01]
2145	strb	r1, [r0, #0x05]
2146	RET
2147	LMEMCPY_6_PAD
2148
2149/*
2150 * 1111: dst is 8-bit aligned, src is 8-bit aligned
2151 */
2152	ldrb	r2, [r1]
2153	ldr	r3, [r1, #0x01]
2154	ldrb	r1, [r1, #0x05]
2155	strb	r2, [r0]
2156	str	r3, [r0, #0x01]
2157	strb	r1, [r0, #0x05]
2158	RET
2159	LMEMCPY_6_PAD
2160
2161
2162/******************************************************************************
2163 * Special case for 8 byte copies
2164 */
2165#define	LMEMCPY_8_LOG2	6	/* 64 bytes */
2166#define	LMEMCPY_8_PAD	.align LMEMCPY_8_LOG2
2167	LMEMCPY_8_PAD
2168.Lmemcpy_8:
2169	and	r2, r1, #0x03
2170	orr	r2, r2, r0, lsl #2
2171	ands	r2, r2, #0x0f
2172	sub	r3, pc, #0x14
2173	addne	pc, r3, r2, lsl #LMEMCPY_8_LOG2
2174
2175/*
2176 * 0000: dst is 32-bit aligned, src is 32-bit aligned
2177 */
2178	ldr	r2, [r1]
2179	ldr	r3, [r1, #0x04]
2180	str	r2, [r0]
2181	str	r3, [r0, #0x04]
2182	RET
2183	LMEMCPY_8_PAD
2184
2185/*
2186 * 0001: dst is 32-bit aligned, src is 8-bit aligned
2187 */
2188	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
2189	ldr	r2, [r1, #0x03]		/* BE:r2 = 3456  LE:r2 = 6543 */
2190	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2191#ifdef __ARMEB__
2192	mov	r3, r3, lsl #8		/* r3 = 012. */
2193	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
2194	orr	r2, r1, r2, lsl #8	/* r2 = 4567 */
2195#else
2196	mov	r3, r3, lsr #8		/* r3 = .210 */
2197	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
2198	mov	r1, r1, lsl #24		/* r1 = 7... */
2199	orr	r2, r1, r2, lsr #8	/* r2 = 7654 */
2200#endif
2201	str	r3, [r0]
2202	str	r2, [r0, #0x04]
2203	RET
2204	LMEMCPY_8_PAD
2205
2206/*
2207 * 0010: dst is 32-bit aligned, src is 16-bit aligned
2208 */
2209	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2210	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2211	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2212#ifdef __ARMEB__
2213	mov	r2, r2, lsl #16		/* r2 = 01.. */
2214	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
2215	orr	r3, r1, r3, lsl #16	/* r3 = 4567 */
2216#else
2217	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
2218	mov	r3, r3, lsr #16		/* r3 = ..54 */
2219	orr	r3, r3, r1, lsl #16	/* r3 = 7654 */
2220#endif
2221	str	r2, [r0]
2222	str	r3, [r0, #0x04]
2223	RET
2224	LMEMCPY_8_PAD
2225
2226/*
2227 * 0011: dst is 32-bit aligned, src is 8-bit aligned
2228 */
2229	ldrb	r3, [r1]		/* r3 = ...0 */
2230	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2231	ldr	r1, [r1, #0x05]		/* BE:r1 = 567x  LE:r1 = x765 */
2232#ifdef __ARMEB__
2233	mov	r3, r3, lsl #24		/* r3 = 0... */
2234	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
2235	mov	r2, r2, lsl #24		/* r2 = 4... */
2236	orr	r2, r2, r1, lsr #8	/* r2 = 4567 */
2237#else
2238	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
2239	mov	r2, r2, lsr #24		/* r2 = ...4 */
2240	orr	r2, r2, r1, lsl #8	/* r2 = 7654 */
2241#endif
2242	str	r3, [r0]
2243	str	r2, [r0, #0x04]
2244	RET
2245	LMEMCPY_8_PAD
2246
2247/*
2248 * 0100: dst is 8-bit aligned, src is 32-bit aligned
2249 */
2250	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
2251	ldr	r2, [r1, #0x04]		/* BE:r2 = 4567  LE:r2 = 7654 */
2252#ifdef __ARMEB__
2253	mov	r1, r3, lsr #24		/* r1 = ...0 */
2254	strb	r1, [r0]
2255	mov	r1, r3, lsr #8		/* r1 = .012 */
2256	strb	r2, [r0, #0x07]
2257	mov	r3, r3, lsl #24		/* r3 = 3... */
2258	orr	r3, r3, r2, lsr #8	/* r3 = 3456 */
2259#else
2260	strb	r3, [r0]
2261	mov	r1, r2, lsr #24		/* r1 = ...7 */
2262	strb	r1, [r0, #0x07]
2263	mov	r1, r3, lsr #8		/* r1 = .321 */
2264	mov	r3, r3, lsr #24		/* r3 = ...3 */
2265	orr	r3, r3, r2, lsl #8	/* r3 = 6543 */
2266#endif
2267	strh	r1, [r0, #0x01]
2268	str	r3, [r0, #0x03]
2269	RET
2270	LMEMCPY_8_PAD
2271
2272/*
2273 * 0101: dst is 8-bit aligned, src is 8-bit aligned
2274 */
2275	ldrb	r2, [r1]
2276	ldrh	r3, [r1, #0x01]
2277	ldr	ip, [r1, #0x03]
2278	ldrb	r1, [r1, #0x07]
2279	strb	r2, [r0]
2280	strh	r3, [r0, #0x01]
2281	str	ip, [r0, #0x03]
2282	strb	r1, [r0, #0x07]
2283	RET
2284	LMEMCPY_8_PAD
2285
2286/*
2287 * 0110: dst is 8-bit aligned, src is 16-bit aligned
2288 */
2289	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2290	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2291	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2292#ifdef __ARMEB__
2293	mov	ip, r2, lsr #8		/* ip = ...0 */
2294	strb	ip, [r0]
2295	mov	ip, r2, lsl #8		/* ip = .01. */
2296	orr	ip, ip, r3, lsr #24	/* ip = .012 */
2297	strb	r1, [r0, #0x07]
2298	mov	r3, r3, lsl #8		/* r3 = 345. */
2299	orr	r3, r3, r1, lsr #8	/* r3 = 3456 */
2300#else
2301	strb	r2, [r0]		/* 0 */
2302	mov	ip, r1, lsr #8		/* ip = ...7 */
2303	strb	ip, [r0, #0x07]		/* 7 */
2304	mov	ip, r2, lsr #8		/* ip = ...1 */
2305	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
2306	mov	r3, r3, lsr #8		/* r3 = .543 */
2307	orr	r3, r3, r1, lsl #24	/* r3 = 6543 */
2308#endif
2309	strh	ip, [r0, #0x01]
2310	str	r3, [r0, #0x03]
2311	RET
2312	LMEMCPY_8_PAD
2313
2314/*
2315 * 0111: dst is 8-bit aligned, src is 8-bit aligned
2316 */
2317	ldrb	r3, [r1]		/* r3 = ...0 */
2318	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
2319	ldrh	r2, [r1, #0x05]		/* BE:r2 = ..56  LE:r2 = ..65 */
2320	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2321	strb	r3, [r0]
2322	mov	r3, ip, lsr #16		/* BE:r3 = ..12  LE:r3 = ..43 */
2323#ifdef __ARMEB__
2324	strh	r3, [r0, #0x01]
2325	orr	r2, r2, ip, lsl #16	/* r2 = 3456 */
2326#else
2327	strh	ip, [r0, #0x01]
2328	orr	r2, r3, r2, lsl #16	/* r2 = 6543 */
2329#endif
2330	str	r2, [r0, #0x03]
2331	strb	r1, [r0, #0x07]
2332	RET
2333	LMEMCPY_8_PAD
2334
2335/*
2336 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2337 */
2338	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2339	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2340	mov	r1, r2, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
2341#ifdef __ARMEB__
2342	strh	r1, [r0]
2343	mov	r1, r3, lsr #16		/* r1 = ..45 */
2344	orr	r2, r1 ,r2, lsl #16	/* r2 = 2345 */
2345#else
2346	strh	r2, [r0]
2347	orr	r2, r1, r3, lsl #16	/* r2 = 5432 */
2348	mov	r3, r3, lsr #16		/* r3 = ..76 */
2349#endif
2350	str	r2, [r0, #0x02]
2351	strh	r3, [r0, #0x06]
2352	RET
2353	LMEMCPY_8_PAD
2354
2355/*
2356 * 1001: dst is 16-bit aligned, src is 8-bit aligned
2357 */
2358	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
2359	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2360	ldrb	ip, [r1, #0x07]		/* ip = ...7 */
2361	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
2362	strh	r1, [r0]
2363#ifdef __ARMEB__
2364	mov	r1, r2, lsl #24		/* r1 = 2... */
2365	orr	r1, r1, r3, lsr #8	/* r1 = 2345 */
2366	orr	r3, ip, r3, lsl #8	/* r3 = 4567 */
2367#else
2368	mov	r1, r2, lsr #24		/* r1 = ...2 */
2369	orr	r1, r1, r3, lsl #8	/* r1 = 5432 */
2370	mov	r3, r3, lsr #24		/* r3 = ...6 */
2371	orr	r3, r3, ip, lsl #8	/* r3 = ..76 */
2372#endif
2373	str	r1, [r0, #0x02]
2374	strh	r3, [r0, #0x06]
2375	RET
2376	LMEMCPY_8_PAD
2377
2378/*
2379 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2380 */
2381	ldrh	r2, [r1]
2382	ldr	ip, [r1, #0x02]
2383	ldrh	r3, [r1, #0x06]
2384	strh	r2, [r0]
2385	str	ip, [r0, #0x02]
2386	strh	r3, [r0, #0x06]
2387	RET
2388	LMEMCPY_8_PAD
2389
2390/*
2391 * 1011: dst is 16-bit aligned, src is 8-bit aligned
2392 */
2393	ldr	r3, [r1, #0x05]		/* BE:r3 = 567x  LE:r3 = x765 */
2394	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2395	ldrb	ip, [r1]		/* ip = ...0 */
2396	mov	r1, r3, lsr #8		/* BE:r1 = .567  LE:r1 = .x76 */
2397	strh	r1, [r0, #0x06]
2398#ifdef __ARMEB__
2399	mov	r3, r3, lsr #24		/* r3 = ...5 */
2400	orr	r3, r3, r2, lsl #8	/* r3 = 2345 */
2401	mov	r2, r2, lsr #24		/* r2 = ...1 */
2402	orr	r2, r2, ip, lsl #8	/* r2 = ..01 */
2403#else
2404	mov	r3, r3, lsl #24		/* r3 = 5... */
2405	orr	r3, r3, r2, lsr #8	/* r3 = 5432 */
2406	orr	r2, ip, r2, lsl #8	/* r2 = 3210 */
2407#endif
2408	str	r3, [r0, #0x02]
2409	strh	r2, [r0]
2410	RET
2411	LMEMCPY_8_PAD
2412
2413/*
2414 * 1100: dst is 8-bit aligned, src is 32-bit aligned
2415 */
2416	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2417	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2418	mov	r1, r3, lsr #8		/* BE:r1 = .456  LE:r1 = .765 */
2419	strh	r1, [r0, #0x05]
2420#ifdef __ARMEB__
2421	strb	r3, [r0, #0x07]
2422	mov	r1, r2, lsr #24		/* r1 = ...0 */
2423	strb	r1, [r0]
2424	mov	r2, r2, lsl #8		/* r2 = 123. */
2425	orr	r2, r2, r3, lsr #24	/* r2 = 1234 */
2426	str	r2, [r0, #0x01]
2427#else
2428	strb	r2, [r0]
2429	mov	r1, r3, lsr #24		/* r1 = ...7 */
2430	strb	r1, [r0, #0x07]
2431	mov	r2, r2, lsr #8		/* r2 = .321 */
2432	orr	r2, r2, r3, lsl #24	/* r2 = 4321 */
2433	str	r2, [r0, #0x01]
2434#endif
2435	RET
2436	LMEMCPY_8_PAD
2437
2438/*
2439 * 1101: dst is 8-bit aligned, src is 8-bit aligned
2440 */
2441	ldrb	r3, [r1]		/* r3 = ...0 */
2442	ldrh	r2, [r1, #0x01]		/* BE:r2 = ..12  LE:r2 = ..21 */
2443	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
2444	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2445	strb	r3, [r0]
2446	mov	r3, ip, lsr #16		/* BE:r3 = ..34  LE:r3 = ..65 */
2447#ifdef __ARMEB__
2448	strh	ip, [r0, #0x05]
2449	orr	r2, r3, r2, lsl #16	/* r2 = 1234 */
2450#else
2451	strh	r3, [r0, #0x05]
2452	orr	r2, r2, ip, lsl #16	/* r2 = 4321 */
2453#endif
2454	str	r2, [r0, #0x01]
2455	strb	r1, [r0, #0x07]
2456	RET
2457	LMEMCPY_8_PAD
2458
2459/*
2460 * 1110: dst is 8-bit aligned, src is 16-bit aligned
2461 */
2462	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2463	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2464	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2465#ifdef __ARMEB__
2466	mov	ip, r2, lsr #8		/* ip = ...0 */
2467	strb	ip, [r0]
2468	mov	ip, r2, lsl #24		/* ip = 1... */
2469	orr	ip, ip, r3, lsr #8	/* ip = 1234 */
2470	strb	r1, [r0, #0x07]
2471	mov	r1, r1, lsr #8		/* r1 = ...6 */
2472	orr	r1, r1, r3, lsl #8	/* r1 = 3456 */
2473#else
2474	strb	r2, [r0]
2475	mov	ip, r2, lsr #8		/* ip = ...1 */
2476	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
2477	mov	r2, r1, lsr #8		/* r2 = ...7 */
2478	strb	r2, [r0, #0x07]
2479	mov	r1, r1, lsl #8		/* r1 = .76. */
2480	orr	r1, r1, r3, lsr #24	/* r1 = .765 */
2481#endif
2482	str	ip, [r0, #0x01]
2483	strh	r1, [r0, #0x05]
2484	RET
2485	LMEMCPY_8_PAD
2486
2487/*
2488 * 1111: dst is 8-bit aligned, src is 8-bit aligned
2489 */
2490	ldrb	r2, [r1]
2491	ldr	ip, [r1, #0x01]
2492	ldrh	r3, [r1, #0x05]
2493	ldrb	r1, [r1, #0x07]
2494	strb	r2, [r0]
2495	str	ip, [r0, #0x01]
2496	strh	r3, [r0, #0x05]
2497	strb	r1, [r0, #0x07]
2498	RET
2499	LMEMCPY_8_PAD
2500
2501/******************************************************************************
2502 * Special case for 12 byte copies
2503 */
2504#define	LMEMCPY_C_LOG2	7	/* 128 bytes */
2505#define	LMEMCPY_C_PAD	.align LMEMCPY_C_LOG2
2506	LMEMCPY_C_PAD
2507.Lmemcpy_c:
2508	and	r2, r1, #0x03
2509	orr	r2, r2, r0, lsl #2
2510	ands	r2, r2, #0x0f
2511	sub	r3, pc, #0x14
2512	addne	pc, r3, r2, lsl #LMEMCPY_C_LOG2
2513
2514/*
2515 * 0000: dst is 32-bit aligned, src is 32-bit aligned
2516 */
2517	ldr	r2, [r1]
2518	ldr	r3, [r1, #0x04]
2519	ldr	r1, [r1, #0x08]
2520	str	r2, [r0]
2521	str	r3, [r0, #0x04]
2522	str	r1, [r0, #0x08]
2523	RET
2524	LMEMCPY_C_PAD
2525
2526/*
2527 * 0001: dst is 32-bit aligned, src is 8-bit aligned
2528 */
2529	ldrb	r2, [r1, #0xb]		/* r2 = ...B */
2530	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
2531	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2532	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
2533#ifdef __ARMEB__
2534	orr	r2, r2, ip, lsl #8	/* r2 = 89AB */
2535	str	r2, [r0, #0x08]
2536	mov	r2, ip, lsr #24		/* r2 = ...7 */
2537	orr	r2, r2, r3, lsl #8	/* r2 = 4567 */
2538	mov	r1, r1, lsl #8		/* r1 = 012. */
2539	orr	r1, r1, r3, lsr #24	/* r1 = 0123 */
2540#else
2541	mov	r2, r2, lsl #24		/* r2 = B... */
2542	orr	r2, r2, ip, lsr #8	/* r2 = BA98 */
2543	str	r2, [r0, #0x08]
2544	mov	r2, ip, lsl #24		/* r2 = 7... */
2545	orr	r2, r2, r3, lsr #8	/* r2 = 7654 */
2546	mov	r1, r1, lsr #8		/* r1 = .210 */
2547	orr	r1, r1, r3, lsl #24	/* r1 = 3210 */
2548#endif
2549	str	r2, [r0, #0x04]
2550	str	r1, [r0]
2551	RET
2552	LMEMCPY_C_PAD
2553
2554/*
2555 * 0010: dst is 32-bit aligned, src is 16-bit aligned
2556 */
2557	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2558	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2559	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
2560	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
2561#ifdef __ARMEB__
2562	mov	r2, r2, lsl #16		/* r2 = 01.. */
2563	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
2564	str	r2, [r0]
2565	mov	r3, r3, lsl #16		/* r3 = 45.. */
2566	orr	r3, r3, ip, lsr #16	/* r3 = 4567 */
2567	orr	r1, r1, ip, lsl #16	/* r1 = 89AB */
2568#else
2569	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
2570	str	r2, [r0]
2571	mov	r3, r3, lsr #16		/* r3 = ..54 */
2572	orr	r3, r3, ip, lsl #16	/* r3 = 7654 */
2573	mov	r1, r1, lsl #16		/* r1 = BA.. */
2574	orr	r1, r1, ip, lsr #16	/* r1 = BA98 */
2575#endif
2576	str	r3, [r0, #0x04]
2577	str	r1, [r0, #0x08]
2578	RET
2579	LMEMCPY_C_PAD
2580
2581/*
2582 * 0011: dst is 32-bit aligned, src is 8-bit aligned
2583 */
2584	ldrb	r2, [r1]		/* r2 = ...0 */
2585	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
2586	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
2587	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
2588#ifdef __ARMEB__
2589	mov	r2, r2, lsl #24		/* r2 = 0... */
2590	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
2591	str	r2, [r0]
2592	mov	r3, r3, lsl #24		/* r3 = 4... */
2593	orr	r3, r3, ip, lsr #8	/* r3 = 4567 */
2594	mov	r1, r1, lsr #8		/* r1 = .9AB */
2595	orr	r1, r1, ip, lsl #24	/* r1 = 89AB */
2596#else
2597	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
2598	str	r2, [r0]
2599	mov	r3, r3, lsr #24		/* r3 = ...4 */
2600	orr	r3, r3, ip, lsl #8	/* r3 = 7654 */
2601	mov	r1, r1, lsl #8		/* r1 = BA9. */
2602	orr	r1, r1, ip, lsr #24	/* r1 = BA98 */
2603#endif
2604	str	r3, [r0, #0x04]
2605	str	r1, [r0, #0x08]
2606	RET
2607	LMEMCPY_C_PAD
2608
2609/*
2610 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
2611 */
2612	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2613	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2614	ldr	ip, [r1, #0x08]		/* BE:ip = 89AB  LE:ip = BA98 */
2615	mov	r1, r2, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
2616	strh	r1, [r0, #0x01]
2617#ifdef __ARMEB__
2618	mov	r1, r2, lsr #24		/* r1 = ...0 */
2619	strb	r1, [r0]
2620	mov	r1, r2, lsl #24		/* r1 = 3... */
2621	orr	r2, r1, r3, lsr #8	/* r1 = 3456 */
2622	mov	r1, r3, lsl #24		/* r1 = 7... */
2623	orr	r1, r1, ip, lsr #8	/* r1 = 789A */
2624#else
2625	strb	r2, [r0]
2626	mov	r1, r2, lsr #24		/* r1 = ...3 */
2627	orr	r2, r1, r3, lsl #8	/* r1 = 6543 */
2628	mov	r1, r3, lsr #24		/* r1 = ...7 */
2629	orr	r1, r1, ip, lsl #8	/* r1 = A987 */
2630	mov	ip, ip, lsr #24		/* ip = ...B */
2631#endif
2632	str	r2, [r0, #0x03]
2633	str	r1, [r0, #0x07]
2634	strb	ip, [r0, #0x0b]
2635	RET
2636	LMEMCPY_C_PAD
2637
2638/*
2639 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
2640 */
2641	ldrb	r2, [r1]
2642	ldrh	r3, [r1, #0x01]
2643	ldr	ip, [r1, #0x03]
2644	strb	r2, [r0]
2645	ldr	r2, [r1, #0x07]
2646	ldrb	r1, [r1, #0x0b]
2647	strh	r3, [r0, #0x01]
2648	str	ip, [r0, #0x03]
2649	str	r2, [r0, #0x07]
2650	strb	r1, [r0, #0x0b]
2651	RET
2652	LMEMCPY_C_PAD
2653
2654/*
2655 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
2656 */
2657	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2658	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2659	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
2660	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
2661#ifdef __ARMEB__
2662	mov	r2, r2, ror #8		/* r2 = 1..0 */
2663	strb	r2, [r0]
2664	mov	r2, r2, lsr #16		/* r2 = ..1. */
2665	orr	r2, r2, r3, lsr #24	/* r2 = ..12 */
2666	strh	r2, [r0, #0x01]
2667	mov	r2, r3, lsl #8		/* r2 = 345. */
2668	orr	r3, r2, ip, lsr #24	/* r3 = 3456 */
2669	mov	r2, ip, lsl #8		/* r2 = 789. */
2670	orr	r2, r2, r1, lsr #8	/* r2 = 789A */
2671#else
2672	strb	r2, [r0]
2673	mov	r2, r2, lsr #8		/* r2 = ...1 */
2674	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
2675	strh	r2, [r0, #0x01]
2676	mov	r2, r3, lsr #8		/* r2 = .543 */
2677	orr	r3, r2, ip, lsl #24	/* r3 = 6543 */
2678	mov	r2, ip, lsr #8		/* r2 = .987 */
2679	orr	r2, r2, r1, lsl #24	/* r2 = A987 */
2680	mov	r1, r1, lsr #8		/* r1 = ...B */
2681#endif
2682	str	r3, [r0, #0x03]
2683	str	r2, [r0, #0x07]
2684	strb	r1, [r0, #0x0b]
2685	RET
2686	LMEMCPY_C_PAD
2687
2688/*
2689 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
2690 */
2691	ldrb	r2, [r1]
2692	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
2693	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
2694	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
2695	strb	r2, [r0]
2696#ifdef __ARMEB__
2697	mov	r2, r3, lsr #16		/* r2 = ..12 */
2698	strh	r2, [r0, #0x01]
2699	mov	r3, r3, lsl #16		/* r3 = 34.. */
2700	orr	r3, r3, ip, lsr #16	/* r3 = 3456 */
2701	mov	ip, ip, lsl #16		/* ip = 78.. */
2702	orr	ip, ip, r1, lsr #16	/* ip = 789A */
2703	mov	r1, r1, lsr #8		/* r1 = .9AB */
2704#else
2705	strh	r3, [r0, #0x01]
2706	mov	r3, r3, lsr #16		/* r3 = ..43 */
2707	orr	r3, r3, ip, lsl #16	/* r3 = 6543 */
2708	mov	ip, ip, lsr #16		/* ip = ..87 */
2709	orr	ip, ip, r1, lsl #16	/* ip = A987 */
2710	mov	r1, r1, lsr #16		/* r1 = ..xB */
2711#endif
2712	str	r3, [r0, #0x03]
2713	str	ip, [r0, #0x07]
2714	strb	r1, [r0, #0x0b]
2715	RET
2716	LMEMCPY_C_PAD
2717
2718/*
2719 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2720 */
2721	ldr	ip, [r1]		/* BE:ip = 0123  LE:ip = 3210 */
2722	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2723	ldr	r2, [r1, #0x08]		/* BE:r2 = 89AB  LE:r2 = BA98 */
2724	mov	r1, ip, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
2725#ifdef __ARMEB__
2726	strh	r1, [r0]
2727	mov	r1, ip, lsl #16		/* r1 = 23.. */
2728	orr	r1, r1, r3, lsr #16	/* r1 = 2345 */
2729	mov	r3, r3, lsl #16		/* r3 = 67.. */
2730	orr	r3, r3, r2, lsr #16	/* r3 = 6789 */
2731#else
2732	strh	ip, [r0]
2733	orr	r1, r1, r3, lsl #16	/* r1 = 5432 */
2734	mov	r3, r3, lsr #16		/* r3 = ..76 */
2735	orr	r3, r3, r2, lsl #16	/* r3 = 9876 */
2736	mov	r2, r2, lsr #16		/* r2 = ..BA */
2737#endif
2738	str	r1, [r0, #0x02]
2739	str	r3, [r0, #0x06]
2740	strh	r2, [r0, #0x0a]
2741	RET
2742	LMEMCPY_C_PAD
2743
2744/*
2745 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
2746 */
2747	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
2748	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2749	mov	ip, r2, lsr #8		/* BE:ip = .x01  LE:ip = .210 */
2750	strh	ip, [r0]
2751	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
2752	ldrb	r1, [r1, #0x0b]		/* r1 = ...B */
2753#ifdef __ARMEB__
2754	mov	r2, r2, lsl #24		/* r2 = 2... */
2755	orr	r2, r2, r3, lsr #8	/* r2 = 2345 */
2756	mov	r3, r3, lsl #24		/* r3 = 6... */
2757	orr	r3, r3, ip, lsr #8	/* r3 = 6789 */
2758	orr	r1, r1, ip, lsl #8	/* r1 = 89AB */
2759#else
2760	mov	r2, r2, lsr #24		/* r2 = ...2 */
2761	orr	r2, r2, r3, lsl #8	/* r2 = 5432 */
2762	mov	r3, r3, lsr #24		/* r3 = ...6 */
2763	orr	r3, r3, ip, lsl #8	/* r3 = 9876 */
2764	mov	r1, r1, lsl #8		/* r1 = ..B. */
2765	orr	r1, r1, ip, lsr #24	/* r1 = ..BA */
2766#endif
2767	str	r2, [r0, #0x02]
2768	str	r3, [r0, #0x06]
2769	strh	r1, [r0, #0x0a]
2770	RET
2771	LMEMCPY_C_PAD
2772
2773/*
2774 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2775 */
2776	ldrh	r2, [r1]
2777	ldr	r3, [r1, #0x02]
2778	ldr	ip, [r1, #0x06]
2779	ldrh	r1, [r1, #0x0a]
2780	strh	r2, [r0]
2781	str	r3, [r0, #0x02]
2782	str	ip, [r0, #0x06]
2783	strh	r1, [r0, #0x0a]
2784	RET
2785	LMEMCPY_C_PAD
2786
2787/*
2788 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
2789 */
2790	ldr	r2, [r1, #0x09]		/* BE:r2 = 9ABx  LE:r2 = xBA9 */
2791	ldr	r3, [r1, #0x05]		/* BE:r3 = 5678  LE:r3 = 8765 */
2792	mov	ip, r2, lsr #8		/* BE:ip = .9AB  LE:ip = .xBA */
2793	strh	ip, [r0, #0x0a]
2794	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
2795	ldrb	r1, [r1]		/* r1 = ...0 */
2796#ifdef __ARMEB__
2797	mov	r2, r2, lsr #24		/* r2 = ...9 */
2798	orr	r2, r2, r3, lsl #8	/* r2 = 6789 */
2799	mov	r3, r3, lsr #24		/* r3 = ...5 */
2800	orr	r3, r3, ip, lsl #8	/* r3 = 2345 */
2801	mov	r1, r1, lsl #8		/* r1 = ..0. */
2802	orr	r1, r1, ip, lsr #24	/* r1 = ..01 */
2803#else
2804	mov	r2, r2, lsl #24		/* r2 = 9... */
2805	orr	r2, r2, r3, lsr #8	/* r2 = 9876 */
2806	mov	r3, r3, lsl #24		/* r3 = 5... */
2807	orr	r3, r3, ip, lsr #8	/* r3 = 5432 */
2808	orr	r1, r1, ip, lsl #8	/* r1 = 3210 */
2809#endif
2810	str	r2, [r0, #0x06]
2811	str	r3, [r0, #0x02]
2812	strh	r1, [r0]
2813	RET
2814	LMEMCPY_C_PAD
2815
2816/*
2817 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2818 */
2819	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2820	ldr	ip, [r1, #0x04]		/* BE:ip = 4567  LE:ip = 7654 */
2821	ldr	r1, [r1, #0x08]		/* BE:r1 = 89AB  LE:r1 = BA98 */
2822#ifdef __ARMEB__
2823	mov	r3, r2, lsr #24		/* r3 = ...0 */
2824	strb	r3, [r0]
2825	mov	r2, r2, lsl #8		/* r2 = 123. */
2826	orr	r2, r2, ip, lsr #24	/* r2 = 1234 */
2827	str	r2, [r0, #0x01]
2828	mov	r2, ip, lsl #8		/* r2 = 567. */
2829	orr	r2, r2, r1, lsr #24	/* r2 = 5678 */
2830	str	r2, [r0, #0x05]
2831	mov	r2, r1, lsr #8		/* r2 = ..9A */
2832	strh	r2, [r0, #0x09]
2833	strb	r1, [r0, #0x0b]
2834#else
2835	strb	r2, [r0]
2836	mov	r3, r2, lsr #8		/* r3 = .321 */
2837	orr	r3, r3, ip, lsl #24	/* r3 = 4321 */
2838	str	r3, [r0, #0x01]
2839	mov	r3, ip, lsr #8		/* r3 = .765 */
2840	orr	r3, r3, r1, lsl #24	/* r3 = 8765 */
2841	str	r3, [r0, #0x05]
2842	mov	r1, r1, lsr #8		/* r1 = .BA9 */
2843	strh	r1, [r0, #0x09]
2844	mov	r1, r1, lsr #16		/* r1 = ...B */
2845	strb	r1, [r0, #0x0b]
2846#endif
2847	RET
2848	LMEMCPY_C_PAD
2849
2850/*
2851 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2852 */
2853	ldrb	r2, [r1, #0x0b]		/* r2 = ...B */
2854	ldr	r3, [r1, #0x07]		/* BE:r3 = 789A  LE:r3 = A987 */
2855	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
2856	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
2857	strb	r2, [r0, #0x0b]
2858#ifdef __ARMEB__
2859	strh	r3, [r0, #0x09]
2860	mov	r3, r3, lsr #16		/* r3 = ..78 */
2861	orr	r3, r3, ip, lsl #16	/* r3 = 5678 */
2862	mov	ip, ip, lsr #16		/* ip = ..34 */
2863	orr	ip, ip, r1, lsl #16	/* ip = 1234 */
2864	mov	r1, r1, lsr #16		/* r1 = ..x0 */
2865#else
2866	mov	r2, r3, lsr #16		/* r2 = ..A9 */
2867	strh	r2, [r0, #0x09]
2868	mov	r3, r3, lsl #16		/* r3 = 87.. */
2869	orr	r3, r3, ip, lsr #16	/* r3 = 8765 */
2870	mov	ip, ip, lsl #16		/* ip = 43.. */
2871	orr	ip, ip, r1, lsr #16	/* ip = 4321 */
2872	mov	r1, r1, lsr #8		/* r1 = .210 */
2873#endif
2874	str	r3, [r0, #0x05]
2875	str	ip, [r0, #0x01]
2876	strb	r1, [r0]
2877	RET
2878	LMEMCPY_C_PAD
2879
2880/*
2881 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2882 */
2883#ifdef __ARMEB__
2884	ldrh	r2, [r1, #0x0a]		/* r2 = ..AB */
2885	ldr	ip, [r1, #0x06]		/* ip = 6789 */
2886	ldr	r3, [r1, #0x02]		/* r3 = 2345 */
2887	ldrh	r1, [r1]		/* r1 = ..01 */
2888	strb	r2, [r0, #0x0b]
2889	mov	r2, r2, lsr #8		/* r2 = ...A */
2890	orr	r2, r2, ip, lsl #8	/* r2 = 789A */
2891	mov	ip, ip, lsr #8		/* ip = .678 */
2892	orr	ip, ip, r3, lsl #24	/* ip = 5678 */
2893	mov	r3, r3, lsr #8		/* r3 = .234 */
2894	orr	r3, r3, r1, lsl #24	/* r3 = 1234 */
2895	mov	r1, r1, lsr #8		/* r1 = ...0 */
2896	strb	r1, [r0]
2897	str	r3, [r0, #0x01]
2898	str	ip, [r0, #0x05]
2899	strh	r2, [r0, #0x09]
2900#else
2901	ldrh	r2, [r1]		/* r2 = ..10 */
2902	ldr	r3, [r1, #0x02]		/* r3 = 5432 */
2903	ldr	ip, [r1, #0x06]		/* ip = 9876 */
2904	ldrh	r1, [r1, #0x0a]		/* r1 = ..BA */
2905	strb	r2, [r0]
2906	mov	r2, r2, lsr #8		/* r2 = ...1 */
2907	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
2908	mov	r3, r3, lsr #24		/* r3 = ...5 */
2909	orr	r3, r3, ip, lsl #8	/* r3 = 8765 */
2910	mov	ip, ip, lsr #24		/* ip = ...9 */
2911	orr	ip, ip, r1, lsl #8	/* ip = .BA9 */
2912	mov	r1, r1, lsr #8		/* r1 = ...B */
2913	str	r2, [r0, #0x01]
2914	str	r3, [r0, #0x05]
2915	strh	ip, [r0, #0x09]
2916	strb	r1, [r0, #0x0b]
2917#endif
2918	RET
2919	LMEMCPY_C_PAD
2920
2921/*
2922 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2923 */
2924	ldrb	r2, [r1]
2925	ldr	r3, [r1, #0x01]
2926	ldr	ip, [r1, #0x05]
2927	strb	r2, [r0]
2928	ldrh	r2, [r1, #0x09]
2929	ldrb	r1, [r1, #0x0b]
2930	str	r3, [r0, #0x01]
2931	str	ip, [r0, #0x05]
2932	strh	r2, [r0, #0x09]
2933	strb	r1, [r0, #0x0b]
2934	RET
2935#endif /* _ARM_ARCH_5E */
2936
2937#ifdef GPROF
2938
2939ENTRY(user)
2940	nop
2941ENTRY(btrap)
2942	nop
2943ENTRY(etrap)
2944	nop
2945ENTRY(bintr)
2946	nop
2947ENTRY(eintr)
2948	nop
2949
2950#endif
2951