xref: /freebsd/sys/arm/arm/support.S (revision 61e21613)
1/*-
2 * Copyright (c) 2004 Olivier Houchard
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26/*
27 * Copyright 2003 Wasabi Systems, Inc.
28 * All rights reserved.
29 *
30 * Written by Steve C. Woodford for Wasabi Systems, Inc.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *      This product includes software developed for the NetBSD Project by
43 *      Wasabi Systems, Inc.
44 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
45 *    or promote products derived from this software without specific prior
46 *    written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
50 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
51 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
52 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
53 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
54 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
55 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
56 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
57 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
58 * POSSIBILITY OF SUCH DAMAGE.
59 */
60/*
61 * Copyright (c) 1997 The NetBSD Foundation, Inc.
62 * All rights reserved.
63 *
64 * This code is derived from software contributed to The NetBSD Foundation
65 * by Neil A. Carson and Mark Brinicombe
66 *
67 * Redistribution and use in source and binary forms, with or without
68 * modification, are permitted provided that the following conditions
69 * are met:
70 * 1. Redistributions of source code must retain the above copyright
71 *    notice, this list of conditions and the following disclaimer.
72 * 2. Redistributions in binary form must reproduce the above copyright
73 *    notice, this list of conditions and the following disclaimer in the
74 *    documentation and/or other materials provided with the distribution.
75 *
76 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
77 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
78 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
79 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
80 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
81 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
82 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
83 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
84 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
85 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
86 * POSSIBILITY OF SUCH DAMAGE.
87 */
88
89#include <machine/asm.h>
90#include "assym.inc"
91
92	.syntax	unified
93
94/*
95 * memset: Sets a block of memory to the specified value
96 *
97 * On entry:
98 *   r0 - dest address
99 *   r1 - byte to write
100 *   r2 - number of bytes to write
101 *
102 * On exit:
103 *   r0 - dest address
104 */
105/* LINTSTUB: Func: void *memset(void *, int, size_t) */
106ENTRY(memset)
107	and	r3, r1, #0xff		/* We deal with bytes */
108	mov	r1, r2
109do_memset:
110	cmp	r1, #0x04		/* Do we have less than 4 bytes */
111	mov	ip, r0
112	blt	.Lmemset_lessthanfour
113
114	/* Ok first we will word align the address */
115	ands	r2, ip, #0x03		/* Get the bottom two bits */
116	bne	.Lmemset_wordunaligned	/* The address is not word aligned */
117
118	/* We are now word aligned */
119.Lmemset_wordaligned:
120	orr	r3, r3, r3, lsl #8	/* Extend value to 16-bits */
121	tst	ip, #0x04		/* Quad-align for armv5e */
122	orr	r3, r3, r3, lsl #16	/* Extend value to 32-bits */
123	subne	r1, r1, #0x04		/* Quad-align if necessary */
124	strne	r3, [ip], #0x04
125	cmp	r1, #0x10
126	blt	.Lmemset_loop4		/* If less than 16 then use words */
127	mov	r2, r3			/* Duplicate data */
128	cmp	r1, #0x80		/* If < 128 then skip the big loop */
129	blt	.Lmemset_loop32
130
131	/* Do 128 bytes at a time */
132.Lmemset_loop128:
133	subs	r1, r1, #0x80
134	strdge	r2, [ip], #0x08
135	strdge	r2, [ip], #0x08
136	strdge	r2, [ip], #0x08
137	strdge	r2, [ip], #0x08
138	strdge	r2, [ip], #0x08
139	strdge	r2, [ip], #0x08
140	strdge	r2, [ip], #0x08
141	strdge	r2, [ip], #0x08
142	strdge	r2, [ip], #0x08
143	strdge	r2, [ip], #0x08
144	strdge	r2, [ip], #0x08
145	strdge	r2, [ip], #0x08
146	strdge	r2, [ip], #0x08
147	strdge	r2, [ip], #0x08
148	strdge	r2, [ip], #0x08
149	strdge	r2, [ip], #0x08
150	bgt	.Lmemset_loop128
151	RETeq			/* Zero length so just exit */
152
153	add	r1, r1, #0x80		/* Adjust for extra sub */
154
155	/* Do 32 bytes at a time */
156.Lmemset_loop32:
157	subs	r1, r1, #0x20
158	strdge	r2, [ip], #0x08
159	strdge	r2, [ip], #0x08
160	strdge	r2, [ip], #0x08
161	strdge	r2, [ip], #0x08
162	bgt	.Lmemset_loop32
163	RETeq			/* Zero length so just exit */
164
165	adds	r1, r1, #0x10		/* Partially adjust for extra sub */
166
167	/* Deal with 16 bytes or more */
168	strdge	r2, [ip], #0x08
169	strdge	r2, [ip], #0x08
170	RETeq			/* Zero length so just exit */
171
172	addlt	r1, r1, #0x10		/* Possibly adjust for extra sub */
173
174	/* We have at least 4 bytes so copy as words */
175.Lmemset_loop4:
176	subs	r1, r1, #0x04
177	strge	r3, [ip], #0x04
178	bgt	.Lmemset_loop4
179	RETeq			/* Zero length so just exit */
180
181	/* Compensate for 64-bit alignment check */
182	adds	r1, r1, #0x04
183	RETeq
184	cmp	r1, #2
185
186	strb	r3, [ip], #0x01		/* Set 1 byte */
187	strbge	r3, [ip], #0x01		/* Set another byte */
188	strbgt	r3, [ip]		/* and a third */
189	RET			/* Exit */
190
191.Lmemset_wordunaligned:
192	rsb	r2, r2, #0x004
193	strb	r3, [ip], #0x01		/* Set 1 byte */
194	cmp	r2, #0x02
195	strbge	r3, [ip], #0x01		/* Set another byte */
196	sub	r1, r1, r2
197	strbgt	r3, [ip], #0x01		/* and a third */
198	cmp	r1, #0x04		/* More than 4 bytes left? */
199	bge	.Lmemset_wordaligned	/* Yup */
200
201.Lmemset_lessthanfour:
202	cmp	r1, #0x00
203	RETeq			/* Zero length so exit */
204	strb	r3, [ip], #0x01		/* Set 1 byte */
205	cmp	r1, #0x02
206	strbge	r3, [ip], #0x01		/* Set another byte */
207	strbgt	r3, [ip]		/* and a third */
208	RET			/* Exit */
209END(memset)
210
211ENTRY(memcmp)
212	mov	ip, r0
213	cmp	r2, #0x06
214	beq	.Lmemcmp_6bytes
215	mov	r0, #0x00
216
217	/* Are both addresses aligned the same way? */
218	cmp	r2, #0x00
219	eorsne	r3, ip, r1
220	RETeq			/* len == 0, or same addresses! */
221	tst	r3, #0x03
222	subne	r2, r2, #0x01
223	bne	.Lmemcmp_bytewise2	/* Badly aligned. Do it the slow way */
224
225	/* Word-align the addresses, if necessary */
226	sub	r3, r1, #0x05
227	ands	r3, r3, #0x03
228	add	r3, r3, r3, lsl #1
229	addne	pc, pc, r3, lsl #3
230	nop
231
232	/* Compare up to 3 bytes */
233	ldrb	r0, [ip], #0x01
234	ldrb	r3, [r1], #0x01
235	subs	r0, r0, r3
236	RETne
237	subs	r2, r2, #0x01
238	RETeq
239
240	/* Compare up to 2 bytes */
241	ldrb	r0, [ip], #0x01
242	ldrb	r3, [r1], #0x01
243	subs	r0, r0, r3
244	RETne
245	subs	r2, r2, #0x01
246	RETeq
247
248	/* Compare 1 byte */
249	ldrb	r0, [ip], #0x01
250	ldrb	r3, [r1], #0x01
251	subs	r0, r0, r3
252	RETne
253	subs	r2, r2, #0x01
254	RETeq
255
256	/* Compare 4 bytes at a time, if possible */
257	subs	r2, r2, #0x04
258	bcc	.Lmemcmp_bytewise
259.Lmemcmp_word_aligned:
260	ldr	r0, [ip], #0x04
261	ldr	r3, [r1], #0x04
262	subs	r2, r2, #0x04
263	cmpcs	r0, r3
264	beq	.Lmemcmp_word_aligned
265	sub	r0, r0, r3
266
267	/* Correct for extra subtraction, and check if done */
268	adds	r2, r2, #0x04
269	cmpeq	r0, #0x00		/* If done, did all bytes match? */
270	RETeq			/* Yup. Just return */
271
272	/* Re-do the final word byte-wise */
273	sub	ip, ip, #0x04
274	sub	r1, r1, #0x04
275
276.Lmemcmp_bytewise:
277	add	r2, r2, #0x03
278.Lmemcmp_bytewise2:
279	ldrb	r0, [ip], #0x01
280	ldrb	r3, [r1], #0x01
281	subs	r2, r2, #0x01
282	cmpcs	r0, r3
283	beq	.Lmemcmp_bytewise2
284	sub	r0, r0, r3
285	RET
286
287	/*
288	 * 6 byte compares are very common, thanks to the network stack.
289	 * This code is hand-scheduled to reduce the number of stalls for
290	 * load results. Everything else being equal, this will be ~32%
291	 * faster than a byte-wise memcmp.
292	 */
293	.align	5
294.Lmemcmp_6bytes:
295	ldrb	r3, [r1, #0x00]		/* r3 = b2#0 */
296	ldrb	r0, [ip, #0x00]		/* r0 = b1#0 */
297	ldrb	r2, [r1, #0x01]		/* r2 = b2#1 */
298	subs	r0, r0, r3		/* r0 = b1#0 - b2#0 */
299	ldrbeq	r3, [ip, #0x01]		/* r3 = b1#1 */
300	RETne			/* Return if mismatch on #0 */
301	subs	r0, r3, r2		/* r0 = b1#1 - b2#1 */
302	ldrbeq	r3, [r1, #0x02]		/* r3 = b2#2 */
303	ldrbeq	r0, [ip, #0x02]		/* r0 = b1#2 */
304	RETne			/* Return if mismatch on #1 */
305	ldrb	r2, [r1, #0x03]		/* r2 = b2#3 */
306	subs	r0, r0, r3		/* r0 = b1#2 - b2#2 */
307	ldrbeq	r3, [ip, #0x03]		/* r3 = b1#3 */
308	RETne			/* Return if mismatch on #2 */
309	subs	r0, r3, r2		/* r0 = b1#3 - b2#3 */
310	ldrbeq	r3, [r1, #0x04]		/* r3 = b2#4 */
311	ldrbeq	r0, [ip, #0x04]		/* r0 = b1#4 */
312	RETne			/* Return if mismatch on #3 */
313	ldrb	r2, [r1, #0x05]		/* r2 = b2#5 */
314	subs	r0, r0, r3		/* r0 = b1#4 - b2#4 */
315	ldrbeq	r3, [ip, #0x05]		/* r3 = b1#5 */
316	RETne			/* Return if mismatch on #4 */
317	sub	r0, r3, r2		/* r0 = b1#5 - b2#5 */
318	RET
319END(memcmp)
320
321ENTRY(memmove)
322	/* Do the buffers overlap? */
323	cmp	r0, r1
324	RETeq		/* Bail now if src/dst are the same */
325	subcc	r3, r0, r1	/* if (dst > src) r3 = dst - src */
326	subcs	r3, r1, r0	/* if (src > dsr) r3 = src - dst */
327	cmp	r3, r2		/* if (r3 < len) we have an overlap */
328	bcc	PIC_SYM(_C_LABEL(memcpy), PLT)
329
330	/* Determine copy direction */
331	cmp	r1, r0
332	bcc	.Lmemmove_backwards
333
334	moveq	r0, #0			/* Quick abort for len=0 */
335	RETeq
336
337	stmdb	sp!, {r0, lr}		/* memmove() returns dest addr */
338	subs	r2, r2, #4
339	blt	.Lmemmove_fl4		/* less than 4 bytes */
340	ands	r12, r0, #3
341	bne	.Lmemmove_fdestul	/* oh unaligned destination addr */
342	ands	r12, r1, #3
343	bne	.Lmemmove_fsrcul		/* oh unaligned source addr */
344
345.Lmemmove_ft8:
346	/* We have aligned source and destination */
347	subs	r2, r2, #8
348	blt	.Lmemmove_fl12		/* less than 12 bytes (4 from above) */
349	subs	r2, r2, #0x14
350	blt	.Lmemmove_fl32		/* less than 32 bytes (12 from above) */
351	stmdb	sp!, {r4}		/* borrow r4 */
352
353	/* blat 32 bytes at a time */
354	/* XXX for really big copies perhaps we should use more registers */
355.Lmemmove_floop32:
356	ldmia	r1!, {r3, r4, r12, lr}
357	stmia	r0!, {r3, r4, r12, lr}
358	ldmia	r1!, {r3, r4, r12, lr}
359	stmia	r0!, {r3, r4, r12, lr}
360	subs	r2, r2, #0x20
361	bge	.Lmemmove_floop32
362
363	cmn	r2, #0x10
364	ldmiage	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
365	stmiage	r0!, {r3, r4, r12, lr}
366	subge	r2, r2, #0x10
367	ldmia	sp!, {r4}		/* return r4 */
368
369.Lmemmove_fl32:
370	adds	r2, r2, #0x14
371
372	/* blat 12 bytes at a time */
373.Lmemmove_floop12:
374	ldmiage	r1!, {r3, r12, lr}
375	stmiage	r0!, {r3, r12, lr}
376	subsge	r2, r2, #0x0c
377	bge	.Lmemmove_floop12
378
379.Lmemmove_fl12:
380	adds	r2, r2, #8
381	blt	.Lmemmove_fl4
382
383	subs	r2, r2, #4
384	ldrlt	r3, [r1], #4
385	strlt	r3, [r0], #4
386	ldmiage	r1!, {r3, r12}
387	stmiage	r0!, {r3, r12}
388	subge	r2, r2, #4
389
390.Lmemmove_fl4:
391	/* less than 4 bytes to go */
392	adds	r2, r2, #4
393	ldmiaeq	sp!, {r0, pc}		/* done */
394
395	/* copy the crud byte at a time */
396	cmp	r2, #2
397	ldrb	r3, [r1], #1
398	strb	r3, [r0], #1
399	ldrbge	r3, [r1], #1
400	strbge	r3, [r0], #1
401	ldrbgt	r3, [r1], #1
402	strbgt	r3, [r0], #1
403	ldmia	sp!, {r0, pc}
404
405	/* erg - unaligned destination */
406.Lmemmove_fdestul:
407	rsb	r12, r12, #4
408	cmp	r12, #2
409
410	/* align destination with byte copies */
411	ldrb	r3, [r1], #1
412	strb	r3, [r0], #1
413	ldrbge	r3, [r1], #1
414	strbge	r3, [r0], #1
415	ldrbgt	r3, [r1], #1
416	strbgt	r3, [r0], #1
417	subs	r2, r2, r12
418	blt	.Lmemmove_fl4		/* less the 4 bytes */
419
420	ands	r12, r1, #3
421	beq	.Lmemmove_ft8		/* we have an aligned source */
422
423	/* erg - unaligned source */
424	/* This is where it gets nasty ... */
425.Lmemmove_fsrcul:
426	bic	r1, r1, #3
427	ldr	lr, [r1], #4
428	cmp	r12, #2
429	bgt	.Lmemmove_fsrcul3
430	beq	.Lmemmove_fsrcul2
431	cmp	r2, #0x0c
432	blt	.Lmemmove_fsrcul1loop4
433	sub	r2, r2, #0x0c
434	stmdb	sp!, {r4, r5}
435
436.Lmemmove_fsrcul1loop16:
437	mov	r3, lr, lsr #8
438	ldmia	r1!, {r4, r5, r12, lr}
439	orr	r3, r3, r4, lsl #24
440	mov	r4, r4, lsr #8
441	orr	r4, r4, r5, lsl #24
442	mov	r5, r5, lsr #8
443	orr	r5, r5, r12, lsl #24
444	mov	r12, r12, lsr #8
445	orr	r12, r12, lr, lsl #24
446	stmia	r0!, {r3-r5, r12}
447	subs	r2, r2, #0x10
448	bge	.Lmemmove_fsrcul1loop16
449	ldmia	sp!, {r4, r5}
450	adds	r2, r2, #0x0c
451	blt	.Lmemmove_fsrcul1l4
452
453.Lmemmove_fsrcul1loop4:
454	mov	r12, lr, lsr #8
455	ldr	lr, [r1], #4
456	orr	r12, r12, lr, lsl #24
457	str	r12, [r0], #4
458	subs	r2, r2, #4
459	bge	.Lmemmove_fsrcul1loop4
460
461.Lmemmove_fsrcul1l4:
462	sub	r1, r1, #3
463	b	.Lmemmove_fl4
464
465.Lmemmove_fsrcul2:
466	cmp	r2, #0x0c
467	blt	.Lmemmove_fsrcul2loop4
468	sub	r2, r2, #0x0c
469	stmdb	sp!, {r4, r5}
470
471.Lmemmove_fsrcul2loop16:
472	mov	r3, lr, lsr #16
473	ldmia	r1!, {r4, r5, r12, lr}
474	orr	r3, r3, r4, lsl #16
475	mov	r4, r4, lsr #16
476	orr	r4, r4, r5, lsl #16
477	mov	r5, r5, lsr #16
478	orr	r5, r5, r12, lsl #16
479	mov	r12, r12, lsr #16
480	orr	r12, r12, lr, lsl #16
481	stmia	r0!, {r3-r5, r12}
482	subs	r2, r2, #0x10
483	bge	.Lmemmove_fsrcul2loop16
484	ldmia	sp!, {r4, r5}
485	adds	r2, r2, #0x0c
486	blt	.Lmemmove_fsrcul2l4
487
488.Lmemmove_fsrcul2loop4:
489	mov	r12, lr, lsr #16
490	ldr	lr, [r1], #4
491	orr	r12, r12, lr, lsl #16
492	str	r12, [r0], #4
493	subs	r2, r2, #4
494	bge	.Lmemmove_fsrcul2loop4
495
496.Lmemmove_fsrcul2l4:
497	sub	r1, r1, #2
498	b	.Lmemmove_fl4
499
500.Lmemmove_fsrcul3:
501	cmp	r2, #0x0c
502	blt	.Lmemmove_fsrcul3loop4
503	sub	r2, r2, #0x0c
504	stmdb	sp!, {r4, r5}
505
506.Lmemmove_fsrcul3loop16:
507	mov	r3, lr, lsr #24
508	ldmia	r1!, {r4, r5, r12, lr}
509	orr	r3, r3, r4, lsl #8
510	mov	r4, r4, lsr #24
511	orr	r4, r4, r5, lsl #8
512	mov	r5, r5, lsr #24
513	orr	r5, r5, r12, lsl #8
514	mov	r12, r12, lsr #24
515	orr	r12, r12, lr, lsl #8
516	stmia	r0!, {r3-r5, r12}
517	subs	r2, r2, #0x10
518	bge	.Lmemmove_fsrcul3loop16
519	ldmia	sp!, {r4, r5}
520	adds	r2, r2, #0x0c
521	blt	.Lmemmove_fsrcul3l4
522
523.Lmemmove_fsrcul3loop4:
524	mov	r12, lr, lsr #24
525	ldr	lr, [r1], #4
526	orr	r12, r12, lr, lsl #8
527	str	r12, [r0], #4
528	subs	r2, r2, #4
529	bge	.Lmemmove_fsrcul3loop4
530
531.Lmemmove_fsrcul3l4:
532	sub	r1, r1, #1
533	b	.Lmemmove_fl4
534
535.Lmemmove_backwards:
536	add	r1, r1, r2
537	add	r0, r0, r2
538	subs	r2, r2, #4
539	blt	.Lmemmove_bl4		/* less than 4 bytes */
540	ands	r12, r0, #3
541	bne	.Lmemmove_bdestul	/* oh unaligned destination addr */
542	ands	r12, r1, #3
543	bne	.Lmemmove_bsrcul		/* oh unaligned source addr */
544
545.Lmemmove_bt8:
546	/* We have aligned source and destination */
547	subs	r2, r2, #8
548	blt	.Lmemmove_bl12		/* less than 12 bytes (4 from above) */
549	stmdb	sp!, {r4, lr}
550	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
551	blt	.Lmemmove_bl32
552
553	/* blat 32 bytes at a time */
554	/* XXX for really big copies perhaps we should use more registers */
555.Lmemmove_bloop32:
556	ldmdb	r1!, {r3, r4, r12, lr}
557	stmdb	r0!, {r3, r4, r12, lr}
558	ldmdb	r1!, {r3, r4, r12, lr}
559	stmdb	r0!, {r3, r4, r12, lr}
560	subs	r2, r2, #0x20
561	bge	.Lmemmove_bloop32
562
563.Lmemmove_bl32:
564	cmn	r2, #0x10
565	ldmdbge	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
566	stmdbge	r0!, {r3, r4, r12, lr}
567	subge	r2, r2, #0x10
568	adds	r2, r2, #0x14
569	ldmdbge	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
570	stmdbge	r0!, {r3, r12, lr}
571	subge	r2, r2, #0x0c
572	ldmia	sp!, {r4, lr}
573
574.Lmemmove_bl12:
575	adds	r2, r2, #8
576	blt	.Lmemmove_bl4
577	subs	r2, r2, #4
578	ldrlt	r3, [r1, #-4]!
579	strlt	r3, [r0, #-4]!
580	ldmdbge	r1!, {r3, r12}
581	stmdbge	r0!, {r3, r12}
582	subge	r2, r2, #4
583
584.Lmemmove_bl4:
585	/* less than 4 bytes to go */
586	adds	r2, r2, #4
587	RETeq			/* done */
588
589	/* copy the crud byte at a time */
590	cmp	r2, #2
591	ldrb	r3, [r1, #-1]!
592	strb	r3, [r0, #-1]!
593	ldrbge	r3, [r1, #-1]!
594	strbge	r3, [r0, #-1]!
595	ldrbgt	r3, [r1, #-1]!
596	strbgt	r3, [r0, #-1]!
597	RET
598
599	/* erg - unaligned destination */
600.Lmemmove_bdestul:
601	cmp	r12, #2
602
603	/* align destination with byte copies */
604	ldrb	r3, [r1, #-1]!
605	strb	r3, [r0, #-1]!
606	ldrbge	r3, [r1, #-1]!
607	strbge	r3, [r0, #-1]!
608	ldrbgt	r3, [r1, #-1]!
609	strbgt	r3, [r0, #-1]!
610	subs	r2, r2, r12
611	blt	.Lmemmove_bl4		/* less than 4 bytes to go */
612	ands	r12, r1, #3
613	beq	.Lmemmove_bt8		/* we have an aligned source */
614
615	/* erg - unaligned source */
616	/* This is where it gets nasty ... */
617.Lmemmove_bsrcul:
618	bic	r1, r1, #3
619	ldr	r3, [r1, #0]
620	cmp	r12, #2
621	blt	.Lmemmove_bsrcul1
622	beq	.Lmemmove_bsrcul2
623	cmp	r2, #0x0c
624	blt	.Lmemmove_bsrcul3loop4
625	sub	r2, r2, #0x0c
626	stmdb	sp!, {r4, r5, lr}
627
628.Lmemmove_bsrcul3loop16:
629	mov	lr, r3, lsl #8
630	ldmdb	r1!, {r3-r5, r12}
631	orr	lr, lr, r12, lsr #24
632	mov	r12, r12, lsl #8
633	orr	r12, r12, r5, lsr #24
634	mov	r5, r5, lsl #8
635	orr	r5, r5, r4, lsr #24
636	mov	r4, r4, lsl #8
637	orr	r4, r4, r3, lsr #24
638	stmdb	r0!, {r4, r5, r12, lr}
639	subs	r2, r2, #0x10
640	bge	.Lmemmove_bsrcul3loop16
641	ldmia	sp!, {r4, r5, lr}
642	adds	r2, r2, #0x0c
643	blt	.Lmemmove_bsrcul3l4
644
645.Lmemmove_bsrcul3loop4:
646	mov	r12, r3, lsl #8
647	ldr	r3, [r1, #-4]!
648	orr	r12, r12, r3, lsr #24
649	str	r12, [r0, #-4]!
650	subs	r2, r2, #4
651	bge	.Lmemmove_bsrcul3loop4
652
653.Lmemmove_bsrcul3l4:
654	add	r1, r1, #3
655	b	.Lmemmove_bl4
656
657.Lmemmove_bsrcul2:
658	cmp	r2, #0x0c
659	blt	.Lmemmove_bsrcul2loop4
660	sub	r2, r2, #0x0c
661	stmdb	sp!, {r4, r5, lr}
662
663.Lmemmove_bsrcul2loop16:
664	mov	lr, r3, lsl #16
665	ldmdb	r1!, {r3-r5, r12}
666	orr	lr, lr, r12, lsr #16
667	mov	r12, r12, lsl #16
668	orr	r12, r12, r5, lsr #16
669	mov	r5, r5, lsl #16
670	orr	r5, r5, r4, lsr #16
671	mov	r4, r4, lsl #16
672	orr	r4, r4, r3, lsr #16
673	stmdb	r0!, {r4, r5, r12, lr}
674	subs	r2, r2, #0x10
675	bge	.Lmemmove_bsrcul2loop16
676	ldmia	sp!, {r4, r5, lr}
677	adds	r2, r2, #0x0c
678	blt	.Lmemmove_bsrcul2l4
679
680.Lmemmove_bsrcul2loop4:
681	mov	r12, r3, lsl #16
682	ldr	r3, [r1, #-4]!
683	orr	r12, r12, r3, lsr #16
684	str	r12, [r0, #-4]!
685	subs	r2, r2, #4
686	bge	.Lmemmove_bsrcul2loop4
687
688.Lmemmove_bsrcul2l4:
689	add	r1, r1, #2
690	b	.Lmemmove_bl4
691
692.Lmemmove_bsrcul1:
693	cmp	r2, #0x0c
694	blt	.Lmemmove_bsrcul1loop4
695	sub	r2, r2, #0x0c
696	stmdb	sp!, {r4, r5, lr}
697
698.Lmemmove_bsrcul1loop32:
699	mov	lr, r3, lsl #24
700	ldmdb	r1!, {r3-r5, r12}
701	orr	lr, lr, r12, lsr #8
702	mov	r12, r12, lsl #24
703	orr	r12, r12, r5, lsr #8
704	mov	r5, r5, lsl #24
705	orr	r5, r5, r4, lsr #8
706	mov	r4, r4, lsl #24
707	orr	r4, r4, r3, lsr #8
708	stmdb	r0!, {r4, r5, r12, lr}
709	subs	r2, r2, #0x10
710	bge	.Lmemmove_bsrcul1loop32
711	ldmia	sp!, {r4, r5, lr}
712	adds	r2, r2, #0x0c
713	blt	.Lmemmove_bsrcul1l4
714
715.Lmemmove_bsrcul1loop4:
716	mov	r12, r3, lsl #24
717	ldr	r3, [r1, #-4]!
718	orr	r12, r12, r3, lsr #8
719	str	r12, [r0, #-4]!
720	subs	r2, r2, #4
721	bge	.Lmemmove_bsrcul1loop4
722
723.Lmemmove_bsrcul1l4:
724	add	r1, r1, #1
725	b	.Lmemmove_bl4
726END(memmove)
727
728/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
729ENTRY(memcpy)
730	pld	[r1]
731	cmp	r2, #0x0c
732	ble	.Lmemcpy_short		/* <= 12 bytes */
733#ifdef FLASHADDR
734#if FLASHADDR > PHYSADDR
735	ldr	r3, =FLASHADDR
736	cmp	r3, pc
737	bls	.Lnormal
738#else
739	ldr	r3, =FLASHADDR
740	cmp	r3, pc
741	bhi	.Lnormal
742#endif
743#endif
744	mov	r3, r0			/* We must not clobber r0 */
745
746	/* Word-align the destination buffer */
747	ands	ip, r3, #0x03		/* Already word aligned? */
748	beq	.Lmemcpy_wordaligned	/* Yup */
749	cmp	ip, #0x02
750	ldrb	ip, [r1], #0x01
751	sub	r2, r2, #0x01
752	strb	ip, [r3], #0x01
753	ldrble	ip, [r1], #0x01
754	suble	r2, r2, #0x01
755	strble	ip, [r3], #0x01
756	ldrblt	ip, [r1], #0x01
757	sublt	r2, r2, #0x01
758	strblt	ip, [r3], #0x01
759
760	/* Destination buffer is now word aligned */
761.Lmemcpy_wordaligned:
762	ands	ip, r1, #0x03		/* Is src also word-aligned? */
763	bne	.Lmemcpy_bad_align	/* Nope. Things just got bad */
764
765	/* Quad-align the destination buffer */
766	tst	r3, #0x07		/* Already quad aligned? */
767	ldrne	ip, [r1], #0x04
768	stmfd	sp!, {r4-r9}		/* Free up some registers */
769	subne	r2, r2, #0x04
770	strne	ip, [r3], #0x04
771
772	/* Destination buffer quad aligned, source is at least word aligned */
773	subs	r2, r2, #0x80
774	blt	.Lmemcpy_w_lessthan128
775
776	/* Copy 128 bytes at a time */
777.Lmemcpy_w_loop128:
778	ldr	r4, [r1], #0x04		/* LD:00-03 */
779	ldr	r5, [r1], #0x04		/* LD:04-07 */
780	pld	[r1, #0x18]		/* Prefetch 0x20 */
781	ldr	r6, [r1], #0x04		/* LD:08-0b */
782	ldr	r7, [r1], #0x04		/* LD:0c-0f */
783	ldr	r8, [r1], #0x04		/* LD:10-13 */
784	ldr	r9, [r1], #0x04		/* LD:14-17 */
785	strd	r4, [r3], #0x08		/* ST:00-07 */
786	ldr	r4, [r1], #0x04		/* LD:18-1b */
787	ldr	r5, [r1], #0x04		/* LD:1c-1f */
788	strd	r6, [r3], #0x08		/* ST:08-0f */
789	ldr	r6, [r1], #0x04		/* LD:20-23 */
790	ldr	r7, [r1], #0x04		/* LD:24-27 */
791	pld	[r1, #0x18]		/* Prefetch 0x40 */
792	strd	r8, [r3], #0x08		/* ST:10-17 */
793	ldr	r8, [r1], #0x04		/* LD:28-2b */
794	ldr	r9, [r1], #0x04		/* LD:2c-2f */
795	strd	r4, [r3], #0x08		/* ST:18-1f */
796	ldr	r4, [r1], #0x04		/* LD:30-33 */
797	ldr	r5, [r1], #0x04		/* LD:34-37 */
798	strd	r6, [r3], #0x08		/* ST:20-27 */
799	ldr	r6, [r1], #0x04		/* LD:38-3b */
800	ldr	r7, [r1], #0x04		/* LD:3c-3f */
801	strd	r8, [r3], #0x08		/* ST:28-2f */
802	ldr	r8, [r1], #0x04		/* LD:40-43 */
803	ldr	r9, [r1], #0x04		/* LD:44-47 */
804	pld	[r1, #0x18]		/* Prefetch 0x60 */
805	strd	r4, [r3], #0x08		/* ST:30-37 */
806	ldr	r4, [r1], #0x04		/* LD:48-4b */
807	ldr	r5, [r1], #0x04		/* LD:4c-4f */
808	strd	r6, [r3], #0x08		/* ST:38-3f */
809	ldr	r6, [r1], #0x04		/* LD:50-53 */
810	ldr	r7, [r1], #0x04		/* LD:54-57 */
811	strd	r8, [r3], #0x08		/* ST:40-47 */
812	ldr	r8, [r1], #0x04		/* LD:58-5b */
813	ldr	r9, [r1], #0x04		/* LD:5c-5f */
814	strd	r4, [r3], #0x08		/* ST:48-4f */
815	ldr	r4, [r1], #0x04		/* LD:60-63 */
816	ldr	r5, [r1], #0x04		/* LD:64-67 */
817	pld	[r1, #0x18]		/* Prefetch 0x80 */
818	strd	r6, [r3], #0x08		/* ST:50-57 */
819	ldr	r6, [r1], #0x04		/* LD:68-6b */
820	ldr	r7, [r1], #0x04		/* LD:6c-6f */
821	strd	r8, [r3], #0x08		/* ST:58-5f */
822	ldr	r8, [r1], #0x04		/* LD:70-73 */
823	ldr	r9, [r1], #0x04		/* LD:74-77 */
824	strd	r4, [r3], #0x08		/* ST:60-67 */
825	ldr	r4, [r1], #0x04		/* LD:78-7b */
826	ldr	r5, [r1], #0x04		/* LD:7c-7f */
827	strd	r6, [r3], #0x08		/* ST:68-6f */
828	strd	r8, [r3], #0x08		/* ST:70-77 */
829	subs	r2, r2, #0x80
830	strd	r4, [r3], #0x08		/* ST:78-7f */
831	bge	.Lmemcpy_w_loop128
832
833.Lmemcpy_w_lessthan128:
834	adds	r2, r2, #0x80		/* Adjust for extra sub */
835	ldmfdeq	sp!, {r4-r9}
836	RETeq			/* Return now if done */
837	subs	r2, r2, #0x20
838	blt	.Lmemcpy_w_lessthan32
839
840	/* Copy 32 bytes at a time */
841.Lmemcpy_w_loop32:
842	ldr	r4, [r1], #0x04
843	ldr	r5, [r1], #0x04
844	pld	[r1, #0x18]
845	ldr	r6, [r1], #0x04
846	ldr	r7, [r1], #0x04
847	ldr	r8, [r1], #0x04
848	ldr	r9, [r1], #0x04
849	strd	r4, [r3], #0x08
850	ldr	r4, [r1], #0x04
851	ldr	r5, [r1], #0x04
852	strd	r6, [r3], #0x08
853	strd	r8, [r3], #0x08
854	subs	r2, r2, #0x20
855	strd	r4, [r3], #0x08
856	bge	.Lmemcpy_w_loop32
857
858.Lmemcpy_w_lessthan32:
859	adds	r2, r2, #0x20		/* Adjust for extra sub */
860	ldmfdeq	sp!, {r4-r9}
861	RETeq			/* Return now if done */
862
863	and	r4, r2, #0x18
864	rsbs	r4, r4, #0x18
865	addne	pc, pc, r4, lsl #1
866	nop
867
868	/* At least 24 bytes remaining */
869	ldr	r4, [r1], #0x04
870	ldr	r5, [r1], #0x04
871	sub	r2, r2, #0x08
872	strd	r4, [r3], #0x08
873
874	/* At least 16 bytes remaining */
875	ldr	r4, [r1], #0x04
876	ldr	r5, [r1], #0x04
877	sub	r2, r2, #0x08
878	strd	r4, [r3], #0x08
879
880	/* At least 8 bytes remaining */
881	ldr	r4, [r1], #0x04
882	ldr	r5, [r1], #0x04
883	subs	r2, r2, #0x08
884	strd	r4, [r3], #0x08
885
886	/* Less than 8 bytes remaining */
887	ldmfd	sp!, {r4-r9}
888	RETeq			/* Return now if done */
889	subs	r2, r2, #0x04
890	ldrge	ip, [r1], #0x04
891	strge	ip, [r3], #0x04
892	RETeq			/* Return now if done */
893	addlt	r2, r2, #0x04
894	ldrb	ip, [r1], #0x01
895	cmp	r2, #0x02
896	ldrbge	r2, [r1], #0x01
897	strb	ip, [r3], #0x01
898	ldrbgt	ip, [r1]
899	strbge	r2, [r3], #0x01
900	strbgt	ip, [r3]
901	RET
902/* Place a literal pool here for the above ldr instructions to use */
903.ltorg
904
905
906/*
907 * At this point, it has not been possible to word align both buffers.
908 * The destination buffer is word aligned, but the source buffer is not.
909 */
910.Lmemcpy_bad_align:
911	stmfd	sp!, {r4-r7}
912	bic	r1, r1, #0x03
913	cmp	ip, #2
914	ldr	ip, [r1], #0x04
915	bgt	.Lmemcpy_bad3
916	beq	.Lmemcpy_bad2
917	b	.Lmemcpy_bad1
918
919.Lmemcpy_bad1_loop16:
920	mov	r4, ip, lsr #8
921	ldr	r5, [r1], #0x04
922	pld	[r1, #0x018]
923	ldr	r6, [r1], #0x04
924	ldr	r7, [r1], #0x04
925	ldr	ip, [r1], #0x04
926	orr	r4, r4, r5, lsl #24
927	mov	r5, r5, lsr #8
928	orr	r5, r5, r6, lsl #24
929	mov	r6, r6, lsr #8
930	orr	r6, r6, r7, lsl #24
931	mov	r7, r7, lsr #8
932	orr	r7, r7, ip, lsl #24
933	str	r4, [r3], #0x04
934	str	r5, [r3], #0x04
935	str	r6, [r3], #0x04
936	str	r7, [r3], #0x04
937.Lmemcpy_bad1:
938	subs	r2, r2, #0x10
939	bge	.Lmemcpy_bad1_loop16
940
941	adds	r2, r2, #0x10
942	ldmfdeq	sp!, {r4-r7}
943	RETeq			/* Return now if done */
944	subs	r2, r2, #0x04
945	sublt	r1, r1, #0x03
946	blt	.Lmemcpy_bad_done
947
948.Lmemcpy_bad1_loop4:
949	mov	r4, ip, lsr #8
950	ldr	ip, [r1], #0x04
951	subs	r2, r2, #0x04
952	orr	r4, r4, ip, lsl #24
953	str	r4, [r3], #0x04
954	bge	.Lmemcpy_bad1_loop4
955	sub	r1, r1, #0x03
956	b	.Lmemcpy_bad_done
957
958.Lmemcpy_bad2_loop16:
959	mov	r4, ip, lsr #16
960	ldr	r5, [r1], #0x04
961	pld	[r1, #0x018]
962	ldr	r6, [r1], #0x04
963	ldr	r7, [r1], #0x04
964	ldr	ip, [r1], #0x04
965	orr	r4, r4, r5, lsl #16
966	mov	r5, r5, lsr #16
967	orr	r5, r5, r6, lsl #16
968	mov	r6, r6, lsr #16
969	orr	r6, r6, r7, lsl #16
970	mov	r7, r7, lsr #16
971	orr	r7, r7, ip, lsl #16
972	str	r4, [r3], #0x04
973	str	r5, [r3], #0x04
974	str	r6, [r3], #0x04
975	str	r7, [r3], #0x04
976.Lmemcpy_bad2:
977	subs	r2, r2, #0x10
978	bge	.Lmemcpy_bad2_loop16
979
980	adds	r2, r2, #0x10
981	ldmfdeq	sp!, {r4-r7}
982	RETeq			/* Return now if done */
983	subs	r2, r2, #0x04
984	sublt	r1, r1, #0x02
985	blt	.Lmemcpy_bad_done
986
987.Lmemcpy_bad2_loop4:
988	mov	r4, ip, lsr #16
989	ldr	ip, [r1], #0x04
990	subs	r2, r2, #0x04
991	orr	r4, r4, ip, lsl #16
992	str	r4, [r3], #0x04
993	bge	.Lmemcpy_bad2_loop4
994	sub	r1, r1, #0x02
995	b	.Lmemcpy_bad_done
996
997.Lmemcpy_bad3_loop16:
998	mov	r4, ip, lsr #24
999	ldr	r5, [r1], #0x04
1000	pld	[r1, #0x018]
1001	ldr	r6, [r1], #0x04
1002	ldr	r7, [r1], #0x04
1003	ldr	ip, [r1], #0x04
1004	orr	r4, r4, r5, lsl #8
1005	mov	r5, r5, lsr #24
1006	orr	r5, r5, r6, lsl #8
1007	mov	r6, r6, lsr #24
1008	orr	r6, r6, r7, lsl #8
1009	mov	r7, r7, lsr #24
1010	orr	r7, r7, ip, lsl #8
1011	str	r4, [r3], #0x04
1012	str	r5, [r3], #0x04
1013	str	r6, [r3], #0x04
1014	str	r7, [r3], #0x04
1015.Lmemcpy_bad3:
1016	subs	r2, r2, #0x10
1017	bge	.Lmemcpy_bad3_loop16
1018
1019	adds	r2, r2, #0x10
1020	ldmfdeq	sp!, {r4-r7}
1021	RETeq			/* Return now if done */
1022	subs	r2, r2, #0x04
1023	sublt	r1, r1, #0x01
1024	blt	.Lmemcpy_bad_done
1025
1026.Lmemcpy_bad3_loop4:
1027	mov	r4, ip, lsr #24
1028	ldr	ip, [r1], #0x04
1029	subs	r2, r2, #0x04
1030	orr	r4, r4, ip, lsl #8
1031	str	r4, [r3], #0x04
1032	bge	.Lmemcpy_bad3_loop4
1033	sub	r1, r1, #0x01
1034
1035.Lmemcpy_bad_done:
1036	ldmfd	sp!, {r4-r7}
1037	adds	r2, r2, #0x04
1038	RETeq
1039	ldrb	ip, [r1], #0x01
1040	cmp	r2, #0x02
1041	ldrbge	r2, [r1], #0x01
1042	strb	ip, [r3], #0x01
1043	ldrbgt	ip, [r1]
1044	strbge	r2, [r3], #0x01
1045	strbgt	ip, [r3]
1046	RET
1047
1048
1049/*
1050 * Handle short copies (less than 16 bytes), possibly misaligned.
1051 * Some of these are *very* common, thanks to the network stack,
1052 * and so are handled specially.
1053 */
1054.Lmemcpy_short:
1055	add	pc, pc, r2, lsl #2
1056	nop
1057	RET			/* 0x00 */
1058	b	.Lmemcpy_bytewise	/* 0x01 */
1059	b	.Lmemcpy_bytewise	/* 0x02 */
1060	b	.Lmemcpy_bytewise	/* 0x03 */
1061	b	.Lmemcpy_4		/* 0x04 */
1062	b	.Lmemcpy_bytewise	/* 0x05 */
1063	b	.Lmemcpy_6		/* 0x06 */
1064	b	.Lmemcpy_bytewise	/* 0x07 */
1065	b	.Lmemcpy_8		/* 0x08 */
1066	b	.Lmemcpy_bytewise	/* 0x09 */
1067	b	.Lmemcpy_bytewise	/* 0x0a */
1068	b	.Lmemcpy_bytewise	/* 0x0b */
1069	b	.Lmemcpy_c		/* 0x0c */
1070.Lmemcpy_bytewise:
1071	mov	r3, r0			/* We must not clobber r0 */
1072	ldrb	ip, [r1], #0x01
10731:	subs	r2, r2, #0x01
1074	strb	ip, [r3], #0x01
1075	ldrbne	ip, [r1], #0x01
1076	bne	1b
1077	RET
1078
1079/******************************************************************************
1080 * Special case for 4 byte copies
1081 */
1082#define	LMEMCPY_4_LOG2	6	/* 64 bytes */
1083#define	LMEMCPY_4_PAD	.align LMEMCPY_4_LOG2
1084	LMEMCPY_4_PAD
1085.Lmemcpy_4:
1086	and	r2, r1, #0x03
1087	orr	r2, r2, r0, lsl #2
1088	ands	r2, r2, #0x0f
1089	sub	r3, pc, #0x14
1090	addne	pc, r3, r2, lsl #LMEMCPY_4_LOG2
1091
1092/*
1093 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1094 */
1095	ldr	r2, [r1]
1096	str	r2, [r0]
1097	RET
1098	LMEMCPY_4_PAD
1099
1100/*
1101 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1102 */
1103	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1104	ldr	r2, [r1, #3]		/* BE:r2 = 3xxx  LE:r2 = xxx3 */
1105	mov	r3, r3, lsr #8		/* r3 = .210 */
1106	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
1107	str	r3, [r0]
1108	RET
1109	LMEMCPY_4_PAD
1110
1111/*
1112 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1113 */
1114	ldrh	r3, [r1, #0x02]
1115	ldrh	r2, [r1]
1116	orr	r3, r2, r3, lsl #16
1117	str	r3, [r0]
1118	RET
1119	LMEMCPY_4_PAD
1120
1121/*
1122 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1123 */
1124	ldr	r3, [r1, #-3]		/* BE:r3 = xxx0  LE:r3 = 0xxx */
1125	ldr	r2, [r1, #1]		/* BE:r2 = 123x  LE:r2 = x321 */
1126	mov	r3, r3, lsr #24		/* r3 = ...0 */
1127	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1128	str	r3, [r0]
1129	RET
1130	LMEMCPY_4_PAD
1131
1132/*
1133 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1134 */
1135	ldr	r2, [r1]
1136	strb	r2, [r0]
1137	mov	r3, r2, lsr #8
1138	mov	r1, r2, lsr #24
1139	strb	r1, [r0, #0x03]
1140	strh	r3, [r0, #0x01]
1141	RET
1142	LMEMCPY_4_PAD
1143
1144/*
1145 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1146 */
1147	ldrb	r2, [r1]
1148	ldrh	r3, [r1, #0x01]
1149	ldrb	r1, [r1, #0x03]
1150	strb	r2, [r0]
1151	strh	r3, [r0, #0x01]
1152	strb	r1, [r0, #0x03]
1153	RET
1154	LMEMCPY_4_PAD
1155
1156/*
1157 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1158 */
1159	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1160	ldrh	r3, [r1, #0x02]		/* LE:r3 = ..23  LE:r3 = ..32 */
1161	strb	r2, [r0]
1162	mov	r2, r2, lsr #8		/* r2 = ...1 */
1163	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1164	mov	r3, r3, lsr #8		/* r3 = ...3 */
1165	strh	r2, [r0, #0x01]
1166	strb	r3, [r0, #0x03]
1167	RET
1168	LMEMCPY_4_PAD
1169
1170/*
1171 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1172 */
1173	ldrb	r2, [r1]
1174	ldrh	r3, [r1, #0x01]
1175	ldrb	r1, [r1, #0x03]
1176	strb	r2, [r0]
1177	strh	r3, [r0, #0x01]
1178	strb	r1, [r0, #0x03]
1179	RET
1180	LMEMCPY_4_PAD
1181
1182/*
1183 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1184 */
1185	ldr	r2, [r1]
1186	strh	r2, [r0]
1187	mov	r3, r2, lsr #16
1188	strh	r3, [r0, #0x02]
1189	RET
1190	LMEMCPY_4_PAD
1191
1192/*
1193 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1194 */
1195	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1196	ldr	r3, [r1, #3]		/* BE:r3 = 3xxx  LE:r3 = xxx3 */
1197	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1198	strh	r1, [r0]
1199	mov	r2, r2, lsr #24		/* r2 = ...2 */
1200	orr	r2, r2, r3, lsl #8	/* r2 = xx32 */
1201	strh	r2, [r0, #0x02]
1202	RET
1203	LMEMCPY_4_PAD
1204
1205/*
1206 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1207 */
1208	ldrh	r2, [r1]
1209	ldrh	r3, [r1, #0x02]
1210	strh	r2, [r0]
1211	strh	r3, [r0, #0x02]
1212	RET
1213	LMEMCPY_4_PAD
1214
1215/*
1216 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1217 */
1218	ldr	r3, [r1, #1]		/* BE:r3 = 123x  LE:r3 = x321 */
1219	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1220	mov	r1, r3, lsr #8		/* BE:r1 = .123  LE:r1 = .x32 */
1221	strh	r1, [r0, #0x02]
1222	mov	r3, r3, lsl #8		/* r3 = 321. */
1223	orr	r3, r3, r2, lsr #24	/* r3 = 3210 */
1224	strh	r3, [r0]
1225	RET
1226	LMEMCPY_4_PAD
1227
1228/*
1229 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1230 */
1231	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1232	strb	r2, [r0]
1233	mov	r3, r2, lsr #8
1234	mov	r1, r2, lsr #24
1235	strh	r3, [r0, #0x01]
1236	strb	r1, [r0, #0x03]
1237	RET
1238	LMEMCPY_4_PAD
1239
1240/*
1241 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1242 */
1243	ldrb	r2, [r1]
1244	ldrh	r3, [r1, #0x01]
1245	ldrb	r1, [r1, #0x03]
1246	strb	r2, [r0]
1247	strh	r3, [r0, #0x01]
1248	strb	r1, [r0, #0x03]
1249	RET
1250	LMEMCPY_4_PAD
1251
1252/*
1253 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1254 */
1255	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1256	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
1257	strb	r2, [r0]
1258	mov	r2, r2, lsr #8		/* r2 = ...1 */
1259	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1260	strh	r2, [r0, #0x01]
1261	mov	r3, r3, lsr #8		/* r3 = ...3 */
1262	strb	r3, [r0, #0x03]
1263	RET
1264	LMEMCPY_4_PAD
1265
1266/*
1267 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1268 */
1269	ldrb	r2, [r1]
1270	ldrh	r3, [r1, #0x01]
1271	ldrb	r1, [r1, #0x03]
1272	strb	r2, [r0]
1273	strh	r3, [r0, #0x01]
1274	strb	r1, [r0, #0x03]
1275	RET
1276	LMEMCPY_4_PAD
1277
1278
1279/******************************************************************************
1280 * Special case for 6 byte copies
1281 */
1282#define	LMEMCPY_6_LOG2	6	/* 64 bytes */
1283#define	LMEMCPY_6_PAD	.align LMEMCPY_6_LOG2
1284	LMEMCPY_6_PAD
1285.Lmemcpy_6:
1286	and	r2, r1, #0x03
1287	orr	r2, r2, r0, lsl #2
1288	ands	r2, r2, #0x0f
1289	sub	r3, pc, #0x14
1290	addne	pc, r3, r2, lsl #LMEMCPY_6_LOG2
1291
1292/*
1293 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1294 */
1295	ldr	r2, [r1]
1296	ldrh	r3, [r1, #0x04]
1297	str	r2, [r0]
1298	strh	r3, [r0, #0x04]
1299	RET
1300	LMEMCPY_6_PAD
1301
1302/*
1303 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1304 */
1305	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1306	ldr	r3, [r1, #0x03]		/* BE:r3 = 345x  LE:r3 = x543 */
1307	mov	r2, r2, lsr #8		/* r2 = .210 */
1308	orr	r2, r2, r3, lsl #24	/* r2 = 3210 */
1309	mov	r3, r3, lsr #8		/* BE:r3 = .345  LE:r3 = .x54 */
1310	str	r2, [r0]
1311	strh	r3, [r0, #0x04]
1312	RET
1313	LMEMCPY_6_PAD
1314
1315/*
1316 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1317 */
1318	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1319	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1320	mov	r1, r3, lsr #16		/* r1 = ..54 */
1321	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1322	str	r2, [r0]
1323	strh	r1, [r0, #0x04]
1324	RET
1325	LMEMCPY_6_PAD
1326
1327/*
1328 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1329 */
1330	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1331	ldr	r3, [r1, #1]		/* BE:r3 = 1234  LE:r3 = 4321 */
1332	ldr	r1, [r1, #5]		/* BE:r1 = 5xxx  LE:r3 = xxx5 */
1333	mov	r2, r2, lsr #24		/* r2 = ...0 */
1334	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
1335	mov	r1, r1, lsl #8		/* r1 = xx5. */
1336	orr	r1, r1, r3, lsr #24	/* r1 = xx54 */
1337	str	r2, [r0]
1338	strh	r1, [r0, #0x04]
1339	RET
1340	LMEMCPY_6_PAD
1341
1342/*
1343 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1344 */
1345	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
1346	ldrh	r2, [r1, #0x04]		/* BE:r2 = ..45  LE:r2 = ..54 */
1347	mov	r1, r3, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
1348	strh	r1, [r0, #0x01]
1349	strb	r3, [r0]
1350	mov	r3, r3, lsr #24		/* r3 = ...3 */
1351	orr	r3, r3, r2, lsl #8	/* r3 = .543 */
1352	mov	r2, r2, lsr #8		/* r2 = ...5 */
1353	strh	r3, [r0, #0x03]
1354	strb	r2, [r0, #0x05]
1355	RET
1356	LMEMCPY_6_PAD
1357
1358/*
1359 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1360 */
1361	ldrb	r2, [r1]
1362	ldrh	r3, [r1, #0x01]
1363	ldrh	ip, [r1, #0x03]
1364	ldrb	r1, [r1, #0x05]
1365	strb	r2, [r0]
1366	strh	r3, [r0, #0x01]
1367	strh	ip, [r0, #0x03]
1368	strb	r1, [r0, #0x05]
1369	RET
1370	LMEMCPY_6_PAD
1371
1372/*
1373 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1374 */
1375	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1376	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
1377	strb	r2, [r0]
1378	mov	r3, r1, lsr #24
1379	strb	r3, [r0, #0x05]
1380	mov	r3, r1, lsr #8		/* r3 = .543 */
1381	strh	r3, [r0, #0x03]
1382	mov	r3, r2, lsr #8		/* r3 = ...1 */
1383	orr	r3, r3, r1, lsl #8	/* r3 = 4321 */
1384	strh	r3, [r0, #0x01]
1385	RET
1386	LMEMCPY_6_PAD
1387
1388/*
1389 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1390 */
1391	ldrb	r2, [r1]
1392	ldrh	r3, [r1, #0x01]
1393	ldrh	ip, [r1, #0x03]
1394	ldrb	r1, [r1, #0x05]
1395	strb	r2, [r0]
1396	strh	r3, [r0, #0x01]
1397	strh	ip, [r0, #0x03]
1398	strb	r1, [r0, #0x05]
1399	RET
1400	LMEMCPY_6_PAD
1401
1402/*
1403 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1404 */
1405	ldrh	r2, [r1, #0x04]		/* r2 = ..54 */
1406	ldr	r3, [r1]		/* r3 = 3210 */
1407	mov	r2, r2, lsl #16		/* r2 = 54.. */
1408	orr	r2, r2, r3, lsr #16	/* r2 = 5432 */
1409	strh	r3, [r0]
1410	str	r2, [r0, #0x02]
1411	RET
1412	LMEMCPY_6_PAD
1413
1414/*
1415 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1416 */
1417	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1418	ldr	r2, [r1, #3]		/* BE:r2 = 345x  LE:r2 = x543 */
1419	mov	r1, r3, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1420	mov	r2, r2, lsl #8		/* r2 = 543. */
1421	orr	r2, r2, r3, lsr #24	/* r2 = 5432 */
1422	strh	r1, [r0]
1423	str	r2, [r0, #0x02]
1424	RET
1425	LMEMCPY_6_PAD
1426
1427/*
1428 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1429 */
1430	ldrh	r2, [r1]
1431	ldr	r3, [r1, #0x02]
1432	strh	r2, [r0]
1433	str	r3, [r0, #0x02]
1434	RET
1435	LMEMCPY_6_PAD
1436
1437/*
1438 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1439 */
1440	ldrb	r3, [r1]		/* r3 = ...0 */
1441	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
1442	ldrb	r1, [r1, #0x05]		/* r1 = ...5 */
1443	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1444	mov	r1, r1, lsl #24		/* r1 = 5... */
1445	orr	r1, r1, r2, lsr #8	/* r1 = 5432 */
1446	strh	r3, [r0]
1447	str	r1, [r0, #0x02]
1448	RET
1449	LMEMCPY_6_PAD
1450
1451/*
1452 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1453 */
1454	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1455	ldrh	r1, [r1, #0x04]		/* BE:r1 = ..45  LE:r1 = ..54 */
1456	strb	r2, [r0]
1457	mov	r2, r2, lsr #8		/* r2 = .321 */
1458	orr	r2, r2, r1, lsl #24	/* r2 = 4321 */
1459	mov	r1, r1, lsr #8		/* r1 = ...5 */
1460	str	r2, [r0, #0x01]
1461	strb	r1, [r0, #0x05]
1462	RET
1463	LMEMCPY_6_PAD
1464
1465/*
1466 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1467 */
1468	ldrb	r2, [r1]
1469	ldrh	r3, [r1, #0x01]
1470	ldrh	ip, [r1, #0x03]
1471	ldrb	r1, [r1, #0x05]
1472	strb	r2, [r0]
1473	strh	r3, [r0, #0x01]
1474	strh	ip, [r0, #0x03]
1475	strb	r1, [r0, #0x05]
1476	RET
1477	LMEMCPY_6_PAD
1478
1479/*
1480 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1481 */
1482	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1483	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
1484	strb	r2, [r0]
1485	mov	r2, r2, lsr #8		/* r2 = ...1 */
1486	orr	r2, r2, r1, lsl #8	/* r2 = 4321 */
1487	mov	r1, r1, lsr #24		/* r1 = ...5 */
1488	str	r2, [r0, #0x01]
1489	strb	r1, [r0, #0x05]
1490	RET
1491	LMEMCPY_6_PAD
1492
1493/*
1494 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1495 */
1496	ldrb	r2, [r1]
1497	ldr	r3, [r1, #0x01]
1498	ldrb	r1, [r1, #0x05]
1499	strb	r2, [r0]
1500	str	r3, [r0, #0x01]
1501	strb	r1, [r0, #0x05]
1502	RET
1503	LMEMCPY_6_PAD
1504
1505
1506/******************************************************************************
1507 * Special case for 8 byte copies
1508 */
1509#define	LMEMCPY_8_LOG2	6	/* 64 bytes */
1510#define	LMEMCPY_8_PAD	.align LMEMCPY_8_LOG2
1511	LMEMCPY_8_PAD
1512.Lmemcpy_8:
1513	and	r2, r1, #0x03
1514	orr	r2, r2, r0, lsl #2
1515	ands	r2, r2, #0x0f
1516	sub	r3, pc, #0x14
1517	addne	pc, r3, r2, lsl #LMEMCPY_8_LOG2
1518
1519/*
1520 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1521 */
1522	ldr	r2, [r1]
1523	ldr	r3, [r1, #0x04]
1524	str	r2, [r0]
1525	str	r3, [r0, #0x04]
1526	RET
1527	LMEMCPY_8_PAD
1528
1529/*
1530 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1531 */
1532	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1533	ldr	r2, [r1, #0x03]		/* BE:r2 = 3456  LE:r2 = 6543 */
1534	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1535	mov	r3, r3, lsr #8		/* r3 = .210 */
1536	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
1537	mov	r1, r1, lsl #24		/* r1 = 7... */
1538	orr	r2, r1, r2, lsr #8	/* r2 = 7654 */
1539	str	r3, [r0]
1540	str	r2, [r0, #0x04]
1541	RET
1542	LMEMCPY_8_PAD
1543
1544/*
1545 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1546 */
1547	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1548	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1549	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1550	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1551	mov	r3, r3, lsr #16		/* r3 = ..54 */
1552	orr	r3, r3, r1, lsl #16	/* r3 = 7654 */
1553	str	r2, [r0]
1554	str	r3, [r0, #0x04]
1555	RET
1556	LMEMCPY_8_PAD
1557
1558/*
1559 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1560 */
1561	ldrb	r3, [r1]		/* r3 = ...0 */
1562	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
1563	ldr	r1, [r1, #0x05]		/* BE:r1 = 567x  LE:r1 = x765 */
1564	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1565	mov	r2, r2, lsr #24		/* r2 = ...4 */
1566	orr	r2, r2, r1, lsl #8	/* r2 = 7654 */
1567	str	r3, [r0]
1568	str	r2, [r0, #0x04]
1569	RET
1570	LMEMCPY_8_PAD
1571
1572/*
1573 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1574 */
1575	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
1576	ldr	r2, [r1, #0x04]		/* BE:r2 = 4567  LE:r2 = 7654 */
1577	strb	r3, [r0]
1578	mov	r1, r2, lsr #24		/* r1 = ...7 */
1579	strb	r1, [r0, #0x07]
1580	mov	r1, r3, lsr #8		/* r1 = .321 */
1581	mov	r3, r3, lsr #24		/* r3 = ...3 */
1582	orr	r3, r3, r2, lsl #8	/* r3 = 6543 */
1583	strh	r1, [r0, #0x01]
1584	str	r3, [r0, #0x03]
1585	RET
1586	LMEMCPY_8_PAD
1587
1588/*
1589 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1590 */
1591	ldrb	r2, [r1]
1592	ldrh	r3, [r1, #0x01]
1593	ldr	ip, [r1, #0x03]
1594	ldrb	r1, [r1, #0x07]
1595	strb	r2, [r0]
1596	strh	r3, [r0, #0x01]
1597	str	ip, [r0, #0x03]
1598	strb	r1, [r0, #0x07]
1599	RET
1600	LMEMCPY_8_PAD
1601
1602/*
1603 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1604 */
1605	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1606	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1607	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1608	strb	r2, [r0]		/* 0 */
1609	mov	ip, r1, lsr #8		/* ip = ...7 */
1610	strb	ip, [r0, #0x07]		/* 7 */
1611	mov	ip, r2, lsr #8		/* ip = ...1 */
1612	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
1613	mov	r3, r3, lsr #8		/* r3 = .543 */
1614	orr	r3, r3, r1, lsl #24	/* r3 = 6543 */
1615	strh	ip, [r0, #0x01]
1616	str	r3, [r0, #0x03]
1617	RET
1618	LMEMCPY_8_PAD
1619
1620/*
1621 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1622 */
1623	ldrb	r3, [r1]		/* r3 = ...0 */
1624	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
1625	ldrh	r2, [r1, #0x05]		/* BE:r2 = ..56  LE:r2 = ..65 */
1626	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1627	strb	r3, [r0]
1628	mov	r3, ip, lsr #16		/* BE:r3 = ..12  LE:r3 = ..43 */
1629	strh	ip, [r0, #0x01]
1630	orr	r2, r3, r2, lsl #16	/* r2 = 6543 */
1631	str	r2, [r0, #0x03]
1632	strb	r1, [r0, #0x07]
1633	RET
1634	LMEMCPY_8_PAD
1635
1636/*
1637 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1638 */
1639	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1640	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1641	mov	r1, r2, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
1642	strh	r2, [r0]
1643	orr	r2, r1, r3, lsl #16	/* r2 = 5432 */
1644	mov	r3, r3, lsr #16		/* r3 = ..76 */
1645	str	r2, [r0, #0x02]
1646	strh	r3, [r0, #0x06]
1647	RET
1648	LMEMCPY_8_PAD
1649
1650/*
1651 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1652 */
1653	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1654	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1655	ldrb	ip, [r1, #0x07]		/* ip = ...7 */
1656	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1657	strh	r1, [r0]
1658	mov	r1, r2, lsr #24		/* r1 = ...2 */
1659	orr	r1, r1, r3, lsl #8	/* r1 = 5432 */
1660	mov	r3, r3, lsr #24		/* r3 = ...6 */
1661	orr	r3, r3, ip, lsl #8	/* r3 = ..76 */
1662	str	r1, [r0, #0x02]
1663	strh	r3, [r0, #0x06]
1664	RET
1665	LMEMCPY_8_PAD
1666
1667/*
1668 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1669 */
1670	ldrh	r2, [r1]
1671	ldr	ip, [r1, #0x02]
1672	ldrh	r3, [r1, #0x06]
1673	strh	r2, [r0]
1674	str	ip, [r0, #0x02]
1675	strh	r3, [r0, #0x06]
1676	RET
1677	LMEMCPY_8_PAD
1678
1679/*
1680 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1681 */
1682	ldr	r3, [r1, #0x05]		/* BE:r3 = 567x  LE:r3 = x765 */
1683	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
1684	ldrb	ip, [r1]		/* ip = ...0 */
1685	mov	r1, r3, lsr #8		/* BE:r1 = .567  LE:r1 = .x76 */
1686	strh	r1, [r0, #0x06]
1687	mov	r3, r3, lsl #24		/* r3 = 5... */
1688	orr	r3, r3, r2, lsr #8	/* r3 = 5432 */
1689	orr	r2, ip, r2, lsl #8	/* r2 = 3210 */
1690	str	r3, [r0, #0x02]
1691	strh	r2, [r0]
1692	RET
1693	LMEMCPY_8_PAD
1694
1695/*
1696 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1697 */
1698	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1699	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1700	mov	r1, r3, lsr #8		/* BE:r1 = .456  LE:r1 = .765 */
1701	strh	r1, [r0, #0x05]
1702	strb	r2, [r0]
1703	mov	r1, r3, lsr #24		/* r1 = ...7 */
1704	strb	r1, [r0, #0x07]
1705	mov	r2, r2, lsr #8		/* r2 = .321 */
1706	orr	r2, r2, r3, lsl #24	/* r2 = 4321 */
1707	str	r2, [r0, #0x01]
1708	RET
1709	LMEMCPY_8_PAD
1710
1711/*
1712 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1713 */
1714	ldrb	r3, [r1]		/* r3 = ...0 */
1715	ldrh	r2, [r1, #0x01]		/* BE:r2 = ..12  LE:r2 = ..21 */
1716	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
1717	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1718	strb	r3, [r0]
1719	mov	r3, ip, lsr #16		/* BE:r3 = ..34  LE:r3 = ..65 */
1720	strh	r3, [r0, #0x05]
1721	orr	r2, r2, ip, lsl #16	/* r2 = 4321 */
1722	str	r2, [r0, #0x01]
1723	strb	r1, [r0, #0x07]
1724	RET
1725	LMEMCPY_8_PAD
1726
1727/*
1728 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1729 */
1730	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1731	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1732	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1733	strb	r2, [r0]
1734	mov	ip, r2, lsr #8		/* ip = ...1 */
1735	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
1736	mov	r2, r1, lsr #8		/* r2 = ...7 */
1737	strb	r2, [r0, #0x07]
1738	mov	r1, r1, lsl #8		/* r1 = .76. */
1739	orr	r1, r1, r3, lsr #24	/* r1 = .765 */
1740	str	ip, [r0, #0x01]
1741	strh	r1, [r0, #0x05]
1742	RET
1743	LMEMCPY_8_PAD
1744
1745/*
1746 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1747 */
1748	ldrb	r2, [r1]
1749	ldr	ip, [r1, #0x01]
1750	ldrh	r3, [r1, #0x05]
1751	ldrb	r1, [r1, #0x07]
1752	strb	r2, [r0]
1753	str	ip, [r0, #0x01]
1754	strh	r3, [r0, #0x05]
1755	strb	r1, [r0, #0x07]
1756	RET
1757	LMEMCPY_8_PAD
1758
1759/******************************************************************************
1760 * Special case for 12 byte copies
1761 */
1762#define	LMEMCPY_C_LOG2	7	/* 128 bytes */
1763#define	LMEMCPY_C_PAD	.align LMEMCPY_C_LOG2
1764	LMEMCPY_C_PAD
1765.Lmemcpy_c:
1766	and	r2, r1, #0x03
1767	orr	r2, r2, r0, lsl #2
1768	ands	r2, r2, #0x0f
1769	sub	r3, pc, #0x14
1770	addne	pc, r3, r2, lsl #LMEMCPY_C_LOG2
1771
1772/*
1773 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1774 */
1775	ldr	r2, [r1]
1776	ldr	r3, [r1, #0x04]
1777	ldr	r1, [r1, #0x08]
1778	str	r2, [r0]
1779	str	r3, [r0, #0x04]
1780	str	r1, [r0, #0x08]
1781	RET
1782	LMEMCPY_C_PAD
1783
1784/*
1785 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1786 */
1787	ldrb	r2, [r1, #0xb]		/* r2 = ...B */
1788	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
1789	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1790	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
1791	mov	r2, r2, lsl #24		/* r2 = B... */
1792	orr	r2, r2, ip, lsr #8	/* r2 = BA98 */
1793	str	r2, [r0, #0x08]
1794	mov	r2, ip, lsl #24		/* r2 = 7... */
1795	orr	r2, r2, r3, lsr #8	/* r2 = 7654 */
1796	mov	r1, r1, lsr #8		/* r1 = .210 */
1797	orr	r1, r1, r3, lsl #24	/* r1 = 3210 */
1798	str	r2, [r0, #0x04]
1799	str	r1, [r0]
1800	RET
1801	LMEMCPY_C_PAD
1802
1803/*
1804 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1805 */
1806	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1807	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1808	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
1809	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
1810	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1811	str	r2, [r0]
1812	mov	r3, r3, lsr #16		/* r3 = ..54 */
1813	orr	r3, r3, ip, lsl #16	/* r3 = 7654 */
1814	mov	r1, r1, lsl #16		/* r1 = BA.. */
1815	orr	r1, r1, ip, lsr #16	/* r1 = BA98 */
1816	str	r3, [r0, #0x04]
1817	str	r1, [r0, #0x08]
1818	RET
1819	LMEMCPY_C_PAD
1820
1821/*
1822 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1823 */
1824	ldrb	r2, [r1]		/* r2 = ...0 */
1825	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
1826	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
1827	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
1828	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
1829	str	r2, [r0]
1830	mov	r3, r3, lsr #24		/* r3 = ...4 */
1831	orr	r3, r3, ip, lsl #8	/* r3 = 7654 */
1832	mov	r1, r1, lsl #8		/* r1 = BA9. */
1833	orr	r1, r1, ip, lsr #24	/* r1 = BA98 */
1834	str	r3, [r0, #0x04]
1835	str	r1, [r0, #0x08]
1836	RET
1837	LMEMCPY_C_PAD
1838
1839/*
1840 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
1841 */
1842	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1843	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1844	ldr	ip, [r1, #0x08]		/* BE:ip = 89AB  LE:ip = BA98 */
1845	mov	r1, r2, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
1846	strh	r1, [r0, #0x01]
1847	strb	r2, [r0]
1848	mov	r1, r2, lsr #24		/* r1 = ...3 */
1849	orr	r2, r1, r3, lsl #8	/* r1 = 6543 */
1850	mov	r1, r3, lsr #24		/* r1 = ...7 */
1851	orr	r1, r1, ip, lsl #8	/* r1 = A987 */
1852	mov	ip, ip, lsr #24		/* ip = ...B */
1853	str	r2, [r0, #0x03]
1854	str	r1, [r0, #0x07]
1855	strb	ip, [r0, #0x0b]
1856	RET
1857	LMEMCPY_C_PAD
1858
1859/*
1860 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
1861 */
1862	ldrb	r2, [r1]
1863	ldrh	r3, [r1, #0x01]
1864	ldr	ip, [r1, #0x03]
1865	strb	r2, [r0]
1866	ldr	r2, [r1, #0x07]
1867	ldrb	r1, [r1, #0x0b]
1868	strh	r3, [r0, #0x01]
1869	str	ip, [r0, #0x03]
1870	str	r2, [r0, #0x07]
1871	strb	r1, [r0, #0x0b]
1872	RET
1873	LMEMCPY_C_PAD
1874
1875/*
1876 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
1877 */
1878	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1879	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1880	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
1881	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
1882	strb	r2, [r0]
1883	mov	r2, r2, lsr #8		/* r2 = ...1 */
1884	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
1885	strh	r2, [r0, #0x01]
1886	mov	r2, r3, lsr #8		/* r2 = .543 */
1887	orr	r3, r2, ip, lsl #24	/* r3 = 6543 */
1888	mov	r2, ip, lsr #8		/* r2 = .987 */
1889	orr	r2, r2, r1, lsl #24	/* r2 = A987 */
1890	mov	r1, r1, lsr #8		/* r1 = ...B */
1891	str	r3, [r0, #0x03]
1892	str	r2, [r0, #0x07]
1893	strb	r1, [r0, #0x0b]
1894	RET
1895	LMEMCPY_C_PAD
1896
1897/*
1898 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
1899 */
1900	ldrb	r2, [r1]
1901	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
1902	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
1903	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
1904	strb	r2, [r0]
1905	strh	r3, [r0, #0x01]
1906	mov	r3, r3, lsr #16		/* r3 = ..43 */
1907	orr	r3, r3, ip, lsl #16	/* r3 = 6543 */
1908	mov	ip, ip, lsr #16		/* ip = ..87 */
1909	orr	ip, ip, r1, lsl #16	/* ip = A987 */
1910	mov	r1, r1, lsr #16		/* r1 = ..xB */
1911	str	r3, [r0, #0x03]
1912	str	ip, [r0, #0x07]
1913	strb	r1, [r0, #0x0b]
1914	RET
1915	LMEMCPY_C_PAD
1916
1917/*
1918 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1919 */
1920	ldr	ip, [r1]		/* BE:ip = 0123  LE:ip = 3210 */
1921	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1922	ldr	r2, [r1, #0x08]		/* BE:r2 = 89AB  LE:r2 = BA98 */
1923	mov	r1, ip, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
1924	strh	ip, [r0]
1925	orr	r1, r1, r3, lsl #16	/* r1 = 5432 */
1926	mov	r3, r3, lsr #16		/* r3 = ..76 */
1927	orr	r3, r3, r2, lsl #16	/* r3 = 9876 */
1928	mov	r2, r2, lsr #16		/* r2 = ..BA */
1929	str	r1, [r0, #0x02]
1930	str	r3, [r0, #0x06]
1931	strh	r2, [r0, #0x0a]
1932	RET
1933	LMEMCPY_C_PAD
1934
1935/*
1936 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
1937 */
1938	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1939	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1940	mov	ip, r2, lsr #8		/* BE:ip = .x01  LE:ip = .210 */
1941	strh	ip, [r0]
1942	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
1943	ldrb	r1, [r1, #0x0b]		/* r1 = ...B */
1944	mov	r2, r2, lsr #24		/* r2 = ...2 */
1945	orr	r2, r2, r3, lsl #8	/* r2 = 5432 */
1946	mov	r3, r3, lsr #24		/* r3 = ...6 */
1947	orr	r3, r3, ip, lsl #8	/* r3 = 9876 */
1948	mov	r1, r1, lsl #8		/* r1 = ..B. */
1949	orr	r1, r1, ip, lsr #24	/* r1 = ..BA */
1950	str	r2, [r0, #0x02]
1951	str	r3, [r0, #0x06]
1952	strh	r1, [r0, #0x0a]
1953	RET
1954	LMEMCPY_C_PAD
1955
1956/*
1957 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1958 */
1959	ldrh	r2, [r1]
1960	ldr	r3, [r1, #0x02]
1961	ldr	ip, [r1, #0x06]
1962	ldrh	r1, [r1, #0x0a]
1963	strh	r2, [r0]
1964	str	r3, [r0, #0x02]
1965	str	ip, [r0, #0x06]
1966	strh	r1, [r0, #0x0a]
1967	RET
1968	LMEMCPY_C_PAD
1969
1970/*
1971 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
1972 */
1973	ldr	r2, [r1, #0x09]		/* BE:r2 = 9ABx  LE:r2 = xBA9 */
1974	ldr	r3, [r1, #0x05]		/* BE:r3 = 5678  LE:r3 = 8765 */
1975	mov	ip, r2, lsr #8		/* BE:ip = .9AB  LE:ip = .xBA */
1976	strh	ip, [r0, #0x0a]
1977	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
1978	ldrb	r1, [r1]		/* r1 = ...0 */
1979	mov	r2, r2, lsl #24		/* r2 = 9... */
1980	orr	r2, r2, r3, lsr #8	/* r2 = 9876 */
1981	mov	r3, r3, lsl #24		/* r3 = 5... */
1982	orr	r3, r3, ip, lsr #8	/* r3 = 5432 */
1983	orr	r1, r1, ip, lsl #8	/* r1 = 3210 */
1984	str	r2, [r0, #0x06]
1985	str	r3, [r0, #0x02]
1986	strh	r1, [r0]
1987	RET
1988	LMEMCPY_C_PAD
1989
1990/*
1991 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
1992 */
1993	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1994	ldr	ip, [r1, #0x04]		/* BE:ip = 4567  LE:ip = 7654 */
1995	ldr	r1, [r1, #0x08]		/* BE:r1 = 89AB  LE:r1 = BA98 */
1996	strb	r2, [r0]
1997	mov	r3, r2, lsr #8		/* r3 = .321 */
1998	orr	r3, r3, ip, lsl #24	/* r3 = 4321 */
1999	str	r3, [r0, #0x01]
2000	mov	r3, ip, lsr #8		/* r3 = .765 */
2001	orr	r3, r3, r1, lsl #24	/* r3 = 8765 */
2002	str	r3, [r0, #0x05]
2003	mov	r1, r1, lsr #8		/* r1 = .BA9 */
2004	strh	r1, [r0, #0x09]
2005	mov	r1, r1, lsr #16		/* r1 = ...B */
2006	strb	r1, [r0, #0x0b]
2007	RET
2008	LMEMCPY_C_PAD
2009
2010/*
2011 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2012 */
2013	ldrb	r2, [r1, #0x0b]		/* r2 = ...B */
2014	ldr	r3, [r1, #0x07]		/* BE:r3 = 789A  LE:r3 = A987 */
2015	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
2016	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
2017	strb	r2, [r0, #0x0b]
2018	mov	r2, r3, lsr #16		/* r2 = ..A9 */
2019	strh	r2, [r0, #0x09]
2020	mov	r3, r3, lsl #16		/* r3 = 87.. */
2021	orr	r3, r3, ip, lsr #16	/* r3 = 8765 */
2022	mov	ip, ip, lsl #16		/* ip = 43.. */
2023	orr	ip, ip, r1, lsr #16	/* ip = 4321 */
2024	mov	r1, r1, lsr #8		/* r1 = .210 */
2025	str	r3, [r0, #0x05]
2026	str	ip, [r0, #0x01]
2027	strb	r1, [r0]
2028	RET
2029	LMEMCPY_C_PAD
2030
2031/*
2032 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2033 */
2034	ldrh	r2, [r1]		/* r2 = ..10 */
2035	ldr	r3, [r1, #0x02]		/* r3 = 5432 */
2036	ldr	ip, [r1, #0x06]		/* ip = 9876 */
2037	ldrh	r1, [r1, #0x0a]		/* r1 = ..BA */
2038	strb	r2, [r0]
2039	mov	r2, r2, lsr #8		/* r2 = ...1 */
2040	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
2041	mov	r3, r3, lsr #24		/* r3 = ...5 */
2042	orr	r3, r3, ip, lsl #8	/* r3 = 8765 */
2043	mov	ip, ip, lsr #24		/* ip = ...9 */
2044	orr	ip, ip, r1, lsl #8	/* ip = .BA9 */
2045	mov	r1, r1, lsr #8		/* r1 = ...B */
2046	str	r2, [r0, #0x01]
2047	str	r3, [r0, #0x05]
2048	strh	ip, [r0, #0x09]
2049	strb	r1, [r0, #0x0b]
2050	RET
2051	LMEMCPY_C_PAD
2052
2053/*
2054 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2055 */
2056	ldrb	r2, [r1]
2057	ldr	r3, [r1, #0x01]
2058	ldr	ip, [r1, #0x05]
2059	strb	r2, [r0]
2060	ldrh	r2, [r1, #0x09]
2061	ldrb	r1, [r1, #0x0b]
2062	str	r3, [r0, #0x01]
2063	str	ip, [r0, #0x05]
2064	strh	r2, [r0, #0x09]
2065	strb	r1, [r0, #0x0b]
2066	RET
2067END(memcpy)
2068