xref: /freebsd/sys/arm/arm/support.S (revision 9768746b)
1/*-
2 * Copyright (c) 2004 Olivier Houchard
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26/*
27 * Copyright 2003 Wasabi Systems, Inc.
28 * All rights reserved.
29 *
30 * Written by Steve C. Woodford for Wasabi Systems, Inc.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 *    notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 *    notice, this list of conditions and the following disclaimer in the
39 *    documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 *    must display the following acknowledgement:
42 *      This product includes software developed for the NetBSD Project by
43 *      Wasabi Systems, Inc.
44 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
45 *    or promote products derived from this software without specific prior
46 *    written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
50 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
51 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
52 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
53 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
54 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
55 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
56 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
57 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
58 * POSSIBILITY OF SUCH DAMAGE.
59 */
60/*
61 * Copyright (c) 1997 The NetBSD Foundation, Inc.
62 * All rights reserved.
63 *
64 * This code is derived from software contributed to The NetBSD Foundation
65 * by Neil A. Carson and Mark Brinicombe
66 *
67 * Redistribution and use in source and binary forms, with or without
68 * modification, are permitted provided that the following conditions
69 * are met:
70 * 1. Redistributions of source code must retain the above copyright
71 *    notice, this list of conditions and the following disclaimer.
72 * 2. Redistributions in binary form must reproduce the above copyright
73 *    notice, this list of conditions and the following disclaimer in the
74 *    documentation and/or other materials provided with the distribution.
75 *
76 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
77 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
78 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
79 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
80 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
81 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
82 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
83 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
84 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
85 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
86 * POSSIBILITY OF SUCH DAMAGE.
87 */
88
89#include <machine/asm.h>
90__FBSDID("$FreeBSD$");
91
92#include "assym.inc"
93
94	.syntax	unified
95
96/*
97 * memset: Sets a block of memory to the specified value
98 *
99 * On entry:
100 *   r0 - dest address
101 *   r1 - byte to write
102 *   r2 - number of bytes to write
103 *
104 * On exit:
105 *   r0 - dest address
106 */
107/* LINTSTUB: Func: void *memset(void *, int, size_t) */
108ENTRY(memset)
109	and	r3, r1, #0xff		/* We deal with bytes */
110	mov	r1, r2
111do_memset:
112	cmp	r1, #0x04		/* Do we have less than 4 bytes */
113	mov	ip, r0
114	blt	.Lmemset_lessthanfour
115
116	/* Ok first we will word align the address */
117	ands	r2, ip, #0x03		/* Get the bottom two bits */
118	bne	.Lmemset_wordunaligned	/* The address is not word aligned */
119
120	/* We are now word aligned */
121.Lmemset_wordaligned:
122	orr	r3, r3, r3, lsl #8	/* Extend value to 16-bits */
123	tst	ip, #0x04		/* Quad-align for armv5e */
124	orr	r3, r3, r3, lsl #16	/* Extend value to 32-bits */
125	subne	r1, r1, #0x04		/* Quad-align if necessary */
126	strne	r3, [ip], #0x04
127	cmp	r1, #0x10
128	blt	.Lmemset_loop4		/* If less than 16 then use words */
129	mov	r2, r3			/* Duplicate data */
130	cmp	r1, #0x80		/* If < 128 then skip the big loop */
131	blt	.Lmemset_loop32
132
133	/* Do 128 bytes at a time */
134.Lmemset_loop128:
135	subs	r1, r1, #0x80
136	strdge	r2, [ip], #0x08
137	strdge	r2, [ip], #0x08
138	strdge	r2, [ip], #0x08
139	strdge	r2, [ip], #0x08
140	strdge	r2, [ip], #0x08
141	strdge	r2, [ip], #0x08
142	strdge	r2, [ip], #0x08
143	strdge	r2, [ip], #0x08
144	strdge	r2, [ip], #0x08
145	strdge	r2, [ip], #0x08
146	strdge	r2, [ip], #0x08
147	strdge	r2, [ip], #0x08
148	strdge	r2, [ip], #0x08
149	strdge	r2, [ip], #0x08
150	strdge	r2, [ip], #0x08
151	strdge	r2, [ip], #0x08
152	bgt	.Lmemset_loop128
153	RETeq			/* Zero length so just exit */
154
155	add	r1, r1, #0x80		/* Adjust for extra sub */
156
157	/* Do 32 bytes at a time */
158.Lmemset_loop32:
159	subs	r1, r1, #0x20
160	strdge	r2, [ip], #0x08
161	strdge	r2, [ip], #0x08
162	strdge	r2, [ip], #0x08
163	strdge	r2, [ip], #0x08
164	bgt	.Lmemset_loop32
165	RETeq			/* Zero length so just exit */
166
167	adds	r1, r1, #0x10		/* Partially adjust for extra sub */
168
169	/* Deal with 16 bytes or more */
170	strdge	r2, [ip], #0x08
171	strdge	r2, [ip], #0x08
172	RETeq			/* Zero length so just exit */
173
174	addlt	r1, r1, #0x10		/* Possibly adjust for extra sub */
175
176	/* We have at least 4 bytes so copy as words */
177.Lmemset_loop4:
178	subs	r1, r1, #0x04
179	strge	r3, [ip], #0x04
180	bgt	.Lmemset_loop4
181	RETeq			/* Zero length so just exit */
182
183	/* Compensate for 64-bit alignment check */
184	adds	r1, r1, #0x04
185	RETeq
186	cmp	r1, #2
187
188	strb	r3, [ip], #0x01		/* Set 1 byte */
189	strbge	r3, [ip], #0x01		/* Set another byte */
190	strbgt	r3, [ip]		/* and a third */
191	RET			/* Exit */
192
193.Lmemset_wordunaligned:
194	rsb	r2, r2, #0x004
195	strb	r3, [ip], #0x01		/* Set 1 byte */
196	cmp	r2, #0x02
197	strbge	r3, [ip], #0x01		/* Set another byte */
198	sub	r1, r1, r2
199	strbgt	r3, [ip], #0x01		/* and a third */
200	cmp	r1, #0x04		/* More than 4 bytes left? */
201	bge	.Lmemset_wordaligned	/* Yup */
202
203.Lmemset_lessthanfour:
204	cmp	r1, #0x00
205	RETeq			/* Zero length so exit */
206	strb	r3, [ip], #0x01		/* Set 1 byte */
207	cmp	r1, #0x02
208	strbge	r3, [ip], #0x01		/* Set another byte */
209	strbgt	r3, [ip]		/* and a third */
210	RET			/* Exit */
211END(memset)
212
213ENTRY(memcmp)
214	mov	ip, r0
215	cmp	r2, #0x06
216	beq	.Lmemcmp_6bytes
217	mov	r0, #0x00
218
219	/* Are both addresses aligned the same way? */
220	cmp	r2, #0x00
221	eorsne	r3, ip, r1
222	RETeq			/* len == 0, or same addresses! */
223	tst	r3, #0x03
224	subne	r2, r2, #0x01
225	bne	.Lmemcmp_bytewise2	/* Badly aligned. Do it the slow way */
226
227	/* Word-align the addresses, if necessary */
228	sub	r3, r1, #0x05
229	ands	r3, r3, #0x03
230	add	r3, r3, r3, lsl #1
231	addne	pc, pc, r3, lsl #3
232	nop
233
234	/* Compare up to 3 bytes */
235	ldrb	r0, [ip], #0x01
236	ldrb	r3, [r1], #0x01
237	subs	r0, r0, r3
238	RETne
239	subs	r2, r2, #0x01
240	RETeq
241
242	/* Compare up to 2 bytes */
243	ldrb	r0, [ip], #0x01
244	ldrb	r3, [r1], #0x01
245	subs	r0, r0, r3
246	RETne
247	subs	r2, r2, #0x01
248	RETeq
249
250	/* Compare 1 byte */
251	ldrb	r0, [ip], #0x01
252	ldrb	r3, [r1], #0x01
253	subs	r0, r0, r3
254	RETne
255	subs	r2, r2, #0x01
256	RETeq
257
258	/* Compare 4 bytes at a time, if possible */
259	subs	r2, r2, #0x04
260	bcc	.Lmemcmp_bytewise
261.Lmemcmp_word_aligned:
262	ldr	r0, [ip], #0x04
263	ldr	r3, [r1], #0x04
264	subs	r2, r2, #0x04
265	cmpcs	r0, r3
266	beq	.Lmemcmp_word_aligned
267	sub	r0, r0, r3
268
269	/* Correct for extra subtraction, and check if done */
270	adds	r2, r2, #0x04
271	cmpeq	r0, #0x00		/* If done, did all bytes match? */
272	RETeq			/* Yup. Just return */
273
274	/* Re-do the final word byte-wise */
275	sub	ip, ip, #0x04
276	sub	r1, r1, #0x04
277
278.Lmemcmp_bytewise:
279	add	r2, r2, #0x03
280.Lmemcmp_bytewise2:
281	ldrb	r0, [ip], #0x01
282	ldrb	r3, [r1], #0x01
283	subs	r2, r2, #0x01
284	cmpcs	r0, r3
285	beq	.Lmemcmp_bytewise2
286	sub	r0, r0, r3
287	RET
288
289	/*
290	 * 6 byte compares are very common, thanks to the network stack.
291	 * This code is hand-scheduled to reduce the number of stalls for
292	 * load results. Everything else being equal, this will be ~32%
293	 * faster than a byte-wise memcmp.
294	 */
295	.align	5
296.Lmemcmp_6bytes:
297	ldrb	r3, [r1, #0x00]		/* r3 = b2#0 */
298	ldrb	r0, [ip, #0x00]		/* r0 = b1#0 */
299	ldrb	r2, [r1, #0x01]		/* r2 = b2#1 */
300	subs	r0, r0, r3		/* r0 = b1#0 - b2#0 */
301	ldrbeq	r3, [ip, #0x01]		/* r3 = b1#1 */
302	RETne			/* Return if mismatch on #0 */
303	subs	r0, r3, r2		/* r0 = b1#1 - b2#1 */
304	ldrbeq	r3, [r1, #0x02]		/* r3 = b2#2 */
305	ldrbeq	r0, [ip, #0x02]		/* r0 = b1#2 */
306	RETne			/* Return if mismatch on #1 */
307	ldrb	r2, [r1, #0x03]		/* r2 = b2#3 */
308	subs	r0, r0, r3		/* r0 = b1#2 - b2#2 */
309	ldrbeq	r3, [ip, #0x03]		/* r3 = b1#3 */
310	RETne			/* Return if mismatch on #2 */
311	subs	r0, r3, r2		/* r0 = b1#3 - b2#3 */
312	ldrbeq	r3, [r1, #0x04]		/* r3 = b2#4 */
313	ldrbeq	r0, [ip, #0x04]		/* r0 = b1#4 */
314	RETne			/* Return if mismatch on #3 */
315	ldrb	r2, [r1, #0x05]		/* r2 = b2#5 */
316	subs	r0, r0, r3		/* r0 = b1#4 - b2#4 */
317	ldrbeq	r3, [ip, #0x05]		/* r3 = b1#5 */
318	RETne			/* Return if mismatch on #4 */
319	sub	r0, r3, r2		/* r0 = b1#5 - b2#5 */
320	RET
321END(memcmp)
322
323ENTRY(memmove)
324	/* Do the buffers overlap? */
325	cmp	r0, r1
326	RETeq		/* Bail now if src/dst are the same */
327	subcc	r3, r0, r1	/* if (dst > src) r3 = dst - src */
328	subcs	r3, r1, r0	/* if (src > dsr) r3 = src - dst */
329	cmp	r3, r2		/* if (r3 < len) we have an overlap */
330	bcc	PIC_SYM(_C_LABEL(memcpy), PLT)
331
332	/* Determine copy direction */
333	cmp	r1, r0
334	bcc	.Lmemmove_backwards
335
336	moveq	r0, #0			/* Quick abort for len=0 */
337	RETeq
338
339	stmdb	sp!, {r0, lr}		/* memmove() returns dest addr */
340	subs	r2, r2, #4
341	blt	.Lmemmove_fl4		/* less than 4 bytes */
342	ands	r12, r0, #3
343	bne	.Lmemmove_fdestul	/* oh unaligned destination addr */
344	ands	r12, r1, #3
345	bne	.Lmemmove_fsrcul		/* oh unaligned source addr */
346
347.Lmemmove_ft8:
348	/* We have aligned source and destination */
349	subs	r2, r2, #8
350	blt	.Lmemmove_fl12		/* less than 12 bytes (4 from above) */
351	subs	r2, r2, #0x14
352	blt	.Lmemmove_fl32		/* less than 32 bytes (12 from above) */
353	stmdb	sp!, {r4}		/* borrow r4 */
354
355	/* blat 32 bytes at a time */
356	/* XXX for really big copies perhaps we should use more registers */
357.Lmemmove_floop32:
358	ldmia	r1!, {r3, r4, r12, lr}
359	stmia	r0!, {r3, r4, r12, lr}
360	ldmia	r1!, {r3, r4, r12, lr}
361	stmia	r0!, {r3, r4, r12, lr}
362	subs	r2, r2, #0x20
363	bge	.Lmemmove_floop32
364
365	cmn	r2, #0x10
366	ldmiage	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
367	stmiage	r0!, {r3, r4, r12, lr}
368	subge	r2, r2, #0x10
369	ldmia	sp!, {r4}		/* return r4 */
370
371.Lmemmove_fl32:
372	adds	r2, r2, #0x14
373
374	/* blat 12 bytes at a time */
375.Lmemmove_floop12:
376	ldmiage	r1!, {r3, r12, lr}
377	stmiage	r0!, {r3, r12, lr}
378	subsge	r2, r2, #0x0c
379	bge	.Lmemmove_floop12
380
381.Lmemmove_fl12:
382	adds	r2, r2, #8
383	blt	.Lmemmove_fl4
384
385	subs	r2, r2, #4
386	ldrlt	r3, [r1], #4
387	strlt	r3, [r0], #4
388	ldmiage	r1!, {r3, r12}
389	stmiage	r0!, {r3, r12}
390	subge	r2, r2, #4
391
392.Lmemmove_fl4:
393	/* less than 4 bytes to go */
394	adds	r2, r2, #4
395	ldmiaeq	sp!, {r0, pc}		/* done */
396
397	/* copy the crud byte at a time */
398	cmp	r2, #2
399	ldrb	r3, [r1], #1
400	strb	r3, [r0], #1
401	ldrbge	r3, [r1], #1
402	strbge	r3, [r0], #1
403	ldrbgt	r3, [r1], #1
404	strbgt	r3, [r0], #1
405	ldmia	sp!, {r0, pc}
406
407	/* erg - unaligned destination */
408.Lmemmove_fdestul:
409	rsb	r12, r12, #4
410	cmp	r12, #2
411
412	/* align destination with byte copies */
413	ldrb	r3, [r1], #1
414	strb	r3, [r0], #1
415	ldrbge	r3, [r1], #1
416	strbge	r3, [r0], #1
417	ldrbgt	r3, [r1], #1
418	strbgt	r3, [r0], #1
419	subs	r2, r2, r12
420	blt	.Lmemmove_fl4		/* less the 4 bytes */
421
422	ands	r12, r1, #3
423	beq	.Lmemmove_ft8		/* we have an aligned source */
424
425	/* erg - unaligned source */
426	/* This is where it gets nasty ... */
427.Lmemmove_fsrcul:
428	bic	r1, r1, #3
429	ldr	lr, [r1], #4
430	cmp	r12, #2
431	bgt	.Lmemmove_fsrcul3
432	beq	.Lmemmove_fsrcul2
433	cmp	r2, #0x0c
434	blt	.Lmemmove_fsrcul1loop4
435	sub	r2, r2, #0x0c
436	stmdb	sp!, {r4, r5}
437
438.Lmemmove_fsrcul1loop16:
439	mov	r3, lr, lsr #8
440	ldmia	r1!, {r4, r5, r12, lr}
441	orr	r3, r3, r4, lsl #24
442	mov	r4, r4, lsr #8
443	orr	r4, r4, r5, lsl #24
444	mov	r5, r5, lsr #8
445	orr	r5, r5, r12, lsl #24
446	mov	r12, r12, lsr #8
447	orr	r12, r12, lr, lsl #24
448	stmia	r0!, {r3-r5, r12}
449	subs	r2, r2, #0x10
450	bge	.Lmemmove_fsrcul1loop16
451	ldmia	sp!, {r4, r5}
452	adds	r2, r2, #0x0c
453	blt	.Lmemmove_fsrcul1l4
454
455.Lmemmove_fsrcul1loop4:
456	mov	r12, lr, lsr #8
457	ldr	lr, [r1], #4
458	orr	r12, r12, lr, lsl #24
459	str	r12, [r0], #4
460	subs	r2, r2, #4
461	bge	.Lmemmove_fsrcul1loop4
462
463.Lmemmove_fsrcul1l4:
464	sub	r1, r1, #3
465	b	.Lmemmove_fl4
466
467.Lmemmove_fsrcul2:
468	cmp	r2, #0x0c
469	blt	.Lmemmove_fsrcul2loop4
470	sub	r2, r2, #0x0c
471	stmdb	sp!, {r4, r5}
472
473.Lmemmove_fsrcul2loop16:
474	mov	r3, lr, lsr #16
475	ldmia	r1!, {r4, r5, r12, lr}
476	orr	r3, r3, r4, lsl #16
477	mov	r4, r4, lsr #16
478	orr	r4, r4, r5, lsl #16
479	mov	r5, r5, lsr #16
480	orr	r5, r5, r12, lsl #16
481	mov	r12, r12, lsr #16
482	orr	r12, r12, lr, lsl #16
483	stmia	r0!, {r3-r5, r12}
484	subs	r2, r2, #0x10
485	bge	.Lmemmove_fsrcul2loop16
486	ldmia	sp!, {r4, r5}
487	adds	r2, r2, #0x0c
488	blt	.Lmemmove_fsrcul2l4
489
490.Lmemmove_fsrcul2loop4:
491	mov	r12, lr, lsr #16
492	ldr	lr, [r1], #4
493	orr	r12, r12, lr, lsl #16
494	str	r12, [r0], #4
495	subs	r2, r2, #4
496	bge	.Lmemmove_fsrcul2loop4
497
498.Lmemmove_fsrcul2l4:
499	sub	r1, r1, #2
500	b	.Lmemmove_fl4
501
502.Lmemmove_fsrcul3:
503	cmp	r2, #0x0c
504	blt	.Lmemmove_fsrcul3loop4
505	sub	r2, r2, #0x0c
506	stmdb	sp!, {r4, r5}
507
508.Lmemmove_fsrcul3loop16:
509	mov	r3, lr, lsr #24
510	ldmia	r1!, {r4, r5, r12, lr}
511	orr	r3, r3, r4, lsl #8
512	mov	r4, r4, lsr #24
513	orr	r4, r4, r5, lsl #8
514	mov	r5, r5, lsr #24
515	orr	r5, r5, r12, lsl #8
516	mov	r12, r12, lsr #24
517	orr	r12, r12, lr, lsl #8
518	stmia	r0!, {r3-r5, r12}
519	subs	r2, r2, #0x10
520	bge	.Lmemmove_fsrcul3loop16
521	ldmia	sp!, {r4, r5}
522	adds	r2, r2, #0x0c
523	blt	.Lmemmove_fsrcul3l4
524
525.Lmemmove_fsrcul3loop4:
526	mov	r12, lr, lsr #24
527	ldr	lr, [r1], #4
528	orr	r12, r12, lr, lsl #8
529	str	r12, [r0], #4
530	subs	r2, r2, #4
531	bge	.Lmemmove_fsrcul3loop4
532
533.Lmemmove_fsrcul3l4:
534	sub	r1, r1, #1
535	b	.Lmemmove_fl4
536
537.Lmemmove_backwards:
538	add	r1, r1, r2
539	add	r0, r0, r2
540	subs	r2, r2, #4
541	blt	.Lmemmove_bl4		/* less than 4 bytes */
542	ands	r12, r0, #3
543	bne	.Lmemmove_bdestul	/* oh unaligned destination addr */
544	ands	r12, r1, #3
545	bne	.Lmemmove_bsrcul		/* oh unaligned source addr */
546
547.Lmemmove_bt8:
548	/* We have aligned source and destination */
549	subs	r2, r2, #8
550	blt	.Lmemmove_bl12		/* less than 12 bytes (4 from above) */
551	stmdb	sp!, {r4, lr}
552	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
553	blt	.Lmemmove_bl32
554
555	/* blat 32 bytes at a time */
556	/* XXX for really big copies perhaps we should use more registers */
557.Lmemmove_bloop32:
558	ldmdb	r1!, {r3, r4, r12, lr}
559	stmdb	r0!, {r3, r4, r12, lr}
560	ldmdb	r1!, {r3, r4, r12, lr}
561	stmdb	r0!, {r3, r4, r12, lr}
562	subs	r2, r2, #0x20
563	bge	.Lmemmove_bloop32
564
565.Lmemmove_bl32:
566	cmn	r2, #0x10
567	ldmdbge	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
568	stmdbge	r0!, {r3, r4, r12, lr}
569	subge	r2, r2, #0x10
570	adds	r2, r2, #0x14
571	ldmdbge	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
572	stmdbge	r0!, {r3, r12, lr}
573	subge	r2, r2, #0x0c
574	ldmia	sp!, {r4, lr}
575
576.Lmemmove_bl12:
577	adds	r2, r2, #8
578	blt	.Lmemmove_bl4
579	subs	r2, r2, #4
580	ldrlt	r3, [r1, #-4]!
581	strlt	r3, [r0, #-4]!
582	ldmdbge	r1!, {r3, r12}
583	stmdbge	r0!, {r3, r12}
584	subge	r2, r2, #4
585
586.Lmemmove_bl4:
587	/* less than 4 bytes to go */
588	adds	r2, r2, #4
589	RETeq			/* done */
590
591	/* copy the crud byte at a time */
592	cmp	r2, #2
593	ldrb	r3, [r1, #-1]!
594	strb	r3, [r0, #-1]!
595	ldrbge	r3, [r1, #-1]!
596	strbge	r3, [r0, #-1]!
597	ldrbgt	r3, [r1, #-1]!
598	strbgt	r3, [r0, #-1]!
599	RET
600
601	/* erg - unaligned destination */
602.Lmemmove_bdestul:
603	cmp	r12, #2
604
605	/* align destination with byte copies */
606	ldrb	r3, [r1, #-1]!
607	strb	r3, [r0, #-1]!
608	ldrbge	r3, [r1, #-1]!
609	strbge	r3, [r0, #-1]!
610	ldrbgt	r3, [r1, #-1]!
611	strbgt	r3, [r0, #-1]!
612	subs	r2, r2, r12
613	blt	.Lmemmove_bl4		/* less than 4 bytes to go */
614	ands	r12, r1, #3
615	beq	.Lmemmove_bt8		/* we have an aligned source */
616
617	/* erg - unaligned source */
618	/* This is where it gets nasty ... */
619.Lmemmove_bsrcul:
620	bic	r1, r1, #3
621	ldr	r3, [r1, #0]
622	cmp	r12, #2
623	blt	.Lmemmove_bsrcul1
624	beq	.Lmemmove_bsrcul2
625	cmp	r2, #0x0c
626	blt	.Lmemmove_bsrcul3loop4
627	sub	r2, r2, #0x0c
628	stmdb	sp!, {r4, r5, lr}
629
630.Lmemmove_bsrcul3loop16:
631	mov	lr, r3, lsl #8
632	ldmdb	r1!, {r3-r5, r12}
633	orr	lr, lr, r12, lsr #24
634	mov	r12, r12, lsl #8
635	orr	r12, r12, r5, lsr #24
636	mov	r5, r5, lsl #8
637	orr	r5, r5, r4, lsr #24
638	mov	r4, r4, lsl #8
639	orr	r4, r4, r3, lsr #24
640	stmdb	r0!, {r4, r5, r12, lr}
641	subs	r2, r2, #0x10
642	bge	.Lmemmove_bsrcul3loop16
643	ldmia	sp!, {r4, r5, lr}
644	adds	r2, r2, #0x0c
645	blt	.Lmemmove_bsrcul3l4
646
647.Lmemmove_bsrcul3loop4:
648	mov	r12, r3, lsl #8
649	ldr	r3, [r1, #-4]!
650	orr	r12, r12, r3, lsr #24
651	str	r12, [r0, #-4]!
652	subs	r2, r2, #4
653	bge	.Lmemmove_bsrcul3loop4
654
655.Lmemmove_bsrcul3l4:
656	add	r1, r1, #3
657	b	.Lmemmove_bl4
658
659.Lmemmove_bsrcul2:
660	cmp	r2, #0x0c
661	blt	.Lmemmove_bsrcul2loop4
662	sub	r2, r2, #0x0c
663	stmdb	sp!, {r4, r5, lr}
664
665.Lmemmove_bsrcul2loop16:
666	mov	lr, r3, lsl #16
667	ldmdb	r1!, {r3-r5, r12}
668	orr	lr, lr, r12, lsr #16
669	mov	r12, r12, lsl #16
670	orr	r12, r12, r5, lsr #16
671	mov	r5, r5, lsl #16
672	orr	r5, r5, r4, lsr #16
673	mov	r4, r4, lsl #16
674	orr	r4, r4, r3, lsr #16
675	stmdb	r0!, {r4, r5, r12, lr}
676	subs	r2, r2, #0x10
677	bge	.Lmemmove_bsrcul2loop16
678	ldmia	sp!, {r4, r5, lr}
679	adds	r2, r2, #0x0c
680	blt	.Lmemmove_bsrcul2l4
681
682.Lmemmove_bsrcul2loop4:
683	mov	r12, r3, lsl #16
684	ldr	r3, [r1, #-4]!
685	orr	r12, r12, r3, lsr #16
686	str	r12, [r0, #-4]!
687	subs	r2, r2, #4
688	bge	.Lmemmove_bsrcul2loop4
689
690.Lmemmove_bsrcul2l4:
691	add	r1, r1, #2
692	b	.Lmemmove_bl4
693
694.Lmemmove_bsrcul1:
695	cmp	r2, #0x0c
696	blt	.Lmemmove_bsrcul1loop4
697	sub	r2, r2, #0x0c
698	stmdb	sp!, {r4, r5, lr}
699
700.Lmemmove_bsrcul1loop32:
701	mov	lr, r3, lsl #24
702	ldmdb	r1!, {r3-r5, r12}
703	orr	lr, lr, r12, lsr #8
704	mov	r12, r12, lsl #24
705	orr	r12, r12, r5, lsr #8
706	mov	r5, r5, lsl #24
707	orr	r5, r5, r4, lsr #8
708	mov	r4, r4, lsl #24
709	orr	r4, r4, r3, lsr #8
710	stmdb	r0!, {r4, r5, r12, lr}
711	subs	r2, r2, #0x10
712	bge	.Lmemmove_bsrcul1loop32
713	ldmia	sp!, {r4, r5, lr}
714	adds	r2, r2, #0x0c
715	blt	.Lmemmove_bsrcul1l4
716
717.Lmemmove_bsrcul1loop4:
718	mov	r12, r3, lsl #24
719	ldr	r3, [r1, #-4]!
720	orr	r12, r12, r3, lsr #8
721	str	r12, [r0, #-4]!
722	subs	r2, r2, #4
723	bge	.Lmemmove_bsrcul1loop4
724
725.Lmemmove_bsrcul1l4:
726	add	r1, r1, #1
727	b	.Lmemmove_bl4
728END(memmove)
729
730/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
731ENTRY(memcpy)
732	pld	[r1]
733	cmp	r2, #0x0c
734	ble	.Lmemcpy_short		/* <= 12 bytes */
735#ifdef FLASHADDR
736#if FLASHADDR > PHYSADDR
737	ldr	r3, =FLASHADDR
738	cmp	r3, pc
739	bls	.Lnormal
740#else
741	ldr	r3, =FLASHADDR
742	cmp	r3, pc
743	bhi	.Lnormal
744#endif
745#endif
746	mov	r3, r0			/* We must not clobber r0 */
747
748	/* Word-align the destination buffer */
749	ands	ip, r3, #0x03		/* Already word aligned? */
750	beq	.Lmemcpy_wordaligned	/* Yup */
751	cmp	ip, #0x02
752	ldrb	ip, [r1], #0x01
753	sub	r2, r2, #0x01
754	strb	ip, [r3], #0x01
755	ldrble	ip, [r1], #0x01
756	suble	r2, r2, #0x01
757	strble	ip, [r3], #0x01
758	ldrblt	ip, [r1], #0x01
759	sublt	r2, r2, #0x01
760	strblt	ip, [r3], #0x01
761
762	/* Destination buffer is now word aligned */
763.Lmemcpy_wordaligned:
764	ands	ip, r1, #0x03		/* Is src also word-aligned? */
765	bne	.Lmemcpy_bad_align	/* Nope. Things just got bad */
766
767	/* Quad-align the destination buffer */
768	tst	r3, #0x07		/* Already quad aligned? */
769	ldrne	ip, [r1], #0x04
770	stmfd	sp!, {r4-r9}		/* Free up some registers */
771	subne	r2, r2, #0x04
772	strne	ip, [r3], #0x04
773
774	/* Destination buffer quad aligned, source is at least word aligned */
775	subs	r2, r2, #0x80
776	blt	.Lmemcpy_w_lessthan128
777
778	/* Copy 128 bytes at a time */
779.Lmemcpy_w_loop128:
780	ldr	r4, [r1], #0x04		/* LD:00-03 */
781	ldr	r5, [r1], #0x04		/* LD:04-07 */
782	pld	[r1, #0x18]		/* Prefetch 0x20 */
783	ldr	r6, [r1], #0x04		/* LD:08-0b */
784	ldr	r7, [r1], #0x04		/* LD:0c-0f */
785	ldr	r8, [r1], #0x04		/* LD:10-13 */
786	ldr	r9, [r1], #0x04		/* LD:14-17 */
787	strd	r4, [r3], #0x08		/* ST:00-07 */
788	ldr	r4, [r1], #0x04		/* LD:18-1b */
789	ldr	r5, [r1], #0x04		/* LD:1c-1f */
790	strd	r6, [r3], #0x08		/* ST:08-0f */
791	ldr	r6, [r1], #0x04		/* LD:20-23 */
792	ldr	r7, [r1], #0x04		/* LD:24-27 */
793	pld	[r1, #0x18]		/* Prefetch 0x40 */
794	strd	r8, [r3], #0x08		/* ST:10-17 */
795	ldr	r8, [r1], #0x04		/* LD:28-2b */
796	ldr	r9, [r1], #0x04		/* LD:2c-2f */
797	strd	r4, [r3], #0x08		/* ST:18-1f */
798	ldr	r4, [r1], #0x04		/* LD:30-33 */
799	ldr	r5, [r1], #0x04		/* LD:34-37 */
800	strd	r6, [r3], #0x08		/* ST:20-27 */
801	ldr	r6, [r1], #0x04		/* LD:38-3b */
802	ldr	r7, [r1], #0x04		/* LD:3c-3f */
803	strd	r8, [r3], #0x08		/* ST:28-2f */
804	ldr	r8, [r1], #0x04		/* LD:40-43 */
805	ldr	r9, [r1], #0x04		/* LD:44-47 */
806	pld	[r1, #0x18]		/* Prefetch 0x60 */
807	strd	r4, [r3], #0x08		/* ST:30-37 */
808	ldr	r4, [r1], #0x04		/* LD:48-4b */
809	ldr	r5, [r1], #0x04		/* LD:4c-4f */
810	strd	r6, [r3], #0x08		/* ST:38-3f */
811	ldr	r6, [r1], #0x04		/* LD:50-53 */
812	ldr	r7, [r1], #0x04		/* LD:54-57 */
813	strd	r8, [r3], #0x08		/* ST:40-47 */
814	ldr	r8, [r1], #0x04		/* LD:58-5b */
815	ldr	r9, [r1], #0x04		/* LD:5c-5f */
816	strd	r4, [r3], #0x08		/* ST:48-4f */
817	ldr	r4, [r1], #0x04		/* LD:60-63 */
818	ldr	r5, [r1], #0x04		/* LD:64-67 */
819	pld	[r1, #0x18]		/* Prefetch 0x80 */
820	strd	r6, [r3], #0x08		/* ST:50-57 */
821	ldr	r6, [r1], #0x04		/* LD:68-6b */
822	ldr	r7, [r1], #0x04		/* LD:6c-6f */
823	strd	r8, [r3], #0x08		/* ST:58-5f */
824	ldr	r8, [r1], #0x04		/* LD:70-73 */
825	ldr	r9, [r1], #0x04		/* LD:74-77 */
826	strd	r4, [r3], #0x08		/* ST:60-67 */
827	ldr	r4, [r1], #0x04		/* LD:78-7b */
828	ldr	r5, [r1], #0x04		/* LD:7c-7f */
829	strd	r6, [r3], #0x08		/* ST:68-6f */
830	strd	r8, [r3], #0x08		/* ST:70-77 */
831	subs	r2, r2, #0x80
832	strd	r4, [r3], #0x08		/* ST:78-7f */
833	bge	.Lmemcpy_w_loop128
834
835.Lmemcpy_w_lessthan128:
836	adds	r2, r2, #0x80		/* Adjust for extra sub */
837	ldmfdeq	sp!, {r4-r9}
838	RETeq			/* Return now if done */
839	subs	r2, r2, #0x20
840	blt	.Lmemcpy_w_lessthan32
841
842	/* Copy 32 bytes at a time */
843.Lmemcpy_w_loop32:
844	ldr	r4, [r1], #0x04
845	ldr	r5, [r1], #0x04
846	pld	[r1, #0x18]
847	ldr	r6, [r1], #0x04
848	ldr	r7, [r1], #0x04
849	ldr	r8, [r1], #0x04
850	ldr	r9, [r1], #0x04
851	strd	r4, [r3], #0x08
852	ldr	r4, [r1], #0x04
853	ldr	r5, [r1], #0x04
854	strd	r6, [r3], #0x08
855	strd	r8, [r3], #0x08
856	subs	r2, r2, #0x20
857	strd	r4, [r3], #0x08
858	bge	.Lmemcpy_w_loop32
859
860.Lmemcpy_w_lessthan32:
861	adds	r2, r2, #0x20		/* Adjust for extra sub */
862	ldmfdeq	sp!, {r4-r9}
863	RETeq			/* Return now if done */
864
865	and	r4, r2, #0x18
866	rsbs	r4, r4, #0x18
867	addne	pc, pc, r4, lsl #1
868	nop
869
870	/* At least 24 bytes remaining */
871	ldr	r4, [r1], #0x04
872	ldr	r5, [r1], #0x04
873	sub	r2, r2, #0x08
874	strd	r4, [r3], #0x08
875
876	/* At least 16 bytes remaining */
877	ldr	r4, [r1], #0x04
878	ldr	r5, [r1], #0x04
879	sub	r2, r2, #0x08
880	strd	r4, [r3], #0x08
881
882	/* At least 8 bytes remaining */
883	ldr	r4, [r1], #0x04
884	ldr	r5, [r1], #0x04
885	subs	r2, r2, #0x08
886	strd	r4, [r3], #0x08
887
888	/* Less than 8 bytes remaining */
889	ldmfd	sp!, {r4-r9}
890	RETeq			/* Return now if done */
891	subs	r2, r2, #0x04
892	ldrge	ip, [r1], #0x04
893	strge	ip, [r3], #0x04
894	RETeq			/* Return now if done */
895	addlt	r2, r2, #0x04
896	ldrb	ip, [r1], #0x01
897	cmp	r2, #0x02
898	ldrbge	r2, [r1], #0x01
899	strb	ip, [r3], #0x01
900	ldrbgt	ip, [r1]
901	strbge	r2, [r3], #0x01
902	strbgt	ip, [r3]
903	RET
904/* Place a literal pool here for the above ldr instructions to use */
905.ltorg
906
907
908/*
909 * At this point, it has not been possible to word align both buffers.
910 * The destination buffer is word aligned, but the source buffer is not.
911 */
912.Lmemcpy_bad_align:
913	stmfd	sp!, {r4-r7}
914	bic	r1, r1, #0x03
915	cmp	ip, #2
916	ldr	ip, [r1], #0x04
917	bgt	.Lmemcpy_bad3
918	beq	.Lmemcpy_bad2
919	b	.Lmemcpy_bad1
920
921.Lmemcpy_bad1_loop16:
922	mov	r4, ip, lsr #8
923	ldr	r5, [r1], #0x04
924	pld	[r1, #0x018]
925	ldr	r6, [r1], #0x04
926	ldr	r7, [r1], #0x04
927	ldr	ip, [r1], #0x04
928	orr	r4, r4, r5, lsl #24
929	mov	r5, r5, lsr #8
930	orr	r5, r5, r6, lsl #24
931	mov	r6, r6, lsr #8
932	orr	r6, r6, r7, lsl #24
933	mov	r7, r7, lsr #8
934	orr	r7, r7, ip, lsl #24
935	str	r4, [r3], #0x04
936	str	r5, [r3], #0x04
937	str	r6, [r3], #0x04
938	str	r7, [r3], #0x04
939.Lmemcpy_bad1:
940	subs	r2, r2, #0x10
941	bge	.Lmemcpy_bad1_loop16
942
943	adds	r2, r2, #0x10
944	ldmfdeq	sp!, {r4-r7}
945	RETeq			/* Return now if done */
946	subs	r2, r2, #0x04
947	sublt	r1, r1, #0x03
948	blt	.Lmemcpy_bad_done
949
950.Lmemcpy_bad1_loop4:
951	mov	r4, ip, lsr #8
952	ldr	ip, [r1], #0x04
953	subs	r2, r2, #0x04
954	orr	r4, r4, ip, lsl #24
955	str	r4, [r3], #0x04
956	bge	.Lmemcpy_bad1_loop4
957	sub	r1, r1, #0x03
958	b	.Lmemcpy_bad_done
959
960.Lmemcpy_bad2_loop16:
961	mov	r4, ip, lsr #16
962	ldr	r5, [r1], #0x04
963	pld	[r1, #0x018]
964	ldr	r6, [r1], #0x04
965	ldr	r7, [r1], #0x04
966	ldr	ip, [r1], #0x04
967	orr	r4, r4, r5, lsl #16
968	mov	r5, r5, lsr #16
969	orr	r5, r5, r6, lsl #16
970	mov	r6, r6, lsr #16
971	orr	r6, r6, r7, lsl #16
972	mov	r7, r7, lsr #16
973	orr	r7, r7, ip, lsl #16
974	str	r4, [r3], #0x04
975	str	r5, [r3], #0x04
976	str	r6, [r3], #0x04
977	str	r7, [r3], #0x04
978.Lmemcpy_bad2:
979	subs	r2, r2, #0x10
980	bge	.Lmemcpy_bad2_loop16
981
982	adds	r2, r2, #0x10
983	ldmfdeq	sp!, {r4-r7}
984	RETeq			/* Return now if done */
985	subs	r2, r2, #0x04
986	sublt	r1, r1, #0x02
987	blt	.Lmemcpy_bad_done
988
989.Lmemcpy_bad2_loop4:
990	mov	r4, ip, lsr #16
991	ldr	ip, [r1], #0x04
992	subs	r2, r2, #0x04
993	orr	r4, r4, ip, lsl #16
994	str	r4, [r3], #0x04
995	bge	.Lmemcpy_bad2_loop4
996	sub	r1, r1, #0x02
997	b	.Lmemcpy_bad_done
998
999.Lmemcpy_bad3_loop16:
1000	mov	r4, ip, lsr #24
1001	ldr	r5, [r1], #0x04
1002	pld	[r1, #0x018]
1003	ldr	r6, [r1], #0x04
1004	ldr	r7, [r1], #0x04
1005	ldr	ip, [r1], #0x04
1006	orr	r4, r4, r5, lsl #8
1007	mov	r5, r5, lsr #24
1008	orr	r5, r5, r6, lsl #8
1009	mov	r6, r6, lsr #24
1010	orr	r6, r6, r7, lsl #8
1011	mov	r7, r7, lsr #24
1012	orr	r7, r7, ip, lsl #8
1013	str	r4, [r3], #0x04
1014	str	r5, [r3], #0x04
1015	str	r6, [r3], #0x04
1016	str	r7, [r3], #0x04
1017.Lmemcpy_bad3:
1018	subs	r2, r2, #0x10
1019	bge	.Lmemcpy_bad3_loop16
1020
1021	adds	r2, r2, #0x10
1022	ldmfdeq	sp!, {r4-r7}
1023	RETeq			/* Return now if done */
1024	subs	r2, r2, #0x04
1025	sublt	r1, r1, #0x01
1026	blt	.Lmemcpy_bad_done
1027
1028.Lmemcpy_bad3_loop4:
1029	mov	r4, ip, lsr #24
1030	ldr	ip, [r1], #0x04
1031	subs	r2, r2, #0x04
1032	orr	r4, r4, ip, lsl #8
1033	str	r4, [r3], #0x04
1034	bge	.Lmemcpy_bad3_loop4
1035	sub	r1, r1, #0x01
1036
1037.Lmemcpy_bad_done:
1038	ldmfd	sp!, {r4-r7}
1039	adds	r2, r2, #0x04
1040	RETeq
1041	ldrb	ip, [r1], #0x01
1042	cmp	r2, #0x02
1043	ldrbge	r2, [r1], #0x01
1044	strb	ip, [r3], #0x01
1045	ldrbgt	ip, [r1]
1046	strbge	r2, [r3], #0x01
1047	strbgt	ip, [r3]
1048	RET
1049
1050
1051/*
1052 * Handle short copies (less than 16 bytes), possibly misaligned.
1053 * Some of these are *very* common, thanks to the network stack,
1054 * and so are handled specially.
1055 */
1056.Lmemcpy_short:
1057	add	pc, pc, r2, lsl #2
1058	nop
1059	RET			/* 0x00 */
1060	b	.Lmemcpy_bytewise	/* 0x01 */
1061	b	.Lmemcpy_bytewise	/* 0x02 */
1062	b	.Lmemcpy_bytewise	/* 0x03 */
1063	b	.Lmemcpy_4		/* 0x04 */
1064	b	.Lmemcpy_bytewise	/* 0x05 */
1065	b	.Lmemcpy_6		/* 0x06 */
1066	b	.Lmemcpy_bytewise	/* 0x07 */
1067	b	.Lmemcpy_8		/* 0x08 */
1068	b	.Lmemcpy_bytewise	/* 0x09 */
1069	b	.Lmemcpy_bytewise	/* 0x0a */
1070	b	.Lmemcpy_bytewise	/* 0x0b */
1071	b	.Lmemcpy_c		/* 0x0c */
1072.Lmemcpy_bytewise:
1073	mov	r3, r0			/* We must not clobber r0 */
1074	ldrb	ip, [r1], #0x01
10751:	subs	r2, r2, #0x01
1076	strb	ip, [r3], #0x01
1077	ldrbne	ip, [r1], #0x01
1078	bne	1b
1079	RET
1080
1081/******************************************************************************
1082 * Special case for 4 byte copies
1083 */
1084#define	LMEMCPY_4_LOG2	6	/* 64 bytes */
1085#define	LMEMCPY_4_PAD	.align LMEMCPY_4_LOG2
1086	LMEMCPY_4_PAD
1087.Lmemcpy_4:
1088	and	r2, r1, #0x03
1089	orr	r2, r2, r0, lsl #2
1090	ands	r2, r2, #0x0f
1091	sub	r3, pc, #0x14
1092	addne	pc, r3, r2, lsl #LMEMCPY_4_LOG2
1093
1094/*
1095 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1096 */
1097	ldr	r2, [r1]
1098	str	r2, [r0]
1099	RET
1100	LMEMCPY_4_PAD
1101
1102/*
1103 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1104 */
1105	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1106	ldr	r2, [r1, #3]		/* BE:r2 = 3xxx  LE:r2 = xxx3 */
1107	mov	r3, r3, lsr #8		/* r3 = .210 */
1108	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
1109	str	r3, [r0]
1110	RET
1111	LMEMCPY_4_PAD
1112
1113/*
1114 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1115 */
1116	ldrh	r3, [r1, #0x02]
1117	ldrh	r2, [r1]
1118	orr	r3, r2, r3, lsl #16
1119	str	r3, [r0]
1120	RET
1121	LMEMCPY_4_PAD
1122
1123/*
1124 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1125 */
1126	ldr	r3, [r1, #-3]		/* BE:r3 = xxx0  LE:r3 = 0xxx */
1127	ldr	r2, [r1, #1]		/* BE:r2 = 123x  LE:r2 = x321 */
1128	mov	r3, r3, lsr #24		/* r3 = ...0 */
1129	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1130	str	r3, [r0]
1131	RET
1132	LMEMCPY_4_PAD
1133
1134/*
1135 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1136 */
1137	ldr	r2, [r1]
1138	strb	r2, [r0]
1139	mov	r3, r2, lsr #8
1140	mov	r1, r2, lsr #24
1141	strb	r1, [r0, #0x03]
1142	strh	r3, [r0, #0x01]
1143	RET
1144	LMEMCPY_4_PAD
1145
1146/*
1147 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1148 */
1149	ldrb	r2, [r1]
1150	ldrh	r3, [r1, #0x01]
1151	ldrb	r1, [r1, #0x03]
1152	strb	r2, [r0]
1153	strh	r3, [r0, #0x01]
1154	strb	r1, [r0, #0x03]
1155	RET
1156	LMEMCPY_4_PAD
1157
1158/*
1159 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1160 */
1161	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1162	ldrh	r3, [r1, #0x02]		/* LE:r3 = ..23  LE:r3 = ..32 */
1163	strb	r2, [r0]
1164	mov	r2, r2, lsr #8		/* r2 = ...1 */
1165	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1166	mov	r3, r3, lsr #8		/* r3 = ...3 */
1167	strh	r2, [r0, #0x01]
1168	strb	r3, [r0, #0x03]
1169	RET
1170	LMEMCPY_4_PAD
1171
1172/*
1173 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1174 */
1175	ldrb	r2, [r1]
1176	ldrh	r3, [r1, #0x01]
1177	ldrb	r1, [r1, #0x03]
1178	strb	r2, [r0]
1179	strh	r3, [r0, #0x01]
1180	strb	r1, [r0, #0x03]
1181	RET
1182	LMEMCPY_4_PAD
1183
1184/*
1185 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1186 */
1187	ldr	r2, [r1]
1188	strh	r2, [r0]
1189	mov	r3, r2, lsr #16
1190	strh	r3, [r0, #0x02]
1191	RET
1192	LMEMCPY_4_PAD
1193
1194/*
1195 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1196 */
1197	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1198	ldr	r3, [r1, #3]		/* BE:r3 = 3xxx  LE:r3 = xxx3 */
1199	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1200	strh	r1, [r0]
1201	mov	r2, r2, lsr #24		/* r2 = ...2 */
1202	orr	r2, r2, r3, lsl #8	/* r2 = xx32 */
1203	strh	r2, [r0, #0x02]
1204	RET
1205	LMEMCPY_4_PAD
1206
1207/*
1208 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1209 */
1210	ldrh	r2, [r1]
1211	ldrh	r3, [r1, #0x02]
1212	strh	r2, [r0]
1213	strh	r3, [r0, #0x02]
1214	RET
1215	LMEMCPY_4_PAD
1216
1217/*
1218 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1219 */
1220	ldr	r3, [r1, #1]		/* BE:r3 = 123x  LE:r3 = x321 */
1221	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1222	mov	r1, r3, lsr #8		/* BE:r1 = .123  LE:r1 = .x32 */
1223	strh	r1, [r0, #0x02]
1224	mov	r3, r3, lsl #8		/* r3 = 321. */
1225	orr	r3, r3, r2, lsr #24	/* r3 = 3210 */
1226	strh	r3, [r0]
1227	RET
1228	LMEMCPY_4_PAD
1229
1230/*
1231 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1232 */
1233	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1234	strb	r2, [r0]
1235	mov	r3, r2, lsr #8
1236	mov	r1, r2, lsr #24
1237	strh	r3, [r0, #0x01]
1238	strb	r1, [r0, #0x03]
1239	RET
1240	LMEMCPY_4_PAD
1241
1242/*
1243 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1244 */
1245	ldrb	r2, [r1]
1246	ldrh	r3, [r1, #0x01]
1247	ldrb	r1, [r1, #0x03]
1248	strb	r2, [r0]
1249	strh	r3, [r0, #0x01]
1250	strb	r1, [r0, #0x03]
1251	RET
1252	LMEMCPY_4_PAD
1253
1254/*
1255 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1256 */
1257	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1258	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
1259	strb	r2, [r0]
1260	mov	r2, r2, lsr #8		/* r2 = ...1 */
1261	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1262	strh	r2, [r0, #0x01]
1263	mov	r3, r3, lsr #8		/* r3 = ...3 */
1264	strb	r3, [r0, #0x03]
1265	RET
1266	LMEMCPY_4_PAD
1267
1268/*
1269 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1270 */
1271	ldrb	r2, [r1]
1272	ldrh	r3, [r1, #0x01]
1273	ldrb	r1, [r1, #0x03]
1274	strb	r2, [r0]
1275	strh	r3, [r0, #0x01]
1276	strb	r1, [r0, #0x03]
1277	RET
1278	LMEMCPY_4_PAD
1279
1280
1281/******************************************************************************
1282 * Special case for 6 byte copies
1283 */
1284#define	LMEMCPY_6_LOG2	6	/* 64 bytes */
1285#define	LMEMCPY_6_PAD	.align LMEMCPY_6_LOG2
1286	LMEMCPY_6_PAD
1287.Lmemcpy_6:
1288	and	r2, r1, #0x03
1289	orr	r2, r2, r0, lsl #2
1290	ands	r2, r2, #0x0f
1291	sub	r3, pc, #0x14
1292	addne	pc, r3, r2, lsl #LMEMCPY_6_LOG2
1293
1294/*
1295 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1296 */
1297	ldr	r2, [r1]
1298	ldrh	r3, [r1, #0x04]
1299	str	r2, [r0]
1300	strh	r3, [r0, #0x04]
1301	RET
1302	LMEMCPY_6_PAD
1303
1304/*
1305 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1306 */
1307	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1308	ldr	r3, [r1, #0x03]		/* BE:r3 = 345x  LE:r3 = x543 */
1309	mov	r2, r2, lsr #8		/* r2 = .210 */
1310	orr	r2, r2, r3, lsl #24	/* r2 = 3210 */
1311	mov	r3, r3, lsr #8		/* BE:r3 = .345  LE:r3 = .x54 */
1312	str	r2, [r0]
1313	strh	r3, [r0, #0x04]
1314	RET
1315	LMEMCPY_6_PAD
1316
1317/*
1318 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1319 */
1320	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1321	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1322	mov	r1, r3, lsr #16		/* r1 = ..54 */
1323	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1324	str	r2, [r0]
1325	strh	r1, [r0, #0x04]
1326	RET
1327	LMEMCPY_6_PAD
1328
1329/*
1330 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1331 */
1332	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1333	ldr	r3, [r1, #1]		/* BE:r3 = 1234  LE:r3 = 4321 */
1334	ldr	r1, [r1, #5]		/* BE:r1 = 5xxx  LE:r3 = xxx5 */
1335	mov	r2, r2, lsr #24		/* r2 = ...0 */
1336	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
1337	mov	r1, r1, lsl #8		/* r1 = xx5. */
1338	orr	r1, r1, r3, lsr #24	/* r1 = xx54 */
1339	str	r2, [r0]
1340	strh	r1, [r0, #0x04]
1341	RET
1342	LMEMCPY_6_PAD
1343
1344/*
1345 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1346 */
1347	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
1348	ldrh	r2, [r1, #0x04]		/* BE:r2 = ..45  LE:r2 = ..54 */
1349	mov	r1, r3, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
1350	strh	r1, [r0, #0x01]
1351	strb	r3, [r0]
1352	mov	r3, r3, lsr #24		/* r3 = ...3 */
1353	orr	r3, r3, r2, lsl #8	/* r3 = .543 */
1354	mov	r2, r2, lsr #8		/* r2 = ...5 */
1355	strh	r3, [r0, #0x03]
1356	strb	r2, [r0, #0x05]
1357	RET
1358	LMEMCPY_6_PAD
1359
1360/*
1361 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1362 */
1363	ldrb	r2, [r1]
1364	ldrh	r3, [r1, #0x01]
1365	ldrh	ip, [r1, #0x03]
1366	ldrb	r1, [r1, #0x05]
1367	strb	r2, [r0]
1368	strh	r3, [r0, #0x01]
1369	strh	ip, [r0, #0x03]
1370	strb	r1, [r0, #0x05]
1371	RET
1372	LMEMCPY_6_PAD
1373
1374/*
1375 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1376 */
1377	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1378	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
1379	strb	r2, [r0]
1380	mov	r3, r1, lsr #24
1381	strb	r3, [r0, #0x05]
1382	mov	r3, r1, lsr #8		/* r3 = .543 */
1383	strh	r3, [r0, #0x03]
1384	mov	r3, r2, lsr #8		/* r3 = ...1 */
1385	orr	r3, r3, r1, lsl #8	/* r3 = 4321 */
1386	strh	r3, [r0, #0x01]
1387	RET
1388	LMEMCPY_6_PAD
1389
1390/*
1391 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1392 */
1393	ldrb	r2, [r1]
1394	ldrh	r3, [r1, #0x01]
1395	ldrh	ip, [r1, #0x03]
1396	ldrb	r1, [r1, #0x05]
1397	strb	r2, [r0]
1398	strh	r3, [r0, #0x01]
1399	strh	ip, [r0, #0x03]
1400	strb	r1, [r0, #0x05]
1401	RET
1402	LMEMCPY_6_PAD
1403
1404/*
1405 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1406 */
1407	ldrh	r2, [r1, #0x04]		/* r2 = ..54 */
1408	ldr	r3, [r1]		/* r3 = 3210 */
1409	mov	r2, r2, lsl #16		/* r2 = 54.. */
1410	orr	r2, r2, r3, lsr #16	/* r2 = 5432 */
1411	strh	r3, [r0]
1412	str	r2, [r0, #0x02]
1413	RET
1414	LMEMCPY_6_PAD
1415
1416/*
1417 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1418 */
1419	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1420	ldr	r2, [r1, #3]		/* BE:r2 = 345x  LE:r2 = x543 */
1421	mov	r1, r3, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1422	mov	r2, r2, lsl #8		/* r2 = 543. */
1423	orr	r2, r2, r3, lsr #24	/* r2 = 5432 */
1424	strh	r1, [r0]
1425	str	r2, [r0, #0x02]
1426	RET
1427	LMEMCPY_6_PAD
1428
1429/*
1430 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1431 */
1432	ldrh	r2, [r1]
1433	ldr	r3, [r1, #0x02]
1434	strh	r2, [r0]
1435	str	r3, [r0, #0x02]
1436	RET
1437	LMEMCPY_6_PAD
1438
1439/*
1440 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1441 */
1442	ldrb	r3, [r1]		/* r3 = ...0 */
1443	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
1444	ldrb	r1, [r1, #0x05]		/* r1 = ...5 */
1445	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1446	mov	r1, r1, lsl #24		/* r1 = 5... */
1447	orr	r1, r1, r2, lsr #8	/* r1 = 5432 */
1448	strh	r3, [r0]
1449	str	r1, [r0, #0x02]
1450	RET
1451	LMEMCPY_6_PAD
1452
1453/*
1454 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1455 */
1456	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1457	ldrh	r1, [r1, #0x04]		/* BE:r1 = ..45  LE:r1 = ..54 */
1458	strb	r2, [r0]
1459	mov	r2, r2, lsr #8		/* r2 = .321 */
1460	orr	r2, r2, r1, lsl #24	/* r2 = 4321 */
1461	mov	r1, r1, lsr #8		/* r1 = ...5 */
1462	str	r2, [r0, #0x01]
1463	strb	r1, [r0, #0x05]
1464	RET
1465	LMEMCPY_6_PAD
1466
1467/*
1468 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1469 */
1470	ldrb	r2, [r1]
1471	ldrh	r3, [r1, #0x01]
1472	ldrh	ip, [r1, #0x03]
1473	ldrb	r1, [r1, #0x05]
1474	strb	r2, [r0]
1475	strh	r3, [r0, #0x01]
1476	strh	ip, [r0, #0x03]
1477	strb	r1, [r0, #0x05]
1478	RET
1479	LMEMCPY_6_PAD
1480
1481/*
1482 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1483 */
1484	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1485	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
1486	strb	r2, [r0]
1487	mov	r2, r2, lsr #8		/* r2 = ...1 */
1488	orr	r2, r2, r1, lsl #8	/* r2 = 4321 */
1489	mov	r1, r1, lsr #24		/* r1 = ...5 */
1490	str	r2, [r0, #0x01]
1491	strb	r1, [r0, #0x05]
1492	RET
1493	LMEMCPY_6_PAD
1494
1495/*
1496 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1497 */
1498	ldrb	r2, [r1]
1499	ldr	r3, [r1, #0x01]
1500	ldrb	r1, [r1, #0x05]
1501	strb	r2, [r0]
1502	str	r3, [r0, #0x01]
1503	strb	r1, [r0, #0x05]
1504	RET
1505	LMEMCPY_6_PAD
1506
1507
1508/******************************************************************************
1509 * Special case for 8 byte copies
1510 */
1511#define	LMEMCPY_8_LOG2	6	/* 64 bytes */
1512#define	LMEMCPY_8_PAD	.align LMEMCPY_8_LOG2
1513	LMEMCPY_8_PAD
1514.Lmemcpy_8:
1515	and	r2, r1, #0x03
1516	orr	r2, r2, r0, lsl #2
1517	ands	r2, r2, #0x0f
1518	sub	r3, pc, #0x14
1519	addne	pc, r3, r2, lsl #LMEMCPY_8_LOG2
1520
1521/*
1522 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1523 */
1524	ldr	r2, [r1]
1525	ldr	r3, [r1, #0x04]
1526	str	r2, [r0]
1527	str	r3, [r0, #0x04]
1528	RET
1529	LMEMCPY_8_PAD
1530
1531/*
1532 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1533 */
1534	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1535	ldr	r2, [r1, #0x03]		/* BE:r2 = 3456  LE:r2 = 6543 */
1536	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1537	mov	r3, r3, lsr #8		/* r3 = .210 */
1538	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
1539	mov	r1, r1, lsl #24		/* r1 = 7... */
1540	orr	r2, r1, r2, lsr #8	/* r2 = 7654 */
1541	str	r3, [r0]
1542	str	r2, [r0, #0x04]
1543	RET
1544	LMEMCPY_8_PAD
1545
1546/*
1547 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1548 */
1549	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1550	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1551	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1552	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1553	mov	r3, r3, lsr #16		/* r3 = ..54 */
1554	orr	r3, r3, r1, lsl #16	/* r3 = 7654 */
1555	str	r2, [r0]
1556	str	r3, [r0, #0x04]
1557	RET
1558	LMEMCPY_8_PAD
1559
1560/*
1561 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1562 */
1563	ldrb	r3, [r1]		/* r3 = ...0 */
1564	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
1565	ldr	r1, [r1, #0x05]		/* BE:r1 = 567x  LE:r1 = x765 */
1566	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1567	mov	r2, r2, lsr #24		/* r2 = ...4 */
1568	orr	r2, r2, r1, lsl #8	/* r2 = 7654 */
1569	str	r3, [r0]
1570	str	r2, [r0, #0x04]
1571	RET
1572	LMEMCPY_8_PAD
1573
1574/*
1575 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1576 */
1577	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
1578	ldr	r2, [r1, #0x04]		/* BE:r2 = 4567  LE:r2 = 7654 */
1579	strb	r3, [r0]
1580	mov	r1, r2, lsr #24		/* r1 = ...7 */
1581	strb	r1, [r0, #0x07]
1582	mov	r1, r3, lsr #8		/* r1 = .321 */
1583	mov	r3, r3, lsr #24		/* r3 = ...3 */
1584	orr	r3, r3, r2, lsl #8	/* r3 = 6543 */
1585	strh	r1, [r0, #0x01]
1586	str	r3, [r0, #0x03]
1587	RET
1588	LMEMCPY_8_PAD
1589
1590/*
1591 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1592 */
1593	ldrb	r2, [r1]
1594	ldrh	r3, [r1, #0x01]
1595	ldr	ip, [r1, #0x03]
1596	ldrb	r1, [r1, #0x07]
1597	strb	r2, [r0]
1598	strh	r3, [r0, #0x01]
1599	str	ip, [r0, #0x03]
1600	strb	r1, [r0, #0x07]
1601	RET
1602	LMEMCPY_8_PAD
1603
1604/*
1605 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1606 */
1607	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1608	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1609	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1610	strb	r2, [r0]		/* 0 */
1611	mov	ip, r1, lsr #8		/* ip = ...7 */
1612	strb	ip, [r0, #0x07]		/* 7 */
1613	mov	ip, r2, lsr #8		/* ip = ...1 */
1614	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
1615	mov	r3, r3, lsr #8		/* r3 = .543 */
1616	orr	r3, r3, r1, lsl #24	/* r3 = 6543 */
1617	strh	ip, [r0, #0x01]
1618	str	r3, [r0, #0x03]
1619	RET
1620	LMEMCPY_8_PAD
1621
1622/*
1623 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1624 */
1625	ldrb	r3, [r1]		/* r3 = ...0 */
1626	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
1627	ldrh	r2, [r1, #0x05]		/* BE:r2 = ..56  LE:r2 = ..65 */
1628	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1629	strb	r3, [r0]
1630	mov	r3, ip, lsr #16		/* BE:r3 = ..12  LE:r3 = ..43 */
1631	strh	ip, [r0, #0x01]
1632	orr	r2, r3, r2, lsl #16	/* r2 = 6543 */
1633	str	r2, [r0, #0x03]
1634	strb	r1, [r0, #0x07]
1635	RET
1636	LMEMCPY_8_PAD
1637
1638/*
1639 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1640 */
1641	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1642	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1643	mov	r1, r2, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
1644	strh	r2, [r0]
1645	orr	r2, r1, r3, lsl #16	/* r2 = 5432 */
1646	mov	r3, r3, lsr #16		/* r3 = ..76 */
1647	str	r2, [r0, #0x02]
1648	strh	r3, [r0, #0x06]
1649	RET
1650	LMEMCPY_8_PAD
1651
1652/*
1653 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1654 */
1655	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1656	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1657	ldrb	ip, [r1, #0x07]		/* ip = ...7 */
1658	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1659	strh	r1, [r0]
1660	mov	r1, r2, lsr #24		/* r1 = ...2 */
1661	orr	r1, r1, r3, lsl #8	/* r1 = 5432 */
1662	mov	r3, r3, lsr #24		/* r3 = ...6 */
1663	orr	r3, r3, ip, lsl #8	/* r3 = ..76 */
1664	str	r1, [r0, #0x02]
1665	strh	r3, [r0, #0x06]
1666	RET
1667	LMEMCPY_8_PAD
1668
1669/*
1670 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1671 */
1672	ldrh	r2, [r1]
1673	ldr	ip, [r1, #0x02]
1674	ldrh	r3, [r1, #0x06]
1675	strh	r2, [r0]
1676	str	ip, [r0, #0x02]
1677	strh	r3, [r0, #0x06]
1678	RET
1679	LMEMCPY_8_PAD
1680
1681/*
1682 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1683 */
1684	ldr	r3, [r1, #0x05]		/* BE:r3 = 567x  LE:r3 = x765 */
1685	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
1686	ldrb	ip, [r1]		/* ip = ...0 */
1687	mov	r1, r3, lsr #8		/* BE:r1 = .567  LE:r1 = .x76 */
1688	strh	r1, [r0, #0x06]
1689	mov	r3, r3, lsl #24		/* r3 = 5... */
1690	orr	r3, r3, r2, lsr #8	/* r3 = 5432 */
1691	orr	r2, ip, r2, lsl #8	/* r2 = 3210 */
1692	str	r3, [r0, #0x02]
1693	strh	r2, [r0]
1694	RET
1695	LMEMCPY_8_PAD
1696
1697/*
1698 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1699 */
1700	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1701	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1702	mov	r1, r3, lsr #8		/* BE:r1 = .456  LE:r1 = .765 */
1703	strh	r1, [r0, #0x05]
1704	strb	r2, [r0]
1705	mov	r1, r3, lsr #24		/* r1 = ...7 */
1706	strb	r1, [r0, #0x07]
1707	mov	r2, r2, lsr #8		/* r2 = .321 */
1708	orr	r2, r2, r3, lsl #24	/* r2 = 4321 */
1709	str	r2, [r0, #0x01]
1710	RET
1711	LMEMCPY_8_PAD
1712
1713/*
1714 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1715 */
1716	ldrb	r3, [r1]		/* r3 = ...0 */
1717	ldrh	r2, [r1, #0x01]		/* BE:r2 = ..12  LE:r2 = ..21 */
1718	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
1719	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1720	strb	r3, [r0]
1721	mov	r3, ip, lsr #16		/* BE:r3 = ..34  LE:r3 = ..65 */
1722	strh	r3, [r0, #0x05]
1723	orr	r2, r2, ip, lsl #16	/* r2 = 4321 */
1724	str	r2, [r0, #0x01]
1725	strb	r1, [r0, #0x07]
1726	RET
1727	LMEMCPY_8_PAD
1728
1729/*
1730 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1731 */
1732	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1733	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1734	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1735	strb	r2, [r0]
1736	mov	ip, r2, lsr #8		/* ip = ...1 */
1737	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
1738	mov	r2, r1, lsr #8		/* r2 = ...7 */
1739	strb	r2, [r0, #0x07]
1740	mov	r1, r1, lsl #8		/* r1 = .76. */
1741	orr	r1, r1, r3, lsr #24	/* r1 = .765 */
1742	str	ip, [r0, #0x01]
1743	strh	r1, [r0, #0x05]
1744	RET
1745	LMEMCPY_8_PAD
1746
1747/*
1748 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1749 */
1750	ldrb	r2, [r1]
1751	ldr	ip, [r1, #0x01]
1752	ldrh	r3, [r1, #0x05]
1753	ldrb	r1, [r1, #0x07]
1754	strb	r2, [r0]
1755	str	ip, [r0, #0x01]
1756	strh	r3, [r0, #0x05]
1757	strb	r1, [r0, #0x07]
1758	RET
1759	LMEMCPY_8_PAD
1760
1761/******************************************************************************
1762 * Special case for 12 byte copies
1763 */
1764#define	LMEMCPY_C_LOG2	7	/* 128 bytes */
1765#define	LMEMCPY_C_PAD	.align LMEMCPY_C_LOG2
1766	LMEMCPY_C_PAD
1767.Lmemcpy_c:
1768	and	r2, r1, #0x03
1769	orr	r2, r2, r0, lsl #2
1770	ands	r2, r2, #0x0f
1771	sub	r3, pc, #0x14
1772	addne	pc, r3, r2, lsl #LMEMCPY_C_LOG2
1773
1774/*
1775 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1776 */
1777	ldr	r2, [r1]
1778	ldr	r3, [r1, #0x04]
1779	ldr	r1, [r1, #0x08]
1780	str	r2, [r0]
1781	str	r3, [r0, #0x04]
1782	str	r1, [r0, #0x08]
1783	RET
1784	LMEMCPY_C_PAD
1785
1786/*
1787 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1788 */
1789	ldrb	r2, [r1, #0xb]		/* r2 = ...B */
1790	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
1791	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1792	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
1793	mov	r2, r2, lsl #24		/* r2 = B... */
1794	orr	r2, r2, ip, lsr #8	/* r2 = BA98 */
1795	str	r2, [r0, #0x08]
1796	mov	r2, ip, lsl #24		/* r2 = 7... */
1797	orr	r2, r2, r3, lsr #8	/* r2 = 7654 */
1798	mov	r1, r1, lsr #8		/* r1 = .210 */
1799	orr	r1, r1, r3, lsl #24	/* r1 = 3210 */
1800	str	r2, [r0, #0x04]
1801	str	r1, [r0]
1802	RET
1803	LMEMCPY_C_PAD
1804
1805/*
1806 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1807 */
1808	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1809	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1810	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
1811	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
1812	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1813	str	r2, [r0]
1814	mov	r3, r3, lsr #16		/* r3 = ..54 */
1815	orr	r3, r3, ip, lsl #16	/* r3 = 7654 */
1816	mov	r1, r1, lsl #16		/* r1 = BA.. */
1817	orr	r1, r1, ip, lsr #16	/* r1 = BA98 */
1818	str	r3, [r0, #0x04]
1819	str	r1, [r0, #0x08]
1820	RET
1821	LMEMCPY_C_PAD
1822
1823/*
1824 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1825 */
1826	ldrb	r2, [r1]		/* r2 = ...0 */
1827	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
1828	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
1829	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
1830	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
1831	str	r2, [r0]
1832	mov	r3, r3, lsr #24		/* r3 = ...4 */
1833	orr	r3, r3, ip, lsl #8	/* r3 = 7654 */
1834	mov	r1, r1, lsl #8		/* r1 = BA9. */
1835	orr	r1, r1, ip, lsr #24	/* r1 = BA98 */
1836	str	r3, [r0, #0x04]
1837	str	r1, [r0, #0x08]
1838	RET
1839	LMEMCPY_C_PAD
1840
1841/*
1842 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
1843 */
1844	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1845	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1846	ldr	ip, [r1, #0x08]		/* BE:ip = 89AB  LE:ip = BA98 */
1847	mov	r1, r2, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
1848	strh	r1, [r0, #0x01]
1849	strb	r2, [r0]
1850	mov	r1, r2, lsr #24		/* r1 = ...3 */
1851	orr	r2, r1, r3, lsl #8	/* r1 = 6543 */
1852	mov	r1, r3, lsr #24		/* r1 = ...7 */
1853	orr	r1, r1, ip, lsl #8	/* r1 = A987 */
1854	mov	ip, ip, lsr #24		/* ip = ...B */
1855	str	r2, [r0, #0x03]
1856	str	r1, [r0, #0x07]
1857	strb	ip, [r0, #0x0b]
1858	RET
1859	LMEMCPY_C_PAD
1860
1861/*
1862 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
1863 */
1864	ldrb	r2, [r1]
1865	ldrh	r3, [r1, #0x01]
1866	ldr	ip, [r1, #0x03]
1867	strb	r2, [r0]
1868	ldr	r2, [r1, #0x07]
1869	ldrb	r1, [r1, #0x0b]
1870	strh	r3, [r0, #0x01]
1871	str	ip, [r0, #0x03]
1872	str	r2, [r0, #0x07]
1873	strb	r1, [r0, #0x0b]
1874	RET
1875	LMEMCPY_C_PAD
1876
1877/*
1878 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
1879 */
1880	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1881	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1882	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
1883	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
1884	strb	r2, [r0]
1885	mov	r2, r2, lsr #8		/* r2 = ...1 */
1886	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
1887	strh	r2, [r0, #0x01]
1888	mov	r2, r3, lsr #8		/* r2 = .543 */
1889	orr	r3, r2, ip, lsl #24	/* r3 = 6543 */
1890	mov	r2, ip, lsr #8		/* r2 = .987 */
1891	orr	r2, r2, r1, lsl #24	/* r2 = A987 */
1892	mov	r1, r1, lsr #8		/* r1 = ...B */
1893	str	r3, [r0, #0x03]
1894	str	r2, [r0, #0x07]
1895	strb	r1, [r0, #0x0b]
1896	RET
1897	LMEMCPY_C_PAD
1898
1899/*
1900 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
1901 */
1902	ldrb	r2, [r1]
1903	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
1904	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
1905	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
1906	strb	r2, [r0]
1907	strh	r3, [r0, #0x01]
1908	mov	r3, r3, lsr #16		/* r3 = ..43 */
1909	orr	r3, r3, ip, lsl #16	/* r3 = 6543 */
1910	mov	ip, ip, lsr #16		/* ip = ..87 */
1911	orr	ip, ip, r1, lsl #16	/* ip = A987 */
1912	mov	r1, r1, lsr #16		/* r1 = ..xB */
1913	str	r3, [r0, #0x03]
1914	str	ip, [r0, #0x07]
1915	strb	r1, [r0, #0x0b]
1916	RET
1917	LMEMCPY_C_PAD
1918
1919/*
1920 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1921 */
1922	ldr	ip, [r1]		/* BE:ip = 0123  LE:ip = 3210 */
1923	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1924	ldr	r2, [r1, #0x08]		/* BE:r2 = 89AB  LE:r2 = BA98 */
1925	mov	r1, ip, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
1926	strh	ip, [r0]
1927	orr	r1, r1, r3, lsl #16	/* r1 = 5432 */
1928	mov	r3, r3, lsr #16		/* r3 = ..76 */
1929	orr	r3, r3, r2, lsl #16	/* r3 = 9876 */
1930	mov	r2, r2, lsr #16		/* r2 = ..BA */
1931	str	r1, [r0, #0x02]
1932	str	r3, [r0, #0x06]
1933	strh	r2, [r0, #0x0a]
1934	RET
1935	LMEMCPY_C_PAD
1936
1937/*
1938 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
1939 */
1940	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1941	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1942	mov	ip, r2, lsr #8		/* BE:ip = .x01  LE:ip = .210 */
1943	strh	ip, [r0]
1944	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
1945	ldrb	r1, [r1, #0x0b]		/* r1 = ...B */
1946	mov	r2, r2, lsr #24		/* r2 = ...2 */
1947	orr	r2, r2, r3, lsl #8	/* r2 = 5432 */
1948	mov	r3, r3, lsr #24		/* r3 = ...6 */
1949	orr	r3, r3, ip, lsl #8	/* r3 = 9876 */
1950	mov	r1, r1, lsl #8		/* r1 = ..B. */
1951	orr	r1, r1, ip, lsr #24	/* r1 = ..BA */
1952	str	r2, [r0, #0x02]
1953	str	r3, [r0, #0x06]
1954	strh	r1, [r0, #0x0a]
1955	RET
1956	LMEMCPY_C_PAD
1957
1958/*
1959 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1960 */
1961	ldrh	r2, [r1]
1962	ldr	r3, [r1, #0x02]
1963	ldr	ip, [r1, #0x06]
1964	ldrh	r1, [r1, #0x0a]
1965	strh	r2, [r0]
1966	str	r3, [r0, #0x02]
1967	str	ip, [r0, #0x06]
1968	strh	r1, [r0, #0x0a]
1969	RET
1970	LMEMCPY_C_PAD
1971
1972/*
1973 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
1974 */
1975	ldr	r2, [r1, #0x09]		/* BE:r2 = 9ABx  LE:r2 = xBA9 */
1976	ldr	r3, [r1, #0x05]		/* BE:r3 = 5678  LE:r3 = 8765 */
1977	mov	ip, r2, lsr #8		/* BE:ip = .9AB  LE:ip = .xBA */
1978	strh	ip, [r0, #0x0a]
1979	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
1980	ldrb	r1, [r1]		/* r1 = ...0 */
1981	mov	r2, r2, lsl #24		/* r2 = 9... */
1982	orr	r2, r2, r3, lsr #8	/* r2 = 9876 */
1983	mov	r3, r3, lsl #24		/* r3 = 5... */
1984	orr	r3, r3, ip, lsr #8	/* r3 = 5432 */
1985	orr	r1, r1, ip, lsl #8	/* r1 = 3210 */
1986	str	r2, [r0, #0x06]
1987	str	r3, [r0, #0x02]
1988	strh	r1, [r0]
1989	RET
1990	LMEMCPY_C_PAD
1991
1992/*
1993 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
1994 */
1995	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1996	ldr	ip, [r1, #0x04]		/* BE:ip = 4567  LE:ip = 7654 */
1997	ldr	r1, [r1, #0x08]		/* BE:r1 = 89AB  LE:r1 = BA98 */
1998	strb	r2, [r0]
1999	mov	r3, r2, lsr #8		/* r3 = .321 */
2000	orr	r3, r3, ip, lsl #24	/* r3 = 4321 */
2001	str	r3, [r0, #0x01]
2002	mov	r3, ip, lsr #8		/* r3 = .765 */
2003	orr	r3, r3, r1, lsl #24	/* r3 = 8765 */
2004	str	r3, [r0, #0x05]
2005	mov	r1, r1, lsr #8		/* r1 = .BA9 */
2006	strh	r1, [r0, #0x09]
2007	mov	r1, r1, lsr #16		/* r1 = ...B */
2008	strb	r1, [r0, #0x0b]
2009	RET
2010	LMEMCPY_C_PAD
2011
2012/*
2013 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2014 */
2015	ldrb	r2, [r1, #0x0b]		/* r2 = ...B */
2016	ldr	r3, [r1, #0x07]		/* BE:r3 = 789A  LE:r3 = A987 */
2017	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
2018	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
2019	strb	r2, [r0, #0x0b]
2020	mov	r2, r3, lsr #16		/* r2 = ..A9 */
2021	strh	r2, [r0, #0x09]
2022	mov	r3, r3, lsl #16		/* r3 = 87.. */
2023	orr	r3, r3, ip, lsr #16	/* r3 = 8765 */
2024	mov	ip, ip, lsl #16		/* ip = 43.. */
2025	orr	ip, ip, r1, lsr #16	/* ip = 4321 */
2026	mov	r1, r1, lsr #8		/* r1 = .210 */
2027	str	r3, [r0, #0x05]
2028	str	ip, [r0, #0x01]
2029	strb	r1, [r0]
2030	RET
2031	LMEMCPY_C_PAD
2032
2033/*
2034 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2035 */
2036	ldrh	r2, [r1]		/* r2 = ..10 */
2037	ldr	r3, [r1, #0x02]		/* r3 = 5432 */
2038	ldr	ip, [r1, #0x06]		/* ip = 9876 */
2039	ldrh	r1, [r1, #0x0a]		/* r1 = ..BA */
2040	strb	r2, [r0]
2041	mov	r2, r2, lsr #8		/* r2 = ...1 */
2042	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
2043	mov	r3, r3, lsr #24		/* r3 = ...5 */
2044	orr	r3, r3, ip, lsl #8	/* r3 = 8765 */
2045	mov	ip, ip, lsr #24		/* ip = ...9 */
2046	orr	ip, ip, r1, lsl #8	/* ip = .BA9 */
2047	mov	r1, r1, lsr #8		/* r1 = ...B */
2048	str	r2, [r0, #0x01]
2049	str	r3, [r0, #0x05]
2050	strh	ip, [r0, #0x09]
2051	strb	r1, [r0, #0x0b]
2052	RET
2053	LMEMCPY_C_PAD
2054
2055/*
2056 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2057 */
2058	ldrb	r2, [r1]
2059	ldr	r3, [r1, #0x01]
2060	ldr	ip, [r1, #0x05]
2061	strb	r2, [r0]
2062	ldrh	r2, [r1, #0x09]
2063	ldrb	r1, [r1, #0x0b]
2064	str	r3, [r0, #0x01]
2065	str	ip, [r0, #0x05]
2066	strh	r2, [r0, #0x09]
2067	strb	r1, [r0, #0x0b]
2068	RET
2069END(memcpy)
2070