xref: /freebsd/sys/arm/arm/support.S (revision 7bd6fde3)
1/*-
2 * Copyright (c) 2004 Olivier Houchard
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <machine/asm.h>
28#include <machine/asmacros.h>
29__FBSDID("$FreeBSD$");
30
31#include "assym.s"
32
33.L_arm_memcpy:
34	.word	_C_LABEL(_arm_memcpy)
35.L_arm_bzero:
36	.word	_C_LABEL(_arm_bzero)
37.L_min_memcpy_size:
38	.word	_C_LABEL(_min_memcpy_size)
39.L_min_bzero_size:
40	.word	_C_LABEL(_min_bzero_size)
41/*
42 * memset: Sets a block of memory to the specified value
43 *
44 * On entry:
45 *   r0 - dest address
46 *   r1 - byte to write
47 *   r2 - number of bytes to write
48 *
49 * On exit:
50 *   r0 - dest address
51 */
52/* LINTSTUB: Func: void bzero(void *, size_t) */
53ENTRY(bzero)
54	ldr	r3, .L_arm_bzero
55	ldr	r3, [r3]
56	cmp	r3, #0
57	beq	.Lnormal0
58	ldr	r2, .L_min_bzero_size
59	ldr	r2, [r2]
60	cmp	r1, r2
61	blt	.Lnormal0
62	stmfd	sp!, {r0, r1, lr}
63	mov	r2, #0
64	mov	lr, pc
65	mov	pc, r3
66	cmp	r0, #0
67	ldmfd	sp!, {r0, r1, lr}
68	RETeq
69.Lnormal0:
70	mov	r3, #0x00
71	b	do_memset
72
73/* LINTSTUB: Func: void *memset(void *, int, size_t) */
74ENTRY(memset)
75	and	r3, r1, #0xff		/* We deal with bytes */
76	mov	r1, r2
77do_memset:
78	cmp	r1, #0x04		/* Do we have less than 4 bytes */
79	mov	ip, r0
80	blt	.Lmemset_lessthanfour
81
82	/* Ok first we will word align the address */
83	ands	r2, ip, #0x03		/* Get the bottom two bits */
84	bne	.Lmemset_wordunaligned	/* The address is not word aligned */
85
86	/* We are now word aligned */
87.Lmemset_wordaligned:
88	orr	r3, r3, r3, lsl #8	/* Extend value to 16-bits */
89#ifdef __XSCALE__
90	tst	ip, #0x04		/* Quad-align for Xscale */
91#else
92	cmp	r1, #0x10
93#endif
94	orr	r3, r3, r3, lsl #16	/* Extend value to 32-bits */
95#ifdef __XSCALE__
96	subne	r1, r1, #0x04		/* Quad-align if necessary */
97	strne	r3, [ip], #0x04
98	cmp	r1, #0x10
99#endif
100	blt	.Lmemset_loop4		/* If less than 16 then use words */
101	mov	r2, r3			/* Duplicate data */
102	cmp	r1, #0x80		/* If < 128 then skip the big loop */
103	blt	.Lmemset_loop32
104
105	/* Do 128 bytes at a time */
106.Lmemset_loop128:
107	subs	r1, r1, #0x80
108#ifdef __XSCALE__
109	strged	r2, [ip], #0x08
110	strged	r2, [ip], #0x08
111	strged	r2, [ip], #0x08
112	strged	r2, [ip], #0x08
113	strged	r2, [ip], #0x08
114	strged	r2, [ip], #0x08
115	strged	r2, [ip], #0x08
116	strged	r2, [ip], #0x08
117	strged	r2, [ip], #0x08
118	strged	r2, [ip], #0x08
119	strged	r2, [ip], #0x08
120	strged	r2, [ip], #0x08
121	strged	r2, [ip], #0x08
122	strged	r2, [ip], #0x08
123	strged	r2, [ip], #0x08
124	strged	r2, [ip], #0x08
125#else
126	stmgeia	ip!, {r2-r3}
127	stmgeia	ip!, {r2-r3}
128	stmgeia	ip!, {r2-r3}
129	stmgeia	ip!, {r2-r3}
130	stmgeia	ip!, {r2-r3}
131	stmgeia	ip!, {r2-r3}
132	stmgeia	ip!, {r2-r3}
133	stmgeia	ip!, {r2-r3}
134	stmgeia	ip!, {r2-r3}
135	stmgeia	ip!, {r2-r3}
136	stmgeia	ip!, {r2-r3}
137	stmgeia	ip!, {r2-r3}
138	stmgeia	ip!, {r2-r3}
139	stmgeia	ip!, {r2-r3}
140	stmgeia	ip!, {r2-r3}
141	stmgeia	ip!, {r2-r3}
142#endif
143	bgt	.Lmemset_loop128
144	RETeq			/* Zero length so just exit */
145
146	add	r1, r1, #0x80		/* Adjust for extra sub */
147
148	/* Do 32 bytes at a time */
149.Lmemset_loop32:
150	subs	r1, r1, #0x20
151#ifdef __XSCALE__
152	strged	r2, [ip], #0x08
153	strged	r2, [ip], #0x08
154	strged	r2, [ip], #0x08
155	strged	r2, [ip], #0x08
156#else
157	stmgeia	ip!, {r2-r3}
158	stmgeia	ip!, {r2-r3}
159	stmgeia	ip!, {r2-r3}
160	stmgeia	ip!, {r2-r3}
161#endif
162	bgt	.Lmemset_loop32
163	RETeq			/* Zero length so just exit */
164
165	adds	r1, r1, #0x10		/* Partially adjust for extra sub */
166
167	/* Deal with 16 bytes or more */
168#ifdef __XSCALE__
169	strged	r2, [ip], #0x08
170	strged	r2, [ip], #0x08
171#else
172	stmgeia	ip!, {r2-r3}
173	stmgeia	ip!, {r2-r3}
174#endif
175	RETeq			/* Zero length so just exit */
176
177	addlt	r1, r1, #0x10		/* Possibly adjust for extra sub */
178
179	/* We have at least 4 bytes so copy as words */
180.Lmemset_loop4:
181	subs	r1, r1, #0x04
182	strge	r3, [ip], #0x04
183	bgt	.Lmemset_loop4
184	RETeq			/* Zero length so just exit */
185
186#ifdef __XSCALE__
187	/* Compensate for 64-bit alignment check */
188	adds	r1, r1, #0x04
189	RETeq
190	cmp	r1, #2
191#else
192	cmp	r1, #-2
193#endif
194
195	strb	r3, [ip], #0x01		/* Set 1 byte */
196	strgeb	r3, [ip], #0x01		/* Set another byte */
197	strgtb	r3, [ip]		/* and a third */
198	RET			/* Exit */
199
200.Lmemset_wordunaligned:
201	rsb	r2, r2, #0x004
202	strb	r3, [ip], #0x01		/* Set 1 byte */
203	cmp	r2, #0x02
204	strgeb	r3, [ip], #0x01		/* Set another byte */
205	sub	r1, r1, r2
206	strgtb	r3, [ip], #0x01		/* and a third */
207	cmp	r1, #0x04		/* More than 4 bytes left? */
208	bge	.Lmemset_wordaligned	/* Yup */
209
210.Lmemset_lessthanfour:
211	cmp	r1, #0x00
212	RETeq			/* Zero length so exit */
213	strb	r3, [ip], #0x01		/* Set 1 byte */
214	cmp	r1, #0x02
215	strgeb	r3, [ip], #0x01		/* Set another byte */
216	strgtb	r3, [ip]		/* and a third */
217	RET			/* Exit */
218
219ENTRY(bcmp)
220	mov	ip, r0
221	cmp	r2, #0x06
222	beq	.Lmemcmp_6bytes
223	mov	r0, #0x00
224
225	/* Are both addresses aligned the same way? */
226	cmp	r2, #0x00
227	eornes	r3, ip, r1
228	RETeq			/* len == 0, or same addresses! */
229	tst	r3, #0x03
230	subne	r2, r2, #0x01
231	bne	.Lmemcmp_bytewise2	/* Badly aligned. Do it the slow way */
232
233	/* Word-align the addresses, if necessary */
234	sub	r3, r1, #0x05
235	ands	r3, r3, #0x03
236	add	r3, r3, r3, lsl #1
237	addne	pc, pc, r3, lsl #3
238	nop
239
240	/* Compare up to 3 bytes */
241	ldrb	r0, [ip], #0x01
242	ldrb	r3, [r1], #0x01
243	subs	r0, r0, r3
244	RETne
245	subs	r2, r2, #0x01
246	RETeq
247
248	/* Compare up to 2 bytes */
249	ldrb	r0, [ip], #0x01
250	ldrb	r3, [r1], #0x01
251	subs	r0, r0, r3
252	RETne
253	subs	r2, r2, #0x01
254	RETeq
255
256	/* Compare 1 byte */
257	ldrb	r0, [ip], #0x01
258	ldrb	r3, [r1], #0x01
259	subs	r0, r0, r3
260	RETne
261	subs	r2, r2, #0x01
262	RETeq
263
264	/* Compare 4 bytes at a time, if possible */
265	subs	r2, r2, #0x04
266	bcc	.Lmemcmp_bytewise
267.Lmemcmp_word_aligned:
268	ldr	r0, [ip], #0x04
269	ldr	r3, [r1], #0x04
270	subs	r2, r2, #0x04
271	cmpcs	r0, r3
272	beq	.Lmemcmp_word_aligned
273	sub	r0, r0, r3
274
275	/* Correct for extra subtraction, and check if done */
276	adds	r2, r2, #0x04
277	cmpeq	r0, #0x00		/* If done, did all bytes match? */
278	RETeq			/* Yup. Just return */
279
280	/* Re-do the final word byte-wise */
281	sub	ip, ip, #0x04
282	sub	r1, r1, #0x04
283
284.Lmemcmp_bytewise:
285	add	r2, r2, #0x03
286.Lmemcmp_bytewise2:
287	ldrb	r0, [ip], #0x01
288	ldrb	r3, [r1], #0x01
289	subs	r2, r2, #0x01
290	cmpcs	r0, r3
291	beq	.Lmemcmp_bytewise2
292	sub	r0, r0, r3
293	RET
294
295	/*
296	 * 6 byte compares are very common, thanks to the network stack.
297	 * This code is hand-scheduled to reduce the number of stalls for
298	 * load results. Everything else being equal, this will be ~32%
299	 * faster than a byte-wise memcmp.
300	 */
301	.align	5
302.Lmemcmp_6bytes:
303	ldrb	r3, [r1, #0x00]		/* r3 = b2#0 */
304	ldrb	r0, [ip, #0x00]		/* r0 = b1#0 */
305	ldrb	r2, [r1, #0x01]		/* r2 = b2#1 */
306	subs	r0, r0, r3		/* r0 = b1#0 - b2#0 */
307	ldreqb	r3, [ip, #0x01]		/* r3 = b1#1 */
308	RETne			/* Return if mismatch on #0 */
309	subs	r0, r3, r2		/* r0 = b1#1 - b2#1 */
310	ldreqb	r3, [r1, #0x02]		/* r3 = b2#2 */
311	ldreqb	r0, [ip, #0x02]		/* r0 = b1#2 */
312	RETne			/* Return if mismatch on #1 */
313	ldrb	r2, [r1, #0x03]		/* r2 = b2#3 */
314	subs	r0, r0, r3		/* r0 = b1#2 - b2#2 */
315	ldreqb	r3, [ip, #0x03]		/* r3 = b1#3 */
316	RETne			/* Return if mismatch on #2 */
317	subs	r0, r3, r2		/* r0 = b1#3 - b2#3 */
318	ldreqb	r3, [r1, #0x04]		/* r3 = b2#4 */
319	ldreqb	r0, [ip, #0x04]		/* r0 = b1#4 */
320	RETne			/* Return if mismatch on #3 */
321	ldrb	r2, [r1, #0x05]		/* r2 = b2#5 */
322	subs	r0, r0, r3		/* r0 = b1#4 - b2#4 */
323	ldreqb	r3, [ip, #0x05]		/* r3 = b1#5 */
324	RETne			/* Return if mismatch on #4 */
325	sub	r0, r3, r2		/* r0 = b1#5 - b2#5 */
326	RET
327
328ENTRY(bcopy)
329	/* switch the source and destination registers */
330	eor     r0, r1, r0
331	eor     r1, r0, r1
332	eor     r0, r1, r0
333ENTRY(memmove)
334	/* Do the buffers overlap? */
335	cmp	r0, r1
336	RETeq		/* Bail now if src/dst are the same */
337	subcc	r3, r0, r1	/* if (dst > src) r3 = dst - src */
338	subcs	r3, r1, r0	/* if (src > dsr) r3 = src - dst */
339	cmp	r3, r2		/* if (r3 < len) we have an overlap */
340	bcc	PIC_SYM(_C_LABEL(memcpy), PLT)
341
342	/* Determine copy direction */
343	cmp	r1, r0
344	bcc	.Lmemmove_backwards
345
346	moveq	r0, #0			/* Quick abort for len=0 */
347	RETeq
348
349	stmdb	sp!, {r0, lr}		/* memmove() returns dest addr */
350	subs	r2, r2, #4
351	blt	.Lmemmove_fl4		/* less than 4 bytes */
352	ands	r12, r0, #3
353	bne	.Lmemmove_fdestul	/* oh unaligned destination addr */
354	ands	r12, r1, #3
355	bne	.Lmemmove_fsrcul		/* oh unaligned source addr */
356
357.Lmemmove_ft8:
358	/* We have aligned source and destination */
359	subs	r2, r2, #8
360	blt	.Lmemmove_fl12		/* less than 12 bytes (4 from above) */
361	subs	r2, r2, #0x14
362	blt	.Lmemmove_fl32		/* less than 32 bytes (12 from above) */
363	stmdb	sp!, {r4}		/* borrow r4 */
364
365	/* blat 32 bytes at a time */
366	/* XXX for really big copies perhaps we should use more registers */
367.Lmemmove_floop32:
368	ldmia	r1!, {r3, r4, r12, lr}
369	stmia	r0!, {r3, r4, r12, lr}
370	ldmia	r1!, {r3, r4, r12, lr}
371	stmia	r0!, {r3, r4, r12, lr}
372	subs	r2, r2, #0x20
373	bge	.Lmemmove_floop32
374
375	cmn	r2, #0x10
376	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
377	stmgeia	r0!, {r3, r4, r12, lr}
378	subge	r2, r2, #0x10
379	ldmia	sp!, {r4}		/* return r4 */
380
381.Lmemmove_fl32:
382	adds	r2, r2, #0x14
383
384	/* blat 12 bytes at a time */
385.Lmemmove_floop12:
386	ldmgeia	r1!, {r3, r12, lr}
387	stmgeia	r0!, {r3, r12, lr}
388	subges	r2, r2, #0x0c
389	bge	.Lmemmove_floop12
390
391.Lmemmove_fl12:
392	adds	r2, r2, #8
393	blt	.Lmemmove_fl4
394
395	subs	r2, r2, #4
396	ldrlt	r3, [r1], #4
397	strlt	r3, [r0], #4
398	ldmgeia	r1!, {r3, r12}
399	stmgeia	r0!, {r3, r12}
400	subge	r2, r2, #4
401
402.Lmemmove_fl4:
403	/* less than 4 bytes to go */
404	adds	r2, r2, #4
405	ldmeqia	sp!, {r0, pc}		/* done */
406
407	/* copy the crud byte at a time */
408	cmp	r2, #2
409	ldrb	r3, [r1], #1
410	strb	r3, [r0], #1
411	ldrgeb	r3, [r1], #1
412	strgeb	r3, [r0], #1
413	ldrgtb	r3, [r1], #1
414	strgtb	r3, [r0], #1
415	ldmia	sp!, {r0, pc}
416
417	/* erg - unaligned destination */
418.Lmemmove_fdestul:
419	rsb	r12, r12, #4
420	cmp	r12, #2
421
422	/* align destination with byte copies */
423	ldrb	r3, [r1], #1
424	strb	r3, [r0], #1
425	ldrgeb	r3, [r1], #1
426	strgeb	r3, [r0], #1
427	ldrgtb	r3, [r1], #1
428	strgtb	r3, [r0], #1
429	subs	r2, r2, r12
430	blt	.Lmemmove_fl4		/* less the 4 bytes */
431
432	ands	r12, r1, #3
433	beq	.Lmemmove_ft8		/* we have an aligned source */
434
435	/* erg - unaligned source */
436	/* This is where it gets nasty ... */
437.Lmemmove_fsrcul:
438	bic	r1, r1, #3
439	ldr	lr, [r1], #4
440	cmp	r12, #2
441	bgt	.Lmemmove_fsrcul3
442	beq	.Lmemmove_fsrcul2
443	cmp	r2, #0x0c
444	blt	.Lmemmove_fsrcul1loop4
445	sub	r2, r2, #0x0c
446	stmdb	sp!, {r4, r5}
447
448.Lmemmove_fsrcul1loop16:
449#ifdef __ARMEB__
450	mov	r3, lr, lsl #8
451#else
452	mov	r3, lr, lsr #8
453#endif
454	ldmia	r1!, {r4, r5, r12, lr}
455#ifdef __ARMEB__
456	orr	r3, r3, r4, lsr #24
457	mov	r4, r4, lsl #8
458	orr	r4, r4, r5, lsr #24
459	mov	r5, r5, lsl #8
460	orr	r5, r5, r12, lsr #24
461	mov	r12, r12, lsl #8
462	orr	r12, r12, lr, lsr #24
463#else
464	orr	r3, r3, r4, lsl #24
465	mov	r4, r4, lsr #8
466	orr	r4, r4, r5, lsl #24
467	mov	r5, r5, lsr #8
468	orr	r5, r5, r12, lsl #24
469	mov	r12, r12, lsr #8
470	orr	r12, r12, lr, lsl #24
471#endif
472	stmia	r0!, {r3-r5, r12}
473	subs	r2, r2, #0x10
474	bge	.Lmemmove_fsrcul1loop16
475	ldmia	sp!, {r4, r5}
476	adds	r2, r2, #0x0c
477	blt	.Lmemmove_fsrcul1l4
478
479.Lmemmove_fsrcul1loop4:
480#ifdef __ARMEB__
481	mov	r12, lr, lsl #8
482#else
483	mov	r12, lr, lsr #8
484#endif
485	ldr	lr, [r1], #4
486#ifdef __ARMEB__
487	orr	r12, r12, lr, lsr #24
488#else
489	orr	r12, r12, lr, lsl #24
490#endif
491	str	r12, [r0], #4
492	subs	r2, r2, #4
493	bge	.Lmemmove_fsrcul1loop4
494
495.Lmemmove_fsrcul1l4:
496	sub	r1, r1, #3
497	b	.Lmemmove_fl4
498
499.Lmemmove_fsrcul2:
500	cmp	r2, #0x0c
501	blt	.Lmemmove_fsrcul2loop4
502	sub	r2, r2, #0x0c
503	stmdb	sp!, {r4, r5}
504
505.Lmemmove_fsrcul2loop16:
506#ifdef __ARMEB__
507	mov	r3, lr, lsl #16
508#else
509	mov	r3, lr, lsr #16
510#endif
511	ldmia	r1!, {r4, r5, r12, lr}
512#ifdef __ARMEB__
513	orr	r3, r3, r4, lsr #16
514	mov	r4, r4, lsl #16
515	orr	r4, r4, r5, lsr #16
516	mov	r5, r5, lsl #16
517	orr	r5, r5, r12, lsr #16
518	mov	r12, r12, lsl #16
519	orr	r12, r12, lr, lsr #16
520#else
521	orr	r3, r3, r4, lsl #16
522	mov	r4, r4, lsr #16
523	orr	r4, r4, r5, lsl #16
524	mov	r5, r5, lsr #16
525	orr	r5, r5, r12, lsl #16
526	mov	r12, r12, lsr #16
527	orr	r12, r12, lr, lsl #16
528#endif
529	stmia	r0!, {r3-r5, r12}
530	subs	r2, r2, #0x10
531	bge	.Lmemmove_fsrcul2loop16
532	ldmia	sp!, {r4, r5}
533	adds	r2, r2, #0x0c
534	blt	.Lmemmove_fsrcul2l4
535
536.Lmemmove_fsrcul2loop4:
537#ifdef __ARMEB__
538	mov	r12, lr, lsl #16
539#else
540	mov	r12, lr, lsr #16
541#endif
542	ldr	lr, [r1], #4
543#ifdef __ARMEB__
544	orr	r12, r12, lr, lsr #16
545#else
546	orr	r12, r12, lr, lsl #16
547#endif
548	str	r12, [r0], #4
549	subs	r2, r2, #4
550	bge	.Lmemmove_fsrcul2loop4
551
552.Lmemmove_fsrcul2l4:
553	sub	r1, r1, #2
554	b	.Lmemmove_fl4
555
556.Lmemmove_fsrcul3:
557	cmp	r2, #0x0c
558	blt	.Lmemmove_fsrcul3loop4
559	sub	r2, r2, #0x0c
560	stmdb	sp!, {r4, r5}
561
562.Lmemmove_fsrcul3loop16:
563#ifdef __ARMEB__
564	mov	r3, lr, lsl #24
565#else
566	mov	r3, lr, lsr #24
567#endif
568	ldmia	r1!, {r4, r5, r12, lr}
569#ifdef __ARMEB__
570	orr	r3, r3, r4, lsr #8
571	mov	r4, r4, lsl #24
572	orr	r4, r4, r5, lsr #8
573	mov	r5, r5, lsl #24
574	orr	r5, r5, r12, lsr #8
575	mov	r12, r12, lsl #24
576	orr	r12, r12, lr, lsr #8
577#else
578	orr	r3, r3, r4, lsl #8
579	mov	r4, r4, lsr #24
580	orr	r4, r4, r5, lsl #8
581	mov	r5, r5, lsr #24
582	orr	r5, r5, r12, lsl #8
583	mov	r12, r12, lsr #24
584	orr	r12, r12, lr, lsl #8
585#endif
586	stmia	r0!, {r3-r5, r12}
587	subs	r2, r2, #0x10
588	bge	.Lmemmove_fsrcul3loop16
589	ldmia	sp!, {r4, r5}
590	adds	r2, r2, #0x0c
591	blt	.Lmemmove_fsrcul3l4
592
593.Lmemmove_fsrcul3loop4:
594#ifdef __ARMEB__
595	mov	r12, lr, lsl #24
596#else
597	mov	r12, lr, lsr #24
598#endif
599	ldr	lr, [r1], #4
600#ifdef __ARMEB__
601	orr	r12, r12, lr, lsr #8
602#else
603	orr	r12, r12, lr, lsl #8
604#endif
605	str	r12, [r0], #4
606	subs	r2, r2, #4
607	bge	.Lmemmove_fsrcul3loop4
608
609.Lmemmove_fsrcul3l4:
610	sub	r1, r1, #1
611	b	.Lmemmove_fl4
612
613.Lmemmove_backwards:
614	add	r1, r1, r2
615	add	r0, r0, r2
616	subs	r2, r2, #4
617	blt	.Lmemmove_bl4		/* less than 4 bytes */
618	ands	r12, r0, #3
619	bne	.Lmemmove_bdestul	/* oh unaligned destination addr */
620	ands	r12, r1, #3
621	bne	.Lmemmove_bsrcul		/* oh unaligned source addr */
622
623.Lmemmove_bt8:
624	/* We have aligned source and destination */
625	subs	r2, r2, #8
626	blt	.Lmemmove_bl12		/* less than 12 bytes (4 from above) */
627	stmdb	sp!, {r4, lr}
628	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
629	blt	.Lmemmove_bl32
630
631	/* blat 32 bytes at a time */
632	/* XXX for really big copies perhaps we should use more registers */
633.Lmemmove_bloop32:
634	ldmdb	r1!, {r3, r4, r12, lr}
635	stmdb	r0!, {r3, r4, r12, lr}
636	ldmdb	r1!, {r3, r4, r12, lr}
637	stmdb	r0!, {r3, r4, r12, lr}
638	subs	r2, r2, #0x20
639	bge	.Lmemmove_bloop32
640
641.Lmemmove_bl32:
642	cmn	r2, #0x10
643	ldmgedb	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
644	stmgedb	r0!, {r3, r4, r12, lr}
645	subge	r2, r2, #0x10
646	adds	r2, r2, #0x14
647	ldmgedb	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
648	stmgedb	r0!, {r3, r12, lr}
649	subge	r2, r2, #0x0c
650	ldmia	sp!, {r4, lr}
651
652.Lmemmove_bl12:
653	adds	r2, r2, #8
654	blt	.Lmemmove_bl4
655	subs	r2, r2, #4
656	ldrlt	r3, [r1, #-4]!
657	strlt	r3, [r0, #-4]!
658	ldmgedb	r1!, {r3, r12}
659	stmgedb	r0!, {r3, r12}
660	subge	r2, r2, #4
661
662.Lmemmove_bl4:
663	/* less than 4 bytes to go */
664	adds	r2, r2, #4
665	RETeq			/* done */
666
667	/* copy the crud byte at a time */
668	cmp	r2, #2
669	ldrb	r3, [r1, #-1]!
670	strb	r3, [r0, #-1]!
671	ldrgeb	r3, [r1, #-1]!
672	strgeb	r3, [r0, #-1]!
673	ldrgtb	r3, [r1, #-1]!
674	strgtb	r3, [r0, #-1]!
675	RET
676
677	/* erg - unaligned destination */
678.Lmemmove_bdestul:
679	cmp	r12, #2
680
681	/* align destination with byte copies */
682	ldrb	r3, [r1, #-1]!
683	strb	r3, [r0, #-1]!
684	ldrgeb	r3, [r1, #-1]!
685	strgeb	r3, [r0, #-1]!
686	ldrgtb	r3, [r1, #-1]!
687	strgtb	r3, [r0, #-1]!
688	subs	r2, r2, r12
689	blt	.Lmemmove_bl4		/* less than 4 bytes to go */
690	ands	r12, r1, #3
691	beq	.Lmemmove_bt8		/* we have an aligned source */
692
693	/* erg - unaligned source */
694	/* This is where it gets nasty ... */
695.Lmemmove_bsrcul:
696	bic	r1, r1, #3
697	ldr	r3, [r1, #0]
698	cmp	r12, #2
699	blt	.Lmemmove_bsrcul1
700	beq	.Lmemmove_bsrcul2
701	cmp	r2, #0x0c
702	blt	.Lmemmove_bsrcul3loop4
703	sub	r2, r2, #0x0c
704	stmdb	sp!, {r4, r5, lr}
705
706.Lmemmove_bsrcul3loop16:
707#ifdef __ARMEB__
708	mov	lr, r3, lsr #8
709#else
710	mov	lr, r3, lsl #8
711#endif
712	ldmdb	r1!, {r3-r5, r12}
713#ifdef __ARMEB__
714	orr	lr, lr, r12, lsl #24
715	mov	r12, r12, lsr #8
716	orr	r12, r12, r5, lsl #24
717	mov	r5, r5, lsr #8
718	orr	r5, r5, r4, lsl #24
719	mov	r4, r4, lsr #8
720	orr	r4, r4, r3, lsl #24
721#else
722	orr	lr, lr, r12, lsr #24
723	mov	r12, r12, lsl #8
724	orr	r12, r12, r5, lsr #24
725	mov	r5, r5, lsl #8
726	orr	r5, r5, r4, lsr #24
727	mov	r4, r4, lsl #8
728	orr	r4, r4, r3, lsr #24
729#endif
730	stmdb	r0!, {r4, r5, r12, lr}
731	subs	r2, r2, #0x10
732	bge	.Lmemmove_bsrcul3loop16
733	ldmia	sp!, {r4, r5, lr}
734	adds	r2, r2, #0x0c
735	blt	.Lmemmove_bsrcul3l4
736
737.Lmemmove_bsrcul3loop4:
738#ifdef __ARMEB__
739	mov	r12, r3, lsr #8
740#else
741	mov	r12, r3, lsl #8
742#endif
743	ldr	r3, [r1, #-4]!
744#ifdef __ARMEB__
745	orr	r12, r12, r3, lsl #24
746#else
747	orr	r12, r12, r3, lsr #24
748#endif
749	str	r12, [r0, #-4]!
750	subs	r2, r2, #4
751	bge	.Lmemmove_bsrcul3loop4
752
753.Lmemmove_bsrcul3l4:
754	add	r1, r1, #3
755	b	.Lmemmove_bl4
756
757.Lmemmove_bsrcul2:
758	cmp	r2, #0x0c
759	blt	.Lmemmove_bsrcul2loop4
760	sub	r2, r2, #0x0c
761	stmdb	sp!, {r4, r5, lr}
762
763.Lmemmove_bsrcul2loop16:
764#ifdef __ARMEB__
765	mov	lr, r3, lsr #16
766#else
767	mov	lr, r3, lsl #16
768#endif
769	ldmdb	r1!, {r3-r5, r12}
770#ifdef __ARMEB__
771	orr	lr, lr, r12, lsl #16
772	mov	r12, r12, lsr #16
773	orr	r12, r12, r5, lsl #16
774	mov	r5, r5, lsr #16
775	orr	r5, r5, r4, lsl #16
776	mov	r4, r4, lsr #16
777	orr	r4, r4, r3, lsl #16
778#else
779	orr	lr, lr, r12, lsr #16
780	mov	r12, r12, lsl #16
781	orr	r12, r12, r5, lsr #16
782	mov	r5, r5, lsl #16
783	orr	r5, r5, r4, lsr #16
784	mov	r4, r4, lsl #16
785	orr	r4, r4, r3, lsr #16
786#endif
787	stmdb	r0!, {r4, r5, r12, lr}
788	subs	r2, r2, #0x10
789	bge	.Lmemmove_bsrcul2loop16
790	ldmia	sp!, {r4, r5, lr}
791	adds	r2, r2, #0x0c
792	blt	.Lmemmove_bsrcul2l4
793
794.Lmemmove_bsrcul2loop4:
795#ifdef __ARMEB__
796	mov	r12, r3, lsr #16
797#else
798	mov	r12, r3, lsl #16
799#endif
800	ldr	r3, [r1, #-4]!
801#ifdef __ARMEB__
802	orr	r12, r12, r3, lsl #16
803#else
804	orr	r12, r12, r3, lsr #16
805#endif
806	str	r12, [r0, #-4]!
807	subs	r2, r2, #4
808	bge	.Lmemmove_bsrcul2loop4
809
810.Lmemmove_bsrcul2l4:
811	add	r1, r1, #2
812	b	.Lmemmove_bl4
813
814.Lmemmove_bsrcul1:
815	cmp	r2, #0x0c
816	blt	.Lmemmove_bsrcul1loop4
817	sub	r2, r2, #0x0c
818	stmdb	sp!, {r4, r5, lr}
819
820.Lmemmove_bsrcul1loop32:
821#ifdef __ARMEB__
822	mov	lr, r3, lsr #24
823#else
824	mov	lr, r3, lsl #24
825#endif
826	ldmdb	r1!, {r3-r5, r12}
827#ifdef __ARMEB__
828	orr	lr, lr, r12, lsl #8
829	mov	r12, r12, lsr #24
830	orr	r12, r12, r5, lsl #8
831	mov	r5, r5, lsr #24
832	orr	r5, r5, r4, lsl #8
833	mov	r4, r4, lsr #24
834	orr	r4, r4, r3, lsl #8
835#else
836	orr	lr, lr, r12, lsr #8
837	mov	r12, r12, lsl #24
838	orr	r12, r12, r5, lsr #8
839	mov	r5, r5, lsl #24
840	orr	r5, r5, r4, lsr #8
841	mov	r4, r4, lsl #24
842	orr	r4, r4, r3, lsr #8
843#endif
844	stmdb	r0!, {r4, r5, r12, lr}
845	subs	r2, r2, #0x10
846	bge	.Lmemmove_bsrcul1loop32
847	ldmia	sp!, {r4, r5, lr}
848	adds	r2, r2, #0x0c
849	blt	.Lmemmove_bsrcul1l4
850
851.Lmemmove_bsrcul1loop4:
852#ifdef __ARMEB__
853	mov	r12, r3, lsr #24
854#else
855	mov	r12, r3, lsl #24
856#endif
857	ldr	r3, [r1, #-4]!
858#ifdef __ARMEB__
859	orr	r12, r12, r3, lsl #8
860#else
861	orr	r12, r12, r3, lsr #8
862#endif
863	str	r12, [r0, #-4]!
864	subs	r2, r2, #4
865	bge	.Lmemmove_bsrcul1loop4
866
867.Lmemmove_bsrcul1l4:
868	add	r1, r1, #1
869	b	.Lmemmove_bl4
870
871#if !defined(__XSCALE__)
872ENTRY(memcpy)
873	/* save leaf functions having to store this away */
874	/* Do not check arm_memcpy if we're running from flash */
875#ifdef FLASHADDR
876#if FLASHADDR > PHYSADDR
877	ldr	r3, =FLASHADDR
878	cmp	r3, pc
879	bls	.Lnormal
880#else
881	ldr	r3, =FLASHADDR
882	cmp	r3, pc
883	bhi	.Lnormal
884#endif
885#endif
886	ldr	r3, .L_arm_memcpy
887	ldr	r3, [r3]
888	cmp	r3, #0
889	beq	.Lnormal
890	ldr	r3, .L_min_memcpy_size
891	ldr	r3, [r3]
892	cmp	r2, r3
893	blt	.Lnormal
894	stmfd	sp!, {r0-r2, r4, lr}
895	mov	r3, #0
896	ldr	r4, .L_arm_memcpy
897	mov	lr, pc
898	ldr	pc, [r4]
899	cmp	r0, #0
900	ldmfd	sp!, {r0-r2, r4, lr}
901	RETeq
902
903.Lnormal:
904	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
905
906	subs	r2, r2, #4
907	blt	.Lmemcpy_l4		/* less than 4 bytes */
908	ands	r12, r0, #3
909	bne	.Lmemcpy_destul		/* oh unaligned destination addr */
910	ands	r12, r1, #3
911	bne	.Lmemcpy_srcul		/* oh unaligned source addr */
912
913.Lmemcpy_t8:
914	/* We have aligned source and destination */
915	subs	r2, r2, #8
916	blt	.Lmemcpy_l12		/* less than 12 bytes (4 from above) */
917	subs	r2, r2, #0x14
918	blt	.Lmemcpy_l32		/* less than 32 bytes (12 from above) */
919	stmdb	sp!, {r4}		/* borrow r4 */
920
921	/* blat 32 bytes at a time */
922	/* XXX for really big copies perhaps we should use more registers */
923.Lmemcpy_loop32:
924	ldmia	r1!, {r3, r4, r12, lr}
925	stmia	r0!, {r3, r4, r12, lr}
926	ldmia	r1!, {r3, r4, r12, lr}
927	stmia	r0!, {r3, r4, r12, lr}
928	subs	r2, r2, #0x20
929	bge	.Lmemcpy_loop32
930
931	cmn	r2, #0x10
932	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
933	stmgeia	r0!, {r3, r4, r12, lr}
934	subge	r2, r2, #0x10
935	ldmia	sp!, {r4}		/* return r4 */
936
937.Lmemcpy_l32:
938	adds	r2, r2, #0x14
939
940	/* blat 12 bytes at a time */
941.Lmemcpy_loop12:
942	ldmgeia	r1!, {r3, r12, lr}
943	stmgeia	r0!, {r3, r12, lr}
944	subges	r2, r2, #0x0c
945	bge	.Lmemcpy_loop12
946
947.Lmemcpy_l12:
948	adds	r2, r2, #8
949	blt	.Lmemcpy_l4
950
951	subs	r2, r2, #4
952	ldrlt	r3, [r1], #4
953	strlt	r3, [r0], #4
954	ldmgeia	r1!, {r3, r12}
955	stmgeia	r0!, {r3, r12}
956	subge	r2, r2, #4
957
958.Lmemcpy_l4:
959	/* less than 4 bytes to go */
960	adds	r2, r2, #4
961#ifdef __APCS_26_
962	ldmeqia sp!, {r0, pc}^		/* done */
963#else
964	ldmeqia	sp!, {r0, pc}		/* done */
965#endif
966	/* copy the crud byte at a time */
967	cmp	r2, #2
968	ldrb	r3, [r1], #1
969	strb	r3, [r0], #1
970	ldrgeb	r3, [r1], #1
971	strgeb	r3, [r0], #1
972	ldrgtb	r3, [r1], #1
973	strgtb	r3, [r0], #1
974	ldmia	sp!, {r0, pc}
975
976	/* erg - unaligned destination */
977.Lmemcpy_destul:
978	rsb	r12, r12, #4
979	cmp	r12, #2
980
981	/* align destination with byte copies */
982	ldrb	r3, [r1], #1
983	strb	r3, [r0], #1
984	ldrgeb	r3, [r1], #1
985	strgeb	r3, [r0], #1
986	ldrgtb	r3, [r1], #1
987	strgtb	r3, [r0], #1
988	subs	r2, r2, r12
989	blt	.Lmemcpy_l4		/* less the 4 bytes */
990
991	ands	r12, r1, #3
992	beq	.Lmemcpy_t8		/* we have an aligned source */
993
994	/* erg - unaligned source */
995	/* This is where it gets nasty ... */
996.Lmemcpy_srcul:
997	bic	r1, r1, #3
998	ldr	lr, [r1], #4
999	cmp	r12, #2
1000	bgt	.Lmemcpy_srcul3
1001	beq	.Lmemcpy_srcul2
1002	cmp	r2, #0x0c
1003	blt	.Lmemcpy_srcul1loop4
1004	sub	r2, r2, #0x0c
1005	stmdb	sp!, {r4, r5}
1006
1007.Lmemcpy_srcul1loop16:
1008	mov	r3, lr, lsr #8
1009	ldmia	r1!, {r4, r5, r12, lr}
1010	orr	r3, r3, r4, lsl #24
1011	mov	r4, r4, lsr #8
1012	orr	r4, r4, r5, lsl #24
1013	mov	r5, r5, lsr #8
1014	orr	r5, r5, r12, lsl #24
1015	mov	r12, r12, lsr #8
1016	orr	r12, r12, lr, lsl #24
1017	stmia	r0!, {r3-r5, r12}
1018	subs	r2, r2, #0x10
1019	bge	.Lmemcpy_srcul1loop16
1020	ldmia	sp!, {r4, r5}
1021	adds	r2, r2, #0x0c
1022	blt	.Lmemcpy_srcul1l4
1023
1024.Lmemcpy_srcul1loop4:
1025	mov	r12, lr, lsr #8
1026	ldr	lr, [r1], #4
1027	orr	r12, r12, lr, lsl #24
1028	str	r12, [r0], #4
1029	subs	r2, r2, #4
1030	bge	.Lmemcpy_srcul1loop4
1031
1032.Lmemcpy_srcul1l4:
1033	sub	r1, r1, #3
1034	b	.Lmemcpy_l4
1035
1036.Lmemcpy_srcul2:
1037	cmp	r2, #0x0c
1038	blt	.Lmemcpy_srcul2loop4
1039	sub	r2, r2, #0x0c
1040	stmdb	sp!, {r4, r5}
1041
1042.Lmemcpy_srcul2loop16:
1043	mov	r3, lr, lsr #16
1044	ldmia	r1!, {r4, r5, r12, lr}
1045	orr	r3, r3, r4, lsl #16
1046	mov	r4, r4, lsr #16
1047	orr	r4, r4, r5, lsl #16
1048	mov	r5, r5, lsr #16
1049	orr	r5, r5, r12, lsl #16
1050	mov	r12, r12, lsr #16
1051	orr	r12, r12, lr, lsl #16
1052	stmia	r0!, {r3-r5, r12}
1053	subs	r2, r2, #0x10
1054	bge	.Lmemcpy_srcul2loop16
1055	ldmia	sp!, {r4, r5}
1056	adds	r2, r2, #0x0c
1057	blt	.Lmemcpy_srcul2l4
1058
1059.Lmemcpy_srcul2loop4:
1060	mov	r12, lr, lsr #16
1061	ldr	lr, [r1], #4
1062	orr	r12, r12, lr, lsl #16
1063	str	r12, [r0], #4
1064	subs	r2, r2, #4
1065	bge	.Lmemcpy_srcul2loop4
1066
1067.Lmemcpy_srcul2l4:
1068	sub	r1, r1, #2
1069	b	.Lmemcpy_l4
1070
1071.Lmemcpy_srcul3:
1072	cmp	r2, #0x0c
1073	blt	.Lmemcpy_srcul3loop4
1074	sub	r2, r2, #0x0c
1075	stmdb	sp!, {r4, r5}
1076
1077.Lmemcpy_srcul3loop16:
1078	mov	r3, lr, lsr #24
1079	ldmia	r1!, {r4, r5, r12, lr}
1080	orr	r3, r3, r4, lsl #8
1081	mov	r4, r4, lsr #24
1082	orr	r4, r4, r5, lsl #8
1083	mov	r5, r5, lsr #24
1084	orr	r5, r5, r12, lsl #8
1085	mov	r12, r12, lsr #24
1086	orr	r12, r12, lr, lsl #8
1087	stmia	r0!, {r3-r5, r12}
1088	subs	r2, r2, #0x10
1089	bge	.Lmemcpy_srcul3loop16
1090	ldmia	sp!, {r4, r5}
1091	adds	r2, r2, #0x0c
1092	blt	.Lmemcpy_srcul3l4
1093
1094.Lmemcpy_srcul3loop4:
1095	mov	r12, lr, lsr #24
1096	ldr	lr, [r1], #4
1097	orr	r12, r12, lr, lsl #8
1098	str	r12, [r0], #4
1099	subs	r2, r2, #4
1100	bge	.Lmemcpy_srcul3loop4
1101
1102.Lmemcpy_srcul3l4:
1103	sub	r1, r1, #1
1104	b	.Lmemcpy_l4
1105#else
1106/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
1107ENTRY(memcpy)
1108	pld	[r1]
1109	cmp	r2, #0x0c
1110	ble	.Lmemcpy_short		/* <= 12 bytes */
1111#ifdef FLASHADDR
1112#if FLASHADDR > PHYSADDR
1113	ldr	r3, =FLASHADDR
1114	cmp	r3, pc
1115	bls	.Lnormal
1116#else
1117	ldr	r3, =FLASHADDR
1118	cmp	r3, pc
1119	bhi	.Lnormal
1120#endif
1121#endif
1122	ldr	r3, .L_arm_memcpy
1123	ldr	r3, [r3]
1124	cmp	r3, #0
1125	beq	.Lnormal
1126	ldr	r3, .L_min_memcpy_size
1127	ldr	r3, [r3]
1128	cmp	r2, r3
1129	blt	.Lnormal
1130	stmfd	sp!, {r0-r2, r4, lr}
1131	mov	r3, #0
1132	ldr	r4, .L_arm_memcpy
1133	mov	lr, pc
1134	ldr	pc, [r4]
1135	cmp	r0, #0
1136	ldmfd	sp!, {r0-r2, r4, lr}
1137	RETeq
1138.Lnormal:
1139	mov	r3, r0			/* We must not clobber r0 */
1140
1141	/* Word-align the destination buffer */
1142	ands	ip, r3, #0x03		/* Already word aligned? */
1143	beq	.Lmemcpy_wordaligned	/* Yup */
1144	cmp	ip, #0x02
1145	ldrb	ip, [r1], #0x01
1146	sub	r2, r2, #0x01
1147	strb	ip, [r3], #0x01
1148	ldrleb	ip, [r1], #0x01
1149	suble	r2, r2, #0x01
1150	strleb	ip, [r3], #0x01
1151	ldrltb	ip, [r1], #0x01
1152	sublt	r2, r2, #0x01
1153	strltb	ip, [r3], #0x01
1154
1155	/* Destination buffer is now word aligned */
1156.Lmemcpy_wordaligned:
1157	ands	ip, r1, #0x03		/* Is src also word-aligned? */
1158	bne	.Lmemcpy_bad_align	/* Nope. Things just got bad */
1159
1160	/* Quad-align the destination buffer */
1161	tst	r3, #0x07		/* Already quad aligned? */
1162	ldrne	ip, [r1], #0x04
1163	stmfd	sp!, {r4-r9}		/* Free up some registers */
1164	subne	r2, r2, #0x04
1165	strne	ip, [r3], #0x04
1166
1167	/* Destination buffer quad aligned, source is at least word aligned */
1168	subs	r2, r2, #0x80
1169	blt	.Lmemcpy_w_lessthan128
1170
1171	/* Copy 128 bytes at a time */
1172.Lmemcpy_w_loop128:
1173	ldr	r4, [r1], #0x04		/* LD:00-03 */
1174	ldr	r5, [r1], #0x04		/* LD:04-07 */
1175	pld	[r1, #0x18]		/* Prefetch 0x20 */
1176	ldr	r6, [r1], #0x04		/* LD:08-0b */
1177	ldr	r7, [r1], #0x04		/* LD:0c-0f */
1178	ldr	r8, [r1], #0x04		/* LD:10-13 */
1179	ldr	r9, [r1], #0x04		/* LD:14-17 */
1180	strd	r4, [r3], #0x08		/* ST:00-07 */
1181	ldr	r4, [r1], #0x04		/* LD:18-1b */
1182	ldr	r5, [r1], #0x04		/* LD:1c-1f */
1183	strd	r6, [r3], #0x08		/* ST:08-0f */
1184	ldr	r6, [r1], #0x04		/* LD:20-23 */
1185	ldr	r7, [r1], #0x04		/* LD:24-27 */
1186	pld	[r1, #0x18]		/* Prefetch 0x40 */
1187	strd	r8, [r3], #0x08		/* ST:10-17 */
1188	ldr	r8, [r1], #0x04		/* LD:28-2b */
1189	ldr	r9, [r1], #0x04		/* LD:2c-2f */
1190	strd	r4, [r3], #0x08		/* ST:18-1f */
1191	ldr	r4, [r1], #0x04		/* LD:30-33 */
1192	ldr	r5, [r1], #0x04		/* LD:34-37 */
1193	strd	r6, [r3], #0x08		/* ST:20-27 */
1194	ldr	r6, [r1], #0x04		/* LD:38-3b */
1195	ldr	r7, [r1], #0x04		/* LD:3c-3f */
1196	strd	r8, [r3], #0x08		/* ST:28-2f */
1197	ldr	r8, [r1], #0x04		/* LD:40-43 */
1198	ldr	r9, [r1], #0x04		/* LD:44-47 */
1199	pld	[r1, #0x18]		/* Prefetch 0x60 */
1200	strd	r4, [r3], #0x08		/* ST:30-37 */
1201	ldr	r4, [r1], #0x04		/* LD:48-4b */
1202	ldr	r5, [r1], #0x04		/* LD:4c-4f */
1203	strd	r6, [r3], #0x08		/* ST:38-3f */
1204	ldr	r6, [r1], #0x04		/* LD:50-53 */
1205	ldr	r7, [r1], #0x04		/* LD:54-57 */
1206	strd	r8, [r3], #0x08		/* ST:40-47 */
1207	ldr	r8, [r1], #0x04		/* LD:58-5b */
1208	ldr	r9, [r1], #0x04		/* LD:5c-5f */
1209	strd	r4, [r3], #0x08		/* ST:48-4f */
1210	ldr	r4, [r1], #0x04		/* LD:60-63 */
1211	ldr	r5, [r1], #0x04		/* LD:64-67 */
1212	pld	[r1, #0x18]		/* Prefetch 0x80 */
1213	strd	r6, [r3], #0x08		/* ST:50-57 */
1214	ldr	r6, [r1], #0x04		/* LD:68-6b */
1215	ldr	r7, [r1], #0x04		/* LD:6c-6f */
1216	strd	r8, [r3], #0x08		/* ST:58-5f */
1217	ldr	r8, [r1], #0x04		/* LD:70-73 */
1218	ldr	r9, [r1], #0x04		/* LD:74-77 */
1219	strd	r4, [r3], #0x08		/* ST:60-67 */
1220	ldr	r4, [r1], #0x04		/* LD:78-7b */
1221	ldr	r5, [r1], #0x04		/* LD:7c-7f */
1222	strd	r6, [r3], #0x08		/* ST:68-6f */
1223	strd	r8, [r3], #0x08		/* ST:70-77 */
1224	subs	r2, r2, #0x80
1225	strd	r4, [r3], #0x08		/* ST:78-7f */
1226	bge	.Lmemcpy_w_loop128
1227
1228.Lmemcpy_w_lessthan128:
1229	adds	r2, r2, #0x80		/* Adjust for extra sub */
1230	ldmeqfd	sp!, {r4-r9}
1231	RETeq			/* Return now if done */
1232	subs	r2, r2, #0x20
1233	blt	.Lmemcpy_w_lessthan32
1234
1235	/* Copy 32 bytes at a time */
1236.Lmemcpy_w_loop32:
1237	ldr	r4, [r1], #0x04
1238	ldr	r5, [r1], #0x04
1239	pld	[r1, #0x18]
1240	ldr	r6, [r1], #0x04
1241	ldr	r7, [r1], #0x04
1242	ldr	r8, [r1], #0x04
1243	ldr	r9, [r1], #0x04
1244	strd	r4, [r3], #0x08
1245	ldr	r4, [r1], #0x04
1246	ldr	r5, [r1], #0x04
1247	strd	r6, [r3], #0x08
1248	strd	r8, [r3], #0x08
1249	subs	r2, r2, #0x20
1250	strd	r4, [r3], #0x08
1251	bge	.Lmemcpy_w_loop32
1252
1253.Lmemcpy_w_lessthan32:
1254	adds	r2, r2, #0x20		/* Adjust for extra sub */
1255	ldmeqfd	sp!, {r4-r9}
1256	RETeq			/* Return now if done */
1257
1258	and	r4, r2, #0x18
1259	rsbs	r4, r4, #0x18
1260	addne	pc, pc, r4, lsl #1
1261	nop
1262
1263	/* At least 24 bytes remaining */
1264	ldr	r4, [r1], #0x04
1265	ldr	r5, [r1], #0x04
1266	sub	r2, r2, #0x08
1267	strd	r4, [r3], #0x08
1268
1269	/* At least 16 bytes remaining */
1270	ldr	r4, [r1], #0x04
1271	ldr	r5, [r1], #0x04
1272	sub	r2, r2, #0x08
1273	strd	r4, [r3], #0x08
1274
1275	/* At least 8 bytes remaining */
1276	ldr	r4, [r1], #0x04
1277	ldr	r5, [r1], #0x04
1278	subs	r2, r2, #0x08
1279	strd	r4, [r3], #0x08
1280
1281	/* Less than 8 bytes remaining */
1282	ldmfd	sp!, {r4-r9}
1283	RETeq			/* Return now if done */
1284	subs	r2, r2, #0x04
1285	ldrge	ip, [r1], #0x04
1286	strge	ip, [r3], #0x04
1287	RETeq			/* Return now if done */
1288	addlt	r2, r2, #0x04
1289	ldrb	ip, [r1], #0x01
1290	cmp	r2, #0x02
1291	ldrgeb	r2, [r1], #0x01
1292	strb	ip, [r3], #0x01
1293	ldrgtb	ip, [r1]
1294	strgeb	r2, [r3], #0x01
1295	strgtb	ip, [r3]
1296	RET
1297
1298
1299/*
1300 * At this point, it has not been possible to word align both buffers.
1301 * The destination buffer is word aligned, but the source buffer is not.
1302 */
1303.Lmemcpy_bad_align:
1304	stmfd	sp!, {r4-r7}
1305	bic	r1, r1, #0x03
1306	cmp	ip, #2
1307	ldr	ip, [r1], #0x04
1308	bgt	.Lmemcpy_bad3
1309	beq	.Lmemcpy_bad2
1310	b	.Lmemcpy_bad1
1311
1312.Lmemcpy_bad1_loop16:
1313#ifdef __ARMEB__
1314	mov	r4, ip, lsl #8
1315#else
1316	mov	r4, ip, lsr #8
1317#endif
1318	ldr	r5, [r1], #0x04
1319	pld	[r1, #0x018]
1320	ldr	r6, [r1], #0x04
1321	ldr	r7, [r1], #0x04
1322	ldr	ip, [r1], #0x04
1323#ifdef __ARMEB__
1324	orr	r4, r4, r5, lsr #24
1325	mov	r5, r5, lsl #8
1326	orr	r5, r5, r6, lsr #24
1327	mov	r6, r6, lsl #8
1328	orr	r6, r6, r7, lsr #24
1329	mov	r7, r7, lsl #8
1330	orr	r7, r7, ip, lsr #24
1331#else
1332	orr	r4, r4, r5, lsl #24
1333	mov	r5, r5, lsr #8
1334	orr	r5, r5, r6, lsl #24
1335	mov	r6, r6, lsr #8
1336	orr	r6, r6, r7, lsl #24
1337	mov	r7, r7, lsr #8
1338	orr	r7, r7, ip, lsl #24
1339#endif
1340	str	r4, [r3], #0x04
1341	str	r5, [r3], #0x04
1342	str	r6, [r3], #0x04
1343	str	r7, [r3], #0x04
1344.Lmemcpy_bad1:
1345	subs	r2, r2, #0x10
1346	bge	.Lmemcpy_bad1_loop16
1347
1348	adds	r2, r2, #0x10
1349	ldmeqfd	sp!, {r4-r7}
1350	RETeq			/* Return now if done */
1351	subs	r2, r2, #0x04
1352	sublt	r1, r1, #0x03
1353	blt	.Lmemcpy_bad_done
1354
1355.Lmemcpy_bad1_loop4:
1356#ifdef __ARMEB__
1357	mov	r4, ip, lsl #8
1358#else
1359	mov	r4, ip, lsr #8
1360#endif
1361	ldr	ip, [r1], #0x04
1362	subs	r2, r2, #0x04
1363#ifdef __ARMEB__
1364	orr	r4, r4, ip, lsr #24
1365#else
1366	orr	r4, r4, ip, lsl #24
1367#endif
1368	str	r4, [r3], #0x04
1369	bge	.Lmemcpy_bad1_loop4
1370	sub	r1, r1, #0x03
1371	b	.Lmemcpy_bad_done
1372
1373.Lmemcpy_bad2_loop16:
1374#ifdef __ARMEB__
1375	mov	r4, ip, lsl #16
1376#else
1377	mov	r4, ip, lsr #16
1378#endif
1379	ldr	r5, [r1], #0x04
1380	pld	[r1, #0x018]
1381	ldr	r6, [r1], #0x04
1382	ldr	r7, [r1], #0x04
1383	ldr	ip, [r1], #0x04
1384#ifdef __ARMEB__
1385	orr	r4, r4, r5, lsr #16
1386	mov	r5, r5, lsl #16
1387	orr	r5, r5, r6, lsr #16
1388	mov	r6, r6, lsl #16
1389	orr	r6, r6, r7, lsr #16
1390	mov	r7, r7, lsl #16
1391	orr	r7, r7, ip, lsr #16
1392#else
1393	orr	r4, r4, r5, lsl #16
1394	mov	r5, r5, lsr #16
1395	orr	r5, r5, r6, lsl #16
1396	mov	r6, r6, lsr #16
1397	orr	r6, r6, r7, lsl #16
1398	mov	r7, r7, lsr #16
1399	orr	r7, r7, ip, lsl #16
1400#endif
1401	str	r4, [r3], #0x04
1402	str	r5, [r3], #0x04
1403	str	r6, [r3], #0x04
1404	str	r7, [r3], #0x04
1405.Lmemcpy_bad2:
1406	subs	r2, r2, #0x10
1407	bge	.Lmemcpy_bad2_loop16
1408
1409	adds	r2, r2, #0x10
1410	ldmeqfd	sp!, {r4-r7}
1411	RETeq			/* Return now if done */
1412	subs	r2, r2, #0x04
1413	sublt	r1, r1, #0x02
1414	blt	.Lmemcpy_bad_done
1415
1416.Lmemcpy_bad2_loop4:
1417#ifdef __ARMEB__
1418	mov	r4, ip, lsl #16
1419#else
1420	mov	r4, ip, lsr #16
1421#endif
1422	ldr	ip, [r1], #0x04
1423	subs	r2, r2, #0x04
1424#ifdef __ARMEB__
1425	orr	r4, r4, ip, lsr #16
1426#else
1427	orr	r4, r4, ip, lsl #16
1428#endif
1429	str	r4, [r3], #0x04
1430	bge	.Lmemcpy_bad2_loop4
1431	sub	r1, r1, #0x02
1432	b	.Lmemcpy_bad_done
1433
1434.Lmemcpy_bad3_loop16:
1435#ifdef __ARMEB__
1436	mov	r4, ip, lsl #24
1437#else
1438	mov	r4, ip, lsr #24
1439#endif
1440	ldr	r5, [r1], #0x04
1441	pld	[r1, #0x018]
1442	ldr	r6, [r1], #0x04
1443	ldr	r7, [r1], #0x04
1444	ldr	ip, [r1], #0x04
1445#ifdef __ARMEB__
1446	orr	r4, r4, r5, lsr #8
1447	mov	r5, r5, lsl #24
1448	orr	r5, r5, r6, lsr #8
1449	mov	r6, r6, lsl #24
1450	orr	r6, r6, r7, lsr #8
1451	mov	r7, r7, lsl #24
1452	orr	r7, r7, ip, lsr #8
1453#else
1454	orr	r4, r4, r5, lsl #8
1455	mov	r5, r5, lsr #24
1456	orr	r5, r5, r6, lsl #8
1457	mov	r6, r6, lsr #24
1458	orr	r6, r6, r7, lsl #8
1459	mov	r7, r7, lsr #24
1460	orr	r7, r7, ip, lsl #8
1461#endif
1462	str	r4, [r3], #0x04
1463	str	r5, [r3], #0x04
1464	str	r6, [r3], #0x04
1465	str	r7, [r3], #0x04
1466.Lmemcpy_bad3:
1467	subs	r2, r2, #0x10
1468	bge	.Lmemcpy_bad3_loop16
1469
1470	adds	r2, r2, #0x10
1471	ldmeqfd	sp!, {r4-r7}
1472	RETeq			/* Return now if done */
1473	subs	r2, r2, #0x04
1474	sublt	r1, r1, #0x01
1475	blt	.Lmemcpy_bad_done
1476
1477.Lmemcpy_bad3_loop4:
1478#ifdef __ARMEB__
1479	mov	r4, ip, lsl #24
1480#else
1481	mov	r4, ip, lsr #24
1482#endif
1483	ldr	ip, [r1], #0x04
1484	subs	r2, r2, #0x04
1485#ifdef __ARMEB__
1486	orr	r4, r4, ip, lsr #8
1487#else
1488	orr	r4, r4, ip, lsl #8
1489#endif
1490	str	r4, [r3], #0x04
1491	bge	.Lmemcpy_bad3_loop4
1492	sub	r1, r1, #0x01
1493
1494.Lmemcpy_bad_done:
1495	ldmfd	sp!, {r4-r7}
1496	adds	r2, r2, #0x04
1497	RETeq
1498	ldrb	ip, [r1], #0x01
1499	cmp	r2, #0x02
1500	ldrgeb	r2, [r1], #0x01
1501	strb	ip, [r3], #0x01
1502	ldrgtb	ip, [r1]
1503	strgeb	r2, [r3], #0x01
1504	strgtb	ip, [r3]
1505	RET
1506
1507
1508/*
1509 * Handle short copies (less than 16 bytes), possibly misaligned.
1510 * Some of these are *very* common, thanks to the network stack,
1511 * and so are handled specially.
1512 */
1513.Lmemcpy_short:
1514	add	pc, pc, r2, lsl #2
1515	nop
1516	RET			/* 0x00 */
1517	b	.Lmemcpy_bytewise	/* 0x01 */
1518	b	.Lmemcpy_bytewise	/* 0x02 */
1519	b	.Lmemcpy_bytewise	/* 0x03 */
1520	b	.Lmemcpy_4		/* 0x04 */
1521	b	.Lmemcpy_bytewise	/* 0x05 */
1522	b	.Lmemcpy_6		/* 0x06 */
1523	b	.Lmemcpy_bytewise	/* 0x07 */
1524	b	.Lmemcpy_8		/* 0x08 */
1525	b	.Lmemcpy_bytewise	/* 0x09 */
1526	b	.Lmemcpy_bytewise	/* 0x0a */
1527	b	.Lmemcpy_bytewise	/* 0x0b */
1528	b	.Lmemcpy_c		/* 0x0c */
1529.Lmemcpy_bytewise:
1530	mov	r3, r0			/* We must not clobber r0 */
1531	ldrb	ip, [r1], #0x01
15321:	subs	r2, r2, #0x01
1533	strb	ip, [r3], #0x01
1534	ldrneb	ip, [r1], #0x01
1535	bne	1b
1536	RET
1537
1538/******************************************************************************
1539 * Special case for 4 byte copies
1540 */
1541#define	LMEMCPY_4_LOG2	6	/* 64 bytes */
1542#define	LMEMCPY_4_PAD	.align LMEMCPY_4_LOG2
1543	LMEMCPY_4_PAD
1544.Lmemcpy_4:
1545	and	r2, r1, #0x03
1546	orr	r2, r2, r0, lsl #2
1547	ands	r2, r2, #0x0f
1548	sub	r3, pc, #0x14
1549	addne	pc, r3, r2, lsl #LMEMCPY_4_LOG2
1550
1551/*
1552 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1553 */
1554	ldr	r2, [r1]
1555	str	r2, [r0]
1556	RET
1557	LMEMCPY_4_PAD
1558
1559/*
1560 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1561 */
1562	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1563	ldr	r2, [r1, #3]		/* BE:r2 = 3xxx  LE:r2 = xxx3 */
1564#ifdef __ARMEB__
1565	mov	r3, r3, lsl #8		/* r3 = 012. */
1566	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
1567#else
1568	mov	r3, r3, lsr #8		/* r3 = .210 */
1569	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
1570#endif
1571	str	r3, [r0]
1572	RET
1573	LMEMCPY_4_PAD
1574
1575/*
1576 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1577 */
1578#ifdef __ARMEB__
1579	ldrh	r3, [r1]
1580	ldrh	r2, [r1, #0x02]
1581#else
1582	ldrh	r3, [r1, #0x02]
1583	ldrh	r2, [r1]
1584#endif
1585	orr	r3, r2, r3, lsl #16
1586	str	r3, [r0]
1587	RET
1588	LMEMCPY_4_PAD
1589
1590/*
1591 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1592 */
1593	ldr	r3, [r1, #-3]		/* BE:r3 = xxx0  LE:r3 = 0xxx */
1594	ldr	r2, [r1, #1]		/* BE:r2 = 123x  LE:r2 = x321 */
1595#ifdef __ARMEB__
1596	mov	r3, r3, lsl #24		/* r3 = 0... */
1597	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
1598#else
1599	mov	r3, r3, lsr #24		/* r3 = ...0 */
1600	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1601#endif
1602	str	r3, [r0]
1603	RET
1604	LMEMCPY_4_PAD
1605
1606/*
1607 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1608 */
1609	ldr	r2, [r1]
1610#ifdef __ARMEB__
1611	strb	r2, [r0, #0x03]
1612	mov	r3, r2, lsr #8
1613	mov	r1, r2, lsr #24
1614	strb	r1, [r0]
1615#else
1616	strb	r2, [r0]
1617	mov	r3, r2, lsr #8
1618	mov	r1, r2, lsr #24
1619	strb	r1, [r0, #0x03]
1620#endif
1621	strh	r3, [r0, #0x01]
1622	RET
1623	LMEMCPY_4_PAD
1624
1625/*
1626 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1627 */
1628	ldrb	r2, [r1]
1629	ldrh	r3, [r1, #0x01]
1630	ldrb	r1, [r1, #0x03]
1631	strb	r2, [r0]
1632	strh	r3, [r0, #0x01]
1633	strb	r1, [r0, #0x03]
1634	RET
1635	LMEMCPY_4_PAD
1636
1637/*
1638 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1639 */
1640	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1641	ldrh	r3, [r1, #0x02]		/* LE:r3 = ..23  LE:r3 = ..32 */
1642#ifdef __ARMEB__
1643	mov	r1, r2, lsr #8		/* r1 = ...0 */
1644	strb	r1, [r0]
1645	mov	r2, r2, lsl #8		/* r2 = .01. */
1646	orr	r2, r2, r3, lsr #8	/* r2 = .012 */
1647#else
1648	strb	r2, [r0]
1649	mov	r2, r2, lsr #8		/* r2 = ...1 */
1650	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1651	mov	r3, r3, lsr #8		/* r3 = ...3 */
1652#endif
1653	strh	r2, [r0, #0x01]
1654	strb	r3, [r0, #0x03]
1655	RET
1656	LMEMCPY_4_PAD
1657
1658/*
1659 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1660 */
1661	ldrb	r2, [r1]
1662	ldrh	r3, [r1, #0x01]
1663	ldrb	r1, [r1, #0x03]
1664	strb	r2, [r0]
1665	strh	r3, [r0, #0x01]
1666	strb	r1, [r0, #0x03]
1667	RET
1668	LMEMCPY_4_PAD
1669
1670/*
1671 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1672 */
1673	ldr	r2, [r1]
1674#ifdef __ARMEB__
1675	strh	r2, [r0, #0x02]
1676	mov	r3, r2, lsr #16
1677	strh	r3, [r0]
1678#else
1679	strh	r2, [r0]
1680	mov	r3, r2, lsr #16
1681	strh	r3, [r0, #0x02]
1682#endif
1683	RET
1684	LMEMCPY_4_PAD
1685
1686/*
1687 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1688 */
1689	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1690	ldr	r3, [r1, #3]		/* BE:r3 = 3xxx  LE:r3 = xxx3 */
1691	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1692	strh	r1, [r0]
1693#ifdef __ARMEB__
1694	mov	r2, r2, lsl #8		/* r2 = 012. */
1695	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
1696#else
1697	mov	r2, r2, lsr #24		/* r2 = ...2 */
1698	orr	r2, r2, r3, lsl #8	/* r2 = xx32 */
1699#endif
1700	strh	r2, [r0, #0x02]
1701	RET
1702	LMEMCPY_4_PAD
1703
1704/*
1705 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1706 */
1707	ldrh	r2, [r1]
1708	ldrh	r3, [r1, #0x02]
1709	strh	r2, [r0]
1710	strh	r3, [r0, #0x02]
1711	RET
1712	LMEMCPY_4_PAD
1713
1714/*
1715 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1716 */
1717	ldr	r3, [r1, #1]		/* BE:r3 = 123x  LE:r3 = x321 */
1718	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1719	mov	r1, r3, lsr #8		/* BE:r1 = .123  LE:r1 = .x32 */
1720	strh	r1, [r0, #0x02]
1721#ifdef __ARMEB__
1722	mov	r3, r3, lsr #24		/* r3 = ...1 */
1723	orr	r3, r3, r2, lsl #8	/* r3 = xx01 */
1724#else
1725	mov	r3, r3, lsl #8		/* r3 = 321. */
1726	orr	r3, r3, r2, lsr #24	/* r3 = 3210 */
1727#endif
1728	strh	r3, [r0]
1729	RET
1730	LMEMCPY_4_PAD
1731
1732/*
1733 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1734 */
1735	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1736#ifdef __ARMEB__
1737	strb	r2, [r0, #0x03]
1738	mov	r3, r2, lsr #8
1739	mov	r1, r2, lsr #24
1740	strh	r3, [r0, #0x01]
1741	strb	r1, [r0]
1742#else
1743	strb	r2, [r0]
1744	mov	r3, r2, lsr #8
1745	mov	r1, r2, lsr #24
1746	strh	r3, [r0, #0x01]
1747	strb	r1, [r0, #0x03]
1748#endif
1749	RET
1750	LMEMCPY_4_PAD
1751
1752/*
1753 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1754 */
1755	ldrb	r2, [r1]
1756	ldrh	r3, [r1, #0x01]
1757	ldrb	r1, [r1, #0x03]
1758	strb	r2, [r0]
1759	strh	r3, [r0, #0x01]
1760	strb	r1, [r0, #0x03]
1761	RET
1762	LMEMCPY_4_PAD
1763
1764/*
1765 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1766 */
1767#ifdef __ARMEB__
1768	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
1769	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1770	strb	r3, [r0, #0x03]
1771	mov	r3, r3, lsr #8		/* r3 = ...2 */
1772	orr	r3, r3, r2, lsl #8	/* r3 = ..12 */
1773	strh	r3, [r0, #0x01]
1774	mov	r2, r2, lsr #8		/* r2 = ...0 */
1775	strb	r2, [r0]
1776#else
1777	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1778	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
1779	strb	r2, [r0]
1780	mov	r2, r2, lsr #8		/* r2 = ...1 */
1781	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
1782	strh	r2, [r0, #0x01]
1783	mov	r3, r3, lsr #8		/* r3 = ...3 */
1784	strb	r3, [r0, #0x03]
1785#endif
1786	RET
1787	LMEMCPY_4_PAD
1788
1789/*
1790 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1791 */
1792	ldrb	r2, [r1]
1793	ldrh	r3, [r1, #0x01]
1794	ldrb	r1, [r1, #0x03]
1795	strb	r2, [r0]
1796	strh	r3, [r0, #0x01]
1797	strb	r1, [r0, #0x03]
1798	RET
1799	LMEMCPY_4_PAD
1800
1801
1802/******************************************************************************
1803 * Special case for 6 byte copies
1804 */
1805#define	LMEMCPY_6_LOG2	6	/* 64 bytes */
1806#define	LMEMCPY_6_PAD	.align LMEMCPY_6_LOG2
1807	LMEMCPY_6_PAD
1808.Lmemcpy_6:
1809	and	r2, r1, #0x03
1810	orr	r2, r2, r0, lsl #2
1811	ands	r2, r2, #0x0f
1812	sub	r3, pc, #0x14
1813	addne	pc, r3, r2, lsl #LMEMCPY_6_LOG2
1814
1815/*
1816 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1817 */
1818	ldr	r2, [r1]
1819	ldrh	r3, [r1, #0x04]
1820	str	r2, [r0]
1821	strh	r3, [r0, #0x04]
1822	RET
1823	LMEMCPY_6_PAD
1824
1825/*
1826 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1827 */
1828	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1829	ldr	r3, [r1, #0x03]		/* BE:r3 = 345x  LE:r3 = x543 */
1830#ifdef __ARMEB__
1831	mov	r2, r2, lsl #8		/* r2 = 012. */
1832	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
1833#else
1834	mov	r2, r2, lsr #8		/* r2 = .210 */
1835	orr	r2, r2, r3, lsl #24	/* r2 = 3210 */
1836#endif
1837	mov	r3, r3, lsr #8		/* BE:r3 = .345  LE:r3 = .x54 */
1838	str	r2, [r0]
1839	strh	r3, [r0, #0x04]
1840	RET
1841	LMEMCPY_6_PAD
1842
1843/*
1844 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1845 */
1846	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1847	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1848#ifdef __ARMEB__
1849	mov	r1, r3, lsr #16		/* r1 = ..23 */
1850	orr	r1, r1, r2, lsl #16	/* r1 = 0123 */
1851	str	r1, [r0]
1852	strh	r3, [r0, #0x04]
1853#else
1854	mov	r1, r3, lsr #16		/* r1 = ..54 */
1855	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1856	str	r2, [r0]
1857	strh	r1, [r0, #0x04]
1858#endif
1859	RET
1860	LMEMCPY_6_PAD
1861
1862/*
1863 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1864 */
1865	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
1866	ldr	r3, [r1, #1]		/* BE:r3 = 1234  LE:r3 = 4321 */
1867	ldr	r1, [r1, #5]		/* BE:r1 = 5xxx  LE:r3 = xxx5 */
1868#ifdef __ARMEB__
1869	mov	r2, r2, lsl #24		/* r2 = 0... */
1870	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
1871	mov	r3, r3, lsl #8		/* r3 = 234. */
1872	orr	r1, r3, r1, lsr #24	/* r1 = 2345 */
1873#else
1874	mov	r2, r2, lsr #24		/* r2 = ...0 */
1875	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
1876	mov	r1, r1, lsl #8		/* r1 = xx5. */
1877	orr	r1, r1, r3, lsr #24	/* r1 = xx54 */
1878#endif
1879	str	r2, [r0]
1880	strh	r1, [r0, #0x04]
1881	RET
1882	LMEMCPY_6_PAD
1883
1884/*
1885 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1886 */
1887	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
1888	ldrh	r2, [r1, #0x04]		/* BE:r2 = ..45  LE:r2 = ..54 */
1889	mov	r1, r3, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
1890	strh	r1, [r0, #0x01]
1891#ifdef __ARMEB__
1892	mov	r1, r3, lsr #24		/* r1 = ...0 */
1893	strb	r1, [r0]
1894	mov	r3, r3, lsl #8		/* r3 = 123. */
1895	orr	r3, r3, r2, lsr #8	/* r3 = 1234 */
1896#else
1897	strb	r3, [r0]
1898	mov	r3, r3, lsr #24		/* r3 = ...3 */
1899	orr	r3, r3, r2, lsl #8	/* r3 = .543 */
1900	mov	r2, r2, lsr #8		/* r2 = ...5 */
1901#endif
1902	strh	r3, [r0, #0x03]
1903	strb	r2, [r0, #0x05]
1904	RET
1905	LMEMCPY_6_PAD
1906
1907/*
1908 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1909 */
1910	ldrb	r2, [r1]
1911	ldrh	r3, [r1, #0x01]
1912	ldrh	ip, [r1, #0x03]
1913	ldrb	r1, [r1, #0x05]
1914	strb	r2, [r0]
1915	strh	r3, [r0, #0x01]
1916	strh	ip, [r0, #0x03]
1917	strb	r1, [r0, #0x05]
1918	RET
1919	LMEMCPY_6_PAD
1920
1921/*
1922 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1923 */
1924	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1925	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
1926#ifdef __ARMEB__
1927	mov	r3, r2, lsr #8		/* r3 = ...0 */
1928	strb	r3, [r0]
1929	strb	r1, [r0, #0x05]
1930	mov	r3, r1, lsr #8		/* r3 = .234 */
1931	strh	r3, [r0, #0x03]
1932	mov	r3, r2, lsl #8		/* r3 = .01. */
1933	orr	r3, r3, r1, lsr #24	/* r3 = .012 */
1934	strh	r3, [r0, #0x01]
1935#else
1936	strb	r2, [r0]
1937	mov	r3, r1, lsr #24
1938	strb	r3, [r0, #0x05]
1939	mov	r3, r1, lsr #8		/* r3 = .543 */
1940	strh	r3, [r0, #0x03]
1941	mov	r3, r2, lsr #8		/* r3 = ...1 */
1942	orr	r3, r3, r1, lsl #8	/* r3 = 4321 */
1943	strh	r3, [r0, #0x01]
1944#endif
1945	RET
1946	LMEMCPY_6_PAD
1947
1948/*
1949 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1950 */
1951	ldrb	r2, [r1]
1952	ldrh	r3, [r1, #0x01]
1953	ldrh	ip, [r1, #0x03]
1954	ldrb	r1, [r1, #0x05]
1955	strb	r2, [r0]
1956	strh	r3, [r0, #0x01]
1957	strh	ip, [r0, #0x03]
1958	strb	r1, [r0, #0x05]
1959	RET
1960	LMEMCPY_6_PAD
1961
1962/*
1963 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1964 */
1965#ifdef __ARMEB__
1966	ldr	r2, [r1]		/* r2 = 0123 */
1967	ldrh	r3, [r1, #0x04]		/* r3 = ..45 */
1968	mov	r1, r2, lsr #16		/* r1 = ..01 */
1969	orr	r3, r3, r2, lsl#16	/* r3 = 2345 */
1970	strh	r1, [r0]
1971	str	r3, [r0, #0x02]
1972#else
1973	ldrh	r2, [r1, #0x04]		/* r2 = ..54 */
1974	ldr	r3, [r1]		/* r3 = 3210 */
1975	mov	r2, r2, lsl #16		/* r2 = 54.. */
1976	orr	r2, r2, r3, lsr #16	/* r2 = 5432 */
1977	strh	r3, [r0]
1978	str	r2, [r0, #0x02]
1979#endif
1980	RET
1981	LMEMCPY_6_PAD
1982
1983/*
1984 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1985 */
1986	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1987	ldr	r2, [r1, #3]		/* BE:r2 = 345x  LE:r2 = x543 */
1988	mov	r1, r3, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1989#ifdef __ARMEB__
1990	mov	r2, r2, lsr #8		/* r2 = .345 */
1991	orr	r2, r2, r3, lsl #24	/* r2 = 2345 */
1992#else
1993	mov	r2, r2, lsl #8		/* r2 = 543. */
1994	orr	r2, r2, r3, lsr #24	/* r2 = 5432 */
1995#endif
1996	strh	r1, [r0]
1997	str	r2, [r0, #0x02]
1998	RET
1999	LMEMCPY_6_PAD
2000
2001/*
2002 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2003 */
2004	ldrh	r2, [r1]
2005	ldr	r3, [r1, #0x02]
2006	strh	r2, [r0]
2007	str	r3, [r0, #0x02]
2008	RET
2009	LMEMCPY_6_PAD
2010
2011/*
2012 * 1011: dst is 16-bit aligned, src is 8-bit aligned
2013 */
2014	ldrb	r3, [r1]		/* r3 = ...0 */
2015	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2016	ldrb	r1, [r1, #0x05]		/* r1 = ...5 */
2017#ifdef __ARMEB__
2018	mov	r3, r3, lsl #8		/* r3 = ..0. */
2019	orr	r3, r3, r2, lsr #24	/* r3 = ..01 */
2020	orr	r1, r1, r2, lsl #8	/* r1 = 2345 */
2021#else
2022	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
2023	mov	r1, r1, lsl #24		/* r1 = 5... */
2024	orr	r1, r1, r2, lsr #8	/* r1 = 5432 */
2025#endif
2026	strh	r3, [r0]
2027	str	r1, [r0, #0x02]
2028	RET
2029	LMEMCPY_6_PAD
2030
2031/*
2032 * 1100: dst is 8-bit aligned, src is 32-bit aligned
2033 */
2034	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2035	ldrh	r1, [r1, #0x04]		/* BE:r1 = ..45  LE:r1 = ..54 */
2036#ifdef __ARMEB__
2037	mov	r3, r2, lsr #24		/* r3 = ...0 */
2038	strb	r3, [r0]
2039	mov	r2, r2, lsl #8		/* r2 = 123. */
2040	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
2041#else
2042	strb	r2, [r0]
2043	mov	r2, r2, lsr #8		/* r2 = .321 */
2044	orr	r2, r2, r1, lsl #24	/* r2 = 4321 */
2045	mov	r1, r1, lsr #8		/* r1 = ...5 */
2046#endif
2047	str	r2, [r0, #0x01]
2048	strb	r1, [r0, #0x05]
2049	RET
2050	LMEMCPY_6_PAD
2051
2052/*
2053 * 1101: dst is 8-bit aligned, src is 8-bit aligned
2054 */
2055	ldrb	r2, [r1]
2056	ldrh	r3, [r1, #0x01]
2057	ldrh	ip, [r1, #0x03]
2058	ldrb	r1, [r1, #0x05]
2059	strb	r2, [r0]
2060	strh	r3, [r0, #0x01]
2061	strh	ip, [r0, #0x03]
2062	strb	r1, [r0, #0x05]
2063	RET
2064	LMEMCPY_6_PAD
2065
2066/*
2067 * 1110: dst is 8-bit aligned, src is 16-bit aligned
2068 */
2069	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2070	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
2071#ifdef __ARMEB__
2072	mov	r3, r2, lsr #8		/* r3 = ...0 */
2073	strb	r3, [r0]
2074	mov	r2, r2, lsl #24		/* r2 = 1... */
2075	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
2076#else
2077	strb	r2, [r0]
2078	mov	r2, r2, lsr #8		/* r2 = ...1 */
2079	orr	r2, r2, r1, lsl #8	/* r2 = 4321 */
2080	mov	r1, r1, lsr #24		/* r1 = ...5 */
2081#endif
2082	str	r2, [r0, #0x01]
2083	strb	r1, [r0, #0x05]
2084	RET
2085	LMEMCPY_6_PAD
2086
2087/*
2088 * 1111: dst is 8-bit aligned, src is 8-bit aligned
2089 */
2090	ldrb	r2, [r1]
2091	ldr	r3, [r1, #0x01]
2092	ldrb	r1, [r1, #0x05]
2093	strb	r2, [r0]
2094	str	r3, [r0, #0x01]
2095	strb	r1, [r0, #0x05]
2096	RET
2097	LMEMCPY_6_PAD
2098
2099
2100/******************************************************************************
2101 * Special case for 8 byte copies
2102 */
2103#define	LMEMCPY_8_LOG2	6	/* 64 bytes */
2104#define	LMEMCPY_8_PAD	.align LMEMCPY_8_LOG2
2105	LMEMCPY_8_PAD
2106.Lmemcpy_8:
2107	and	r2, r1, #0x03
2108	orr	r2, r2, r0, lsl #2
2109	ands	r2, r2, #0x0f
2110	sub	r3, pc, #0x14
2111	addne	pc, r3, r2, lsl #LMEMCPY_8_LOG2
2112
2113/*
2114 * 0000: dst is 32-bit aligned, src is 32-bit aligned
2115 */
2116	ldr	r2, [r1]
2117	ldr	r3, [r1, #0x04]
2118	str	r2, [r0]
2119	str	r3, [r0, #0x04]
2120	RET
2121	LMEMCPY_8_PAD
2122
2123/*
2124 * 0001: dst is 32-bit aligned, src is 8-bit aligned
2125 */
2126	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
2127	ldr	r2, [r1, #0x03]		/* BE:r2 = 3456  LE:r2 = 6543 */
2128	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2129#ifdef __ARMEB__
2130	mov	r3, r3, lsl #8		/* r3 = 012. */
2131	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
2132	orr	r2, r1, r2, lsl #8	/* r2 = 4567 */
2133#else
2134	mov	r3, r3, lsr #8		/* r3 = .210 */
2135	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
2136	mov	r1, r1, lsl #24		/* r1 = 7... */
2137	orr	r2, r1, r2, lsr #8	/* r2 = 7654 */
2138#endif
2139	str	r3, [r0]
2140	str	r2, [r0, #0x04]
2141	RET
2142	LMEMCPY_8_PAD
2143
2144/*
2145 * 0010: dst is 32-bit aligned, src is 16-bit aligned
2146 */
2147	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2148	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2149	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2150#ifdef __ARMEB__
2151	mov	r2, r2, lsl #16		/* r2 = 01.. */
2152	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
2153	orr	r3, r1, r3, lsl #16	/* r3 = 4567 */
2154#else
2155	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
2156	mov	r3, r3, lsr #16		/* r3 = ..54 */
2157	orr	r3, r3, r1, lsl #16	/* r3 = 7654 */
2158#endif
2159	str	r2, [r0]
2160	str	r3, [r0, #0x04]
2161	RET
2162	LMEMCPY_8_PAD
2163
2164/*
2165 * 0011: dst is 32-bit aligned, src is 8-bit aligned
2166 */
2167	ldrb	r3, [r1]		/* r3 = ...0 */
2168	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2169	ldr	r1, [r1, #0x05]		/* BE:r1 = 567x  LE:r1 = x765 */
2170#ifdef __ARMEB__
2171	mov	r3, r3, lsl #24		/* r3 = 0... */
2172	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
2173	mov	r2, r2, lsl #24		/* r2 = 4... */
2174	orr	r2, r2, r1, lsr #8	/* r2 = 4567 */
2175#else
2176	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
2177	mov	r2, r2, lsr #24		/* r2 = ...4 */
2178	orr	r2, r2, r1, lsl #8	/* r2 = 7654 */
2179#endif
2180	str	r3, [r0]
2181	str	r2, [r0, #0x04]
2182	RET
2183	LMEMCPY_8_PAD
2184
2185/*
2186 * 0100: dst is 8-bit aligned, src is 32-bit aligned
2187 */
2188	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
2189	ldr	r2, [r1, #0x04]		/* BE:r2 = 4567  LE:r2 = 7654 */
2190#ifdef __ARMEB__
2191	mov	r1, r3, lsr #24		/* r1 = ...0 */
2192	strb	r1, [r0]
2193	mov	r1, r3, lsr #8		/* r1 = .012 */
2194	strb	r2, [r0, #0x07]
2195	mov	r3, r3, lsl #24		/* r3 = 3... */
2196	orr	r3, r3, r2, lsr #8	/* r3 = 3456 */
2197#else
2198	strb	r3, [r0]
2199	mov	r1, r2, lsr #24		/* r1 = ...7 */
2200	strb	r1, [r0, #0x07]
2201	mov	r1, r3, lsr #8		/* r1 = .321 */
2202	mov	r3, r3, lsr #24		/* r3 = ...3 */
2203	orr	r3, r3, r2, lsl #8	/* r3 = 6543 */
2204#endif
2205	strh	r1, [r0, #0x01]
2206	str	r3, [r0, #0x03]
2207	RET
2208	LMEMCPY_8_PAD
2209
2210/*
2211 * 0101: dst is 8-bit aligned, src is 8-bit aligned
2212 */
2213	ldrb	r2, [r1]
2214	ldrh	r3, [r1, #0x01]
2215	ldr	ip, [r1, #0x03]
2216	ldrb	r1, [r1, #0x07]
2217	strb	r2, [r0]
2218	strh	r3, [r0, #0x01]
2219	str	ip, [r0, #0x03]
2220	strb	r1, [r0, #0x07]
2221	RET
2222	LMEMCPY_8_PAD
2223
2224/*
2225 * 0110: dst is 8-bit aligned, src is 16-bit aligned
2226 */
2227	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2228	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2229	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2230#ifdef __ARMEB__
2231	mov	ip, r2, lsr #8		/* ip = ...0 */
2232	strb	ip, [r0]
2233	mov	ip, r2, lsl #8		/* ip = .01. */
2234	orr	ip, ip, r3, lsr #24	/* ip = .012 */
2235	strb	r1, [r0, #0x07]
2236	mov	r3, r3, lsl #8		/* r3 = 345. */
2237	orr	r3, r3, r1, lsr #8	/* r3 = 3456 */
2238#else
2239	strb	r2, [r0]		/* 0 */
2240	mov	ip, r1, lsr #8		/* ip = ...7 */
2241	strb	ip, [r0, #0x07]		/* 7 */
2242	mov	ip, r2, lsr #8		/* ip = ...1 */
2243	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
2244	mov	r3, r3, lsr #8		/* r3 = .543 */
2245	orr	r3, r3, r1, lsl #24	/* r3 = 6543 */
2246#endif
2247	strh	ip, [r0, #0x01]
2248	str	r3, [r0, #0x03]
2249	RET
2250	LMEMCPY_8_PAD
2251
2252/*
2253 * 0111: dst is 8-bit aligned, src is 8-bit aligned
2254 */
2255	ldrb	r3, [r1]		/* r3 = ...0 */
2256	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
2257	ldrh	r2, [r1, #0x05]		/* BE:r2 = ..56  LE:r2 = ..65 */
2258	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2259	strb	r3, [r0]
2260	mov	r3, ip, lsr #16		/* BE:r3 = ..12  LE:r3 = ..43 */
2261#ifdef __ARMEB__
2262	strh	r3, [r0, #0x01]
2263	orr	r2, r2, ip, lsl #16	/* r2 = 3456 */
2264#else
2265	strh	ip, [r0, #0x01]
2266	orr	r2, r3, r2, lsl #16	/* r2 = 6543 */
2267#endif
2268	str	r2, [r0, #0x03]
2269	strb	r1, [r0, #0x07]
2270	RET
2271	LMEMCPY_8_PAD
2272
2273/*
2274 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2275 */
2276	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2277	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2278	mov	r1, r2, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
2279#ifdef __ARMEB__
2280	strh	r1, [r0]
2281	mov	r1, r3, lsr #16		/* r1 = ..45 */
2282	orr	r2, r1 ,r2, lsl #16	/* r2 = 2345 */
2283#else
2284	strh	r2, [r0]
2285	orr	r2, r1, r3, lsl #16	/* r2 = 5432 */
2286	mov	r3, r3, lsr #16		/* r3 = ..76 */
2287#endif
2288	str	r2, [r0, #0x02]
2289	strh	r3, [r0, #0x06]
2290	RET
2291	LMEMCPY_8_PAD
2292
2293/*
2294 * 1001: dst is 16-bit aligned, src is 8-bit aligned
2295 */
2296	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
2297	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2298	ldrb	ip, [r1, #0x07]		/* ip = ...7 */
2299	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
2300	strh	r1, [r0]
2301#ifdef __ARMEB__
2302	mov	r1, r2, lsl #24		/* r1 = 2... */
2303	orr	r1, r1, r3, lsr #8	/* r1 = 2345 */
2304	orr	r3, ip, r3, lsl #8	/* r3 = 4567 */
2305#else
2306	mov	r1, r2, lsr #24		/* r1 = ...2 */
2307	orr	r1, r1, r3, lsl #8	/* r1 = 5432 */
2308	mov	r3, r3, lsr #24		/* r3 = ...6 */
2309	orr	r3, r3, ip, lsl #8	/* r3 = ..76 */
2310#endif
2311	str	r1, [r0, #0x02]
2312	strh	r3, [r0, #0x06]
2313	RET
2314	LMEMCPY_8_PAD
2315
2316/*
2317 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2318 */
2319	ldrh	r2, [r1]
2320	ldr	ip, [r1, #0x02]
2321	ldrh	r3, [r1, #0x06]
2322	strh	r2, [r0]
2323	str	ip, [r0, #0x02]
2324	strh	r3, [r0, #0x06]
2325	RET
2326	LMEMCPY_8_PAD
2327
2328/*
2329 * 1011: dst is 16-bit aligned, src is 8-bit aligned
2330 */
2331	ldr	r3, [r1, #0x05]		/* BE:r3 = 567x  LE:r3 = x765 */
2332	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
2333	ldrb	ip, [r1]		/* ip = ...0 */
2334	mov	r1, r3, lsr #8		/* BE:r1 = .567  LE:r1 = .x76 */
2335	strh	r1, [r0, #0x06]
2336#ifdef __ARMEB__
2337	mov	r3, r3, lsr #24		/* r3 = ...5 */
2338	orr	r3, r3, r2, lsl #8	/* r3 = 2345 */
2339	mov	r2, r2, lsr #24		/* r2 = ...1 */
2340	orr	r2, r2, ip, lsl #8	/* r2 = ..01 */
2341#else
2342	mov	r3, r3, lsl #24		/* r3 = 5... */
2343	orr	r3, r3, r2, lsr #8	/* r3 = 5432 */
2344	orr	r2, ip, r2, lsl #8	/* r2 = 3210 */
2345#endif
2346	str	r3, [r0, #0x02]
2347	strh	r2, [r0]
2348	RET
2349	LMEMCPY_8_PAD
2350
2351/*
2352 * 1100: dst is 8-bit aligned, src is 32-bit aligned
2353 */
2354	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2355	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2356	mov	r1, r3, lsr #8		/* BE:r1 = .456  LE:r1 = .765 */
2357	strh	r1, [r0, #0x05]
2358#ifdef __ARMEB__
2359	strb	r3, [r0, #0x07]
2360	mov	r1, r2, lsr #24		/* r1 = ...0 */
2361	strb	r1, [r0]
2362	mov	r2, r2, lsl #8		/* r2 = 123. */
2363	orr	r2, r2, r3, lsr #24	/* r2 = 1234 */
2364	str	r2, [r0, #0x01]
2365#else
2366	strb	r2, [r0]
2367	mov	r1, r3, lsr #24		/* r1 = ...7 */
2368	strb	r1, [r0, #0x07]
2369	mov	r2, r2, lsr #8		/* r2 = .321 */
2370	orr	r2, r2, r3, lsl #24	/* r2 = 4321 */
2371	str	r2, [r0, #0x01]
2372#endif
2373	RET
2374	LMEMCPY_8_PAD
2375
2376/*
2377 * 1101: dst is 8-bit aligned, src is 8-bit aligned
2378 */
2379	ldrb	r3, [r1]		/* r3 = ...0 */
2380	ldrh	r2, [r1, #0x01]		/* BE:r2 = ..12  LE:r2 = ..21 */
2381	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
2382	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
2383	strb	r3, [r0]
2384	mov	r3, ip, lsr #16		/* BE:r3 = ..34  LE:r3 = ..65 */
2385#ifdef __ARMEB__
2386	strh	ip, [r0, #0x05]
2387	orr	r2, r3, r2, lsl #16	/* r2 = 1234 */
2388#else
2389	strh	r3, [r0, #0x05]
2390	orr	r2, r2, ip, lsl #16	/* r2 = 4321 */
2391#endif
2392	str	r2, [r0, #0x01]
2393	strb	r1, [r0, #0x07]
2394	RET
2395	LMEMCPY_8_PAD
2396
2397/*
2398 * 1110: dst is 8-bit aligned, src is 16-bit aligned
2399 */
2400	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2401	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2402	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
2403#ifdef __ARMEB__
2404	mov	ip, r2, lsr #8		/* ip = ...0 */
2405	strb	ip, [r0]
2406	mov	ip, r2, lsl #24		/* ip = 1... */
2407	orr	ip, ip, r3, lsr #8	/* ip = 1234 */
2408	strb	r1, [r0, #0x07]
2409	mov	r1, r1, lsr #8		/* r1 = ...6 */
2410	orr	r1, r1, r3, lsl #8	/* r1 = 3456 */
2411#else
2412	strb	r2, [r0]
2413	mov	ip, r2, lsr #8		/* ip = ...1 */
2414	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
2415	mov	r2, r1, lsr #8		/* r2 = ...7 */
2416	strb	r2, [r0, #0x07]
2417	mov	r1, r1, lsl #8		/* r1 = .76. */
2418	orr	r1, r1, r3, lsr #24	/* r1 = .765 */
2419#endif
2420	str	ip, [r0, #0x01]
2421	strh	r1, [r0, #0x05]
2422	RET
2423	LMEMCPY_8_PAD
2424
2425/*
2426 * 1111: dst is 8-bit aligned, src is 8-bit aligned
2427 */
2428	ldrb	r2, [r1]
2429	ldr	ip, [r1, #0x01]
2430	ldrh	r3, [r1, #0x05]
2431	ldrb	r1, [r1, #0x07]
2432	strb	r2, [r0]
2433	str	ip, [r0, #0x01]
2434	strh	r3, [r0, #0x05]
2435	strb	r1, [r0, #0x07]
2436	RET
2437	LMEMCPY_8_PAD
2438
2439/******************************************************************************
2440 * Special case for 12 byte copies
2441 */
2442#define	LMEMCPY_C_LOG2	7	/* 128 bytes */
2443#define	LMEMCPY_C_PAD	.align LMEMCPY_C_LOG2
2444	LMEMCPY_C_PAD
2445.Lmemcpy_c:
2446	and	r2, r1, #0x03
2447	orr	r2, r2, r0, lsl #2
2448	ands	r2, r2, #0x0f
2449	sub	r3, pc, #0x14
2450	addne	pc, r3, r2, lsl #LMEMCPY_C_LOG2
2451
2452/*
2453 * 0000: dst is 32-bit aligned, src is 32-bit aligned
2454 */
2455	ldr	r2, [r1]
2456	ldr	r3, [r1, #0x04]
2457	ldr	r1, [r1, #0x08]
2458	str	r2, [r0]
2459	str	r3, [r0, #0x04]
2460	str	r1, [r0, #0x08]
2461	RET
2462	LMEMCPY_C_PAD
2463
2464/*
2465 * 0001: dst is 32-bit aligned, src is 8-bit aligned
2466 */
2467	ldrb	r2, [r1, #0xb]		/* r2 = ...B */
2468	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
2469	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2470	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
2471#ifdef __ARMEB__
2472	orr	r2, r2, ip, lsl #8	/* r2 = 89AB */
2473	str	r2, [r0, #0x08]
2474	mov	r2, ip, lsr #24		/* r2 = ...7 */
2475	orr	r2, r2, r3, lsl #8	/* r2 = 4567 */
2476	mov	r1, r1, lsl #8		/* r1 = 012. */
2477	orr	r1, r1, r3, lsr #24	/* r1 = 0123 */
2478#else
2479	mov	r2, r2, lsl #24		/* r2 = B... */
2480	orr	r2, r2, ip, lsr #8	/* r2 = BA98 */
2481	str	r2, [r0, #0x08]
2482	mov	r2, ip, lsl #24		/* r2 = 7... */
2483	orr	r2, r2, r3, lsr #8	/* r2 = 7654 */
2484	mov	r1, r1, lsr #8		/* r1 = .210 */
2485	orr	r1, r1, r3, lsl #24	/* r1 = 3210 */
2486#endif
2487	str	r2, [r0, #0x04]
2488	str	r1, [r0]
2489	RET
2490	LMEMCPY_C_PAD
2491
2492/*
2493 * 0010: dst is 32-bit aligned, src is 16-bit aligned
2494 */
2495	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2496	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2497	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
2498	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
2499#ifdef __ARMEB__
2500	mov	r2, r2, lsl #16		/* r2 = 01.. */
2501	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
2502	str	r2, [r0]
2503	mov	r3, r3, lsl #16		/* r3 = 45.. */
2504	orr	r3, r3, ip, lsr #16	/* r3 = 4567 */
2505	orr	r1, r1, ip, lsl #16	/* r1 = 89AB */
2506#else
2507	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
2508	str	r2, [r0]
2509	mov	r3, r3, lsr #16		/* r3 = ..54 */
2510	orr	r3, r3, ip, lsl #16	/* r3 = 7654 */
2511	mov	r1, r1, lsl #16		/* r1 = BA.. */
2512	orr	r1, r1, ip, lsr #16	/* r1 = BA98 */
2513#endif
2514	str	r3, [r0, #0x04]
2515	str	r1, [r0, #0x08]
2516	RET
2517	LMEMCPY_C_PAD
2518
2519/*
2520 * 0011: dst is 32-bit aligned, src is 8-bit aligned
2521 */
2522	ldrb	r2, [r1]		/* r2 = ...0 */
2523	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
2524	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
2525	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
2526#ifdef __ARMEB__
2527	mov	r2, r2, lsl #24		/* r2 = 0... */
2528	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
2529	str	r2, [r0]
2530	mov	r3, r3, lsl #24		/* r3 = 4... */
2531	orr	r3, r3, ip, lsr #8	/* r3 = 4567 */
2532	mov	r1, r1, lsr #8		/* r1 = .9AB */
2533	orr	r1, r1, ip, lsl #24	/* r1 = 89AB */
2534#else
2535	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
2536	str	r2, [r0]
2537	mov	r3, r3, lsr #24		/* r3 = ...4 */
2538	orr	r3, r3, ip, lsl #8	/* r3 = 7654 */
2539	mov	r1, r1, lsl #8		/* r1 = BA9. */
2540	orr	r1, r1, ip, lsr #24	/* r1 = BA98 */
2541#endif
2542	str	r3, [r0, #0x04]
2543	str	r1, [r0, #0x08]
2544	RET
2545	LMEMCPY_C_PAD
2546
2547/*
2548 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
2549 */
2550	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2551	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2552	ldr	ip, [r1, #0x08]		/* BE:ip = 89AB  LE:ip = BA98 */
2553	mov	r1, r2, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
2554	strh	r1, [r0, #0x01]
2555#ifdef __ARMEB__
2556	mov	r1, r2, lsr #24		/* r1 = ...0 */
2557	strb	r1, [r0]
2558	mov	r1, r2, lsl #24		/* r1 = 3... */
2559	orr	r2, r1, r3, lsr #8	/* r1 = 3456 */
2560	mov	r1, r3, lsl #24		/* r1 = 7... */
2561	orr	r1, r1, ip, lsr #8	/* r1 = 789A */
2562#else
2563	strb	r2, [r0]
2564	mov	r1, r2, lsr #24		/* r1 = ...3 */
2565	orr	r2, r1, r3, lsl #8	/* r1 = 6543 */
2566	mov	r1, r3, lsr #24		/* r1 = ...7 */
2567	orr	r1, r1, ip, lsl #8	/* r1 = A987 */
2568	mov	ip, ip, lsr #24		/* ip = ...B */
2569#endif
2570	str	r2, [r0, #0x03]
2571	str	r1, [r0, #0x07]
2572	strb	ip, [r0, #0x0b]
2573	RET
2574	LMEMCPY_C_PAD
2575
2576/*
2577 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
2578 */
2579	ldrb	r2, [r1]
2580	ldrh	r3, [r1, #0x01]
2581	ldr	ip, [r1, #0x03]
2582	strb	r2, [r0]
2583	ldr	r2, [r1, #0x07]
2584	ldrb	r1, [r1, #0x0b]
2585	strh	r3, [r0, #0x01]
2586	str	ip, [r0, #0x03]
2587	str	r2, [r0, #0x07]
2588	strb	r1, [r0, #0x0b]
2589	RET
2590	LMEMCPY_C_PAD
2591
2592/*
2593 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
2594 */
2595	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
2596	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
2597	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
2598	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
2599#ifdef __ARMEB__
2600	mov	r2, r2, ror #8		/* r2 = 1..0 */
2601	strb	r2, [r0]
2602	mov	r2, r2, lsr #16		/* r2 = ..1. */
2603	orr	r2, r2, r3, lsr #24	/* r2 = ..12 */
2604	strh	r2, [r0, #0x01]
2605	mov	r2, r3, lsl #8		/* r2 = 345. */
2606	orr	r3, r2, ip, lsr #24	/* r3 = 3456 */
2607	mov	r2, ip, lsl #8		/* r2 = 789. */
2608	orr	r2, r2, r1, lsr #8	/* r2 = 789A */
2609#else
2610	strb	r2, [r0]
2611	mov	r2, r2, lsr #8		/* r2 = ...1 */
2612	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
2613	strh	r2, [r0, #0x01]
2614	mov	r2, r3, lsr #8		/* r2 = .543 */
2615	orr	r3, r2, ip, lsl #24	/* r3 = 6543 */
2616	mov	r2, ip, lsr #8		/* r2 = .987 */
2617	orr	r2, r2, r1, lsl #24	/* r2 = A987 */
2618	mov	r1, r1, lsr #8		/* r1 = ...B */
2619#endif
2620	str	r3, [r0, #0x03]
2621	str	r2, [r0, #0x07]
2622	strb	r1, [r0, #0x0b]
2623	RET
2624	LMEMCPY_C_PAD
2625
2626/*
2627 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
2628 */
2629	ldrb	r2, [r1]
2630	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
2631	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
2632	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
2633	strb	r2, [r0]
2634#ifdef __ARMEB__
2635	mov	r2, r3, lsr #16		/* r2 = ..12 */
2636	strh	r2, [r0, #0x01]
2637	mov	r3, r3, lsl #16		/* r3 = 34.. */
2638	orr	r3, r3, ip, lsr #16	/* r3 = 3456 */
2639	mov	ip, ip, lsl #16		/* ip = 78.. */
2640	orr	ip, ip, r1, lsr #16	/* ip = 789A */
2641	mov	r1, r1, lsr #8		/* r1 = .9AB */
2642#else
2643	strh	r3, [r0, #0x01]
2644	mov	r3, r3, lsr #16		/* r3 = ..43 */
2645	orr	r3, r3, ip, lsl #16	/* r3 = 6543 */
2646	mov	ip, ip, lsr #16		/* ip = ..87 */
2647	orr	ip, ip, r1, lsl #16	/* ip = A987 */
2648	mov	r1, r1, lsr #16		/* r1 = ..xB */
2649#endif
2650	str	r3, [r0, #0x03]
2651	str	ip, [r0, #0x07]
2652	strb	r1, [r0, #0x0b]
2653	RET
2654	LMEMCPY_C_PAD
2655
2656/*
2657 * 1000: dst is 16-bit aligned, src is 32-bit aligned
2658 */
2659	ldr	ip, [r1]		/* BE:ip = 0123  LE:ip = 3210 */
2660	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
2661	ldr	r2, [r1, #0x08]		/* BE:r2 = 89AB  LE:r2 = BA98 */
2662	mov	r1, ip, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
2663#ifdef __ARMEB__
2664	strh	r1, [r0]
2665	mov	r1, ip, lsl #16		/* r1 = 23.. */
2666	orr	r1, r1, r3, lsr #16	/* r1 = 2345 */
2667	mov	r3, r3, lsl #16		/* r3 = 67.. */
2668	orr	r3, r3, r2, lsr #16	/* r3 = 6789 */
2669#else
2670	strh	ip, [r0]
2671	orr	r1, r1, r3, lsl #16	/* r1 = 5432 */
2672	mov	r3, r3, lsr #16		/* r3 = ..76 */
2673	orr	r3, r3, r2, lsl #16	/* r3 = 9876 */
2674	mov	r2, r2, lsr #16		/* r2 = ..BA */
2675#endif
2676	str	r1, [r0, #0x02]
2677	str	r3, [r0, #0x06]
2678	strh	r2, [r0, #0x0a]
2679	RET
2680	LMEMCPY_C_PAD
2681
2682/*
2683 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
2684 */
2685	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
2686	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
2687	mov	ip, r2, lsr #8		/* BE:ip = .x01  LE:ip = .210 */
2688	strh	ip, [r0]
2689	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
2690	ldrb	r1, [r1, #0x0b]		/* r1 = ...B */
2691#ifdef __ARMEB__
2692	mov	r2, r2, lsl #24		/* r2 = 2... */
2693	orr	r2, r2, r3, lsr #8	/* r2 = 2345 */
2694	mov	r3, r3, lsl #24		/* r3 = 6... */
2695	orr	r3, r3, ip, lsr #8	/* r3 = 6789 */
2696	orr	r1, r1, ip, lsl #8	/* r1 = 89AB */
2697#else
2698	mov	r2, r2, lsr #24		/* r2 = ...2 */
2699	orr	r2, r2, r3, lsl #8	/* r2 = 5432 */
2700	mov	r3, r3, lsr #24		/* r3 = ...6 */
2701	orr	r3, r3, ip, lsl #8	/* r3 = 9876 */
2702	mov	r1, r1, lsl #8		/* r1 = ..B. */
2703	orr	r1, r1, ip, lsr #24	/* r1 = ..BA */
2704#endif
2705	str	r2, [r0, #0x02]
2706	str	r3, [r0, #0x06]
2707	strh	r1, [r0, #0x0a]
2708	RET
2709	LMEMCPY_C_PAD
2710
2711/*
2712 * 1010: dst is 16-bit aligned, src is 16-bit aligned
2713 */
2714	ldrh	r2, [r1]
2715	ldr	r3, [r1, #0x02]
2716	ldr	ip, [r1, #0x06]
2717	ldrh	r1, [r1, #0x0a]
2718	strh	r2, [r0]
2719	str	r3, [r0, #0x02]
2720	str	ip, [r0, #0x06]
2721	strh	r1, [r0, #0x0a]
2722	RET
2723	LMEMCPY_C_PAD
2724
2725/*
2726 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
2727 */
2728	ldr	r2, [r1, #0x09]		/* BE:r2 = 9ABx  LE:r2 = xBA9 */
2729	ldr	r3, [r1, #0x05]		/* BE:r3 = 5678  LE:r3 = 8765 */
2730	mov	ip, r2, lsr #8		/* BE:ip = .9AB  LE:ip = .xBA */
2731	strh	ip, [r0, #0x0a]
2732	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
2733	ldrb	r1, [r1]		/* r1 = ...0 */
2734#ifdef __ARMEB__
2735	mov	r2, r2, lsr #24		/* r2 = ...9 */
2736	orr	r2, r2, r3, lsl #8	/* r2 = 6789 */
2737	mov	r3, r3, lsr #24		/* r3 = ...5 */
2738	orr	r3, r3, ip, lsl #8	/* r3 = 2345 */
2739	mov	r1, r1, lsl #8		/* r1 = ..0. */
2740	orr	r1, r1, ip, lsr #24	/* r1 = ..01 */
2741#else
2742	mov	r2, r2, lsl #24		/* r2 = 9... */
2743	orr	r2, r2, r3, lsr #8	/* r2 = 9876 */
2744	mov	r3, r3, lsl #24		/* r3 = 5... */
2745	orr	r3, r3, ip, lsr #8	/* r3 = 5432 */
2746	orr	r1, r1, ip, lsl #8	/* r1 = 3210 */
2747#endif
2748	str	r2, [r0, #0x06]
2749	str	r3, [r0, #0x02]
2750	strh	r1, [r0]
2751	RET
2752	LMEMCPY_C_PAD
2753
2754/*
2755 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
2756 */
2757	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
2758	ldr	ip, [r1, #0x04]		/* BE:ip = 4567  LE:ip = 7654 */
2759	ldr	r1, [r1, #0x08]		/* BE:r1 = 89AB  LE:r1 = BA98 */
2760#ifdef __ARMEB__
2761	mov	r3, r2, lsr #24		/* r3 = ...0 */
2762	strb	r3, [r0]
2763	mov	r2, r2, lsl #8		/* r2 = 123. */
2764	orr	r2, r2, ip, lsr #24	/* r2 = 1234 */
2765	str	r2, [r0, #0x01]
2766	mov	r2, ip, lsl #8		/* r2 = 567. */
2767	orr	r2, r2, r1, lsr #24	/* r2 = 5678 */
2768	str	r2, [r0, #0x05]
2769	mov	r2, r1, lsr #8		/* r2 = ..9A */
2770	strh	r2, [r0, #0x09]
2771	strb	r1, [r0, #0x0b]
2772#else
2773	strb	r2, [r0]
2774	mov	r3, r2, lsr #8		/* r3 = .321 */
2775	orr	r3, r3, ip, lsl #24	/* r3 = 4321 */
2776	str	r3, [r0, #0x01]
2777	mov	r3, ip, lsr #8		/* r3 = .765 */
2778	orr	r3, r3, r1, lsl #24	/* r3 = 8765 */
2779	str	r3, [r0, #0x05]
2780	mov	r1, r1, lsr #8		/* r1 = .BA9 */
2781	strh	r1, [r0, #0x09]
2782	mov	r1, r1, lsr #16		/* r1 = ...B */
2783	strb	r1, [r0, #0x0b]
2784#endif
2785	RET
2786	LMEMCPY_C_PAD
2787
2788/*
2789 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
2790 */
2791	ldrb	r2, [r1, #0x0b]		/* r2 = ...B */
2792	ldr	r3, [r1, #0x07]		/* BE:r3 = 789A  LE:r3 = A987 */
2793	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
2794	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
2795	strb	r2, [r0, #0x0b]
2796#ifdef __ARMEB__
2797	strh	r3, [r0, #0x09]
2798	mov	r3, r3, lsr #16		/* r3 = ..78 */
2799	orr	r3, r3, ip, lsl #16	/* r3 = 5678 */
2800	mov	ip, ip, lsr #16		/* ip = ..34 */
2801	orr	ip, ip, r1, lsl #16	/* ip = 1234 */
2802	mov	r1, r1, lsr #16		/* r1 = ..x0 */
2803#else
2804	mov	r2, r3, lsr #16		/* r2 = ..A9 */
2805	strh	r2, [r0, #0x09]
2806	mov	r3, r3, lsl #16		/* r3 = 87.. */
2807	orr	r3, r3, ip, lsr #16	/* r3 = 8765 */
2808	mov	ip, ip, lsl #16		/* ip = 43.. */
2809	orr	ip, ip, r1, lsr #16	/* ip = 4321 */
2810	mov	r1, r1, lsr #8		/* r1 = .210 */
2811#endif
2812	str	r3, [r0, #0x05]
2813	str	ip, [r0, #0x01]
2814	strb	r1, [r0]
2815	RET
2816	LMEMCPY_C_PAD
2817
2818/*
2819 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
2820 */
2821#ifdef __ARMEB__
2822	ldrh	r2, [r1, #0x0a]		/* r2 = ..AB */
2823	ldr	ip, [r1, #0x06]		/* ip = 6789 */
2824	ldr	r3, [r1, #0x02]		/* r3 = 2345 */
2825	ldrh	r1, [r1]		/* r1 = ..01 */
2826	strb	r2, [r0, #0x0b]
2827	mov	r2, r2, lsr #8		/* r2 = ...A */
2828	orr	r2, r2, ip, lsl #8	/* r2 = 789A */
2829	mov	ip, ip, lsr #8		/* ip = .678 */
2830	orr	ip, ip, r3, lsl #24	/* ip = 5678 */
2831	mov	r3, r3, lsr #8		/* r3 = .234 */
2832	orr	r3, r3, r1, lsl #24	/* r3 = 1234 */
2833	mov	r1, r1, lsr #8		/* r1 = ...0 */
2834	strb	r1, [r0]
2835	str	r3, [r0, #0x01]
2836	str	ip, [r0, #0x05]
2837	strh	r2, [r0, #0x09]
2838#else
2839	ldrh	r2, [r1]		/* r2 = ..10 */
2840	ldr	r3, [r1, #0x02]		/* r3 = 5432 */
2841	ldr	ip, [r1, #0x06]		/* ip = 9876 */
2842	ldrh	r1, [r1, #0x0a]		/* r1 = ..BA */
2843	strb	r2, [r0]
2844	mov	r2, r2, lsr #8		/* r2 = ...1 */
2845	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
2846	mov	r3, r3, lsr #24		/* r3 = ...5 */
2847	orr	r3, r3, ip, lsl #8	/* r3 = 8765 */
2848	mov	ip, ip, lsr #24		/* ip = ...9 */
2849	orr	ip, ip, r1, lsl #8	/* ip = .BA9 */
2850	mov	r1, r1, lsr #8		/* r1 = ...B */
2851	str	r2, [r0, #0x01]
2852	str	r3, [r0, #0x05]
2853	strh	ip, [r0, #0x09]
2854	strb	r1, [r0, #0x0b]
2855#endif
2856	RET
2857	LMEMCPY_C_PAD
2858
2859/*
2860 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
2861 */
2862	ldrb	r2, [r1]
2863	ldr	r3, [r1, #0x01]
2864	ldr	ip, [r1, #0x05]
2865	strb	r2, [r0]
2866	ldrh	r2, [r1, #0x09]
2867	ldrb	r1, [r1, #0x0b]
2868	str	r3, [r0, #0x01]
2869	str	ip, [r0, #0x05]
2870	strh	r2, [r0, #0x09]
2871	strb	r1, [r0, #0x0b]
2872	RET
2873#endif /* __XSCALE__ */
2874
2875#ifdef GPROF
2876
2877ENTRY(user)
2878	nop
2879ENTRY(btrap)
2880	nop
2881ENTRY(etrap)
2882	nop
2883ENTRY(bintr)
2884	nop
2885ENTRY(eintr)
2886	nop
2887
2888#endif
2889