1/*	$NetBSD: memcpy_xscale.S,v 1.5 2013/12/17 01:27:21 joerg Exp $	*/
2
3/*
4 * Copyright 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Steve C. Woodford for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *      This product includes software developed for the NetBSD Project by
20 *      Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 *    or promote products derived from this software without specific prior
23 *    written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38#include <machine/asm.h>
39
40/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
41ENTRY(memcpy)
42	pld	[r1]
43	cmp	r2, #0x0c
44	ble	.Lmemcpy_short		/* <= 12 bytes */
45	mov	r3, r0			/* We must not clobber r0 */
46
47	/* Word-align the destination buffer */
48	ands	ip, r3, #0x03		/* Already word aligned? */
49	beq	.Lmemcpy_wordaligned	/* Yup */
50	cmp	ip, #0x02
51	ldrb	ip, [r1], #0x01
52	sub	r2, r2, #0x01
53	strb	ip, [r3], #0x01
54	ldrble	ip, [r1], #0x01
55	suble	r2, r2, #0x01
56	strble	ip, [r3], #0x01
57	ldrblt	ip, [r1], #0x01
58	sublt	r2, r2, #0x01
59	strblt	ip, [r3], #0x01
60
61	/* Destination buffer is now word aligned */
62.Lmemcpy_wordaligned:
63	ands	ip, r1, #0x03		/* Is src also word-aligned? */
64	bne	.Lmemcpy_bad_align	/* Nope. Things just got bad */
65
66	/* Quad-align the destination buffer */
67	tst	r3, #0x07		/* Already quad aligned? */
68	ldrne	ip, [r1], #0x04
69	push	{r4-r9}		/* Free up some registers */
70	subne	r2, r2, #0x04
71	strne	ip, [r3], #0x04
72
73	/* Destination buffer quad aligned, source is at least word aligned */
74	subs	r2, r2, #0x80
75	blt	.Lmemcpy_w_lessthan128
76
77	/* Copy 128 bytes at a time */
78.Lmemcpy_w_loop128:
79	ldr	r4, [r1], #0x04		/* LD:00-03 */
80	ldr	r5, [r1], #0x04		/* LD:04-07 */
81	pld	[r1, #0x18]		/* Prefetch 0x20 */
82	ldr	r6, [r1], #0x04		/* LD:08-0b */
83	ldr	r7, [r1], #0x04		/* LD:0c-0f */
84	ldr	r8, [r1], #0x04		/* LD:10-13 */
85	ldr	r9, [r1], #0x04		/* LD:14-17 */
86	strd	r4, r5, [r3], #0x08	/* ST:00-07 */
87	ldr	r4, [r1], #0x04		/* LD:18-1b */
88	ldr	r5, [r1], #0x04		/* LD:1c-1f */
89	strd	r6, r7, [r3], #0x08	/* ST:08-0f */
90	ldr	r6, [r1], #0x04		/* LD:20-23 */
91	ldr	r7, [r1], #0x04		/* LD:24-27 */
92	pld	[r1, #0x18]		/* Prefetch 0x40 */
93	strd	r8, r9, [r3], #0x08	/* ST:10-17 */
94	ldr	r8, [r1], #0x04		/* LD:28-2b */
95	ldr	r9, [r1], #0x04		/* LD:2c-2f */
96	strd	r4, r5, [r3], #0x08	/* ST:18-1f */
97	ldr	r4, [r1], #0x04		/* LD:30-33 */
98	ldr	r5, [r1], #0x04		/* LD:34-37 */
99	strd	r6, r7, [r3], #0x08	/* ST:20-27 */
100	ldr	r6, [r1], #0x04		/* LD:38-3b */
101	ldr	r7, [r1], #0x04		/* LD:3c-3f */
102	strd	r8, r9, [r3], #0x08	/* ST:28-2f */
103	ldr	r8, [r1], #0x04		/* LD:40-43 */
104	ldr	r9, [r1], #0x04		/* LD:44-47 */
105	pld	[r1, #0x18]		/* Prefetch 0x60 */
106	strd	r4, r5, [r3], #0x08	/* ST:30-37 */
107	ldr	r4, [r1], #0x04		/* LD:48-4b */
108	ldr	r5, [r1], #0x04		/* LD:4c-4f */
109	strd	r6, r7, [r3], #0x08	/* ST:38-3f */
110	ldr	r6, [r1], #0x04		/* LD:50-53 */
111	ldr	r7, [r1], #0x04		/* LD:54-57 */
112	strd	r8, r9, [r3], #0x08	/* ST:40-47 */
113	ldr	r8, [r1], #0x04		/* LD:58-5b */
114	ldr	r9, [r1], #0x04		/* LD:5c-5f */
115	strd	r4, r5, [r3], #0x08	/* ST:48-4f */
116	ldr	r4, [r1], #0x04		/* LD:60-63 */
117	ldr	r5, [r1], #0x04		/* LD:64-67 */
118	pld	[r1, #0x18]		/* Prefetch 0x80 */
119	strd	r6, r7, [r3], #0x08	/* ST:50-57 */
120	ldr	r6, [r1], #0x04		/* LD:68-6b */
121	ldr	r7, [r1], #0x04		/* LD:6c-6f */
122	strd	r8, r9, [r3], #0x08	/* ST:58-5f */
123	ldr	r8, [r1], #0x04		/* LD:70-73 */
124	ldr	r9, [r1], #0x04		/* LD:74-77 */
125	strd	r4, r5, [r3], #0x08	/* ST:60-67 */
126	ldr	r4, [r1], #0x04		/* LD:78-7b */
127	ldr	r5, [r1], #0x04		/* LD:7c-7f */
128	strd	r6, r7, [r3], #0x08	/* ST:68-6f */
129	strd	r8, r9, [r3], #0x08	/* ST:70-77 */
130	subs	r2, r2, #0x80
131	strd	r4, r5, [r3], #0x08	/* ST:78-7f */
132	bge	.Lmemcpy_w_loop128
133
134.Lmemcpy_w_lessthan128:
135	adds	r2, r2, #0x80		/* Adjust for extra sub */
136	popeq	{r4-r9}
137	RETc(eq)			/* Return now if done */
138	subs	r2, r2, #0x20
139	blt	.Lmemcpy_w_lessthan32
140
141	/* Copy 32 bytes at a time */
142.Lmemcpy_w_loop32:
143	ldr	r4, [r1], #0x04
144	ldr	r5, [r1], #0x04
145	pld	[r1, #0x18]
146	ldr	r6, [r1], #0x04
147	ldr	r7, [r1], #0x04
148	ldr	r8, [r1], #0x04
149	ldr	r9, [r1], #0x04
150	strd	r4, r5, [r3], #0x08
151	ldr	r4, [r1], #0x04
152	ldr	r5, [r1], #0x04
153	strd	r6, r7, [r3], #0x08
154	strd	r8, r9, [r3], #0x08
155	subs	r2, r2, #0x20
156	strd	r4, r5, [r3], #0x08
157	bge	.Lmemcpy_w_loop32
158
159.Lmemcpy_w_lessthan32:
160	adds	r2, r2, #0x20		/* Adjust for extra sub */
161	popeq	{r4-r9}
162	RETc(eq)			/* Return now if done */
163
164	and	r4, r2, #0x18
165	rsbs	r4, r4, #0x18
166	addne	pc, pc, r4, lsl #1
167	nop
168
169	/* At least 24 bytes remaining */
170	ldr	r4, [r1], #0x04
171	ldr	r5, [r1], #0x04
172	sub	r2, r2, #0x08
173	strd	r4, r5, [r3], #0x08
174
175	/* At least 16 bytes remaining */
176	ldr	r4, [r1], #0x04
177	ldr	r5, [r1], #0x04
178	sub	r2, r2, #0x08
179	strd	r4, r5, [r3], #0x08
180
181	/* At least 8 bytes remaining */
182	ldr	r4, [r1], #0x04
183	ldr	r5, [r1], #0x04
184	subs	r2, r2, #0x08
185	strd	r4, r5, [r3], #0x08
186
187	/* Less than 8 bytes remaining */
188	pop	{r4-r9}
189	RETc(eq)			/* Return now if done */
190	subs	r2, r2, #0x04
191	ldrge	ip, [r1], #0x04
192	strge	ip, [r3], #0x04
193	RETc(eq)			/* Return now if done */
194	addlt	r2, r2, #0x04
195	ldrb	ip, [r1], #0x01
196	cmp	r2, #0x02
197	ldrbge	r2, [r1], #0x01
198	strb	ip, [r3], #0x01
199	ldrbgt	ip, [r1]
200	strbge	r2, [r3], #0x01
201	strbgt	ip, [r3]
202	RET
203
204
205/*
206 * At this point, it has not been possible to word align both buffers.
207 * The destination buffer is word aligned, but the source buffer is not.
208 */
209.Lmemcpy_bad_align:
210	push	{r4-r7}
211	bic	r1, r1, #0x03
212	cmp	ip, #2
213	ldr	ip, [r1], #0x04
214	bgt	.Lmemcpy_bad3
215	beq	.Lmemcpy_bad2
216	b	.Lmemcpy_bad1
217
218.Lmemcpy_bad1_loop16:
219#ifdef __ARMEB__
220	mov	r4, ip, lsl #8
221#else
222	mov	r4, ip, lsr #8
223#endif
224	ldr	r5, [r1], #0x04
225	pld	[r1, #0x018]
226	ldr	r6, [r1], #0x04
227	ldr	r7, [r1], #0x04
228	ldr	ip, [r1], #0x04
229#ifdef __ARMEB__
230	orr	r4, r4, r5, lsr #24
231	mov	r5, r5, lsl #8
232	orr	r5, r5, r6, lsr #24
233	mov	r6, r6, lsl #8
234	orr	r6, r6, r7, lsr #24
235	mov	r7, r7, lsl #8
236	orr	r7, r7, ip, lsr #24
237#else
238	orr	r4, r4, r5, lsl #24
239	mov	r5, r5, lsr #8
240	orr	r5, r5, r6, lsl #24
241	mov	r6, r6, lsr #8
242	orr	r6, r6, r7, lsl #24
243	mov	r7, r7, lsr #8
244	orr	r7, r7, ip, lsl #24
245#endif
246	str	r4, [r3], #0x04
247	str	r5, [r3], #0x04
248	str	r6, [r3], #0x04
249	str	r7, [r3], #0x04
250	sub	r2, r2, #0x10
251
252.Lmemcpy_bad1:
253	cmp	r2, #0x20
254	bge	.Lmemcpy_bad1_loop16
255	cmp	r2, #0x10
256	blt	.Lmemcpy_bad1_loop16_short
257
258	/* copy last 16 bytes (without preload) */
259#ifdef __ARMEB__
260	mov	r4, ip, lsl #8
261#else
262	mov	r4, ip, lsr #8
263#endif
264	ldr	r5, [r1], #0x04
265	ldr	r6, [r1], #0x04
266	ldr	r7, [r1], #0x04
267	ldr	ip, [r1], #0x04
268#ifdef __ARMEB__
269	orr	r4, r4, r5, lsr #24
270	mov	r5, r5, lsl #8
271	orr	r5, r5, r6, lsr #24
272	mov	r6, r6, lsl #8
273	orr	r6, r6, r7, lsr #24
274	mov	r7, r7, lsl #8
275	orr	r7, r7, ip, lsr #24
276#else
277	orr	r4, r4, r5, lsl #24
278	mov	r5, r5, lsr #8
279	orr	r5, r5, r6, lsl #24
280	mov	r6, r6, lsr #8
281	orr	r6, r6, r7, lsl #24
282	mov	r7, r7, lsr #8
283	orr	r7, r7, ip, lsl #24
284#endif
285	str	r4, [r3], #0x04
286	str	r5, [r3], #0x04
287	str	r6, [r3], #0x04
288	str	r7, [r3], #0x04
289	subs	r2, r2, #0x10
290	popeq	{r4-r7}
291	RETc(eq)			/* Return now if done */
292
293.Lmemcpy_bad1_loop16_short:
294	subs	r2, r2, #0x04
295	sublt	r1, r1, #0x03
296	blt	.Lmemcpy_bad_done
297
298.Lmemcpy_bad1_loop4:
299#ifdef __ARMEB__
300	mov	r4, ip, lsl #8
301#else
302	mov	r4, ip, lsr #8
303#endif
304	ldr	ip, [r1], #0x04
305	subs	r2, r2, #0x04
306#ifdef __ARMEB__
307	orr	r4, r4, ip, lsr #24
308#else
309	orr	r4, r4, ip, lsl #24
310#endif
311	str	r4, [r3], #0x04
312	bge	.Lmemcpy_bad1_loop4
313	sub	r1, r1, #0x03
314	b	.Lmemcpy_bad_done
315
316.Lmemcpy_bad2_loop16:
317#ifdef __ARMEB__
318	mov	r4, ip, lsl #16
319#else
320	mov	r4, ip, lsr #16
321#endif
322	ldr	r5, [r1], #0x04
323	pld	[r1, #0x018]
324	ldr	r6, [r1], #0x04
325	ldr	r7, [r1], #0x04
326	ldr	ip, [r1], #0x04
327#ifdef __ARMEB__
328	orr	r4, r4, r5, lsr #16
329	mov	r5, r5, lsl #16
330	orr	r5, r5, r6, lsr #16
331	mov	r6, r6, lsl #16
332	orr	r6, r6, r7, lsr #16
333	mov	r7, r7, lsl #16
334	orr	r7, r7, ip, lsr #16
335#else
336	orr	r4, r4, r5, lsl #16
337	mov	r5, r5, lsr #16
338	orr	r5, r5, r6, lsl #16
339	mov	r6, r6, lsr #16
340	orr	r6, r6, r7, lsl #16
341	mov	r7, r7, lsr #16
342	orr	r7, r7, ip, lsl #16
343#endif
344	str	r4, [r3], #0x04
345	str	r5, [r3], #0x04
346	str	r6, [r3], #0x04
347	str	r7, [r3], #0x04
348	sub	r2, r2, #0x10
349
350.Lmemcpy_bad2:
351	cmp	r2, #0x20
352	bge	.Lmemcpy_bad2_loop16
353	cmp	r2, #0x10
354	blt	.Lmemcpy_bad2_loop16_short
355
356	/* copy last 16 bytes (without preload) */
357#ifdef __ARMEB__
358	mov	r4, ip, lsl #16
359#else
360	mov	r4, ip, lsr #16
361#endif
362	ldr	r5, [r1], #0x04
363	ldr	r6, [r1], #0x04
364	ldr	r7, [r1], #0x04
365	ldr	ip, [r1], #0x04
366#ifdef __ARMEB__
367	orr	r4, r4, r5, lsr #16
368	mov	r5, r5, lsl #16
369	orr	r5, r5, r6, lsr #16
370	mov	r6, r6, lsl #16
371	orr	r6, r6, r7, lsr #16
372	mov	r7, r7, lsl #16
373	orr	r7, r7, ip, lsr #16
374#else
375	orr	r4, r4, r5, lsl #16
376	mov	r5, r5, lsr #16
377	orr	r5, r5, r6, lsl #16
378	mov	r6, r6, lsr #16
379	orr	r6, r6, r7, lsl #16
380	mov	r7, r7, lsr #16
381	orr	r7, r7, ip, lsl #16
382#endif
383	str	r4, [r3], #0x04
384	str	r5, [r3], #0x04
385	str	r6, [r3], #0x04
386	str	r7, [r3], #0x04
387	subs	r2, r2, #0x10
388	popeq	{r4-r7}
389	RETc(eq)			/* Return now if done */
390
391.Lmemcpy_bad2_loop16_short:
392	subs	r2, r2, #0x04
393	sublt	r1, r1, #0x02
394	blt	.Lmemcpy_bad_done
395
396.Lmemcpy_bad2_loop4:
397#ifdef __ARMEB__
398	mov	r4, ip, lsl #16
399#else
400	mov	r4, ip, lsr #16
401#endif
402	ldr	ip, [r1], #0x04
403	subs	r2, r2, #0x04
404#ifdef __ARMEB__
405	orr	r4, r4, ip, lsr #16
406#else
407	orr	r4, r4, ip, lsl #16
408#endif
409	str	r4, [r3], #0x04
410	bge	.Lmemcpy_bad2_loop4
411	sub	r1, r1, #0x02
412	b	.Lmemcpy_bad_done
413
414.Lmemcpy_bad3_loop16:
415#ifdef __ARMEB__
416	mov	r4, ip, lsl #24
417#else
418	mov	r4, ip, lsr #24
419#endif
420	ldr	r5, [r1], #0x04
421	pld	[r1, #0x018]
422	ldr	r6, [r1], #0x04
423	ldr	r7, [r1], #0x04
424	ldr	ip, [r1], #0x04
425#ifdef __ARMEB__
426	orr	r4, r4, r5, lsr #8
427	mov	r5, r5, lsl #24
428	orr	r5, r5, r6, lsr #8
429	mov	r6, r6, lsl #24
430	orr	r6, r6, r7, lsr #8
431	mov	r7, r7, lsl #24
432	orr	r7, r7, ip, lsr #8
433#else
434	orr	r4, r4, r5, lsl #8
435	mov	r5, r5, lsr #24
436	orr	r5, r5, r6, lsl #8
437	mov	r6, r6, lsr #24
438	orr	r6, r6, r7, lsl #8
439	mov	r7, r7, lsr #24
440	orr	r7, r7, ip, lsl #8
441#endif
442	str	r4, [r3], #0x04
443	str	r5, [r3], #0x04
444	str	r6, [r3], #0x04
445	str	r7, [r3], #0x04
446	sub	r2, r2, #0x10
447
448.Lmemcpy_bad3:
449	cmp	r2, #0x20
450	bge	.Lmemcpy_bad3_loop16
451	cmp	r2, #0x10
452	blt	.Lmemcpy_bad3_loop16_short
453
454	/* copy last 16 bytes (without preload) */
455#ifdef __ARMEB__
456	mov	r4, ip, lsl #24
457#else
458	mov	r4, ip, lsr #24
459#endif
460	ldr	r5, [r1], #0x04
461	ldr	r6, [r1], #0x04
462	ldr	r7, [r1], #0x04
463	ldr	ip, [r1], #0x04
464#ifdef __ARMEB__
465	orr	r4, r4, r5, lsr #8
466	mov	r5, r5, lsl #24
467	orr	r5, r5, r6, lsr #8
468	mov	r6, r6, lsl #24
469	orr	r6, r6, r7, lsr #8
470	mov	r7, r7, lsl #24
471	orr	r7, r7, ip, lsr #8
472#else
473	orr	r4, r4, r5, lsl #8
474	mov	r5, r5, lsr #24
475	orr	r5, r5, r6, lsl #8
476	mov	r6, r6, lsr #24
477	orr	r6, r6, r7, lsl #8
478	mov	r7, r7, lsr #24
479	orr	r7, r7, ip, lsl #8
480#endif
481	str	r4, [r3], #0x04
482	str	r5, [r3], #0x04
483	str	r6, [r3], #0x04
484	str	r7, [r3], #0x04
485	subs	r2, r2, #0x10
486	popeq	{r4-r7}
487	RETc(eq)			/* Return now if done */
488
489.Lmemcpy_bad3_loop16_short:
490	subs	r2, r2, #0x04
491	sublt	r1, r1, #0x01
492	blt	.Lmemcpy_bad_done
493
494.Lmemcpy_bad3_loop4:
495#ifdef __ARMEB__
496	mov	r4, ip, lsl #24
497#else
498	mov	r4, ip, lsr #24
499#endif
500	ldr	ip, [r1], #0x04
501	subs	r2, r2, #0x04
502#ifdef __ARMEB__
503	orr	r4, r4, ip, lsr #8
504#else
505	orr	r4, r4, ip, lsl #8
506#endif
507	str	r4, [r3], #0x04
508	bge	.Lmemcpy_bad3_loop4
509	sub	r1, r1, #0x01
510
511.Lmemcpy_bad_done:
512	pop	{r4-r7}
513	adds	r2, r2, #0x04
514	RETc(eq)
515	ldrb	ip, [r1], #0x01
516	cmp	r2, #0x02
517	ldrbge	r2, [r1], #0x01
518	strb	ip, [r3], #0x01
519	ldrbgt	ip, [r1]
520	strbge	r2, [r3], #0x01
521	strbgt	ip, [r3]
522	RET
523
524
525/*
526 * Handle short copies (less than 16 bytes), possibly misaligned.
527 * Some of these are *very* common, thanks to the network stack,
528 * and so are handled specially.
529 */
530.Lmemcpy_short:
531#ifndef _STANDALONE
532	add	pc, pc, r2, lsl #2
533	nop
534	RET				/* 0x00 */
535	b	.Lmemcpy_bytewise	/* 0x01 */
536	b	.Lmemcpy_bytewise	/* 0x02 */
537	b	.Lmemcpy_bytewise	/* 0x03 */
538	b	.Lmemcpy_4		/* 0x04 */
539	b	.Lmemcpy_bytewise	/* 0x05 */
540	b	.Lmemcpy_6		/* 0x06 */
541	b	.Lmemcpy_bytewise	/* 0x07 */
542	b	.Lmemcpy_8		/* 0x08 */
543	b	.Lmemcpy_bytewise	/* 0x09 */
544	b	.Lmemcpy_bytewise	/* 0x0a */
545	b	.Lmemcpy_bytewise	/* 0x0b */
546	b	.Lmemcpy_c		/* 0x0c */
547#endif
548.Lmemcpy_bytewise:
549	mov	r3, r0			/* We must not clobber r0 */
550	ldrb	ip, [r1], #0x01
5511:	subs	r2, r2, #0x01
552	strb	ip, [r3], #0x01
553	ldrbne	ip, [r1], #0x01
554	bne	1b
555	RET
556
557#ifndef _STANDALONE
558/******************************************************************************
559 * Special case for 4 byte copies
560 */
561#define	LMEMCPY_4_LOG2	6	/* 64 bytes */
562#define	LMEMCPY_4_PAD	.align LMEMCPY_4_LOG2
563	LMEMCPY_4_PAD
564.Lmemcpy_4:
565	and	r2, r1, #0x03
566	orr	r2, r2, r0, lsl #2
567	ands	r2, r2, #0x0f
568	sub	r3, pc, #0x14
569	addne	pc, r3, r2, lsl #LMEMCPY_4_LOG2
570
571/*
572 * 0000: dst is 32-bit aligned, src is 32-bit aligned
573 */
574	ldr	r2, [r1]
575	str	r2, [r0]
576	RET
577	LMEMCPY_4_PAD
578
579/*
580 * 0001: dst is 32-bit aligned, src is 8-bit aligned
581 */
582	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
583	ldr	r2, [r1, #3]		/* BE:r2 = 3xxx  LE:r2 = xxx3 */
584#ifdef __ARMEB__
585	mov	r3, r3, lsl #8		/* r3 = 012. */
586	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
587#else
588	mov	r3, r3, lsr #8		/* r3 = .210 */
589	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
590#endif
591	str	r3, [r0]
592	RET
593	LMEMCPY_4_PAD
594
595/*
596 * 0010: dst is 32-bit aligned, src is 16-bit aligned
597 */
598#ifdef __ARMEB__
599	ldrh	r3, [r1]
600	ldrh	r2, [r1, #0x02]
601#else
602	ldrh	r3, [r1, #0x02]
603	ldrh	r2, [r1]
604#endif
605	orr	r3, r2, r3, lsl #16
606	str	r3, [r0]
607	RET
608	LMEMCPY_4_PAD
609
610/*
611 * 0011: dst is 32-bit aligned, src is 8-bit aligned
612 */
613	ldr	r3, [r1, #-3]		/* BE:r3 = xxx0  LE:r3 = 0xxx */
614	ldr	r2, [r1, #1]		/* BE:r2 = 123x  LE:r2 = x321 */
615#ifdef __ARMEB__
616	mov	r3, r3, lsl #24		/* r3 = 0... */
617	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
618#else
619	mov	r3, r3, lsr #24		/* r3 = ...0 */
620	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
621#endif
622	str	r3, [r0]
623	RET
624	LMEMCPY_4_PAD
625
626/*
627 * 0100: dst is 8-bit aligned, src is 32-bit aligned
628 */
629	ldr	r2, [r1]
630#ifdef __ARMEB__
631	strb	r2, [r0, #0x03]
632	mov	r3, r2, lsr #8
633	mov	r1, r2, lsr #24
634	strb	r1, [r0]
635#else
636	strb	r2, [r0]
637	mov	r3, r2, lsr #8
638	mov	r1, r2, lsr #24
639	strb	r1, [r0, #0x03]
640#endif
641	strh	r3, [r0, #0x01]
642	RET
643	LMEMCPY_4_PAD
644
645/*
646 * 0101: dst is 8-bit aligned, src is 8-bit aligned
647 */
648	ldrb	r2, [r1]
649	ldrh	r3, [r1, #0x01]
650	ldrb	r1, [r1, #0x03]
651	strb	r2, [r0]
652	strh	r3, [r0, #0x01]
653	strb	r1, [r0, #0x03]
654	RET
655	LMEMCPY_4_PAD
656
657/*
658 * 0110: dst is 8-bit aligned, src is 16-bit aligned
659 */
660	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
661	ldrh	r3, [r1, #0x02]		/* LE:r3 = ..23  LE:r3 = ..32 */
662#ifdef __ARMEB__
663	mov	r1, r2, lsr #8		/* r1 = ...0 */
664	strb	r1, [r0]
665	mov	r2, r2, lsl #8		/* r2 = .01. */
666	orr	r2, r2, r3, lsr #8	/* r2 = .012 */
667#else
668	strb	r2, [r0]
669	mov	r2, r2, lsr #8		/* r2 = ...1 */
670	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
671	mov	r3, r3, lsr #8		/* r3 = ...3 */
672#endif
673	strh	r2, [r0, #0x01]
674	strb	r3, [r0, #0x03]
675	RET
676	LMEMCPY_4_PAD
677
678/*
679 * 0111: dst is 8-bit aligned, src is 8-bit aligned
680 */
681	ldrb	r2, [r1]
682	ldrh	r3, [r1, #0x01]
683	ldrb	r1, [r1, #0x03]
684	strb	r2, [r0]
685	strh	r3, [r0, #0x01]
686	strb	r1, [r0, #0x03]
687	RET
688	LMEMCPY_4_PAD
689
690/*
691 * 1000: dst is 16-bit aligned, src is 32-bit aligned
692 */
693	ldr	r2, [r1]
694#ifdef __ARMEB__
695	strh	r2, [r0, #0x02]
696	mov	r3, r2, lsr #16
697	strh	r3, [r0]
698#else
699	strh	r2, [r0]
700	mov	r3, r2, lsr #16
701	strh	r3, [r0, #0x02]
702#endif
703	RET
704	LMEMCPY_4_PAD
705
706/*
707 * 1001: dst is 16-bit aligned, src is 8-bit aligned
708 */
709	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
710	ldr	r3, [r1, #3]		/* BE:r3 = 3xxx  LE:r3 = xxx3 */
711	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
712	strh	r1, [r0]
713#ifdef __ARMEB__
714	mov	r2, r2, lsl #8		/* r2 = 012. */
715	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
716#else
717	mov	r2, r2, lsr #24		/* r2 = ...2 */
718	orr	r2, r2, r3, lsl #8	/* r2 = xx32 */
719#endif
720	strh	r2, [r0, #0x02]
721	RET
722	LMEMCPY_4_PAD
723
724/*
725 * 1010: dst is 16-bit aligned, src is 16-bit aligned
726 */
727	ldrh	r2, [r1]
728	ldrh	r3, [r1, #0x02]
729	strh	r2, [r0]
730	strh	r3, [r0, #0x02]
731	RET
732	LMEMCPY_4_PAD
733
734/*
735 * 1011: dst is 16-bit aligned, src is 8-bit aligned
736 */
737	ldr	r3, [r1, #1]		/* BE:r3 = 123x  LE:r3 = x321 */
738	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
739	mov	r1, r3, lsr #8		/* BE:r1 = .123  LE:r1 = .x32 */
740	strh	r1, [r0, #0x02]
741#ifdef __ARMEB__
742	mov	r3, r3, lsr #24		/* r3 = ...1 */
743	orr	r3, r3, r2, lsl #8	/* r3 = xx01 */
744#else
745	mov	r3, r3, lsl #8		/* r3 = 321. */
746	orr	r3, r3, r2, lsr #24	/* r3 = 3210 */
747#endif
748	strh	r3, [r0]
749	RET
750	LMEMCPY_4_PAD
751
752/*
753 * 1100: dst is 8-bit aligned, src is 32-bit aligned
754 */
755	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
756#ifdef __ARMEB__
757	strb	r2, [r0, #0x03]
758	mov	r3, r2, lsr #8
759	mov	r1, r2, lsr #24
760	strh	r3, [r0, #0x01]
761	strb	r1, [r0]
762#else
763	strb	r2, [r0]
764	mov	r3, r2, lsr #8
765	mov	r1, r2, lsr #24
766	strh	r3, [r0, #0x01]
767	strb	r1, [r0, #0x03]
768#endif
769	RET
770	LMEMCPY_4_PAD
771
772/*
773 * 1101: dst is 8-bit aligned, src is 8-bit aligned
774 */
775	ldrb	r2, [r1]
776	ldrh	r3, [r1, #0x01]
777	ldrb	r1, [r1, #0x03]
778	strb	r2, [r0]
779	strh	r3, [r0, #0x01]
780	strb	r1, [r0, #0x03]
781	RET
782	LMEMCPY_4_PAD
783
784/*
785 * 1110: dst is 8-bit aligned, src is 16-bit aligned
786 */
787#ifdef __ARMEB__
788	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
789	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
790	strb	r3, [r0, #0x03]
791	mov	r3, r3, lsr #8		/* r3 = ...2 */
792	orr	r3, r3, r2, lsl #8	/* r3 = ..12 */
793	strh	r3, [r0, #0x01]
794	mov	r2, r2, lsr #8		/* r2 = ...0 */
795	strb	r2, [r0]
796#else
797	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
798	ldrh	r3, [r1, #0x02]		/* BE:r3 = ..23  LE:r3 = ..32 */
799	strb	r2, [r0]
800	mov	r2, r2, lsr #8		/* r2 = ...1 */
801	orr	r2, r2, r3, lsl #8	/* r2 = .321 */
802	strh	r2, [r0, #0x01]
803	mov	r3, r3, lsr #8		/* r3 = ...3 */
804	strb	r3, [r0, #0x03]
805#endif
806	RET
807	LMEMCPY_4_PAD
808
809/*
810 * 1111: dst is 8-bit aligned, src is 8-bit aligned
811 */
812	ldrb	r2, [r1]
813	ldrh	r3, [r1, #0x01]
814	ldrb	r1, [r1, #0x03]
815	strb	r2, [r0]
816	strh	r3, [r0, #0x01]
817	strb	r1, [r0, #0x03]
818	RET
819	LMEMCPY_4_PAD
820
821
822/******************************************************************************
823 * Special case for 6 byte copies
824 */
825#define	LMEMCPY_6_LOG2	6	/* 64 bytes */
826#define	LMEMCPY_6_PAD	.align LMEMCPY_6_LOG2
827	LMEMCPY_6_PAD
828.Lmemcpy_6:
829	and	r2, r1, #0x03
830	orr	r2, r2, r0, lsl #2
831	ands	r2, r2, #0x0f
832	sub	r3, pc, #0x14
833	addne	pc, r3, r2, lsl #LMEMCPY_6_LOG2
834
835/*
836 * 0000: dst is 32-bit aligned, src is 32-bit aligned
837 */
838	ldr	r2, [r1]
839	ldrh	r3, [r1, #0x04]
840	str	r2, [r0]
841	strh	r3, [r0, #0x04]
842	RET
843	LMEMCPY_6_PAD
844
845/*
846 * 0001: dst is 32-bit aligned, src is 8-bit aligned
847 */
848	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
849	ldr	r3, [r1, #0x03]		/* BE:r3 = 345x  LE:r3 = x543 */
850#ifdef __ARMEB__
851	mov	r2, r2, lsl #8		/* r2 = 012. */
852	orr	r2, r2, r3, lsr #24	/* r2 = 0123 */
853#else
854	mov	r2, r2, lsr #8		/* r2 = .210 */
855	orr	r2, r2, r3, lsl #24	/* r2 = 3210 */
856#endif
857	mov	r3, r3, lsr #8		/* BE:r3 = .345  LE:r3 = .x54 */
858	str	r2, [r0]
859	strh	r3, [r0, #0x04]
860	RET
861	LMEMCPY_6_PAD
862
863/*
864 * 0010: dst is 32-bit aligned, src is 16-bit aligned
865 */
866	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
867	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
868#ifdef __ARMEB__
869	mov	r1, r3, lsr #16		/* r1 = ..23 */
870	orr	r1, r1, r2, lsl #16	/* r1 = 0123 */
871	str	r1, [r0]
872	strh	r3, [r0, #0x04]
873#else
874	mov	r1, r3, lsr #16		/* r1 = ..54 */
875	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
876	str	r2, [r0]
877	strh	r1, [r0, #0x04]
878#endif
879	RET
880	LMEMCPY_6_PAD
881
882/*
883 * 0011: dst is 32-bit aligned, src is 8-bit aligned
884 */
885	ldr	r2, [r1, #-3]		/* BE:r2 = xxx0  LE:r2 = 0xxx */
886	ldr	r3, [r1, #1]		/* BE:r3 = 1234  LE:r3 = 4321 */
887	ldr	r1, [r1, #5]		/* BE:r1 = 5xxx  LE:r3 = xxx5 */
888#ifdef __ARMEB__
889	mov	r2, r2, lsl #24		/* r2 = 0... */
890	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
891	mov	r3, r3, lsl #8		/* r3 = 234. */
892	orr	r1, r3, r1, lsr #24	/* r1 = 2345 */
893#else
894	mov	r2, r2, lsr #24		/* r2 = ...0 */
895	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
896	mov	r1, r1, lsl #8		/* r1 = xx5. */
897	orr	r1, r1, r3, lsr #24	/* r1 = xx54 */
898#endif
899	str	r2, [r0]
900	strh	r1, [r0, #0x04]
901	RET
902	LMEMCPY_6_PAD
903
904/*
905 * 0100: dst is 8-bit aligned, src is 32-bit aligned
906 */
907	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
908	ldrh	r2, [r1, #0x04]		/* BE:r2 = ..45  LE:r2 = ..54 */
909	mov	r1, r3, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
910	strh	r1, [r0, #0x01]
911#ifdef __ARMEB__
912	mov	r1, r3, lsr #24		/* r1 = ...0 */
913	strb	r1, [r0]
914	mov	r3, r3, lsl #8		/* r3 = 123. */
915	orr	r3, r3, r2, lsr #8	/* r3 = 1234 */
916#else
917	strb	r3, [r0]
918	mov	r3, r3, lsr #24		/* r3 = ...3 */
919	orr	r3, r3, r2, lsl #8	/* r3 = .543 */
920	mov	r2, r2, lsr #8		/* r2 = ...5 */
921#endif
922	strh	r3, [r0, #0x03]
923	strb	r2, [r0, #0x05]
924	RET
925	LMEMCPY_6_PAD
926
927/*
928 * 0101: dst is 8-bit aligned, src is 8-bit aligned
929 */
930	ldrb	r2, [r1]
931	ldrh	r3, [r1, #0x01]
932	ldrh	ip, [r1, #0x03]
933	ldrb	r1, [r1, #0x05]
934	strb	r2, [r0]
935	strh	r3, [r0, #0x01]
936	strh	ip, [r0, #0x03]
937	strb	r1, [r0, #0x05]
938	RET
939	LMEMCPY_6_PAD
940
941/*
942 * 0110: dst is 8-bit aligned, src is 16-bit aligned
943 */
944	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
945	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
946#ifdef __ARMEB__
947	mov	r3, r2, lsr #8		/* r3 = ...0 */
948	strb	r3, [r0]
949	strb	r1, [r0, #0x05]
950	mov	r3, r1, lsr #8		/* r3 = .234 */
951	strh	r3, [r0, #0x03]
952	mov	r3, r2, lsl #8		/* r3 = .01. */
953	orr	r3, r3, r1, lsr #24	/* r3 = .012 */
954	strh	r3, [r0, #0x01]
955#else
956	strb	r2, [r0]
957	mov	r3, r1, lsr #24
958	strb	r3, [r0, #0x05]
959	mov	r3, r1, lsr #8		/* r3 = .543 */
960	strh	r3, [r0, #0x03]
961	mov	r3, r2, lsr #8		/* r3 = ...1 */
962	orr	r3, r3, r1, lsl #8	/* r3 = 4321 */
963	strh	r3, [r0, #0x01]
964#endif
965	RET
966	LMEMCPY_6_PAD
967
968/*
969 * 0111: dst is 8-bit aligned, src is 8-bit aligned
970 */
971	ldrb	r2, [r1]
972	ldrh	r3, [r1, #0x01]
973	ldrh	ip, [r1, #0x03]
974	ldrb	r1, [r1, #0x05]
975	strb	r2, [r0]
976	strh	r3, [r0, #0x01]
977	strh	ip, [r0, #0x03]
978	strb	r1, [r0, #0x05]
979	RET
980	LMEMCPY_6_PAD
981
982/*
983 * 1000: dst is 16-bit aligned, src is 32-bit aligned
984 */
985#ifdef __ARMEB__
986	ldr	r2, [r1]		/* r2 = 0123 */
987	ldrh	r3, [r1, #0x04]		/* r3 = ..45 */
988	mov	r1, r2, lsr #16		/* r1 = ..01 */
989	orr	r3, r3, r2, lsl#16	/* r3 = 2345 */
990	strh	r1, [r0]
991	str	r3, [r0, #0x02]
992#else
993	ldrh	r2, [r1, #0x04]		/* r2 = ..54 */
994	ldr	r3, [r1]		/* r3 = 3210 */
995	mov	r2, r2, lsl #16		/* r2 = 54.. */
996	orr	r2, r2, r3, lsr #16	/* r2 = 5432 */
997	strh	r3, [r0]
998	str	r2, [r0, #0x02]
999#endif
1000	RET
1001	LMEMCPY_6_PAD
1002
1003/*
1004 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1005 */
1006	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1007	ldr	r2, [r1, #3]		/* BE:r2 = 345x  LE:r2 = x543 */
1008	mov	r1, r3, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1009#ifdef __ARMEB__
1010	mov	r2, r2, lsr #8		/* r2 = .345 */
1011	orr	r2, r2, r3, lsl #24	/* r2 = 2345 */
1012#else
1013	mov	r2, r2, lsl #8		/* r2 = 543. */
1014	orr	r2, r2, r3, lsr #24	/* r2 = 5432 */
1015#endif
1016	strh	r1, [r0]
1017	str	r2, [r0, #0x02]
1018	RET
1019	LMEMCPY_6_PAD
1020
1021/*
1022 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1023 */
1024	ldrh	r2, [r1]
1025	ldr	r3, [r1, #0x02]
1026	strh	r2, [r0]
1027	str	r3, [r0, #0x02]
1028	RET
1029	LMEMCPY_6_PAD
1030
1031/*
1032 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1033 */
1034	ldrb	r3, [r1]		/* r3 = ...0 */
1035	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
1036	ldrb	r1, [r1, #0x05]		/* r1 = ...5 */
1037#ifdef __ARMEB__
1038	mov	r3, r3, lsl #8		/* r3 = ..0. */
1039	orr	r3, r3, r2, lsr #24	/* r3 = ..01 */
1040	orr	r1, r1, r2, lsl #8	/* r1 = 2345 */
1041#else
1042	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1043	mov	r1, r1, lsl #24		/* r1 = 5... */
1044	orr	r1, r1, r2, lsr #8	/* r1 = 5432 */
1045#endif
1046	strh	r3, [r0]
1047	str	r1, [r0, #0x02]
1048	RET
1049	LMEMCPY_6_PAD
1050
1051/*
1052 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1053 */
1054	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1055	ldrh	r1, [r1, #0x04]		/* BE:r1 = ..45  LE:r1 = ..54 */
1056#ifdef __ARMEB__
1057	mov	r3, r2, lsr #24		/* r3 = ...0 */
1058	strb	r3, [r0]
1059	mov	r2, r2, lsl #8		/* r2 = 123. */
1060	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
1061#else
1062	strb	r2, [r0]
1063	mov	r2, r2, lsr #8		/* r2 = .321 */
1064	orr	r2, r2, r1, lsl #24	/* r2 = 4321 */
1065	mov	r1, r1, lsr #8		/* r1 = ...5 */
1066#endif
1067	str	r2, [r0, #0x01]
1068	strb	r1, [r0, #0x05]
1069	RET
1070	LMEMCPY_6_PAD
1071
1072/*
1073 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1074 */
1075	ldrb	r2, [r1]
1076	ldrh	r3, [r1, #0x01]
1077	ldrh	ip, [r1, #0x03]
1078	ldrb	r1, [r1, #0x05]
1079	strb	r2, [r0]
1080	strh	r3, [r0, #0x01]
1081	strh	ip, [r0, #0x03]
1082	strb	r1, [r0, #0x05]
1083	RET
1084	LMEMCPY_6_PAD
1085
1086/*
1087 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1088 */
1089	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1090	ldr	r1, [r1, #0x02]		/* BE:r1 = 2345  LE:r1 = 5432 */
1091#ifdef __ARMEB__
1092	mov	r3, r2, lsr #8		/* r3 = ...0 */
1093	strb	r3, [r0]
1094	mov	r2, r2, lsl #24		/* r2 = 1... */
1095	orr	r2, r2, r1, lsr #8	/* r2 = 1234 */
1096#else
1097	strb	r2, [r0]
1098	mov	r2, r2, lsr #8		/* r2 = ...1 */
1099	orr	r2, r2, r1, lsl #8	/* r2 = 4321 */
1100	mov	r1, r1, lsr #24		/* r1 = ...5 */
1101#endif
1102	str	r2, [r0, #0x01]
1103	strb	r1, [r0, #0x05]
1104	RET
1105	LMEMCPY_6_PAD
1106
1107/*
1108 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1109 */
1110	ldrb	r2, [r1]
1111	ldr	r3, [r1, #0x01]
1112	ldrb	r1, [r1, #0x05]
1113	strb	r2, [r0]
1114	str	r3, [r0, #0x01]
1115	strb	r1, [r0, #0x05]
1116	RET
1117	LMEMCPY_6_PAD
1118
1119
1120/******************************************************************************
1121 * Special case for 8 byte copies
1122 */
1123#define	LMEMCPY_8_LOG2	6	/* 64 bytes */
1124#define	LMEMCPY_8_PAD	.align LMEMCPY_8_LOG2
1125	LMEMCPY_8_PAD
1126.Lmemcpy_8:
1127	and	r2, r1, #0x03
1128	orr	r2, r2, r0, lsl #2
1129	ands	r2, r2, #0x0f
1130	sub	r3, pc, #0x14
1131	addne	pc, r3, r2, lsl #LMEMCPY_8_LOG2
1132
1133/*
1134 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1135 */
1136	ldr	r2, [r1]
1137	ldr	r3, [r1, #0x04]
1138	str	r2, [r0]
1139	str	r3, [r0, #0x04]
1140	RET
1141	LMEMCPY_8_PAD
1142
1143/*
1144 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1145 */
1146	ldr	r3, [r1, #-1]		/* BE:r3 = x012  LE:r3 = 210x */
1147	ldr	r2, [r1, #0x03]		/* BE:r2 = 3456  LE:r2 = 6543 */
1148	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1149#ifdef __ARMEB__
1150	mov	r3, r3, lsl #8		/* r3 = 012. */
1151	orr	r3, r3, r2, lsr #24	/* r3 = 0123 */
1152	orr	r2, r1, r2, lsl #8	/* r2 = 4567 */
1153#else
1154	mov	r3, r3, lsr #8		/* r3 = .210 */
1155	orr	r3, r3, r2, lsl #24	/* r3 = 3210 */
1156	mov	r1, r1, lsl #24		/* r1 = 7... */
1157	orr	r2, r1, r2, lsr #8	/* r2 = 7654 */
1158#endif
1159	str	r3, [r0]
1160	str	r2, [r0, #0x04]
1161	RET
1162	LMEMCPY_8_PAD
1163
1164/*
1165 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1166 */
1167	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1168	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1169	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1170#ifdef __ARMEB__
1171	mov	r2, r2, lsl #16		/* r2 = 01.. */
1172	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
1173	orr	r3, r1, r3, lsl #16	/* r3 = 4567 */
1174#else
1175	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1176	mov	r3, r3, lsr #16		/* r3 = ..54 */
1177	orr	r3, r3, r1, lsl #16	/* r3 = 7654 */
1178#endif
1179	str	r2, [r0]
1180	str	r3, [r0, #0x04]
1181	RET
1182	LMEMCPY_8_PAD
1183
1184/*
1185 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1186 */
1187	ldrb	r3, [r1]		/* r3 = ...0 */
1188	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
1189	ldr	r1, [r1, #0x05]		/* BE:r1 = 567x  LE:r1 = x765 */
1190#ifdef __ARMEB__
1191	mov	r3, r3, lsl #24		/* r3 = 0... */
1192	orr	r3, r3, r2, lsr #8	/* r3 = 0123 */
1193	mov	r2, r2, lsl #24		/* r2 = 4... */
1194	orr	r2, r2, r1, lsr #8	/* r2 = 4567 */
1195#else
1196	orr	r3, r3, r2, lsl #8	/* r3 = 3210 */
1197	mov	r2, r2, lsr #24		/* r2 = ...4 */
1198	orr	r2, r2, r1, lsl #8	/* r2 = 7654 */
1199#endif
1200	str	r3, [r0]
1201	str	r2, [r0, #0x04]
1202	RET
1203	LMEMCPY_8_PAD
1204
1205/*
1206 * 0100: dst is 8-bit aligned, src is 32-bit aligned
1207 */
1208	ldr	r3, [r1]		/* BE:r3 = 0123  LE:r3 = 3210 */
1209	ldr	r2, [r1, #0x04]		/* BE:r2 = 4567  LE:r2 = 7654 */
1210#ifdef __ARMEB__
1211	mov	r1, r3, lsr #24		/* r1 = ...0 */
1212	strb	r1, [r0]
1213	mov	r1, r3, lsr #8		/* r1 = .012 */
1214	strb	r2, [r0, #0x07]
1215	mov	r3, r3, lsl #24		/* r3 = 3... */
1216	orr	r3, r3, r2, lsr #8	/* r3 = 3456 */
1217#else
1218	strb	r3, [r0]
1219	mov	r1, r2, lsr #24		/* r1 = ...7 */
1220	strb	r1, [r0, #0x07]
1221	mov	r1, r3, lsr #8		/* r1 = .321 */
1222	mov	r3, r3, lsr #24		/* r3 = ...3 */
1223	orr	r3, r3, r2, lsl #8	/* r3 = 6543 */
1224#endif
1225	strh	r1, [r0, #0x01]
1226	str	r3, [r0, #0x03]
1227	RET
1228	LMEMCPY_8_PAD
1229
1230/*
1231 * 0101: dst is 8-bit aligned, src is 8-bit aligned
1232 */
1233	ldrb	r2, [r1]
1234	ldrh	r3, [r1, #0x01]
1235	ldr	ip, [r1, #0x03]
1236	ldrb	r1, [r1, #0x07]
1237	strb	r2, [r0]
1238	strh	r3, [r0, #0x01]
1239	str	ip, [r0, #0x03]
1240	strb	r1, [r0, #0x07]
1241	RET
1242	LMEMCPY_8_PAD
1243
1244/*
1245 * 0110: dst is 8-bit aligned, src is 16-bit aligned
1246 */
1247	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1248	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1249	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1250#ifdef __ARMEB__
1251	mov	ip, r2, lsr #8		/* ip = ...0 */
1252	strb	ip, [r0]
1253	mov	ip, r2, lsl #8		/* ip = .01. */
1254	orr	ip, ip, r3, lsr #24	/* ip = .012 */
1255	strb	r1, [r0, #0x07]
1256	mov	r3, r3, lsl #8		/* r3 = 345. */
1257	orr	r3, r3, r1, lsr #8	/* r3 = 3456 */
1258#else
1259	strb	r2, [r0]		/* 0 */
1260	mov	ip, r1, lsr #8		/* ip = ...7 */
1261	strb	ip, [r0, #0x07]		/* 7 */
1262	mov	ip, r2, lsr #8		/* ip = ...1 */
1263	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
1264	mov	r3, r3, lsr #8		/* r3 = .543 */
1265	orr	r3, r3, r1, lsl #24	/* r3 = 6543 */
1266#endif
1267	strh	ip, [r0, #0x01]
1268	str	r3, [r0, #0x03]
1269	RET
1270	LMEMCPY_8_PAD
1271
1272/*
1273 * 0111: dst is 8-bit aligned, src is 8-bit aligned
1274 */
1275	ldrb	r3, [r1]		/* r3 = ...0 */
1276	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
1277	ldrh	r2, [r1, #0x05]		/* BE:r2 = ..56  LE:r2 = ..65 */
1278	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1279	strb	r3, [r0]
1280	mov	r3, ip, lsr #16		/* BE:r3 = ..12  LE:r3 = ..43 */
1281#ifdef __ARMEB__
1282	strh	r3, [r0, #0x01]
1283	orr	r2, r2, ip, lsl #16	/* r2 = 3456 */
1284#else
1285	strh	ip, [r0, #0x01]
1286	orr	r2, r3, r2, lsl #16	/* r2 = 6543 */
1287#endif
1288	str	r2, [r0, #0x03]
1289	strb	r1, [r0, #0x07]
1290	RET
1291	LMEMCPY_8_PAD
1292
1293/*
1294 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1295 */
1296	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1297	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1298	mov	r1, r2, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
1299#ifdef __ARMEB__
1300	strh	r1, [r0]
1301	mov	r1, r3, lsr #16		/* r1 = ..45 */
1302	orr	r2, r1 ,r2, lsl #16	/* r2 = 2345 */
1303#else
1304	strh	r2, [r0]
1305	orr	r2, r1, r3, lsl #16	/* r2 = 5432 */
1306	mov	r3, r3, lsr #16		/* r3 = ..76 */
1307#endif
1308	str	r2, [r0, #0x02]
1309	strh	r3, [r0, #0x06]
1310	RET
1311	LMEMCPY_8_PAD
1312
1313/*
1314 * 1001: dst is 16-bit aligned, src is 8-bit aligned
1315 */
1316	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1317	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1318	ldrb	ip, [r1, #0x07]		/* ip = ...7 */
1319	mov	r1, r2, lsr #8		/* BE:r1 = .x01  LE:r1 = .210 */
1320	strh	r1, [r0]
1321#ifdef __ARMEB__
1322	mov	r1, r2, lsl #24		/* r1 = 2... */
1323	orr	r1, r1, r3, lsr #8	/* r1 = 2345 */
1324	orr	r3, ip, r3, lsl #8	/* r3 = 4567 */
1325#else
1326	mov	r1, r2, lsr #24		/* r1 = ...2 */
1327	orr	r1, r1, r3, lsl #8	/* r1 = 5432 */
1328	mov	r3, r3, lsr #24		/* r3 = ...6 */
1329	orr	r3, r3, ip, lsl #8	/* r3 = ..76 */
1330#endif
1331	str	r1, [r0, #0x02]
1332	strh	r3, [r0, #0x06]
1333	RET
1334	LMEMCPY_8_PAD
1335
1336/*
1337 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1338 */
1339	ldrh	r2, [r1]
1340	ldr	ip, [r1, #0x02]
1341	ldrh	r3, [r1, #0x06]
1342	strh	r2, [r0]
1343	str	ip, [r0, #0x02]
1344	strh	r3, [r0, #0x06]
1345	RET
1346	LMEMCPY_8_PAD
1347
1348/*
1349 * 1011: dst is 16-bit aligned, src is 8-bit aligned
1350 */
1351	ldr	r3, [r1, #0x05]		/* BE:r3 = 567x  LE:r3 = x765 */
1352	ldr	r2, [r1, #0x01]		/* BE:r2 = 1234  LE:r2 = 4321 */
1353	ldrb	ip, [r1]		/* ip = ...0 */
1354	mov	r1, r3, lsr #8		/* BE:r1 = .567  LE:r1 = .x76 */
1355	strh	r1, [r0, #0x06]
1356#ifdef __ARMEB__
1357	mov	r3, r3, lsr #24		/* r3 = ...5 */
1358	orr	r3, r3, r2, lsl #8	/* r3 = 2345 */
1359	mov	r2, r2, lsr #24		/* r2 = ...1 */
1360	orr	r2, r2, ip, lsl #8	/* r2 = ..01 */
1361#else
1362	mov	r3, r3, lsl #24		/* r3 = 5... */
1363	orr	r3, r3, r2, lsr #8	/* r3 = 5432 */
1364	orr	r2, ip, r2, lsl #8	/* r2 = 3210 */
1365#endif
1366	str	r3, [r0, #0x02]
1367	strh	r2, [r0]
1368	RET
1369	LMEMCPY_8_PAD
1370
1371/*
1372 * 1100: dst is 8-bit aligned, src is 32-bit aligned
1373 */
1374	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1375	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1376	mov	r1, r3, lsr #8		/* BE:r1 = .456  LE:r1 = .765 */
1377	strh	r1, [r0, #0x05]
1378#ifdef __ARMEB__
1379	strb	r3, [r0, #0x07]
1380	mov	r1, r2, lsr #24		/* r1 = ...0 */
1381	strb	r1, [r0]
1382	mov	r2, r2, lsl #8		/* r2 = 123. */
1383	orr	r2, r2, r3, lsr #24	/* r2 = 1234 */
1384	str	r2, [r0, #0x01]
1385#else
1386	strb	r2, [r0]
1387	mov	r1, r3, lsr #24		/* r1 = ...7 */
1388	strb	r1, [r0, #0x07]
1389	mov	r2, r2, lsr #8		/* r2 = .321 */
1390	orr	r2, r2, r3, lsl #24	/* r2 = 4321 */
1391	str	r2, [r0, #0x01]
1392#endif
1393	RET
1394	LMEMCPY_8_PAD
1395
1396/*
1397 * 1101: dst is 8-bit aligned, src is 8-bit aligned
1398 */
1399	ldrb	r3, [r1]		/* r3 = ...0 */
1400	ldrh	r2, [r1, #0x01]		/* BE:r2 = ..12  LE:r2 = ..21 */
1401	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
1402	ldrb	r1, [r1, #0x07]		/* r1 = ...7 */
1403	strb	r3, [r0]
1404	mov	r3, ip, lsr #16		/* BE:r3 = ..34  LE:r3 = ..65 */
1405#ifdef __ARMEB__
1406	strh	ip, [r0, #0x05]
1407	orr	r2, r3, r2, lsl #16	/* r2 = 1234 */
1408#else
1409	strh	r3, [r0, #0x05]
1410	orr	r2, r2, ip, lsl #16	/* r2 = 4321 */
1411#endif
1412	str	r2, [r0, #0x01]
1413	strb	r1, [r0, #0x07]
1414	RET
1415	LMEMCPY_8_PAD
1416
1417/*
1418 * 1110: dst is 8-bit aligned, src is 16-bit aligned
1419 */
1420	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1421	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1422	ldrh	r1, [r1, #0x06]		/* BE:r1 = ..67  LE:r1 = ..76 */
1423#ifdef __ARMEB__
1424	mov	ip, r2, lsr #8		/* ip = ...0 */
1425	strb	ip, [r0]
1426	mov	ip, r2, lsl #24		/* ip = 1... */
1427	orr	ip, ip, r3, lsr #8	/* ip = 1234 */
1428	strb	r1, [r0, #0x07]
1429	mov	r1, r1, lsr #8		/* r1 = ...6 */
1430	orr	r1, r1, r3, lsl #8	/* r1 = 3456 */
1431#else
1432	strb	r2, [r0]
1433	mov	ip, r2, lsr #8		/* ip = ...1 */
1434	orr	ip, ip, r3, lsl #8	/* ip = 4321 */
1435	mov	r2, r1, lsr #8		/* r2 = ...7 */
1436	strb	r2, [r0, #0x07]
1437	mov	r1, r1, lsl #8		/* r1 = .76. */
1438	orr	r1, r1, r3, lsr #24	/* r1 = .765 */
1439#endif
1440	str	ip, [r0, #0x01]
1441	strh	r1, [r0, #0x05]
1442	RET
1443	LMEMCPY_8_PAD
1444
1445/*
1446 * 1111: dst is 8-bit aligned, src is 8-bit aligned
1447 */
1448	ldrb	r2, [r1]
1449	ldr	ip, [r1, #0x01]
1450	ldrh	r3, [r1, #0x05]
1451	ldrb	r1, [r1, #0x07]
1452	strb	r2, [r0]
1453	str	ip, [r0, #0x01]
1454	strh	r3, [r0, #0x05]
1455	strb	r1, [r0, #0x07]
1456	RET
1457	LMEMCPY_8_PAD
1458
1459/******************************************************************************
1460 * Special case for 12 byte copies
1461 */
1462#define	LMEMCPY_C_LOG2	7	/* 128 bytes */
1463#define	LMEMCPY_C_PAD	.align LMEMCPY_C_LOG2
1464	LMEMCPY_C_PAD
1465.Lmemcpy_c:
1466	and	r2, r1, #0x03
1467	orr	r2, r2, r0, lsl #2
1468	ands	r2, r2, #0x0f
1469	sub	r3, pc, #0x14
1470	addne	pc, r3, r2, lsl #LMEMCPY_C_LOG2
1471
1472/*
1473 * 0000: dst is 32-bit aligned, src is 32-bit aligned
1474 */
1475	ldr	r2, [r1]
1476	ldr	r3, [r1, #0x04]
1477	ldr	r1, [r1, #0x08]
1478	str	r2, [r0]
1479	str	r3, [r0, #0x04]
1480	str	r1, [r0, #0x08]
1481	RET
1482	LMEMCPY_C_PAD
1483
1484/*
1485 * 0001: dst is 32-bit aligned, src is 8-bit aligned
1486 */
1487	ldrb	r2, [r1, #0xb]		/* r2 = ...B */
1488	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
1489	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1490	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
1491#ifdef __ARMEB__
1492	orr	r2, r2, ip, lsl #8	/* r2 = 89AB */
1493	str	r2, [r0, #0x08]
1494	mov	r2, ip, lsr #24		/* r2 = ...7 */
1495	orr	r2, r2, r3, lsl #8	/* r2 = 4567 */
1496	mov	r1, r1, lsl #8		/* r1 = 012. */
1497	orr	r1, r1, r3, lsr #24	/* r1 = 0123 */
1498#else
1499	mov	r2, r2, lsl #24		/* r2 = B... */
1500	orr	r2, r2, ip, lsr #8	/* r2 = BA98 */
1501	str	r2, [r0, #0x08]
1502	mov	r2, ip, lsl #24		/* r2 = 7... */
1503	orr	r2, r2, r3, lsr #8	/* r2 = 7654 */
1504	mov	r1, r1, lsr #8		/* r1 = .210 */
1505	orr	r1, r1, r3, lsl #24	/* r1 = 3210 */
1506#endif
1507	str	r2, [r0, #0x04]
1508	str	r1, [r0]
1509	RET
1510	LMEMCPY_C_PAD
1511
1512/*
1513 * 0010: dst is 32-bit aligned, src is 16-bit aligned
1514 */
1515	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1516	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1517	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
1518	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
1519#ifdef __ARMEB__
1520	mov	r2, r2, lsl #16		/* r2 = 01.. */
1521	orr	r2, r2, r3, lsr #16	/* r2 = 0123 */
1522	str	r2, [r0]
1523	mov	r3, r3, lsl #16		/* r3 = 45.. */
1524	orr	r3, r3, ip, lsr #16	/* r3 = 4567 */
1525	orr	r1, r1, ip, lsl #16	/* r1 = 89AB */
1526#else
1527	orr	r2, r2, r3, lsl #16	/* r2 = 3210 */
1528	str	r2, [r0]
1529	mov	r3, r3, lsr #16		/* r3 = ..54 */
1530	orr	r3, r3, ip, lsl #16	/* r3 = 7654 */
1531	mov	r1, r1, lsl #16		/* r1 = BA.. */
1532	orr	r1, r1, ip, lsr #16	/* r1 = BA98 */
1533#endif
1534	str	r3, [r0, #0x04]
1535	str	r1, [r0, #0x08]
1536	RET
1537	LMEMCPY_C_PAD
1538
1539/*
1540 * 0011: dst is 32-bit aligned, src is 8-bit aligned
1541 */
1542	ldrb	r2, [r1]		/* r2 = ...0 */
1543	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
1544	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
1545	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
1546#ifdef __ARMEB__
1547	mov	r2, r2, lsl #24		/* r2 = 0... */
1548	orr	r2, r2, r3, lsr #8	/* r2 = 0123 */
1549	str	r2, [r0]
1550	mov	r3, r3, lsl #24		/* r3 = 4... */
1551	orr	r3, r3, ip, lsr #8	/* r3 = 4567 */
1552	mov	r1, r1, lsr #8		/* r1 = .9AB */
1553	orr	r1, r1, ip, lsl #24	/* r1 = 89AB */
1554#else
1555	orr	r2, r2, r3, lsl #8	/* r2 = 3210 */
1556	str	r2, [r0]
1557	mov	r3, r3, lsr #24		/* r3 = ...4 */
1558	orr	r3, r3, ip, lsl #8	/* r3 = 7654 */
1559	mov	r1, r1, lsl #8		/* r1 = BA9. */
1560	orr	r1, r1, ip, lsr #24	/* r1 = BA98 */
1561#endif
1562	str	r3, [r0, #0x04]
1563	str	r1, [r0, #0x08]
1564	RET
1565	LMEMCPY_C_PAD
1566
1567/*
1568 * 0100: dst is 8-bit aligned (byte 1), src is 32-bit aligned
1569 */
1570	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1571	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1572	ldr	ip, [r1, #0x08]		/* BE:ip = 89AB  LE:ip = BA98 */
1573	mov	r1, r2, lsr #8		/* BE:r1 = .012  LE:r1 = .321 */
1574	strh	r1, [r0, #0x01]
1575#ifdef __ARMEB__
1576	mov	r1, r2, lsr #24		/* r1 = ...0 */
1577	strb	r1, [r0]
1578	mov	r1, r2, lsl #24		/* r1 = 3... */
1579	orr	r2, r1, r3, lsr #8	/* r1 = 3456 */
1580	mov	r1, r3, lsl #24		/* r1 = 7... */
1581	orr	r1, r1, ip, lsr #8	/* r1 = 789A */
1582#else
1583	strb	r2, [r0]
1584	mov	r1, r2, lsr #24		/* r1 = ...3 */
1585	orr	r2, r1, r3, lsl #8	/* r1 = 6543 */
1586	mov	r1, r3, lsr #24		/* r1 = ...7 */
1587	orr	r1, r1, ip, lsl #8	/* r1 = A987 */
1588	mov	ip, ip, lsr #24		/* ip = ...B */
1589#endif
1590	str	r2, [r0, #0x03]
1591	str	r1, [r0, #0x07]
1592	strb	ip, [r0, #0x0b]
1593	RET
1594	LMEMCPY_C_PAD
1595
1596/*
1597 * 0101: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 1)
1598 */
1599	ldrb	r2, [r1]
1600	ldrh	r3, [r1, #0x01]
1601	ldr	ip, [r1, #0x03]
1602	strb	r2, [r0]
1603	ldr	r2, [r1, #0x07]
1604	ldrb	r1, [r1, #0x0b]
1605	strh	r3, [r0, #0x01]
1606	str	ip, [r0, #0x03]
1607	str	r2, [r0, #0x07]
1608	strb	r1, [r0, #0x0b]
1609	RET
1610	LMEMCPY_C_PAD
1611
1612/*
1613 * 0110: dst is 8-bit aligned (byte 1), src is 16-bit aligned
1614 */
1615	ldrh	r2, [r1]		/* BE:r2 = ..01  LE:r2 = ..10 */
1616	ldr	r3, [r1, #0x02]		/* BE:r3 = 2345  LE:r3 = 5432 */
1617	ldr	ip, [r1, #0x06]		/* BE:ip = 6789  LE:ip = 9876 */
1618	ldrh	r1, [r1, #0x0a]		/* BE:r1 = ..AB  LE:r1 = ..BA */
1619#ifdef __ARMEB__
1620	mov	r2, r2, ror #8		/* r2 = 1..0 */
1621	strb	r2, [r0]
1622	mov	r2, r2, lsr #16		/* r2 = ..1. */
1623	orr	r2, r2, r3, lsr #24	/* r2 = ..12 */
1624	strh	r2, [r0, #0x01]
1625	mov	r2, r3, lsl #8		/* r2 = 345. */
1626	orr	r3, r2, ip, lsr #24	/* r3 = 3456 */
1627	mov	r2, ip, lsl #8		/* r2 = 789. */
1628	orr	r2, r2, r1, lsr #8	/* r2 = 789A */
1629#else
1630	strb	r2, [r0]
1631	mov	r2, r2, lsr #8		/* r2 = ...1 */
1632	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
1633	strh	r2, [r0, #0x01]
1634	mov	r2, r3, lsr #8		/* r2 = .543 */
1635	orr	r3, r2, ip, lsl #24	/* r3 = 6543 */
1636	mov	r2, ip, lsr #8		/* r2 = .987 */
1637	orr	r2, r2, r1, lsl #24	/* r2 = A987 */
1638	mov	r1, r1, lsr #8		/* r1 = ...B */
1639#endif
1640	str	r3, [r0, #0x03]
1641	str	r2, [r0, #0x07]
1642	strb	r1, [r0, #0x0b]
1643	RET
1644	LMEMCPY_C_PAD
1645
1646/*
1647 * 0111: dst is 8-bit aligned (byte 1), src is 8-bit aligned (byte 3)
1648 */
1649	ldrb	r2, [r1]
1650	ldr	r3, [r1, #0x01]		/* BE:r3 = 1234  LE:r3 = 4321 */
1651	ldr	ip, [r1, #0x05]		/* BE:ip = 5678  LE:ip = 8765 */
1652	ldr	r1, [r1, #0x09]		/* BE:r1 = 9ABx  LE:r1 = xBA9 */
1653	strb	r2, [r0]
1654#ifdef __ARMEB__
1655	mov	r2, r3, lsr #16		/* r2 = ..12 */
1656	strh	r2, [r0, #0x01]
1657	mov	r3, r3, lsl #16		/* r3 = 34.. */
1658	orr	r3, r3, ip, lsr #16	/* r3 = 3456 */
1659	mov	ip, ip, lsl #16		/* ip = 78.. */
1660	orr	ip, ip, r1, lsr #16	/* ip = 789A */
1661	mov	r1, r1, lsr #8		/* r1 = .9AB */
1662#else
1663	strh	r3, [r0, #0x01]
1664	mov	r3, r3, lsr #16		/* r3 = ..43 */
1665	orr	r3, r3, ip, lsl #16	/* r3 = 6543 */
1666	mov	ip, ip, lsr #16		/* ip = ..87 */
1667	orr	ip, ip, r1, lsl #16	/* ip = A987 */
1668	mov	r1, r1, lsr #16		/* r1 = ..xB */
1669#endif
1670	str	r3, [r0, #0x03]
1671	str	ip, [r0, #0x07]
1672	strb	r1, [r0, #0x0b]
1673	RET
1674	LMEMCPY_C_PAD
1675
1676/*
1677 * 1000: dst is 16-bit aligned, src is 32-bit aligned
1678 */
1679	ldr	ip, [r1]		/* BE:ip = 0123  LE:ip = 3210 */
1680	ldr	r3, [r1, #0x04]		/* BE:r3 = 4567  LE:r3 = 7654 */
1681	ldr	r2, [r1, #0x08]		/* BE:r2 = 89AB  LE:r2 = BA98 */
1682	mov	r1, ip, lsr #16		/* BE:r1 = ..01  LE:r1 = ..32 */
1683#ifdef __ARMEB__
1684	strh	r1, [r0]
1685	mov	r1, ip, lsl #16		/* r1 = 23.. */
1686	orr	r1, r1, r3, lsr #16	/* r1 = 2345 */
1687	mov	r3, r3, lsl #16		/* r3 = 67.. */
1688	orr	r3, r3, r2, lsr #16	/* r3 = 6789 */
1689#else
1690	strh	ip, [r0]
1691	orr	r1, r1, r3, lsl #16	/* r1 = 5432 */
1692	mov	r3, r3, lsr #16		/* r3 = ..76 */
1693	orr	r3, r3, r2, lsl #16	/* r3 = 9876 */
1694	mov	r2, r2, lsr #16		/* r2 = ..BA */
1695#endif
1696	str	r1, [r0, #0x02]
1697	str	r3, [r0, #0x06]
1698	strh	r2, [r0, #0x0a]
1699	RET
1700	LMEMCPY_C_PAD
1701
1702/*
1703 * 1001: dst is 16-bit aligned, src is 8-bit aligned (byte 1)
1704 */
1705	ldr	r2, [r1, #-1]		/* BE:r2 = x012  LE:r2 = 210x */
1706	ldr	r3, [r1, #0x03]		/* BE:r3 = 3456  LE:r3 = 6543 */
1707	mov	ip, r2, lsr #8		/* BE:ip = .x01  LE:ip = .210 */
1708	strh	ip, [r0]
1709	ldr	ip, [r1, #0x07]		/* BE:ip = 789A  LE:ip = A987 */
1710	ldrb	r1, [r1, #0x0b]		/* r1 = ...B */
1711#ifdef __ARMEB__
1712	mov	r2, r2, lsl #24		/* r2 = 2... */
1713	orr	r2, r2, r3, lsr #8	/* r2 = 2345 */
1714	mov	r3, r3, lsl #24		/* r3 = 6... */
1715	orr	r3, r3, ip, lsr #8	/* r3 = 6789 */
1716	orr	r1, r1, ip, lsl #8	/* r1 = 89AB */
1717#else
1718	mov	r2, r2, lsr #24		/* r2 = ...2 */
1719	orr	r2, r2, r3, lsl #8	/* r2 = 5432 */
1720	mov	r3, r3, lsr #24		/* r3 = ...6 */
1721	orr	r3, r3, ip, lsl #8	/* r3 = 9876 */
1722	mov	r1, r1, lsl #8		/* r1 = ..B. */
1723	orr	r1, r1, ip, lsr #24	/* r1 = ..BA */
1724#endif
1725	str	r2, [r0, #0x02]
1726	str	r3, [r0, #0x06]
1727	strh	r1, [r0, #0x0a]
1728	RET
1729	LMEMCPY_C_PAD
1730
1731/*
1732 * 1010: dst is 16-bit aligned, src is 16-bit aligned
1733 */
1734	ldrh	r2, [r1]
1735	ldr	r3, [r1, #0x02]
1736	ldr	ip, [r1, #0x06]
1737	ldrh	r1, [r1, #0x0a]
1738	strh	r2, [r0]
1739	str	r3, [r0, #0x02]
1740	str	ip, [r0, #0x06]
1741	strh	r1, [r0, #0x0a]
1742	RET
1743	LMEMCPY_C_PAD
1744
1745/*
1746 * 1011: dst is 16-bit aligned, src is 8-bit aligned (byte 3)
1747 */
1748	ldr	r2, [r1, #0x09]		/* BE:r2 = 9ABx  LE:r2 = xBA9 */
1749	ldr	r3, [r1, #0x05]		/* BE:r3 = 5678  LE:r3 = 8765 */
1750	mov	ip, r2, lsr #8		/* BE:ip = .9AB  LE:ip = .xBA */
1751	strh	ip, [r0, #0x0a]
1752	ldr	ip, [r1, #0x01]		/* BE:ip = 1234  LE:ip = 4321 */
1753	ldrb	r1, [r1]		/* r1 = ...0 */
1754#ifdef __ARMEB__
1755	mov	r2, r2, lsr #24		/* r2 = ...9 */
1756	orr	r2, r2, r3, lsl #8	/* r2 = 6789 */
1757	mov	r3, r3, lsr #24		/* r3 = ...5 */
1758	orr	r3, r3, ip, lsl #8	/* r3 = 2345 */
1759	mov	r1, r1, lsl #8		/* r1 = ..0. */
1760	orr	r1, r1, ip, lsr #24	/* r1 = ..01 */
1761#else
1762	mov	r2, r2, lsl #24		/* r2 = 9... */
1763	orr	r2, r2, r3, lsr #8	/* r2 = 9876 */
1764	mov	r3, r3, lsl #24		/* r3 = 5... */
1765	orr	r3, r3, ip, lsr #8	/* r3 = 5432 */
1766	orr	r1, r1, ip, lsl #8	/* r1 = 3210 */
1767#endif
1768	str	r2, [r0, #0x06]
1769	str	r3, [r0, #0x02]
1770	strh	r1, [r0]
1771	RET
1772	LMEMCPY_C_PAD
1773
1774/*
1775 * 1100: dst is 8-bit aligned (byte 3), src is 32-bit aligned
1776 */
1777	ldr	r2, [r1]		/* BE:r2 = 0123  LE:r2 = 3210 */
1778	ldr	ip, [r1, #0x04]		/* BE:ip = 4567  LE:ip = 7654 */
1779	ldr	r1, [r1, #0x08]		/* BE:r1 = 89AB  LE:r1 = BA98 */
1780#ifdef __ARMEB__
1781	mov	r3, r2, lsr #24		/* r3 = ...0 */
1782	strb	r3, [r0]
1783	mov	r2, r2, lsl #8		/* r2 = 123. */
1784	orr	r2, r2, ip, lsr #24	/* r2 = 1234 */
1785	str	r2, [r0, #0x01]
1786	mov	r2, ip, lsl #8		/* r2 = 567. */
1787	orr	r2, r2, r1, lsr #24	/* r2 = 5678 */
1788	str	r2, [r0, #0x05]
1789	mov	r2, r1, lsr #8		/* r2 = ..9A */
1790	strh	r2, [r0, #0x09]
1791	strb	r1, [r0, #0x0b]
1792#else
1793	strb	r2, [r0]
1794	mov	r3, r2, lsr #8		/* r3 = .321 */
1795	orr	r3, r3, ip, lsl #24	/* r3 = 4321 */
1796	str	r3, [r0, #0x01]
1797	mov	r3, ip, lsr #8		/* r3 = .765 */
1798	orr	r3, r3, r1, lsl #24	/* r3 = 8765 */
1799	str	r3, [r0, #0x05]
1800	mov	r1, r1, lsr #8		/* r1 = .BA9 */
1801	strh	r1, [r0, #0x09]
1802	mov	r1, r1, lsr #16		/* r1 = ...B */
1803	strb	r1, [r0, #0x0b]
1804#endif
1805	RET
1806	LMEMCPY_C_PAD
1807
1808/*
1809 * 1101: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 1)
1810 */
1811	ldrb	r2, [r1, #0x0b]		/* r2 = ...B */
1812	ldr	r3, [r1, #0x07]		/* BE:r3 = 789A  LE:r3 = A987 */
1813	ldr	ip, [r1, #0x03]		/* BE:ip = 3456  LE:ip = 6543 */
1814	ldr	r1, [r1, #-1]		/* BE:r1 = x012  LE:r1 = 210x */
1815	strb	r2, [r0, #0x0b]
1816#ifdef __ARMEB__
1817	strh	r3, [r0, #0x09]
1818	mov	r3, r3, lsr #16		/* r3 = ..78 */
1819	orr	r3, r3, ip, lsl #16	/* r3 = 5678 */
1820	mov	ip, ip, lsr #16		/* ip = ..34 */
1821	orr	ip, ip, r1, lsl #16	/* ip = 1234 */
1822	mov	r1, r1, lsr #16		/* r1 = ..x0 */
1823#else
1824	mov	r2, r3, lsr #16		/* r2 = ..A9 */
1825	strh	r2, [r0, #0x09]
1826	mov	r3, r3, lsl #16		/* r3 = 87.. */
1827	orr	r3, r3, ip, lsr #16	/* r3 = 8765 */
1828	mov	ip, ip, lsl #16		/* ip = 43.. */
1829	orr	ip, ip, r1, lsr #16	/* ip = 4321 */
1830	mov	r1, r1, lsr #8		/* r1 = .210 */
1831#endif
1832	str	r3, [r0, #0x05]
1833	str	ip, [r0, #0x01]
1834	strb	r1, [r0]
1835	RET
1836	LMEMCPY_C_PAD
1837
1838/*
1839 * 1110: dst is 8-bit aligned (byte 3), src is 16-bit aligned
1840 */
1841#ifdef __ARMEB__
1842	ldrh	r2, [r1, #0x0a]		/* r2 = ..AB */
1843	ldr	ip, [r1, #0x06]		/* ip = 6789 */
1844	ldr	r3, [r1, #0x02]		/* r3 = 2345 */
1845	ldrh	r1, [r1]		/* r1 = ..01 */
1846	strb	r2, [r0, #0x0b]
1847	mov	r2, r2, lsr #8		/* r2 = ...A */
1848	orr	r2, r2, ip, lsl #8	/* r2 = 789A */
1849	mov	ip, ip, lsr #8		/* ip = .678 */
1850	orr	ip, ip, r3, lsl #24	/* ip = 5678 */
1851	mov	r3, r3, lsr #8		/* r3 = .234 */
1852	orr	r3, r3, r1, lsl #24	/* r3 = 1234 */
1853	mov	r1, r1, lsr #8		/* r1 = ...0 */
1854	strb	r1, [r0]
1855	str	r3, [r0, #0x01]
1856	str	ip, [r0, #0x05]
1857	strh	r2, [r0, #0x09]
1858#else
1859	ldrh	r2, [r1]		/* r2 = ..10 */
1860	ldr	r3, [r1, #0x02]		/* r3 = 5432 */
1861	ldr	ip, [r1, #0x06]		/* ip = 9876 */
1862	ldrh	r1, [r1, #0x0a]		/* r1 = ..BA */
1863	strb	r2, [r0]
1864	mov	r2, r2, lsr #8		/* r2 = ...1 */
1865	orr	r2, r2, r3, lsl #8	/* r2 = 4321 */
1866	mov	r3, r3, lsr #24		/* r3 = ...5 */
1867	orr	r3, r3, ip, lsl #8	/* r3 = 8765 */
1868	mov	ip, ip, lsr #24		/* ip = ...9 */
1869	orr	ip, ip, r1, lsl #8	/* ip = .BA9 */
1870	mov	r1, r1, lsr #8		/* r1 = ...B */
1871	str	r2, [r0, #0x01]
1872	str	r3, [r0, #0x05]
1873	strh	ip, [r0, #0x09]
1874	strb	r1, [r0, #0x0b]
1875#endif
1876	RET
1877	LMEMCPY_C_PAD
1878
1879/*
1880 * 1111: dst is 8-bit aligned (byte 3), src is 8-bit aligned (byte 3)
1881 */
1882	ldrb	r2, [r1]
1883	ldr	r3, [r1, #0x01]
1884	ldr	ip, [r1, #0x05]
1885	strb	r2, [r0]
1886	ldrh	r2, [r1, #0x09]
1887	ldrb	r1, [r1, #0x0b]
1888	str	r3, [r0, #0x01]
1889	str	ip, [r0, #0x05]
1890	strh	r2, [r0, #0x09]
1891	strb	r1, [r0, #0x0b]
1892	RET
1893END(memcpy)
1894#endif	/* !_STANDALONE */
1895