xref: /freebsd/sys/arm/arm/bcopyinout_xscale.S (revision 4d846d26)
1/*	$NetBSD: bcopyinout_xscale.S,v 1.3 2003/12/15 09:27:18 scw Exp $	*/
2
3/*-
4 * Copyright 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Steve C. Woodford for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *      This product includes software developed for the NetBSD Project by
20 *      Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 *    or promote products derived from this software without specific prior
23 *    written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38#include <machine/asm.h>
39__FBSDID("$FreeBSD$");
40
41	.syntax	unified
42	.text
43	.align	2
44
45#define GET_PCB(tmp) \
46	mrc p15, 0, tmp, c13, c0, 4; \
47	add	tmp, tmp, #(TD_PCB)
48
49/*
50 * r0 = user space address
51 * r1 = kernel space address
52 * r2 = length
53 *
54 * Copies bytes from user space to kernel space
55 */
56ENTRY(copyin)
57	cmp	r2, #0x00
58	movle	r0, #0x00
59	movle	pc, lr			/* Bail early if length is <= 0 */
60
61	adds	r3, r0, r2
62	movcs	r0, #EFAULT
63	RETc(cs)
64
65	ldr	r12, =(VM_MAXUSER_ADDRESS + 1)
66	cmp	r3, r12
67	movcs	r0, #EFAULT
68	RETc(cs)
69
70	stmfd	sp!, {r10-r11, lr}
71
72	GET_PCB(r10)
73	ldr	r10, [r10]
74
75	mov	r3, #0x00
76	adr	ip, .Lcopyin_fault
77	ldr	r11, [r10, #PCB_ONFAULT]
78	str	ip, [r10, #PCB_ONFAULT]
79	bl	.Lcopyin_guts
80	str	r11, [r10, #PCB_ONFAULT]
81	mov	r0, #0x00
82	ldmfd	sp!, {r10-r11, pc}
83
84.Lcopyin_fault:
85	ldr	r0, =EFAULT
86	str	r11, [r10, #PCB_ONFAULT]
87	cmp	r3, #0x00
88	ldmfdgt	sp!, {r4-r7}		/* r3 > 0 Restore r4-r7 */
89	ldmfdlt	sp!, {r4-r9}		/* r3 < 0 Restore r4-r9 */
90	ldmfd	sp!, {r10-r11, pc}
91
92.Lcopyin_guts:
93	pld	[r0]
94	/* Word-align the destination buffer */
95	ands	ip, r1, #0x03		/* Already word aligned? */
96	beq	.Lcopyin_wordaligned	/* Yup */
97	rsb	ip, ip, #0x04
98	cmp	r2, ip			/* Enough bytes left to align it? */
99	blt	.Lcopyin_l4_2		/* Nope. Just copy bytewise */
100	sub	r2, r2, ip
101	rsbs	ip, ip, #0x03
102	addne	pc, pc, ip, lsl #3
103	nop
104	ldrbt	ip, [r0], #0x01
105	strb	ip, [r1], #0x01
106	ldrbt	ip, [r0], #0x01
107	strb	ip, [r1], #0x01
108	ldrbt	ip, [r0], #0x01
109	strb	ip, [r1], #0x01
110	cmp	r2, #0x00		/* All done? */
111	RETeq
112
113	/* Destination buffer is now word aligned */
114.Lcopyin_wordaligned:
115	ands	ip, r0, #0x03		/* Is src also word-aligned? */
116	bne	.Lcopyin_bad_align	/* Nope. Things just got bad */
117	cmp	r2, #0x08		/* Less than 8 bytes remaining? */
118	blt	.Lcopyin_w_less_than8
119
120	/* Quad-align the destination buffer */
121	tst	r1, #0x07		/* Already quad aligned? */
122	ldrtne	ip, [r0], #0x04
123	strne	ip, [r1], #0x04
124	subne	r2, r2, #0x04
125	stmfd	sp!, {r4-r9}		/* Free up some registers */
126	mov	r3, #-1			/* Signal restore r4-r9 */
127
128	/* Destination buffer quad aligned, source is word aligned */
129	subs	r2, r2, #0x80
130	blt	.Lcopyin_w_lessthan128
131
132	/* Copy 128 bytes at a time */
133.Lcopyin_w_loop128:
134	ldrt	r4, [r0], #0x04		/* LD:00-03 */
135	ldrt	r5, [r0], #0x04		/* LD:04-07 */
136	pld	[r0, #0x18]		/* Prefetch 0x20 */
137	ldrt	r6, [r0], #0x04		/* LD:08-0b */
138	ldrt	r7, [r0], #0x04		/* LD:0c-0f */
139	ldrt	r8, [r0], #0x04		/* LD:10-13 */
140	ldrt	r9, [r0], #0x04		/* LD:14-17 */
141	strd	r4, [r1], #0x08		/* ST:00-07 */
142	ldrt	r4, [r0], #0x04		/* LD:18-1b */
143	ldrt	r5, [r0], #0x04		/* LD:1c-1f */
144	strd	r6, [r1], #0x08		/* ST:08-0f */
145	ldrt	r6, [r0], #0x04		/* LD:20-23 */
146	ldrt	r7, [r0], #0x04		/* LD:24-27 */
147	pld	[r0, #0x18]		/* Prefetch 0x40 */
148	strd	r8, [r1], #0x08		/* ST:10-17 */
149	ldrt	r8, [r0], #0x04		/* LD:28-2b */
150	ldrt	r9, [r0], #0x04		/* LD:2c-2f */
151	strd	r4, [r1], #0x08		/* ST:18-1f */
152	ldrt	r4, [r0], #0x04		/* LD:30-33 */
153	ldrt	r5, [r0], #0x04		/* LD:34-37 */
154	strd	r6, [r1], #0x08		/* ST:20-27 */
155	ldrt	r6, [r0], #0x04		/* LD:38-3b */
156	ldrt	r7, [r0], #0x04		/* LD:3c-3f */
157	strd	r8, [r1], #0x08		/* ST:28-2f */
158	ldrt	r8, [r0], #0x04		/* LD:40-43 */
159	ldrt	r9, [r0], #0x04		/* LD:44-47 */
160	pld	[r0, #0x18]		/* Prefetch 0x60 */
161	strd	r4, [r1], #0x08		/* ST:30-37 */
162	ldrt	r4, [r0], #0x04		/* LD:48-4b */
163	ldrt	r5, [r0], #0x04		/* LD:4c-4f */
164	strd	r6, [r1], #0x08		/* ST:38-3f */
165	ldrt	r6, [r0], #0x04		/* LD:50-53 */
166	ldrt	r7, [r0], #0x04		/* LD:54-57 */
167	strd	r8, [r1], #0x08		/* ST:40-47 */
168	ldrt	r8, [r0], #0x04		/* LD:58-5b */
169	ldrt	r9, [r0], #0x04		/* LD:5c-5f */
170	strd	r4, [r1], #0x08		/* ST:48-4f */
171	ldrt	r4, [r0], #0x04		/* LD:60-63 */
172	ldrt	r5, [r0], #0x04		/* LD:64-67 */
173	pld	[r0, #0x18]		/* Prefetch 0x80 */
174	strd	r6, [r1], #0x08		/* ST:50-57 */
175	ldrt	r6, [r0], #0x04		/* LD:68-6b */
176	ldrt	r7, [r0], #0x04		/* LD:6c-6f */
177	strd	r8, [r1], #0x08		/* ST:58-5f */
178	ldrt	r8, [r0], #0x04		/* LD:70-73 */
179	ldrt	r9, [r0], #0x04		/* LD:74-77 */
180	strd	r4, [r1], #0x08		/* ST:60-67 */
181	ldrt	r4, [r0], #0x04		/* LD:78-7b */
182	ldrt	r5, [r0], #0x04		/* LD:7c-7f */
183	strd	r6, [r1], #0x08		/* ST:68-6f */
184	strd	r8, [r1], #0x08		/* ST:70-77 */
185	subs	r2, r2, #0x80
186	strd	r4, [r1], #0x08		/* ST:78-7f */
187	bge	.Lcopyin_w_loop128
188
189.Lcopyin_w_lessthan128:
190	adds	r2, r2, #0x80		/* Adjust for extra sub */
191	ldmfdeq	sp!, {r4-r9}
192	RETeq
193	subs	r2, r2, #0x20
194	blt	.Lcopyin_w_lessthan32
195
196	/* Copy 32 bytes at a time */
197.Lcopyin_w_loop32:
198	ldrt	r4, [r0], #0x04
199	ldrt	r5, [r0], #0x04
200	pld	[r0, #0x18]
201	ldrt	r6, [r0], #0x04
202	ldrt	r7, [r0], #0x04
203	ldrt	r8, [r0], #0x04
204	ldrt	r9, [r0], #0x04
205	strd	r4, [r1], #0x08
206	ldrt	r4, [r0], #0x04
207	ldrt	r5, [r0], #0x04
208	strd	r6, [r1], #0x08
209	strd	r8, [r1], #0x08
210	subs	r2, r2, #0x20
211	strd	r4, [r1], #0x08
212	bge	.Lcopyin_w_loop32
213
214.Lcopyin_w_lessthan32:
215	adds	r2, r2, #0x20		/* Adjust for extra sub */
216	ldmfdeq	sp!, {r4-r9}
217	RETeq				/* Return now if done */
218
219	and	r4, r2, #0x18
220	rsb	r5, r4, #0x18
221	subs	r2, r2, r4
222	add	pc, pc, r5, lsl #1
223	nop
224
225	/* At least 24 bytes remaining */
226	ldrt	r4, [r0], #0x04
227	ldrt	r5, [r0], #0x04
228	nop
229	strd	r4, [r1], #0x08
230
231	/* At least 16 bytes remaining */
232	ldrt	r4, [r0], #0x04
233	ldrt	r5, [r0], #0x04
234	nop
235	strd	r4, [r1], #0x08
236
237	/* At least 8 bytes remaining */
238	ldrt	r4, [r0], #0x04
239	ldrt	r5, [r0], #0x04
240	nop
241	strd	r4, [r1], #0x08
242
243	/* Less than 8 bytes remaining */
244	ldmfd	sp!, {r4-r9}
245	RETeq				/* Return now if done */
246	mov	r3, #0x00
247
248.Lcopyin_w_less_than8:
249	subs	r2, r2, #0x04
250	ldrtge	ip, [r0], #0x04
251	strge	ip, [r1], #0x04
252	RETeq				/* Return now if done */
253	addlt	r2, r2, #0x04
254	ldrbt	ip, [r0], #0x01
255	cmp	r2, #0x02
256	ldrbtge	r2, [r0], #0x01
257	strb	ip, [r1], #0x01
258	ldrbtgt	ip, [r0]
259	strbge	r2, [r1], #0x01
260	strbgt	ip, [r1]
261	RET
262
263/*
264 * At this point, it has not been possible to word align both buffers.
265 * The destination buffer (r1) is word aligned, but the source buffer
266 * (r0) is not.
267 */
268.Lcopyin_bad_align:
269	stmfd	sp!, {r4-r7}
270	mov	r3, #0x01
271	bic	r0, r0, #0x03
272	cmp	ip, #2
273	ldrt	ip, [r0], #0x04
274	bgt	.Lcopyin_bad3
275	beq	.Lcopyin_bad2
276	b	.Lcopyin_bad1
277
278.Lcopyin_bad1_loop16:
279	mov	r4, ip, lsr #8
280	ldrt	r5, [r0], #0x04
281	pld	[r0, #0x018]
282	ldrt	r6, [r0], #0x04
283	ldrt	r7, [r0], #0x04
284	ldrt	ip, [r0], #0x04
285	orr	r4, r4, r5, lsl #24
286	mov	r5, r5, lsr #8
287	orr	r5, r5, r6, lsl #24
288	mov	r6, r6, lsr #8
289	orr	r6, r6, r7, lsl #24
290	mov	r7, r7, lsr #8
291	orr	r7, r7, ip, lsl #24
292	str	r4, [r1], #0x04
293	str	r5, [r1], #0x04
294	str	r6, [r1], #0x04
295	str	r7, [r1], #0x04
296.Lcopyin_bad1:
297	subs	r2, r2, #0x10
298	bge	.Lcopyin_bad1_loop16
299
300	adds	r2, r2, #0x10
301	ldmfdeq	sp!, {r4-r7}
302	RETeq				/* Return now if done */
303	subs	r2, r2, #0x04
304	sublt	r0, r0, #0x03
305	blt	.Lcopyin_l4
306
307.Lcopyin_bad1_loop4:
308	mov	r4, ip, lsr #8
309	ldrt	ip, [r0], #0x04
310	subs	r2, r2, #0x04
311	orr	r4, r4, ip, lsl #24
312	str	r4, [r1], #0x04
313	bge	.Lcopyin_bad1_loop4
314	sub	r0, r0, #0x03
315	b	.Lcopyin_l4
316
317.Lcopyin_bad2_loop16:
318	mov	r4, ip, lsr #16
319	ldrt	r5, [r0], #0x04
320	pld	[r0, #0x018]
321	ldrt	r6, [r0], #0x04
322	ldrt	r7, [r0], #0x04
323	ldrt	ip, [r0], #0x04
324	orr	r4, r4, r5, lsl #16
325	mov	r5, r5, lsr #16
326	orr	r5, r5, r6, lsl #16
327	mov	r6, r6, lsr #16
328	orr	r6, r6, r7, lsl #16
329	mov	r7, r7, lsr #16
330	orr	r7, r7, ip, lsl #16
331	str	r4, [r1], #0x04
332	str	r5, [r1], #0x04
333	str	r6, [r1], #0x04
334	str	r7, [r1], #0x04
335.Lcopyin_bad2:
336	subs	r2, r2, #0x10
337	bge	.Lcopyin_bad2_loop16
338
339	adds	r2, r2, #0x10
340	ldmfdeq	sp!, {r4-r7}
341	RETeq				/* Return now if done */
342	subs	r2, r2, #0x04
343	sublt	r0, r0, #0x02
344	blt	.Lcopyin_l4
345
346.Lcopyin_bad2_loop4:
347	mov	r4, ip, lsr #16
348	ldrt	ip, [r0], #0x04
349	subs	r2, r2, #0x04
350	orr	r4, r4, ip, lsl #16
351	str	r4, [r1], #0x04
352	bge	.Lcopyin_bad2_loop4
353	sub	r0, r0, #0x02
354	b	.Lcopyin_l4
355
356.Lcopyin_bad3_loop16:
357	mov	r4, ip, lsr #24
358	ldrt	r5, [r0], #0x04
359	pld	[r0, #0x018]
360	ldrt	r6, [r0], #0x04
361	ldrt	r7, [r0], #0x04
362	ldrt	ip, [r0], #0x04
363	orr	r4, r4, r5, lsl #8
364	mov	r5, r5, lsr #24
365	orr	r5, r5, r6, lsl #8
366	mov	r6, r6, lsr #24
367	orr	r6, r6, r7, lsl #8
368	mov	r7, r7, lsr #24
369	orr	r7, r7, ip, lsl #8
370	str	r4, [r1], #0x04
371	str	r5, [r1], #0x04
372	str	r6, [r1], #0x04
373	str	r7, [r1], #0x04
374.Lcopyin_bad3:
375	subs	r2, r2, #0x10
376	bge	.Lcopyin_bad3_loop16
377
378	adds	r2, r2, #0x10
379	ldmfdeq	sp!, {r4-r7}
380	RETeq				/* Return now if done */
381	subs	r2, r2, #0x04
382	sublt	r0, r0, #0x01
383	blt	.Lcopyin_l4
384
385.Lcopyin_bad3_loop4:
386	mov	r4, ip, lsr #24
387	ldrt	ip, [r0], #0x04
388	subs	r2, r2, #0x04
389	orr	r4, r4, ip, lsl #8
390	str	r4, [r1], #0x04
391	bge	.Lcopyin_bad3_loop4
392	sub	r0, r0, #0x01
393
394.Lcopyin_l4:
395	ldmfd	sp!, {r4-r7}
396	mov	r3, #0x00
397	adds	r2, r2, #0x04
398	RETeq
399.Lcopyin_l4_2:
400	rsbs	r2, r2, #0x03
401	addne	pc, pc, r2, lsl #3
402	nop
403	ldrbt	ip, [r0], #0x01
404	strb	ip, [r1], #0x01
405	ldrbt	ip, [r0], #0x01
406	strb	ip, [r1], #0x01
407	ldrbt	ip, [r0]
408	strb	ip, [r1]
409	RET
410END(copyin)
411
412/*
413 * r0 = kernel space address
414 * r1 = user space address
415 * r2 = length
416 *
417 * Copies bytes from kernel space to user space
418 */
419ENTRY(copyout)
420	cmp	r2, #0x00
421	movle	r0, #0x00
422	movle	pc, lr			/* Bail early if length is <= 0 */
423
424	adds	r3, r1, r2
425	movcs	r0, #EFAULT
426	RETc(cs)
427
428	ldr	r12, =(VM_MAXUSER_ADDRESS + 1)
429	cmp	r3, r12
430	movcs	r0, #EFAULT
431	RETc(cs)
432
433	stmfd	sp!, {r10-r11, lr}
434
435	GET_PCB(r10)
436	ldr	r10, [r10]
437
438	mov	r3, #0x00
439	adr	ip, .Lcopyout_fault
440	ldr	r11, [r10, #PCB_ONFAULT]
441	str	ip, [r10, #PCB_ONFAULT]
442	bl	.Lcopyout_guts
443	str	r11, [r10, #PCB_ONFAULT]
444	mov	r0, #0x00
445	ldmfd	sp!, {r10-r11, pc}
446
447.Lcopyout_fault:
448	ldr	r0, =EFAULT
449	str	r11, [r10, #PCB_ONFAULT]
450	cmp	r3, #0x00
451	ldmfdgt	sp!, {r4-r7}		/* r3 > 0 Restore r4-r7 */
452	ldmfdlt	sp!, {r4-r9}		/* r3 < 0 Restore r4-r9 */
453	ldmfd	sp!, {r10-r11, pc}
454
455.Lcopyout_guts:
456	pld	[r0]
457	/* Word-align the destination buffer */
458	ands	ip, r1, #0x03		/* Already word aligned? */
459	beq	.Lcopyout_wordaligned	/* Yup */
460	rsb	ip, ip, #0x04
461	cmp	r2, ip			/* Enough bytes left to align it? */
462	blt	.Lcopyout_l4_2		/* Nope. Just copy bytewise */
463	sub	r2, r2, ip
464	rsbs	ip, ip, #0x03
465	addne	pc, pc, ip, lsl #3
466	nop
467	ldrb	ip, [r0], #0x01
468	strbt	ip, [r1], #0x01
469	ldrb	ip, [r0], #0x01
470	strbt	ip, [r1], #0x01
471	ldrb	ip, [r0], #0x01
472	strbt	ip, [r1], #0x01
473	cmp	r2, #0x00		/* All done? */
474	RETeq
475
476	/* Destination buffer is now word aligned */
477.Lcopyout_wordaligned:
478	ands	ip, r0, #0x03		/* Is src also word-aligned? */
479	bne	.Lcopyout_bad_align	/* Nope. Things just got bad */
480	cmp	r2, #0x08		/* Less than 8 bytes remaining? */
481	blt	.Lcopyout_w_less_than8
482
483	/* Quad-align the destination buffer */
484	tst	r0, #0x07		/* Already quad aligned? */
485	ldrne	ip, [r0], #0x04
486	subne	r2, r2, #0x04
487	strtne	ip, [r1], #0x04
488
489	stmfd	sp!, {r4-r9}		/* Free up some registers */
490	mov	r3, #-1			/* Signal restore r4-r9 */
491
492	/* Destination buffer word aligned, source is quad aligned */
493	subs	r2, r2, #0x80
494	blt	.Lcopyout_w_lessthan128
495
496	/* Copy 128 bytes at a time */
497.Lcopyout_w_loop128:
498	ldrd	r4, [r0], #0x08		/* LD:00-07 */
499	pld	[r0, #0x18]		/* Prefetch 0x20 */
500	ldrd	r6, [r0], #0x08		/* LD:08-0f */
501	ldrd	r8, [r0], #0x08		/* LD:10-17 */
502	strt	r4, [r1], #0x04		/* ST:00-03 */
503	strt	r5, [r1], #0x04		/* ST:04-07 */
504	ldrd	r4, [r0], #0x08		/* LD:18-1f */
505	strt	r6, [r1], #0x04		/* ST:08-0b */
506	strt	r7, [r1], #0x04		/* ST:0c-0f */
507	ldrd	r6, [r0], #0x08		/* LD:20-27 */
508	pld	[r0, #0x18]		/* Prefetch 0x40 */
509	strt	r8, [r1], #0x04		/* ST:10-13 */
510	strt	r9, [r1], #0x04		/* ST:14-17 */
511	ldrd	r8, [r0], #0x08		/* LD:28-2f */
512	strt	r4, [r1], #0x04		/* ST:18-1b */
513	strt	r5, [r1], #0x04		/* ST:1c-1f */
514	ldrd	r4, [r0], #0x08		/* LD:30-37 */
515	strt	r6, [r1], #0x04		/* ST:20-23 */
516	strt	r7, [r1], #0x04		/* ST:24-27 */
517	ldrd	r6, [r0], #0x08		/* LD:38-3f */
518	strt	r8, [r1], #0x04		/* ST:28-2b */
519	strt	r9, [r1], #0x04		/* ST:2c-2f */
520	ldrd	r8, [r0], #0x08		/* LD:40-47 */
521	pld	[r0, #0x18]		/* Prefetch 0x60 */
522	strt	r4, [r1], #0x04		/* ST:30-33 */
523	strt	r5, [r1], #0x04		/* ST:34-37 */
524	ldrd	r4, [r0], #0x08		/* LD:48-4f */
525	strt	r6, [r1], #0x04		/* ST:38-3b */
526	strt	r7, [r1], #0x04		/* ST:3c-3f */
527	ldrd	r6, [r0], #0x08		/* LD:50-57 */
528	strt	r8, [r1], #0x04		/* ST:40-43 */
529	strt	r9, [r1], #0x04		/* ST:44-47 */
530	ldrd	r8, [r0], #0x08		/* LD:58-4f */
531	strt	r4, [r1], #0x04		/* ST:48-4b */
532	strt	r5, [r1], #0x04		/* ST:4c-4f */
533	ldrd	r4, [r0], #0x08		/* LD:60-67 */
534	pld	[r0, #0x18]		/* Prefetch 0x80 */
535	strt	r6, [r1], #0x04		/* ST:50-53 */
536	strt	r7, [r1], #0x04		/* ST:54-57 */
537	ldrd	r6, [r0], #0x08		/* LD:68-6f */
538	strt	r8, [r1], #0x04		/* ST:58-5b */
539	strt	r9, [r1], #0x04		/* ST:5c-5f */
540	ldrd	r8, [r0], #0x08		/* LD:70-77 */
541	strt	r4, [r1], #0x04		/* ST:60-63 */
542	strt	r5, [r1], #0x04		/* ST:64-67 */
543	ldrd	r4, [r0], #0x08		/* LD:78-7f */
544	strt	r6, [r1], #0x04		/* ST:68-6b */
545	strt	r7, [r1], #0x04		/* ST:6c-6f */
546	strt	r8, [r1], #0x04		/* ST:70-73 */
547	strt	r9, [r1], #0x04		/* ST:74-77 */
548	subs	r2, r2, #0x80
549	strt	r4, [r1], #0x04		/* ST:78-7b */
550	strt	r5, [r1], #0x04		/* ST:7c-7f */
551	bge	.Lcopyout_w_loop128
552
553.Lcopyout_w_lessthan128:
554	adds	r2, r2, #0x80		/* Adjust for extra sub */
555	ldmfdeq	sp!, {r4-r9}
556	RETeq				/* Return now if done */
557	subs	r2, r2, #0x20
558	blt	.Lcopyout_w_lessthan32
559
560	/* Copy 32 bytes at a time */
561.Lcopyout_w_loop32:
562	ldrd	r4, [r0], #0x08
563	pld	[r0, #0x18]
564	ldrd	r6, [r0], #0x08
565	ldrd	r8, [r0], #0x08
566	strt	r4, [r1], #0x04
567	strt	r5, [r1], #0x04
568	ldrd	r4, [r0], #0x08
569	strt	r6, [r1], #0x04
570	strt	r7, [r1], #0x04
571	strt	r8, [r1], #0x04
572	strt	r9, [r1], #0x04
573	subs	r2, r2, #0x20
574	strt	r4, [r1], #0x04
575	strt	r5, [r1], #0x04
576	bge	.Lcopyout_w_loop32
577
578.Lcopyout_w_lessthan32:
579	adds	r2, r2, #0x20		/* Adjust for extra sub */
580	ldmfdeq	sp!, {r4-r9}
581	RETeq				/* Return now if done */
582
583	and	r4, r2, #0x18
584	rsb	r5, r4, #0x18
585	subs	r2, r2, r4
586	add	pc, pc, r5, lsl #1
587	nop
588
589	/* At least 24 bytes remaining */
590	ldrd	r4, [r0], #0x08
591	strt	r4, [r1], #0x04
592	strt	r5, [r1], #0x04
593	nop
594
595	/* At least 16 bytes remaining */
596	ldrd	r4, [r0], #0x08
597	strt	r4, [r1], #0x04
598	strt	r5, [r1], #0x04
599	nop
600
601	/* At least 8 bytes remaining */
602	ldrd	r4, [r0], #0x08
603	strt	r4, [r1], #0x04
604	strt	r5, [r1], #0x04
605	nop
606
607	/* Less than 8 bytes remaining */
608	ldmfd	sp!, {r4-r9}
609	RETeq				/* Return now if done */
610	mov	r3, #0x00
611
612.Lcopyout_w_less_than8:
613	subs	r2, r2, #0x04
614	ldrge	ip, [r0], #0x04
615	strtge	ip, [r1], #0x04
616	RETeq				/* Return now if done */
617	addlt	r2, r2, #0x04
618	ldrb	ip, [r0], #0x01
619	cmp	r2, #0x02
620	ldrbge	r2, [r0], #0x01
621	strbt	ip, [r1], #0x01
622	ldrbgt	ip, [r0]
623	strbtge	r2, [r1], #0x01
624	strbtgt	ip, [r1]
625	RET
626
627/*
628 * At this point, it has not been possible to word align both buffers.
629 * The destination buffer (r1) is word aligned, but the source buffer
630 * (r0) is not.
631 */
632.Lcopyout_bad_align:
633	stmfd	sp!, {r4-r7}
634	mov	r3, #0x01
635	bic	r0, r0, #0x03
636	cmp	ip, #2
637	ldr	ip, [r0], #0x04
638	bgt	.Lcopyout_bad3
639	beq	.Lcopyout_bad2
640	b	.Lcopyout_bad1
641
642.Lcopyout_bad1_loop16:
643	mov	r4, ip, lsr #8
644	ldr	r5, [r0], #0x04
645	pld	[r0, #0x018]
646	ldr	r6, [r0], #0x04
647	ldr	r7, [r0], #0x04
648	ldr	ip, [r0], #0x04
649	orr	r4, r4, r5, lsl #24
650	mov	r5, r5, lsr #8
651	orr	r5, r5, r6, lsl #24
652	mov	r6, r6, lsr #8
653	orr	r6, r6, r7, lsl #24
654	mov	r7, r7, lsr #8
655	orr	r7, r7, ip, lsl #24
656	strt	r4, [r1], #0x04
657	strt	r5, [r1], #0x04
658	strt	r6, [r1], #0x04
659	strt	r7, [r1], #0x04
660.Lcopyout_bad1:
661	subs	r2, r2, #0x10
662	bge	.Lcopyout_bad1_loop16
663
664	adds	r2, r2, #0x10
665	ldmfdeq	sp!, {r4-r7}
666	RETeq				/* Return now if done */
667	subs	r2, r2, #0x04
668	sublt	r0, r0, #0x03
669	blt	.Lcopyout_l4
670
671.Lcopyout_bad1_loop4:
672	mov	r4, ip, lsr #8
673	ldr	ip, [r0], #0x04
674	subs	r2, r2, #0x04
675	orr	r4, r4, ip, lsl #24
676	strt	r4, [r1], #0x04
677	bge	.Lcopyout_bad1_loop4
678	sub	r0, r0, #0x03
679	b	.Lcopyout_l4
680
681.Lcopyout_bad2_loop16:
682	mov	r4, ip, lsr #16
683	ldr	r5, [r0], #0x04
684	pld	[r0, #0x018]
685	ldr	r6, [r0], #0x04
686	ldr	r7, [r0], #0x04
687	ldr	ip, [r0], #0x04
688	orr	r4, r4, r5, lsl #16
689	mov	r5, r5, lsr #16
690	orr	r5, r5, r6, lsl #16
691	mov	r6, r6, lsr #16
692	orr	r6, r6, r7, lsl #16
693	mov	r7, r7, lsr #16
694	orr	r7, r7, ip, lsl #16
695	strt	r4, [r1], #0x04
696	strt	r5, [r1], #0x04
697	strt	r6, [r1], #0x04
698	strt	r7, [r1], #0x04
699.Lcopyout_bad2:
700	subs	r2, r2, #0x10
701	bge	.Lcopyout_bad2_loop16
702
703	adds	r2, r2, #0x10
704	ldmfdeq	sp!, {r4-r7}
705	RETeq				/* Return now if done */
706	subs	r2, r2, #0x04
707	sublt	r0, r0, #0x02
708	blt	.Lcopyout_l4
709
710.Lcopyout_bad2_loop4:
711	mov	r4, ip, lsr #16
712	ldr	ip, [r0], #0x04
713	subs	r2, r2, #0x04
714	orr	r4, r4, ip, lsl #16
715	strt	r4, [r1], #0x04
716	bge	.Lcopyout_bad2_loop4
717	sub	r0, r0, #0x02
718	b	.Lcopyout_l4
719
720.Lcopyout_bad3_loop16:
721	mov	r4, ip, lsr #24
722	ldr	r5, [r0], #0x04
723	pld	[r0, #0x018]
724	ldr	r6, [r0], #0x04
725	ldr	r7, [r0], #0x04
726	ldr	ip, [r0], #0x04
727	orr	r4, r4, r5, lsl #8
728	mov	r5, r5, lsr #24
729	orr	r5, r5, r6, lsl #8
730	mov	r6, r6, lsr #24
731	orr	r6, r6, r7, lsl #8
732	mov	r7, r7, lsr #24
733	orr	r7, r7, ip, lsl #8
734	strt	r4, [r1], #0x04
735	strt	r5, [r1], #0x04
736	strt	r6, [r1], #0x04
737	strt	r7, [r1], #0x04
738.Lcopyout_bad3:
739	subs	r2, r2, #0x10
740	bge	.Lcopyout_bad3_loop16
741
742	adds	r2, r2, #0x10
743	ldmfdeq	sp!, {r4-r7}
744	RETeq				/* Return now if done */
745	subs	r2, r2, #0x04
746	sublt	r0, r0, #0x01
747	blt	.Lcopyout_l4
748
749.Lcopyout_bad3_loop4:
750	mov	r4, ip, lsr #24
751	ldr	ip, [r0], #0x04
752	subs	r2, r2, #0x04
753	orr	r4, r4, ip, lsl #8
754	strt	r4, [r1], #0x04
755	bge	.Lcopyout_bad3_loop4
756	sub	r0, r0, #0x01
757
758.Lcopyout_l4:
759	ldmfd	sp!, {r4-r7}
760	mov	r3, #0x00
761	adds	r2, r2, #0x04
762	RETeq
763.Lcopyout_l4_2:
764	rsbs	r2, r2, #0x03
765	addne	pc, pc, r2, lsl #3
766	nop
767	ldrb	ip, [r0], #0x01
768	strbt	ip, [r1], #0x01
769	ldrb	ip, [r0], #0x01
770	strbt	ip, [r1], #0x01
771	ldrb	ip, [r0]
772	strbt	ip, [r1]
773	RET
774END(copyout)
775
776