xref: /netbsd/sys/arch/arm/arm/bcopyinout.S (revision c4a72b64)
1/*	$NetBSD: bcopyinout.S,v 1.9 2002/10/13 14:54:47 bjh21 Exp $	*/
2
3/*
4 * Copyright (c) 2002 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Allen Briggs for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *      This product includes software developed for the NetBSD Project by
20 *      Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 *    or promote products derived from this software without specific prior
23 *    written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 */
37
38#include "opt_multiprocessor.h"
39
40#include "assym.h"
41
42#include <machine/asm.h>
43
44RCSID("$NetBSD: bcopyinout.S,v 1.9 2002/10/13 14:54:47 bjh21 Exp $")
45
46	.text
47	.align	0
48
49#ifdef MULTIPROCESSOR
50.Lcpu_info:
51	.word	_C_LABEL(cpu_info)
52#else
53.Lcurpcb:
54	.word _C_LABEL(curpcb)
55#endif
56
57#ifdef __PROG32
58#define SAVE_REGS	stmfd	sp!, {r4-r11}
59#define RESTORE_REGS	ldmfd	sp!, {r4-r11}
60#else
61/* Need to save R14_svc because it'll get trampled if we take a page fault. */
62#define SAVE_REGS	stmfd	sp!, {r4-r11, r14}
63#define RESTORE_REGS	ldmfd	sp!, {r4-r11, r14}
64#endif
65
66#if 0 && defined(__XSCALE__)
67#define HELLOCPP #
68#define PREFETCH(rx,o)	pld	[ rx , HELLOCPP (o) ]
69#else
70#define PREFETCH(rx,o)
71#endif
72
73/*
74 * r0 = user space address
75 * r1 = kernel space address
76 * r2 = length
77 *
78 * Copies bytes from user space to kernel space
79 *
80 * We save/restore r4-r11:
81 * r4-r11 are scratch
82 */
83ENTRY(copyin)
84	/* Quick exit if length is zero */
85	teq	r2, #0
86	moveq	r0, #0
87	moveq	pc, lr
88
89	SAVE_REGS
90#ifdef MULTIPROCESSOR
91	/* XXX Probably not appropriate for non-Hydra SMPs */
92	stmfd	sp!, {r0-r2, r14}
93	bl	_C_LABEL(cpu_number)
94	ldr	r4, .Lcpu_info
95	ldr	r4, [r4, r0, lsl #2]
96	ldr	r4, [r4, #CI_CURPCB]
97	ldmfd	sp!, {r0-r2, r14}
98#else
99	ldr	r4, .Lcurpcb
100	ldr	r4, [r4]
101#endif
102
103	ldr	r5, [r4, #PCB_ONFAULT]
104	adr	r3, .Lcopyfault
105	str	r3, [r4, #PCB_ONFAULT]
106
107	PREFETCH(r0, 0)
108	PREFETCH(r1, 0)
109
110	/*
111	 * If not too many bytes, take the slow path.
112	 */
113	cmp	r2, #0x08
114	blt	.Licleanup
115
116	/*
117	 * Align destination to word boundary.
118	 */
119	and	r6, r1, #0x3
120	ldr	pc, [pc, r6, lsl #2]
121	b	.Lialend
122	.word	.Lialend
123	.word	.Lial3
124	.word	.Lial2
125	.word	.Lial1
126.Lial3:	ldrbt	r6, [r0], #1
127	sub	r2, r2, #1
128	strb	r6, [r1], #1
129.Lial2:	ldrbt	r7, [r0], #1
130	sub	r2, r2, #1
131	strb	r7, [r1], #1
132.Lial1:	ldrbt	r6, [r0], #1
133	sub	r2, r2, #1
134	strb	r6, [r1], #1
135.Lialend:
136
137	/*
138	 * If few bytes left, finish slow.
139	 */
140	cmp	r2, #0x08
141	blt	.Licleanup
142
143	/*
144	 * If source is not aligned, finish slow.
145	 */
146	ands	r3, r0, #0x03
147	bne	.Licleanup
148
149	cmp	r2, #0x60	/* Must be > 0x5f for unrolled cacheline */
150	blt	.Licleanup8
151
152	/*
153	 * Align destination to cacheline boundary.
154	 * If source and destination are nicely aligned, this can be a big
155	 * win.  If not, it's still cheaper to copy in groups of 32 even if
156	 * we don't get the nice cacheline alignment.
157	 */
158	and	r6, r1, #0x1f
159	ldr	pc, [pc, r6]
160	b	.Licaligned
161	.word	.Licaligned
162	.word	.Lical28
163	.word	.Lical24
164	.word	.Lical20
165	.word	.Lical16
166	.word	.Lical12
167	.word	.Lical8
168	.word	.Lical4
169.Lical28:ldrt	r6, [r0], #4
170	sub	r2, r2, #4
171	str	r6, [r1], #4
172.Lical24:ldrt	r7, [r0], #4
173	sub	r2, r2, #4
174	str	r7, [r1], #4
175.Lical20:ldrt	r6, [r0], #4
176	sub	r2, r2, #4
177	str	r6, [r1], #4
178.Lical16:ldrt	r7, [r0], #4
179	sub	r2, r2, #4
180	str	r7, [r1], #4
181.Lical12:ldrt	r6, [r0], #4
182	sub	r2, r2, #4
183	str	r6, [r1], #4
184.Lical8:ldrt	r7, [r0], #4
185	sub	r2, r2, #4
186	str	r7, [r1], #4
187.Lical4:ldrt	r6, [r0], #4
188	sub	r2, r2, #4
189	str	r6, [r1], #4
190
191	/*
192	 * We start with > 0x40 bytes to copy (>= 0x60 got us into this
193	 * part of the code, and we may have knocked that down by as much
194	 * as 0x1c getting aligned).
195	 *
196	 * This loop basically works out to:
197	 * do {
198	 * 	prefetch-next-cacheline(s)
199	 *	bytes -= 0x20;
200	 *	copy cacheline
201	 * } while (bytes >= 0x40);
202	 * bytes -= 0x20;
203	 * copy cacheline
204	 */
205.Licaligned:
206	PREFETCH(r0, 32)
207	PREFETCH(r1, 32)
208
209	sub	r2, r2, #0x20
210
211	/* Copy a cacheline */
212	ldrt	r10, [r0], #4
213	ldrt	r11, [r0], #4
214	ldrt	r6, [r0], #4
215	ldrt	r7, [r0], #4
216	ldrt	r8, [r0], #4
217	ldrt	r9, [r0], #4
218	stmia	r1!, {r10-r11}
219	ldrt	r10, [r0], #4
220	ldrt	r11, [r0], #4
221	stmia	r1!, {r6-r11}
222
223	cmp	r2, #0x40
224	bge	.Licaligned
225
226	sub	r2, r2, #0x20
227
228	/* Copy a cacheline */
229	ldrt	r10, [r0], #4
230	ldrt	r11, [r0], #4
231	ldrt	r6, [r0], #4
232	ldrt	r7, [r0], #4
233	ldrt	r8, [r0], #4
234	ldrt	r9, [r0], #4
235	stmia	r1!, {r10-r11}
236	ldrt	r10, [r0], #4
237	ldrt	r11, [r0], #4
238	stmia	r1!, {r6-r11}
239
240	cmp	r2, #0x08
241	blt	.Liprecleanup
242
243.Licleanup8:
244	ldrt	r8, [r0], #4
245	ldrt	r9, [r0], #4
246	sub	r2, r2, #8
247	stmia	r1!, {r8, r9}
248	cmp	r2, #8
249	bge	.Licleanup8
250
251.Liprecleanup:
252	/*
253	 * If we're done, bail.
254	 */
255	cmp	r2, #0
256	beq	.Lout
257
258.Licleanup:
259	and	r6, r2, #0x3
260	ldr	pc, [pc, r6, lsl #2]
261	b	.Licend
262	.word	.Lic4
263	.word	.Lic1
264	.word	.Lic2
265	.word	.Lic3
266.Lic4:	ldrbt	r6, [r0], #1
267	sub	r2, r2, #1
268	strb	r6, [r1], #1
269.Lic3:	ldrbt	r7, [r0], #1
270	sub	r2, r2, #1
271	strb	r7, [r1], #1
272.Lic2:	ldrbt	r6, [r0], #1
273	sub	r2, r2, #1
274	strb	r6, [r1], #1
275.Lic1:	ldrbt	r7, [r0], #1
276	subs	r2, r2, #1
277	strb	r7, [r1], #1
278.Licend:
279	bne	.Licleanup
280
281.Liout:
282	mov	r0, #0
283
284	str	r5, [r4, #PCB_ONFAULT]
285	RESTORE_REGS
286
287	mov	pc, lr
288
289.Lcopyfault:
290	str	r5, [r4, #PCB_ONFAULT]
291	RESTORE_REGS
292
293	mov	pc, lr
294
295/*
296 * r0 = kernel space address
297 * r1 = user space address
298 * r2 = length
299 *
300 * Copies bytes from kernel space to user space
301 *
302 * We save/restore r4-r11:
303 * r4-r11 are scratch
304 */
305
306ENTRY(copyout)
307	/* Quick exit if length is zero */
308	teq	r2, #0
309	moveq	r0, #0
310	moveq	pc, lr
311
312	SAVE_REGS
313#ifdef MULTIPROCESSOR
314	/* XXX Probably not appropriate for non-Hydra SMPs */
315	stmfd	sp!, {r0-r2, r14}
316	bl	_C_LABEL(cpu_number)
317	ldr	r4, .Lcpu_info
318	ldr	r4, [r4, r0, lsl #2]
319	ldr	r4, [r4, #CI_CURPCB]
320	ldmfd	sp!, {r0-r2, r14}
321#else
322	ldr	r4, .Lcurpcb
323	ldr	r4, [r4]
324#endif
325
326	ldr	r5, [r4, #PCB_ONFAULT]
327	adr	r3, .Lcopyfault
328	str	r3, [r4, #PCB_ONFAULT]
329
330	PREFETCH(r0, 0)
331	PREFETCH(r1, 0)
332
333	/*
334	 * If not too many bytes, take the slow path.
335	 */
336	cmp	r2, #0x08
337	blt	.Lcleanup
338
339	/*
340	 * Align destination to word boundary.
341	 */
342	and	r6, r1, #0x3
343	ldr	pc, [pc, r6, lsl #2]
344	b	.Lalend
345	.word	.Lalend
346	.word	.Lal3
347	.word	.Lal2
348	.word	.Lal1
349.Lal3:	ldrb	r6, [r0], #1
350	sub	r2, r2, #1
351	strbt	r6, [r1], #1
352.Lal2:	ldrb	r7, [r0], #1
353	sub	r2, r2, #1
354	strbt	r7, [r1], #1
355.Lal1:	ldrb	r6, [r0], #1
356	sub	r2, r2, #1
357	strbt	r6, [r1], #1
358.Lalend:
359
360	/*
361	 * If few bytes left, finish slow.
362	 */
363	cmp	r2, #0x08
364	blt	.Lcleanup
365
366	/*
367	 * If source is not aligned, finish slow.
368	 */
369	ands	r3, r0, #0x03
370	bne	.Lcleanup
371
372	cmp	r2, #0x60	/* Must be > 0x5f for unrolled cacheline */
373	blt	.Lcleanup8
374
375	/*
376	 * Align source & destination to cacheline boundary.
377	 */
378	and	r6, r1, #0x1f
379	ldr	pc, [pc, r6]
380	b	.Lcaligned
381	.word	.Lcaligned
382	.word	.Lcal28
383	.word	.Lcal24
384	.word	.Lcal20
385	.word	.Lcal16
386	.word	.Lcal12
387	.word	.Lcal8
388	.word	.Lcal4
389.Lcal28:ldr	r6, [r0], #4
390	sub	r2, r2, #4
391	strt	r6, [r1], #4
392.Lcal24:ldr	r7, [r0], #4
393	sub	r2, r2, #4
394	strt	r7, [r1], #4
395.Lcal20:ldr	r6, [r0], #4
396	sub	r2, r2, #4
397	strt	r6, [r1], #4
398.Lcal16:ldr	r7, [r0], #4
399	sub	r2, r2, #4
400	strt	r7, [r1], #4
401.Lcal12:ldr	r6, [r0], #4
402	sub	r2, r2, #4
403	strt	r6, [r1], #4
404.Lcal8:	ldr	r7, [r0], #4
405	sub	r2, r2, #4
406	strt	r7, [r1], #4
407.Lcal4:	ldr	r6, [r0], #4
408	sub	r2, r2, #4
409	strt	r6, [r1], #4
410
411	/*
412	 * We start with > 0x40 bytes to copy (>= 0x60 got us into this
413	 * part of the code, and we may have knocked that down by as much
414	 * as 0x1c getting aligned).
415	 *
416	 * This loop basically works out to:
417	 * do {
418	 * 	prefetch-next-cacheline(s)
419	 *	bytes -= 0x20;
420	 *	copy cacheline
421	 * } while (bytes >= 0x40);
422	 * bytes -= 0x20;
423	 * copy cacheline
424	 */
425.Lcaligned:
426	PREFETCH(r0, 32)
427	PREFETCH(r1, 32)
428
429	sub	r2, r2, #0x20
430
431	/* Copy a cacheline */
432	ldmia	r0!, {r6-r11}
433	strt	r6, [r1], #4
434	strt	r7, [r1], #4
435	ldmia	r0!, {r6-r7}
436	strt	r8, [r1], #4
437	strt	r9, [r1], #4
438	strt	r10, [r1], #4
439	strt	r11, [r1], #4
440	strt	r6, [r1], #4
441	strt	r7, [r1], #4
442
443	cmp	r2, #0x40
444	bge	.Lcaligned
445
446	sub	r2, r2, #0x20
447
448	/* Copy a cacheline */
449	ldmia	r0!, {r6-r11}
450	strt	r6, [r1], #4
451	strt	r7, [r1], #4
452	ldmia	r0!, {r6-r7}
453	strt	r8, [r1], #4
454	strt	r9, [r1], #4
455	strt	r10, [r1], #4
456	strt	r11, [r1], #4
457	strt	r6, [r1], #4
458	strt	r7, [r1], #4
459
460	cmp	r2, #0x08
461	blt	.Lprecleanup
462
463.Lcleanup8:
464	ldmia	r0!, {r8-r9}
465	sub	r2, r2, #8
466	strt	r8, [r1], #4
467	strt	r9, [r1], #4
468	cmp	r2, #8
469	bge	.Lcleanup8
470
471.Lprecleanup:
472	/*
473	 * If we're done, bail.
474	 */
475	cmp	r2, #0
476	beq	.Lout
477
478.Lcleanup:
479	and	r6, r2, #0x3
480	ldr	pc, [pc, r6, lsl #2]
481	b	.Lcend
482	.word	.Lc4
483	.word	.Lc1
484	.word	.Lc2
485	.word	.Lc3
486.Lc4:	ldrb	r6, [r0], #1
487	sub	r2, r2, #1
488	strbt	r6, [r1], #1
489.Lc3:	ldrb	r7, [r0], #1
490	sub	r2, r2, #1
491	strbt	r7, [r1], #1
492.Lc2:	ldrb	r6, [r0], #1
493	sub	r2, r2, #1
494	strbt	r6, [r1], #1
495.Lc1:	ldrb	r7, [r0], #1
496	subs	r2, r2, #1
497	strbt	r7, [r1], #1
498.Lcend:
499	bne	.Lcleanup
500
501.Lout:
502	mov	r0, #0
503
504	str	r5, [r4, #PCB_ONFAULT]
505	RESTORE_REGS
506
507	mov	pc, lr
508
509/*
510 * r0 = kernel space source address
511 * r1 = kernel space destination address
512 * r2 = length
513 *
514 * Copies bytes from kernel space to kernel space, aborting on page fault
515 *
516 * Copy of copyout, but without the ldrt/strt instructions.
517 */
518
519ENTRY(kcopy)
520	/* Quick exit if length is zero */
521	teq	r2, #0
522	moveq	r0, #0
523	moveq	pc, lr
524
525	SAVE_REGS
526#ifdef MULTIPROCESSOR
527	/* XXX Probably not appropriate for non-Hydra SMPs */
528	stmfd	sp!, {r0-r2, r14}
529	bl	_C_LABEL(cpu_number)
530	ldr	r4, .Lcpu_info
531	ldr	r4, [r4, r0, lsl #2]
532	ldr	r4, [r4, #CI_CURPCB]
533	ldmfd	sp!, {r0-r2, r14}
534#else
535	ldr	r4, .Lcurpcb
536	ldr	r4, [r4]
537#endif
538
539	ldr	r5, [r4, #PCB_ONFAULT]
540	adr	r3, .Lcopyfault
541	str	r3, [r4, #PCB_ONFAULT]
542
543	PREFETCH(r0, 0)
544	PREFETCH(r1, 0)
545
546	/*
547	 * If not too many bytes, take the slow path.
548	 */
549	cmp	r2, #0x08
550	blt	.Lkcleanup
551
552	/*
553	 * Align destination to word boundary.
554	 */
555	and	r6, r1, #0x3
556	ldr	pc, [pc, r6, lsl #2]
557	b	.Lkalend
558	.word	.Lkalend
559	.word	.Lkal3
560	.word	.Lkal2
561	.word	.Lkal1
562.Lkal3:	ldrb	r6, [r0], #1
563	sub	r2, r2, #1
564	strb	r6, [r1], #1
565.Lkal2:	ldrb	r7, [r0], #1
566	sub	r2, r2, #1
567	strb	r7, [r1], #1
568.Lkal1:	ldrb	r6, [r0], #1
569	sub	r2, r2, #1
570	strb	r6, [r1], #1
571.Lkalend:
572
573	/*
574	 * If few bytes left, finish slow.
575	 */
576	cmp	r2, #0x08
577	blt	.Lkcleanup
578
579	/*
580	 * If source is not aligned, finish slow.
581	 */
582	ands	r3, r0, #0x03
583	bne	.Lkcleanup
584
585	cmp	r2, #0x60	/* Must be > 0x5f for unrolled cacheline */
586	blt	.Lkcleanup8
587
588	/*
589	 * Align source & destination to cacheline boundary.
590	 */
591	and	r6, r1, #0x1f
592	ldr	pc, [pc, r6]
593	b	.Lkcaligned
594	.word	.Lkcaligned
595	.word	.Lkcal28
596	.word	.Lkcal24
597	.word	.Lkcal20
598	.word	.Lkcal16
599	.word	.Lkcal12
600	.word	.Lkcal8
601	.word	.Lkcal4
602.Lkcal28:ldr	r6, [r0], #4
603	sub	r2, r2, #4
604	str	r6, [r1], #4
605.Lkcal24:ldr	r7, [r0], #4
606	sub	r2, r2, #4
607	str	r7, [r1], #4
608.Lkcal20:ldr	r6, [r0], #4
609	sub	r2, r2, #4
610	str	r6, [r1], #4
611.Lkcal16:ldr	r7, [r0], #4
612	sub	r2, r2, #4
613	str	r7, [r1], #4
614.Lkcal12:ldr	r6, [r0], #4
615	sub	r2, r2, #4
616	str	r6, [r1], #4
617.Lkcal8:ldr	r7, [r0], #4
618	sub	r2, r2, #4
619	str	r7, [r1], #4
620.Lkcal4:ldr	r6, [r0], #4
621	sub	r2, r2, #4
622	str	r6, [r1], #4
623
624	/*
625	 * We start with > 0x40 bytes to copy (>= 0x60 got us into this
626	 * part of the code, and we may have knocked that down by as much
627	 * as 0x1c getting aligned).
628	 *
629	 * This loop basically works out to:
630	 * do {
631	 * 	prefetch-next-cacheline(s)
632	 *	bytes -= 0x20;
633	 *	copy cacheline
634	 * } while (bytes >= 0x40);
635	 * bytes -= 0x20;
636	 * copy cacheline
637	 */
638.Lkcaligned:
639	PREFETCH(r0, 32)
640	PREFETCH(r1, 32)
641
642	sub	r2, r2, #0x20
643
644	/* Copy a cacheline */
645	ldmia	r0!, {r6-r11}
646	stmia	r1!, {r6, r7}
647	ldmia	r0!, {r6, r7}
648	stmia	r1!, {r8-r11}
649	stmia	r1!, {r6, r7}
650
651	cmp	r2, #0x40
652	bge	.Lkcaligned
653
654	sub	r2, r2, #0x20
655
656	/* Copy a cacheline */
657	ldmia	r0!, {r6-r11}
658	stmia	r1!, {r6-r7}
659	ldmia	r0!, {r6-r7}
660	stmia	r1!, {r8-r11}
661	stmia	r1!, {r6-r7}
662
663	cmp	r2, #0x08
664	blt	.Lkprecleanup
665
666.Lkcleanup8:
667	ldmia	r0!, {r8-r9}
668	sub	r2, r2, #8
669	stmia	r1!, {r8-r9}
670	cmp	r2, #8
671	bge	.Lkcleanup8
672
673.Lkprecleanup:
674	/*
675	 * If we're done, bail.
676	 */
677	cmp	r2, #0
678	beq	.Lkout
679
680.Lkcleanup:
681	and	r6, r2, #0x3
682	ldr	pc, [pc, r6, lsl #2]
683	b	.Lkcend
684	.word	.Lkc4
685	.word	.Lkc1
686	.word	.Lkc2
687	.word	.Lkc3
688.Lkc4:	ldrb	r6, [r0], #1
689	sub	r2, r2, #1
690	strb	r6, [r1], #1
691.Lkc3:	ldrb	r7, [r0], #1
692	sub	r2, r2, #1
693	strb	r7, [r1], #1
694.Lkc2:	ldrb	r6, [r0], #1
695	sub	r2, r2, #1
696	strb	r6, [r1], #1
697.Lkc1:	ldrb	r7, [r0], #1
698	subs	r2, r2, #1
699	strb	r7, [r1], #1
700.Lkcend:
701	bne	.Lkcleanup
702
703.Lkout:
704	mov	r0, #0
705
706	str	r5, [r4, #PCB_ONFAULT]
707	RESTORE_REGS
708
709	mov	pc, lr
710