xref: /freebsd/sys/arm/arm/blockio.S (revision 069ac184)
1/*	$NetBSD: blockio.S,v 1.5 2002/08/15 01:38:16 briggs Exp $	*/
2
3/*-
4 * Copyright (c) 2001 Ben Harris.
5 * Copyright (c) 1994 Mark Brinicombe.
6 * Copyright (c) 1994 Brini.
7 * All rights reserved.
8 *
9 * This code is derived from software written for Brini by Mark Brinicombe
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 *    must display the following acknowledgement:
21 *	This product includes software developed by Brini.
22 * 4. The name of the company nor the name of the author may be used to
23 *    endorse or promote products derived from this software without specific
24 *    prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
27 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
28 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
29 * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
30 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
31 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
32 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * RiscBSD kernel project
39 *
40 * blockio.S
41 *
42 * optimised block read/write from/to IO routines.
43 *
44 * Created      : 08/10/94
45 * Modified	: 22/01/99  -- R.Earnshaw
46 *			       Faster, and small tweaks for StrongARM
47 */
48
49#include <machine/asm.h>
50	.syntax	unified
51
52/*
53 * Read bytes from an I/O address into a block of memory
54 *
55 * r0 = address to read from (IO)
56 * r1 = address to write to (memory)
57 * r2 = length
58 */
59
60/* This code will look very familiar if you've read _memcpy(). */
61ENTRY(read_multi_1)
62	mov	ip, sp
63	stmfd	sp!, {fp, ip, lr, pc}
64	sub	fp, ip, #4
65	subs	r2, r2, #4		/* r2 = length - 4 */
66	blt	.Lrm1_l4			/* less than 4 bytes */
67	ands	r12, r1, #3
68	beq	.Lrm1_main		/* aligned destination */
69	rsb	r12, r12, #4
70	cmp	r12, #2
71	ldrb	r3, [r0]
72	strb	r3, [r1], #1
73	ldrbge	r3, [r0]
74	strbge	r3, [r1], #1
75	ldrbgt	r3, [r0]
76	strbgt	r3, [r1], #1
77	subs	r2, r2, r12
78	blt	.Lrm1_l4
79.Lrm1_main:
80.Lrm1loop:
81	ldrb	r3, [r0]
82	ldrb	r12, [r0]
83	orr	r3, r3, r12, lsl #8
84	ldrb	r12, [r0]
85	orr	r3, r3, r12, lsl #16
86	ldrb	r12, [r0]
87	orr	r3, r3, r12, lsl #24
88	str	r3, [r1], #4
89	subs	r2, r2, #4
90	bge	.Lrm1loop
91.Lrm1_l4:
92	adds	r2, r2, #4			/* r2 = length again */
93	ldmdbeq	fp, {fp, sp, pc}
94	RETeq
95	cmp	r2, #2
96	ldrb	r3, [r0]
97	strb	r3, [r1], #1
98	ldrbge	r3, [r0]
99	strbge	r3, [r1], #1
100	ldrbgt	r3, [r0]
101	strbgt	r3, [r1], #1
102	ldmdb	fp, {fp, sp, pc}
103END(read_multi_1)
104
105/*
106 * Write bytes to an I/O address from a block of memory
107 *
108 * r0 = address to write to (IO)
109 * r1 = address to read from (memory)
110 * r2 = length
111 */
112
113/* This code will look very familiar if you've read _memcpy(). */
114ENTRY(write_multi_1)
115	mov	ip, sp
116	stmfd	sp!, {fp, ip, lr, pc}
117	sub	fp, ip, #4
118	subs	r2, r2, #4		/* r2 = length - 4 */
119	blt	.Lwm1_l4		/* less than 4 bytes */
120	ands	r12, r1, #3
121	beq	.Lwm1_main		/* aligned source */
122	rsb	r12, r12, #4
123	cmp	r12, #2
124	ldrb	r3, [r1], #1
125	strb	r3, [r0]
126	ldrbge	r3, [r1], #1
127	strbge	r3, [r0]
128	ldrbgt	r3, [r1], #1
129	strbgt	r3, [r0]
130	subs	r2, r2, r12
131	blt	.Lwm1_l4
132.Lwm1_main:
133.Lwm1loop:
134	ldr	r3, [r1], #4
135	strb	r3, [r0]
136	mov	r3, r3, lsr #8
137	strb	r3, [r0]
138	mov	r3, r3, lsr #8
139	strb	r3, [r0]
140	mov	r3, r3, lsr #8
141	strb	r3, [r0]
142	subs	r2, r2, #4
143	bge	.Lwm1loop
144.Lwm1_l4:
145	adds	r2, r2, #4			/* r2 = length again */
146	ldmdbeq	fp, {fp, sp, pc}
147	cmp	r2, #2
148	ldrb	r3, [r1], #1
149	strb	r3, [r0]
150	ldrbge	r3, [r1], #1
151	strbge	r3, [r0]
152	ldrbgt	r3, [r1], #1
153	strbgt	r3, [r0]
154	ldmdb	fp, {fp, sp, pc}
155END(write_multi_1)
156
157/*
158 * Reads short ints (16 bits) from an I/O address into a block of memory
159 *
160 * r0 = address to read from (IO)
161 * r1 = address to write to (memory)
162 * r2 = length
163 */
164
165ENTRY(insw)
166/* Make sure that we have a positive length */
167	cmp	r2, #0x00000000
168	movle	pc, lr
169
170/* If the destination address and the size is word aligned, do it fast */
171
172	tst	r2, #0x00000001
173	tsteq	r1, #0x00000003
174	beq	.Lfastinsw
175
176/* Non aligned insw */
177
178.Linswloop:
179	ldr	r3, [r0]
180	subs	r2, r2, #0x00000001	/* Loop test in load delay slot */
181	strb	r3, [r1], #0x0001
182	mov	r3, r3, lsr #8
183	strb	r3, [r1], #0x0001
184	bgt	.Linswloop
185
186	RET
187
188/* Word aligned insw */
189
190.Lfastinsw:
191
192.Lfastinswloop:
193	ldr	r3, [r0, #0x0002]	/* take advantage of nonaligned
194					 * word accesses */
195	ldr	ip, [r0]
196	mov	r3, r3, lsr #16		/* Put the two shorts together */
197	orr	r3, r3, ip, lsl #16
198	str	r3, [r1], #0x0004	/* Store */
199	subs	r2, r2, #0x00000002	/* Next */
200	bgt	.Lfastinswloop
201
202	RET
203END(insw)
204
205/*
206 * Writes short ints (16 bits) from a block of memory to an I/O address
207 *
208 * r0 = address to write to (IO)
209 * r1 = address to read from (memory)
210 * r2 = length
211 */
212
213ENTRY(outsw)
214/* Make sure that we have a positive length */
215	cmp	r2, #0x00000000
216	movle	pc, lr
217
218/* If the destination address and the size is word aligned, do it fast */
219
220	tst	r2, #0x00000001
221	tsteq	r1, #0x00000003
222	beq	.Lfastoutsw
223
224/* Non aligned outsw */
225
226.Loutswloop:
227	ldrb	r3, [r1], #0x0001
228	ldrb	ip, [r1], #0x0001
229	subs	r2, r2, #0x00000001	/* Loop test in load delay slot */
230	orr	r3, r3, ip, lsl #8
231	orr	r3, r3, r3, lsl #16
232	str	r3, [r0]
233	bgt	.Loutswloop
234
235	RET
236
237/* Word aligned outsw */
238
239.Lfastoutsw:
240
241.Lfastoutswloop:
242	ldr	r3, [r1], #0x0004	/* r3 = (H)(L) */
243	subs	r2, r2, #0x00000002	/* Loop test in load delay slot */
244
245	eor	ip, r3, r3, lsr #16	/* ip = (H)(H^L) */
246	eor	r3, r3, ip, lsl #16	/* r3 = (H^H^L)(L) = (L)(L) */
247	eor	ip, ip, r3, lsr #16	/* ip = (H)(H^L^L) = (H)(H) */
248
249	str	r3, [r0]
250	str	ip, [r0]
251
252/*	mov	ip, r3, lsl #16
253 *	orr	ip, ip, ip, lsr #16
254 *	str	ip, [r0]
255 *
256 *	mov	ip, r3, lsr #16
257 *	orr	ip, ip, ip, lsl #16
258 *	str	ip, [r0]
259 */
260
261	bgt	.Lfastoutswloop
262
263	RET
264END(outsw)
265
266/*
267 * reads short ints (16 bits) from an I/O address into a block of memory
268 * with a length garenteed to be a multiple of 16 bytes
269 * with a word aligned destination address
270 *
271 * r0 = address to read from (IO)
272 * r1 = address to write to (memory)
273 * r2 = length
274 */
275
276ENTRY(insw16)
277/* Make sure that we have a positive length */
278	cmp	r2, #0x00000000
279	movle	pc, lr
280
281/* If the destination address is word aligned and the size suitably
282   aligned, do it fast */
283
284	tst	r2, #0x00000007
285	tsteq	r1, #0x00000003
286
287	bne	_C_LABEL(insw)
288
289/* Word aligned insw */
290
291	stmfd	sp!, {r4,r5,lr}
292
293.Linsw16loop:
294	ldr	r3, [r0, #0x0002]	/* take advantage of nonaligned
295					 * word accesses */
296	ldr	lr, [r0]
297	mov	r3, r3, lsr #16		/* Put the two shorts together */
298	orr	r3, r3, lr, lsl #16
299
300	ldr	r4, [r0, #0x0002]	/* take advantage of nonaligned
301					 * word accesses */
302	ldr	lr, [r0]
303	mov	r4, r4, lsr #16		/* Put the two shorts together */
304	orr	r4, r4, lr, lsl #16
305
306	ldr	r5, [r0, #0x0002]	/* take advantage of nonaligned
307					 * word accesses */
308	ldr	lr, [r0]
309	mov	r5, r5, lsr #16		/* Put the two shorts together */
310	orr	r5, r5, lr, lsl #16
311
312	ldr	ip, [r0, #0x0002]	/* take advantage of nonaligned
313					 * word accesses */
314	ldr	lr, [r0]
315	mov	ip, ip, lsr #16		/* Put the two shorts together */
316	orr	ip, ip, lr, lsl #16
317
318	stmia	r1!, {r3-r5,ip}
319	subs	r2, r2, #0x00000008	/* Next */
320	bgt	.Linsw16loop
321
322	ldmfd	sp!, {r4,r5,pc}		/* Restore regs and go home */
323END(insw16)
324
325/*
326 * Writes short ints (16 bits) from a block of memory to an I/O address
327 *
328 * r0 = address to write to (IO)
329 * r1 = address to read from (memory)
330 * r2 = length
331 */
332
333ENTRY(outsw16)
334/* Make sure that we have a positive length */
335	cmp	r2, #0x00000000
336	movle	pc, lr
337
338/* If the destination address is word aligned and the size suitably
339   aligned, do it fast */
340
341	tst	r2, #0x00000007
342	tsteq	r1, #0x00000003
343
344	bne	_C_LABEL(outsw)
345
346/* Word aligned outsw */
347
348	stmfd	sp!, {r4,r5,lr}
349
350.Loutsw16loop:
351	ldmia	r1!, {r4,r5,ip,lr}
352
353	eor	r3, r4, r4, lsl #16	/* r3 = (A^B)(B) */
354	eor	r4, r4, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
355	eor	r3, r3, r4, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
356	str	r3, [r0]
357	str	r4, [r0]
358
359/*	mov	r3, r4, lsl #16
360 *	orr	r3, r3, r3, lsr #16
361 *	str	r3, [r0]
362 *
363 *	mov	r3, r4, lsr #16
364 *	orr	r3, r3, r3, lsl #16
365 *	str	r3, [r0]
366 */
367
368	eor	r3, r5, r5, lsl #16	/* r3 = (A^B)(B) */
369	eor	r5, r5, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
370	eor	r3, r3, r5, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
371	str	r3, [r0]
372	str	r5, [r0]
373
374	eor	r3, ip, ip, lsl #16	/* r3 = (A^B)(B) */
375	eor	ip, ip, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
376	eor	r3, r3, ip, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
377	str	r3, [r0]
378	str	ip, [r0]
379
380	eor	r3, lr, lr, lsl #16	/* r3 = (A^B)(B) */
381	eor	lr, lr, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
382	eor	r3, r3, lr, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
383	str	r3, [r0]
384	str	lr, [r0]
385
386	subs	r2, r2, #0x00000008
387	bgt	.Loutsw16loop
388
389	ldmfd	sp!, {r4,r5,pc}		/* and go home */
390END(outsw16)
391
392/*
393 * reads short ints (16 bits) from an I/O address into a block of memory
394 * The I/O address is assumed to be mapped multiple times in a block of
395 * 8 words.
396 * The destination address should be word aligned.
397 *
398 * r0 = address to read from (IO)
399 * r1 = address to write to (memory)
400 * r2 = length
401 */
402
403ENTRY(inswm8)
404/* Make sure that we have a positive length */
405	cmp	r2, #0x00000000
406	movle	pc, lr
407
408/* If the destination address is word aligned and the size suitably
409   aligned, do it fast */
410
411	tst	r1, #0x00000003
412
413	bne	_C_LABEL(insw)
414
415/* Word aligned insw */
416
417	stmfd	sp!, {r4-r9,lr}
418
419	mov	lr, #0xff000000
420	orr	lr, lr, #0x00ff0000
421
422.Linswm8_loop8:
423	cmp	r2, #8
424	bcc	.Linswm8_l8
425
426	ldmia	r0, {r3-r9,ip}
427
428	bic	r3, r3, lr
429	orr	r3, r3, r4, lsl #16
430	bic	r5, r5, lr
431	orr	r4, r5, r6, lsl #16
432	bic	r7, r7, lr
433	orr	r5, r7, r8, lsl #16
434	bic	r9, r9, lr
435	orr	r6, r9, ip, lsl #16
436
437	stmia	r1!, {r3-r6}
438
439	subs	r2, r2, #0x00000008	/* Next */
440	bne	.Linswm8_loop8
441	beq	.Linswm8_l1
442
443.Linswm8_l8:
444	cmp	r2, #4
445	bcc	.Linswm8_l4
446
447	ldmia	r0, {r3-r6}
448
449	bic	r3, r3, lr
450	orr	r3, r3, r4, lsl #16
451	bic	r5, r5, lr
452	orr	r4, r5, r6, lsl #16
453
454	stmia	r1!, {r3-r4}
455
456	subs	r2, r2, #0x00000004
457	beq	.Linswm8_l1
458
459.Linswm8_l4:
460	cmp	r2, #2
461	bcc	.Linswm8_l2
462
463	ldmia	r0, {r3-r4}
464
465	bic	r3, r3, lr
466	orr	r3, r3, r4, lsl #16
467	str	r3, [r1], #0x0004
468
469	subs	r2, r2, #0x00000002
470	beq	.Linswm8_l1
471
472.Linswm8_l2:
473	cmp	r2, #1
474	bcc	.Linswm8_l1
475
476	ldr	r3, [r0]
477	subs	r2, r2, #0x00000001	/* Test in load delay slot */
478					/* XXX, why don't we use result?  */
479
480	strb	r3, [r1], #0x0001
481	mov	r3, r3, lsr #8
482	strb	r3, [r1], #0x0001
483
484
485.Linswm8_l1:
486	ldmfd	sp!, {r4-r9,pc}		/* And go home */
487END(inswm8)
488
489/*
490 * write short ints (16 bits) to an I/O address from a block of memory
491 * The I/O address is assumed to be mapped multiple times in a block of
492 * 8 words.
493 * The source address should be word aligned.
494 *
495 * r0 = address to read to (IO)
496 * r1 = address to write from (memory)
497 * r2 = length
498 */
499
500ENTRY(outswm8)
501/* Make sure that we have a positive length */
502	cmp	r2, #0x00000000
503	movle	pc, lr
504
505/* If the destination address is word aligned and the size suitably
506   aligned, do it fast */
507
508	tst	r1, #0x00000003
509
510	bne	_C_LABEL(outsw)
511
512/* Word aligned outsw */
513
514	stmfd	sp!, {r4-r8,lr}
515
516.Loutswm8_loop8:
517	cmp	r2, #8
518	bcc	.Loutswm8_l8
519
520	ldmia	r1!, {r3,r5,r7,ip}
521
522	eor	r4, r3, r3, lsr #16	/* r4 = (A)(A^B) */
523	eor	r3, r3, r4, lsl #16	/* r3 = (A^A^B)(B) = (B)(B) */
524	eor	r4, r4, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
525
526	eor	r6, r5, r5, lsr #16	/* r6 = (A)(A^B) */
527	eor	r5, r5, r6, lsl #16	/* r5 = (A^A^B)(B) = (B)(B) */
528	eor	r6, r6, r5, lsr #16	/* r6 = (A)(B^A^B) = (A)(A) */
529
530	eor	r8, r7, r7, lsr #16	/* r8 = (A)(A^B) */
531	eor	r7, r7, r8, lsl #16	/* r7 = (A^A^B)(B) = (B)(B) */
532	eor	r8, r8, r7, lsr #16	/* r8 = (A)(B^A^B) = (A)(A) */
533
534	eor	lr, ip, ip, lsr #16	/* lr = (A)(A^B) */
535	eor	ip, ip, lr, lsl #16	/* ip = (A^A^B)(B) = (B)(B) */
536	eor	lr, lr, ip, lsr #16	/* lr = (A)(B^A^B) = (A)(A) */
537
538	stmia	r0, {r3-r8,ip,lr}
539
540	subs	r2, r2, #0x00000008	/* Next */
541	bne	.Loutswm8_loop8
542	beq	.Loutswm8_l1
543
544.Loutswm8_l8:
545	cmp	r2, #4
546	bcc	.Loutswm8_l4
547
548	ldmia	r1!, {r3-r4}
549
550	eor	r6, r3, r3, lsr #16	/* r6 = (A)(A^B) */
551	eor	r5, r3, r6, lsl #16	/* r5 = (A^A^B)(B) = (B)(B) */
552	eor	r6, r6, r5, lsr #16	/* r6 = (A)(B^A^B) = (A)(A) */
553
554	eor	r8, r4, r4, lsr #16	/* r8 = (A)(A^B) */
555	eor	r7, r4, r8, lsl #16	/* r7 = (A^A^B)(B) = (B)(B) */
556	eor	r8, r8, r7, lsr #16	/* r8 = (A)(B^A^B) = (A)(A) */
557
558	stmia	r0, {r5-r8}
559
560	subs	r2, r2, #0x00000004
561	beq	.Loutswm8_l1
562
563.Loutswm8_l4:
564	cmp	r2, #2
565	bcc	.Loutswm8_l2
566
567	ldr	r3, [r1], #0x0004	/* r3 = (A)(B) */
568	subs	r2, r2, #0x00000002	/* Done test in Load delay slot */
569
570	eor	r5, r3, r3, lsr #16	/* r5 = (A)(A^B)*/
571	eor	r4, r3, r5, lsl #16	/* r4 = (A^A^B)(B) = (B)(B) */
572	eor	r5, r5, r4, lsr #16	/* r5 = (A)(B^A^B) = (A)(A) */
573
574	stmia	r0, {r4, r5}
575
576	beq	.Loutswm8_l1
577
578.Loutswm8_l2:
579	cmp	r2, #1
580	bcc	.Loutswm8_l1
581
582	ldrb	r3, [r1], #0x0001
583	ldrb	r4, [r1], #0x0001
584	subs	r2, r2, #0x00000001	/* Done test in load delay slot */
585					/* XXX This test isn't used?  */
586	orr	r3, r3, r4, lsl #8
587	orr	r3, r3, r3, lsl #16
588	str	r3, [r0]
589
590.Loutswm8_l1:
591	ldmfd	sp!, {r4-r8,pc}		/* And go home */
592END(outswm8)
593
594