xref: /netbsd/sys/arch/arm/arm/blockio.S (revision c4a72b64)
1/*	$NetBSD: blockio.S,v 1.5 2002/08/15 01:38:16 briggs Exp $	*/
2
3/*
4 * Copyright (c) 2001 Ben Harris.
5 * Copyright (c) 1994 Mark Brinicombe.
6 * Copyright (c) 1994 Brini.
7 * All rights reserved.
8 *
9 * This code is derived from software written for Brini by Mark Brinicombe
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 *    must display the following acknowledgement:
21 *	This product includes software developed by Brini.
22 * 4. The name of the company nor the name of the author may be used to
23 *    endorse or promote products derived from this software without specific
24 *    prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
27 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
28 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
29 * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
30 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
31 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
32 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * RiscBSD kernel project
39 *
40 * blockio.S
41 *
42 * optimised block read/write from/to IO routines.
43 *
44 * Created      : 08/10/94
45 * Modified	: 22/01/99  -- R.Earnshaw
46 *			       Faster, and small tweaks for StrongARM
47 */
48
49#include <machine/asm.h>
50
51RCSID("$NetBSD: blockio.S,v 1.5 2002/08/15 01:38:16 briggs Exp $")
52
53/*
54 * Read bytes from an I/O address into a block of memory
55 *
56 * r0 = address to read from (IO)
57 * r1 = address to write to (memory)
58 * r2 = length
59 */
60
61/* This code will look very familiar if you've read _memcpy(). */
62ENTRY(read_multi_1)
63	mov	ip, sp
64	stmfd	sp!, {fp, ip, lr, pc}
65	sub	fp, ip, #4
66	subs	r2, r2, #4		/* r2 = length - 4 */
67	blt	.Lrm1_l4			/* less than 4 bytes */
68	ands	r12, r1, #3
69	beq	.Lrm1_main		/* aligned destination */
70	rsb	r12, r12, #4
71	cmp	r12, #2
72	ldrb	r3, [r0]
73	strb	r3, [r1], #1
74	ldrgeb	r3, [r0]
75	strgeb	r3, [r1], #1
76	ldrgtb	r3, [r0]
77	strgtb	r3, [r1], #1
78	subs	r2, r2, r12
79	blt	.Lrm1_l4
80.Lrm1_main:
81.Lrm1loop:
82	ldrb	r3, [r0]
83	ldrb	r12, [r0]
84	orr	r3, r3, r12, lsl #8
85	ldrb	r12, [r0]
86	orr	r3, r3, r12, lsl #16
87	ldrb	r12, [r0]
88	orr	r3, r3, r12, lsl #24
89	str	r3, [r1], #4
90	subs	r2, r2, #4
91	bge	.Lrm1loop
92.Lrm1_l4:
93	adds	r2, r2, #4			/* r2 = length again */
94	ldmeqdb	fp, {fp, sp, pc}
95	moveq	pc, r14
96	cmp	r2, #2
97	ldrb	r3, [r0]
98	strb	r3, [r1], #1
99	ldrgeb	r3, [r0]
100	strgeb	r3, [r1], #1
101	ldrgtb	r3, [r0]
102	strgtb	r3, [r1], #1
103	ldmdb	fp, {fp, sp, pc}
104
105/*
106 * Write bytes to an I/O address from a block of memory
107 *
108 * r0 = address to write to (IO)
109 * r1 = address to read from (memory)
110 * r2 = length
111 */
112
113/* This code will look very familiar if you've read _memcpy(). */
114ENTRY(write_multi_1)
115	mov	ip, sp
116	stmfd	sp!, {fp, ip, lr, pc}
117	sub	fp, ip, #4
118	subs	r2, r2, #4		/* r2 = length - 4 */
119	blt	.Lwm1_l4		/* less than 4 bytes */
120	ands	r12, r1, #3
121	beq	.Lwm1_main		/* aligned source */
122	rsb	r12, r12, #4
123	cmp	r12, #2
124	ldrb	r3, [r1], #1
125	strb	r3, [r0]
126	ldrgeb	r3, [r1], #1
127	strgeb	r3, [r0]
128	ldrgtb	r3, [r1], #1
129	strgtb	r3, [r0]
130	subs	r2, r2, r12
131	blt	.Lwm1_l4
132.Lwm1_main:
133.Lwm1loop:
134	ldr	r3, [r1], #4
135	strb	r3, [r0]
136	mov	r3, r3, lsr #8
137	strb	r3, [r0]
138	mov	r3, r3, lsr #8
139	strb	r3, [r0]
140	mov	r3, r3, lsr #8
141	strb	r3, [r0]
142	subs	r2, r2, #4
143	bge	.Lwm1loop
144.Lwm1_l4:
145	adds	r2, r2, #4			/* r2 = length again */
146	ldmeqdb	fp, {fp, sp, pc}
147	cmp	r2, #2
148	ldrb	r3, [r1], #1
149	strb	r3, [r0]
150	ldrgeb	r3, [r1], #1
151	strgeb	r3, [r0]
152	ldrgtb	r3, [r1], #1
153	strgtb	r3, [r0]
154	ldmdb	fp, {fp, sp, pc}
155
156/*
157 * Reads short ints (16 bits) from an I/O address into a block of memory
158 *
159 * r0 = address to read from (IO)
160 * r1 = address to write to (memory)
161 * r2 = length
162 */
163
164ENTRY(insw)
165/* Make sure that we have a positive length */
166	cmp	r2, #0x00000000
167	movle	pc, lr
168
169/* If the destination address and the size is word aligned, do it fast */
170
171	tst	r2, #0x00000001
172	tsteq	r1, #0x00000003
173	beq	.Lfastinsw
174
175/* Non aligned insw */
176
177.Linswloop:
178	ldr	r3, [r0]
179	subs	r2, r2, #0x00000001	/* Loop test in load delay slot */
180	strb	r3, [r1], #0x0001
181	mov	r3, r3, lsr #8
182	strb	r3, [r1], #0x0001
183	bgt	.Linswloop
184
185	mov	pc, lr
186
187/* Word aligned insw */
188
189.Lfastinsw:
190
191.Lfastinswloop:
192	ldr	r3, [r0, #0x0002]	/* take advantage of nonaligned
193					 * word accesses */
194	ldr	ip, [r0]
195	mov	r3, r3, lsr #16		/* Put the two shorts together */
196	orr	r3, r3, ip, lsl #16
197	str	r3, [r1], #0x0004	/* Store */
198	subs	r2, r2, #0x00000002	/* Next */
199	bgt	.Lfastinswloop
200
201	mov	pc, lr
202
203
204/*
205 * Writes short ints (16 bits) from a block of memory to an I/O address
206 *
207 * r0 = address to write to (IO)
208 * r1 = address to read from (memory)
209 * r2 = length
210 */
211
212ENTRY(outsw)
213/* Make sure that we have a positive length */
214	cmp	r2, #0x00000000
215	movle	pc, lr
216
217/* If the destination address and the size is word aligned, do it fast */
218
219	tst	r2, #0x00000001
220	tsteq	r1, #0x00000003
221	beq	.Lfastoutsw
222
223/* Non aligned outsw */
224
225.Loutswloop:
226	ldrb	r3, [r1], #0x0001
227	ldrb	ip, [r1], #0x0001
228	subs	r2, r2, #0x00000001	/* Loop test in load delay slot */
229	orr	r3, r3, ip, lsl #8
230	orr	r3, r3, r3, lsl #16
231	str	r3, [r0]
232	bgt	.Loutswloop
233
234	mov	pc, lr
235
236/* Word aligned outsw */
237
238.Lfastoutsw:
239
240.Lfastoutswloop:
241	ldr	r3, [r1], #0x0004	/* r3 = (H)(L) */
242	subs	r2, r2, #0x00000002	/* Loop test in load delay slot */
243
244	eor	ip, r3, r3, lsr #16	/* ip = (H)(H^L) */
245	eor	r3, r3, ip, lsl #16	/* r3 = (H^H^L)(L) = (L)(L) */
246	eor	ip, ip, r3, lsr #16	/* ip = (H)(H^L^L) = (H)(H) */
247
248	str	r3, [r0]
249	str	ip, [r0]
250
251/*	mov	ip, r3, lsl #16
252 *	orr	ip, ip, ip, lsr #16
253 *	str	ip, [r0]
254 *
255 *	mov	ip, r3, lsr #16
256 *	orr	ip, ip, ip, lsl #16
257 *	str	ip, [r0]
258 */
259
260	bgt	.Lfastoutswloop
261
262	mov	pc, lr
263
264/*
265 * reads short ints (16 bits) from an I/O address into a block of memory
266 * with a length garenteed to be a multiple of 16 bytes
267 * with a word aligned destination address
268 *
269 * r0 = address to read from (IO)
270 * r1 = address to write to (memory)
271 * r2 = length
272 */
273
274ENTRY(insw16)
275/* Make sure that we have a positive length */
276	cmp	r2, #0x00000000
277	movle	pc, lr
278
279/* If the destination address is word aligned and the size suitably
280   aligned, do it fast */
281
282	tst	r2, #0x00000007
283	tsteq	r1, #0x00000003
284
285	bne	_C_LABEL(insw)
286
287/* Word aligned insw */
288
289	stmfd	sp!, {r4,r5,lr}
290
291.Linsw16loop:
292	ldr	r3, [r0, #0x0002]	/* take advantage of nonaligned
293					 * word accesses */
294	ldr	lr, [r0]
295	mov	r3, r3, lsr #16		/* Put the two shorts together */
296	orr	r3, r3, lr, lsl #16
297
298	ldr	r4, [r0, #0x0002]	/* take advantage of nonaligned
299					 * word accesses */
300	ldr	lr, [r0]
301	mov	r4, r4, lsr #16		/* Put the two shorts together */
302	orr	r4, r4, lr, lsl #16
303
304	ldr	r5, [r0, #0x0002]	/* take advantage of nonaligned
305					 * word accesses */
306	ldr	lr, [r0]
307	mov	r5, r5, lsr #16		/* Put the two shorts together */
308	orr	r5, r5, lr, lsl #16
309
310	ldr	ip, [r0, #0x0002]	/* take advantage of nonaligned
311					 * word accesses */
312	ldr	lr, [r0]
313	mov	ip, ip, lsr #16		/* Put the two shorts together */
314	orr	ip, ip, lr, lsl #16
315
316	stmia	r1!, {r3-r5,ip}
317	subs	r2, r2, #0x00000008	/* Next */
318	bgt	.Linsw16loop
319
320	ldmfd	sp!, {r4,r5,pc}		/* Restore regs and go home */
321
322
323/*
324 * Writes short ints (16 bits) from a block of memory to an I/O address
325 *
326 * r0 = address to write to (IO)
327 * r1 = address to read from (memory)
328 * r2 = length
329 */
330
331ENTRY(outsw16)
332/* Make sure that we have a positive length */
333	cmp	r2, #0x00000000
334	movle	pc, lr
335
336/* If the destination address is word aligned and the size suitably
337   aligned, do it fast */
338
339	tst	r2, #0x00000007
340	tsteq	r1, #0x00000003
341
342	bne	_C_LABEL(outsw)
343
344/* Word aligned outsw */
345
346	stmfd	sp!, {r4,r5,lr}
347
348.Loutsw16loop:
349	ldmia	r1!, {r4,r5,ip,lr}
350
351	eor	r3, r4, r4, lsl #16	/* r3 = (A^B)(B) */
352	eor	r4, r4, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
353	eor	r3, r3, r4, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
354	str	r3, [r0]
355	str	r4, [r0]
356
357/*	mov	r3, r4, lsl #16
358 *	orr	r3, r3, r3, lsr #16
359 *	str	r3, [r0]
360 *
361 *	mov	r3, r4, lsr #16
362 *	orr	r3, r3, r3, lsl #16
363 *	str	r3, [r0]
364 */
365
366	eor	r3, r5, r5, lsl #16	/* r3 = (A^B)(B) */
367	eor	r5, r5, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
368	eor	r3, r3, r5, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
369	str	r3, [r0]
370	str	r5, [r0]
371
372	eor	r3, ip, ip, lsl #16	/* r3 = (A^B)(B) */
373	eor	ip, ip, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
374	eor	r3, r3, ip, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
375	str	r3, [r0]
376	str	ip, [r0]
377
378	eor	r3, lr, lr, lsl #16	/* r3 = (A^B)(B) */
379	eor	lr, lr, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
380	eor	r3, r3, lr, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
381	str	r3, [r0]
382	str	lr, [r0]
383
384	subs	r2, r2, #0x00000008
385	bgt	.Loutsw16loop
386
387	ldmfd	sp!, {r4,r5,pc}		/* and go home */
388
389/*
390 * reads short ints (16 bits) from an I/O address into a block of memory
391 * The I/O address is assumed to be mapped multiple times in a block of
392 * 8 words.
393 * The destination address should be word aligned.
394 *
395 * r0 = address to read from (IO)
396 * r1 = address to write to (memory)
397 * r2 = length
398 */
399
400ENTRY(inswm8)
401/* Make sure that we have a positive length */
402	cmp	r2, #0x00000000
403	movle	pc, lr
404
405/* If the destination address is word aligned and the size suitably
406   aligned, do it fast */
407
408	tst	r1, #0x00000003
409
410	bne	_C_LABEL(insw)
411
412/* Word aligned insw */
413
414	stmfd	sp!, {r4-r9,lr}
415
416	mov	lr, #0xff000000
417	orr	lr, lr, #0x00ff0000
418
419.Linswm8_loop8:
420	cmp	r2, #8
421	bcc	.Linswm8_l8
422
423	ldmia	r0, {r3-r9,ip}
424
425	bic	r3, r3, lr
426	orr	r3, r3, r4, lsl #16
427	bic	r5, r5, lr
428	orr	r4, r5, r6, lsl #16
429	bic	r7, r7, lr
430	orr	r5, r7, r8, lsl #16
431	bic	r9, r9, lr
432	orr	r6, r9, ip, lsl #16
433
434	stmia	r1!, {r3-r6}
435
436	subs	r2, r2, #0x00000008	/* Next */
437	bne	.Linswm8_loop8
438	beq	.Linswm8_l1
439
440.Linswm8_l8:
441	cmp	r2, #4
442	bcc	.Linswm8_l4
443
444	ldmia	r0, {r3-r6}
445
446	bic	r3, r3, lr
447	orr	r3, r3, r4, lsl #16
448	bic	r5, r5, lr
449	orr	r4, r5, r6, lsl #16
450
451	stmia	r1!, {r3-r4}
452
453	subs	r2, r2, #0x00000004
454	beq	.Linswm8_l1
455
456.Linswm8_l4:
457	cmp	r2, #2
458	bcc	.Linswm8_l2
459
460	ldmia	r0, {r3-r4}
461
462	bic	r3, r3, lr
463	orr	r3, r3, r4, lsl #16
464	str	r3, [r1], #0x0004
465
466	subs	r2, r2, #0x00000002
467	beq	.Linswm8_l1
468
469.Linswm8_l2:
470	cmp	r2, #1
471	bcc	.Linswm8_l1
472
473	ldr	r3, [r0]
474	subs	r2, r2, #0x00000001	/* Test in load delay slot */
475					/* XXX, why don't we use result?  */
476
477	strb	r3, [r1], #0x0001
478	mov	r3, r3, lsr #8
479	strb	r3, [r1], #0x0001
480
481
482.Linswm8_l1:
483	ldmfd	sp!, {r4-r9,pc}		/* And go home */
484
485/*
486 * write short ints (16 bits) to an I/O address from a block of memory
487 * The I/O address is assumed to be mapped multiple times in a block of
488 * 8 words.
489 * The source address should be word aligned.
490 *
491 * r0 = address to read to (IO)
492 * r1 = address to write from (memory)
493 * r2 = length
494 */
495
496ENTRY(outswm8)
497/* Make sure that we have a positive length */
498	cmp	r2, #0x00000000
499	movle	pc, lr
500
501/* If the destination address is word aligned and the size suitably
502   aligned, do it fast */
503
504	tst	r1, #0x00000003
505
506	bne	_C_LABEL(outsw)
507
508/* Word aligned outsw */
509
510	stmfd	sp!, {r4-r8,lr}
511
512.Loutswm8_loop8:
513	cmp	r2, #8
514	bcc	.Loutswm8_l8
515
516	ldmia	r1!, {r3,r5,r7,ip}
517
518	eor	r4, r3, r3, lsr #16	/* r4 = (A)(A^B) */
519	eor	r3, r3, r4, lsl #16	/* r3 = (A^A^B)(B) = (B)(B) */
520	eor	r4, r4, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
521
522	eor	r6, r5, r5, lsr #16	/* r6 = (A)(A^B) */
523	eor	r5, r5, r6, lsl #16	/* r5 = (A^A^B)(B) = (B)(B) */
524	eor	r6, r6, r5, lsr #16	/* r6 = (A)(B^A^B) = (A)(A) */
525
526	eor	r8, r7, r7, lsr #16	/* r8 = (A)(A^B) */
527	eor	r7, r7, r8, lsl #16	/* r7 = (A^A^B)(B) = (B)(B) */
528	eor	r8, r8, r7, lsr #16	/* r8 = (A)(B^A^B) = (A)(A) */
529
530	eor	lr, ip, ip, lsr #16	/* lr = (A)(A^B) */
531	eor	ip, ip, lr, lsl #16	/* ip = (A^A^B)(B) = (B)(B) */
532	eor	lr, lr, ip, lsr #16	/* lr = (A)(B^A^B) = (A)(A) */
533
534	stmia	r0, {r3-r8,ip,lr}
535
536	subs	r2, r2, #0x00000008	/* Next */
537	bne	.Loutswm8_loop8
538	beq	.Loutswm8_l1
539
540.Loutswm8_l8:
541	cmp	r2, #4
542	bcc	.Loutswm8_l4
543
544	ldmia	r1!, {r3-r4}
545
546	eor	r6, r3, r3, lsr #16	/* r6 = (A)(A^B) */
547	eor	r5, r3, r6, lsl #16	/* r5 = (A^A^B)(B) = (B)(B) */
548	eor	r6, r6, r5, lsr #16	/* r6 = (A)(B^A^B) = (A)(A) */
549
550	eor	r8, r4, r4, lsr #16	/* r8 = (A)(A^B) */
551	eor	r7, r4, r8, lsl #16	/* r7 = (A^A^B)(B) = (B)(B) */
552	eor	r8, r8, r7, lsr #16	/* r8 = (A)(B^A^B) = (A)(A) */
553
554	stmia	r0, {r5-r8}
555
556	subs	r2, r2, #0x00000004
557	beq	.Loutswm8_l1
558
559.Loutswm8_l4:
560	cmp	r2, #2
561	bcc	.Loutswm8_l2
562
563	ldr	r3, [r1], #0x0004	/* r3 = (A)(B) */
564	subs	r2, r2, #0x00000002	/* Done test in Load delay slot */
565
566	eor	r5, r3, r3, lsr #16	/* r5 = (A)(A^B)*/
567	eor	r4, r3, r5, lsl #16	/* r4 = (A^A^B)(B) = (B)(B) */
568	eor	r5, r5, r4, lsr #16	/* r5 = (A)(B^A^B) = (A)(A) */
569
570	stmia	r0, {r4, r5}
571
572	beq	.Loutswm8_l1
573
574.Loutswm8_l2:
575	cmp	r2, #1
576	bcc	.Loutswm8_l1
577
578	ldrb	r3, [r1], #0x0001
579	ldrb	r4, [r1], #0x0001
580	subs	r2, r2, #0x00000001	/* Done test in load delay slot */
581					/* XXX This test isn't used?  */
582	orr	r3, r3, r4, lsl #8
583	orr	r3, r3, r3, lsl #16
584	str	r3, [r0]
585
586.Loutswm8_l1:
587	ldmfd	sp!, {r4-r8,pc}		/* And go home */
588