xref: /freebsd/sys/arm/arm/blockio.S (revision f05cddf9)
1/*	$NetBSD: blockio.S,v 1.5 2002/08/15 01:38:16 briggs Exp $	*/
2
3/*-
4 * Copyright (c) 2001 Ben Harris.
5 * Copyright (c) 1994 Mark Brinicombe.
6 * Copyright (c) 1994 Brini.
7 * All rights reserved.
8 *
9 * This code is derived from software written for Brini by Mark Brinicombe
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 *    must display the following acknowledgement:
21 *	This product includes software developed by Brini.
22 * 4. The name of the company nor the name of the author may be used to
23 *    endorse or promote products derived from this software without specific
24 *    prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
27 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
28 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
29 * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
30 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
31 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
32 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * RiscBSD kernel project
39 *
40 * blockio.S
41 *
42 * optimised block read/write from/to IO routines.
43 *
44 * Created      : 08/10/94
45 * Modified	: 22/01/99  -- R.Earnshaw
46 *			       Faster, and small tweaks for StrongARM
47 */
48
49#include <machine/asm.h>
50
51__FBSDID("$FreeBSD$");
52
53/*
54 * Read bytes from an I/O address into a block of memory
55 *
56 * r0 = address to read from (IO)
57 * r1 = address to write to (memory)
58 * r2 = length
59 */
60
61/* This code will look very familiar if you've read _memcpy(). */
62ENTRY(read_multi_1)
63	mov	ip, sp
64	stmfd	sp!, {fp, ip, lr, pc}
65	sub	fp, ip, #4
66	subs	r2, r2, #4		/* r2 = length - 4 */
67	blt	.Lrm1_l4			/* less than 4 bytes */
68	ands	r12, r1, #3
69	beq	.Lrm1_main		/* aligned destination */
70	rsb	r12, r12, #4
71	cmp	r12, #2
72	ldrb	r3, [r0]
73	strb	r3, [r1], #1
74	ldrgeb	r3, [r0]
75	strgeb	r3, [r1], #1
76	ldrgtb	r3, [r0]
77	strgtb	r3, [r1], #1
78	subs	r2, r2, r12
79	blt	.Lrm1_l4
80.Lrm1_main:
81.Lrm1loop:
82	ldrb	r3, [r0]
83	ldrb	r12, [r0]
84	orr	r3, r3, r12, lsl #8
85	ldrb	r12, [r0]
86	orr	r3, r3, r12, lsl #16
87	ldrb	r12, [r0]
88	orr	r3, r3, r12, lsl #24
89	str	r3, [r1], #4
90	subs	r2, r2, #4
91	bge	.Lrm1loop
92.Lrm1_l4:
93	adds	r2, r2, #4			/* r2 = length again */
94	ldmeqdb	fp, {fp, sp, pc}
95	RETeq
96	cmp	r2, #2
97	ldrb	r3, [r0]
98	strb	r3, [r1], #1
99	ldrgeb	r3, [r0]
100	strgeb	r3, [r1], #1
101	ldrgtb	r3, [r0]
102	strgtb	r3, [r1], #1
103	ldmdb	fp, {fp, sp, pc}
104END(read_multi_1)
105
106/*
107 * Write bytes to an I/O address from a block of memory
108 *
109 * r0 = address to write to (IO)
110 * r1 = address to read from (memory)
111 * r2 = length
112 */
113
114/* This code will look very familiar if you've read _memcpy(). */
115ENTRY(write_multi_1)
116	mov	ip, sp
117	stmfd	sp!, {fp, ip, lr, pc}
118	sub	fp, ip, #4
119	subs	r2, r2, #4		/* r2 = length - 4 */
120	blt	.Lwm1_l4		/* less than 4 bytes */
121	ands	r12, r1, #3
122	beq	.Lwm1_main		/* aligned source */
123	rsb	r12, r12, #4
124	cmp	r12, #2
125	ldrb	r3, [r1], #1
126	strb	r3, [r0]
127	ldrgeb	r3, [r1], #1
128	strgeb	r3, [r0]
129	ldrgtb	r3, [r1], #1
130	strgtb	r3, [r0]
131	subs	r2, r2, r12
132	blt	.Lwm1_l4
133.Lwm1_main:
134.Lwm1loop:
135	ldr	r3, [r1], #4
136	strb	r3, [r0]
137	mov	r3, r3, lsr #8
138	strb	r3, [r0]
139	mov	r3, r3, lsr #8
140	strb	r3, [r0]
141	mov	r3, r3, lsr #8
142	strb	r3, [r0]
143	subs	r2, r2, #4
144	bge	.Lwm1loop
145.Lwm1_l4:
146	adds	r2, r2, #4			/* r2 = length again */
147	ldmeqdb	fp, {fp, sp, pc}
148	cmp	r2, #2
149	ldrb	r3, [r1], #1
150	strb	r3, [r0]
151	ldrgeb	r3, [r1], #1
152	strgeb	r3, [r0]
153	ldrgtb	r3, [r1], #1
154	strgtb	r3, [r0]
155	ldmdb	fp, {fp, sp, pc}
156END(write_multi_1)
157
158/*
159 * Reads short ints (16 bits) from an I/O address into a block of memory
160 *
161 * r0 = address to read from (IO)
162 * r1 = address to write to (memory)
163 * r2 = length
164 */
165
166ENTRY(insw)
167/* Make sure that we have a positive length */
168	cmp	r2, #0x00000000
169	movle	pc, lr
170
171/* If the destination address and the size is word aligned, do it fast */
172
173	tst	r2, #0x00000001
174	tsteq	r1, #0x00000003
175	beq	.Lfastinsw
176
177/* Non aligned insw */
178
179.Linswloop:
180	ldr	r3, [r0]
181	subs	r2, r2, #0x00000001	/* Loop test in load delay slot */
182	strb	r3, [r1], #0x0001
183	mov	r3, r3, lsr #8
184	strb	r3, [r1], #0x0001
185	bgt	.Linswloop
186
187	RET
188
189/* Word aligned insw */
190
191.Lfastinsw:
192
193.Lfastinswloop:
194	ldr	r3, [r0, #0x0002]	/* take advantage of nonaligned
195					 * word accesses */
196	ldr	ip, [r0]
197	mov	r3, r3, lsr #16		/* Put the two shorts together */
198	orr	r3, r3, ip, lsl #16
199	str	r3, [r1], #0x0004	/* Store */
200	subs	r2, r2, #0x00000002	/* Next */
201	bgt	.Lfastinswloop
202
203	RET
204END(insw)
205
206/*
207 * Writes short ints (16 bits) from a block of memory to an I/O address
208 *
209 * r0 = address to write to (IO)
210 * r1 = address to read from (memory)
211 * r2 = length
212 */
213
214ENTRY(outsw)
215/* Make sure that we have a positive length */
216	cmp	r2, #0x00000000
217	movle	pc, lr
218
219/* If the destination address and the size is word aligned, do it fast */
220
221	tst	r2, #0x00000001
222	tsteq	r1, #0x00000003
223	beq	.Lfastoutsw
224
225/* Non aligned outsw */
226
227.Loutswloop:
228	ldrb	r3, [r1], #0x0001
229	ldrb	ip, [r1], #0x0001
230	subs	r2, r2, #0x00000001	/* Loop test in load delay slot */
231	orr	r3, r3, ip, lsl #8
232	orr	r3, r3, r3, lsl #16
233	str	r3, [r0]
234	bgt	.Loutswloop
235
236	RET
237
238/* Word aligned outsw */
239
240.Lfastoutsw:
241
242.Lfastoutswloop:
243	ldr	r3, [r1], #0x0004	/* r3 = (H)(L) */
244	subs	r2, r2, #0x00000002	/* Loop test in load delay slot */
245
246	eor	ip, r3, r3, lsr #16	/* ip = (H)(H^L) */
247	eor	r3, r3, ip, lsl #16	/* r3 = (H^H^L)(L) = (L)(L) */
248	eor	ip, ip, r3, lsr #16	/* ip = (H)(H^L^L) = (H)(H) */
249
250	str	r3, [r0]
251	str	ip, [r0]
252
253/*	mov	ip, r3, lsl #16
254 *	orr	ip, ip, ip, lsr #16
255 *	str	ip, [r0]
256 *
257 *	mov	ip, r3, lsr #16
258 *	orr	ip, ip, ip, lsl #16
259 *	str	ip, [r0]
260 */
261
262	bgt	.Lfastoutswloop
263
264	RET
265END(outsw)
266
267/*
268 * reads short ints (16 bits) from an I/O address into a block of memory
269 * with a length garenteed to be a multiple of 16 bytes
270 * with a word aligned destination address
271 *
272 * r0 = address to read from (IO)
273 * r1 = address to write to (memory)
274 * r2 = length
275 */
276
277ENTRY(insw16)
278/* Make sure that we have a positive length */
279	cmp	r2, #0x00000000
280	movle	pc, lr
281
282/* If the destination address is word aligned and the size suitably
283   aligned, do it fast */
284
285	tst	r2, #0x00000007
286	tsteq	r1, #0x00000003
287
288	bne	_C_LABEL(insw)
289
290/* Word aligned insw */
291
292	stmfd	sp!, {r4,r5,lr}
293
294.Linsw16loop:
295	ldr	r3, [r0, #0x0002]	/* take advantage of nonaligned
296					 * word accesses */
297	ldr	lr, [r0]
298	mov	r3, r3, lsr #16		/* Put the two shorts together */
299	orr	r3, r3, lr, lsl #16
300
301	ldr	r4, [r0, #0x0002]	/* take advantage of nonaligned
302					 * word accesses */
303	ldr	lr, [r0]
304	mov	r4, r4, lsr #16		/* Put the two shorts together */
305	orr	r4, r4, lr, lsl #16
306
307	ldr	r5, [r0, #0x0002]	/* take advantage of nonaligned
308					 * word accesses */
309	ldr	lr, [r0]
310	mov	r5, r5, lsr #16		/* Put the two shorts together */
311	orr	r5, r5, lr, lsl #16
312
313	ldr	ip, [r0, #0x0002]	/* take advantage of nonaligned
314					 * word accesses */
315	ldr	lr, [r0]
316	mov	ip, ip, lsr #16		/* Put the two shorts together */
317	orr	ip, ip, lr, lsl #16
318
319	stmia	r1!, {r3-r5,ip}
320	subs	r2, r2, #0x00000008	/* Next */
321	bgt	.Linsw16loop
322
323	ldmfd	sp!, {r4,r5,pc}		/* Restore regs and go home */
324END(insw16)
325
326/*
327 * Writes short ints (16 bits) from a block of memory to an I/O address
328 *
329 * r0 = address to write to (IO)
330 * r1 = address to read from (memory)
331 * r2 = length
332 */
333
334ENTRY(outsw16)
335/* Make sure that we have a positive length */
336	cmp	r2, #0x00000000
337	movle	pc, lr
338
339/* If the destination address is word aligned and the size suitably
340   aligned, do it fast */
341
342	tst	r2, #0x00000007
343	tsteq	r1, #0x00000003
344
345	bne	_C_LABEL(outsw)
346
347/* Word aligned outsw */
348
349	stmfd	sp!, {r4,r5,lr}
350
351.Loutsw16loop:
352	ldmia	r1!, {r4,r5,ip,lr}
353
354	eor	r3, r4, r4, lsl #16	/* r3 = (A^B)(B) */
355	eor	r4, r4, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
356	eor	r3, r3, r4, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
357	str	r3, [r0]
358	str	r4, [r0]
359
360/*	mov	r3, r4, lsl #16
361 *	orr	r3, r3, r3, lsr #16
362 *	str	r3, [r0]
363 *
364 *	mov	r3, r4, lsr #16
365 *	orr	r3, r3, r3, lsl #16
366 *	str	r3, [r0]
367 */
368
369	eor	r3, r5, r5, lsl #16	/* r3 = (A^B)(B) */
370	eor	r5, r5, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
371	eor	r3, r3, r5, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
372	str	r3, [r0]
373	str	r5, [r0]
374
375	eor	r3, ip, ip, lsl #16	/* r3 = (A^B)(B) */
376	eor	ip, ip, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
377	eor	r3, r3, ip, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
378	str	r3, [r0]
379	str	ip, [r0]
380
381	eor	r3, lr, lr, lsl #16	/* r3 = (A^B)(B) */
382	eor	lr, lr, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
383	eor	r3, r3, lr, lsl #16	/* r3 = (A^B^A)(B) = (B)(B) */
384	str	r3, [r0]
385	str	lr, [r0]
386
387	subs	r2, r2, #0x00000008
388	bgt	.Loutsw16loop
389
390	ldmfd	sp!, {r4,r5,pc}		/* and go home */
391END(outsw16)
392
393/*
394 * reads short ints (16 bits) from an I/O address into a block of memory
395 * The I/O address is assumed to be mapped multiple times in a block of
396 * 8 words.
397 * The destination address should be word aligned.
398 *
399 * r0 = address to read from (IO)
400 * r1 = address to write to (memory)
401 * r2 = length
402 */
403
404ENTRY(inswm8)
405/* Make sure that we have a positive length */
406	cmp	r2, #0x00000000
407	movle	pc, lr
408
409/* If the destination address is word aligned and the size suitably
410   aligned, do it fast */
411
412	tst	r1, #0x00000003
413
414	bne	_C_LABEL(insw)
415
416/* Word aligned insw */
417
418	stmfd	sp!, {r4-r9,lr}
419
420	mov	lr, #0xff000000
421	orr	lr, lr, #0x00ff0000
422
423.Linswm8_loop8:
424	cmp	r2, #8
425	bcc	.Linswm8_l8
426
427	ldmia	r0, {r3-r9,ip}
428
429	bic	r3, r3, lr
430	orr	r3, r3, r4, lsl #16
431	bic	r5, r5, lr
432	orr	r4, r5, r6, lsl #16
433	bic	r7, r7, lr
434	orr	r5, r7, r8, lsl #16
435	bic	r9, r9, lr
436	orr	r6, r9, ip, lsl #16
437
438	stmia	r1!, {r3-r6}
439
440	subs	r2, r2, #0x00000008	/* Next */
441	bne	.Linswm8_loop8
442	beq	.Linswm8_l1
443
444.Linswm8_l8:
445	cmp	r2, #4
446	bcc	.Linswm8_l4
447
448	ldmia	r0, {r3-r6}
449
450	bic	r3, r3, lr
451	orr	r3, r3, r4, lsl #16
452	bic	r5, r5, lr
453	orr	r4, r5, r6, lsl #16
454
455	stmia	r1!, {r3-r4}
456
457	subs	r2, r2, #0x00000004
458	beq	.Linswm8_l1
459
460.Linswm8_l4:
461	cmp	r2, #2
462	bcc	.Linswm8_l2
463
464	ldmia	r0, {r3-r4}
465
466	bic	r3, r3, lr
467	orr	r3, r3, r4, lsl #16
468	str	r3, [r1], #0x0004
469
470	subs	r2, r2, #0x00000002
471	beq	.Linswm8_l1
472
473.Linswm8_l2:
474	cmp	r2, #1
475	bcc	.Linswm8_l1
476
477	ldr	r3, [r0]
478	subs	r2, r2, #0x00000001	/* Test in load delay slot */
479					/* XXX, why don't we use result?  */
480
481	strb	r3, [r1], #0x0001
482	mov	r3, r3, lsr #8
483	strb	r3, [r1], #0x0001
484
485
486.Linswm8_l1:
487	ldmfd	sp!, {r4-r9,pc}		/* And go home */
488END(inswm8)
489
490/*
491 * write short ints (16 bits) to an I/O address from a block of memory
492 * The I/O address is assumed to be mapped multiple times in a block of
493 * 8 words.
494 * The source address should be word aligned.
495 *
496 * r0 = address to read to (IO)
497 * r1 = address to write from (memory)
498 * r2 = length
499 */
500
501ENTRY(outswm8)
502/* Make sure that we have a positive length */
503	cmp	r2, #0x00000000
504	movle	pc, lr
505
506/* If the destination address is word aligned and the size suitably
507   aligned, do it fast */
508
509	tst	r1, #0x00000003
510
511	bne	_C_LABEL(outsw)
512
513/* Word aligned outsw */
514
515	stmfd	sp!, {r4-r8,lr}
516
517.Loutswm8_loop8:
518	cmp	r2, #8
519	bcc	.Loutswm8_l8
520
521	ldmia	r1!, {r3,r5,r7,ip}
522
523	eor	r4, r3, r3, lsr #16	/* r4 = (A)(A^B) */
524	eor	r3, r3, r4, lsl #16	/* r3 = (A^A^B)(B) = (B)(B) */
525	eor	r4, r4, r3, lsr #16	/* r4 = (A)(B^A^B) = (A)(A) */
526
527	eor	r6, r5, r5, lsr #16	/* r6 = (A)(A^B) */
528	eor	r5, r5, r6, lsl #16	/* r5 = (A^A^B)(B) = (B)(B) */
529	eor	r6, r6, r5, lsr #16	/* r6 = (A)(B^A^B) = (A)(A) */
530
531	eor	r8, r7, r7, lsr #16	/* r8 = (A)(A^B) */
532	eor	r7, r7, r8, lsl #16	/* r7 = (A^A^B)(B) = (B)(B) */
533	eor	r8, r8, r7, lsr #16	/* r8 = (A)(B^A^B) = (A)(A) */
534
535	eor	lr, ip, ip, lsr #16	/* lr = (A)(A^B) */
536	eor	ip, ip, lr, lsl #16	/* ip = (A^A^B)(B) = (B)(B) */
537	eor	lr, lr, ip, lsr #16	/* lr = (A)(B^A^B) = (A)(A) */
538
539	stmia	r0, {r3-r8,ip,lr}
540
541	subs	r2, r2, #0x00000008	/* Next */
542	bne	.Loutswm8_loop8
543	beq	.Loutswm8_l1
544
545.Loutswm8_l8:
546	cmp	r2, #4
547	bcc	.Loutswm8_l4
548
549	ldmia	r1!, {r3-r4}
550
551	eor	r6, r3, r3, lsr #16	/* r6 = (A)(A^B) */
552	eor	r5, r3, r6, lsl #16	/* r5 = (A^A^B)(B) = (B)(B) */
553	eor	r6, r6, r5, lsr #16	/* r6 = (A)(B^A^B) = (A)(A) */
554
555	eor	r8, r4, r4, lsr #16	/* r8 = (A)(A^B) */
556	eor	r7, r4, r8, lsl #16	/* r7 = (A^A^B)(B) = (B)(B) */
557	eor	r8, r8, r7, lsr #16	/* r8 = (A)(B^A^B) = (A)(A) */
558
559	stmia	r0, {r5-r8}
560
561	subs	r2, r2, #0x00000004
562	beq	.Loutswm8_l1
563
564.Loutswm8_l4:
565	cmp	r2, #2
566	bcc	.Loutswm8_l2
567
568	ldr	r3, [r1], #0x0004	/* r3 = (A)(B) */
569	subs	r2, r2, #0x00000002	/* Done test in Load delay slot */
570
571	eor	r5, r3, r3, lsr #16	/* r5 = (A)(A^B)*/
572	eor	r4, r3, r5, lsl #16	/* r4 = (A^A^B)(B) = (B)(B) */
573	eor	r5, r5, r4, lsr #16	/* r5 = (A)(B^A^B) = (A)(A) */
574
575	stmia	r0, {r4, r5}
576
577	beq	.Loutswm8_l1
578
579.Loutswm8_l2:
580	cmp	r2, #1
581	bcc	.Loutswm8_l1
582
583	ldrb	r3, [r1], #0x0001
584	ldrb	r4, [r1], #0x0001
585	subs	r2, r2, #0x00000001	/* Done test in load delay slot */
586					/* XXX This test isn't used?  */
587	orr	r3, r3, r4, lsl #8
588	orr	r3, r3, r3, lsl #16
589	str	r3, [r0]
590
591.Loutswm8_l1:
592	ldmfd	sp!, {r4-r8,pc}		/* And go home */
593END(outswm8)
594
595