1/* $NetBSD: blockio.S,v 1.5 2002/08/15 01:38:16 briggs Exp $ */ 2 3/*- 4 * Copyright (c) 2001 Ben Harris. 5 * Copyright (c) 1994 Mark Brinicombe. 6 * Copyright (c) 1994 Brini. 7 * All rights reserved. 8 * 9 * This code is derived from software written for Brini by Mark Brinicombe 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by Brini. 22 * 4. The name of the company nor the name of the author may be used to 23 * endorse or promote products derived from this software without specific 24 * prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED 27 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 28 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 29 * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 30 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 31 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 32 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * RiscBSD kernel project 39 * 40 * blockio.S 41 * 42 * optimised block read/write from/to IO routines. 43 * 44 * Created : 08/10/94 45 * Modified : 22/01/99 -- R.Earnshaw 46 * Faster, and small tweaks for StrongARM 47 */ 48 49#include <machine/asm.h> 50 51__FBSDID("$FreeBSD$"); 52 53 .syntax unified 54 55/* 56 * Read bytes from an I/O address into a block of memory 57 * 58 * r0 = address to read from (IO) 59 * r1 = address to write to (memory) 60 * r2 = length 61 */ 62 63/* This code will look very familiar if you've read _memcpy(). */ 64ENTRY(read_multi_1) 65 mov ip, sp 66 stmfd sp!, {fp, ip, lr, pc} 67 sub fp, ip, #4 68 subs r2, r2, #4 /* r2 = length - 4 */ 69 blt .Lrm1_l4 /* less than 4 bytes */ 70 ands r12, r1, #3 71 beq .Lrm1_main /* aligned destination */ 72 rsb r12, r12, #4 73 cmp r12, #2 74 ldrb r3, [r0] 75 strb r3, [r1], #1 76 ldrbge r3, [r0] 77 strbge r3, [r1], #1 78 ldrbgt r3, [r0] 79 strbgt r3, [r1], #1 80 subs r2, r2, r12 81 blt .Lrm1_l4 82.Lrm1_main: 83.Lrm1loop: 84 ldrb r3, [r0] 85 ldrb r12, [r0] 86 orr r3, r3, r12, lsl #8 87 ldrb r12, [r0] 88 orr r3, r3, r12, lsl #16 89 ldrb r12, [r0] 90 orr r3, r3, r12, lsl #24 91 str r3, [r1], #4 92 subs r2, r2, #4 93 bge .Lrm1loop 94.Lrm1_l4: 95 adds r2, r2, #4 /* r2 = length again */ 96 ldmdbeq fp, {fp, sp, pc} 97 RETeq 98 cmp r2, #2 99 ldrb r3, [r0] 100 strb r3, [r1], #1 101 ldrbge r3, [r0] 102 strbge r3, [r1], #1 103 ldrbgt r3, [r0] 104 strbgt r3, [r1], #1 105 ldmdb fp, {fp, sp, pc} 106END(read_multi_1) 107 108/* 109 * Write bytes to an I/O address from a block of memory 110 * 111 * r0 = address to write to (IO) 112 * r1 = address to read from (memory) 113 * r2 = length 114 */ 115 116/* This code will look very familiar if you've read _memcpy(). */ 117ENTRY(write_multi_1) 118 mov ip, sp 119 stmfd sp!, {fp, ip, lr, pc} 120 sub fp, ip, #4 121 subs r2, r2, #4 /* r2 = length - 4 */ 122 blt .Lwm1_l4 /* less than 4 bytes */ 123 ands r12, r1, #3 124 beq .Lwm1_main /* aligned source */ 125 rsb r12, r12, #4 126 cmp r12, #2 127 ldrb r3, [r1], #1 128 strb r3, [r0] 129 ldrbge r3, [r1], #1 130 strbge r3, [r0] 131 ldrbgt r3, [r1], #1 132 strbgt r3, [r0] 133 subs r2, r2, r12 134 blt .Lwm1_l4 135.Lwm1_main: 136.Lwm1loop: 137 ldr r3, [r1], #4 138 strb r3, [r0] 139 mov r3, r3, lsr #8 140 strb r3, [r0] 141 mov r3, r3, lsr #8 142 strb r3, [r0] 143 mov r3, r3, lsr #8 144 strb r3, [r0] 145 subs r2, r2, #4 146 bge .Lwm1loop 147.Lwm1_l4: 148 adds r2, r2, #4 /* r2 = length again */ 149 ldmdbeq fp, {fp, sp, pc} 150 cmp r2, #2 151 ldrb r3, [r1], #1 152 strb r3, [r0] 153 ldrbge r3, [r1], #1 154 strbge r3, [r0] 155 ldrbgt r3, [r1], #1 156 strbgt r3, [r0] 157 ldmdb fp, {fp, sp, pc} 158END(write_multi_1) 159 160/* 161 * Reads short ints (16 bits) from an I/O address into a block of memory 162 * 163 * r0 = address to read from (IO) 164 * r1 = address to write to (memory) 165 * r2 = length 166 */ 167 168ENTRY(insw) 169/* Make sure that we have a positive length */ 170 cmp r2, #0x00000000 171 movle pc, lr 172 173/* If the destination address and the size is word aligned, do it fast */ 174 175 tst r2, #0x00000001 176 tsteq r1, #0x00000003 177 beq .Lfastinsw 178 179/* Non aligned insw */ 180 181.Linswloop: 182 ldr r3, [r0] 183 subs r2, r2, #0x00000001 /* Loop test in load delay slot */ 184 strb r3, [r1], #0x0001 185 mov r3, r3, lsr #8 186 strb r3, [r1], #0x0001 187 bgt .Linswloop 188 189 RET 190 191/* Word aligned insw */ 192 193.Lfastinsw: 194 195.Lfastinswloop: 196 ldr r3, [r0, #0x0002] /* take advantage of nonaligned 197 * word accesses */ 198 ldr ip, [r0] 199 mov r3, r3, lsr #16 /* Put the two shorts together */ 200 orr r3, r3, ip, lsl #16 201 str r3, [r1], #0x0004 /* Store */ 202 subs r2, r2, #0x00000002 /* Next */ 203 bgt .Lfastinswloop 204 205 RET 206END(insw) 207 208/* 209 * Writes short ints (16 bits) from a block of memory to an I/O address 210 * 211 * r0 = address to write to (IO) 212 * r1 = address to read from (memory) 213 * r2 = length 214 */ 215 216ENTRY(outsw) 217/* Make sure that we have a positive length */ 218 cmp r2, #0x00000000 219 movle pc, lr 220 221/* If the destination address and the size is word aligned, do it fast */ 222 223 tst r2, #0x00000001 224 tsteq r1, #0x00000003 225 beq .Lfastoutsw 226 227/* Non aligned outsw */ 228 229.Loutswloop: 230 ldrb r3, [r1], #0x0001 231 ldrb ip, [r1], #0x0001 232 subs r2, r2, #0x00000001 /* Loop test in load delay slot */ 233 orr r3, r3, ip, lsl #8 234 orr r3, r3, r3, lsl #16 235 str r3, [r0] 236 bgt .Loutswloop 237 238 RET 239 240/* Word aligned outsw */ 241 242.Lfastoutsw: 243 244.Lfastoutswloop: 245 ldr r3, [r1], #0x0004 /* r3 = (H)(L) */ 246 subs r2, r2, #0x00000002 /* Loop test in load delay slot */ 247 248 eor ip, r3, r3, lsr #16 /* ip = (H)(H^L) */ 249 eor r3, r3, ip, lsl #16 /* r3 = (H^H^L)(L) = (L)(L) */ 250 eor ip, ip, r3, lsr #16 /* ip = (H)(H^L^L) = (H)(H) */ 251 252 str r3, [r0] 253 str ip, [r0] 254 255/* mov ip, r3, lsl #16 256 * orr ip, ip, ip, lsr #16 257 * str ip, [r0] 258 * 259 * mov ip, r3, lsr #16 260 * orr ip, ip, ip, lsl #16 261 * str ip, [r0] 262 */ 263 264 bgt .Lfastoutswloop 265 266 RET 267END(outsw) 268 269/* 270 * reads short ints (16 bits) from an I/O address into a block of memory 271 * with a length garenteed to be a multiple of 16 bytes 272 * with a word aligned destination address 273 * 274 * r0 = address to read from (IO) 275 * r1 = address to write to (memory) 276 * r2 = length 277 */ 278 279ENTRY(insw16) 280/* Make sure that we have a positive length */ 281 cmp r2, #0x00000000 282 movle pc, lr 283 284/* If the destination address is word aligned and the size suitably 285 aligned, do it fast */ 286 287 tst r2, #0x00000007 288 tsteq r1, #0x00000003 289 290 bne _C_LABEL(insw) 291 292/* Word aligned insw */ 293 294 stmfd sp!, {r4,r5,lr} 295 296.Linsw16loop: 297 ldr r3, [r0, #0x0002] /* take advantage of nonaligned 298 * word accesses */ 299 ldr lr, [r0] 300 mov r3, r3, lsr #16 /* Put the two shorts together */ 301 orr r3, r3, lr, lsl #16 302 303 ldr r4, [r0, #0x0002] /* take advantage of nonaligned 304 * word accesses */ 305 ldr lr, [r0] 306 mov r4, r4, lsr #16 /* Put the two shorts together */ 307 orr r4, r4, lr, lsl #16 308 309 ldr r5, [r0, #0x0002] /* take advantage of nonaligned 310 * word accesses */ 311 ldr lr, [r0] 312 mov r5, r5, lsr #16 /* Put the two shorts together */ 313 orr r5, r5, lr, lsl #16 314 315 ldr ip, [r0, #0x0002] /* take advantage of nonaligned 316 * word accesses */ 317 ldr lr, [r0] 318 mov ip, ip, lsr #16 /* Put the two shorts together */ 319 orr ip, ip, lr, lsl #16 320 321 stmia r1!, {r3-r5,ip} 322 subs r2, r2, #0x00000008 /* Next */ 323 bgt .Linsw16loop 324 325 ldmfd sp!, {r4,r5,pc} /* Restore regs and go home */ 326END(insw16) 327 328/* 329 * Writes short ints (16 bits) from a block of memory to an I/O address 330 * 331 * r0 = address to write to (IO) 332 * r1 = address to read from (memory) 333 * r2 = length 334 */ 335 336ENTRY(outsw16) 337/* Make sure that we have a positive length */ 338 cmp r2, #0x00000000 339 movle pc, lr 340 341/* If the destination address is word aligned and the size suitably 342 aligned, do it fast */ 343 344 tst r2, #0x00000007 345 tsteq r1, #0x00000003 346 347 bne _C_LABEL(outsw) 348 349/* Word aligned outsw */ 350 351 stmfd sp!, {r4,r5,lr} 352 353.Loutsw16loop: 354 ldmia r1!, {r4,r5,ip,lr} 355 356 eor r3, r4, r4, lsl #16 /* r3 = (A^B)(B) */ 357 eor r4, r4, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */ 358 eor r3, r3, r4, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */ 359 str r3, [r0] 360 str r4, [r0] 361 362/* mov r3, r4, lsl #16 363 * orr r3, r3, r3, lsr #16 364 * str r3, [r0] 365 * 366 * mov r3, r4, lsr #16 367 * orr r3, r3, r3, lsl #16 368 * str r3, [r0] 369 */ 370 371 eor r3, r5, r5, lsl #16 /* r3 = (A^B)(B) */ 372 eor r5, r5, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */ 373 eor r3, r3, r5, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */ 374 str r3, [r0] 375 str r5, [r0] 376 377 eor r3, ip, ip, lsl #16 /* r3 = (A^B)(B) */ 378 eor ip, ip, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */ 379 eor r3, r3, ip, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */ 380 str r3, [r0] 381 str ip, [r0] 382 383 eor r3, lr, lr, lsl #16 /* r3 = (A^B)(B) */ 384 eor lr, lr, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */ 385 eor r3, r3, lr, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */ 386 str r3, [r0] 387 str lr, [r0] 388 389 subs r2, r2, #0x00000008 390 bgt .Loutsw16loop 391 392 ldmfd sp!, {r4,r5,pc} /* and go home */ 393END(outsw16) 394 395/* 396 * reads short ints (16 bits) from an I/O address into a block of memory 397 * The I/O address is assumed to be mapped multiple times in a block of 398 * 8 words. 399 * The destination address should be word aligned. 400 * 401 * r0 = address to read from (IO) 402 * r1 = address to write to (memory) 403 * r2 = length 404 */ 405 406ENTRY(inswm8) 407/* Make sure that we have a positive length */ 408 cmp r2, #0x00000000 409 movle pc, lr 410 411/* If the destination address is word aligned and the size suitably 412 aligned, do it fast */ 413 414 tst r1, #0x00000003 415 416 bne _C_LABEL(insw) 417 418/* Word aligned insw */ 419 420 stmfd sp!, {r4-r9,lr} 421 422 mov lr, #0xff000000 423 orr lr, lr, #0x00ff0000 424 425.Linswm8_loop8: 426 cmp r2, #8 427 bcc .Linswm8_l8 428 429 ldmia r0, {r3-r9,ip} 430 431 bic r3, r3, lr 432 orr r3, r3, r4, lsl #16 433 bic r5, r5, lr 434 orr r4, r5, r6, lsl #16 435 bic r7, r7, lr 436 orr r5, r7, r8, lsl #16 437 bic r9, r9, lr 438 orr r6, r9, ip, lsl #16 439 440 stmia r1!, {r3-r6} 441 442 subs r2, r2, #0x00000008 /* Next */ 443 bne .Linswm8_loop8 444 beq .Linswm8_l1 445 446.Linswm8_l8: 447 cmp r2, #4 448 bcc .Linswm8_l4 449 450 ldmia r0, {r3-r6} 451 452 bic r3, r3, lr 453 orr r3, r3, r4, lsl #16 454 bic r5, r5, lr 455 orr r4, r5, r6, lsl #16 456 457 stmia r1!, {r3-r4} 458 459 subs r2, r2, #0x00000004 460 beq .Linswm8_l1 461 462.Linswm8_l4: 463 cmp r2, #2 464 bcc .Linswm8_l2 465 466 ldmia r0, {r3-r4} 467 468 bic r3, r3, lr 469 orr r3, r3, r4, lsl #16 470 str r3, [r1], #0x0004 471 472 subs r2, r2, #0x00000002 473 beq .Linswm8_l1 474 475.Linswm8_l2: 476 cmp r2, #1 477 bcc .Linswm8_l1 478 479 ldr r3, [r0] 480 subs r2, r2, #0x00000001 /* Test in load delay slot */ 481 /* XXX, why don't we use result? */ 482 483 strb r3, [r1], #0x0001 484 mov r3, r3, lsr #8 485 strb r3, [r1], #0x0001 486 487 488.Linswm8_l1: 489 ldmfd sp!, {r4-r9,pc} /* And go home */ 490END(inswm8) 491 492/* 493 * write short ints (16 bits) to an I/O address from a block of memory 494 * The I/O address is assumed to be mapped multiple times in a block of 495 * 8 words. 496 * The source address should be word aligned. 497 * 498 * r0 = address to read to (IO) 499 * r1 = address to write from (memory) 500 * r2 = length 501 */ 502 503ENTRY(outswm8) 504/* Make sure that we have a positive length */ 505 cmp r2, #0x00000000 506 movle pc, lr 507 508/* If the destination address is word aligned and the size suitably 509 aligned, do it fast */ 510 511 tst r1, #0x00000003 512 513 bne _C_LABEL(outsw) 514 515/* Word aligned outsw */ 516 517 stmfd sp!, {r4-r8,lr} 518 519.Loutswm8_loop8: 520 cmp r2, #8 521 bcc .Loutswm8_l8 522 523 ldmia r1!, {r3,r5,r7,ip} 524 525 eor r4, r3, r3, lsr #16 /* r4 = (A)(A^B) */ 526 eor r3, r3, r4, lsl #16 /* r3 = (A^A^B)(B) = (B)(B) */ 527 eor r4, r4, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */ 528 529 eor r6, r5, r5, lsr #16 /* r6 = (A)(A^B) */ 530 eor r5, r5, r6, lsl #16 /* r5 = (A^A^B)(B) = (B)(B) */ 531 eor r6, r6, r5, lsr #16 /* r6 = (A)(B^A^B) = (A)(A) */ 532 533 eor r8, r7, r7, lsr #16 /* r8 = (A)(A^B) */ 534 eor r7, r7, r8, lsl #16 /* r7 = (A^A^B)(B) = (B)(B) */ 535 eor r8, r8, r7, lsr #16 /* r8 = (A)(B^A^B) = (A)(A) */ 536 537 eor lr, ip, ip, lsr #16 /* lr = (A)(A^B) */ 538 eor ip, ip, lr, lsl #16 /* ip = (A^A^B)(B) = (B)(B) */ 539 eor lr, lr, ip, lsr #16 /* lr = (A)(B^A^B) = (A)(A) */ 540 541 stmia r0, {r3-r8,ip,lr} 542 543 subs r2, r2, #0x00000008 /* Next */ 544 bne .Loutswm8_loop8 545 beq .Loutswm8_l1 546 547.Loutswm8_l8: 548 cmp r2, #4 549 bcc .Loutswm8_l4 550 551 ldmia r1!, {r3-r4} 552 553 eor r6, r3, r3, lsr #16 /* r6 = (A)(A^B) */ 554 eor r5, r3, r6, lsl #16 /* r5 = (A^A^B)(B) = (B)(B) */ 555 eor r6, r6, r5, lsr #16 /* r6 = (A)(B^A^B) = (A)(A) */ 556 557 eor r8, r4, r4, lsr #16 /* r8 = (A)(A^B) */ 558 eor r7, r4, r8, lsl #16 /* r7 = (A^A^B)(B) = (B)(B) */ 559 eor r8, r8, r7, lsr #16 /* r8 = (A)(B^A^B) = (A)(A) */ 560 561 stmia r0, {r5-r8} 562 563 subs r2, r2, #0x00000004 564 beq .Loutswm8_l1 565 566.Loutswm8_l4: 567 cmp r2, #2 568 bcc .Loutswm8_l2 569 570 ldr r3, [r1], #0x0004 /* r3 = (A)(B) */ 571 subs r2, r2, #0x00000002 /* Done test in Load delay slot */ 572 573 eor r5, r3, r3, lsr #16 /* r5 = (A)(A^B)*/ 574 eor r4, r3, r5, lsl #16 /* r4 = (A^A^B)(B) = (B)(B) */ 575 eor r5, r5, r4, lsr #16 /* r5 = (A)(B^A^B) = (A)(A) */ 576 577 stmia r0, {r4, r5} 578 579 beq .Loutswm8_l1 580 581.Loutswm8_l2: 582 cmp r2, #1 583 bcc .Loutswm8_l1 584 585 ldrb r3, [r1], #0x0001 586 ldrb r4, [r1], #0x0001 587 subs r2, r2, #0x00000001 /* Done test in load delay slot */ 588 /* XXX This test isn't used? */ 589 orr r3, r3, r4, lsl #8 590 orr r3, r3, r3, lsl #16 591 str r3, [r0] 592 593.Loutswm8_l1: 594 ldmfd sp!, {r4-r8,pc} /* And go home */ 595END(outswm8) 596 597