1/* $NetBSD: blockio.S,v 1.5 2002/08/15 01:38:16 briggs Exp $ */ 2 3/*- 4 * Copyright (c) 2001 Ben Harris. 5 * Copyright (c) 1994 Mark Brinicombe. 6 * Copyright (c) 1994 Brini. 7 * All rights reserved. 8 * 9 * This code is derived from software written for Brini by Mark Brinicombe 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by Brini. 22 * 4. The name of the company nor the name of the author may be used to 23 * endorse or promote products derived from this software without specific 24 * prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED 27 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 28 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 29 * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 30 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 31 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 32 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * RiscBSD kernel project 39 * 40 * blockio.S 41 * 42 * optimised block read/write from/to IO routines. 43 * 44 * Created : 08/10/94 45 * Modified : 22/01/99 -- R.Earnshaw 46 * Faster, and small tweaks for StrongARM 47 */ 48 49#include <machine/asm.h> 50 .syntax unified 51 52/* 53 * Read bytes from an I/O address into a block of memory 54 * 55 * r0 = address to read from (IO) 56 * r1 = address to write to (memory) 57 * r2 = length 58 */ 59 60/* This code will look very familiar if you've read _memcpy(). */ 61ENTRY(read_multi_1) 62 mov ip, sp 63 stmfd sp!, {fp, ip, lr, pc} 64 sub fp, ip, #4 65 subs r2, r2, #4 /* r2 = length - 4 */ 66 blt .Lrm1_l4 /* less than 4 bytes */ 67 ands r12, r1, #3 68 beq .Lrm1_main /* aligned destination */ 69 rsb r12, r12, #4 70 cmp r12, #2 71 ldrb r3, [r0] 72 strb r3, [r1], #1 73 ldrbge r3, [r0] 74 strbge r3, [r1], #1 75 ldrbgt r3, [r0] 76 strbgt r3, [r1], #1 77 subs r2, r2, r12 78 blt .Lrm1_l4 79.Lrm1_main: 80.Lrm1loop: 81 ldrb r3, [r0] 82 ldrb r12, [r0] 83 orr r3, r3, r12, lsl #8 84 ldrb r12, [r0] 85 orr r3, r3, r12, lsl #16 86 ldrb r12, [r0] 87 orr r3, r3, r12, lsl #24 88 str r3, [r1], #4 89 subs r2, r2, #4 90 bge .Lrm1loop 91.Lrm1_l4: 92 adds r2, r2, #4 /* r2 = length again */ 93 ldmdbeq fp, {fp, sp, pc} 94 RETeq 95 cmp r2, #2 96 ldrb r3, [r0] 97 strb r3, [r1], #1 98 ldrbge r3, [r0] 99 strbge r3, [r1], #1 100 ldrbgt r3, [r0] 101 strbgt r3, [r1], #1 102 ldmdb fp, {fp, sp, pc} 103END(read_multi_1) 104 105/* 106 * Write bytes to an I/O address from a block of memory 107 * 108 * r0 = address to write to (IO) 109 * r1 = address to read from (memory) 110 * r2 = length 111 */ 112 113/* This code will look very familiar if you've read _memcpy(). */ 114ENTRY(write_multi_1) 115 mov ip, sp 116 stmfd sp!, {fp, ip, lr, pc} 117 sub fp, ip, #4 118 subs r2, r2, #4 /* r2 = length - 4 */ 119 blt .Lwm1_l4 /* less than 4 bytes */ 120 ands r12, r1, #3 121 beq .Lwm1_main /* aligned source */ 122 rsb r12, r12, #4 123 cmp r12, #2 124 ldrb r3, [r1], #1 125 strb r3, [r0] 126 ldrbge r3, [r1], #1 127 strbge r3, [r0] 128 ldrbgt r3, [r1], #1 129 strbgt r3, [r0] 130 subs r2, r2, r12 131 blt .Lwm1_l4 132.Lwm1_main: 133.Lwm1loop: 134 ldr r3, [r1], #4 135 strb r3, [r0] 136 mov r3, r3, lsr #8 137 strb r3, [r0] 138 mov r3, r3, lsr #8 139 strb r3, [r0] 140 mov r3, r3, lsr #8 141 strb r3, [r0] 142 subs r2, r2, #4 143 bge .Lwm1loop 144.Lwm1_l4: 145 adds r2, r2, #4 /* r2 = length again */ 146 ldmdbeq fp, {fp, sp, pc} 147 cmp r2, #2 148 ldrb r3, [r1], #1 149 strb r3, [r0] 150 ldrbge r3, [r1], #1 151 strbge r3, [r0] 152 ldrbgt r3, [r1], #1 153 strbgt r3, [r0] 154 ldmdb fp, {fp, sp, pc} 155END(write_multi_1) 156 157/* 158 * Reads short ints (16 bits) from an I/O address into a block of memory 159 * 160 * r0 = address to read from (IO) 161 * r1 = address to write to (memory) 162 * r2 = length 163 */ 164 165ENTRY(insw) 166/* Make sure that we have a positive length */ 167 cmp r2, #0x00000000 168 movle pc, lr 169 170/* If the destination address and the size is word aligned, do it fast */ 171 172 tst r2, #0x00000001 173 tsteq r1, #0x00000003 174 beq .Lfastinsw 175 176/* Non aligned insw */ 177 178.Linswloop: 179 ldr r3, [r0] 180 subs r2, r2, #0x00000001 /* Loop test in load delay slot */ 181 strb r3, [r1], #0x0001 182 mov r3, r3, lsr #8 183 strb r3, [r1], #0x0001 184 bgt .Linswloop 185 186 RET 187 188/* Word aligned insw */ 189 190.Lfastinsw: 191 192.Lfastinswloop: 193 ldr r3, [r0, #0x0002] /* take advantage of nonaligned 194 * word accesses */ 195 ldr ip, [r0] 196 mov r3, r3, lsr #16 /* Put the two shorts together */ 197 orr r3, r3, ip, lsl #16 198 str r3, [r1], #0x0004 /* Store */ 199 subs r2, r2, #0x00000002 /* Next */ 200 bgt .Lfastinswloop 201 202 RET 203END(insw) 204 205/* 206 * Writes short ints (16 bits) from a block of memory to an I/O address 207 * 208 * r0 = address to write to (IO) 209 * r1 = address to read from (memory) 210 * r2 = length 211 */ 212 213ENTRY(outsw) 214/* Make sure that we have a positive length */ 215 cmp r2, #0x00000000 216 movle pc, lr 217 218/* If the destination address and the size is word aligned, do it fast */ 219 220 tst r2, #0x00000001 221 tsteq r1, #0x00000003 222 beq .Lfastoutsw 223 224/* Non aligned outsw */ 225 226.Loutswloop: 227 ldrb r3, [r1], #0x0001 228 ldrb ip, [r1], #0x0001 229 subs r2, r2, #0x00000001 /* Loop test in load delay slot */ 230 orr r3, r3, ip, lsl #8 231 orr r3, r3, r3, lsl #16 232 str r3, [r0] 233 bgt .Loutswloop 234 235 RET 236 237/* Word aligned outsw */ 238 239.Lfastoutsw: 240 241.Lfastoutswloop: 242 ldr r3, [r1], #0x0004 /* r3 = (H)(L) */ 243 subs r2, r2, #0x00000002 /* Loop test in load delay slot */ 244 245 eor ip, r3, r3, lsr #16 /* ip = (H)(H^L) */ 246 eor r3, r3, ip, lsl #16 /* r3 = (H^H^L)(L) = (L)(L) */ 247 eor ip, ip, r3, lsr #16 /* ip = (H)(H^L^L) = (H)(H) */ 248 249 str r3, [r0] 250 str ip, [r0] 251 252/* mov ip, r3, lsl #16 253 * orr ip, ip, ip, lsr #16 254 * str ip, [r0] 255 * 256 * mov ip, r3, lsr #16 257 * orr ip, ip, ip, lsl #16 258 * str ip, [r0] 259 */ 260 261 bgt .Lfastoutswloop 262 263 RET 264END(outsw) 265 266/* 267 * reads short ints (16 bits) from an I/O address into a block of memory 268 * with a length garenteed to be a multiple of 16 bytes 269 * with a word aligned destination address 270 * 271 * r0 = address to read from (IO) 272 * r1 = address to write to (memory) 273 * r2 = length 274 */ 275 276ENTRY(insw16) 277/* Make sure that we have a positive length */ 278 cmp r2, #0x00000000 279 movle pc, lr 280 281/* If the destination address is word aligned and the size suitably 282 aligned, do it fast */ 283 284 tst r2, #0x00000007 285 tsteq r1, #0x00000003 286 287 bne _C_LABEL(insw) 288 289/* Word aligned insw */ 290 291 stmfd sp!, {r4,r5,lr} 292 293.Linsw16loop: 294 ldr r3, [r0, #0x0002] /* take advantage of nonaligned 295 * word accesses */ 296 ldr lr, [r0] 297 mov r3, r3, lsr #16 /* Put the two shorts together */ 298 orr r3, r3, lr, lsl #16 299 300 ldr r4, [r0, #0x0002] /* take advantage of nonaligned 301 * word accesses */ 302 ldr lr, [r0] 303 mov r4, r4, lsr #16 /* Put the two shorts together */ 304 orr r4, r4, lr, lsl #16 305 306 ldr r5, [r0, #0x0002] /* take advantage of nonaligned 307 * word accesses */ 308 ldr lr, [r0] 309 mov r5, r5, lsr #16 /* Put the two shorts together */ 310 orr r5, r5, lr, lsl #16 311 312 ldr ip, [r0, #0x0002] /* take advantage of nonaligned 313 * word accesses */ 314 ldr lr, [r0] 315 mov ip, ip, lsr #16 /* Put the two shorts together */ 316 orr ip, ip, lr, lsl #16 317 318 stmia r1!, {r3-r5,ip} 319 subs r2, r2, #0x00000008 /* Next */ 320 bgt .Linsw16loop 321 322 ldmfd sp!, {r4,r5,pc} /* Restore regs and go home */ 323END(insw16) 324 325/* 326 * Writes short ints (16 bits) from a block of memory to an I/O address 327 * 328 * r0 = address to write to (IO) 329 * r1 = address to read from (memory) 330 * r2 = length 331 */ 332 333ENTRY(outsw16) 334/* Make sure that we have a positive length */ 335 cmp r2, #0x00000000 336 movle pc, lr 337 338/* If the destination address is word aligned and the size suitably 339 aligned, do it fast */ 340 341 tst r2, #0x00000007 342 tsteq r1, #0x00000003 343 344 bne _C_LABEL(outsw) 345 346/* Word aligned outsw */ 347 348 stmfd sp!, {r4,r5,lr} 349 350.Loutsw16loop: 351 ldmia r1!, {r4,r5,ip,lr} 352 353 eor r3, r4, r4, lsl #16 /* r3 = (A^B)(B) */ 354 eor r4, r4, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */ 355 eor r3, r3, r4, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */ 356 str r3, [r0] 357 str r4, [r0] 358 359/* mov r3, r4, lsl #16 360 * orr r3, r3, r3, lsr #16 361 * str r3, [r0] 362 * 363 * mov r3, r4, lsr #16 364 * orr r3, r3, r3, lsl #16 365 * str r3, [r0] 366 */ 367 368 eor r3, r5, r5, lsl #16 /* r3 = (A^B)(B) */ 369 eor r5, r5, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */ 370 eor r3, r3, r5, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */ 371 str r3, [r0] 372 str r5, [r0] 373 374 eor r3, ip, ip, lsl #16 /* r3 = (A^B)(B) */ 375 eor ip, ip, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */ 376 eor r3, r3, ip, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */ 377 str r3, [r0] 378 str ip, [r0] 379 380 eor r3, lr, lr, lsl #16 /* r3 = (A^B)(B) */ 381 eor lr, lr, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */ 382 eor r3, r3, lr, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */ 383 str r3, [r0] 384 str lr, [r0] 385 386 subs r2, r2, #0x00000008 387 bgt .Loutsw16loop 388 389 ldmfd sp!, {r4,r5,pc} /* and go home */ 390END(outsw16) 391 392/* 393 * reads short ints (16 bits) from an I/O address into a block of memory 394 * The I/O address is assumed to be mapped multiple times in a block of 395 * 8 words. 396 * The destination address should be word aligned. 397 * 398 * r0 = address to read from (IO) 399 * r1 = address to write to (memory) 400 * r2 = length 401 */ 402 403ENTRY(inswm8) 404/* Make sure that we have a positive length */ 405 cmp r2, #0x00000000 406 movle pc, lr 407 408/* If the destination address is word aligned and the size suitably 409 aligned, do it fast */ 410 411 tst r1, #0x00000003 412 413 bne _C_LABEL(insw) 414 415/* Word aligned insw */ 416 417 stmfd sp!, {r4-r9,lr} 418 419 mov lr, #0xff000000 420 orr lr, lr, #0x00ff0000 421 422.Linswm8_loop8: 423 cmp r2, #8 424 bcc .Linswm8_l8 425 426 ldmia r0, {r3-r9,ip} 427 428 bic r3, r3, lr 429 orr r3, r3, r4, lsl #16 430 bic r5, r5, lr 431 orr r4, r5, r6, lsl #16 432 bic r7, r7, lr 433 orr r5, r7, r8, lsl #16 434 bic r9, r9, lr 435 orr r6, r9, ip, lsl #16 436 437 stmia r1!, {r3-r6} 438 439 subs r2, r2, #0x00000008 /* Next */ 440 bne .Linswm8_loop8 441 beq .Linswm8_l1 442 443.Linswm8_l8: 444 cmp r2, #4 445 bcc .Linswm8_l4 446 447 ldmia r0, {r3-r6} 448 449 bic r3, r3, lr 450 orr r3, r3, r4, lsl #16 451 bic r5, r5, lr 452 orr r4, r5, r6, lsl #16 453 454 stmia r1!, {r3-r4} 455 456 subs r2, r2, #0x00000004 457 beq .Linswm8_l1 458 459.Linswm8_l4: 460 cmp r2, #2 461 bcc .Linswm8_l2 462 463 ldmia r0, {r3-r4} 464 465 bic r3, r3, lr 466 orr r3, r3, r4, lsl #16 467 str r3, [r1], #0x0004 468 469 subs r2, r2, #0x00000002 470 beq .Linswm8_l1 471 472.Linswm8_l2: 473 cmp r2, #1 474 bcc .Linswm8_l1 475 476 ldr r3, [r0] 477 subs r2, r2, #0x00000001 /* Test in load delay slot */ 478 /* XXX, why don't we use result? */ 479 480 strb r3, [r1], #0x0001 481 mov r3, r3, lsr #8 482 strb r3, [r1], #0x0001 483 484 485.Linswm8_l1: 486 ldmfd sp!, {r4-r9,pc} /* And go home */ 487END(inswm8) 488 489/* 490 * write short ints (16 bits) to an I/O address from a block of memory 491 * The I/O address is assumed to be mapped multiple times in a block of 492 * 8 words. 493 * The source address should be word aligned. 494 * 495 * r0 = address to read to (IO) 496 * r1 = address to write from (memory) 497 * r2 = length 498 */ 499 500ENTRY(outswm8) 501/* Make sure that we have a positive length */ 502 cmp r2, #0x00000000 503 movle pc, lr 504 505/* If the destination address is word aligned and the size suitably 506 aligned, do it fast */ 507 508 tst r1, #0x00000003 509 510 bne _C_LABEL(outsw) 511 512/* Word aligned outsw */ 513 514 stmfd sp!, {r4-r8,lr} 515 516.Loutswm8_loop8: 517 cmp r2, #8 518 bcc .Loutswm8_l8 519 520 ldmia r1!, {r3,r5,r7,ip} 521 522 eor r4, r3, r3, lsr #16 /* r4 = (A)(A^B) */ 523 eor r3, r3, r4, lsl #16 /* r3 = (A^A^B)(B) = (B)(B) */ 524 eor r4, r4, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */ 525 526 eor r6, r5, r5, lsr #16 /* r6 = (A)(A^B) */ 527 eor r5, r5, r6, lsl #16 /* r5 = (A^A^B)(B) = (B)(B) */ 528 eor r6, r6, r5, lsr #16 /* r6 = (A)(B^A^B) = (A)(A) */ 529 530 eor r8, r7, r7, lsr #16 /* r8 = (A)(A^B) */ 531 eor r7, r7, r8, lsl #16 /* r7 = (A^A^B)(B) = (B)(B) */ 532 eor r8, r8, r7, lsr #16 /* r8 = (A)(B^A^B) = (A)(A) */ 533 534 eor lr, ip, ip, lsr #16 /* lr = (A)(A^B) */ 535 eor ip, ip, lr, lsl #16 /* ip = (A^A^B)(B) = (B)(B) */ 536 eor lr, lr, ip, lsr #16 /* lr = (A)(B^A^B) = (A)(A) */ 537 538 stmia r0, {r3-r8,ip,lr} 539 540 subs r2, r2, #0x00000008 /* Next */ 541 bne .Loutswm8_loop8 542 beq .Loutswm8_l1 543 544.Loutswm8_l8: 545 cmp r2, #4 546 bcc .Loutswm8_l4 547 548 ldmia r1!, {r3-r4} 549 550 eor r6, r3, r3, lsr #16 /* r6 = (A)(A^B) */ 551 eor r5, r3, r6, lsl #16 /* r5 = (A^A^B)(B) = (B)(B) */ 552 eor r6, r6, r5, lsr #16 /* r6 = (A)(B^A^B) = (A)(A) */ 553 554 eor r8, r4, r4, lsr #16 /* r8 = (A)(A^B) */ 555 eor r7, r4, r8, lsl #16 /* r7 = (A^A^B)(B) = (B)(B) */ 556 eor r8, r8, r7, lsr #16 /* r8 = (A)(B^A^B) = (A)(A) */ 557 558 stmia r0, {r5-r8} 559 560 subs r2, r2, #0x00000004 561 beq .Loutswm8_l1 562 563.Loutswm8_l4: 564 cmp r2, #2 565 bcc .Loutswm8_l2 566 567 ldr r3, [r1], #0x0004 /* r3 = (A)(B) */ 568 subs r2, r2, #0x00000002 /* Done test in Load delay slot */ 569 570 eor r5, r3, r3, lsr #16 /* r5 = (A)(A^B)*/ 571 eor r4, r3, r5, lsl #16 /* r4 = (A^A^B)(B) = (B)(B) */ 572 eor r5, r5, r4, lsr #16 /* r5 = (A)(B^A^B) = (A)(A) */ 573 574 stmia r0, {r4, r5} 575 576 beq .Loutswm8_l1 577 578.Loutswm8_l2: 579 cmp r2, #1 580 bcc .Loutswm8_l1 581 582 ldrb r3, [r1], #0x0001 583 ldrb r4, [r1], #0x0001 584 subs r2, r2, #0x00000001 /* Done test in load delay slot */ 585 /* XXX This test isn't used? */ 586 orr r3, r3, r4, lsl #8 587 orr r3, r3, r3, lsl #16 588 str r3, [r0] 589 590.Loutswm8_l1: 591 ldmfd sp!, {r4-r8,pc} /* And go home */ 592END(outswm8) 593 594