1/* $NetBSD: bcopyinout.S,v 1.9 2002/10/13 14:54:47 bjh21 Exp $ */ 2 3/* 4 * Copyright (c) 2002 Wasabi Systems, Inc. 5 * All rights reserved. 6 * 7 * Written by Allen Briggs for Wasabi Systems, Inc. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed for the NetBSD Project by 20 * Wasabi Systems, Inc. 21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 22 * or promote products derived from this software without specific prior 23 * written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 * POSSIBILITY OF SUCH DAMAGE. 36 */ 37 38#include "opt_multiprocessor.h" 39 40#include "assym.h" 41 42#include <machine/asm.h> 43 44RCSID("$NetBSD: bcopyinout.S,v 1.9 2002/10/13 14:54:47 bjh21 Exp $") 45 46 .text 47 .align 0 48 49#ifdef MULTIPROCESSOR 50.Lcpu_info: 51 .word _C_LABEL(cpu_info) 52#else 53.Lcurpcb: 54 .word _C_LABEL(curpcb) 55#endif 56 57#ifdef __PROG32 58#define SAVE_REGS stmfd sp!, {r4-r11} 59#define RESTORE_REGS ldmfd sp!, {r4-r11} 60#else 61/* Need to save R14_svc because it'll get trampled if we take a page fault. */ 62#define SAVE_REGS stmfd sp!, {r4-r11, r14} 63#define RESTORE_REGS ldmfd sp!, {r4-r11, r14} 64#endif 65 66#if 0 && defined(__XSCALE__) 67#define HELLOCPP # 68#define PREFETCH(rx,o) pld [ rx , HELLOCPP (o) ] 69#else 70#define PREFETCH(rx,o) 71#endif 72 73/* 74 * r0 = user space address 75 * r1 = kernel space address 76 * r2 = length 77 * 78 * Copies bytes from user space to kernel space 79 * 80 * We save/restore r4-r11: 81 * r4-r11 are scratch 82 */ 83ENTRY(copyin) 84 /* Quick exit if length is zero */ 85 teq r2, #0 86 moveq r0, #0 87 moveq pc, lr 88 89 SAVE_REGS 90#ifdef MULTIPROCESSOR 91 /* XXX Probably not appropriate for non-Hydra SMPs */ 92 stmfd sp!, {r0-r2, r14} 93 bl _C_LABEL(cpu_number) 94 ldr r4, .Lcpu_info 95 ldr r4, [r4, r0, lsl #2] 96 ldr r4, [r4, #CI_CURPCB] 97 ldmfd sp!, {r0-r2, r14} 98#else 99 ldr r4, .Lcurpcb 100 ldr r4, [r4] 101#endif 102 103 ldr r5, [r4, #PCB_ONFAULT] 104 adr r3, .Lcopyfault 105 str r3, [r4, #PCB_ONFAULT] 106 107 PREFETCH(r0, 0) 108 PREFETCH(r1, 0) 109 110 /* 111 * If not too many bytes, take the slow path. 112 */ 113 cmp r2, #0x08 114 blt .Licleanup 115 116 /* 117 * Align destination to word boundary. 118 */ 119 and r6, r1, #0x3 120 ldr pc, [pc, r6, lsl #2] 121 b .Lialend 122 .word .Lialend 123 .word .Lial3 124 .word .Lial2 125 .word .Lial1 126.Lial3: ldrbt r6, [r0], #1 127 sub r2, r2, #1 128 strb r6, [r1], #1 129.Lial2: ldrbt r7, [r0], #1 130 sub r2, r2, #1 131 strb r7, [r1], #1 132.Lial1: ldrbt r6, [r0], #1 133 sub r2, r2, #1 134 strb r6, [r1], #1 135.Lialend: 136 137 /* 138 * If few bytes left, finish slow. 139 */ 140 cmp r2, #0x08 141 blt .Licleanup 142 143 /* 144 * If source is not aligned, finish slow. 145 */ 146 ands r3, r0, #0x03 147 bne .Licleanup 148 149 cmp r2, #0x60 /* Must be > 0x5f for unrolled cacheline */ 150 blt .Licleanup8 151 152 /* 153 * Align destination to cacheline boundary. 154 * If source and destination are nicely aligned, this can be a big 155 * win. If not, it's still cheaper to copy in groups of 32 even if 156 * we don't get the nice cacheline alignment. 157 */ 158 and r6, r1, #0x1f 159 ldr pc, [pc, r6] 160 b .Licaligned 161 .word .Licaligned 162 .word .Lical28 163 .word .Lical24 164 .word .Lical20 165 .word .Lical16 166 .word .Lical12 167 .word .Lical8 168 .word .Lical4 169.Lical28:ldrt r6, [r0], #4 170 sub r2, r2, #4 171 str r6, [r1], #4 172.Lical24:ldrt r7, [r0], #4 173 sub r2, r2, #4 174 str r7, [r1], #4 175.Lical20:ldrt r6, [r0], #4 176 sub r2, r2, #4 177 str r6, [r1], #4 178.Lical16:ldrt r7, [r0], #4 179 sub r2, r2, #4 180 str r7, [r1], #4 181.Lical12:ldrt r6, [r0], #4 182 sub r2, r2, #4 183 str r6, [r1], #4 184.Lical8:ldrt r7, [r0], #4 185 sub r2, r2, #4 186 str r7, [r1], #4 187.Lical4:ldrt r6, [r0], #4 188 sub r2, r2, #4 189 str r6, [r1], #4 190 191 /* 192 * We start with > 0x40 bytes to copy (>= 0x60 got us into this 193 * part of the code, and we may have knocked that down by as much 194 * as 0x1c getting aligned). 195 * 196 * This loop basically works out to: 197 * do { 198 * prefetch-next-cacheline(s) 199 * bytes -= 0x20; 200 * copy cacheline 201 * } while (bytes >= 0x40); 202 * bytes -= 0x20; 203 * copy cacheline 204 */ 205.Licaligned: 206 PREFETCH(r0, 32) 207 PREFETCH(r1, 32) 208 209 sub r2, r2, #0x20 210 211 /* Copy a cacheline */ 212 ldrt r10, [r0], #4 213 ldrt r11, [r0], #4 214 ldrt r6, [r0], #4 215 ldrt r7, [r0], #4 216 ldrt r8, [r0], #4 217 ldrt r9, [r0], #4 218 stmia r1!, {r10-r11} 219 ldrt r10, [r0], #4 220 ldrt r11, [r0], #4 221 stmia r1!, {r6-r11} 222 223 cmp r2, #0x40 224 bge .Licaligned 225 226 sub r2, r2, #0x20 227 228 /* Copy a cacheline */ 229 ldrt r10, [r0], #4 230 ldrt r11, [r0], #4 231 ldrt r6, [r0], #4 232 ldrt r7, [r0], #4 233 ldrt r8, [r0], #4 234 ldrt r9, [r0], #4 235 stmia r1!, {r10-r11} 236 ldrt r10, [r0], #4 237 ldrt r11, [r0], #4 238 stmia r1!, {r6-r11} 239 240 cmp r2, #0x08 241 blt .Liprecleanup 242 243.Licleanup8: 244 ldrt r8, [r0], #4 245 ldrt r9, [r0], #4 246 sub r2, r2, #8 247 stmia r1!, {r8, r9} 248 cmp r2, #8 249 bge .Licleanup8 250 251.Liprecleanup: 252 /* 253 * If we're done, bail. 254 */ 255 cmp r2, #0 256 beq .Lout 257 258.Licleanup: 259 and r6, r2, #0x3 260 ldr pc, [pc, r6, lsl #2] 261 b .Licend 262 .word .Lic4 263 .word .Lic1 264 .word .Lic2 265 .word .Lic3 266.Lic4: ldrbt r6, [r0], #1 267 sub r2, r2, #1 268 strb r6, [r1], #1 269.Lic3: ldrbt r7, [r0], #1 270 sub r2, r2, #1 271 strb r7, [r1], #1 272.Lic2: ldrbt r6, [r0], #1 273 sub r2, r2, #1 274 strb r6, [r1], #1 275.Lic1: ldrbt r7, [r0], #1 276 subs r2, r2, #1 277 strb r7, [r1], #1 278.Licend: 279 bne .Licleanup 280 281.Liout: 282 mov r0, #0 283 284 str r5, [r4, #PCB_ONFAULT] 285 RESTORE_REGS 286 287 mov pc, lr 288 289.Lcopyfault: 290 str r5, [r4, #PCB_ONFAULT] 291 RESTORE_REGS 292 293 mov pc, lr 294 295/* 296 * r0 = kernel space address 297 * r1 = user space address 298 * r2 = length 299 * 300 * Copies bytes from kernel space to user space 301 * 302 * We save/restore r4-r11: 303 * r4-r11 are scratch 304 */ 305 306ENTRY(copyout) 307 /* Quick exit if length is zero */ 308 teq r2, #0 309 moveq r0, #0 310 moveq pc, lr 311 312 SAVE_REGS 313#ifdef MULTIPROCESSOR 314 /* XXX Probably not appropriate for non-Hydra SMPs */ 315 stmfd sp!, {r0-r2, r14} 316 bl _C_LABEL(cpu_number) 317 ldr r4, .Lcpu_info 318 ldr r4, [r4, r0, lsl #2] 319 ldr r4, [r4, #CI_CURPCB] 320 ldmfd sp!, {r0-r2, r14} 321#else 322 ldr r4, .Lcurpcb 323 ldr r4, [r4] 324#endif 325 326 ldr r5, [r4, #PCB_ONFAULT] 327 adr r3, .Lcopyfault 328 str r3, [r4, #PCB_ONFAULT] 329 330 PREFETCH(r0, 0) 331 PREFETCH(r1, 0) 332 333 /* 334 * If not too many bytes, take the slow path. 335 */ 336 cmp r2, #0x08 337 blt .Lcleanup 338 339 /* 340 * Align destination to word boundary. 341 */ 342 and r6, r1, #0x3 343 ldr pc, [pc, r6, lsl #2] 344 b .Lalend 345 .word .Lalend 346 .word .Lal3 347 .word .Lal2 348 .word .Lal1 349.Lal3: ldrb r6, [r0], #1 350 sub r2, r2, #1 351 strbt r6, [r1], #1 352.Lal2: ldrb r7, [r0], #1 353 sub r2, r2, #1 354 strbt r7, [r1], #1 355.Lal1: ldrb r6, [r0], #1 356 sub r2, r2, #1 357 strbt r6, [r1], #1 358.Lalend: 359 360 /* 361 * If few bytes left, finish slow. 362 */ 363 cmp r2, #0x08 364 blt .Lcleanup 365 366 /* 367 * If source is not aligned, finish slow. 368 */ 369 ands r3, r0, #0x03 370 bne .Lcleanup 371 372 cmp r2, #0x60 /* Must be > 0x5f for unrolled cacheline */ 373 blt .Lcleanup8 374 375 /* 376 * Align source & destination to cacheline boundary. 377 */ 378 and r6, r1, #0x1f 379 ldr pc, [pc, r6] 380 b .Lcaligned 381 .word .Lcaligned 382 .word .Lcal28 383 .word .Lcal24 384 .word .Lcal20 385 .word .Lcal16 386 .word .Lcal12 387 .word .Lcal8 388 .word .Lcal4 389.Lcal28:ldr r6, [r0], #4 390 sub r2, r2, #4 391 strt r6, [r1], #4 392.Lcal24:ldr r7, [r0], #4 393 sub r2, r2, #4 394 strt r7, [r1], #4 395.Lcal20:ldr r6, [r0], #4 396 sub r2, r2, #4 397 strt r6, [r1], #4 398.Lcal16:ldr r7, [r0], #4 399 sub r2, r2, #4 400 strt r7, [r1], #4 401.Lcal12:ldr r6, [r0], #4 402 sub r2, r2, #4 403 strt r6, [r1], #4 404.Lcal8: ldr r7, [r0], #4 405 sub r2, r2, #4 406 strt r7, [r1], #4 407.Lcal4: ldr r6, [r0], #4 408 sub r2, r2, #4 409 strt r6, [r1], #4 410 411 /* 412 * We start with > 0x40 bytes to copy (>= 0x60 got us into this 413 * part of the code, and we may have knocked that down by as much 414 * as 0x1c getting aligned). 415 * 416 * This loop basically works out to: 417 * do { 418 * prefetch-next-cacheline(s) 419 * bytes -= 0x20; 420 * copy cacheline 421 * } while (bytes >= 0x40); 422 * bytes -= 0x20; 423 * copy cacheline 424 */ 425.Lcaligned: 426 PREFETCH(r0, 32) 427 PREFETCH(r1, 32) 428 429 sub r2, r2, #0x20 430 431 /* Copy a cacheline */ 432 ldmia r0!, {r6-r11} 433 strt r6, [r1], #4 434 strt r7, [r1], #4 435 ldmia r0!, {r6-r7} 436 strt r8, [r1], #4 437 strt r9, [r1], #4 438 strt r10, [r1], #4 439 strt r11, [r1], #4 440 strt r6, [r1], #4 441 strt r7, [r1], #4 442 443 cmp r2, #0x40 444 bge .Lcaligned 445 446 sub r2, r2, #0x20 447 448 /* Copy a cacheline */ 449 ldmia r0!, {r6-r11} 450 strt r6, [r1], #4 451 strt r7, [r1], #4 452 ldmia r0!, {r6-r7} 453 strt r8, [r1], #4 454 strt r9, [r1], #4 455 strt r10, [r1], #4 456 strt r11, [r1], #4 457 strt r6, [r1], #4 458 strt r7, [r1], #4 459 460 cmp r2, #0x08 461 blt .Lprecleanup 462 463.Lcleanup8: 464 ldmia r0!, {r8-r9} 465 sub r2, r2, #8 466 strt r8, [r1], #4 467 strt r9, [r1], #4 468 cmp r2, #8 469 bge .Lcleanup8 470 471.Lprecleanup: 472 /* 473 * If we're done, bail. 474 */ 475 cmp r2, #0 476 beq .Lout 477 478.Lcleanup: 479 and r6, r2, #0x3 480 ldr pc, [pc, r6, lsl #2] 481 b .Lcend 482 .word .Lc4 483 .word .Lc1 484 .word .Lc2 485 .word .Lc3 486.Lc4: ldrb r6, [r0], #1 487 sub r2, r2, #1 488 strbt r6, [r1], #1 489.Lc3: ldrb r7, [r0], #1 490 sub r2, r2, #1 491 strbt r7, [r1], #1 492.Lc2: ldrb r6, [r0], #1 493 sub r2, r2, #1 494 strbt r6, [r1], #1 495.Lc1: ldrb r7, [r0], #1 496 subs r2, r2, #1 497 strbt r7, [r1], #1 498.Lcend: 499 bne .Lcleanup 500 501.Lout: 502 mov r0, #0 503 504 str r5, [r4, #PCB_ONFAULT] 505 RESTORE_REGS 506 507 mov pc, lr 508 509/* 510 * r0 = kernel space source address 511 * r1 = kernel space destination address 512 * r2 = length 513 * 514 * Copies bytes from kernel space to kernel space, aborting on page fault 515 * 516 * Copy of copyout, but without the ldrt/strt instructions. 517 */ 518 519ENTRY(kcopy) 520 /* Quick exit if length is zero */ 521 teq r2, #0 522 moveq r0, #0 523 moveq pc, lr 524 525 SAVE_REGS 526#ifdef MULTIPROCESSOR 527 /* XXX Probably not appropriate for non-Hydra SMPs */ 528 stmfd sp!, {r0-r2, r14} 529 bl _C_LABEL(cpu_number) 530 ldr r4, .Lcpu_info 531 ldr r4, [r4, r0, lsl #2] 532 ldr r4, [r4, #CI_CURPCB] 533 ldmfd sp!, {r0-r2, r14} 534#else 535 ldr r4, .Lcurpcb 536 ldr r4, [r4] 537#endif 538 539 ldr r5, [r4, #PCB_ONFAULT] 540 adr r3, .Lcopyfault 541 str r3, [r4, #PCB_ONFAULT] 542 543 PREFETCH(r0, 0) 544 PREFETCH(r1, 0) 545 546 /* 547 * If not too many bytes, take the slow path. 548 */ 549 cmp r2, #0x08 550 blt .Lkcleanup 551 552 /* 553 * Align destination to word boundary. 554 */ 555 and r6, r1, #0x3 556 ldr pc, [pc, r6, lsl #2] 557 b .Lkalend 558 .word .Lkalend 559 .word .Lkal3 560 .word .Lkal2 561 .word .Lkal1 562.Lkal3: ldrb r6, [r0], #1 563 sub r2, r2, #1 564 strb r6, [r1], #1 565.Lkal2: ldrb r7, [r0], #1 566 sub r2, r2, #1 567 strb r7, [r1], #1 568.Lkal1: ldrb r6, [r0], #1 569 sub r2, r2, #1 570 strb r6, [r1], #1 571.Lkalend: 572 573 /* 574 * If few bytes left, finish slow. 575 */ 576 cmp r2, #0x08 577 blt .Lkcleanup 578 579 /* 580 * If source is not aligned, finish slow. 581 */ 582 ands r3, r0, #0x03 583 bne .Lkcleanup 584 585 cmp r2, #0x60 /* Must be > 0x5f for unrolled cacheline */ 586 blt .Lkcleanup8 587 588 /* 589 * Align source & destination to cacheline boundary. 590 */ 591 and r6, r1, #0x1f 592 ldr pc, [pc, r6] 593 b .Lkcaligned 594 .word .Lkcaligned 595 .word .Lkcal28 596 .word .Lkcal24 597 .word .Lkcal20 598 .word .Lkcal16 599 .word .Lkcal12 600 .word .Lkcal8 601 .word .Lkcal4 602.Lkcal28:ldr r6, [r0], #4 603 sub r2, r2, #4 604 str r6, [r1], #4 605.Lkcal24:ldr r7, [r0], #4 606 sub r2, r2, #4 607 str r7, [r1], #4 608.Lkcal20:ldr r6, [r0], #4 609 sub r2, r2, #4 610 str r6, [r1], #4 611.Lkcal16:ldr r7, [r0], #4 612 sub r2, r2, #4 613 str r7, [r1], #4 614.Lkcal12:ldr r6, [r0], #4 615 sub r2, r2, #4 616 str r6, [r1], #4 617.Lkcal8:ldr r7, [r0], #4 618 sub r2, r2, #4 619 str r7, [r1], #4 620.Lkcal4:ldr r6, [r0], #4 621 sub r2, r2, #4 622 str r6, [r1], #4 623 624 /* 625 * We start with > 0x40 bytes to copy (>= 0x60 got us into this 626 * part of the code, and we may have knocked that down by as much 627 * as 0x1c getting aligned). 628 * 629 * This loop basically works out to: 630 * do { 631 * prefetch-next-cacheline(s) 632 * bytes -= 0x20; 633 * copy cacheline 634 * } while (bytes >= 0x40); 635 * bytes -= 0x20; 636 * copy cacheline 637 */ 638.Lkcaligned: 639 PREFETCH(r0, 32) 640 PREFETCH(r1, 32) 641 642 sub r2, r2, #0x20 643 644 /* Copy a cacheline */ 645 ldmia r0!, {r6-r11} 646 stmia r1!, {r6, r7} 647 ldmia r0!, {r6, r7} 648 stmia r1!, {r8-r11} 649 stmia r1!, {r6, r7} 650 651 cmp r2, #0x40 652 bge .Lkcaligned 653 654 sub r2, r2, #0x20 655 656 /* Copy a cacheline */ 657 ldmia r0!, {r6-r11} 658 stmia r1!, {r6-r7} 659 ldmia r0!, {r6-r7} 660 stmia r1!, {r8-r11} 661 stmia r1!, {r6-r7} 662 663 cmp r2, #0x08 664 blt .Lkprecleanup 665 666.Lkcleanup8: 667 ldmia r0!, {r8-r9} 668 sub r2, r2, #8 669 stmia r1!, {r8-r9} 670 cmp r2, #8 671 bge .Lkcleanup8 672 673.Lkprecleanup: 674 /* 675 * If we're done, bail. 676 */ 677 cmp r2, #0 678 beq .Lkout 679 680.Lkcleanup: 681 and r6, r2, #0x3 682 ldr pc, [pc, r6, lsl #2] 683 b .Lkcend 684 .word .Lkc4 685 .word .Lkc1 686 .word .Lkc2 687 .word .Lkc3 688.Lkc4: ldrb r6, [r0], #1 689 sub r2, r2, #1 690 strb r6, [r1], #1 691.Lkc3: ldrb r7, [r0], #1 692 sub r2, r2, #1 693 strb r7, [r1], #1 694.Lkc2: ldrb r6, [r0], #1 695 sub r2, r2, #1 696 strb r6, [r1], #1 697.Lkc1: ldrb r7, [r0], #1 698 subs r2, r2, #1 699 strb r7, [r1], #1 700.Lkcend: 701 bne .Lkcleanup 702 703.Lkout: 704 mov r0, #0 705 706 str r5, [r4, #PCB_ONFAULT] 707 RESTORE_REGS 708 709 mov pc, lr 710