1/* $NetBSD: memcpy.S,v 1.2 2013/03/17 02:13:10 christos Exp $ */ 2 3/* 4 * Copyright (c) 1996-2002 Eduardo Horvath 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26#include "strmacros.h" 27#if defined(LIBC_SCCS) && !defined(lint) 28RCSID("$NetBSD: memcpy.S,v 1.2 2013/03/17 02:13:10 christos Exp $") 29#endif /* LIBC_SCCS and not lint */ 30 31/* 32 * memcpy 33 * Assumes regions do not overlap; 34 * 35 * Must not use %g7 (see copyin/copyout above). 36 */ 37ENTRY(memcpy) /* dest, src, size */ 38 /* 39 * Swap args for bcopy. Gcc generates calls to memcpy for 40 * structure assignments. 41 */ 42 mov %o0, %o3 43 mov %o1, %o0 44 mov %o3, %o1 45#if !defined(_KERNEL) || defined(_RUMPKERNEL) 46ENTRY(bcopy) /* src, dest, size */ 47#endif 48#ifdef DEBUG 49#if defined(_KERNEL) && !defined(_RUMPKERNEL) 50 set pmapdebug, %o4 51 ld [%o4], %o4 52 btst 0x80, %o4 ! PDB_COPY 53 bz,pt %icc, 3f 54 nop 55#endif 56 save %sp, -CC64FSZ, %sp 57 mov %i0, %o1 58 set 2f, %o0 59 mov %i1, %o2 60 call printf 61 mov %i2, %o3 62! ta 1; nop 63 restore 64 .data 652: .asciz "memcpy(%p<-%p,%x)\n" 66 _ALIGN 67 .text 683: 69#endif 70 71 cmp %o2, BCOPY_SMALL 72 73Lmemcpy_start: 74 bge,pt CCCR, 2f ! if >= this many, go be fancy. 75 cmp %o2, 256 76 77 mov %o1, %o5 ! Save memcpy return value 78 /* 79 * Not much to copy, just do it a byte at a time. 80 */ 81 deccc %o2 ! while (--len >= 0) 82 bl 1f 83 .empty 840: 85 inc %o0 86 ldsb [%o0 - 1], %o4 ! (++dst)[-1] = *src++; 87 stb %o4, [%o1] 88 deccc %o2 89 bge 0b 90 inc %o1 911: 92 retl 93 mov %o5, %o0 94 NOTREACHED 95 96 /* 97 * Plenty of data to copy, so try to do it optimally. 98 */ 992: 100#ifdef USE_BLOCK_STORE_LOAD 101 ! If it is big enough, use VIS instructions 102 bge Lmemcpy_block 103 nop 104#endif /* USE_BLOCK_STORE_LOAD */ 105Lmemcpy_fancy: 106 107 !! 108 !! First align the output to a 8-byte entity 109 !! 110 111 save %sp, -CC64FSZ, %sp 112 113 mov %i0, %l0 114 mov %i1, %l1 115 116 mov %i2, %l2 117 btst 1, %l1 118 119 bz,pt %icc, 4f 120 btst 2, %l1 121 ldub [%l0], %l4 ! Load 1st byte 122 123 deccc 1, %l2 124 ble,pn CCCR, Lmemcpy_finish ! XXXX 125 inc 1, %l0 126 127 stb %l4, [%l1] ! Store 1st byte 128 inc 1, %l1 ! Update address 129 btst 2, %l1 1304: 131 bz,pt %icc, 4f 132 133 btst 1, %l0 134 bz,a 1f 135 lduh [%l0], %l4 ! Load short 136 137 ldub [%l0], %l4 ! Load bytes 138 139 ldub [%l0+1], %l3 140 sllx %l4, 8, %l4 141 or %l3, %l4, %l4 142 1431: 144 deccc 2, %l2 145 ble,pn CCCR, Lmemcpy_finish ! XXXX 146 inc 2, %l0 147 sth %l4, [%l1] ! Store 1st short 148 149 inc 2, %l1 1504: 151 btst 4, %l1 152 bz,pt CCCR, 4f 153 154 btst 3, %l0 155 bz,a,pt CCCR, 1f 156 lduw [%l0], %l4 ! Load word -1 157 158 btst 1, %l0 159 bz,a,pt %icc, 2f 160 lduh [%l0], %l4 161 162 ldub [%l0], %l4 163 164 lduh [%l0+1], %l3 165 sllx %l4, 16, %l4 166 or %l4, %l3, %l4 167 168 ldub [%l0+3], %l3 169 sllx %l4, 8, %l4 170 ba,pt %icc, 1f 171 or %l4, %l3, %l4 172 1732: 174 lduh [%l0+2], %l3 175 sllx %l4, 16, %l4 176 or %l4, %l3, %l4 177 1781: 179 deccc 4, %l2 180 ble,pn CCCR, Lmemcpy_finish ! XXXX 181 inc 4, %l0 182 183 st %l4, [%l1] ! Store word 184 inc 4, %l1 1854: 186 !! 187 !! We are now 32-bit aligned in the dest. 188 !! 189Lmemcpy_common: 190 191 and %l0, 7, %l4 ! Shift amount 192 andn %l0, 7, %l0 ! Source addr 193 194 brz,pt %l4, Lmemcpy_noshift8 ! No shift version... 195 196 sllx %l4, 3, %l4 ! In bits 197 mov 8<<3, %l3 198 199 ldx [%l0], %o0 ! Load word -1 200 sub %l3, %l4, %l3 ! Reverse shift 201 deccc 12*8, %l2 ! Have enough room? 202 203 sllx %o0, %l4, %o0 204 bl,pn CCCR, 2f 205 and %l3, 0x38, %l3 206Lmemcpy_unrolled8: 207 208 /* 209 * This is about as close to optimal as you can get, since 210 * the shifts require EU0 and cannot be paired, and you have 211 * 3 dependent operations on the data. 212 */ 213 214! ldx [%l0+0*8], %o0 ! Already done 215! sllx %o0, %l4, %o0 ! Already done 216 ldx [%l0+1*8], %o1 217 ldx [%l0+2*8], %o2 218 ldx [%l0+3*8], %o3 219 ldx [%l0+4*8], %o4 220 ba,pt %icc, 1f 221 ldx [%l0+5*8], %o5 222 .align 8 2231: 224 srlx %o1, %l3, %g1 225 inc 6*8, %l0 226 227 sllx %o1, %l4, %o1 228 or %g1, %o0, %g6 229 ldx [%l0+0*8], %o0 230 231 stx %g6, [%l1+0*8] 232 srlx %o2, %l3, %g1 233 234 sllx %o2, %l4, %o2 235 or %g1, %o1, %g6 236 ldx [%l0+1*8], %o1 237 238 stx %g6, [%l1+1*8] 239 srlx %o3, %l3, %g1 240 241 sllx %o3, %l4, %o3 242 or %g1, %o2, %g6 243 ldx [%l0+2*8], %o2 244 245 stx %g6, [%l1+2*8] 246 srlx %o4, %l3, %g1 247 248 sllx %o4, %l4, %o4 249 or %g1, %o3, %g6 250 ldx [%l0+3*8], %o3 251 252 stx %g6, [%l1+3*8] 253 srlx %o5, %l3, %g1 254 255 sllx %o5, %l4, %o5 256 or %g1, %o4, %g6 257 ldx [%l0+4*8], %o4 258 259 stx %g6, [%l1+4*8] 260 srlx %o0, %l3, %g1 261 deccc 6*8, %l2 ! Have enough room? 262 263 sllx %o0, %l4, %o0 ! Next loop 264 or %g1, %o5, %g6 265 ldx [%l0+5*8], %o5 266 267 stx %g6, [%l1+5*8] 268 bge,pt CCCR, 1b 269 inc 6*8, %l1 270 271Lmemcpy_unrolled8_cleanup: 272 !! 273 !! Finished 8 byte block, unload the regs. 274 !! 275 srlx %o1, %l3, %g1 276 inc 5*8, %l0 277 278 sllx %o1, %l4, %o1 279 or %g1, %o0, %g6 280 281 stx %g6, [%l1+0*8] 282 srlx %o2, %l3, %g1 283 284 sllx %o2, %l4, %o2 285 or %g1, %o1, %g6 286 287 stx %g6, [%l1+1*8] 288 srlx %o3, %l3, %g1 289 290 sllx %o3, %l4, %o3 291 or %g1, %o2, %g6 292 293 stx %g6, [%l1+2*8] 294 srlx %o4, %l3, %g1 295 296 sllx %o4, %l4, %o4 297 or %g1, %o3, %g6 298 299 stx %g6, [%l1+3*8] 300 srlx %o5, %l3, %g1 301 302 sllx %o5, %l4, %o5 303 or %g1, %o4, %g6 304 305 stx %g6, [%l1+4*8] 306 inc 5*8, %l1 307 308 mov %o5, %o0 ! Save our unused data 309 dec 5*8, %l2 3102: 311 inccc 12*8, %l2 312 bz,pn %icc, Lmemcpy_complete 313 314 !! Unrolled 8 times 315Lmemcpy_aligned8: 316! ldx [%l0], %o0 ! Already done 317! sllx %o0, %l4, %o0 ! Shift high word 318 319 deccc 8, %l2 ! Pre-decrement 320 bl,pn CCCR, Lmemcpy_finish 3211: 322 ldx [%l0+8], %o1 ! Load word 0 323 inc 8, %l0 324 325 srlx %o1, %l3, %g6 326 or %g6, %o0, %g6 ! Combine 327 328 stx %g6, [%l1] ! Store result 329 inc 8, %l1 330 331 deccc 8, %l2 332 bge,pn CCCR, 1b 333 sllx %o1, %l4, %o0 334 335 btst 7, %l2 ! Done? 336 bz,pt CCCR, Lmemcpy_complete 337 338 !! 339 !! Loadup the last dregs into %o0 and shift it into place 340 !! 341 srlx %l3, 3, %g6 ! # bytes in %o0 342 dec 8, %g6 ! - 8 343 !! n-8 - (by - 8) -> n - by 344 subcc %l2, %g6, %g0 ! # bytes we need 345 ble,pt %icc, Lmemcpy_finish 346 nop 347 ldx [%l0+8], %o1 ! Need another word 348 srlx %o1, %l3, %o1 349 ba,pt %icc, Lmemcpy_finish 350 or %o0, %o1, %o0 ! All loaded up. 351 352Lmemcpy_noshift8: 353 deccc 6*8, %l2 ! Have enough room? 354 bl,pn CCCR, 2f 355 nop 356 ba,pt %icc, 1f 357 nop 358 .align 32 3591: 360 ldx [%l0+0*8], %o0 361 ldx [%l0+1*8], %o1 362 ldx [%l0+2*8], %o2 363 stx %o0, [%l1+0*8] 364 stx %o1, [%l1+1*8] 365 stx %o2, [%l1+2*8] 366 367 368 ldx [%l0+3*8], %o3 369 ldx [%l0+4*8], %o4 370 ldx [%l0+5*8], %o5 371 inc 6*8, %l0 372 stx %o3, [%l1+3*8] 373 deccc 6*8, %l2 374 stx %o4, [%l1+4*8] 375 stx %o5, [%l1+5*8] 376 bge,pt CCCR, 1b 377 inc 6*8, %l1 3782: 379 inc 6*8, %l2 3801: 381 deccc 8, %l2 382 bl,pn %icc, 1f ! < 0 --> sub word 383 nop 384 ldx [%l0], %g6 385 inc 8, %l0 386 stx %g6, [%l1] 387 bg,pt %icc, 1b ! Exactly 0 --> done 388 inc 8, %l1 3891: 390 btst 7, %l2 ! Done? 391 bz,pt CCCR, Lmemcpy_complete 392 clr %l4 393 ldx [%l0], %o0 394Lmemcpy_finish: 395 396 brz,pn %l2, 2f ! 100% complete? 397 cmp %l2, 8 ! Exactly 8 bytes? 398 bz,a,pn CCCR, 2f 399 stx %o0, [%l1] 400 401 btst 4, %l2 ! Word store? 402 bz CCCR, 1f 403 srlx %o0, 32, %g6 ! Shift high word down 404 stw %g6, [%l1] 405 inc 4, %l1 406 mov %o0, %g6 ! Operate on the low bits 4071: 408 btst 2, %l2 409 mov %g6, %o0 410 bz 1f 411 srlx %o0, 16, %g6 412 413 sth %g6, [%l1] ! Store short 414 inc 2, %l1 415 mov %o0, %g6 ! Operate on low bytes 4161: 417 mov %g6, %o0 418 btst 1, %l2 ! Byte aligned? 419 bz 2f 420 srlx %o0, 8, %g6 421 422 stb %g6, [%l1] ! Store last byte 423 inc 1, %l1 ! Update address 4242: 425Lmemcpy_complete: 426#if 0 427 !! 428 !! verify copy success. 429 !! 430 431 mov %i0, %o2 432 mov %i1, %o4 433 mov %i2, %l4 4340: 435 ldub [%o2], %o1 436 inc %o2 437 ldub [%o4], %o3 438 inc %o4 439 cmp %o3, %o1 440 bnz 1f 441 dec %l4 442 brnz %l4, 0b 443 nop 444 ba 2f 445 nop 446 4471: 448 set 0f, %o0 449 call printf 450 sub %i2, %l4, %o5 451 set 1f, %o0 452 mov %i0, %o2 453 mov %i1, %o1 454 call printf 455 mov %i2, %o3 456 ta 1 457 .data 4580: .asciz "memcpy failed: %x@%p != %x@%p byte %d\n" 4591: .asciz "memcpy(%p, %p, %lx)\n" 460 .align 8 461 .text 4622: 463#endif 464 ret 465 restore %i1, %g0, %o0 466 467#ifdef USE_BLOCK_STORE_LOAD 468 469/* 470 * Block copy. Useful for >256 byte copies. 471 * 472 * Benchmarking has shown this always seems to be slower than 473 * the integer version, so this is disabled. Maybe someone will 474 * figure out why sometime. 475 */ 476 477Lmemcpy_block: 478 sethi %hi(block_disable), %o3 479 ldx [ %o3 + %lo(block_disable) ], %o3 480 brnz,pn %o3, Lmemcpy_fancy 481 !! Make sure our trap table is installed 482 set _C_LABEL(trapbase), %o5 483 rdpr %tba, %o3 484 sub %o3, %o5, %o3 485 brnz,pn %o3, Lmemcpy_fancy ! No, then don't use block load/store 486 nop 487#if defined(_KERNEL) && !defined(_RUMPKERNEL) 488/* 489 * Kernel: 490 * 491 * Here we use VIS instructions to do a block clear of a page. 492 * But before we can do that we need to save and enable the FPU. 493 * The last owner of the FPU registers is fplwp, and 494 * fplwp->l_md.md_fpstate is the current fpstate. If that's not 495 * null, call savefpstate() with it to store our current fp state. 496 * 497 * Next, allocate an aligned fpstate on the stack. We will properly 498 * nest calls on a particular stack so this should not be a problem. 499 * 500 * Now we grab either curlwp (or if we're on the interrupt stack 501 * lwp0). We stash its existing fpstate in a local register and 502 * put our new fpstate in curlwp->p_md.md_fpstate. We point 503 * fplwp at curlwp (or lwp0) and enable the FPU. 504 * 505 * If we are ever preempted, our FPU state will be saved in our 506 * fpstate. Then, when we're resumed and we take an FPDISABLED 507 * trap, the trap handler will be able to fish our FPU state out 508 * of curlwp (or lwp0). 509 * 510 * On exiting this routine we undo the damage: restore the original 511 * pointer to curlwp->p_md.md_fpstate, clear our fplwp, and disable 512 * the MMU. 513 * 514 * 515 * Register usage, Kernel only (after save): 516 * 517 * %i0 src 518 * %i1 dest 519 * %i2 size 520 * 521 * %l0 XXXX DEBUG old fpstate 522 * %l1 fplwp (hi bits only) 523 * %l2 orig fplwp 524 * %l3 orig fpstate 525 * %l5 curlwp 526 * %l6 old fpstate 527 * 528 * Register ussage, Kernel and user: 529 * 530 * %g1 src (retval for memcpy) 531 * 532 * %o0 src 533 * %o1 dest 534 * %o2 end dest 535 * %o5 last safe fetchable address 536 */ 537 538 ENABLE_FPU(0) 539 540 mov %i0, %o0 ! Src addr. 541 mov %i1, %o1 ! Store our dest ptr here. 542 mov %i2, %o2 ! Len counter 543#endif /* _KERNEL */ 544 545 !! 546 !! First align the output to a 64-bit entity 547 !! 548 549 mov %o1, %g1 ! memcpy retval 550 add %o0, %o2, %o5 ! End of source block 551 552 andn %o0, 7, %o3 ! Start of block 553 dec %o5 554 fzero %f0 555 556 andn %o5, BLOCK_ALIGN, %o5 ! Last safe addr. 557 ldd [%o3], %f2 ! Load 1st word 558 559 dec 8, %o3 ! Move %o3 1 word back 560 btst 1, %o1 561 bz 4f 562 563 mov -7, %o4 ! Lowest src addr possible 564 alignaddr %o0, %o4, %o4 ! Base addr for load. 565 566 cmp %o3, %o4 567 be,pt CCCR, 1f ! Already loaded? 568 mov %o4, %o3 569 fmovd %f2, %f0 ! No. Shift 570 ldd [%o3+8], %f2 ! And load 5711: 572 573 faligndata %f0, %f2, %f4 ! Isolate 1st byte 574 575 stda %f4, [%o1] ASI_FL8_P ! Store 1st byte 576 inc 1, %o1 ! Update address 577 inc 1, %o0 578 dec 1, %o2 5794: 580 btst 2, %o1 581 bz 4f 582 583 mov -6, %o4 ! Calculate src - 6 584 alignaddr %o0, %o4, %o4 ! calculate shift mask and dest. 585 586 cmp %o3, %o4 ! Addresses same? 587 be,pt CCCR, 1f 588 mov %o4, %o3 589 fmovd %f2, %f0 ! Shuffle data 590 ldd [%o3+8], %f2 ! Load word 0 5911: 592 faligndata %f0, %f2, %f4 ! Move 1st short low part of f8 593 594 stda %f4, [%o1] ASI_FL16_P ! Store 1st short 595 dec 2, %o2 596 inc 2, %o1 597 inc 2, %o0 5984: 599 brz,pn %o2, Lmemcpy_blockfinish ! XXXX 600 601 btst 4, %o1 602 bz 4f 603 604 mov -4, %o4 605 alignaddr %o0, %o4, %o4 ! calculate shift mask and dest. 606 607 cmp %o3, %o4 ! Addresses same? 608 beq,pt CCCR, 1f 609 mov %o4, %o3 610 fmovd %f2, %f0 ! Shuffle data 611 ldd [%o3+8], %f2 ! Load word 0 6121: 613 faligndata %f0, %f2, %f4 ! Move 1st short low part of f8 614 615 st %f5, [%o1] ! Store word 616 dec 4, %o2 617 inc 4, %o1 618 inc 4, %o0 6194: 620 brz,pn %o2, Lmemcpy_blockfinish ! XXXX 621 !! 622 !! We are now 32-bit aligned in the dest. 623 !! 624Lmemcpy_block_common: 625 626 mov -0, %o4 627 alignaddr %o0, %o4, %o4 ! base - shift 628 629 cmp %o3, %o4 ! Addresses same? 630 beq,pt CCCR, 1f 631 mov %o4, %o3 632 fmovd %f2, %f0 ! Shuffle data 633 ldd [%o3+8], %f2 ! Load word 0 6341: 635 add %o3, 8, %o0 ! now use %o0 for src 636 637 !! 638 !! Continue until our dest is block aligned 639 !! 640Lmemcpy_block_aligned8: 6411: 642 brz %o2, Lmemcpy_blockfinish 643 btst BLOCK_ALIGN, %o1 ! Block aligned? 644 bz 1f 645 646 faligndata %f0, %f2, %f4 ! Generate result 647 deccc 8, %o2 648 ble,pn %icc, Lmemcpy_blockfinish ! Should never happen 649 fmovd %f4, %f48 650 651 std %f4, [%o1] ! Store result 652 inc 8, %o1 653 654 fmovd %f2, %f0 655 inc 8, %o0 656 ba,pt %xcc, 1b ! Not yet. 657 ldd [%o0], %f2 ! Load next part 658Lmemcpy_block_aligned64: 6591: 660 661/* 662 * 64-byte aligned -- ready for block operations. 663 * 664 * Here we have the destination block aligned, but the 665 * source pointer may not be. Sub-word alignment will 666 * be handled by faligndata instructions. But the source 667 * can still be potentially aligned to 8 different words 668 * in our 64-bit block, so we have 8 different copy routines. 669 * 670 * Once we figure out our source alignment, we branch 671 * to the appropriate copy routine, which sets up the 672 * alignment for faligndata and loads (sets) the values 673 * into the source registers and does the copy loop. 674 * 675 * When were down to less than 1 block to store, we 676 * exit the copy loop and execute cleanup code. 677 * 678 * Block loads and stores are not properly interlocked. 679 * Stores save one reg/cycle, so you can start overwriting 680 * registers the cycle after the store is issued. 681 * 682 * Block loads require a block load to a different register 683 * block or a membar #Sync before accessing the loaded 684 * data. 685 * 686 * Since the faligndata instructions may be offset as far 687 * as 7 registers into a block (if you are shifting source 688 * 7 -> dest 0), you need 3 source register blocks for full 689 * performance: one you are copying, one you are loading, 690 * and one for interlocking. Otherwise, we would need to 691 * sprinkle the code with membar #Sync and lose the advantage 692 * of running faligndata in parallel with block stores. This 693 * means we are fetching a full 128 bytes ahead of the stores. 694 * We need to make sure the prefetch does not inadvertently 695 * cross a page boundary and fault on data that we will never 696 * store. 697 * 698 */ 699#if 1 700 and %o0, BLOCK_ALIGN, %o3 701 srax %o3, 3, %o3 ! Isolate the offset 702 703 brz %o3, L100 ! 0->0 704 btst 4, %o3 705 bnz %xcc, 4f 706 btst 2, %o3 707 bnz %xcc, 2f 708 btst 1, %o3 709 ba,pt %xcc, L101 ! 0->1 710 nop /* XXX spitfire bug */ 7112: 712 bz %xcc, L102 ! 0->2 713 nop 714 ba,pt %xcc, L103 ! 0->3 715 nop /* XXX spitfire bug */ 7164: 717 bnz %xcc, 2f 718 btst 1, %o3 719 bz %xcc, L104 ! 0->4 720 nop 721 ba,pt %xcc, L105 ! 0->5 722 nop /* XXX spitfire bug */ 7232: 724 bz %xcc, L106 ! 0->6 725 nop 726 ba,pt %xcc, L107 ! 0->7 727 nop /* XXX spitfire bug */ 728#else 729 730 !! 731 !! Isolate the word offset, which just happens to be 732 !! the slot in our jump table. 733 !! 734 !! This is 6 insns, most of which cannot be paired, 735 !! which is about the same as the above version. 736 !! 737 rd %pc, %o4 7381: 739 and %o0, 0x31, %o3 740 add %o3, (Lmemcpy_block_jmp - 1b), %o3 741 jmpl %o4 + %o3, %g0 742 nop 743 744 !! 745 !! Jump table 746 !! 747 748Lmemcpy_block_jmp: 749 ba,a,pt %xcc, L100 750 nop 751 ba,a,pt %xcc, L101 752 nop 753 ba,a,pt %xcc, L102 754 nop 755 ba,a,pt %xcc, L103 756 nop 757 ba,a,pt %xcc, L104 758 nop 759 ba,a,pt %xcc, L105 760 nop 761 ba,a,pt %xcc, L106 762 nop 763 ba,a,pt %xcc, L107 764 nop 765#endif 766 767 !! 768 !! Source is block aligned. 769 !! 770 !! Just load a block and go. 771 !! 772L100: 773#ifdef RETURN_NAME 774 sethi %hi(1f), %g1 775 ba,pt %icc, 2f 776 or %g1, %lo(1f), %g1 7771: 778 .asciz "L100" 779 .align 8 7802: 781#endif 782 fmovd %f0 , %f62 783 ldda [%o0] ASI_BLK_P, %f0 784 inc BLOCK_SIZE, %o0 785 cmp %o0, %o5 786 bleu,a,pn %icc, 3f 787 ldda [%o0] ASI_BLK_P, %f16 788 ba,pt %icc, 3f 789 membar #Sync 790 791 .align 32 ! ICache align. 7923: 793 faligndata %f62, %f0, %f32 794 inc BLOCK_SIZE, %o0 795 faligndata %f0, %f2, %f34 796 dec BLOCK_SIZE, %o2 797 faligndata %f2, %f4, %f36 798 cmp %o0, %o5 799 faligndata %f4, %f6, %f38 800 faligndata %f6, %f8, %f40 801 faligndata %f8, %f10, %f42 802 faligndata %f10, %f12, %f44 803 brlez,pn %o2, Lmemcpy_blockdone 804 faligndata %f12, %f14, %f46 805 806 bleu,a,pn %icc, 2f 807 ldda [%o0] ASI_BLK_P, %f48 808 membar #Sync 8092: 810 stda %f32, [%o1] ASI_STORE 811 faligndata %f14, %f16, %f32 812 inc BLOCK_SIZE, %o0 813 faligndata %f16, %f18, %f34 814 inc BLOCK_SIZE, %o1 815 faligndata %f18, %f20, %f36 816 dec BLOCK_SIZE, %o2 817 faligndata %f20, %f22, %f38 818 cmp %o0, %o5 819 faligndata %f22, %f24, %f40 820 faligndata %f24, %f26, %f42 821 faligndata %f26, %f28, %f44 822 brlez,pn %o2, Lmemcpy_blockdone 823 faligndata %f28, %f30, %f46 824 825 bleu,a,pn %icc, 2f 826 ldda [%o0] ASI_BLK_P, %f0 827 membar #Sync 8282: 829 stda %f32, [%o1] ASI_STORE 830 faligndata %f30, %f48, %f32 831 inc BLOCK_SIZE, %o0 832 faligndata %f48, %f50, %f34 833 inc BLOCK_SIZE, %o1 834 faligndata %f50, %f52, %f36 835 dec BLOCK_SIZE, %o2 836 faligndata %f52, %f54, %f38 837 cmp %o0, %o5 838 faligndata %f54, %f56, %f40 839 faligndata %f56, %f58, %f42 840 faligndata %f58, %f60, %f44 841 brlez,pn %o2, Lmemcpy_blockdone 842 faligndata %f60, %f62, %f46 843 bleu,a,pn %icc, 2f 844 ldda [%o0] ASI_BLK_P, %f16 ! Increment is at top 845 membar #Sync 8462: 847 stda %f32, [%o1] ASI_STORE 848 ba 3b 849 inc BLOCK_SIZE, %o1 850 851 !! 852 !! Source at BLOCK_ALIGN+8 853 !! 854 !! We need to load almost 1 complete block by hand. 855 !! 856L101: 857#ifdef RETURN_NAME 858 sethi %hi(1f), %g1 859 ba,pt %icc, 2f 860 or %g1, %lo(1f), %g1 8611: 862 .asciz "L101" 863 .align 8 8642: 865#endif 866! fmovd %f0, %f0 ! Hoist fmovd 867 ldd [%o0], %f2 868 inc 8, %o0 869 ldd [%o0], %f4 870 inc 8, %o0 871 ldd [%o0], %f6 872 inc 8, %o0 873 ldd [%o0], %f8 874 inc 8, %o0 875 ldd [%o0], %f10 876 inc 8, %o0 877 ldd [%o0], %f12 878 inc 8, %o0 879 ldd [%o0], %f14 880 inc 8, %o0 881 882 cmp %o0, %o5 883 bleu,a,pn %icc, 3f 884 ldda [%o0] ASI_BLK_P, %f16 885 membar #Sync 8863: 887 faligndata %f0, %f2, %f32 888 inc BLOCK_SIZE, %o0 889 faligndata %f2, %f4, %f34 890 cmp %o0, %o5 891 faligndata %f4, %f6, %f36 892 dec BLOCK_SIZE, %o2 893 faligndata %f6, %f8, %f38 894 faligndata %f8, %f10, %f40 895 faligndata %f10, %f12, %f42 896 faligndata %f12, %f14, %f44 897 bleu,a,pn %icc, 2f 898 ldda [%o0] ASI_BLK_P, %f48 899 membar #Sync 9002: 901 brlez,pn %o2, Lmemcpy_blockdone 902 faligndata %f14, %f16, %f46 903 904 stda %f32, [%o1] ASI_STORE 905 906 faligndata %f16, %f18, %f32 907 inc BLOCK_SIZE, %o0 908 faligndata %f18, %f20, %f34 909 inc BLOCK_SIZE, %o1 910 faligndata %f20, %f22, %f36 911 cmp %o0, %o5 912 faligndata %f22, %f24, %f38 913 dec BLOCK_SIZE, %o2 914 faligndata %f24, %f26, %f40 915 faligndata %f26, %f28, %f42 916 faligndata %f28, %f30, %f44 917 bleu,a,pn %icc, 2f 918 ldda [%o0] ASI_BLK_P, %f0 919 membar #Sync 9202: 921 brlez,pn %o2, Lmemcpy_blockdone 922 faligndata %f30, %f48, %f46 923 924 stda %f32, [%o1] ASI_STORE 925 926 faligndata %f48, %f50, %f32 927 inc BLOCK_SIZE, %o0 928 faligndata %f50, %f52, %f34 929 inc BLOCK_SIZE, %o1 930 faligndata %f52, %f54, %f36 931 cmp %o0, %o5 932 faligndata %f54, %f56, %f38 933 dec BLOCK_SIZE, %o2 934 faligndata %f56, %f58, %f40 935 faligndata %f58, %f60, %f42 936 faligndata %f60, %f62, %f44 937 bleu,a,pn %icc, 2f 938 ldda [%o0] ASI_BLK_P, %f16 939 membar #Sync 9402: 941 brlez,pn %o2, Lmemcpy_blockdone 942 faligndata %f62, %f0, %f46 943 944 stda %f32, [%o1] ASI_STORE 945 ba 3b 946 inc BLOCK_SIZE, %o1 947 948 !! 949 !! Source at BLOCK_ALIGN+16 950 !! 951 !! We need to load 6 doubles by hand. 952 !! 953L102: 954#ifdef RETURN_NAME 955 sethi %hi(1f), %g1 956 ba,pt %icc, 2f 957 or %g1, %lo(1f), %g1 9581: 959 .asciz "L102" 960 .align 8 9612: 962#endif 963 ldd [%o0], %f4 964 inc 8, %o0 965 fmovd %f0, %f2 ! Hoist fmovd 966 ldd [%o0], %f6 967 inc 8, %o0 968 969 ldd [%o0], %f8 970 inc 8, %o0 971 ldd [%o0], %f10 972 inc 8, %o0 973 ldd [%o0], %f12 974 inc 8, %o0 975 ldd [%o0], %f14 976 inc 8, %o0 977 978 cmp %o0, %o5 979 bleu,a,pn %icc, 3f 980 ldda [%o0] ASI_BLK_P, %f16 981 membar #Sync 9823: 983 faligndata %f2, %f4, %f32 984 inc BLOCK_SIZE, %o0 985 faligndata %f4, %f6, %f34 986 cmp %o0, %o5 987 faligndata %f6, %f8, %f36 988 dec BLOCK_SIZE, %o2 989 faligndata %f8, %f10, %f38 990 faligndata %f10, %f12, %f40 991 faligndata %f12, %f14, %f42 992 bleu,a,pn %icc, 2f 993 ldda [%o0] ASI_BLK_P, %f48 994 membar #Sync 9952: 996 faligndata %f14, %f16, %f44 997 998 brlez,pn %o2, Lmemcpy_blockdone 999 faligndata %f16, %f18, %f46 1000 1001 stda %f32, [%o1] ASI_STORE 1002 1003 faligndata %f18, %f20, %f32 1004 inc BLOCK_SIZE, %o0 1005 faligndata %f20, %f22, %f34 1006 inc BLOCK_SIZE, %o1 1007 faligndata %f22, %f24, %f36 1008 cmp %o0, %o5 1009 faligndata %f24, %f26, %f38 1010 dec BLOCK_SIZE, %o2 1011 faligndata %f26, %f28, %f40 1012 faligndata %f28, %f30, %f42 1013 bleu,a,pn %icc, 2f 1014 ldda [%o0] ASI_BLK_P, %f0 1015 membar #Sync 10162: 1017 faligndata %f30, %f48, %f44 1018 brlez,pn %o2, Lmemcpy_blockdone 1019 faligndata %f48, %f50, %f46 1020 1021 stda %f32, [%o1] ASI_STORE 1022 1023 faligndata %f50, %f52, %f32 1024 inc BLOCK_SIZE, %o0 1025 faligndata %f52, %f54, %f34 1026 inc BLOCK_SIZE, %o1 1027 faligndata %f54, %f56, %f36 1028 cmp %o0, %o5 1029 faligndata %f56, %f58, %f38 1030 dec BLOCK_SIZE, %o2 1031 faligndata %f58, %f60, %f40 1032 faligndata %f60, %f62, %f42 1033 bleu,a,pn %icc, 2f 1034 ldda [%o0] ASI_BLK_P, %f16 1035 membar #Sync 10362: 1037 faligndata %f62, %f0, %f44 1038 brlez,pn %o2, Lmemcpy_blockdone 1039 faligndata %f0, %f2, %f46 1040 1041 stda %f32, [%o1] ASI_STORE 1042 ba 3b 1043 inc BLOCK_SIZE, %o1 1044 1045 !! 1046 !! Source at BLOCK_ALIGN+24 1047 !! 1048 !! We need to load 5 doubles by hand. 1049 !! 1050L103: 1051#ifdef RETURN_NAME 1052 sethi %hi(1f), %g1 1053 ba,pt %icc, 2f 1054 or %g1, %lo(1f), %g1 10551: 1056 .asciz "L103" 1057 .align 8 10582: 1059#endif 1060 fmovd %f0, %f4 1061 ldd [%o0], %f6 1062 inc 8, %o0 1063 ldd [%o0], %f8 1064 inc 8, %o0 1065 ldd [%o0], %f10 1066 inc 8, %o0 1067 ldd [%o0], %f12 1068 inc 8, %o0 1069 ldd [%o0], %f14 1070 inc 8, %o0 1071 1072 cmp %o0, %o5 1073 bleu,a,pn %icc, 2f 1074 ldda [%o0] ASI_BLK_P, %f16 1075 membar #Sync 10762: 1077 inc BLOCK_SIZE, %o0 10783: 1079 faligndata %f4, %f6, %f32 1080 cmp %o0, %o5 1081 faligndata %f6, %f8, %f34 1082 dec BLOCK_SIZE, %o2 1083 faligndata %f8, %f10, %f36 1084 faligndata %f10, %f12, %f38 1085 faligndata %f12, %f14, %f40 1086 bleu,a,pn %icc, 2f 1087 ldda [%o0] ASI_BLK_P, %f48 1088 membar #Sync 10892: 1090 faligndata %f14, %f16, %f42 1091 inc BLOCK_SIZE, %o0 1092 faligndata %f16, %f18, %f44 1093 brlez,pn %o2, Lmemcpy_blockdone 1094 faligndata %f18, %f20, %f46 1095 1096 stda %f32, [%o1] ASI_STORE 1097 1098 faligndata %f20, %f22, %f32 1099 cmp %o0, %o5 1100 faligndata %f22, %f24, %f34 1101 dec BLOCK_SIZE, %o2 1102 faligndata %f24, %f26, %f36 1103 inc BLOCK_SIZE, %o1 1104 faligndata %f26, %f28, %f38 1105 faligndata %f28, %f30, %f40 1106 ble,a,pn %icc, 2f 1107 ldda [%o0] ASI_BLK_P, %f0 1108 membar #Sync 11092: 1110 faligndata %f30, %f48, %f42 1111 inc BLOCK_SIZE, %o0 1112 faligndata %f48, %f50, %f44 1113 brlez,pn %o2, Lmemcpy_blockdone 1114 faligndata %f50, %f52, %f46 1115 1116 stda %f32, [%o1] ASI_STORE 1117 1118 faligndata %f52, %f54, %f32 1119 cmp %o0, %o5 1120 faligndata %f54, %f56, %f34 1121 dec BLOCK_SIZE, %o2 1122 faligndata %f56, %f58, %f36 1123 faligndata %f58, %f60, %f38 1124 inc BLOCK_SIZE, %o1 1125 faligndata %f60, %f62, %f40 1126 bleu,a,pn %icc, 2f 1127 ldda [%o0] ASI_BLK_P, %f16 1128 membar #Sync 11292: 1130 faligndata %f62, %f0, %f42 1131 inc BLOCK_SIZE, %o0 1132 faligndata %f0, %f2, %f44 1133 brlez,pn %o2, Lmemcpy_blockdone 1134 faligndata %f2, %f4, %f46 1135 1136 stda %f32, [%o1] ASI_STORE 1137 ba 3b 1138 inc BLOCK_SIZE, %o1 1139 1140 !! 1141 !! Source at BLOCK_ALIGN+32 1142 !! 1143 !! We need to load 4 doubles by hand. 1144 !! 1145L104: 1146#ifdef RETURN_NAME 1147 sethi %hi(1f), %g1 1148 ba,pt %icc, 2f 1149 or %g1, %lo(1f), %g1 11501: 1151 .asciz "L104" 1152 .align 8 11532: 1154#endif 1155 fmovd %f0, %f6 1156 ldd [%o0], %f8 1157 inc 8, %o0 1158 ldd [%o0], %f10 1159 inc 8, %o0 1160 ldd [%o0], %f12 1161 inc 8, %o0 1162 ldd [%o0], %f14 1163 inc 8, %o0 1164 1165 cmp %o0, %o5 1166 bleu,a,pn %icc, 2f 1167 ldda [%o0] ASI_BLK_P, %f16 1168 membar #Sync 11692: 1170 inc BLOCK_SIZE, %o0 11713: 1172 faligndata %f6, %f8, %f32 1173 cmp %o0, %o5 1174 faligndata %f8, %f10, %f34 1175 dec BLOCK_SIZE, %o2 1176 faligndata %f10, %f12, %f36 1177 faligndata %f12, %f14, %f38 1178 bleu,a,pn %icc, 2f 1179 ldda [%o0] ASI_BLK_P, %f48 1180 membar #Sync 11812: 1182 faligndata %f14, %f16, %f40 1183 faligndata %f16, %f18, %f42 1184 inc BLOCK_SIZE, %o0 1185 faligndata %f18, %f20, %f44 1186 brlez,pn %o2, Lmemcpy_blockdone 1187 faligndata %f20, %f22, %f46 1188 1189 stda %f32, [%o1] ASI_STORE 1190 1191 faligndata %f22, %f24, %f32 1192 cmp %o0, %o5 1193 faligndata %f24, %f26, %f34 1194 faligndata %f26, %f28, %f36 1195 inc BLOCK_SIZE, %o1 1196 faligndata %f28, %f30, %f38 1197 bleu,a,pn %icc, 2f 1198 ldda [%o0] ASI_BLK_P, %f0 1199 membar #Sync 12002: 1201 faligndata %f30, %f48, %f40 1202 dec BLOCK_SIZE, %o2 1203 faligndata %f48, %f50, %f42 1204 inc BLOCK_SIZE, %o0 1205 faligndata %f50, %f52, %f44 1206 brlez,pn %o2, Lmemcpy_blockdone 1207 faligndata %f52, %f54, %f46 1208 1209 stda %f32, [%o1] ASI_STORE 1210 1211 faligndata %f54, %f56, %f32 1212 cmp %o0, %o5 1213 faligndata %f56, %f58, %f34 1214 faligndata %f58, %f60, %f36 1215 inc BLOCK_SIZE, %o1 1216 faligndata %f60, %f62, %f38 1217 bleu,a,pn %icc, 2f 1218 ldda [%o0] ASI_BLK_P, %f16 1219 membar #Sync 12202: 1221 faligndata %f62, %f0, %f40 1222 dec BLOCK_SIZE, %o2 1223 faligndata %f0, %f2, %f42 1224 inc BLOCK_SIZE, %o0 1225 faligndata %f2, %f4, %f44 1226 brlez,pn %o2, Lmemcpy_blockdone 1227 faligndata %f4, %f6, %f46 1228 1229 stda %f32, [%o1] ASI_STORE 1230 ba 3b 1231 inc BLOCK_SIZE, %o1 1232 1233 !! 1234 !! Source at BLOCK_ALIGN+40 1235 !! 1236 !! We need to load 3 doubles by hand. 1237 !! 1238L105: 1239#ifdef RETURN_NAME 1240 sethi %hi(1f), %g1 1241 ba,pt %icc, 2f 1242 or %g1, %lo(1f), %g1 12431: 1244 .asciz "L105" 1245 .align 8 12462: 1247#endif 1248 fmovd %f0, %f8 1249 ldd [%o0], %f10 1250 inc 8, %o0 1251 ldd [%o0], %f12 1252 inc 8, %o0 1253 ldd [%o0], %f14 1254 inc 8, %o0 1255 1256 cmp %o0, %o5 1257 bleu,a,pn %icc, 2f 1258 ldda [%o0] ASI_BLK_P, %f16 1259 membar #Sync 12602: 1261 inc BLOCK_SIZE, %o0 12623: 1263 faligndata %f8, %f10, %f32 1264 cmp %o0, %o5 1265 faligndata %f10, %f12, %f34 1266 faligndata %f12, %f14, %f36 1267 bleu,a,pn %icc, 2f 1268 ldda [%o0] ASI_BLK_P, %f48 1269 membar #Sync 12702: 1271 faligndata %f14, %f16, %f38 1272 dec BLOCK_SIZE, %o2 1273 faligndata %f16, %f18, %f40 1274 inc BLOCK_SIZE, %o0 1275 faligndata %f18, %f20, %f42 1276 faligndata %f20, %f22, %f44 1277 brlez,pn %o2, Lmemcpy_blockdone 1278 faligndata %f22, %f24, %f46 1279 1280 stda %f32, [%o1] ASI_STORE 1281 1282 faligndata %f24, %f26, %f32 1283 cmp %o0, %o5 1284 faligndata %f26, %f28, %f34 1285 dec BLOCK_SIZE, %o2 1286 faligndata %f28, %f30, %f36 1287 bleu,a,pn %icc, 2f 1288 ldda [%o0] ASI_BLK_P, %f0 1289 membar #Sync 12902: 1291 faligndata %f30, %f48, %f38 1292 inc BLOCK_SIZE, %o1 1293 faligndata %f48, %f50, %f40 1294 inc BLOCK_SIZE, %o0 1295 faligndata %f50, %f52, %f42 1296 faligndata %f52, %f54, %f44 1297 brlez,pn %o2, Lmemcpy_blockdone 1298 faligndata %f54, %f56, %f46 1299 1300 stda %f32, [%o1] ASI_STORE 1301 1302 faligndata %f56, %f58, %f32 1303 cmp %o0, %o5 1304 faligndata %f58, %f60, %f34 1305 dec BLOCK_SIZE, %o2 1306 faligndata %f60, %f62, %f36 1307 bleu,a,pn %icc, 2f 1308 ldda [%o0] ASI_BLK_P, %f16 1309 membar #Sync 13102: 1311 faligndata %f62, %f0, %f38 1312 inc BLOCK_SIZE, %o1 1313 faligndata %f0, %f2, %f40 1314 inc BLOCK_SIZE, %o0 1315 faligndata %f2, %f4, %f42 1316 faligndata %f4, %f6, %f44 1317 brlez,pn %o2, Lmemcpy_blockdone 1318 faligndata %f6, %f8, %f46 1319 1320 stda %f32, [%o1] ASI_STORE 1321 ba 3b 1322 inc BLOCK_SIZE, %o1 1323 1324 1325 !! 1326 !! Source at BLOCK_ALIGN+48 1327 !! 1328 !! We need to load 2 doubles by hand. 1329 !! 1330L106: 1331#ifdef RETURN_NAME 1332 sethi %hi(1f), %g1 1333 ba,pt %icc, 2f 1334 or %g1, %lo(1f), %g1 13351: 1336 .asciz "L106" 1337 .align 8 13382: 1339#endif 1340 fmovd %f0, %f10 1341 ldd [%o0], %f12 1342 inc 8, %o0 1343 ldd [%o0], %f14 1344 inc 8, %o0 1345 1346 cmp %o0, %o5 1347 bleu,a,pn %icc, 2f 1348 ldda [%o0] ASI_BLK_P, %f16 1349 membar #Sync 13502: 1351 inc BLOCK_SIZE, %o0 13523: 1353 faligndata %f10, %f12, %f32 1354 cmp %o0, %o5 1355 faligndata %f12, %f14, %f34 1356 bleu,a,pn %icc, 2f 1357 ldda [%o0] ASI_BLK_P, %f48 1358 membar #Sync 13592: 1360 faligndata %f14, %f16, %f36 1361 dec BLOCK_SIZE, %o2 1362 faligndata %f16, %f18, %f38 1363 inc BLOCK_SIZE, %o0 1364 faligndata %f18, %f20, %f40 1365 faligndata %f20, %f22, %f42 1366 faligndata %f22, %f24, %f44 1367 brlez,pn %o2, Lmemcpy_blockdone 1368 faligndata %f24, %f26, %f46 1369 1370 stda %f32, [%o1] ASI_STORE 1371 1372 faligndata %f26, %f28, %f32 1373 cmp %o0, %o5 1374 faligndata %f28, %f30, %f34 1375 bleu,a,pn %icc, 2f 1376 ldda [%o0] ASI_BLK_P, %f0 1377 membar #Sync 13782: 1379 faligndata %f30, %f48, %f36 1380 dec BLOCK_SIZE, %o2 1381 faligndata %f48, %f50, %f38 1382 inc BLOCK_SIZE, %o1 1383 faligndata %f50, %f52, %f40 1384 faligndata %f52, %f54, %f42 1385 inc BLOCK_SIZE, %o0 1386 faligndata %f54, %f56, %f44 1387 brlez,pn %o2, Lmemcpy_blockdone 1388 faligndata %f56, %f58, %f46 1389 1390 stda %f32, [%o1] ASI_STORE 1391 1392 faligndata %f58, %f60, %f32 1393 cmp %o0, %o5 1394 faligndata %f60, %f62, %f34 1395 bleu,a,pn %icc, 2f 1396 ldda [%o0] ASI_BLK_P, %f16 1397 membar #Sync 13982: 1399 faligndata %f62, %f0, %f36 1400 dec BLOCK_SIZE, %o2 1401 faligndata %f0, %f2, %f38 1402 inc BLOCK_SIZE, %o1 1403 faligndata %f2, %f4, %f40 1404 faligndata %f4, %f6, %f42 1405 inc BLOCK_SIZE, %o0 1406 faligndata %f6, %f8, %f44 1407 brlez,pn %o2, Lmemcpy_blockdone 1408 faligndata %f8, %f10, %f46 1409 1410 stda %f32, [%o1] ASI_STORE 1411 ba 3b 1412 inc BLOCK_SIZE, %o1 1413 1414 1415 !! 1416 !! Source at BLOCK_ALIGN+56 1417 !! 1418 !! We need to load 1 double by hand. 1419 !! 1420L107: 1421#ifdef RETURN_NAME 1422 sethi %hi(1f), %g1 1423 ba,pt %icc, 2f 1424 or %g1, %lo(1f), %g1 14251: 1426 .asciz "L107" 1427 .align 8 14282: 1429#endif 1430 fmovd %f0, %f12 1431 ldd [%o0], %f14 1432 inc 8, %o0 1433 1434 cmp %o0, %o5 1435 bleu,a,pn %icc, 2f 1436 ldda [%o0] ASI_BLK_P, %f16 1437 membar #Sync 14382: 1439 inc BLOCK_SIZE, %o0 14403: 1441 faligndata %f12, %f14, %f32 1442 cmp %o0, %o5 1443 bleu,a,pn %icc, 2f 1444 ldda [%o0] ASI_BLK_P, %f48 1445 membar #Sync 14462: 1447 faligndata %f14, %f16, %f34 1448 dec BLOCK_SIZE, %o2 1449 faligndata %f16, %f18, %f36 1450 inc BLOCK_SIZE, %o0 1451 faligndata %f18, %f20, %f38 1452 faligndata %f20, %f22, %f40 1453 faligndata %f22, %f24, %f42 1454 faligndata %f24, %f26, %f44 1455 brlez,pn %o2, Lmemcpy_blockdone 1456 faligndata %f26, %f28, %f46 1457 1458 stda %f32, [%o1] ASI_STORE 1459 1460 faligndata %f28, %f30, %f32 1461 cmp %o0, %o5 1462 bleu,a,pn %icc, 2f 1463 ldda [%o0] ASI_BLK_P, %f0 1464 membar #Sync 14652: 1466 faligndata %f30, %f48, %f34 1467 dec BLOCK_SIZE, %o2 1468 faligndata %f48, %f50, %f36 1469 inc BLOCK_SIZE, %o1 1470 faligndata %f50, %f52, %f38 1471 faligndata %f52, %f54, %f40 1472 inc BLOCK_SIZE, %o0 1473 faligndata %f54, %f56, %f42 1474 faligndata %f56, %f58, %f44 1475 brlez,pn %o2, Lmemcpy_blockdone 1476 faligndata %f58, %f60, %f46 1477 1478 stda %f32, [%o1] ASI_STORE 1479 1480 faligndata %f60, %f62, %f32 1481 cmp %o0, %o5 1482 bleu,a,pn %icc, 2f 1483 ldda [%o0] ASI_BLK_P, %f16 1484 membar #Sync 14852: 1486 faligndata %f62, %f0, %f34 1487 dec BLOCK_SIZE, %o2 1488 faligndata %f0, %f2, %f36 1489 inc BLOCK_SIZE, %o1 1490 faligndata %f2, %f4, %f38 1491 faligndata %f4, %f6, %f40 1492 inc BLOCK_SIZE, %o0 1493 faligndata %f6, %f8, %f42 1494 faligndata %f8, %f10, %f44 1495 1496 brlez,pn %o2, Lmemcpy_blockdone 1497 faligndata %f10, %f12, %f46 1498 1499 stda %f32, [%o1] ASI_STORE 1500 ba 3b 1501 inc BLOCK_SIZE, %o1 1502 1503Lmemcpy_blockdone: 1504 inc BLOCK_SIZE, %o2 ! Fixup our overcommit 1505 membar #Sync ! Finish any pending loads 1506#define FINISH_REG(f) \ 1507 deccc 8, %o2; \ 1508 bl,a Lmemcpy_blockfinish; \ 1509 fmovd f, %f48; \ 1510 std f, [%o1]; \ 1511 inc 8, %o1 1512 1513 FINISH_REG(%f32) 1514 FINISH_REG(%f34) 1515 FINISH_REG(%f36) 1516 FINISH_REG(%f38) 1517 FINISH_REG(%f40) 1518 FINISH_REG(%f42) 1519 FINISH_REG(%f44) 1520 FINISH_REG(%f46) 1521 FINISH_REG(%f48) 1522#undef FINISH_REG 1523 !! 1524 !! The low 3 bits have the sub-word bits needed to be 1525 !! stored [because (x-8)&0x7 == x]. 1526 !! 1527Lmemcpy_blockfinish: 1528 brz,pn %o2, 2f ! 100% complete? 1529 fmovd %f48, %f4 1530 cmp %o2, 8 ! Exactly 8 bytes? 1531 bz,a,pn CCCR, 2f 1532 std %f4, [%o1] 1533 1534 btst 4, %o2 ! Word store? 1535 bz CCCR, 1f 1536 nop 1537 st %f4, [%o1] 1538 inc 4, %o1 15391: 1540 btst 2, %o2 1541 fzero %f0 1542 bz 1f 1543 1544 mov -6, %o4 1545 alignaddr %o1, %o4, %g0 1546 1547 faligndata %f0, %f4, %f8 1548 1549 stda %f8, [%o1] ASI_FL16_P ! Store short 1550 inc 2, %o1 15511: 1552 btst 1, %o2 ! Byte aligned? 1553 bz 2f 1554 1555 mov -7, %o0 ! Calculate dest - 7 1556 alignaddr %o1, %o0, %g0 ! Calculate shift mask and dest. 1557 1558 faligndata %f0, %f4, %f8 ! Move 1st byte to low part of f8 1559 1560 stda %f8, [%o1] ASI_FL8_P ! Store 1st byte 1561 inc 1, %o1 ! Update address 15622: 1563 membar #Sync 1564#if 0 1565 !! 1566 !! verify copy success. 1567 !! 1568 1569 mov %i0, %o2 1570 mov %i1, %o4 1571 mov %i2, %l4 15720: 1573 ldub [%o2], %o1 1574 inc %o2 1575 ldub [%o4], %o3 1576 inc %o4 1577 cmp %o3, %o1 1578 bnz 1f 1579 dec %l4 1580 brnz %l4, 0b 1581 nop 1582 ba 2f 1583 nop 1584 15851: 1586 set block_disable, %o0 1587 stx %o0, [%o0] 1588 1589 set 0f, %o0 1590 call prom_printf 1591 sub %i2, %l4, %o5 1592 set 1f, %o0 1593 mov %i0, %o2 1594 mov %i1, %o1 1595 call prom_printf 1596 mov %i2, %o3 1597 ta 1 1598 .data 1599 _ALIGN 16000: .asciz "block memcpy failed: %x@%p != %x@%p byte %d\r\n" 16011: .asciz "memcpy(%p, %p, %lx)\r\n" 1602 _ALIGN 1603 .text 16042: 1605#endif 1606#if defined(_KERNEL) && !defined(_RUMPKERNEL) 1607 1608/* 1609 * Weve saved our possible fpstate, now disable the fpu 1610 * and continue with life. 1611 */ 1612 RESTORE_FPU 1613 ret 1614 restore %g1, 0, %o0 ! Return DEST for memcpy 1615#endif 1616 retl 1617 mov %g1, %o0 1618/* 1619 * Use block_disable to turn off block insns for 1620 * memcpy/memset 1621 */ 1622 .data 1623 .align 8 1624 .globl block_disable 1625block_disable: .xword 1 1626 .text 1627#endif /* USE_BLOCK_STORE_LOAD */ 1628