1{ 2 Copyright (c) 2004, John O'Harrow (john@almcrest.demon.co.uk) 3 4This software is provided 'as-is', without any express or implied warranty. 5In no event will the authors be held liable for any damages arising from the 6use of this software. 7 8Permission is granted to anyone to use this software for any purpose, including 9commercial applications, and to alter it and redistribute it freely, subject to 10the following restrictions: 11 121. The origin of this software must not be misrepresented; you must not claim 13 that you wrote the original software. If you use this software in a product, 14 an acknowledgment in the product documentation would be appreciated but is 15 not required. 16 172. Altered source versions must be plainly marked as such, and must not be 18 misrepresented as being the original software. 19 203. This notice may not be removed or altered from any source distribution. 21 22------------------------------------------------------------------------------- 23 24Version: 1.40 - 16-SEP-2004 25} 26 27{$ifdef USE_FASTMOVE} 28 29{$ifndef FPC_SYSTEM_HAS_MOVE} 30{$define FPC_SYSTEM_HAS_MOVE} 31 32{$asmmode intel} 33 34{-------------------------------------------------------------------------} 35(* 36{Just to show that a good Pascal algorithm can beat the default BASM} 37procedure MoveJOH_PAS_3(const Source; var Dest; Count : Integer); 38var 39 S, D : PtrUInt; 40 Temp, C, I : PtrInt; 41 L : PPtrInt; 42begin 43 S := Cardinal(@Source); 44 D := Cardinal(@Dest); 45 if S = D then 46 Exit; 47 if Count <= 4 then 48 case Count of 49 1 : PByte(@Dest)^ := PByte(S)^; 50 2 : PWord(@Dest)^ := PWord(S)^; 51 3 : if D > S then 52 begin 53 PByte(Integer(@Dest)+2)^ := PByte(S+2)^; 54 PWord(@Dest)^ := PWord(S)^; 55 end 56 else 57 begin 58 PWord(@Dest)^ := PWord(S)^; 59 PByte(Integer(@Dest)+2)^ := PByte(S+2)^; 60 end; 61 4 : PInteger(@Dest)^ := PInteger(S)^ 62 else Exit; {Count <= 0} 63 end 64 else 65 if D > S then 66 begin 67 Temp := PInteger(S)^; 68 I := Integer(@Dest); 69 C := Count - 4; 70 L := PInteger(Integer(@Dest) + C); 71 Inc(S, C); 72 repeat 73 L^ := PInteger(S)^; 74 if Count <= 8 then 75 Break; 76 Dec(Count, 4); 77 Dec(S, 4); 78 Dec(L); 79 until False; 80 PInteger(I)^ := Temp; 81 end 82 else 83 begin 84 C := Count - 4; 85 Temp := PInteger(S + Cardinal(C))^; 86 I := Integer(@Dest) + C; 87 L := @Dest; 88 repeat 89 L^ := PInteger(S)^; 90 if Count <= 8 then 91 Break; 92 Dec(Count, 4); 93 Inc(S, 4); 94 Inc(L); 95 until False; 96 PInteger(I)^ := Temp; 97 end; 98end; {MoveJOH_PAS} 99*) 100 101const 102 SMALLMOVESIZE = 36; 103 104{-------------------------------------------------------------------------} 105{Perform Forward Move of 0..36 Bytes} 106{On Entry, ECX = Count, EAX = Source+Count, EDX = Dest+Count. Destroys ECX} 107procedure SmallForwardMove_3;assembler;nostackframe; 108asm 109 jmp dword ptr @@FwdJumpTable[ecx*4] 110 align 16 111@@FwdJumpTable: 112 dd @@Done {Removes need to test for zero size move} 113 dd @@Fwd01,@@Fwd02,@@Fwd03,@@Fwd04,@@Fwd05,@@Fwd06,@@Fwd07,@@Fwd08 114 dd @@Fwd09,@@Fwd10,@@Fwd11,@@Fwd12,@@Fwd13,@@Fwd14,@@Fwd15,@@Fwd16 115 dd @@Fwd17,@@Fwd18,@@Fwd19,@@Fwd20,@@Fwd21,@@Fwd22,@@Fwd23,@@Fwd24 116 dd @@Fwd25,@@Fwd26,@@Fwd27,@@Fwd28,@@Fwd29,@@Fwd30,@@Fwd31,@@Fwd32 117 dd @@Fwd33,@@Fwd34,@@Fwd35,@@Fwd36 118@@Fwd36: 119 mov ecx,[eax-36] 120 mov [edx-36],ecx 121@@Fwd32: 122 mov ecx,[eax-32] 123 mov [edx-32],ecx 124@@Fwd28: 125 mov ecx,[eax-28] 126 mov [edx-28],ecx 127@@Fwd24: 128 mov ecx,[eax-24] 129 mov [edx-24],ecx 130@@Fwd20: 131 mov ecx,[eax-20] 132 mov [edx-20],ecx 133@@Fwd16: 134 mov ecx,[eax-16] 135 mov [edx-16],ecx 136@@Fwd12: 137 mov ecx,[eax-12] 138 mov [edx-12],ecx 139@@Fwd08: 140 mov ecx,[eax-8] 141 mov [edx-8],ecx 142@@Fwd04: 143 mov ecx,[eax-4] 144 mov [edx-4],ecx 145 ret 146@@Fwd35: 147 mov ecx,[eax-35] 148 mov [edx-35],ecx 149@@Fwd31: 150 mov ecx,[eax-31] 151 mov [edx-31],ecx 152@@Fwd27: 153 mov ecx,[eax-27] 154 mov [edx-27],ecx 155@@Fwd23: 156 mov ecx,[eax-23] 157 mov [edx-23],ecx 158@@Fwd19: 159 mov ecx,[eax-19] 160 mov [edx-19],ecx 161@@Fwd15: 162 mov ecx,[eax-15] 163 mov [edx-15],ecx 164@@Fwd11: 165 mov ecx,[eax-11] 166 mov [edx-11],ecx 167@@Fwd07: 168 mov ecx,[eax-7] 169 mov [edx-7],ecx 170 mov ecx,[eax-4] 171 mov [edx-4],ecx 172 ret 173@@Fwd03: 174 movzx ecx, word ptr [eax-3] 175 mov [edx-3],cx 176 movzx ecx, byte ptr [eax-1] 177 mov [edx-1],cl 178 ret 179@@Fwd34: 180 mov ecx,[eax-34] 181 mov [edx-34],ecx 182@@Fwd30: 183 mov ecx,[eax-30] 184 mov [edx-30],ecx 185@@Fwd26: 186 mov ecx,[eax-26] 187 mov [edx-26],ecx 188@@Fwd22: 189 mov ecx,[eax-22] 190 mov [edx-22],ecx 191@@Fwd18: 192 mov ecx,[eax-18] 193 mov [edx-18],ecx 194@@Fwd14: 195 mov ecx,[eax-14] 196 mov [edx-14],ecx 197@@Fwd10: 198 mov ecx,[eax-10] 199 mov [edx-10],ecx 200@@Fwd06: 201 mov ecx,[eax-6] 202 mov [edx-6],ecx 203@@Fwd02: 204 movzx ecx, word ptr [eax-2] 205 mov [edx-2],cx 206 ret 207@@Fwd33: 208 mov ecx,[eax-33] 209 mov [edx-33],ecx 210@@Fwd29: 211 mov ecx,[eax-29] 212 mov [edx-29],ecx 213@@Fwd25: 214 mov ecx,[eax-25] 215 mov [edx-25],ecx 216@@Fwd21: 217 mov ecx,[eax-21] 218 mov [edx-21],ecx 219@@Fwd17: 220 mov ecx,[eax-17] 221 mov [edx-17],ecx 222@@Fwd13: 223 mov ecx,[eax-13] 224 mov [edx-13],ecx 225@@Fwd09: 226 mov ecx,[eax-9] 227 mov [edx-9],ecx 228@@Fwd05: 229 mov ecx,[eax-5] 230 mov [edx-5],ecx 231@@Fwd01: 232 movzx ecx, byte ptr [eax-1] 233 mov [edx-1],cl 234@@Done: 235end; {SmallForwardMove} 236 237{-------------------------------------------------------------------------} 238{Perform Backward Move of 0..36 Bytes} 239{On Entry, ECX = Count, EAX = Source, EDX = Dest. Destroys ECX} 240procedure SmallBackwardMove_3;assembler;nostackframe; 241asm 242 jmp dword ptr @@BwdJumpTable[ecx*4] 243 align 16 244@@BwdJumpTable: 245 dd @@Done {Removes need to test for zero size move} 246 dd @@Bwd01,@@Bwd02,@@Bwd03,@@Bwd04,@@Bwd05,@@Bwd06,@@Bwd07,@@Bwd08 247 dd @@Bwd09,@@Bwd10,@@Bwd11,@@Bwd12,@@Bwd13,@@Bwd14,@@Bwd15,@@Bwd16 248 dd @@Bwd17,@@Bwd18,@@Bwd19,@@Bwd20,@@Bwd21,@@Bwd22,@@Bwd23,@@Bwd24 249 dd @@Bwd25,@@Bwd26,@@Bwd27,@@Bwd28,@@Bwd29,@@Bwd30,@@Bwd31,@@Bwd32 250 dd @@Bwd33,@@Bwd34,@@Bwd35,@@Bwd36 251@@Bwd36: 252 mov ecx,[eax+32] 253 mov [edx+32],ecx 254@@Bwd32: 255 mov ecx,[eax+28] 256 mov [edx+28],ecx 257@@Bwd28: 258 mov ecx,[eax+24] 259 mov [edx+24],ecx 260@@Bwd24: 261 mov ecx,[eax+20] 262 mov [edx+20],ecx 263@@Bwd20: 264 mov ecx,[eax+16] 265 mov [edx+16],ecx 266@@Bwd16: 267 mov ecx,[eax+12] 268 mov [edx+12],ecx 269@@Bwd12: 270 mov ecx,[eax+8] 271 mov [edx+8],ecx 272@@Bwd08: 273 mov ecx,[eax+4] 274 mov [edx+4],ecx 275@@Bwd04: 276 mov ecx,[eax] 277 mov [edx],ecx 278 ret 279@@Bwd35: 280 mov ecx,[eax+31] 281 mov [edx+31],ecx 282@@Bwd31: 283 mov ecx,[eax+27] 284 mov [edx+27],ecx 285@@Bwd27: 286 mov ecx,[eax+23] 287 mov [edx+23],ecx 288@@Bwd23: 289 mov ecx,[eax+19] 290 mov [edx+19],ecx 291@@Bwd19: 292 mov ecx,[eax+15] 293 mov [edx+15],ecx 294@@Bwd15: 295 mov ecx,[eax+11] 296 mov [edx+11],ecx 297@@Bwd11: 298 mov ecx,[eax+7] 299 mov [edx+7],ecx 300@@Bwd07: 301 mov ecx,[eax+3] 302 mov [edx+3],ecx 303 mov ecx,[eax] 304 mov [edx],ecx 305 ret 306@@Bwd03: 307 movzx ecx, word ptr [eax+1] 308 mov [edx+1],cx 309 movzx ecx, byte ptr [eax] 310 mov [edx],cl 311 ret 312@@Bwd34: 313 mov ecx,[eax+30] 314 mov [edx+30],ecx 315@@Bwd30: 316 mov ecx,[eax+26] 317 mov [edx+26],ecx 318@@Bwd26: 319 mov ecx,[eax+22] 320 mov [edx+22],ecx 321@@Bwd22: 322 mov ecx,[eax+18] 323 mov [edx+18],ecx 324@@Bwd18: 325 mov ecx,[eax+14] 326 mov [edx+14],ecx 327@@Bwd14: 328 mov ecx,[eax+10] 329 mov [edx+10],ecx 330@@Bwd10: 331 mov ecx,[eax+6] 332 mov [edx+6],ecx 333@@Bwd06: 334 mov ecx,[eax+2] 335 mov [edx+2],ecx 336@@Bwd02: 337 movzx ecx, word ptr [eax] 338 mov [edx],cx 339 ret 340@@Bwd33: 341 mov ecx,[eax+29] 342 mov [edx+29],ecx 343@@Bwd29: 344 mov ecx,[eax+25] 345 mov [edx+25],ecx 346@@Bwd25: 347 mov ecx,[eax+21] 348 mov [edx+21],ecx 349@@Bwd21: 350 mov ecx,[eax+17] 351 mov [edx+17],ecx 352@@Bwd17: 353 mov ecx,[eax+13] 354 mov [edx+13],ecx 355@@Bwd13: 356 mov ecx,[eax+9] 357 mov [edx+9],ecx 358@@Bwd09: 359 mov ecx,[eax+5] 360 mov [edx+5],ecx 361@@Bwd05: 362 mov ecx,[eax+1] 363 mov [edx+1],ecx 364@@Bwd01: 365 movzx ecx, byte ptr[eax] 366 mov [edx],cl 367@@Done: 368end; {SmallBackwardMove} 369 370 371{ at least valgrind up to 3.3 has a bug which prevents the default code to 372 work so we use a rather simple implementation here 373} 374procedure Forwards_Valgrind;assembler;nostackframe; 375asm 376{$ifdef FPC_ENABLED_CLD} 377 cld 378{$endif FPC_ENABLED_CLD} 379 push esi 380 push edi 381 mov esi,eax 382 mov edi,edx 383 rep movsb 384 pop edi 385 pop esi 386end; 387 388{ at least valgrind up to 3.3 has a bug which prevents the default code to 389 work so we use a rather simple implementation here 390} 391procedure Backwards_Valgrind;assembler;nostackframe; 392asm 393 push esi 394 push edi 395 lea esi,[eax+ecx-1] 396 lea edi,[edx+ecx-1] 397@@repeat: 398 mov al,[esi] 399 mov [edi],al 400 dec esi 401 dec edi 402 dec ecx 403 jnz @@repeat 404 pop edi 405 pop esi 406end; 407 408{-------------------------------------------------------------------------} 409{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)} 410procedure Forwards_IA32_3;assembler;nostackframe; 411asm 412 push ebx 413 mov ebx,edx 414 fild qword ptr [eax] 415 add eax,ecx {QWORD Align Writes} 416 add ecx,edx 417 add edx,7 418 and edx,-8 419 sub ecx,edx 420 add edx,ecx {Now QWORD Aligned} 421 sub ecx,16 422 neg ecx 423@FwdLoop: 424 fild qword ptr [eax+ecx-16] 425 fistp qword ptr [edx+ecx-16] 426 fild qword ptr [eax+ecx-8] 427 fistp qword ptr [edx+ecx-8] 428 add ecx,16 429 jle @FwdLoop 430 fistp qword ptr [ebx] 431 neg ecx 432 add ecx,16 433 pop ebx 434 jmp SmallForwardMove_3 435end; {Forwards_IA32} 436 437{-------------------------------------------------------------------------} 438{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)} 439procedure Backwards_IA32_3;assembler;nostackframe; 440asm 441 push ebx 442 fild qword ptr [eax+ecx-8] 443 lea ebx,[edx+ecx] {QWORD Align Writes} 444 and ebx,7 445 sub ecx,ebx 446 add ebx,ecx {Now QWORD Aligned, EBX = Original Length} 447 sub ecx,16 448@BwdLoop: 449 fild qword ptr [eax+ecx] 450 fild qword ptr [eax+ecx+8] 451 fistp qword ptr [edx+ecx+8] 452 fistp qword ptr [edx+ecx] 453 sub ecx,16 454 jge @BwdLoop 455 fistp qword ptr [edx+ebx-8] 456 add ecx,16 457 pop ebx 458 jmp SmallBackwardMove_3 459end; {Backwards_IA32} 460 461{-------------------------------------------------------------------------} 462{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)} 463procedure Forwards_MMX_3;assembler;nostackframe; 464const 465 LARGESIZE = 1024; 466asm 467 cmp ecx,LARGESIZE 468 jge @FwdLargeMove 469 cmp ecx,72 {Size at which using MMX becomes worthwhile} 470 jl Forwards_IA32_3 471 push ebx 472 mov ebx,edx 473 movq mm0,[eax] {First 8 Characters} 474 {QWORD Align Writes} 475 add eax,ecx 476 add ecx,edx 477 add edx,7 478 and edx,-8 479 sub ecx,edx 480 add edx,ecx 481 {Now QWORD Aligned} 482 sub ecx,32 483 neg ecx 484@FwdLoopMMX: 485 movq mm1,[eax+ecx-32] 486 movq mm2,[eax+ecx-24] 487 movq mm3,[eax+ecx-16] 488 movq mm4,[eax+ecx- 8] 489 movq [edx+ecx-32],mm1 490 movq [edx+ecx-24],mm2 491 movq [edx+ecx-16],mm3 492 movq [edx+ecx- 8],mm4 493 add ecx,32 494 jle @FwdLoopMMX 495 movq [ebx],mm0 {First 8 Characters} 496 emms 497 pop ebx 498 neg ecx 499 add ecx,32 500 jmp SmallForwardMove_3 501@FwdLargeMove: 502 push ebx 503 mov ebx,ecx 504 test edx,15 505 jz @FwdAligned 506 {16 byte Align Destination} 507 mov ecx,edx 508 add ecx,15 509 and ecx,-16 510 sub ecx,edx 511 add eax,ecx 512 add edx,ecx 513 sub ebx,ecx 514 {Destination now 16 Byte Aligned} 515 call SmallForwardMove_3 516@FwdAligned: 517 mov ecx,ebx 518 and ecx,-16 519 sub ebx,ecx {EBX = Remainder} 520 push esi 521 push edi 522 mov esi,eax {ESI = Source} 523 mov edi,edx {EDI = Dest} 524 mov eax,ecx {EAX = Count} 525 and eax,-64 {EAX = No of Bytes to Blocks Moves} 526 and ecx,$3F {ECX = Remaining Bytes to Move (0..63)} 527 add esi,eax 528 add edi,eax 529 shr eax,3 {EAX = No of QWORD's to Block Move} 530 neg eax 531@MMXcopyloop: 532 movq mm0,[esi+eax*8 ] 533 movq mm1,[esi+eax*8+ 8] 534 movq mm2,[esi+eax*8+16] 535 movq mm3,[esi+eax*8+24] 536 movq mm4,[esi+eax*8+32] 537 movq mm5,[esi+eax*8+40] 538 movq mm6,[esi+eax*8+48] 539 movq mm7,[esi+eax*8+56] 540 movq [edi+eax*8 ],mm0 541 movq [edi+eax*8+ 8],mm1 542 movq [edi+eax*8+16],mm2 543 movq [edi+eax*8+24],mm3 544 movq [edi+eax*8+32],mm4 545 movq [edi+eax*8+40],mm5 546 movq [edi+eax*8+48],mm6 547 movq [edi+eax*8+56],mm7 548 add eax,8 549 jnz @MMXcopyloop 550 emms {Empty MMX State} 551{$ifdef FPC_ENABLED_CLD} 552 cld 553{$endif FPC_ENABLED_CLD} 554 add ecx,ebx 555 shr ecx,2 556 rep movsd 557 mov ecx,ebx 558 and ecx,3 559 rep movsb 560 pop edi 561 pop esi 562 pop ebx 563end; {Forwards_MMX} 564 565{-------------------------------------------------------------------------} 566{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)} 567procedure Backwards_MMX_3;assembler;nostackframe; 568asm 569 cmp ecx,72 {Size at which using MMX becomes worthwhile} 570 jl Backwards_IA32_3 571 push ebx 572 movq mm0,[eax+ecx-8] {Get Last QWORD} 573 {QWORD Align Writes} 574 lea ebx,[edx+ecx] 575 and ebx,7 576 sub ecx,ebx 577 add ebx,ecx 578 {Now QWORD Aligned} 579 sub ecx,32 580@BwdLoopMMX: 581 movq mm1,[eax+ecx ] 582 movq mm2,[eax+ecx+ 8] 583 movq mm3,[eax+ecx+16] 584 movq mm4,[eax+ecx+24] 585 movq [edx+ecx+24],mm4 586 movq [edx+ecx+16],mm3 587 movq [edx+ecx+ 8],mm2 588 movq [edx+ecx ],mm1 589 sub ecx,32 590 jge @BwdLoopMMX 591 movq [edx+ebx-8], mm0 {Last QWORD} 592 emms 593 add ecx,32 594 pop ebx 595 jmp SmallBackwardMove_3 596end; {Backwards_MMX} 597 598{$ifndef FASTMOVE_DISABLE_SSE3} 599{-------------------------------------------------------------------------} 600{Dest MUST be 16-Byes Aligned, Count MUST be multiple of 16 } 601procedure AlignedFwdMoveSSE_3(const Source; var Dest; Count: Integer);assembler;nostackframe; 602const 603 Prefetch = 512; 604asm 605 push esi 606 mov esi,eax {ESI = Source} 607 mov eax,ecx {EAX = Count} 608 and eax,-128 {EAX = No of Bytes to Block Move} 609 add esi,eax 610 add edx,eax 611 shr eax,3 {EAX = No of QWORD's to Block Move} 612 neg eax 613 cmp eax, -(32*1024) {Count > 256K} 614 jl @Large 615@Small: {Count<=256K} 616 test esi,15 {Check if Both Source/Dest Aligned} 617 jnz @SmallUnaligned 618@SmallAligned: {Both Source and Dest 16-Byte Aligned} 619@SmallAlignedLoop: 620 movaps xmm0,[esi+8*eax] 621 movaps xmm1,[esi+8*eax+16] 622 movaps xmm2,[esi+8*eax+32] 623 movaps xmm3,[esi+8*eax+48] 624 movaps [edx+8*eax],xmm0 625 movaps [edx+8*eax+16],xmm1 626 movaps [edx+8*eax+32],xmm2 627 movaps [edx+8*eax+48],xmm3 628 movaps xmm4,[esi+8*eax+64] 629 movaps xmm5,[esi+8*eax+80] 630 movaps xmm6,[esi+8*eax+96] 631 movaps xmm7,[esi+8*eax+112] 632 movaps [edx+8*eax+64],xmm4 633 movaps [edx+8*eax+80],xmm5 634 movaps [edx+8*eax+96],xmm6 635 movaps [edx+8*eax+112],xmm7 636 add eax,16 637 js @SmallAlignedLoop 638 jmp @Remainder 639@SmallUnaligned: {Source Not 16-Byte Aligned} 640@SmallUnalignedLoop: 641 movups xmm0,[esi+8*eax] 642 movups xmm1,[esi+8*eax+16] 643 movups xmm2,[esi+8*eax+32] 644 movups xmm3,[esi+8*eax+48] 645 movaps [edx+8*eax],xmm0 646 movaps [edx+8*eax+16],xmm1 647 movaps [edx+8*eax+32],xmm2 648 movaps [edx+8*eax+48],xmm3 649 movups xmm4,[esi+8*eax+64] 650 movups xmm5,[esi+8*eax+80] 651 movups xmm6,[esi+8*eax+96] 652 movups xmm7,[esi+8*eax+112] 653 movaps [edx+8*eax+64],xmm4 654 movaps [edx+8*eax+80],xmm5 655 movaps [edx+8*eax+96],xmm6 656 movaps [edx+8*eax+112],xmm7 657 add eax,16 658 js @SmallUnalignedLoop 659 jmp @Remainder 660@Large: {Count>256K} 661 test esi,15 {Check if Both Source/Dest Aligned} 662 jnz @LargeUnaligned 663@LargeAligned: {Both Source and Dest 16-Byte Aligned} 664@LargeAlignedLoop: 665 prefetchnta [esi+8*eax+Prefetch] 666 prefetchnta [esi+8*eax+Prefetch+64] 667 movaps xmm0,[esi+8*eax] 668 movaps xmm1,[esi+8*eax+16] 669 movaps xmm2,[esi+8*eax+32] 670 movaps xmm3,[esi+8*eax+48] 671 movntps [edx+8*eax],xmm0 672 movntps [edx+8*eax+16],xmm1 673 movntps [edx+8*eax+32],xmm2 674 movntps [edx+8*eax+48],xmm3 675 movaps xmm4,[esi+8*eax+64] 676 movaps xmm5,[esi+8*eax+80] 677 movaps xmm6,[esi+8*eax+96] 678 movaps xmm7,[esi+8*eax+112] 679 movntps [edx+8*eax+64],xmm4 680 movntps [edx+8*eax+80],xmm5 681 movntps [edx+8*eax+96],xmm6 682 movntps [edx+8*eax+112],xmm7 683 add eax,16 684 js @LargeAlignedLoop 685 sfence 686 jmp @Remainder 687@LargeUnaligned: {Source Not 16-Byte Aligned} 688@LargeUnalignedLoop: 689 prefetchnta [esi+8*eax+Prefetch] 690 prefetchnta [esi+8*eax+Prefetch+64] 691 movups xmm0,[esi+8*eax] 692 movups xmm1,[esi+8*eax+16] 693 movups xmm2,[esi+8*eax+32] 694 movups xmm3,[esi+8*eax+48] 695 movntps [edx+8*eax],xmm0 696 movntps [edx+8*eax+16],xmm1 697 movntps [edx+8*eax+32],xmm2 698 movntps [edx+8*eax+48],xmm3 699 movups xmm4,[esi+8*eax+64] 700 movups xmm5,[esi+8*eax+80] 701 movups xmm6,[esi+8*eax+96] 702 movups xmm7,[esi+8*eax+112] 703 movntps [edx+8*eax+64],xmm4 704 movntps [edx+8*eax+80],xmm5 705 movntps [edx+8*eax+96],xmm6 706 movntps [edx+8*eax+112],xmm7 707 add eax,16 708 js @LargeUnalignedLoop 709 sfence 710@Remainder: 711 and ecx,$7F {ECX = Remainder (0..112 - Multiple of 16)} 712 jz @Done 713 add esi,ecx 714 add edx,ecx 715 neg ecx 716@RemainderLoop: 717 movups xmm0,[esi+ecx] 718 movaps [edx+ecx],xmm0 719 add ecx,16 720 jnz @RemainderLoop 721@Done: 722 pop esi 723end; {AlignedFwdMoveSSE} 724 725{-------------------------------------------------------------------------} 726{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)} 727procedure Forwards_SSE_3;assembler;nostackframe; 728const 729 LARGESIZE = 2048; 730asm 731 cmp ecx,LARGESIZE 732 jge @FwdLargeMove 733 cmp ecx,SMALLMOVESIZE+32 734 movups xmm0,[eax] 735 jg @FwdMoveSSE 736 movups xmm1,[eax+16] 737 movups [edx],xmm0 738 movups [edx+16],xmm1 739 add eax,ecx 740 add edx,ecx 741 sub ecx,32 742 jmp SmallForwardMove_3 743@FwdMoveSSE: 744 push ebx 745 mov ebx,edx 746 {Align Writes} 747 add eax,ecx 748 add ecx,edx 749 add edx,15 750 and edx,-16 751 sub ecx,edx 752 add edx,ecx 753 {Now Aligned} 754 sub ecx,32 755 neg ecx 756@FwdLoopSSE: 757 movups xmm1,[eax+ecx-32] 758 movups xmm2,[eax+ecx-16] 759 movaps [edx+ecx-32],xmm1 760 movaps [edx+ecx-16],xmm2 761 add ecx,32 762 jle @FwdLoopSSE 763 movups [ebx],xmm0 {First 16 Bytes} 764 neg ecx 765 add ecx,32 766 pop ebx 767 jmp SmallForwardMove_3 768@FwdLargeMove: 769 push ebx 770 mov ebx,ecx 771 test edx,15 772 jz @FwdLargeAligned 773 {16 byte Align Destination} 774 mov ecx,edx 775 add ecx,15 776 and ecx,-16 777 sub ecx,edx 778 add eax,ecx 779 add edx,ecx 780 sub ebx,ecx 781 {Destination now 16 Byte Aligned} 782 call SmallForwardMove_3 783 mov ecx,ebx 784@FwdLargeAligned: 785 and ecx,-16 786 sub ebx,ecx {EBX = Remainder} 787 push edx 788 push eax 789 push ecx 790 call AlignedFwdMoveSSE_3 791 pop ecx 792 pop eax 793 pop edx 794 add ecx,ebx 795 add eax,ecx 796 add edx,ecx 797 mov ecx,ebx 798 pop ebx 799 jmp SmallForwardMove_3 800end; {Forwards_SSE} 801 802{-------------------------------------------------------------------------} 803{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)} 804procedure Backwards_SSE_3;assembler;nostackframe; 805asm 806 cmp ecx,SMALLMOVESIZE+32 807 jg @BwdMoveSSE 808 sub ecx,32 809 movups xmm1,[eax+ecx] 810 movups xmm2,[eax+ecx+16] 811 movups [edx+ecx],xmm1 812 movups [edx+ecx+16],xmm2 813 jmp SmallBackwardMove_3 814@BwdMoveSSE: 815 push ebx 816 movups xmm0,[eax+ecx-16] {Last 16 Bytes} 817 {Align Writes} 818 lea ebx,[edx+ecx] 819 and ebx,15 820 sub ecx,ebx 821 add ebx,ecx 822 {Now Aligned} 823 sub ecx,32 824@BwdLoop: 825 movups xmm1,[eax+ecx] 826 movups xmm2,[eax+ecx+16] 827 movaps [edx+ecx],xmm1 828 movaps [edx+ecx+16],xmm2 829 sub ecx,32 830 jge @BwdLoop 831 movups [edx+ebx-16],xmm0 {Last 16 Bytes} 832 add ecx,32 833 pop ebx 834 jmp SmallBackwardMove_3 835end; {Backwards_SSE} 836{$endif ndef FASTMOVE_DISABLE_SSE3} 837 838const 839 fastmoveproc_forward : pointer = @Forwards_IA32_3; 840 fastmoveproc_backward : pointer = @Backwards_IA32_3; 841 842procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe; 843asm 844 cmp ecx,SMALLMOVESIZE 845 ja @Large 846 cmp eax,edx 847 lea eax,[eax+ecx] 848 jle @SmallCheck 849@SmallForward: 850 add edx,ecx 851 jmp SmallForwardMove_3 852@SmallCheck: 853 je @Done {For Compatibility with Delphi's move for Source = Dest} 854 sub eax,ecx 855 jmp SmallBackwardMove_3 856@Large: 857 jng @Done {For Compatibility with Delphi's move for Count < 0} 858 cmp eax,edx 859 jg @moveforward 860 je @Done {For Compatibility with Delphi's move for Source = Dest} 861 push eax 862 add eax,ecx 863 cmp eax,edx 864 pop eax 865 jg @movebackward 866@moveforward: 867 jmp dword ptr fastmoveproc_forward 868@movebackward: 869 jmp dword ptr fastmoveproc_backward {Source/Dest Overlap} 870@Done: 871end; 872 873{$asmmode att} 874{$ifndef FPC_HAS_INDIRECT_ENTRY_INFORMATION} 875var 876 valgrind_used : boolean;external name '__fpc_valgrind'; 877{$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION} 878 879procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif} 880 begin 881 { workaround valgrind bug } 882{$ifdef FPC_HAS_INDIRECT_ENTRY_INFORMATION} 883 if EntryInformation.valgrind_used then 884{$else FPC_HAS_INDIRECT_ENTRY_INFORMATION} 885 if valgrind_used then 886{$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION} 887 begin 888 fastmoveproc_forward:=@Forwards_Valgrind; 889 fastmoveproc_backward:=@Backwards_Valgrind; 890 end 891{$ifndef FASTMOVE_DISABLE_SSE3} 892 else if has_sse_support then 893 begin 894 fastmoveproc_forward:=@Forwards_SSE_3; 895 fastmoveproc_backward:=@Backwards_SSE_3; 896 end 897{$endif ndef FASTMOVE_DISABLE_SSE3} 898 else if has_mmx_support then 899 begin 900 fastmoveproc_forward:=@Forwards_MMX_3; 901 fastmoveproc_backward:=@Backwards_MMX_3; 902 end; 903 end; 904 905{$endif FPC_SYSTEM_HAS_MOVE} 906 907{$endif} 908