1{
2  Copyright (c) 2004, John O'Harrow (john@almcrest.demon.co.uk)
3
4This software is provided 'as-is', without any express or implied warranty.
5In no event will the authors be held liable for any damages arising from the
6use of this software.
7
8Permission is granted to anyone to use this software for any purpose, including
9commercial applications, and to alter it and redistribute it freely, subject to
10the following restrictions:
11
121. The origin of this software must not be misrepresented; you must not claim
13   that you wrote the original software. If you use this software in a product,
14   an acknowledgment in the product documentation would be appreciated but is
15   not required.
16
172. Altered source versions must be plainly marked as such, and must not be
18   misrepresented as being the original software.
19
203. This notice may not be removed or altered from any source distribution.
21
22-------------------------------------------------------------------------------
23
24Version: 1.40 - 16-SEP-2004
25}
26
27{$ifdef USE_FASTMOVE}
28
29{$ifndef FPC_SYSTEM_HAS_MOVE}
30{$define FPC_SYSTEM_HAS_MOVE}
31
32{$asmmode intel}
33
34{-------------------------------------------------------------------------}
35(*
36{Just to show that a good Pascal algorithm can beat the default BASM}
37procedure MoveJOH_PAS_3(const Source; var Dest; Count : Integer);
38var
39  S, D       : PtrUInt;
40  Temp, C, I : PtrInt;
41  L          : PPtrInt;
42begin
43  S := Cardinal(@Source);
44  D := Cardinal(@Dest);
45  if S = D then
46    Exit;
47  if Count <= 4 then
48    case Count of
49      1 : PByte(@Dest)^ := PByte(S)^;
50      2 : PWord(@Dest)^ := PWord(S)^;
51      3 : if D > S then
52            begin
53              PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
54              PWord(@Dest)^ := PWord(S)^;
55            end
56          else
57            begin
58              PWord(@Dest)^ := PWord(S)^;
59              PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
60            end;
61      4 : PInteger(@Dest)^ := PInteger(S)^
62      else Exit; {Count <= 0}
63    end
64  else
65    if D > S then
66      begin
67        Temp := PInteger(S)^;
68        I := Integer(@Dest);
69        C := Count - 4;
70        L := PInteger(Integer(@Dest) + C);
71        Inc(S, C);
72        repeat
73          L^ := PInteger(S)^;
74          if Count <= 8 then
75            Break;
76          Dec(Count, 4);
77          Dec(S, 4);
78          Dec(L);
79        until False;
80        PInteger(I)^ := Temp;
81      end
82    else
83      begin
84        C := Count - 4;
85        Temp := PInteger(S + Cardinal(C))^;
86        I := Integer(@Dest) + C;
87        L := @Dest;
88        repeat
89          L^ := PInteger(S)^;
90          if Count <= 8 then
91            Break;
92          Dec(Count, 4);
93          Inc(S, 4);
94          Inc(L);
95        until False;
96        PInteger(I)^ := Temp;
97      end;
98end; {MoveJOH_PAS}
99*)
100
101const
102  SMALLMOVESIZE = 36;
103
104{-------------------------------------------------------------------------}
105{Perform Forward Move of 0..36 Bytes}
106{On Entry, ECX = Count, EAX = Source+Count, EDX = Dest+Count.  Destroys ECX}
107procedure SmallForwardMove_3;assembler;nostackframe;
108asm
109  jmp     dword ptr @@FwdJumpTable[ecx*4]
110  align   16
111@@FwdJumpTable:
112  dd      @@Done {Removes need to test for zero size move}
113  dd      @@Fwd01,@@Fwd02,@@Fwd03,@@Fwd04,@@Fwd05,@@Fwd06,@@Fwd07,@@Fwd08
114  dd      @@Fwd09,@@Fwd10,@@Fwd11,@@Fwd12,@@Fwd13,@@Fwd14,@@Fwd15,@@Fwd16
115  dd      @@Fwd17,@@Fwd18,@@Fwd19,@@Fwd20,@@Fwd21,@@Fwd22,@@Fwd23,@@Fwd24
116  dd      @@Fwd25,@@Fwd26,@@Fwd27,@@Fwd28,@@Fwd29,@@Fwd30,@@Fwd31,@@Fwd32
117  dd      @@Fwd33,@@Fwd34,@@Fwd35,@@Fwd36
118@@Fwd36:
119  mov     ecx,[eax-36]
120  mov     [edx-36],ecx
121@@Fwd32:
122  mov     ecx,[eax-32]
123  mov     [edx-32],ecx
124@@Fwd28:
125  mov     ecx,[eax-28]
126  mov     [edx-28],ecx
127@@Fwd24:
128  mov     ecx,[eax-24]
129  mov     [edx-24],ecx
130@@Fwd20:
131  mov     ecx,[eax-20]
132  mov     [edx-20],ecx
133@@Fwd16:
134  mov     ecx,[eax-16]
135  mov     [edx-16],ecx
136@@Fwd12:
137  mov     ecx,[eax-12]
138  mov     [edx-12],ecx
139@@Fwd08:
140  mov     ecx,[eax-8]
141  mov     [edx-8],ecx
142@@Fwd04:
143  mov     ecx,[eax-4]
144  mov     [edx-4],ecx
145  ret
146@@Fwd35:
147  mov     ecx,[eax-35]
148  mov     [edx-35],ecx
149@@Fwd31:
150  mov     ecx,[eax-31]
151  mov     [edx-31],ecx
152@@Fwd27:
153  mov     ecx,[eax-27]
154  mov     [edx-27],ecx
155@@Fwd23:
156  mov     ecx,[eax-23]
157  mov     [edx-23],ecx
158@@Fwd19:
159  mov     ecx,[eax-19]
160  mov     [edx-19],ecx
161@@Fwd15:
162  mov     ecx,[eax-15]
163  mov     [edx-15],ecx
164@@Fwd11:
165  mov     ecx,[eax-11]
166  mov     [edx-11],ecx
167@@Fwd07:
168  mov     ecx,[eax-7]
169  mov     [edx-7],ecx
170  mov     ecx,[eax-4]
171  mov     [edx-4],ecx
172  ret
173@@Fwd03:
174  movzx   ecx, word ptr [eax-3]
175  mov     [edx-3],cx
176  movzx   ecx, byte ptr [eax-1]
177  mov     [edx-1],cl
178  ret
179@@Fwd34:
180  mov     ecx,[eax-34]
181  mov     [edx-34],ecx
182@@Fwd30:
183  mov     ecx,[eax-30]
184  mov     [edx-30],ecx
185@@Fwd26:
186  mov     ecx,[eax-26]
187  mov     [edx-26],ecx
188@@Fwd22:
189  mov     ecx,[eax-22]
190  mov     [edx-22],ecx
191@@Fwd18:
192  mov     ecx,[eax-18]
193  mov     [edx-18],ecx
194@@Fwd14:
195  mov     ecx,[eax-14]
196  mov     [edx-14],ecx
197@@Fwd10:
198  mov     ecx,[eax-10]
199  mov     [edx-10],ecx
200@@Fwd06:
201  mov     ecx,[eax-6]
202  mov     [edx-6],ecx
203@@Fwd02:
204  movzx   ecx, word ptr [eax-2]
205  mov     [edx-2],cx
206  ret
207@@Fwd33:
208  mov     ecx,[eax-33]
209  mov     [edx-33],ecx
210@@Fwd29:
211  mov     ecx,[eax-29]
212  mov     [edx-29],ecx
213@@Fwd25:
214  mov     ecx,[eax-25]
215  mov     [edx-25],ecx
216@@Fwd21:
217  mov     ecx,[eax-21]
218  mov     [edx-21],ecx
219@@Fwd17:
220  mov     ecx,[eax-17]
221  mov     [edx-17],ecx
222@@Fwd13:
223  mov     ecx,[eax-13]
224  mov     [edx-13],ecx
225@@Fwd09:
226  mov     ecx,[eax-9]
227  mov     [edx-9],ecx
228@@Fwd05:
229  mov     ecx,[eax-5]
230  mov     [edx-5],ecx
231@@Fwd01:
232  movzx   ecx, byte ptr [eax-1]
233  mov     [edx-1],cl
234@@Done:
235end; {SmallForwardMove}
236
237{-------------------------------------------------------------------------}
238{Perform Backward Move of 0..36 Bytes}
239{On Entry, ECX = Count, EAX = Source, EDX = Dest.  Destroys ECX}
240procedure SmallBackwardMove_3;assembler;nostackframe;
241asm
242  jmp     dword ptr @@BwdJumpTable[ecx*4]
243  align   16
244@@BwdJumpTable:
245  dd      @@Done {Removes need to test for zero size move}
246  dd      @@Bwd01,@@Bwd02,@@Bwd03,@@Bwd04,@@Bwd05,@@Bwd06,@@Bwd07,@@Bwd08
247  dd      @@Bwd09,@@Bwd10,@@Bwd11,@@Bwd12,@@Bwd13,@@Bwd14,@@Bwd15,@@Bwd16
248  dd      @@Bwd17,@@Bwd18,@@Bwd19,@@Bwd20,@@Bwd21,@@Bwd22,@@Bwd23,@@Bwd24
249  dd      @@Bwd25,@@Bwd26,@@Bwd27,@@Bwd28,@@Bwd29,@@Bwd30,@@Bwd31,@@Bwd32
250  dd      @@Bwd33,@@Bwd34,@@Bwd35,@@Bwd36
251@@Bwd36:
252  mov     ecx,[eax+32]
253  mov     [edx+32],ecx
254@@Bwd32:
255  mov     ecx,[eax+28]
256  mov     [edx+28],ecx
257@@Bwd28:
258  mov     ecx,[eax+24]
259  mov     [edx+24],ecx
260@@Bwd24:
261  mov     ecx,[eax+20]
262  mov     [edx+20],ecx
263@@Bwd20:
264  mov     ecx,[eax+16]
265  mov     [edx+16],ecx
266@@Bwd16:
267  mov     ecx,[eax+12]
268  mov     [edx+12],ecx
269@@Bwd12:
270  mov     ecx,[eax+8]
271  mov     [edx+8],ecx
272@@Bwd08:
273  mov     ecx,[eax+4]
274  mov     [edx+4],ecx
275@@Bwd04:
276  mov     ecx,[eax]
277  mov     [edx],ecx
278  ret
279@@Bwd35:
280  mov     ecx,[eax+31]
281  mov     [edx+31],ecx
282@@Bwd31:
283  mov     ecx,[eax+27]
284  mov     [edx+27],ecx
285@@Bwd27:
286  mov     ecx,[eax+23]
287  mov     [edx+23],ecx
288@@Bwd23:
289  mov     ecx,[eax+19]
290  mov     [edx+19],ecx
291@@Bwd19:
292  mov     ecx,[eax+15]
293  mov     [edx+15],ecx
294@@Bwd15:
295  mov     ecx,[eax+11]
296  mov     [edx+11],ecx
297@@Bwd11:
298  mov     ecx,[eax+7]
299  mov     [edx+7],ecx
300@@Bwd07:
301  mov     ecx,[eax+3]
302  mov     [edx+3],ecx
303  mov     ecx,[eax]
304  mov     [edx],ecx
305  ret
306@@Bwd03:
307  movzx   ecx, word ptr [eax+1]
308  mov     [edx+1],cx
309  movzx   ecx, byte ptr [eax]
310  mov     [edx],cl
311  ret
312@@Bwd34:
313  mov     ecx,[eax+30]
314  mov     [edx+30],ecx
315@@Bwd30:
316  mov     ecx,[eax+26]
317  mov     [edx+26],ecx
318@@Bwd26:
319  mov     ecx,[eax+22]
320  mov     [edx+22],ecx
321@@Bwd22:
322  mov     ecx,[eax+18]
323  mov     [edx+18],ecx
324@@Bwd18:
325  mov     ecx,[eax+14]
326  mov     [edx+14],ecx
327@@Bwd14:
328  mov     ecx,[eax+10]
329  mov     [edx+10],ecx
330@@Bwd10:
331  mov     ecx,[eax+6]
332  mov     [edx+6],ecx
333@@Bwd06:
334  mov     ecx,[eax+2]
335  mov     [edx+2],ecx
336@@Bwd02:
337  movzx   ecx, word ptr [eax]
338  mov     [edx],cx
339  ret
340@@Bwd33:
341  mov     ecx,[eax+29]
342  mov     [edx+29],ecx
343@@Bwd29:
344  mov     ecx,[eax+25]
345  mov     [edx+25],ecx
346@@Bwd25:
347  mov     ecx,[eax+21]
348  mov     [edx+21],ecx
349@@Bwd21:
350  mov     ecx,[eax+17]
351  mov     [edx+17],ecx
352@@Bwd17:
353  mov     ecx,[eax+13]
354  mov     [edx+13],ecx
355@@Bwd13:
356  mov     ecx,[eax+9]
357  mov     [edx+9],ecx
358@@Bwd09:
359  mov     ecx,[eax+5]
360  mov     [edx+5],ecx
361@@Bwd05:
362  mov     ecx,[eax+1]
363  mov     [edx+1],ecx
364@@Bwd01:
365  movzx   ecx, byte ptr[eax]
366  mov     [edx],cl
367@@Done:
368end; {SmallBackwardMove}
369
370
371{ at least valgrind up to 3.3 has a bug which prevents the default code to
372  work so we use a rather simple implementation here
373}
374procedure Forwards_Valgrind;assembler;nostackframe;
375asm
376{$ifdef FPC_ENABLED_CLD}
377  cld
378{$endif FPC_ENABLED_CLD}
379  push    esi
380  push    edi
381  mov     esi,eax
382  mov     edi,edx
383  rep     movsb
384  pop     edi
385  pop     esi
386end;
387
388{ at least valgrind up to 3.3 has a bug which prevents the default code to
389  work so we use a rather simple implementation here
390}
391procedure Backwards_Valgrind;assembler;nostackframe;
392asm
393  push    esi
394  push    edi
395  lea     esi,[eax+ecx-1]
396  lea     edi,[edx+ecx-1]
397@@repeat:
398  mov     al,[esi]
399  mov     [edi],al
400  dec     esi
401  dec     edi
402  dec     ecx
403  jnz     @@repeat
404  pop     edi
405  pop     esi
406end;
407
408{-------------------------------------------------------------------------}
409{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
410procedure Forwards_IA32_3;assembler;nostackframe;
411asm
412  push    ebx
413  mov     ebx,edx
414  fild    qword ptr [eax]
415  add     eax,ecx {QWORD Align Writes}
416  add     ecx,edx
417  add     edx,7
418  and     edx,-8
419  sub     ecx,edx
420  add     edx,ecx {Now QWORD Aligned}
421  sub     ecx,16
422  neg     ecx
423@FwdLoop:
424  fild    qword ptr [eax+ecx-16]
425  fistp   qword ptr [edx+ecx-16]
426  fild    qword ptr [eax+ecx-8]
427  fistp   qword ptr [edx+ecx-8]
428  add     ecx,16
429  jle     @FwdLoop
430  fistp   qword ptr [ebx]
431  neg     ecx
432  add     ecx,16
433  pop     ebx
434  jmp     SmallForwardMove_3
435end; {Forwards_IA32}
436
437{-------------------------------------------------------------------------}
438{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
439procedure Backwards_IA32_3;assembler;nostackframe;
440asm
441  push    ebx
442  fild    qword ptr [eax+ecx-8]
443  lea     ebx,[edx+ecx] {QWORD Align Writes}
444  and     ebx,7
445  sub     ecx,ebx
446  add     ebx,ecx {Now QWORD Aligned, EBX = Original Length}
447  sub     ecx,16
448@BwdLoop:
449  fild    qword ptr [eax+ecx]
450  fild    qword ptr [eax+ecx+8]
451  fistp   qword ptr [edx+ecx+8]
452  fistp   qword ptr [edx+ecx]
453  sub     ecx,16
454  jge     @BwdLoop
455  fistp   qword ptr [edx+ebx-8]
456  add     ecx,16
457  pop     ebx
458  jmp     SmallBackwardMove_3
459end; {Backwards_IA32}
460
461{-------------------------------------------------------------------------}
462{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
463procedure Forwards_MMX_3;assembler;nostackframe;
464const
465  LARGESIZE = 1024;
466asm
467  cmp     ecx,LARGESIZE
468  jge     @FwdLargeMove
469  cmp     ecx,72 {Size at which using MMX becomes worthwhile}
470  jl      Forwards_IA32_3
471  push    ebx
472  mov     ebx,edx
473  movq    mm0,[eax] {First 8 Characters}
474  {QWORD Align Writes}
475  add     eax,ecx
476  add     ecx,edx
477  add     edx,7
478  and     edx,-8
479  sub     ecx,edx
480  add     edx,ecx
481  {Now QWORD Aligned}
482  sub     ecx,32
483  neg     ecx
484@FwdLoopMMX:
485  movq    mm1,[eax+ecx-32]
486  movq    mm2,[eax+ecx-24]
487  movq    mm3,[eax+ecx-16]
488  movq    mm4,[eax+ecx- 8]
489  movq    [edx+ecx-32],mm1
490  movq    [edx+ecx-24],mm2
491  movq    [edx+ecx-16],mm3
492  movq    [edx+ecx- 8],mm4
493  add     ecx,32
494  jle     @FwdLoopMMX
495  movq    [ebx],mm0 {First 8 Characters}
496  emms
497  pop     ebx
498  neg     ecx
499  add     ecx,32
500  jmp     SmallForwardMove_3
501@FwdLargeMove:
502  push    ebx
503  mov     ebx,ecx
504  test    edx,15
505  jz      @FwdAligned
506  {16 byte Align Destination}
507  mov     ecx,edx
508  add     ecx,15
509  and     ecx,-16
510  sub     ecx,edx
511  add     eax,ecx
512  add     edx,ecx
513  sub     ebx,ecx
514  {Destination now 16 Byte Aligned}
515  call    SmallForwardMove_3
516@FwdAligned:
517  mov     ecx,ebx
518  and     ecx,-16
519  sub     ebx,ecx {EBX = Remainder}
520  push    esi
521  push    edi
522  mov     esi,eax          {ESI = Source}
523  mov     edi,edx          {EDI = Dest}
524  mov     eax,ecx          {EAX = Count}
525  and     eax,-64          {EAX = No of Bytes to Blocks Moves}
526  and     ecx,$3F          {ECX = Remaining Bytes to Move (0..63)}
527  add     esi,eax
528  add     edi,eax
529  shr     eax,3            {EAX = No of QWORD's to Block Move}
530  neg     eax
531@MMXcopyloop:
532  movq    mm0,[esi+eax*8   ]
533  movq    mm1,[esi+eax*8+ 8]
534  movq    mm2,[esi+eax*8+16]
535  movq    mm3,[esi+eax*8+24]
536  movq    mm4,[esi+eax*8+32]
537  movq    mm5,[esi+eax*8+40]
538  movq    mm6,[esi+eax*8+48]
539  movq    mm7,[esi+eax*8+56]
540  movq    [edi+eax*8   ],mm0
541  movq    [edi+eax*8+ 8],mm1
542  movq    [edi+eax*8+16],mm2
543  movq    [edi+eax*8+24],mm3
544  movq    [edi+eax*8+32],mm4
545  movq    [edi+eax*8+40],mm5
546  movq    [edi+eax*8+48],mm6
547  movq    [edi+eax*8+56],mm7
548  add     eax,8
549  jnz     @MMXcopyloop
550  emms                   {Empty MMX State}
551{$ifdef FPC_ENABLED_CLD}
552  cld
553{$endif FPC_ENABLED_CLD}
554  add     ecx,ebx
555  shr     ecx,2
556  rep     movsd
557  mov     ecx,ebx
558  and     ecx,3
559  rep     movsb
560  pop     edi
561  pop     esi
562  pop     ebx
563end; {Forwards_MMX}
564
565{-------------------------------------------------------------------------}
566{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
567procedure Backwards_MMX_3;assembler;nostackframe;
568asm
569  cmp     ecx,72 {Size at which using MMX becomes worthwhile}
570  jl      Backwards_IA32_3
571  push    ebx
572  movq    mm0,[eax+ecx-8] {Get Last QWORD}
573  {QWORD Align Writes}
574  lea     ebx,[edx+ecx]
575  and     ebx,7
576  sub     ecx,ebx
577  add     ebx,ecx
578  {Now QWORD Aligned}
579  sub     ecx,32
580@BwdLoopMMX:
581  movq    mm1,[eax+ecx   ]
582  movq    mm2,[eax+ecx+ 8]
583  movq    mm3,[eax+ecx+16]
584  movq    mm4,[eax+ecx+24]
585  movq    [edx+ecx+24],mm4
586  movq    [edx+ecx+16],mm3
587  movq    [edx+ecx+ 8],mm2
588  movq    [edx+ecx   ],mm1
589  sub     ecx,32
590  jge     @BwdLoopMMX
591  movq    [edx+ebx-8], mm0 {Last QWORD}
592  emms
593  add     ecx,32
594  pop     ebx
595  jmp     SmallBackwardMove_3
596end; {Backwards_MMX}
597
598{$ifndef FASTMOVE_DISABLE_SSE3}
599{-------------------------------------------------------------------------}
600{Dest MUST be 16-Byes Aligned, Count MUST be multiple of 16 }
601procedure AlignedFwdMoveSSE_3(const Source; var Dest; Count: Integer);assembler;nostackframe;
602const
603  Prefetch = 512;
604asm
605  push    esi
606  mov     esi,eax             {ESI = Source}
607  mov     eax,ecx             {EAX = Count}
608  and     eax,-128            {EAX = No of Bytes to Block Move}
609  add     esi,eax
610  add     edx,eax
611  shr     eax,3               {EAX = No of QWORD's to Block Move}
612  neg     eax
613  cmp     eax, -(32*1024)     {Count > 256K}
614  jl      @Large
615@Small: {Count<=256K}
616  test    esi,15              {Check if Both Source/Dest Aligned}
617  jnz     @SmallUnaligned
618@SmallAligned:                {Both Source and Dest 16-Byte Aligned}
619@SmallAlignedLoop:
620  movaps  xmm0,[esi+8*eax]
621  movaps  xmm1,[esi+8*eax+16]
622  movaps  xmm2,[esi+8*eax+32]
623  movaps  xmm3,[esi+8*eax+48]
624  movaps  [edx+8*eax],xmm0
625  movaps  [edx+8*eax+16],xmm1
626  movaps  [edx+8*eax+32],xmm2
627  movaps  [edx+8*eax+48],xmm3
628  movaps  xmm4,[esi+8*eax+64]
629  movaps  xmm5,[esi+8*eax+80]
630  movaps  xmm6,[esi+8*eax+96]
631  movaps  xmm7,[esi+8*eax+112]
632  movaps  [edx+8*eax+64],xmm4
633  movaps  [edx+8*eax+80],xmm5
634  movaps  [edx+8*eax+96],xmm6
635  movaps  [edx+8*eax+112],xmm7
636  add     eax,16
637  js      @SmallAlignedLoop
638  jmp     @Remainder
639@SmallUnaligned:              {Source Not 16-Byte Aligned}
640@SmallUnalignedLoop:
641  movups  xmm0,[esi+8*eax]
642  movups  xmm1,[esi+8*eax+16]
643  movups  xmm2,[esi+8*eax+32]
644  movups  xmm3,[esi+8*eax+48]
645  movaps  [edx+8*eax],xmm0
646  movaps  [edx+8*eax+16],xmm1
647  movaps  [edx+8*eax+32],xmm2
648  movaps  [edx+8*eax+48],xmm3
649  movups  xmm4,[esi+8*eax+64]
650  movups  xmm5,[esi+8*eax+80]
651  movups  xmm6,[esi+8*eax+96]
652  movups  xmm7,[esi+8*eax+112]
653  movaps  [edx+8*eax+64],xmm4
654  movaps  [edx+8*eax+80],xmm5
655  movaps  [edx+8*eax+96],xmm6
656  movaps  [edx+8*eax+112],xmm7
657  add     eax,16
658  js      @SmallUnalignedLoop
659  jmp     @Remainder
660@Large: {Count>256K}
661  test    esi,15              {Check if Both Source/Dest Aligned}
662  jnz     @LargeUnaligned
663@LargeAligned:                {Both Source and Dest 16-Byte Aligned}
664@LargeAlignedLoop:
665  prefetchnta  [esi+8*eax+Prefetch]
666  prefetchnta  [esi+8*eax+Prefetch+64]
667  movaps  xmm0,[esi+8*eax]
668  movaps  xmm1,[esi+8*eax+16]
669  movaps  xmm2,[esi+8*eax+32]
670  movaps  xmm3,[esi+8*eax+48]
671  movntps [edx+8*eax],xmm0
672  movntps [edx+8*eax+16],xmm1
673  movntps [edx+8*eax+32],xmm2
674  movntps [edx+8*eax+48],xmm3
675  movaps  xmm4,[esi+8*eax+64]
676  movaps  xmm5,[esi+8*eax+80]
677  movaps  xmm6,[esi+8*eax+96]
678  movaps  xmm7,[esi+8*eax+112]
679  movntps [edx+8*eax+64],xmm4
680  movntps [edx+8*eax+80],xmm5
681  movntps [edx+8*eax+96],xmm6
682  movntps [edx+8*eax+112],xmm7
683  add     eax,16
684  js      @LargeAlignedLoop
685  sfence
686  jmp     @Remainder
687@LargeUnaligned:              {Source Not 16-Byte Aligned}
688@LargeUnalignedLoop:
689  prefetchnta  [esi+8*eax+Prefetch]
690  prefetchnta  [esi+8*eax+Prefetch+64]
691  movups  xmm0,[esi+8*eax]
692  movups  xmm1,[esi+8*eax+16]
693  movups  xmm2,[esi+8*eax+32]
694  movups  xmm3,[esi+8*eax+48]
695  movntps [edx+8*eax],xmm0
696  movntps [edx+8*eax+16],xmm1
697  movntps [edx+8*eax+32],xmm2
698  movntps [edx+8*eax+48],xmm3
699  movups  xmm4,[esi+8*eax+64]
700  movups  xmm5,[esi+8*eax+80]
701  movups  xmm6,[esi+8*eax+96]
702  movups  xmm7,[esi+8*eax+112]
703  movntps [edx+8*eax+64],xmm4
704  movntps [edx+8*eax+80],xmm5
705  movntps [edx+8*eax+96],xmm6
706  movntps [edx+8*eax+112],xmm7
707  add     eax,16
708  js      @LargeUnalignedLoop
709  sfence
710@Remainder:
711  and     ecx,$7F {ECX = Remainder (0..112 - Multiple of 16)}
712  jz      @Done
713  add     esi,ecx
714  add     edx,ecx
715  neg     ecx
716@RemainderLoop:
717  movups  xmm0,[esi+ecx]
718  movaps  [edx+ecx],xmm0
719  add     ecx,16
720  jnz     @RemainderLoop
721@Done:
722  pop     esi
723end; {AlignedFwdMoveSSE}
724
725{-------------------------------------------------------------------------}
726{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
727procedure Forwards_SSE_3;assembler;nostackframe;
728const
729  LARGESIZE = 2048;
730asm
731  cmp     ecx,LARGESIZE
732  jge     @FwdLargeMove
733  cmp     ecx,SMALLMOVESIZE+32
734  movups  xmm0,[eax]
735  jg      @FwdMoveSSE
736  movups  xmm1,[eax+16]
737  movups  [edx],xmm0
738  movups  [edx+16],xmm1
739  add     eax,ecx
740  add     edx,ecx
741  sub     ecx,32
742  jmp     SmallForwardMove_3
743@FwdMoveSSE:
744  push    ebx
745  mov     ebx,edx
746  {Align Writes}
747  add     eax,ecx
748  add     ecx,edx
749  add     edx,15
750  and     edx,-16
751  sub     ecx,edx
752  add     edx,ecx
753  {Now Aligned}
754  sub     ecx,32
755  neg     ecx
756@FwdLoopSSE:
757  movups  xmm1,[eax+ecx-32]
758  movups  xmm2,[eax+ecx-16]
759  movaps  [edx+ecx-32],xmm1
760  movaps  [edx+ecx-16],xmm2
761  add     ecx,32
762  jle     @FwdLoopSSE
763  movups  [ebx],xmm0 {First 16 Bytes}
764  neg     ecx
765  add     ecx,32
766  pop     ebx
767  jmp     SmallForwardMove_3
768@FwdLargeMove:
769  push    ebx
770  mov     ebx,ecx
771  test    edx,15
772  jz      @FwdLargeAligned
773  {16 byte Align Destination}
774  mov     ecx,edx
775  add     ecx,15
776  and     ecx,-16
777  sub     ecx,edx
778  add     eax,ecx
779  add     edx,ecx
780  sub     ebx,ecx
781  {Destination now 16 Byte Aligned}
782  call    SmallForwardMove_3
783  mov     ecx,ebx
784@FwdLargeAligned:
785  and     ecx,-16
786  sub     ebx,ecx {EBX = Remainder}
787  push    edx
788  push    eax
789  push    ecx
790  call    AlignedFwdMoveSSE_3
791  pop     ecx
792  pop     eax
793  pop     edx
794  add     ecx,ebx
795  add     eax,ecx
796  add     edx,ecx
797  mov     ecx,ebx
798  pop     ebx
799  jmp     SmallForwardMove_3
800end; {Forwards_SSE}
801
802{-------------------------------------------------------------------------}
803{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
804procedure Backwards_SSE_3;assembler;nostackframe;
805asm
806  cmp     ecx,SMALLMOVESIZE+32
807  jg      @BwdMoveSSE
808  sub     ecx,32
809  movups  xmm1,[eax+ecx]
810  movups  xmm2,[eax+ecx+16]
811  movups  [edx+ecx],xmm1
812  movups  [edx+ecx+16],xmm2
813  jmp     SmallBackwardMove_3
814@BwdMoveSSE:
815  push    ebx
816  movups  xmm0,[eax+ecx-16] {Last 16 Bytes}
817  {Align Writes}
818  lea     ebx,[edx+ecx]
819  and     ebx,15
820  sub     ecx,ebx
821  add     ebx,ecx
822  {Now Aligned}
823  sub     ecx,32
824@BwdLoop:
825  movups  xmm1,[eax+ecx]
826  movups  xmm2,[eax+ecx+16]
827  movaps  [edx+ecx],xmm1
828  movaps  [edx+ecx+16],xmm2
829  sub     ecx,32
830  jge     @BwdLoop
831  movups  [edx+ebx-16],xmm0  {Last 16 Bytes}
832  add     ecx,32
833  pop     ebx
834  jmp     SmallBackwardMove_3
835end; {Backwards_SSE}
836{$endif ndef FASTMOVE_DISABLE_SSE3}
837
838const
839   fastmoveproc_forward : pointer = @Forwards_IA32_3;
840   fastmoveproc_backward : pointer = @Backwards_IA32_3;
841
842procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
843asm
844  cmp     ecx,SMALLMOVESIZE
845  ja      @Large
846  cmp     eax,edx
847  lea     eax,[eax+ecx]
848  jle     @SmallCheck
849@SmallForward:
850  add     edx,ecx
851  jmp     SmallForwardMove_3
852@SmallCheck:
853  je      @Done {For Compatibility with Delphi's move for Source = Dest}
854  sub     eax,ecx
855  jmp     SmallBackwardMove_3
856@Large:
857  jng     @Done {For Compatibility with Delphi's move for Count < 0}
858  cmp     eax,edx
859  jg      @moveforward
860  je      @Done {For Compatibility with Delphi's move for Source = Dest}
861  push    eax
862  add     eax,ecx
863  cmp     eax,edx
864  pop     eax
865  jg      @movebackward
866@moveforward:
867  jmp     dword ptr fastmoveproc_forward
868@movebackward:
869  jmp     dword ptr fastmoveproc_backward {Source/Dest Overlap}
870@Done:
871end;
872
873{$asmmode att}
874{$ifndef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
875var
876  valgrind_used : boolean;external name '__fpc_valgrind';
877{$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
878
879procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif}
880  begin
881    { workaround valgrind bug }
882{$ifdef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
883    if EntryInformation.valgrind_used then
884{$else FPC_HAS_INDIRECT_ENTRY_INFORMATION}
885    if valgrind_used then
886{$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
887      begin
888        fastmoveproc_forward:=@Forwards_Valgrind;
889        fastmoveproc_backward:=@Backwards_Valgrind;
890      end
891{$ifndef FASTMOVE_DISABLE_SSE3}
892    else if has_sse_support then
893      begin
894        fastmoveproc_forward:=@Forwards_SSE_3;
895        fastmoveproc_backward:=@Backwards_SSE_3;
896      end
897{$endif ndef FASTMOVE_DISABLE_SSE3}
898   else if has_mmx_support then
899      begin
900        fastmoveproc_forward:=@Forwards_MMX_3;
901        fastmoveproc_backward:=@Backwards_MMX_3;
902      end;
903  end;
904
905{$endif  FPC_SYSTEM_HAS_MOVE}
906
907{$endif}
908