1bits 32
2section .text
3;extern "C" int mmx_memcpy
4;  (unsigned char *dest, unsigned char *src, int len);
5
6global mmx_memcpy
7
8times ($$-$) & 3 db 0
9
10mmx_memcpy:
11
12pushad			; save registers
13mov edi,[esp+36]	; get 1st argument
14mov esi,[esp+40]	; ...2nd
15mov eax,[esp+44]	; ...3rd
16
17mov edx, eax
18shr eax, byte 3		; figure out how many 8 byte chunks we have
19and edx, byte 7		; also figure out slack
20test eax, eax		; Do we have any big chunks?
21push edx
22jz .slack		; If not, let's just do slack
23
24mov ecx,eax
25
26.mmx_move:
27  movq mm0,qword[esi]	; move 8 byte blocks using MMX
28  movq qword[edi],mm0
29  add esi, byte 8	; increment pointers
30  add edi, byte 8
31loopnz .mmx_move	; continue until CX=0
32
33.slack:
34pop ecx
35rep movsb		; move 1 byte slack
36
37emms			; Free up for the FPU
38
39popad			; clean up
40ret
41
42; --------------------------------------
43
44%ifdef NASM_STACK_NOEXEC
45section .note.GNU-stack noalloc noexec nowrite progbits
46%endif
47