1;------------------------------------------------------------------------------
2;
3; Copyright (c) 2006, Intel Corporation. All rights reserved.<BR>
4; SPDX-License-Identifier: BSD-2-Clause-Patent
5;
6; Module Name:
7;
8;   CopyMem.nasm
9;
10; Abstract:
11;
12;   CopyMem function
13;
14; Notes:
15;
16;------------------------------------------------------------------------------
17
18    DEFAULT REL
19    SECTION .text
20
21;------------------------------------------------------------------------------
22;  VOID *
23;  EFIAPI
24;  InternalMemCopyMem (
25;    IN VOID   *Destination,
26;    IN VOID   *Source,
27;    IN UINTN  Count
28;    );
29;------------------------------------------------------------------------------
30global ASM_PFX(InternalMemCopyMem)
31ASM_PFX(InternalMemCopyMem):
32    push    rsi
33    push    rdi
34    mov     rsi, rdx                    ; rsi <- Source
35    mov     rdi, rcx                    ; rdi <- Destination
36    lea     r9, [rsi + r8 - 1]          ; r9 <- Last byte of Source
37    cmp     rsi, rdi
38    mov     rax, rdi                    ; rax <- Destination as return value
39    jae     .0                          ; Copy forward if Source > Destination
40    cmp     r9, rdi                     ; Overlapped?
41    jae     @CopyBackward               ; Copy backward if overlapped
42.0:
43    xor     rcx, rcx
44    sub     rcx, rdi                    ; rcx <- -rdi
45    and     rcx, 15                     ; rcx + rsi should be 16 bytes aligned
46    jz      .1                          ; skip if rcx == 0
47    cmp     rcx, r8
48    cmova   rcx, r8
49    sub     r8, rcx
50    rep     movsb
51.1:
52    mov     rcx, r8
53    and     r8, 15
54    shr     rcx, 4                      ; rcx <- # of DQwords to copy
55    jz      @CopyBytes
56    movdqa  [rsp + 0x18], xmm0           ; save xmm0 on stack
57.2:
58    movdqu  xmm0, [rsi]                 ; rsi may not be 16-byte aligned
59    movntdq [rdi], xmm0                 ; rdi should be 16-byte aligned
60    add     rsi, 16
61    add     rdi, 16
62    loop    .2
63    mfence
64    movdqa  xmm0, [rsp + 0x18]           ; restore xmm0
65    jmp     @CopyBytes                  ; copy remaining bytes
66@CopyBackward:
67    mov     rsi, r9                     ; rsi <- Last byte of Source
68    lea     rdi, [rdi + r8 - 1]         ; rdi <- Last byte of Destination
69    std
70@CopyBytes:
71    mov     rcx, r8
72    rep     movsb
73    cld
74    pop     rdi
75    pop     rsi
76    ret
77
78