1/*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from locore.s. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32#include <machine/asm.h> 33 34#if defined(LIBC_SCCS) 35 RCSID("$NetBSD: bcopy.S,v 1.4 2009/11/22 17:25:47 dsl Exp $") 36#endif 37 38 /* 39 * (ov)bcopy (src,dst,cnt) 40 * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 41 * 42 * Hacked about by dsl@netbsd.org 43 */ 44 45#ifdef MEMCOPY 46ENTRY(memcpy) 47#define NO_OVERLAP 48#else 49#ifdef MEMMOVE 50ENTRY(memmove) 51#else 52ENTRY(bcopy) 53#endif 54#endif 55 movq %rdx,%rcx 56#if defined(MEMCOPY) || defined(MEMMOVE) 57 movq %rdi,%rax /* must return destination address */ 58 mov %rdi,%r11 /* for misaligned check */ 59#else 60 mov %rsi,%r11 /* for misaligned check */ 61 xchgq %rdi,%rsi /* bcopy() has arg order reversed */ 62#endif 63 64#if !defined(NO_OVERLAP) 65 movq %rdi,%r8 66 subq %rsi,%r8 67#endif 68 69 shrq $3,%rcx /* count for copy by words */ 70 jz 8f /* j if less than 8 bytes */ 71 72 lea -8(%rdi,%rdx),%r9 /* target address of last 8 */ 73 mov -8(%rsi,%rdx),%r10 /* get last word */ 74#if !defined(NO_OVERLAP) 75 cmpq %rdx,%r8 /* overlapping? */ 76 jb 10f 77#endif 78 79/* 80 * Non-overlaping, copy forwards. 81 * Newer Intel cpus (Nehalem) will do 16byte read/write transfers 82 * if %ecx is more than 76. 83 * AMD might do something similar some day. 84 */ 85 and $7,%r11 /* destination misaligned ? */ 86 jnz 2f 87 rep 88 movsq 89 mov %r10,(%r9) /* write last word */ 90 ret 91 92/* 93 * Destination misaligned 94 * AMD say it is better to align the destination (not the source). 95 * This will also re-align copies if the source and dest are both 96 * misaligned by the same amount) 97 * (I think Nehalem will use its accelerated copy if the source 98 * and destination have the same alignment.) 99 */ 1002: 101 lea -9(%r11,%rdx),%rcx /* post re-alignment count */ 102 neg %r11 /* now -1 .. -7 */ 103 mov (%rsi),%rdx /* get first word */ 104 mov %rdi,%r8 /* target for first word */ 105 lea 8(%rsi,%r11),%rsi 106 lea 8(%rdi,%r11),%rdi 107 shr $3,%rcx 108 rep 109 movsq 110 mov %rdx,(%r8) /* write first word */ 111 mov %r10,(%r9) /* write last word */ 112 ret 113 114#if !defined(NO_OVERLAP) 115/* Must copy backwards. 116 * Reverse copy is probably easy to code faster than 'rep movds' 117 * since that requires (IIRC) an extra clock every 3 iterations (AMD). 118 * However I don't suppose anything cares that much! 119 * The big cost is the std/cld pair - reputedly 50+ cycles on Netburst P4. 120 * The copy is aligned with the buffer start (more likely to 121 * be a multiple of 8 than the end). 122 */ 12310: 124 lea -8(%rsi,%rcx,8),%rsi 125 lea -8(%rdi,%rcx,8),%rdi 126 std 127 rep 128 movsq 129 cld 130 mov %r10,(%r9) /* write last bytes */ 131 ret 132#endif 133 134/* Less than 8 bytes to copy, copy by bytes */ 135/* Intel Nehalem optimise 'rep movsb' for <= 7 bytes (9-15 clocks). 136 * For longer transfers it is 50+ ! 137 */ 1388: mov %rdx,%rcx 139 140#if !defined(NO_OVERLAP) 141 cmpq %rdx,%r8 /* overlapping? */ 142 jb 81f 143#endif 144 145 /* nope, copy forwards. */ 146 rep 147 movsb 148 ret 149 150#if !defined(NO_OVERLAP) 151/* Must copy backwards */ 15281: 153 lea -1(%rsi,%rcx),%rsi 154 lea -1(%rdi,%rcx),%rdi 155 std 156 rep 157 movsb 158 cld 159 ret 160#endif 161