1/* Copyright (c) 2012, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26 27/* 28 * Copyright (c) 2015 ARM Ltd 29 * All rights reserved. 30 * 31 * Redistribution and use in source and binary forms, with or without 32 * modification, are permitted provided that the following conditions 33 * are met: 34 * 1. Redistributions of source code must retain the above copyright 35 * notice, this list of conditions and the following disclaimer. 36 * 2. Redistributions in binary form must reproduce the above copyright 37 * notice, this list of conditions and the following disclaimer in the 38 * documentation and/or other materials provided with the distribution. 39 * 3. The name of the company may not be used to endorse or promote 40 * products derived from this software without specific prior written 41 * permission. 42 * 43 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 45 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 46 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 48 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 49 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 50 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 51 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 52 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 53 */ 54 55#include <machine/asm.h> 56__FBSDID("$FreeBSD$"); 57 58/* Assumptions: 59 * 60 * ARMv8-a, AArch64, unaligned accesses. 61 * 62 */ 63 64#define dstin x0 65#define src x1 66#define count x2 67#define dst x3 68#define srcend x4 69#define dstend x5 70#define A_l x6 71#define A_lw w6 72#define A_h x7 73#define A_hw w7 74#define B_l x8 75#define B_lw w8 76#define B_h x9 77#define C_l x10 78#define C_h x11 79#define D_l x12 80#define D_h x13 81#define E_l src 82#define E_h count 83#define F_l srcend 84#define F_h dst 85#define tmp1 x9 86 87#define L(l) .L ## l 88 89/* Copies are split into 3 main cases: small copies of up to 16 bytes, 90 medium copies of 17..96 bytes which are fully unrolled. Large copies 91 of more than 96 bytes align the destination and use an unrolled loop 92 processing 64 bytes per iteration. 93 Small and medium copies read all data before writing, allowing any 94 kind of overlap, and memmove tailcalls memcpy for these cases as 95 well as non-overlapping copies. 96*/ 97 98ENTRY(memcpy) 99 prfm PLDL1KEEP, [src] 100 add srcend, src, count 101 add dstend, dstin, count 102 cmp count, 16 103 b.ls L(copy16) 104 cmp count, 96 105 b.hi L(copy_long) 106 107 /* Medium copies: 17..96 bytes. */ 108 sub tmp1, count, 1 109 ldp A_l, A_h, [src] 110 tbnz tmp1, 6, L(copy96) 111 ldp D_l, D_h, [srcend, -16] 112 tbz tmp1, 5, 1f 113 ldp B_l, B_h, [src, 16] 114 ldp C_l, C_h, [srcend, -32] 115 stp B_l, B_h, [dstin, 16] 116 stp C_l, C_h, [dstend, -32] 1171: 118 stp A_l, A_h, [dstin] 119 stp D_l, D_h, [dstend, -16] 120 ret 121 122 .p2align 4 123 /* Small copies: 0..16 bytes. */ 124L(copy16): 125 cmp count, 8 126 b.lo 1f 127 ldr A_l, [src] 128 ldr A_h, [srcend, -8] 129 str A_l, [dstin] 130 str A_h, [dstend, -8] 131 ret 132 .p2align 4 1331: 134 tbz count, 2, 1f 135 ldr A_lw, [src] 136 ldr A_hw, [srcend, -4] 137 str A_lw, [dstin] 138 str A_hw, [dstend, -4] 139 ret 140 141 /* Copy 0..3 bytes. Use a branchless sequence that copies the same 142 byte 3 times if count==1, or the 2nd byte twice if count==2. */ 1431: 144 cbz count, 2f 145 lsr tmp1, count, 1 146 ldrb A_lw, [src] 147 ldrb A_hw, [srcend, -1] 148 ldrb B_lw, [src, tmp1] 149 strb A_lw, [dstin] 150 strb B_lw, [dstin, tmp1] 151 strb A_hw, [dstend, -1] 1522: ret 153 154 .p2align 4 155 /* Copy 64..96 bytes. Copy 64 bytes from the start and 156 32 bytes from the end. */ 157L(copy96): 158 ldp B_l, B_h, [src, 16] 159 ldp C_l, C_h, [src, 32] 160 ldp D_l, D_h, [src, 48] 161 ldp E_l, E_h, [srcend, -32] 162 ldp F_l, F_h, [srcend, -16] 163 stp A_l, A_h, [dstin] 164 stp B_l, B_h, [dstin, 16] 165 stp C_l, C_h, [dstin, 32] 166 stp D_l, D_h, [dstin, 48] 167 stp E_l, E_h, [dstend, -32] 168 stp F_l, F_h, [dstend, -16] 169 ret 170 171 /* Align DST to 16 byte alignment so that we don't cross cache line 172 boundaries on both loads and stores. There are at least 96 bytes 173 to copy, so copy 16 bytes unaligned and then align. The loop 174 copies 64 bytes per iteration and prefetches one iteration ahead. */ 175 176 .p2align 4 177L(copy_long): 178 and tmp1, dstin, 15 179 bic dst, dstin, 15 180 ldp D_l, D_h, [src] 181 sub src, src, tmp1 182 add count, count, tmp1 /* Count is now 16 too large. */ 183 ldp A_l, A_h, [src, 16] 184 stp D_l, D_h, [dstin] 185 ldp B_l, B_h, [src, 32] 186 ldp C_l, C_h, [src, 48] 187 ldp D_l, D_h, [src, 64]! 188 subs count, count, 128 + 16 /* Test and readjust count. */ 189 b.ls 2f 1901: 191 stp A_l, A_h, [dst, 16] 192 ldp A_l, A_h, [src, 16] 193 stp B_l, B_h, [dst, 32] 194 ldp B_l, B_h, [src, 32] 195 stp C_l, C_h, [dst, 48] 196 ldp C_l, C_h, [src, 48] 197 stp D_l, D_h, [dst, 64]! 198 ldp D_l, D_h, [src, 64]! 199 subs count, count, 64 200 b.hi 1b 201 202 /* Write the last full set of 64 bytes. The remainder is at most 64 203 bytes, so it is safe to always copy 64 bytes from the end even if 204 there is just 1 byte left. */ 2052: 206 ldp E_l, E_h, [srcend, -64] 207 stp A_l, A_h, [dst, 16] 208 ldp A_l, A_h, [srcend, -48] 209 stp B_l, B_h, [dst, 32] 210 ldp B_l, B_h, [srcend, -32] 211 stp C_l, C_h, [dst, 48] 212 ldp C_l, C_h, [srcend, -16] 213 stp D_l, D_h, [dst, 64] 214 stp E_l, E_h, [dstend, -64] 215 stp A_l, A_h, [dstend, -48] 216 stp B_l, B_h, [dstend, -32] 217 stp C_l, C_h, [dstend, -16] 218 ret 219END(memcpy) 220