1/* 2 * memcpy - copy memory area 3 * 4 * Copyright (c) 2012-2020, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, unaligned accesses. 11 * 12 */ 13 14#include <machine/asm.h> 15 16#define L(l) .L ## l 17 18#define dstin x0 19#define src x1 20#define count x2 21#define dst x3 22#define srcend x4 23#define dstend x5 24#define A_l x6 25#define A_lw w6 26#define A_h x7 27#define B_l x8 28#define B_lw w8 29#define B_h x9 30#define C_l x10 31#define C_lw w10 32#define C_h x11 33#define D_l x12 34#define D_h x13 35#define E_l x14 36#define E_h x15 37#define F_l x16 38#define F_h x17 39#define G_l count 40#define G_h dst 41#define H_l src 42#define H_h srcend 43#define tmp1 x14 44 45/* This implementation handles overlaps and supports both memcpy and memmove 46 from a single entry point. It uses unaligned accesses and branchless 47 sequences to keep the code small, simple and improve performance. 48 49 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 50 copies of up to 128 bytes, and large copies. The overhead of the overlap 51 check is negligible since it is only required for large copies. 52 53 Large copies use a software pipelined loop processing 64 bytes per iteration. 54 The destination pointer is 16-byte aligned to minimize unaligned accesses. 55 The loop tail is handled by always copying 64 bytes from the end. 56*/ 57 58EENTRY(memmove) 59ENTRY(memcpy) 60 add srcend, src, count 61 add dstend, dstin, count 62 cmp count, 128 63 b.hi L(copy_long) 64 cmp count, 32 65 b.hi L(copy32_128) 66 67 /* Small copies: 0..32 bytes. */ 68 cmp count, 16 69 b.lo L(copy16) 70 ldp A_l, A_h, [src] 71 ldp D_l, D_h, [srcend, -16] 72 stp A_l, A_h, [dstin] 73 stp D_l, D_h, [dstend, -16] 74 ret 75 76 /* Copy 8-15 bytes. */ 77L(copy16): 78 tbz count, 3, L(copy8) 79 ldr A_l, [src] 80 ldr A_h, [srcend, -8] 81 str A_l, [dstin] 82 str A_h, [dstend, -8] 83 ret 84 85 .p2align 3 86 /* Copy 4-7 bytes. */ 87L(copy8): 88 tbz count, 2, L(copy4) 89 ldr A_lw, [src] 90 ldr B_lw, [srcend, -4] 91 str A_lw, [dstin] 92 str B_lw, [dstend, -4] 93 ret 94 95 /* Copy 0..3 bytes using a branchless sequence. */ 96L(copy4): 97 cbz count, L(copy0) 98 lsr tmp1, count, 1 99 ldrb A_lw, [src] 100 ldrb C_lw, [srcend, -1] 101 ldrb B_lw, [src, tmp1] 102 strb A_lw, [dstin] 103 strb B_lw, [dstin, tmp1] 104 strb C_lw, [dstend, -1] 105L(copy0): 106 ret 107 108 .p2align 4 109 /* Medium copies: 33..128 bytes. */ 110L(copy32_128): 111 ldp A_l, A_h, [src] 112 ldp B_l, B_h, [src, 16] 113 ldp C_l, C_h, [srcend, -32] 114 ldp D_l, D_h, [srcend, -16] 115 cmp count, 64 116 b.hi L(copy128) 117 stp A_l, A_h, [dstin] 118 stp B_l, B_h, [dstin, 16] 119 stp C_l, C_h, [dstend, -32] 120 stp D_l, D_h, [dstend, -16] 121 ret 122 123 .p2align 4 124 /* Copy 65..128 bytes. */ 125L(copy128): 126 ldp E_l, E_h, [src, 32] 127 ldp F_l, F_h, [src, 48] 128 cmp count, 96 129 b.ls L(copy96) 130 ldp G_l, G_h, [srcend, -64] 131 ldp H_l, H_h, [srcend, -48] 132 stp G_l, G_h, [dstend, -64] 133 stp H_l, H_h, [dstend, -48] 134L(copy96): 135 stp A_l, A_h, [dstin] 136 stp B_l, B_h, [dstin, 16] 137 stp E_l, E_h, [dstin, 32] 138 stp F_l, F_h, [dstin, 48] 139 stp C_l, C_h, [dstend, -32] 140 stp D_l, D_h, [dstend, -16] 141 ret 142 143 .p2align 4 144 /* Copy more than 128 bytes. */ 145L(copy_long): 146 /* Use backwards copy if there is an overlap. */ 147 sub tmp1, dstin, src 148 cbz tmp1, L(copy0) 149 cmp tmp1, count 150 b.lo L(copy_long_backwards) 151 152 /* Copy 16 bytes and then align dst to 16-byte alignment. */ 153 154 ldp D_l, D_h, [src] 155 and tmp1, dstin, 15 156 bic dst, dstin, 15 157 sub src, src, tmp1 158 add count, count, tmp1 /* Count is now 16 too large. */ 159 ldp A_l, A_h, [src, 16] 160 stp D_l, D_h, [dstin] 161 ldp B_l, B_h, [src, 32] 162 ldp C_l, C_h, [src, 48] 163 ldp D_l, D_h, [src, 64]! 164 subs count, count, 128 + 16 /* Test and readjust count. */ 165 b.ls L(copy64_from_end) 166 167L(loop64): 168 stp A_l, A_h, [dst, 16] 169 ldp A_l, A_h, [src, 16] 170 stp B_l, B_h, [dst, 32] 171 ldp B_l, B_h, [src, 32] 172 stp C_l, C_h, [dst, 48] 173 ldp C_l, C_h, [src, 48] 174 stp D_l, D_h, [dst, 64]! 175 ldp D_l, D_h, [src, 64]! 176 subs count, count, 64 177 b.hi L(loop64) 178 179 /* Write the last iteration and copy 64 bytes from the end. */ 180L(copy64_from_end): 181 ldp E_l, E_h, [srcend, -64] 182 stp A_l, A_h, [dst, 16] 183 ldp A_l, A_h, [srcend, -48] 184 stp B_l, B_h, [dst, 32] 185 ldp B_l, B_h, [srcend, -32] 186 stp C_l, C_h, [dst, 48] 187 ldp C_l, C_h, [srcend, -16] 188 stp D_l, D_h, [dst, 64] 189 stp E_l, E_h, [dstend, -64] 190 stp A_l, A_h, [dstend, -48] 191 stp B_l, B_h, [dstend, -32] 192 stp C_l, C_h, [dstend, -16] 193 ret 194 195 .p2align 4 196 197 /* Large backwards copy for overlapping copies. 198 Copy 16 bytes and then align dst to 16-byte alignment. */ 199L(copy_long_backwards): 200 ldp D_l, D_h, [srcend, -16] 201 and tmp1, dstend, 15 202 sub srcend, srcend, tmp1 203 sub count, count, tmp1 204 ldp A_l, A_h, [srcend, -16] 205 stp D_l, D_h, [dstend, -16] 206 ldp B_l, B_h, [srcend, -32] 207 ldp C_l, C_h, [srcend, -48] 208 ldp D_l, D_h, [srcend, -64]! 209 sub dstend, dstend, tmp1 210 subs count, count, 128 211 b.ls L(copy64_from_start) 212 213L(loop64_backwards): 214 stp A_l, A_h, [dstend, -16] 215 ldp A_l, A_h, [srcend, -16] 216 stp B_l, B_h, [dstend, -32] 217 ldp B_l, B_h, [srcend, -32] 218 stp C_l, C_h, [dstend, -48] 219 ldp C_l, C_h, [srcend, -48] 220 stp D_l, D_h, [dstend, -64]! 221 ldp D_l, D_h, [srcend, -64]! 222 subs count, count, 64 223 b.hi L(loop64_backwards) 224 225 /* Write the last iteration and copy 64 bytes from the start. */ 226L(copy64_from_start): 227 ldp G_l, G_h, [src, 48] 228 stp A_l, A_h, [dstend, -16] 229 ldp A_l, A_h, [src, 32] 230 stp B_l, B_h, [dstend, -32] 231 ldp B_l, B_h, [src, 16] 232 stp C_l, C_h, [dstend, -48] 233 ldp C_l, C_h, [src] 234 stp D_l, D_h, [dstend, -64] 235 stp G_l, G_h, [dstin, 48] 236 stp A_l, A_h, [dstin, 32] 237 stp B_l, B_h, [dstin, 16] 238 stp C_l, C_h, [dstin] 239 ret 240END(memcpy) 241EEND(memmove) 242 243