1/* 2 * memcpy - copy memory area 3 * 4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 * See https://llvm.org/LICENSE.txt for license information. 6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 */ 8 9/* Assumptions: 10 * 11 * ARMv8-a, AArch64, unaligned accesses. 12 * 13 */ 14 15#include "../asmdefs.h" 16 17#define dstin x0 18#define src x1 19#define count x2 20#define dst x3 21#define srcend x4 22#define dstend x5 23#define A_l x6 24#define A_lw w6 25#define A_h x7 26#define B_l x8 27#define B_lw w8 28#define B_h x9 29#define C_l x10 30#define C_lw w10 31#define C_h x11 32#define D_l x12 33#define D_h x13 34#define E_l x14 35#define E_h x15 36#define F_l x16 37#define F_h x17 38#define G_l count 39#define G_h dst 40#define H_l src 41#define H_h srcend 42#define tmp1 x14 43 44/* This implementation handles overlaps and supports both memcpy and memmove 45 from a single entry point. It uses unaligned accesses and branchless 46 sequences to keep the code small, simple and improve performance. 47 48 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 49 copies of up to 128 bytes, and large copies. The overhead of the overlap 50 check is negligible since it is only required for large copies. 51 52 Large copies use a software pipelined loop processing 64 bytes per iteration. 53 The destination pointer is 16-byte aligned to minimize unaligned accesses. 54 The loop tail is handled by always copying 64 bytes from the end. 55*/ 56 57ENTRY (__memcpy_aarch64) 58ENTRY_ALIAS (__memmove_aarch64) 59 add srcend, src, count 60 add dstend, dstin, count 61 cmp count, 128 62 b.hi L(copy_long) 63 cmp count, 32 64 b.hi L(copy32_128) 65 66 /* Small copies: 0..32 bytes. */ 67 cmp count, 16 68 b.lo L(copy16) 69 ldp A_l, A_h, [src] 70 ldp D_l, D_h, [srcend, -16] 71 stp A_l, A_h, [dstin] 72 stp D_l, D_h, [dstend, -16] 73 ret 74 75 /* Copy 8-15 bytes. */ 76L(copy16): 77 tbz count, 3, L(copy8) 78 ldr A_l, [src] 79 ldr A_h, [srcend, -8] 80 str A_l, [dstin] 81 str A_h, [dstend, -8] 82 ret 83 84 .p2align 3 85 /* Copy 4-7 bytes. */ 86L(copy8): 87 tbz count, 2, L(copy4) 88 ldr A_lw, [src] 89 ldr B_lw, [srcend, -4] 90 str A_lw, [dstin] 91 str B_lw, [dstend, -4] 92 ret 93 94 /* Copy 0..3 bytes using a branchless sequence. */ 95L(copy4): 96 cbz count, L(copy0) 97 lsr tmp1, count, 1 98 ldrb A_lw, [src] 99 ldrb C_lw, [srcend, -1] 100 ldrb B_lw, [src, tmp1] 101 strb A_lw, [dstin] 102 strb B_lw, [dstin, tmp1] 103 strb C_lw, [dstend, -1] 104L(copy0): 105 ret 106 107 .p2align 4 108 /* Medium copies: 33..128 bytes. */ 109L(copy32_128): 110 ldp A_l, A_h, [src] 111 ldp B_l, B_h, [src, 16] 112 ldp C_l, C_h, [srcend, -32] 113 ldp D_l, D_h, [srcend, -16] 114 cmp count, 64 115 b.hi L(copy128) 116 stp A_l, A_h, [dstin] 117 stp B_l, B_h, [dstin, 16] 118 stp C_l, C_h, [dstend, -32] 119 stp D_l, D_h, [dstend, -16] 120 ret 121 122 .p2align 4 123 /* Copy 65..128 bytes. */ 124L(copy128): 125 ldp E_l, E_h, [src, 32] 126 ldp F_l, F_h, [src, 48] 127 cmp count, 96 128 b.ls L(copy96) 129 ldp G_l, G_h, [srcend, -64] 130 ldp H_l, H_h, [srcend, -48] 131 stp G_l, G_h, [dstend, -64] 132 stp H_l, H_h, [dstend, -48] 133L(copy96): 134 stp A_l, A_h, [dstin] 135 stp B_l, B_h, [dstin, 16] 136 stp E_l, E_h, [dstin, 32] 137 stp F_l, F_h, [dstin, 48] 138 stp C_l, C_h, [dstend, -32] 139 stp D_l, D_h, [dstend, -16] 140 ret 141 142 .p2align 4 143 /* Copy more than 128 bytes. */ 144L(copy_long): 145 /* Use backwards copy if there is an overlap. */ 146 sub tmp1, dstin, src 147 cbz tmp1, L(copy0) 148 cmp tmp1, count 149 b.lo L(copy_long_backwards) 150 151 /* Copy 16 bytes and then align dst to 16-byte alignment. */ 152 153 ldp D_l, D_h, [src] 154 and tmp1, dstin, 15 155 bic dst, dstin, 15 156 sub src, src, tmp1 157 add count, count, tmp1 /* Count is now 16 too large. */ 158 ldp A_l, A_h, [src, 16] 159 stp D_l, D_h, [dstin] 160 ldp B_l, B_h, [src, 32] 161 ldp C_l, C_h, [src, 48] 162 ldp D_l, D_h, [src, 64]! 163 subs count, count, 128 + 16 /* Test and readjust count. */ 164 b.ls L(copy64_from_end) 165 166L(loop64): 167 stp A_l, A_h, [dst, 16] 168 ldp A_l, A_h, [src, 16] 169 stp B_l, B_h, [dst, 32] 170 ldp B_l, B_h, [src, 32] 171 stp C_l, C_h, [dst, 48] 172 ldp C_l, C_h, [src, 48] 173 stp D_l, D_h, [dst, 64]! 174 ldp D_l, D_h, [src, 64]! 175 subs count, count, 64 176 b.hi L(loop64) 177 178 /* Write the last iteration and copy 64 bytes from the end. */ 179L(copy64_from_end): 180 ldp E_l, E_h, [srcend, -64] 181 stp A_l, A_h, [dst, 16] 182 ldp A_l, A_h, [srcend, -48] 183 stp B_l, B_h, [dst, 32] 184 ldp B_l, B_h, [srcend, -32] 185 stp C_l, C_h, [dst, 48] 186 ldp C_l, C_h, [srcend, -16] 187 stp D_l, D_h, [dst, 64] 188 stp E_l, E_h, [dstend, -64] 189 stp A_l, A_h, [dstend, -48] 190 stp B_l, B_h, [dstend, -32] 191 stp C_l, C_h, [dstend, -16] 192 ret 193 194 .p2align 4 195 196 /* Large backwards copy for overlapping copies. 197 Copy 16 bytes and then align dst to 16-byte alignment. */ 198L(copy_long_backwards): 199 ldp D_l, D_h, [srcend, -16] 200 and tmp1, dstend, 15 201 sub srcend, srcend, tmp1 202 sub count, count, tmp1 203 ldp A_l, A_h, [srcend, -16] 204 stp D_l, D_h, [dstend, -16] 205 ldp B_l, B_h, [srcend, -32] 206 ldp C_l, C_h, [srcend, -48] 207 ldp D_l, D_h, [srcend, -64]! 208 sub dstend, dstend, tmp1 209 subs count, count, 128 210 b.ls L(copy64_from_start) 211 212L(loop64_backwards): 213 stp A_l, A_h, [dstend, -16] 214 ldp A_l, A_h, [srcend, -16] 215 stp B_l, B_h, [dstend, -32] 216 ldp B_l, B_h, [srcend, -32] 217 stp C_l, C_h, [dstend, -48] 218 ldp C_l, C_h, [srcend, -48] 219 stp D_l, D_h, [dstend, -64]! 220 ldp D_l, D_h, [srcend, -64]! 221 subs count, count, 64 222 b.hi L(loop64_backwards) 223 224 /* Write the last iteration and copy 64 bytes from the start. */ 225L(copy64_from_start): 226 ldp G_l, G_h, [src, 48] 227 stp A_l, A_h, [dstend, -16] 228 ldp A_l, A_h, [src, 32] 229 stp B_l, B_h, [dstend, -32] 230 ldp B_l, B_h, [src, 16] 231 stp C_l, C_h, [dstend, -48] 232 ldp C_l, C_h, [src] 233 stp D_l, D_h, [dstend, -64] 234 stp G_l, G_h, [dstin, 48] 235 stp A_l, A_h, [dstin, 32] 236 stp B_l, B_h, [dstin, 16] 237 stp C_l, C_h, [dstin] 238 ret 239 240END (__memcpy_aarch64) 241