1/* 2 * memcpy - copy memory area 3 * 4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 * See https://llvm.org/LICENSE.txt for license information. 6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 */ 8 9/* Assumptions: 10 * 11 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 12 * 13 */ 14 15#include "../asmdefs.h" 16 17#define dstin x0 18#define src x1 19#define count x2 20#define dst x3 21#define srcend x4 22#define dstend x5 23#define A_l x6 24#define A_lw w6 25#define A_h x7 26#define B_l x8 27#define B_lw w8 28#define B_h x9 29#define C_lw w10 30#define tmp1 x14 31 32#define A_q q0 33#define B_q q1 34#define C_q q2 35#define D_q q3 36#define E_q q4 37#define F_q q5 38#define G_q q6 39#define H_q q7 40 41/* This implementation handles overlaps and supports both memcpy and memmove 42 from a single entry point. It uses unaligned accesses and branchless 43 sequences to keep the code small, simple and improve performance. 44 45 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 46 copies of up to 128 bytes, and large copies. The overhead of the overlap 47 check is negligible since it is only required for large copies. 48 49 Large copies use a software pipelined loop processing 64 bytes per iteration. 50 The source pointer is 16-byte aligned to minimize unaligned accesses. 51 The loop tail is handled by always copying 64 bytes from the end. 52*/ 53 54ENTRY (__memcpy_aarch64_simd) 55ENTRY_ALIAS (__memmove_aarch64_simd) 56 add srcend, src, count 57 add dstend, dstin, count 58 cmp count, 128 59 b.hi L(copy_long) 60 cmp count, 32 61 b.hi L(copy32_128) 62 63 /* Small copies: 0..32 bytes. */ 64 cmp count, 16 65 b.lo L(copy16) 66 ldr A_q, [src] 67 ldr B_q, [srcend, -16] 68 str A_q, [dstin] 69 str B_q, [dstend, -16] 70 ret 71 72 /* Copy 8-15 bytes. */ 73L(copy16): 74 tbz count, 3, L(copy8) 75 ldr A_l, [src] 76 ldr A_h, [srcend, -8] 77 str A_l, [dstin] 78 str A_h, [dstend, -8] 79 ret 80 81 .p2align 3 82 /* Copy 4-7 bytes. */ 83L(copy8): 84 tbz count, 2, L(copy4) 85 ldr A_lw, [src] 86 ldr B_lw, [srcend, -4] 87 str A_lw, [dstin] 88 str B_lw, [dstend, -4] 89 ret 90 91 /* Copy 0..3 bytes using a branchless sequence. */ 92L(copy4): 93 cbz count, L(copy0) 94 lsr tmp1, count, 1 95 ldrb A_lw, [src] 96 ldrb C_lw, [srcend, -1] 97 ldrb B_lw, [src, tmp1] 98 strb A_lw, [dstin] 99 strb B_lw, [dstin, tmp1] 100 strb C_lw, [dstend, -1] 101L(copy0): 102 ret 103 104 .p2align 4 105 /* Medium copies: 33..128 bytes. */ 106L(copy32_128): 107 ldp A_q, B_q, [src] 108 ldp C_q, D_q, [srcend, -32] 109 cmp count, 64 110 b.hi L(copy128) 111 stp A_q, B_q, [dstin] 112 stp C_q, D_q, [dstend, -32] 113 ret 114 115 .p2align 4 116 /* Copy 65..128 bytes. */ 117L(copy128): 118 ldp E_q, F_q, [src, 32] 119 cmp count, 96 120 b.ls L(copy96) 121 ldp G_q, H_q, [srcend, -64] 122 stp G_q, H_q, [dstend, -64] 123L(copy96): 124 stp A_q, B_q, [dstin] 125 stp E_q, F_q, [dstin, 32] 126 stp C_q, D_q, [dstend, -32] 127 ret 128 129 /* Copy more than 128 bytes. */ 130L(copy_long): 131 /* Use backwards copy if there is an overlap. */ 132 sub tmp1, dstin, src 133 cmp tmp1, count 134 b.lo L(copy_long_backwards) 135 136 /* Copy 16 bytes and then align src to 16-byte alignment. */ 137 ldr D_q, [src] 138 and tmp1, src, 15 139 bic src, src, 15 140 sub dst, dstin, tmp1 141 add count, count, tmp1 /* Count is now 16 too large. */ 142 ldp A_q, B_q, [src, 16] 143 str D_q, [dstin] 144 ldp C_q, D_q, [src, 48] 145 subs count, count, 128 + 16 /* Test and readjust count. */ 146 b.ls L(copy64_from_end) 147L(loop64): 148 stp A_q, B_q, [dst, 16] 149 ldp A_q, B_q, [src, 80] 150 stp C_q, D_q, [dst, 48] 151 ldp C_q, D_q, [src, 112] 152 add src, src, 64 153 add dst, dst, 64 154 subs count, count, 64 155 b.hi L(loop64) 156 157 /* Write the last iteration and copy 64 bytes from the end. */ 158L(copy64_from_end): 159 ldp E_q, F_q, [srcend, -64] 160 stp A_q, B_q, [dst, 16] 161 ldp A_q, B_q, [srcend, -32] 162 stp C_q, D_q, [dst, 48] 163 stp E_q, F_q, [dstend, -64] 164 stp A_q, B_q, [dstend, -32] 165 ret 166 167 /* Large backwards copy for overlapping copies. 168 Copy 16 bytes and then align srcend to 16-byte alignment. */ 169L(copy_long_backwards): 170 cbz tmp1, L(copy0) 171 ldr D_q, [srcend, -16] 172 and tmp1, srcend, 15 173 bic srcend, srcend, 15 174 sub count, count, tmp1 175 ldp A_q, B_q, [srcend, -32] 176 str D_q, [dstend, -16] 177 ldp C_q, D_q, [srcend, -64] 178 sub dstend, dstend, tmp1 179 subs count, count, 128 180 b.ls L(copy64_from_start) 181 182L(loop64_backwards): 183 stp A_q, B_q, [dstend, -32] 184 ldp A_q, B_q, [srcend, -96] 185 stp C_q, D_q, [dstend, -64] 186 ldp C_q, D_q, [srcend, -128] 187 sub srcend, srcend, 64 188 sub dstend, dstend, 64 189 subs count, count, 64 190 b.hi L(loop64_backwards) 191 192 /* Write the last iteration and copy 64 bytes from the start. */ 193L(copy64_from_start): 194 ldp E_q, F_q, [src, 32] 195 stp A_q, B_q, [dstend, -32] 196 ldp A_q, B_q, [src] 197 stp C_q, D_q, [dstend, -64] 198 stp E_q, F_q, [dstin, 32] 199 stp A_q, B_q, [dstin] 200 ret 201 202END (__memcpy_aarch64_simd) 203