1/* 2 * Copyright (c) 2014 ARM Ltd 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the company may not be used to endorse or promote 14 * products derived from this software without specific prior written 15 * permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include "arm_asm.h" 30 31/* NOTE: This ifdef MUST match the one in aeabi_memcpy.c. */ 32#if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) && \ 33 (defined (__ARM_NEON__) || !defined (__SOFTFP__)) 34 35 .syntax unified 36 .global __aeabi_memcpy 37 .type __aeabi_memcpy, %function 38__aeabi_memcpy: 39 /* Assumes that n >= 0, and dst, src are valid pointers. 40 If there is at least 8 bytes to copy, use LDRD/STRD. 41 If src and dst are misaligned with different offsets, 42 first copy byte by byte until dst is aligned, 43 and then copy using LDRD/STRD and shift if needed. 44 When less than 8 left, copy a word and then byte by byte. */ 45 46 /* Save registers (r0 holds the return value): 47 optimized push {r0, r4, r5, lr}. 48 To try and improve performance, stack layout changed, 49 i.e., not keeping the stack looking like users expect 50 (highest numbered register at highest address). */ 51 push {r0, lr} 52 strd r4, r5, [sp, #-8]! 53 54 /* Get copying of tiny blocks out of the way first. */ 55 /* Is there at least 4 bytes to copy? */ 56 subs r2, r2, #4 57 blt copy_less_than_4 /* If n < 4. */ 58 59 /* Check word alignment. */ 60 ands ip, r0, #3 /* ip = last 2 bits of dst. */ 61 bne dst_not_word_aligned /* If dst is not word-aligned. */ 62 63 /* Get here if dst is word-aligned. */ 64 ands ip, r1, #3 /* ip = last 2 bits of src. */ 65 bne src_not_word_aligned /* If src is not word-aligned. */ 66word_aligned: 67 /* Get here if source and dst both are word-aligned. 68 The number of bytes remaining to copy is r2+4. */ 69 70 /* Is there is at least 64 bytes to copy? */ 71 subs r2, r2, #60 72 blt copy_less_than_64 /* If r2 + 4 < 64. */ 73 74 /* First, align the destination buffer to 8-bytes, 75 to make sure double loads and stores don't cross cache line boundary, 76 as they are then more expensive even if the data is in the cache 77 (require two load/store issue cycles instead of one). 78 If only one of the buffers is not 8-bytes aligned, 79 then it's more important to align dst than src, 80 because there is more penalty for stores 81 than loads that cross cacheline boundary. 82 This check and realignment are only worth doing 83 if there is a lot to copy. */ 84 85 /* Get here if dst is word aligned, 86 i.e., the 2 least significant bits are 0. 87 If dst is not 2w aligned (i.e., the 3rd bit is not set in dst), 88 then copy 1 word (4 bytes). */ 89 ands r3, r0, #4 90 beq two_word_aligned /* If dst already two-word aligned. */ 91 ldr r3, [r1], #4 92 str r3, [r0], #4 93 subs r2, r2, #4 94 blt copy_less_than_64 95 96two_word_aligned: 97 /* TODO: Align to cacheline (useful for PLD optimization). */ 98 99 /* Every loop iteration copies 64 bytes. */ 1001: 101 .irp offset, #0, #8, #16, #24, #32, #40, #48, #56 102 ldrd r4, r5, [r1, \offset] 103 strd r4, r5, [r0, \offset] 104 .endr 105 106 add r0, r0, #64 107 add r1, r1, #64 108 subs r2, r2, #64 109 bge 1b /* If there is more to copy. */ 110 111copy_less_than_64: 112 113 /* Get here if less than 64 bytes to copy, -64 <= r2 < 0. 114 Restore the count if there is more than 7 bytes to copy. */ 115 adds r2, r2, #56 116 blt copy_less_than_8 117 118 /* Copy 8 bytes at a time. */ 1192: 120 ldrd r4, r5, [r1], #8 121 strd r4, r5, [r0], #8 122 subs r2, r2, #8 123 bge 2b /* If there is more to copy. */ 124 125copy_less_than_8: 126 127 /* Get here if less than 8 bytes to copy, -8 <= r2 < 0. 128 Check if there is more to copy. */ 129 cmn r2, #8 130 beq return /* If r2 + 8 == 0. */ 131 132 /* Restore the count if there is more than 3 bytes to copy. */ 133 adds r2, r2, #4 134 blt copy_less_than_4 135 136 /* Copy 4 bytes. */ 137 ldr r3, [r1], #4 138 str r3, [r0], #4 139 140copy_less_than_4: 141 /* Get here if less than 4 bytes to copy, -4 <= r2 < 0. */ 142 143 /* Restore the count, check if there is more to copy. */ 144 adds r2, r2, #4 145 beq return /* If r2 == 0. */ 146 147 /* Get here with r2 is in {1,2,3}={01,10,11}. */ 148 /* Logical shift left r2, insert 0s, update flags. */ 149 lsls r2, r2, #31 150 151 /* Copy byte by byte. 152 Condition ne means the last bit of r2 is 0. 153 Condition cs means the second to last bit of r2 is set, 154 i.e., r2 is 1 or 3. */ 155 itt ne 156 ldrbne r3, [r1], #1 157 strbne r3, [r0], #1 158 159 itttt cs 160 ldrbcs r4, [r1], #1 161 ldrbcs r5, [r1] 162 strbcs r4, [r0], #1 163 strbcs r5, [r0] 164 165return: 166 /* Restore registers: optimized pop {r0, r4, r5, pc} */ 167 ldrd r4, r5, [sp], #8 168 pop {r0, pc} /* This is the only return point of memcpy. */ 169 170dst_not_word_aligned: 171 172 /* Get here when dst is not aligned and ip has the last 2 bits of dst, 173 i.e., ip is the offset of dst from word. 174 The number of bytes that remains to copy is r2 + 4, 175 i.e., there are at least 4 bytes to copy. 176 Write a partial word (0 to 3 bytes), such that dst becomes 177 word-aligned. */ 178 179 /* If dst is at ip bytes offset from a word (with 0 < ip < 4), 180 then there are (4 - ip) bytes to fill up to align dst to the next 181 word. */ 182 rsb ip, ip, #4 /* ip = #4 - ip. */ 183 cmp ip, #2 184 185 /* Copy byte by byte with conditionals. */ 186 itt gt 187 ldrbgt r3, [r1], #1 188 strbgt r3, [r0], #1 189 190 itt ge 191 ldrbge r4, [r1], #1 192 strbge r4, [r0], #1 193 194 ldrb lr, [r1], #1 195 strb lr, [r0], #1 196 197 /* Update the count. 198 ip holds the number of bytes we have just copied. */ 199 subs r2, r2, ip /* r2 = r2 - ip. */ 200 blt copy_less_than_4 /* If r2 < ip. */ 201 202 /* Get here if there are more than 4 bytes to copy. 203 Check if src is aligned. If beforehand src and dst were not word 204 aligned but congruent (same offset), then now they are both 205 word-aligned, and we can copy the rest efficiently (without 206 shifting). */ 207 ands ip, r1, #3 /* ip = last 2 bits of src. */ 208 beq word_aligned /* If r1 is word-aligned. */ 209 210src_not_word_aligned: 211 /* Get here when src is not word-aligned, but dst is word-aligned. 212 The number of bytes that remains to copy is r2+4. */ 213 214 /* Copy word by word using LDR when alignment can be done in hardware, 215 i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */ 216 subs r2, r2, #60 217 blt 8f 218 2197: 220 /* Copy 64 bytes in every loop iteration. */ 221 .irp offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60 222 ldr r3, [r1, \offset] 223 str r3, [r0, \offset] 224 .endr 225 226 add r0, r0, #64 227 add r1, r1, #64 228 subs r2, r2, #64 229 bge 7b 230 2318: 232 /* Get here if less than 64 bytes to copy, -64 <= r2 < 0. 233 Check if there is more than 3 bytes to copy. */ 234 adds r2, r2, #60 235 blt copy_less_than_4 236 2379: 238 /* Get here if there is less than 64 but at least 4 bytes to copy, 239 where the number of bytes to copy is r2+4. */ 240 ldr r3, [r1], #4 241 str r3, [r0], #4 242 subs r2, r2, #4 243 bge 9b 244 245 b copy_less_than_4 246 247 248 .syntax unified 249 .global __aeabi_memcpy4 250 .type __aeabi_memcpy4, %function 251__aeabi_memcpy4: 252 /* Assumes that both of its arguments are 4-byte aligned. */ 253 254 push {r0, lr} 255 strd r4, r5, [sp, #-8]! 256 257 /* Is there at least 4 bytes to copy? */ 258 subs r2, r2, #4 259 blt copy_less_than_4 /* If n < 4. */ 260 261 bl word_aligned 262 263 .syntax unified 264 .global __aeabi_memcpy8 265 .type __aeabi_memcpy8, %function 266__aeabi_memcpy8: 267 /* Assumes that both of its arguments are 8-byte aligned. */ 268 269 push {r0, lr} 270 strd r4, r5, [sp, #-8]! 271 272 /* Is there at least 4 bytes to copy? */ 273 subs r2, r2, #4 274 blt copy_less_than_4 /* If n < 4. */ 275 276 /* Is there at least 8 bytes to copy? */ 277 subs r2, r2, #4 278 blt copy_less_than_8 /* If n < 8. */ 279 280 /* Is there at least 64 bytes to copy? */ 281 subs r2, r2, #56 282 blt copy_less_than_64 /* if n + 8 < 64. */ 283 284 bl two_word_aligned 285 286#endif 287