1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 30/* 31 * Optimized memcpy() for ARM. 32 * 33 * note that memcpy() always returns the destination pointer, 34 * so we have to preserve R0. 35 */ 36 37/* 38 * This file has been modified from the original for use in musl libc. 39 * The main changes are: addition of .type memcpy,%function to make the 40 * code safely callable from thumb mode, adjusting the return 41 * instructions to be compatible with pre-thumb ARM cpus, removal of 42 * prefetch code that is not compatible with older cpus and support for 43 * building as thumb 2 and big-endian. 44 */ 45 46.syntax unified 47 48.global memcpy 49.type memcpy,%function 50memcpy: 51 /* The stack must always be 64-bits aligned to be compliant with the 52 * ARM ABI. Since we have to save R0, we might as well save R4 53 * which we can use for better pipelining of the reads below 54 */ 55 .fnstart 56 .save {r0, r4, lr} 57 stmfd sp!, {r0, r4, lr} 58 /* Making room for r5-r11 which will be spilled later */ 59 .pad #28 60 sub sp, sp, #28 61 62 /* it simplifies things to take care of len<4 early */ 63 cmp r2, #4 64 blo copy_last_3_and_return 65 66 /* compute the offset to align the source 67 * offset = (4-(src&3))&3 = -src & 3 68 */ 69 rsb r3, r1, #0 70 ands r3, r3, #3 71 beq src_aligned 72 73 /* align source to 32 bits. We need to insert 2 instructions between 74 * a ldr[b|h] and str[b|h] because byte and half-word instructions 75 * stall 2 cycles. 76 */ 77 movs r12, r3, lsl #31 78 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */ 79 ldrbmi r3, [r1], #1 80 ldrbcs r4, [r1], #1 81 ldrbcs r12,[r1], #1 82 strbmi r3, [r0], #1 83 strbcs r4, [r0], #1 84 strbcs r12,[r0], #1 85 86src_aligned: 87 88 /* see if src and dst are aligned together (congruent) */ 89 eor r12, r0, r1 90 tst r12, #3 91 bne non_congruent 92 93 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack 94 * frame. Don't update sp. 95 */ 96 stmea sp, {r5-r11} 97 98 /* align the destination to a cache-line */ 99 rsb r3, r0, #0 100 ands r3, r3, #0x1C 101 beq congruent_aligned32 102 cmp r3, r2 103 andhi r3, r2, #0x1C 104 105 /* conditionnaly copies 0 to 7 words (length in r3) */ 106 movs r12, r3, lsl #28 107 ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */ 108 ldmmi r1!, {r8, r9} /* 8 bytes */ 109 stmcs r0!, {r4, r5, r6, r7} 110 stmmi r0!, {r8, r9} 111 tst r3, #0x4 112 ldrne r10,[r1], #4 /* 4 bytes */ 113 strne r10,[r0], #4 114 sub r2, r2, r3 115 116congruent_aligned32: 117 /* 118 * here source is aligned to 32 bytes. 119 */ 120 121cached_aligned32: 122 subs r2, r2, #32 123 blo less_than_32_left 124 125 /* 126 * We preload a cache-line up to 64 bytes ahead. On the 926, this will 127 * stall only until the requested world is fetched, but the linefill 128 * continues in the the background. 129 * While the linefill is going, we write our previous cache-line 130 * into the write-buffer (which should have some free space). 131 * When the linefill is done, the writebuffer will 132 * start dumping its content into memory 133 * 134 * While all this is going, we then load a full cache line into 135 * 8 registers, this cache line should be in the cache by now 136 * (or partly in the cache). 137 * 138 * This code should work well regardless of the source/dest alignment. 139 * 140 */ 141 142 /* Align the preload register to a cache-line because the cpu does 143 * "critical word first" (the first word requested is loaded first). 144 */ 145 @ bic r12, r1, #0x1F 146 @ add r12, r12, #64 147 1481: ldmia r1!, { r4-r11 } 149 subs r2, r2, #32 150 151 /* 152 * NOTE: if r12 is more than 64 ahead of r1, the following ldrhi 153 * for ARM9 preload will not be safely guarded by the preceding subs. 154 * When it is safely guarded the only possibility to have SIGSEGV here 155 * is because the caller overstates the length. 156 */ 157 @ ldrhi r3, [r12], #32 /* cheap ARM9 preload */ 158 stmia r0!, { r4-r11 } 159 bhs 1b 160 161 add r2, r2, #32 162 163less_than_32_left: 164 /* 165 * less than 32 bytes left at this point (length in r2) 166 */ 167 168 /* skip all this if there is nothing to do, which should 169 * be a common case (if not executed the code below takes 170 * about 16 cycles) 171 */ 172 tst r2, #0x1F 173 beq 1f 174 175 /* conditionnaly copies 0 to 31 bytes */ 176 movs r12, r2, lsl #28 177 ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */ 178 ldmmi r1!, {r8, r9} /* 8 bytes */ 179 stmcs r0!, {r4, r5, r6, r7} 180 stmmi r0!, {r8, r9} 181 movs r12, r2, lsl #30 182 ldrcs r3, [r1], #4 /* 4 bytes */ 183 ldrhmi r4, [r1], #2 /* 2 bytes */ 184 strcs r3, [r0], #4 185 strhmi r4, [r0], #2 186 tst r2, #0x1 187 ldrbne r3, [r1] /* last byte */ 188 strbne r3, [r0] 189 190 /* we're done! restore everything and return */ 1911: ldmfd sp!, {r5-r11} 192 ldmfd sp!, {r0, r4, lr} 193 bx lr 194 195 /********************************************************************/ 196 197non_congruent: 198 /* 199 * here source is aligned to 4 bytes 200 * but destination is not. 201 * 202 * in the code below r2 is the number of bytes read 203 * (the number of bytes written is always smaller, because we have 204 * partial words in the shift queue) 205 */ 206 cmp r2, #4 207 blo copy_last_3_and_return 208 209 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack 210 * frame. Don't update sp. 211 */ 212 stmea sp, {r5-r11} 213 214 /* compute shifts needed to align src to dest */ 215 rsb r5, r0, #0 216 and r5, r5, #3 /* r5 = # bytes in partial words */ 217 mov r12, r5, lsl #3 /* r12 = right */ 218 rsb lr, r12, #32 /* lr = left */ 219 220 /* read the first word */ 221 ldr r3, [r1], #4 222 sub r2, r2, #4 223 224 /* write a partial word (0 to 3 bytes), such that destination 225 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment) 226 */ 227 movs r5, r5, lsl #31 228 229#if __ARMEB__ 230 movmi r3, r3, ror #24 231 strbmi r3, [r0], #1 232 movcs r3, r3, ror #24 233 strbcs r3, [r0], #1 234 movcs r3, r3, ror #24 235 strbcs r3, [r0], #1 236#else 237 strbmi r3, [r0], #1 238 movmi r3, r3, lsr #8 239 strbcs r3, [r0], #1 240 movcs r3, r3, lsr #8 241 strbcs r3, [r0], #1 242 movcs r3, r3, lsr #8 243#endif 244 245 cmp r2, #4 246 blo partial_word_tail 247 248#if __ARMEB__ 249 mov r3, r3, lsr r12 250 mov r3, r3, lsl r12 251#endif 252 253 /* Align destination to 32 bytes (cache line boundary) */ 2541: tst r0, #0x1c 255 beq 2f 256 ldr r5, [r1], #4 257 sub r2, r2, #4 258#if __ARMEB__ 259 mov r4, r5, lsr lr 260 orr r4, r4, r3 261 mov r3, r5, lsl r12 262#else 263 mov r4, r5, lsl lr 264 orr r4, r4, r3 265 mov r3, r5, lsr r12 266#endif 267 str r4, [r0], #4 268 cmp r2, #4 269 bhs 1b 270 blo partial_word_tail 271 272 /* copy 32 bytes at a time */ 2732: subs r2, r2, #32 274 blo less_than_thirtytwo 275 276 /* Use immediate mode for the shifts, because there is an extra cycle 277 * for register shifts, which could account for up to 50% of 278 * performance hit. 279 */ 280 281 cmp r12, #24 282 beq loop24 283 cmp r12, #8 284 beq loop8 285 286loop16: 287 ldr r12, [r1], #4 2881: mov r4, r12 289 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} 290 subs r2, r2, #32 291 ldrhs r12, [r1], #4 292#if __ARMEB__ 293 orr r3, r3, r4, lsr #16 294 mov r4, r4, lsl #16 295 orr r4, r4, r5, lsr #16 296 mov r5, r5, lsl #16 297 orr r5, r5, r6, lsr #16 298 mov r6, r6, lsl #16 299 orr r6, r6, r7, lsr #16 300 mov r7, r7, lsl #16 301 orr r7, r7, r8, lsr #16 302 mov r8, r8, lsl #16 303 orr r8, r8, r9, lsr #16 304 mov r9, r9, lsl #16 305 orr r9, r9, r10, lsr #16 306 mov r10, r10, lsl #16 307 orr r10, r10, r11, lsr #16 308 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 309 mov r3, r11, lsl #16 310#else 311 orr r3, r3, r4, lsl #16 312 mov r4, r4, lsr #16 313 orr r4, r4, r5, lsl #16 314 mov r5, r5, lsr #16 315 orr r5, r5, r6, lsl #16 316 mov r6, r6, lsr #16 317 orr r6, r6, r7, lsl #16 318 mov r7, r7, lsr #16 319 orr r7, r7, r8, lsl #16 320 mov r8, r8, lsr #16 321 orr r8, r8, r9, lsl #16 322 mov r9, r9, lsr #16 323 orr r9, r9, r10, lsl #16 324 mov r10, r10, lsr #16 325 orr r10, r10, r11, lsl #16 326 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 327 mov r3, r11, lsr #16 328#endif 329 bhs 1b 330 b less_than_thirtytwo 331 332loop8: 333 ldr r12, [r1], #4 3341: mov r4, r12 335 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} 336 subs r2, r2, #32 337 ldrhs r12, [r1], #4 338#if __ARMEB__ 339 orr r3, r3, r4, lsr #24 340 mov r4, r4, lsl #8 341 orr r4, r4, r5, lsr #24 342 mov r5, r5, lsl #8 343 orr r5, r5, r6, lsr #24 344 mov r6, r6, lsl #8 345 orr r6, r6, r7, lsr #24 346 mov r7, r7, lsl #8 347 orr r7, r7, r8, lsr #24 348 mov r8, r8, lsl #8 349 orr r8, r8, r9, lsr #24 350 mov r9, r9, lsl #8 351 orr r9, r9, r10, lsr #24 352 mov r10, r10, lsl #8 353 orr r10, r10, r11, lsr #24 354 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 355 mov r3, r11, lsl #8 356#else 357 orr r3, r3, r4, lsl #24 358 mov r4, r4, lsr #8 359 orr r4, r4, r5, lsl #24 360 mov r5, r5, lsr #8 361 orr r5, r5, r6, lsl #24 362 mov r6, r6, lsr #8 363 orr r6, r6, r7, lsl #24 364 mov r7, r7, lsr #8 365 orr r7, r7, r8, lsl #24 366 mov r8, r8, lsr #8 367 orr r8, r8, r9, lsl #24 368 mov r9, r9, lsr #8 369 orr r9, r9, r10, lsl #24 370 mov r10, r10, lsr #8 371 orr r10, r10, r11, lsl #24 372 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 373 mov r3, r11, lsr #8 374#endif 375 bhs 1b 376 b less_than_thirtytwo 377 378loop24: 379 ldr r12, [r1], #4 3801: mov r4, r12 381 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} 382 subs r2, r2, #32 383 ldrhs r12, [r1], #4 384#if __ARMEB__ 385 orr r3, r3, r4, lsr #8 386 mov r4, r4, lsl #24 387 orr r4, r4, r5, lsr #8 388 mov r5, r5, lsl #24 389 orr r5, r5, r6, lsr #8 390 mov r6, r6, lsl #24 391 orr r6, r6, r7, lsr #8 392 mov r7, r7, lsl #24 393 orr r7, r7, r8, lsr #8 394 mov r8, r8, lsl #24 395 orr r8, r8, r9, lsr #8 396 mov r9, r9, lsl #24 397 orr r9, r9, r10, lsr #8 398 mov r10, r10, lsl #24 399 orr r10, r10, r11, lsr #8 400 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 401 mov r3, r11, lsl #24 402#else 403 orr r3, r3, r4, lsl #8 404 mov r4, r4, lsr #24 405 orr r4, r4, r5, lsl #8 406 mov r5, r5, lsr #24 407 orr r5, r5, r6, lsl #8 408 mov r6, r6, lsr #24 409 orr r6, r6, r7, lsl #8 410 mov r7, r7, lsr #24 411 orr r7, r7, r8, lsl #8 412 mov r8, r8, lsr #24 413 orr r8, r8, r9, lsl #8 414 mov r9, r9, lsr #24 415 orr r9, r9, r10, lsl #8 416 mov r10, r10, lsr #24 417 orr r10, r10, r11, lsl #8 418 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} 419 mov r3, r11, lsr #24 420#endif 421 bhs 1b 422 423less_than_thirtytwo: 424 /* copy the last 0 to 31 bytes of the source */ 425 rsb r12, lr, #32 /* we corrupted r12, recompute it */ 426 add r2, r2, #32 427 cmp r2, #4 428 blo partial_word_tail 429 4301: ldr r5, [r1], #4 431 sub r2, r2, #4 432#if __ARMEB__ 433 mov r4, r5, lsr lr 434 orr r4, r4, r3 435 mov r3, r5, lsl r12 436#else 437 mov r4, r5, lsl lr 438 orr r4, r4, r3 439 mov r3, r5, lsr r12 440#endif 441 str r4, [r0], #4 442 cmp r2, #4 443 bhs 1b 444 445partial_word_tail: 446 /* we have a partial word in the input buffer */ 447 movs r5, lr, lsl #(31-3) 448#if __ARMEB__ 449 movmi r3, r3, ror #24 450 strbmi r3, [r0], #1 451 movcs r3, r3, ror #24 452 strbcs r3, [r0], #1 453 movcs r3, r3, ror #24 454 strbcs r3, [r0], #1 455#else 456 strbmi r3, [r0], #1 457 movmi r3, r3, lsr #8 458 strbcs r3, [r0], #1 459 movcs r3, r3, lsr #8 460 strbcs r3, [r0], #1 461#endif 462 463 /* Refill spilled registers from the stack. Don't update sp. */ 464 ldmfd sp, {r5-r11} 465 466copy_last_3_and_return: 467 movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */ 468 ldrbmi r2, [r1], #1 469 ldrbcs r3, [r1], #1 470 ldrbcs r12,[r1] 471 strbmi r2, [r0], #1 472 strbcs r3, [r0], #1 473 strbcs r12,[r0] 474 475 /* we're done! restore sp and spilled registers and return */ 476 add sp, sp, #28 477 ldmfd sp!, {r0, r4, lr} 478 bx lr 479 480