1/* 2 * memcpy - copy memory area 3 * 4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 * See https://llvm.org/LICENSE.txt for license information. 6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 */ 8 9/* 10 This memcpy routine is optimised for Cortex-A15 cores and takes advantage 11 of VFP or NEON when built with the appropriate flags. 12 13 Assumptions: 14 15 ARMv6 (ARMv7-a if using Neon) 16 ARM state 17 Unaligned accesses 18 19 */ 20 21#include "../asmdefs.h" 22 23 .syntax unified 24 /* This implementation requires ARM state. */ 25 .arm 26 27#ifdef __ARM_NEON__ 28 29 .fpu neon 30 .arch armv7-a 31# define FRAME_SIZE 4 32# define USE_VFP 33# define USE_NEON 34 35#elif !defined (__SOFTFP__) 36 37 .arch armv6 38 .fpu vfpv2 39# define FRAME_SIZE 32 40# define USE_VFP 41 42#else 43 .arch armv6 44# define FRAME_SIZE 32 45 46#endif 47 48/* Old versions of GAS incorrectly implement the NEON align semantics. */ 49#ifdef BROKEN_ASM_NEON_ALIGN 50#define ALIGN(addr, align) addr,:align 51#else 52#define ALIGN(addr, align) addr:align 53#endif 54 55#define PC_OFFSET 8 /* PC pipeline compensation. */ 56#define INSN_SIZE 4 57 58/* Call parameters. */ 59#define dstin r0 60#define src r1 61#define count r2 62 63/* Locals. */ 64#define tmp1 r3 65#define dst ip 66#define tmp2 r10 67 68#ifndef USE_NEON 69/* For bulk copies using GP registers. */ 70#define A_l r2 /* Call-clobbered. */ 71#define A_h r3 /* Call-clobbered. */ 72#define B_l r4 73#define B_h r5 74#define C_l r6 75#define C_h r7 76#define D_l r8 77#define D_h r9 78#endif 79 80/* Number of lines ahead to pre-fetch data. If you change this the code 81 below will need adjustment to compensate. */ 82 83#define prefetch_lines 5 84 85#ifdef USE_VFP 86 .macro cpy_line_vfp vreg, base 87 vstr \vreg, [dst, #\base] 88 vldr \vreg, [src, #\base] 89 vstr d0, [dst, #\base + 8] 90 vldr d0, [src, #\base + 8] 91 vstr d1, [dst, #\base + 16] 92 vldr d1, [src, #\base + 16] 93 vstr d2, [dst, #\base + 24] 94 vldr d2, [src, #\base + 24] 95 vstr \vreg, [dst, #\base + 32] 96 vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] 97 vstr d0, [dst, #\base + 40] 98 vldr d0, [src, #\base + 40] 99 vstr d1, [dst, #\base + 48] 100 vldr d1, [src, #\base + 48] 101 vstr d2, [dst, #\base + 56] 102 vldr d2, [src, #\base + 56] 103 .endm 104 105 .macro cpy_tail_vfp vreg, base 106 vstr \vreg, [dst, #\base] 107 vldr \vreg, [src, #\base] 108 vstr d0, [dst, #\base + 8] 109 vldr d0, [src, #\base + 8] 110 vstr d1, [dst, #\base + 16] 111 vldr d1, [src, #\base + 16] 112 vstr d2, [dst, #\base + 24] 113 vldr d2, [src, #\base + 24] 114 vstr \vreg, [dst, #\base + 32] 115 vstr d0, [dst, #\base + 40] 116 vldr d0, [src, #\base + 40] 117 vstr d1, [dst, #\base + 48] 118 vldr d1, [src, #\base + 48] 119 vstr d2, [dst, #\base + 56] 120 vldr d2, [src, #\base + 56] 121 .endm 122#endif 123 124ENTRY (__memcpy_arm) 125 126 mov dst, dstin /* Preserve dstin, we need to return it. */ 127 cmp count, #64 128 bge L(cpy_not_short) 129 /* Deal with small copies quickly by dropping straight into the 130 exit block. */ 131 132L(tail63unaligned): 133#ifdef USE_NEON 134 and tmp1, count, #0x38 135 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 136 add pc, pc, tmp1 137 vld1.8 {d0}, [src]! /* 14 words to go. */ 138 vst1.8 {d0}, [dst]! 139 vld1.8 {d0}, [src]! /* 12 words to go. */ 140 vst1.8 {d0}, [dst]! 141 vld1.8 {d0}, [src]! /* 10 words to go. */ 142 vst1.8 {d0}, [dst]! 143 vld1.8 {d0}, [src]! /* 8 words to go. */ 144 vst1.8 {d0}, [dst]! 145 vld1.8 {d0}, [src]! /* 6 words to go. */ 146 vst1.8 {d0}, [dst]! 147 vld1.8 {d0}, [src]! /* 4 words to go. */ 148 vst1.8 {d0}, [dst]! 149 vld1.8 {d0}, [src]! /* 2 words to go. */ 150 vst1.8 {d0}, [dst]! 151 152 tst count, #4 153 ldrne tmp1, [src], #4 154 strne tmp1, [dst], #4 155#else 156 /* Copy up to 15 full words of data. May not be aligned. */ 157 /* Cannot use VFP for unaligned data. */ 158 and tmp1, count, #0x3c 159 add dst, dst, tmp1 160 add src, src, tmp1 161 rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) 162 /* Jump directly into the sequence below at the correct offset. */ 163 add pc, pc, tmp1, lsl #1 164 165 ldr tmp1, [src, #-60] /* 15 words to go. */ 166 str tmp1, [dst, #-60] 167 168 ldr tmp1, [src, #-56] /* 14 words to go. */ 169 str tmp1, [dst, #-56] 170 ldr tmp1, [src, #-52] 171 str tmp1, [dst, #-52] 172 173 ldr tmp1, [src, #-48] /* 12 words to go. */ 174 str tmp1, [dst, #-48] 175 ldr tmp1, [src, #-44] 176 str tmp1, [dst, #-44] 177 178 ldr tmp1, [src, #-40] /* 10 words to go. */ 179 str tmp1, [dst, #-40] 180 ldr tmp1, [src, #-36] 181 str tmp1, [dst, #-36] 182 183 ldr tmp1, [src, #-32] /* 8 words to go. */ 184 str tmp1, [dst, #-32] 185 ldr tmp1, [src, #-28] 186 str tmp1, [dst, #-28] 187 188 ldr tmp1, [src, #-24] /* 6 words to go. */ 189 str tmp1, [dst, #-24] 190 ldr tmp1, [src, #-20] 191 str tmp1, [dst, #-20] 192 193 ldr tmp1, [src, #-16] /* 4 words to go. */ 194 str tmp1, [dst, #-16] 195 ldr tmp1, [src, #-12] 196 str tmp1, [dst, #-12] 197 198 ldr tmp1, [src, #-8] /* 2 words to go. */ 199 str tmp1, [dst, #-8] 200 ldr tmp1, [src, #-4] 201 str tmp1, [dst, #-4] 202#endif 203 204 lsls count, count, #31 205 ldrhcs tmp1, [src], #2 206 ldrbne src, [src] /* Src is dead, use as a scratch. */ 207 strhcs tmp1, [dst], #2 208 strbne src, [dst] 209 bx lr 210 211L(cpy_not_short): 212 /* At least 64 bytes to copy, but don't know the alignment yet. */ 213 str tmp2, [sp, #-FRAME_SIZE]! 214 and tmp2, src, #7 215 and tmp1, dst, #7 216 cmp tmp1, tmp2 217 bne L(cpy_notaligned) 218 219#ifdef USE_VFP 220 /* Magic dust alert! Force VFP on Cortex-A9. Experiments show 221 that the FP pipeline is much better at streaming loads and 222 stores. This is outside the critical loop. */ 223 vmov.f32 s0, s0 224#endif 225 226 /* SRC and DST have the same mutual 64-bit alignment, but we may 227 still need to pre-copy some bytes to get to natural alignment. 228 We bring SRC and DST into full 64-bit alignment. */ 229 lsls tmp2, dst, #29 230 beq 1f 231 rsbs tmp2, tmp2, #0 232 sub count, count, tmp2, lsr #29 233 ldrmi tmp1, [src], #4 234 strmi tmp1, [dst], #4 235 lsls tmp2, tmp2, #2 236 ldrhcs tmp1, [src], #2 237 ldrbne tmp2, [src], #1 238 strhcs tmp1, [dst], #2 239 strbne tmp2, [dst], #1 240 2411: 242 subs tmp2, count, #64 /* Use tmp2 for count. */ 243 blt L(tail63aligned) 244 245 cmp tmp2, #512 246 bge L(cpy_body_long) 247 248L(cpy_body_medium): /* Count in tmp2. */ 249#ifdef USE_VFP 2501: 251 vldr d0, [src, #0] 252 subs tmp2, tmp2, #64 253 vldr d1, [src, #8] 254 vstr d0, [dst, #0] 255 vldr d0, [src, #16] 256 vstr d1, [dst, #8] 257 vldr d1, [src, #24] 258 vstr d0, [dst, #16] 259 vldr d0, [src, #32] 260 vstr d1, [dst, #24] 261 vldr d1, [src, #40] 262 vstr d0, [dst, #32] 263 vldr d0, [src, #48] 264 vstr d1, [dst, #40] 265 vldr d1, [src, #56] 266 vstr d0, [dst, #48] 267 add src, src, #64 268 vstr d1, [dst, #56] 269 add dst, dst, #64 270 bge 1b 271 tst tmp2, #0x3f 272 beq L(done) 273 274L(tail63aligned): /* Count in tmp2. */ 275 and tmp1, tmp2, #0x38 276 add dst, dst, tmp1 277 add src, src, tmp1 278 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 279 add pc, pc, tmp1 280 281 vldr d0, [src, #-56] /* 14 words to go. */ 282 vstr d0, [dst, #-56] 283 vldr d0, [src, #-48] /* 12 words to go. */ 284 vstr d0, [dst, #-48] 285 vldr d0, [src, #-40] /* 10 words to go. */ 286 vstr d0, [dst, #-40] 287 vldr d0, [src, #-32] /* 8 words to go. */ 288 vstr d0, [dst, #-32] 289 vldr d0, [src, #-24] /* 6 words to go. */ 290 vstr d0, [dst, #-24] 291 vldr d0, [src, #-16] /* 4 words to go. */ 292 vstr d0, [dst, #-16] 293 vldr d0, [src, #-8] /* 2 words to go. */ 294 vstr d0, [dst, #-8] 295#else 296 sub src, src, #8 297 sub dst, dst, #8 2981: 299 ldrd A_l, A_h, [src, #8] 300 strd A_l, A_h, [dst, #8] 301 ldrd A_l, A_h, [src, #16] 302 strd A_l, A_h, [dst, #16] 303 ldrd A_l, A_h, [src, #24] 304 strd A_l, A_h, [dst, #24] 305 ldrd A_l, A_h, [src, #32] 306 strd A_l, A_h, [dst, #32] 307 ldrd A_l, A_h, [src, #40] 308 strd A_l, A_h, [dst, #40] 309 ldrd A_l, A_h, [src, #48] 310 strd A_l, A_h, [dst, #48] 311 ldrd A_l, A_h, [src, #56] 312 strd A_l, A_h, [dst, #56] 313 ldrd A_l, A_h, [src, #64]! 314 strd A_l, A_h, [dst, #64]! 315 subs tmp2, tmp2, #64 316 bge 1b 317 tst tmp2, #0x3f 318 bne 1f 319 ldr tmp2,[sp], #FRAME_SIZE 320 bx lr 3211: 322 add src, src, #8 323 add dst, dst, #8 324 325L(tail63aligned): /* Count in tmp2. */ 326 /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but 327 we know that the src and dest are 64-bit aligned so we can use 328 LDRD/STRD to improve efficiency. */ 329 /* TMP2 is now negative, but we don't care about that. The bottom 330 six bits still tell us how many bytes are left to copy. */ 331 332 and tmp1, tmp2, #0x38 333 add dst, dst, tmp1 334 add src, src, tmp1 335 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 336 add pc, pc, tmp1 337 ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ 338 strd A_l, A_h, [dst, #-56] 339 ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ 340 strd A_l, A_h, [dst, #-48] 341 ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ 342 strd A_l, A_h, [dst, #-40] 343 ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ 344 strd A_l, A_h, [dst, #-32] 345 ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ 346 strd A_l, A_h, [dst, #-24] 347 ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ 348 strd A_l, A_h, [dst, #-16] 349 ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ 350 strd A_l, A_h, [dst, #-8] 351 352#endif 353 tst tmp2, #4 354 ldrne tmp1, [src], #4 355 strne tmp1, [dst], #4 356 lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ 357 ldrhcs tmp1, [src], #2 358 ldrbne tmp2, [src] 359 strhcs tmp1, [dst], #2 360 strbne tmp2, [dst] 361 362L(done): 363 ldr tmp2, [sp], #FRAME_SIZE 364 bx lr 365 366L(cpy_body_long): /* Count in tmp2. */ 367 368 /* Long copy. We know that there's at least (prefetch_lines * 64) 369 bytes to go. */ 370#ifdef USE_VFP 371 /* Don't use PLD. Instead, read some data in advance of the current 372 copy position into a register. This should act like a PLD 373 operation but we won't have to repeat the transfer. */ 374 375 vldr d3, [src, #0] 376 vldr d4, [src, #64] 377 vldr d5, [src, #128] 378 vldr d6, [src, #192] 379 vldr d7, [src, #256] 380 381 vldr d0, [src, #8] 382 vldr d1, [src, #16] 383 vldr d2, [src, #24] 384 add src, src, #32 385 386 subs tmp2, tmp2, #prefetch_lines * 64 * 2 387 blt 2f 3881: 389 cpy_line_vfp d3, 0 390 cpy_line_vfp d4, 64 391 cpy_line_vfp d5, 128 392 add dst, dst, #3 * 64 393 add src, src, #3 * 64 394 cpy_line_vfp d6, 0 395 cpy_line_vfp d7, 64 396 add dst, dst, #2 * 64 397 add src, src, #2 * 64 398 subs tmp2, tmp2, #prefetch_lines * 64 399 bge 1b 400 4012: 402 cpy_tail_vfp d3, 0 403 cpy_tail_vfp d4, 64 404 cpy_tail_vfp d5, 128 405 add src, src, #3 * 64 406 add dst, dst, #3 * 64 407 cpy_tail_vfp d6, 0 408 vstr d7, [dst, #64] 409 vldr d7, [src, #64] 410 vstr d0, [dst, #64 + 8] 411 vldr d0, [src, #64 + 8] 412 vstr d1, [dst, #64 + 16] 413 vldr d1, [src, #64 + 16] 414 vstr d2, [dst, #64 + 24] 415 vldr d2, [src, #64 + 24] 416 vstr d7, [dst, #64 + 32] 417 add src, src, #96 418 vstr d0, [dst, #64 + 40] 419 vstr d1, [dst, #64 + 48] 420 vstr d2, [dst, #64 + 56] 421 add dst, dst, #128 422 add tmp2, tmp2, #prefetch_lines * 64 423 b L(cpy_body_medium) 424#else 425 /* Long copy. Use an SMS style loop to maximize the I/O 426 bandwidth of the core. We don't have enough spare registers 427 to synthesise prefetching, so use PLD operations. */ 428 /* Pre-bias src and dst. */ 429 sub src, src, #8 430 sub dst, dst, #8 431 pld [src, #8] 432 pld [src, #72] 433 subs tmp2, tmp2, #64 434 pld [src, #136] 435 ldrd A_l, A_h, [src, #8] 436 strd B_l, B_h, [sp, #8] 437 ldrd B_l, B_h, [src, #16] 438 strd C_l, C_h, [sp, #16] 439 ldrd C_l, C_h, [src, #24] 440 strd D_l, D_h, [sp, #24] 441 pld [src, #200] 442 ldrd D_l, D_h, [src, #32]! 443 b 1f 444 .p2align 6 4452: 446 pld [src, #232] 447 strd A_l, A_h, [dst, #40] 448 ldrd A_l, A_h, [src, #40] 449 strd B_l, B_h, [dst, #48] 450 ldrd B_l, B_h, [src, #48] 451 strd C_l, C_h, [dst, #56] 452 ldrd C_l, C_h, [src, #56] 453 strd D_l, D_h, [dst, #64]! 454 ldrd D_l, D_h, [src, #64]! 455 subs tmp2, tmp2, #64 4561: 457 strd A_l, A_h, [dst, #8] 458 ldrd A_l, A_h, [src, #8] 459 strd B_l, B_h, [dst, #16] 460 ldrd B_l, B_h, [src, #16] 461 strd C_l, C_h, [dst, #24] 462 ldrd C_l, C_h, [src, #24] 463 strd D_l, D_h, [dst, #32] 464 ldrd D_l, D_h, [src, #32] 465 bcs 2b 466 /* Save the remaining bytes and restore the callee-saved regs. */ 467 strd A_l, A_h, [dst, #40] 468 add src, src, #40 469 strd B_l, B_h, [dst, #48] 470 ldrd B_l, B_h, [sp, #8] 471 strd C_l, C_h, [dst, #56] 472 ldrd C_l, C_h, [sp, #16] 473 strd D_l, D_h, [dst, #64] 474 ldrd D_l, D_h, [sp, #24] 475 add dst, dst, #72 476 tst tmp2, #0x3f 477 bne L(tail63aligned) 478 ldr tmp2, [sp], #FRAME_SIZE 479 bx lr 480#endif 481 482L(cpy_notaligned): 483 pld [src] 484 pld [src, #64] 485 /* There's at least 64 bytes to copy, but there is no mutual 486 alignment. */ 487 /* Bring DST to 64-bit alignment. */ 488 lsls tmp2, dst, #29 489 pld [src, #(2 * 64)] 490 beq 1f 491 rsbs tmp2, tmp2, #0 492 sub count, count, tmp2, lsr #29 493 ldrmi tmp1, [src], #4 494 strmi tmp1, [dst], #4 495 lsls tmp2, tmp2, #2 496 ldrbne tmp1, [src], #1 497 ldrhcs tmp2, [src], #2 498 strbne tmp1, [dst], #1 499 strhcs tmp2, [dst], #2 5001: 501 pld [src, #(3 * 64)] 502 subs count, count, #64 503 ldrmi tmp2, [sp], #FRAME_SIZE 504 bmi L(tail63unaligned) 505 pld [src, #(4 * 64)] 506 507#ifdef USE_NEON 508 vld1.8 {d0-d3}, [src]! 509 vld1.8 {d4-d7}, [src]! 510 subs count, count, #64 511 bmi 2f 5121: 513 pld [src, #(4 * 64)] 514 vst1.8 {d0-d3}, [ALIGN (dst, 64)]! 515 vld1.8 {d0-d3}, [src]! 516 vst1.8 {d4-d7}, [ALIGN (dst, 64)]! 517 vld1.8 {d4-d7}, [src]! 518 subs count, count, #64 519 bpl 1b 5202: 521 vst1.8 {d0-d3}, [ALIGN (dst, 64)]! 522 vst1.8 {d4-d7}, [ALIGN (dst, 64)]! 523 ands count, count, #0x3f 524#else 525 /* Use an SMS style loop to maximize the I/O bandwidth. */ 526 sub src, src, #4 527 sub dst, dst, #8 528 subs tmp2, count, #64 /* Use tmp2 for count. */ 529 ldr A_l, [src, #4] 530 ldr A_h, [src, #8] 531 strd B_l, B_h, [sp, #8] 532 ldr B_l, [src, #12] 533 ldr B_h, [src, #16] 534 strd C_l, C_h, [sp, #16] 535 ldr C_l, [src, #20] 536 ldr C_h, [src, #24] 537 strd D_l, D_h, [sp, #24] 538 ldr D_l, [src, #28] 539 ldr D_h, [src, #32]! 540 b 1f 541 .p2align 6 5422: 543 pld [src, #(5 * 64) - (32 - 4)] 544 strd A_l, A_h, [dst, #40] 545 ldr A_l, [src, #36] 546 ldr A_h, [src, #40] 547 strd B_l, B_h, [dst, #48] 548 ldr B_l, [src, #44] 549 ldr B_h, [src, #48] 550 strd C_l, C_h, [dst, #56] 551 ldr C_l, [src, #52] 552 ldr C_h, [src, #56] 553 strd D_l, D_h, [dst, #64]! 554 ldr D_l, [src, #60] 555 ldr D_h, [src, #64]! 556 subs tmp2, tmp2, #64 5571: 558 strd A_l, A_h, [dst, #8] 559 ldr A_l, [src, #4] 560 ldr A_h, [src, #8] 561 strd B_l, B_h, [dst, #16] 562 ldr B_l, [src, #12] 563 ldr B_h, [src, #16] 564 strd C_l, C_h, [dst, #24] 565 ldr C_l, [src, #20] 566 ldr C_h, [src, #24] 567 strd D_l, D_h, [dst, #32] 568 ldr D_l, [src, #28] 569 ldr D_h, [src, #32] 570 bcs 2b 571 572 /* Save the remaining bytes and restore the callee-saved regs. */ 573 strd A_l, A_h, [dst, #40] 574 add src, src, #36 575 strd B_l, B_h, [dst, #48] 576 ldrd B_l, B_h, [sp, #8] 577 strd C_l, C_h, [dst, #56] 578 ldrd C_l, C_h, [sp, #16] 579 strd D_l, D_h, [dst, #64] 580 ldrd D_l, D_h, [sp, #24] 581 add dst, dst, #72 582 ands count, tmp2, #0x3f 583#endif 584 ldr tmp2, [sp], #FRAME_SIZE 585 bne L(tail63unaligned) 586 bx lr 587 588END (__memcpy_arm) 589