131914882SAlex Richardson/* 231914882SAlex Richardson * memcpy - copy memory area 331914882SAlex Richardson * 4*072a4ba8SAndrew Turner * Copyright (c) 2013-2022, Arm Limited. 5*072a4ba8SAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 631914882SAlex Richardson */ 731914882SAlex Richardson 831914882SAlex Richardson/* 931914882SAlex Richardson This memcpy routine is optimised for Cortex-A15 cores and takes advantage 1031914882SAlex Richardson of VFP or NEON when built with the appropriate flags. 1131914882SAlex Richardson 1231914882SAlex Richardson Assumptions: 1331914882SAlex Richardson 1431914882SAlex Richardson ARMv6 (ARMv7-a if using Neon) 1531914882SAlex Richardson ARM state 1631914882SAlex Richardson Unaligned accesses 1731914882SAlex Richardson 1831914882SAlex Richardson */ 1931914882SAlex Richardson 20*072a4ba8SAndrew Turner#include "asmdefs.h" 2131914882SAlex Richardson 2231914882SAlex Richardson .syntax unified 2331914882SAlex Richardson /* This implementation requires ARM state. */ 2431914882SAlex Richardson .arm 2531914882SAlex Richardson 2631914882SAlex Richardson#ifdef __ARM_NEON__ 2731914882SAlex Richardson 2831914882SAlex Richardson .fpu neon 2931914882SAlex Richardson .arch armv7-a 3031914882SAlex Richardson# define FRAME_SIZE 4 3131914882SAlex Richardson# define USE_VFP 3231914882SAlex Richardson# define USE_NEON 3331914882SAlex Richardson 3431914882SAlex Richardson#elif !defined (__SOFTFP__) 3531914882SAlex Richardson 3631914882SAlex Richardson .arch armv6 3731914882SAlex Richardson .fpu vfpv2 3831914882SAlex Richardson# define FRAME_SIZE 32 3931914882SAlex Richardson# define USE_VFP 4031914882SAlex Richardson 4131914882SAlex Richardson#else 4231914882SAlex Richardson .arch armv6 4331914882SAlex Richardson# define FRAME_SIZE 32 4431914882SAlex Richardson 4531914882SAlex Richardson#endif 4631914882SAlex Richardson 4731914882SAlex Richardson/* Old versions of GAS incorrectly implement the NEON align semantics. */ 4831914882SAlex Richardson#ifdef BROKEN_ASM_NEON_ALIGN 4931914882SAlex Richardson#define ALIGN(addr, align) addr,:align 5031914882SAlex Richardson#else 5131914882SAlex Richardson#define ALIGN(addr, align) addr:align 5231914882SAlex Richardson#endif 5331914882SAlex Richardson 5431914882SAlex Richardson#define PC_OFFSET 8 /* PC pipeline compensation. */ 5531914882SAlex Richardson#define INSN_SIZE 4 5631914882SAlex Richardson 5731914882SAlex Richardson/* Call parameters. */ 5831914882SAlex Richardson#define dstin r0 5931914882SAlex Richardson#define src r1 6031914882SAlex Richardson#define count r2 6131914882SAlex Richardson 6231914882SAlex Richardson/* Locals. */ 6331914882SAlex Richardson#define tmp1 r3 6431914882SAlex Richardson#define dst ip 6531914882SAlex Richardson#define tmp2 r10 6631914882SAlex Richardson 6731914882SAlex Richardson#ifndef USE_NEON 6831914882SAlex Richardson/* For bulk copies using GP registers. */ 6931914882SAlex Richardson#define A_l r2 /* Call-clobbered. */ 7031914882SAlex Richardson#define A_h r3 /* Call-clobbered. */ 7131914882SAlex Richardson#define B_l r4 7231914882SAlex Richardson#define B_h r5 7331914882SAlex Richardson#define C_l r6 7431914882SAlex Richardson#define C_h r7 7531914882SAlex Richardson#define D_l r8 7631914882SAlex Richardson#define D_h r9 7731914882SAlex Richardson#endif 7831914882SAlex Richardson 7931914882SAlex Richardson/* Number of lines ahead to pre-fetch data. If you change this the code 8031914882SAlex Richardson below will need adjustment to compensate. */ 8131914882SAlex Richardson 8231914882SAlex Richardson#define prefetch_lines 5 8331914882SAlex Richardson 8431914882SAlex Richardson#ifdef USE_VFP 8531914882SAlex Richardson .macro cpy_line_vfp vreg, base 8631914882SAlex Richardson vstr \vreg, [dst, #\base] 8731914882SAlex Richardson vldr \vreg, [src, #\base] 8831914882SAlex Richardson vstr d0, [dst, #\base + 8] 8931914882SAlex Richardson vldr d0, [src, #\base + 8] 9031914882SAlex Richardson vstr d1, [dst, #\base + 16] 9131914882SAlex Richardson vldr d1, [src, #\base + 16] 9231914882SAlex Richardson vstr d2, [dst, #\base + 24] 9331914882SAlex Richardson vldr d2, [src, #\base + 24] 9431914882SAlex Richardson vstr \vreg, [dst, #\base + 32] 9531914882SAlex Richardson vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] 9631914882SAlex Richardson vstr d0, [dst, #\base + 40] 9731914882SAlex Richardson vldr d0, [src, #\base + 40] 9831914882SAlex Richardson vstr d1, [dst, #\base + 48] 9931914882SAlex Richardson vldr d1, [src, #\base + 48] 10031914882SAlex Richardson vstr d2, [dst, #\base + 56] 10131914882SAlex Richardson vldr d2, [src, #\base + 56] 10231914882SAlex Richardson .endm 10331914882SAlex Richardson 10431914882SAlex Richardson .macro cpy_tail_vfp vreg, base 10531914882SAlex Richardson vstr \vreg, [dst, #\base] 10631914882SAlex Richardson vldr \vreg, [src, #\base] 10731914882SAlex Richardson vstr d0, [dst, #\base + 8] 10831914882SAlex Richardson vldr d0, [src, #\base + 8] 10931914882SAlex Richardson vstr d1, [dst, #\base + 16] 11031914882SAlex Richardson vldr d1, [src, #\base + 16] 11131914882SAlex Richardson vstr d2, [dst, #\base + 24] 11231914882SAlex Richardson vldr d2, [src, #\base + 24] 11331914882SAlex Richardson vstr \vreg, [dst, #\base + 32] 11431914882SAlex Richardson vstr d0, [dst, #\base + 40] 11531914882SAlex Richardson vldr d0, [src, #\base + 40] 11631914882SAlex Richardson vstr d1, [dst, #\base + 48] 11731914882SAlex Richardson vldr d1, [src, #\base + 48] 11831914882SAlex Richardson vstr d2, [dst, #\base + 56] 11931914882SAlex Richardson vldr d2, [src, #\base + 56] 12031914882SAlex Richardson .endm 12131914882SAlex Richardson#endif 12231914882SAlex Richardson 12331914882SAlex RichardsonENTRY (__memcpy_arm) 12431914882SAlex Richardson 12531914882SAlex Richardson mov dst, dstin /* Preserve dstin, we need to return it. */ 12631914882SAlex Richardson cmp count, #64 12731914882SAlex Richardson bhs L(cpy_not_short) 12831914882SAlex Richardson /* Deal with small copies quickly by dropping straight into the 12931914882SAlex Richardson exit block. */ 13031914882SAlex Richardson 13131914882SAlex RichardsonL(tail63unaligned): 13231914882SAlex Richardson#ifdef USE_NEON 13331914882SAlex Richardson and tmp1, count, #0x38 13431914882SAlex Richardson rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 13531914882SAlex Richardson add pc, pc, tmp1 13631914882SAlex Richardson vld1.8 {d0}, [src]! /* 14 words to go. */ 13731914882SAlex Richardson vst1.8 {d0}, [dst]! 13831914882SAlex Richardson vld1.8 {d0}, [src]! /* 12 words to go. */ 13931914882SAlex Richardson vst1.8 {d0}, [dst]! 14031914882SAlex Richardson vld1.8 {d0}, [src]! /* 10 words to go. */ 14131914882SAlex Richardson vst1.8 {d0}, [dst]! 14231914882SAlex Richardson vld1.8 {d0}, [src]! /* 8 words to go. */ 14331914882SAlex Richardson vst1.8 {d0}, [dst]! 14431914882SAlex Richardson vld1.8 {d0}, [src]! /* 6 words to go. */ 14531914882SAlex Richardson vst1.8 {d0}, [dst]! 14631914882SAlex Richardson vld1.8 {d0}, [src]! /* 4 words to go. */ 14731914882SAlex Richardson vst1.8 {d0}, [dst]! 14831914882SAlex Richardson vld1.8 {d0}, [src]! /* 2 words to go. */ 14931914882SAlex Richardson vst1.8 {d0}, [dst]! 15031914882SAlex Richardson 15131914882SAlex Richardson tst count, #4 15231914882SAlex Richardson ldrne tmp1, [src], #4 15331914882SAlex Richardson strne tmp1, [dst], #4 15431914882SAlex Richardson#else 15531914882SAlex Richardson /* Copy up to 15 full words of data. May not be aligned. */ 15631914882SAlex Richardson /* Cannot use VFP for unaligned data. */ 15731914882SAlex Richardson and tmp1, count, #0x3c 15831914882SAlex Richardson add dst, dst, tmp1 15931914882SAlex Richardson add src, src, tmp1 16031914882SAlex Richardson rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) 16131914882SAlex Richardson /* Jump directly into the sequence below at the correct offset. */ 16231914882SAlex Richardson add pc, pc, tmp1, lsl #1 16331914882SAlex Richardson 16431914882SAlex Richardson ldr tmp1, [src, #-60] /* 15 words to go. */ 16531914882SAlex Richardson str tmp1, [dst, #-60] 16631914882SAlex Richardson 16731914882SAlex Richardson ldr tmp1, [src, #-56] /* 14 words to go. */ 16831914882SAlex Richardson str tmp1, [dst, #-56] 16931914882SAlex Richardson ldr tmp1, [src, #-52] 17031914882SAlex Richardson str tmp1, [dst, #-52] 17131914882SAlex Richardson 17231914882SAlex Richardson ldr tmp1, [src, #-48] /* 12 words to go. */ 17331914882SAlex Richardson str tmp1, [dst, #-48] 17431914882SAlex Richardson ldr tmp1, [src, #-44] 17531914882SAlex Richardson str tmp1, [dst, #-44] 17631914882SAlex Richardson 17731914882SAlex Richardson ldr tmp1, [src, #-40] /* 10 words to go. */ 17831914882SAlex Richardson str tmp1, [dst, #-40] 17931914882SAlex Richardson ldr tmp1, [src, #-36] 18031914882SAlex Richardson str tmp1, [dst, #-36] 18131914882SAlex Richardson 18231914882SAlex Richardson ldr tmp1, [src, #-32] /* 8 words to go. */ 18331914882SAlex Richardson str tmp1, [dst, #-32] 18431914882SAlex Richardson ldr tmp1, [src, #-28] 18531914882SAlex Richardson str tmp1, [dst, #-28] 18631914882SAlex Richardson 18731914882SAlex Richardson ldr tmp1, [src, #-24] /* 6 words to go. */ 18831914882SAlex Richardson str tmp1, [dst, #-24] 18931914882SAlex Richardson ldr tmp1, [src, #-20] 19031914882SAlex Richardson str tmp1, [dst, #-20] 19131914882SAlex Richardson 19231914882SAlex Richardson ldr tmp1, [src, #-16] /* 4 words to go. */ 19331914882SAlex Richardson str tmp1, [dst, #-16] 19431914882SAlex Richardson ldr tmp1, [src, #-12] 19531914882SAlex Richardson str tmp1, [dst, #-12] 19631914882SAlex Richardson 19731914882SAlex Richardson ldr tmp1, [src, #-8] /* 2 words to go. */ 19831914882SAlex Richardson str tmp1, [dst, #-8] 19931914882SAlex Richardson ldr tmp1, [src, #-4] 20031914882SAlex Richardson str tmp1, [dst, #-4] 20131914882SAlex Richardson#endif 20231914882SAlex Richardson 20331914882SAlex Richardson lsls count, count, #31 20431914882SAlex Richardson ldrhcs tmp1, [src], #2 20531914882SAlex Richardson ldrbne src, [src] /* Src is dead, use as a scratch. */ 20631914882SAlex Richardson strhcs tmp1, [dst], #2 20731914882SAlex Richardson strbne src, [dst] 20831914882SAlex Richardson bx lr 20931914882SAlex Richardson 21031914882SAlex RichardsonL(cpy_not_short): 21131914882SAlex Richardson /* At least 64 bytes to copy, but don't know the alignment yet. */ 21231914882SAlex Richardson str tmp2, [sp, #-FRAME_SIZE]! 21331914882SAlex Richardson and tmp2, src, #7 21431914882SAlex Richardson and tmp1, dst, #7 21531914882SAlex Richardson cmp tmp1, tmp2 21631914882SAlex Richardson bne L(cpy_notaligned) 21731914882SAlex Richardson 21831914882SAlex Richardson#ifdef USE_VFP 21931914882SAlex Richardson /* Magic dust alert! Force VFP on Cortex-A9. Experiments show 22031914882SAlex Richardson that the FP pipeline is much better at streaming loads and 22131914882SAlex Richardson stores. This is outside the critical loop. */ 22231914882SAlex Richardson vmov.f32 s0, s0 22331914882SAlex Richardson#endif 22431914882SAlex Richardson 22531914882SAlex Richardson /* SRC and DST have the same mutual 64-bit alignment, but we may 22631914882SAlex Richardson still need to pre-copy some bytes to get to natural alignment. 22731914882SAlex Richardson We bring SRC and DST into full 64-bit alignment. */ 22831914882SAlex Richardson lsls tmp2, dst, #29 22931914882SAlex Richardson beq 1f 23031914882SAlex Richardson rsbs tmp2, tmp2, #0 23131914882SAlex Richardson sub count, count, tmp2, lsr #29 23231914882SAlex Richardson ldrmi tmp1, [src], #4 23331914882SAlex Richardson strmi tmp1, [dst], #4 23431914882SAlex Richardson lsls tmp2, tmp2, #2 23531914882SAlex Richardson ldrhcs tmp1, [src], #2 23631914882SAlex Richardson ldrbne tmp2, [src], #1 23731914882SAlex Richardson strhcs tmp1, [dst], #2 23831914882SAlex Richardson strbne tmp2, [dst], #1 23931914882SAlex Richardson 24031914882SAlex Richardson1: 24131914882SAlex Richardson subs tmp2, count, #64 /* Use tmp2 for count. */ 24231914882SAlex Richardson blo L(tail63aligned) 24331914882SAlex Richardson 24431914882SAlex Richardson cmp tmp2, #512 24531914882SAlex Richardson bhs L(cpy_body_long) 24631914882SAlex Richardson 24731914882SAlex RichardsonL(cpy_body_medium): /* Count in tmp2. */ 24831914882SAlex Richardson#ifdef USE_VFP 24931914882SAlex Richardson1: 25031914882SAlex Richardson vldr d0, [src, #0] 25131914882SAlex Richardson subs tmp2, tmp2, #64 25231914882SAlex Richardson vldr d1, [src, #8] 25331914882SAlex Richardson vstr d0, [dst, #0] 25431914882SAlex Richardson vldr d0, [src, #16] 25531914882SAlex Richardson vstr d1, [dst, #8] 25631914882SAlex Richardson vldr d1, [src, #24] 25731914882SAlex Richardson vstr d0, [dst, #16] 25831914882SAlex Richardson vldr d0, [src, #32] 25931914882SAlex Richardson vstr d1, [dst, #24] 26031914882SAlex Richardson vldr d1, [src, #40] 26131914882SAlex Richardson vstr d0, [dst, #32] 26231914882SAlex Richardson vldr d0, [src, #48] 26331914882SAlex Richardson vstr d1, [dst, #40] 26431914882SAlex Richardson vldr d1, [src, #56] 26531914882SAlex Richardson vstr d0, [dst, #48] 26631914882SAlex Richardson add src, src, #64 26731914882SAlex Richardson vstr d1, [dst, #56] 26831914882SAlex Richardson add dst, dst, #64 26931914882SAlex Richardson bhs 1b 27031914882SAlex Richardson tst tmp2, #0x3f 27131914882SAlex Richardson beq L(done) 27231914882SAlex Richardson 27331914882SAlex RichardsonL(tail63aligned): /* Count in tmp2. */ 27431914882SAlex Richardson and tmp1, tmp2, #0x38 27531914882SAlex Richardson add dst, dst, tmp1 27631914882SAlex Richardson add src, src, tmp1 27731914882SAlex Richardson rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 27831914882SAlex Richardson add pc, pc, tmp1 27931914882SAlex Richardson 28031914882SAlex Richardson vldr d0, [src, #-56] /* 14 words to go. */ 28131914882SAlex Richardson vstr d0, [dst, #-56] 28231914882SAlex Richardson vldr d0, [src, #-48] /* 12 words to go. */ 28331914882SAlex Richardson vstr d0, [dst, #-48] 28431914882SAlex Richardson vldr d0, [src, #-40] /* 10 words to go. */ 28531914882SAlex Richardson vstr d0, [dst, #-40] 28631914882SAlex Richardson vldr d0, [src, #-32] /* 8 words to go. */ 28731914882SAlex Richardson vstr d0, [dst, #-32] 28831914882SAlex Richardson vldr d0, [src, #-24] /* 6 words to go. */ 28931914882SAlex Richardson vstr d0, [dst, #-24] 29031914882SAlex Richardson vldr d0, [src, #-16] /* 4 words to go. */ 29131914882SAlex Richardson vstr d0, [dst, #-16] 29231914882SAlex Richardson vldr d0, [src, #-8] /* 2 words to go. */ 29331914882SAlex Richardson vstr d0, [dst, #-8] 29431914882SAlex Richardson#else 29531914882SAlex Richardson sub src, src, #8 29631914882SAlex Richardson sub dst, dst, #8 29731914882SAlex Richardson1: 29831914882SAlex Richardson ldrd A_l, A_h, [src, #8] 29931914882SAlex Richardson strd A_l, A_h, [dst, #8] 30031914882SAlex Richardson ldrd A_l, A_h, [src, #16] 30131914882SAlex Richardson strd A_l, A_h, [dst, #16] 30231914882SAlex Richardson ldrd A_l, A_h, [src, #24] 30331914882SAlex Richardson strd A_l, A_h, [dst, #24] 30431914882SAlex Richardson ldrd A_l, A_h, [src, #32] 30531914882SAlex Richardson strd A_l, A_h, [dst, #32] 30631914882SAlex Richardson ldrd A_l, A_h, [src, #40] 30731914882SAlex Richardson strd A_l, A_h, [dst, #40] 30831914882SAlex Richardson ldrd A_l, A_h, [src, #48] 30931914882SAlex Richardson strd A_l, A_h, [dst, #48] 31031914882SAlex Richardson ldrd A_l, A_h, [src, #56] 31131914882SAlex Richardson strd A_l, A_h, [dst, #56] 31231914882SAlex Richardson ldrd A_l, A_h, [src, #64]! 31331914882SAlex Richardson strd A_l, A_h, [dst, #64]! 31431914882SAlex Richardson subs tmp2, tmp2, #64 31531914882SAlex Richardson bhs 1b 31631914882SAlex Richardson tst tmp2, #0x3f 31731914882SAlex Richardson bne 1f 31831914882SAlex Richardson ldr tmp2,[sp], #FRAME_SIZE 31931914882SAlex Richardson bx lr 32031914882SAlex Richardson1: 32131914882SAlex Richardson add src, src, #8 32231914882SAlex Richardson add dst, dst, #8 32331914882SAlex Richardson 32431914882SAlex RichardsonL(tail63aligned): /* Count in tmp2. */ 32531914882SAlex Richardson /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but 32631914882SAlex Richardson we know that the src and dest are 64-bit aligned so we can use 32731914882SAlex Richardson LDRD/STRD to improve efficiency. */ 32831914882SAlex Richardson /* TMP2 is now negative, but we don't care about that. The bottom 32931914882SAlex Richardson six bits still tell us how many bytes are left to copy. */ 33031914882SAlex Richardson 33131914882SAlex Richardson and tmp1, tmp2, #0x38 33231914882SAlex Richardson add dst, dst, tmp1 33331914882SAlex Richardson add src, src, tmp1 33431914882SAlex Richardson rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 33531914882SAlex Richardson add pc, pc, tmp1 33631914882SAlex Richardson ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ 33731914882SAlex Richardson strd A_l, A_h, [dst, #-56] 33831914882SAlex Richardson ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ 33931914882SAlex Richardson strd A_l, A_h, [dst, #-48] 34031914882SAlex Richardson ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ 34131914882SAlex Richardson strd A_l, A_h, [dst, #-40] 34231914882SAlex Richardson ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ 34331914882SAlex Richardson strd A_l, A_h, [dst, #-32] 34431914882SAlex Richardson ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ 34531914882SAlex Richardson strd A_l, A_h, [dst, #-24] 34631914882SAlex Richardson ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ 34731914882SAlex Richardson strd A_l, A_h, [dst, #-16] 34831914882SAlex Richardson ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ 34931914882SAlex Richardson strd A_l, A_h, [dst, #-8] 35031914882SAlex Richardson 35131914882SAlex Richardson#endif 35231914882SAlex Richardson tst tmp2, #4 35331914882SAlex Richardson ldrne tmp1, [src], #4 35431914882SAlex Richardson strne tmp1, [dst], #4 35531914882SAlex Richardson lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ 35631914882SAlex Richardson ldrhcs tmp1, [src], #2 35731914882SAlex Richardson ldrbne tmp2, [src] 35831914882SAlex Richardson strhcs tmp1, [dst], #2 35931914882SAlex Richardson strbne tmp2, [dst] 36031914882SAlex Richardson 36131914882SAlex RichardsonL(done): 36231914882SAlex Richardson ldr tmp2, [sp], #FRAME_SIZE 36331914882SAlex Richardson bx lr 36431914882SAlex Richardson 36531914882SAlex RichardsonL(cpy_body_long): /* Count in tmp2. */ 36631914882SAlex Richardson 36731914882SAlex Richardson /* Long copy. We know that there's at least (prefetch_lines * 64) 36831914882SAlex Richardson bytes to go. */ 36931914882SAlex Richardson#ifdef USE_VFP 37031914882SAlex Richardson /* Don't use PLD. Instead, read some data in advance of the current 37131914882SAlex Richardson copy position into a register. This should act like a PLD 37231914882SAlex Richardson operation but we won't have to repeat the transfer. */ 37331914882SAlex Richardson 37431914882SAlex Richardson vldr d3, [src, #0] 37531914882SAlex Richardson vldr d4, [src, #64] 37631914882SAlex Richardson vldr d5, [src, #128] 37731914882SAlex Richardson vldr d6, [src, #192] 37831914882SAlex Richardson vldr d7, [src, #256] 37931914882SAlex Richardson 38031914882SAlex Richardson vldr d0, [src, #8] 38131914882SAlex Richardson vldr d1, [src, #16] 38231914882SAlex Richardson vldr d2, [src, #24] 38331914882SAlex Richardson add src, src, #32 38431914882SAlex Richardson 38531914882SAlex Richardson subs tmp2, tmp2, #prefetch_lines * 64 * 2 38631914882SAlex Richardson blo 2f 38731914882SAlex Richardson1: 38831914882SAlex Richardson cpy_line_vfp d3, 0 38931914882SAlex Richardson cpy_line_vfp d4, 64 39031914882SAlex Richardson cpy_line_vfp d5, 128 39131914882SAlex Richardson add dst, dst, #3 * 64 39231914882SAlex Richardson add src, src, #3 * 64 39331914882SAlex Richardson cpy_line_vfp d6, 0 39431914882SAlex Richardson cpy_line_vfp d7, 64 39531914882SAlex Richardson add dst, dst, #2 * 64 39631914882SAlex Richardson add src, src, #2 * 64 39731914882SAlex Richardson subs tmp2, tmp2, #prefetch_lines * 64 39831914882SAlex Richardson bhs 1b 39931914882SAlex Richardson 40031914882SAlex Richardson2: 40131914882SAlex Richardson cpy_tail_vfp d3, 0 40231914882SAlex Richardson cpy_tail_vfp d4, 64 40331914882SAlex Richardson cpy_tail_vfp d5, 128 40431914882SAlex Richardson add src, src, #3 * 64 40531914882SAlex Richardson add dst, dst, #3 * 64 40631914882SAlex Richardson cpy_tail_vfp d6, 0 40731914882SAlex Richardson vstr d7, [dst, #64] 40831914882SAlex Richardson vldr d7, [src, #64] 40931914882SAlex Richardson vstr d0, [dst, #64 + 8] 41031914882SAlex Richardson vldr d0, [src, #64 + 8] 41131914882SAlex Richardson vstr d1, [dst, #64 + 16] 41231914882SAlex Richardson vldr d1, [src, #64 + 16] 41331914882SAlex Richardson vstr d2, [dst, #64 + 24] 41431914882SAlex Richardson vldr d2, [src, #64 + 24] 41531914882SAlex Richardson vstr d7, [dst, #64 + 32] 41631914882SAlex Richardson add src, src, #96 41731914882SAlex Richardson vstr d0, [dst, #64 + 40] 41831914882SAlex Richardson vstr d1, [dst, #64 + 48] 41931914882SAlex Richardson vstr d2, [dst, #64 + 56] 42031914882SAlex Richardson add dst, dst, #128 42131914882SAlex Richardson add tmp2, tmp2, #prefetch_lines * 64 42231914882SAlex Richardson b L(cpy_body_medium) 42331914882SAlex Richardson#else 42431914882SAlex Richardson /* Long copy. Use an SMS style loop to maximize the I/O 42531914882SAlex Richardson bandwidth of the core. We don't have enough spare registers 42631914882SAlex Richardson to synthesise prefetching, so use PLD operations. */ 42731914882SAlex Richardson /* Pre-bias src and dst. */ 42831914882SAlex Richardson sub src, src, #8 42931914882SAlex Richardson sub dst, dst, #8 43031914882SAlex Richardson pld [src, #8] 43131914882SAlex Richardson pld [src, #72] 43231914882SAlex Richardson subs tmp2, tmp2, #64 43331914882SAlex Richardson pld [src, #136] 43431914882SAlex Richardson ldrd A_l, A_h, [src, #8] 43531914882SAlex Richardson strd B_l, B_h, [sp, #8] 43631914882SAlex Richardson ldrd B_l, B_h, [src, #16] 43731914882SAlex Richardson strd C_l, C_h, [sp, #16] 43831914882SAlex Richardson ldrd C_l, C_h, [src, #24] 43931914882SAlex Richardson strd D_l, D_h, [sp, #24] 44031914882SAlex Richardson pld [src, #200] 44131914882SAlex Richardson ldrd D_l, D_h, [src, #32]! 44231914882SAlex Richardson b 1f 44331914882SAlex Richardson .p2align 6 44431914882SAlex Richardson2: 44531914882SAlex Richardson pld [src, #232] 44631914882SAlex Richardson strd A_l, A_h, [dst, #40] 44731914882SAlex Richardson ldrd A_l, A_h, [src, #40] 44831914882SAlex Richardson strd B_l, B_h, [dst, #48] 44931914882SAlex Richardson ldrd B_l, B_h, [src, #48] 45031914882SAlex Richardson strd C_l, C_h, [dst, #56] 45131914882SAlex Richardson ldrd C_l, C_h, [src, #56] 45231914882SAlex Richardson strd D_l, D_h, [dst, #64]! 45331914882SAlex Richardson ldrd D_l, D_h, [src, #64]! 45431914882SAlex Richardson subs tmp2, tmp2, #64 45531914882SAlex Richardson1: 45631914882SAlex Richardson strd A_l, A_h, [dst, #8] 45731914882SAlex Richardson ldrd A_l, A_h, [src, #8] 45831914882SAlex Richardson strd B_l, B_h, [dst, #16] 45931914882SAlex Richardson ldrd B_l, B_h, [src, #16] 46031914882SAlex Richardson strd C_l, C_h, [dst, #24] 46131914882SAlex Richardson ldrd C_l, C_h, [src, #24] 46231914882SAlex Richardson strd D_l, D_h, [dst, #32] 46331914882SAlex Richardson ldrd D_l, D_h, [src, #32] 46431914882SAlex Richardson bcs 2b 46531914882SAlex Richardson /* Save the remaining bytes and restore the callee-saved regs. */ 46631914882SAlex Richardson strd A_l, A_h, [dst, #40] 46731914882SAlex Richardson add src, src, #40 46831914882SAlex Richardson strd B_l, B_h, [dst, #48] 46931914882SAlex Richardson ldrd B_l, B_h, [sp, #8] 47031914882SAlex Richardson strd C_l, C_h, [dst, #56] 47131914882SAlex Richardson ldrd C_l, C_h, [sp, #16] 47231914882SAlex Richardson strd D_l, D_h, [dst, #64] 47331914882SAlex Richardson ldrd D_l, D_h, [sp, #24] 47431914882SAlex Richardson add dst, dst, #72 47531914882SAlex Richardson tst tmp2, #0x3f 47631914882SAlex Richardson bne L(tail63aligned) 47731914882SAlex Richardson ldr tmp2, [sp], #FRAME_SIZE 47831914882SAlex Richardson bx lr 47931914882SAlex Richardson#endif 48031914882SAlex Richardson 48131914882SAlex RichardsonL(cpy_notaligned): 48231914882SAlex Richardson pld [src] 48331914882SAlex Richardson pld [src, #64] 48431914882SAlex Richardson /* There's at least 64 bytes to copy, but there is no mutual 48531914882SAlex Richardson alignment. */ 48631914882SAlex Richardson /* Bring DST to 64-bit alignment. */ 48731914882SAlex Richardson lsls tmp2, dst, #29 48831914882SAlex Richardson pld [src, #(2 * 64)] 48931914882SAlex Richardson beq 1f 49031914882SAlex Richardson rsbs tmp2, tmp2, #0 49131914882SAlex Richardson sub count, count, tmp2, lsr #29 49231914882SAlex Richardson ldrmi tmp1, [src], #4 49331914882SAlex Richardson strmi tmp1, [dst], #4 49431914882SAlex Richardson lsls tmp2, tmp2, #2 49531914882SAlex Richardson ldrbne tmp1, [src], #1 49631914882SAlex Richardson ldrhcs tmp2, [src], #2 49731914882SAlex Richardson strbne tmp1, [dst], #1 49831914882SAlex Richardson strhcs tmp2, [dst], #2 49931914882SAlex Richardson1: 50031914882SAlex Richardson pld [src, #(3 * 64)] 50131914882SAlex Richardson subs count, count, #64 50231914882SAlex Richardson ldrlo tmp2, [sp], #FRAME_SIZE 50331914882SAlex Richardson blo L(tail63unaligned) 50431914882SAlex Richardson pld [src, #(4 * 64)] 50531914882SAlex Richardson 50631914882SAlex Richardson#ifdef USE_NEON 50731914882SAlex Richardson vld1.8 {d0-d3}, [src]! 50831914882SAlex Richardson vld1.8 {d4-d7}, [src]! 50931914882SAlex Richardson subs count, count, #64 51031914882SAlex Richardson blo 2f 51131914882SAlex Richardson1: 51231914882SAlex Richardson pld [src, #(4 * 64)] 51331914882SAlex Richardson vst1.8 {d0-d3}, [ALIGN (dst, 64)]! 51431914882SAlex Richardson vld1.8 {d0-d3}, [src]! 51531914882SAlex Richardson vst1.8 {d4-d7}, [ALIGN (dst, 64)]! 51631914882SAlex Richardson vld1.8 {d4-d7}, [src]! 51731914882SAlex Richardson subs count, count, #64 51831914882SAlex Richardson bhs 1b 51931914882SAlex Richardson2: 52031914882SAlex Richardson vst1.8 {d0-d3}, [ALIGN (dst, 64)]! 52131914882SAlex Richardson vst1.8 {d4-d7}, [ALIGN (dst, 64)]! 52231914882SAlex Richardson ands count, count, #0x3f 52331914882SAlex Richardson#else 52431914882SAlex Richardson /* Use an SMS style loop to maximize the I/O bandwidth. */ 52531914882SAlex Richardson sub src, src, #4 52631914882SAlex Richardson sub dst, dst, #8 52731914882SAlex Richardson subs tmp2, count, #64 /* Use tmp2 for count. */ 52831914882SAlex Richardson ldr A_l, [src, #4] 52931914882SAlex Richardson ldr A_h, [src, #8] 53031914882SAlex Richardson strd B_l, B_h, [sp, #8] 53131914882SAlex Richardson ldr B_l, [src, #12] 53231914882SAlex Richardson ldr B_h, [src, #16] 53331914882SAlex Richardson strd C_l, C_h, [sp, #16] 53431914882SAlex Richardson ldr C_l, [src, #20] 53531914882SAlex Richardson ldr C_h, [src, #24] 53631914882SAlex Richardson strd D_l, D_h, [sp, #24] 53731914882SAlex Richardson ldr D_l, [src, #28] 53831914882SAlex Richardson ldr D_h, [src, #32]! 53931914882SAlex Richardson b 1f 54031914882SAlex Richardson .p2align 6 54131914882SAlex Richardson2: 54231914882SAlex Richardson pld [src, #(5 * 64) - (32 - 4)] 54331914882SAlex Richardson strd A_l, A_h, [dst, #40] 54431914882SAlex Richardson ldr A_l, [src, #36] 54531914882SAlex Richardson ldr A_h, [src, #40] 54631914882SAlex Richardson strd B_l, B_h, [dst, #48] 54731914882SAlex Richardson ldr B_l, [src, #44] 54831914882SAlex Richardson ldr B_h, [src, #48] 54931914882SAlex Richardson strd C_l, C_h, [dst, #56] 55031914882SAlex Richardson ldr C_l, [src, #52] 55131914882SAlex Richardson ldr C_h, [src, #56] 55231914882SAlex Richardson strd D_l, D_h, [dst, #64]! 55331914882SAlex Richardson ldr D_l, [src, #60] 55431914882SAlex Richardson ldr D_h, [src, #64]! 55531914882SAlex Richardson subs tmp2, tmp2, #64 55631914882SAlex Richardson1: 55731914882SAlex Richardson strd A_l, A_h, [dst, #8] 55831914882SAlex Richardson ldr A_l, [src, #4] 55931914882SAlex Richardson ldr A_h, [src, #8] 56031914882SAlex Richardson strd B_l, B_h, [dst, #16] 56131914882SAlex Richardson ldr B_l, [src, #12] 56231914882SAlex Richardson ldr B_h, [src, #16] 56331914882SAlex Richardson strd C_l, C_h, [dst, #24] 56431914882SAlex Richardson ldr C_l, [src, #20] 56531914882SAlex Richardson ldr C_h, [src, #24] 56631914882SAlex Richardson strd D_l, D_h, [dst, #32] 56731914882SAlex Richardson ldr D_l, [src, #28] 56831914882SAlex Richardson ldr D_h, [src, #32] 56931914882SAlex Richardson bcs 2b 57031914882SAlex Richardson 57131914882SAlex Richardson /* Save the remaining bytes and restore the callee-saved regs. */ 57231914882SAlex Richardson strd A_l, A_h, [dst, #40] 57331914882SAlex Richardson add src, src, #36 57431914882SAlex Richardson strd B_l, B_h, [dst, #48] 57531914882SAlex Richardson ldrd B_l, B_h, [sp, #8] 57631914882SAlex Richardson strd C_l, C_h, [dst, #56] 57731914882SAlex Richardson ldrd C_l, C_h, [sp, #16] 57831914882SAlex Richardson strd D_l, D_h, [dst, #64] 57931914882SAlex Richardson ldrd D_l, D_h, [sp, #24] 58031914882SAlex Richardson add dst, dst, #72 58131914882SAlex Richardson ands count, tmp2, #0x3f 58231914882SAlex Richardson#endif 58331914882SAlex Richardson ldr tmp2, [sp], #FRAME_SIZE 58431914882SAlex Richardson bne L(tail63unaligned) 58531914882SAlex Richardson bx lr 58631914882SAlex Richardson 58731914882SAlex RichardsonEND (__memcpy_arm) 588