1*09a53ad8SAndrew Turner/* Copyright (c) 2013, Linaro Limited 2*09a53ad8SAndrew Turner All rights reserved. 3*09a53ad8SAndrew Turner 4*09a53ad8SAndrew Turner Redistribution and use in source and binary forms, with or without 5*09a53ad8SAndrew Turner modification, are permitted provided that the following conditions 6*09a53ad8SAndrew Turner are met: 7*09a53ad8SAndrew Turner 8*09a53ad8SAndrew Turner * Redistributions of source code must retain the above copyright 9*09a53ad8SAndrew Turner notice, this list of conditions and the following disclaimer. 10*09a53ad8SAndrew Turner 11*09a53ad8SAndrew Turner * Redistributions in binary form must reproduce the above copyright 12*09a53ad8SAndrew Turner notice, this list of conditions and the following disclaimer in the 13*09a53ad8SAndrew Turner documentation and/or other materials provided with the distribution. 14*09a53ad8SAndrew Turner 15*09a53ad8SAndrew Turner * Neither the name of Linaro Limited nor the names of its 16*09a53ad8SAndrew Turner contributors may be used to endorse or promote products derived 17*09a53ad8SAndrew Turner from this software without specific prior written permission. 18*09a53ad8SAndrew Turner 19*09a53ad8SAndrew Turner THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20*09a53ad8SAndrew Turner "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21*09a53ad8SAndrew Turner LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22*09a53ad8SAndrew Turner A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23*09a53ad8SAndrew Turner HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24*09a53ad8SAndrew Turner SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25*09a53ad8SAndrew Turner LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26*09a53ad8SAndrew Turner DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27*09a53ad8SAndrew Turner THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28*09a53ad8SAndrew Turner (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29*09a53ad8SAndrew Turner OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30*09a53ad8SAndrew Turner */ 31*09a53ad8SAndrew Turner 32*09a53ad8SAndrew Turner/* 33*09a53ad8SAndrew Turner This memcpy routine is optimised for Cortex-A15 cores and takes advantage 34*09a53ad8SAndrew Turner of VFP or NEON when built with the appropriate flags. 35*09a53ad8SAndrew Turner 36*09a53ad8SAndrew Turner Assumptions: 37*09a53ad8SAndrew Turner 38*09a53ad8SAndrew Turner ARMv6 (ARMv7-a if using Neon) 39*09a53ad8SAndrew Turner ARM state 40*09a53ad8SAndrew Turner Unaligned accesses 41*09a53ad8SAndrew Turner 42*09a53ad8SAndrew Turner */ 43*09a53ad8SAndrew Turner 44*09a53ad8SAndrew Turner .syntax unified 45*09a53ad8SAndrew Turner /* This implementation requires ARM state. */ 46*09a53ad8SAndrew Turner .arm 47*09a53ad8SAndrew Turner 48*09a53ad8SAndrew Turner#ifdef __ARM_NEON__ 49*09a53ad8SAndrew Turner 50*09a53ad8SAndrew Turner .fpu neon 51*09a53ad8SAndrew Turner .arch armv7-a 52*09a53ad8SAndrew Turner# define FRAME_SIZE 4 53*09a53ad8SAndrew Turner# define USE_VFP 54*09a53ad8SAndrew Turner# define USE_NEON 55*09a53ad8SAndrew Turner 56*09a53ad8SAndrew Turner#elif !defined (__SOFTFP__) 57*09a53ad8SAndrew Turner 58*09a53ad8SAndrew Turner .arch armv6 59*09a53ad8SAndrew Turner .fpu vfpv2 60*09a53ad8SAndrew Turner# define FRAME_SIZE 32 61*09a53ad8SAndrew Turner# define USE_VFP 62*09a53ad8SAndrew Turner 63*09a53ad8SAndrew Turner#else 64*09a53ad8SAndrew Turner .arch armv6 65*09a53ad8SAndrew Turner# define FRAME_SIZE 32 66*09a53ad8SAndrew Turner 67*09a53ad8SAndrew Turner#endif 68*09a53ad8SAndrew Turner 69*09a53ad8SAndrew Turner/* Old versions of GAS incorrectly implement the NEON align semantics. */ 70*09a53ad8SAndrew Turner#ifdef BROKEN_ASM_NEON_ALIGN 71*09a53ad8SAndrew Turner#define ALIGN(addr, align) addr,:align 72*09a53ad8SAndrew Turner#else 73*09a53ad8SAndrew Turner#define ALIGN(addr, align) addr:align 74*09a53ad8SAndrew Turner#endif 75*09a53ad8SAndrew Turner 76*09a53ad8SAndrew Turner#define PC_OFFSET 8 /* PC pipeline compensation. */ 77*09a53ad8SAndrew Turner#define INSN_SIZE 4 78*09a53ad8SAndrew Turner 79*09a53ad8SAndrew Turner/* Call parameters. */ 80*09a53ad8SAndrew Turner#define dstin r0 81*09a53ad8SAndrew Turner#define src r1 82*09a53ad8SAndrew Turner#define count r2 83*09a53ad8SAndrew Turner 84*09a53ad8SAndrew Turner/* Locals. */ 85*09a53ad8SAndrew Turner#define tmp1 r3 86*09a53ad8SAndrew Turner#define dst ip 87*09a53ad8SAndrew Turner#define tmp2 r10 88*09a53ad8SAndrew Turner 89*09a53ad8SAndrew Turner#ifndef USE_NEON 90*09a53ad8SAndrew Turner/* For bulk copies using GP registers. */ 91*09a53ad8SAndrew Turner#define A_l r2 /* Call-clobbered. */ 92*09a53ad8SAndrew Turner#define A_h r3 /* Call-clobbered. */ 93*09a53ad8SAndrew Turner#define B_l r4 94*09a53ad8SAndrew Turner#define B_h r5 95*09a53ad8SAndrew Turner#define C_l r6 96*09a53ad8SAndrew Turner#define C_h r7 97*09a53ad8SAndrew Turner#define D_l r8 98*09a53ad8SAndrew Turner#define D_h r9 99*09a53ad8SAndrew Turner#endif 100*09a53ad8SAndrew Turner 101*09a53ad8SAndrew Turner/* Number of lines ahead to pre-fetch data. If you change this the code 102*09a53ad8SAndrew Turner below will need adjustment to compensate. */ 103*09a53ad8SAndrew Turner 104*09a53ad8SAndrew Turner#define prefetch_lines 5 105*09a53ad8SAndrew Turner 106*09a53ad8SAndrew Turner#ifdef USE_VFP 107*09a53ad8SAndrew Turner .macro cpy_line_vfp vreg, base 108*09a53ad8SAndrew Turner vstr \vreg, [dst, #\base] 109*09a53ad8SAndrew Turner vldr \vreg, [src, #\base] 110*09a53ad8SAndrew Turner vstr d0, [dst, #\base + 8] 111*09a53ad8SAndrew Turner vldr d0, [src, #\base + 8] 112*09a53ad8SAndrew Turner vstr d1, [dst, #\base + 16] 113*09a53ad8SAndrew Turner vldr d1, [src, #\base + 16] 114*09a53ad8SAndrew Turner vstr d2, [dst, #\base + 24] 115*09a53ad8SAndrew Turner vldr d2, [src, #\base + 24] 116*09a53ad8SAndrew Turner vstr \vreg, [dst, #\base + 32] 117*09a53ad8SAndrew Turner vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] 118*09a53ad8SAndrew Turner vstr d0, [dst, #\base + 40] 119*09a53ad8SAndrew Turner vldr d0, [src, #\base + 40] 120*09a53ad8SAndrew Turner vstr d1, [dst, #\base + 48] 121*09a53ad8SAndrew Turner vldr d1, [src, #\base + 48] 122*09a53ad8SAndrew Turner vstr d2, [dst, #\base + 56] 123*09a53ad8SAndrew Turner vldr d2, [src, #\base + 56] 124*09a53ad8SAndrew Turner .endm 125*09a53ad8SAndrew Turner 126*09a53ad8SAndrew Turner .macro cpy_tail_vfp vreg, base 127*09a53ad8SAndrew Turner vstr \vreg, [dst, #\base] 128*09a53ad8SAndrew Turner vldr \vreg, [src, #\base] 129*09a53ad8SAndrew Turner vstr d0, [dst, #\base + 8] 130*09a53ad8SAndrew Turner vldr d0, [src, #\base + 8] 131*09a53ad8SAndrew Turner vstr d1, [dst, #\base + 16] 132*09a53ad8SAndrew Turner vldr d1, [src, #\base + 16] 133*09a53ad8SAndrew Turner vstr d2, [dst, #\base + 24] 134*09a53ad8SAndrew Turner vldr d2, [src, #\base + 24] 135*09a53ad8SAndrew Turner vstr \vreg, [dst, #\base + 32] 136*09a53ad8SAndrew Turner vstr d0, [dst, #\base + 40] 137*09a53ad8SAndrew Turner vldr d0, [src, #\base + 40] 138*09a53ad8SAndrew Turner vstr d1, [dst, #\base + 48] 139*09a53ad8SAndrew Turner vldr d1, [src, #\base + 48] 140*09a53ad8SAndrew Turner vstr d2, [dst, #\base + 56] 141*09a53ad8SAndrew Turner vldr d2, [src, #\base + 56] 142*09a53ad8SAndrew Turner .endm 143*09a53ad8SAndrew Turner#endif 144*09a53ad8SAndrew Turner 145*09a53ad8SAndrew Turner .macro def_fn f p2align=0 146*09a53ad8SAndrew Turner .text 147*09a53ad8SAndrew Turner .p2align \p2align 148*09a53ad8SAndrew Turner .global \f 149*09a53ad8SAndrew Turner .type \f, %function 150*09a53ad8SAndrew Turner\f: 151*09a53ad8SAndrew Turner .endm 152*09a53ad8SAndrew Turner 153*09a53ad8SAndrew Turnerdef_fn memcpy p2align=6 154*09a53ad8SAndrew Turner 155*09a53ad8SAndrew Turner mov dst, dstin /* Preserve dstin, we need to return it. */ 156*09a53ad8SAndrew Turner cmp count, #64 157*09a53ad8SAndrew Turner bge .Lcpy_not_short 158*09a53ad8SAndrew Turner /* Deal with small copies quickly by dropping straight into the 159*09a53ad8SAndrew Turner exit block. */ 160*09a53ad8SAndrew Turner 161*09a53ad8SAndrew Turner.Ltail63unaligned: 162*09a53ad8SAndrew Turner#ifdef USE_NEON 163*09a53ad8SAndrew Turner and tmp1, count, #0x38 164*09a53ad8SAndrew Turner rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 165*09a53ad8SAndrew Turner add pc, pc, tmp1 166*09a53ad8SAndrew Turner vld1.8 {d0}, [src]! /* 14 words to go. */ 167*09a53ad8SAndrew Turner vst1.8 {d0}, [dst]! 168*09a53ad8SAndrew Turner vld1.8 {d0}, [src]! /* 12 words to go. */ 169*09a53ad8SAndrew Turner vst1.8 {d0}, [dst]! 170*09a53ad8SAndrew Turner vld1.8 {d0}, [src]! /* 10 words to go. */ 171*09a53ad8SAndrew Turner vst1.8 {d0}, [dst]! 172*09a53ad8SAndrew Turner vld1.8 {d0}, [src]! /* 8 words to go. */ 173*09a53ad8SAndrew Turner vst1.8 {d0}, [dst]! 174*09a53ad8SAndrew Turner vld1.8 {d0}, [src]! /* 6 words to go. */ 175*09a53ad8SAndrew Turner vst1.8 {d0}, [dst]! 176*09a53ad8SAndrew Turner vld1.8 {d0}, [src]! /* 4 words to go. */ 177*09a53ad8SAndrew Turner vst1.8 {d0}, [dst]! 178*09a53ad8SAndrew Turner vld1.8 {d0}, [src]! /* 2 words to go. */ 179*09a53ad8SAndrew Turner vst1.8 {d0}, [dst]! 180*09a53ad8SAndrew Turner 181*09a53ad8SAndrew Turner tst count, #4 182*09a53ad8SAndrew Turner ldrne tmp1, [src], #4 183*09a53ad8SAndrew Turner strne tmp1, [dst], #4 184*09a53ad8SAndrew Turner#else 185*09a53ad8SAndrew Turner /* Copy up to 15 full words of data. May not be aligned. */ 186*09a53ad8SAndrew Turner /* Cannot use VFP for unaligned data. */ 187*09a53ad8SAndrew Turner and tmp1, count, #0x3c 188*09a53ad8SAndrew Turner add dst, dst, tmp1 189*09a53ad8SAndrew Turner add src, src, tmp1 190*09a53ad8SAndrew Turner rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) 191*09a53ad8SAndrew Turner /* Jump directly into the sequence below at the correct offset. */ 192*09a53ad8SAndrew Turner add pc, pc, tmp1, lsl #1 193*09a53ad8SAndrew Turner 194*09a53ad8SAndrew Turner ldr tmp1, [src, #-60] /* 15 words to go. */ 195*09a53ad8SAndrew Turner str tmp1, [dst, #-60] 196*09a53ad8SAndrew Turner 197*09a53ad8SAndrew Turner ldr tmp1, [src, #-56] /* 14 words to go. */ 198*09a53ad8SAndrew Turner str tmp1, [dst, #-56] 199*09a53ad8SAndrew Turner ldr tmp1, [src, #-52] 200*09a53ad8SAndrew Turner str tmp1, [dst, #-52] 201*09a53ad8SAndrew Turner 202*09a53ad8SAndrew Turner ldr tmp1, [src, #-48] /* 12 words to go. */ 203*09a53ad8SAndrew Turner str tmp1, [dst, #-48] 204*09a53ad8SAndrew Turner ldr tmp1, [src, #-44] 205*09a53ad8SAndrew Turner str tmp1, [dst, #-44] 206*09a53ad8SAndrew Turner 207*09a53ad8SAndrew Turner ldr tmp1, [src, #-40] /* 10 words to go. */ 208*09a53ad8SAndrew Turner str tmp1, [dst, #-40] 209*09a53ad8SAndrew Turner ldr tmp1, [src, #-36] 210*09a53ad8SAndrew Turner str tmp1, [dst, #-36] 211*09a53ad8SAndrew Turner 212*09a53ad8SAndrew Turner ldr tmp1, [src, #-32] /* 8 words to go. */ 213*09a53ad8SAndrew Turner str tmp1, [dst, #-32] 214*09a53ad8SAndrew Turner ldr tmp1, [src, #-28] 215*09a53ad8SAndrew Turner str tmp1, [dst, #-28] 216*09a53ad8SAndrew Turner 217*09a53ad8SAndrew Turner ldr tmp1, [src, #-24] /* 6 words to go. */ 218*09a53ad8SAndrew Turner str tmp1, [dst, #-24] 219*09a53ad8SAndrew Turner ldr tmp1, [src, #-20] 220*09a53ad8SAndrew Turner str tmp1, [dst, #-20] 221*09a53ad8SAndrew Turner 222*09a53ad8SAndrew Turner ldr tmp1, [src, #-16] /* 4 words to go. */ 223*09a53ad8SAndrew Turner str tmp1, [dst, #-16] 224*09a53ad8SAndrew Turner ldr tmp1, [src, #-12] 225*09a53ad8SAndrew Turner str tmp1, [dst, #-12] 226*09a53ad8SAndrew Turner 227*09a53ad8SAndrew Turner ldr tmp1, [src, #-8] /* 2 words to go. */ 228*09a53ad8SAndrew Turner str tmp1, [dst, #-8] 229*09a53ad8SAndrew Turner ldr tmp1, [src, #-4] 230*09a53ad8SAndrew Turner str tmp1, [dst, #-4] 231*09a53ad8SAndrew Turner#endif 232*09a53ad8SAndrew Turner 233*09a53ad8SAndrew Turner lsls count, count, #31 234*09a53ad8SAndrew Turner ldrhcs tmp1, [src], #2 235*09a53ad8SAndrew Turner ldrbne src, [src] /* Src is dead, use as a scratch. */ 236*09a53ad8SAndrew Turner strhcs tmp1, [dst], #2 237*09a53ad8SAndrew Turner strbne src, [dst] 238*09a53ad8SAndrew Turner bx lr 239*09a53ad8SAndrew Turner 240*09a53ad8SAndrew Turner.Lcpy_not_short: 241*09a53ad8SAndrew Turner /* At least 64 bytes to copy, but don't know the alignment yet. */ 242*09a53ad8SAndrew Turner str tmp2, [sp, #-FRAME_SIZE]! 243*09a53ad8SAndrew Turner and tmp2, src, #7 244*09a53ad8SAndrew Turner and tmp1, dst, #7 245*09a53ad8SAndrew Turner cmp tmp1, tmp2 246*09a53ad8SAndrew Turner bne .Lcpy_notaligned 247*09a53ad8SAndrew Turner 248*09a53ad8SAndrew Turner#ifdef USE_VFP 249*09a53ad8SAndrew Turner /* Magic dust alert! Force VFP on Cortex-A9. Experiments show 250*09a53ad8SAndrew Turner that the FP pipeline is much better at streaming loads and 251*09a53ad8SAndrew Turner stores. This is outside the critical loop. */ 252*09a53ad8SAndrew Turner vmov.f32 s0, s0 253*09a53ad8SAndrew Turner#endif 254*09a53ad8SAndrew Turner 255*09a53ad8SAndrew Turner /* SRC and DST have the same mutual 64-bit alignment, but we may 256*09a53ad8SAndrew Turner still need to pre-copy some bytes to get to natural alignment. 257*09a53ad8SAndrew Turner We bring SRC and DST into full 64-bit alignment. */ 258*09a53ad8SAndrew Turner lsls tmp2, dst, #29 259*09a53ad8SAndrew Turner beq 1f 260*09a53ad8SAndrew Turner rsbs tmp2, tmp2, #0 261*09a53ad8SAndrew Turner sub count, count, tmp2, lsr #29 262*09a53ad8SAndrew Turner ldrmi tmp1, [src], #4 263*09a53ad8SAndrew Turner strmi tmp1, [dst], #4 264*09a53ad8SAndrew Turner lsls tmp2, tmp2, #2 265*09a53ad8SAndrew Turner ldrhcs tmp1, [src], #2 266*09a53ad8SAndrew Turner ldrbne tmp2, [src], #1 267*09a53ad8SAndrew Turner strhcs tmp1, [dst], #2 268*09a53ad8SAndrew Turner strbne tmp2, [dst], #1 269*09a53ad8SAndrew Turner 270*09a53ad8SAndrew Turner1: 271*09a53ad8SAndrew Turner subs tmp2, count, #64 /* Use tmp2 for count. */ 272*09a53ad8SAndrew Turner blt .Ltail63aligned 273*09a53ad8SAndrew Turner 274*09a53ad8SAndrew Turner cmp tmp2, #512 275*09a53ad8SAndrew Turner bge .Lcpy_body_long 276*09a53ad8SAndrew Turner 277*09a53ad8SAndrew Turner.Lcpy_body_medium: /* Count in tmp2. */ 278*09a53ad8SAndrew Turner#ifdef USE_VFP 279*09a53ad8SAndrew Turner1: 280*09a53ad8SAndrew Turner vldr d0, [src, #0] 281*09a53ad8SAndrew Turner subs tmp2, tmp2, #64 282*09a53ad8SAndrew Turner vldr d1, [src, #8] 283*09a53ad8SAndrew Turner vstr d0, [dst, #0] 284*09a53ad8SAndrew Turner vldr d0, [src, #16] 285*09a53ad8SAndrew Turner vstr d1, [dst, #8] 286*09a53ad8SAndrew Turner vldr d1, [src, #24] 287*09a53ad8SAndrew Turner vstr d0, [dst, #16] 288*09a53ad8SAndrew Turner vldr d0, [src, #32] 289*09a53ad8SAndrew Turner vstr d1, [dst, #24] 290*09a53ad8SAndrew Turner vldr d1, [src, #40] 291*09a53ad8SAndrew Turner vstr d0, [dst, #32] 292*09a53ad8SAndrew Turner vldr d0, [src, #48] 293*09a53ad8SAndrew Turner vstr d1, [dst, #40] 294*09a53ad8SAndrew Turner vldr d1, [src, #56] 295*09a53ad8SAndrew Turner vstr d0, [dst, #48] 296*09a53ad8SAndrew Turner add src, src, #64 297*09a53ad8SAndrew Turner vstr d1, [dst, #56] 298*09a53ad8SAndrew Turner add dst, dst, #64 299*09a53ad8SAndrew Turner bge 1b 300*09a53ad8SAndrew Turner tst tmp2, #0x3f 301*09a53ad8SAndrew Turner beq .Ldone 302*09a53ad8SAndrew Turner 303*09a53ad8SAndrew Turner.Ltail63aligned: /* Count in tmp2. */ 304*09a53ad8SAndrew Turner and tmp1, tmp2, #0x38 305*09a53ad8SAndrew Turner add dst, dst, tmp1 306*09a53ad8SAndrew Turner add src, src, tmp1 307*09a53ad8SAndrew Turner rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 308*09a53ad8SAndrew Turner add pc, pc, tmp1 309*09a53ad8SAndrew Turner 310*09a53ad8SAndrew Turner vldr d0, [src, #-56] /* 14 words to go. */ 311*09a53ad8SAndrew Turner vstr d0, [dst, #-56] 312*09a53ad8SAndrew Turner vldr d0, [src, #-48] /* 12 words to go. */ 313*09a53ad8SAndrew Turner vstr d0, [dst, #-48] 314*09a53ad8SAndrew Turner vldr d0, [src, #-40] /* 10 words to go. */ 315*09a53ad8SAndrew Turner vstr d0, [dst, #-40] 316*09a53ad8SAndrew Turner vldr d0, [src, #-32] /* 8 words to go. */ 317*09a53ad8SAndrew Turner vstr d0, [dst, #-32] 318*09a53ad8SAndrew Turner vldr d0, [src, #-24] /* 6 words to go. */ 319*09a53ad8SAndrew Turner vstr d0, [dst, #-24] 320*09a53ad8SAndrew Turner vldr d0, [src, #-16] /* 4 words to go. */ 321*09a53ad8SAndrew Turner vstr d0, [dst, #-16] 322*09a53ad8SAndrew Turner vldr d0, [src, #-8] /* 2 words to go. */ 323*09a53ad8SAndrew Turner vstr d0, [dst, #-8] 324*09a53ad8SAndrew Turner#else 325*09a53ad8SAndrew Turner sub src, src, #8 326*09a53ad8SAndrew Turner sub dst, dst, #8 327*09a53ad8SAndrew Turner1: 328*09a53ad8SAndrew Turner ldrd A_l, A_h, [src, #8] 329*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #8] 330*09a53ad8SAndrew Turner ldrd A_l, A_h, [src, #16] 331*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #16] 332*09a53ad8SAndrew Turner ldrd A_l, A_h, [src, #24] 333*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #24] 334*09a53ad8SAndrew Turner ldrd A_l, A_h, [src, #32] 335*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #32] 336*09a53ad8SAndrew Turner ldrd A_l, A_h, [src, #40] 337*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #40] 338*09a53ad8SAndrew Turner ldrd A_l, A_h, [src, #48] 339*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #48] 340*09a53ad8SAndrew Turner ldrd A_l, A_h, [src, #56] 341*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #56] 342*09a53ad8SAndrew Turner ldrd A_l, A_h, [src, #64]! 343*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #64]! 344*09a53ad8SAndrew Turner subs tmp2, tmp2, #64 345*09a53ad8SAndrew Turner bge 1b 346*09a53ad8SAndrew Turner tst tmp2, #0x3f 347*09a53ad8SAndrew Turner bne 1f 348*09a53ad8SAndrew Turner ldr tmp2,[sp], #FRAME_SIZE 349*09a53ad8SAndrew Turner bx lr 350*09a53ad8SAndrew Turner1: 351*09a53ad8SAndrew Turner add src, src, #8 352*09a53ad8SAndrew Turner add dst, dst, #8 353*09a53ad8SAndrew Turner 354*09a53ad8SAndrew Turner.Ltail63aligned: /* Count in tmp2. */ 355*09a53ad8SAndrew Turner /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but 356*09a53ad8SAndrew Turner we know that the src and dest are 64-bit aligned so we can use 357*09a53ad8SAndrew Turner LDRD/STRD to improve efficiency. */ 358*09a53ad8SAndrew Turner /* TMP2 is now negative, but we don't care about that. The bottom 359*09a53ad8SAndrew Turner six bits still tell us how many bytes are left to copy. */ 360*09a53ad8SAndrew Turner 361*09a53ad8SAndrew Turner and tmp1, tmp2, #0x38 362*09a53ad8SAndrew Turner add dst, dst, tmp1 363*09a53ad8SAndrew Turner add src, src, tmp1 364*09a53ad8SAndrew Turner rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 365*09a53ad8SAndrew Turner add pc, pc, tmp1 366*09a53ad8SAndrew Turner ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ 367*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #-56] 368*09a53ad8SAndrew Turner ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ 369*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #-48] 370*09a53ad8SAndrew Turner ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ 371*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #-40] 372*09a53ad8SAndrew Turner ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ 373*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #-32] 374*09a53ad8SAndrew Turner ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ 375*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #-24] 376*09a53ad8SAndrew Turner ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ 377*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #-16] 378*09a53ad8SAndrew Turner ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ 379*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #-8] 380*09a53ad8SAndrew Turner 381*09a53ad8SAndrew Turner#endif 382*09a53ad8SAndrew Turner tst tmp2, #4 383*09a53ad8SAndrew Turner ldrne tmp1, [src], #4 384*09a53ad8SAndrew Turner strne tmp1, [dst], #4 385*09a53ad8SAndrew Turner lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ 386*09a53ad8SAndrew Turner ldrhcs tmp1, [src], #2 387*09a53ad8SAndrew Turner ldrbne tmp2, [src] 388*09a53ad8SAndrew Turner strhcs tmp1, [dst], #2 389*09a53ad8SAndrew Turner strbne tmp2, [dst] 390*09a53ad8SAndrew Turner 391*09a53ad8SAndrew Turner.Ldone: 392*09a53ad8SAndrew Turner ldr tmp2, [sp], #FRAME_SIZE 393*09a53ad8SAndrew Turner bx lr 394*09a53ad8SAndrew Turner 395*09a53ad8SAndrew Turner.Lcpy_body_long: /* Count in tmp2. */ 396*09a53ad8SAndrew Turner 397*09a53ad8SAndrew Turner /* Long copy. We know that there's at least (prefetch_lines * 64) 398*09a53ad8SAndrew Turner bytes to go. */ 399*09a53ad8SAndrew Turner#ifdef USE_VFP 400*09a53ad8SAndrew Turner /* Don't use PLD. Instead, read some data in advance of the current 401*09a53ad8SAndrew Turner copy position into a register. This should act like a PLD 402*09a53ad8SAndrew Turner operation but we won't have to repeat the transfer. */ 403*09a53ad8SAndrew Turner 404*09a53ad8SAndrew Turner vldr d3, [src, #0] 405*09a53ad8SAndrew Turner vldr d4, [src, #64] 406*09a53ad8SAndrew Turner vldr d5, [src, #128] 407*09a53ad8SAndrew Turner vldr d6, [src, #192] 408*09a53ad8SAndrew Turner vldr d7, [src, #256] 409*09a53ad8SAndrew Turner 410*09a53ad8SAndrew Turner vldr d0, [src, #8] 411*09a53ad8SAndrew Turner vldr d1, [src, #16] 412*09a53ad8SAndrew Turner vldr d2, [src, #24] 413*09a53ad8SAndrew Turner add src, src, #32 414*09a53ad8SAndrew Turner 415*09a53ad8SAndrew Turner subs tmp2, tmp2, #prefetch_lines * 64 * 2 416*09a53ad8SAndrew Turner blt 2f 417*09a53ad8SAndrew Turner1: 418*09a53ad8SAndrew Turner cpy_line_vfp d3, 0 419*09a53ad8SAndrew Turner cpy_line_vfp d4, 64 420*09a53ad8SAndrew Turner cpy_line_vfp d5, 128 421*09a53ad8SAndrew Turner add dst, dst, #3 * 64 422*09a53ad8SAndrew Turner add src, src, #3 * 64 423*09a53ad8SAndrew Turner cpy_line_vfp d6, 0 424*09a53ad8SAndrew Turner cpy_line_vfp d7, 64 425*09a53ad8SAndrew Turner add dst, dst, #2 * 64 426*09a53ad8SAndrew Turner add src, src, #2 * 64 427*09a53ad8SAndrew Turner subs tmp2, tmp2, #prefetch_lines * 64 428*09a53ad8SAndrew Turner bge 1b 429*09a53ad8SAndrew Turner 430*09a53ad8SAndrew Turner2: 431*09a53ad8SAndrew Turner cpy_tail_vfp d3, 0 432*09a53ad8SAndrew Turner cpy_tail_vfp d4, 64 433*09a53ad8SAndrew Turner cpy_tail_vfp d5, 128 434*09a53ad8SAndrew Turner add src, src, #3 * 64 435*09a53ad8SAndrew Turner add dst, dst, #3 * 64 436*09a53ad8SAndrew Turner cpy_tail_vfp d6, 0 437*09a53ad8SAndrew Turner vstr d7, [dst, #64] 438*09a53ad8SAndrew Turner vldr d7, [src, #64] 439*09a53ad8SAndrew Turner vstr d0, [dst, #64 + 8] 440*09a53ad8SAndrew Turner vldr d0, [src, #64 + 8] 441*09a53ad8SAndrew Turner vstr d1, [dst, #64 + 16] 442*09a53ad8SAndrew Turner vldr d1, [src, #64 + 16] 443*09a53ad8SAndrew Turner vstr d2, [dst, #64 + 24] 444*09a53ad8SAndrew Turner vldr d2, [src, #64 + 24] 445*09a53ad8SAndrew Turner vstr d7, [dst, #64 + 32] 446*09a53ad8SAndrew Turner add src, src, #96 447*09a53ad8SAndrew Turner vstr d0, [dst, #64 + 40] 448*09a53ad8SAndrew Turner vstr d1, [dst, #64 + 48] 449*09a53ad8SAndrew Turner vstr d2, [dst, #64 + 56] 450*09a53ad8SAndrew Turner add dst, dst, #128 451*09a53ad8SAndrew Turner add tmp2, tmp2, #prefetch_lines * 64 452*09a53ad8SAndrew Turner b .Lcpy_body_medium 453*09a53ad8SAndrew Turner#else 454*09a53ad8SAndrew Turner /* Long copy. Use an SMS style loop to maximize the I/O 455*09a53ad8SAndrew Turner bandwidth of the core. We don't have enough spare registers 456*09a53ad8SAndrew Turner to synthesise prefetching, so use PLD operations. */ 457*09a53ad8SAndrew Turner /* Pre-bias src and dst. */ 458*09a53ad8SAndrew Turner sub src, src, #8 459*09a53ad8SAndrew Turner sub dst, dst, #8 460*09a53ad8SAndrew Turner pld [src, #8] 461*09a53ad8SAndrew Turner pld [src, #72] 462*09a53ad8SAndrew Turner subs tmp2, tmp2, #64 463*09a53ad8SAndrew Turner pld [src, #136] 464*09a53ad8SAndrew Turner ldrd A_l, A_h, [src, #8] 465*09a53ad8SAndrew Turner strd B_l, B_h, [sp, #8] 466*09a53ad8SAndrew Turner ldrd B_l, B_h, [src, #16] 467*09a53ad8SAndrew Turner strd C_l, C_h, [sp, #16] 468*09a53ad8SAndrew Turner ldrd C_l, C_h, [src, #24] 469*09a53ad8SAndrew Turner strd D_l, D_h, [sp, #24] 470*09a53ad8SAndrew Turner pld [src, #200] 471*09a53ad8SAndrew Turner ldrd D_l, D_h, [src, #32]! 472*09a53ad8SAndrew Turner b 1f 473*09a53ad8SAndrew Turner .p2align 6 474*09a53ad8SAndrew Turner2: 475*09a53ad8SAndrew Turner pld [src, #232] 476*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #40] 477*09a53ad8SAndrew Turner ldrd A_l, A_h, [src, #40] 478*09a53ad8SAndrew Turner strd B_l, B_h, [dst, #48] 479*09a53ad8SAndrew Turner ldrd B_l, B_h, [src, #48] 480*09a53ad8SAndrew Turner strd C_l, C_h, [dst, #56] 481*09a53ad8SAndrew Turner ldrd C_l, C_h, [src, #56] 482*09a53ad8SAndrew Turner strd D_l, D_h, [dst, #64]! 483*09a53ad8SAndrew Turner ldrd D_l, D_h, [src, #64]! 484*09a53ad8SAndrew Turner subs tmp2, tmp2, #64 485*09a53ad8SAndrew Turner1: 486*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #8] 487*09a53ad8SAndrew Turner ldrd A_l, A_h, [src, #8] 488*09a53ad8SAndrew Turner strd B_l, B_h, [dst, #16] 489*09a53ad8SAndrew Turner ldrd B_l, B_h, [src, #16] 490*09a53ad8SAndrew Turner strd C_l, C_h, [dst, #24] 491*09a53ad8SAndrew Turner ldrd C_l, C_h, [src, #24] 492*09a53ad8SAndrew Turner strd D_l, D_h, [dst, #32] 493*09a53ad8SAndrew Turner ldrd D_l, D_h, [src, #32] 494*09a53ad8SAndrew Turner bcs 2b 495*09a53ad8SAndrew Turner /* Save the remaining bytes and restore the callee-saved regs. */ 496*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #40] 497*09a53ad8SAndrew Turner add src, src, #40 498*09a53ad8SAndrew Turner strd B_l, B_h, [dst, #48] 499*09a53ad8SAndrew Turner ldrd B_l, B_h, [sp, #8] 500*09a53ad8SAndrew Turner strd C_l, C_h, [dst, #56] 501*09a53ad8SAndrew Turner ldrd C_l, C_h, [sp, #16] 502*09a53ad8SAndrew Turner strd D_l, D_h, [dst, #64] 503*09a53ad8SAndrew Turner ldrd D_l, D_h, [sp, #24] 504*09a53ad8SAndrew Turner add dst, dst, #72 505*09a53ad8SAndrew Turner tst tmp2, #0x3f 506*09a53ad8SAndrew Turner bne .Ltail63aligned 507*09a53ad8SAndrew Turner ldr tmp2, [sp], #FRAME_SIZE 508*09a53ad8SAndrew Turner bx lr 509*09a53ad8SAndrew Turner#endif 510*09a53ad8SAndrew Turner 511*09a53ad8SAndrew Turner.Lcpy_notaligned: 512*09a53ad8SAndrew Turner pld [src] 513*09a53ad8SAndrew Turner pld [src, #64] 514*09a53ad8SAndrew Turner /* There's at least 64 bytes to copy, but there is no mutual 515*09a53ad8SAndrew Turner alignment. */ 516*09a53ad8SAndrew Turner /* Bring DST to 64-bit alignment. */ 517*09a53ad8SAndrew Turner lsls tmp2, dst, #29 518*09a53ad8SAndrew Turner pld [src, #(2 * 64)] 519*09a53ad8SAndrew Turner beq 1f 520*09a53ad8SAndrew Turner rsbs tmp2, tmp2, #0 521*09a53ad8SAndrew Turner sub count, count, tmp2, lsr #29 522*09a53ad8SAndrew Turner ldrmi tmp1, [src], #4 523*09a53ad8SAndrew Turner strmi tmp1, [dst], #4 524*09a53ad8SAndrew Turner lsls tmp2, tmp2, #2 525*09a53ad8SAndrew Turner ldrbne tmp1, [src], #1 526*09a53ad8SAndrew Turner ldrhcs tmp2, [src], #2 527*09a53ad8SAndrew Turner strbne tmp1, [dst], #1 528*09a53ad8SAndrew Turner strhcs tmp2, [dst], #2 529*09a53ad8SAndrew Turner1: 530*09a53ad8SAndrew Turner pld [src, #(3 * 64)] 531*09a53ad8SAndrew Turner subs count, count, #64 532*09a53ad8SAndrew Turner ldrmi tmp2, [sp], #FRAME_SIZE 533*09a53ad8SAndrew Turner bmi .Ltail63unaligned 534*09a53ad8SAndrew Turner pld [src, #(4 * 64)] 535*09a53ad8SAndrew Turner 536*09a53ad8SAndrew Turner#ifdef USE_NEON 537*09a53ad8SAndrew Turner vld1.8 {d0-d3}, [src]! 538*09a53ad8SAndrew Turner vld1.8 {d4-d7}, [src]! 539*09a53ad8SAndrew Turner subs count, count, #64 540*09a53ad8SAndrew Turner bmi 2f 541*09a53ad8SAndrew Turner1: 542*09a53ad8SAndrew Turner pld [src, #(4 * 64)] 543*09a53ad8SAndrew Turner vst1.8 {d0-d3}, [ALIGN (dst, 64)]! 544*09a53ad8SAndrew Turner vld1.8 {d0-d3}, [src]! 545*09a53ad8SAndrew Turner vst1.8 {d4-d7}, [ALIGN (dst, 64)]! 546*09a53ad8SAndrew Turner vld1.8 {d4-d7}, [src]! 547*09a53ad8SAndrew Turner subs count, count, #64 548*09a53ad8SAndrew Turner bpl 1b 549*09a53ad8SAndrew Turner2: 550*09a53ad8SAndrew Turner vst1.8 {d0-d3}, [ALIGN (dst, 64)]! 551*09a53ad8SAndrew Turner vst1.8 {d4-d7}, [ALIGN (dst, 64)]! 552*09a53ad8SAndrew Turner ands count, count, #0x3f 553*09a53ad8SAndrew Turner#else 554*09a53ad8SAndrew Turner /* Use an SMS style loop to maximize the I/O bandwidth. */ 555*09a53ad8SAndrew Turner sub src, src, #4 556*09a53ad8SAndrew Turner sub dst, dst, #8 557*09a53ad8SAndrew Turner subs tmp2, count, #64 /* Use tmp2 for count. */ 558*09a53ad8SAndrew Turner ldr A_l, [src, #4] 559*09a53ad8SAndrew Turner ldr A_h, [src, #8] 560*09a53ad8SAndrew Turner strd B_l, B_h, [sp, #8] 561*09a53ad8SAndrew Turner ldr B_l, [src, #12] 562*09a53ad8SAndrew Turner ldr B_h, [src, #16] 563*09a53ad8SAndrew Turner strd C_l, C_h, [sp, #16] 564*09a53ad8SAndrew Turner ldr C_l, [src, #20] 565*09a53ad8SAndrew Turner ldr C_h, [src, #24] 566*09a53ad8SAndrew Turner strd D_l, D_h, [sp, #24] 567*09a53ad8SAndrew Turner ldr D_l, [src, #28] 568*09a53ad8SAndrew Turner ldr D_h, [src, #32]! 569*09a53ad8SAndrew Turner b 1f 570*09a53ad8SAndrew Turner .p2align 6 571*09a53ad8SAndrew Turner2: 572*09a53ad8SAndrew Turner pld [src, #(5 * 64) - (32 - 4)] 573*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #40] 574*09a53ad8SAndrew Turner ldr A_l, [src, #36] 575*09a53ad8SAndrew Turner ldr A_h, [src, #40] 576*09a53ad8SAndrew Turner strd B_l, B_h, [dst, #48] 577*09a53ad8SAndrew Turner ldr B_l, [src, #44] 578*09a53ad8SAndrew Turner ldr B_h, [src, #48] 579*09a53ad8SAndrew Turner strd C_l, C_h, [dst, #56] 580*09a53ad8SAndrew Turner ldr C_l, [src, #52] 581*09a53ad8SAndrew Turner ldr C_h, [src, #56] 582*09a53ad8SAndrew Turner strd D_l, D_h, [dst, #64]! 583*09a53ad8SAndrew Turner ldr D_l, [src, #60] 584*09a53ad8SAndrew Turner ldr D_h, [src, #64]! 585*09a53ad8SAndrew Turner subs tmp2, tmp2, #64 586*09a53ad8SAndrew Turner1: 587*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #8] 588*09a53ad8SAndrew Turner ldr A_l, [src, #4] 589*09a53ad8SAndrew Turner ldr A_h, [src, #8] 590*09a53ad8SAndrew Turner strd B_l, B_h, [dst, #16] 591*09a53ad8SAndrew Turner ldr B_l, [src, #12] 592*09a53ad8SAndrew Turner ldr B_h, [src, #16] 593*09a53ad8SAndrew Turner strd C_l, C_h, [dst, #24] 594*09a53ad8SAndrew Turner ldr C_l, [src, #20] 595*09a53ad8SAndrew Turner ldr C_h, [src, #24] 596*09a53ad8SAndrew Turner strd D_l, D_h, [dst, #32] 597*09a53ad8SAndrew Turner ldr D_l, [src, #28] 598*09a53ad8SAndrew Turner ldr D_h, [src, #32] 599*09a53ad8SAndrew Turner bcs 2b 600*09a53ad8SAndrew Turner 601*09a53ad8SAndrew Turner /* Save the remaining bytes and restore the callee-saved regs. */ 602*09a53ad8SAndrew Turner strd A_l, A_h, [dst, #40] 603*09a53ad8SAndrew Turner add src, src, #36 604*09a53ad8SAndrew Turner strd B_l, B_h, [dst, #48] 605*09a53ad8SAndrew Turner ldrd B_l, B_h, [sp, #8] 606*09a53ad8SAndrew Turner strd C_l, C_h, [dst, #56] 607*09a53ad8SAndrew Turner ldrd C_l, C_h, [sp, #16] 608*09a53ad8SAndrew Turner strd D_l, D_h, [dst, #64] 609*09a53ad8SAndrew Turner ldrd D_l, D_h, [sp, #24] 610*09a53ad8SAndrew Turner add dst, dst, #72 611*09a53ad8SAndrew Turner ands count, tmp2, #0x3f 612*09a53ad8SAndrew Turner#endif 613*09a53ad8SAndrew Turner ldr tmp2, [sp], #FRAME_SIZE 614*09a53ad8SAndrew Turner bne .Ltail63unaligned 615*09a53ad8SAndrew Turner bx lr 616*09a53ad8SAndrew Turner 617*09a53ad8SAndrew Turner .size memcpy, . - memcpy 618