109a53ad8SAndrew Turner/* Copyright (c) 2012, Linaro Limited 209a53ad8SAndrew Turner All rights reserved. 309a53ad8SAndrew Turner 409a53ad8SAndrew Turner Redistribution and use in source and binary forms, with or without 509a53ad8SAndrew Turner modification, are permitted provided that the following conditions are met: 609a53ad8SAndrew Turner * Redistributions of source code must retain the above copyright 709a53ad8SAndrew Turner notice, this list of conditions and the following disclaimer. 809a53ad8SAndrew Turner * Redistributions in binary form must reproduce the above copyright 909a53ad8SAndrew Turner notice, this list of conditions and the following disclaimer in the 1009a53ad8SAndrew Turner documentation and/or other materials provided with the distribution. 1109a53ad8SAndrew Turner * Neither the name of the Linaro nor the 1209a53ad8SAndrew Turner names of its contributors may be used to endorse or promote products 1309a53ad8SAndrew Turner derived from this software without specific prior written permission. 1409a53ad8SAndrew Turner 1509a53ad8SAndrew Turner THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 1609a53ad8SAndrew Turner "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 1709a53ad8SAndrew Turner LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 1809a53ad8SAndrew Turner A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 1909a53ad8SAndrew Turner HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 2009a53ad8SAndrew Turner SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 2109a53ad8SAndrew Turner LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 2209a53ad8SAndrew Turner DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 2309a53ad8SAndrew Turner THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 2409a53ad8SAndrew Turner (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 2509a53ad8SAndrew Turner OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 2609a53ad8SAndrew Turner 2709a53ad8SAndrew Turner/* 2809a53ad8SAndrew Turner * Copyright (c) 2015 ARM Ltd 2909a53ad8SAndrew Turner * All rights reserved. 3009a53ad8SAndrew Turner * 3109a53ad8SAndrew Turner * Redistribution and use in source and binary forms, with or without 3209a53ad8SAndrew Turner * modification, are permitted provided that the following conditions 3309a53ad8SAndrew Turner * are met: 3409a53ad8SAndrew Turner * 1. Redistributions of source code must retain the above copyright 3509a53ad8SAndrew Turner * notice, this list of conditions and the following disclaimer. 3609a53ad8SAndrew Turner * 2. Redistributions in binary form must reproduce the above copyright 3709a53ad8SAndrew Turner * notice, this list of conditions and the following disclaimer in the 3809a53ad8SAndrew Turner * documentation and/or other materials provided with the distribution. 3909a53ad8SAndrew Turner * 3. The name of the company may not be used to endorse or promote 4009a53ad8SAndrew Turner * products derived from this software without specific prior written 4109a53ad8SAndrew Turner * permission. 4209a53ad8SAndrew Turner * 4309a53ad8SAndrew Turner * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 4409a53ad8SAndrew Turner * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 4509a53ad8SAndrew Turner * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 4609a53ad8SAndrew Turner * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 4709a53ad8SAndrew Turner * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 4809a53ad8SAndrew Turner * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 4909a53ad8SAndrew Turner * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 5009a53ad8SAndrew Turner * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 5109a53ad8SAndrew Turner * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 5209a53ad8SAndrew Turner * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 5309a53ad8SAndrew Turner */ 5409a53ad8SAndrew Turner 5509a53ad8SAndrew Turner/* Assumptions: 5609a53ad8SAndrew Turner * 5709a53ad8SAndrew Turner * ARMv8-a, AArch64, unaligned accesses. 5809a53ad8SAndrew Turner * 5909a53ad8SAndrew Turner */ 6009a53ad8SAndrew Turner 6109a53ad8SAndrew Turner#define dstin x0 6209a53ad8SAndrew Turner#define src x1 6309a53ad8SAndrew Turner#define count x2 6409a53ad8SAndrew Turner#define dst x3 6509a53ad8SAndrew Turner#define srcend x4 6609a53ad8SAndrew Turner#define dstend x5 6709a53ad8SAndrew Turner#define A_l x6 6809a53ad8SAndrew Turner#define A_lw w6 6909a53ad8SAndrew Turner#define A_h x7 7009a53ad8SAndrew Turner#define A_hw w7 7109a53ad8SAndrew Turner#define B_l x8 7209a53ad8SAndrew Turner#define B_lw w8 7309a53ad8SAndrew Turner#define B_h x9 7409a53ad8SAndrew Turner#define C_l x10 7509a53ad8SAndrew Turner#define C_h x11 7609a53ad8SAndrew Turner#define D_l x12 7709a53ad8SAndrew Turner#define D_h x13 7809a53ad8SAndrew Turner#define E_l src 7909a53ad8SAndrew Turner#define E_h count 8086573876SAndrew Turner#define F_l dst 8186573876SAndrew Turner#define F_h srcend 8209a53ad8SAndrew Turner#define tmp1 x9 8309a53ad8SAndrew Turner 8409a53ad8SAndrew Turner#define L(l) .L ## l 8509a53ad8SAndrew Turner 8609a53ad8SAndrew Turner .macro def_fn f p2align=0 8709a53ad8SAndrew Turner .text 8809a53ad8SAndrew Turner .p2align \p2align 8909a53ad8SAndrew Turner .global \f 9009a53ad8SAndrew Turner .type \f, %function 9109a53ad8SAndrew Turner\f: 9209a53ad8SAndrew Turner .endm 9309a53ad8SAndrew Turner 9409a53ad8SAndrew Turner/* Copies are split into 3 main cases: small copies of up to 16 bytes, 9509a53ad8SAndrew Turner medium copies of 17..96 bytes which are fully unrolled. Large copies 9609a53ad8SAndrew Turner of more than 96 bytes align the destination and use an unrolled loop 9709a53ad8SAndrew Turner processing 64 bytes per iteration. 9809a53ad8SAndrew Turner Small and medium copies read all data before writing, allowing any 9909a53ad8SAndrew Turner kind of overlap, and memmove tailcalls memcpy for these cases as 10009a53ad8SAndrew Turner well as non-overlapping copies. 10109a53ad8SAndrew Turner*/ 10209a53ad8SAndrew Turner 10309a53ad8SAndrew Turnerdef_fn memcpy p2align=6 10409a53ad8SAndrew Turner prfm PLDL1KEEP, [src] 10509a53ad8SAndrew Turner add srcend, src, count 10609a53ad8SAndrew Turner add dstend, dstin, count 10709a53ad8SAndrew Turner cmp count, 16 10809a53ad8SAndrew Turner b.ls L(copy16) 10909a53ad8SAndrew Turner cmp count, 96 11009a53ad8SAndrew Turner b.hi L(copy_long) 11109a53ad8SAndrew Turner 11209a53ad8SAndrew Turner /* Medium copies: 17..96 bytes. */ 11309a53ad8SAndrew Turner sub tmp1, count, 1 11409a53ad8SAndrew Turner ldp A_l, A_h, [src] 11509a53ad8SAndrew Turner tbnz tmp1, 6, L(copy96) 11609a53ad8SAndrew Turner ldp D_l, D_h, [srcend, -16] 11709a53ad8SAndrew Turner tbz tmp1, 5, 1f 11809a53ad8SAndrew Turner ldp B_l, B_h, [src, 16] 11909a53ad8SAndrew Turner ldp C_l, C_h, [srcend, -32] 12009a53ad8SAndrew Turner stp B_l, B_h, [dstin, 16] 12109a53ad8SAndrew Turner stp C_l, C_h, [dstend, -32] 12209a53ad8SAndrew Turner1: 12309a53ad8SAndrew Turner stp A_l, A_h, [dstin] 12409a53ad8SAndrew Turner stp D_l, D_h, [dstend, -16] 12509a53ad8SAndrew Turner ret 12609a53ad8SAndrew Turner 12709a53ad8SAndrew Turner .p2align 4 12809a53ad8SAndrew Turner /* Small copies: 0..16 bytes. */ 12909a53ad8SAndrew TurnerL(copy16): 13009a53ad8SAndrew Turner cmp count, 8 13109a53ad8SAndrew Turner b.lo 1f 13209a53ad8SAndrew Turner ldr A_l, [src] 13309a53ad8SAndrew Turner ldr A_h, [srcend, -8] 13409a53ad8SAndrew Turner str A_l, [dstin] 13509a53ad8SAndrew Turner str A_h, [dstend, -8] 13609a53ad8SAndrew Turner ret 13709a53ad8SAndrew Turner .p2align 4 13809a53ad8SAndrew Turner1: 13909a53ad8SAndrew Turner tbz count, 2, 1f 14009a53ad8SAndrew Turner ldr A_lw, [src] 14109a53ad8SAndrew Turner ldr A_hw, [srcend, -4] 14209a53ad8SAndrew Turner str A_lw, [dstin] 14309a53ad8SAndrew Turner str A_hw, [dstend, -4] 14409a53ad8SAndrew Turner ret 14509a53ad8SAndrew Turner 14609a53ad8SAndrew Turner /* Copy 0..3 bytes. Use a branchless sequence that copies the same 14709a53ad8SAndrew Turner byte 3 times if count==1, or the 2nd byte twice if count==2. */ 14809a53ad8SAndrew Turner1: 14909a53ad8SAndrew Turner cbz count, 2f 15009a53ad8SAndrew Turner lsr tmp1, count, 1 15109a53ad8SAndrew Turner ldrb A_lw, [src] 15209a53ad8SAndrew Turner ldrb A_hw, [srcend, -1] 15309a53ad8SAndrew Turner ldrb B_lw, [src, tmp1] 15409a53ad8SAndrew Turner strb A_lw, [dstin] 15509a53ad8SAndrew Turner strb B_lw, [dstin, tmp1] 15609a53ad8SAndrew Turner strb A_hw, [dstend, -1] 15709a53ad8SAndrew Turner2: ret 15809a53ad8SAndrew Turner 15909a53ad8SAndrew Turner .p2align 4 16009a53ad8SAndrew Turner /* Copy 64..96 bytes. Copy 64 bytes from the start and 16109a53ad8SAndrew Turner 32 bytes from the end. */ 16209a53ad8SAndrew TurnerL(copy96): 16309a53ad8SAndrew Turner ldp B_l, B_h, [src, 16] 16409a53ad8SAndrew Turner ldp C_l, C_h, [src, 32] 16509a53ad8SAndrew Turner ldp D_l, D_h, [src, 48] 16609a53ad8SAndrew Turner ldp E_l, E_h, [srcend, -32] 16709a53ad8SAndrew Turner ldp F_l, F_h, [srcend, -16] 16809a53ad8SAndrew Turner stp A_l, A_h, [dstin] 16909a53ad8SAndrew Turner stp B_l, B_h, [dstin, 16] 17009a53ad8SAndrew Turner stp C_l, C_h, [dstin, 32] 17109a53ad8SAndrew Turner stp D_l, D_h, [dstin, 48] 17209a53ad8SAndrew Turner stp E_l, E_h, [dstend, -32] 17309a53ad8SAndrew Turner stp F_l, F_h, [dstend, -16] 17409a53ad8SAndrew Turner ret 17509a53ad8SAndrew Turner 17609a53ad8SAndrew Turner /* Align DST to 16 byte alignment so that we don't cross cache line 17709a53ad8SAndrew Turner boundaries on both loads and stores. There are at least 96 bytes 17809a53ad8SAndrew Turner to copy, so copy 16 bytes unaligned and then align. The loop 17909a53ad8SAndrew Turner copies 64 bytes per iteration and prefetches one iteration ahead. */ 18009a53ad8SAndrew Turner 18109a53ad8SAndrew Turner .p2align 4 18209a53ad8SAndrew TurnerL(copy_long): 18309a53ad8SAndrew Turner and tmp1, dstin, 15 18409a53ad8SAndrew Turner bic dst, dstin, 15 18509a53ad8SAndrew Turner ldp D_l, D_h, [src] 18609a53ad8SAndrew Turner sub src, src, tmp1 18709a53ad8SAndrew Turner add count, count, tmp1 /* Count is now 16 too large. */ 18809a53ad8SAndrew Turner ldp A_l, A_h, [src, 16] 18909a53ad8SAndrew Turner stp D_l, D_h, [dstin] 19009a53ad8SAndrew Turner ldp B_l, B_h, [src, 32] 19109a53ad8SAndrew Turner ldp C_l, C_h, [src, 48] 19209a53ad8SAndrew Turner ldp D_l, D_h, [src, 64]! 19309a53ad8SAndrew Turner subs count, count, 128 + 16 /* Test and readjust count. */ 19409a53ad8SAndrew Turner b.ls 2f 19509a53ad8SAndrew Turner1: 19609a53ad8SAndrew Turner stp A_l, A_h, [dst, 16] 19709a53ad8SAndrew Turner ldp A_l, A_h, [src, 16] 19809a53ad8SAndrew Turner stp B_l, B_h, [dst, 32] 19909a53ad8SAndrew Turner ldp B_l, B_h, [src, 32] 20009a53ad8SAndrew Turner stp C_l, C_h, [dst, 48] 20109a53ad8SAndrew Turner ldp C_l, C_h, [src, 48] 20209a53ad8SAndrew Turner stp D_l, D_h, [dst, 64]! 20309a53ad8SAndrew Turner ldp D_l, D_h, [src, 64]! 20409a53ad8SAndrew Turner subs count, count, 64 20509a53ad8SAndrew Turner b.hi 1b 20609a53ad8SAndrew Turner 20709a53ad8SAndrew Turner /* Write the last full set of 64 bytes. The remainder is at most 64 20809a53ad8SAndrew Turner bytes, so it is safe to always copy 64 bytes from the end even if 20909a53ad8SAndrew Turner there is just 1 byte left. */ 21009a53ad8SAndrew Turner2: 21109a53ad8SAndrew Turner ldp E_l, E_h, [srcend, -64] 21209a53ad8SAndrew Turner stp A_l, A_h, [dst, 16] 21309a53ad8SAndrew Turner ldp A_l, A_h, [srcend, -48] 21409a53ad8SAndrew Turner stp B_l, B_h, [dst, 32] 21509a53ad8SAndrew Turner ldp B_l, B_h, [srcend, -32] 21609a53ad8SAndrew Turner stp C_l, C_h, [dst, 48] 21709a53ad8SAndrew Turner ldp C_l, C_h, [srcend, -16] 21809a53ad8SAndrew Turner stp D_l, D_h, [dst, 64] 21909a53ad8SAndrew Turner stp E_l, E_h, [dstend, -64] 22009a53ad8SAndrew Turner stp A_l, A_h, [dstend, -48] 22109a53ad8SAndrew Turner stp B_l, B_h, [dstend, -32] 22209a53ad8SAndrew Turner stp C_l, C_h, [dstend, -16] 22309a53ad8SAndrew Turner ret 22409a53ad8SAndrew Turner 22509a53ad8SAndrew Turner .size memcpy, . - memcpy 226