1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Copyright (C) 2013 Regents of the University of California 4 */ 5 6#include <linux/linkage.h> 7#include <asm/asm.h> 8 9/* void *memcpy(void *, const void *, size_t) */ 10ENTRY(__memcpy) 11WEAK(memcpy) 12 /* Save for return value */ 13 mv t6, a0 14 15 /* 16 * Register allocation for code below: 17 * a0 - start of uncopied dst 18 * a1 - start of uncopied src 19 * t0 - end of uncopied dst 20 */ 21 add t0, a0, a2 22 23 /* 24 * Use bytewise copy if too small. 25 * 26 * This threshold must be at least 2*SZREG to ensure at least one 27 * wordwise copy is performed. It is chosen to be 16 because it will 28 * save at least 7 iterations of bytewise copy, which pays off the 29 * fixed overhead. 30 */ 31 li a3, 16 32 bltu a2, a3, .Lbyte_copy_tail 33 34 /* 35 * Bytewise copy first to align a0 to word boundary. 36 */ 37 addi a2, a0, SZREG-1 38 andi a2, a2, ~(SZREG-1) 39 beq a0, a2, 2f 401: 41 lb a5, 0(a1) 42 addi a1, a1, 1 43 sb a5, 0(a0) 44 addi a0, a0, 1 45 bne a0, a2, 1b 462: 47 48 /* 49 * Now a0 is word-aligned. If a1 is also word aligned, we could perform 50 * aligned word-wise copy. Otherwise we need to perform misaligned 51 * word-wise copy. 52 */ 53 andi a3, a1, SZREG-1 54 bnez a3, .Lmisaligned_word_copy 55 56 /* Unrolled wordwise copy */ 57 addi t0, t0, -(16*SZREG-1) 58 bgeu a0, t0, 2f 591: 60 REG_L a2, 0(a1) 61 REG_L a3, SZREG(a1) 62 REG_L a4, 2*SZREG(a1) 63 REG_L a5, 3*SZREG(a1) 64 REG_L a6, 4*SZREG(a1) 65 REG_L a7, 5*SZREG(a1) 66 REG_L t1, 6*SZREG(a1) 67 REG_L t2, 7*SZREG(a1) 68 REG_L t3, 8*SZREG(a1) 69 REG_L t4, 9*SZREG(a1) 70 REG_L t5, 10*SZREG(a1) 71 REG_S a2, 0(a0) 72 REG_S a3, SZREG(a0) 73 REG_S a4, 2*SZREG(a0) 74 REG_S a5, 3*SZREG(a0) 75 REG_S a6, 4*SZREG(a0) 76 REG_S a7, 5*SZREG(a0) 77 REG_S t1, 6*SZREG(a0) 78 REG_S t2, 7*SZREG(a0) 79 REG_S t3, 8*SZREG(a0) 80 REG_S t4, 9*SZREG(a0) 81 REG_S t5, 10*SZREG(a0) 82 REG_L a2, 11*SZREG(a1) 83 REG_L a3, 12*SZREG(a1) 84 REG_L a4, 13*SZREG(a1) 85 REG_L a5, 14*SZREG(a1) 86 REG_L a6, 15*SZREG(a1) 87 addi a1, a1, 16*SZREG 88 REG_S a2, 11*SZREG(a0) 89 REG_S a3, 12*SZREG(a0) 90 REG_S a4, 13*SZREG(a0) 91 REG_S a5, 14*SZREG(a0) 92 REG_S a6, 15*SZREG(a0) 93 addi a0, a0, 16*SZREG 94 bltu a0, t0, 1b 952: 96 /* Post-loop increment by 16*SZREG-1 and pre-loop decrement by SZREG-1 */ 97 addi t0, t0, 15*SZREG 98 99 /* Wordwise copy */ 100 bgeu a0, t0, 2f 1011: 102 REG_L a5, 0(a1) 103 addi a1, a1, SZREG 104 REG_S a5, 0(a0) 105 addi a0, a0, SZREG 106 bltu a0, t0, 1b 1072: 108 addi t0, t0, SZREG-1 109 110.Lbyte_copy_tail: 111 /* 112 * Bytewise copy anything left. 113 */ 114 beq a0, t0, 2f 1151: 116 lb a5, 0(a1) 117 addi a1, a1, 1 118 sb a5, 0(a0) 119 addi a0, a0, 1 120 bne a0, t0, 1b 1212: 122 123 mv a0, t6 124 ret 125 126.Lmisaligned_word_copy: 127 /* 128 * Misaligned word-wise copy. 129 * For misaligned copy we still perform word-wise copy, but we need to 130 * use the value fetched from the previous iteration and do some shifts. 131 * This is safe because we wouldn't access more words than necessary. 132 */ 133 134 /* Calculate shifts */ 135 slli t3, a3, 3 136 sub t4, x0, t3 /* negate is okay as shift will only look at LSBs */ 137 138 /* Load the initial value and align a1 */ 139 andi a1, a1, ~(SZREG-1) 140 REG_L a5, 0(a1) 141 142 addi t0, t0, -(SZREG-1) 143 /* At least one iteration will be executed here, no check */ 1441: 145 srl a4, a5, t3 146 REG_L a5, SZREG(a1) 147 addi a1, a1, SZREG 148 sll a2, a5, t4 149 or a2, a2, a4 150 REG_S a2, 0(a0) 151 addi a0, a0, SZREG 152 bltu a0, t0, 1b 153 154 /* Update pointers to correct value */ 155 addi t0, t0, SZREG-1 156 add a1, a1, a3 157 158 j .Lbyte_copy_tail 159END(__memcpy) 160