1/* $OpenBSD: _memcpy.S,v 1.7 2017/10/29 02:21:33 guenther Exp $ */ 2/* $NetBSD: _memcpy.S,v 1.4 2003/04/05 23:08:52 bjh21 Exp $ */ 3 4/*- 5 * Copyright (c) 1997 The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Neil A. Carson and Mark Brinicombe 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33#include "DEFS.h" 34 35/* 36 * This is one fun bit of code ... 37 * Some easy listening music is suggested while trying to understand this 38 * code e.g. Iron Maiden 39 * 40 * For anyone attempting to understand it : 41 * 42 * The core code is implemented here with simple stubs for memcpy() 43 * memmove() and bcopy(). 44 * 45 * All local labels are prefixed with Lmemcpy_ 46 * Following the prefix a label starting f is used in the forward copy code 47 * while a label using b is used in the backwards copy code 48 * The source and destination addresses determine whether a forward or 49 * backward copy is performed. 50 * Separate bits of code are used to deal with the following situations 51 * for both the forward and backwards copy. 52 * unaligned source address 53 * unaligned destination address 54 * Separate copy routines are used to produce an optimised result for each 55 * of these cases. 56 * The copy code will use LDM/STM instructions to copy up to 32 bytes at 57 * a time where possible. 58 * 59 * Note: r12 (aka ip) can be trashed during the function along with 60 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out. 61 * Additional registers are preserved prior to use i.e. r4, r5 & lr 62 * 63 * Apologies for the state of the comments ;-) 64 */ 65 66.syntax unified 67 68.hidden _memcpy 69 70ENTRY(_memcpy) 71 /* Determine copy direction */ 72 cmp r1, r0 73 bcc .Lmemcpy_backwards 74 75 moveq r0, #0 /* Quick abort for len=0 */ 76 moveq pc, lr 77 78 stmdb sp!, {r0, lr} /* memcpy() returns dest addr */ 79 subs r2, r2, #4 80 blt .Lmemcpy_fl4 /* less than 4 bytes */ 81 ands r12, r0, #3 82 bne .Lmemcpy_fdestul /* oh unaligned destination addr */ 83 ands r12, r1, #3 84 bne .Lmemcpy_fsrcul /* oh unaligned source addr */ 85 86.Lmemcpy_ft8: 87 /* We have aligned source and destination */ 88 subs r2, r2, #8 89 blt .Lmemcpy_fl12 /* less than 12 bytes (4 from above) */ 90 subs r2, r2, #0x14 91 blt .Lmemcpy_fl32 /* less than 32 bytes (12 from above) */ 92 stmdb sp!, {r4} /* borrow r4 */ 93 94 /* blat 32 bytes at a time */ 95 /* XXX for really big copies perhaps we should use more registers */ 96.Lmemcpy_floop32: 97 ldmia r1!, {r3, r4, r12, lr} 98 stmia r0!, {r3, r4, r12, lr} 99 ldmia r1!, {r3, r4, r12, lr} 100 stmia r0!, {r3, r4, r12, lr} 101 subs r2, r2, #0x20 102 bge .Lmemcpy_floop32 103 104 cmn r2, #0x10 105 ldmiage r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ 106 stmiage r0!, {r3, r4, r12, lr} 107 subge r2, r2, #0x10 108 ldmia sp!, {r4} /* return r4 */ 109 110.Lmemcpy_fl32: 111 adds r2, r2, #0x14 112 113 /* blat 12 bytes at a time */ 114.Lmemcpy_floop12: 115 ldmiage r1!, {r3, r12, lr} 116 stmiage r0!, {r3, r12, lr} 117 subsge r2, r2, #0x0c 118 bge .Lmemcpy_floop12 119 120.Lmemcpy_fl12: 121 adds r2, r2, #8 122 blt .Lmemcpy_fl4 123 124 subs r2, r2, #4 125 ldrlt r3, [r1], #4 126 strlt r3, [r0], #4 127 ldmiage r1!, {r3, r12} 128 stmiage r0!, {r3, r12} 129 subge r2, r2, #4 130 131.Lmemcpy_fl4: 132 /* less than 4 bytes to go */ 133 adds r2, r2, #4 134 ldmiaeq sp!, {r0, pc} /* done */ 135 136 /* copy the crud byte at a time */ 137 cmp r2, #2 138 ldrb r3, [r1], #1 139 strb r3, [r0], #1 140 ldrbge r3, [r1], #1 141 strbge r3, [r0], #1 142 ldrbgt r3, [r1], #1 143 strbgt r3, [r0], #1 144 ldmia sp!, {r0, pc} 145 146 /* erg - unaligned destination */ 147.Lmemcpy_fdestul: 148 rsb r12, r12, #4 149 cmp r12, #2 150 151 /* align destination with byte copies */ 152 ldrb r3, [r1], #1 153 strb r3, [r0], #1 154 ldrbge r3, [r1], #1 155 strbge r3, [r0], #1 156 ldrbgt r3, [r1], #1 157 strbgt r3, [r0], #1 158 subs r2, r2, r12 159 blt .Lmemcpy_fl4 /* less the 4 bytes */ 160 161 ands r12, r1, #3 162 beq .Lmemcpy_ft8 /* we have an aligned source */ 163 164 /* erg - unaligned source */ 165 /* This is where it gets nasty ... */ 166.Lmemcpy_fsrcul: 167 bic r1, r1, #3 168 ldr lr, [r1], #4 169 cmp r12, #2 170 bgt .Lmemcpy_fsrcul3 171 beq .Lmemcpy_fsrcul2 172 cmp r2, #0x0c 173 blt .Lmemcpy_fsrcul1loop4 174 sub r2, r2, #0x0c 175 stmdb sp!, {r4, r5} 176 177.Lmemcpy_fsrcul1loop16: 178 mov r3, lr, lsr #8 179 ldmia r1!, {r4, r5, r12, lr} 180 orr r3, r3, r4, lsl #24 181 mov r4, r4, lsr #8 182 orr r4, r4, r5, lsl #24 183 mov r5, r5, lsr #8 184 orr r5, r5, r12, lsl #24 185 mov r12, r12, lsr #8 186 orr r12, r12, lr, lsl #24 187 stmia r0!, {r3-r5, r12} 188 subs r2, r2, #0x10 189 bge .Lmemcpy_fsrcul1loop16 190 ldmia sp!, {r4, r5} 191 adds r2, r2, #0x0c 192 blt .Lmemcpy_fsrcul1l4 193 194.Lmemcpy_fsrcul1loop4: 195 mov r12, lr, lsr #8 196 ldr lr, [r1], #4 197 orr r12, r12, lr, lsl #24 198 str r12, [r0], #4 199 subs r2, r2, #4 200 bge .Lmemcpy_fsrcul1loop4 201 202.Lmemcpy_fsrcul1l4: 203 sub r1, r1, #3 204 b .Lmemcpy_fl4 205 206.Lmemcpy_fsrcul2: 207 cmp r2, #0x0c 208 blt .Lmemcpy_fsrcul2loop4 209 sub r2, r2, #0x0c 210 stmdb sp!, {r4, r5} 211 212.Lmemcpy_fsrcul2loop16: 213 mov r3, lr, lsr #16 214 ldmia r1!, {r4, r5, r12, lr} 215 orr r3, r3, r4, lsl #16 216 mov r4, r4, lsr #16 217 orr r4, r4, r5, lsl #16 218 mov r5, r5, lsr #16 219 orr r5, r5, r12, lsl #16 220 mov r12, r12, lsr #16 221 orr r12, r12, lr, lsl #16 222 stmia r0!, {r3-r5, r12} 223 subs r2, r2, #0x10 224 bge .Lmemcpy_fsrcul2loop16 225 ldmia sp!, {r4, r5} 226 adds r2, r2, #0x0c 227 blt .Lmemcpy_fsrcul2l4 228 229.Lmemcpy_fsrcul2loop4: 230 mov r12, lr, lsr #16 231 ldr lr, [r1], #4 232 orr r12, r12, lr, lsl #16 233 str r12, [r0], #4 234 subs r2, r2, #4 235 bge .Lmemcpy_fsrcul2loop4 236 237.Lmemcpy_fsrcul2l4: 238 sub r1, r1, #2 239 b .Lmemcpy_fl4 240 241.Lmemcpy_fsrcul3: 242 cmp r2, #0x0c 243 blt .Lmemcpy_fsrcul3loop4 244 sub r2, r2, #0x0c 245 stmdb sp!, {r4, r5} 246 247.Lmemcpy_fsrcul3loop16: 248 mov r3, lr, lsr #24 249 ldmia r1!, {r4, r5, r12, lr} 250 orr r3, r3, r4, lsl #8 251 mov r4, r4, lsr #24 252 orr r4, r4, r5, lsl #8 253 mov r5, r5, lsr #24 254 orr r5, r5, r12, lsl #8 255 mov r12, r12, lsr #24 256 orr r12, r12, lr, lsl #8 257 stmia r0!, {r3-r5, r12} 258 subs r2, r2, #0x10 259 bge .Lmemcpy_fsrcul3loop16 260 ldmia sp!, {r4, r5} 261 adds r2, r2, #0x0c 262 blt .Lmemcpy_fsrcul3l4 263 264.Lmemcpy_fsrcul3loop4: 265 mov r12, lr, lsr #24 266 ldr lr, [r1], #4 267 orr r12, r12, lr, lsl #8 268 str r12, [r0], #4 269 subs r2, r2, #4 270 bge .Lmemcpy_fsrcul3loop4 271 272.Lmemcpy_fsrcul3l4: 273 sub r1, r1, #1 274 b .Lmemcpy_fl4 275 276.Lmemcpy_backwards: 277 add r1, r1, r2 278 add r0, r0, r2 279 subs r2, r2, #4 280 blt .Lmemcpy_bl4 /* less than 4 bytes */ 281 ands r12, r0, #3 282 bne .Lmemcpy_bdestul /* oh unaligned destination addr */ 283 ands r12, r1, #3 284 bne .Lmemcpy_bsrcul /* oh unaligned source addr */ 285 286.Lmemcpy_bt8: 287 /* We have aligned source and destination */ 288 subs r2, r2, #8 289 blt .Lmemcpy_bl12 /* less than 12 bytes (4 from above) */ 290 stmdb sp!, {r4, lr} 291 subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */ 292 blt .Lmemcpy_bl32 293 294 /* blat 32 bytes at a time */ 295 /* XXX for really big copies perhaps we should use more registers */ 296.Lmemcpy_bloop32: 297 ldmdb r1!, {r3, r4, r12, lr} 298 stmdb r0!, {r3, r4, r12, lr} 299 ldmdb r1!, {r3, r4, r12, lr} 300 stmdb r0!, {r3, r4, r12, lr} 301 subs r2, r2, #0x20 302 bge .Lmemcpy_bloop32 303 304.Lmemcpy_bl32: 305 cmn r2, #0x10 306 ldmdbge r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ 307 stmdbge r0!, {r3, r4, r12, lr} 308 subge r2, r2, #0x10 309 adds r2, r2, #0x14 310 ldmdbge r1!, {r3, r12, lr} /* blat a remaining 12 bytes */ 311 stmdbge r0!, {r3, r12, lr} 312 subge r2, r2, #0x0c 313 ldmia sp!, {r4, lr} 314 315.Lmemcpy_bl12: 316 adds r2, r2, #8 317 blt .Lmemcpy_bl4 318 subs r2, r2, #4 319 ldrlt r3, [r1, #-4]! 320 strlt r3, [r0, #-4]! 321 ldmdbge r1!, {r3, r12} 322 stmdbge r0!, {r3, r12} 323 subge r2, r2, #4 324 325.Lmemcpy_bl4: 326 /* less than 4 bytes to go */ 327 adds r2, r2, #4 328 moveq pc, lr /* done */ 329 330 /* copy the crud byte at a time */ 331 cmp r2, #2 332 ldrb r3, [r1, #-1]! 333 strb r3, [r0, #-1]! 334 ldrbge r3, [r1, #-1]! 335 strbge r3, [r0, #-1]! 336 ldrbgt r3, [r1, #-1]! 337 strbgt r3, [r0, #-1]! 338 mov pc, lr 339 340 /* erg - unaligned destination */ 341.Lmemcpy_bdestul: 342 cmp r12, #2 343 344 /* align destination with byte copies */ 345 ldrb r3, [r1, #-1]! 346 strb r3, [r0, #-1]! 347 ldrbge r3, [r1, #-1]! 348 strbge r3, [r0, #-1]! 349 ldrbgt r3, [r1, #-1]! 350 strbgt r3, [r0, #-1]! 351 subs r2, r2, r12 352 blt .Lmemcpy_bl4 /* less than 4 bytes to go */ 353 ands r12, r1, #3 354 beq .Lmemcpy_bt8 /* we have an aligned source */ 355 356 /* erg - unaligned source */ 357 /* This is where it gets nasty ... */ 358.Lmemcpy_bsrcul: 359 bic r1, r1, #3 360 ldr r3, [r1, #0] 361 cmp r12, #2 362 blt .Lmemcpy_bsrcul1 363 beq .Lmemcpy_bsrcul2 364 cmp r2, #0x0c 365 blt .Lmemcpy_bsrcul3loop4 366 sub r2, r2, #0x0c 367 stmdb sp!, {r4, r5, lr} 368 369.Lmemcpy_bsrcul3loop16: 370 mov lr, r3, lsl #8 371 ldmdb r1!, {r3-r5, r12} 372 orr lr, lr, r12, lsr #24 373 mov r12, r12, lsl #8 374 orr r12, r12, r5, lsr #24 375 mov r5, r5, lsl #8 376 orr r5, r5, r4, lsr #24 377 mov r4, r4, lsl #8 378 orr r4, r4, r3, lsr #24 379 stmdb r0!, {r4, r5, r12, lr} 380 subs r2, r2, #0x10 381 bge .Lmemcpy_bsrcul3loop16 382 ldmia sp!, {r4, r5, lr} 383 adds r2, r2, #0x0c 384 blt .Lmemcpy_bsrcul3l4 385 386.Lmemcpy_bsrcul3loop4: 387 mov r12, r3, lsl #8 388 ldr r3, [r1, #-4]! 389 orr r12, r12, r3, lsr #24 390 str r12, [r0, #-4]! 391 subs r2, r2, #4 392 bge .Lmemcpy_bsrcul3loop4 393 394.Lmemcpy_bsrcul3l4: 395 add r1, r1, #3 396 b .Lmemcpy_bl4 397 398.Lmemcpy_bsrcul2: 399 cmp r2, #0x0c 400 blt .Lmemcpy_bsrcul2loop4 401 sub r2, r2, #0x0c 402 stmdb sp!, {r4, r5, lr} 403 404.Lmemcpy_bsrcul2loop16: 405 mov lr, r3, lsl #16 406 ldmdb r1!, {r3-r5, r12} 407 orr lr, lr, r12, lsr #16 408 mov r12, r12, lsl #16 409 orr r12, r12, r5, lsr #16 410 mov r5, r5, lsl #16 411 orr r5, r5, r4, lsr #16 412 mov r4, r4, lsl #16 413 orr r4, r4, r3, lsr #16 414 stmdb r0!, {r4, r5, r12, lr} 415 subs r2, r2, #0x10 416 bge .Lmemcpy_bsrcul2loop16 417 ldmia sp!, {r4, r5, lr} 418 adds r2, r2, #0x0c 419 blt .Lmemcpy_bsrcul2l4 420 421.Lmemcpy_bsrcul2loop4: 422 mov r12, r3, lsl #16 423 ldr r3, [r1, #-4]! 424 orr r12, r12, r3, lsr #16 425 str r12, [r0, #-4]! 426 subs r2, r2, #4 427 bge .Lmemcpy_bsrcul2loop4 428 429.Lmemcpy_bsrcul2l4: 430 add r1, r1, #2 431 b .Lmemcpy_bl4 432 433.Lmemcpy_bsrcul1: 434 cmp r2, #0x0c 435 blt .Lmemcpy_bsrcul1loop4 436 sub r2, r2, #0x0c 437 stmdb sp!, {r4, r5, lr} 438 439.Lmemcpy_bsrcul1loop32: 440 mov lr, r3, lsl #24 441 ldmdb r1!, {r3-r5, r12} 442 orr lr, lr, r12, lsr #8 443 mov r12, r12, lsl #24 444 orr r12, r12, r5, lsr #8 445 mov r5, r5, lsl #24 446 orr r5, r5, r4, lsr #8 447 mov r4, r4, lsl #24 448 orr r4, r4, r3, lsr #8 449 stmdb r0!, {r4, r5, r12, lr} 450 subs r2, r2, #0x10 451 bge .Lmemcpy_bsrcul1loop32 452 ldmia sp!, {r4, r5, lr} 453 adds r2, r2, #0x0c 454 blt .Lmemcpy_bsrcul1l4 455 456.Lmemcpy_bsrcul1loop4: 457 mov r12, r3, lsl #24 458 ldr r3, [r1, #-4]! 459 orr r12, r12, r3, lsr #8 460 str r12, [r0, #-4]! 461 subs r2, r2, #4 462 bge .Lmemcpy_bsrcul1loop4 463 464.Lmemcpy_bsrcul1l4: 465 add r1, r1, #1 466 b .Lmemcpy_bl4 467END(_memcpy) 468