1/* $OpenBSD: _memcpy.S,v 1.3 2008/06/26 05:42:04 ray Exp $ */ 2/* $NetBSD: _memcpy.S,v 1.4 2003/04/05 23:08:52 bjh21 Exp $ */ 3 4/*- 5 * Copyright (c) 1997 The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Neil A. Carson and Mark Brinicombe 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33#include <machine/asm.h> 34 35/* 36 * This is one fun bit of code ... 37 * Some easy listening music is suggested while trying to understand this 38 * code e.g. Iron Maiden 39 * 40 * For anyone attempting to understand it : 41 * 42 * The core code is implemented here with simple stubs for memcpy() 43 * memmove() and bcopy(). 44 * 45 * All local labels are prefixed with Lmemcpy_ 46 * Following the prefix a label starting f is used in the forward copy code 47 * while a label using b is used in the backwards copy code 48 * The source and destination addresses determine whether a forward or 49 * backward copy is performed. 50 * Separate bits of code are used to deal with the following situations 51 * for both the forward and backwards copy. 52 * unaligned source address 53 * unaligned destination address 54 * Separate copy routines are used to produce an optimised result for each 55 * of these cases. 56 * The copy code will use LDM/STM instructions to copy up to 32 bytes at 57 * a time where possible. 58 * 59 * Note: r12 (aka ip) can be trashed during the function along with 60 * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out. 61 * Additional registers are preserved prior to use i.e. r4, r5 & lr 62 * 63 * Apologies for the state of the comments ;-) 64 */ 65 66ENTRY(_memcpy) 67 /* Determine copy direction */ 68 cmp r1, r0 69 bcc .Lmemcpy_backwards 70 71 moveq r0, #0 /* Quick abort for len=0 */ 72 moveq pc, lr 73 74 stmdb sp!, {r0, lr} /* memcpy() returns dest addr */ 75 subs r2, r2, #4 76 blt .Lmemcpy_fl4 /* less than 4 bytes */ 77 ands r12, r0, #3 78 bne .Lmemcpy_fdestul /* oh unaligned destination addr */ 79 ands r12, r1, #3 80 bne .Lmemcpy_fsrcul /* oh unaligned source addr */ 81 82.Lmemcpy_ft8: 83 /* We have aligned source and destination */ 84 subs r2, r2, #8 85 blt .Lmemcpy_fl12 /* less than 12 bytes (4 from above) */ 86 subs r2, r2, #0x14 87 blt .Lmemcpy_fl32 /* less than 32 bytes (12 from above) */ 88 stmdb sp!, {r4} /* borrow r4 */ 89 90 /* blat 32 bytes at a time */ 91 /* XXX for really big copies perhaps we should use more registers */ 92.Lmemcpy_floop32: 93 ldmia r1!, {r3, r4, r12, lr} 94 stmia r0!, {r3, r4, r12, lr} 95 ldmia r1!, {r3, r4, r12, lr} 96 stmia r0!, {r3, r4, r12, lr} 97 subs r2, r2, #0x20 98 bge .Lmemcpy_floop32 99 100 cmn r2, #0x10 101 ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ 102 stmgeia r0!, {r3, r4, r12, lr} 103 subge r2, r2, #0x10 104 ldmia sp!, {r4} /* return r4 */ 105 106.Lmemcpy_fl32: 107 adds r2, r2, #0x14 108 109 /* blat 12 bytes at a time */ 110.Lmemcpy_floop12: 111 ldmgeia r1!, {r3, r12, lr} 112 stmgeia r0!, {r3, r12, lr} 113 subges r2, r2, #0x0c 114 bge .Lmemcpy_floop12 115 116.Lmemcpy_fl12: 117 adds r2, r2, #8 118 blt .Lmemcpy_fl4 119 120 subs r2, r2, #4 121 ldrlt r3, [r1], #4 122 strlt r3, [r0], #4 123 ldmgeia r1!, {r3, r12} 124 stmgeia r0!, {r3, r12} 125 subge r2, r2, #4 126 127.Lmemcpy_fl4: 128 /* less than 4 bytes to go */ 129 adds r2, r2, #4 130 ldmeqia sp!, {r0, pc} /* done */ 131 132 /* copy the crud byte at a time */ 133 cmp r2, #2 134 ldrb r3, [r1], #1 135 strb r3, [r0], #1 136 ldrgeb r3, [r1], #1 137 strgeb r3, [r0], #1 138 ldrgtb r3, [r1], #1 139 strgtb r3, [r0], #1 140 ldmia sp!, {r0, pc} 141 142 /* erg - unaligned destination */ 143.Lmemcpy_fdestul: 144 rsb r12, r12, #4 145 cmp r12, #2 146 147 /* align destination with byte copies */ 148 ldrb r3, [r1], #1 149 strb r3, [r0], #1 150 ldrgeb r3, [r1], #1 151 strgeb r3, [r0], #1 152 ldrgtb r3, [r1], #1 153 strgtb r3, [r0], #1 154 subs r2, r2, r12 155 blt .Lmemcpy_fl4 /* less the 4 bytes */ 156 157 ands r12, r1, #3 158 beq .Lmemcpy_ft8 /* we have an aligned source */ 159 160 /* erg - unaligned source */ 161 /* This is where it gets nasty ... */ 162.Lmemcpy_fsrcul: 163 bic r1, r1, #3 164 ldr lr, [r1], #4 165 cmp r12, #2 166 bgt .Lmemcpy_fsrcul3 167 beq .Lmemcpy_fsrcul2 168 cmp r2, #0x0c 169 blt .Lmemcpy_fsrcul1loop4 170 sub r2, r2, #0x0c 171 stmdb sp!, {r4, r5} 172 173.Lmemcpy_fsrcul1loop16: 174 mov r3, lr, lsr #8 175 ldmia r1!, {r4, r5, r12, lr} 176 orr r3, r3, r4, lsl #24 177 mov r4, r4, lsr #8 178 orr r4, r4, r5, lsl #24 179 mov r5, r5, lsr #8 180 orr r5, r5, r12, lsl #24 181 mov r12, r12, lsr #8 182 orr r12, r12, lr, lsl #24 183 stmia r0!, {r3-r5, r12} 184 subs r2, r2, #0x10 185 bge .Lmemcpy_fsrcul1loop16 186 ldmia sp!, {r4, r5} 187 adds r2, r2, #0x0c 188 blt .Lmemcpy_fsrcul1l4 189 190.Lmemcpy_fsrcul1loop4: 191 mov r12, lr, lsr #8 192 ldr lr, [r1], #4 193 orr r12, r12, lr, lsl #24 194 str r12, [r0], #4 195 subs r2, r2, #4 196 bge .Lmemcpy_fsrcul1loop4 197 198.Lmemcpy_fsrcul1l4: 199 sub r1, r1, #3 200 b .Lmemcpy_fl4 201 202.Lmemcpy_fsrcul2: 203 cmp r2, #0x0c 204 blt .Lmemcpy_fsrcul2loop4 205 sub r2, r2, #0x0c 206 stmdb sp!, {r4, r5} 207 208.Lmemcpy_fsrcul2loop16: 209 mov r3, lr, lsr #16 210 ldmia r1!, {r4, r5, r12, lr} 211 orr r3, r3, r4, lsl #16 212 mov r4, r4, lsr #16 213 orr r4, r4, r5, lsl #16 214 mov r5, r5, lsr #16 215 orr r5, r5, r12, lsl #16 216 mov r12, r12, lsr #16 217 orr r12, r12, lr, lsl #16 218 stmia r0!, {r3-r5, r12} 219 subs r2, r2, #0x10 220 bge .Lmemcpy_fsrcul2loop16 221 ldmia sp!, {r4, r5} 222 adds r2, r2, #0x0c 223 blt .Lmemcpy_fsrcul2l4 224 225.Lmemcpy_fsrcul2loop4: 226 mov r12, lr, lsr #16 227 ldr lr, [r1], #4 228 orr r12, r12, lr, lsl #16 229 str r12, [r0], #4 230 subs r2, r2, #4 231 bge .Lmemcpy_fsrcul2loop4 232 233.Lmemcpy_fsrcul2l4: 234 sub r1, r1, #2 235 b .Lmemcpy_fl4 236 237.Lmemcpy_fsrcul3: 238 cmp r2, #0x0c 239 blt .Lmemcpy_fsrcul3loop4 240 sub r2, r2, #0x0c 241 stmdb sp!, {r4, r5} 242 243.Lmemcpy_fsrcul3loop16: 244 mov r3, lr, lsr #24 245 ldmia r1!, {r4, r5, r12, lr} 246 orr r3, r3, r4, lsl #8 247 mov r4, r4, lsr #24 248 orr r4, r4, r5, lsl #8 249 mov r5, r5, lsr #24 250 orr r5, r5, r12, lsl #8 251 mov r12, r12, lsr #24 252 orr r12, r12, lr, lsl #8 253 stmia r0!, {r3-r5, r12} 254 subs r2, r2, #0x10 255 bge .Lmemcpy_fsrcul3loop16 256 ldmia sp!, {r4, r5} 257 adds r2, r2, #0x0c 258 blt .Lmemcpy_fsrcul3l4 259 260.Lmemcpy_fsrcul3loop4: 261 mov r12, lr, lsr #24 262 ldr lr, [r1], #4 263 orr r12, r12, lr, lsl #8 264 str r12, [r0], #4 265 subs r2, r2, #4 266 bge .Lmemcpy_fsrcul3loop4 267 268.Lmemcpy_fsrcul3l4: 269 sub r1, r1, #1 270 b .Lmemcpy_fl4 271 272.Lmemcpy_backwards: 273 add r1, r1, r2 274 add r0, r0, r2 275 subs r2, r2, #4 276 blt .Lmemcpy_bl4 /* less than 4 bytes */ 277 ands r12, r0, #3 278 bne .Lmemcpy_bdestul /* oh unaligned destination addr */ 279 ands r12, r1, #3 280 bne .Lmemcpy_bsrcul /* oh unaligned source addr */ 281 282.Lmemcpy_bt8: 283 /* We have aligned source and destination */ 284 subs r2, r2, #8 285 blt .Lmemcpy_bl12 /* less than 12 bytes (4 from above) */ 286 stmdb sp!, {r4, lr} 287 subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */ 288 blt .Lmemcpy_bl32 289 290 /* blat 32 bytes at a time */ 291 /* XXX for really big copies perhaps we should use more registers */ 292.Lmemcpy_bloop32: 293 ldmdb r1!, {r3, r4, r12, lr} 294 stmdb r0!, {r3, r4, r12, lr} 295 ldmdb r1!, {r3, r4, r12, lr} 296 stmdb r0!, {r3, r4, r12, lr} 297 subs r2, r2, #0x20 298 bge .Lmemcpy_bloop32 299 300.Lmemcpy_bl32: 301 cmn r2, #0x10 302 ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */ 303 stmgedb r0!, {r3, r4, r12, lr} 304 subge r2, r2, #0x10 305 adds r2, r2, #0x14 306 ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */ 307 stmgedb r0!, {r3, r12, lr} 308 subge r2, r2, #0x0c 309 ldmia sp!, {r4, lr} 310 311.Lmemcpy_bl12: 312 adds r2, r2, #8 313 blt .Lmemcpy_bl4 314 subs r2, r2, #4 315 ldrlt r3, [r1, #-4]! 316 strlt r3, [r0, #-4]! 317 ldmgedb r1!, {r3, r12} 318 stmgedb r0!, {r3, r12} 319 subge r2, r2, #4 320 321.Lmemcpy_bl4: 322 /* less than 4 bytes to go */ 323 adds r2, r2, #4 324 moveq pc, lr /* done */ 325 326 /* copy the crud byte at a time */ 327 cmp r2, #2 328 ldrb r3, [r1, #-1]! 329 strb r3, [r0, #-1]! 330 ldrgeb r3, [r1, #-1]! 331 strgeb r3, [r0, #-1]! 332 ldrgtb r3, [r1, #-1]! 333 strgtb r3, [r0, #-1]! 334 mov pc, lr 335 336 /* erg - unaligned destination */ 337.Lmemcpy_bdestul: 338 cmp r12, #2 339 340 /* align destination with byte copies */ 341 ldrb r3, [r1, #-1]! 342 strb r3, [r0, #-1]! 343 ldrgeb r3, [r1, #-1]! 344 strgeb r3, [r0, #-1]! 345 ldrgtb r3, [r1, #-1]! 346 strgtb r3, [r0, #-1]! 347 subs r2, r2, r12 348 blt .Lmemcpy_bl4 /* less than 4 bytes to go */ 349 ands r12, r1, #3 350 beq .Lmemcpy_bt8 /* we have an aligned source */ 351 352 /* erg - unaligned source */ 353 /* This is where it gets nasty ... */ 354.Lmemcpy_bsrcul: 355 bic r1, r1, #3 356 ldr r3, [r1, #0] 357 cmp r12, #2 358 blt .Lmemcpy_bsrcul1 359 beq .Lmemcpy_bsrcul2 360 cmp r2, #0x0c 361 blt .Lmemcpy_bsrcul3loop4 362 sub r2, r2, #0x0c 363 stmdb sp!, {r4, r5, lr} 364 365.Lmemcpy_bsrcul3loop16: 366 mov lr, r3, lsl #8 367 ldmdb r1!, {r3-r5, r12} 368 orr lr, lr, r12, lsr #24 369 mov r12, r12, lsl #8 370 orr r12, r12, r5, lsr #24 371 mov r5, r5, lsl #8 372 orr r5, r5, r4, lsr #24 373 mov r4, r4, lsl #8 374 orr r4, r4, r3, lsr #24 375 stmdb r0!, {r4, r5, r12, lr} 376 subs r2, r2, #0x10 377 bge .Lmemcpy_bsrcul3loop16 378 ldmia sp!, {r4, r5, lr} 379 adds r2, r2, #0x0c 380 blt .Lmemcpy_bsrcul3l4 381 382.Lmemcpy_bsrcul3loop4: 383 mov r12, r3, lsl #8 384 ldr r3, [r1, #-4]! 385 orr r12, r12, r3, lsr #24 386 str r12, [r0, #-4]! 387 subs r2, r2, #4 388 bge .Lmemcpy_bsrcul3loop4 389 390.Lmemcpy_bsrcul3l4: 391 add r1, r1, #3 392 b .Lmemcpy_bl4 393 394.Lmemcpy_bsrcul2: 395 cmp r2, #0x0c 396 blt .Lmemcpy_bsrcul2loop4 397 sub r2, r2, #0x0c 398 stmdb sp!, {r4, r5, lr} 399 400.Lmemcpy_bsrcul2loop16: 401 mov lr, r3, lsl #16 402 ldmdb r1!, {r3-r5, r12} 403 orr lr, lr, r12, lsr #16 404 mov r12, r12, lsl #16 405 orr r12, r12, r5, lsr #16 406 mov r5, r5, lsl #16 407 orr r5, r5, r4, lsr #16 408 mov r4, r4, lsl #16 409 orr r4, r4, r3, lsr #16 410 stmdb r0!, {r4, r5, r12, lr} 411 subs r2, r2, #0x10 412 bge .Lmemcpy_bsrcul2loop16 413 ldmia sp!, {r4, r5, lr} 414 adds r2, r2, #0x0c 415 blt .Lmemcpy_bsrcul2l4 416 417.Lmemcpy_bsrcul2loop4: 418 mov r12, r3, lsl #16 419 ldr r3, [r1, #-4]! 420 orr r12, r12, r3, lsr #16 421 str r12, [r0, #-4]! 422 subs r2, r2, #4 423 bge .Lmemcpy_bsrcul2loop4 424 425.Lmemcpy_bsrcul2l4: 426 add r1, r1, #2 427 b .Lmemcpy_bl4 428 429.Lmemcpy_bsrcul1: 430 cmp r2, #0x0c 431 blt .Lmemcpy_bsrcul1loop4 432 sub r2, r2, #0x0c 433 stmdb sp!, {r4, r5, lr} 434 435.Lmemcpy_bsrcul1loop32: 436 mov lr, r3, lsl #24 437 ldmdb r1!, {r3-r5, r12} 438 orr lr, lr, r12, lsr #8 439 mov r12, r12, lsl #24 440 orr r12, r12, r5, lsr #8 441 mov r5, r5, lsl #24 442 orr r5, r5, r4, lsr #8 443 mov r4, r4, lsl #24 444 orr r4, r4, r3, lsr #8 445 stmdb r0!, {r4, r5, r12, lr} 446 subs r2, r2, #0x10 447 bge .Lmemcpy_bsrcul1loop32 448 ldmia sp!, {r4, r5, lr} 449 adds r2, r2, #0x0c 450 blt .Lmemcpy_bsrcul1l4 451 452.Lmemcpy_bsrcul1loop4: 453 mov r12, r3, lsl #24 454 ldr r3, [r1, #-4]! 455 orr r12, r12, r3, lsr #8 456 str r12, [r0, #-4]! 457 subs r2, r2, #4 458 bge .Lmemcpy_bsrcul1loop4 459 460.Lmemcpy_bsrcul1l4: 461 add r1, r1, #1 462 b .Lmemcpy_bl4 463