1 /*- 2 * Copyright (c) 2018-2019 The FreeBSD Foundation 3 * Copyright (c) 2003 Peter Wemm. 4 * Copyright (c) 1993 The Regents of the University of California. 5 * All rights reserved. 6 * 7 * Portions of this software were developed by 8 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from 9 * the FreeBSD Foundation. 10 * 11 * Primarily rewritten and redeveloped by Mateusz Guzik 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * $FreeBSD$ 38 */ 39 /* 40 * Macros to help implement memcmp(), bcmp(), 41 * bzero(), memset(), 42 * memcpy(), bcopy(), memmove() 43 */ 44 45 /* 46 * memcmp(b1, b2, len) 47 * rdi,rsi,rdx 48 */ 49 .macro MEMCMP end 50 xorl %eax,%eax 51 10: 52 cmpq $16,%rdx 53 ja 101632f 54 55 100816: 56 cmpb $8,%dl 57 jl 100408f 58 movq (%rdi),%r8 59 movq (%rsi),%r9 60 cmpq %r8,%r9 61 jne 80f 62 movq -8(%rdi,%rdx),%r8 63 movq -8(%rsi,%rdx),%r9 64 cmpq %r8,%r9 65 jne 10081608f 66 \end 67 100408: 68 cmpb $4,%dl 69 jl 100204f 70 movl (%rdi),%r8d 71 movl (%rsi),%r9d 72 cmpl %r8d,%r9d 73 jne 80f 74 movl -4(%rdi,%rdx),%r8d 75 movl -4(%rsi,%rdx),%r9d 76 cmpl %r8d,%r9d 77 jne 10040804f 78 \end 79 100204: 80 cmpb $2,%dl 81 jl 100001f 82 movzwl (%rdi),%r8d 83 movzwl (%rsi),%r9d 84 cmpl %r8d,%r9d 85 jne 1f 86 movzwl -2(%rdi,%rdx),%r8d 87 movzwl -2(%rsi,%rdx),%r9d 88 cmpl %r8d,%r9d 89 jne 1f 90 \end 91 100001: 92 cmpb $1,%dl 93 jl 100000f 94 movzbl (%rdi),%eax 95 movzbl (%rsi),%r8d 96 subl %r8d,%eax 97 100000: 98 \end 99 ALIGN_TEXT 100 101632: 101 cmpq $32,%rdx 102 ja 103200f 103 movq (%rdi),%r8 104 movq (%rsi),%r9 105 cmpq %r8,%r9 106 jne 80f 107 movq 8(%rdi),%r8 108 movq 8(%rsi),%r9 109 cmpq %r8,%r9 110 jne 10163208f 111 movq -16(%rdi,%rdx),%r8 112 movq -16(%rsi,%rdx),%r9 113 cmpq %r8,%r9 114 jne 10163216f 115 movq -8(%rdi,%rdx),%r8 116 movq -8(%rsi,%rdx),%r9 117 cmpq %r8,%r9 118 jne 10163224f 119 \end 120 ALIGN_TEXT 121 103200: 122 movq (%rdi),%r8 123 movq 8(%rdi),%r9 124 subq (%rsi),%r8 125 subq 8(%rsi),%r9 126 orq %r8,%r9 127 jnz 10320000f 128 129 movq 16(%rdi),%r8 130 movq 24(%rdi),%r9 131 subq 16(%rsi),%r8 132 subq 24(%rsi),%r9 133 orq %r8,%r9 134 jnz 10320016f 135 136 leaq 32(%rdi),%rdi 137 leaq 32(%rsi),%rsi 138 subq $32,%rdx 139 cmpq $32,%rdx 140 jae 103200b 141 cmpb $0,%dl 142 jne 10b 143 \end 144 145 /* 146 * Mismatch was found. 147 * 148 * Before we compute it we narrow down the range (16 -> 8 -> 4 bytes). 149 */ 150 ALIGN_TEXT 151 10320016: 152 leaq 16(%rdi),%rdi 153 leaq 16(%rsi),%rsi 154 10320000: 155 movq (%rdi),%r8 156 movq (%rsi),%r9 157 cmpq %r8,%r9 158 jne 80f 159 leaq 8(%rdi),%rdi 160 leaq 8(%rsi),%rsi 161 jmp 80f 162 ALIGN_TEXT 163 10081608: 164 10163224: 165 leaq -8(%rdi,%rdx),%rdi 166 leaq -8(%rsi,%rdx),%rsi 167 jmp 80f 168 ALIGN_TEXT 169 10163216: 170 leaq -16(%rdi,%rdx),%rdi 171 leaq -16(%rsi,%rdx),%rsi 172 jmp 80f 173 ALIGN_TEXT 174 10163208: 175 leaq 8(%rdi),%rdi 176 leaq 8(%rsi),%rsi 177 jmp 80f 178 ALIGN_TEXT 179 10040804: 180 leaq -4(%rdi,%rdx),%rdi 181 leaq -4(%rsi,%rdx),%rsi 182 jmp 1f 183 184 ALIGN_TEXT 185 80: 186 movl (%rdi),%r8d 187 movl (%rsi),%r9d 188 cmpl %r8d,%r9d 189 jne 1f 190 leaq 4(%rdi),%rdi 191 leaq 4(%rsi),%rsi 192 193 /* 194 * We have up to 4 bytes to inspect. 195 */ 196 1: 197 movzbl (%rdi),%eax 198 movzbl (%rsi),%r8d 199 cmpb %r8b,%al 200 jne 2f 201 202 movzbl 1(%rdi),%eax 203 movzbl 1(%rsi),%r8d 204 cmpb %r8b,%al 205 jne 2f 206 207 movzbl 2(%rdi),%eax 208 movzbl 2(%rsi),%r8d 209 cmpb %r8b,%al 210 jne 2f 211 212 movzbl 3(%rdi),%eax 213 movzbl 3(%rsi),%r8d 214 2: 215 subl %r8d,%eax 216 \end 217 .endm 218 219 /* 220 * memmove(dst, src, cnt) 221 * rdi, rsi, rdx 222 */ 223 224 /* 225 * Register state at entry is supposed to be as follows: 226 * rdi - destination 227 * rsi - source 228 * rcx - count 229 * 230 * The macro possibly clobbers the above and: rcx, r8, r9, r10 231 * It does not clobber rax nor r11. 232 */ 233 .macro MEMMOVE erms overlap end 234 /* 235 * For sizes 0..32 all data is read before it is written, so there 236 * is no correctness issue with direction of copying. 237 */ 238 movq %rdx,%rcx 239 cmpq $32,%rdx 240 jbe 101632f 241 242 .if \overlap == 1 243 movq %rdi,%r8 244 subq %rsi,%r8 245 cmpq %rcx,%r8 /* overlapping && src < dst? */ 246 jb 2f 247 .endif 248 249 /* 250 * AMD's movsq gets better at around 1024 bytes, Intel's gets 251 * better at around 256 bytes (Zen 2, 9900K era) 252 */ 253 cmpq $1024,%rcx 254 ja 1256f 255 256 103200: 257 movq (%rsi),%rdx 258 movq %rdx,(%rdi) 259 movq 8(%rsi),%rdx 260 movq %rdx,8(%rdi) 261 movq 16(%rsi),%rdx 262 movq %rdx,16(%rdi) 263 movq 24(%rsi),%rdx 264 movq %rdx,24(%rdi) 265 leaq 32(%rsi),%rsi 266 leaq 32(%rdi),%rdi 267 subq $32,%rcx 268 cmpq $32,%rcx 269 jae 103200b 270 cmpb $0,%cl 271 jne 101632f 272 \end 273 ALIGN_TEXT 274 101632: 275 cmpb $16,%cl 276 jl 100816f 277 movq (%rsi),%rdx 278 movq 8(%rsi),%r8 279 movq -16(%rsi,%rcx),%r9 280 movq -8(%rsi,%rcx),%r10 281 movq %rdx,(%rdi) 282 movq %r8,8(%rdi) 283 movq %r9,-16(%rdi,%rcx) 284 movq %r10,-8(%rdi,%rcx) 285 \end 286 ALIGN_TEXT 287 100816: 288 cmpb $8,%cl 289 jl 100408f 290 movq (%rsi),%rdx 291 movq -8(%rsi,%rcx),%r8 292 movq %rdx,(%rdi) 293 movq %r8,-8(%rdi,%rcx,) 294 \end 295 ALIGN_TEXT 296 100408: 297 cmpb $4,%cl 298 jl 100204f 299 movl (%rsi),%edx 300 movl -4(%rsi,%rcx),%r8d 301 movl %edx,(%rdi) 302 movl %r8d,-4(%rdi,%rcx) 303 \end 304 ALIGN_TEXT 305 100204: 306 cmpb $2,%cl 307 jl 100001f 308 movzwl (%rsi),%edx 309 movzwl -2(%rsi,%rcx),%r8d 310 movw %dx,(%rdi) 311 movw %r8w,-2(%rdi,%rcx) 312 \end 313 ALIGN_TEXT 314 100001: 315 cmpb $1,%cl 316 jl 100000f 317 movb (%rsi),%dl 318 movb %dl,(%rdi) 319 100000: 320 \end 321 322 /* 323 * 256 or more bytes 324 */ 325 ALIGN_TEXT 326 1256: 327 testb $15,%dil 328 jnz 100f 329 .if \erms == 1 330 rep 331 movsb 332 .else 333 shrq $3,%rcx /* copy by 64-bit words */ 334 rep 335 movsq 336 movq %rdx,%rcx 337 andl $7,%ecx /* any bytes left? */ 338 jne 100408b 339 .endif 340 \end 341 100: 342 movq (%rsi),%r8 343 movq 8(%rsi),%r9 344 movq %rdi,%r10 345 movq %rdi,%rcx 346 andq $15,%rcx 347 leaq -16(%rdx,%rcx),%rdx 348 neg %rcx 349 leaq 16(%rdi,%rcx),%rdi 350 leaq 16(%rsi,%rcx),%rsi 351 movq %rdx,%rcx 352 .if \erms == 1 353 rep 354 movsb 355 movq %r8,(%r10) 356 movq %r9,8(%r10) 357 .else 358 shrq $3,%rcx /* copy by 64-bit words */ 359 rep 360 movsq 361 movq %r8,(%r10) 362 movq %r9,8(%r10) 363 movq %rdx,%rcx 364 andl $7,%ecx /* any bytes left? */ 365 jne 100408b 366 .endif 367 \end 368 369 .if \overlap == 1 370 /* 371 * Copy backwards. 372 */ 373 ALIGN_TEXT 374 2: 375 cmpq $256,%rcx 376 ja 2256f 377 378 leaq -8(%rdi,%rcx),%rdi 379 leaq -8(%rsi,%rcx),%rsi 380 381 cmpq $32,%rcx 382 jb 2016f 383 384 2032: 385 movq (%rsi),%rdx 386 movq %rdx,(%rdi) 387 movq -8(%rsi),%rdx 388 movq %rdx,-8(%rdi) 389 movq -16(%rsi),%rdx 390 movq %rdx,-16(%rdi) 391 movq -24(%rsi),%rdx 392 movq %rdx,-24(%rdi) 393 leaq -32(%rsi),%rsi 394 leaq -32(%rdi),%rdi 395 subq $32,%rcx 396 cmpq $32,%rcx 397 jae 2032b 398 cmpb $0,%cl 399 jne 2016f 400 \end 401 ALIGN_TEXT 402 2016: 403 cmpb $16,%cl 404 jl 2008f 405 movq (%rsi),%rdx 406 movq %rdx,(%rdi) 407 movq -8(%rsi),%rdx 408 movq %rdx,-8(%rdi) 409 subb $16,%cl 410 jz 2000f 411 leaq -16(%rsi),%rsi 412 leaq -16(%rdi),%rdi 413 2008: 414 cmpb $8,%cl 415 jl 2004f 416 movq (%rsi),%rdx 417 movq %rdx,(%rdi) 418 subb $8,%cl 419 jz 2000f 420 leaq -8(%rsi),%rsi 421 leaq -8(%rdi),%rdi 422 2004: 423 cmpb $4,%cl 424 jl 2002f 425 movl 4(%rsi),%edx 426 movl %edx,4(%rdi) 427 subb $4,%cl 428 jz 2000f 429 leaq -4(%rsi),%rsi 430 leaq -4(%rdi),%rdi 431 2002: 432 cmpb $2,%cl 433 jl 2001f 434 movw 6(%rsi),%dx 435 movw %dx,6(%rdi) 436 subb $2,%cl 437 jz 2000f 438 leaq -2(%rsi),%rsi 439 leaq -2(%rdi),%rdi 440 2001: 441 cmpb $1,%cl 442 jl 2000f 443 movb 7(%rsi),%dl 444 movb %dl,7(%rdi) 445 2000: 446 \end 447 ALIGN_TEXT 448 2256: 449 std 450 .if \erms == 1 451 leaq -1(%rdi,%rcx),%rdi 452 leaq -1(%rsi,%rcx),%rsi 453 rep 454 movsb 455 cld 456 .else 457 leaq -8(%rdi,%rcx),%rdi 458 leaq -8(%rsi,%rcx),%rsi 459 shrq $3,%rcx 460 rep 461 movsq 462 cld 463 movq %rdx,%rcx 464 andb $7,%cl 465 jne 2004b 466 .endif 467 \end 468 .endif 469 .endm 470 471 /* 472 * memset(dst, c, len) 473 * rdi, r10, rdx 474 */ 475 .macro MEMSET erms end 476 movq %rdi,%rax 477 movq %rdx,%rcx 478 479 cmpq $32,%rcx 480 jbe 101632f 481 482 cmpq $256,%rcx 483 ja 1256f 484 485 103200: 486 movq %r10,(%rdi) 487 movq %r10,8(%rdi) 488 movq %r10,16(%rdi) 489 movq %r10,24(%rdi) 490 leaq 32(%rdi),%rdi 491 subq $32,%rcx 492 cmpq $32,%rcx 493 ja 103200b 494 cmpb $16,%cl 495 ja 201632f 496 movq %r10,-16(%rdi,%rcx) 497 movq %r10,-8(%rdi,%rcx) 498 \end 499 ALIGN_TEXT 500 101632: 501 cmpb $16,%cl 502 jl 100816f 503 201632: 504 movq %r10,(%rdi) 505 movq %r10,8(%rdi) 506 movq %r10,-16(%rdi,%rcx) 507 movq %r10,-8(%rdi,%rcx) 508 \end 509 ALIGN_TEXT 510 100816: 511 cmpb $8,%cl 512 jl 100408f 513 movq %r10,(%rdi) 514 movq %r10,-8(%rdi,%rcx) 515 \end 516 ALIGN_TEXT 517 100408: 518 cmpb $4,%cl 519 jl 100204f 520 movl %r10d,(%rdi) 521 movl %r10d,-4(%rdi,%rcx) 522 \end 523 ALIGN_TEXT 524 100204: 525 cmpb $2,%cl 526 jl 100001f 527 movw %r10w,(%rdi) 528 movw %r10w,-2(%rdi,%rcx) 529 \end 530 ALIGN_TEXT 531 100001: 532 cmpb $0,%cl 533 je 100000f 534 movb %r10b,(%rdi) 535 100000: 536 \end 537 ALIGN_TEXT 538 1256: 539 movq %rdi,%r9 540 movq %r10,%rax 541 testl $15,%edi 542 jnz 3f 543 1: 544 .if \erms == 1 545 rep 546 stosb 547 movq %r9,%rax 548 .else 549 movq %rcx,%rdx 550 shrq $3,%rcx 551 rep 552 stosq 553 movq %r9,%rax 554 andl $7,%edx 555 jnz 2f 556 \end 557 2: 558 movq %r10,-8(%rdi,%rdx) 559 .endif 560 \end 561 ALIGN_TEXT 562 3: 563 movq %r10,(%rdi) 564 movq %r10,8(%rdi) 565 movq %rdi,%r8 566 andq $15,%r8 567 leaq -16(%rcx,%r8),%rcx 568 neg %r8 569 leaq 16(%rdi,%r8),%rdi 570 jmp 1b 571 .endm 572 573 .macro DUMMYARG 574 .endm 575