1// Copyright 2016 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build !appengine 6// +build gc 7// +build !noasm 8 9#include "textflag.h" 10 11// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a 12// Go toolchain regression. See https://github.com/golang/go/issues/15426 and 13// https://github.com/golang/snappy/issues/29 14// 15// As a workaround, the package was built with a known good assembler, and 16// those instructions were disassembled by "objdump -d" to yield the 17// 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15 18// style comments, in AT&T asm syntax. Note that rsp here is a physical 19// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm). 20// The instructions were then encoded as "BYTE $0x.." sequences, which assemble 21// fine on Go 1.6. 22 23// The asm code generally follows the pure Go code in encode_other.go, except 24// where marked with a "!!!". 25 26// ---------------------------------------------------------------------------- 27 28// func emitLiteral(dst, lit []byte) int 29// 30// All local variables fit into registers. The register allocation: 31// - AX len(lit) 32// - BX n 33// - DX return value 34// - DI &dst[i] 35// - R10 &lit[0] 36// 37// The 24 bytes of stack space is to call runtime·memmove. 38// 39// The unusual register allocation of local variables, such as R10 for the 40// source pointer, matches the allocation used at the call site in encodeBlock, 41// which makes it easier to manually inline this function. 42TEXT ·emitLiteral(SB), NOSPLIT, $24-56 43 MOVQ dst_base+0(FP), DI 44 MOVQ lit_base+24(FP), R10 45 MOVQ lit_len+32(FP), AX 46 MOVQ AX, DX 47 MOVL AX, BX 48 SUBL $1, BX 49 50 CMPL BX, $60 51 JLT oneByte 52 CMPL BX, $256 53 JLT twoBytes 54 55threeBytes: 56 MOVB $0xf4, 0(DI) 57 MOVW BX, 1(DI) 58 ADDQ $3, DI 59 ADDQ $3, DX 60 JMP memmove 61 62twoBytes: 63 MOVB $0xf0, 0(DI) 64 MOVB BX, 1(DI) 65 ADDQ $2, DI 66 ADDQ $2, DX 67 JMP memmove 68 69oneByte: 70 SHLB $2, BX 71 MOVB BX, 0(DI) 72 ADDQ $1, DI 73 ADDQ $1, DX 74 75memmove: 76 MOVQ DX, ret+48(FP) 77 78 // copy(dst[i:], lit) 79 // 80 // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push 81 // DI, R10 and AX as arguments. 82 MOVQ DI, 0(SP) 83 MOVQ R10, 8(SP) 84 MOVQ AX, 16(SP) 85 CALL runtime·memmove(SB) 86 RET 87 88// ---------------------------------------------------------------------------- 89 90// func emitCopy(dst []byte, offset, length int) int 91// 92// All local variables fit into registers. The register allocation: 93// - AX length 94// - SI &dst[0] 95// - DI &dst[i] 96// - R11 offset 97// 98// The unusual register allocation of local variables, such as R11 for the 99// offset, matches the allocation used at the call site in encodeBlock, which 100// makes it easier to manually inline this function. 101TEXT ·emitCopy(SB), NOSPLIT, $0-48 102 MOVQ dst_base+0(FP), DI 103 MOVQ DI, SI 104 MOVQ offset+24(FP), R11 105 MOVQ length+32(FP), AX 106 107loop0: 108 // for length >= 68 { etc } 109 CMPL AX, $68 110 JLT step1 111 112 // Emit a length 64 copy, encoded as 3 bytes. 113 MOVB $0xfe, 0(DI) 114 MOVW R11, 1(DI) 115 ADDQ $3, DI 116 SUBL $64, AX 117 JMP loop0 118 119step1: 120 // if length > 64 { etc } 121 CMPL AX, $64 122 JLE step2 123 124 // Emit a length 60 copy, encoded as 3 bytes. 125 MOVB $0xee, 0(DI) 126 MOVW R11, 1(DI) 127 ADDQ $3, DI 128 SUBL $60, AX 129 130step2: 131 // if length >= 12 || offset >= 2048 { goto step3 } 132 CMPL AX, $12 133 JGE step3 134 CMPL R11, $2048 135 JGE step3 136 137 // Emit the remaining copy, encoded as 2 bytes. 138 MOVB R11, 1(DI) 139 SHRL $8, R11 140 SHLB $5, R11 141 SUBB $4, AX 142 SHLB $2, AX 143 ORB AX, R11 144 ORB $1, R11 145 MOVB R11, 0(DI) 146 ADDQ $2, DI 147 148 // Return the number of bytes written. 149 SUBQ SI, DI 150 MOVQ DI, ret+40(FP) 151 RET 152 153step3: 154 // Emit the remaining copy, encoded as 3 bytes. 155 SUBL $1, AX 156 SHLB $2, AX 157 ORB $2, AX 158 MOVB AX, 0(DI) 159 MOVW R11, 1(DI) 160 ADDQ $3, DI 161 162 // Return the number of bytes written. 163 SUBQ SI, DI 164 MOVQ DI, ret+40(FP) 165 RET 166 167// ---------------------------------------------------------------------------- 168 169// func extendMatch(src []byte, i, j int) int 170// 171// All local variables fit into registers. The register allocation: 172// - DX &src[0] 173// - SI &src[j] 174// - R13 &src[len(src) - 8] 175// - R14 &src[len(src)] 176// - R15 &src[i] 177// 178// The unusual register allocation of local variables, such as R15 for a source 179// pointer, matches the allocation used at the call site in encodeBlock, which 180// makes it easier to manually inline this function. 181TEXT ·extendMatch(SB), NOSPLIT, $0-48 182 MOVQ src_base+0(FP), DX 183 MOVQ src_len+8(FP), R14 184 MOVQ i+24(FP), R15 185 MOVQ j+32(FP), SI 186 ADDQ DX, R14 187 ADDQ DX, R15 188 ADDQ DX, SI 189 MOVQ R14, R13 190 SUBQ $8, R13 191 192cmp8: 193 // As long as we are 8 or more bytes before the end of src, we can load and 194 // compare 8 bytes at a time. If those 8 bytes are equal, repeat. 195 CMPQ SI, R13 196 JA cmp1 197 MOVQ (R15), AX 198 MOVQ (SI), BX 199 CMPQ AX, BX 200 JNE bsf 201 ADDQ $8, R15 202 ADDQ $8, SI 203 JMP cmp8 204 205bsf: 206 // If those 8 bytes were not equal, XOR the two 8 byte values, and return 207 // the index of the first byte that differs. The BSF instruction finds the 208 // least significant 1 bit, the amd64 architecture is little-endian, and 209 // the shift by 3 converts a bit index to a byte index. 210 XORQ AX, BX 211 BSFQ BX, BX 212 SHRQ $3, BX 213 ADDQ BX, SI 214 215 // Convert from &src[ret] to ret. 216 SUBQ DX, SI 217 MOVQ SI, ret+40(FP) 218 RET 219 220cmp1: 221 // In src's tail, compare 1 byte at a time. 222 CMPQ SI, R14 223 JAE extendMatchEnd 224 MOVB (R15), AX 225 MOVB (SI), BX 226 CMPB AX, BX 227 JNE extendMatchEnd 228 ADDQ $1, R15 229 ADDQ $1, SI 230 JMP cmp1 231 232extendMatchEnd: 233 // Convert from &src[ret] to ret. 234 SUBQ DX, SI 235 MOVQ SI, ret+40(FP) 236 RET 237 238// ---------------------------------------------------------------------------- 239 240// func encodeBlock(dst, src []byte) (d int) 241// 242// All local variables fit into registers, other than "var table". The register 243// allocation: 244// - AX . . 245// - BX . . 246// - CX 56 shift (note that amd64 shifts by non-immediates must use CX). 247// - DX 64 &src[0], tableSize 248// - SI 72 &src[s] 249// - DI 80 &dst[d] 250// - R9 88 sLimit 251// - R10 . &src[nextEmit] 252// - R11 96 prevHash, currHash, nextHash, offset 253// - R12 104 &src[base], skip 254// - R13 . &src[nextS], &src[len(src) - 8] 255// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x 256// - R15 112 candidate 257// 258// The second column (56, 64, etc) is the stack offset to spill the registers 259// when calling other functions. We could pack this slightly tighter, but it's 260// simpler to have a dedicated spill map independent of the function called. 261// 262// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An 263// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill 264// local variables (registers) during calls gives 32768 + 56 + 64 = 32888. 265TEXT ·encodeBlock(SB), 0, $32888-56 266 MOVQ dst_base+0(FP), DI 267 MOVQ src_base+24(FP), SI 268 MOVQ src_len+32(FP), R14 269 270 // shift, tableSize := uint32(32-8), 1<<8 271 MOVQ $24, CX 272 MOVQ $256, DX 273 274calcShift: 275 // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 { 276 // shift-- 277 // } 278 CMPQ DX, $16384 279 JGE varTable 280 CMPQ DX, R14 281 JGE varTable 282 SUBQ $1, CX 283 SHLQ $1, DX 284 JMP calcShift 285 286varTable: 287 // var table [maxTableSize]uint16 288 // 289 // In the asm code, unlike the Go code, we can zero-initialize only the 290 // first tableSize elements. Each uint16 element is 2 bytes and each MOVOU 291 // writes 16 bytes, so we can do only tableSize/8 writes instead of the 292 // 2048 writes that would zero-initialize all of table's 32768 bytes. 293 SHRQ $3, DX 294 LEAQ table-32768(SP), BX 295 PXOR X0, X0 296 297memclr: 298 MOVOU X0, 0(BX) 299 ADDQ $16, BX 300 SUBQ $1, DX 301 JNZ memclr 302 303 // !!! DX = &src[0] 304 MOVQ SI, DX 305 306 // sLimit := len(src) - inputMargin 307 MOVQ R14, R9 308 SUBQ $15, R9 309 310 // !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't 311 // change for the rest of the function. 312 MOVQ CX, 56(SP) 313 MOVQ DX, 64(SP) 314 MOVQ R9, 88(SP) 315 316 // nextEmit := 0 317 MOVQ DX, R10 318 319 // s := 1 320 ADDQ $1, SI 321 322 // nextHash := hash(load32(src, s), shift) 323 MOVL 0(SI), R11 324 IMULL $0x1e35a7bd, R11 325 SHRL CX, R11 326 327outer: 328 // for { etc } 329 330 // skip := 32 331 MOVQ $32, R12 332 333 // nextS := s 334 MOVQ SI, R13 335 336 // candidate := 0 337 MOVQ $0, R15 338 339inner0: 340 // for { etc } 341 342 // s := nextS 343 MOVQ R13, SI 344 345 // bytesBetweenHashLookups := skip >> 5 346 MOVQ R12, R14 347 SHRQ $5, R14 348 349 // nextS = s + bytesBetweenHashLookups 350 ADDQ R14, R13 351 352 // skip += bytesBetweenHashLookups 353 ADDQ R14, R12 354 355 // if nextS > sLimit { goto emitRemainder } 356 MOVQ R13, AX 357 SUBQ DX, AX 358 CMPQ AX, R9 359 JA emitRemainder 360 361 // candidate = int(table[nextHash]) 362 // XXX: MOVWQZX table-32768(SP)(R11*2), R15 363 // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15 364 BYTE $0x4e 365 BYTE $0x0f 366 BYTE $0xb7 367 BYTE $0x7c 368 BYTE $0x5c 369 BYTE $0x78 370 371 // table[nextHash] = uint16(s) 372 MOVQ SI, AX 373 SUBQ DX, AX 374 375 // XXX: MOVW AX, table-32768(SP)(R11*2) 376 // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2) 377 BYTE $0x66 378 BYTE $0x42 379 BYTE $0x89 380 BYTE $0x44 381 BYTE $0x5c 382 BYTE $0x78 383 384 // nextHash = hash(load32(src, nextS), shift) 385 MOVL 0(R13), R11 386 IMULL $0x1e35a7bd, R11 387 SHRL CX, R11 388 389 // if load32(src, s) != load32(src, candidate) { continue } break 390 MOVL 0(SI), AX 391 MOVL (DX)(R15*1), BX 392 CMPL AX, BX 393 JNE inner0 394 395fourByteMatch: 396 // As per the encode_other.go code: 397 // 398 // A 4-byte match has been found. We'll later see etc. 399 400 // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment 401 // on inputMargin in encode.go. 402 MOVQ SI, AX 403 SUBQ R10, AX 404 CMPQ AX, $16 405 JLE emitLiteralFastPath 406 407 // ---------------------------------------- 408 // Begin inline of the emitLiteral call. 409 // 410 // d += emitLiteral(dst[d:], src[nextEmit:s]) 411 412 MOVL AX, BX 413 SUBL $1, BX 414 415 CMPL BX, $60 416 JLT inlineEmitLiteralOneByte 417 CMPL BX, $256 418 JLT inlineEmitLiteralTwoBytes 419 420inlineEmitLiteralThreeBytes: 421 MOVB $0xf4, 0(DI) 422 MOVW BX, 1(DI) 423 ADDQ $3, DI 424 JMP inlineEmitLiteralMemmove 425 426inlineEmitLiteralTwoBytes: 427 MOVB $0xf0, 0(DI) 428 MOVB BX, 1(DI) 429 ADDQ $2, DI 430 JMP inlineEmitLiteralMemmove 431 432inlineEmitLiteralOneByte: 433 SHLB $2, BX 434 MOVB BX, 0(DI) 435 ADDQ $1, DI 436 437inlineEmitLiteralMemmove: 438 // Spill local variables (registers) onto the stack; call; unspill. 439 // 440 // copy(dst[i:], lit) 441 // 442 // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push 443 // DI, R10 and AX as arguments. 444 MOVQ DI, 0(SP) 445 MOVQ R10, 8(SP) 446 MOVQ AX, 16(SP) 447 ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)". 448 MOVQ SI, 72(SP) 449 MOVQ DI, 80(SP) 450 MOVQ R15, 112(SP) 451 CALL runtime·memmove(SB) 452 MOVQ 56(SP), CX 453 MOVQ 64(SP), DX 454 MOVQ 72(SP), SI 455 MOVQ 80(SP), DI 456 MOVQ 88(SP), R9 457 MOVQ 112(SP), R15 458 JMP inner1 459 460inlineEmitLiteralEnd: 461 // End inline of the emitLiteral call. 462 // ---------------------------------------- 463 464emitLiteralFastPath: 465 // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2". 466 MOVB AX, BX 467 SUBB $1, BX 468 SHLB $2, BX 469 MOVB BX, (DI) 470 ADDQ $1, DI 471 472 // !!! Implement the copy from lit to dst as a 16-byte load and store. 473 // (Encode's documentation says that dst and src must not overlap.) 474 // 475 // This always copies 16 bytes, instead of only len(lit) bytes, but that's 476 // OK. Subsequent iterations will fix up the overrun. 477 // 478 // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or 479 // 16-byte loads and stores. This technique probably wouldn't be as 480 // effective on architectures that are fussier about alignment. 481 MOVOU 0(R10), X0 482 MOVOU X0, 0(DI) 483 ADDQ AX, DI 484 485inner1: 486 // for { etc } 487 488 // base := s 489 MOVQ SI, R12 490 491 // !!! offset := base - candidate 492 MOVQ R12, R11 493 SUBQ R15, R11 494 SUBQ DX, R11 495 496 // ---------------------------------------- 497 // Begin inline of the extendMatch call. 498 // 499 // s = extendMatch(src, candidate+4, s+4) 500 501 // !!! R14 = &src[len(src)] 502 MOVQ src_len+32(FP), R14 503 ADDQ DX, R14 504 505 // !!! R13 = &src[len(src) - 8] 506 MOVQ R14, R13 507 SUBQ $8, R13 508 509 // !!! R15 = &src[candidate + 4] 510 ADDQ $4, R15 511 ADDQ DX, R15 512 513 // !!! s += 4 514 ADDQ $4, SI 515 516inlineExtendMatchCmp8: 517 // As long as we are 8 or more bytes before the end of src, we can load and 518 // compare 8 bytes at a time. If those 8 bytes are equal, repeat. 519 CMPQ SI, R13 520 JA inlineExtendMatchCmp1 521 MOVQ (R15), AX 522 MOVQ (SI), BX 523 CMPQ AX, BX 524 JNE inlineExtendMatchBSF 525 ADDQ $8, R15 526 ADDQ $8, SI 527 JMP inlineExtendMatchCmp8 528 529inlineExtendMatchBSF: 530 // If those 8 bytes were not equal, XOR the two 8 byte values, and return 531 // the index of the first byte that differs. The BSF instruction finds the 532 // least significant 1 bit, the amd64 architecture is little-endian, and 533 // the shift by 3 converts a bit index to a byte index. 534 XORQ AX, BX 535 BSFQ BX, BX 536 SHRQ $3, BX 537 ADDQ BX, SI 538 JMP inlineExtendMatchEnd 539 540inlineExtendMatchCmp1: 541 // In src's tail, compare 1 byte at a time. 542 CMPQ SI, R14 543 JAE inlineExtendMatchEnd 544 MOVB (R15), AX 545 MOVB (SI), BX 546 CMPB AX, BX 547 JNE inlineExtendMatchEnd 548 ADDQ $1, R15 549 ADDQ $1, SI 550 JMP inlineExtendMatchCmp1 551 552inlineExtendMatchEnd: 553 // End inline of the extendMatch call. 554 // ---------------------------------------- 555 556 // ---------------------------------------- 557 // Begin inline of the emitCopy call. 558 // 559 // d += emitCopy(dst[d:], base-candidate, s-base) 560 561 // !!! length := s - base 562 MOVQ SI, AX 563 SUBQ R12, AX 564 565inlineEmitCopyLoop0: 566 // for length >= 68 { etc } 567 CMPL AX, $68 568 JLT inlineEmitCopyStep1 569 570 // Emit a length 64 copy, encoded as 3 bytes. 571 MOVB $0xfe, 0(DI) 572 MOVW R11, 1(DI) 573 ADDQ $3, DI 574 SUBL $64, AX 575 JMP inlineEmitCopyLoop0 576 577inlineEmitCopyStep1: 578 // if length > 64 { etc } 579 CMPL AX, $64 580 JLE inlineEmitCopyStep2 581 582 // Emit a length 60 copy, encoded as 3 bytes. 583 MOVB $0xee, 0(DI) 584 MOVW R11, 1(DI) 585 ADDQ $3, DI 586 SUBL $60, AX 587 588inlineEmitCopyStep2: 589 // if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 } 590 CMPL AX, $12 591 JGE inlineEmitCopyStep3 592 CMPL R11, $2048 593 JGE inlineEmitCopyStep3 594 595 // Emit the remaining copy, encoded as 2 bytes. 596 MOVB R11, 1(DI) 597 SHRL $8, R11 598 SHLB $5, R11 599 SUBB $4, AX 600 SHLB $2, AX 601 ORB AX, R11 602 ORB $1, R11 603 MOVB R11, 0(DI) 604 ADDQ $2, DI 605 JMP inlineEmitCopyEnd 606 607inlineEmitCopyStep3: 608 // Emit the remaining copy, encoded as 3 bytes. 609 SUBL $1, AX 610 SHLB $2, AX 611 ORB $2, AX 612 MOVB AX, 0(DI) 613 MOVW R11, 1(DI) 614 ADDQ $3, DI 615 616inlineEmitCopyEnd: 617 // End inline of the emitCopy call. 618 // ---------------------------------------- 619 620 // nextEmit = s 621 MOVQ SI, R10 622 623 // if s >= sLimit { goto emitRemainder } 624 MOVQ SI, AX 625 SUBQ DX, AX 626 CMPQ AX, R9 627 JAE emitRemainder 628 629 // As per the encode_other.go code: 630 // 631 // We could immediately etc. 632 633 // x := load64(src, s-1) 634 MOVQ -1(SI), R14 635 636 // prevHash := hash(uint32(x>>0), shift) 637 MOVL R14, R11 638 IMULL $0x1e35a7bd, R11 639 SHRL CX, R11 640 641 // table[prevHash] = uint16(s-1) 642 MOVQ SI, AX 643 SUBQ DX, AX 644 SUBQ $1, AX 645 646 // XXX: MOVW AX, table-32768(SP)(R11*2) 647 // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2) 648 BYTE $0x66 649 BYTE $0x42 650 BYTE $0x89 651 BYTE $0x44 652 BYTE $0x5c 653 BYTE $0x78 654 655 // currHash := hash(uint32(x>>8), shift) 656 SHRQ $8, R14 657 MOVL R14, R11 658 IMULL $0x1e35a7bd, R11 659 SHRL CX, R11 660 661 // candidate = int(table[currHash]) 662 // XXX: MOVWQZX table-32768(SP)(R11*2), R15 663 // XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15 664 BYTE $0x4e 665 BYTE $0x0f 666 BYTE $0xb7 667 BYTE $0x7c 668 BYTE $0x5c 669 BYTE $0x78 670 671 // table[currHash] = uint16(s) 672 ADDQ $1, AX 673 674 // XXX: MOVW AX, table-32768(SP)(R11*2) 675 // XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2) 676 BYTE $0x66 677 BYTE $0x42 678 BYTE $0x89 679 BYTE $0x44 680 BYTE $0x5c 681 BYTE $0x78 682 683 // if uint32(x>>8) == load32(src, candidate) { continue } 684 MOVL (DX)(R15*1), BX 685 CMPL R14, BX 686 JEQ inner1 687 688 // nextHash = hash(uint32(x>>16), shift) 689 SHRQ $8, R14 690 MOVL R14, R11 691 IMULL $0x1e35a7bd, R11 692 SHRL CX, R11 693 694 // s++ 695 ADDQ $1, SI 696 697 // break out of the inner1 for loop, i.e. continue the outer loop. 698 JMP outer 699 700emitRemainder: 701 // if nextEmit < len(src) { etc } 702 MOVQ src_len+32(FP), AX 703 ADDQ DX, AX 704 CMPQ R10, AX 705 JEQ encodeBlockEnd 706 707 // d += emitLiteral(dst[d:], src[nextEmit:]) 708 // 709 // Push args. 710 MOVQ DI, 0(SP) 711 MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative. 712 MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative. 713 MOVQ R10, 24(SP) 714 SUBQ R10, AX 715 MOVQ AX, 32(SP) 716 MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative. 717 718 // Spill local variables (registers) onto the stack; call; unspill. 719 MOVQ DI, 80(SP) 720 CALL ·emitLiteral(SB) 721 MOVQ 80(SP), DI 722 723 // Finish the "d +=" part of "d += emitLiteral(etc)". 724 ADDQ 48(SP), DI 725 726encodeBlockEnd: 727 MOVQ dst_base+0(FP), AX 728 SUBQ AX, DI 729 MOVQ DI, d+48(FP) 730 RET 731