1// Copyright 2020 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build !appengine 6// +build gc 7// +build !noasm 8 9#include "textflag.h" 10 11// The asm code generally follows the pure Go code in encode_other.go, except 12// where marked with a "!!!". 13 14// ---------------------------------------------------------------------------- 15 16// func emitLiteral(dst, lit []byte) int 17// 18// All local variables fit into registers. The register allocation: 19// - R3 len(lit) 20// - R4 n 21// - R6 return value 22// - R8 &dst[i] 23// - R10 &lit[0] 24// 25// The 32 bytes of stack space is to call runtime·memmove. 26// 27// The unusual register allocation of local variables, such as R10 for the 28// source pointer, matches the allocation used at the call site in encodeBlock, 29// which makes it easier to manually inline this function. 30TEXT ·emitLiteral(SB), NOSPLIT, $32-56 31 MOVD dst_base+0(FP), R8 32 MOVD lit_base+24(FP), R10 33 MOVD lit_len+32(FP), R3 34 MOVD R3, R6 35 MOVW R3, R4 36 SUBW $1, R4, R4 37 38 CMPW $60, R4 39 BLT oneByte 40 CMPW $256, R4 41 BLT twoBytes 42 43threeBytes: 44 MOVD $0xf4, R2 45 MOVB R2, 0(R8) 46 MOVW R4, 1(R8) 47 ADD $3, R8, R8 48 ADD $3, R6, R6 49 B memmove 50 51twoBytes: 52 MOVD $0xf0, R2 53 MOVB R2, 0(R8) 54 MOVB R4, 1(R8) 55 ADD $2, R8, R8 56 ADD $2, R6, R6 57 B memmove 58 59oneByte: 60 LSLW $2, R4, R4 61 MOVB R4, 0(R8) 62 ADD $1, R8, R8 63 ADD $1, R6, R6 64 65memmove: 66 MOVD R6, ret+48(FP) 67 68 // copy(dst[i:], lit) 69 // 70 // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push 71 // R8, R10 and R3 as arguments. 72 MOVD R8, 8(RSP) 73 MOVD R10, 16(RSP) 74 MOVD R3, 24(RSP) 75 CALL runtime·memmove(SB) 76 RET 77 78// ---------------------------------------------------------------------------- 79 80// func emitCopy(dst []byte, offset, length int) int 81// 82// All local variables fit into registers. The register allocation: 83// - R3 length 84// - R7 &dst[0] 85// - R8 &dst[i] 86// - R11 offset 87// 88// The unusual register allocation of local variables, such as R11 for the 89// offset, matches the allocation used at the call site in encodeBlock, which 90// makes it easier to manually inline this function. 91TEXT ·emitCopy(SB), NOSPLIT, $0-48 92 MOVD dst_base+0(FP), R8 93 MOVD R8, R7 94 MOVD offset+24(FP), R11 95 MOVD length+32(FP), R3 96 97loop0: 98 // for length >= 68 { etc } 99 CMPW $68, R3 100 BLT step1 101 102 // Emit a length 64 copy, encoded as 3 bytes. 103 MOVD $0xfe, R2 104 MOVB R2, 0(R8) 105 MOVW R11, 1(R8) 106 ADD $3, R8, R8 107 SUB $64, R3, R3 108 B loop0 109 110step1: 111 // if length > 64 { etc } 112 CMP $64, R3 113 BLE step2 114 115 // Emit a length 60 copy, encoded as 3 bytes. 116 MOVD $0xee, R2 117 MOVB R2, 0(R8) 118 MOVW R11, 1(R8) 119 ADD $3, R8, R8 120 SUB $60, R3, R3 121 122step2: 123 // if length >= 12 || offset >= 2048 { goto step3 } 124 CMP $12, R3 125 BGE step3 126 CMPW $2048, R11 127 BGE step3 128 129 // Emit the remaining copy, encoded as 2 bytes. 130 MOVB R11, 1(R8) 131 LSRW $3, R11, R11 132 AND $0xe0, R11, R11 133 SUB $4, R3, R3 134 LSLW $2, R3 135 AND $0xff, R3, R3 136 ORRW R3, R11, R11 137 ORRW $1, R11, R11 138 MOVB R11, 0(R8) 139 ADD $2, R8, R8 140 141 // Return the number of bytes written. 142 SUB R7, R8, R8 143 MOVD R8, ret+40(FP) 144 RET 145 146step3: 147 // Emit the remaining copy, encoded as 3 bytes. 148 SUB $1, R3, R3 149 AND $0xff, R3, R3 150 LSLW $2, R3, R3 151 ORRW $2, R3, R3 152 MOVB R3, 0(R8) 153 MOVW R11, 1(R8) 154 ADD $3, R8, R8 155 156 // Return the number of bytes written. 157 SUB R7, R8, R8 158 MOVD R8, ret+40(FP) 159 RET 160 161// ---------------------------------------------------------------------------- 162 163// func extendMatch(src []byte, i, j int) int 164// 165// All local variables fit into registers. The register allocation: 166// - R6 &src[0] 167// - R7 &src[j] 168// - R13 &src[len(src) - 8] 169// - R14 &src[len(src)] 170// - R15 &src[i] 171// 172// The unusual register allocation of local variables, such as R15 for a source 173// pointer, matches the allocation used at the call site in encodeBlock, which 174// makes it easier to manually inline this function. 175TEXT ·extendMatch(SB), NOSPLIT, $0-48 176 MOVD src_base+0(FP), R6 177 MOVD src_len+8(FP), R14 178 MOVD i+24(FP), R15 179 MOVD j+32(FP), R7 180 ADD R6, R14, R14 181 ADD R6, R15, R15 182 ADD R6, R7, R7 183 MOVD R14, R13 184 SUB $8, R13, R13 185 186cmp8: 187 // As long as we are 8 or more bytes before the end of src, we can load and 188 // compare 8 bytes at a time. If those 8 bytes are equal, repeat. 189 CMP R13, R7 190 BHI cmp1 191 MOVD (R15), R3 192 MOVD (R7), R4 193 CMP R4, R3 194 BNE bsf 195 ADD $8, R15, R15 196 ADD $8, R7, R7 197 B cmp8 198 199bsf: 200 // If those 8 bytes were not equal, XOR the two 8 byte values, and return 201 // the index of the first byte that differs. 202 // RBIT reverses the bit order, then CLZ counts the leading zeros, the 203 // combination of which finds the least significant bit which is set. 204 // The arm64 architecture is little-endian, and the shift by 3 converts 205 // a bit index to a byte index. 206 EOR R3, R4, R4 207 RBIT R4, R4 208 CLZ R4, R4 209 ADD R4>>3, R7, R7 210 211 // Convert from &src[ret] to ret. 212 SUB R6, R7, R7 213 MOVD R7, ret+40(FP) 214 RET 215 216cmp1: 217 // In src's tail, compare 1 byte at a time. 218 CMP R7, R14 219 BLS extendMatchEnd 220 MOVB (R15), R3 221 MOVB (R7), R4 222 CMP R4, R3 223 BNE extendMatchEnd 224 ADD $1, R15, R15 225 ADD $1, R7, R7 226 B cmp1 227 228extendMatchEnd: 229 // Convert from &src[ret] to ret. 230 SUB R6, R7, R7 231 MOVD R7, ret+40(FP) 232 RET 233 234// ---------------------------------------------------------------------------- 235 236// func encodeBlock(dst, src []byte) (d int) 237// 238// All local variables fit into registers, other than "var table". The register 239// allocation: 240// - R3 . . 241// - R4 . . 242// - R5 64 shift 243// - R6 72 &src[0], tableSize 244// - R7 80 &src[s] 245// - R8 88 &dst[d] 246// - R9 96 sLimit 247// - R10 . &src[nextEmit] 248// - R11 104 prevHash, currHash, nextHash, offset 249// - R12 112 &src[base], skip 250// - R13 . &src[nextS], &src[len(src) - 8] 251// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x 252// - R15 120 candidate 253// - R16 . hash constant, 0x1e35a7bd 254// - R17 . &table 255// - . 128 table 256// 257// The second column (64, 72, etc) is the stack offset to spill the registers 258// when calling other functions. We could pack this slightly tighter, but it's 259// simpler to have a dedicated spill map independent of the function called. 260// 261// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An 262// extra 64 bytes, to call other functions, and an extra 64 bytes, to spill 263// local variables (registers) during calls gives 32768 + 64 + 64 = 32896. 264TEXT ·encodeBlock(SB), 0, $32896-56 265 MOVD dst_base+0(FP), R8 266 MOVD src_base+24(FP), R7 267 MOVD src_len+32(FP), R14 268 269 // shift, tableSize := uint32(32-8), 1<<8 270 MOVD $24, R5 271 MOVD $256, R6 272 MOVW $0xa7bd, R16 273 MOVKW $(0x1e35<<16), R16 274 275calcShift: 276 // for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 { 277 // shift-- 278 // } 279 MOVD $16384, R2 280 CMP R2, R6 281 BGE varTable 282 CMP R14, R6 283 BGE varTable 284 SUB $1, R5, R5 285 LSL $1, R6, R6 286 B calcShift 287 288varTable: 289 // var table [maxTableSize]uint16 290 // 291 // In the asm code, unlike the Go code, we can zero-initialize only the 292 // first tableSize elements. Each uint16 element is 2 bytes and each 293 // iterations writes 64 bytes, so we can do only tableSize/32 writes 294 // instead of the 2048 writes that would zero-initialize all of table's 295 // 32768 bytes. This clear could overrun the first tableSize elements, but 296 // it won't overrun the allocated stack size. 297 ADD $128, RSP, R17 298 MOVD R17, R4 299 300 // !!! R6 = &src[tableSize] 301 ADD R6<<1, R17, R6 302 303memclr: 304 STP.P (ZR, ZR), 64(R4) 305 STP (ZR, ZR), -48(R4) 306 STP (ZR, ZR), -32(R4) 307 STP (ZR, ZR), -16(R4) 308 CMP R4, R6 309 BHI memclr 310 311 // !!! R6 = &src[0] 312 MOVD R7, R6 313 314 // sLimit := len(src) - inputMargin 315 MOVD R14, R9 316 SUB $15, R9, R9 317 318 // !!! Pre-emptively spill R5, R6 and R9 to the stack. Their values don't 319 // change for the rest of the function. 320 MOVD R5, 64(RSP) 321 MOVD R6, 72(RSP) 322 MOVD R9, 96(RSP) 323 324 // nextEmit := 0 325 MOVD R6, R10 326 327 // s := 1 328 ADD $1, R7, R7 329 330 // nextHash := hash(load32(src, s), shift) 331 MOVW 0(R7), R11 332 MULW R16, R11, R11 333 LSRW R5, R11, R11 334 335outer: 336 // for { etc } 337 338 // skip := 32 339 MOVD $32, R12 340 341 // nextS := s 342 MOVD R7, R13 343 344 // candidate := 0 345 MOVD $0, R15 346 347inner0: 348 // for { etc } 349 350 // s := nextS 351 MOVD R13, R7 352 353 // bytesBetweenHashLookups := skip >> 5 354 MOVD R12, R14 355 LSR $5, R14, R14 356 357 // nextS = s + bytesBetweenHashLookups 358 ADD R14, R13, R13 359 360 // skip += bytesBetweenHashLookups 361 ADD R14, R12, R12 362 363 // if nextS > sLimit { goto emitRemainder } 364 MOVD R13, R3 365 SUB R6, R3, R3 366 CMP R9, R3 367 BHI emitRemainder 368 369 // candidate = int(table[nextHash]) 370 MOVHU 0(R17)(R11<<1), R15 371 372 // table[nextHash] = uint16(s) 373 MOVD R7, R3 374 SUB R6, R3, R3 375 376 MOVH R3, 0(R17)(R11<<1) 377 378 // nextHash = hash(load32(src, nextS), shift) 379 MOVW 0(R13), R11 380 MULW R16, R11 381 LSRW R5, R11, R11 382 383 // if load32(src, s) != load32(src, candidate) { continue } break 384 MOVW 0(R7), R3 385 MOVW (R6)(R15*1), R4 386 CMPW R4, R3 387 BNE inner0 388 389fourByteMatch: 390 // As per the encode_other.go code: 391 // 392 // A 4-byte match has been found. We'll later see etc. 393 394 // !!! Jump to a fast path for short (<= 16 byte) literals. See the comment 395 // on inputMargin in encode.go. 396 MOVD R7, R3 397 SUB R10, R3, R3 398 CMP $16, R3 399 BLE emitLiteralFastPath 400 401 // ---------------------------------------- 402 // Begin inline of the emitLiteral call. 403 // 404 // d += emitLiteral(dst[d:], src[nextEmit:s]) 405 406 MOVW R3, R4 407 SUBW $1, R4, R4 408 409 MOVW $60, R2 410 CMPW R2, R4 411 BLT inlineEmitLiteralOneByte 412 MOVW $256, R2 413 CMPW R2, R4 414 BLT inlineEmitLiteralTwoBytes 415 416inlineEmitLiteralThreeBytes: 417 MOVD $0xf4, R1 418 MOVB R1, 0(R8) 419 MOVW R4, 1(R8) 420 ADD $3, R8, R8 421 B inlineEmitLiteralMemmove 422 423inlineEmitLiteralTwoBytes: 424 MOVD $0xf0, R1 425 MOVB R1, 0(R8) 426 MOVB R4, 1(R8) 427 ADD $2, R8, R8 428 B inlineEmitLiteralMemmove 429 430inlineEmitLiteralOneByte: 431 LSLW $2, R4, R4 432 MOVB R4, 0(R8) 433 ADD $1, R8, R8 434 435inlineEmitLiteralMemmove: 436 // Spill local variables (registers) onto the stack; call; unspill. 437 // 438 // copy(dst[i:], lit) 439 // 440 // This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push 441 // R8, R10 and R3 as arguments. 442 MOVD R8, 8(RSP) 443 MOVD R10, 16(RSP) 444 MOVD R3, 24(RSP) 445 446 // Finish the "d +=" part of "d += emitLiteral(etc)". 447 ADD R3, R8, R8 448 MOVD R7, 80(RSP) 449 MOVD R8, 88(RSP) 450 MOVD R15, 120(RSP) 451 CALL runtime·memmove(SB) 452 MOVD 64(RSP), R5 453 MOVD 72(RSP), R6 454 MOVD 80(RSP), R7 455 MOVD 88(RSP), R8 456 MOVD 96(RSP), R9 457 MOVD 120(RSP), R15 458 ADD $128, RSP, R17 459 MOVW $0xa7bd, R16 460 MOVKW $(0x1e35<<16), R16 461 B inner1 462 463inlineEmitLiteralEnd: 464 // End inline of the emitLiteral call. 465 // ---------------------------------------- 466 467emitLiteralFastPath: 468 // !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2". 469 MOVB R3, R4 470 SUBW $1, R4, R4 471 AND $0xff, R4, R4 472 LSLW $2, R4, R4 473 MOVB R4, (R8) 474 ADD $1, R8, R8 475 476 // !!! Implement the copy from lit to dst as a 16-byte load and store. 477 // (Encode's documentation says that dst and src must not overlap.) 478 // 479 // This always copies 16 bytes, instead of only len(lit) bytes, but that's 480 // OK. Subsequent iterations will fix up the overrun. 481 // 482 // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or 483 // 16-byte loads and stores. This technique probably wouldn't be as 484 // effective on architectures that are fussier about alignment. 485 LDP 0(R10), (R0, R1) 486 STP (R0, R1), 0(R8) 487 ADD R3, R8, R8 488 489inner1: 490 // for { etc } 491 492 // base := s 493 MOVD R7, R12 494 495 // !!! offset := base - candidate 496 MOVD R12, R11 497 SUB R15, R11, R11 498 SUB R6, R11, R11 499 500 // ---------------------------------------- 501 // Begin inline of the extendMatch call. 502 // 503 // s = extendMatch(src, candidate+4, s+4) 504 505 // !!! R14 = &src[len(src)] 506 MOVD src_len+32(FP), R14 507 ADD R6, R14, R14 508 509 // !!! R13 = &src[len(src) - 8] 510 MOVD R14, R13 511 SUB $8, R13, R13 512 513 // !!! R15 = &src[candidate + 4] 514 ADD $4, R15, R15 515 ADD R6, R15, R15 516 517 // !!! s += 4 518 ADD $4, R7, R7 519 520inlineExtendMatchCmp8: 521 // As long as we are 8 or more bytes before the end of src, we can load and 522 // compare 8 bytes at a time. If those 8 bytes are equal, repeat. 523 CMP R13, R7 524 BHI inlineExtendMatchCmp1 525 MOVD (R15), R3 526 MOVD (R7), R4 527 CMP R4, R3 528 BNE inlineExtendMatchBSF 529 ADD $8, R15, R15 530 ADD $8, R7, R7 531 B inlineExtendMatchCmp8 532 533inlineExtendMatchBSF: 534 // If those 8 bytes were not equal, XOR the two 8 byte values, and return 535 // the index of the first byte that differs. 536 // RBIT reverses the bit order, then CLZ counts the leading zeros, the 537 // combination of which finds the least significant bit which is set. 538 // The arm64 architecture is little-endian, and the shift by 3 converts 539 // a bit index to a byte index. 540 EOR R3, R4, R4 541 RBIT R4, R4 542 CLZ R4, R4 543 ADD R4>>3, R7, R7 544 B inlineExtendMatchEnd 545 546inlineExtendMatchCmp1: 547 // In src's tail, compare 1 byte at a time. 548 CMP R7, R14 549 BLS inlineExtendMatchEnd 550 MOVB (R15), R3 551 MOVB (R7), R4 552 CMP R4, R3 553 BNE inlineExtendMatchEnd 554 ADD $1, R15, R15 555 ADD $1, R7, R7 556 B inlineExtendMatchCmp1 557 558inlineExtendMatchEnd: 559 // End inline of the extendMatch call. 560 // ---------------------------------------- 561 562 // ---------------------------------------- 563 // Begin inline of the emitCopy call. 564 // 565 // d += emitCopy(dst[d:], base-candidate, s-base) 566 567 // !!! length := s - base 568 MOVD R7, R3 569 SUB R12, R3, R3 570 571inlineEmitCopyLoop0: 572 // for length >= 68 { etc } 573 MOVW $68, R2 574 CMPW R2, R3 575 BLT inlineEmitCopyStep1 576 577 // Emit a length 64 copy, encoded as 3 bytes. 578 MOVD $0xfe, R1 579 MOVB R1, 0(R8) 580 MOVW R11, 1(R8) 581 ADD $3, R8, R8 582 SUBW $64, R3, R3 583 B inlineEmitCopyLoop0 584 585inlineEmitCopyStep1: 586 // if length > 64 { etc } 587 MOVW $64, R2 588 CMPW R2, R3 589 BLE inlineEmitCopyStep2 590 591 // Emit a length 60 copy, encoded as 3 bytes. 592 MOVD $0xee, R1 593 MOVB R1, 0(R8) 594 MOVW R11, 1(R8) 595 ADD $3, R8, R8 596 SUBW $60, R3, R3 597 598inlineEmitCopyStep2: 599 // if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 } 600 MOVW $12, R2 601 CMPW R2, R3 602 BGE inlineEmitCopyStep3 603 MOVW $2048, R2 604 CMPW R2, R11 605 BGE inlineEmitCopyStep3 606 607 // Emit the remaining copy, encoded as 2 bytes. 608 MOVB R11, 1(R8) 609 LSRW $8, R11, R11 610 LSLW $5, R11, R11 611 SUBW $4, R3, R3 612 AND $0xff, R3, R3 613 LSLW $2, R3, R3 614 ORRW R3, R11, R11 615 ORRW $1, R11, R11 616 MOVB R11, 0(R8) 617 ADD $2, R8, R8 618 B inlineEmitCopyEnd 619 620inlineEmitCopyStep3: 621 // Emit the remaining copy, encoded as 3 bytes. 622 SUBW $1, R3, R3 623 LSLW $2, R3, R3 624 ORRW $2, R3, R3 625 MOVB R3, 0(R8) 626 MOVW R11, 1(R8) 627 ADD $3, R8, R8 628 629inlineEmitCopyEnd: 630 // End inline of the emitCopy call. 631 // ---------------------------------------- 632 633 // nextEmit = s 634 MOVD R7, R10 635 636 // if s >= sLimit { goto emitRemainder } 637 MOVD R7, R3 638 SUB R6, R3, R3 639 CMP R3, R9 640 BLS emitRemainder 641 642 // As per the encode_other.go code: 643 // 644 // We could immediately etc. 645 646 // x := load64(src, s-1) 647 MOVD -1(R7), R14 648 649 // prevHash := hash(uint32(x>>0), shift) 650 MOVW R14, R11 651 MULW R16, R11, R11 652 LSRW R5, R11, R11 653 654 // table[prevHash] = uint16(s-1) 655 MOVD R7, R3 656 SUB R6, R3, R3 657 SUB $1, R3, R3 658 659 MOVHU R3, 0(R17)(R11<<1) 660 661 // currHash := hash(uint32(x>>8), shift) 662 LSR $8, R14, R14 663 MOVW R14, R11 664 MULW R16, R11, R11 665 LSRW R5, R11, R11 666 667 // candidate = int(table[currHash]) 668 MOVHU 0(R17)(R11<<1), R15 669 670 // table[currHash] = uint16(s) 671 ADD $1, R3, R3 672 MOVHU R3, 0(R17)(R11<<1) 673 674 // if uint32(x>>8) == load32(src, candidate) { continue } 675 MOVW (R6)(R15*1), R4 676 CMPW R4, R14 677 BEQ inner1 678 679 // nextHash = hash(uint32(x>>16), shift) 680 LSR $8, R14, R14 681 MOVW R14, R11 682 MULW R16, R11, R11 683 LSRW R5, R11, R11 684 685 // s++ 686 ADD $1, R7, R7 687 688 // break out of the inner1 for loop, i.e. continue the outer loop. 689 B outer 690 691emitRemainder: 692 // if nextEmit < len(src) { etc } 693 MOVD src_len+32(FP), R3 694 ADD R6, R3, R3 695 CMP R3, R10 696 BEQ encodeBlockEnd 697 698 // d += emitLiteral(dst[d:], src[nextEmit:]) 699 // 700 // Push args. 701 MOVD R8, 8(RSP) 702 MOVD $0, 16(RSP) // Unnecessary, as the callee ignores it, but conservative. 703 MOVD $0, 24(RSP) // Unnecessary, as the callee ignores it, but conservative. 704 MOVD R10, 32(RSP) 705 SUB R10, R3, R3 706 MOVD R3, 40(RSP) 707 MOVD R3, 48(RSP) // Unnecessary, as the callee ignores it, but conservative. 708 709 // Spill local variables (registers) onto the stack; call; unspill. 710 MOVD R8, 88(RSP) 711 CALL ·emitLiteral(SB) 712 MOVD 88(RSP), R8 713 714 // Finish the "d +=" part of "d += emitLiteral(etc)". 715 MOVD 56(RSP), R1 716 ADD R1, R8, R8 717 718encodeBlockEnd: 719 MOVD dst_base+0(FP), R3 720 SUB R3, R8, R8 721 MOVD R8, d+48(FP) 722 RET 723