1package main 2 3//go:generate go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2 4 5import ( 6 "fmt" 7 "math" 8 "runtime" 9 10 . "github.com/mmcloughlin/avo/build" 11 "github.com/mmcloughlin/avo/buildtags" 12 . "github.com/mmcloughlin/avo/operand" 13 "github.com/mmcloughlin/avo/reg" 14) 15 16// insert extra checks here and there. 17const debug = false 18 19const ( 20 limit14B = math.MaxUint32 21 // Use 12 bit table when no more than... 22 limit12B = 16<<10 - 1 23 // Use 10 bit table when no more than... 24 limit10B = 4<<10 - 1 25 // Use 8 bit table when no more than... 26 limit8B = 512 - 1 27) 28 29func main() { 30 Constraint(buildtags.Not("appengine").ToConstraint()) 31 Constraint(buildtags.Not("noasm").ToConstraint()) 32 Constraint(buildtags.Term("gc").ToConstraint()) 33 34 o := options{ 35 snappy: false, 36 } 37 o.genEncodeBlockAsm("encodeBlockAsm", 14, 6, 6, limit14B) 38 o.genEncodeBlockAsm("encodeBlockAsm12B", 12, 5, 5, limit12B) 39 o.genEncodeBlockAsm("encodeBlockAsm10B", 10, 5, 4, limit10B) 40 o.genEncodeBlockAsm("encodeBlockAsm8B", 8, 4, 4, limit8B) 41 42 // Snappy compatible 43 o.snappy = true 44 o.genEncodeBlockAsm("encodeSnappyBlockAsm", 14, 6, 6, limit14B) 45 o.genEncodeBlockAsm("encodeSnappyBlockAsm12B", 12, 5, 5, limit12B) 46 o.genEncodeBlockAsm("encodeSnappyBlockAsm10B", 10, 5, 4, limit10B) 47 o.genEncodeBlockAsm("encodeSnappyBlockAsm8B", 8, 4, 4, limit8B) 48 49 o.snappy = false 50 o.maxLen = math.MaxUint32 51 o.genEmitLiteral() 52 o.genEmitRepeat() 53 o.genEmitCopy() 54 o.snappy = true 55 o.genEmitCopyNoRepeat() 56 o.snappy = false 57 o.genMatchLen() 58 Generate() 59} 60 61func debugval(v Op) { 62 value := reg.R15 63 MOVQ(v, value) 64 INT(Imm(3)) 65} 66 67func debugval32(v Op) { 68 value := reg.R15L 69 MOVL(v, value) 70 INT(Imm(3)) 71} 72 73var assertCounter int 74 75// assert will insert code if debug is enabled. 76// The code should jump to 'ok' is assertion is success. 77func assert(fn func(ok LabelRef)) { 78 if debug { 79 caller := [100]uintptr{0} 80 runtime.Callers(2, caller[:]) 81 frame, _ := runtime.CallersFrames(caller[:]).Next() 82 83 ok := fmt.Sprintf("assert_check_%d_ok_srcline_%d", assertCounter, frame.Line) 84 fn(LabelRef(ok)) 85 // Emit several since delve is imprecise. 86 INT(Imm(3)) 87 INT(Imm(3)) 88 Label(ok) 89 assertCounter++ 90 } 91} 92 93type options struct { 94 snappy bool 95 vmbi2 bool 96 maxLen int 97} 98 99func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, maxLen int) { 100 TEXT(name, 0, "func(dst, src []byte) int") 101 Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.", 102 fmt.Sprintf("Maximum input %d bytes.", maxLen), 103 "It assumes that the varint-encoded length of the decompressed bytes has already been written.", "") 104 Pragma("noescape") 105 106 const literalMaxOverhead = 4 107 o.maxLen = maxLen 108 109 var tableSize = 4 * (1 << tableBits) 110 // Memzero needs at least 128 bytes. 111 if tableSize < 128 { 112 panic("tableSize must be at least 128 bytes") 113 } 114 115 lenSrcBasic, err := Param("src").Len().Resolve() 116 if err != nil { 117 panic(err) 118 } 119 lenSrcQ := lenSrcBasic.Addr 120 121 lenDstBasic, err := Param("dst").Len().Resolve() 122 if err != nil { 123 panic(err) 124 } 125 lenDstQ := lenDstBasic.Addr 126 127 // Bail if we can't compress to at least this. 128 dstLimitPtrQ := AllocLocal(8) 129 130 // sLimitL is when to stop looking for offset/length copies. 131 sLimitL := AllocLocal(4) 132 133 // nextEmitL keeps track of the point we have emitted to. 134 nextEmitL := AllocLocal(4) 135 136 // Repeat stores the last match offset. 137 repeatL := AllocLocal(4) 138 139 // nextSTempL keeps nextS while other functions are being called. 140 nextSTempL := AllocLocal(4) 141 142 // Alloc table last 143 table := AllocLocal(tableSize) 144 145 dst := GP64() 146 { 147 dstBaseBasic, err := Param("dst").Base().Resolve() 148 if err != nil { 149 panic(err) 150 } 151 dstBaseQ := dstBaseBasic.Addr 152 MOVQ(dstBaseQ, dst) 153 } 154 155 srcBaseBasic, err := Param("src").Base().Resolve() 156 if err != nil { 157 panic(err) 158 } 159 srcBaseQ := srcBaseBasic.Addr 160 161 // Zero table 162 { 163 iReg := GP64() 164 MOVQ(U32(tableSize/8/16), iReg) 165 tablePtr := GP64() 166 LEAQ(table, tablePtr) 167 zeroXmm := XMM() 168 PXOR(zeroXmm, zeroXmm) 169 170 Label("zero_loop_" + name) 171 for i := 0; i < 8; i++ { 172 MOVOU(zeroXmm, Mem{Base: tablePtr, Disp: i * 16}) 173 } 174 ADDQ(U8(16*8), tablePtr) 175 DECQ(iReg) 176 JNZ(LabelRef("zero_loop_" + name)) 177 } 178 179 { 180 // nextEmit is offset n src where the next emitLiteral should start from. 181 MOVL(U32(0), nextEmitL) 182 183 const inputMargin = 8 184 tmp, tmp2, tmp3 := GP64(), GP64(), GP64() 185 MOVQ(lenSrcQ, tmp) 186 LEAQ(Mem{Base: tmp, Disp: -5}, tmp2) 187 // sLimitL := len(src) - inputMargin 188 LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3) 189 190 assert(func(ok LabelRef) { 191 CMPQ(tmp3, lenSrcQ) 192 JL(ok) 193 }) 194 195 MOVL(tmp3.As32(), sLimitL) 196 197 // dstLimit := (len(src) - 5 ) - len(src)>>5 198 SHRQ(U8(5), tmp) 199 SUBL(tmp.As32(), tmp2.As32()) // tmp2 = tmp2 - tmp 200 201 assert(func(ok LabelRef) { 202 // if len(src) > len(src) - len(src)>>5 - 5: ok 203 CMPQ(lenSrcQ, tmp2) 204 JGE(ok) 205 }) 206 207 LEAQ(Mem{Base: dst, Index: tmp2, Scale: 1}, tmp2) 208 MOVQ(tmp2, dstLimitPtrQ) 209 } 210 211 // s = 1 212 s := GP32() 213 MOVL(U32(1), s) 214 // repeatL = 1 215 MOVL(s, repeatL) 216 217 src := GP64() 218 Load(Param("src").Base(), src) 219 220 // Load cv 221 Label("search_loop_" + name) 222 candidate := GP32() 223 { 224 assert(func(ok LabelRef) { 225 // Check if somebody changed src 226 tmp := GP64() 227 MOVQ(srcBaseQ, tmp) 228 CMPQ(tmp, src) 229 JEQ(ok) 230 }) 231 232 cv := GP64() 233 MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv) 234 nextS := GP32() 235 // nextS := s + (s-nextEmit)>>6 + 4 236 { 237 tmp := GP64() 238 MOVL(s, tmp.As32()) // tmp = s 239 SUBL(nextEmitL, tmp.As32()) // tmp = s - nextEmit 240 SHRL(U8(skipLog), tmp.As32()) // tmp = (s - nextEmit) >> skipLog 241 LEAL(Mem{Base: s, Disp: 4, Index: tmp, Scale: 1}, nextS) 242 } 243 // if nextS > sLimit {goto emitRemainder} 244 { 245 CMPL(nextS.As32(), sLimitL) 246 JGE(LabelRef("emit_remainder_" + name)) 247 } 248 assert(func(ok LabelRef) { 249 // Check if s is valid (we should have jumped above if not) 250 tmp := GP64() 251 MOVQ(lenSrcQ, tmp) 252 CMPQ(tmp, s.As64()) 253 JG(ok) 254 }) 255 // move nextS to stack. 256 MOVL(nextS.As32(), nextSTempL) 257 258 candidate2 := GP32() 259 hasher := hashN(hashBytes, tableBits) 260 { 261 hash0, hash1 := GP64(), GP64() 262 MOVQ(cv, hash0) 263 MOVQ(cv, hash1) 264 SHRQ(U8(8), hash1) 265 hasher.hash(hash0) 266 hasher.hash(hash1) 267 MOVL(table.Idx(hash0, 4), candidate) 268 MOVL(table.Idx(hash1, 4), candidate2) 269 assert(func(ok LabelRef) { 270 CMPQ(hash0, U32(tableSize)) 271 JL(ok) 272 }) 273 assert(func(ok LabelRef) { 274 CMPQ(hash1, U32(tableSize)) 275 JL(ok) 276 }) 277 278 MOVL(s, table.Idx(hash0, 4)) 279 tmp := GP32() 280 LEAL(Mem{Base: s, Disp: 1}, tmp) 281 MOVL(tmp, table.Idx(hash1, 4)) 282 } 283 284 // Can be moved up if registers are available. 285 hash2 := GP64() 286 { 287 // hash2 := hash6(cv>>16, tableBits) 288 // hasher = hash6(tableBits) 289 MOVQ(cv, hash2) 290 SHRQ(U8(16), hash2) 291 hasher.hash(hash2) 292 assert(func(ok LabelRef) { 293 CMPQ(hash2, U32(tableSize)) 294 JL(ok) 295 }) 296 } 297 298 // En/disable repeat matching. 299 if true { 300 // Check repeat at offset checkRep 301 const checkRep = 1 302 { 303 // rep = s - repeat 304 rep := GP32() 305 MOVL(s, rep) 306 SUBL(repeatL, rep) // rep = s - repeat 307 308 // if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { 309 left, right := GP64(), GP64() 310 MOVL(Mem{Base: src, Index: rep, Disp: checkRep, Scale: 1}, right.As32()) 311 MOVQ(cv, left) 312 SHRQ(U8(checkRep*8), left) 313 CMPL(left.As32(), right.As32()) 314 // BAIL, no repeat. 315 JNE(LabelRef("no_repeat_found_" + name)) 316 } 317 // base = s + checkRep 318 base := GP32() 319 LEAL(Mem{Base: s, Disp: checkRep}, base) 320 321 // nextEmit before repeat. 322 nextEmit := GP32() 323 MOVL(nextEmitL, nextEmit) 324 325 // Extend back 326 if true { 327 i := GP32() 328 MOVL(base, i) 329 SUBL(repeatL, i) 330 JZ(LabelRef("repeat_extend_back_end_" + name)) 331 332 Label("repeat_extend_back_loop_" + name) 333 // if base <= nextemit {exit} 334 CMPL(base.As32(), nextEmit) 335 JLE(LabelRef("repeat_extend_back_end_" + name)) 336 // if src[i-1] == src[base-1] 337 tmp, tmp2 := GP64(), GP64() 338 MOVB(Mem{Base: src, Index: i, Scale: 1, Disp: -1}, tmp.As8()) 339 MOVB(Mem{Base: src, Index: base, Scale: 1, Disp: -1}, tmp2.As8()) 340 CMPB(tmp.As8(), tmp2.As8()) 341 JNE(LabelRef("repeat_extend_back_end_" + name)) 342 LEAL(Mem{Base: base, Disp: -1}, base) 343 DECL(i) 344 JNZ(LabelRef("repeat_extend_back_loop_" + name)) 345 } 346 Label("repeat_extend_back_end_" + name) 347 348 // Base is now at start. Emit until base. 349 // d += emitLiteral(dst[d:], src[nextEmit:base]) 350 if true { 351 o.emitLiteralsDstP(nextEmitL, base, src, dst, "repeat_emit_"+name) 352 } 353 354 // Extend forward 355 { 356 // s += 4 + checkRep 357 ADDL(U8(4+checkRep), s) 358 359 if true { 360 // candidate := s - repeat + 4 + checkRep 361 MOVL(s, candidate) 362 SUBL(repeatL, candidate) // candidate = s - repeat 363 364 // srcLeft = len(src) - s 365 srcLeft := GP64() 366 MOVQ(lenSrcQ, srcLeft) 367 SUBL(s, srcLeft.As32()) 368 assert(func(ok LabelRef) { 369 // if srcleft < maxint32: ok 370 CMPQ(srcLeft, U32(0x7fffffff)) 371 JL(ok) 372 }) 373 // Forward address 374 forwardStart := GP64() 375 LEAQ(Mem{Base: src, Index: s, Scale: 1}, forwardStart) 376 // End address 377 backStart := GP64() 378 LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, backStart) 379 380 length := o.matchLen("repeat_extend_"+name, forwardStart, backStart, srcLeft, LabelRef("repeat_extend_forward_end_"+name)) 381 forwardStart, backStart, srcLeft = nil, nil, nil 382 Label("repeat_extend_forward_end_" + name) 383 // s+= length 384 ADDL(length.As32(), s) 385 } 386 } 387 // Emit 388 if true { 389 // length = s-base 390 length := GP32() 391 MOVL(s, length) 392 SUBL(base.As32(), length) // length = s - base 393 394 offsetVal := GP32() 395 MOVL(repeatL, offsetVal) 396 397 if !o.snappy { 398 // if nextEmit == 0 {do copy instead...} 399 TESTL(nextEmit, nextEmit) 400 JZ(LabelRef("repeat_as_copy_" + name)) 401 402 // Emit as repeat... 403 o.emitRepeat("match_repeat_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name)) 404 405 // Emit as copy instead... 406 Label("repeat_as_copy_" + name) 407 } 408 o.emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name)) 409 410 Label("repeat_end_emit_" + name) 411 // Store new dst and nextEmit 412 MOVL(s, nextEmitL) 413 } 414 // if s >= sLimit is picked up on next loop. 415 if false { 416 CMPL(s.As32(), sLimitL) 417 JGE(LabelRef("emit_remainder_" + name)) 418 } 419 JMP(LabelRef("search_loop_" + name)) 420 } 421 Label("no_repeat_found_" + name) 422 { 423 // Check candidates are ok. All must be < s and < len(src) 424 assert(func(ok LabelRef) { 425 tmp := GP64() 426 MOVQ(lenSrcQ, tmp) 427 CMPL(tmp.As32(), candidate) 428 JG(ok) 429 }) 430 assert(func(ok LabelRef) { 431 CMPL(s, candidate) 432 JG(ok) 433 }) 434 assert(func(ok LabelRef) { 435 tmp := GP64() 436 MOVQ(lenSrcQ, tmp) 437 CMPL(tmp.As32(), candidate2) 438 JG(ok) 439 }) 440 assert(func(ok LabelRef) { 441 CMPL(s, candidate2) 442 JG(ok) 443 }) 444 445 CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) 446 JEQ(LabelRef("candidate_match_" + name)) 447 448 tmp := GP32() 449 // cv >>= 8 450 SHRQ(U8(8), cv) 451 452 // candidate = int(table[hash2]) - load early. 453 MOVL(table.Idx(hash2, 4), candidate) 454 assert(func(ok LabelRef) { 455 tmp := GP64() 456 MOVQ(lenSrcQ, tmp) 457 CMPL(tmp.As32(), candidate) 458 JG(ok) 459 }) 460 assert(func(ok LabelRef) { 461 // We may get s and s+1 462 tmp := GP32() 463 LEAL(Mem{Base: s, Disp: 2}, tmp) 464 CMPL(tmp, candidate) 465 JG(ok) 466 }) 467 468 LEAL(Mem{Base: s, Disp: 2}, tmp) 469 470 //if uint32(cv>>8) == load32(src, candidate2) 471 CMPL(Mem{Base: src, Index: candidate2, Scale: 1}, cv.As32()) 472 JEQ(LabelRef("candidate2_match_" + name)) 473 474 // table[hash2] = uint32(s + 2) 475 MOVL(tmp, table.Idx(hash2, 4)) 476 477 // cv >>= 8 (>> 16 total) 478 SHRQ(U8(8), cv) 479 480 // if uint32(cv>>16) == load32(src, candidate) 481 CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) 482 JEQ(LabelRef("candidate3_match_" + name)) 483 484 // No match found, next loop 485 // s = nextS 486 MOVL(nextSTempL, s) 487 JMP(LabelRef("search_loop_" + name)) 488 489 // Matches candidate at s + 2 (3rd check) 490 Label("candidate3_match_" + name) 491 ADDL(U8(2), s) 492 JMP(LabelRef("candidate_match_" + name)) 493 494 // Match at s + 1 (we calculated the hash, lets store it) 495 Label("candidate2_match_" + name) 496 // table[hash2] = uint32(s + 2) 497 MOVL(tmp, table.Idx(hash2, 4)) 498 // s++ 499 INCL(s) 500 MOVL(candidate2, candidate) 501 } 502 } 503 504 Label("candidate_match_" + name) 505 // We have a match at 's' with src offset in "candidate" that matches at least 4 bytes. 506 // Extend backwards 507 if true { 508 ne := GP32() 509 MOVL(nextEmitL, ne) 510 TESTL(candidate, candidate) 511 JZ(LabelRef("match_extend_back_end_" + name)) 512 513 // candidate is tested when decremented, so we loop back here. 514 Label("match_extend_back_loop_" + name) 515 // if s <= nextEmit {exit} 516 CMPL(s, ne) 517 JLE(LabelRef("match_extend_back_end_" + name)) 518 // if src[candidate-1] == src[s-1] 519 tmp, tmp2 := GP64(), GP64() 520 MOVB(Mem{Base: src, Index: candidate, Scale: 1, Disp: -1}, tmp.As8()) 521 MOVB(Mem{Base: src, Index: s, Scale: 1, Disp: -1}, tmp2.As8()) 522 CMPB(tmp.As8(), tmp2.As8()) 523 JNE(LabelRef("match_extend_back_end_" + name)) 524 LEAL(Mem{Base: s, Disp: -1}, s) 525 DECL(candidate) 526 JZ(LabelRef("match_extend_back_end_" + name)) 527 JMP(LabelRef("match_extend_back_loop_" + name)) 528 } 529 Label("match_extend_back_end_" + name) 530 531 // Bail if we exceed the maximum size. 532 if true { 533 // tmp = s-nextEmit 534 tmp := GP64() 535 MOVL(s, tmp.As32()) 536 SUBL(nextEmitL, tmp.As32()) 537 // tmp = &dst + s-nextEmit 538 LEAQ(Mem{Base: dst, Index: tmp, Scale: 1, Disp: literalMaxOverhead}, tmp) 539 CMPQ(tmp, dstLimitPtrQ) 540 JL(LabelRef("match_dst_size_check_" + name)) 541 ri, err := ReturnIndex(0).Resolve() 542 if err != nil { 543 panic(err) 544 } 545 MOVQ(U32(0), ri.Addr) 546 RET() 547 } 548 Label("match_dst_size_check_" + name) 549 { 550 base := GP32() 551 MOVL(s, base.As32()) 552 o.emitLiteralsDstP(nextEmitL, base, src, dst, "match_emit_"+name) 553 } 554 cv := GP64() 555 Label("match_nolit_loop_" + name) 556 { 557 // Update repeat 558 { 559 // repeat = base - candidate 560 repeatVal := GP64().As32() 561 MOVL(s, repeatVal) 562 SUBL(candidate, repeatVal) 563 MOVL(repeatVal, repeatL) 564 } 565 // s+=4, candidate+=4 566 ADDL(U8(4), s) 567 ADDL(U8(4), candidate) 568 // Extend the 4-byte match as long as possible and emit copy. 569 { 570 assert(func(ok LabelRef) { 571 // s must be > candidate cannot be equal. 572 CMPL(s, candidate) 573 JG(ok) 574 }) 575 // srcLeft = len(src) - s 576 srcLeft := GP64() 577 MOVQ(lenSrcQ, srcLeft) 578 SUBL(s, srcLeft.As32()) 579 assert(func(ok LabelRef) { 580 // if srcleft < maxint32: ok 581 CMPQ(srcLeft, U32(0x7fffffff)) 582 JL(ok) 583 }) 584 585 a, b := GP64(), GP64() 586 LEAQ(Mem{Base: src, Index: s, Scale: 1}, a) 587 LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, b) 588 length := o.matchLen("match_nolit_"+name, 589 a, b, 590 srcLeft, 591 LabelRef("match_nolit_end_"+name), 592 ) 593 Label("match_nolit_end_" + name) 594 assert(func(ok LabelRef) { 595 CMPL(length.As32(), U32(math.MaxInt32)) 596 JL(ok) 597 }) 598 a, b, srcLeft = nil, nil, nil 599 600 // s += length (length is destroyed, use it now) 601 ADDL(length.As32(), s) 602 603 // Load offset from repeat value. 604 offset := GP64() 605 MOVL(repeatL, offset.As32()) 606 607 // length += 4 608 ADDL(U8(4), length.As32()) 609 MOVL(s, nextEmitL) // nextEmit = s 610 o.emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name)) 611 Label("match_nolit_emitcopy_end_" + name) 612 613 // if s >= sLimit { end } 614 { 615 CMPL(s.As32(), sLimitL) 616 JGE(LabelRef("emit_remainder_" + name)) 617 } 618 // Start load s-2 as early as possible... 619 MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, cv) 620 // Bail if we exceed the maximum size. 621 { 622 CMPQ(dst, dstLimitPtrQ) 623 JL(LabelRef("match_nolit_dst_ok_" + name)) 624 ri, err := ReturnIndex(0).Resolve() 625 if err != nil { 626 panic(err) 627 } 628 MOVQ(U32(0), ri.Addr) 629 RET() 630 Label("match_nolit_dst_ok_" + name) 631 } 632 } 633 // cv must be set to value at s-2 before arriving here 634 { 635 // Check for an immediate match, otherwise start search at s+1 636 // Index s-2 637 hasher := hashN(hashBytes, tableBits) 638 hash0, hash1 := GP64(), GP64() 639 MOVQ(cv, hash0) // src[s-2] 640 SHRQ(U8(16), cv) 641 MOVQ(cv, hash1) // src[s] 642 hasher.hash(hash0) 643 hasher.hash(hash1) 644 sm2 := GP32() // s - 2 645 LEAL(Mem{Base: s, Disp: -2}, sm2) 646 assert(func(ok LabelRef) { 647 CMPQ(hash0, U32(tableSize)) 648 JL(ok) 649 }) 650 assert(func(ok LabelRef) { 651 CMPQ(hash1, U32(tableSize)) 652 JL(ok) 653 }) 654 addr := GP64() 655 LEAQ(table.Idx(hash1, 4), addr) 656 MOVL(Mem{Base: addr}, candidate) 657 MOVL(sm2, table.Idx(hash0, 4)) 658 MOVL(s, Mem{Base: addr}) 659 CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) 660 JEQ(LabelRef("match_nolit_loop_" + name)) 661 INCL(s) 662 } 663 JMP(LabelRef("search_loop_" + name)) 664 } 665 666 Label("emit_remainder_" + name) 667 // Bail if we exceed the maximum size. 668 // if d+len(src)-nextEmitL > dstLimitPtrQ { return 0 669 { 670 // remain = len(src) - nextEmit 671 remain := GP64() 672 MOVQ(lenSrcQ, remain) 673 SUBL(nextEmitL, remain.As32()) 674 675 dstExpect := GP64() 676 // dst := dst + (len(src)-nextEmitL) 677 678 LEAQ(Mem{Base: dst, Index: remain, Scale: 1, Disp: literalMaxOverhead}, dstExpect) 679 CMPQ(dstExpect, dstLimitPtrQ) 680 JL(LabelRef("emit_remainder_ok_" + name)) 681 ri, err := ReturnIndex(0).Resolve() 682 if err != nil { 683 panic(err) 684 } 685 MOVQ(U32(0), ri.Addr) 686 RET() 687 Label("emit_remainder_ok_" + name) 688 } 689 // emitLiteral(dst[d:], src[nextEmitL:]) 690 emitEnd := GP64() 691 MOVQ(lenSrcQ, emitEnd) 692 693 // Emit final literals. 694 o.emitLiteralsDstP(nextEmitL, emitEnd, src, dst, "emit_remainder_"+name) 695 696 // Assert size is < limit 697 assert(func(ok LabelRef) { 698 // if dstBaseQ < dstLimitPtrQ: ok 699 CMPQ(dst, dstLimitPtrQ) 700 JL(ok) 701 }) 702 703 // length := start - base (ptr arithmetic) 704 length := GP64() 705 base := Load(Param("dst").Base(), GP64()) 706 MOVQ(dst, length) 707 SUBQ(base, length) 708 709 // Assert size is < len(src) 710 assert(func(ok LabelRef) { 711 // if len(src) >= length: ok 712 CMPQ(lenSrcQ, length) 713 JGE(ok) 714 }) 715 // Assert size is < len(dst) 716 assert(func(ok LabelRef) { 717 // if len(dst) >= length: ok 718 CMPQ(lenDstQ, length) 719 JGE(ok) 720 }) 721 Store(length, ReturnIndex(0)) 722 RET() 723} 724 725// emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase. 726// Checks if base == nextemit. 727// src & base are untouched. 728func (o options) emitLiterals(nextEmitL Mem, base reg.GPVirtual, src reg.GPVirtual, dstBase Mem, name string) { 729 nextEmit, litLen, dstBaseTmp, litBase := GP32(), GP32(), GP64(), GP64() 730 MOVL(nextEmitL, nextEmit) 731 CMPL(nextEmit, base.As32()) 732 JEQ(LabelRef("emit_literal_skip_" + name)) 733 MOVL(base.As32(), litLen.As32()) 734 735 // Base is now next emit. 736 MOVL(base.As32(), nextEmitL) 737 738 // litBase = src[nextEmitL:] 739 LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase) 740 SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit 741 742 // Load (and store when we return) 743 MOVQ(dstBase, dstBaseTmp) 744 o.emitLiteral(name, litLen, nil, dstBaseTmp, litBase, LabelRef("emit_literal_done_"+name), true) 745 Label("emit_literal_done_" + name) 746 747 // Emitted length must be > litlen. 748 // We have already checked for len(0) above. 749 assert(func(ok LabelRef) { 750 tmp := GP64() 751 MOVQ(dstBaseTmp, tmp) 752 SUBQ(dstBase, tmp) // tmp = dstBaseTmp - dstBase 753 // if tmp > litLen: ok 754 CMPQ(tmp, litLen.As64()) 755 JG(ok) 756 }) 757 // Store updated dstBase 758 MOVQ(dstBaseTmp, dstBase) 759 Label("emit_literal_skip_" + name) 760} 761 762// emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase. 763// Checks if base == nextemit. 764// src & base are untouched. 765func (o options) emitLiteralsDstP(nextEmitL Mem, base reg.GPVirtual, src, dst reg.GPVirtual, name string) { 766 nextEmit, litLen, litBase := GP32(), GP32(), GP64() 767 MOVL(nextEmitL, nextEmit) 768 CMPL(nextEmit, base.As32()) 769 JEQ(LabelRef("emit_literal_done_" + name)) 770 MOVL(base.As32(), litLen.As32()) 771 772 // Base is now next emit. 773 MOVL(base.As32(), nextEmitL) 774 775 // litBase = src[nextEmitL:] 776 LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase) 777 SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit 778 779 // Load (and store when we return) 780 o.emitLiteral(name, litLen, nil, dst, litBase, LabelRef("emit_literal_done_"+name), true) 781 Label("emit_literal_done_" + name) 782} 783 784type hashGen struct { 785 bytes int 786 tablebits int 787 mulreg reg.GPVirtual 788} 789 790// hashN uses multiply to get a 'output' hash on the hash of the lowest 'bytes' bytes in value. 791func hashN(hashBytes, tablebits int) hashGen { 792 h := hashGen{ 793 bytes: hashBytes, 794 tablebits: tablebits, 795 mulreg: GP64(), 796 } 797 primebytes := uint64(0) 798 switch hashBytes { 799 case 3: 800 primebytes = 506832829 801 case 4: 802 primebytes = 2654435761 803 case 5: 804 primebytes = 889523592379 805 case 6: 806 primebytes = 227718039650203 807 case 7: 808 primebytes = 58295818150454627 809 case 8: 810 primebytes = 0xcf1bbcdcb7a56463 811 default: 812 panic("invalid hash length") 813 } 814 MOVQ(Imm(primebytes), h.mulreg) 815 return h 816} 817 818// hash uses multiply to get hash of the value. 819func (h hashGen) hash(val reg.GPVirtual) { 820 // Move value to top of register. 821 SHLQ(U8(64-8*h.bytes), val) 822 IMULQ(h.mulreg, val) 823 // Move value to bottom 824 SHRQ(U8(64-h.tablebits), val) 825} 826 827func (o options) genEmitLiteral() { 828 TEXT("emitLiteral", NOSPLIT, "func(dst, lit []byte) int") 829 Doc("emitLiteral writes a literal chunk and returns the number of bytes written.", "", 830 "It assumes that:", 831 " dst is long enough to hold the encoded bytes", 832 " 0 <= len(lit) && len(lit) <= math.MaxUint32", "") 833 Pragma("noescape") 834 835 dstBase, litBase, litLen, retval := GP64(), GP64(), GP64(), GP64() 836 Load(Param("lit").Len(), litLen) 837 Load(Param("dst").Base(), dstBase) 838 Load(Param("lit").Base(), litBase) 839 TESTQ(litLen, litLen) 840 JZ(LabelRef("emit_literal_end_standalone_skip")) 841 o.emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_standalone", false) 842 843 Label("emit_literal_end_standalone_skip") 844 XORQ(retval, retval) 845 846 Label("emit_literal_end_standalone") 847 Store(retval, ReturnIndex(0)) 848 RET() 849 850} 851 852// emitLiteral can be used for inlining an emitLiteral call. 853// litLen must be > 0. 854// stack must have at least 32 bytes. 855// retval will contain emitted bytes, but can be nil if this is not interesting. 856// dstBase and litBase are updated. 857// Uses 2 GP registers. With AVX 4 registers. 858// If updateDst is true dstBase will have the updated end pointer and an additional register will be used. 859func (o options) emitLiteral(name string, litLen, retval, dstBase, litBase reg.GPVirtual, end LabelRef, updateDst bool) { 860 n := GP32() 861 n16 := GP32() 862 863 // litLen must be > 0 864 assert(func(ok LabelRef) { 865 TESTL(litLen.As32(), litLen.As32()) 866 JNZ(ok) 867 }) 868 869 // We always add litLen bytes 870 if retval != nil { 871 MOVL(litLen.As32(), retval.As32()) 872 } 873 // n = litlen - 1 874 LEAL(Mem{Base: litLen.As32(), Disp: -1}, n) 875 876 // Find number of bytes to emit for tag. 877 CMPL(n.As32(), U8(60)) 878 JLT(LabelRef("one_byte_" + name)) 879 CMPL(n.As32(), U32(1<<8)) 880 JLT(LabelRef("two_bytes_" + name)) 881 if o.maxLen >= 1<<16 { 882 CMPL(n.As32(), U32(1<<16)) 883 JLT(LabelRef("three_bytes_" + name)) 884 } else { 885 JMP(LabelRef("three_bytes_" + name)) 886 } 887 if o.maxLen >= 1<<16 { 888 if o.maxLen >= 1<<24 { 889 CMPL(n.As32(), U32(1<<24)) 890 JLT(LabelRef("four_bytes_" + name)) 891 } else { 892 JMP(LabelRef("four_bytes_" + name)) 893 } 894 } 895 if o.maxLen >= 1<<24 { 896 Label("five_bytes_" + name) 897 MOVB(U8(252), Mem{Base: dstBase}) 898 MOVL(n.As32(), Mem{Base: dstBase, Disp: 1}) 899 if retval != nil { 900 ADDQ(U8(5), retval) 901 } 902 ADDQ(U8(5), dstBase) 903 JMP(LabelRef("memmove_long_" + name)) 904 } 905 if o.maxLen >= 1<<16 { 906 Label("four_bytes_" + name) 907 MOVL(n, n16) 908 SHRL(U8(16), n16.As32()) 909 MOVB(U8(248), Mem{Base: dstBase}) 910 MOVW(n.As16(), Mem{Base: dstBase, Disp: 1}) 911 MOVB(n16.As8(), Mem{Base: dstBase, Disp: 3}) 912 if retval != nil { 913 ADDQ(U8(4), retval) 914 } 915 ADDQ(U8(4), dstBase) 916 JMP(LabelRef("memmove_long_" + name)) 917 } 918 Label("three_bytes_" + name) 919 MOVB(U8(0xf4), Mem{Base: dstBase}) 920 MOVW(n.As16(), Mem{Base: dstBase, Disp: 1}) 921 if retval != nil { 922 ADDQ(U8(3), retval) 923 } 924 ADDQ(U8(3), dstBase) 925 JMP(LabelRef("memmove_long_" + name)) 926 927 Label("two_bytes_" + name) 928 MOVB(U8(0xf0), Mem{Base: dstBase}) 929 MOVB(n.As8(), Mem{Base: dstBase, Disp: 1}) 930 if retval != nil { 931 ADDQ(U8(2), retval) 932 } 933 ADDQ(U8(2), dstBase) 934 CMPL(n.As32(), U8(64)) 935 JL(LabelRef("memmove_" + name)) 936 JMP(LabelRef("memmove_long_" + name)) 937 938 Label("one_byte_" + name) 939 SHLB(U8(2), n.As8()) 940 MOVB(n.As8(), Mem{Base: dstBase}) 941 if retval != nil { 942 ADDQ(U8(1), retval) 943 } 944 ADDQ(U8(1), dstBase) 945 // Fallthrough 946 947 Label("memmove_" + name) 948 949 // copy(dst[i:], lit) 950 dstEnd := GP64() 951 copyEnd := end 952 if updateDst { 953 copyEnd = LabelRef("memmove_end_copy_" + name) 954 LEAQ(Mem{Base: dstBase, Index: litLen, Scale: 1}, dstEnd) 955 } 956 length := GP64() 957 MOVL(litLen.As32(), length.As32()) 958 959 // updates litBase. 960 o.genMemMoveShort("emit_lit_memmove_"+name, dstBase, litBase, length, copyEnd) 961 962 if updateDst { 963 Label("memmove_end_copy_" + name) 964 MOVQ(dstEnd, dstBase) 965 } 966 JMP(end) 967 968 // > 32 bytes 969 Label("memmove_long_" + name) 970 971 // copy(dst[i:], lit) 972 dstEnd = GP64() 973 copyEnd = end 974 if updateDst { 975 copyEnd = LabelRef("memmove_end_copy_long_" + name) 976 LEAQ(Mem{Base: dstBase, Index: litLen, Scale: 1}, dstEnd) 977 } 978 length = GP64() 979 MOVL(litLen.As32(), length.As32()) 980 981 // updates litBase. 982 o.genMemMoveLong("emit_lit_memmove_long_"+name, dstBase, litBase, length, copyEnd) 983 984 if updateDst { 985 Label("memmove_end_copy_long_" + name) 986 MOVQ(dstEnd, dstBase) 987 } 988 JMP(end) 989 // Should be unreachable 990 if debug { 991 INT(Imm(3)) 992 } 993 return 994} 995 996// genEmitRepeat generates a standlone emitRepeat. 997func (o options) genEmitRepeat() { 998 TEXT("emitRepeat", NOSPLIT, "func(dst []byte, offset, length int) int") 999 Doc("emitRepeat writes a repeat chunk and returns the number of bytes written.", 1000 "Length must be at least 4 and < 1<<32", "") 1001 Pragma("noescape") 1002 1003 dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64() 1004 1005 // retval = 0 1006 XORQ(retval, retval) 1007 1008 Load(Param("dst").Base(), dstBase) 1009 Load(Param("offset"), offset) 1010 Load(Param("length"), length) 1011 o.emitRepeat("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_repeat_end")) 1012 Label("gen_emit_repeat_end") 1013 Store(retval, ReturnIndex(0)) 1014 RET() 1015} 1016 1017// emitRepeat can be used for inlining an emitRepeat call. 1018// length >= 4 and < 1<<32 1019// length is modified. dstBase is updated. retval is added to input. 1020// retval can be nil. 1021// Will jump to end label when finished. 1022// Uses 1 GP register. 1023func (o options) emitRepeat(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) { 1024 Label("emit_repeat_again_" + name) 1025 tmp := GP32() 1026 MOVL(length.As32(), tmp) // Copy length 1027 // length -= 4 1028 LEAL(Mem{Base: length, Disp: -4}, length.As32()) 1029 1030 // if length <= 4 (use copied value) 1031 CMPL(tmp.As32(), U8(8)) 1032 JLE(LabelRef("repeat_two_" + name)) 1033 1034 // length < 8 && offset < 2048 1035 CMPL(tmp.As32(), U8(12)) 1036 JGE(LabelRef("cant_repeat_two_offset_" + name)) 1037 if o.maxLen >= 2048 { 1038 CMPL(offset.As32(), U32(2048)) 1039 JLT(LabelRef("repeat_two_offset_" + name)) 1040 } 1041 1042 const maxRepeat = ((1 << 24) - 1) + 65536 1043 Label("cant_repeat_two_offset_" + name) 1044 CMPL(length.As32(), U32((1<<8)+4)) 1045 JLT(LabelRef("repeat_three_" + name)) // if length < (1<<8)+4 1046 if o.maxLen >= (1<<16)+(1<<8) { 1047 CMPL(length.As32(), U32((1<<16)+(1<<8))) 1048 JLT(LabelRef("repeat_four_" + name)) // if length < (1 << 16) + (1 << 8) 1049 } else { 1050 // Not needed, we should skip to it when generating. 1051 // JMP(LabelRef("repeat_four_" + name)) // if length < (1 << 16) + (1 << 8) 1052 } 1053 if o.maxLen >= maxRepeat { 1054 CMPL(length.As32(), U32(maxRepeat)) 1055 JLT(LabelRef("repeat_five_" + name)) // If less than 24 bits to represent. 1056 1057 // We have have more than 24 bits 1058 // Emit so we have at least 4 bytes left. 1059 LEAL(Mem{Base: length, Disp: -(maxRepeat - 4)}, length.As32()) // length -= (maxRepeat - 4) 1060 MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0 1061 MOVW(U16(65531), Mem{Base: dstBase, Disp: 2}) // 0xfffb 1062 MOVB(U8(255), Mem{Base: dstBase, Disp: 4}) 1063 ADDQ(U8(5), dstBase) 1064 if retval != nil { 1065 ADDQ(U8(5), retval) 1066 } 1067 JMP(LabelRef("emit_repeat_again_" + name)) 1068 } else { 1069 // Not needed. 1070 // JMP(LabelRef("repeat_five_" + name)) // If less than 24 bits to represent. 1071 } 1072 1073 // Must be able to be within 5 bytes. 1074 if o.maxLen >= (1<<16)+(1<<8) { 1075 Label("repeat_five_" + name) 1076 LEAL(Mem{Base: length, Disp: -65536}, length.As32()) // length -= 65536 1077 MOVL(length.As32(), offset.As32()) 1078 MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0 1079 MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8) 1080 SARL(U8(16), offset.As32()) // offset = length >> 16 1081 MOVB(offset.As8(), Mem{Base: dstBase, Disp: 4}) // dst[4] = length >> 16 1082 if retval != nil { 1083 ADDQ(U8(5), retval) // i += 5 1084 } 1085 ADDQ(U8(5), dstBase) // dst += 5 1086 JMP(end) 1087 } 1088 Label("repeat_four_" + name) 1089 LEAL(Mem{Base: length, Disp: -256}, length.As32()) // length -= 256 1090 MOVW(U16(6<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 6<<2 | tagCopy1, dst[1] = 0 1091 MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8) 1092 if retval != nil { 1093 ADDQ(U8(4), retval) // i += 4 1094 } 1095 ADDQ(U8(4), dstBase) // dst += 4 1096 JMP(end) 1097 1098 Label("repeat_three_" + name) 1099 LEAL(Mem{Base: length, Disp: -4}, length.As32()) // length -= 4 1100 MOVW(U16(5<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 5<<2 | tagCopy1, dst[1] = 0 1101 MOVB(length.As8(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length) 1102 if retval != nil { 1103 ADDQ(U8(3), retval) // i += 3 1104 } 1105 ADDQ(U8(3), dstBase) // dst += 3 1106 JMP(end) 1107 1108 Label("repeat_two_" + name) 1109 // dst[0] = uint8(length)<<2 | tagCopy1, dst[1] = 0 1110 SHLL(U8(2), length.As32()) 1111 ORL(U8(tagCopy1), length.As32()) 1112 MOVW(length.As16(), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0 1113 if retval != nil { 1114 ADDQ(U8(2), retval) // i += 2 1115 } 1116 ADDQ(U8(2), dstBase) // dst += 2 1117 JMP(end) 1118 1119 Label("repeat_two_offset_" + name) 1120 // Emit the remaining copy, encoded as 2 bytes. 1121 // dst[1] = uint8(offset) 1122 // dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1 1123 tmp = GP64() 1124 XORQ(tmp, tmp) 1125 // Use scale and displacement to shift and subtract values from length. 1126 LEAL(Mem{Base: tmp, Index: length, Scale: 4, Disp: tagCopy1}, length.As32()) 1127 MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte 1128 SARL(U8(8), offset.As32()) // Remove lower 1129 SHLL(U8(5), offset.As32()) // Shift back up 1130 ORL(offset.As32(), length.As32()) // OR result 1131 MOVB(length.As8(), Mem{Base: dstBase, Disp: 0}) 1132 if retval != nil { 1133 ADDQ(U8(2), retval) // i += 2 1134 } 1135 ADDQ(U8(2), dstBase) // dst += 2 1136 1137 JMP(end) 1138} 1139 1140// emitCopy writes a copy chunk and returns the number of bytes written. 1141// 1142// It assumes that: 1143// dst is long enough to hold the encoded bytes 1144// 1 <= offset && offset <= math.MaxUint32 1145// 4 <= length && length <= 1 << 24 1146 1147// genEmitCopy generates a standlone emitCopy 1148func (o options) genEmitCopy() { 1149 TEXT("emitCopy", NOSPLIT, "func(dst []byte, offset, length int) int") 1150 Doc("emitCopy writes a copy chunk and returns the number of bytes written.", "", 1151 "It assumes that:", 1152 " dst is long enough to hold the encoded bytes", 1153 " 1 <= offset && offset <= math.MaxUint32", 1154 " 4 <= length && length <= 1 << 24", "") 1155 Pragma("noescape") 1156 1157 dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64() 1158 1159 // i := 0 1160 XORQ(retval, retval) 1161 1162 Load(Param("dst").Base(), dstBase) 1163 Load(Param("offset"), offset) 1164 Load(Param("length"), length) 1165 o.emitCopy("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_copy_end")) 1166 Label("gen_emit_copy_end") 1167 Store(retval, ReturnIndex(0)) 1168 RET() 1169} 1170 1171// emitCopy writes a copy chunk and returns the number of bytes written. 1172// 1173// It assumes that: 1174// dst is long enough to hold the encoded bytes 1175// 1 <= offset && offset <= math.MaxUint32 1176// 4 <= length && length <= 1 << 24 1177 1178// genEmitCopy generates a standlone emitCopy 1179func (o options) genEmitCopyNoRepeat() { 1180 TEXT("emitCopyNoRepeat", NOSPLIT, "func(dst []byte, offset, length int) int") 1181 Doc("emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.", "", 1182 "It assumes that:", 1183 " dst is long enough to hold the encoded bytes", 1184 " 1 <= offset && offset <= math.MaxUint32", 1185 " 4 <= length && length <= 1 << 24", "") 1186 Pragma("noescape") 1187 1188 dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64() 1189 1190 // i := 0 1191 XORQ(retval, retval) 1192 1193 Load(Param("dst").Base(), dstBase) 1194 Load(Param("offset"), offset) 1195 Load(Param("length"), length) 1196 o.emitCopy("standalone_snappy", length, offset, retval, dstBase, "gen_emit_copy_end_snappy") 1197 Label("gen_emit_copy_end_snappy") 1198 Store(retval, ReturnIndex(0)) 1199 RET() 1200} 1201 1202const ( 1203 tagLiteral = 0x00 1204 tagCopy1 = 0x01 1205 tagCopy2 = 0x02 1206 tagCopy4 = 0x03 1207) 1208 1209// emitCopy can be used for inlining an emitCopy call. 1210// length is modified (and junk). dstBase is updated. retval is added to input. 1211// retval can be nil. 1212// Will jump to end label when finished. 1213// Uses 2 GP registers. 1214func (o options) emitCopy(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) { 1215 if o.maxLen >= 65536 { 1216 //if offset >= 65536 { 1217 CMPL(offset.As32(), U32(65536)) 1218 JL(LabelRef("two_byte_offset_" + name)) 1219 1220 // offset is >= 65536 1221 // if length <= 64 goto four_bytes_remain_ 1222 Label("four_bytes_loop_back_" + name) 1223 CMPL(length.As32(), U8(64)) 1224 JLE(LabelRef("four_bytes_remain_" + name)) 1225 1226 // Emit a length 64 copy, encoded as 5 bytes. 1227 // dst[0] = 63<<2 | tagCopy4 1228 MOVB(U8(63<<2|tagCopy4), Mem{Base: dstBase}) 1229 // dst[4] = uint8(offset >> 24) 1230 // dst[3] = uint8(offset >> 16) 1231 // dst[2] = uint8(offset >> 8) 1232 // dst[1] = uint8(offset) 1233 MOVL(offset.As32(), Mem{Base: dstBase, Disp: 1}) 1234 // length -= 64 1235 LEAL(Mem{Base: length, Disp: -64}, length.As32()) 1236 if retval != nil { 1237 ADDQ(U8(5), retval) // i+=5 1238 } 1239 ADDQ(U8(5), dstBase) // dst+=5 1240 1241 // if length >= 4 { 1242 CMPL(length.As32(), U8(4)) 1243 JL(LabelRef("four_bytes_remain_" + name)) 1244 1245 // Emit remaining as repeats 1246 // return 5 + emitRepeat(dst[5:], offset, length) 1247 // Inline call to emitRepeat. Will jump to end 1248 if !o.snappy { 1249 o.emitRepeat(name+"_emit_copy", length, offset, retval, dstBase, end) 1250 } 1251 JMP(LabelRef("four_bytes_loop_back_" + name)) 1252 1253 Label("four_bytes_remain_" + name) 1254 // if length == 0 { 1255 // return i 1256 // } 1257 TESTL(length.As32(), length.As32()) 1258 JZ(end) 1259 1260 // Emit a copy, offset encoded as 4 bytes. 1261 // dst[i+0] = uint8(length-1)<<2 | tagCopy4 1262 // dst[i+1] = uint8(offset) 1263 // dst[i+2] = uint8(offset >> 8) 1264 // dst[i+3] = uint8(offset >> 16) 1265 // dst[i+4] = uint8(offset >> 24) 1266 tmp := GP64() 1267 MOVB(U8(tagCopy4), tmp.As8()) 1268 // Use displacement to subtract 1 from upshifted length. 1269 LEAL(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length.As32()) 1270 MOVB(length.As8(), Mem{Base: dstBase}) 1271 MOVL(offset.As32(), Mem{Base: dstBase, Disp: 1}) 1272 // return i + 5 1273 if retval != nil { 1274 ADDQ(U8(5), retval) 1275 } 1276 ADDQ(U8(5), dstBase) 1277 JMP(end) 1278 } 1279 Label("two_byte_offset_" + name) 1280 // Offset no more than 2 bytes. 1281 1282 //if length > 64 { 1283 CMPL(length.As32(), U8(64)) 1284 JLE(LabelRef("two_byte_offset_short_" + name)) 1285 // Emit a length 60 copy, encoded as 3 bytes. 1286 // Emit remaining as repeat value (minimum 4 bytes). 1287 // dst[2] = uint8(offset >> 8) 1288 // dst[1] = uint8(offset) 1289 // dst[0] = 59<<2 | tagCopy2 1290 MOVB(U8(59<<2|tagCopy2), Mem{Base: dstBase}) 1291 MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1}) 1292 // length -= 60 1293 LEAL(Mem{Base: length, Disp: -60}, length.As32()) 1294 1295 // Emit remaining as repeats, at least 4 bytes remain. 1296 // return 3 + emitRepeat(dst[3:], offset, length) 1297 //} 1298 ADDQ(U8(3), dstBase) 1299 if retval != nil { 1300 ADDQ(U8(3), retval) 1301 } 1302 // Inline call to emitRepeat. Will jump to end 1303 if !o.snappy { 1304 o.emitRepeat(name+"_emit_copy_short", length, offset, retval, dstBase, end) 1305 } 1306 JMP(LabelRef("two_byte_offset_" + name)) 1307 1308 Label("two_byte_offset_short_" + name) 1309 //if length >= 12 || offset >= 2048 { 1310 CMPL(length.As32(), U8(12)) 1311 JGE(LabelRef("emit_copy_three_" + name)) 1312 if o.maxLen >= 2048 { 1313 CMPL(offset.As32(), U32(2048)) 1314 JGE(LabelRef("emit_copy_three_" + name)) 1315 } 1316 // Emit the remaining copy, encoded as 2 bytes. 1317 // dst[1] = uint8(offset) 1318 // dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 1319 tmp := GP64() 1320 MOVB(U8(tagCopy1), tmp.As8()) 1321 // Use scale and displacement to shift and subtract values from length. 1322 LEAL(Mem{Base: tmp, Index: length, Scale: 4, Disp: -(4 << 2)}, length.As32()) 1323 MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte 1324 SHRL(U8(8), offset.As32()) // Remove lower 1325 SHLL(U8(5), offset.As32()) // Shift back up 1326 ORL(offset.As32(), length.As32()) // OR result 1327 MOVB(length.As8(), Mem{Base: dstBase, Disp: 0}) 1328 if retval != nil { 1329 ADDQ(U8(2), retval) // i += 2 1330 } 1331 ADDQ(U8(2), dstBase) // dst += 2 1332 // return 2 1333 JMP(end) 1334 1335 Label("emit_copy_three_" + name) 1336 // // Emit the remaining copy, encoded as 3 bytes. 1337 // dst[2] = uint8(offset >> 8) 1338 // dst[1] = uint8(offset) 1339 // dst[0] = uint8(length-1)<<2 | tagCopy2 1340 tmp = GP64() 1341 MOVB(U8(tagCopy2), tmp.As8()) 1342 LEAL(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length.As32()) 1343 MOVB(length.As8(), Mem{Base: dstBase}) 1344 MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1}) 1345 // return 3 1346 if retval != nil { 1347 ADDQ(U8(3), retval) // i += 3 1348 } 1349 ADDQ(U8(3), dstBase) // dst += 3 1350 JMP(end) 1351} 1352 1353// func memmove(to, from unsafe.Pointer, n uintptr) 1354// src and dst may not overlap. 1355// Non AVX uses 2 GP register, 16 SSE2 registers. 1356// AVX uses 4 GP registers 16 AVX/SSE registers. 1357// All passed registers may be updated. 1358// Length must be 1 -> 64 bytes 1359func (o options) genMemMoveShort(name string, dst, src, length reg.GPVirtual, end LabelRef) { 1360 AX, CX := GP64(), GP64() 1361 name += "_memmove_" 1362 1363 // Only enable if length can be 0. 1364 if false { 1365 TESTQ(length, length) 1366 JEQ(end) 1367 } 1368 assert(func(ok LabelRef) { 1369 CMPQ(length, U8(64)) 1370 JBE(ok) 1371 }) 1372 assert(func(ok LabelRef) { 1373 TESTQ(length, length) 1374 JNZ(ok) 1375 }) 1376 Label(name + "tail") 1377 CMPQ(length, U8(3)) 1378 JB(LabelRef(name + "move_1or2")) 1379 JE(LabelRef(name + "move_3")) 1380 CMPQ(length, U8(8)) 1381 JB(LabelRef(name + "move_4through7")) 1382 CMPQ(length, U8(16)) 1383 JBE(LabelRef(name + "move_8through16")) 1384 CMPQ(length, U8(32)) 1385 JBE(LabelRef(name + "move_17through32")) 1386 if debug { 1387 CMPQ(length, U8(64)) 1388 JBE(LabelRef(name + "move_33through64")) 1389 INT(U8(3)) 1390 } 1391 JMP(LabelRef(name + "move_33through64")) 1392 1393 //genMemMoveLong(name, dst, src, length, end) 1394 1395 Label(name + "move_1or2") 1396 MOVB(Mem{Base: src}, AX.As8()) 1397 MOVB(Mem{Base: src, Disp: -1, Index: length, Scale: 1}, CX.As8()) 1398 MOVB(AX.As8(), Mem{Base: dst}) 1399 MOVB(CX.As8(), Mem{Base: dst, Disp: -1, Index: length, Scale: 1}) 1400 JMP(end) 1401 1402 Label(name + "move_3") 1403 MOVW(Mem{Base: src}, AX.As16()) 1404 MOVB(Mem{Base: src, Disp: 2}, CX.As8()) 1405 MOVW(AX.As16(), Mem{Base: dst}) 1406 MOVB(CX.As8(), Mem{Base: dst, Disp: 2}) 1407 JMP(end) 1408 1409 Label(name + "move_4through7") 1410 MOVL(Mem{Base: src}, AX.As32()) 1411 MOVL(Mem{Base: src, Disp: -4, Index: length, Scale: 1}, CX.As32()) 1412 MOVL(AX.As32(), Mem{Base: dst}) 1413 MOVL(CX.As32(), Mem{Base: dst, Disp: -4, Index: length, Scale: 1}) 1414 JMP(end) 1415 1416 Label(name + "move_8through16") 1417 MOVQ(Mem{Base: src}, AX) 1418 MOVQ(Mem{Base: src, Disp: -8, Index: length, Scale: 1}, CX) 1419 MOVQ(AX, Mem{Base: dst}) 1420 MOVQ(CX, Mem{Base: dst, Disp: -8, Index: length, Scale: 1}) 1421 JMP(end) 1422 1423 Label(name + "move_17through32") 1424 X0, X1, X2, X3 := XMM(), XMM(), XMM(), XMM() 1425 1426 MOVOU(Mem{Base: src}, X0) 1427 MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X1) 1428 MOVOU(X0, Mem{Base: dst}) 1429 MOVOU(X1, Mem{Base: dst, Disp: -16, Index: length, Scale: 1}) 1430 JMP(end) 1431 1432 Label(name + "move_33through64") 1433 MOVOU(Mem{Base: src}, X0) 1434 MOVOU(Mem{Base: src, Disp: 16}, X1) 1435 MOVOU(Mem{Base: src, Disp: -32, Index: length, Scale: 1}, X2) 1436 MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X3) 1437 MOVOU(X0, Mem{Base: dst}) 1438 MOVOU(X1, Mem{Base: dst, Disp: 16}) 1439 MOVOU(X2, Mem{Base: dst, Disp: -32, Index: length, Scale: 1}) 1440 MOVOU(X3, Mem{Base: dst, Disp: -16, Index: length, Scale: 1}) 1441 JMP(end) 1442} 1443 1444// func genMemMoveLong(to, from unsafe.Pointer, n uintptr) 1445// src and dst may not overlap. 1446// length must be >= 64 bytes. 1447// Non AVX uses 2 GP register, 16 SSE2 registers. 1448// AVX uses 4 GP registers 16 AVX/SSE registers. 1449// All passed registers may be updated. 1450func (o options) genMemMoveLong(name string, dst, src, length reg.GPVirtual, end LabelRef) { 1451 name += "large_" 1452 1453 assert(func(ok LabelRef) { 1454 CMPQ(length, U8(64)) 1455 JAE(ok) 1456 }) 1457 1458 // These are disabled. 1459 // AVX is ever so slightly faster, but it is disabled for simplicity. 1460 const branchLoops = false 1461 const avx = false && branchLoops 1462 if branchLoops { 1463 CMPQ(length, U8(128)) 1464 JBE(LabelRef(name + "move_65through128")) 1465 CMPQ(length, U32(256)) 1466 JBE(LabelRef(name + "move_129through256")) 1467 if avx { 1468 JMP(LabelRef(name + "avxUnaligned")) 1469 } else { 1470 JMP(LabelRef(name + "forward_sse")) 1471 } 1472 1473 X0, X1, X2, X3, X4, X5, X6, X7 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM() 1474 X8, X9, X10, X11, X12, X13, X14, X15 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM() 1475 Label(name + "move_65through128") 1476 MOVOU(Mem{Base: src}, X0) 1477 MOVOU(Mem{Base: src, Disp: 16}, X1) 1478 MOVOU(Mem{Base: src, Disp: 32}, X2) 1479 MOVOU(Mem{Base: src, Disp: 48}, X3) 1480 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12) 1481 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13) 1482 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14) 1483 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15) 1484 MOVOU(X0, Mem{Base: dst}) 1485 MOVOU(X1, Mem{Base: dst, Disp: 16}) 1486 MOVOU(X2, Mem{Base: dst, Disp: 32}) 1487 MOVOU(X3, Mem{Base: dst, Disp: 48}) 1488 MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64}) 1489 MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48}) 1490 MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32}) 1491 MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16}) 1492 JMP(end) 1493 1494 Label(name + "move_129through256") 1495 MOVOU(Mem{Base: src}, X0) 1496 MOVOU(Mem{Base: src, Disp: 16}, X1) 1497 MOVOU(Mem{Base: src, Disp: 32}, X2) 1498 MOVOU(Mem{Base: src, Disp: 48}, X3) 1499 MOVOU(Mem{Base: src, Disp: 64}, X4) 1500 MOVOU(Mem{Base: src, Disp: 80}, X5) 1501 MOVOU(Mem{Base: src, Disp: 96}, X6) 1502 MOVOU(Mem{Base: src, Disp: 112}, X7) 1503 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -128}, X8) 1504 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -112}, X9) 1505 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -96}, X10) 1506 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -80}, X11) 1507 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12) 1508 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13) 1509 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14) 1510 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15) 1511 MOVOU(X0, Mem{Base: dst}) 1512 MOVOU(X1, Mem{Base: dst, Disp: 16}) 1513 MOVOU(X2, Mem{Base: dst, Disp: 32}) 1514 MOVOU(X3, Mem{Base: dst, Disp: 48}) 1515 MOVOU(X4, Mem{Base: dst, Disp: 64}) 1516 MOVOU(X5, Mem{Base: dst, Disp: 80}) 1517 MOVOU(X6, Mem{Base: dst, Disp: 96}) 1518 MOVOU(X7, Mem{Base: dst, Disp: 112}) 1519 MOVOU(X8, Mem{Base: dst, Index: length, Scale: 1, Disp: -128}) 1520 MOVOU(X9, Mem{Base: dst, Index: length, Scale: 1, Disp: -112}) 1521 MOVOU(X10, Mem{Base: dst, Index: length, Scale: 1, Disp: -96}) 1522 MOVOU(X11, Mem{Base: dst, Index: length, Scale: 1, Disp: -80}) 1523 MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64}) 1524 MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48}) 1525 MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32}) 1526 MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16}) 1527 JMP(end) 1528 if avx { 1529 Label(name + "avxUnaligned") 1530 AX, CX, R8, R10 := GP64(), GP64(), GP64(), GP64() 1531 // Memory layout on the source side 1532 // src CX 1533 // |<---------length before correction--------->| 1534 // | |<--length corrected-->| | 1535 // | | |<--- AX --->| 1536 // |<-R11->| |<-128 bytes->| 1537 // +----------------------------------------+ 1538 // | Head | Body | Tail | 1539 // +-------+------------------+-------------+ 1540 // ^ ^ ^ 1541 // | | | 1542 // Save head into Y4 Save tail into X5..X12 1543 // | 1544 // src+R11, where R11 = ((dst & -32) + 32) - dst 1545 // Algorithm: 1546 // 1. Unaligned save of the tail's 128 bytes 1547 // 2. Unaligned save of the head's 32 bytes 1548 // 3. Destination-aligned copying of body (128 bytes per iteration) 1549 // 4. Put head on the new place 1550 // 5. Put the tail on the new place 1551 // It can be important to satisfy processor's pipeline requirements for 1552 // small sizes as the cost of unaligned memory region copying is 1553 // comparable with the cost of main loop. So code is slightly messed there. 1554 // There is more clean implementation of that algorithm for bigger sizes 1555 // where the cost of unaligned part copying is negligible. 1556 // You can see it after gobble_big_data_fwd label. 1557 Y0, Y1, Y2, Y3, Y4 := YMM(), YMM(), YMM(), YMM(), YMM() 1558 1559 LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX) 1560 MOVQ(dst, R10) 1561 // CX points to the end of buffer so we need go back slightly. We will use negative offsets there. 1562 MOVOU(Mem{Base: CX, Disp: -0x80}, X5) 1563 MOVOU(Mem{Base: CX, Disp: -0x70}, X6) 1564 MOVQ(U32(0x80), AX) 1565 1566 // Align destination address 1567 ANDQ(U32(0xffffffe0), dst) 1568 ADDQ(U8(32), dst) 1569 // Continue tail saving. 1570 MOVOU(Mem{Base: CX, Disp: -0x60}, X7) 1571 MOVOU(Mem{Base: CX, Disp: -0x50}, X8) 1572 // Make R8 delta between aligned and unaligned destination addresses. 1573 MOVQ(dst, R8) 1574 SUBQ(R10, R8) 1575 // Continue tail saving. 1576 MOVOU(Mem{Base: CX, Disp: -0x40}, X9) 1577 MOVOU(Mem{Base: CX, Disp: -0x30}, X10) 1578 // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying. 1579 SUBQ(R8, length) 1580 // Continue tail saving. 1581 MOVOU(Mem{Base: CX, Disp: -0x20}, X11) 1582 MOVOU(Mem{Base: CX, Disp: -0x10}, X12) 1583 // The tail will be put on its place after main body copying. 1584 // It's time for the unaligned heading part. 1585 VMOVDQU(Mem{Base: src}, Y4) 1586 // Adjust source address to point past head. 1587 ADDQ(R8, src) 1588 SUBQ(AX, length) 1589 1590 // Aligned memory copying there 1591 Label(name + "gobble_128_loop") 1592 VMOVDQU(Mem{Base: src}, Y0) 1593 VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1) 1594 VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2) 1595 VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3) 1596 ADDQ(AX, src) 1597 VMOVDQA(Y0, Mem{Base: dst}) 1598 VMOVDQA(Y1, Mem{Base: dst, Disp: 0x20}) 1599 VMOVDQA(Y2, Mem{Base: dst, Disp: 0x40}) 1600 VMOVDQA(Y3, Mem{Base: dst, Disp: 0x60}) 1601 ADDQ(AX, dst) 1602 SUBQ(AX, length) 1603 JA(LabelRef(name + "gobble_128_loop")) 1604 // Now we can store unaligned parts. 1605 ADDQ(AX, length) 1606 ADDQ(dst, length) 1607 VMOVDQU(Y4, Mem{Base: R10}) 1608 VZEROUPPER() 1609 MOVOU(X5, Mem{Base: length, Disp: -0x80}) 1610 MOVOU(X6, Mem{Base: length, Disp: -0x70}) 1611 MOVOU(X7, Mem{Base: length, Disp: -0x60}) 1612 MOVOU(X8, Mem{Base: length, Disp: -0x50}) 1613 MOVOU(X9, Mem{Base: length, Disp: -0x40}) 1614 MOVOU(X10, Mem{Base: length, Disp: -0x30}) 1615 MOVOU(X11, Mem{Base: length, Disp: -0x20}) 1616 MOVOU(X12, Mem{Base: length, Disp: -0x10}) 1617 JMP(end) 1618 1619 return 1620 } 1621 } 1622 1623 // Store start and end for sse_tail 1624 Label(name + "forward_sse") 1625 X0, X1, X2, X3, X4, X5, X6, X7 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM() 1626 X8, X9, X10, X11 := XMM(), XMM(), XMM(), XMM() 1627 1628 MOVOU(Mem{Base: src}, X0) 1629 MOVOU(Mem{Base: src, Disp: 16}, X1) 1630 MOVOU(Mem{Base: src, Disp: -32, Index: length, Scale: 1}, X2) 1631 MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X3) 1632 1633 // forward (only) 1634 dstAlign := GP64() 1635 bigLoops := GP64() 1636 MOVQ(length, bigLoops) 1637 SHRQ(U8(7), bigLoops) // bigLoops = length / 128 1638 1639 MOVQ(dst, dstAlign) 1640 ANDL(U32(31), dstAlign.As32()) 1641 srcOff := GP64() 1642 MOVQ(U32(64), srcOff) 1643 SUBQ(dstAlign, srcOff) 1644 1645 // Move 128 bytes/loop 1646 DECQ(bigLoops) 1647 JA(LabelRef(name + "forward_sse_loop_32")) 1648 1649 // Can be moved inside loop for less regs. 1650 srcPos := GP64() 1651 LEAQ(Mem{Disp: -32, Base: src, Scale: 1, Index: srcOff}, srcPos) 1652 dstPos := GP64() 1653 LEAQ(Mem{Disp: -32, Base: dst, Scale: 1, Index: srcOff}, dstPos) 1654 1655 Label(name + "big_loop_back") 1656 1657 MOVOU(Mem{Disp: 0, Base: srcPos}, X4) 1658 MOVOU(Mem{Disp: 16, Base: srcPos}, X5) 1659 MOVOU(Mem{Disp: 32, Base: srcPos}, X6) 1660 MOVOU(Mem{Disp: 48, Base: srcPos}, X7) 1661 MOVOU(Mem{Disp: 64, Base: srcPos}, X8) 1662 MOVOU(Mem{Disp: 80, Base: srcPos}, X9) 1663 MOVOU(Mem{Disp: 96, Base: srcPos}, X10) 1664 MOVOU(Mem{Disp: 112, Base: srcPos}, X11) 1665 1666 MOVOA(X4, Mem{Disp: 0, Base: dstPos}) 1667 MOVOA(X5, Mem{Disp: 16, Base: dstPos}) 1668 MOVOA(X6, Mem{Disp: 32, Base: dstPos}) 1669 MOVOA(X7, Mem{Disp: 48, Base: dstPos}) 1670 MOVOA(X8, Mem{Disp: 64, Base: dstPos}) 1671 MOVOA(X9, Mem{Disp: 80, Base: dstPos}) 1672 MOVOA(X10, Mem{Disp: 96, Base: dstPos}) 1673 MOVOA(X11, Mem{Disp: 112, Base: dstPos}) 1674 ADDQ(U8(128), dstPos) 1675 ADDQ(U8(128), srcPos) 1676 ADDQ(U8(128), srcOff) // This could be outside the loop, but we lose a reg if we do. 1677 DECQ(bigLoops) 1678 JNA(LabelRef(name + "big_loop_back")) 1679 1680 Label(name + "forward_sse_loop_32") 1681 MOVOU(Mem{Disp: -32, Base: src, Scale: 1, Index: srcOff}, X4) 1682 MOVOU(Mem{Disp: -16, Base: src, Scale: 1, Index: srcOff}, X5) 1683 MOVOA(X4, Mem{Disp: -32, Base: dst, Scale: 1, Index: srcOff}) 1684 MOVOA(X5, Mem{Disp: -16, Base: dst, Scale: 1, Index: srcOff}) 1685 ADDQ(U8(32), srcOff) 1686 CMPQ(length, srcOff) 1687 JAE(LabelRef(name + "forward_sse_loop_32")) 1688 1689 // sse_tail patches up the beginning and end of the transfer. 1690 MOVOU(X0, Mem{Base: dst, Disp: 0}) 1691 MOVOU(X1, Mem{Base: dst, Disp: 16}) 1692 MOVOU(X2, Mem{Base: dst, Disp: -32, Index: length, Scale: 1}) 1693 MOVOU(X3, Mem{Base: dst, Disp: -16, Index: length, Scale: 1}) 1694 1695 JMP(end) 1696 return 1697} 1698 1699// genMatchLen generates standalone matchLen. 1700func (o options) genMatchLen() { 1701 TEXT("matchLen", NOSPLIT, "func(a, b []byte) int") 1702 Doc("matchLen returns how many bytes match in a and b", "", 1703 "It assumes that:", 1704 " len(a) <= len(b)", "") 1705 Pragma("noescape") 1706 1707 aBase, bBase, length := GP64(), GP64(), GP64() 1708 1709 Load(Param("a").Base(), aBase) 1710 Load(Param("b").Base(), bBase) 1711 Load(Param("a").Len(), length) 1712 l := o.matchLen("standalone", aBase, bBase, length, LabelRef("gen_match_len_end")) 1713 Label("gen_match_len_end") 1714 Store(l.As64(), ReturnIndex(0)) 1715 RET() 1716} 1717 1718// matchLen returns the number of matching bytes of a and b. 1719// len is the maximum number of bytes to match. 1720// Will jump to end when done and returns the length. 1721// Uses 2 GP registers. 1722func (o options) matchLen(name string, a, b, len reg.GPVirtual, end LabelRef) reg.GPVirtual { 1723 if false { 1724 return o.matchLenAlt(name, a, b, len, end) 1725 } 1726 tmp, matched := GP64(), GP32() 1727 XORL(matched, matched) 1728 1729 CMPL(len.As32(), U8(8)) 1730 JL(LabelRef("matchlen_single_" + name)) 1731 1732 Label("matchlen_loopback_" + name) 1733 MOVQ(Mem{Base: a, Index: matched, Scale: 1}, tmp) 1734 XORQ(Mem{Base: b, Index: matched, Scale: 1}, tmp) 1735 TESTQ(tmp, tmp) 1736 JZ(LabelRef("matchlen_loop_" + name)) 1737 // Not all match. 1738 BSFQ(tmp, tmp) 1739 SARQ(U8(3), tmp) 1740 LEAL(Mem{Base: matched, Index: tmp, Scale: 1}, matched) 1741 JMP(end) 1742 1743 // All 8 byte matched, update and loop. 1744 Label("matchlen_loop_" + name) 1745 LEAL(Mem{Base: len, Disp: -8}, len.As32()) 1746 LEAL(Mem{Base: matched, Disp: 8}, matched) 1747 CMPL(len.As32(), U8(8)) 1748 JGE(LabelRef("matchlen_loopback_" + name)) 1749 1750 // Less than 8 bytes left. 1751 Label("matchlen_single_" + name) 1752 TESTL(len.As32(), len.As32()) 1753 JZ(end) 1754 Label("matchlen_single_loopback_" + name) 1755 MOVB(Mem{Base: a, Index: matched, Scale: 1}, tmp.As8()) 1756 CMPB(Mem{Base: b, Index: matched, Scale: 1}, tmp.As8()) 1757 JNE(end) 1758 LEAL(Mem{Base: matched, Disp: 1}, matched) 1759 DECL(len.As32()) 1760 JNZ(LabelRef("matchlen_single_loopback_" + name)) 1761 JMP(end) 1762 return matched 1763} 1764 1765// matchLen returns the number of matching bytes of a and b. 1766// len is the maximum number of bytes to match. 1767// Will jump to end when done and returns the length. 1768// Uses 3 GP registers. 1769// It is better on longer matches. 1770func (o options) matchLenAlt(name string, a, b, len reg.GPVirtual, end LabelRef) reg.GPVirtual { 1771 tmp, tmp2, matched := GP64(), GP64(), GP32() 1772 XORL(matched, matched) 1773 1774 CMPL(len.As32(), U8(16)) 1775 JB(LabelRef("matchlen_short_" + name)) 1776 1777 Label("matchlen_loopback_" + name) 1778 MOVQ(Mem{Base: a}, tmp) 1779 MOVQ(Mem{Base: a, Disp: 8}, tmp2) 1780 XORQ(Mem{Base: b, Disp: 0}, tmp) 1781 XORQ(Mem{Base: b, Disp: 8}, tmp2) 1782 endTest := func(xored reg.GPVirtual, disp int, ok LabelRef) { 1783 TESTQ(xored, xored) 1784 JZ(ok) 1785 // Not all match. 1786 BSFQ(xored, xored) 1787 SARQ(U8(3), xored) 1788 LEAL(Mem{Base: matched, Index: xored, Scale: 1, Disp: disp}, matched) 1789 JMP(end) 1790 } 1791 endTest(tmp, 0, LabelRef("matchlen_loop_tmp2_"+name)) 1792 Label("matchlen_loop_tmp2_" + name) 1793 endTest(tmp2, 8, LabelRef("matchlen_loop_"+name)) 1794 1795 // All 16 byte matched, update and loop. 1796 Label("matchlen_loop_" + name) 1797 SUBL(U8(16), len.As32()) 1798 ADDL(U8(16), matched) 1799 ADDQ(U8(16), a) 1800 ADDQ(U8(16), b) 1801 CMPL(len.As32(), U8(16)) 1802 JAE(LabelRef("matchlen_loopback_" + name)) 1803 1804 // Test 4 bytes at the time... 1805 Label("matchlen_short_" + name) 1806 lenoff := 0 1807 if true { 1808 lenoff = 4 1809 SUBL(U8(4), len.As32()) 1810 JC(LabelRef("matchlen_single_resume_" + name)) 1811 1812 Label("matchlen_four_loopback_" + name) 1813 assert(func(ok LabelRef) { 1814 CMPL(len.As32(), U32(math.MaxInt32)) 1815 JL(ok) 1816 }) 1817 1818 MOVL(Mem{Base: a}, tmp.As32()) 1819 XORL(Mem{Base: b}, tmp.As32()) 1820 { 1821 JZ(LabelRef("matchlen_four_loopback_next" + name)) 1822 BSFL(tmp.As32(), tmp.As32()) 1823 SARQ(U8(3), tmp) 1824 LEAL(Mem{Base: matched, Index: tmp, Scale: 1}, matched) 1825 JMP(end) 1826 } 1827 Label("matchlen_four_loopback_next" + name) 1828 ADDL(U8(4), matched) 1829 ADDQ(U8(4), a) 1830 ADDQ(U8(4), b) 1831 SUBL(U8(4), len.As32()) 1832 JNC(LabelRef("matchlen_four_loopback_" + name)) 1833 } 1834 1835 // Test one at the time 1836 Label("matchlen_single_resume_" + name) 1837 if true { 1838 // Less than 16 bytes left. 1839 if lenoff > 0 { 1840 ADDL(U8(lenoff), len.As32()) 1841 } 1842 TESTL(len.As32(), len.As32()) 1843 JZ(end) 1844 1845 Label("matchlen_single_loopback_" + name) 1846 MOVB(Mem{Base: a}, tmp.As8()) 1847 CMPB(Mem{Base: b}, tmp.As8()) 1848 JNE(end) 1849 INCL(matched) 1850 INCQ(a) 1851 INCQ(b) 1852 DECL(len.As32()) 1853 JNZ(LabelRef("matchlen_single_loopback_" + name)) 1854 } 1855 JMP(end) 1856 return matched 1857} 1858