1package main 2 3//go:generate go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2 4 5import ( 6 "fmt" 7 "math" 8 "runtime" 9 10 . "github.com/mmcloughlin/avo/build" 11 "github.com/mmcloughlin/avo/buildtags" 12 . "github.com/mmcloughlin/avo/operand" 13 "github.com/mmcloughlin/avo/reg" 14) 15 16// insert extra checks here and there. 17const debug = false 18 19const ( 20 limit14B = math.MaxUint32 21 // Use 12 bit table when no more than... 22 limit12B = 16<<10 - 1 23 // Use 10 bit table when no more than... 24 limit10B = 4<<10 - 1 25 // Use 8 bit table when no more than... 26 limit8B = 512 - 1 27) 28 29func main() { 30 Constraint(buildtags.Not("appengine").ToConstraint()) 31 Constraint(buildtags.Not("noasm").ToConstraint()) 32 Constraint(buildtags.Term("gc").ToConstraint()) 33 34 o := options{ 35 snappy: false, 36 } 37 o.genEncodeBlockAsm("encodeBlockAsm", 14, 6, 6, limit14B) 38 o.genEncodeBlockAsm("encodeBlockAsm4MB", 14, 6, 6, 4<<20) 39 o.genEncodeBlockAsm("encodeBlockAsm12B", 12, 5, 5, limit12B) 40 o.genEncodeBlockAsm("encodeBlockAsm10B", 10, 5, 4, limit10B) 41 o.genEncodeBlockAsm("encodeBlockAsm8B", 8, 4, 4, limit8B) 42 43 o.genEncodeBetterBlockAsm("encodeBetterBlockAsm", 16, 7, 7, limit14B) 44 o.genEncodeBetterBlockAsm("encodeBetterBlockAsm4MB", 16, 7, 7, 4<<20) 45 o.genEncodeBetterBlockAsm("encodeBetterBlockAsm12B", 14, 6, 6, limit12B) 46 o.genEncodeBetterBlockAsm("encodeBetterBlockAsm10B", 12, 5, 6, limit10B) 47 o.genEncodeBetterBlockAsm("encodeBetterBlockAsm8B", 10, 4, 6, limit8B) 48 49 // Snappy compatible 50 o.snappy = true 51 o.genEncodeBlockAsm("encodeSnappyBlockAsm", 14, 6, 6, limit14B) 52 o.genEncodeBlockAsm("encodeSnappyBlockAsm12B", 12, 5, 5, limit12B) 53 o.genEncodeBlockAsm("encodeSnappyBlockAsm10B", 10, 5, 4, limit10B) 54 o.genEncodeBlockAsm("encodeSnappyBlockAsm8B", 8, 4, 4, limit8B) 55 56 o.snappy = false 57 o.maxLen = math.MaxUint32 58 o.genEmitLiteral() 59 o.genEmitRepeat() 60 o.genEmitCopy() 61 o.snappy = true 62 o.genEmitCopyNoRepeat() 63 o.snappy = false 64 o.genMatchLen() 65 Generate() 66} 67 68func debugval(v Op) { 69 value := reg.R15 70 MOVQ(v, value) 71 INT(Imm(3)) 72} 73 74func debugval32(v Op) { 75 value := reg.R15L 76 MOVL(v, value) 77 INT(Imm(3)) 78} 79 80var assertCounter int 81 82// assert will insert code if debug is enabled. 83// The code should jump to 'ok' is assertion is success. 84func assert(fn func(ok LabelRef)) { 85 if debug { 86 caller := [100]uintptr{0} 87 runtime.Callers(2, caller[:]) 88 frame, _ := runtime.CallersFrames(caller[:]).Next() 89 90 ok := fmt.Sprintf("assert_check_%d_ok_srcline_%d", assertCounter, frame.Line) 91 fn(LabelRef(ok)) 92 // Emit several since delve is imprecise. 93 INT(Imm(3)) 94 INT(Imm(3)) 95 Label(ok) 96 assertCounter++ 97 } 98} 99 100type options struct { 101 snappy bool 102 vmbi2 bool 103 maxLen int 104} 105 106func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, maxLen int) { 107 TEXT(name, 0, "func(dst, src []byte) int") 108 Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.", 109 fmt.Sprintf("Maximum input %d bytes.", maxLen), 110 "It assumes that the varint-encoded length of the decompressed bytes has already been written.", "") 111 Pragma("noescape") 112 113 o.maxLen = maxLen 114 var literalMaxOverhead = maxLitOverheadFor(maxLen) 115 116 var tableSize = 4 * (1 << tableBits) 117 // Memzero needs at least 128 bytes. 118 if tableSize < 128 { 119 panic("tableSize must be at least 128 bytes") 120 } 121 122 lenSrcBasic, err := Param("src").Len().Resolve() 123 if err != nil { 124 panic(err) 125 } 126 lenSrcQ := lenSrcBasic.Addr 127 128 lenDstBasic, err := Param("dst").Len().Resolve() 129 if err != nil { 130 panic(err) 131 } 132 lenDstQ := lenDstBasic.Addr 133 134 // Bail if we can't compress to at least this. 135 dstLimitPtrQ := AllocLocal(8) 136 137 // sLimitL is when to stop looking for offset/length copies. 138 sLimitL := AllocLocal(4) 139 140 // nextEmitL keeps track of the point we have emitted to. 141 nextEmitL := AllocLocal(4) 142 143 // Repeat stores the last match offset. 144 repeatL := AllocLocal(4) 145 146 // nextSTempL keeps nextS while other functions are being called. 147 nextSTempL := AllocLocal(4) 148 149 // Alloc table last 150 table := AllocLocal(tableSize) 151 152 dst := GP64() 153 { 154 dstBaseBasic, err := Param("dst").Base().Resolve() 155 if err != nil { 156 panic(err) 157 } 158 dstBaseQ := dstBaseBasic.Addr 159 MOVQ(dstBaseQ, dst) 160 } 161 162 srcBaseBasic, err := Param("src").Base().Resolve() 163 if err != nil { 164 panic(err) 165 } 166 srcBaseQ := srcBaseBasic.Addr 167 168 // Zero table 169 { 170 iReg := GP64() 171 MOVQ(U32(tableSize/8/16), iReg) 172 tablePtr := GP64() 173 LEAQ(table, tablePtr) 174 zeroXmm := XMM() 175 PXOR(zeroXmm, zeroXmm) 176 177 Label("zero_loop_" + name) 178 for i := 0; i < 8; i++ { 179 MOVOU(zeroXmm, Mem{Base: tablePtr, Disp: i * 16}) 180 } 181 ADDQ(U8(16*8), tablePtr) 182 DECQ(iReg) 183 JNZ(LabelRef("zero_loop_" + name)) 184 } 185 186 { 187 // nextEmit is offset n src where the next emitLiteral should start from. 188 MOVL(U32(0), nextEmitL) 189 190 const inputMargin = 8 191 tmp, tmp2, tmp3 := GP64(), GP64(), GP64() 192 MOVQ(lenSrcQ, tmp) 193 LEAQ(Mem{Base: tmp, Disp: -5}, tmp2) 194 // sLimitL := len(src) - inputMargin 195 LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3) 196 197 assert(func(ok LabelRef) { 198 CMPQ(tmp3, lenSrcQ) 199 JL(ok) 200 }) 201 202 MOVL(tmp3.As32(), sLimitL) 203 204 // dstLimit := (len(src) - 5 ) - len(src)>>5 205 SHRQ(U8(5), tmp) 206 SUBL(tmp.As32(), tmp2.As32()) // tmp2 = tmp2 - tmp 207 208 assert(func(ok LabelRef) { 209 // if len(src) > len(src) - len(src)>>5 - 5: ok 210 CMPQ(lenSrcQ, tmp2) 211 JGE(ok) 212 }) 213 214 LEAQ(Mem{Base: dst, Index: tmp2, Scale: 1}, tmp2) 215 MOVQ(tmp2, dstLimitPtrQ) 216 } 217 218 // s = 1 219 s := GP32() 220 MOVL(U32(1), s) 221 // repeatL = 1 222 MOVL(s, repeatL) 223 224 src := GP64() 225 Load(Param("src").Base(), src) 226 227 // Load cv 228 Label("search_loop_" + name) 229 candidate := GP32() 230 { 231 assert(func(ok LabelRef) { 232 // Check if somebody changed src 233 tmp := GP64() 234 MOVQ(srcBaseQ, tmp) 235 CMPQ(tmp, src) 236 JEQ(ok) 237 }) 238 239 cv := GP64() 240 nextS := GP32() 241 // nextS := s + (s-nextEmit)>>6 + 4 242 { 243 tmp := GP64() 244 MOVL(s, tmp.As32()) // tmp = s 245 SUBL(nextEmitL, tmp.As32()) // tmp = s - nextEmit 246 SHRL(U8(skipLog), tmp.As32()) // tmp = (s - nextEmit) >> skipLog 247 LEAL(Mem{Base: s, Disp: 4, Index: tmp, Scale: 1}, nextS) 248 } 249 // if nextS > sLimit {goto emitRemainder} 250 { 251 CMPL(nextS.As32(), sLimitL) 252 JGE(LabelRef("emit_remainder_" + name)) 253 } 254 MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv) 255 assert(func(ok LabelRef) { 256 // Check if s is valid (we should have jumped above if not) 257 tmp := GP64() 258 MOVQ(lenSrcQ, tmp) 259 CMPQ(tmp, s.As64()) 260 JG(ok) 261 }) 262 // move nextS to stack. 263 MOVL(nextS.As32(), nextSTempL) 264 265 candidate2 := GP32() 266 hasher := hashN(hashBytes, tableBits) 267 { 268 hash0, hash1 := GP64(), GP64() 269 MOVQ(cv, hash0) 270 MOVQ(cv, hash1) 271 SHRQ(U8(8), hash1) 272 hasher.hash(hash0) 273 hasher.hash(hash1) 274 MOVL(table.Idx(hash0, 4), candidate) 275 MOVL(table.Idx(hash1, 4), candidate2) 276 assert(func(ok LabelRef) { 277 CMPQ(hash0, U32(tableSize)) 278 JL(ok) 279 }) 280 assert(func(ok LabelRef) { 281 CMPQ(hash1, U32(tableSize)) 282 JL(ok) 283 }) 284 285 MOVL(s, table.Idx(hash0, 4)) 286 tmp := GP32() 287 LEAL(Mem{Base: s, Disp: 1}, tmp) 288 MOVL(tmp, table.Idx(hash1, 4)) 289 } 290 291 // Can be moved up if registers are available. 292 hash2 := GP64() 293 { 294 // hash2 := hash6(cv>>16, tableBits) 295 // hasher = hash6(tableBits) 296 MOVQ(cv, hash2) 297 SHRQ(U8(16), hash2) 298 hasher.hash(hash2) 299 assert(func(ok LabelRef) { 300 CMPQ(hash2, U32(tableSize)) 301 JL(ok) 302 }) 303 } 304 305 // En/disable repeat matching. 306 if true { 307 // Check repeat at offset checkRep 308 const checkRep = 1 309 { 310 // rep = s - repeat 311 rep := GP32() 312 MOVL(s, rep) 313 SUBL(repeatL, rep) // rep = s - repeat 314 315 // if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { 316 left, right := GP64(), GP64() 317 MOVL(Mem{Base: src, Index: rep, Disp: checkRep, Scale: 1}, right.As32()) 318 MOVQ(cv, left) 319 SHRQ(U8(checkRep*8), left) 320 CMPL(left.As32(), right.As32()) 321 // BAIL, no repeat. 322 JNE(LabelRef("no_repeat_found_" + name)) 323 } 324 // base = s + checkRep 325 base := GP32() 326 LEAL(Mem{Base: s, Disp: checkRep}, base) 327 328 // nextEmit before repeat. 329 nextEmit := GP32() 330 MOVL(nextEmitL, nextEmit) 331 332 // Extend back 333 if true { 334 i := GP32() 335 MOVL(base, i) 336 SUBL(repeatL, i) 337 JZ(LabelRef("repeat_extend_back_end_" + name)) 338 339 Label("repeat_extend_back_loop_" + name) 340 // if base <= nextemit {exit} 341 CMPL(base.As32(), nextEmit) 342 JLE(LabelRef("repeat_extend_back_end_" + name)) 343 // if src[i-1] == src[base-1] 344 tmp, tmp2 := GP64(), GP64() 345 MOVB(Mem{Base: src, Index: i, Scale: 1, Disp: -1}, tmp.As8()) 346 MOVB(Mem{Base: src, Index: base, Scale: 1, Disp: -1}, tmp2.As8()) 347 CMPB(tmp.As8(), tmp2.As8()) 348 JNE(LabelRef("repeat_extend_back_end_" + name)) 349 LEAL(Mem{Base: base, Disp: -1}, base) 350 DECL(i) 351 JNZ(LabelRef("repeat_extend_back_loop_" + name)) 352 } 353 Label("repeat_extend_back_end_" + name) 354 355 // Base is now at start. Emit until base. 356 // d += emitLiteral(dst[d:], src[nextEmit:base]) 357 if true { 358 o.emitLiteralsDstP(nextEmitL, base, src, dst, "repeat_emit_"+name) 359 } 360 361 // Extend forward 362 { 363 // s += 4 + checkRep 364 ADDL(U8(4+checkRep), s) 365 366 if true { 367 // candidate := s - repeat + 4 + checkRep 368 MOVL(s, candidate) 369 SUBL(repeatL, candidate) // candidate = s - repeat 370 371 // srcLeft = len(src) - s 372 srcLeft := GP64() 373 MOVQ(lenSrcQ, srcLeft) 374 SUBL(s, srcLeft.As32()) 375 assert(func(ok LabelRef) { 376 // if srcleft < maxint32: ok 377 CMPQ(srcLeft, U32(0x7fffffff)) 378 JL(ok) 379 }) 380 // Forward address 381 forwardStart := GP64() 382 LEAQ(Mem{Base: src, Index: s, Scale: 1}, forwardStart) 383 // End address 384 backStart := GP64() 385 LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, backStart) 386 387 length := o.matchLen("repeat_extend_"+name, forwardStart, backStart, srcLeft, LabelRef("repeat_extend_forward_end_"+name)) 388 forwardStart, backStart, srcLeft = nil, nil, nil 389 Label("repeat_extend_forward_end_" + name) 390 // s+= length 391 ADDL(length.As32(), s) 392 } 393 } 394 // Emit 395 if true { 396 // length = s-base 397 length := GP32() 398 MOVL(s, length) 399 SUBL(base.As32(), length) // length = s - base 400 401 offsetVal := GP32() 402 MOVL(repeatL, offsetVal) 403 404 if !o.snappy { 405 // if nextEmit == 0 {do copy instead...} 406 TESTL(nextEmit, nextEmit) 407 JZ(LabelRef("repeat_as_copy_" + name)) 408 409 // Emit as repeat... 410 o.emitRepeat("match_repeat_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name)) 411 412 // Emit as copy instead... 413 Label("repeat_as_copy_" + name) 414 } 415 o.emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name)) 416 417 Label("repeat_end_emit_" + name) 418 // Store new dst and nextEmit 419 MOVL(s, nextEmitL) 420 } 421 // if s >= sLimit is picked up on next loop. 422 if false { 423 CMPL(s.As32(), sLimitL) 424 JGE(LabelRef("emit_remainder_" + name)) 425 } 426 JMP(LabelRef("search_loop_" + name)) 427 } 428 Label("no_repeat_found_" + name) 429 { 430 // Check candidates are ok. All must be < s and < len(src) 431 assert(func(ok LabelRef) { 432 tmp := GP64() 433 MOVQ(lenSrcQ, tmp) 434 CMPL(tmp.As32(), candidate) 435 JG(ok) 436 }) 437 assert(func(ok LabelRef) { 438 CMPL(s, candidate) 439 JG(ok) 440 }) 441 assert(func(ok LabelRef) { 442 tmp := GP64() 443 MOVQ(lenSrcQ, tmp) 444 CMPL(tmp.As32(), candidate2) 445 JG(ok) 446 }) 447 assert(func(ok LabelRef) { 448 CMPL(s, candidate2) 449 JG(ok) 450 }) 451 452 CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) 453 JEQ(LabelRef("candidate_match_" + name)) 454 455 tmp := GP32() 456 // cv >>= 8 457 SHRQ(U8(8), cv) 458 459 // candidate = int(table[hash2]) - load early. 460 MOVL(table.Idx(hash2, 4), candidate) 461 assert(func(ok LabelRef) { 462 tmp := GP64() 463 MOVQ(lenSrcQ, tmp) 464 CMPL(tmp.As32(), candidate) 465 JG(ok) 466 }) 467 assert(func(ok LabelRef) { 468 // We may get s and s+1 469 tmp := GP32() 470 LEAL(Mem{Base: s, Disp: 2}, tmp) 471 CMPL(tmp, candidate) 472 JG(ok) 473 }) 474 475 LEAL(Mem{Base: s, Disp: 2}, tmp) 476 477 //if uint32(cv>>8) == load32(src, candidate2) 478 CMPL(Mem{Base: src, Index: candidate2, Scale: 1}, cv.As32()) 479 JEQ(LabelRef("candidate2_match_" + name)) 480 481 // table[hash2] = uint32(s + 2) 482 MOVL(tmp, table.Idx(hash2, 4)) 483 484 // cv >>= 8 (>> 16 total) 485 SHRQ(U8(8), cv) 486 487 // if uint32(cv>>16) == load32(src, candidate) 488 CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) 489 JEQ(LabelRef("candidate3_match_" + name)) 490 491 // No match found, next loop 492 // s = nextS 493 MOVL(nextSTempL, s) 494 JMP(LabelRef("search_loop_" + name)) 495 496 // Matches candidate at s + 2 (3rd check) 497 Label("candidate3_match_" + name) 498 ADDL(U8(2), s) 499 JMP(LabelRef("candidate_match_" + name)) 500 501 // Match at s + 1 (we calculated the hash, lets store it) 502 Label("candidate2_match_" + name) 503 // table[hash2] = uint32(s + 2) 504 MOVL(tmp, table.Idx(hash2, 4)) 505 // s++ 506 INCL(s) 507 MOVL(candidate2, candidate) 508 } 509 } 510 511 Label("candidate_match_" + name) 512 // We have a match at 's' with src offset in "candidate" that matches at least 4 bytes. 513 // Extend backwards 514 if true { 515 ne := GP32() 516 MOVL(nextEmitL, ne) 517 TESTL(candidate, candidate) 518 JZ(LabelRef("match_extend_back_end_" + name)) 519 520 // candidate is tested when decremented, so we loop back here. 521 Label("match_extend_back_loop_" + name) 522 // if s <= nextEmit {exit} 523 CMPL(s, ne) 524 JLE(LabelRef("match_extend_back_end_" + name)) 525 // if src[candidate-1] == src[s-1] 526 tmp, tmp2 := GP64(), GP64() 527 MOVB(Mem{Base: src, Index: candidate, Scale: 1, Disp: -1}, tmp.As8()) 528 MOVB(Mem{Base: src, Index: s, Scale: 1, Disp: -1}, tmp2.As8()) 529 CMPB(tmp.As8(), tmp2.As8()) 530 JNE(LabelRef("match_extend_back_end_" + name)) 531 LEAL(Mem{Base: s, Disp: -1}, s) 532 DECL(candidate) 533 JZ(LabelRef("match_extend_back_end_" + name)) 534 JMP(LabelRef("match_extend_back_loop_" + name)) 535 } 536 Label("match_extend_back_end_" + name) 537 538 // Bail if we exceed the maximum size. 539 if true { 540 // tmp = s-nextEmit 541 tmp := GP64() 542 MOVL(s, tmp.As32()) 543 SUBL(nextEmitL, tmp.As32()) 544 // tmp = &dst + s-nextEmit 545 LEAQ(Mem{Base: dst, Index: tmp, Scale: 1, Disp: literalMaxOverhead}, tmp) 546 CMPQ(tmp, dstLimitPtrQ) 547 JL(LabelRef("match_dst_size_check_" + name)) 548 ri, err := ReturnIndex(0).Resolve() 549 if err != nil { 550 panic(err) 551 } 552 MOVQ(U32(0), ri.Addr) 553 RET() 554 } 555 Label("match_dst_size_check_" + name) 556 { 557 base := GP32() 558 MOVL(s, base.As32()) 559 o.emitLiteralsDstP(nextEmitL, base, src, dst, "match_emit_"+name) 560 } 561 cv := GP64() 562 Label("match_nolit_loop_" + name) 563 { 564 // Update repeat 565 { 566 // repeat = base - candidate 567 repeatVal := GP64().As32() 568 MOVL(s, repeatVal) 569 SUBL(candidate, repeatVal) 570 MOVL(repeatVal, repeatL) 571 } 572 // s+=4, candidate+=4 573 ADDL(U8(4), s) 574 ADDL(U8(4), candidate) 575 // Extend the 4-byte match as long as possible and emit copy. 576 { 577 assert(func(ok LabelRef) { 578 // s must be > candidate cannot be equal. 579 CMPL(s, candidate) 580 JG(ok) 581 }) 582 // srcLeft = len(src) - s 583 srcLeft := GP64() 584 MOVQ(lenSrcQ, srcLeft) 585 SUBL(s, srcLeft.As32()) 586 assert(func(ok LabelRef) { 587 // if srcleft < maxint32: ok 588 CMPQ(srcLeft, U32(0x7fffffff)) 589 JL(ok) 590 }) 591 592 a, b := GP64(), GP64() 593 LEAQ(Mem{Base: src, Index: s, Scale: 1}, a) 594 LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, b) 595 length := o.matchLen("match_nolit_"+name, 596 a, b, 597 srcLeft, 598 LabelRef("match_nolit_end_"+name), 599 ) 600 Label("match_nolit_end_" + name) 601 assert(func(ok LabelRef) { 602 CMPL(length.As32(), U32(math.MaxInt32)) 603 JL(ok) 604 }) 605 a, b, srcLeft = nil, nil, nil 606 607 // s += length (length is destroyed, use it now) 608 ADDL(length.As32(), s) 609 610 // Load offset from repeat value. 611 offset := GP64() 612 MOVL(repeatL, offset.As32()) 613 614 // length += 4 615 ADDL(U8(4), length.As32()) 616 MOVL(s, nextEmitL) // nextEmit = s 617 o.emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name)) 618 Label("match_nolit_emitcopy_end_" + name) 619 620 // if s >= sLimit { end } 621 { 622 CMPL(s.As32(), sLimitL) 623 JGE(LabelRef("emit_remainder_" + name)) 624 } 625 // Start load s-2 as early as possible... 626 MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, cv) 627 // Bail if we exceed the maximum size. 628 { 629 CMPQ(dst, dstLimitPtrQ) 630 JL(LabelRef("match_nolit_dst_ok_" + name)) 631 ri, err := ReturnIndex(0).Resolve() 632 if err != nil { 633 panic(err) 634 } 635 MOVQ(U32(0), ri.Addr) 636 RET() 637 Label("match_nolit_dst_ok_" + name) 638 } 639 } 640 // cv must be set to value at s-2 before arriving here 641 { 642 // Check for an immediate match, otherwise start search at s+1 643 // Index s-2 644 hasher := hashN(hashBytes, tableBits) 645 hash0, hash1 := GP64(), GP64() 646 MOVQ(cv, hash0) // src[s-2] 647 SHRQ(U8(16), cv) 648 MOVQ(cv, hash1) // src[s] 649 hasher.hash(hash0) 650 hasher.hash(hash1) 651 sm2 := GP32() // s - 2 652 LEAL(Mem{Base: s, Disp: -2}, sm2) 653 assert(func(ok LabelRef) { 654 CMPQ(hash0, U32(tableSize)) 655 JL(ok) 656 }) 657 assert(func(ok LabelRef) { 658 CMPQ(hash1, U32(tableSize)) 659 JL(ok) 660 }) 661 addr := GP64() 662 LEAQ(table.Idx(hash1, 4), addr) 663 MOVL(Mem{Base: addr}, candidate) 664 MOVL(sm2, table.Idx(hash0, 4)) 665 MOVL(s, Mem{Base: addr}) 666 CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) 667 JEQ(LabelRef("match_nolit_loop_" + name)) 668 INCL(s) 669 } 670 JMP(LabelRef("search_loop_" + name)) 671 } 672 673 Label("emit_remainder_" + name) 674 // Bail if we exceed the maximum size. 675 // if d+len(src)-nextEmitL > dstLimitPtrQ { return 0 676 { 677 // remain = len(src) - nextEmit 678 remain := GP64() 679 MOVQ(lenSrcQ, remain) 680 SUBL(nextEmitL, remain.As32()) 681 682 dstExpect := GP64() 683 // dst := dst + (len(src)-nextEmitL) 684 685 LEAQ(Mem{Base: dst, Index: remain, Scale: 1, Disp: literalMaxOverhead}, dstExpect) 686 CMPQ(dstExpect, dstLimitPtrQ) 687 JL(LabelRef("emit_remainder_ok_" + name)) 688 ri, err := ReturnIndex(0).Resolve() 689 if err != nil { 690 panic(err) 691 } 692 MOVQ(U32(0), ri.Addr) 693 RET() 694 Label("emit_remainder_ok_" + name) 695 } 696 // emitLiteral(dst[d:], src[nextEmitL:]) 697 emitEnd := GP64() 698 MOVQ(lenSrcQ, emitEnd) 699 700 // Emit final literals. 701 o.emitLiteralsDstP(nextEmitL, emitEnd, src, dst, "emit_remainder_"+name) 702 703 // Assert size is < limit 704 assert(func(ok LabelRef) { 705 // if dstBaseQ < dstLimitPtrQ: ok 706 CMPQ(dst, dstLimitPtrQ) 707 JL(ok) 708 }) 709 710 // length := start - base (ptr arithmetic) 711 length := GP64() 712 base := Load(Param("dst").Base(), GP64()) 713 MOVQ(dst, length) 714 SUBQ(base, length) 715 716 // Assert size is < len(src) 717 assert(func(ok LabelRef) { 718 // if len(src) >= length: ok 719 CMPQ(lenSrcQ, length) 720 JGE(ok) 721 }) 722 // Assert size is < len(dst) 723 assert(func(ok LabelRef) { 724 // if len(dst) >= length: ok 725 CMPQ(lenDstQ, length) 726 JGE(ok) 727 }) 728 Store(length, ReturnIndex(0)) 729 RET() 730} 731 732func maxLitOverheadFor(n int) int { 733 switch { 734 case n == 0: 735 return 0 736 case n < 60: 737 return 1 738 case n < 1<<8: 739 return 2 740 case n < 1<<16: 741 return 3 742 case n < 1<<24: 743 return 4 744 } 745 return 5 746} 747 748func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHashBytes, maxLen int) { 749 TEXT(name, 0, "func(dst, src []byte) int") 750 Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.", 751 fmt.Sprintf("Maximum input %d bytes.", maxLen), 752 "It assumes that the varint-encoded length of the decompressed bytes has already been written.", "") 753 Pragma("noescape") 754 755 if lHashBytes > 7 || lHashBytes <= 4 { 756 panic("lHashBytes must be <= 7 and >4") 757 } 758 var literalMaxOverhead = maxLitOverheadFor(maxLen) 759 760 var sTableBits = lTableBits - 2 761 const sHashBytes = 4 762 o.maxLen = maxLen 763 764 var lTableSize = 4 * (1 << lTableBits) 765 var sTableSize = 4 * (1 << sTableBits) 766 767 // Memzero needs at least 128 bytes. 768 if (lTableSize + sTableSize) < 128 { 769 panic("tableSize must be at least 128 bytes") 770 } 771 772 lenSrcBasic, err := Param("src").Len().Resolve() 773 if err != nil { 774 panic(err) 775 } 776 lenSrcQ := lenSrcBasic.Addr 777 778 lenDstBasic, err := Param("dst").Len().Resolve() 779 if err != nil { 780 panic(err) 781 } 782 lenDstQ := lenDstBasic.Addr 783 784 // Bail if we can't compress to at least this. 785 dstLimitPtrQ := AllocLocal(8) 786 787 // sLimitL is when to stop looking for offset/length copies. 788 sLimitL := AllocLocal(4) 789 790 // nextEmitL keeps track of the point we have emitted to. 791 nextEmitL := AllocLocal(4) 792 793 // Repeat stores the last match offset. 794 repeatL := AllocLocal(4) 795 796 // nextSTempL keeps nextS while other functions are being called. 797 nextSTempL := AllocLocal(4) 798 799 // Alloc table last, lTab must be before sTab. 800 lTab := AllocLocal(lTableSize) 801 sTab := AllocLocal(sTableSize) 802 803 dst := GP64() 804 { 805 dstBaseBasic, err := Param("dst").Base().Resolve() 806 if err != nil { 807 panic(err) 808 } 809 dstBaseQ := dstBaseBasic.Addr 810 MOVQ(dstBaseQ, dst) 811 } 812 813 srcBaseBasic, err := Param("src").Base().Resolve() 814 if err != nil { 815 panic(err) 816 } 817 srcBaseQ := srcBaseBasic.Addr 818 819 // Zero table 820 { 821 iReg := GP64() 822 MOVQ(U32((sTableSize+lTableSize)/8/16), iReg) 823 tablePtr := GP64() 824 LEAQ(lTab, tablePtr) 825 zeroXmm := XMM() 826 PXOR(zeroXmm, zeroXmm) 827 828 Label("zero_loop_" + name) 829 for i := 0; i < 8; i++ { 830 MOVOU(zeroXmm, Mem{Base: tablePtr, Disp: i * 16}) 831 } 832 ADDQ(U8(16*8), tablePtr) 833 DECQ(iReg) 834 JNZ(LabelRef("zero_loop_" + name)) 835 } 836 837 { 838 // nextEmit is offset n src where the next emitLiteral should start from. 839 MOVL(U32(0), nextEmitL) 840 841 const inputMargin = 8 842 tmp, tmp2, tmp3 := GP64(), GP64(), GP64() 843 MOVQ(lenSrcQ, tmp) 844 LEAQ(Mem{Base: tmp, Disp: -6}, tmp2) 845 // sLimitL := len(src) - inputMargin 846 LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3) 847 848 assert(func(ok LabelRef) { 849 CMPQ(tmp3, lenSrcQ) 850 JL(ok) 851 }) 852 853 MOVL(tmp3.As32(), sLimitL) 854 855 // dstLimit := (len(src) - 5 ) - len(src)>>5 856 SHRQ(U8(5), tmp) 857 SUBL(tmp.As32(), tmp2.As32()) // tmp2 = tmp2 - tmp 858 859 assert(func(ok LabelRef) { 860 // if len(src) > len(src) - len(src)>>5 - 5: ok 861 CMPQ(lenSrcQ, tmp2) 862 JGE(ok) 863 }) 864 865 LEAQ(Mem{Base: dst, Index: tmp2, Scale: 1}, tmp2) 866 MOVQ(tmp2, dstLimitPtrQ) 867 } 868 869 // s = 1 870 s := GP32() 871 MOVL(U32(1), s) 872 // repeatL = 0 873 MOVL(U32(0), repeatL) 874 875 src := GP64() 876 Load(Param("src").Base(), src) 877 878 // Load cv 879 Label("search_loop_" + name) 880 candidate := GP32() 881 { 882 assert(func(ok LabelRef) { 883 // Check if somebody changed src 884 tmp := GP64() 885 MOVQ(srcBaseQ, tmp) 886 CMPQ(tmp, src) 887 JEQ(ok) 888 }) 889 890 cv := GP64() 891 nextS := GP32() 892 // nextS := s + (s-nextEmit)>>skipLog + 1 893 { 894 tmp := GP64() 895 MOVL(s, tmp.As32()) // tmp = s 896 SUBL(nextEmitL, tmp.As32()) // tmp = s - nextEmit 897 SHRL(U8(skipLog), tmp.As32()) // tmp = (s - nextEmit) >> skipLog 898 LEAL(Mem{Base: s, Disp: 1, Index: tmp, Scale: 1}, nextS) 899 } 900 // if nextS > sLimit {goto emitRemainder} 901 { 902 CMPL(nextS.As32(), sLimitL) 903 JGE(LabelRef("emit_remainder_" + name)) 904 } 905 MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv) 906 assert(func(ok LabelRef) { 907 // Check if s is valid (we should have jumped above if not) 908 tmp := GP64() 909 MOVQ(lenSrcQ, tmp) 910 CMPQ(tmp, s.As64()) 911 JG(ok) 912 }) 913 // move nextS to stack. 914 MOVL(nextS.As32(), nextSTempL) 915 916 candidateS := GP32() 917 lHasher := hashN(lHashBytes, lTableBits) 918 { 919 sHasher := hashN(sHashBytes, sTableBits) 920 hash0, hash1 := GP64(), GP64() 921 MOVQ(cv, hash0) 922 MOVQ(cv, hash1) 923 lHasher.hash(hash0) 924 sHasher.hash(hash1) 925 MOVL(lTab.Idx(hash0, 4), candidate) 926 MOVL(sTab.Idx(hash1, 4), candidateS) 927 assert(func(ok LabelRef) { 928 CMPQ(hash0, U32(lTableSize)) 929 JL(ok) 930 }) 931 assert(func(ok LabelRef) { 932 CMPQ(hash1, U32(sTableSize)) 933 JL(ok) 934 }) 935 936 MOVL(s, lTab.Idx(hash0, 4)) 937 MOVL(s, sTab.Idx(hash1, 4)) 938 } 939 940 // En/disable repeat matching. 941 if false { 942 // Check repeat at offset checkRep 943 const checkRep = 1 944 { 945 // rep = s - repeat 946 rep := GP32() 947 MOVL(s, rep) 948 SUBL(repeatL, rep) // rep = s - repeat 949 950 // if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { 951 left, right := GP64(), GP64() 952 MOVL(Mem{Base: src, Index: rep, Disp: checkRep, Scale: 1}, right.As32()) 953 MOVQ(cv, left) 954 SHRQ(U8(checkRep*8), left) 955 CMPL(left.As32(), right.As32()) 956 // BAIL, no repeat. 957 JNE(LabelRef("no_repeat_found_" + name)) 958 } 959 // base = s + checkRep 960 base := GP32() 961 LEAL(Mem{Base: s, Disp: checkRep}, base) 962 963 // nextEmit before repeat. 964 nextEmit := GP32() 965 MOVL(nextEmitL, nextEmit) 966 967 // Extend back 968 if true { 969 i := GP32() 970 MOVL(base, i) 971 SUBL(repeatL, i) 972 JZ(LabelRef("repeat_extend_back_end_" + name)) 973 974 Label("repeat_extend_back_loop_" + name) 975 // if base <= nextemit {exit} 976 CMPL(base.As32(), nextEmit) 977 JLE(LabelRef("repeat_extend_back_end_" + name)) 978 // if src[i-1] == src[base-1] 979 tmp, tmp2 := GP64(), GP64() 980 MOVB(Mem{Base: src, Index: i, Scale: 1, Disp: -1}, tmp.As8()) 981 MOVB(Mem{Base: src, Index: base, Scale: 1, Disp: -1}, tmp2.As8()) 982 CMPB(tmp.As8(), tmp2.As8()) 983 JNE(LabelRef("repeat_extend_back_end_" + name)) 984 LEAL(Mem{Base: base, Disp: -1}, base) 985 DECL(i) 986 JNZ(LabelRef("repeat_extend_back_loop_" + name)) 987 } 988 Label("repeat_extend_back_end_" + name) 989 990 // Base is now at start. Emit until base. 991 // d += emitLiteral(dst[d:], src[nextEmit:base]) 992 if true { 993 o.emitLiteralsDstP(nextEmitL, base, src, dst, "repeat_emit_"+name) 994 } 995 996 // Extend forward 997 { 998 // s += 4 + checkRep 999 ADDL(U8(4+checkRep), s) 1000 1001 if true { 1002 // candidate := s - repeat + 4 + checkRep 1003 MOVL(s, candidate) 1004 SUBL(repeatL, candidate) // candidate = s - repeat 1005 1006 // srcLeft = len(src) - s 1007 srcLeft := GP64() 1008 MOVQ(lenSrcQ, srcLeft) 1009 SUBL(s, srcLeft.As32()) 1010 assert(func(ok LabelRef) { 1011 // if srcleft < maxint32: ok 1012 CMPQ(srcLeft, U32(0x7fffffff)) 1013 JL(ok) 1014 }) 1015 // Forward address 1016 forwardStart := GP64() 1017 LEAQ(Mem{Base: src, Index: s, Scale: 1}, forwardStart) 1018 // End address 1019 backStart := GP64() 1020 LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, backStart) 1021 1022 length := o.matchLen("repeat_extend_"+name, forwardStart, backStart, srcLeft, LabelRef("repeat_extend_forward_end_"+name)) 1023 forwardStart, backStart, srcLeft = nil, nil, nil 1024 Label("repeat_extend_forward_end_" + name) 1025 // s+= length 1026 ADDL(length.As32(), s) 1027 } 1028 } 1029 // Emit 1030 if true { 1031 // length = s-base 1032 length := GP32() 1033 MOVL(s, length) 1034 SUBL(base.As32(), length) // length = s - base 1035 1036 offsetVal := GP32() 1037 MOVL(repeatL, offsetVal) 1038 1039 if !o.snappy { 1040 // if nextEmit == 0 {do copy instead...} 1041 TESTL(nextEmit, nextEmit) 1042 JZ(LabelRef("repeat_as_copy_" + name)) 1043 1044 // Emit as repeat... 1045 o.emitRepeat("match_repeat_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name)) 1046 1047 // Emit as copy instead... 1048 Label("repeat_as_copy_" + name) 1049 } 1050 o.emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name)) 1051 1052 Label("repeat_end_emit_" + name) 1053 // Store new dst and nextEmit 1054 MOVL(s, nextEmitL) 1055 } 1056 // if s >= sLimit is picked up on next loop. 1057 if false { 1058 CMPL(s.As32(), sLimitL) 1059 JGE(LabelRef("emit_remainder_" + name)) 1060 } 1061 JMP(LabelRef("search_loop_" + name)) 1062 } 1063 Label("no_repeat_found_" + name) 1064 { 1065 // Check candidates are ok. All must be < s and < len(src) 1066 assert(func(ok LabelRef) { 1067 tmp := GP64() 1068 MOVQ(lenSrcQ, tmp) 1069 CMPL(tmp.As32(), candidate) 1070 JG(ok) 1071 }) 1072 assert(func(ok LabelRef) { 1073 CMPL(s, candidate) 1074 JG(ok) 1075 }) 1076 assert(func(ok LabelRef) { 1077 tmp := GP64() 1078 MOVQ(lenSrcQ, tmp) 1079 CMPL(tmp.As32(), candidateS) 1080 JG(ok) 1081 }) 1082 assert(func(ok LabelRef) { 1083 CMPL(s, candidateS) 1084 JG(ok) 1085 }) 1086 1087 CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) 1088 JEQ(LabelRef("candidate_match_" + name)) 1089 1090 //if uint32(cv) == load32(src, candidateS) 1091 CMPL(Mem{Base: src, Index: candidateS, Scale: 1}, cv.As32()) 1092 JEQ(LabelRef("candidateS_match_" + name)) 1093 1094 // No match found, next loop 1095 // s = nextS 1096 MOVL(nextSTempL, s) 1097 JMP(LabelRef("search_loop_" + name)) 1098 1099 // Short match at s, try a long candidate at s+1 1100 Label("candidateS_match_" + name) 1101 if true { 1102 hash0 := GP64() 1103 SHRQ(U8(8), cv) 1104 MOVQ(cv, hash0) 1105 lHasher.hash(hash0) 1106 MOVL(lTab.Idx(hash0, 4), candidate) 1107 INCL(s) 1108 assert(func(ok LabelRef) { 1109 CMPQ(hash0, U32(lTableSize)) 1110 JL(ok) 1111 }) 1112 MOVL(s, lTab.Idx(hash0, 4)) 1113 CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) 1114 JEQ(LabelRef("candidate_match_" + name)) 1115 // No match, decrement s again and use short match at s... 1116 DECL(s) 1117 } 1118 MOVL(candidateS, candidate) 1119 } 1120 } 1121 1122 Label("candidate_match_" + name) 1123 // We have a match at 's' with src offset in "candidate" that matches at least 4 bytes. 1124 // Extend backwards 1125 if true { 1126 ne := GP32() 1127 MOVL(nextEmitL, ne) 1128 TESTL(candidate, candidate) 1129 JZ(LabelRef("match_extend_back_end_" + name)) 1130 1131 // candidate is tested when decremented, so we loop back here. 1132 Label("match_extend_back_loop_" + name) 1133 // if s <= nextEmit {exit} 1134 CMPL(s, ne) 1135 JLE(LabelRef("match_extend_back_end_" + name)) 1136 // if src[candidate-1] == src[s-1] 1137 tmp, tmp2 := GP64(), GP64() 1138 MOVB(Mem{Base: src, Index: candidate, Scale: 1, Disp: -1}, tmp.As8()) 1139 MOVB(Mem{Base: src, Index: s, Scale: 1, Disp: -1}, tmp2.As8()) 1140 CMPB(tmp.As8(), tmp2.As8()) 1141 JNE(LabelRef("match_extend_back_end_" + name)) 1142 LEAL(Mem{Base: s, Disp: -1}, s) 1143 DECL(candidate) 1144 JZ(LabelRef("match_extend_back_end_" + name)) 1145 JMP(LabelRef("match_extend_back_loop_" + name)) 1146 } 1147 Label("match_extend_back_end_" + name) 1148 1149 // Bail if we exceed the maximum size. 1150 if true { 1151 // tmp = s-nextEmit 1152 tmp := GP64() 1153 MOVL(s, tmp.As32()) 1154 SUBL(nextEmitL, tmp.As32()) 1155 // tmp = &dst + s-nextEmit 1156 LEAQ(Mem{Base: dst, Index: tmp, Scale: 1, Disp: literalMaxOverhead}, tmp) 1157 CMPQ(tmp, dstLimitPtrQ) 1158 JL(LabelRef("match_dst_size_check_" + name)) 1159 ri, err := ReturnIndex(0).Resolve() 1160 if err != nil { 1161 panic(err) 1162 } 1163 MOVQ(U32(0), ri.Addr) 1164 RET() 1165 } 1166 Label("match_dst_size_check_" + name) 1167 1168 base := GP32() 1169 MOVL(s, base.As32()) 1170 1171 // s+=4, candidate+=4 1172 ADDL(U8(4), s) 1173 ADDL(U8(4), candidate) 1174 // Extend the 4-byte match as long as possible and emit copy. 1175 { 1176 assert(func(ok LabelRef) { 1177 // s must be > candidate cannot be equal. 1178 CMPL(s, candidate) 1179 JG(ok) 1180 }) 1181 // srcLeft = len(src) - s 1182 srcLeft := GP64() 1183 MOVQ(lenSrcQ, srcLeft) 1184 SUBL(s, srcLeft.As32()) 1185 assert(func(ok LabelRef) { 1186 // if srcleft < maxint32: ok 1187 CMPQ(srcLeft, U32(0x7fffffff)) 1188 JL(ok) 1189 }) 1190 1191 a, b := GP64(), GP64() 1192 LEAQ(Mem{Base: src, Index: s, Scale: 1}, a) 1193 LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, b) 1194 length := o.matchLen("match_nolit_"+name, 1195 a, b, 1196 srcLeft, 1197 LabelRef("match_nolit_end_"+name), 1198 ) 1199 Label("match_nolit_end_" + name) 1200 assert(func(ok LabelRef) { 1201 CMPL(length.As32(), U32(math.MaxInt32)) 1202 JL(ok) 1203 }) 1204 a, b, srcLeft = nil, nil, nil 1205 1206 offset := GP64() 1207 offset32 := offset.As32() 1208 { 1209 // offset = base - candidate 1210 MOVL(s, offset32) 1211 SUBL(candidate, offset32) 1212 Comment("Check if repeat") 1213 CMPL(repeatL, offset32) 1214 JEQ(LabelRef("match_is_repeat_" + name)) 1215 1216 // NOT REPEAT 1217 { 1218 // Check if match is better.. 1219 if o.maxLen > 65535 { 1220 CMPL(length.As32(), U8(1)) 1221 JG(LabelRef("match_length_ok_" + name)) 1222 CMPL(offset32, U32(65535)) 1223 JLE(LabelRef("match_length_ok_" + name)) 1224 // Match is equal or worse to the encoding. 1225 MOVL(nextSTempL, s) 1226 INCL(s) 1227 JMP(LabelRef("search_loop_" + name)) 1228 Label("match_length_ok_" + name) 1229 } 1230 // Store updated repeat 1231 MOVL(offset32, repeatL) 1232 // Emit.... 1233 o.emitLiteralsDstP(nextEmitL, base, src, dst, "match_emit_"+name) 1234 // s += length (length is destroyed, use it now) 1235 ADDL(length.As32(), s) 1236 1237 // length += 4 1238 ADDL(U8(4), length.As32()) 1239 MOVL(s, nextEmitL) // nextEmit = s 1240 o.emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name)) 1241 // Jumps at end 1242 } 1243 // REPEAT 1244 { 1245 Label("match_is_repeat_" + name) 1246 // Emit.... 1247 o.emitLiteralsDstP(nextEmitL, base, src, dst, "match_emit_repeat_"+name) 1248 // s += length (length is destroyed, use it now) 1249 ADDL(length.As32(), s) 1250 1251 // length += 4 1252 ADDL(U8(4), length.As32()) 1253 MOVL(s, nextEmitL) // nextEmit = s 1254 o.emitRepeat("match_nolit_repeat_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name)) 1255 } 1256 } 1257 Label("match_nolit_emitcopy_end_" + name) 1258 1259 // if s >= sLimit { end } 1260 { 1261 CMPL(s.As32(), sLimitL) 1262 JGE(LabelRef("emit_remainder_" + name)) 1263 } 1264 1265 // Bail if we exceed the maximum size. 1266 { 1267 CMPQ(dst, dstLimitPtrQ) 1268 JL(LabelRef("match_nolit_dst_ok_" + name)) 1269 ri, err := ReturnIndex(0).Resolve() 1270 if err != nil { 1271 panic(err) 1272 } 1273 MOVQ(U32(0), ri.Addr) 1274 RET() 1275 } 1276 } 1277 Label("match_nolit_dst_ok_" + name) 1278 // cv must be set to value at base+1 before arriving here 1279 if true { 1280 lHasher := hashN(lHashBytes, lTableBits) 1281 sHasher := hashN(sHashBytes, sTableBits) 1282 1283 // Index base+1 long, base+2 short... 1284 cv := GP64() 1285 INCL(base) 1286 MOVQ(Mem{Base: src, Index: base, Scale: 1, Disp: 0}, cv) 1287 hash0, hash1, hash2, hash3 := GP64(), GP64(), GP64(), GP64() 1288 MOVQ(cv, hash0) // src[base+1] 1289 MOVQ(cv, hash1) 1290 MOVQ(cv, hash2) 1291 SHRQ(U8(8), hash1) // src[base+2] 1292 MOVQ(hash1, hash3) 1293 SHRQ(U8(16), hash2) // src[base+3] 1294 bp1, bp2 := GP32(), GP32() // base+1 1295 LEAL(Mem{Base: base, Disp: 1}, bp1) 1296 LEAL(Mem{Base: base, Disp: 2}, bp2) 1297 1298 // Load s-2 early 1299 MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, cv) 1300 1301 lHasher.hash(hash0) 1302 lHasher.hash(hash3) 1303 sHasher.hash(hash1) 1304 sHasher.hash(hash2) 1305 assert(func(ok LabelRef) { 1306 CMPQ(hash0, U32(lTableSize)) 1307 JL(ok) 1308 }) 1309 assert(func(ok LabelRef) { 1310 CMPQ(hash3, U32(lTableSize)) 1311 JL(ok) 1312 }) 1313 assert(func(ok LabelRef) { 1314 CMPQ(hash1, U32(sTableSize)) 1315 JL(ok) 1316 }) 1317 assert(func(ok LabelRef) { 1318 CMPQ(hash2, U32(sTableSize)) 1319 JL(ok) 1320 }) 1321 MOVL(base, lTab.Idx(hash0, 4)) 1322 MOVL(bp1, lTab.Idx(hash3, 4)) 1323 MOVL(bp1, sTab.Idx(hash1, 4)) 1324 MOVL(bp2, sTab.Idx(hash2, 4)) 1325 1326 // Index s-2 long, s-1 long+short... 1327 MOVQ(cv, hash0) // src[s-2] 1328 MOVQ(cv, hash1) // src[s-1] 1329 SHRQ(U8(8), hash1) 1330 MOVQ(hash1, hash3) 1331 sm1, sm2 := GP32(), GP32() // s -1, s - 2 1332 LEAL(Mem{Base: s, Disp: -2}, sm2) 1333 LEAL(Mem{Base: s, Disp: -1}, sm1) 1334 lHasher.hash(hash0) 1335 sHasher.hash(hash1) 1336 lHasher.hash(hash3) 1337 assert(func(ok LabelRef) { 1338 CMPQ(hash0, U32(lTableSize)) 1339 JL(ok) 1340 }) 1341 assert(func(ok LabelRef) { 1342 CMPQ(hash3, U32(lTableSize)) 1343 JL(ok) 1344 }) 1345 assert(func(ok LabelRef) { 1346 CMPQ(hash1, U32(sTableSize)) 1347 JL(ok) 1348 }) 1349 MOVL(sm2, lTab.Idx(hash0, 4)) 1350 MOVL(sm1, sTab.Idx(hash1, 4)) 1351 MOVL(sm1, lTab.Idx(hash3, 4)) 1352 } 1353 JMP(LabelRef("search_loop_" + name)) 1354 1355 Label("emit_remainder_" + name) 1356 // Bail if we exceed the maximum size. 1357 // if d+len(src)-nextEmitL > dstLimitPtrQ { return 0 1358 { 1359 // remain = len(src) - nextEmit 1360 remain := GP64() 1361 MOVQ(lenSrcQ, remain) 1362 SUBL(nextEmitL, remain.As32()) 1363 1364 dstExpect := GP64() 1365 // dst := dst + (len(src)-nextEmitL) 1366 1367 LEAQ(Mem{Base: dst, Index: remain, Scale: 1, Disp: literalMaxOverhead}, dstExpect) 1368 CMPQ(dstExpect, dstLimitPtrQ) 1369 JL(LabelRef("emit_remainder_ok_" + name)) 1370 ri, err := ReturnIndex(0).Resolve() 1371 if err != nil { 1372 panic(err) 1373 } 1374 MOVQ(U32(0), ri.Addr) 1375 RET() 1376 Label("emit_remainder_ok_" + name) 1377 } 1378 // emitLiteral(dst[d:], src[nextEmitL:]) 1379 emitEnd := GP64() 1380 MOVQ(lenSrcQ, emitEnd) 1381 1382 // Emit final literals. 1383 o.emitLiteralsDstP(nextEmitL, emitEnd, src, dst, "emit_remainder_"+name) 1384 1385 // Assert size is < limit 1386 assert(func(ok LabelRef) { 1387 // if dstBaseQ < dstLimitPtrQ: ok 1388 CMPQ(dst, dstLimitPtrQ) 1389 JL(ok) 1390 }) 1391 1392 // length := start - base (ptr arithmetic) 1393 length := GP64() 1394 dstBase := Load(Param("dst").Base(), GP64()) 1395 MOVQ(dst, length) 1396 SUBQ(dstBase, length) 1397 1398 // Assert size is < len(src) 1399 assert(func(ok LabelRef) { 1400 // if len(src) >= length: ok 1401 CMPQ(lenSrcQ, length) 1402 JGE(ok) 1403 }) 1404 // Assert size is < len(dst) 1405 assert(func(ok LabelRef) { 1406 // if len(dst) >= length: ok 1407 CMPQ(lenDstQ, length) 1408 JGE(ok) 1409 }) 1410 Store(length, ReturnIndex(0)) 1411 RET() 1412} 1413 1414// emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase. 1415// Checks if base == nextemit. 1416// src & base are untouched. 1417func (o options) emitLiterals(nextEmitL Mem, base reg.GPVirtual, src reg.GPVirtual, dstBase Mem, name string) { 1418 nextEmit, litLen, dstBaseTmp, litBase := GP32(), GP32(), GP64(), GP64() 1419 MOVL(nextEmitL, nextEmit) 1420 CMPL(nextEmit, base.As32()) 1421 JEQ(LabelRef("emit_literal_skip_" + name)) 1422 MOVL(base.As32(), litLen.As32()) 1423 1424 // Base is now next emit. 1425 MOVL(base.As32(), nextEmitL) 1426 1427 // litBase = src[nextEmitL:] 1428 LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase) 1429 SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit 1430 1431 // Load (and store when we return) 1432 MOVQ(dstBase, dstBaseTmp) 1433 o.emitLiteral(name, litLen, nil, dstBaseTmp, litBase, LabelRef("emit_literal_done_"+name), true) 1434 Label("emit_literal_done_" + name) 1435 1436 // Emitted length must be > litlen. 1437 // We have already checked for len(0) above. 1438 assert(func(ok LabelRef) { 1439 tmp := GP64() 1440 MOVQ(dstBaseTmp, tmp) 1441 SUBQ(dstBase, tmp) // tmp = dstBaseTmp - dstBase 1442 // if tmp > litLen: ok 1443 CMPQ(tmp, litLen.As64()) 1444 JG(ok) 1445 }) 1446 // Store updated dstBase 1447 MOVQ(dstBaseTmp, dstBase) 1448 Label("emit_literal_skip_" + name) 1449} 1450 1451// emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase. 1452// Checks if base == nextemit. 1453// src & base are untouched. 1454func (o options) emitLiteralsDstP(nextEmitL Mem, base reg.GPVirtual, src, dst reg.GPVirtual, name string) { 1455 nextEmit, litLen, litBase := GP32(), GP32(), GP64() 1456 MOVL(nextEmitL, nextEmit) 1457 CMPL(nextEmit, base.As32()) 1458 JEQ(LabelRef("emit_literal_done_" + name)) 1459 MOVL(base.As32(), litLen.As32()) 1460 1461 // Base is now next emit. 1462 MOVL(base.As32(), nextEmitL) 1463 1464 // litBase = src[nextEmitL:] 1465 LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase) 1466 SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit 1467 1468 // Load (and store when we return) 1469 o.emitLiteral(name, litLen, nil, dst, litBase, LabelRef("emit_literal_done_"+name), true) 1470 Label("emit_literal_done_" + name) 1471} 1472 1473type hashGen struct { 1474 bytes int 1475 tablebits int 1476 mulreg reg.GPVirtual 1477} 1478 1479// hashN uses multiply to get a 'output' hash on the hash of the lowest 'bytes' bytes in value. 1480func hashN(hashBytes, tablebits int) hashGen { 1481 h := hashGen{ 1482 bytes: hashBytes, 1483 tablebits: tablebits, 1484 mulreg: GP64(), 1485 } 1486 primebytes := uint64(0) 1487 switch hashBytes { 1488 case 3: 1489 primebytes = 506832829 1490 case 4: 1491 primebytes = 2654435761 1492 case 5: 1493 primebytes = 889523592379 1494 case 6: 1495 primebytes = 227718039650203 1496 case 7: 1497 primebytes = 58295818150454627 1498 case 8: 1499 primebytes = 0xcf1bbcdcb7a56463 1500 default: 1501 panic("invalid hash length") 1502 } 1503 MOVQ(Imm(primebytes), h.mulreg) 1504 return h 1505} 1506 1507// hash uses multiply to get hash of the value. 1508func (h hashGen) hash(val reg.GPVirtual) { 1509 // Move value to top of register. 1510 if h.bytes < 8 { 1511 SHLQ(U8(64-8*h.bytes), val) 1512 } 1513 IMULQ(h.mulreg, val) 1514 // Move value to bottom 1515 SHRQ(U8(64-h.tablebits), val) 1516} 1517 1518func (o options) genEmitLiteral() { 1519 TEXT("emitLiteral", NOSPLIT, "func(dst, lit []byte) int") 1520 Doc("emitLiteral writes a literal chunk and returns the number of bytes written.", "", 1521 "It assumes that:", 1522 " dst is long enough to hold the encoded bytes", 1523 " 0 <= len(lit) && len(lit) <= math.MaxUint32", "") 1524 Pragma("noescape") 1525 1526 dstBase, litBase, litLen, retval := GP64(), GP64(), GP64(), GP64() 1527 Load(Param("lit").Len(), litLen) 1528 Load(Param("dst").Base(), dstBase) 1529 Load(Param("lit").Base(), litBase) 1530 TESTQ(litLen, litLen) 1531 JZ(LabelRef("emit_literal_end_standalone_skip")) 1532 o.emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_standalone", false) 1533 1534 Label("emit_literal_end_standalone_skip") 1535 XORQ(retval, retval) 1536 1537 Label("emit_literal_end_standalone") 1538 Store(retval, ReturnIndex(0)) 1539 RET() 1540 1541} 1542 1543// emitLiteral can be used for inlining an emitLiteral call. 1544// litLen must be > 0. 1545// stack must have at least 32 bytes. 1546// retval will contain emitted bytes, but can be nil if this is not interesting. 1547// dstBase and litBase are updated. 1548// Uses 2 GP registers. With AVX 4 registers. 1549// If updateDst is true dstBase will have the updated end pointer and an additional register will be used. 1550func (o options) emitLiteral(name string, litLen, retval, dstBase, litBase reg.GPVirtual, end LabelRef, updateDst bool) { 1551 n := GP32() 1552 n16 := GP32() 1553 1554 // litLen must be > 0 1555 assert(func(ok LabelRef) { 1556 TESTL(litLen.As32(), litLen.As32()) 1557 JNZ(ok) 1558 }) 1559 1560 // We always add litLen bytes 1561 if retval != nil { 1562 MOVL(litLen.As32(), retval.As32()) 1563 } 1564 // n = litlen - 1 1565 LEAL(Mem{Base: litLen.As32(), Disp: -1}, n) 1566 1567 // Find number of bytes to emit for tag. 1568 CMPL(n.As32(), U8(60)) 1569 JLT(LabelRef("one_byte_" + name)) 1570 CMPL(n.As32(), U32(1<<8)) 1571 JLT(LabelRef("two_bytes_" + name)) 1572 if o.maxLen >= 1<<16 { 1573 CMPL(n.As32(), U32(1<<16)) 1574 JLT(LabelRef("three_bytes_" + name)) 1575 } else { 1576 JMP(LabelRef("three_bytes_" + name)) 1577 } 1578 if o.maxLen >= 1<<16 { 1579 if o.maxLen >= 1<<24 { 1580 CMPL(n.As32(), U32(1<<24)) 1581 JLT(LabelRef("four_bytes_" + name)) 1582 } else { 1583 JMP(LabelRef("four_bytes_" + name)) 1584 } 1585 } 1586 if o.maxLen >= 1<<24 { 1587 Label("five_bytes_" + name) 1588 MOVB(U8(252), Mem{Base: dstBase}) 1589 MOVL(n.As32(), Mem{Base: dstBase, Disp: 1}) 1590 if retval != nil { 1591 ADDQ(U8(5), retval) 1592 } 1593 ADDQ(U8(5), dstBase) 1594 JMP(LabelRef("memmove_long_" + name)) 1595 } 1596 if o.maxLen >= 1<<16 { 1597 Label("four_bytes_" + name) 1598 MOVL(n, n16) 1599 SHRL(U8(16), n16.As32()) 1600 MOVB(U8(248), Mem{Base: dstBase}) 1601 MOVW(n.As16(), Mem{Base: dstBase, Disp: 1}) 1602 MOVB(n16.As8(), Mem{Base: dstBase, Disp: 3}) 1603 if retval != nil { 1604 ADDQ(U8(4), retval) 1605 } 1606 ADDQ(U8(4), dstBase) 1607 JMP(LabelRef("memmove_long_" + name)) 1608 } 1609 Label("three_bytes_" + name) 1610 MOVB(U8(0xf4), Mem{Base: dstBase}) 1611 MOVW(n.As16(), Mem{Base: dstBase, Disp: 1}) 1612 if retval != nil { 1613 ADDQ(U8(3), retval) 1614 } 1615 ADDQ(U8(3), dstBase) 1616 JMP(LabelRef("memmove_long_" + name)) 1617 1618 Label("two_bytes_" + name) 1619 MOVB(U8(0xf0), Mem{Base: dstBase}) 1620 MOVB(n.As8(), Mem{Base: dstBase, Disp: 1}) 1621 if retval != nil { 1622 ADDQ(U8(2), retval) 1623 } 1624 ADDQ(U8(2), dstBase) 1625 CMPL(n.As32(), U8(64)) 1626 JL(LabelRef("memmove_" + name)) 1627 JMP(LabelRef("memmove_long_" + name)) 1628 1629 Label("one_byte_" + name) 1630 SHLB(U8(2), n.As8()) 1631 MOVB(n.As8(), Mem{Base: dstBase}) 1632 if retval != nil { 1633 ADDQ(U8(1), retval) 1634 } 1635 ADDQ(U8(1), dstBase) 1636 // Fallthrough 1637 1638 Label("memmove_" + name) 1639 1640 // copy(dst[i:], lit) 1641 dstEnd := GP64() 1642 copyEnd := end 1643 if updateDst { 1644 copyEnd = LabelRef("memmove_end_copy_" + name) 1645 LEAQ(Mem{Base: dstBase, Index: litLen, Scale: 1}, dstEnd) 1646 } 1647 length := GP64() 1648 MOVL(litLen.As32(), length.As32()) 1649 1650 // updates litBase. 1651 o.genMemMoveShort("emit_lit_memmove_"+name, dstBase, litBase, length, copyEnd) 1652 1653 if updateDst { 1654 Label("memmove_end_copy_" + name) 1655 MOVQ(dstEnd, dstBase) 1656 } 1657 JMP(end) 1658 1659 // > 64 bytes 1660 Label("memmove_long_" + name) 1661 1662 // copy(dst[i:], lit) 1663 dstEnd = GP64() 1664 copyEnd = end 1665 if updateDst { 1666 copyEnd = LabelRef("memmove_end_copy_long_" + name) 1667 LEAQ(Mem{Base: dstBase, Index: litLen, Scale: 1}, dstEnd) 1668 } 1669 length = GP64() 1670 MOVL(litLen.As32(), length.As32()) 1671 1672 // updates litBase. 1673 o.genMemMoveLong("emit_lit_memmove_long_"+name, dstBase, litBase, length, copyEnd) 1674 1675 if updateDst { 1676 Label("memmove_end_copy_long_" + name) 1677 MOVQ(dstEnd, dstBase) 1678 } 1679 JMP(end) 1680 // Should be unreachable 1681 if debug { 1682 INT(Imm(3)) 1683 } 1684 return 1685} 1686 1687// genEmitRepeat generates a standlone emitRepeat. 1688func (o options) genEmitRepeat() { 1689 TEXT("emitRepeat", NOSPLIT, "func(dst []byte, offset, length int) int") 1690 Doc("emitRepeat writes a repeat chunk and returns the number of bytes written.", 1691 "Length must be at least 4 and < 1<<32", "") 1692 Pragma("noescape") 1693 1694 dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64() 1695 1696 // retval = 0 1697 XORQ(retval, retval) 1698 1699 Load(Param("dst").Base(), dstBase) 1700 Load(Param("offset"), offset) 1701 Load(Param("length"), length) 1702 o.emitRepeat("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_repeat_end")) 1703 Label("gen_emit_repeat_end") 1704 Store(retval, ReturnIndex(0)) 1705 RET() 1706} 1707 1708// emitRepeat can be used for inlining an emitRepeat call. 1709// length >= 4 and < 1<<32 1710// length is modified. dstBase is updated. retval is added to input. 1711// retval can be nil. 1712// Will jump to end label when finished. 1713// Uses 1 GP register. 1714func (o options) emitRepeat(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) { 1715 Comment("emitRepeat") 1716 Label("emit_repeat_again_" + name) 1717 tmp := GP32() 1718 MOVL(length.As32(), tmp) // Copy length 1719 // length -= 4 1720 LEAL(Mem{Base: length, Disp: -4}, length.As32()) 1721 1722 // if length <= 4 (use copied value) 1723 CMPL(tmp.As32(), U8(8)) 1724 JLE(LabelRef("repeat_two_" + name)) 1725 1726 // length < 8 && offset < 2048 1727 CMPL(tmp.As32(), U8(12)) 1728 JGE(LabelRef("cant_repeat_two_offset_" + name)) 1729 if o.maxLen >= 2048 { 1730 CMPL(offset.As32(), U32(2048)) 1731 JLT(LabelRef("repeat_two_offset_" + name)) 1732 } 1733 1734 const maxRepeat = ((1 << 24) - 1) + 65536 1735 Label("cant_repeat_two_offset_" + name) 1736 CMPL(length.As32(), U32((1<<8)+4)) 1737 JLT(LabelRef("repeat_three_" + name)) // if length < (1<<8)+4 1738 if o.maxLen >= (1<<16)+(1<<8) { 1739 CMPL(length.As32(), U32((1<<16)+(1<<8))) 1740 JLT(LabelRef("repeat_four_" + name)) // if length < (1 << 16) + (1 << 8) 1741 } else { 1742 // Not needed, we should skip to it when generating. 1743 // JMP(LabelRef("repeat_four_" + name)) // if length < (1 << 16) + (1 << 8) 1744 } 1745 if o.maxLen >= maxRepeat { 1746 CMPL(length.As32(), U32(maxRepeat)) 1747 JLT(LabelRef("repeat_five_" + name)) // If less than 24 bits to represent. 1748 1749 // We have have more than 24 bits 1750 // Emit so we have at least 4 bytes left. 1751 LEAL(Mem{Base: length, Disp: -(maxRepeat - 4)}, length.As32()) // length -= (maxRepeat - 4) 1752 MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0 1753 MOVW(U16(65531), Mem{Base: dstBase, Disp: 2}) // 0xfffb 1754 MOVB(U8(255), Mem{Base: dstBase, Disp: 4}) 1755 ADDQ(U8(5), dstBase) 1756 if retval != nil { 1757 ADDQ(U8(5), retval) 1758 } 1759 JMP(LabelRef("emit_repeat_again_" + name)) 1760 } else { 1761 // Not needed. 1762 // JMP(LabelRef("repeat_five_" + name)) // If less than 24 bits to represent. 1763 } 1764 1765 // Must be able to be within 5 bytes. 1766 if o.maxLen >= (1<<16)+(1<<8) { 1767 Label("repeat_five_" + name) 1768 LEAL(Mem{Base: length, Disp: -65536}, length.As32()) // length -= 65536 1769 MOVL(length.As32(), offset.As32()) 1770 MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0 1771 MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8) 1772 SARL(U8(16), offset.As32()) // offset = length >> 16 1773 MOVB(offset.As8(), Mem{Base: dstBase, Disp: 4}) // dst[4] = length >> 16 1774 if retval != nil { 1775 ADDQ(U8(5), retval) // i += 5 1776 } 1777 ADDQ(U8(5), dstBase) // dst += 5 1778 JMP(end) 1779 } 1780 Label("repeat_four_" + name) 1781 LEAL(Mem{Base: length, Disp: -256}, length.As32()) // length -= 256 1782 MOVW(U16(6<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 6<<2 | tagCopy1, dst[1] = 0 1783 MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8) 1784 if retval != nil { 1785 ADDQ(U8(4), retval) // i += 4 1786 } 1787 ADDQ(U8(4), dstBase) // dst += 4 1788 JMP(end) 1789 1790 Label("repeat_three_" + name) 1791 LEAL(Mem{Base: length, Disp: -4}, length.As32()) // length -= 4 1792 MOVW(U16(5<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 5<<2 | tagCopy1, dst[1] = 0 1793 MOVB(length.As8(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length) 1794 if retval != nil { 1795 ADDQ(U8(3), retval) // i += 3 1796 } 1797 ADDQ(U8(3), dstBase) // dst += 3 1798 JMP(end) 1799 1800 Label("repeat_two_" + name) 1801 // dst[0] = uint8(length)<<2 | tagCopy1, dst[1] = 0 1802 SHLL(U8(2), length.As32()) 1803 ORL(U8(tagCopy1), length.As32()) 1804 MOVW(length.As16(), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0 1805 if retval != nil { 1806 ADDQ(U8(2), retval) // i += 2 1807 } 1808 ADDQ(U8(2), dstBase) // dst += 2 1809 JMP(end) 1810 1811 Label("repeat_two_offset_" + name) 1812 // Emit the remaining copy, encoded as 2 bytes. 1813 // dst[1] = uint8(offset) 1814 // dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1 1815 tmp = GP64() 1816 XORQ(tmp, tmp) 1817 // Use scale and displacement to shift and subtract values from length. 1818 LEAL(Mem{Base: tmp, Index: length, Scale: 4, Disp: tagCopy1}, length.As32()) 1819 MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte 1820 SARL(U8(8), offset.As32()) // Remove lower 1821 SHLL(U8(5), offset.As32()) // Shift back up 1822 ORL(offset.As32(), length.As32()) // OR result 1823 MOVB(length.As8(), Mem{Base: dstBase, Disp: 0}) 1824 if retval != nil { 1825 ADDQ(U8(2), retval) // i += 2 1826 } 1827 ADDQ(U8(2), dstBase) // dst += 2 1828 1829 JMP(end) 1830} 1831 1832// emitCopy writes a copy chunk and returns the number of bytes written. 1833// 1834// It assumes that: 1835// dst is long enough to hold the encoded bytes 1836// 1 <= offset && offset <= math.MaxUint32 1837// 4 <= length && length <= 1 << 24 1838 1839// genEmitCopy generates a standlone emitCopy 1840func (o options) genEmitCopy() { 1841 TEXT("emitCopy", NOSPLIT, "func(dst []byte, offset, length int) int") 1842 Doc("emitCopy writes a copy chunk and returns the number of bytes written.", "", 1843 "It assumes that:", 1844 " dst is long enough to hold the encoded bytes", 1845 " 1 <= offset && offset <= math.MaxUint32", 1846 " 4 <= length && length <= 1 << 24", "") 1847 Pragma("noescape") 1848 1849 dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64() 1850 1851 // i := 0 1852 XORQ(retval, retval) 1853 Load(Param("dst").Base(), dstBase) 1854 Load(Param("offset"), offset) 1855 Load(Param("length"), length) 1856 o.emitCopy("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_copy_end")) 1857 Label("gen_emit_copy_end") 1858 Store(retval, ReturnIndex(0)) 1859 RET() 1860} 1861 1862// emitCopy writes a copy chunk and returns the number of bytes written. 1863// 1864// It assumes that: 1865// dst is long enough to hold the encoded bytes 1866// 1 <= offset && offset <= math.MaxUint32 1867// 4 <= length && length <= 1 << 24 1868 1869// genEmitCopy generates a standlone emitCopy 1870func (o options) genEmitCopyNoRepeat() { 1871 TEXT("emitCopyNoRepeat", NOSPLIT, "func(dst []byte, offset, length int) int") 1872 Doc("emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.", "", 1873 "It assumes that:", 1874 " dst is long enough to hold the encoded bytes", 1875 " 1 <= offset && offset <= math.MaxUint32", 1876 " 4 <= length && length <= 1 << 24", "") 1877 Pragma("noescape") 1878 1879 dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64() 1880 1881 // i := 0 1882 XORQ(retval, retval) 1883 1884 Load(Param("dst").Base(), dstBase) 1885 Load(Param("offset"), offset) 1886 Load(Param("length"), length) 1887 o.emitCopy("standalone_snappy", length, offset, retval, dstBase, "gen_emit_copy_end_snappy") 1888 Label("gen_emit_copy_end_snappy") 1889 Store(retval, ReturnIndex(0)) 1890 RET() 1891} 1892 1893const ( 1894 tagLiteral = 0x00 1895 tagCopy1 = 0x01 1896 tagCopy2 = 0x02 1897 tagCopy4 = 0x03 1898) 1899 1900// emitCopy can be used for inlining an emitCopy call. 1901// length is modified (and junk). dstBase is updated. retval is added to input. 1902// retval can be nil. 1903// Will jump to end label when finished. 1904// Uses 2 GP registers. 1905func (o options) emitCopy(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) { 1906 Comment("emitCopy") 1907 1908 if o.maxLen >= 65536 { 1909 //if offset >= 65536 { 1910 CMPL(offset.As32(), U32(65536)) 1911 JL(LabelRef("two_byte_offset_" + name)) 1912 1913 // offset is >= 65536 1914 // if length <= 64 goto four_bytes_remain_ 1915 Label("four_bytes_loop_back_" + name) 1916 CMPL(length.As32(), U8(64)) 1917 JLE(LabelRef("four_bytes_remain_" + name)) 1918 1919 // Emit a length 64 copy, encoded as 5 bytes. 1920 // dst[0] = 63<<2 | tagCopy4 1921 MOVB(U8(63<<2|tagCopy4), Mem{Base: dstBase}) 1922 // dst[4] = uint8(offset >> 24) 1923 // dst[3] = uint8(offset >> 16) 1924 // dst[2] = uint8(offset >> 8) 1925 // dst[1] = uint8(offset) 1926 MOVL(offset.As32(), Mem{Base: dstBase, Disp: 1}) 1927 // length -= 64 1928 LEAL(Mem{Base: length, Disp: -64}, length.As32()) 1929 if retval != nil { 1930 ADDQ(U8(5), retval) // i+=5 1931 } 1932 ADDQ(U8(5), dstBase) // dst+=5 1933 1934 // if length >= 4 { 1935 CMPL(length.As32(), U8(4)) 1936 JL(LabelRef("four_bytes_remain_" + name)) 1937 1938 // Emit remaining as repeats 1939 // return 5 + emitRepeat(dst[5:], offset, length) 1940 // Inline call to emitRepeat. Will jump to end 1941 if !o.snappy { 1942 o.emitRepeat(name+"_emit_copy", length, offset, retval, dstBase, end) 1943 } 1944 JMP(LabelRef("four_bytes_loop_back_" + name)) 1945 1946 Label("four_bytes_remain_" + name) 1947 // if length == 0 { 1948 // return i 1949 // } 1950 TESTL(length.As32(), length.As32()) 1951 JZ(end) 1952 1953 // Emit a copy, offset encoded as 4 bytes. 1954 // dst[i+0] = uint8(length-1)<<2 | tagCopy4 1955 // dst[i+1] = uint8(offset) 1956 // dst[i+2] = uint8(offset >> 8) 1957 // dst[i+3] = uint8(offset >> 16) 1958 // dst[i+4] = uint8(offset >> 24) 1959 tmp := GP64() 1960 MOVB(U8(tagCopy4), tmp.As8()) 1961 // Use displacement to subtract 1 from upshifted length. 1962 LEAL(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length.As32()) 1963 MOVB(length.As8(), Mem{Base: dstBase}) 1964 MOVL(offset.As32(), Mem{Base: dstBase, Disp: 1}) 1965 // return i + 5 1966 if retval != nil { 1967 ADDQ(U8(5), retval) 1968 } 1969 ADDQ(U8(5), dstBase) 1970 JMP(end) 1971 } 1972 Label("two_byte_offset_" + name) 1973 // Offset no more than 2 bytes. 1974 1975 //if length > 64 { 1976 CMPL(length.As32(), U8(64)) 1977 JLE(LabelRef("two_byte_offset_short_" + name)) 1978 // Emit a length 60 copy, encoded as 3 bytes. 1979 // Emit remaining as repeat value (minimum 4 bytes). 1980 // dst[2] = uint8(offset >> 8) 1981 // dst[1] = uint8(offset) 1982 // dst[0] = 59<<2 | tagCopy2 1983 MOVB(U8(59<<2|tagCopy2), Mem{Base: dstBase}) 1984 MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1}) 1985 // length -= 60 1986 LEAL(Mem{Base: length, Disp: -60}, length.As32()) 1987 1988 // Emit remaining as repeats, at least 4 bytes remain. 1989 // return 3 + emitRepeat(dst[3:], offset, length) 1990 //} 1991 ADDQ(U8(3), dstBase) 1992 if retval != nil { 1993 ADDQ(U8(3), retval) 1994 } 1995 // Inline call to emitRepeat. Will jump to end 1996 if !o.snappy { 1997 o.emitRepeat(name+"_emit_copy_short", length, offset, retval, dstBase, end) 1998 } 1999 JMP(LabelRef("two_byte_offset_" + name)) 2000 2001 Label("two_byte_offset_short_" + name) 2002 //if length >= 12 || offset >= 2048 { 2003 CMPL(length.As32(), U8(12)) 2004 JGE(LabelRef("emit_copy_three_" + name)) 2005 if o.maxLen >= 2048 { 2006 CMPL(offset.As32(), U32(2048)) 2007 JGE(LabelRef("emit_copy_three_" + name)) 2008 } 2009 // Emit the remaining copy, encoded as 2 bytes. 2010 // dst[1] = uint8(offset) 2011 // dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 2012 tmp := GP64() 2013 MOVB(U8(tagCopy1), tmp.As8()) 2014 // Use scale and displacement to shift and subtract values from length. 2015 LEAL(Mem{Base: tmp, Index: length, Scale: 4, Disp: -(4 << 2)}, length.As32()) 2016 MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte 2017 SHRL(U8(8), offset.As32()) // Remove lower 2018 SHLL(U8(5), offset.As32()) // Shift back up 2019 ORL(offset.As32(), length.As32()) // OR result 2020 MOVB(length.As8(), Mem{Base: dstBase, Disp: 0}) 2021 if retval != nil { 2022 ADDQ(U8(2), retval) // i += 2 2023 } 2024 ADDQ(U8(2), dstBase) // dst += 2 2025 // return 2 2026 JMP(end) 2027 2028 Label("emit_copy_three_" + name) 2029 // // Emit the remaining copy, encoded as 3 bytes. 2030 // dst[2] = uint8(offset >> 8) 2031 // dst[1] = uint8(offset) 2032 // dst[0] = uint8(length-1)<<2 | tagCopy2 2033 tmp = GP64() 2034 MOVB(U8(tagCopy2), tmp.As8()) 2035 LEAL(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length.As32()) 2036 MOVB(length.As8(), Mem{Base: dstBase}) 2037 MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1}) 2038 // return 3 2039 if retval != nil { 2040 ADDQ(U8(3), retval) // i += 3 2041 } 2042 ADDQ(U8(3), dstBase) // dst += 3 2043 JMP(end) 2044} 2045 2046// func memmove(to, from unsafe.Pointer, n uintptr) 2047// src and dst may not overlap. 2048// Non AVX uses 2 GP register, 16 SSE2 registers. 2049// AVX uses 4 GP registers 16 AVX/SSE registers. 2050// All passed registers may be updated. 2051// Length must be 1 -> 64 bytes 2052func (o options) genMemMoveShort(name string, dst, src, length reg.GPVirtual, end LabelRef) { 2053 Comment("genMemMoveShort") 2054 AX, CX := GP64(), GP64() 2055 name += "_memmove_" 2056 2057 // Only enable if length can be 0. 2058 if false { 2059 TESTQ(length, length) 2060 JEQ(end) 2061 } 2062 assert(func(ok LabelRef) { 2063 CMPQ(length, U8(64)) 2064 JBE(ok) 2065 }) 2066 assert(func(ok LabelRef) { 2067 TESTQ(length, length) 2068 JNZ(ok) 2069 }) 2070 Label(name + "tail") 2071 CMPQ(length, U8(3)) 2072 JB(LabelRef(name + "move_1or2")) 2073 JE(LabelRef(name + "move_3")) 2074 CMPQ(length, U8(8)) 2075 JB(LabelRef(name + "move_4through7")) 2076 CMPQ(length, U8(16)) 2077 JBE(LabelRef(name + "move_8through16")) 2078 CMPQ(length, U8(32)) 2079 JBE(LabelRef(name + "move_17through32")) 2080 if debug { 2081 CMPQ(length, U8(64)) 2082 JBE(LabelRef(name + "move_33through64")) 2083 INT(U8(3)) 2084 } 2085 JMP(LabelRef(name + "move_33through64")) 2086 2087 //genMemMoveLong(name, dst, src, length, end) 2088 2089 Label(name + "move_1or2") 2090 MOVB(Mem{Base: src}, AX.As8()) 2091 MOVB(Mem{Base: src, Disp: -1, Index: length, Scale: 1}, CX.As8()) 2092 MOVB(AX.As8(), Mem{Base: dst}) 2093 MOVB(CX.As8(), Mem{Base: dst, Disp: -1, Index: length, Scale: 1}) 2094 JMP(end) 2095 2096 Label(name + "move_3") 2097 MOVW(Mem{Base: src}, AX.As16()) 2098 MOVB(Mem{Base: src, Disp: 2}, CX.As8()) 2099 MOVW(AX.As16(), Mem{Base: dst}) 2100 MOVB(CX.As8(), Mem{Base: dst, Disp: 2}) 2101 JMP(end) 2102 2103 Label(name + "move_4through7") 2104 MOVL(Mem{Base: src}, AX.As32()) 2105 MOVL(Mem{Base: src, Disp: -4, Index: length, Scale: 1}, CX.As32()) 2106 MOVL(AX.As32(), Mem{Base: dst}) 2107 MOVL(CX.As32(), Mem{Base: dst, Disp: -4, Index: length, Scale: 1}) 2108 JMP(end) 2109 2110 Label(name + "move_8through16") 2111 MOVQ(Mem{Base: src}, AX) 2112 MOVQ(Mem{Base: src, Disp: -8, Index: length, Scale: 1}, CX) 2113 MOVQ(AX, Mem{Base: dst}) 2114 MOVQ(CX, Mem{Base: dst, Disp: -8, Index: length, Scale: 1}) 2115 JMP(end) 2116 2117 Label(name + "move_17through32") 2118 X0, X1, X2, X3 := XMM(), XMM(), XMM(), XMM() 2119 2120 MOVOU(Mem{Base: src}, X0) 2121 MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X1) 2122 MOVOU(X0, Mem{Base: dst}) 2123 MOVOU(X1, Mem{Base: dst, Disp: -16, Index: length, Scale: 1}) 2124 JMP(end) 2125 2126 Label(name + "move_33through64") 2127 MOVOU(Mem{Base: src}, X0) 2128 MOVOU(Mem{Base: src, Disp: 16}, X1) 2129 MOVOU(Mem{Base: src, Disp: -32, Index: length, Scale: 1}, X2) 2130 MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X3) 2131 MOVOU(X0, Mem{Base: dst}) 2132 MOVOU(X1, Mem{Base: dst, Disp: 16}) 2133 MOVOU(X2, Mem{Base: dst, Disp: -32, Index: length, Scale: 1}) 2134 MOVOU(X3, Mem{Base: dst, Disp: -16, Index: length, Scale: 1}) 2135 JMP(end) 2136} 2137 2138// func genMemMoveLong(to, from unsafe.Pointer, n uintptr) 2139// src and dst may not overlap. 2140// length must be >= 64 bytes. 2141// Non AVX uses 2 GP register, 16 SSE2 registers. 2142// AVX uses 4 GP registers 16 AVX/SSE registers. 2143// All passed registers may be updated. 2144func (o options) genMemMoveLong(name string, dst, src, length reg.GPVirtual, end LabelRef) { 2145 Comment("genMemMoveLong") 2146 name += "large_" 2147 2148 assert(func(ok LabelRef) { 2149 CMPQ(length, U8(64)) 2150 JAE(ok) 2151 }) 2152 2153 // These are disabled. 2154 // AVX is ever so slightly faster, but it is disabled for simplicity. 2155 const branchLoops = false 2156 const avx = false && branchLoops 2157 if branchLoops { 2158 CMPQ(length, U8(128)) 2159 JBE(LabelRef(name + "move_65through128")) 2160 CMPQ(length, U32(256)) 2161 JBE(LabelRef(name + "move_129through256")) 2162 if avx { 2163 JMP(LabelRef(name + "avxUnaligned")) 2164 } else { 2165 JMP(LabelRef(name + "forward_sse")) 2166 } 2167 2168 X0, X1, X2, X3, X4, X5, X6, X7 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM() 2169 X8, X9, X10, X11, X12, X13, X14, X15 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM() 2170 Label(name + "move_65through128") 2171 MOVOU(Mem{Base: src}, X0) 2172 MOVOU(Mem{Base: src, Disp: 16}, X1) 2173 MOVOU(Mem{Base: src, Disp: 32}, X2) 2174 MOVOU(Mem{Base: src, Disp: 48}, X3) 2175 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12) 2176 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13) 2177 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14) 2178 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15) 2179 MOVOU(X0, Mem{Base: dst}) 2180 MOVOU(X1, Mem{Base: dst, Disp: 16}) 2181 MOVOU(X2, Mem{Base: dst, Disp: 32}) 2182 MOVOU(X3, Mem{Base: dst, Disp: 48}) 2183 MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64}) 2184 MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48}) 2185 MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32}) 2186 MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16}) 2187 JMP(end) 2188 2189 Label(name + "move_129through256") 2190 MOVOU(Mem{Base: src}, X0) 2191 MOVOU(Mem{Base: src, Disp: 16}, X1) 2192 MOVOU(Mem{Base: src, Disp: 32}, X2) 2193 MOVOU(Mem{Base: src, Disp: 48}, X3) 2194 MOVOU(Mem{Base: src, Disp: 64}, X4) 2195 MOVOU(Mem{Base: src, Disp: 80}, X5) 2196 MOVOU(Mem{Base: src, Disp: 96}, X6) 2197 MOVOU(Mem{Base: src, Disp: 112}, X7) 2198 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -128}, X8) 2199 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -112}, X9) 2200 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -96}, X10) 2201 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -80}, X11) 2202 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12) 2203 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13) 2204 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14) 2205 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15) 2206 MOVOU(X0, Mem{Base: dst}) 2207 MOVOU(X1, Mem{Base: dst, Disp: 16}) 2208 MOVOU(X2, Mem{Base: dst, Disp: 32}) 2209 MOVOU(X3, Mem{Base: dst, Disp: 48}) 2210 MOVOU(X4, Mem{Base: dst, Disp: 64}) 2211 MOVOU(X5, Mem{Base: dst, Disp: 80}) 2212 MOVOU(X6, Mem{Base: dst, Disp: 96}) 2213 MOVOU(X7, Mem{Base: dst, Disp: 112}) 2214 MOVOU(X8, Mem{Base: dst, Index: length, Scale: 1, Disp: -128}) 2215 MOVOU(X9, Mem{Base: dst, Index: length, Scale: 1, Disp: -112}) 2216 MOVOU(X10, Mem{Base: dst, Index: length, Scale: 1, Disp: -96}) 2217 MOVOU(X11, Mem{Base: dst, Index: length, Scale: 1, Disp: -80}) 2218 MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64}) 2219 MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48}) 2220 MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32}) 2221 MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16}) 2222 JMP(end) 2223 if avx { 2224 Label(name + "avxUnaligned") 2225 AX, CX, R8, R10 := GP64(), GP64(), GP64(), GP64() 2226 // Memory layout on the source side 2227 // src CX 2228 // |<---------length before correction--------->| 2229 // | |<--length corrected-->| | 2230 // | | |<--- AX --->| 2231 // |<-R11->| |<-128 bytes->| 2232 // +----------------------------------------+ 2233 // | Head | Body | Tail | 2234 // +-------+------------------+-------------+ 2235 // ^ ^ ^ 2236 // | | | 2237 // Save head into Y4 Save tail into X5..X12 2238 // | 2239 // src+R11, where R11 = ((dst & -32) + 32) - dst 2240 // Algorithm: 2241 // 1. Unaligned save of the tail's 128 bytes 2242 // 2. Unaligned save of the head's 32 bytes 2243 // 3. Destination-aligned copying of body (128 bytes per iteration) 2244 // 4. Put head on the new place 2245 // 5. Put the tail on the new place 2246 // It can be important to satisfy processor's pipeline requirements for 2247 // small sizes as the cost of unaligned memory region copying is 2248 // comparable with the cost of main loop. So code is slightly messed there. 2249 // There is more clean implementation of that algorithm for bigger sizes 2250 // where the cost of unaligned part copying is negligible. 2251 // You can see it after gobble_big_data_fwd label. 2252 Y0, Y1, Y2, Y3, Y4 := YMM(), YMM(), YMM(), YMM(), YMM() 2253 2254 LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX) 2255 MOVQ(dst, R10) 2256 // CX points to the end of buffer so we need go back slightly. We will use negative offsets there. 2257 MOVOU(Mem{Base: CX, Disp: -0x80}, X5) 2258 MOVOU(Mem{Base: CX, Disp: -0x70}, X6) 2259 MOVQ(U32(0x80), AX) 2260 2261 // Align destination address 2262 ANDQ(U32(0xffffffe0), dst) 2263 ADDQ(U8(32), dst) 2264 // Continue tail saving. 2265 MOVOU(Mem{Base: CX, Disp: -0x60}, X7) 2266 MOVOU(Mem{Base: CX, Disp: -0x50}, X8) 2267 // Make R8 delta between aligned and unaligned destination addresses. 2268 MOVQ(dst, R8) 2269 SUBQ(R10, R8) 2270 // Continue tail saving. 2271 MOVOU(Mem{Base: CX, Disp: -0x40}, X9) 2272 MOVOU(Mem{Base: CX, Disp: -0x30}, X10) 2273 // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying. 2274 SUBQ(R8, length) 2275 // Continue tail saving. 2276 MOVOU(Mem{Base: CX, Disp: -0x20}, X11) 2277 MOVOU(Mem{Base: CX, Disp: -0x10}, X12) 2278 // The tail will be put on its place after main body copying. 2279 // It's time for the unaligned heading part. 2280 VMOVDQU(Mem{Base: src}, Y4) 2281 // Adjust source address to point past head. 2282 ADDQ(R8, src) 2283 SUBQ(AX, length) 2284 2285 // Aligned memory copying there 2286 Label(name + "gobble_128_loop") 2287 VMOVDQU(Mem{Base: src}, Y0) 2288 VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1) 2289 VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2) 2290 VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3) 2291 ADDQ(AX, src) 2292 VMOVDQA(Y0, Mem{Base: dst}) 2293 VMOVDQA(Y1, Mem{Base: dst, Disp: 0x20}) 2294 VMOVDQA(Y2, Mem{Base: dst, Disp: 0x40}) 2295 VMOVDQA(Y3, Mem{Base: dst, Disp: 0x60}) 2296 ADDQ(AX, dst) 2297 SUBQ(AX, length) 2298 JA(LabelRef(name + "gobble_128_loop")) 2299 // Now we can store unaligned parts. 2300 ADDQ(AX, length) 2301 ADDQ(dst, length) 2302 VMOVDQU(Y4, Mem{Base: R10}) 2303 VZEROUPPER() 2304 MOVOU(X5, Mem{Base: length, Disp: -0x80}) 2305 MOVOU(X6, Mem{Base: length, Disp: -0x70}) 2306 MOVOU(X7, Mem{Base: length, Disp: -0x60}) 2307 MOVOU(X8, Mem{Base: length, Disp: -0x50}) 2308 MOVOU(X9, Mem{Base: length, Disp: -0x40}) 2309 MOVOU(X10, Mem{Base: length, Disp: -0x30}) 2310 MOVOU(X11, Mem{Base: length, Disp: -0x20}) 2311 MOVOU(X12, Mem{Base: length, Disp: -0x10}) 2312 JMP(end) 2313 2314 return 2315 } 2316 } 2317 2318 // Store start and end for sse_tail 2319 Label(name + "forward_sse") 2320 X0, X1, X2, X3, X4, X5 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM() 2321 // X6, X7 := XMM(), XMM() 2322 //X8, X9, X10, X11 := XMM(), XMM(), XMM(), XMM() 2323 2324 MOVOU(Mem{Base: src}, X0) 2325 MOVOU(Mem{Base: src, Disp: 16}, X1) 2326 MOVOU(Mem{Base: src, Disp: -32, Index: length, Scale: 1}, X2) 2327 MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X3) 2328 2329 // forward (only) 2330 dstAlign := GP64() 2331 bigLoops := GP64() 2332 MOVQ(length, bigLoops) 2333 SHRQ(U8(5), bigLoops) // bigLoops = length / 32 2334 2335 MOVQ(dst, dstAlign) 2336 ANDL(U32(31), dstAlign.As32()) 2337 srcOff := GP64() 2338 MOVQ(U32(64), srcOff) 2339 SUBQ(dstAlign, srcOff) 2340 2341 // Move 32 bytes/loop 2342 DECQ(bigLoops) 2343 JA(LabelRef(name + "forward_sse_loop_32")) 2344 2345 // Can be moved inside loop for less regs. 2346 srcPos := GP64() 2347 LEAQ(Mem{Disp: -32, Base: src, Scale: 1, Index: srcOff}, srcPos) 2348 dstPos := GP64() 2349 LEAQ(Mem{Disp: -32, Base: dst, Scale: 1, Index: srcOff}, dstPos) 2350 2351 Label(name + "big_loop_back") 2352 2353 MOVOU(Mem{Disp: 0, Base: srcPos}, X4) 2354 MOVOU(Mem{Disp: 16, Base: srcPos}, X5) 2355 2356 MOVOA(X4, Mem{Disp: 0, Base: dstPos}) 2357 MOVOA(X5, Mem{Disp: 16, Base: dstPos}) 2358 ADDQ(U8(32), dstPos) 2359 ADDQ(U8(32), srcPos) 2360 ADDQ(U8(32), srcOff) // This could be outside the loop, but we lose a reg if we do. 2361 DECQ(bigLoops) 2362 JNA(LabelRef(name + "big_loop_back")) 2363 2364 Label(name + "forward_sse_loop_32") 2365 MOVOU(Mem{Disp: -32, Base: src, Scale: 1, Index: srcOff}, X4) 2366 MOVOU(Mem{Disp: -16, Base: src, Scale: 1, Index: srcOff}, X5) 2367 MOVOA(X4, Mem{Disp: -32, Base: dst, Scale: 1, Index: srcOff}) 2368 MOVOA(X5, Mem{Disp: -16, Base: dst, Scale: 1, Index: srcOff}) 2369 ADDQ(U8(32), srcOff) 2370 CMPQ(length, srcOff) 2371 JAE(LabelRef(name + "forward_sse_loop_32")) 2372 2373 // sse_tail patches up the beginning and end of the transfer. 2374 MOVOU(X0, Mem{Base: dst, Disp: 0}) 2375 MOVOU(X1, Mem{Base: dst, Disp: 16}) 2376 MOVOU(X2, Mem{Base: dst, Disp: -32, Index: length, Scale: 1}) 2377 MOVOU(X3, Mem{Base: dst, Disp: -16, Index: length, Scale: 1}) 2378 2379 JMP(end) 2380 return 2381} 2382 2383// genMatchLen generates standalone matchLen. 2384func (o options) genMatchLen() { 2385 TEXT("matchLen", NOSPLIT, "func(a, b []byte) int") 2386 Doc("matchLen returns how many bytes match in a and b", "", 2387 "It assumes that:", 2388 " len(a) <= len(b)", "") 2389 Pragma("noescape") 2390 2391 aBase, bBase, length := GP64(), GP64(), GP64() 2392 2393 Load(Param("a").Base(), aBase) 2394 Load(Param("b").Base(), bBase) 2395 Load(Param("a").Len(), length) 2396 l := o.matchLen("standalone", aBase, bBase, length, LabelRef("gen_match_len_end")) 2397 Label("gen_match_len_end") 2398 Store(l.As64(), ReturnIndex(0)) 2399 RET() 2400} 2401 2402// matchLen returns the number of matching bytes of a and b. 2403// len is the maximum number of bytes to match. 2404// Will jump to end when done and returns the length. 2405// Uses 2 GP registers. 2406func (o options) matchLen(name string, a, b, len reg.GPVirtual, end LabelRef) reg.GPVirtual { 2407 Comment("matchLen") 2408 if false { 2409 return o.matchLenAlt(name, a, b, len, end) 2410 } 2411 tmp, matched := GP64(), GP32() 2412 XORL(matched, matched) 2413 2414 CMPL(len.As32(), U8(8)) 2415 JL(LabelRef("matchlen_single_" + name)) 2416 2417 Label("matchlen_loopback_" + name) 2418 MOVQ(Mem{Base: a, Index: matched, Scale: 1}, tmp) 2419 XORQ(Mem{Base: b, Index: matched, Scale: 1}, tmp) 2420 TESTQ(tmp, tmp) 2421 JZ(LabelRef("matchlen_loop_" + name)) 2422 // Not all match. 2423 BSFQ(tmp, tmp) 2424 SARQ(U8(3), tmp) 2425 LEAL(Mem{Base: matched, Index: tmp, Scale: 1}, matched) 2426 JMP(end) 2427 2428 // All 8 byte matched, update and loop. 2429 Label("matchlen_loop_" + name) 2430 LEAL(Mem{Base: len, Disp: -8}, len.As32()) 2431 LEAL(Mem{Base: matched, Disp: 8}, matched) 2432 CMPL(len.As32(), U8(8)) 2433 JGE(LabelRef("matchlen_loopback_" + name)) 2434 2435 // Less than 8 bytes left. 2436 Label("matchlen_single_" + name) 2437 TESTL(len.As32(), len.As32()) 2438 JZ(end) 2439 Label("matchlen_single_loopback_" + name) 2440 MOVB(Mem{Base: a, Index: matched, Scale: 1}, tmp.As8()) 2441 CMPB(Mem{Base: b, Index: matched, Scale: 1}, tmp.As8()) 2442 JNE(end) 2443 LEAL(Mem{Base: matched, Disp: 1}, matched) 2444 DECL(len.As32()) 2445 JNZ(LabelRef("matchlen_single_loopback_" + name)) 2446 JMP(end) 2447 return matched 2448} 2449 2450// matchLen returns the number of matching bytes of a and b. 2451// len is the maximum number of bytes to match. 2452// Will jump to end when done and returns the length. 2453// Uses 3 GP registers. 2454// It is better on longer matches. 2455func (o options) matchLenAlt(name string, a, b, len reg.GPVirtual, end LabelRef) reg.GPVirtual { 2456 Comment("matchLenAlt") 2457 tmp, tmp2, matched := GP64(), GP64(), GP32() 2458 XORL(matched, matched) 2459 2460 CMPL(len.As32(), U8(16)) 2461 JB(LabelRef("matchlen_short_" + name)) 2462 2463 Label("matchlen_loopback_" + name) 2464 MOVQ(Mem{Base: a}, tmp) 2465 MOVQ(Mem{Base: a, Disp: 8}, tmp2) 2466 XORQ(Mem{Base: b, Disp: 0}, tmp) 2467 XORQ(Mem{Base: b, Disp: 8}, tmp2) 2468 endTest := func(xored reg.GPVirtual, disp int, ok LabelRef) { 2469 TESTQ(xored, xored) 2470 JZ(ok) 2471 // Not all match. 2472 BSFQ(xored, xored) 2473 SARQ(U8(3), xored) 2474 LEAL(Mem{Base: matched, Index: xored, Scale: 1, Disp: disp}, matched) 2475 JMP(end) 2476 } 2477 endTest(tmp, 0, LabelRef("matchlen_loop_tmp2_"+name)) 2478 Label("matchlen_loop_tmp2_" + name) 2479 endTest(tmp2, 8, LabelRef("matchlen_loop_"+name)) 2480 2481 // All 16 byte matched, update and loop. 2482 Label("matchlen_loop_" + name) 2483 SUBL(U8(16), len.As32()) 2484 ADDL(U8(16), matched) 2485 ADDQ(U8(16), a) 2486 ADDQ(U8(16), b) 2487 CMPL(len.As32(), U8(16)) 2488 JAE(LabelRef("matchlen_loopback_" + name)) 2489 2490 // Test 4 bytes at the time... 2491 Label("matchlen_short_" + name) 2492 lenoff := 0 2493 if true { 2494 lenoff = 4 2495 SUBL(U8(4), len.As32()) 2496 JC(LabelRef("matchlen_single_resume_" + name)) 2497 2498 Label("matchlen_four_loopback_" + name) 2499 assert(func(ok LabelRef) { 2500 CMPL(len.As32(), U32(math.MaxInt32)) 2501 JL(ok) 2502 }) 2503 2504 MOVL(Mem{Base: a}, tmp.As32()) 2505 XORL(Mem{Base: b}, tmp.As32()) 2506 { 2507 JZ(LabelRef("matchlen_four_loopback_next" + name)) 2508 BSFL(tmp.As32(), tmp.As32()) 2509 SARQ(U8(3), tmp) 2510 LEAL(Mem{Base: matched, Index: tmp, Scale: 1}, matched) 2511 JMP(end) 2512 } 2513 Label("matchlen_four_loopback_next" + name) 2514 ADDL(U8(4), matched) 2515 ADDQ(U8(4), a) 2516 ADDQ(U8(4), b) 2517 SUBL(U8(4), len.As32()) 2518 JNC(LabelRef("matchlen_four_loopback_" + name)) 2519 } 2520 2521 // Test one at the time 2522 Label("matchlen_single_resume_" + name) 2523 if true { 2524 // Less than 16 bytes left. 2525 if lenoff > 0 { 2526 ADDL(U8(lenoff), len.As32()) 2527 } 2528 TESTL(len.As32(), len.As32()) 2529 JZ(end) 2530 2531 Label("matchlen_single_loopback_" + name) 2532 MOVB(Mem{Base: a}, tmp.As8()) 2533 CMPB(Mem{Base: b}, tmp.As8()) 2534 JNE(end) 2535 INCL(matched) 2536 INCQ(a) 2537 INCQ(b) 2538 DECL(len.As32()) 2539 JNZ(LabelRef("matchlen_single_loopback_" + name)) 2540 } 2541 JMP(end) 2542 return matched 2543} 2544