1// +build ignore 2 3package main 4 5import ( 6 "fmt" 7 "log" 8 9 . "github.com/mmcloughlin/avo/build" 10 "github.com/mmcloughlin/avo/buildtags" 11 "github.com/mmcloughlin/avo/operand" 12 . "github.com/mmcloughlin/avo/operand" 13 "github.com/mmcloughlin/avo/reg" 14) 15 16func main() { 17 Constraint(buildtags.Not("appengine").ToConstraint()) 18 Constraint(buildtags.Not("noasm").ToConstraint()) 19 Constraint(buildtags.Term("gc").ToConstraint()) 20 21 genEncodeBlockAsm("encodeBlockAsm", 16, 6, false) 22 genEncodeBlockAsm("encodeBlockAsm14B", 14, 5, false) 23 genEncodeBlockAsm("encodeBlockAsm12B", 12, 4, false) 24 genEncodeBlockAsm("encodeBlockAsmAvx", 16, 6, true) 25 genEncodeBlockAsm("encodeBlockAsm14BAvx", 14, 5, true) 26 genEncodeBlockAsm("encodeBlockAsm12BAvx", 12, 4, true) 27 genEmitLiteral() 28 genEmitRepeat() 29 genEmitCopy() 30 genMatchLen() 31 Generate() 32} 33 34func debugval(v operand.Op) { 35 value := reg.R15 36 MOVQ(v, value) 37 INT(Imm(3)) 38} 39 40func genEncodeBlockAsm(name string, tableBits, skipLog int, avx bool) { 41 TEXT(name, 0, "func(dst, src []byte) int") 42 Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.", 43 "It assumes that the varint-encoded length of the decompressed bytes has already been written.", "") 44 Pragma("noescape") 45 46 // "var table [maxTableSize]uint32" takes up 4 * (1 << tableBits) bytes of stack space. 47 // Extra bytes are added to keep less used values. 48 var ( 49 tableSize = 1 << uint(tableBits) 50 // Keep base stack multiple of 16. 51 baseStack = 0 52 // try to keep extraStack + baseStack multiple of 16 53 // for best chance of table alignment. 54 extraStack = 32 55 allocStack = baseStack + extraStack + tableSize 56 ) 57 58 // Memzero needs at least 128 bytes. 59 if tableSize < 128 { 60 panic("tableSize must be at least 128 bytes") 61 } 62 63 lenSrcBasic, err := Param("src").Len().Resolve() 64 if err != nil { 65 panic(err) 66 } 67 lenSrcQ := lenSrcBasic.Addr 68 69 stack := AllocLocal(allocStack) 70 table := stack.Offset(allocStack - tableSize) 71 72 tmpStack := baseStack 73 // Bail if we can't compress to at least this. 74 dstLimitPtrQ := stack.Offset(tmpStack) 75 tmpStack += 8 76 // dstStartPtrQ contains the original dst pointer for returning the length 77 dstStartPtrQ := stack.Offset(tmpStack) 78 tmpStack += 8 79 // sLimitL is when to stop looking for offset/length copies. 80 sLimitL := stack.Offset(tmpStack) 81 tmpStack += 4 82 // nextEmitL keeps track of the point we have emitted to. 83 nextEmitL := stack.Offset(tmpStack) 84 tmpStack += 4 85 // Repeat stores the last match offset. 86 repeatL := stack.Offset(tmpStack) 87 tmpStack += 4 88 // nextSTempL keeps nextS while other functions are being called. 89 nextSTempL := stack.Offset(tmpStack) 90 tmpStack += 4 91 // Ensure we have the correct extra stack. 92 // Could be automatic, but whatever. 93 if tmpStack-baseStack != extraStack { 94 log.Fatal("adjust extraStack to ", tmpStack-baseStack) 95 } 96 97 dstBaseBasic, err := Param("dst").Base().Resolve() 98 if err != nil { 99 panic(err) 100 } 101 dstBase := dstBaseBasic.Addr 102 103 if tmpStack > extraStack+baseStack { 104 panic(fmt.Sprintf("tmp stack exceeded: %v", tmpStack)) 105 } 106 107 // Zero table 108 { 109 iReg := GP64() 110 MOVQ(U32(tableSize/8/16), iReg) 111 tablePtr := GP64() 112 LEAQ(table, tablePtr) 113 zeroXmm := XMM() 114 PXOR(zeroXmm, zeroXmm) 115 116 Label("zero_loop_" + name) 117 for i := 0; i < 8; i++ { 118 MOVOU(zeroXmm, Mem{Base: tablePtr, Disp: i * 16}) 119 } 120 ADDQ(U8(16*8), tablePtr) 121 DECQ(iReg) 122 JNZ(LabelRef("zero_loop_" + name)) 123 124 // nextEmit is offset n src where the next emitLiteral should start from. 125 MOVL(iReg.As32(), nextEmitL) 126 } 127 128 { 129 const inputMargin = 8 130 tmp, tmp2, tmp3 := GP64(), GP64(), GP64() 131 MOVQ(lenSrcQ, tmp) 132 LEAQ(Mem{Base: tmp, Disp: -5}, tmp2) 133 // sLimitL := len(src) - inputMargin 134 LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3) 135 // dstLimit := len(src) - len(src)>>5 - 5 136 SHRQ(U8(5), tmp) 137 SUBL(tmp.As32(), tmp2.As32()) // tmp2 = tmp2 - tmp 138 MOVL(tmp3.As32(), sLimitL) 139 dstAddr := GP64() 140 MOVQ(dstBase, dstAddr) 141 // Store dst start address 142 MOVQ(dstAddr, dstStartPtrQ) 143 LEAQ(Mem{Base: dstAddr, Index: tmp2, Scale: 1}, tmp2) 144 MOVQ(tmp2, dstLimitPtrQ) 145 } 146 147 // s = 1 148 s := GP64().As32() 149 MOVL(U32(1), s) 150 // repeatL = 1 151 MOVL(s, repeatL) 152 153 src := GP64() 154 Load(Param("src").Base(), src) 155 156 // Load cv 157 Label("search_loop_" + name) 158 candidate := GP64().As32() 159 { 160 cv := GP64() 161 MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv) 162 nextS := GP64() 163 // nextS := s + (s-nextEmit)>>6 + 4 164 { 165 tmp := GP64() 166 MOVL(s, tmp.As32()) // tmp = s 167 SUBL(nextEmitL, tmp.As32()) // tmp = s - nextEmit 168 SHRL(U8(skipLog), tmp.As32()) // tmp = (s - nextEmit) >> skipLog 169 LEAQ(Mem{Base: s, Disp: 4, Index: tmp, Scale: 1}, nextS) 170 } 171 // if nextS > sLimit {goto emitRemainder} 172 { 173 tmp := GP64() 174 MOVL(sLimitL, tmp.As32()) 175 CMPL(nextS.As32(), tmp.As32()) 176 JGT(LabelRef("emit_remainder_" + name)) 177 } 178 // move nextS to stack. 179 MOVL(nextS.As32(), nextSTempL) 180 181 candidate2 := GP64().As32() 182 hasher := hash6(tableBits) 183 { 184 hash0, hash1 := GP64(), GP64() 185 MOVQ(cv, hash0) 186 MOVQ(cv, hash1) 187 SHRQ(U8(8), hash1) 188 hasher.hash(hash0) 189 hasher.hash(hash1) 190 MOVL(table.Idx(hash0, 1), candidate) 191 MOVL(table.Idx(hash1, 1), candidate2) 192 MOVL(s, table.Idx(hash0, 1)) 193 tmp := GP64().As32() 194 LEAL(Mem{Base: s, Disp: 1}, tmp) 195 MOVL(tmp, table.Idx(hash1, 1)) 196 } 197 // Check repeat at offset checkRep 198 const checkRep = 1 199 200 if true { 201 // rep = s - repeat 202 rep := GP64().As32() 203 if true { 204 // if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { 205 left, right := GP64(), GP64() 206 MOVL(s, rep) 207 SUBL(repeatL, rep) // rep = s - repeat 208 MOVL(Mem{Base: src, Index: rep, Scale: 1, Disp: checkRep}, right.As32()) 209 MOVQ(cv, left) 210 SHLQ(U8(checkRep*8), left) 211 CMPL(left.As32(), right.As32()) 212 213 // FIXME: Unable to allocate if enabled. 214 JNE(LabelRef("no_repeat_found_" + name)) 215 } 216 // base = s + 1 217 base := GP64() 218 LEAQ(Mem{Base: s, Disp: 1}, base) 219 // Extend back 220 if true { 221 ne := GP64().As32() 222 MOVL(nextEmitL, ne) 223 TESTL(rep, rep) 224 JZ(LabelRef("repeat_extend_back_end_" + name)) 225 226 // I is tested when decremented, so we loop back here. 227 Label("repeat_extend_back_loop_" + name) 228 CMPL(base.As32(), ne) 229 JG(LabelRef("repeat_extend_back_end_" + name)) 230 // if src[i-1] == src[base-1] 231 tmp, tmp2 := GP64(), GP64() 232 MOVB(Mem{Base: src, Index: rep, Scale: 1, Disp: -1}, tmp.As8()) 233 MOVB(Mem{Base: src, Index: base, Scale: 1, Disp: -1}, tmp2.As8()) 234 CMPB(tmp.As8(), tmp2.As8()) 235 JNE(LabelRef("repeat_extend_back_end_" + name)) 236 LEAQ(Mem{Base: base, Disp: -1}, base) 237 DECL(rep) 238 JZ(LabelRef("repeat_extend_back_end_" + name)) 239 JMP(LabelRef("repeat_extend_back_loop_" + name)) 240 } 241 Label("repeat_extend_back_end_" + name) 242 // Base is now at start. 243 // d += emitLiteral(dst[d:], src[nextEmitL:base]) 244 if true { 245 emitLiterals(nextEmitL, base, src, dstBase, "repeat_emit_"+name, avx) 246 } 247 248 // Extend forward 249 if true { 250 // s += 4 + checkRep 251 ADDL(U8(4+checkRep), s) 252 253 // candidate := s - repeat + 4 + checkRep 254 MOVL(s, candidate) 255 SUBL(repeatL, candidate) // candidate = s - repeatL 256 { 257 // srcLeft = sLimitL - s 258 srcLeft := GP64() 259 MOVL(sLimitL, srcLeft.As32()) 260 SUBL(s, srcLeft.As32()) 261 262 // Forward address 263 forwardStart := Mem{Base: src, Index: s, Scale: 1} 264 // End address 265 backStart := Mem{Base: src, Index: candidate, Scale: 1} 266 length := matchLen("repeat_extend", forwardStart, backStart, srcLeft, LabelRef("repeat_extend_forward_end_"+name)) 267 Label("repeat_extend_forward_end_" + name) 268 // s+= length 269 ADDL(length.As32(), s) 270 } 271 } 272 // Emit 273 if true { 274 // length = s-base 275 length := GP64() 276 MOVL(s, length.As32()) 277 SUBL(base.As32(), length.As32()) 278 279 offsetVal := GP64() 280 MOVL(repeatL, offsetVal.As32()) 281 dst := GP64() 282 MOVQ(dstBase, dst) 283 284 // if nextEmit > 0 285 tmp := GP64() 286 MOVL(nextEmitL, tmp.As32()) 287 TESTL(tmp.As32(), tmp.As32()) 288 289 // FIXME: fails to allocate regs if enabled: 290 JZ(LabelRef("repeat_as_copy_" + name)) 291 292 emitRepeat("match_repeat_", length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name)) 293 294 // JUMPS TO HERE: 295 Label("repeat_as_copy_" + name) 296 emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name)) 297 298 Label("repeat_end_emit_" + name) 299 // Store new dst and nextEmit 300 MOVQ(dst, dstBase) 301 } 302 // if s >= sLimit 303 // can be omitted. 304 if true { 305 tmp := GP64() 306 MOVL(sLimitL, tmp.As32()) 307 CMPL(s, tmp.As32()) 308 JGT(LabelRef("emit_remainder_" + name)) 309 } 310 JMP(LabelRef("search_loop_" + name)) 311 } 312 Label("no_repeat_found_" + name) 313 { 314 // Can be moved up if registers are available. 315 hash2 := GP64() 316 { 317 // hash2 := hash6(cv>>16, tableBits) 318 hasher = hash6(tableBits) 319 MOVQ(cv, hash2) 320 SHRQ(U8(16), hash2) 321 hasher.hash(hash2) 322 } 323 324 CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) 325 // cv >>= 8 326 SHRQ(U8(8), cv) 327 JEQ(LabelRef("candidate_match_" + name)) 328 329 // candidate = int(table[hash2]) 330 MOVL(table.Idx(hash2, 1), candidate) 331 332 // if uint32(cv>>8) == load32(src, candidate2) 333 CMPL(Mem{Base: src, Index: candidate2, Scale: 1}, cv.As32()) 334 JEQ(LabelRef("candidate2_match_" + name)) 335 336 // table[hash2] = uint32(s + 2) 337 tmp := GP64() 338 LEAQ(Mem{Base: s, Disp: 2}, tmp) 339 MOVL(tmp.As32(), table.Idx(hash2, 1)) 340 341 // if uint32(cv>>16) == load32(src, candidate) 342 SHRQ(U8(8), cv) 343 CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32()) 344 JEQ(LabelRef("candidate3_match_" + name)) 345 // s = nextS 346 MOVL(nextSTempL, s) 347 JMP(LabelRef("search_loop_" + name)) 348 349 // Matches candidate3 350 Label("candidate3_match_" + name) 351 ADDL(U8(2), s) 352 JMP(LabelRef("candidate_match_" + name)) 353 354 Label("candidate2_match_" + name) 355 // table[hash2] = uint32(s + 2) 356 tmp = GP64() 357 LEAQ(Mem{Base: s, Disp: -2}, tmp) 358 MOVL(tmp.As32(), table.Idx(hash2, 1)) 359 // s++ 360 INCL(s) 361 MOVL(candidate2, candidate) 362 } 363 } 364 365 Label("candidate_match_" + name) 366 // We have a match at 's' with src offset in "candidate" that matches at least 4 bytes. 367 // Extend backwards 368 { 369 ne := GP64() 370 MOVL(nextEmitL, ne.As32()) 371 TESTL(candidate, candidate) 372 JZ(LabelRef("match_extend_back_end_" + name)) 373 374 // candidate is tested when decremented, so we loop back here. 375 Label("match_extend_back_loop_" + name) 376 CMPL(s, ne.As32()) 377 JG(LabelRef("match_extend_back_end_" + name)) 378 // if src[candidate-1] == src[s-1] 379 tmp, tmp2 := GP64(), GP64() 380 MOVB(Mem{Base: src, Index: candidate, Scale: 1, Disp: -1}, tmp.As8()) 381 MOVB(Mem{Base: src, Index: s, Scale: 1, Disp: -1}, tmp2.As8()) 382 CMPB(tmp.As8(), tmp2.As8()) 383 JNE(LabelRef("match_extend_back_end_" + name)) 384 LEAL(Mem{Base: s, Disp: -1}, s) 385 DECL(candidate) 386 JZ(LabelRef("match_extend_back_end_" + name)) 387 JMP(LabelRef("match_extend_back_loop_" + name)) 388 } 389 Label("match_extend_back_end_" + name) 390 391 // Bail if we exceed the maximum size. 392 if true { 393 // tmp = s-nextEmitL 394 tmp := GP64() 395 MOVL(s, tmp.As32()) 396 SUBL(nextEmitL, tmp.As32()) 397 LEAQ(dstBase.Idx(tmp, 1), tmp) 398 CMPQ(tmp, dstLimitPtrQ) 399 JL(LabelRef("match_dst_size_check_" + name)) 400 ri, err := ReturnIndex(0).Resolve() 401 if err != nil { 402 panic(err) 403 } 404 MOVQ(U32(0), ri.Addr) 405 RET() 406 } 407 Label("match_dst_size_check_" + name) 408 { 409 base := GP64() 410 MOVL(candidate, base.As32()) 411 emitLiterals(nextEmitL, base, src, dstBase, "match_emit_"+name, avx) 412 NOP() 413 } 414 415 Label("match_nolit_loop_" + name) 416 { 417 base := GP64().As32() 418 MOVL(s, base) 419 // Update repeat 420 { 421 // repeat = base - candidate 422 repeatVal := GP64().As32() 423 MOVL(s, repeatVal) 424 SUBL(candidate, repeatVal) 425 MOVL(repeatVal, repeatL) 426 } 427 // s+=4, candidate+=4 428 ADDL(U8(4), s) 429 ADDL(U8(4), candidate) 430 // Extend the 4-byte match as long as possible and emit copy. 431 { 432 // srcLeft = sLimitL - s 433 srcLeft := GP64() 434 MOVL(sLimitL, srcLeft.As32()) 435 SUBL(s, srcLeft.As32()) 436 length := matchLen("match_nolit_"+name, 437 Mem{Base: src, Index: s, Scale: 1}, 438 Mem{Base: src, Index: candidate, Scale: 1}, 439 srcLeft, 440 LabelRef("match_nolit_end_"+name), 441 ) 442 Label("match_nolit_end_" + name) 443 offset := GP64() 444 MOVL(repeatL, offset.As32()) 445 ADDQ(U8(4), length) 446 dst := GP64() 447 MOVQ(dstBase, dst) 448 // s += length (lenght is destroyed, use it now) 449 ADDL(length.As32(), s) 450 emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name)) 451 Label("match_nolit_emitcopy_end_" + name) 452 MOVQ(dst, dstBase) 453 MOVL(s, nextEmitL) 454 CMPL(s, sLimitL) 455 JGE(LabelRef("emit_remainder_" + name)) 456 457 // Bail if we exceed the maximum size. 458 { 459 CMPQ(dst, dstLimitPtrQ) 460 JL(LabelRef("match_nolit_dst_ok_" + name)) 461 ri, err := ReturnIndex(0).Resolve() 462 if err != nil { 463 panic(err) 464 } 465 MOVQ(U32(0), ri.Addr) 466 RET() 467 Label("match_nolit_dst_ok_" + name) 468 } 469 } 470 { 471 // Check for an immediate match, otherwise start search at s+1 472 x := GP64() 473 // Index s-2 474 MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, x) 475 hasher := hash6(tableBits) 476 hash0, hash1 := GP64(), GP64() 477 MOVQ(x, hash0) // s-2 478 SHRQ(U8(16), x) 479 MOVQ(x, hash1) // s 480 hasher.hash(hash0) 481 hasher.hash(hash1) 482 c0, c1 := GP64(), GP64() 483 MOVL(table.Idx(hash0, 1), c0.As32()) 484 MOVL(table.Idx(hash1, 1), c1.As32()) 485 sm2 := GP64() 486 LEAQ(Mem{Base: s, Disp: -2}, sm2) 487 MOVL(sm2.As32(), table.Idx(hash0, 1)) 488 MOVL(s, table.Idx(hash1, 1)) 489 CMPL(Mem{Base: src, Index: hash1, Scale: 1}, x.As32()) 490 JEQ(LabelRef("match_nolit_loop_" + name)) 491 INCL(s) 492 } 493 JMP(LabelRef("search_loop_" + name)) 494 } 495 496 Label("emit_remainder_" + name) 497 // Bail if we exceed the maximum size. 498 // if d+len(src)-nextEmitL > dstLimitPtrQ { return 0 499 { 500 // remain = lenSrc - nextEmitL 501 remain := GP64() 502 MOVQ(lenSrcQ, remain) 503 SUBL(nextEmitL, remain.As32()) 504 dst := GP64() 505 MOVQ(dstBase, dst) 506 // dst := dst + (len(src)-nextEmitL) 507 LEAQ(Mem{Base: dst, Index: remain, Scale: 1}, dst) 508 CMPQ(dst, dstLimitPtrQ) 509 JL(LabelRef("emit_remainder_ok_" + name)) 510 ri, err := ReturnIndex(0).Resolve() 511 if err != nil { 512 panic(err) 513 } 514 MOVQ(U32(0), ri.Addr) 515 RET() 516 Label("emit_remainder_ok_" + name) 517 } 518 // emitLiteral(dst[d:], src[nextEmitL:]) 519 emitEnd := GP64() 520 MOVQ(lenSrcQ, emitEnd) 521 522 // Emit final literals. 523 emitLiterals(nextEmitL, emitEnd, src, dstBase, "emit_remainder_"+name, avx) 524 525 // length := start - base (ptr arithmetic) 526 length := GP64() 527 MOVQ(dstStartPtrQ, length) 528 SUBQ(dstBase, length) 529 530 Store(length, ReturnIndex(0)) 531 RET() 532} 533 534// emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase. 535// Checks if base == nextemit. 536// src & base are untouched. 537func emitLiterals(nextEmitL Mem, base reg.GPVirtual, src reg.GPVirtual, dstBase Mem, name string, avx bool) { 538 nextEmit, litLen, dstBaseTmp, litBase := GP64().As32(), GP64(), GP64(), GP64() 539 MOVL(nextEmitL, nextEmit) 540 CMPL(nextEmit, base.As32()) 541 JEQ(LabelRef("emit_literal_skip_" + name)) 542 MOVL(base.As32(), litLen.As32()) 543 544 // Base is now next emit. 545 MOVL(base.As32(), nextEmitL) 546 547 // litBase = src[nextEmitL:] 548 LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase) 549 SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit 550 551 // Load (and store when we return) 552 MOVQ(dstBase, dstBaseTmp) 553 emitLiteral(name, litLen, nil, dstBaseTmp, litBase, LabelRef("emit_literal_done_"+name), avx, true) 554 Label("emit_literal_done_" + name) 555 // Store updated dstBase 556 MOVQ(dstBaseTmp, dstBase) 557 Label("emit_literal_skip_" + name) 558} 559 560type hashGen struct { 561 bytes int 562 tablebits int 563 mulreg reg.GPVirtual 564} 565 566// hash uses multiply to get a 'output' hash on the hash of the lowest 'bytes' bytes in value. 567func hash6(tablebits int) hashGen { 568 h := hashGen{ 569 bytes: 6, 570 tablebits: tablebits, 571 mulreg: GP64(), 572 } 573 MOVQ(Imm(227718039650203), h.mulreg) 574 return h 575} 576 577// hash uses multiply to get hash of the value. 578func (h hashGen) hash(val reg.GPVirtual) { 579 // Move value to top of register. 580 SHLQ(U8(64-8*h.bytes), val) 581 IMULQ(h.mulreg, val) 582 // Move value to bottom 583 SHRQ(U8(64-h.tablebits), val) 584} 585 586func genEmitLiteral() { 587 TEXT("emitLiteral", NOSPLIT, "func(dst, lit []byte) int") 588 Doc("emitLiteral writes a literal chunk and returns the number of bytes written.", "", 589 "It assumes that:", 590 " dst is long enough to hold the encoded bytes", 591 " 0 <= len(lit) && len(lit) <= math.MaxUint32", "") 592 Pragma("noescape") 593 594 dstBase, litBase, litLen, retval := GP64(), GP64(), GP64(), GP64() 595 Load(Param("dst").Base(), dstBase) 596 Load(Param("lit").Base(), litBase) 597 Load(Param("lit").Len(), litLen) 598 emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_standalone", false, false) 599 Label("emit_literal_end_standalone") 600 Store(retval, ReturnIndex(0)) 601 RET() 602 603 TEXT("emitLiteralAvx", NOSPLIT, "func(dst, lit []byte) int") 604 Doc("emitLiteralAvx writes a literal chunk and returns the number of bytes written.", "", 605 "It assumes that:", 606 " dst is long enough to hold the encoded bytes", 607 " 0 <= len(lit) && len(lit) <= math.MaxUint32", "") 608 Pragma("noescape") 609 610 dstBase, litBase, litLen, retval = GP64(), GP64(), GP64(), GP64() 611 Load(Param("dst").Base(), dstBase) 612 Load(Param("lit").Base(), litBase) 613 Load(Param("lit").Len(), litLen) 614 emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_avx_standalone", true, false) 615 Label("emit_literal_end_avx_standalone") 616 Store(retval, ReturnIndex(0)) 617 RET() 618} 619 620// emitLiteral can be used for inlining an emitLiteral call. 621// stack must have at least 32 bytes. 622// retval will contain emitted bytes, but can be nil if this is not interesting. 623// dstBase and litBase are updated. 624// Uses 2 GP registers. With AVX 4 registers. 625// If updateDst is true dstBase will have the updated end pointer and an additional register will be used. 626func emitLiteral(name string, litLen, retval, dstBase, litBase reg.GPVirtual, end LabelRef, avx, updateDst bool) { 627 n := GP64() 628 n16 := GP64() 629 630 // We always add litLen bytes 631 if retval != nil { 632 MOVQ(litLen, retval) 633 } 634 MOVQ(litLen, n) 635 636 SUBL(U8(1), n.As32()) 637 // Return if AX was 0 638 JC(end) 639 640 // Find number of bytes to emit for tag. 641 CMPL(n.As32(), U8(60)) 642 JLT(LabelRef("one_byte_" + name)) 643 CMPL(n.As32(), U32(1<<8)) 644 JLT(LabelRef("two_bytes_" + name)) 645 CMPL(n.As32(), U32(1<<16)) 646 JLT(LabelRef("three_bytes_" + name)) 647 CMPL(n.As32(), U32(1<<24)) 648 JLT(LabelRef("four_bytes_" + name)) 649 650 Label("five_bytes_" + name) 651 MOVB(U8(252), Mem{Base: dstBase}) 652 MOVL(n.As32(), Mem{Base: dstBase, Disp: 1}) 653 if retval != nil { 654 ADDQ(U8(5), retval) 655 } 656 ADDQ(U8(5), dstBase) 657 JMP(LabelRef("memmove_" + name)) 658 659 Label("four_bytes_" + name) 660 MOVQ(n, n16) 661 SHRL(U8(16), n16.As32()) 662 MOVB(U8(248), Mem{Base: dstBase}) 663 MOVW(n.As16(), Mem{Base: dstBase, Disp: 1}) 664 MOVB(n16.As8(), Mem{Base: dstBase, Disp: 3}) 665 if retval != nil { 666 ADDQ(U8(4), retval) 667 } 668 ADDQ(U8(4), dstBase) 669 JMP(LabelRef("memmove_" + name)) 670 671 Label("three_bytes_" + name) 672 MOVB(U8(0xf4), Mem{Base: dstBase}) 673 MOVW(n.As16(), Mem{Base: dstBase, Disp: 1}) 674 if retval != nil { 675 ADDQ(U8(3), retval) 676 } 677 ADDQ(U8(3), dstBase) 678 JMP(LabelRef("memmove_" + name)) 679 680 Label("two_bytes_" + name) 681 MOVB(U8(0xf0), Mem{Base: dstBase}) 682 MOVB(n.As8(), Mem{Base: dstBase, Disp: 1}) 683 if retval != nil { 684 ADDQ(U8(2), retval) 685 } 686 ADDQ(U8(2), dstBase) 687 JMP(LabelRef("memmove_" + name)) 688 689 Label("one_byte_" + name) 690 SHLB(U8(2), n.As8()) 691 MOVB(n.As8(), Mem{Base: dstBase}) 692 if retval != nil { 693 ADDQ(U8(1), retval) 694 } 695 ADDQ(U8(1), dstBase) 696 // Fallthrough 697 698 Label("memmove_" + name) 699 700 // copy(dst[i:], lit) 701 if true { 702 dstEnd := GP64() 703 if updateDst { 704 LEAQ(Mem{Base: dstBase, Index: litLen, Scale: 1}, dstEnd) 705 } 706 genMemMove2("emit_lit_memmove_"+name, dstBase, litBase, litLen, end, avx) 707 if updateDst { 708 MOVQ(dstEnd, dstBase) 709 } 710 } else { 711 genMemMove("emit_lit_memmove_"+name, dstBase, litBase, litLen, end) 712 } 713 return 714} 715 716// genEmitRepeat generates a standlone emitRepeat. 717func genEmitRepeat() { 718 TEXT("emitRepeat", NOSPLIT, "func(dst []byte, offset, length int) int") 719 Doc("emitRepeat writes a repeat chunk and returns the number of bytes written.", 720 "Length must be at least 4 and < 1<<32", "") 721 Pragma("noescape") 722 723 dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64() 724 725 // retval = 0 726 XORQ(retval, retval) 727 728 Load(Param("dst").Base(), dstBase) 729 Load(Param("offset"), offset) 730 Load(Param("length"), length) 731 emitRepeat("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_repeat_end")) 732 Label("gen_emit_repeat_end") 733 Store(retval, ReturnIndex(0)) 734 RET() 735} 736 737// emitRepeat can be used for inlining an emitRepeat call. 738// length >= 4 and < 1<<32 739// length is modified. dstBase is updated. retval is added to input. 740// retval can be nil. 741// Will jump to end label when finished. 742// Uses 1 GP register. 743func emitRepeat(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) { 744 Label("emit_repeat_again_" + name) 745 tmp := GP64() 746 MOVQ(length, tmp) // Copy length 747 // length -= 4 748 LEAQ(Mem{Base: length, Disp: -4}, length) 749 750 // if length <= 4 (use copied value) 751 CMPL(tmp.As32(), U8(8)) 752 JLE(LabelRef("repeat_two_" + name)) 753 754 // length < 8 && offset < 2048 755 CMPL(tmp.As32(), U8(12)) 756 JGE(LabelRef("cant_repeat_two_offset_" + name)) 757 CMPL(offset.As32(), U32(2048)) 758 JLT(LabelRef("repeat_two_offset_" + name)) 759 760 const maxRepeat = ((1 << 24) - 1) + 65536 761 Label("cant_repeat_two_offset_" + name) 762 CMPL(length.As32(), U32((1<<8)+4)) 763 JLT(LabelRef("repeat_three_" + name)) // if length < (1<<8)+4 764 CMPL(length.As32(), U32((1<<16)+(1<<8))) 765 JLT(LabelRef("repeat_four_" + name)) // if length < (1 << 16) + (1 << 8) 766 CMPL(length.As32(), U32(maxRepeat)) 767 JLT(LabelRef("repeat_five_" + name)) // If less than 24 bits to represent. 768 769 // We have have more than 24 bits 770 // Emit so we have at least 4 bytes left. 771 LEAQ(Mem{Base: length, Disp: -(maxRepeat - 4)}, length) // length -= (maxRepeat - 4) 772 MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0 773 MOVW(U16(65531), Mem{Base: dstBase, Disp: 2}) // 0xfffb 774 MOVB(U8(255), Mem{Base: dstBase, Disp: 4}) 775 ADDQ(U8(5), dstBase) 776 if retval != nil { 777 ADDQ(U8(5), retval) 778 } 779 JMP(LabelRef("emit_repeat_again_" + name)) 780 781 // Must be able to be within 5 bytes. 782 Label("repeat_five_" + name) 783 LEAQ(Mem{Base: length, Disp: -65536}, length) // length -= 65536 784 MOVQ(length, offset) 785 MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0 786 MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8) 787 SARQ(U8(16), offset) // offset = length >> 16 788 MOVB(offset.As8(), Mem{Base: dstBase, Disp: 4}) // dst[4] = length >> 16 789 if retval != nil { 790 ADDQ(U8(5), retval) // i += 5 791 } 792 ADDQ(U8(5), dstBase) // dst += 5 793 JMP(end) 794 795 Label("repeat_four_" + name) 796 LEAQ(Mem{Base: length, Disp: -256}, length) // length -= 256 797 MOVW(U16(6<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 6<<2 | tagCopy1, dst[1] = 0 798 MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8) 799 if retval != nil { 800 ADDQ(U8(4), retval) // i += 4 801 } 802 ADDQ(U8(4), dstBase) // dst += 4 803 JMP(end) 804 805 Label("repeat_three_" + name) 806 LEAQ(Mem{Base: length, Disp: -4}, length) // length -= 4 807 MOVW(U16(5<<2|tagCopy1), Mem{Base: dstBase}) // dst[0] = 5<<2 | tagCopy1, dst[1] = 0 808 MOVB(length.As8(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length) 809 if retval != nil { 810 ADDQ(U8(3), retval) // i += 3 811 } 812 ADDQ(U8(3), dstBase) // dst += 3 813 JMP(end) 814 815 Label("repeat_two_" + name) 816 // dst[0] = uint8(length)<<2 | tagCopy1, dst[1] = 0 817 SHLL(U8(2), length.As32()) 818 ORL(U8(tagCopy1), length.As32()) 819 MOVW(length.As16(), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0 820 if retval != nil { 821 ADDQ(U8(2), retval) // i += 2 822 } 823 ADDQ(U8(2), dstBase) // dst += 2 824 JMP(end) 825 826 Label("repeat_two_offset_" + name) 827 // Emit the remaining copy, encoded as 2 bytes. 828 // dst[1] = uint8(offset) 829 // dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1 830 tmp = GP64() 831 XORQ(tmp, tmp) 832 // Use scale and displacement to shift and subtract values from length. 833 LEAQ(Mem{Base: tmp, Index: length, Scale: 4, Disp: tagCopy1}, length) 834 MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte 835 SARL(U8(8), offset.As32()) // Remove lower 836 SHLL(U8(5), offset.As32()) // Shift back up 837 ORL(offset.As32(), length.As32()) // OR result 838 MOVB(length.As8(), Mem{Base: dstBase, Disp: 0}) 839 if retval != nil { 840 ADDQ(U8(2), retval) // i += 2 841 } 842 ADDQ(U8(2), dstBase) // dst += 2 843 844 JMP(end) 845} 846 847// emitCopy writes a copy chunk and returns the number of bytes written. 848// 849// It assumes that: 850// dst is long enough to hold the encoded bytes 851// 1 <= offset && offset <= math.MaxUint32 852// 4 <= length && length <= 1 << 24 853 854// genEmitCopy generates a standlone emitCopy 855func genEmitCopy() { 856 TEXT("emitCopy", NOSPLIT, "func(dst []byte, offset, length int) int") 857 Doc("emitCopy writes a copy chunk and returns the number of bytes written.", "", 858 "It assumes that:", 859 " dst is long enough to hold the encoded bytes", 860 " 1 <= offset && offset <= math.MaxUint32", 861 " 4 <= length && length <= 1 << 24", "") 862 Pragma("noescape") 863 864 dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64() 865 866 // i := 0 867 XORQ(retval, retval) 868 869 Load(Param("dst").Base(), dstBase) 870 Load(Param("offset"), offset) 871 Load(Param("length"), length) 872 emitCopy("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_copy_end")) 873 Label("gen_emit_copy_end") 874 Store(retval, ReturnIndex(0)) 875 RET() 876} 877 878const ( 879 tagLiteral = 0x00 880 tagCopy1 = 0x01 881 tagCopy2 = 0x02 882 tagCopy4 = 0x03 883) 884 885// emitCopy can be used for inlining an emitCopy call. 886// length is modified (and junk). dstBase is updated. retval is added to input. 887// retval can be nil. 888// Will jump to end label when finished. 889// Uses 2 GP registers. 890func emitCopy(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) { 891 // if offset >= 65536 { 892 CMPL(offset.As32(), U32(65536)) 893 JL(LabelRef("two_byte_offset_" + name)) 894 895 // offset is >= 65536 896 // if length <= 64 goto four_bytes_remain_ 897 CMPL(length.As32(), U8(64)) 898 JLE(LabelRef("four_bytes_remain_" + name)) 899 900 // Emit a length 64 copy, encoded as 5 bytes. 901 // dst[0] = 63<<2 | tagCopy4 902 MOVB(U8(63<<2|tagCopy4), Mem{Base: dstBase}) 903 // dst[4] = uint8(offset >> 24) 904 // dst[3] = uint8(offset >> 16) 905 // dst[2] = uint8(offset >> 8) 906 // dst[1] = uint8(offset) 907 MOVD(offset, Mem{Base: dstBase, Disp: 1}) 908 // length -= 64 909 LEAQ(Mem{Base: length, Disp: -64}, length) 910 if retval != nil { 911 ADDQ(U8(5), retval) // i+=5 912 } 913 ADDQ(U8(5), dstBase) // dst+=5 914 915 // if length >= 4 { 916 CMPL(length.As32(), U8(4)) 917 JL(LabelRef("four_bytes_remain_" + name)) 918 919 // Emit remaining as repeats 920 // return 5 + emitRepeat(dst[5:], offset, length) 921 // Inline call to emitRepeat. Will jump to end 922 emitRepeat(name+"_emit_copy", length, offset, retval, dstBase, end) 923 924 Label("four_bytes_remain_" + name) 925 // if length == 0 { 926 // return i 927 // } 928 TESTL(length.As32(), length.As32()) 929 JZ(end) 930 931 // Emit a copy, offset encoded as 4 bytes. 932 // dst[i+0] = uint8(length-1)<<2 | tagCopy4 933 // dst[i+1] = uint8(offset) 934 // dst[i+2] = uint8(offset >> 8) 935 // dst[i+3] = uint8(offset >> 16) 936 // dst[i+4] = uint8(offset >> 24) 937 tmp := GP64() 938 MOVB(U8(tagCopy4), tmp.As8()) 939 // Use displacement to subtract 1 from upshifted length. 940 LEAQ(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length) 941 MOVB(length.As8(), Mem{Base: dstBase}) 942 MOVD(offset, Mem{Base: dstBase, Disp: 1}) 943 // return i + 5 944 if retval != nil { 945 ADDQ(U8(5), retval) 946 } 947 ADDQ(U8(5), dstBase) 948 JMP(end) 949 950 Label("two_byte_offset_" + name) 951 // Offset no more than 2 bytes. 952 953 // if length > 64 { 954 CMPL(length.As32(), U8(64)) 955 JLE(LabelRef("two_byte_offset_short_" + name)) 956 // Emit a length 60 copy, encoded as 3 bytes. 957 // Emit remaining as repeat value (minimum 4 bytes). 958 // dst[2] = uint8(offset >> 8) 959 // dst[1] = uint8(offset) 960 // dst[0] = 59<<2 | tagCopy2 961 MOVB(U8(59<<2|tagCopy2), Mem{Base: dstBase}) 962 MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1}) 963 // length -= 60 964 LEAQ(Mem{Base: length, Disp: -60}, length) 965 966 // Emit remaining as repeats, at least 4 bytes remain. 967 // return 3 + emitRepeat(dst[3:], offset, length) 968 //} 969 ADDQ(U8(3), dstBase) 970 if retval != nil { 971 ADDQ(U8(3), retval) 972 } 973 // Inline call to emitRepeat. Will jump to end 974 emitRepeat(name+"_emit_copy_short", length, offset, retval, dstBase, end) 975 976 Label("two_byte_offset_short_" + name) 977 // if length >= 12 || offset >= 2048 { 978 CMPL(length.As32(), U8(12)) 979 JGE(LabelRef("emit_copy_three_" + name)) 980 CMPL(offset.As32(), U32(2048)) 981 JGE(LabelRef("emit_copy_three_" + name)) 982 983 // Emit the remaining copy, encoded as 2 bytes. 984 // dst[1] = uint8(offset) 985 // dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 986 tmp = GP64() 987 MOVB(U8(tagCopy1), tmp.As8()) 988 // Use scale and displacement to shift and subtract values from length. 989 LEAQ(Mem{Base: tmp, Index: length, Scale: 4, Disp: -(4 << 2)}, length) 990 MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte 991 SHRL(U8(8), offset.As32()) // Remove lower 992 SHLL(U8(5), offset.As32()) // Shift back up 993 ORL(offset.As32(), length.As32()) // OR result 994 MOVB(length.As8(), Mem{Base: dstBase, Disp: 0}) 995 if retval != nil { 996 ADDQ(U8(2), retval) // i += 2 997 } 998 ADDQ(U8(2), dstBase) // dst += 2 999 // return 2 1000 JMP(end) 1001 1002 Label("emit_copy_three_" + name) 1003 // // Emit the remaining copy, encoded as 3 bytes. 1004 // dst[2] = uint8(offset >> 8) 1005 // dst[1] = uint8(offset) 1006 // dst[0] = uint8(length-1)<<2 | tagCopy2 1007 tmp = GP64() 1008 MOVB(U8(tagCopy2), tmp.As8()) 1009 LEAQ(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length) 1010 MOVB(length.As8(), Mem{Base: dstBase}) 1011 MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1}) 1012 // return 3 1013 if retval != nil { 1014 ADDQ(U8(3), retval) // i += 3 1015 } 1016 ADDQ(U8(3), dstBase) // dst += 3 1017 JMP(end) 1018} 1019 1020// func memmove(to, from unsafe.Pointer, n uintptr) 1021// to and from will be at the end, n will be 0. 1022// to and from may not overlap. 1023// Fairly simplistic for now, can ofc. be extended. 1024// Uses one GP register and 8 SSE registers. 1025func genMemMove(name string, to, from, n reg.GPVirtual, end LabelRef) { 1026 tmp := GP64() 1027 MOVQ(n, tmp) 1028 // tmp = n/128 1029 SHRQ(U8(7), tmp) 1030 1031 TESTQ(tmp, tmp) 1032 JZ(LabelRef("done_128_" + name)) 1033 Label("loop_128_" + name) 1034 var xmmregs [8]reg.VecVirtual 1035 1036 // Prefetch destination for next loop. 1037 // Prefetching source doesn't provide speedup. 1038 // This seems to give a small boost. 1039 const preOff = 128 1040 PREFETCHT0(Mem{Base: to, Disp: preOff}) 1041 PREFETCHT0(Mem{Base: to, Disp: preOff + 64}) 1042 1043 for i := 0; i < 8; i++ { 1044 xmmregs[i] = XMM() 1045 MOVOU(Mem{Base: from}.Offset(i*16), xmmregs[i]) 1046 } 1047 for i := 0; i < 8; i++ { 1048 MOVOU(xmmregs[i], Mem{Base: to}.Offset(i*16)) 1049 } 1050 LEAQ(Mem{Base: n, Disp: -128}, n) 1051 ADDQ(U8(8*16), from) 1052 ADDQ(U8(8*16), to) 1053 DECQ(tmp) 1054 JNZ(LabelRef("loop_128_" + name)) 1055 1056 Label("done_128_" + name) 1057 MOVQ(n, tmp) 1058 // tmp = n/16 1059 SHRQ(U8(4), tmp) 1060 TESTQ(tmp, tmp) 1061 JZ(LabelRef("done_16_" + name)) 1062 1063 Label("loop_16_" + name) 1064 xmm := XMM() 1065 MOVOU(Mem{Base: from}, xmm) 1066 MOVOU(xmm, Mem{Base: to}) 1067 LEAQ(Mem{Base: n, Disp: -16}, n) 1068 ADDQ(U8(16), from) 1069 ADDQ(U8(16), to) 1070 DECQ(tmp) 1071 JNZ(LabelRef("loop_16_" + name)) 1072 Label("done_16_" + name) 1073 1074 // TODO: Use REP; MOVSB somehow. 1075 TESTQ(n, n) 1076 JZ(end) 1077 Label("loop_1_" + name) 1078 MOVB(Mem{Base: from}, tmp.As8()) 1079 MOVB(tmp.As8(), Mem{Base: to}) 1080 INCQ(from) 1081 INCQ(to) 1082 DECQ(n) 1083 JNZ(LabelRef("loop_1_" + name)) 1084} 1085 1086// func memmove(to, from unsafe.Pointer, n uintptr) 1087// src and dst may not overlap. 1088// Non AVX uses 2 GP register, 16 SSE2 registers. 1089// AVX uses 4 GP registers 16 AVX/SSE registers. 1090// All passed registers may be updated. 1091func genMemMove2(name string, dst, src, length reg.GPVirtual, end LabelRef, avx bool) { 1092 AX, CX := GP64(), GP64() 1093 NOP() 1094 name += "_memmove_" 1095 Label(name + "tail") 1096 // move_129through256 or smaller work whether or not the source and the 1097 // destination memory regions overlap because they load all data into 1098 // registers before writing it back. move_256through2048 on the other 1099 // hand can be used only when the memory regions don't overlap or the copy 1100 // direction is forward. 1101 // 1102 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing. 1103 TESTQ(length, length) 1104 JEQ(end) 1105 CMPQ(length, U8(2)) 1106 JBE(LabelRef(name + "move_1or2")) 1107 CMPQ(length, U8(4)) 1108 JB(LabelRef(name + "move_3")) 1109 JBE(LabelRef(name + "move_4")) 1110 CMPQ(length, U8(8)) 1111 JB(LabelRef(name + "move_5through7")) 1112 JE(LabelRef(name + "move_8")) 1113 CMPQ(length, U8(16)) 1114 JBE(LabelRef(name + "move_9through16")) 1115 CMPQ(length, U8(32)) 1116 JBE(LabelRef(name + "move_17through32")) 1117 CMPQ(length, U8(64)) 1118 JBE(LabelRef(name + "move_33through64")) 1119 CMPQ(length, U8(128)) 1120 JBE(LabelRef(name + "move_65through128")) 1121 CMPQ(length, U32(256)) 1122 JBE(LabelRef(name + "move_129through256")) 1123 1124 if avx { 1125 JMP(LabelRef(name + "avxUnaligned")) 1126 } else { 1127 if false { 1128 // Don't check length for now. 1129 Label(name + "forward") 1130 CMPQ(length, U32(2048)) 1131 JLS(LabelRef(name + "move_256through2048")) 1132 1133 genMemMove(name+"fallback", dst, src, length, end) 1134 } else { 1135 JMP(LabelRef(name + "move_256through2048")) 1136 } 1137 } 1138 /* 1139 // If REP MOVSB isn't fast, don't use it 1140 // FIXME: internal∕cpu·X86+const_offsetX86HasERMS(SB) 1141 // CMPB(U8(1), U8(1)) // enhanced REP MOVSB/STOSB 1142 JMP(LabelRef(name + "fwdBy8")) 1143 1144 // Check alignment 1145 MOVL(src.As32(), AX.As32()) 1146 ORL(dst.As32(), AX.As32()) 1147 TESTL(U32(7), AX.As32()) 1148 JEQ(LabelRef(name + "fwdBy8")) 1149 1150 // Do 1 byte at a time 1151 // MOVQ(length, CX) 1152 // FIXME: 1153 // REP; MOVSB 1154 JMP(end) 1155 1156 Label(name + "fwdBy8") 1157 // Do 8 bytes at a time 1158 MOVQ(length, CX) 1159 SHRQ(U8(3), CX) 1160 ANDQ(U8(7), length) 1161 // FIXME: 1162 //REP; MOVSQ 1163 JMP(LabelRef(name + "tail")) 1164 1165 Label(name + "back") 1166 1167 //check overlap 1168 MOVQ(src, CX) 1169 ADDQ(length, CX) 1170 CMPQ(CX, dst) 1171 JLS(LabelRef(name + "forward")) 1172 1173 //whole thing backwards has 1174 //adjusted addresses 1175 1176 ADDQ(length, dst) 1177 ADDQ(length, src) 1178 STD() 1179 1180 // 1181 // copy 1182 // 1183 MOVQ(length, CX) 1184 SHRQ(U8(3), CX) 1185 ANDQ(U8(7), length) 1186 1187 SUBQ(U8(8), dst) 1188 SUBQ(U8(8), src) 1189 // FIXME: 1190 //REP; MOVSQ 1191 1192 // FIXME: 1193 //CLD() 1194 1195 ADDQ(U8(8), dst) 1196 ADDQ(U8(8), src) 1197 SUBQ(length, dst) 1198 SUBQ(length, src) 1199 JMP(LabelRef(name + "tail")) 1200 */ 1201 1202 Label(name + "move_1or2") 1203 MOVB(Mem{Base: src}, AX.As8()) 1204 MOVB(Mem{Base: src, Disp: -1, Index: length, Scale: 1}, CX.As8()) 1205 MOVB(AX.As8(), Mem{Base: dst}) 1206 MOVB(CX.As8(), Mem{Base: dst, Disp: -1, Index: length, Scale: 1}) 1207 JMP(end) 1208 1209 Label(name + "move_4") 1210 MOVL(Mem{Base: src}, AX.As32()) 1211 MOVL(AX.As32(), Mem{Base: dst}) 1212 JMP(end) 1213 1214 Label(name + "move_3") 1215 MOVW(Mem{Base: src}, AX.As16()) 1216 MOVB(Mem{Base: src, Disp: 2}, CX.As8()) 1217 MOVW(AX.As16(), Mem{Base: dst}) 1218 MOVB(CX.As8(), Mem{Base: dst, Disp: 2}) 1219 JMP(end) 1220 1221 Label(name + "move_5through7") 1222 MOVL(Mem{Base: src}, AX.As32()) 1223 MOVL(Mem{Base: src, Disp: -4, Index: length, Scale: 1}, CX.As32()) 1224 MOVL(AX.As32(), Mem{Base: dst}) 1225 MOVL(CX.As32(), Mem{Base: dst, Disp: -4, Index: length, Scale: 1}) 1226 JMP(end) 1227 1228 Label(name + "move_8") 1229 // We need a separate case for 8 to make sure we write pointers atomically. 1230 MOVQ(Mem{Base: src}, AX) 1231 MOVQ(AX, Mem{Base: dst}) 1232 JMP(end) 1233 1234 Label(name + "move_9through16") 1235 MOVQ(Mem{Base: src}, AX) 1236 MOVQ(Mem{Base: src, Disp: -8, Index: length, Scale: 1}, CX) 1237 MOVQ(AX, Mem{Base: dst}) 1238 MOVQ(CX, Mem{Base: dst, Disp: -8, Index: length, Scale: 1}) 1239 JMP(end) 1240 1241 Label(name + "move_17through32") 1242 X0, X1, X2, X3, X4, X5, X6, X7 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM() 1243 X8, X9, X10, X11, X12, X13, X14, X15 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM() 1244 1245 MOVOU(Mem{Base: src}, X0) 1246 MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X1) 1247 MOVOU(X0, Mem{Base: dst}) 1248 MOVOU(X1, Mem{Base: dst, Disp: -16, Index: length, Scale: 1}) 1249 JMP(end) 1250 1251 Label(name + "move_33through64") 1252 MOVOU(Mem{Base: src}, X0) 1253 MOVOU(Mem{Base: src, Disp: 16}, X1) 1254 MOVOU(Mem{Base: src, Disp: -32, Index: length, Scale: 1}, X2) 1255 MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X3) 1256 MOVOU(X0, Mem{Base: dst}) 1257 MOVOU(X1, Mem{Base: dst, Disp: 16}) 1258 MOVOU(X2, Mem{Base: dst, Disp: -32, Index: length, Scale: 1}) 1259 MOVOU(X3, Mem{Base: dst, Disp: -16, Index: length, Scale: 1}) 1260 JMP(end) 1261 1262 Label(name + "move_65through128") 1263 MOVOU(Mem{Base: src}, X0) 1264 MOVOU(Mem{Base: src, Disp: 16}, X1) 1265 MOVOU(Mem{Base: src, Disp: 32}, X2) 1266 MOVOU(Mem{Base: src, Disp: 48}, X3) 1267 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12) 1268 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13) 1269 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14) 1270 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15) 1271 MOVOU(X0, Mem{Base: dst}) 1272 MOVOU(X1, Mem{Base: dst, Disp: 16}) 1273 MOVOU(X2, Mem{Base: dst, Disp: 32}) 1274 MOVOU(X3, Mem{Base: dst, Disp: 48}) 1275 MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64}) 1276 MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48}) 1277 MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32}) 1278 MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16}) 1279 JMP(end) 1280 1281 Label(name + "move_129through256") 1282 MOVOU(Mem{Base: src}, X0) 1283 MOVOU(Mem{Base: src, Disp: 16}, X1) 1284 MOVOU(Mem{Base: src, Disp: 32}, X2) 1285 MOVOU(Mem{Base: src, Disp: 48}, X3) 1286 MOVOU(Mem{Base: src, Disp: 64}, X4) 1287 MOVOU(Mem{Base: src, Disp: 80}, X5) 1288 MOVOU(Mem{Base: src, Disp: 96}, X6) 1289 MOVOU(Mem{Base: src, Disp: 112}, X7) 1290 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -128}, X8) 1291 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -112}, X9) 1292 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -96}, X10) 1293 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -80}, X11) 1294 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12) 1295 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13) 1296 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14) 1297 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15) 1298 MOVOU(X0, Mem{Base: dst}) 1299 MOVOU(X1, Mem{Base: dst, Disp: 16}) 1300 MOVOU(X2, Mem{Base: dst, Disp: 32}) 1301 MOVOU(X3, Mem{Base: dst, Disp: 48}) 1302 MOVOU(X4, Mem{Base: dst, Disp: 64}) 1303 MOVOU(X5, Mem{Base: dst, Disp: 80}) 1304 MOVOU(X6, Mem{Base: dst, Disp: 96}) 1305 MOVOU(X7, Mem{Base: dst, Disp: 112}) 1306 MOVOU(X8, Mem{Base: dst, Index: length, Scale: 1, Disp: -128}) 1307 MOVOU(X9, Mem{Base: dst, Index: length, Scale: 1, Disp: -112}) 1308 MOVOU(X10, Mem{Base: dst, Index: length, Scale: 1, Disp: -96}) 1309 MOVOU(X11, Mem{Base: dst, Index: length, Scale: 1, Disp: -80}) 1310 MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64}) 1311 MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48}) 1312 MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32}) 1313 MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16}) 1314 JMP(end) 1315 1316 Label(name + "move_256through2048") 1317 LEAQ(Mem{Base: length, Disp: -256}, length) 1318 MOVOU(Mem{Base: src}, X0) 1319 MOVOU(Mem{Base: src, Disp: 16}, X1) 1320 MOVOU(Mem{Base: src, Disp: 32}, X2) 1321 MOVOU(Mem{Base: src, Disp: 48}, X3) 1322 MOVOU(Mem{Base: src, Disp: 64}, X4) 1323 MOVOU(Mem{Base: src, Disp: 80}, X5) 1324 MOVOU(Mem{Base: src, Disp: 96}, X6) 1325 MOVOU(Mem{Base: src, Disp: 112}, X7) 1326 MOVOU(Mem{Base: src, Disp: 128}, X8) 1327 MOVOU(Mem{Base: src, Disp: 144}, X9) 1328 MOVOU(Mem{Base: src, Disp: 160}, X10) 1329 MOVOU(Mem{Base: src, Disp: 176}, X11) 1330 MOVOU(Mem{Base: src, Disp: 192}, X12) 1331 MOVOU(Mem{Base: src, Disp: 208}, X13) 1332 MOVOU(Mem{Base: src, Disp: 224}, X14) 1333 MOVOU(Mem{Base: src, Disp: 240}, X15) 1334 MOVOU(X0, Mem{Base: dst}) 1335 MOVOU(X1, Mem{Base: dst, Disp: 16}) 1336 MOVOU(X2, Mem{Base: dst, Disp: 32}) 1337 MOVOU(X3, Mem{Base: dst, Disp: 48}) 1338 MOVOU(X4, Mem{Base: dst, Disp: 64}) 1339 MOVOU(X5, Mem{Base: dst, Disp: 80}) 1340 MOVOU(X6, Mem{Base: dst, Disp: 96}) 1341 MOVOU(X7, Mem{Base: dst, Disp: 112}) 1342 MOVOU(X8, Mem{Base: dst, Disp: 128}) 1343 MOVOU(X9, Mem{Base: dst, Disp: 144}) 1344 MOVOU(X10, Mem{Base: dst, Disp: 160}) 1345 MOVOU(X11, Mem{Base: dst, Disp: 176}) 1346 MOVOU(X12, Mem{Base: dst, Disp: 192}) 1347 MOVOU(X13, Mem{Base: dst, Disp: 208}) 1348 MOVOU(X14, Mem{Base: dst, Disp: 224}) 1349 MOVOU(X15, Mem{Base: dst, Disp: 240}) 1350 CMPQ(length, U32(256)) 1351 LEAQ(Mem{Base: src, Disp: 256}, src) 1352 LEAQ(Mem{Base: dst, Disp: 256}, dst) 1353 JGE(LabelRef(name + "move_256through2048")) 1354 JMP(LabelRef(name + "tail")) 1355 1356 if avx { 1357 Label(name + "avxUnaligned") 1358 R8, R10 := GP64(), GP64() 1359 // There are two implementations of move algorithm. 1360 // The first one for non-overlapped memory regions. It uses forward copying. 1361 // We do not support overlapping input 1362 1363 // Non-temporal copy would be better for big sizes. 1364 // Disabled since big copies are unlikely. 1365 // If enabling, test functionality. 1366 const enableBigData = false 1367 if enableBigData { 1368 CMPQ(length, U32(0x100000)) 1369 JAE(LabelRef(name + "gobble_big_data_fwd")) 1370 } 1371 1372 // Memory layout on the source side 1373 // src CX 1374 // |<---------length before correction--------->| 1375 // | |<--length corrected-->| | 1376 // | | |<--- AX --->| 1377 // |<-R11->| |<-128 bytes->| 1378 // +----------------------------------------+ 1379 // | Head | Body | Tail | 1380 // +-------+------------------+-------------+ 1381 // ^ ^ ^ 1382 // | | | 1383 // Save head into Y4 Save tail into X5..X12 1384 // | 1385 // src+R11, where R11 = ((dst & -32) + 32) - dst 1386 // Algorithm: 1387 // 1. Unaligned save of the tail's 128 bytes 1388 // 2. Unaligned save of the head's 32 bytes 1389 // 3. Destination-aligned copying of body (128 bytes per iteration) 1390 // 4. Put head on the new place 1391 // 5. Put the tail on the new place 1392 // It can be important to satisfy processor's pipeline requirements for 1393 // small sizes as the cost of unaligned memory region copying is 1394 // comparable with the cost of main loop. So code is slightly messed there. 1395 // There is more clean implementation of that algorithm for bigger sizes 1396 // where the cost of unaligned part copying is negligible. 1397 // You can see it after gobble_big_data_fwd label. 1398 Y0, Y1, Y2, Y3, Y4 := YMM(), YMM(), YMM(), YMM(), YMM() 1399 1400 LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX) 1401 MOVQ(dst, R10) 1402 // CX points to the end of buffer so we need go back slightly. We will use negative offsets there. 1403 MOVOU(Mem{Base: CX, Disp: -0x80}, X5) 1404 MOVOU(Mem{Base: CX, Disp: -0x70}, X6) 1405 MOVQ(U32(0x80), AX) 1406 1407 // Align destination address 1408 ANDQ(U32(0xffffffe0), dst) 1409 ADDQ(U8(32), dst) 1410 // Continue tail saving. 1411 MOVOU(Mem{Base: CX, Disp: -0x60}, X7) 1412 MOVOU(Mem{Base: CX, Disp: -0x50}, X8) 1413 // Make R8 delta between aligned and unaligned destination addresses. 1414 MOVQ(dst, R8) 1415 SUBQ(R10, R8) 1416 // Continue tail saving. 1417 MOVOU(Mem{Base: CX, Disp: -0x40}, X9) 1418 MOVOU(Mem{Base: CX, Disp: -0x30}, X10) 1419 // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying. 1420 SUBQ(R8, length) 1421 // Continue tail saving. 1422 MOVOU(Mem{Base: CX, Disp: -0x20}, X11) 1423 MOVOU(Mem{Base: CX, Disp: -0x10}, X12) 1424 // The tail will be put on its place after main body copying. 1425 // It's time for the unaligned heading part. 1426 VMOVDQU(Mem{Base: src}, Y4) 1427 // Adjust source address to point past head. 1428 ADDQ(R8, src) 1429 SUBQ(AX, length) 1430 1431 // Aligned memory copying there 1432 Label(name + "gobble_128_loop") 1433 VMOVDQU(Mem{Base: src}, Y0) 1434 VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1) 1435 VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2) 1436 VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3) 1437 ADDQ(AX, src) 1438 VMOVDQA(Y0, Mem{Base: dst}) 1439 VMOVDQA(Y1, Mem{Base: dst, Disp: 0x20}) 1440 VMOVDQA(Y2, Mem{Base: dst, Disp: 0x40}) 1441 VMOVDQA(Y3, Mem{Base: dst, Disp: 0x60}) 1442 ADDQ(AX, dst) 1443 SUBQ(AX, length) 1444 JA(LabelRef(name + "gobble_128_loop")) 1445 // Now we can store unaligned parts. 1446 ADDQ(AX, length) 1447 ADDQ(dst, length) 1448 VMOVDQU(Y4, Mem{Base: R10}) 1449 VZEROUPPER() 1450 MOVOU(X5, Mem{Base: length, Disp: -0x80}) 1451 MOVOU(X6, Mem{Base: length, Disp: -0x70}) 1452 MOVOU(X7, Mem{Base: length, Disp: -0x60}) 1453 MOVOU(X8, Mem{Base: length, Disp: -0x50}) 1454 MOVOU(X9, Mem{Base: length, Disp: -0x40}) 1455 MOVOU(X10, Mem{Base: length, Disp: -0x30}) 1456 MOVOU(X11, Mem{Base: length, Disp: -0x20}) 1457 MOVOU(X12, Mem{Base: length, Disp: -0x10}) 1458 JMP(end) 1459 1460 if enableBigData { 1461 Label(name + "gobble_big_data_fwd") 1462 // There is forward copying for big regions. 1463 // It uses non-temporal mov instructions. 1464 // Details of this algorithm are commented previously for small sizes. 1465 LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX) 1466 MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -0x80}, X5) 1467 MOVOU(Mem{Base: CX, Disp: -0x70}, X6) 1468 MOVOU(Mem{Base: CX, Disp: -0x60}, X7) 1469 MOVOU(Mem{Base: CX, Disp: -0x50}, X8) 1470 MOVOU(Mem{Base: CX, Disp: -0x40}, X9) 1471 MOVOU(Mem{Base: CX, Disp: -0x30}, X10) 1472 MOVOU(Mem{Base: CX, Disp: -0x20}, X11) 1473 MOVOU(Mem{Base: CX, Disp: -0x10}, X12) 1474 VMOVDQU(Mem{Base: src}, Y4) 1475 MOVQ(dst, R8) 1476 1477 ANDQ(U32(0xffffffe0), dst) 1478 ADDQ(U8(32), dst) 1479 1480 MOVQ(dst, R10) 1481 SUBQ(R8, R10) 1482 SUBQ(R10, length) 1483 ADDQ(R10, src) 1484 LEAQ(Mem{Base: dst, Index: length, Scale: 1}, CX) 1485 SUBQ(U8(0x80), length) 1486 1487 Label(name + "gobble_mem_fwd_loop") 1488 PREFETCHNTA(Mem{Base: src, Disp: 0x1c0}) 1489 PREFETCHNTA(Mem{Base: src, Disp: 0x280}) 1490 // Prefetch values were chosen empirically. 1491 // Approach for prefetch usage as in 7.6.6 of [1] 1492 // [1] 64-ia-32-architectures-optimization-manual.pdf 1493 // https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf 1494 VMOVDQU(Mem{Base: src}, Y0) 1495 VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1) 1496 VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2) 1497 VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3) 1498 1499 ADDQ(U8(0x80), src) 1500 VMOVNTDQ(Y0, Mem{Base: dst}) 1501 VMOVNTDQ(Y1, Mem{Base: dst, Disp: 0x20}) 1502 VMOVNTDQ(Y2, Mem{Base: dst, Disp: 0x20}) 1503 VMOVNTDQ(Y3, Mem{Base: dst, Disp: 0x60}) 1504 ADDQ(U8(0x80), dst) 1505 SUBQ(U8(0x80), length) 1506 JA(LabelRef(name + "gobble_mem_fwd_loop")) 1507 // NT instructions don't follow the normal cache-coherency rules. 1508 // We need SFENCE there to make copied data available timely. 1509 SFENCE() 1510 VMOVDQU(Y4, Mem{Base: R8}) 1511 VZEROUPPER() 1512 MOVOU(X5, Mem{Base: CX, Disp: -0x80}) 1513 MOVOU(X6, Mem{Base: CX, Disp: -0x70}) 1514 MOVOU(X7, Mem{Base: CX, Disp: -0x60}) 1515 MOVOU(X8, Mem{Base: CX, Disp: -0x50}) 1516 MOVOU(X9, Mem{Base: CX, Disp: -0x40}) 1517 MOVOU(X10, Mem{Base: CX, Disp: -0x30}) 1518 MOVOU(X11, Mem{Base: CX, Disp: -0x20}) 1519 MOVOU(X12, Mem{Base: CX, Disp: -0x10}) 1520 JMP(end) 1521 } 1522 } 1523} 1524 1525// genMatchLen generates standalone matchLen. 1526func genMatchLen() { 1527 TEXT("matchLen", NOSPLIT, "func(a, b []byte) int") 1528 Doc("matchLen returns how many bytes match in a and b", "", 1529 "It assumes that:", 1530 " len(a) <= len(b)", "") 1531 Pragma("noescape") 1532 1533 aBase, bBase, length := GP64(), GP64(), GP64() 1534 1535 Load(Param("a").Base(), aBase) 1536 Load(Param("b").Base(), bBase) 1537 Load(Param("a").Len(), length) 1538 l := matchLen("standalone", Mem{Base: aBase}, Mem{Base: bBase}, length, LabelRef("gen_match_len_end")) 1539 Label("gen_match_len_end") 1540 Store(l, ReturnIndex(0)) 1541 RET() 1542} 1543 1544// matchLen returns the number of matching bytes of a and b. 1545// len is the maximum number of bytes to match. 1546// Will jump to end when done and returns the length. 1547// Uses 2 GP registers. 1548func matchLen(name string, a, b Mem, len reg.GPVirtual, end LabelRef) reg.GPVirtual { 1549 tmp, matched := GP64(), GP64() 1550 XORQ(matched, matched) 1551 1552 CMPQ(len, U8(8)) 1553 JL(LabelRef("matchlen_single_" + name)) 1554 1555 Label("matchlen_loopback_" + name) 1556 MOVQ(Mem{Base: a.Base, Index: matched, Scale: 1}, tmp) 1557 XORQ(Mem{Base: b.Base, Index: matched, Scale: 1}, tmp) 1558 TESTQ(tmp, tmp) 1559 JZ(LabelRef("matchlen_loop_" + name)) 1560 // Not all match. 1561 BSFQ(tmp, tmp) 1562 SARQ(U8(3), tmp) 1563 LEAQ(Mem{Base: matched, Index: tmp, Scale: 1}, matched) 1564 JMP(end) 1565 1566 // All 8 byte matched, update and loop. 1567 Label("matchlen_loop_" + name) 1568 LEAQ(Mem{Base: len, Disp: -8}, len) 1569 LEAQ(Mem{Base: matched, Disp: 8}, matched) 1570 CMPQ(len, U8(8)) 1571 JGE(LabelRef("matchlen_loopback_" + name)) 1572 1573 // Less than 8 bytes left. 1574 Label("matchlen_single_" + name) 1575 TESTQ(len, len) 1576 JZ(end) 1577 Label("matchlen_single_loopback_" + name) 1578 MOVB(Mem{Base: a.Base, Index: matched, Scale: 1}, tmp.As8()) 1579 CMPB(Mem{Base: b.Base, Index: matched, Scale: 1}, tmp.As8()) 1580 JNE(end) 1581 LEAQ(Mem{Base: matched, Disp: 1}, matched) 1582 DECQ(len) 1583 JNZ(LabelRef("matchlen_single_loopback_" + name)) 1584 JMP(end) 1585 return matched 1586} 1587