1package main
2
3//go:generate go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2
4
5import (
6	"fmt"
7	"math"
8	"runtime"
9
10	. "github.com/mmcloughlin/avo/build"
11	"github.com/mmcloughlin/avo/buildtags"
12	. "github.com/mmcloughlin/avo/operand"
13	"github.com/mmcloughlin/avo/reg"
14)
15
16// insert extra checks here and there.
17const debug = false
18
19const (
20	limit14B = math.MaxUint32
21	// Use 12 bit table when no more than...
22	limit12B = 16<<10 - 1
23	// Use 10 bit table when no more than...
24	limit10B = 4<<10 - 1
25	// Use 8 bit table when no more than...
26	limit8B = 512 - 1
27)
28
29func main() {
30	Constraint(buildtags.Not("appengine").ToConstraint())
31	Constraint(buildtags.Not("noasm").ToConstraint())
32	Constraint(buildtags.Term("gc").ToConstraint())
33
34	o := options{
35		snappy: false,
36	}
37	o.genEncodeBlockAsm("encodeBlockAsm", 14, 6, 6, limit14B)
38	o.genEncodeBlockAsm("encodeBlockAsm12B", 12, 5, 5, limit12B)
39	o.genEncodeBlockAsm("encodeBlockAsm10B", 10, 5, 4, limit10B)
40	o.genEncodeBlockAsm("encodeBlockAsm8B", 8, 4, 4, limit8B)
41
42	// Snappy compatible
43	o.snappy = true
44	o.genEncodeBlockAsm("encodeSnappyBlockAsm", 14, 6, 6, limit14B)
45	o.genEncodeBlockAsm("encodeSnappyBlockAsm12B", 12, 5, 5, limit12B)
46	o.genEncodeBlockAsm("encodeSnappyBlockAsm10B", 10, 5, 4, limit10B)
47	o.genEncodeBlockAsm("encodeSnappyBlockAsm8B", 8, 4, 4, limit8B)
48
49	o.snappy = false
50	o.maxLen = math.MaxUint32
51	o.genEmitLiteral()
52	o.genEmitRepeat()
53	o.genEmitCopy()
54	o.snappy = true
55	o.genEmitCopyNoRepeat()
56	o.snappy = false
57	o.genMatchLen()
58	Generate()
59}
60
61func debugval(v Op) {
62	value := reg.R15
63	MOVQ(v, value)
64	INT(Imm(3))
65}
66
67func debugval32(v Op) {
68	value := reg.R15L
69	MOVL(v, value)
70	INT(Imm(3))
71}
72
73var assertCounter int
74
75// assert will insert code if debug is enabled.
76// The code should jump to 'ok' is assertion is success.
77func assert(fn func(ok LabelRef)) {
78	if debug {
79		caller := [100]uintptr{0}
80		runtime.Callers(2, caller[:])
81		frame, _ := runtime.CallersFrames(caller[:]).Next()
82
83		ok := fmt.Sprintf("assert_check_%d_ok_srcline_%d", assertCounter, frame.Line)
84		fn(LabelRef(ok))
85		// Emit several since delve is imprecise.
86		INT(Imm(3))
87		INT(Imm(3))
88		Label(ok)
89		assertCounter++
90	}
91}
92
93type options struct {
94	snappy bool
95	vmbi2  bool
96	maxLen int
97}
98
99func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, maxLen int) {
100	TEXT(name, 0, "func(dst, src []byte) int")
101	Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.",
102		fmt.Sprintf("Maximum input %d bytes.", maxLen),
103		"It assumes that the varint-encoded length of the decompressed bytes has already been written.", "")
104	Pragma("noescape")
105
106	const literalMaxOverhead = 4
107	o.maxLen = maxLen
108
109	var tableSize = 4 * (1 << tableBits)
110	// Memzero needs at least 128 bytes.
111	if tableSize < 128 {
112		panic("tableSize must be at least 128 bytes")
113	}
114
115	lenSrcBasic, err := Param("src").Len().Resolve()
116	if err != nil {
117		panic(err)
118	}
119	lenSrcQ := lenSrcBasic.Addr
120
121	lenDstBasic, err := Param("dst").Len().Resolve()
122	if err != nil {
123		panic(err)
124	}
125	lenDstQ := lenDstBasic.Addr
126
127	// Bail if we can't compress to at least this.
128	dstLimitPtrQ := AllocLocal(8)
129
130	// sLimitL is when to stop looking for offset/length copies.
131	sLimitL := AllocLocal(4)
132
133	// nextEmitL keeps track of the point we have emitted to.
134	nextEmitL := AllocLocal(4)
135
136	// Repeat stores the last match offset.
137	repeatL := AllocLocal(4)
138
139	// nextSTempL keeps nextS while other functions are being called.
140	nextSTempL := AllocLocal(4)
141
142	// Alloc table last
143	table := AllocLocal(tableSize)
144
145	dst := GP64()
146	{
147		dstBaseBasic, err := Param("dst").Base().Resolve()
148		if err != nil {
149			panic(err)
150		}
151		dstBaseQ := dstBaseBasic.Addr
152		MOVQ(dstBaseQ, dst)
153	}
154
155	srcBaseBasic, err := Param("src").Base().Resolve()
156	if err != nil {
157		panic(err)
158	}
159	srcBaseQ := srcBaseBasic.Addr
160
161	// Zero table
162	{
163		iReg := GP64()
164		MOVQ(U32(tableSize/8/16), iReg)
165		tablePtr := GP64()
166		LEAQ(table, tablePtr)
167		zeroXmm := XMM()
168		PXOR(zeroXmm, zeroXmm)
169
170		Label("zero_loop_" + name)
171		for i := 0; i < 8; i++ {
172			MOVOU(zeroXmm, Mem{Base: tablePtr, Disp: i * 16})
173		}
174		ADDQ(U8(16*8), tablePtr)
175		DECQ(iReg)
176		JNZ(LabelRef("zero_loop_" + name))
177	}
178
179	{
180		// nextEmit is offset n src where the next emitLiteral should start from.
181		MOVL(U32(0), nextEmitL)
182
183		const inputMargin = 8
184		tmp, tmp2, tmp3 := GP64(), GP64(), GP64()
185		MOVQ(lenSrcQ, tmp)
186		LEAQ(Mem{Base: tmp, Disp: -5}, tmp2)
187		// sLimitL := len(src) - inputMargin
188		LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3)
189
190		assert(func(ok LabelRef) {
191			CMPQ(tmp3, lenSrcQ)
192			JL(ok)
193		})
194
195		MOVL(tmp3.As32(), sLimitL)
196
197		// dstLimit := (len(src) - 5 ) - len(src)>>5
198		SHRQ(U8(5), tmp)
199		SUBL(tmp.As32(), tmp2.As32()) // tmp2 = tmp2 - tmp
200
201		assert(func(ok LabelRef) {
202			// if len(src) > len(src) - len(src)>>5 - 5: ok
203			CMPQ(lenSrcQ, tmp2)
204			JGE(ok)
205		})
206
207		LEAQ(Mem{Base: dst, Index: tmp2, Scale: 1}, tmp2)
208		MOVQ(tmp2, dstLimitPtrQ)
209	}
210
211	// s = 1
212	s := GP32()
213	MOVL(U32(1), s)
214	// repeatL = 1
215	MOVL(s, repeatL)
216
217	src := GP64()
218	Load(Param("src").Base(), src)
219
220	// Load cv
221	Label("search_loop_" + name)
222	candidate := GP32()
223	{
224		assert(func(ok LabelRef) {
225			// Check if somebody changed src
226			tmp := GP64()
227			MOVQ(srcBaseQ, tmp)
228			CMPQ(tmp, src)
229			JEQ(ok)
230		})
231
232		cv := GP64()
233		MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv)
234		nextS := GP32()
235		// nextS := s + (s-nextEmit)>>6 + 4
236		{
237			tmp := GP64()
238			MOVL(s, tmp.As32())           // tmp = s
239			SUBL(nextEmitL, tmp.As32())   // tmp = s - nextEmit
240			SHRL(U8(skipLog), tmp.As32()) // tmp = (s - nextEmit) >> skipLog
241			LEAL(Mem{Base: s, Disp: 4, Index: tmp, Scale: 1}, nextS)
242		}
243		// if nextS > sLimit {goto emitRemainder}
244		{
245			CMPL(nextS.As32(), sLimitL)
246			JGE(LabelRef("emit_remainder_" + name))
247		}
248		assert(func(ok LabelRef) {
249			// Check if s is valid (we should have jumped above if not)
250			tmp := GP64()
251			MOVQ(lenSrcQ, tmp)
252			CMPQ(tmp, s.As64())
253			JG(ok)
254		})
255		// move nextS to stack.
256		MOVL(nextS.As32(), nextSTempL)
257
258		candidate2 := GP32()
259		hasher := hashN(hashBytes, tableBits)
260		{
261			hash0, hash1 := GP64(), GP64()
262			MOVQ(cv, hash0)
263			MOVQ(cv, hash1)
264			SHRQ(U8(8), hash1)
265			hasher.hash(hash0)
266			hasher.hash(hash1)
267			MOVL(table.Idx(hash0, 4), candidate)
268			MOVL(table.Idx(hash1, 4), candidate2)
269			assert(func(ok LabelRef) {
270				CMPQ(hash0, U32(tableSize))
271				JL(ok)
272			})
273			assert(func(ok LabelRef) {
274				CMPQ(hash1, U32(tableSize))
275				JL(ok)
276			})
277
278			MOVL(s, table.Idx(hash0, 4))
279			tmp := GP32()
280			LEAL(Mem{Base: s, Disp: 1}, tmp)
281			MOVL(tmp, table.Idx(hash1, 4))
282		}
283
284		// Can be moved up if registers are available.
285		hash2 := GP64()
286		{
287			// hash2 := hash6(cv>>16, tableBits)
288			// hasher = hash6(tableBits)
289			MOVQ(cv, hash2)
290			SHRQ(U8(16), hash2)
291			hasher.hash(hash2)
292			assert(func(ok LabelRef) {
293				CMPQ(hash2, U32(tableSize))
294				JL(ok)
295			})
296		}
297
298		// En/disable repeat matching.
299		if true {
300			// Check repeat at offset checkRep
301			const checkRep = 1
302			{
303				// rep = s - repeat
304				rep := GP32()
305				MOVL(s, rep)
306				SUBL(repeatL, rep) // rep = s - repeat
307
308				// if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
309				left, right := GP64(), GP64()
310				MOVL(Mem{Base: src, Index: rep, Disp: checkRep, Scale: 1}, right.As32())
311				MOVQ(cv, left)
312				SHRQ(U8(checkRep*8), left)
313				CMPL(left.As32(), right.As32())
314				// BAIL, no repeat.
315				JNE(LabelRef("no_repeat_found_" + name))
316			}
317			// base = s + checkRep
318			base := GP32()
319			LEAL(Mem{Base: s, Disp: checkRep}, base)
320
321			// nextEmit before repeat.
322			nextEmit := GP32()
323			MOVL(nextEmitL, nextEmit)
324
325			// Extend back
326			if true {
327				i := GP32()
328				MOVL(base, i)
329				SUBL(repeatL, i)
330				JZ(LabelRef("repeat_extend_back_end_" + name))
331
332				Label("repeat_extend_back_loop_" + name)
333				// if base <= nextemit {exit}
334				CMPL(base.As32(), nextEmit)
335				JLE(LabelRef("repeat_extend_back_end_" + name))
336				// if src[i-1] == src[base-1]
337				tmp, tmp2 := GP64(), GP64()
338				MOVB(Mem{Base: src, Index: i, Scale: 1, Disp: -1}, tmp.As8())
339				MOVB(Mem{Base: src, Index: base, Scale: 1, Disp: -1}, tmp2.As8())
340				CMPB(tmp.As8(), tmp2.As8())
341				JNE(LabelRef("repeat_extend_back_end_" + name))
342				LEAL(Mem{Base: base, Disp: -1}, base)
343				DECL(i)
344				JNZ(LabelRef("repeat_extend_back_loop_" + name))
345			}
346			Label("repeat_extend_back_end_" + name)
347
348			// Base is now at start. Emit until base.
349			// d += emitLiteral(dst[d:], src[nextEmit:base])
350			if true {
351				o.emitLiteralsDstP(nextEmitL, base, src, dst, "repeat_emit_"+name)
352			}
353
354			// Extend forward
355			{
356				// s += 4 + checkRep
357				ADDL(U8(4+checkRep), s)
358
359				if true {
360					// candidate := s - repeat + 4 + checkRep
361					MOVL(s, candidate)
362					SUBL(repeatL, candidate) // candidate = s - repeat
363
364					// srcLeft = len(src) - s
365					srcLeft := GP64()
366					MOVQ(lenSrcQ, srcLeft)
367					SUBL(s, srcLeft.As32())
368					assert(func(ok LabelRef) {
369						// if srcleft < maxint32: ok
370						CMPQ(srcLeft, U32(0x7fffffff))
371						JL(ok)
372					})
373					// Forward address
374					forwardStart := GP64()
375					LEAQ(Mem{Base: src, Index: s, Scale: 1}, forwardStart)
376					// End address
377					backStart := GP64()
378					LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, backStart)
379
380					length := o.matchLen("repeat_extend_"+name, forwardStart, backStart, srcLeft, LabelRef("repeat_extend_forward_end_"+name))
381					forwardStart, backStart, srcLeft = nil, nil, nil
382					Label("repeat_extend_forward_end_" + name)
383					// s+= length
384					ADDL(length.As32(), s)
385				}
386			}
387			// Emit
388			if true {
389				// length = s-base
390				length := GP32()
391				MOVL(s, length)
392				SUBL(base.As32(), length) // length = s - base
393
394				offsetVal := GP32()
395				MOVL(repeatL, offsetVal)
396
397				if !o.snappy {
398					// if nextEmit == 0 {do copy instead...}
399					TESTL(nextEmit, nextEmit)
400					JZ(LabelRef("repeat_as_copy_" + name))
401
402					// Emit as repeat...
403					o.emitRepeat("match_repeat_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))
404
405					// Emit as copy instead...
406					Label("repeat_as_copy_" + name)
407				}
408				o.emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))
409
410				Label("repeat_end_emit_" + name)
411				// Store new dst and nextEmit
412				MOVL(s, nextEmitL)
413			}
414			// if s >= sLimit is picked up on next loop.
415			if false {
416				CMPL(s.As32(), sLimitL)
417				JGE(LabelRef("emit_remainder_" + name))
418			}
419			JMP(LabelRef("search_loop_" + name))
420		}
421		Label("no_repeat_found_" + name)
422		{
423			// Check candidates are ok. All must be < s and < len(src)
424			assert(func(ok LabelRef) {
425				tmp := GP64()
426				MOVQ(lenSrcQ, tmp)
427				CMPL(tmp.As32(), candidate)
428				JG(ok)
429			})
430			assert(func(ok LabelRef) {
431				CMPL(s, candidate)
432				JG(ok)
433			})
434			assert(func(ok LabelRef) {
435				tmp := GP64()
436				MOVQ(lenSrcQ, tmp)
437				CMPL(tmp.As32(), candidate2)
438				JG(ok)
439			})
440			assert(func(ok LabelRef) {
441				CMPL(s, candidate2)
442				JG(ok)
443			})
444
445			CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
446			JEQ(LabelRef("candidate_match_" + name))
447
448			tmp := GP32()
449			// cv >>= 8
450			SHRQ(U8(8), cv)
451
452			// candidate = int(table[hash2]) - load early.
453			MOVL(table.Idx(hash2, 4), candidate)
454			assert(func(ok LabelRef) {
455				tmp := GP64()
456				MOVQ(lenSrcQ, tmp)
457				CMPL(tmp.As32(), candidate)
458				JG(ok)
459			})
460			assert(func(ok LabelRef) {
461				// We may get s and s+1
462				tmp := GP32()
463				LEAL(Mem{Base: s, Disp: 2}, tmp)
464				CMPL(tmp, candidate)
465				JG(ok)
466			})
467
468			LEAL(Mem{Base: s, Disp: 2}, tmp)
469
470			//if uint32(cv>>8) == load32(src, candidate2)
471			CMPL(Mem{Base: src, Index: candidate2, Scale: 1}, cv.As32())
472			JEQ(LabelRef("candidate2_match_" + name))
473
474			// table[hash2] = uint32(s + 2)
475			MOVL(tmp, table.Idx(hash2, 4))
476
477			// cv >>= 8 (>> 16 total)
478			SHRQ(U8(8), cv)
479
480			// if uint32(cv>>16) == load32(src, candidate)
481			CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
482			JEQ(LabelRef("candidate3_match_" + name))
483
484			// No match found, next loop
485			// s = nextS
486			MOVL(nextSTempL, s)
487			JMP(LabelRef("search_loop_" + name))
488
489			// Matches candidate at s + 2 (3rd check)
490			Label("candidate3_match_" + name)
491			ADDL(U8(2), s)
492			JMP(LabelRef("candidate_match_" + name))
493
494			// Match at s + 1 (we calculated the hash, lets store it)
495			Label("candidate2_match_" + name)
496			// table[hash2] = uint32(s + 2)
497			MOVL(tmp, table.Idx(hash2, 4))
498			// s++
499			INCL(s)
500			MOVL(candidate2, candidate)
501		}
502	}
503
504	Label("candidate_match_" + name)
505	// We have a match at 's' with src offset in "candidate" that matches at least 4 bytes.
506	// Extend backwards
507	if true {
508		ne := GP32()
509		MOVL(nextEmitL, ne)
510		TESTL(candidate, candidate)
511		JZ(LabelRef("match_extend_back_end_" + name))
512
513		// candidate is tested when decremented, so we loop back here.
514		Label("match_extend_back_loop_" + name)
515		// if s <= nextEmit {exit}
516		CMPL(s, ne)
517		JLE(LabelRef("match_extend_back_end_" + name))
518		// if src[candidate-1] == src[s-1]
519		tmp, tmp2 := GP64(), GP64()
520		MOVB(Mem{Base: src, Index: candidate, Scale: 1, Disp: -1}, tmp.As8())
521		MOVB(Mem{Base: src, Index: s, Scale: 1, Disp: -1}, tmp2.As8())
522		CMPB(tmp.As8(), tmp2.As8())
523		JNE(LabelRef("match_extend_back_end_" + name))
524		LEAL(Mem{Base: s, Disp: -1}, s)
525		DECL(candidate)
526		JZ(LabelRef("match_extend_back_end_" + name))
527		JMP(LabelRef("match_extend_back_loop_" + name))
528	}
529	Label("match_extend_back_end_" + name)
530
531	// Bail if we exceed the maximum size.
532	if true {
533		// tmp = s-nextEmit
534		tmp := GP64()
535		MOVL(s, tmp.As32())
536		SUBL(nextEmitL, tmp.As32())
537		// tmp = &dst + s-nextEmit
538		LEAQ(Mem{Base: dst, Index: tmp, Scale: 1, Disp: literalMaxOverhead}, tmp)
539		CMPQ(tmp, dstLimitPtrQ)
540		JL(LabelRef("match_dst_size_check_" + name))
541		ri, err := ReturnIndex(0).Resolve()
542		if err != nil {
543			panic(err)
544		}
545		MOVQ(U32(0), ri.Addr)
546		RET()
547	}
548	Label("match_dst_size_check_" + name)
549	{
550		base := GP32()
551		MOVL(s, base.As32())
552		o.emitLiteralsDstP(nextEmitL, base, src, dst, "match_emit_"+name)
553	}
554	cv := GP64()
555	Label("match_nolit_loop_" + name)
556	{
557		// Update repeat
558		{
559			// repeat = base - candidate
560			repeatVal := GP64().As32()
561			MOVL(s, repeatVal)
562			SUBL(candidate, repeatVal)
563			MOVL(repeatVal, repeatL)
564		}
565		// s+=4, candidate+=4
566		ADDL(U8(4), s)
567		ADDL(U8(4), candidate)
568		// Extend the 4-byte match as long as possible and emit copy.
569		{
570			assert(func(ok LabelRef) {
571				// s must be > candidate cannot be equal.
572				CMPL(s, candidate)
573				JG(ok)
574			})
575			// srcLeft = len(src) - s
576			srcLeft := GP64()
577			MOVQ(lenSrcQ, srcLeft)
578			SUBL(s, srcLeft.As32())
579			assert(func(ok LabelRef) {
580				// if srcleft < maxint32: ok
581				CMPQ(srcLeft, U32(0x7fffffff))
582				JL(ok)
583			})
584
585			a, b := GP64(), GP64()
586			LEAQ(Mem{Base: src, Index: s, Scale: 1}, a)
587			LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, b)
588			length := o.matchLen("match_nolit_"+name,
589				a, b,
590				srcLeft,
591				LabelRef("match_nolit_end_"+name),
592			)
593			Label("match_nolit_end_" + name)
594			assert(func(ok LabelRef) {
595				CMPL(length.As32(), U32(math.MaxInt32))
596				JL(ok)
597			})
598			a, b, srcLeft = nil, nil, nil
599
600			// s += length (length is destroyed, use it now)
601			ADDL(length.As32(), s)
602
603			// Load offset from repeat value.
604			offset := GP64()
605			MOVL(repeatL, offset.As32())
606
607			// length += 4
608			ADDL(U8(4), length.As32())
609			MOVL(s, nextEmitL) // nextEmit = s
610			o.emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name))
611			Label("match_nolit_emitcopy_end_" + name)
612
613			// if s >= sLimit { end }
614			{
615				CMPL(s.As32(), sLimitL)
616				JGE(LabelRef("emit_remainder_" + name))
617			}
618			// Start load s-2 as early as possible...
619			MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, cv)
620			// Bail if we exceed the maximum size.
621			{
622				CMPQ(dst, dstLimitPtrQ)
623				JL(LabelRef("match_nolit_dst_ok_" + name))
624				ri, err := ReturnIndex(0).Resolve()
625				if err != nil {
626					panic(err)
627				}
628				MOVQ(U32(0), ri.Addr)
629				RET()
630				Label("match_nolit_dst_ok_" + name)
631			}
632		}
633		// cv must be set to value at s-2 before arriving here
634		{
635			// Check for an immediate match, otherwise start search at s+1
636			// Index s-2
637			hasher := hashN(hashBytes, tableBits)
638			hash0, hash1 := GP64(), GP64()
639			MOVQ(cv, hash0) // src[s-2]
640			SHRQ(U8(16), cv)
641			MOVQ(cv, hash1) // src[s]
642			hasher.hash(hash0)
643			hasher.hash(hash1)
644			sm2 := GP32() // s - 2
645			LEAL(Mem{Base: s, Disp: -2}, sm2)
646			assert(func(ok LabelRef) {
647				CMPQ(hash0, U32(tableSize))
648				JL(ok)
649			})
650			assert(func(ok LabelRef) {
651				CMPQ(hash1, U32(tableSize))
652				JL(ok)
653			})
654			addr := GP64()
655			LEAQ(table.Idx(hash1, 4), addr)
656			MOVL(Mem{Base: addr}, candidate)
657			MOVL(sm2, table.Idx(hash0, 4))
658			MOVL(s, Mem{Base: addr})
659			CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
660			JEQ(LabelRef("match_nolit_loop_" + name))
661			INCL(s)
662		}
663		JMP(LabelRef("search_loop_" + name))
664	}
665
666	Label("emit_remainder_" + name)
667	// Bail if we exceed the maximum size.
668	// if d+len(src)-nextEmitL > dstLimitPtrQ {	return 0
669	{
670		// remain = len(src) - nextEmit
671		remain := GP64()
672		MOVQ(lenSrcQ, remain)
673		SUBL(nextEmitL, remain.As32())
674
675		dstExpect := GP64()
676		// dst := dst + (len(src)-nextEmitL)
677
678		LEAQ(Mem{Base: dst, Index: remain, Scale: 1, Disp: literalMaxOverhead}, dstExpect)
679		CMPQ(dstExpect, dstLimitPtrQ)
680		JL(LabelRef("emit_remainder_ok_" + name))
681		ri, err := ReturnIndex(0).Resolve()
682		if err != nil {
683			panic(err)
684		}
685		MOVQ(U32(0), ri.Addr)
686		RET()
687		Label("emit_remainder_ok_" + name)
688	}
689	// emitLiteral(dst[d:], src[nextEmitL:])
690	emitEnd := GP64()
691	MOVQ(lenSrcQ, emitEnd)
692
693	// Emit final literals.
694	o.emitLiteralsDstP(nextEmitL, emitEnd, src, dst, "emit_remainder_"+name)
695
696	// Assert size is < limit
697	assert(func(ok LabelRef) {
698		// if dstBaseQ <  dstLimitPtrQ: ok
699		CMPQ(dst, dstLimitPtrQ)
700		JL(ok)
701	})
702
703	// length := start - base (ptr arithmetic)
704	length := GP64()
705	base := Load(Param("dst").Base(), GP64())
706	MOVQ(dst, length)
707	SUBQ(base, length)
708
709	// Assert size is < len(src)
710	assert(func(ok LabelRef) {
711		// if len(src) >= length: ok
712		CMPQ(lenSrcQ, length)
713		JGE(ok)
714	})
715	// Assert size is < len(dst)
716	assert(func(ok LabelRef) {
717		// if len(dst) >= length: ok
718		CMPQ(lenDstQ, length)
719		JGE(ok)
720	})
721	Store(length, ReturnIndex(0))
722	RET()
723}
724
725// emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase.
726// Checks if base == nextemit.
727// src & base are untouched.
728func (o options) emitLiterals(nextEmitL Mem, base reg.GPVirtual, src reg.GPVirtual, dstBase Mem, name string) {
729	nextEmit, litLen, dstBaseTmp, litBase := GP32(), GP32(), GP64(), GP64()
730	MOVL(nextEmitL, nextEmit)
731	CMPL(nextEmit, base.As32())
732	JEQ(LabelRef("emit_literal_skip_" + name))
733	MOVL(base.As32(), litLen.As32())
734
735	// Base is now next emit.
736	MOVL(base.As32(), nextEmitL)
737
738	// litBase = src[nextEmitL:]
739	LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase)
740	SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit
741
742	// Load (and store when we return)
743	MOVQ(dstBase, dstBaseTmp)
744	o.emitLiteral(name, litLen, nil, dstBaseTmp, litBase, LabelRef("emit_literal_done_"+name), true)
745	Label("emit_literal_done_" + name)
746
747	// Emitted length must be > litlen.
748	// We have already checked for len(0) above.
749	assert(func(ok LabelRef) {
750		tmp := GP64()
751		MOVQ(dstBaseTmp, tmp)
752		SUBQ(dstBase, tmp) // tmp = dstBaseTmp - dstBase
753		// if tmp > litLen: ok
754		CMPQ(tmp, litLen.As64())
755		JG(ok)
756	})
757	// Store updated dstBase
758	MOVQ(dstBaseTmp, dstBase)
759	Label("emit_literal_skip_" + name)
760}
761
762// emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase.
763// Checks if base == nextemit.
764// src & base are untouched.
765func (o options) emitLiteralsDstP(nextEmitL Mem, base reg.GPVirtual, src, dst reg.GPVirtual, name string) {
766	nextEmit, litLen, litBase := GP32(), GP32(), GP64()
767	MOVL(nextEmitL, nextEmit)
768	CMPL(nextEmit, base.As32())
769	JEQ(LabelRef("emit_literal_done_" + name))
770	MOVL(base.As32(), litLen.As32())
771
772	// Base is now next emit.
773	MOVL(base.As32(), nextEmitL)
774
775	// litBase = src[nextEmitL:]
776	LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase)
777	SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit
778
779	// Load (and store when we return)
780	o.emitLiteral(name, litLen, nil, dst, litBase, LabelRef("emit_literal_done_"+name), true)
781	Label("emit_literal_done_" + name)
782}
783
784type hashGen struct {
785	bytes     int
786	tablebits int
787	mulreg    reg.GPVirtual
788}
789
790// hashN uses multiply to get a 'output' hash on the hash of the lowest 'bytes' bytes in value.
791func hashN(hashBytes, tablebits int) hashGen {
792	h := hashGen{
793		bytes:     hashBytes,
794		tablebits: tablebits,
795		mulreg:    GP64(),
796	}
797	primebytes := uint64(0)
798	switch hashBytes {
799	case 3:
800		primebytes = 506832829
801	case 4:
802		primebytes = 2654435761
803	case 5:
804		primebytes = 889523592379
805	case 6:
806		primebytes = 227718039650203
807	case 7:
808		primebytes = 58295818150454627
809	case 8:
810		primebytes = 0xcf1bbcdcb7a56463
811	default:
812		panic("invalid hash length")
813	}
814	MOVQ(Imm(primebytes), h.mulreg)
815	return h
816}
817
818// hash uses multiply to get hash of the value.
819func (h hashGen) hash(val reg.GPVirtual) {
820	// Move value to top of register.
821	SHLQ(U8(64-8*h.bytes), val)
822	IMULQ(h.mulreg, val)
823	// Move value to bottom
824	SHRQ(U8(64-h.tablebits), val)
825}
826
827func (o options) genEmitLiteral() {
828	TEXT("emitLiteral", NOSPLIT, "func(dst, lit []byte) int")
829	Doc("emitLiteral writes a literal chunk and returns the number of bytes written.", "",
830		"It assumes that:",
831		"  dst is long enough to hold the encoded bytes",
832		"  0 <= len(lit) && len(lit) <= math.MaxUint32", "")
833	Pragma("noescape")
834
835	dstBase, litBase, litLen, retval := GP64(), GP64(), GP64(), GP64()
836	Load(Param("lit").Len(), litLen)
837	Load(Param("dst").Base(), dstBase)
838	Load(Param("lit").Base(), litBase)
839	TESTQ(litLen, litLen)
840	JZ(LabelRef("emit_literal_end_standalone_skip"))
841	o.emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_standalone", false)
842
843	Label("emit_literal_end_standalone_skip")
844	XORQ(retval, retval)
845
846	Label("emit_literal_end_standalone")
847	Store(retval, ReturnIndex(0))
848	RET()
849
850}
851
852// emitLiteral can be used for inlining an emitLiteral call.
853// litLen must be > 0.
854// stack must have at least 32 bytes.
855// retval will contain emitted bytes, but can be nil if this is not interesting.
856// dstBase and litBase are updated.
857// Uses 2 GP registers. With AVX 4 registers.
858// If updateDst is true dstBase will have the updated end pointer and an additional register will be used.
859func (o options) emitLiteral(name string, litLen, retval, dstBase, litBase reg.GPVirtual, end LabelRef, updateDst bool) {
860	n := GP32()
861	n16 := GP32()
862
863	// litLen must be > 0
864	assert(func(ok LabelRef) {
865		TESTL(litLen.As32(), litLen.As32())
866		JNZ(ok)
867	})
868
869	// We always add litLen bytes
870	if retval != nil {
871		MOVL(litLen.As32(), retval.As32())
872	}
873	// n = litlen - 1
874	LEAL(Mem{Base: litLen.As32(), Disp: -1}, n)
875
876	// Find number of bytes to emit for tag.
877	CMPL(n.As32(), U8(60))
878	JLT(LabelRef("one_byte_" + name))
879	CMPL(n.As32(), U32(1<<8))
880	JLT(LabelRef("two_bytes_" + name))
881	if o.maxLen >= 1<<16 {
882		CMPL(n.As32(), U32(1<<16))
883		JLT(LabelRef("three_bytes_" + name))
884	} else {
885		JMP(LabelRef("three_bytes_" + name))
886	}
887	if o.maxLen >= 1<<16 {
888		if o.maxLen >= 1<<24 {
889			CMPL(n.As32(), U32(1<<24))
890			JLT(LabelRef("four_bytes_" + name))
891		} else {
892			JMP(LabelRef("four_bytes_" + name))
893		}
894	}
895	if o.maxLen >= 1<<24 {
896		Label("five_bytes_" + name)
897		MOVB(U8(252), Mem{Base: dstBase})
898		MOVL(n.As32(), Mem{Base: dstBase, Disp: 1})
899		if retval != nil {
900			ADDQ(U8(5), retval)
901		}
902		ADDQ(U8(5), dstBase)
903		JMP(LabelRef("memmove_long_" + name))
904	}
905	if o.maxLen >= 1<<16 {
906		Label("four_bytes_" + name)
907		MOVL(n, n16)
908		SHRL(U8(16), n16.As32())
909		MOVB(U8(248), Mem{Base: dstBase})
910		MOVW(n.As16(), Mem{Base: dstBase, Disp: 1})
911		MOVB(n16.As8(), Mem{Base: dstBase, Disp: 3})
912		if retval != nil {
913			ADDQ(U8(4), retval)
914		}
915		ADDQ(U8(4), dstBase)
916		JMP(LabelRef("memmove_long_" + name))
917	}
918	Label("three_bytes_" + name)
919	MOVB(U8(0xf4), Mem{Base: dstBase})
920	MOVW(n.As16(), Mem{Base: dstBase, Disp: 1})
921	if retval != nil {
922		ADDQ(U8(3), retval)
923	}
924	ADDQ(U8(3), dstBase)
925	JMP(LabelRef("memmove_long_" + name))
926
927	Label("two_bytes_" + name)
928	MOVB(U8(0xf0), Mem{Base: dstBase})
929	MOVB(n.As8(), Mem{Base: dstBase, Disp: 1})
930	if retval != nil {
931		ADDQ(U8(2), retval)
932	}
933	ADDQ(U8(2), dstBase)
934	CMPL(n.As32(), U8(64))
935	JL(LabelRef("memmove_" + name))
936	JMP(LabelRef("memmove_long_" + name))
937
938	Label("one_byte_" + name)
939	SHLB(U8(2), n.As8())
940	MOVB(n.As8(), Mem{Base: dstBase})
941	if retval != nil {
942		ADDQ(U8(1), retval)
943	}
944	ADDQ(U8(1), dstBase)
945	// Fallthrough
946
947	Label("memmove_" + name)
948
949	// copy(dst[i:], lit)
950	dstEnd := GP64()
951	copyEnd := end
952	if updateDst {
953		copyEnd = LabelRef("memmove_end_copy_" + name)
954		LEAQ(Mem{Base: dstBase, Index: litLen, Scale: 1}, dstEnd)
955	}
956	length := GP64()
957	MOVL(litLen.As32(), length.As32())
958
959	// updates litBase.
960	o.genMemMoveShort("emit_lit_memmove_"+name, dstBase, litBase, length, copyEnd)
961
962	if updateDst {
963		Label("memmove_end_copy_" + name)
964		MOVQ(dstEnd, dstBase)
965	}
966	JMP(end)
967
968	// > 32 bytes
969	Label("memmove_long_" + name)
970
971	// copy(dst[i:], lit)
972	dstEnd = GP64()
973	copyEnd = end
974	if updateDst {
975		copyEnd = LabelRef("memmove_end_copy_long_" + name)
976		LEAQ(Mem{Base: dstBase, Index: litLen, Scale: 1}, dstEnd)
977	}
978	length = GP64()
979	MOVL(litLen.As32(), length.As32())
980
981	// updates litBase.
982	o.genMemMoveLong("emit_lit_memmove_long_"+name, dstBase, litBase, length, copyEnd)
983
984	if updateDst {
985		Label("memmove_end_copy_long_" + name)
986		MOVQ(dstEnd, dstBase)
987	}
988	JMP(end)
989	// Should be unreachable
990	if debug {
991		INT(Imm(3))
992	}
993	return
994}
995
996// genEmitRepeat generates a standlone emitRepeat.
997func (o options) genEmitRepeat() {
998	TEXT("emitRepeat", NOSPLIT, "func(dst []byte, offset, length int) int")
999	Doc("emitRepeat writes a repeat chunk and returns the number of bytes written.",
1000		"Length must be at least 4 and < 1<<32", "")
1001	Pragma("noescape")
1002
1003	dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()
1004
1005	// retval = 0
1006	XORQ(retval, retval)
1007
1008	Load(Param("dst").Base(), dstBase)
1009	Load(Param("offset"), offset)
1010	Load(Param("length"), length)
1011	o.emitRepeat("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_repeat_end"))
1012	Label("gen_emit_repeat_end")
1013	Store(retval, ReturnIndex(0))
1014	RET()
1015}
1016
1017// emitRepeat can be used for inlining an emitRepeat call.
1018// length >= 4 and < 1<<32
1019// length is modified. dstBase is updated. retval is added to input.
1020// retval can be nil.
1021// Will jump to end label when finished.
1022// Uses 1 GP register.
1023func (o options) emitRepeat(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) {
1024	Label("emit_repeat_again_" + name)
1025	tmp := GP32()
1026	MOVL(length.As32(), tmp) // Copy length
1027	// length -= 4
1028	LEAL(Mem{Base: length, Disp: -4}, length.As32())
1029
1030	// if length <= 4 (use copied value)
1031	CMPL(tmp.As32(), U8(8))
1032	JLE(LabelRef("repeat_two_" + name))
1033
1034	// length < 8 && offset < 2048
1035	CMPL(tmp.As32(), U8(12))
1036	JGE(LabelRef("cant_repeat_two_offset_" + name))
1037	if o.maxLen >= 2048 {
1038		CMPL(offset.As32(), U32(2048))
1039		JLT(LabelRef("repeat_two_offset_" + name))
1040	}
1041
1042	const maxRepeat = ((1 << 24) - 1) + 65536
1043	Label("cant_repeat_two_offset_" + name)
1044	CMPL(length.As32(), U32((1<<8)+4))
1045	JLT(LabelRef("repeat_three_" + name)) // if length < (1<<8)+4
1046	if o.maxLen >= (1<<16)+(1<<8) {
1047		CMPL(length.As32(), U32((1<<16)+(1<<8)))
1048		JLT(LabelRef("repeat_four_" + name)) // if length < (1 << 16) + (1 << 8)
1049	} else {
1050		// Not needed, we should skip to it when generating.
1051		// JMP(LabelRef("repeat_four_" + name)) // if length < (1 << 16) + (1 << 8)
1052	}
1053	if o.maxLen >= maxRepeat {
1054		CMPL(length.As32(), U32(maxRepeat))
1055		JLT(LabelRef("repeat_five_" + name)) // If less than 24 bits to represent.
1056
1057		// We have have more than 24 bits
1058		// Emit so we have at least 4 bytes left.
1059		LEAL(Mem{Base: length, Disp: -(maxRepeat - 4)}, length.As32()) // length -= (maxRepeat - 4)
1060		MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase})                   // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
1061		MOVW(U16(65531), Mem{Base: dstBase, Disp: 2})                  // 0xfffb
1062		MOVB(U8(255), Mem{Base: dstBase, Disp: 4})
1063		ADDQ(U8(5), dstBase)
1064		if retval != nil {
1065			ADDQ(U8(5), retval)
1066		}
1067		JMP(LabelRef("emit_repeat_again_" + name))
1068	} else {
1069		// Not needed.
1070		// JMP(LabelRef("repeat_five_" + name)) // If less than 24 bits to represent.
1071	}
1072
1073	// Must be able to be within 5 bytes.
1074	if o.maxLen >= (1<<16)+(1<<8) {
1075		Label("repeat_five_" + name)
1076		LEAL(Mem{Base: length, Disp: -65536}, length.As32()) // length -= 65536
1077		MOVL(length.As32(), offset.As32())
1078		MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase})     // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
1079		MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8)
1080		SARL(U8(16), offset.As32())                      // offset = length >> 16
1081		MOVB(offset.As8(), Mem{Base: dstBase, Disp: 4})  // dst[4] = length >> 16
1082		if retval != nil {
1083			ADDQ(U8(5), retval) // i += 5
1084		}
1085		ADDQ(U8(5), dstBase) // dst += 5
1086		JMP(end)
1087	}
1088	Label("repeat_four_" + name)
1089	LEAL(Mem{Base: length, Disp: -256}, length.As32()) // length -= 256
1090	MOVW(U16(6<<2|tagCopy1), Mem{Base: dstBase})       // dst[0] = 6<<2 | tagCopy1, dst[1] = 0
1091	MOVW(length.As16(), Mem{Base: dstBase, Disp: 2})   // dst[2] = uint8(length), dst[3] = uint8(length >> 8)
1092	if retval != nil {
1093		ADDQ(U8(4), retval) // i += 4
1094	}
1095	ADDQ(U8(4), dstBase) // dst += 4
1096	JMP(end)
1097
1098	Label("repeat_three_" + name)
1099	LEAL(Mem{Base: length, Disp: -4}, length.As32()) // length -= 4
1100	MOVW(U16(5<<2|tagCopy1), Mem{Base: dstBase})     // dst[0] = 5<<2 | tagCopy1, dst[1] = 0
1101	MOVB(length.As8(), Mem{Base: dstBase, Disp: 2})  // dst[2] = uint8(length)
1102	if retval != nil {
1103		ADDQ(U8(3), retval) // i += 3
1104	}
1105	ADDQ(U8(3), dstBase) // dst += 3
1106	JMP(end)
1107
1108	Label("repeat_two_" + name)
1109	// dst[0] = uint8(length)<<2 | tagCopy1, dst[1] = 0
1110	SHLL(U8(2), length.As32())
1111	ORL(U8(tagCopy1), length.As32())
1112	MOVW(length.As16(), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
1113	if retval != nil {
1114		ADDQ(U8(2), retval) // i += 2
1115	}
1116	ADDQ(U8(2), dstBase) // dst += 2
1117	JMP(end)
1118
1119	Label("repeat_two_offset_" + name)
1120	// Emit the remaining copy, encoded as 2 bytes.
1121	// dst[1] = uint8(offset)
1122	// dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
1123	tmp = GP64()
1124	XORQ(tmp, tmp)
1125	// Use scale and displacement to shift and subtract values from length.
1126	LEAL(Mem{Base: tmp, Index: length, Scale: 4, Disp: tagCopy1}, length.As32())
1127	MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte
1128	SARL(U8(8), offset.As32())                      // Remove lower
1129	SHLL(U8(5), offset.As32())                      // Shift back up
1130	ORL(offset.As32(), length.As32())               // OR result
1131	MOVB(length.As8(), Mem{Base: dstBase, Disp: 0})
1132	if retval != nil {
1133		ADDQ(U8(2), retval) // i += 2
1134	}
1135	ADDQ(U8(2), dstBase) // dst += 2
1136
1137	JMP(end)
1138}
1139
1140// emitCopy writes a copy chunk and returns the number of bytes written.
1141//
1142// It assumes that:
1143//	dst is long enough to hold the encoded bytes
1144//	1 <= offset && offset <= math.MaxUint32
1145//	4 <= length && length <= 1 << 24
1146
1147// genEmitCopy generates a standlone emitCopy
1148func (o options) genEmitCopy() {
1149	TEXT("emitCopy", NOSPLIT, "func(dst []byte, offset, length int) int")
1150	Doc("emitCopy writes a copy chunk and returns the number of bytes written.", "",
1151		"It assumes that:",
1152		"  dst is long enough to hold the encoded bytes",
1153		"  1 <= offset && offset <= math.MaxUint32",
1154		"  4 <= length && length <= 1 << 24", "")
1155	Pragma("noescape")
1156
1157	dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()
1158
1159	//	i := 0
1160	XORQ(retval, retval)
1161
1162	Load(Param("dst").Base(), dstBase)
1163	Load(Param("offset"), offset)
1164	Load(Param("length"), length)
1165	o.emitCopy("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_copy_end"))
1166	Label("gen_emit_copy_end")
1167	Store(retval, ReturnIndex(0))
1168	RET()
1169}
1170
1171// emitCopy writes a copy chunk and returns the number of bytes written.
1172//
1173// It assumes that:
1174//	dst is long enough to hold the encoded bytes
1175//	1 <= offset && offset <= math.MaxUint32
1176//	4 <= length && length <= 1 << 24
1177
1178// genEmitCopy generates a standlone emitCopy
1179func (o options) genEmitCopyNoRepeat() {
1180	TEXT("emitCopyNoRepeat", NOSPLIT, "func(dst []byte, offset, length int) int")
1181	Doc("emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.", "",
1182		"It assumes that:",
1183		"  dst is long enough to hold the encoded bytes",
1184		"  1 <= offset && offset <= math.MaxUint32",
1185		"  4 <= length && length <= 1 << 24", "")
1186	Pragma("noescape")
1187
1188	dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()
1189
1190	//	i := 0
1191	XORQ(retval, retval)
1192
1193	Load(Param("dst").Base(), dstBase)
1194	Load(Param("offset"), offset)
1195	Load(Param("length"), length)
1196	o.emitCopy("standalone_snappy", length, offset, retval, dstBase, "gen_emit_copy_end_snappy")
1197	Label("gen_emit_copy_end_snappy")
1198	Store(retval, ReturnIndex(0))
1199	RET()
1200}
1201
1202const (
1203	tagLiteral = 0x00
1204	tagCopy1   = 0x01
1205	tagCopy2   = 0x02
1206	tagCopy4   = 0x03
1207)
1208
1209// emitCopy can be used for inlining an emitCopy call.
1210// length is modified (and junk). dstBase is updated. retval is added to input.
1211// retval can be nil.
1212// Will jump to end label when finished.
1213// Uses 2 GP registers.
1214func (o options) emitCopy(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) {
1215	if o.maxLen >= 65536 {
1216		//if offset >= 65536 {
1217		CMPL(offset.As32(), U32(65536))
1218		JL(LabelRef("two_byte_offset_" + name))
1219
1220		// offset is >= 65536
1221		//	if length <= 64 goto four_bytes_remain_
1222		Label("four_bytes_loop_back_" + name)
1223		CMPL(length.As32(), U8(64))
1224		JLE(LabelRef("four_bytes_remain_" + name))
1225
1226		// Emit a length 64 copy, encoded as 5 bytes.
1227		//		dst[0] = 63<<2 | tagCopy4
1228		MOVB(U8(63<<2|tagCopy4), Mem{Base: dstBase})
1229		//		dst[4] = uint8(offset >> 24)
1230		//		dst[3] = uint8(offset >> 16)
1231		//		dst[2] = uint8(offset >> 8)
1232		//		dst[1] = uint8(offset)
1233		MOVL(offset.As32(), Mem{Base: dstBase, Disp: 1})
1234		//		length -= 64
1235		LEAL(Mem{Base: length, Disp: -64}, length.As32())
1236		if retval != nil {
1237			ADDQ(U8(5), retval) // i+=5
1238		}
1239		ADDQ(U8(5), dstBase) // dst+=5
1240
1241		//	if length >= 4 {
1242		CMPL(length.As32(), U8(4))
1243		JL(LabelRef("four_bytes_remain_" + name))
1244
1245		// Emit remaining as repeats
1246		//	return 5 + emitRepeat(dst[5:], offset, length)
1247		// Inline call to emitRepeat. Will jump to end
1248		if !o.snappy {
1249			o.emitRepeat(name+"_emit_copy", length, offset, retval, dstBase, end)
1250		}
1251		JMP(LabelRef("four_bytes_loop_back_" + name))
1252
1253		Label("four_bytes_remain_" + name)
1254		//	if length == 0 {
1255		//		return i
1256		//	}
1257		TESTL(length.As32(), length.As32())
1258		JZ(end)
1259
1260		// Emit a copy, offset encoded as 4 bytes.
1261		//	dst[i+0] = uint8(length-1)<<2 | tagCopy4
1262		//	dst[i+1] = uint8(offset)
1263		//	dst[i+2] = uint8(offset >> 8)
1264		//	dst[i+3] = uint8(offset >> 16)
1265		//	dst[i+4] = uint8(offset >> 24)
1266		tmp := GP64()
1267		MOVB(U8(tagCopy4), tmp.As8())
1268		// Use displacement to subtract 1 from upshifted length.
1269		LEAL(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length.As32())
1270		MOVB(length.As8(), Mem{Base: dstBase})
1271		MOVL(offset.As32(), Mem{Base: dstBase, Disp: 1})
1272		//	return i + 5
1273		if retval != nil {
1274			ADDQ(U8(5), retval)
1275		}
1276		ADDQ(U8(5), dstBase)
1277		JMP(end)
1278	}
1279	Label("two_byte_offset_" + name)
1280	// Offset no more than 2 bytes.
1281
1282	//if length > 64 {
1283	CMPL(length.As32(), U8(64))
1284	JLE(LabelRef("two_byte_offset_short_" + name))
1285	// Emit a length 60 copy, encoded as 3 bytes.
1286	// Emit remaining as repeat value (minimum 4 bytes).
1287	//	dst[2] = uint8(offset >> 8)
1288	//	dst[1] = uint8(offset)
1289	//	dst[0] = 59<<2 | tagCopy2
1290	MOVB(U8(59<<2|tagCopy2), Mem{Base: dstBase})
1291	MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1})
1292	//	length -= 60
1293	LEAL(Mem{Base: length, Disp: -60}, length.As32())
1294
1295	// Emit remaining as repeats, at least 4 bytes remain.
1296	//	return 3 + emitRepeat(dst[3:], offset, length)
1297	//}
1298	ADDQ(U8(3), dstBase)
1299	if retval != nil {
1300		ADDQ(U8(3), retval)
1301	}
1302	// Inline call to emitRepeat. Will jump to end
1303	if !o.snappy {
1304		o.emitRepeat(name+"_emit_copy_short", length, offset, retval, dstBase, end)
1305	}
1306	JMP(LabelRef("two_byte_offset_" + name))
1307
1308	Label("two_byte_offset_short_" + name)
1309	//if length >= 12 || offset >= 2048 {
1310	CMPL(length.As32(), U8(12))
1311	JGE(LabelRef("emit_copy_three_" + name))
1312	if o.maxLen >= 2048 {
1313		CMPL(offset.As32(), U32(2048))
1314		JGE(LabelRef("emit_copy_three_" + name))
1315	}
1316	// Emit the remaining copy, encoded as 2 bytes.
1317	// dst[1] = uint8(offset)
1318	// dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
1319	tmp := GP64()
1320	MOVB(U8(tagCopy1), tmp.As8())
1321	// Use scale and displacement to shift and subtract values from length.
1322	LEAL(Mem{Base: tmp, Index: length, Scale: 4, Disp: -(4 << 2)}, length.As32())
1323	MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte
1324	SHRL(U8(8), offset.As32())                      // Remove lower
1325	SHLL(U8(5), offset.As32())                      // Shift back up
1326	ORL(offset.As32(), length.As32())               // OR result
1327	MOVB(length.As8(), Mem{Base: dstBase, Disp: 0})
1328	if retval != nil {
1329		ADDQ(U8(2), retval) // i += 2
1330	}
1331	ADDQ(U8(2), dstBase) // dst += 2
1332	// return 2
1333	JMP(end)
1334
1335	Label("emit_copy_three_" + name)
1336	//	// Emit the remaining copy, encoded as 3 bytes.
1337	//	dst[2] = uint8(offset >> 8)
1338	//	dst[1] = uint8(offset)
1339	//	dst[0] = uint8(length-1)<<2 | tagCopy2
1340	tmp = GP64()
1341	MOVB(U8(tagCopy2), tmp.As8())
1342	LEAL(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length.As32())
1343	MOVB(length.As8(), Mem{Base: dstBase})
1344	MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1})
1345	//	return 3
1346	if retval != nil {
1347		ADDQ(U8(3), retval) // i += 3
1348	}
1349	ADDQ(U8(3), dstBase) // dst += 3
1350	JMP(end)
1351}
1352
1353// func memmove(to, from unsafe.Pointer, n uintptr)
1354// src and dst may not overlap.
1355// Non AVX uses 2 GP register, 16 SSE2 registers.
1356// AVX uses 4 GP registers 16 AVX/SSE registers.
1357// All passed registers may be updated.
1358// Length must be 1 -> 64 bytes
1359func (o options) genMemMoveShort(name string, dst, src, length reg.GPVirtual, end LabelRef) {
1360	AX, CX := GP64(), GP64()
1361	name += "_memmove_"
1362
1363	// Only enable if length can be 0.
1364	if false {
1365		TESTQ(length, length)
1366		JEQ(end)
1367	}
1368	assert(func(ok LabelRef) {
1369		CMPQ(length, U8(64))
1370		JBE(ok)
1371	})
1372	assert(func(ok LabelRef) {
1373		TESTQ(length, length)
1374		JNZ(ok)
1375	})
1376	Label(name + "tail")
1377	CMPQ(length, U8(3))
1378	JB(LabelRef(name + "move_1or2"))
1379	JE(LabelRef(name + "move_3"))
1380	CMPQ(length, U8(8))
1381	JB(LabelRef(name + "move_4through7"))
1382	CMPQ(length, U8(16))
1383	JBE(LabelRef(name + "move_8through16"))
1384	CMPQ(length, U8(32))
1385	JBE(LabelRef(name + "move_17through32"))
1386	if debug {
1387		CMPQ(length, U8(64))
1388		JBE(LabelRef(name + "move_33through64"))
1389		INT(U8(3))
1390	}
1391	JMP(LabelRef(name + "move_33through64"))
1392
1393	//genMemMoveLong(name, dst, src, length, end)
1394
1395	Label(name + "move_1or2")
1396	MOVB(Mem{Base: src}, AX.As8())
1397	MOVB(Mem{Base: src, Disp: -1, Index: length, Scale: 1}, CX.As8())
1398	MOVB(AX.As8(), Mem{Base: dst})
1399	MOVB(CX.As8(), Mem{Base: dst, Disp: -1, Index: length, Scale: 1})
1400	JMP(end)
1401
1402	Label(name + "move_3")
1403	MOVW(Mem{Base: src}, AX.As16())
1404	MOVB(Mem{Base: src, Disp: 2}, CX.As8())
1405	MOVW(AX.As16(), Mem{Base: dst})
1406	MOVB(CX.As8(), Mem{Base: dst, Disp: 2})
1407	JMP(end)
1408
1409	Label(name + "move_4through7")
1410	MOVL(Mem{Base: src}, AX.As32())
1411	MOVL(Mem{Base: src, Disp: -4, Index: length, Scale: 1}, CX.As32())
1412	MOVL(AX.As32(), Mem{Base: dst})
1413	MOVL(CX.As32(), Mem{Base: dst, Disp: -4, Index: length, Scale: 1})
1414	JMP(end)
1415
1416	Label(name + "move_8through16")
1417	MOVQ(Mem{Base: src}, AX)
1418	MOVQ(Mem{Base: src, Disp: -8, Index: length, Scale: 1}, CX)
1419	MOVQ(AX, Mem{Base: dst})
1420	MOVQ(CX, Mem{Base: dst, Disp: -8, Index: length, Scale: 1})
1421	JMP(end)
1422
1423	Label(name + "move_17through32")
1424	X0, X1, X2, X3 := XMM(), XMM(), XMM(), XMM()
1425
1426	MOVOU(Mem{Base: src}, X0)
1427	MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X1)
1428	MOVOU(X0, Mem{Base: dst})
1429	MOVOU(X1, Mem{Base: dst, Disp: -16, Index: length, Scale: 1})
1430	JMP(end)
1431
1432	Label(name + "move_33through64")
1433	MOVOU(Mem{Base: src}, X0)
1434	MOVOU(Mem{Base: src, Disp: 16}, X1)
1435	MOVOU(Mem{Base: src, Disp: -32, Index: length, Scale: 1}, X2)
1436	MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X3)
1437	MOVOU(X0, Mem{Base: dst})
1438	MOVOU(X1, Mem{Base: dst, Disp: 16})
1439	MOVOU(X2, Mem{Base: dst, Disp: -32, Index: length, Scale: 1})
1440	MOVOU(X3, Mem{Base: dst, Disp: -16, Index: length, Scale: 1})
1441	JMP(end)
1442}
1443
1444// func genMemMoveLong(to, from unsafe.Pointer, n uintptr)
1445// src and dst may not overlap.
1446// length must be >= 64 bytes.
1447// Non AVX uses 2 GP register, 16 SSE2 registers.
1448// AVX uses 4 GP registers 16 AVX/SSE registers.
1449// All passed registers may be updated.
1450func (o options) genMemMoveLong(name string, dst, src, length reg.GPVirtual, end LabelRef) {
1451	name += "large_"
1452
1453	assert(func(ok LabelRef) {
1454		CMPQ(length, U8(64))
1455		JAE(ok)
1456	})
1457
1458	// These are disabled.
1459	// AVX is ever so slightly faster, but it is disabled for simplicity.
1460	const branchLoops = false
1461	const avx = false && branchLoops
1462	if branchLoops {
1463		CMPQ(length, U8(128))
1464		JBE(LabelRef(name + "move_65through128"))
1465		CMPQ(length, U32(256))
1466		JBE(LabelRef(name + "move_129through256"))
1467		if avx {
1468			JMP(LabelRef(name + "avxUnaligned"))
1469		} else {
1470			JMP(LabelRef(name + "forward_sse"))
1471		}
1472
1473		X0, X1, X2, X3, X4, X5, X6, X7 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
1474		X8, X9, X10, X11, X12, X13, X14, X15 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
1475		Label(name + "move_65through128")
1476		MOVOU(Mem{Base: src}, X0)
1477		MOVOU(Mem{Base: src, Disp: 16}, X1)
1478		MOVOU(Mem{Base: src, Disp: 32}, X2)
1479		MOVOU(Mem{Base: src, Disp: 48}, X3)
1480		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12)
1481		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13)
1482		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14)
1483		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15)
1484		MOVOU(X0, Mem{Base: dst})
1485		MOVOU(X1, Mem{Base: dst, Disp: 16})
1486		MOVOU(X2, Mem{Base: dst, Disp: 32})
1487		MOVOU(X3, Mem{Base: dst, Disp: 48})
1488		MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64})
1489		MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48})
1490		MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32})
1491		MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16})
1492		JMP(end)
1493
1494		Label(name + "move_129through256")
1495		MOVOU(Mem{Base: src}, X0)
1496		MOVOU(Mem{Base: src, Disp: 16}, X1)
1497		MOVOU(Mem{Base: src, Disp: 32}, X2)
1498		MOVOU(Mem{Base: src, Disp: 48}, X3)
1499		MOVOU(Mem{Base: src, Disp: 64}, X4)
1500		MOVOU(Mem{Base: src, Disp: 80}, X5)
1501		MOVOU(Mem{Base: src, Disp: 96}, X6)
1502		MOVOU(Mem{Base: src, Disp: 112}, X7)
1503		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -128}, X8)
1504		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -112}, X9)
1505		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -96}, X10)
1506		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -80}, X11)
1507		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12)
1508		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13)
1509		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14)
1510		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15)
1511		MOVOU(X0, Mem{Base: dst})
1512		MOVOU(X1, Mem{Base: dst, Disp: 16})
1513		MOVOU(X2, Mem{Base: dst, Disp: 32})
1514		MOVOU(X3, Mem{Base: dst, Disp: 48})
1515		MOVOU(X4, Mem{Base: dst, Disp: 64})
1516		MOVOU(X5, Mem{Base: dst, Disp: 80})
1517		MOVOU(X6, Mem{Base: dst, Disp: 96})
1518		MOVOU(X7, Mem{Base: dst, Disp: 112})
1519		MOVOU(X8, Mem{Base: dst, Index: length, Scale: 1, Disp: -128})
1520		MOVOU(X9, Mem{Base: dst, Index: length, Scale: 1, Disp: -112})
1521		MOVOU(X10, Mem{Base: dst, Index: length, Scale: 1, Disp: -96})
1522		MOVOU(X11, Mem{Base: dst, Index: length, Scale: 1, Disp: -80})
1523		MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64})
1524		MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48})
1525		MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32})
1526		MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16})
1527		JMP(end)
1528		if avx {
1529			Label(name + "avxUnaligned")
1530			AX, CX, R8, R10 := GP64(), GP64(), GP64(), GP64()
1531			// Memory layout on the source side
1532			// src                                       CX
1533			// |<---------length before correction--------->|
1534			// |       |<--length corrected-->|             |
1535			// |       |                  |<--- AX  --->|
1536			// |<-R11->|                  |<-128 bytes->|
1537			// +----------------------------------------+
1538			// | Head  | Body             | Tail        |
1539			// +-------+------------------+-------------+
1540			// ^       ^                  ^
1541			// |       |                  |
1542			// Save head into Y4          Save tail into X5..X12
1543			//         |
1544			//         src+R11, where R11 = ((dst & -32) + 32) - dst
1545			// Algorithm:
1546			// 1. Unaligned save of the tail's 128 bytes
1547			// 2. Unaligned save of the head's 32  bytes
1548			// 3. Destination-aligned copying of body (128 bytes per iteration)
1549			// 4. Put head on the new place
1550			// 5. Put the tail on the new place
1551			// It can be important to satisfy processor's pipeline requirements for
1552			// small sizes as the cost of unaligned memory region copying is
1553			// comparable with the cost of main loop. So code is slightly messed there.
1554			// There is more clean implementation of that algorithm for bigger sizes
1555			// where the cost of unaligned part copying is negligible.
1556			// You can see it after gobble_big_data_fwd label.
1557			Y0, Y1, Y2, Y3, Y4 := YMM(), YMM(), YMM(), YMM(), YMM()
1558
1559			LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX)
1560			MOVQ(dst, R10)
1561			// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
1562			MOVOU(Mem{Base: CX, Disp: -0x80}, X5)
1563			MOVOU(Mem{Base: CX, Disp: -0x70}, X6)
1564			MOVQ(U32(0x80), AX)
1565
1566			// Align destination address
1567			ANDQ(U32(0xffffffe0), dst)
1568			ADDQ(U8(32), dst)
1569			// Continue tail saving.
1570			MOVOU(Mem{Base: CX, Disp: -0x60}, X7)
1571			MOVOU(Mem{Base: CX, Disp: -0x50}, X8)
1572			// Make R8 delta between aligned and unaligned destination addresses.
1573			MOVQ(dst, R8)
1574			SUBQ(R10, R8)
1575			// Continue tail saving.
1576			MOVOU(Mem{Base: CX, Disp: -0x40}, X9)
1577			MOVOU(Mem{Base: CX, Disp: -0x30}, X10)
1578			// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
1579			SUBQ(R8, length)
1580			// Continue tail saving.
1581			MOVOU(Mem{Base: CX, Disp: -0x20}, X11)
1582			MOVOU(Mem{Base: CX, Disp: -0x10}, X12)
1583			// The tail will be put on its place after main body copying.
1584			// It's time for the unaligned heading part.
1585			VMOVDQU(Mem{Base: src}, Y4)
1586			// Adjust source address to point past head.
1587			ADDQ(R8, src)
1588			SUBQ(AX, length)
1589
1590			// Aligned memory copying there
1591			Label(name + "gobble_128_loop")
1592			VMOVDQU(Mem{Base: src}, Y0)
1593			VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1)
1594			VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2)
1595			VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3)
1596			ADDQ(AX, src)
1597			VMOVDQA(Y0, Mem{Base: dst})
1598			VMOVDQA(Y1, Mem{Base: dst, Disp: 0x20})
1599			VMOVDQA(Y2, Mem{Base: dst, Disp: 0x40})
1600			VMOVDQA(Y3, Mem{Base: dst, Disp: 0x60})
1601			ADDQ(AX, dst)
1602			SUBQ(AX, length)
1603			JA(LabelRef(name + "gobble_128_loop"))
1604			// Now we can store unaligned parts.
1605			ADDQ(AX, length)
1606			ADDQ(dst, length)
1607			VMOVDQU(Y4, Mem{Base: R10})
1608			VZEROUPPER()
1609			MOVOU(X5, Mem{Base: length, Disp: -0x80})
1610			MOVOU(X6, Mem{Base: length, Disp: -0x70})
1611			MOVOU(X7, Mem{Base: length, Disp: -0x60})
1612			MOVOU(X8, Mem{Base: length, Disp: -0x50})
1613			MOVOU(X9, Mem{Base: length, Disp: -0x40})
1614			MOVOU(X10, Mem{Base: length, Disp: -0x30})
1615			MOVOU(X11, Mem{Base: length, Disp: -0x20})
1616			MOVOU(X12, Mem{Base: length, Disp: -0x10})
1617			JMP(end)
1618
1619			return
1620		}
1621	}
1622
1623	// Store start and end for sse_tail
1624	Label(name + "forward_sse")
1625	X0, X1, X2, X3, X4, X5, X6, X7 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
1626	X8, X9, X10, X11 := XMM(), XMM(), XMM(), XMM()
1627
1628	MOVOU(Mem{Base: src}, X0)
1629	MOVOU(Mem{Base: src, Disp: 16}, X1)
1630	MOVOU(Mem{Base: src, Disp: -32, Index: length, Scale: 1}, X2)
1631	MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X3)
1632
1633	// forward (only)
1634	dstAlign := GP64()
1635	bigLoops := GP64()
1636	MOVQ(length, bigLoops)
1637	SHRQ(U8(7), bigLoops) // bigLoops = length / 128
1638
1639	MOVQ(dst, dstAlign)
1640	ANDL(U32(31), dstAlign.As32())
1641	srcOff := GP64()
1642	MOVQ(U32(64), srcOff)
1643	SUBQ(dstAlign, srcOff)
1644
1645	// Move 128 bytes/loop
1646	DECQ(bigLoops)
1647	JA(LabelRef(name + "forward_sse_loop_32"))
1648
1649	// Can be moved inside loop for less regs.
1650	srcPos := GP64()
1651	LEAQ(Mem{Disp: -32, Base: src, Scale: 1, Index: srcOff}, srcPos)
1652	dstPos := GP64()
1653	LEAQ(Mem{Disp: -32, Base: dst, Scale: 1, Index: srcOff}, dstPos)
1654
1655	Label(name + "big_loop_back")
1656
1657	MOVOU(Mem{Disp: 0, Base: srcPos}, X4)
1658	MOVOU(Mem{Disp: 16, Base: srcPos}, X5)
1659	MOVOU(Mem{Disp: 32, Base: srcPos}, X6)
1660	MOVOU(Mem{Disp: 48, Base: srcPos}, X7)
1661	MOVOU(Mem{Disp: 64, Base: srcPos}, X8)
1662	MOVOU(Mem{Disp: 80, Base: srcPos}, X9)
1663	MOVOU(Mem{Disp: 96, Base: srcPos}, X10)
1664	MOVOU(Mem{Disp: 112, Base: srcPos}, X11)
1665
1666	MOVOA(X4, Mem{Disp: 0, Base: dstPos})
1667	MOVOA(X5, Mem{Disp: 16, Base: dstPos})
1668	MOVOA(X6, Mem{Disp: 32, Base: dstPos})
1669	MOVOA(X7, Mem{Disp: 48, Base: dstPos})
1670	MOVOA(X8, Mem{Disp: 64, Base: dstPos})
1671	MOVOA(X9, Mem{Disp: 80, Base: dstPos})
1672	MOVOA(X10, Mem{Disp: 96, Base: dstPos})
1673	MOVOA(X11, Mem{Disp: 112, Base: dstPos})
1674	ADDQ(U8(128), dstPos)
1675	ADDQ(U8(128), srcPos)
1676	ADDQ(U8(128), srcOff) // This could be outside the loop, but we lose a reg if we do.
1677	DECQ(bigLoops)
1678	JNA(LabelRef(name + "big_loop_back"))
1679
1680	Label(name + "forward_sse_loop_32")
1681	MOVOU(Mem{Disp: -32, Base: src, Scale: 1, Index: srcOff}, X4)
1682	MOVOU(Mem{Disp: -16, Base: src, Scale: 1, Index: srcOff}, X5)
1683	MOVOA(X4, Mem{Disp: -32, Base: dst, Scale: 1, Index: srcOff})
1684	MOVOA(X5, Mem{Disp: -16, Base: dst, Scale: 1, Index: srcOff})
1685	ADDQ(U8(32), srcOff)
1686	CMPQ(length, srcOff)
1687	JAE(LabelRef(name + "forward_sse_loop_32"))
1688
1689	// sse_tail patches up the beginning and end of the transfer.
1690	MOVOU(X0, Mem{Base: dst, Disp: 0})
1691	MOVOU(X1, Mem{Base: dst, Disp: 16})
1692	MOVOU(X2, Mem{Base: dst, Disp: -32, Index: length, Scale: 1})
1693	MOVOU(X3, Mem{Base: dst, Disp: -16, Index: length, Scale: 1})
1694
1695	JMP(end)
1696	return
1697}
1698
1699// genMatchLen generates standalone matchLen.
1700func (o options) genMatchLen() {
1701	TEXT("matchLen", NOSPLIT, "func(a, b []byte) int")
1702	Doc("matchLen returns how many bytes match in a and b", "",
1703		"It assumes that:",
1704		"  len(a) <= len(b)", "")
1705	Pragma("noescape")
1706
1707	aBase, bBase, length := GP64(), GP64(), GP64()
1708
1709	Load(Param("a").Base(), aBase)
1710	Load(Param("b").Base(), bBase)
1711	Load(Param("a").Len(), length)
1712	l := o.matchLen("standalone", aBase, bBase, length, LabelRef("gen_match_len_end"))
1713	Label("gen_match_len_end")
1714	Store(l.As64(), ReturnIndex(0))
1715	RET()
1716}
1717
1718// matchLen returns the number of matching bytes of a and b.
1719// len is the maximum number of bytes to match.
1720// Will jump to end when done and returns the length.
1721// Uses 2 GP registers.
1722func (o options) matchLen(name string, a, b, len reg.GPVirtual, end LabelRef) reg.GPVirtual {
1723	if false {
1724		return o.matchLenAlt(name, a, b, len, end)
1725	}
1726	tmp, matched := GP64(), GP32()
1727	XORL(matched, matched)
1728
1729	CMPL(len.As32(), U8(8))
1730	JL(LabelRef("matchlen_single_" + name))
1731
1732	Label("matchlen_loopback_" + name)
1733	MOVQ(Mem{Base: a, Index: matched, Scale: 1}, tmp)
1734	XORQ(Mem{Base: b, Index: matched, Scale: 1}, tmp)
1735	TESTQ(tmp, tmp)
1736	JZ(LabelRef("matchlen_loop_" + name))
1737	// Not all match.
1738	BSFQ(tmp, tmp)
1739	SARQ(U8(3), tmp)
1740	LEAL(Mem{Base: matched, Index: tmp, Scale: 1}, matched)
1741	JMP(end)
1742
1743	// All 8 byte matched, update and loop.
1744	Label("matchlen_loop_" + name)
1745	LEAL(Mem{Base: len, Disp: -8}, len.As32())
1746	LEAL(Mem{Base: matched, Disp: 8}, matched)
1747	CMPL(len.As32(), U8(8))
1748	JGE(LabelRef("matchlen_loopback_" + name))
1749
1750	// Less than 8 bytes left.
1751	Label("matchlen_single_" + name)
1752	TESTL(len.As32(), len.As32())
1753	JZ(end)
1754	Label("matchlen_single_loopback_" + name)
1755	MOVB(Mem{Base: a, Index: matched, Scale: 1}, tmp.As8())
1756	CMPB(Mem{Base: b, Index: matched, Scale: 1}, tmp.As8())
1757	JNE(end)
1758	LEAL(Mem{Base: matched, Disp: 1}, matched)
1759	DECL(len.As32())
1760	JNZ(LabelRef("matchlen_single_loopback_" + name))
1761	JMP(end)
1762	return matched
1763}
1764
1765// matchLen returns the number of matching bytes of a and b.
1766// len is the maximum number of bytes to match.
1767// Will jump to end when done and returns the length.
1768// Uses 3 GP registers.
1769// It is better on longer matches.
1770func (o options) matchLenAlt(name string, a, b, len reg.GPVirtual, end LabelRef) reg.GPVirtual {
1771	tmp, tmp2, matched := GP64(), GP64(), GP32()
1772	XORL(matched, matched)
1773
1774	CMPL(len.As32(), U8(16))
1775	JB(LabelRef("matchlen_short_" + name))
1776
1777	Label("matchlen_loopback_" + name)
1778	MOVQ(Mem{Base: a}, tmp)
1779	MOVQ(Mem{Base: a, Disp: 8}, tmp2)
1780	XORQ(Mem{Base: b, Disp: 0}, tmp)
1781	XORQ(Mem{Base: b, Disp: 8}, tmp2)
1782	endTest := func(xored reg.GPVirtual, disp int, ok LabelRef) {
1783		TESTQ(xored, xored)
1784		JZ(ok)
1785		// Not all match.
1786		BSFQ(xored, xored)
1787		SARQ(U8(3), xored)
1788		LEAL(Mem{Base: matched, Index: xored, Scale: 1, Disp: disp}, matched)
1789		JMP(end)
1790	}
1791	endTest(tmp, 0, LabelRef("matchlen_loop_tmp2_"+name))
1792	Label("matchlen_loop_tmp2_" + name)
1793	endTest(tmp2, 8, LabelRef("matchlen_loop_"+name))
1794
1795	// All 16 byte matched, update and loop.
1796	Label("matchlen_loop_" + name)
1797	SUBL(U8(16), len.As32())
1798	ADDL(U8(16), matched)
1799	ADDQ(U8(16), a)
1800	ADDQ(U8(16), b)
1801	CMPL(len.As32(), U8(16))
1802	JAE(LabelRef("matchlen_loopback_" + name))
1803
1804	// Test 4 bytes at the time...
1805	Label("matchlen_short_" + name)
1806	lenoff := 0
1807	if true {
1808		lenoff = 4
1809		SUBL(U8(4), len.As32())
1810		JC(LabelRef("matchlen_single_resume_" + name))
1811
1812		Label("matchlen_four_loopback_" + name)
1813		assert(func(ok LabelRef) {
1814			CMPL(len.As32(), U32(math.MaxInt32))
1815			JL(ok)
1816		})
1817
1818		MOVL(Mem{Base: a}, tmp.As32())
1819		XORL(Mem{Base: b}, tmp.As32())
1820		{
1821			JZ(LabelRef("matchlen_four_loopback_next" + name))
1822			BSFL(tmp.As32(), tmp.As32())
1823			SARQ(U8(3), tmp)
1824			LEAL(Mem{Base: matched, Index: tmp, Scale: 1}, matched)
1825			JMP(end)
1826		}
1827		Label("matchlen_four_loopback_next" + name)
1828		ADDL(U8(4), matched)
1829		ADDQ(U8(4), a)
1830		ADDQ(U8(4), b)
1831		SUBL(U8(4), len.As32())
1832		JNC(LabelRef("matchlen_four_loopback_" + name))
1833	}
1834
1835	// Test one at the time
1836	Label("matchlen_single_resume_" + name)
1837	if true {
1838		// Less than 16 bytes left.
1839		if lenoff > 0 {
1840			ADDL(U8(lenoff), len.As32())
1841		}
1842		TESTL(len.As32(), len.As32())
1843		JZ(end)
1844
1845		Label("matchlen_single_loopback_" + name)
1846		MOVB(Mem{Base: a}, tmp.As8())
1847		CMPB(Mem{Base: b}, tmp.As8())
1848		JNE(end)
1849		INCL(matched)
1850		INCQ(a)
1851		INCQ(b)
1852		DECL(len.As32())
1853		JNZ(LabelRef("matchlen_single_loopback_" + name))
1854	}
1855	JMP(end)
1856	return matched
1857}
1858