1package main
2
3//go:generate go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2
4
5import (
6	"fmt"
7	"math"
8	"runtime"
9
10	. "github.com/mmcloughlin/avo/build"
11	"github.com/mmcloughlin/avo/buildtags"
12	. "github.com/mmcloughlin/avo/operand"
13	"github.com/mmcloughlin/avo/reg"
14)
15
16// insert extra checks here and there.
17const debug = false
18
19const (
20	limit14B = math.MaxUint32
21	// Use 12 bit table when no more than...
22	limit12B = 16<<10 - 1
23	// Use 10 bit table when no more than...
24	limit10B = 4<<10 - 1
25	// Use 8 bit table when no more than...
26	limit8B = 512 - 1
27)
28
29func main() {
30	Constraint(buildtags.Not("appengine").ToConstraint())
31	Constraint(buildtags.Not("noasm").ToConstraint())
32	Constraint(buildtags.Term("gc").ToConstraint())
33
34	o := options{
35		snappy: false,
36	}
37	o.genEncodeBlockAsm("encodeBlockAsm", 14, 6, 6, limit14B)
38	o.genEncodeBlockAsm("encodeBlockAsm4MB", 14, 6, 6, 4<<20)
39	o.genEncodeBlockAsm("encodeBlockAsm12B", 12, 5, 5, limit12B)
40	o.genEncodeBlockAsm("encodeBlockAsm10B", 10, 5, 4, limit10B)
41	o.genEncodeBlockAsm("encodeBlockAsm8B", 8, 4, 4, limit8B)
42
43	o.genEncodeBetterBlockAsm("encodeBetterBlockAsm", 16, 7, 7, limit14B)
44	o.genEncodeBetterBlockAsm("encodeBetterBlockAsm4MB", 16, 7, 7, 4<<20)
45	o.genEncodeBetterBlockAsm("encodeBetterBlockAsm12B", 14, 6, 6, limit12B)
46	o.genEncodeBetterBlockAsm("encodeBetterBlockAsm10B", 12, 5, 6, limit10B)
47	o.genEncodeBetterBlockAsm("encodeBetterBlockAsm8B", 10, 4, 6, limit8B)
48
49	// Snappy compatible
50	o.snappy = true
51	o.genEncodeBlockAsm("encodeSnappyBlockAsm", 14, 6, 6, limit14B)
52	o.genEncodeBlockAsm("encodeSnappyBlockAsm12B", 12, 5, 5, limit12B)
53	o.genEncodeBlockAsm("encodeSnappyBlockAsm10B", 10, 5, 4, limit10B)
54	o.genEncodeBlockAsm("encodeSnappyBlockAsm8B", 8, 4, 4, limit8B)
55
56	o.snappy = false
57	o.maxLen = math.MaxUint32
58	o.genEmitLiteral()
59	o.genEmitRepeat()
60	o.genEmitCopy()
61	o.snappy = true
62	o.genEmitCopyNoRepeat()
63	o.snappy = false
64	o.genMatchLen()
65	Generate()
66}
67
68func debugval(v Op) {
69	value := reg.R15
70	MOVQ(v, value)
71	INT(Imm(3))
72}
73
74func debugval32(v Op) {
75	value := reg.R15L
76	MOVL(v, value)
77	INT(Imm(3))
78}
79
80var assertCounter int
81
82// assert will insert code if debug is enabled.
83// The code should jump to 'ok' is assertion is success.
84func assert(fn func(ok LabelRef)) {
85	if debug {
86		caller := [100]uintptr{0}
87		runtime.Callers(2, caller[:])
88		frame, _ := runtime.CallersFrames(caller[:]).Next()
89
90		ok := fmt.Sprintf("assert_check_%d_ok_srcline_%d", assertCounter, frame.Line)
91		fn(LabelRef(ok))
92		// Emit several since delve is imprecise.
93		INT(Imm(3))
94		INT(Imm(3))
95		Label(ok)
96		assertCounter++
97	}
98}
99
100type options struct {
101	snappy bool
102	vmbi2  bool
103	maxLen int
104}
105
106func (o options) genEncodeBlockAsm(name string, tableBits, skipLog, hashBytes, maxLen int) {
107	TEXT(name, 0, "func(dst, src []byte) int")
108	Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.",
109		fmt.Sprintf("Maximum input %d bytes.", maxLen),
110		"It assumes that the varint-encoded length of the decompressed bytes has already been written.", "")
111	Pragma("noescape")
112
113	o.maxLen = maxLen
114	var literalMaxOverhead = maxLitOverheadFor(maxLen)
115
116	var tableSize = 4 * (1 << tableBits)
117	// Memzero needs at least 128 bytes.
118	if tableSize < 128 {
119		panic("tableSize must be at least 128 bytes")
120	}
121
122	lenSrcBasic, err := Param("src").Len().Resolve()
123	if err != nil {
124		panic(err)
125	}
126	lenSrcQ := lenSrcBasic.Addr
127
128	lenDstBasic, err := Param("dst").Len().Resolve()
129	if err != nil {
130		panic(err)
131	}
132	lenDstQ := lenDstBasic.Addr
133
134	// Bail if we can't compress to at least this.
135	dstLimitPtrQ := AllocLocal(8)
136
137	// sLimitL is when to stop looking for offset/length copies.
138	sLimitL := AllocLocal(4)
139
140	// nextEmitL keeps track of the point we have emitted to.
141	nextEmitL := AllocLocal(4)
142
143	// Repeat stores the last match offset.
144	repeatL := AllocLocal(4)
145
146	// nextSTempL keeps nextS while other functions are being called.
147	nextSTempL := AllocLocal(4)
148
149	// Alloc table last
150	table := AllocLocal(tableSize)
151
152	dst := GP64()
153	{
154		dstBaseBasic, err := Param("dst").Base().Resolve()
155		if err != nil {
156			panic(err)
157		}
158		dstBaseQ := dstBaseBasic.Addr
159		MOVQ(dstBaseQ, dst)
160	}
161
162	srcBaseBasic, err := Param("src").Base().Resolve()
163	if err != nil {
164		panic(err)
165	}
166	srcBaseQ := srcBaseBasic.Addr
167
168	// Zero table
169	{
170		iReg := GP64()
171		MOVQ(U32(tableSize/8/16), iReg)
172		tablePtr := GP64()
173		LEAQ(table, tablePtr)
174		zeroXmm := XMM()
175		PXOR(zeroXmm, zeroXmm)
176
177		Label("zero_loop_" + name)
178		for i := 0; i < 8; i++ {
179			MOVOU(zeroXmm, Mem{Base: tablePtr, Disp: i * 16})
180		}
181		ADDQ(U8(16*8), tablePtr)
182		DECQ(iReg)
183		JNZ(LabelRef("zero_loop_" + name))
184	}
185
186	{
187		// nextEmit is offset n src where the next emitLiteral should start from.
188		MOVL(U32(0), nextEmitL)
189
190		const inputMargin = 8
191		tmp, tmp2, tmp3 := GP64(), GP64(), GP64()
192		MOVQ(lenSrcQ, tmp)
193		LEAQ(Mem{Base: tmp, Disp: -5}, tmp2)
194		// sLimitL := len(src) - inputMargin
195		LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3)
196
197		assert(func(ok LabelRef) {
198			CMPQ(tmp3, lenSrcQ)
199			JL(ok)
200		})
201
202		MOVL(tmp3.As32(), sLimitL)
203
204		// dstLimit := (len(src) - 5 ) - len(src)>>5
205		SHRQ(U8(5), tmp)
206		SUBL(tmp.As32(), tmp2.As32()) // tmp2 = tmp2 - tmp
207
208		assert(func(ok LabelRef) {
209			// if len(src) > len(src) - len(src)>>5 - 5: ok
210			CMPQ(lenSrcQ, tmp2)
211			JGE(ok)
212		})
213
214		LEAQ(Mem{Base: dst, Index: tmp2, Scale: 1}, tmp2)
215		MOVQ(tmp2, dstLimitPtrQ)
216	}
217
218	// s = 1
219	s := GP32()
220	MOVL(U32(1), s)
221	// repeatL = 1
222	MOVL(s, repeatL)
223
224	src := GP64()
225	Load(Param("src").Base(), src)
226
227	// Load cv
228	Label("search_loop_" + name)
229	candidate := GP32()
230	{
231		assert(func(ok LabelRef) {
232			// Check if somebody changed src
233			tmp := GP64()
234			MOVQ(srcBaseQ, tmp)
235			CMPQ(tmp, src)
236			JEQ(ok)
237		})
238
239		cv := GP64()
240		nextS := GP32()
241		// nextS := s + (s-nextEmit)>>6 + 4
242		{
243			tmp := GP64()
244			MOVL(s, tmp.As32())           // tmp = s
245			SUBL(nextEmitL, tmp.As32())   // tmp = s - nextEmit
246			SHRL(U8(skipLog), tmp.As32()) // tmp = (s - nextEmit) >> skipLog
247			LEAL(Mem{Base: s, Disp: 4, Index: tmp, Scale: 1}, nextS)
248		}
249		// if nextS > sLimit {goto emitRemainder}
250		{
251			CMPL(nextS.As32(), sLimitL)
252			JGE(LabelRef("emit_remainder_" + name))
253		}
254		MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv)
255		assert(func(ok LabelRef) {
256			// Check if s is valid (we should have jumped above if not)
257			tmp := GP64()
258			MOVQ(lenSrcQ, tmp)
259			CMPQ(tmp, s.As64())
260			JG(ok)
261		})
262		// move nextS to stack.
263		MOVL(nextS.As32(), nextSTempL)
264
265		candidate2 := GP32()
266		hasher := hashN(hashBytes, tableBits)
267		{
268			hash0, hash1 := GP64(), GP64()
269			MOVQ(cv, hash0)
270			MOVQ(cv, hash1)
271			SHRQ(U8(8), hash1)
272			hasher.hash(hash0)
273			hasher.hash(hash1)
274			MOVL(table.Idx(hash0, 4), candidate)
275			MOVL(table.Idx(hash1, 4), candidate2)
276			assert(func(ok LabelRef) {
277				CMPQ(hash0, U32(tableSize))
278				JL(ok)
279			})
280			assert(func(ok LabelRef) {
281				CMPQ(hash1, U32(tableSize))
282				JL(ok)
283			})
284
285			MOVL(s, table.Idx(hash0, 4))
286			tmp := GP32()
287			LEAL(Mem{Base: s, Disp: 1}, tmp)
288			MOVL(tmp, table.Idx(hash1, 4))
289		}
290
291		// Can be moved up if registers are available.
292		hash2 := GP64()
293		{
294			// hash2 := hash6(cv>>16, tableBits)
295			// hasher = hash6(tableBits)
296			MOVQ(cv, hash2)
297			SHRQ(U8(16), hash2)
298			hasher.hash(hash2)
299			assert(func(ok LabelRef) {
300				CMPQ(hash2, U32(tableSize))
301				JL(ok)
302			})
303		}
304
305		// En/disable repeat matching.
306		if true {
307			// Check repeat at offset checkRep
308			const checkRep = 1
309			{
310				// rep = s - repeat
311				rep := GP32()
312				MOVL(s, rep)
313				SUBL(repeatL, rep) // rep = s - repeat
314
315				// if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
316				left, right := GP64(), GP64()
317				MOVL(Mem{Base: src, Index: rep, Disp: checkRep, Scale: 1}, right.As32())
318				MOVQ(cv, left)
319				SHRQ(U8(checkRep*8), left)
320				CMPL(left.As32(), right.As32())
321				// BAIL, no repeat.
322				JNE(LabelRef("no_repeat_found_" + name))
323			}
324			// base = s + checkRep
325			base := GP32()
326			LEAL(Mem{Base: s, Disp: checkRep}, base)
327
328			// nextEmit before repeat.
329			nextEmit := GP32()
330			MOVL(nextEmitL, nextEmit)
331
332			// Extend back
333			if true {
334				i := GP32()
335				MOVL(base, i)
336				SUBL(repeatL, i)
337				JZ(LabelRef("repeat_extend_back_end_" + name))
338
339				Label("repeat_extend_back_loop_" + name)
340				// if base <= nextemit {exit}
341				CMPL(base.As32(), nextEmit)
342				JLE(LabelRef("repeat_extend_back_end_" + name))
343				// if src[i-1] == src[base-1]
344				tmp, tmp2 := GP64(), GP64()
345				MOVB(Mem{Base: src, Index: i, Scale: 1, Disp: -1}, tmp.As8())
346				MOVB(Mem{Base: src, Index: base, Scale: 1, Disp: -1}, tmp2.As8())
347				CMPB(tmp.As8(), tmp2.As8())
348				JNE(LabelRef("repeat_extend_back_end_" + name))
349				LEAL(Mem{Base: base, Disp: -1}, base)
350				DECL(i)
351				JNZ(LabelRef("repeat_extend_back_loop_" + name))
352			}
353			Label("repeat_extend_back_end_" + name)
354
355			// Base is now at start. Emit until base.
356			// d += emitLiteral(dst[d:], src[nextEmit:base])
357			if true {
358				o.emitLiteralsDstP(nextEmitL, base, src, dst, "repeat_emit_"+name)
359			}
360
361			// Extend forward
362			{
363				// s += 4 + checkRep
364				ADDL(U8(4+checkRep), s)
365
366				if true {
367					// candidate := s - repeat + 4 + checkRep
368					MOVL(s, candidate)
369					SUBL(repeatL, candidate) // candidate = s - repeat
370
371					// srcLeft = len(src) - s
372					srcLeft := GP64()
373					MOVQ(lenSrcQ, srcLeft)
374					SUBL(s, srcLeft.As32())
375					assert(func(ok LabelRef) {
376						// if srcleft < maxint32: ok
377						CMPQ(srcLeft, U32(0x7fffffff))
378						JL(ok)
379					})
380					// Forward address
381					forwardStart := GP64()
382					LEAQ(Mem{Base: src, Index: s, Scale: 1}, forwardStart)
383					// End address
384					backStart := GP64()
385					LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, backStart)
386
387					length := o.matchLen("repeat_extend_"+name, forwardStart, backStart, srcLeft, LabelRef("repeat_extend_forward_end_"+name))
388					forwardStart, backStart, srcLeft = nil, nil, nil
389					Label("repeat_extend_forward_end_" + name)
390					// s+= length
391					ADDL(length.As32(), s)
392				}
393			}
394			// Emit
395			if true {
396				// length = s-base
397				length := GP32()
398				MOVL(s, length)
399				SUBL(base.As32(), length) // length = s - base
400
401				offsetVal := GP32()
402				MOVL(repeatL, offsetVal)
403
404				if !o.snappy {
405					// if nextEmit == 0 {do copy instead...}
406					TESTL(nextEmit, nextEmit)
407					JZ(LabelRef("repeat_as_copy_" + name))
408
409					// Emit as repeat...
410					o.emitRepeat("match_repeat_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))
411
412					// Emit as copy instead...
413					Label("repeat_as_copy_" + name)
414				}
415				o.emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))
416
417				Label("repeat_end_emit_" + name)
418				// Store new dst and nextEmit
419				MOVL(s, nextEmitL)
420			}
421			// if s >= sLimit is picked up on next loop.
422			if false {
423				CMPL(s.As32(), sLimitL)
424				JGE(LabelRef("emit_remainder_" + name))
425			}
426			JMP(LabelRef("search_loop_" + name))
427		}
428		Label("no_repeat_found_" + name)
429		{
430			// Check candidates are ok. All must be < s and < len(src)
431			assert(func(ok LabelRef) {
432				tmp := GP64()
433				MOVQ(lenSrcQ, tmp)
434				CMPL(tmp.As32(), candidate)
435				JG(ok)
436			})
437			assert(func(ok LabelRef) {
438				CMPL(s, candidate)
439				JG(ok)
440			})
441			assert(func(ok LabelRef) {
442				tmp := GP64()
443				MOVQ(lenSrcQ, tmp)
444				CMPL(tmp.As32(), candidate2)
445				JG(ok)
446			})
447			assert(func(ok LabelRef) {
448				CMPL(s, candidate2)
449				JG(ok)
450			})
451
452			CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
453			JEQ(LabelRef("candidate_match_" + name))
454
455			tmp := GP32()
456			// cv >>= 8
457			SHRQ(U8(8), cv)
458
459			// candidate = int(table[hash2]) - load early.
460			MOVL(table.Idx(hash2, 4), candidate)
461			assert(func(ok LabelRef) {
462				tmp := GP64()
463				MOVQ(lenSrcQ, tmp)
464				CMPL(tmp.As32(), candidate)
465				JG(ok)
466			})
467			assert(func(ok LabelRef) {
468				// We may get s and s+1
469				tmp := GP32()
470				LEAL(Mem{Base: s, Disp: 2}, tmp)
471				CMPL(tmp, candidate)
472				JG(ok)
473			})
474
475			LEAL(Mem{Base: s, Disp: 2}, tmp)
476
477			//if uint32(cv>>8) == load32(src, candidate2)
478			CMPL(Mem{Base: src, Index: candidate2, Scale: 1}, cv.As32())
479			JEQ(LabelRef("candidate2_match_" + name))
480
481			// table[hash2] = uint32(s + 2)
482			MOVL(tmp, table.Idx(hash2, 4))
483
484			// cv >>= 8 (>> 16 total)
485			SHRQ(U8(8), cv)
486
487			// if uint32(cv>>16) == load32(src, candidate)
488			CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
489			JEQ(LabelRef("candidate3_match_" + name))
490
491			// No match found, next loop
492			// s = nextS
493			MOVL(nextSTempL, s)
494			JMP(LabelRef("search_loop_" + name))
495
496			// Matches candidate at s + 2 (3rd check)
497			Label("candidate3_match_" + name)
498			ADDL(U8(2), s)
499			JMP(LabelRef("candidate_match_" + name))
500
501			// Match at s + 1 (we calculated the hash, lets store it)
502			Label("candidate2_match_" + name)
503			// table[hash2] = uint32(s + 2)
504			MOVL(tmp, table.Idx(hash2, 4))
505			// s++
506			INCL(s)
507			MOVL(candidate2, candidate)
508		}
509	}
510
511	Label("candidate_match_" + name)
512	// We have a match at 's' with src offset in "candidate" that matches at least 4 bytes.
513	// Extend backwards
514	if true {
515		ne := GP32()
516		MOVL(nextEmitL, ne)
517		TESTL(candidate, candidate)
518		JZ(LabelRef("match_extend_back_end_" + name))
519
520		// candidate is tested when decremented, so we loop back here.
521		Label("match_extend_back_loop_" + name)
522		// if s <= nextEmit {exit}
523		CMPL(s, ne)
524		JLE(LabelRef("match_extend_back_end_" + name))
525		// if src[candidate-1] == src[s-1]
526		tmp, tmp2 := GP64(), GP64()
527		MOVB(Mem{Base: src, Index: candidate, Scale: 1, Disp: -1}, tmp.As8())
528		MOVB(Mem{Base: src, Index: s, Scale: 1, Disp: -1}, tmp2.As8())
529		CMPB(tmp.As8(), tmp2.As8())
530		JNE(LabelRef("match_extend_back_end_" + name))
531		LEAL(Mem{Base: s, Disp: -1}, s)
532		DECL(candidate)
533		JZ(LabelRef("match_extend_back_end_" + name))
534		JMP(LabelRef("match_extend_back_loop_" + name))
535	}
536	Label("match_extend_back_end_" + name)
537
538	// Bail if we exceed the maximum size.
539	if true {
540		// tmp = s-nextEmit
541		tmp := GP64()
542		MOVL(s, tmp.As32())
543		SUBL(nextEmitL, tmp.As32())
544		// tmp = &dst + s-nextEmit
545		LEAQ(Mem{Base: dst, Index: tmp, Scale: 1, Disp: literalMaxOverhead}, tmp)
546		CMPQ(tmp, dstLimitPtrQ)
547		JL(LabelRef("match_dst_size_check_" + name))
548		ri, err := ReturnIndex(0).Resolve()
549		if err != nil {
550			panic(err)
551		}
552		MOVQ(U32(0), ri.Addr)
553		RET()
554	}
555	Label("match_dst_size_check_" + name)
556	{
557		base := GP32()
558		MOVL(s, base.As32())
559		o.emitLiteralsDstP(nextEmitL, base, src, dst, "match_emit_"+name)
560	}
561	cv := GP64()
562	Label("match_nolit_loop_" + name)
563	{
564		// Update repeat
565		{
566			// repeat = base - candidate
567			repeatVal := GP64().As32()
568			MOVL(s, repeatVal)
569			SUBL(candidate, repeatVal)
570			MOVL(repeatVal, repeatL)
571		}
572		// s+=4, candidate+=4
573		ADDL(U8(4), s)
574		ADDL(U8(4), candidate)
575		// Extend the 4-byte match as long as possible and emit copy.
576		{
577			assert(func(ok LabelRef) {
578				// s must be > candidate cannot be equal.
579				CMPL(s, candidate)
580				JG(ok)
581			})
582			// srcLeft = len(src) - s
583			srcLeft := GP64()
584			MOVQ(lenSrcQ, srcLeft)
585			SUBL(s, srcLeft.As32())
586			assert(func(ok LabelRef) {
587				// if srcleft < maxint32: ok
588				CMPQ(srcLeft, U32(0x7fffffff))
589				JL(ok)
590			})
591
592			a, b := GP64(), GP64()
593			LEAQ(Mem{Base: src, Index: s, Scale: 1}, a)
594			LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, b)
595			length := o.matchLen("match_nolit_"+name,
596				a, b,
597				srcLeft,
598				LabelRef("match_nolit_end_"+name),
599			)
600			Label("match_nolit_end_" + name)
601			assert(func(ok LabelRef) {
602				CMPL(length.As32(), U32(math.MaxInt32))
603				JL(ok)
604			})
605			a, b, srcLeft = nil, nil, nil
606
607			// s += length (length is destroyed, use it now)
608			ADDL(length.As32(), s)
609
610			// Load offset from repeat value.
611			offset := GP64()
612			MOVL(repeatL, offset.As32())
613
614			// length += 4
615			ADDL(U8(4), length.As32())
616			MOVL(s, nextEmitL) // nextEmit = s
617			o.emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name))
618			Label("match_nolit_emitcopy_end_" + name)
619
620			// if s >= sLimit { end }
621			{
622				CMPL(s.As32(), sLimitL)
623				JGE(LabelRef("emit_remainder_" + name))
624			}
625			// Start load s-2 as early as possible...
626			MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, cv)
627			// Bail if we exceed the maximum size.
628			{
629				CMPQ(dst, dstLimitPtrQ)
630				JL(LabelRef("match_nolit_dst_ok_" + name))
631				ri, err := ReturnIndex(0).Resolve()
632				if err != nil {
633					panic(err)
634				}
635				MOVQ(U32(0), ri.Addr)
636				RET()
637				Label("match_nolit_dst_ok_" + name)
638			}
639		}
640		// cv must be set to value at s-2 before arriving here
641		{
642			// Check for an immediate match, otherwise start search at s+1
643			// Index s-2
644			hasher := hashN(hashBytes, tableBits)
645			hash0, hash1 := GP64(), GP64()
646			MOVQ(cv, hash0) // src[s-2]
647			SHRQ(U8(16), cv)
648			MOVQ(cv, hash1) // src[s]
649			hasher.hash(hash0)
650			hasher.hash(hash1)
651			sm2 := GP32() // s - 2
652			LEAL(Mem{Base: s, Disp: -2}, sm2)
653			assert(func(ok LabelRef) {
654				CMPQ(hash0, U32(tableSize))
655				JL(ok)
656			})
657			assert(func(ok LabelRef) {
658				CMPQ(hash1, U32(tableSize))
659				JL(ok)
660			})
661			addr := GP64()
662			LEAQ(table.Idx(hash1, 4), addr)
663			MOVL(Mem{Base: addr}, candidate)
664			MOVL(sm2, table.Idx(hash0, 4))
665			MOVL(s, Mem{Base: addr})
666			CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
667			JEQ(LabelRef("match_nolit_loop_" + name))
668			INCL(s)
669		}
670		JMP(LabelRef("search_loop_" + name))
671	}
672
673	Label("emit_remainder_" + name)
674	// Bail if we exceed the maximum size.
675	// if d+len(src)-nextEmitL > dstLimitPtrQ {	return 0
676	{
677		// remain = len(src) - nextEmit
678		remain := GP64()
679		MOVQ(lenSrcQ, remain)
680		SUBL(nextEmitL, remain.As32())
681
682		dstExpect := GP64()
683		// dst := dst + (len(src)-nextEmitL)
684
685		LEAQ(Mem{Base: dst, Index: remain, Scale: 1, Disp: literalMaxOverhead}, dstExpect)
686		CMPQ(dstExpect, dstLimitPtrQ)
687		JL(LabelRef("emit_remainder_ok_" + name))
688		ri, err := ReturnIndex(0).Resolve()
689		if err != nil {
690			panic(err)
691		}
692		MOVQ(U32(0), ri.Addr)
693		RET()
694		Label("emit_remainder_ok_" + name)
695	}
696	// emitLiteral(dst[d:], src[nextEmitL:])
697	emitEnd := GP64()
698	MOVQ(lenSrcQ, emitEnd)
699
700	// Emit final literals.
701	o.emitLiteralsDstP(nextEmitL, emitEnd, src, dst, "emit_remainder_"+name)
702
703	// Assert size is < limit
704	assert(func(ok LabelRef) {
705		// if dstBaseQ <  dstLimitPtrQ: ok
706		CMPQ(dst, dstLimitPtrQ)
707		JL(ok)
708	})
709
710	// length := start - base (ptr arithmetic)
711	length := GP64()
712	base := Load(Param("dst").Base(), GP64())
713	MOVQ(dst, length)
714	SUBQ(base, length)
715
716	// Assert size is < len(src)
717	assert(func(ok LabelRef) {
718		// if len(src) >= length: ok
719		CMPQ(lenSrcQ, length)
720		JGE(ok)
721	})
722	// Assert size is < len(dst)
723	assert(func(ok LabelRef) {
724		// if len(dst) >= length: ok
725		CMPQ(lenDstQ, length)
726		JGE(ok)
727	})
728	Store(length, ReturnIndex(0))
729	RET()
730}
731
732func maxLitOverheadFor(n int) int {
733	switch {
734	case n == 0:
735		return 0
736	case n < 60:
737		return 1
738	case n < 1<<8:
739		return 2
740	case n < 1<<16:
741		return 3
742	case n < 1<<24:
743		return 4
744	}
745	return 5
746}
747
748func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHashBytes, maxLen int) {
749	TEXT(name, 0, "func(dst, src []byte) int")
750	Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.",
751		fmt.Sprintf("Maximum input %d bytes.", maxLen),
752		"It assumes that the varint-encoded length of the decompressed bytes has already been written.", "")
753	Pragma("noescape")
754
755	if lHashBytes > 7 || lHashBytes <= 4 {
756		panic("lHashBytes must be <= 7 and >4")
757	}
758	var literalMaxOverhead = maxLitOverheadFor(maxLen)
759
760	var sTableBits = lTableBits - 2
761	const sHashBytes = 4
762	o.maxLen = maxLen
763
764	var lTableSize = 4 * (1 << lTableBits)
765	var sTableSize = 4 * (1 << sTableBits)
766
767	// Memzero needs at least 128 bytes.
768	if (lTableSize + sTableSize) < 128 {
769		panic("tableSize must be at least 128 bytes")
770	}
771
772	lenSrcBasic, err := Param("src").Len().Resolve()
773	if err != nil {
774		panic(err)
775	}
776	lenSrcQ := lenSrcBasic.Addr
777
778	lenDstBasic, err := Param("dst").Len().Resolve()
779	if err != nil {
780		panic(err)
781	}
782	lenDstQ := lenDstBasic.Addr
783
784	// Bail if we can't compress to at least this.
785	dstLimitPtrQ := AllocLocal(8)
786
787	// sLimitL is when to stop looking for offset/length copies.
788	sLimitL := AllocLocal(4)
789
790	// nextEmitL keeps track of the point we have emitted to.
791	nextEmitL := AllocLocal(4)
792
793	// Repeat stores the last match offset.
794	repeatL := AllocLocal(4)
795
796	// nextSTempL keeps nextS while other functions are being called.
797	nextSTempL := AllocLocal(4)
798
799	// Alloc table last, lTab must be before sTab.
800	lTab := AllocLocal(lTableSize)
801	sTab := AllocLocal(sTableSize)
802
803	dst := GP64()
804	{
805		dstBaseBasic, err := Param("dst").Base().Resolve()
806		if err != nil {
807			panic(err)
808		}
809		dstBaseQ := dstBaseBasic.Addr
810		MOVQ(dstBaseQ, dst)
811	}
812
813	srcBaseBasic, err := Param("src").Base().Resolve()
814	if err != nil {
815		panic(err)
816	}
817	srcBaseQ := srcBaseBasic.Addr
818
819	// Zero table
820	{
821		iReg := GP64()
822		MOVQ(U32((sTableSize+lTableSize)/8/16), iReg)
823		tablePtr := GP64()
824		LEAQ(lTab, tablePtr)
825		zeroXmm := XMM()
826		PXOR(zeroXmm, zeroXmm)
827
828		Label("zero_loop_" + name)
829		for i := 0; i < 8; i++ {
830			MOVOU(zeroXmm, Mem{Base: tablePtr, Disp: i * 16})
831		}
832		ADDQ(U8(16*8), tablePtr)
833		DECQ(iReg)
834		JNZ(LabelRef("zero_loop_" + name))
835	}
836
837	{
838		// nextEmit is offset n src where the next emitLiteral should start from.
839		MOVL(U32(0), nextEmitL)
840
841		const inputMargin = 8
842		tmp, tmp2, tmp3 := GP64(), GP64(), GP64()
843		MOVQ(lenSrcQ, tmp)
844		LEAQ(Mem{Base: tmp, Disp: -6}, tmp2)
845		// sLimitL := len(src) - inputMargin
846		LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3)
847
848		assert(func(ok LabelRef) {
849			CMPQ(tmp3, lenSrcQ)
850			JL(ok)
851		})
852
853		MOVL(tmp3.As32(), sLimitL)
854
855		// dstLimit := (len(src) - 5 ) - len(src)>>5
856		SHRQ(U8(5), tmp)
857		SUBL(tmp.As32(), tmp2.As32()) // tmp2 = tmp2 - tmp
858
859		assert(func(ok LabelRef) {
860			// if len(src) > len(src) - len(src)>>5 - 5: ok
861			CMPQ(lenSrcQ, tmp2)
862			JGE(ok)
863		})
864
865		LEAQ(Mem{Base: dst, Index: tmp2, Scale: 1}, tmp2)
866		MOVQ(tmp2, dstLimitPtrQ)
867	}
868
869	// s = 1
870	s := GP32()
871	MOVL(U32(1), s)
872	// repeatL = 0
873	MOVL(U32(0), repeatL)
874
875	src := GP64()
876	Load(Param("src").Base(), src)
877
878	// Load cv
879	Label("search_loop_" + name)
880	candidate := GP32()
881	{
882		assert(func(ok LabelRef) {
883			// Check if somebody changed src
884			tmp := GP64()
885			MOVQ(srcBaseQ, tmp)
886			CMPQ(tmp, src)
887			JEQ(ok)
888		})
889
890		cv := GP64()
891		nextS := GP32()
892		// nextS := s + (s-nextEmit)>>skipLog + 1
893		{
894			tmp := GP64()
895			MOVL(s, tmp.As32())           // tmp = s
896			SUBL(nextEmitL, tmp.As32())   // tmp = s - nextEmit
897			SHRL(U8(skipLog), tmp.As32()) // tmp = (s - nextEmit) >> skipLog
898			LEAL(Mem{Base: s, Disp: 1, Index: tmp, Scale: 1}, nextS)
899		}
900		// if nextS > sLimit {goto emitRemainder}
901		{
902			CMPL(nextS.As32(), sLimitL)
903			JGE(LabelRef("emit_remainder_" + name))
904		}
905		MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv)
906		assert(func(ok LabelRef) {
907			// Check if s is valid (we should have jumped above if not)
908			tmp := GP64()
909			MOVQ(lenSrcQ, tmp)
910			CMPQ(tmp, s.As64())
911			JG(ok)
912		})
913		// move nextS to stack.
914		MOVL(nextS.As32(), nextSTempL)
915
916		candidateS := GP32()
917		lHasher := hashN(lHashBytes, lTableBits)
918		{
919			sHasher := hashN(sHashBytes, sTableBits)
920			hash0, hash1 := GP64(), GP64()
921			MOVQ(cv, hash0)
922			MOVQ(cv, hash1)
923			lHasher.hash(hash0)
924			sHasher.hash(hash1)
925			MOVL(lTab.Idx(hash0, 4), candidate)
926			MOVL(sTab.Idx(hash1, 4), candidateS)
927			assert(func(ok LabelRef) {
928				CMPQ(hash0, U32(lTableSize))
929				JL(ok)
930			})
931			assert(func(ok LabelRef) {
932				CMPQ(hash1, U32(sTableSize))
933				JL(ok)
934			})
935
936			MOVL(s, lTab.Idx(hash0, 4))
937			MOVL(s, sTab.Idx(hash1, 4))
938		}
939
940		// En/disable repeat matching.
941		if false {
942			// Check repeat at offset checkRep
943			const checkRep = 1
944			{
945				// rep = s - repeat
946				rep := GP32()
947				MOVL(s, rep)
948				SUBL(repeatL, rep) // rep = s - repeat
949
950				// if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
951				left, right := GP64(), GP64()
952				MOVL(Mem{Base: src, Index: rep, Disp: checkRep, Scale: 1}, right.As32())
953				MOVQ(cv, left)
954				SHRQ(U8(checkRep*8), left)
955				CMPL(left.As32(), right.As32())
956				// BAIL, no repeat.
957				JNE(LabelRef("no_repeat_found_" + name))
958			}
959			// base = s + checkRep
960			base := GP32()
961			LEAL(Mem{Base: s, Disp: checkRep}, base)
962
963			// nextEmit before repeat.
964			nextEmit := GP32()
965			MOVL(nextEmitL, nextEmit)
966
967			// Extend back
968			if true {
969				i := GP32()
970				MOVL(base, i)
971				SUBL(repeatL, i)
972				JZ(LabelRef("repeat_extend_back_end_" + name))
973
974				Label("repeat_extend_back_loop_" + name)
975				// if base <= nextemit {exit}
976				CMPL(base.As32(), nextEmit)
977				JLE(LabelRef("repeat_extend_back_end_" + name))
978				// if src[i-1] == src[base-1]
979				tmp, tmp2 := GP64(), GP64()
980				MOVB(Mem{Base: src, Index: i, Scale: 1, Disp: -1}, tmp.As8())
981				MOVB(Mem{Base: src, Index: base, Scale: 1, Disp: -1}, tmp2.As8())
982				CMPB(tmp.As8(), tmp2.As8())
983				JNE(LabelRef("repeat_extend_back_end_" + name))
984				LEAL(Mem{Base: base, Disp: -1}, base)
985				DECL(i)
986				JNZ(LabelRef("repeat_extend_back_loop_" + name))
987			}
988			Label("repeat_extend_back_end_" + name)
989
990			// Base is now at start. Emit until base.
991			// d += emitLiteral(dst[d:], src[nextEmit:base])
992			if true {
993				o.emitLiteralsDstP(nextEmitL, base, src, dst, "repeat_emit_"+name)
994			}
995
996			// Extend forward
997			{
998				// s += 4 + checkRep
999				ADDL(U8(4+checkRep), s)
1000
1001				if true {
1002					// candidate := s - repeat + 4 + checkRep
1003					MOVL(s, candidate)
1004					SUBL(repeatL, candidate) // candidate = s - repeat
1005
1006					// srcLeft = len(src) - s
1007					srcLeft := GP64()
1008					MOVQ(lenSrcQ, srcLeft)
1009					SUBL(s, srcLeft.As32())
1010					assert(func(ok LabelRef) {
1011						// if srcleft < maxint32: ok
1012						CMPQ(srcLeft, U32(0x7fffffff))
1013						JL(ok)
1014					})
1015					// Forward address
1016					forwardStart := GP64()
1017					LEAQ(Mem{Base: src, Index: s, Scale: 1}, forwardStart)
1018					// End address
1019					backStart := GP64()
1020					LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, backStart)
1021
1022					length := o.matchLen("repeat_extend_"+name, forwardStart, backStart, srcLeft, LabelRef("repeat_extend_forward_end_"+name))
1023					forwardStart, backStart, srcLeft = nil, nil, nil
1024					Label("repeat_extend_forward_end_" + name)
1025					// s+= length
1026					ADDL(length.As32(), s)
1027				}
1028			}
1029			// Emit
1030			if true {
1031				// length = s-base
1032				length := GP32()
1033				MOVL(s, length)
1034				SUBL(base.As32(), length) // length = s - base
1035
1036				offsetVal := GP32()
1037				MOVL(repeatL, offsetVal)
1038
1039				if !o.snappy {
1040					// if nextEmit == 0 {do copy instead...}
1041					TESTL(nextEmit, nextEmit)
1042					JZ(LabelRef("repeat_as_copy_" + name))
1043
1044					// Emit as repeat...
1045					o.emitRepeat("match_repeat_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))
1046
1047					// Emit as copy instead...
1048					Label("repeat_as_copy_" + name)
1049				}
1050				o.emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))
1051
1052				Label("repeat_end_emit_" + name)
1053				// Store new dst and nextEmit
1054				MOVL(s, nextEmitL)
1055			}
1056			// if s >= sLimit is picked up on next loop.
1057			if false {
1058				CMPL(s.As32(), sLimitL)
1059				JGE(LabelRef("emit_remainder_" + name))
1060			}
1061			JMP(LabelRef("search_loop_" + name))
1062		}
1063		Label("no_repeat_found_" + name)
1064		{
1065			// Check candidates are ok. All must be < s and < len(src)
1066			assert(func(ok LabelRef) {
1067				tmp := GP64()
1068				MOVQ(lenSrcQ, tmp)
1069				CMPL(tmp.As32(), candidate)
1070				JG(ok)
1071			})
1072			assert(func(ok LabelRef) {
1073				CMPL(s, candidate)
1074				JG(ok)
1075			})
1076			assert(func(ok LabelRef) {
1077				tmp := GP64()
1078				MOVQ(lenSrcQ, tmp)
1079				CMPL(tmp.As32(), candidateS)
1080				JG(ok)
1081			})
1082			assert(func(ok LabelRef) {
1083				CMPL(s, candidateS)
1084				JG(ok)
1085			})
1086
1087			CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
1088			JEQ(LabelRef("candidate_match_" + name))
1089
1090			//if uint32(cv) == load32(src, candidateS)
1091			CMPL(Mem{Base: src, Index: candidateS, Scale: 1}, cv.As32())
1092			JEQ(LabelRef("candidateS_match_" + name))
1093
1094			// No match found, next loop
1095			// s = nextS
1096			MOVL(nextSTempL, s)
1097			JMP(LabelRef("search_loop_" + name))
1098
1099			// Short match at s, try a long candidate at s+1
1100			Label("candidateS_match_" + name)
1101			if true {
1102				hash0 := GP64()
1103				SHRQ(U8(8), cv)
1104				MOVQ(cv, hash0)
1105				lHasher.hash(hash0)
1106				MOVL(lTab.Idx(hash0, 4), candidate)
1107				INCL(s)
1108				assert(func(ok LabelRef) {
1109					CMPQ(hash0, U32(lTableSize))
1110					JL(ok)
1111				})
1112				MOVL(s, lTab.Idx(hash0, 4))
1113				CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
1114				JEQ(LabelRef("candidate_match_" + name))
1115				// No match, decrement s again and use short match at s...
1116				DECL(s)
1117			}
1118			MOVL(candidateS, candidate)
1119		}
1120	}
1121
1122	Label("candidate_match_" + name)
1123	// We have a match at 's' with src offset in "candidate" that matches at least 4 bytes.
1124	// Extend backwards
1125	if true {
1126		ne := GP32()
1127		MOVL(nextEmitL, ne)
1128		TESTL(candidate, candidate)
1129		JZ(LabelRef("match_extend_back_end_" + name))
1130
1131		// candidate is tested when decremented, so we loop back here.
1132		Label("match_extend_back_loop_" + name)
1133		// if s <= nextEmit {exit}
1134		CMPL(s, ne)
1135		JLE(LabelRef("match_extend_back_end_" + name))
1136		// if src[candidate-1] == src[s-1]
1137		tmp, tmp2 := GP64(), GP64()
1138		MOVB(Mem{Base: src, Index: candidate, Scale: 1, Disp: -1}, tmp.As8())
1139		MOVB(Mem{Base: src, Index: s, Scale: 1, Disp: -1}, tmp2.As8())
1140		CMPB(tmp.As8(), tmp2.As8())
1141		JNE(LabelRef("match_extend_back_end_" + name))
1142		LEAL(Mem{Base: s, Disp: -1}, s)
1143		DECL(candidate)
1144		JZ(LabelRef("match_extend_back_end_" + name))
1145		JMP(LabelRef("match_extend_back_loop_" + name))
1146	}
1147	Label("match_extend_back_end_" + name)
1148
1149	// Bail if we exceed the maximum size.
1150	if true {
1151		// tmp = s-nextEmit
1152		tmp := GP64()
1153		MOVL(s, tmp.As32())
1154		SUBL(nextEmitL, tmp.As32())
1155		// tmp = &dst + s-nextEmit
1156		LEAQ(Mem{Base: dst, Index: tmp, Scale: 1, Disp: literalMaxOverhead}, tmp)
1157		CMPQ(tmp, dstLimitPtrQ)
1158		JL(LabelRef("match_dst_size_check_" + name))
1159		ri, err := ReturnIndex(0).Resolve()
1160		if err != nil {
1161			panic(err)
1162		}
1163		MOVQ(U32(0), ri.Addr)
1164		RET()
1165	}
1166	Label("match_dst_size_check_" + name)
1167
1168	base := GP32()
1169	MOVL(s, base.As32())
1170
1171	// s+=4, candidate+=4
1172	ADDL(U8(4), s)
1173	ADDL(U8(4), candidate)
1174	// Extend the 4-byte match as long as possible and emit copy.
1175	{
1176		assert(func(ok LabelRef) {
1177			// s must be > candidate cannot be equal.
1178			CMPL(s, candidate)
1179			JG(ok)
1180		})
1181		// srcLeft = len(src) - s
1182		srcLeft := GP64()
1183		MOVQ(lenSrcQ, srcLeft)
1184		SUBL(s, srcLeft.As32())
1185		assert(func(ok LabelRef) {
1186			// if srcleft < maxint32: ok
1187			CMPQ(srcLeft, U32(0x7fffffff))
1188			JL(ok)
1189		})
1190
1191		a, b := GP64(), GP64()
1192		LEAQ(Mem{Base: src, Index: s, Scale: 1}, a)
1193		LEAQ(Mem{Base: src, Index: candidate, Scale: 1}, b)
1194		length := o.matchLen("match_nolit_"+name,
1195			a, b,
1196			srcLeft,
1197			LabelRef("match_nolit_end_"+name),
1198		)
1199		Label("match_nolit_end_" + name)
1200		assert(func(ok LabelRef) {
1201			CMPL(length.As32(), U32(math.MaxInt32))
1202			JL(ok)
1203		})
1204		a, b, srcLeft = nil, nil, nil
1205
1206		offset := GP64()
1207		offset32 := offset.As32()
1208		{
1209			// offset = base - candidate
1210			MOVL(s, offset32)
1211			SUBL(candidate, offset32)
1212			Comment("Check if repeat")
1213			CMPL(repeatL, offset32)
1214			JEQ(LabelRef("match_is_repeat_" + name))
1215
1216			// NOT REPEAT
1217			{
1218				// Check if match is better..
1219				if o.maxLen > 65535 {
1220					CMPL(length.As32(), U8(1))
1221					JG(LabelRef("match_length_ok_" + name))
1222					CMPL(offset32, U32(65535))
1223					JLE(LabelRef("match_length_ok_" + name))
1224					// Match is equal or worse to the encoding.
1225					MOVL(nextSTempL, s)
1226					INCL(s)
1227					JMP(LabelRef("search_loop_" + name))
1228					Label("match_length_ok_" + name)
1229				}
1230				// Store updated repeat
1231				MOVL(offset32, repeatL)
1232				// Emit....
1233				o.emitLiteralsDstP(nextEmitL, base, src, dst, "match_emit_"+name)
1234				// s += length (length is destroyed, use it now)
1235				ADDL(length.As32(), s)
1236
1237				// length += 4
1238				ADDL(U8(4), length.As32())
1239				MOVL(s, nextEmitL) // nextEmit = s
1240				o.emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name))
1241				// Jumps at end
1242			}
1243			// REPEAT
1244			{
1245				Label("match_is_repeat_" + name)
1246				// Emit....
1247				o.emitLiteralsDstP(nextEmitL, base, src, dst, "match_emit_repeat_"+name)
1248				// s += length (length is destroyed, use it now)
1249				ADDL(length.As32(), s)
1250
1251				// length += 4
1252				ADDL(U8(4), length.As32())
1253				MOVL(s, nextEmitL) // nextEmit = s
1254				o.emitRepeat("match_nolit_repeat_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name))
1255			}
1256		}
1257		Label("match_nolit_emitcopy_end_" + name)
1258
1259		// if s >= sLimit { end }
1260		{
1261			CMPL(s.As32(), sLimitL)
1262			JGE(LabelRef("emit_remainder_" + name))
1263		}
1264
1265		// Bail if we exceed the maximum size.
1266		{
1267			CMPQ(dst, dstLimitPtrQ)
1268			JL(LabelRef("match_nolit_dst_ok_" + name))
1269			ri, err := ReturnIndex(0).Resolve()
1270			if err != nil {
1271				panic(err)
1272			}
1273			MOVQ(U32(0), ri.Addr)
1274			RET()
1275		}
1276	}
1277	Label("match_nolit_dst_ok_" + name)
1278	// cv must be set to value at base+1 before arriving here
1279	if true {
1280		lHasher := hashN(lHashBytes, lTableBits)
1281		sHasher := hashN(sHashBytes, sTableBits)
1282
1283		// Index base+1 long, base+2 short...
1284		cv := GP64()
1285		INCL(base)
1286		MOVQ(Mem{Base: src, Index: base, Scale: 1, Disp: 0}, cv)
1287		hash0, hash1, hash2, hash3 := GP64(), GP64(), GP64(), GP64()
1288		MOVQ(cv, hash0) // src[base+1]
1289		MOVQ(cv, hash1)
1290		MOVQ(cv, hash2)
1291		SHRQ(U8(8), hash1) // src[base+2]
1292		MOVQ(hash1, hash3)
1293		SHRQ(U8(16), hash2)        // src[base+3]
1294		bp1, bp2 := GP32(), GP32() // base+1
1295		LEAL(Mem{Base: base, Disp: 1}, bp1)
1296		LEAL(Mem{Base: base, Disp: 2}, bp2)
1297
1298		// Load s-2 early
1299		MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, cv)
1300
1301		lHasher.hash(hash0)
1302		lHasher.hash(hash3)
1303		sHasher.hash(hash1)
1304		sHasher.hash(hash2)
1305		assert(func(ok LabelRef) {
1306			CMPQ(hash0, U32(lTableSize))
1307			JL(ok)
1308		})
1309		assert(func(ok LabelRef) {
1310			CMPQ(hash3, U32(lTableSize))
1311			JL(ok)
1312		})
1313		assert(func(ok LabelRef) {
1314			CMPQ(hash1, U32(sTableSize))
1315			JL(ok)
1316		})
1317		assert(func(ok LabelRef) {
1318			CMPQ(hash2, U32(sTableSize))
1319			JL(ok)
1320		})
1321		MOVL(base, lTab.Idx(hash0, 4))
1322		MOVL(bp1, lTab.Idx(hash3, 4))
1323		MOVL(bp1, sTab.Idx(hash1, 4))
1324		MOVL(bp2, sTab.Idx(hash2, 4))
1325
1326		// Index s-2 long, s-1 long+short...
1327		MOVQ(cv, hash0) // src[s-2]
1328		MOVQ(cv, hash1) // src[s-1]
1329		SHRQ(U8(8), hash1)
1330		MOVQ(hash1, hash3)
1331		sm1, sm2 := GP32(), GP32() // s -1, s - 2
1332		LEAL(Mem{Base: s, Disp: -2}, sm2)
1333		LEAL(Mem{Base: s, Disp: -1}, sm1)
1334		lHasher.hash(hash0)
1335		sHasher.hash(hash1)
1336		lHasher.hash(hash3)
1337		assert(func(ok LabelRef) {
1338			CMPQ(hash0, U32(lTableSize))
1339			JL(ok)
1340		})
1341		assert(func(ok LabelRef) {
1342			CMPQ(hash3, U32(lTableSize))
1343			JL(ok)
1344		})
1345		assert(func(ok LabelRef) {
1346			CMPQ(hash1, U32(sTableSize))
1347			JL(ok)
1348		})
1349		MOVL(sm2, lTab.Idx(hash0, 4))
1350		MOVL(sm1, sTab.Idx(hash1, 4))
1351		MOVL(sm1, lTab.Idx(hash3, 4))
1352	}
1353	JMP(LabelRef("search_loop_" + name))
1354
1355	Label("emit_remainder_" + name)
1356	// Bail if we exceed the maximum size.
1357	// if d+len(src)-nextEmitL > dstLimitPtrQ {	return 0
1358	{
1359		// remain = len(src) - nextEmit
1360		remain := GP64()
1361		MOVQ(lenSrcQ, remain)
1362		SUBL(nextEmitL, remain.As32())
1363
1364		dstExpect := GP64()
1365		// dst := dst + (len(src)-nextEmitL)
1366
1367		LEAQ(Mem{Base: dst, Index: remain, Scale: 1, Disp: literalMaxOverhead}, dstExpect)
1368		CMPQ(dstExpect, dstLimitPtrQ)
1369		JL(LabelRef("emit_remainder_ok_" + name))
1370		ri, err := ReturnIndex(0).Resolve()
1371		if err != nil {
1372			panic(err)
1373		}
1374		MOVQ(U32(0), ri.Addr)
1375		RET()
1376		Label("emit_remainder_ok_" + name)
1377	}
1378	// emitLiteral(dst[d:], src[nextEmitL:])
1379	emitEnd := GP64()
1380	MOVQ(lenSrcQ, emitEnd)
1381
1382	// Emit final literals.
1383	o.emitLiteralsDstP(nextEmitL, emitEnd, src, dst, "emit_remainder_"+name)
1384
1385	// Assert size is < limit
1386	assert(func(ok LabelRef) {
1387		// if dstBaseQ <  dstLimitPtrQ: ok
1388		CMPQ(dst, dstLimitPtrQ)
1389		JL(ok)
1390	})
1391
1392	// length := start - base (ptr arithmetic)
1393	length := GP64()
1394	dstBase := Load(Param("dst").Base(), GP64())
1395	MOVQ(dst, length)
1396	SUBQ(dstBase, length)
1397
1398	// Assert size is < len(src)
1399	assert(func(ok LabelRef) {
1400		// if len(src) >= length: ok
1401		CMPQ(lenSrcQ, length)
1402		JGE(ok)
1403	})
1404	// Assert size is < len(dst)
1405	assert(func(ok LabelRef) {
1406		// if len(dst) >= length: ok
1407		CMPQ(lenDstQ, length)
1408		JGE(ok)
1409	})
1410	Store(length, ReturnIndex(0))
1411	RET()
1412}
1413
1414// emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase.
1415// Checks if base == nextemit.
1416// src & base are untouched.
1417func (o options) emitLiterals(nextEmitL Mem, base reg.GPVirtual, src reg.GPVirtual, dstBase Mem, name string) {
1418	nextEmit, litLen, dstBaseTmp, litBase := GP32(), GP32(), GP64(), GP64()
1419	MOVL(nextEmitL, nextEmit)
1420	CMPL(nextEmit, base.As32())
1421	JEQ(LabelRef("emit_literal_skip_" + name))
1422	MOVL(base.As32(), litLen.As32())
1423
1424	// Base is now next emit.
1425	MOVL(base.As32(), nextEmitL)
1426
1427	// litBase = src[nextEmitL:]
1428	LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase)
1429	SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit
1430
1431	// Load (and store when we return)
1432	MOVQ(dstBase, dstBaseTmp)
1433	o.emitLiteral(name, litLen, nil, dstBaseTmp, litBase, LabelRef("emit_literal_done_"+name), true)
1434	Label("emit_literal_done_" + name)
1435
1436	// Emitted length must be > litlen.
1437	// We have already checked for len(0) above.
1438	assert(func(ok LabelRef) {
1439		tmp := GP64()
1440		MOVQ(dstBaseTmp, tmp)
1441		SUBQ(dstBase, tmp) // tmp = dstBaseTmp - dstBase
1442		// if tmp > litLen: ok
1443		CMPQ(tmp, litLen.As64())
1444		JG(ok)
1445	})
1446	// Store updated dstBase
1447	MOVQ(dstBaseTmp, dstBase)
1448	Label("emit_literal_skip_" + name)
1449}
1450
1451// emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase.
1452// Checks if base == nextemit.
1453// src & base are untouched.
1454func (o options) emitLiteralsDstP(nextEmitL Mem, base reg.GPVirtual, src, dst reg.GPVirtual, name string) {
1455	nextEmit, litLen, litBase := GP32(), GP32(), GP64()
1456	MOVL(nextEmitL, nextEmit)
1457	CMPL(nextEmit, base.As32())
1458	JEQ(LabelRef("emit_literal_done_" + name))
1459	MOVL(base.As32(), litLen.As32())
1460
1461	// Base is now next emit.
1462	MOVL(base.As32(), nextEmitL)
1463
1464	// litBase = src[nextEmitL:]
1465	LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase)
1466	SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit
1467
1468	// Load (and store when we return)
1469	o.emitLiteral(name, litLen, nil, dst, litBase, LabelRef("emit_literal_done_"+name), true)
1470	Label("emit_literal_done_" + name)
1471}
1472
1473type hashGen struct {
1474	bytes     int
1475	tablebits int
1476	mulreg    reg.GPVirtual
1477}
1478
1479// hashN uses multiply to get a 'output' hash on the hash of the lowest 'bytes' bytes in value.
1480func hashN(hashBytes, tablebits int) hashGen {
1481	h := hashGen{
1482		bytes:     hashBytes,
1483		tablebits: tablebits,
1484		mulreg:    GP64(),
1485	}
1486	primebytes := uint64(0)
1487	switch hashBytes {
1488	case 3:
1489		primebytes = 506832829
1490	case 4:
1491		primebytes = 2654435761
1492	case 5:
1493		primebytes = 889523592379
1494	case 6:
1495		primebytes = 227718039650203
1496	case 7:
1497		primebytes = 58295818150454627
1498	case 8:
1499		primebytes = 0xcf1bbcdcb7a56463
1500	default:
1501		panic("invalid hash length")
1502	}
1503	MOVQ(Imm(primebytes), h.mulreg)
1504	return h
1505}
1506
1507// hash uses multiply to get hash of the value.
1508func (h hashGen) hash(val reg.GPVirtual) {
1509	// Move value to top of register.
1510	if h.bytes < 8 {
1511		SHLQ(U8(64-8*h.bytes), val)
1512	}
1513	IMULQ(h.mulreg, val)
1514	// Move value to bottom
1515	SHRQ(U8(64-h.tablebits), val)
1516}
1517
1518func (o options) genEmitLiteral() {
1519	TEXT("emitLiteral", NOSPLIT, "func(dst, lit []byte) int")
1520	Doc("emitLiteral writes a literal chunk and returns the number of bytes written.", "",
1521		"It assumes that:",
1522		"  dst is long enough to hold the encoded bytes",
1523		"  0 <= len(lit) && len(lit) <= math.MaxUint32", "")
1524	Pragma("noescape")
1525
1526	dstBase, litBase, litLen, retval := GP64(), GP64(), GP64(), GP64()
1527	Load(Param("lit").Len(), litLen)
1528	Load(Param("dst").Base(), dstBase)
1529	Load(Param("lit").Base(), litBase)
1530	TESTQ(litLen, litLen)
1531	JZ(LabelRef("emit_literal_end_standalone_skip"))
1532	o.emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_standalone", false)
1533
1534	Label("emit_literal_end_standalone_skip")
1535	XORQ(retval, retval)
1536
1537	Label("emit_literal_end_standalone")
1538	Store(retval, ReturnIndex(0))
1539	RET()
1540
1541}
1542
1543// emitLiteral can be used for inlining an emitLiteral call.
1544// litLen must be > 0.
1545// stack must have at least 32 bytes.
1546// retval will contain emitted bytes, but can be nil if this is not interesting.
1547// dstBase and litBase are updated.
1548// Uses 2 GP registers. With AVX 4 registers.
1549// If updateDst is true dstBase will have the updated end pointer and an additional register will be used.
1550func (o options) emitLiteral(name string, litLen, retval, dstBase, litBase reg.GPVirtual, end LabelRef, updateDst bool) {
1551	n := GP32()
1552	n16 := GP32()
1553
1554	// litLen must be > 0
1555	assert(func(ok LabelRef) {
1556		TESTL(litLen.As32(), litLen.As32())
1557		JNZ(ok)
1558	})
1559
1560	// We always add litLen bytes
1561	if retval != nil {
1562		MOVL(litLen.As32(), retval.As32())
1563	}
1564	// n = litlen - 1
1565	LEAL(Mem{Base: litLen.As32(), Disp: -1}, n)
1566
1567	// Find number of bytes to emit for tag.
1568	CMPL(n.As32(), U8(60))
1569	JLT(LabelRef("one_byte_" + name))
1570	CMPL(n.As32(), U32(1<<8))
1571	JLT(LabelRef("two_bytes_" + name))
1572	if o.maxLen >= 1<<16 {
1573		CMPL(n.As32(), U32(1<<16))
1574		JLT(LabelRef("three_bytes_" + name))
1575	} else {
1576		JMP(LabelRef("three_bytes_" + name))
1577	}
1578	if o.maxLen >= 1<<16 {
1579		if o.maxLen >= 1<<24 {
1580			CMPL(n.As32(), U32(1<<24))
1581			JLT(LabelRef("four_bytes_" + name))
1582		} else {
1583			JMP(LabelRef("four_bytes_" + name))
1584		}
1585	}
1586	if o.maxLen >= 1<<24 {
1587		Label("five_bytes_" + name)
1588		MOVB(U8(252), Mem{Base: dstBase})
1589		MOVL(n.As32(), Mem{Base: dstBase, Disp: 1})
1590		if retval != nil {
1591			ADDQ(U8(5), retval)
1592		}
1593		ADDQ(U8(5), dstBase)
1594		JMP(LabelRef("memmove_long_" + name))
1595	}
1596	if o.maxLen >= 1<<16 {
1597		Label("four_bytes_" + name)
1598		MOVL(n, n16)
1599		SHRL(U8(16), n16.As32())
1600		MOVB(U8(248), Mem{Base: dstBase})
1601		MOVW(n.As16(), Mem{Base: dstBase, Disp: 1})
1602		MOVB(n16.As8(), Mem{Base: dstBase, Disp: 3})
1603		if retval != nil {
1604			ADDQ(U8(4), retval)
1605		}
1606		ADDQ(U8(4), dstBase)
1607		JMP(LabelRef("memmove_long_" + name))
1608	}
1609	Label("three_bytes_" + name)
1610	MOVB(U8(0xf4), Mem{Base: dstBase})
1611	MOVW(n.As16(), Mem{Base: dstBase, Disp: 1})
1612	if retval != nil {
1613		ADDQ(U8(3), retval)
1614	}
1615	ADDQ(U8(3), dstBase)
1616	JMP(LabelRef("memmove_long_" + name))
1617
1618	Label("two_bytes_" + name)
1619	MOVB(U8(0xf0), Mem{Base: dstBase})
1620	MOVB(n.As8(), Mem{Base: dstBase, Disp: 1})
1621	if retval != nil {
1622		ADDQ(U8(2), retval)
1623	}
1624	ADDQ(U8(2), dstBase)
1625	CMPL(n.As32(), U8(64))
1626	JL(LabelRef("memmove_" + name))
1627	JMP(LabelRef("memmove_long_" + name))
1628
1629	Label("one_byte_" + name)
1630	SHLB(U8(2), n.As8())
1631	MOVB(n.As8(), Mem{Base: dstBase})
1632	if retval != nil {
1633		ADDQ(U8(1), retval)
1634	}
1635	ADDQ(U8(1), dstBase)
1636	// Fallthrough
1637
1638	Label("memmove_" + name)
1639
1640	// copy(dst[i:], lit)
1641	dstEnd := GP64()
1642	copyEnd := end
1643	if updateDst {
1644		copyEnd = LabelRef("memmove_end_copy_" + name)
1645		LEAQ(Mem{Base: dstBase, Index: litLen, Scale: 1}, dstEnd)
1646	}
1647	length := GP64()
1648	MOVL(litLen.As32(), length.As32())
1649
1650	// updates litBase.
1651	o.genMemMoveShort("emit_lit_memmove_"+name, dstBase, litBase, length, copyEnd)
1652
1653	if updateDst {
1654		Label("memmove_end_copy_" + name)
1655		MOVQ(dstEnd, dstBase)
1656	}
1657	JMP(end)
1658
1659	// > 64 bytes
1660	Label("memmove_long_" + name)
1661
1662	// copy(dst[i:], lit)
1663	dstEnd = GP64()
1664	copyEnd = end
1665	if updateDst {
1666		copyEnd = LabelRef("memmove_end_copy_long_" + name)
1667		LEAQ(Mem{Base: dstBase, Index: litLen, Scale: 1}, dstEnd)
1668	}
1669	length = GP64()
1670	MOVL(litLen.As32(), length.As32())
1671
1672	// updates litBase.
1673	o.genMemMoveLong("emit_lit_memmove_long_"+name, dstBase, litBase, length, copyEnd)
1674
1675	if updateDst {
1676		Label("memmove_end_copy_long_" + name)
1677		MOVQ(dstEnd, dstBase)
1678	}
1679	JMP(end)
1680	// Should be unreachable
1681	if debug {
1682		INT(Imm(3))
1683	}
1684	return
1685}
1686
1687// genEmitRepeat generates a standlone emitRepeat.
1688func (o options) genEmitRepeat() {
1689	TEXT("emitRepeat", NOSPLIT, "func(dst []byte, offset, length int) int")
1690	Doc("emitRepeat writes a repeat chunk and returns the number of bytes written.",
1691		"Length must be at least 4 and < 1<<32", "")
1692	Pragma("noescape")
1693
1694	dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()
1695
1696	// retval = 0
1697	XORQ(retval, retval)
1698
1699	Load(Param("dst").Base(), dstBase)
1700	Load(Param("offset"), offset)
1701	Load(Param("length"), length)
1702	o.emitRepeat("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_repeat_end"))
1703	Label("gen_emit_repeat_end")
1704	Store(retval, ReturnIndex(0))
1705	RET()
1706}
1707
1708// emitRepeat can be used for inlining an emitRepeat call.
1709// length >= 4 and < 1<<32
1710// length is modified. dstBase is updated. retval is added to input.
1711// retval can be nil.
1712// Will jump to end label when finished.
1713// Uses 1 GP register.
1714func (o options) emitRepeat(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) {
1715	Comment("emitRepeat")
1716	Label("emit_repeat_again_" + name)
1717	tmp := GP32()
1718	MOVL(length.As32(), tmp) // Copy length
1719	// length -= 4
1720	LEAL(Mem{Base: length, Disp: -4}, length.As32())
1721
1722	// if length <= 4 (use copied value)
1723	CMPL(tmp.As32(), U8(8))
1724	JLE(LabelRef("repeat_two_" + name))
1725
1726	// length < 8 && offset < 2048
1727	CMPL(tmp.As32(), U8(12))
1728	JGE(LabelRef("cant_repeat_two_offset_" + name))
1729	if o.maxLen >= 2048 {
1730		CMPL(offset.As32(), U32(2048))
1731		JLT(LabelRef("repeat_two_offset_" + name))
1732	}
1733
1734	const maxRepeat = ((1 << 24) - 1) + 65536
1735	Label("cant_repeat_two_offset_" + name)
1736	CMPL(length.As32(), U32((1<<8)+4))
1737	JLT(LabelRef("repeat_three_" + name)) // if length < (1<<8)+4
1738	if o.maxLen >= (1<<16)+(1<<8) {
1739		CMPL(length.As32(), U32((1<<16)+(1<<8)))
1740		JLT(LabelRef("repeat_four_" + name)) // if length < (1 << 16) + (1 << 8)
1741	} else {
1742		// Not needed, we should skip to it when generating.
1743		// JMP(LabelRef("repeat_four_" + name)) // if length < (1 << 16) + (1 << 8)
1744	}
1745	if o.maxLen >= maxRepeat {
1746		CMPL(length.As32(), U32(maxRepeat))
1747		JLT(LabelRef("repeat_five_" + name)) // If less than 24 bits to represent.
1748
1749		// We have have more than 24 bits
1750		// Emit so we have at least 4 bytes left.
1751		LEAL(Mem{Base: length, Disp: -(maxRepeat - 4)}, length.As32()) // length -= (maxRepeat - 4)
1752		MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase})                   // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
1753		MOVW(U16(65531), Mem{Base: dstBase, Disp: 2})                  // 0xfffb
1754		MOVB(U8(255), Mem{Base: dstBase, Disp: 4})
1755		ADDQ(U8(5), dstBase)
1756		if retval != nil {
1757			ADDQ(U8(5), retval)
1758		}
1759		JMP(LabelRef("emit_repeat_again_" + name))
1760	} else {
1761		// Not needed.
1762		// JMP(LabelRef("repeat_five_" + name)) // If less than 24 bits to represent.
1763	}
1764
1765	// Must be able to be within 5 bytes.
1766	if o.maxLen >= (1<<16)+(1<<8) {
1767		Label("repeat_five_" + name)
1768		LEAL(Mem{Base: length, Disp: -65536}, length.As32()) // length -= 65536
1769		MOVL(length.As32(), offset.As32())
1770		MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase})     // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
1771		MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8)
1772		SARL(U8(16), offset.As32())                      // offset = length >> 16
1773		MOVB(offset.As8(), Mem{Base: dstBase, Disp: 4})  // dst[4] = length >> 16
1774		if retval != nil {
1775			ADDQ(U8(5), retval) // i += 5
1776		}
1777		ADDQ(U8(5), dstBase) // dst += 5
1778		JMP(end)
1779	}
1780	Label("repeat_four_" + name)
1781	LEAL(Mem{Base: length, Disp: -256}, length.As32()) // length -= 256
1782	MOVW(U16(6<<2|tagCopy1), Mem{Base: dstBase})       // dst[0] = 6<<2 | tagCopy1, dst[1] = 0
1783	MOVW(length.As16(), Mem{Base: dstBase, Disp: 2})   // dst[2] = uint8(length), dst[3] = uint8(length >> 8)
1784	if retval != nil {
1785		ADDQ(U8(4), retval) // i += 4
1786	}
1787	ADDQ(U8(4), dstBase) // dst += 4
1788	JMP(end)
1789
1790	Label("repeat_three_" + name)
1791	LEAL(Mem{Base: length, Disp: -4}, length.As32()) // length -= 4
1792	MOVW(U16(5<<2|tagCopy1), Mem{Base: dstBase})     // dst[0] = 5<<2 | tagCopy1, dst[1] = 0
1793	MOVB(length.As8(), Mem{Base: dstBase, Disp: 2})  // dst[2] = uint8(length)
1794	if retval != nil {
1795		ADDQ(U8(3), retval) // i += 3
1796	}
1797	ADDQ(U8(3), dstBase) // dst += 3
1798	JMP(end)
1799
1800	Label("repeat_two_" + name)
1801	// dst[0] = uint8(length)<<2 | tagCopy1, dst[1] = 0
1802	SHLL(U8(2), length.As32())
1803	ORL(U8(tagCopy1), length.As32())
1804	MOVW(length.As16(), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
1805	if retval != nil {
1806		ADDQ(U8(2), retval) // i += 2
1807	}
1808	ADDQ(U8(2), dstBase) // dst += 2
1809	JMP(end)
1810
1811	Label("repeat_two_offset_" + name)
1812	// Emit the remaining copy, encoded as 2 bytes.
1813	// dst[1] = uint8(offset)
1814	// dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
1815	tmp = GP64()
1816	XORQ(tmp, tmp)
1817	// Use scale and displacement to shift and subtract values from length.
1818	LEAL(Mem{Base: tmp, Index: length, Scale: 4, Disp: tagCopy1}, length.As32())
1819	MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte
1820	SARL(U8(8), offset.As32())                      // Remove lower
1821	SHLL(U8(5), offset.As32())                      // Shift back up
1822	ORL(offset.As32(), length.As32())               // OR result
1823	MOVB(length.As8(), Mem{Base: dstBase, Disp: 0})
1824	if retval != nil {
1825		ADDQ(U8(2), retval) // i += 2
1826	}
1827	ADDQ(U8(2), dstBase) // dst += 2
1828
1829	JMP(end)
1830}
1831
1832// emitCopy writes a copy chunk and returns the number of bytes written.
1833//
1834// It assumes that:
1835//	dst is long enough to hold the encoded bytes
1836//	1 <= offset && offset <= math.MaxUint32
1837//	4 <= length && length <= 1 << 24
1838
1839// genEmitCopy generates a standlone emitCopy
1840func (o options) genEmitCopy() {
1841	TEXT("emitCopy", NOSPLIT, "func(dst []byte, offset, length int) int")
1842	Doc("emitCopy writes a copy chunk and returns the number of bytes written.", "",
1843		"It assumes that:",
1844		"  dst is long enough to hold the encoded bytes",
1845		"  1 <= offset && offset <= math.MaxUint32",
1846		"  4 <= length && length <= 1 << 24", "")
1847	Pragma("noescape")
1848
1849	dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()
1850
1851	//	i := 0
1852	XORQ(retval, retval)
1853	Load(Param("dst").Base(), dstBase)
1854	Load(Param("offset"), offset)
1855	Load(Param("length"), length)
1856	o.emitCopy("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_copy_end"))
1857	Label("gen_emit_copy_end")
1858	Store(retval, ReturnIndex(0))
1859	RET()
1860}
1861
1862// emitCopy writes a copy chunk and returns the number of bytes written.
1863//
1864// It assumes that:
1865//	dst is long enough to hold the encoded bytes
1866//	1 <= offset && offset <= math.MaxUint32
1867//	4 <= length && length <= 1 << 24
1868
1869// genEmitCopy generates a standlone emitCopy
1870func (o options) genEmitCopyNoRepeat() {
1871	TEXT("emitCopyNoRepeat", NOSPLIT, "func(dst []byte, offset, length int) int")
1872	Doc("emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.", "",
1873		"It assumes that:",
1874		"  dst is long enough to hold the encoded bytes",
1875		"  1 <= offset && offset <= math.MaxUint32",
1876		"  4 <= length && length <= 1 << 24", "")
1877	Pragma("noescape")
1878
1879	dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()
1880
1881	//	i := 0
1882	XORQ(retval, retval)
1883
1884	Load(Param("dst").Base(), dstBase)
1885	Load(Param("offset"), offset)
1886	Load(Param("length"), length)
1887	o.emitCopy("standalone_snappy", length, offset, retval, dstBase, "gen_emit_copy_end_snappy")
1888	Label("gen_emit_copy_end_snappy")
1889	Store(retval, ReturnIndex(0))
1890	RET()
1891}
1892
1893const (
1894	tagLiteral = 0x00
1895	tagCopy1   = 0x01
1896	tagCopy2   = 0x02
1897	tagCopy4   = 0x03
1898)
1899
1900// emitCopy can be used for inlining an emitCopy call.
1901// length is modified (and junk). dstBase is updated. retval is added to input.
1902// retval can be nil.
1903// Will jump to end label when finished.
1904// Uses 2 GP registers.
1905func (o options) emitCopy(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) {
1906	Comment("emitCopy")
1907
1908	if o.maxLen >= 65536 {
1909		//if offset >= 65536 {
1910		CMPL(offset.As32(), U32(65536))
1911		JL(LabelRef("two_byte_offset_" + name))
1912
1913		// offset is >= 65536
1914		//	if length <= 64 goto four_bytes_remain_
1915		Label("four_bytes_loop_back_" + name)
1916		CMPL(length.As32(), U8(64))
1917		JLE(LabelRef("four_bytes_remain_" + name))
1918
1919		// Emit a length 64 copy, encoded as 5 bytes.
1920		//		dst[0] = 63<<2 | tagCopy4
1921		MOVB(U8(63<<2|tagCopy4), Mem{Base: dstBase})
1922		//		dst[4] = uint8(offset >> 24)
1923		//		dst[3] = uint8(offset >> 16)
1924		//		dst[2] = uint8(offset >> 8)
1925		//		dst[1] = uint8(offset)
1926		MOVL(offset.As32(), Mem{Base: dstBase, Disp: 1})
1927		//		length -= 64
1928		LEAL(Mem{Base: length, Disp: -64}, length.As32())
1929		if retval != nil {
1930			ADDQ(U8(5), retval) // i+=5
1931		}
1932		ADDQ(U8(5), dstBase) // dst+=5
1933
1934		//	if length >= 4 {
1935		CMPL(length.As32(), U8(4))
1936		JL(LabelRef("four_bytes_remain_" + name))
1937
1938		// Emit remaining as repeats
1939		//	return 5 + emitRepeat(dst[5:], offset, length)
1940		// Inline call to emitRepeat. Will jump to end
1941		if !o.snappy {
1942			o.emitRepeat(name+"_emit_copy", length, offset, retval, dstBase, end)
1943		}
1944		JMP(LabelRef("four_bytes_loop_back_" + name))
1945
1946		Label("four_bytes_remain_" + name)
1947		//	if length == 0 {
1948		//		return i
1949		//	}
1950		TESTL(length.As32(), length.As32())
1951		JZ(end)
1952
1953		// Emit a copy, offset encoded as 4 bytes.
1954		//	dst[i+0] = uint8(length-1)<<2 | tagCopy4
1955		//	dst[i+1] = uint8(offset)
1956		//	dst[i+2] = uint8(offset >> 8)
1957		//	dst[i+3] = uint8(offset >> 16)
1958		//	dst[i+4] = uint8(offset >> 24)
1959		tmp := GP64()
1960		MOVB(U8(tagCopy4), tmp.As8())
1961		// Use displacement to subtract 1 from upshifted length.
1962		LEAL(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length.As32())
1963		MOVB(length.As8(), Mem{Base: dstBase})
1964		MOVL(offset.As32(), Mem{Base: dstBase, Disp: 1})
1965		//	return i + 5
1966		if retval != nil {
1967			ADDQ(U8(5), retval)
1968		}
1969		ADDQ(U8(5), dstBase)
1970		JMP(end)
1971	}
1972	Label("two_byte_offset_" + name)
1973	// Offset no more than 2 bytes.
1974
1975	//if length > 64 {
1976	CMPL(length.As32(), U8(64))
1977	JLE(LabelRef("two_byte_offset_short_" + name))
1978	// Emit a length 60 copy, encoded as 3 bytes.
1979	// Emit remaining as repeat value (minimum 4 bytes).
1980	//	dst[2] = uint8(offset >> 8)
1981	//	dst[1] = uint8(offset)
1982	//	dst[0] = 59<<2 | tagCopy2
1983	MOVB(U8(59<<2|tagCopy2), Mem{Base: dstBase})
1984	MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1})
1985	//	length -= 60
1986	LEAL(Mem{Base: length, Disp: -60}, length.As32())
1987
1988	// Emit remaining as repeats, at least 4 bytes remain.
1989	//	return 3 + emitRepeat(dst[3:], offset, length)
1990	//}
1991	ADDQ(U8(3), dstBase)
1992	if retval != nil {
1993		ADDQ(U8(3), retval)
1994	}
1995	// Inline call to emitRepeat. Will jump to end
1996	if !o.snappy {
1997		o.emitRepeat(name+"_emit_copy_short", length, offset, retval, dstBase, end)
1998	}
1999	JMP(LabelRef("two_byte_offset_" + name))
2000
2001	Label("two_byte_offset_short_" + name)
2002	//if length >= 12 || offset >= 2048 {
2003	CMPL(length.As32(), U8(12))
2004	JGE(LabelRef("emit_copy_three_" + name))
2005	if o.maxLen >= 2048 {
2006		CMPL(offset.As32(), U32(2048))
2007		JGE(LabelRef("emit_copy_three_" + name))
2008	}
2009	// Emit the remaining copy, encoded as 2 bytes.
2010	// dst[1] = uint8(offset)
2011	// dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
2012	tmp := GP64()
2013	MOVB(U8(tagCopy1), tmp.As8())
2014	// Use scale and displacement to shift and subtract values from length.
2015	LEAL(Mem{Base: tmp, Index: length, Scale: 4, Disp: -(4 << 2)}, length.As32())
2016	MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte
2017	SHRL(U8(8), offset.As32())                      // Remove lower
2018	SHLL(U8(5), offset.As32())                      // Shift back up
2019	ORL(offset.As32(), length.As32())               // OR result
2020	MOVB(length.As8(), Mem{Base: dstBase, Disp: 0})
2021	if retval != nil {
2022		ADDQ(U8(2), retval) // i += 2
2023	}
2024	ADDQ(U8(2), dstBase) // dst += 2
2025	// return 2
2026	JMP(end)
2027
2028	Label("emit_copy_three_" + name)
2029	//	// Emit the remaining copy, encoded as 3 bytes.
2030	//	dst[2] = uint8(offset >> 8)
2031	//	dst[1] = uint8(offset)
2032	//	dst[0] = uint8(length-1)<<2 | tagCopy2
2033	tmp = GP64()
2034	MOVB(U8(tagCopy2), tmp.As8())
2035	LEAL(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length.As32())
2036	MOVB(length.As8(), Mem{Base: dstBase})
2037	MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1})
2038	//	return 3
2039	if retval != nil {
2040		ADDQ(U8(3), retval) // i += 3
2041	}
2042	ADDQ(U8(3), dstBase) // dst += 3
2043	JMP(end)
2044}
2045
2046// func memmove(to, from unsafe.Pointer, n uintptr)
2047// src and dst may not overlap.
2048// Non AVX uses 2 GP register, 16 SSE2 registers.
2049// AVX uses 4 GP registers 16 AVX/SSE registers.
2050// All passed registers may be updated.
2051// Length must be 1 -> 64 bytes
2052func (o options) genMemMoveShort(name string, dst, src, length reg.GPVirtual, end LabelRef) {
2053	Comment("genMemMoveShort")
2054	AX, CX := GP64(), GP64()
2055	name += "_memmove_"
2056
2057	// Only enable if length can be 0.
2058	if false {
2059		TESTQ(length, length)
2060		JEQ(end)
2061	}
2062	assert(func(ok LabelRef) {
2063		CMPQ(length, U8(64))
2064		JBE(ok)
2065	})
2066	assert(func(ok LabelRef) {
2067		TESTQ(length, length)
2068		JNZ(ok)
2069	})
2070	Label(name + "tail")
2071	CMPQ(length, U8(3))
2072	JB(LabelRef(name + "move_1or2"))
2073	JE(LabelRef(name + "move_3"))
2074	CMPQ(length, U8(8))
2075	JB(LabelRef(name + "move_4through7"))
2076	CMPQ(length, U8(16))
2077	JBE(LabelRef(name + "move_8through16"))
2078	CMPQ(length, U8(32))
2079	JBE(LabelRef(name + "move_17through32"))
2080	if debug {
2081		CMPQ(length, U8(64))
2082		JBE(LabelRef(name + "move_33through64"))
2083		INT(U8(3))
2084	}
2085	JMP(LabelRef(name + "move_33through64"))
2086
2087	//genMemMoveLong(name, dst, src, length, end)
2088
2089	Label(name + "move_1or2")
2090	MOVB(Mem{Base: src}, AX.As8())
2091	MOVB(Mem{Base: src, Disp: -1, Index: length, Scale: 1}, CX.As8())
2092	MOVB(AX.As8(), Mem{Base: dst})
2093	MOVB(CX.As8(), Mem{Base: dst, Disp: -1, Index: length, Scale: 1})
2094	JMP(end)
2095
2096	Label(name + "move_3")
2097	MOVW(Mem{Base: src}, AX.As16())
2098	MOVB(Mem{Base: src, Disp: 2}, CX.As8())
2099	MOVW(AX.As16(), Mem{Base: dst})
2100	MOVB(CX.As8(), Mem{Base: dst, Disp: 2})
2101	JMP(end)
2102
2103	Label(name + "move_4through7")
2104	MOVL(Mem{Base: src}, AX.As32())
2105	MOVL(Mem{Base: src, Disp: -4, Index: length, Scale: 1}, CX.As32())
2106	MOVL(AX.As32(), Mem{Base: dst})
2107	MOVL(CX.As32(), Mem{Base: dst, Disp: -4, Index: length, Scale: 1})
2108	JMP(end)
2109
2110	Label(name + "move_8through16")
2111	MOVQ(Mem{Base: src}, AX)
2112	MOVQ(Mem{Base: src, Disp: -8, Index: length, Scale: 1}, CX)
2113	MOVQ(AX, Mem{Base: dst})
2114	MOVQ(CX, Mem{Base: dst, Disp: -8, Index: length, Scale: 1})
2115	JMP(end)
2116
2117	Label(name + "move_17through32")
2118	X0, X1, X2, X3 := XMM(), XMM(), XMM(), XMM()
2119
2120	MOVOU(Mem{Base: src}, X0)
2121	MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X1)
2122	MOVOU(X0, Mem{Base: dst})
2123	MOVOU(X1, Mem{Base: dst, Disp: -16, Index: length, Scale: 1})
2124	JMP(end)
2125
2126	Label(name + "move_33through64")
2127	MOVOU(Mem{Base: src}, X0)
2128	MOVOU(Mem{Base: src, Disp: 16}, X1)
2129	MOVOU(Mem{Base: src, Disp: -32, Index: length, Scale: 1}, X2)
2130	MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X3)
2131	MOVOU(X0, Mem{Base: dst})
2132	MOVOU(X1, Mem{Base: dst, Disp: 16})
2133	MOVOU(X2, Mem{Base: dst, Disp: -32, Index: length, Scale: 1})
2134	MOVOU(X3, Mem{Base: dst, Disp: -16, Index: length, Scale: 1})
2135	JMP(end)
2136}
2137
2138// func genMemMoveLong(to, from unsafe.Pointer, n uintptr)
2139// src and dst may not overlap.
2140// length must be >= 64 bytes.
2141// Non AVX uses 2 GP register, 16 SSE2 registers.
2142// AVX uses 4 GP registers 16 AVX/SSE registers.
2143// All passed registers may be updated.
2144func (o options) genMemMoveLong(name string, dst, src, length reg.GPVirtual, end LabelRef) {
2145	Comment("genMemMoveLong")
2146	name += "large_"
2147
2148	assert(func(ok LabelRef) {
2149		CMPQ(length, U8(64))
2150		JAE(ok)
2151	})
2152
2153	// These are disabled.
2154	// AVX is ever so slightly faster, but it is disabled for simplicity.
2155	const branchLoops = false
2156	const avx = false && branchLoops
2157	if branchLoops {
2158		CMPQ(length, U8(128))
2159		JBE(LabelRef(name + "move_65through128"))
2160		CMPQ(length, U32(256))
2161		JBE(LabelRef(name + "move_129through256"))
2162		if avx {
2163			JMP(LabelRef(name + "avxUnaligned"))
2164		} else {
2165			JMP(LabelRef(name + "forward_sse"))
2166		}
2167
2168		X0, X1, X2, X3, X4, X5, X6, X7 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
2169		X8, X9, X10, X11, X12, X13, X14, X15 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
2170		Label(name + "move_65through128")
2171		MOVOU(Mem{Base: src}, X0)
2172		MOVOU(Mem{Base: src, Disp: 16}, X1)
2173		MOVOU(Mem{Base: src, Disp: 32}, X2)
2174		MOVOU(Mem{Base: src, Disp: 48}, X3)
2175		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12)
2176		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13)
2177		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14)
2178		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15)
2179		MOVOU(X0, Mem{Base: dst})
2180		MOVOU(X1, Mem{Base: dst, Disp: 16})
2181		MOVOU(X2, Mem{Base: dst, Disp: 32})
2182		MOVOU(X3, Mem{Base: dst, Disp: 48})
2183		MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64})
2184		MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48})
2185		MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32})
2186		MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16})
2187		JMP(end)
2188
2189		Label(name + "move_129through256")
2190		MOVOU(Mem{Base: src}, X0)
2191		MOVOU(Mem{Base: src, Disp: 16}, X1)
2192		MOVOU(Mem{Base: src, Disp: 32}, X2)
2193		MOVOU(Mem{Base: src, Disp: 48}, X3)
2194		MOVOU(Mem{Base: src, Disp: 64}, X4)
2195		MOVOU(Mem{Base: src, Disp: 80}, X5)
2196		MOVOU(Mem{Base: src, Disp: 96}, X6)
2197		MOVOU(Mem{Base: src, Disp: 112}, X7)
2198		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -128}, X8)
2199		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -112}, X9)
2200		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -96}, X10)
2201		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -80}, X11)
2202		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12)
2203		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13)
2204		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14)
2205		MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15)
2206		MOVOU(X0, Mem{Base: dst})
2207		MOVOU(X1, Mem{Base: dst, Disp: 16})
2208		MOVOU(X2, Mem{Base: dst, Disp: 32})
2209		MOVOU(X3, Mem{Base: dst, Disp: 48})
2210		MOVOU(X4, Mem{Base: dst, Disp: 64})
2211		MOVOU(X5, Mem{Base: dst, Disp: 80})
2212		MOVOU(X6, Mem{Base: dst, Disp: 96})
2213		MOVOU(X7, Mem{Base: dst, Disp: 112})
2214		MOVOU(X8, Mem{Base: dst, Index: length, Scale: 1, Disp: -128})
2215		MOVOU(X9, Mem{Base: dst, Index: length, Scale: 1, Disp: -112})
2216		MOVOU(X10, Mem{Base: dst, Index: length, Scale: 1, Disp: -96})
2217		MOVOU(X11, Mem{Base: dst, Index: length, Scale: 1, Disp: -80})
2218		MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64})
2219		MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48})
2220		MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32})
2221		MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16})
2222		JMP(end)
2223		if avx {
2224			Label(name + "avxUnaligned")
2225			AX, CX, R8, R10 := GP64(), GP64(), GP64(), GP64()
2226			// Memory layout on the source side
2227			// src                                       CX
2228			// |<---------length before correction--------->|
2229			// |       |<--length corrected-->|             |
2230			// |       |                  |<--- AX  --->|
2231			// |<-R11->|                  |<-128 bytes->|
2232			// +----------------------------------------+
2233			// | Head  | Body             | Tail        |
2234			// +-------+------------------+-------------+
2235			// ^       ^                  ^
2236			// |       |                  |
2237			// Save head into Y4          Save tail into X5..X12
2238			//         |
2239			//         src+R11, where R11 = ((dst & -32) + 32) - dst
2240			// Algorithm:
2241			// 1. Unaligned save of the tail's 128 bytes
2242			// 2. Unaligned save of the head's 32  bytes
2243			// 3. Destination-aligned copying of body (128 bytes per iteration)
2244			// 4. Put head on the new place
2245			// 5. Put the tail on the new place
2246			// It can be important to satisfy processor's pipeline requirements for
2247			// small sizes as the cost of unaligned memory region copying is
2248			// comparable with the cost of main loop. So code is slightly messed there.
2249			// There is more clean implementation of that algorithm for bigger sizes
2250			// where the cost of unaligned part copying is negligible.
2251			// You can see it after gobble_big_data_fwd label.
2252			Y0, Y1, Y2, Y3, Y4 := YMM(), YMM(), YMM(), YMM(), YMM()
2253
2254			LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX)
2255			MOVQ(dst, R10)
2256			// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
2257			MOVOU(Mem{Base: CX, Disp: -0x80}, X5)
2258			MOVOU(Mem{Base: CX, Disp: -0x70}, X6)
2259			MOVQ(U32(0x80), AX)
2260
2261			// Align destination address
2262			ANDQ(U32(0xffffffe0), dst)
2263			ADDQ(U8(32), dst)
2264			// Continue tail saving.
2265			MOVOU(Mem{Base: CX, Disp: -0x60}, X7)
2266			MOVOU(Mem{Base: CX, Disp: -0x50}, X8)
2267			// Make R8 delta between aligned and unaligned destination addresses.
2268			MOVQ(dst, R8)
2269			SUBQ(R10, R8)
2270			// Continue tail saving.
2271			MOVOU(Mem{Base: CX, Disp: -0x40}, X9)
2272			MOVOU(Mem{Base: CX, Disp: -0x30}, X10)
2273			// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
2274			SUBQ(R8, length)
2275			// Continue tail saving.
2276			MOVOU(Mem{Base: CX, Disp: -0x20}, X11)
2277			MOVOU(Mem{Base: CX, Disp: -0x10}, X12)
2278			// The tail will be put on its place after main body copying.
2279			// It's time for the unaligned heading part.
2280			VMOVDQU(Mem{Base: src}, Y4)
2281			// Adjust source address to point past head.
2282			ADDQ(R8, src)
2283			SUBQ(AX, length)
2284
2285			// Aligned memory copying there
2286			Label(name + "gobble_128_loop")
2287			VMOVDQU(Mem{Base: src}, Y0)
2288			VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1)
2289			VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2)
2290			VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3)
2291			ADDQ(AX, src)
2292			VMOVDQA(Y0, Mem{Base: dst})
2293			VMOVDQA(Y1, Mem{Base: dst, Disp: 0x20})
2294			VMOVDQA(Y2, Mem{Base: dst, Disp: 0x40})
2295			VMOVDQA(Y3, Mem{Base: dst, Disp: 0x60})
2296			ADDQ(AX, dst)
2297			SUBQ(AX, length)
2298			JA(LabelRef(name + "gobble_128_loop"))
2299			// Now we can store unaligned parts.
2300			ADDQ(AX, length)
2301			ADDQ(dst, length)
2302			VMOVDQU(Y4, Mem{Base: R10})
2303			VZEROUPPER()
2304			MOVOU(X5, Mem{Base: length, Disp: -0x80})
2305			MOVOU(X6, Mem{Base: length, Disp: -0x70})
2306			MOVOU(X7, Mem{Base: length, Disp: -0x60})
2307			MOVOU(X8, Mem{Base: length, Disp: -0x50})
2308			MOVOU(X9, Mem{Base: length, Disp: -0x40})
2309			MOVOU(X10, Mem{Base: length, Disp: -0x30})
2310			MOVOU(X11, Mem{Base: length, Disp: -0x20})
2311			MOVOU(X12, Mem{Base: length, Disp: -0x10})
2312			JMP(end)
2313
2314			return
2315		}
2316	}
2317
2318	// Store start and end for sse_tail
2319	Label(name + "forward_sse")
2320	X0, X1, X2, X3, X4, X5 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
2321	// X6, X7 :=  XMM(), XMM()
2322	//X8, X9, X10, X11 := XMM(), XMM(), XMM(), XMM()
2323
2324	MOVOU(Mem{Base: src}, X0)
2325	MOVOU(Mem{Base: src, Disp: 16}, X1)
2326	MOVOU(Mem{Base: src, Disp: -32, Index: length, Scale: 1}, X2)
2327	MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X3)
2328
2329	// forward (only)
2330	dstAlign := GP64()
2331	bigLoops := GP64()
2332	MOVQ(length, bigLoops)
2333	SHRQ(U8(5), bigLoops) // bigLoops = length / 32
2334
2335	MOVQ(dst, dstAlign)
2336	ANDL(U32(31), dstAlign.As32())
2337	srcOff := GP64()
2338	MOVQ(U32(64), srcOff)
2339	SUBQ(dstAlign, srcOff)
2340
2341	// Move 32 bytes/loop
2342	DECQ(bigLoops)
2343	JA(LabelRef(name + "forward_sse_loop_32"))
2344
2345	// Can be moved inside loop for less regs.
2346	srcPos := GP64()
2347	LEAQ(Mem{Disp: -32, Base: src, Scale: 1, Index: srcOff}, srcPos)
2348	dstPos := GP64()
2349	LEAQ(Mem{Disp: -32, Base: dst, Scale: 1, Index: srcOff}, dstPos)
2350
2351	Label(name + "big_loop_back")
2352
2353	MOVOU(Mem{Disp: 0, Base: srcPos}, X4)
2354	MOVOU(Mem{Disp: 16, Base: srcPos}, X5)
2355
2356	MOVOA(X4, Mem{Disp: 0, Base: dstPos})
2357	MOVOA(X5, Mem{Disp: 16, Base: dstPos})
2358	ADDQ(U8(32), dstPos)
2359	ADDQ(U8(32), srcPos)
2360	ADDQ(U8(32), srcOff) // This could be outside the loop, but we lose a reg if we do.
2361	DECQ(bigLoops)
2362	JNA(LabelRef(name + "big_loop_back"))
2363
2364	Label(name + "forward_sse_loop_32")
2365	MOVOU(Mem{Disp: -32, Base: src, Scale: 1, Index: srcOff}, X4)
2366	MOVOU(Mem{Disp: -16, Base: src, Scale: 1, Index: srcOff}, X5)
2367	MOVOA(X4, Mem{Disp: -32, Base: dst, Scale: 1, Index: srcOff})
2368	MOVOA(X5, Mem{Disp: -16, Base: dst, Scale: 1, Index: srcOff})
2369	ADDQ(U8(32), srcOff)
2370	CMPQ(length, srcOff)
2371	JAE(LabelRef(name + "forward_sse_loop_32"))
2372
2373	// sse_tail patches up the beginning and end of the transfer.
2374	MOVOU(X0, Mem{Base: dst, Disp: 0})
2375	MOVOU(X1, Mem{Base: dst, Disp: 16})
2376	MOVOU(X2, Mem{Base: dst, Disp: -32, Index: length, Scale: 1})
2377	MOVOU(X3, Mem{Base: dst, Disp: -16, Index: length, Scale: 1})
2378
2379	JMP(end)
2380	return
2381}
2382
2383// genMatchLen generates standalone matchLen.
2384func (o options) genMatchLen() {
2385	TEXT("matchLen", NOSPLIT, "func(a, b []byte) int")
2386	Doc("matchLen returns how many bytes match in a and b", "",
2387		"It assumes that:",
2388		"  len(a) <= len(b)", "")
2389	Pragma("noescape")
2390
2391	aBase, bBase, length := GP64(), GP64(), GP64()
2392
2393	Load(Param("a").Base(), aBase)
2394	Load(Param("b").Base(), bBase)
2395	Load(Param("a").Len(), length)
2396	l := o.matchLen("standalone", aBase, bBase, length, LabelRef("gen_match_len_end"))
2397	Label("gen_match_len_end")
2398	Store(l.As64(), ReturnIndex(0))
2399	RET()
2400}
2401
2402// matchLen returns the number of matching bytes of a and b.
2403// len is the maximum number of bytes to match.
2404// Will jump to end when done and returns the length.
2405// Uses 2 GP registers.
2406func (o options) matchLen(name string, a, b, len reg.GPVirtual, end LabelRef) reg.GPVirtual {
2407	Comment("matchLen")
2408	if false {
2409		return o.matchLenAlt(name, a, b, len, end)
2410	}
2411	tmp, matched := GP64(), GP32()
2412	XORL(matched, matched)
2413
2414	CMPL(len.As32(), U8(8))
2415	JL(LabelRef("matchlen_single_" + name))
2416
2417	Label("matchlen_loopback_" + name)
2418	MOVQ(Mem{Base: a, Index: matched, Scale: 1}, tmp)
2419	XORQ(Mem{Base: b, Index: matched, Scale: 1}, tmp)
2420	TESTQ(tmp, tmp)
2421	JZ(LabelRef("matchlen_loop_" + name))
2422	// Not all match.
2423	BSFQ(tmp, tmp)
2424	SARQ(U8(3), tmp)
2425	LEAL(Mem{Base: matched, Index: tmp, Scale: 1}, matched)
2426	JMP(end)
2427
2428	// All 8 byte matched, update and loop.
2429	Label("matchlen_loop_" + name)
2430	LEAL(Mem{Base: len, Disp: -8}, len.As32())
2431	LEAL(Mem{Base: matched, Disp: 8}, matched)
2432	CMPL(len.As32(), U8(8))
2433	JGE(LabelRef("matchlen_loopback_" + name))
2434
2435	// Less than 8 bytes left.
2436	Label("matchlen_single_" + name)
2437	TESTL(len.As32(), len.As32())
2438	JZ(end)
2439	Label("matchlen_single_loopback_" + name)
2440	MOVB(Mem{Base: a, Index: matched, Scale: 1}, tmp.As8())
2441	CMPB(Mem{Base: b, Index: matched, Scale: 1}, tmp.As8())
2442	JNE(end)
2443	LEAL(Mem{Base: matched, Disp: 1}, matched)
2444	DECL(len.As32())
2445	JNZ(LabelRef("matchlen_single_loopback_" + name))
2446	JMP(end)
2447	return matched
2448}
2449
2450// matchLen returns the number of matching bytes of a and b.
2451// len is the maximum number of bytes to match.
2452// Will jump to end when done and returns the length.
2453// Uses 3 GP registers.
2454// It is better on longer matches.
2455func (o options) matchLenAlt(name string, a, b, len reg.GPVirtual, end LabelRef) reg.GPVirtual {
2456	Comment("matchLenAlt")
2457	tmp, tmp2, matched := GP64(), GP64(), GP32()
2458	XORL(matched, matched)
2459
2460	CMPL(len.As32(), U8(16))
2461	JB(LabelRef("matchlen_short_" + name))
2462
2463	Label("matchlen_loopback_" + name)
2464	MOVQ(Mem{Base: a}, tmp)
2465	MOVQ(Mem{Base: a, Disp: 8}, tmp2)
2466	XORQ(Mem{Base: b, Disp: 0}, tmp)
2467	XORQ(Mem{Base: b, Disp: 8}, tmp2)
2468	endTest := func(xored reg.GPVirtual, disp int, ok LabelRef) {
2469		TESTQ(xored, xored)
2470		JZ(ok)
2471		// Not all match.
2472		BSFQ(xored, xored)
2473		SARQ(U8(3), xored)
2474		LEAL(Mem{Base: matched, Index: xored, Scale: 1, Disp: disp}, matched)
2475		JMP(end)
2476	}
2477	endTest(tmp, 0, LabelRef("matchlen_loop_tmp2_"+name))
2478	Label("matchlen_loop_tmp2_" + name)
2479	endTest(tmp2, 8, LabelRef("matchlen_loop_"+name))
2480
2481	// All 16 byte matched, update and loop.
2482	Label("matchlen_loop_" + name)
2483	SUBL(U8(16), len.As32())
2484	ADDL(U8(16), matched)
2485	ADDQ(U8(16), a)
2486	ADDQ(U8(16), b)
2487	CMPL(len.As32(), U8(16))
2488	JAE(LabelRef("matchlen_loopback_" + name))
2489
2490	// Test 4 bytes at the time...
2491	Label("matchlen_short_" + name)
2492	lenoff := 0
2493	if true {
2494		lenoff = 4
2495		SUBL(U8(4), len.As32())
2496		JC(LabelRef("matchlen_single_resume_" + name))
2497
2498		Label("matchlen_four_loopback_" + name)
2499		assert(func(ok LabelRef) {
2500			CMPL(len.As32(), U32(math.MaxInt32))
2501			JL(ok)
2502		})
2503
2504		MOVL(Mem{Base: a}, tmp.As32())
2505		XORL(Mem{Base: b}, tmp.As32())
2506		{
2507			JZ(LabelRef("matchlen_four_loopback_next" + name))
2508			BSFL(tmp.As32(), tmp.As32())
2509			SARQ(U8(3), tmp)
2510			LEAL(Mem{Base: matched, Index: tmp, Scale: 1}, matched)
2511			JMP(end)
2512		}
2513		Label("matchlen_four_loopback_next" + name)
2514		ADDL(U8(4), matched)
2515		ADDQ(U8(4), a)
2516		ADDQ(U8(4), b)
2517		SUBL(U8(4), len.As32())
2518		JNC(LabelRef("matchlen_four_loopback_" + name))
2519	}
2520
2521	// Test one at the time
2522	Label("matchlen_single_resume_" + name)
2523	if true {
2524		// Less than 16 bytes left.
2525		if lenoff > 0 {
2526			ADDL(U8(lenoff), len.As32())
2527		}
2528		TESTL(len.As32(), len.As32())
2529		JZ(end)
2530
2531		Label("matchlen_single_loopback_" + name)
2532		MOVB(Mem{Base: a}, tmp.As8())
2533		CMPB(Mem{Base: b}, tmp.As8())
2534		JNE(end)
2535		INCL(matched)
2536		INCQ(a)
2537		INCQ(b)
2538		DECL(len.As32())
2539		JNZ(LabelRef("matchlen_single_loopback_" + name))
2540	}
2541	JMP(end)
2542	return matched
2543}
2544