1// +build ignore
2
3package main
4
5import (
6	"fmt"
7	"log"
8
9	. "github.com/mmcloughlin/avo/build"
10	"github.com/mmcloughlin/avo/buildtags"
11	"github.com/mmcloughlin/avo/operand"
12	. "github.com/mmcloughlin/avo/operand"
13	"github.com/mmcloughlin/avo/reg"
14)
15
16func main() {
17	Constraint(buildtags.Not("appengine").ToConstraint())
18	Constraint(buildtags.Not("noasm").ToConstraint())
19	Constraint(buildtags.Term("gc").ToConstraint())
20
21	genEncodeBlockAsm("encodeBlockAsm", 16, 6, false)
22	genEncodeBlockAsm("encodeBlockAsm14B", 14, 5, false)
23	genEncodeBlockAsm("encodeBlockAsm12B", 12, 4, false)
24	genEncodeBlockAsm("encodeBlockAsmAvx", 16, 6, true)
25	genEncodeBlockAsm("encodeBlockAsm14BAvx", 14, 5, true)
26	genEncodeBlockAsm("encodeBlockAsm12BAvx", 12, 4, true)
27	genEmitLiteral()
28	genEmitRepeat()
29	genEmitCopy()
30	genMatchLen()
31	Generate()
32}
33
34func debugval(v operand.Op) {
35	value := reg.R15
36	MOVQ(v, value)
37	INT(Imm(3))
38}
39
40func genEncodeBlockAsm(name string, tableBits, skipLog int, avx bool) {
41	TEXT(name, 0, "func(dst, src []byte) int")
42	Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.",
43		"It assumes that the varint-encoded length of the decompressed bytes has already been written.", "")
44	Pragma("noescape")
45
46	// "var table [maxTableSize]uint32" takes up 4 * (1 << tableBits) bytes of stack space.
47	// Extra bytes are added to keep less used values.
48	var (
49		tableSize = 1 << uint(tableBits)
50		// Keep base stack multiple of 16.
51		baseStack = 0
52		// try to keep extraStack + baseStack multiple of 16
53		// for best chance of table alignment.
54		extraStack = 32
55		allocStack = baseStack + extraStack + tableSize
56	)
57
58	// Memzero needs at least 128 bytes.
59	if tableSize < 128 {
60		panic("tableSize must be at least 128 bytes")
61	}
62
63	lenSrcBasic, err := Param("src").Len().Resolve()
64	if err != nil {
65		panic(err)
66	}
67	lenSrcQ := lenSrcBasic.Addr
68
69	stack := AllocLocal(allocStack)
70	table := stack.Offset(allocStack - tableSize)
71
72	tmpStack := baseStack
73	// Bail if we can't compress to at least this.
74	dstLimitPtrQ := stack.Offset(tmpStack)
75	tmpStack += 8
76	// dstStartPtrQ contains the original dst pointer for returning the length
77	dstStartPtrQ := stack.Offset(tmpStack)
78	tmpStack += 8
79	// sLimitL is when to stop looking for offset/length copies.
80	sLimitL := stack.Offset(tmpStack)
81	tmpStack += 4
82	// nextEmitL keeps track of the point we have emitted to.
83	nextEmitL := stack.Offset(tmpStack)
84	tmpStack += 4
85	// Repeat stores the last match offset.
86	repeatL := stack.Offset(tmpStack)
87	tmpStack += 4
88	// nextSTempL keeps nextS while other functions are being called.
89	nextSTempL := stack.Offset(tmpStack)
90	tmpStack += 4
91	// Ensure we have the correct extra stack.
92	// Could be automatic, but whatever.
93	if tmpStack-baseStack != extraStack {
94		log.Fatal("adjust extraStack to ", tmpStack-baseStack)
95	}
96
97	dstBaseBasic, err := Param("dst").Base().Resolve()
98	if err != nil {
99		panic(err)
100	}
101	dstBase := dstBaseBasic.Addr
102
103	if tmpStack > extraStack+baseStack {
104		panic(fmt.Sprintf("tmp stack exceeded: %v", tmpStack))
105	}
106
107	// Zero table
108	{
109		iReg := GP64()
110		MOVQ(U32(tableSize/8/16), iReg)
111		tablePtr := GP64()
112		LEAQ(table, tablePtr)
113		zeroXmm := XMM()
114		PXOR(zeroXmm, zeroXmm)
115
116		Label("zero_loop_" + name)
117		for i := 0; i < 8; i++ {
118			MOVOU(zeroXmm, Mem{Base: tablePtr, Disp: i * 16})
119		}
120		ADDQ(U8(16*8), tablePtr)
121		DECQ(iReg)
122		JNZ(LabelRef("zero_loop_" + name))
123
124		// nextEmit is offset n src where the next emitLiteral should start from.
125		MOVL(iReg.As32(), nextEmitL)
126	}
127
128	{
129		const inputMargin = 8
130		tmp, tmp2, tmp3 := GP64(), GP64(), GP64()
131		MOVQ(lenSrcQ, tmp)
132		LEAQ(Mem{Base: tmp, Disp: -5}, tmp2)
133		// sLimitL := len(src) - inputMargin
134		LEAQ(Mem{Base: tmp, Disp: -inputMargin}, tmp3)
135		// dstLimit := len(src) - len(src)>>5 - 5
136		SHRQ(U8(5), tmp)
137		SUBL(tmp.As32(), tmp2.As32()) // tmp2 = tmp2 - tmp
138		MOVL(tmp3.As32(), sLimitL)
139		dstAddr := GP64()
140		MOVQ(dstBase, dstAddr)
141		// Store dst start address
142		MOVQ(dstAddr, dstStartPtrQ)
143		LEAQ(Mem{Base: dstAddr, Index: tmp2, Scale: 1}, tmp2)
144		MOVQ(tmp2, dstLimitPtrQ)
145	}
146
147	// s = 1
148	s := GP64().As32()
149	MOVL(U32(1), s)
150	// repeatL = 1
151	MOVL(s, repeatL)
152
153	src := GP64()
154	Load(Param("src").Base(), src)
155
156	// Load cv
157	Label("search_loop_" + name)
158	candidate := GP64().As32()
159	{
160		cv := GP64()
161		MOVQ(Mem{Base: src, Index: s, Scale: 1}, cv)
162		nextS := GP64()
163		// nextS := s + (s-nextEmit)>>6 + 4
164		{
165			tmp := GP64()
166			MOVL(s, tmp.As32())           // tmp = s
167			SUBL(nextEmitL, tmp.As32())   // tmp = s - nextEmit
168			SHRL(U8(skipLog), tmp.As32()) // tmp = (s - nextEmit) >> skipLog
169			LEAQ(Mem{Base: s, Disp: 4, Index: tmp, Scale: 1}, nextS)
170		}
171		// if nextS > sLimit {goto emitRemainder}
172		{
173			tmp := GP64()
174			MOVL(sLimitL, tmp.As32())
175			CMPL(nextS.As32(), tmp.As32())
176			JGT(LabelRef("emit_remainder_" + name))
177		}
178		// move nextS to stack.
179		MOVL(nextS.As32(), nextSTempL)
180
181		candidate2 := GP64().As32()
182		hasher := hash6(tableBits)
183		{
184			hash0, hash1 := GP64(), GP64()
185			MOVQ(cv, hash0)
186			MOVQ(cv, hash1)
187			SHRQ(U8(8), hash1)
188			hasher.hash(hash0)
189			hasher.hash(hash1)
190			MOVL(table.Idx(hash0, 1), candidate)
191			MOVL(table.Idx(hash1, 1), candidate2)
192			MOVL(s, table.Idx(hash0, 1))
193			tmp := GP64().As32()
194			LEAL(Mem{Base: s, Disp: 1}, tmp)
195			MOVL(tmp, table.Idx(hash1, 1))
196		}
197		// Check repeat at offset checkRep
198		const checkRep = 1
199
200		if true {
201			// rep = s - repeat
202			rep := GP64().As32()
203			if true {
204				// if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
205				left, right := GP64(), GP64()
206				MOVL(s, rep)
207				SUBL(repeatL, rep) // rep = s - repeat
208				MOVL(Mem{Base: src, Index: rep, Scale: 1, Disp: checkRep}, right.As32())
209				MOVQ(cv, left)
210				SHLQ(U8(checkRep*8), left)
211				CMPL(left.As32(), right.As32())
212
213				// FIXME: Unable to allocate if enabled.
214				JNE(LabelRef("no_repeat_found_" + name))
215			}
216			// base = s + 1
217			base := GP64()
218			LEAQ(Mem{Base: s, Disp: 1}, base)
219			// Extend back
220			if true {
221				ne := GP64().As32()
222				MOVL(nextEmitL, ne)
223				TESTL(rep, rep)
224				JZ(LabelRef("repeat_extend_back_end_" + name))
225
226				// I is tested when decremented, so we loop back here.
227				Label("repeat_extend_back_loop_" + name)
228				CMPL(base.As32(), ne)
229				JG(LabelRef("repeat_extend_back_end_" + name))
230				// if src[i-1] == src[base-1]
231				tmp, tmp2 := GP64(), GP64()
232				MOVB(Mem{Base: src, Index: rep, Scale: 1, Disp: -1}, tmp.As8())
233				MOVB(Mem{Base: src, Index: base, Scale: 1, Disp: -1}, tmp2.As8())
234				CMPB(tmp.As8(), tmp2.As8())
235				JNE(LabelRef("repeat_extend_back_end_" + name))
236				LEAQ(Mem{Base: base, Disp: -1}, base)
237				DECL(rep)
238				JZ(LabelRef("repeat_extend_back_end_" + name))
239				JMP(LabelRef("repeat_extend_back_loop_" + name))
240			}
241			Label("repeat_extend_back_end_" + name)
242			// Base is now at start.
243			// d += emitLiteral(dst[d:], src[nextEmitL:base])
244			if true {
245				emitLiterals(nextEmitL, base, src, dstBase, "repeat_emit_"+name, avx)
246			}
247
248			// Extend forward
249			if true {
250				// s += 4 + checkRep
251				ADDL(U8(4+checkRep), s)
252
253				// candidate := s - repeat + 4 + checkRep
254				MOVL(s, candidate)
255				SUBL(repeatL, candidate) // candidate = s - repeatL
256				{
257					// srcLeft = sLimitL - s
258					srcLeft := GP64()
259					MOVL(sLimitL, srcLeft.As32())
260					SUBL(s, srcLeft.As32())
261
262					// Forward address
263					forwardStart := Mem{Base: src, Index: s, Scale: 1}
264					// End address
265					backStart := Mem{Base: src, Index: candidate, Scale: 1}
266					length := matchLen("repeat_extend", forwardStart, backStart, srcLeft, LabelRef("repeat_extend_forward_end_"+name))
267					Label("repeat_extend_forward_end_" + name)
268					// s+= length
269					ADDL(length.As32(), s)
270				}
271			}
272			// Emit
273			if true {
274				// length = s-base
275				length := GP64()
276				MOVL(s, length.As32())
277				SUBL(base.As32(), length.As32())
278
279				offsetVal := GP64()
280				MOVL(repeatL, offsetVal.As32())
281				dst := GP64()
282				MOVQ(dstBase, dst)
283
284				// if nextEmit > 0
285				tmp := GP64()
286				MOVL(nextEmitL, tmp.As32())
287				TESTL(tmp.As32(), tmp.As32())
288
289				// FIXME: fails to allocate regs if enabled:
290				JZ(LabelRef("repeat_as_copy_" + name))
291
292				emitRepeat("match_repeat_", length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))
293
294				// JUMPS TO HERE:
295				Label("repeat_as_copy_" + name)
296				emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))
297
298				Label("repeat_end_emit_" + name)
299				// Store new dst and nextEmit
300				MOVQ(dst, dstBase)
301			}
302			// if s >= sLimit
303			// can be omitted.
304			if true {
305				tmp := GP64()
306				MOVL(sLimitL, tmp.As32())
307				CMPL(s, tmp.As32())
308				JGT(LabelRef("emit_remainder_" + name))
309			}
310			JMP(LabelRef("search_loop_" + name))
311		}
312		Label("no_repeat_found_" + name)
313		{
314			// Can be moved up if registers are available.
315			hash2 := GP64()
316			{
317				// hash2 := hash6(cv>>16, tableBits)
318				hasher = hash6(tableBits)
319				MOVQ(cv, hash2)
320				SHRQ(U8(16), hash2)
321				hasher.hash(hash2)
322			}
323
324			CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
325			// cv >>= 8
326			SHRQ(U8(8), cv)
327			JEQ(LabelRef("candidate_match_" + name))
328
329			// candidate = int(table[hash2])
330			MOVL(table.Idx(hash2, 1), candidate)
331
332			// if uint32(cv>>8) == load32(src, candidate2)
333			CMPL(Mem{Base: src, Index: candidate2, Scale: 1}, cv.As32())
334			JEQ(LabelRef("candidate2_match_" + name))
335
336			// table[hash2] = uint32(s + 2)
337			tmp := GP64()
338			LEAQ(Mem{Base: s, Disp: 2}, tmp)
339			MOVL(tmp.As32(), table.Idx(hash2, 1))
340
341			// if uint32(cv>>16) == load32(src, candidate)
342			SHRQ(U8(8), cv)
343			CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
344			JEQ(LabelRef("candidate3_match_" + name))
345			// s = nextS
346			MOVL(nextSTempL, s)
347			JMP(LabelRef("search_loop_" + name))
348
349			// Matches candidate3
350			Label("candidate3_match_" + name)
351			ADDL(U8(2), s)
352			JMP(LabelRef("candidate_match_" + name))
353
354			Label("candidate2_match_" + name)
355			// table[hash2] = uint32(s + 2)
356			tmp = GP64()
357			LEAQ(Mem{Base: s, Disp: -2}, tmp)
358			MOVL(tmp.As32(), table.Idx(hash2, 1))
359			// s++
360			INCL(s)
361			MOVL(candidate2, candidate)
362		}
363	}
364
365	Label("candidate_match_" + name)
366	// We have a match at 's' with src offset in "candidate" that matches at least 4 bytes.
367	// Extend backwards
368	{
369		ne := GP64()
370		MOVL(nextEmitL, ne.As32())
371		TESTL(candidate, candidate)
372		JZ(LabelRef("match_extend_back_end_" + name))
373
374		// candidate is tested when decremented, so we loop back here.
375		Label("match_extend_back_loop_" + name)
376		CMPL(s, ne.As32())
377		JG(LabelRef("match_extend_back_end_" + name))
378		// if src[candidate-1] == src[s-1]
379		tmp, tmp2 := GP64(), GP64()
380		MOVB(Mem{Base: src, Index: candidate, Scale: 1, Disp: -1}, tmp.As8())
381		MOVB(Mem{Base: src, Index: s, Scale: 1, Disp: -1}, tmp2.As8())
382		CMPB(tmp.As8(), tmp2.As8())
383		JNE(LabelRef("match_extend_back_end_" + name))
384		LEAL(Mem{Base: s, Disp: -1}, s)
385		DECL(candidate)
386		JZ(LabelRef("match_extend_back_end_" + name))
387		JMP(LabelRef("match_extend_back_loop_" + name))
388	}
389	Label("match_extend_back_end_" + name)
390
391	// Bail if we exceed the maximum size.
392	if true {
393		// tmp = s-nextEmitL
394		tmp := GP64()
395		MOVL(s, tmp.As32())
396		SUBL(nextEmitL, tmp.As32())
397		LEAQ(dstBase.Idx(tmp, 1), tmp)
398		CMPQ(tmp, dstLimitPtrQ)
399		JL(LabelRef("match_dst_size_check_" + name))
400		ri, err := ReturnIndex(0).Resolve()
401		if err != nil {
402			panic(err)
403		}
404		MOVQ(U32(0), ri.Addr)
405		RET()
406	}
407	Label("match_dst_size_check_" + name)
408	{
409		base := GP64()
410		MOVL(candidate, base.As32())
411		emitLiterals(nextEmitL, base, src, dstBase, "match_emit_"+name, avx)
412		NOP()
413	}
414
415	Label("match_nolit_loop_" + name)
416	{
417		base := GP64().As32()
418		MOVL(s, base)
419		// Update repeat
420		{
421			// repeat = base - candidate
422			repeatVal := GP64().As32()
423			MOVL(s, repeatVal)
424			SUBL(candidate, repeatVal)
425			MOVL(repeatVal, repeatL)
426		}
427		// s+=4, candidate+=4
428		ADDL(U8(4), s)
429		ADDL(U8(4), candidate)
430		// Extend the 4-byte match as long as possible and emit copy.
431		{
432			// srcLeft = sLimitL - s
433			srcLeft := GP64()
434			MOVL(sLimitL, srcLeft.As32())
435			SUBL(s, srcLeft.As32())
436			length := matchLen("match_nolit_"+name,
437				Mem{Base: src, Index: s, Scale: 1},
438				Mem{Base: src, Index: candidate, Scale: 1},
439				srcLeft,
440				LabelRef("match_nolit_end_"+name),
441			)
442			Label("match_nolit_end_" + name)
443			offset := GP64()
444			MOVL(repeatL, offset.As32())
445			ADDQ(U8(4), length)
446			dst := GP64()
447			MOVQ(dstBase, dst)
448			// s += length (lenght is destroyed, use it now)
449			ADDL(length.As32(), s)
450			emitCopy("match_nolit_"+name, length, offset, nil, dst, LabelRef("match_nolit_emitcopy_end_"+name))
451			Label("match_nolit_emitcopy_end_" + name)
452			MOVQ(dst, dstBase)
453			MOVL(s, nextEmitL)
454			CMPL(s, sLimitL)
455			JGE(LabelRef("emit_remainder_" + name))
456
457			// Bail if we exceed the maximum size.
458			{
459				CMPQ(dst, dstLimitPtrQ)
460				JL(LabelRef("match_nolit_dst_ok_" + name))
461				ri, err := ReturnIndex(0).Resolve()
462				if err != nil {
463					panic(err)
464				}
465				MOVQ(U32(0), ri.Addr)
466				RET()
467				Label("match_nolit_dst_ok_" + name)
468			}
469		}
470		{
471			// Check for an immediate match, otherwise start search at s+1
472			x := GP64()
473			// Index s-2
474			MOVQ(Mem{Base: src, Index: s, Scale: 1, Disp: -2}, x)
475			hasher := hash6(tableBits)
476			hash0, hash1 := GP64(), GP64()
477			MOVQ(x, hash0) // s-2
478			SHRQ(U8(16), x)
479			MOVQ(x, hash1) // s
480			hasher.hash(hash0)
481			hasher.hash(hash1)
482			c0, c1 := GP64(), GP64()
483			MOVL(table.Idx(hash0, 1), c0.As32())
484			MOVL(table.Idx(hash1, 1), c1.As32())
485			sm2 := GP64()
486			LEAQ(Mem{Base: s, Disp: -2}, sm2)
487			MOVL(sm2.As32(), table.Idx(hash0, 1))
488			MOVL(s, table.Idx(hash1, 1))
489			CMPL(Mem{Base: src, Index: hash1, Scale: 1}, x.As32())
490			JEQ(LabelRef("match_nolit_loop_" + name))
491			INCL(s)
492		}
493		JMP(LabelRef("search_loop_" + name))
494	}
495
496	Label("emit_remainder_" + name)
497	// Bail if we exceed the maximum size.
498	// if d+len(src)-nextEmitL > dstLimitPtrQ {	return 0
499	{
500		// remain = lenSrc - nextEmitL
501		remain := GP64()
502		MOVQ(lenSrcQ, remain)
503		SUBL(nextEmitL, remain.As32())
504		dst := GP64()
505		MOVQ(dstBase, dst)
506		// dst := dst + (len(src)-nextEmitL)
507		LEAQ(Mem{Base: dst, Index: remain, Scale: 1}, dst)
508		CMPQ(dst, dstLimitPtrQ)
509		JL(LabelRef("emit_remainder_ok_" + name))
510		ri, err := ReturnIndex(0).Resolve()
511		if err != nil {
512			panic(err)
513		}
514		MOVQ(U32(0), ri.Addr)
515		RET()
516		Label("emit_remainder_ok_" + name)
517	}
518	// emitLiteral(dst[d:], src[nextEmitL:])
519	emitEnd := GP64()
520	MOVQ(lenSrcQ, emitEnd)
521
522	// Emit final literals.
523	emitLiterals(nextEmitL, emitEnd, src, dstBase, "emit_remainder_"+name, avx)
524
525	// length := start - base (ptr arithmetic)
526	length := GP64()
527	MOVQ(dstStartPtrQ, length)
528	SUBQ(dstBase, length)
529
530	Store(length, ReturnIndex(0))
531	RET()
532}
533
534// emitLiterals emits literals from nextEmit to base, updates nextEmit, dstBase.
535// Checks if base == nextemit.
536// src & base are untouched.
537func emitLiterals(nextEmitL Mem, base reg.GPVirtual, src reg.GPVirtual, dstBase Mem, name string, avx bool) {
538	nextEmit, litLen, dstBaseTmp, litBase := GP64().As32(), GP64(), GP64(), GP64()
539	MOVL(nextEmitL, nextEmit)
540	CMPL(nextEmit, base.As32())
541	JEQ(LabelRef("emit_literal_skip_" + name))
542	MOVL(base.As32(), litLen.As32())
543
544	// Base is now next emit.
545	MOVL(base.As32(), nextEmitL)
546
547	// litBase = src[nextEmitL:]
548	LEAQ(Mem{Base: src, Index: nextEmit, Scale: 1}, litBase)
549	SUBL(nextEmit, litLen.As32()) // litlen = base - nextEmit
550
551	// Load (and store when we return)
552	MOVQ(dstBase, dstBaseTmp)
553	emitLiteral(name, litLen, nil, dstBaseTmp, litBase, LabelRef("emit_literal_done_"+name), avx, true)
554	Label("emit_literal_done_" + name)
555	// Store updated dstBase
556	MOVQ(dstBaseTmp, dstBase)
557	Label("emit_literal_skip_" + name)
558}
559
560type hashGen struct {
561	bytes     int
562	tablebits int
563	mulreg    reg.GPVirtual
564}
565
566// hash uses multiply to get a 'output' hash on the hash of the lowest 'bytes' bytes in value.
567func hash6(tablebits int) hashGen {
568	h := hashGen{
569		bytes:     6,
570		tablebits: tablebits,
571		mulreg:    GP64(),
572	}
573	MOVQ(Imm(227718039650203), h.mulreg)
574	return h
575}
576
577// hash uses multiply to get hash of the value.
578func (h hashGen) hash(val reg.GPVirtual) {
579	// Move value to top of register.
580	SHLQ(U8(64-8*h.bytes), val)
581	IMULQ(h.mulreg, val)
582	// Move value to bottom
583	SHRQ(U8(64-h.tablebits), val)
584}
585
586func genEmitLiteral() {
587	TEXT("emitLiteral", NOSPLIT, "func(dst, lit []byte) int")
588	Doc("emitLiteral writes a literal chunk and returns the number of bytes written.", "",
589		"It assumes that:",
590		"  dst is long enough to hold the encoded bytes",
591		"  0 <= len(lit) && len(lit) <= math.MaxUint32", "")
592	Pragma("noescape")
593
594	dstBase, litBase, litLen, retval := GP64(), GP64(), GP64(), GP64()
595	Load(Param("dst").Base(), dstBase)
596	Load(Param("lit").Base(), litBase)
597	Load(Param("lit").Len(), litLen)
598	emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_standalone", false, false)
599	Label("emit_literal_end_standalone")
600	Store(retval, ReturnIndex(0))
601	RET()
602
603	TEXT("emitLiteralAvx", NOSPLIT, "func(dst, lit []byte) int")
604	Doc("emitLiteralAvx writes a literal chunk and returns the number of bytes written.", "",
605		"It assumes that:",
606		"  dst is long enough to hold the encoded bytes",
607		"  0 <= len(lit) && len(lit) <= math.MaxUint32", "")
608	Pragma("noescape")
609
610	dstBase, litBase, litLen, retval = GP64(), GP64(), GP64(), GP64()
611	Load(Param("dst").Base(), dstBase)
612	Load(Param("lit").Base(), litBase)
613	Load(Param("lit").Len(), litLen)
614	emitLiteral("standalone", litLen, retval, dstBase, litBase, "emit_literal_end_avx_standalone", true, false)
615	Label("emit_literal_end_avx_standalone")
616	Store(retval, ReturnIndex(0))
617	RET()
618}
619
620// emitLiteral can be used for inlining an emitLiteral call.
621// stack must have at least 32 bytes.
622// retval will contain emitted bytes, but can be nil if this is not interesting.
623// dstBase and litBase are updated.
624// Uses 2 GP registers. With AVX 4 registers.
625// If updateDst is true dstBase will have the updated end pointer and an additional register will be used.
626func emitLiteral(name string, litLen, retval, dstBase, litBase reg.GPVirtual, end LabelRef, avx, updateDst bool) {
627	n := GP64()
628	n16 := GP64()
629
630	// We always add litLen bytes
631	if retval != nil {
632		MOVQ(litLen, retval)
633	}
634	MOVQ(litLen, n)
635
636	SUBL(U8(1), n.As32())
637	// Return if AX was 0
638	JC(end)
639
640	// Find number of bytes to emit for tag.
641	CMPL(n.As32(), U8(60))
642	JLT(LabelRef("one_byte_" + name))
643	CMPL(n.As32(), U32(1<<8))
644	JLT(LabelRef("two_bytes_" + name))
645	CMPL(n.As32(), U32(1<<16))
646	JLT(LabelRef("three_bytes_" + name))
647	CMPL(n.As32(), U32(1<<24))
648	JLT(LabelRef("four_bytes_" + name))
649
650	Label("five_bytes_" + name)
651	MOVB(U8(252), Mem{Base: dstBase})
652	MOVL(n.As32(), Mem{Base: dstBase, Disp: 1})
653	if retval != nil {
654		ADDQ(U8(5), retval)
655	}
656	ADDQ(U8(5), dstBase)
657	JMP(LabelRef("memmove_" + name))
658
659	Label("four_bytes_" + name)
660	MOVQ(n, n16)
661	SHRL(U8(16), n16.As32())
662	MOVB(U8(248), Mem{Base: dstBase})
663	MOVW(n.As16(), Mem{Base: dstBase, Disp: 1})
664	MOVB(n16.As8(), Mem{Base: dstBase, Disp: 3})
665	if retval != nil {
666		ADDQ(U8(4), retval)
667	}
668	ADDQ(U8(4), dstBase)
669	JMP(LabelRef("memmove_" + name))
670
671	Label("three_bytes_" + name)
672	MOVB(U8(0xf4), Mem{Base: dstBase})
673	MOVW(n.As16(), Mem{Base: dstBase, Disp: 1})
674	if retval != nil {
675		ADDQ(U8(3), retval)
676	}
677	ADDQ(U8(3), dstBase)
678	JMP(LabelRef("memmove_" + name))
679
680	Label("two_bytes_" + name)
681	MOVB(U8(0xf0), Mem{Base: dstBase})
682	MOVB(n.As8(), Mem{Base: dstBase, Disp: 1})
683	if retval != nil {
684		ADDQ(U8(2), retval)
685	}
686	ADDQ(U8(2), dstBase)
687	JMP(LabelRef("memmove_" + name))
688
689	Label("one_byte_" + name)
690	SHLB(U8(2), n.As8())
691	MOVB(n.As8(), Mem{Base: dstBase})
692	if retval != nil {
693		ADDQ(U8(1), retval)
694	}
695	ADDQ(U8(1), dstBase)
696	// Fallthrough
697
698	Label("memmove_" + name)
699
700	// copy(dst[i:], lit)
701	if true {
702		dstEnd := GP64()
703		if updateDst {
704			LEAQ(Mem{Base: dstBase, Index: litLen, Scale: 1}, dstEnd)
705		}
706		genMemMove2("emit_lit_memmove_"+name, dstBase, litBase, litLen, end, avx)
707		if updateDst {
708			MOVQ(dstEnd, dstBase)
709		}
710	} else {
711		genMemMove("emit_lit_memmove_"+name, dstBase, litBase, litLen, end)
712	}
713	return
714}
715
716// genEmitRepeat generates a standlone emitRepeat.
717func genEmitRepeat() {
718	TEXT("emitRepeat", NOSPLIT, "func(dst []byte, offset, length int) int")
719	Doc("emitRepeat writes a repeat chunk and returns the number of bytes written.",
720		"Length must be at least 4 and < 1<<32", "")
721	Pragma("noescape")
722
723	dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()
724
725	// retval = 0
726	XORQ(retval, retval)
727
728	Load(Param("dst").Base(), dstBase)
729	Load(Param("offset"), offset)
730	Load(Param("length"), length)
731	emitRepeat("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_repeat_end"))
732	Label("gen_emit_repeat_end")
733	Store(retval, ReturnIndex(0))
734	RET()
735}
736
737// emitRepeat can be used for inlining an emitRepeat call.
738// length >= 4 and < 1<<32
739// length is modified. dstBase is updated. retval is added to input.
740// retval can be nil.
741// Will jump to end label when finished.
742// Uses 1 GP register.
743func emitRepeat(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) {
744	Label("emit_repeat_again_" + name)
745	tmp := GP64()
746	MOVQ(length, tmp) // Copy length
747	// length -= 4
748	LEAQ(Mem{Base: length, Disp: -4}, length)
749
750	// if length <= 4 (use copied value)
751	CMPL(tmp.As32(), U8(8))
752	JLE(LabelRef("repeat_two_" + name))
753
754	// length < 8 && offset < 2048
755	CMPL(tmp.As32(), U8(12))
756	JGE(LabelRef("cant_repeat_two_offset_" + name))
757	CMPL(offset.As32(), U32(2048))
758	JLT(LabelRef("repeat_two_offset_" + name))
759
760	const maxRepeat = ((1 << 24) - 1) + 65536
761	Label("cant_repeat_two_offset_" + name)
762	CMPL(length.As32(), U32((1<<8)+4))
763	JLT(LabelRef("repeat_three_" + name)) // if length < (1<<8)+4
764	CMPL(length.As32(), U32((1<<16)+(1<<8)))
765	JLT(LabelRef("repeat_four_" + name)) // if length < (1 << 16) + (1 << 8)
766	CMPL(length.As32(), U32(maxRepeat))
767	JLT(LabelRef("repeat_five_" + name)) // If less than 24 bits to represent.
768
769	// We have have more than 24 bits
770	// Emit so we have at least 4 bytes left.
771	LEAQ(Mem{Base: length, Disp: -(maxRepeat - 4)}, length) // length -= (maxRepeat - 4)
772	MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase})            // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
773	MOVW(U16(65531), Mem{Base: dstBase, Disp: 2})           // 0xfffb
774	MOVB(U8(255), Mem{Base: dstBase, Disp: 4})
775	ADDQ(U8(5), dstBase)
776	if retval != nil {
777		ADDQ(U8(5), retval)
778	}
779	JMP(LabelRef("emit_repeat_again_" + name))
780
781	// Must be able to be within 5 bytes.
782	Label("repeat_five_" + name)
783	LEAQ(Mem{Base: length, Disp: -65536}, length) // length -= 65536
784	MOVQ(length, offset)
785	MOVW(U16(7<<2|tagCopy1), Mem{Base: dstBase})     // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
786	MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8)
787	SARQ(U8(16), offset)                             // offset = length >> 16
788	MOVB(offset.As8(), Mem{Base: dstBase, Disp: 4})  // dst[4] = length >> 16
789	if retval != nil {
790		ADDQ(U8(5), retval) // i += 5
791	}
792	ADDQ(U8(5), dstBase) // dst += 5
793	JMP(end)
794
795	Label("repeat_four_" + name)
796	LEAQ(Mem{Base: length, Disp: -256}, length)      // length -= 256
797	MOVW(U16(6<<2|tagCopy1), Mem{Base: dstBase})     // dst[0] = 6<<2 | tagCopy1, dst[1] = 0
798	MOVW(length.As16(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length), dst[3] = uint8(length >> 8)
799	if retval != nil {
800		ADDQ(U8(4), retval) // i += 4
801	}
802	ADDQ(U8(4), dstBase) // dst += 4
803	JMP(end)
804
805	Label("repeat_three_" + name)
806	LEAQ(Mem{Base: length, Disp: -4}, length)       // length -= 4
807	MOVW(U16(5<<2|tagCopy1), Mem{Base: dstBase})    // dst[0] = 5<<2 | tagCopy1, dst[1] = 0
808	MOVB(length.As8(), Mem{Base: dstBase, Disp: 2}) // dst[2] = uint8(length)
809	if retval != nil {
810		ADDQ(U8(3), retval) // i += 3
811	}
812	ADDQ(U8(3), dstBase) // dst += 3
813	JMP(end)
814
815	Label("repeat_two_" + name)
816	// dst[0] = uint8(length)<<2 | tagCopy1, dst[1] = 0
817	SHLL(U8(2), length.As32())
818	ORL(U8(tagCopy1), length.As32())
819	MOVW(length.As16(), Mem{Base: dstBase}) // dst[0] = 7<<2 | tagCopy1, dst[1] = 0
820	if retval != nil {
821		ADDQ(U8(2), retval) // i += 2
822	}
823	ADDQ(U8(2), dstBase) // dst += 2
824	JMP(end)
825
826	Label("repeat_two_offset_" + name)
827	// Emit the remaining copy, encoded as 2 bytes.
828	// dst[1] = uint8(offset)
829	// dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
830	tmp = GP64()
831	XORQ(tmp, tmp)
832	// Use scale and displacement to shift and subtract values from length.
833	LEAQ(Mem{Base: tmp, Index: length, Scale: 4, Disp: tagCopy1}, length)
834	MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte
835	SARL(U8(8), offset.As32())                      // Remove lower
836	SHLL(U8(5), offset.As32())                      // Shift back up
837	ORL(offset.As32(), length.As32())               // OR result
838	MOVB(length.As8(), Mem{Base: dstBase, Disp: 0})
839	if retval != nil {
840		ADDQ(U8(2), retval) // i += 2
841	}
842	ADDQ(U8(2), dstBase) // dst += 2
843
844	JMP(end)
845}
846
847// emitCopy writes a copy chunk and returns the number of bytes written.
848//
849// It assumes that:
850//	dst is long enough to hold the encoded bytes
851//	1 <= offset && offset <= math.MaxUint32
852//	4 <= length && length <= 1 << 24
853
854// genEmitCopy generates a standlone emitCopy
855func genEmitCopy() {
856	TEXT("emitCopy", NOSPLIT, "func(dst []byte, offset, length int) int")
857	Doc("emitCopy writes a copy chunk and returns the number of bytes written.", "",
858		"It assumes that:",
859		"  dst is long enough to hold the encoded bytes",
860		"  1 <= offset && offset <= math.MaxUint32",
861		"  4 <= length && length <= 1 << 24", "")
862	Pragma("noescape")
863
864	dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64()
865
866	//	i := 0
867	XORQ(retval, retval)
868
869	Load(Param("dst").Base(), dstBase)
870	Load(Param("offset"), offset)
871	Load(Param("length"), length)
872	emitCopy("standalone", length, offset, retval, dstBase, LabelRef("gen_emit_copy_end"))
873	Label("gen_emit_copy_end")
874	Store(retval, ReturnIndex(0))
875	RET()
876}
877
878const (
879	tagLiteral = 0x00
880	tagCopy1   = 0x01
881	tagCopy2   = 0x02
882	tagCopy4   = 0x03
883)
884
885// emitCopy can be used for inlining an emitCopy call.
886// length is modified (and junk). dstBase is updated. retval is added to input.
887// retval can be nil.
888// Will jump to end label when finished.
889// Uses 2 GP registers.
890func emitCopy(name string, length, offset, retval, dstBase reg.GPVirtual, end LabelRef) {
891	// if offset >= 65536 {
892	CMPL(offset.As32(), U32(65536))
893	JL(LabelRef("two_byte_offset_" + name))
894
895	// offset is >= 65536
896	//	if length <= 64 goto four_bytes_remain_
897	CMPL(length.As32(), U8(64))
898	JLE(LabelRef("four_bytes_remain_" + name))
899
900	// Emit a length 64 copy, encoded as 5 bytes.
901	//		dst[0] = 63<<2 | tagCopy4
902	MOVB(U8(63<<2|tagCopy4), Mem{Base: dstBase})
903	//		dst[4] = uint8(offset >> 24)
904	//		dst[3] = uint8(offset >> 16)
905	//		dst[2] = uint8(offset >> 8)
906	//		dst[1] = uint8(offset)
907	MOVD(offset, Mem{Base: dstBase, Disp: 1})
908	//		length -= 64
909	LEAQ(Mem{Base: length, Disp: -64}, length)
910	if retval != nil {
911		ADDQ(U8(5), retval) // i+=5
912	}
913	ADDQ(U8(5), dstBase) // dst+=5
914
915	//	if length >= 4 {
916	CMPL(length.As32(), U8(4))
917	JL(LabelRef("four_bytes_remain_" + name))
918
919	// Emit remaining as repeats
920	//	return 5 + emitRepeat(dst[5:], offset, length)
921	// Inline call to emitRepeat. Will jump to end
922	emitRepeat(name+"_emit_copy", length, offset, retval, dstBase, end)
923
924	Label("four_bytes_remain_" + name)
925	//	if length == 0 {
926	//		return i
927	//	}
928	TESTL(length.As32(), length.As32())
929	JZ(end)
930
931	// Emit a copy, offset encoded as 4 bytes.
932	//	dst[i+0] = uint8(length-1)<<2 | tagCopy4
933	//	dst[i+1] = uint8(offset)
934	//	dst[i+2] = uint8(offset >> 8)
935	//	dst[i+3] = uint8(offset >> 16)
936	//	dst[i+4] = uint8(offset >> 24)
937	tmp := GP64()
938	MOVB(U8(tagCopy4), tmp.As8())
939	// Use displacement to subtract 1 from upshifted length.
940	LEAQ(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length)
941	MOVB(length.As8(), Mem{Base: dstBase})
942	MOVD(offset, Mem{Base: dstBase, Disp: 1})
943	//	return i + 5
944	if retval != nil {
945		ADDQ(U8(5), retval)
946	}
947	ADDQ(U8(5), dstBase)
948	JMP(end)
949
950	Label("two_byte_offset_" + name)
951	// Offset no more than 2 bytes.
952
953	// if length > 64 {
954	CMPL(length.As32(), U8(64))
955	JLE(LabelRef("two_byte_offset_short_" + name))
956	// Emit a length 60 copy, encoded as 3 bytes.
957	// Emit remaining as repeat value (minimum 4 bytes).
958	//	dst[2] = uint8(offset >> 8)
959	//	dst[1] = uint8(offset)
960	//	dst[0] = 59<<2 | tagCopy2
961	MOVB(U8(59<<2|tagCopy2), Mem{Base: dstBase})
962	MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1})
963	//	length -= 60
964	LEAQ(Mem{Base: length, Disp: -60}, length)
965
966	// Emit remaining as repeats, at least 4 bytes remain.
967	//	return 3 + emitRepeat(dst[3:], offset, length)
968	//}
969	ADDQ(U8(3), dstBase)
970	if retval != nil {
971		ADDQ(U8(3), retval)
972	}
973	// Inline call to emitRepeat. Will jump to end
974	emitRepeat(name+"_emit_copy_short", length, offset, retval, dstBase, end)
975
976	Label("two_byte_offset_short_" + name)
977	// if length >= 12 || offset >= 2048 {
978	CMPL(length.As32(), U8(12))
979	JGE(LabelRef("emit_copy_three_" + name))
980	CMPL(offset.As32(), U32(2048))
981	JGE(LabelRef("emit_copy_three_" + name))
982
983	// Emit the remaining copy, encoded as 2 bytes.
984	// dst[1] = uint8(offset)
985	// dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
986	tmp = GP64()
987	MOVB(U8(tagCopy1), tmp.As8())
988	// Use scale and displacement to shift and subtract values from length.
989	LEAQ(Mem{Base: tmp, Index: length, Scale: 4, Disp: -(4 << 2)}, length)
990	MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte
991	SHRL(U8(8), offset.As32())                      // Remove lower
992	SHLL(U8(5), offset.As32())                      // Shift back up
993	ORL(offset.As32(), length.As32())               // OR result
994	MOVB(length.As8(), Mem{Base: dstBase, Disp: 0})
995	if retval != nil {
996		ADDQ(U8(2), retval) // i += 2
997	}
998	ADDQ(U8(2), dstBase) // dst += 2
999	// return 2
1000	JMP(end)
1001
1002	Label("emit_copy_three_" + name)
1003	//	// Emit the remaining copy, encoded as 3 bytes.
1004	//	dst[2] = uint8(offset >> 8)
1005	//	dst[1] = uint8(offset)
1006	//	dst[0] = uint8(length-1)<<2 | tagCopy2
1007	tmp = GP64()
1008	MOVB(U8(tagCopy2), tmp.As8())
1009	LEAQ(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length)
1010	MOVB(length.As8(), Mem{Base: dstBase})
1011	MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1})
1012	//	return 3
1013	if retval != nil {
1014		ADDQ(U8(3), retval) // i += 3
1015	}
1016	ADDQ(U8(3), dstBase) // dst += 3
1017	JMP(end)
1018}
1019
1020// func memmove(to, from unsafe.Pointer, n uintptr)
1021// to and from will be at the end, n will be 0.
1022// to and from may not overlap.
1023// Fairly simplistic for now, can ofc. be extended.
1024// Uses one GP register and 8 SSE registers.
1025func genMemMove(name string, to, from, n reg.GPVirtual, end LabelRef) {
1026	tmp := GP64()
1027	MOVQ(n, tmp)
1028	// tmp = n/128
1029	SHRQ(U8(7), tmp)
1030
1031	TESTQ(tmp, tmp)
1032	JZ(LabelRef("done_128_" + name))
1033	Label("loop_128_" + name)
1034	var xmmregs [8]reg.VecVirtual
1035
1036	// Prefetch destination for next loop.
1037	// Prefetching source doesn't provide speedup.
1038	// This seems to give a small boost.
1039	const preOff = 128
1040	PREFETCHT0(Mem{Base: to, Disp: preOff})
1041	PREFETCHT0(Mem{Base: to, Disp: preOff + 64})
1042
1043	for i := 0; i < 8; i++ {
1044		xmmregs[i] = XMM()
1045		MOVOU(Mem{Base: from}.Offset(i*16), xmmregs[i])
1046	}
1047	for i := 0; i < 8; i++ {
1048		MOVOU(xmmregs[i], Mem{Base: to}.Offset(i*16))
1049	}
1050	LEAQ(Mem{Base: n, Disp: -128}, n)
1051	ADDQ(U8(8*16), from)
1052	ADDQ(U8(8*16), to)
1053	DECQ(tmp)
1054	JNZ(LabelRef("loop_128_" + name))
1055
1056	Label("done_128_" + name)
1057	MOVQ(n, tmp)
1058	// tmp = n/16
1059	SHRQ(U8(4), tmp)
1060	TESTQ(tmp, tmp)
1061	JZ(LabelRef("done_16_" + name))
1062
1063	Label("loop_16_" + name)
1064	xmm := XMM()
1065	MOVOU(Mem{Base: from}, xmm)
1066	MOVOU(xmm, Mem{Base: to})
1067	LEAQ(Mem{Base: n, Disp: -16}, n)
1068	ADDQ(U8(16), from)
1069	ADDQ(U8(16), to)
1070	DECQ(tmp)
1071	JNZ(LabelRef("loop_16_" + name))
1072	Label("done_16_" + name)
1073
1074	// TODO: Use REP; MOVSB somehow.
1075	TESTQ(n, n)
1076	JZ(end)
1077	Label("loop_1_" + name)
1078	MOVB(Mem{Base: from}, tmp.As8())
1079	MOVB(tmp.As8(), Mem{Base: to})
1080	INCQ(from)
1081	INCQ(to)
1082	DECQ(n)
1083	JNZ(LabelRef("loop_1_" + name))
1084}
1085
1086// func memmove(to, from unsafe.Pointer, n uintptr)
1087// src and dst may not overlap.
1088// Non AVX uses 2 GP register, 16 SSE2 registers.
1089// AVX uses 4 GP registers 16 AVX/SSE registers.
1090// All passed registers may be updated.
1091func genMemMove2(name string, dst, src, length reg.GPVirtual, end LabelRef, avx bool) {
1092	AX, CX := GP64(), GP64()
1093	NOP()
1094	name += "_memmove_"
1095	Label(name + "tail")
1096	// move_129through256 or smaller work whether or not the source and the
1097	// destination memory regions overlap because they load all data into
1098	// registers before writing it back.  move_256through2048 on the other
1099	// hand can be used only when the memory regions don't overlap or the copy
1100	// direction is forward.
1101	//
1102	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
1103	TESTQ(length, length)
1104	JEQ(end)
1105	CMPQ(length, U8(2))
1106	JBE(LabelRef(name + "move_1or2"))
1107	CMPQ(length, U8(4))
1108	JB(LabelRef(name + "move_3"))
1109	JBE(LabelRef(name + "move_4"))
1110	CMPQ(length, U8(8))
1111	JB(LabelRef(name + "move_5through7"))
1112	JE(LabelRef(name + "move_8"))
1113	CMPQ(length, U8(16))
1114	JBE(LabelRef(name + "move_9through16"))
1115	CMPQ(length, U8(32))
1116	JBE(LabelRef(name + "move_17through32"))
1117	CMPQ(length, U8(64))
1118	JBE(LabelRef(name + "move_33through64"))
1119	CMPQ(length, U8(128))
1120	JBE(LabelRef(name + "move_65through128"))
1121	CMPQ(length, U32(256))
1122	JBE(LabelRef(name + "move_129through256"))
1123
1124	if avx {
1125		JMP(LabelRef(name + "avxUnaligned"))
1126	} else {
1127		if false {
1128			// Don't check length for now.
1129			Label(name + "forward")
1130			CMPQ(length, U32(2048))
1131			JLS(LabelRef(name + "move_256through2048"))
1132
1133			genMemMove(name+"fallback", dst, src, length, end)
1134		} else {
1135			JMP(LabelRef(name + "move_256through2048"))
1136		}
1137	}
1138	/*
1139			// If REP MOVSB isn't fast, don't use it
1140			// FIXME: internal∕cpu·X86+const_offsetX86HasERMS(SB)
1141			// CMPB(U8(1), U8(1)) // enhanced REP MOVSB/STOSB
1142			JMP(LabelRef(name + "fwdBy8"))
1143
1144			// Check alignment
1145			MOVL(src.As32(), AX.As32())
1146			ORL(dst.As32(), AX.As32())
1147			TESTL(U32(7), AX.As32())
1148			JEQ(LabelRef(name + "fwdBy8"))
1149
1150			// Do 1 byte at a time
1151			// MOVQ(length, CX)
1152			// FIXME:
1153			// REP;	MOVSB
1154			JMP(end)
1155
1156		Label(name + "fwdBy8")
1157		// Do 8 bytes at a time
1158		MOVQ(length, CX)
1159		SHRQ(U8(3), CX)
1160		ANDQ(U8(7), length)
1161		// FIXME:
1162		//REP;	MOVSQ
1163		JMP(LabelRef(name + "tail"))
1164
1165		Label(name + "back")
1166
1167		//check overlap
1168		MOVQ(src, CX)
1169		ADDQ(length, CX)
1170		CMPQ(CX, dst)
1171		JLS(LabelRef(name + "forward"))
1172
1173		//whole thing backwards has
1174		//adjusted addresses
1175
1176		ADDQ(length, dst)
1177		ADDQ(length, src)
1178		STD()
1179
1180		//
1181		//  copy
1182		 //
1183		MOVQ(length, CX)
1184		SHRQ(U8(3), CX)
1185		ANDQ(U8(7), length)
1186
1187		SUBQ(U8(8), dst)
1188		SUBQ(U8(8), src)
1189		// FIXME:
1190		//REP;	MOVSQ
1191
1192		// FIXME:
1193		//CLD()
1194
1195		ADDQ(U8(8), dst)
1196		ADDQ(U8(8), src)
1197		SUBQ(length, dst)
1198		SUBQ(length, src)
1199		JMP(LabelRef(name + "tail"))
1200	*/
1201
1202	Label(name + "move_1or2")
1203	MOVB(Mem{Base: src}, AX.As8())
1204	MOVB(Mem{Base: src, Disp: -1, Index: length, Scale: 1}, CX.As8())
1205	MOVB(AX.As8(), Mem{Base: dst})
1206	MOVB(CX.As8(), Mem{Base: dst, Disp: -1, Index: length, Scale: 1})
1207	JMP(end)
1208
1209	Label(name + "move_4")
1210	MOVL(Mem{Base: src}, AX.As32())
1211	MOVL(AX.As32(), Mem{Base: dst})
1212	JMP(end)
1213
1214	Label(name + "move_3")
1215	MOVW(Mem{Base: src}, AX.As16())
1216	MOVB(Mem{Base: src, Disp: 2}, CX.As8())
1217	MOVW(AX.As16(), Mem{Base: dst})
1218	MOVB(CX.As8(), Mem{Base: dst, Disp: 2})
1219	JMP(end)
1220
1221	Label(name + "move_5through7")
1222	MOVL(Mem{Base: src}, AX.As32())
1223	MOVL(Mem{Base: src, Disp: -4, Index: length, Scale: 1}, CX.As32())
1224	MOVL(AX.As32(), Mem{Base: dst})
1225	MOVL(CX.As32(), Mem{Base: dst, Disp: -4, Index: length, Scale: 1})
1226	JMP(end)
1227
1228	Label(name + "move_8")
1229	// We need a separate case for 8 to make sure we write pointers atomically.
1230	MOVQ(Mem{Base: src}, AX)
1231	MOVQ(AX, Mem{Base: dst})
1232	JMP(end)
1233
1234	Label(name + "move_9through16")
1235	MOVQ(Mem{Base: src}, AX)
1236	MOVQ(Mem{Base: src, Disp: -8, Index: length, Scale: 1}, CX)
1237	MOVQ(AX, Mem{Base: dst})
1238	MOVQ(CX, Mem{Base: dst, Disp: -8, Index: length, Scale: 1})
1239	JMP(end)
1240
1241	Label(name + "move_17through32")
1242	X0, X1, X2, X3, X4, X5, X6, X7 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
1243	X8, X9, X10, X11, X12, X13, X14, X15 := XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM(), XMM()
1244
1245	MOVOU(Mem{Base: src}, X0)
1246	MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X1)
1247	MOVOU(X0, Mem{Base: dst})
1248	MOVOU(X1, Mem{Base: dst, Disp: -16, Index: length, Scale: 1})
1249	JMP(end)
1250
1251	Label(name + "move_33through64")
1252	MOVOU(Mem{Base: src}, X0)
1253	MOVOU(Mem{Base: src, Disp: 16}, X1)
1254	MOVOU(Mem{Base: src, Disp: -32, Index: length, Scale: 1}, X2)
1255	MOVOU(Mem{Base: src, Disp: -16, Index: length, Scale: 1}, X3)
1256	MOVOU(X0, Mem{Base: dst})
1257	MOVOU(X1, Mem{Base: dst, Disp: 16})
1258	MOVOU(X2, Mem{Base: dst, Disp: -32, Index: length, Scale: 1})
1259	MOVOU(X3, Mem{Base: dst, Disp: -16, Index: length, Scale: 1})
1260	JMP(end)
1261
1262	Label(name + "move_65through128")
1263	MOVOU(Mem{Base: src}, X0)
1264	MOVOU(Mem{Base: src, Disp: 16}, X1)
1265	MOVOU(Mem{Base: src, Disp: 32}, X2)
1266	MOVOU(Mem{Base: src, Disp: 48}, X3)
1267	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12)
1268	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13)
1269	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14)
1270	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15)
1271	MOVOU(X0, Mem{Base: dst})
1272	MOVOU(X1, Mem{Base: dst, Disp: 16})
1273	MOVOU(X2, Mem{Base: dst, Disp: 32})
1274	MOVOU(X3, Mem{Base: dst, Disp: 48})
1275	MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64})
1276	MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48})
1277	MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32})
1278	MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16})
1279	JMP(end)
1280
1281	Label(name + "move_129through256")
1282	MOVOU(Mem{Base: src}, X0)
1283	MOVOU(Mem{Base: src, Disp: 16}, X1)
1284	MOVOU(Mem{Base: src, Disp: 32}, X2)
1285	MOVOU(Mem{Base: src, Disp: 48}, X3)
1286	MOVOU(Mem{Base: src, Disp: 64}, X4)
1287	MOVOU(Mem{Base: src, Disp: 80}, X5)
1288	MOVOU(Mem{Base: src, Disp: 96}, X6)
1289	MOVOU(Mem{Base: src, Disp: 112}, X7)
1290	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -128}, X8)
1291	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -112}, X9)
1292	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -96}, X10)
1293	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -80}, X11)
1294	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -64}, X12)
1295	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -48}, X13)
1296	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -32}, X14)
1297	MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -16}, X15)
1298	MOVOU(X0, Mem{Base: dst})
1299	MOVOU(X1, Mem{Base: dst, Disp: 16})
1300	MOVOU(X2, Mem{Base: dst, Disp: 32})
1301	MOVOU(X3, Mem{Base: dst, Disp: 48})
1302	MOVOU(X4, Mem{Base: dst, Disp: 64})
1303	MOVOU(X5, Mem{Base: dst, Disp: 80})
1304	MOVOU(X6, Mem{Base: dst, Disp: 96})
1305	MOVOU(X7, Mem{Base: dst, Disp: 112})
1306	MOVOU(X8, Mem{Base: dst, Index: length, Scale: 1, Disp: -128})
1307	MOVOU(X9, Mem{Base: dst, Index: length, Scale: 1, Disp: -112})
1308	MOVOU(X10, Mem{Base: dst, Index: length, Scale: 1, Disp: -96})
1309	MOVOU(X11, Mem{Base: dst, Index: length, Scale: 1, Disp: -80})
1310	MOVOU(X12, Mem{Base: dst, Index: length, Scale: 1, Disp: -64})
1311	MOVOU(X13, Mem{Base: dst, Index: length, Scale: 1, Disp: -48})
1312	MOVOU(X14, Mem{Base: dst, Index: length, Scale: 1, Disp: -32})
1313	MOVOU(X15, Mem{Base: dst, Index: length, Scale: 1, Disp: -16})
1314	JMP(end)
1315
1316	Label(name + "move_256through2048")
1317	LEAQ(Mem{Base: length, Disp: -256}, length)
1318	MOVOU(Mem{Base: src}, X0)
1319	MOVOU(Mem{Base: src, Disp: 16}, X1)
1320	MOVOU(Mem{Base: src, Disp: 32}, X2)
1321	MOVOU(Mem{Base: src, Disp: 48}, X3)
1322	MOVOU(Mem{Base: src, Disp: 64}, X4)
1323	MOVOU(Mem{Base: src, Disp: 80}, X5)
1324	MOVOU(Mem{Base: src, Disp: 96}, X6)
1325	MOVOU(Mem{Base: src, Disp: 112}, X7)
1326	MOVOU(Mem{Base: src, Disp: 128}, X8)
1327	MOVOU(Mem{Base: src, Disp: 144}, X9)
1328	MOVOU(Mem{Base: src, Disp: 160}, X10)
1329	MOVOU(Mem{Base: src, Disp: 176}, X11)
1330	MOVOU(Mem{Base: src, Disp: 192}, X12)
1331	MOVOU(Mem{Base: src, Disp: 208}, X13)
1332	MOVOU(Mem{Base: src, Disp: 224}, X14)
1333	MOVOU(Mem{Base: src, Disp: 240}, X15)
1334	MOVOU(X0, Mem{Base: dst})
1335	MOVOU(X1, Mem{Base: dst, Disp: 16})
1336	MOVOU(X2, Mem{Base: dst, Disp: 32})
1337	MOVOU(X3, Mem{Base: dst, Disp: 48})
1338	MOVOU(X4, Mem{Base: dst, Disp: 64})
1339	MOVOU(X5, Mem{Base: dst, Disp: 80})
1340	MOVOU(X6, Mem{Base: dst, Disp: 96})
1341	MOVOU(X7, Mem{Base: dst, Disp: 112})
1342	MOVOU(X8, Mem{Base: dst, Disp: 128})
1343	MOVOU(X9, Mem{Base: dst, Disp: 144})
1344	MOVOU(X10, Mem{Base: dst, Disp: 160})
1345	MOVOU(X11, Mem{Base: dst, Disp: 176})
1346	MOVOU(X12, Mem{Base: dst, Disp: 192})
1347	MOVOU(X13, Mem{Base: dst, Disp: 208})
1348	MOVOU(X14, Mem{Base: dst, Disp: 224})
1349	MOVOU(X15, Mem{Base: dst, Disp: 240})
1350	CMPQ(length, U32(256))
1351	LEAQ(Mem{Base: src, Disp: 256}, src)
1352	LEAQ(Mem{Base: dst, Disp: 256}, dst)
1353	JGE(LabelRef(name + "move_256through2048"))
1354	JMP(LabelRef(name + "tail"))
1355
1356	if avx {
1357		Label(name + "avxUnaligned")
1358		R8, R10 := GP64(), GP64()
1359		// There are two implementations of move algorithm.
1360		// The first one for non-overlapped memory regions. It uses forward copying.
1361		// We do not support overlapping input
1362
1363		// Non-temporal copy would be better for big sizes.
1364		// Disabled since big copies are unlikely.
1365		// If enabling, test functionality.
1366		const enableBigData = false
1367		if enableBigData {
1368			CMPQ(length, U32(0x100000))
1369			JAE(LabelRef(name + "gobble_big_data_fwd"))
1370		}
1371
1372		// Memory layout on the source side
1373		// src                                       CX
1374		// |<---------length before correction--------->|
1375		// |       |<--length corrected-->|             |
1376		// |       |                  |<--- AX  --->|
1377		// |<-R11->|                  |<-128 bytes->|
1378		// +----------------------------------------+
1379		// | Head  | Body             | Tail        |
1380		// +-------+------------------+-------------+
1381		// ^       ^                  ^
1382		// |       |                  |
1383		// Save head into Y4          Save tail into X5..X12
1384		//         |
1385		//         src+R11, where R11 = ((dst & -32) + 32) - dst
1386		// Algorithm:
1387		// 1. Unaligned save of the tail's 128 bytes
1388		// 2. Unaligned save of the head's 32  bytes
1389		// 3. Destination-aligned copying of body (128 bytes per iteration)
1390		// 4. Put head on the new place
1391		// 5. Put the tail on the new place
1392		// It can be important to satisfy processor's pipeline requirements for
1393		// small sizes as the cost of unaligned memory region copying is
1394		// comparable with the cost of main loop. So code is slightly messed there.
1395		// There is more clean implementation of that algorithm for bigger sizes
1396		// where the cost of unaligned part copying is negligible.
1397		// You can see it after gobble_big_data_fwd label.
1398		Y0, Y1, Y2, Y3, Y4 := YMM(), YMM(), YMM(), YMM(), YMM()
1399
1400		LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX)
1401		MOVQ(dst, R10)
1402		// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
1403		MOVOU(Mem{Base: CX, Disp: -0x80}, X5)
1404		MOVOU(Mem{Base: CX, Disp: -0x70}, X6)
1405		MOVQ(U32(0x80), AX)
1406
1407		// Align destination address
1408		ANDQ(U32(0xffffffe0), dst)
1409		ADDQ(U8(32), dst)
1410		// Continue tail saving.
1411		MOVOU(Mem{Base: CX, Disp: -0x60}, X7)
1412		MOVOU(Mem{Base: CX, Disp: -0x50}, X8)
1413		// Make R8 delta between aligned and unaligned destination addresses.
1414		MOVQ(dst, R8)
1415		SUBQ(R10, R8)
1416		// Continue tail saving.
1417		MOVOU(Mem{Base: CX, Disp: -0x40}, X9)
1418		MOVOU(Mem{Base: CX, Disp: -0x30}, X10)
1419		// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
1420		SUBQ(R8, length)
1421		// Continue tail saving.
1422		MOVOU(Mem{Base: CX, Disp: -0x20}, X11)
1423		MOVOU(Mem{Base: CX, Disp: -0x10}, X12)
1424		// The tail will be put on its place after main body copying.
1425		// It's time for the unaligned heading part.
1426		VMOVDQU(Mem{Base: src}, Y4)
1427		// Adjust source address to point past head.
1428		ADDQ(R8, src)
1429		SUBQ(AX, length)
1430
1431		// Aligned memory copying there
1432		Label(name + "gobble_128_loop")
1433		VMOVDQU(Mem{Base: src}, Y0)
1434		VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1)
1435		VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2)
1436		VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3)
1437		ADDQ(AX, src)
1438		VMOVDQA(Y0, Mem{Base: dst})
1439		VMOVDQA(Y1, Mem{Base: dst, Disp: 0x20})
1440		VMOVDQA(Y2, Mem{Base: dst, Disp: 0x40})
1441		VMOVDQA(Y3, Mem{Base: dst, Disp: 0x60})
1442		ADDQ(AX, dst)
1443		SUBQ(AX, length)
1444		JA(LabelRef(name + "gobble_128_loop"))
1445		// Now we can store unaligned parts.
1446		ADDQ(AX, length)
1447		ADDQ(dst, length)
1448		VMOVDQU(Y4, Mem{Base: R10})
1449		VZEROUPPER()
1450		MOVOU(X5, Mem{Base: length, Disp: -0x80})
1451		MOVOU(X6, Mem{Base: length, Disp: -0x70})
1452		MOVOU(X7, Mem{Base: length, Disp: -0x60})
1453		MOVOU(X8, Mem{Base: length, Disp: -0x50})
1454		MOVOU(X9, Mem{Base: length, Disp: -0x40})
1455		MOVOU(X10, Mem{Base: length, Disp: -0x30})
1456		MOVOU(X11, Mem{Base: length, Disp: -0x20})
1457		MOVOU(X12, Mem{Base: length, Disp: -0x10})
1458		JMP(end)
1459
1460		if enableBigData {
1461			Label(name + "gobble_big_data_fwd")
1462			// There is forward copying for big regions.
1463			// It uses non-temporal mov instructions.
1464			// Details of this algorithm are commented previously for small sizes.
1465			LEAQ(Mem{Base: src, Index: length, Scale: 1}, CX)
1466			MOVOU(Mem{Base: src, Index: length, Scale: 1, Disp: -0x80}, X5)
1467			MOVOU(Mem{Base: CX, Disp: -0x70}, X6)
1468			MOVOU(Mem{Base: CX, Disp: -0x60}, X7)
1469			MOVOU(Mem{Base: CX, Disp: -0x50}, X8)
1470			MOVOU(Mem{Base: CX, Disp: -0x40}, X9)
1471			MOVOU(Mem{Base: CX, Disp: -0x30}, X10)
1472			MOVOU(Mem{Base: CX, Disp: -0x20}, X11)
1473			MOVOU(Mem{Base: CX, Disp: -0x10}, X12)
1474			VMOVDQU(Mem{Base: src}, Y4)
1475			MOVQ(dst, R8)
1476
1477			ANDQ(U32(0xffffffe0), dst)
1478			ADDQ(U8(32), dst)
1479
1480			MOVQ(dst, R10)
1481			SUBQ(R8, R10)
1482			SUBQ(R10, length)
1483			ADDQ(R10, src)
1484			LEAQ(Mem{Base: dst, Index: length, Scale: 1}, CX)
1485			SUBQ(U8(0x80), length)
1486
1487			Label(name + "gobble_mem_fwd_loop")
1488			PREFETCHNTA(Mem{Base: src, Disp: 0x1c0})
1489			PREFETCHNTA(Mem{Base: src, Disp: 0x280})
1490			// Prefetch values were chosen empirically.
1491			// Approach for prefetch usage as in 7.6.6 of [1]
1492			// [1] 64-ia-32-architectures-optimization-manual.pdf
1493			// https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
1494			VMOVDQU(Mem{Base: src}, Y0)
1495			VMOVDQU(Mem{Base: src, Disp: 0x20}, Y1)
1496			VMOVDQU(Mem{Base: src, Disp: 0x40}, Y2)
1497			VMOVDQU(Mem{Base: src, Disp: 0x60}, Y3)
1498
1499			ADDQ(U8(0x80), src)
1500			VMOVNTDQ(Y0, Mem{Base: dst})
1501			VMOVNTDQ(Y1, Mem{Base: dst, Disp: 0x20})
1502			VMOVNTDQ(Y2, Mem{Base: dst, Disp: 0x20})
1503			VMOVNTDQ(Y3, Mem{Base: dst, Disp: 0x60})
1504			ADDQ(U8(0x80), dst)
1505			SUBQ(U8(0x80), length)
1506			JA(LabelRef(name + "gobble_mem_fwd_loop"))
1507			// NT instructions don't follow the normal cache-coherency rules.
1508			// We need SFENCE there to make copied data available timely.
1509			SFENCE()
1510			VMOVDQU(Y4, Mem{Base: R8})
1511			VZEROUPPER()
1512			MOVOU(X5, Mem{Base: CX, Disp: -0x80})
1513			MOVOU(X6, Mem{Base: CX, Disp: -0x70})
1514			MOVOU(X7, Mem{Base: CX, Disp: -0x60})
1515			MOVOU(X8, Mem{Base: CX, Disp: -0x50})
1516			MOVOU(X9, Mem{Base: CX, Disp: -0x40})
1517			MOVOU(X10, Mem{Base: CX, Disp: -0x30})
1518			MOVOU(X11, Mem{Base: CX, Disp: -0x20})
1519			MOVOU(X12, Mem{Base: CX, Disp: -0x10})
1520			JMP(end)
1521		}
1522	}
1523}
1524
1525// genMatchLen generates standalone matchLen.
1526func genMatchLen() {
1527	TEXT("matchLen", NOSPLIT, "func(a, b []byte) int")
1528	Doc("matchLen returns how many bytes match in a and b", "",
1529		"It assumes that:",
1530		"  len(a) <= len(b)", "")
1531	Pragma("noescape")
1532
1533	aBase, bBase, length := GP64(), GP64(), GP64()
1534
1535	Load(Param("a").Base(), aBase)
1536	Load(Param("b").Base(), bBase)
1537	Load(Param("a").Len(), length)
1538	l := matchLen("standalone", Mem{Base: aBase}, Mem{Base: bBase}, length, LabelRef("gen_match_len_end"))
1539	Label("gen_match_len_end")
1540	Store(l, ReturnIndex(0))
1541	RET()
1542}
1543
1544// matchLen returns the number of matching bytes of a and b.
1545// len is the maximum number of bytes to match.
1546// Will jump to end when done and returns the length.
1547// Uses 2 GP registers.
1548func matchLen(name string, a, b Mem, len reg.GPVirtual, end LabelRef) reg.GPVirtual {
1549	tmp, matched := GP64(), GP64()
1550	XORQ(matched, matched)
1551
1552	CMPQ(len, U8(8))
1553	JL(LabelRef("matchlen_single_" + name))
1554
1555	Label("matchlen_loopback_" + name)
1556	MOVQ(Mem{Base: a.Base, Index: matched, Scale: 1}, tmp)
1557	XORQ(Mem{Base: b.Base, Index: matched, Scale: 1}, tmp)
1558	TESTQ(tmp, tmp)
1559	JZ(LabelRef("matchlen_loop_" + name))
1560	// Not all match.
1561	BSFQ(tmp, tmp)
1562	SARQ(U8(3), tmp)
1563	LEAQ(Mem{Base: matched, Index: tmp, Scale: 1}, matched)
1564	JMP(end)
1565
1566	// All 8 byte matched, update and loop.
1567	Label("matchlen_loop_" + name)
1568	LEAQ(Mem{Base: len, Disp: -8}, len)
1569	LEAQ(Mem{Base: matched, Disp: 8}, matched)
1570	CMPQ(len, U8(8))
1571	JGE(LabelRef("matchlen_loopback_" + name))
1572
1573	// Less than 8 bytes left.
1574	Label("matchlen_single_" + name)
1575	TESTQ(len, len)
1576	JZ(end)
1577	Label("matchlen_single_loopback_" + name)
1578	MOVB(Mem{Base: a.Base, Index: matched, Scale: 1}, tmp.As8())
1579	CMPB(Mem{Base: b.Base, Index: matched, Scale: 1}, tmp.As8())
1580	JNE(end)
1581	LEAQ(Mem{Base: matched, Disp: 1}, matched)
1582	DECQ(len)
1583	JNZ(LabelRef("matchlen_single_loopback_" + name))
1584	JMP(end)
1585	return matched
1586}
1587