1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build !appengine
6// +build gc
7// +build !noasm
8
9#include "textflag.h"
10
11// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
12// Go toolchain regression. See https://github.com/golang/go/issues/15426 and
13// https://github.com/golang/snappy/issues/29
14//
15// As a workaround, the package was built with a known good assembler, and
16// those instructions were disassembled by "objdump -d" to yield the
17//	4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
18// style comments, in AT&T asm syntax. Note that rsp here is a physical
19// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
20// The instructions were then encoded as "BYTE $0x.." sequences, which assemble
21// fine on Go 1.6.
22
23// The asm code generally follows the pure Go code in encode_other.go, except
24// where marked with a "!!!".
25
26// ----------------------------------------------------------------------------
27
28// func emitLiteral(dst, lit []byte) int
29//
30// All local variables fit into registers. The register allocation:
31//	- AX	len(lit)
32//	- BX	n
33//	- DX	return value
34//	- DI	&dst[i]
35//	- R10	&lit[0]
36//
37// The 24 bytes of stack space is to call runtime·memmove.
38//
39// The unusual register allocation of local variables, such as R10 for the
40// source pointer, matches the allocation used at the call site in encodeBlock,
41// which makes it easier to manually inline this function.
42TEXT ·emitLiteral(SB), NOSPLIT, $24-56
43	MOVQ dst_base+0(FP), DI
44	MOVQ lit_base+24(FP), R10
45	MOVQ lit_len+32(FP), AX
46	MOVQ AX, DX
47	MOVL AX, BX
48	SUBL $1, BX
49
50	CMPL BX, $60
51	JLT  oneByte
52	CMPL BX, $256
53	JLT  twoBytes
54
55threeBytes:
56	MOVB $0xf4, 0(DI)
57	MOVW BX, 1(DI)
58	ADDQ $3, DI
59	ADDQ $3, DX
60	JMP  memmove
61
62twoBytes:
63	MOVB $0xf0, 0(DI)
64	MOVB BX, 1(DI)
65	ADDQ $2, DI
66	ADDQ $2, DX
67	JMP  memmove
68
69oneByte:
70	SHLB $2, BX
71	MOVB BX, 0(DI)
72	ADDQ $1, DI
73	ADDQ $1, DX
74
75memmove:
76	MOVQ DX, ret+48(FP)
77
78	// copy(dst[i:], lit)
79	//
80	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
81	// DI, R10 and AX as arguments.
82	MOVQ DI, 0(SP)
83	MOVQ R10, 8(SP)
84	MOVQ AX, 16(SP)
85	CALL runtime·memmove(SB)
86	RET
87
88// ----------------------------------------------------------------------------
89
90// func emitCopy(dst []byte, offset, length int) int
91//
92// All local variables fit into registers. The register allocation:
93//	- AX	length
94//	- SI	&dst[0]
95//	- DI	&dst[i]
96//	- R11	offset
97//
98// The unusual register allocation of local variables, such as R11 for the
99// offset, matches the allocation used at the call site in encodeBlock, which
100// makes it easier to manually inline this function.
101TEXT ·emitCopy(SB), NOSPLIT, $0-48
102	MOVQ dst_base+0(FP), DI
103	MOVQ DI, SI
104	MOVQ offset+24(FP), R11
105	MOVQ length+32(FP), AX
106
107loop0:
108	// for length >= 68 { etc }
109	CMPL AX, $68
110	JLT  step1
111
112	// Emit a length 64 copy, encoded as 3 bytes.
113	MOVB $0xfe, 0(DI)
114	MOVW R11, 1(DI)
115	ADDQ $3, DI
116	SUBL $64, AX
117	JMP  loop0
118
119step1:
120	// if length > 64 { etc }
121	CMPL AX, $64
122	JLE  step2
123
124	// Emit a length 60 copy, encoded as 3 bytes.
125	MOVB $0xee, 0(DI)
126	MOVW R11, 1(DI)
127	ADDQ $3, DI
128	SUBL $60, AX
129
130step2:
131	// if length >= 12 || offset >= 2048 { goto step3 }
132	CMPL AX, $12
133	JGE  step3
134	CMPL R11, $2048
135	JGE  step3
136
137	// Emit the remaining copy, encoded as 2 bytes.
138	MOVB R11, 1(DI)
139	SHRL $8, R11
140	SHLB $5, R11
141	SUBB $4, AX
142	SHLB $2, AX
143	ORB  AX, R11
144	ORB  $1, R11
145	MOVB R11, 0(DI)
146	ADDQ $2, DI
147
148	// Return the number of bytes written.
149	SUBQ SI, DI
150	MOVQ DI, ret+40(FP)
151	RET
152
153step3:
154	// Emit the remaining copy, encoded as 3 bytes.
155	SUBL $1, AX
156	SHLB $2, AX
157	ORB  $2, AX
158	MOVB AX, 0(DI)
159	MOVW R11, 1(DI)
160	ADDQ $3, DI
161
162	// Return the number of bytes written.
163	SUBQ SI, DI
164	MOVQ DI, ret+40(FP)
165	RET
166
167// ----------------------------------------------------------------------------
168
169// func extendMatch(src []byte, i, j int) int
170//
171// All local variables fit into registers. The register allocation:
172//	- DX	&src[0]
173//	- SI	&src[j]
174//	- R13	&src[len(src) - 8]
175//	- R14	&src[len(src)]
176//	- R15	&src[i]
177//
178// The unusual register allocation of local variables, such as R15 for a source
179// pointer, matches the allocation used at the call site in encodeBlock, which
180// makes it easier to manually inline this function.
181TEXT ·extendMatch(SB), NOSPLIT, $0-48
182	MOVQ src_base+0(FP), DX
183	MOVQ src_len+8(FP), R14
184	MOVQ i+24(FP), R15
185	MOVQ j+32(FP), SI
186	ADDQ DX, R14
187	ADDQ DX, R15
188	ADDQ DX, SI
189	MOVQ R14, R13
190	SUBQ $8, R13
191
192cmp8:
193	// As long as we are 8 or more bytes before the end of src, we can load and
194	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
195	CMPQ SI, R13
196	JA   cmp1
197	MOVQ (R15), AX
198	MOVQ (SI), BX
199	CMPQ AX, BX
200	JNE  bsf
201	ADDQ $8, R15
202	ADDQ $8, SI
203	JMP  cmp8
204
205bsf:
206	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
207	// the index of the first byte that differs. The BSF instruction finds the
208	// least significant 1 bit, the amd64 architecture is little-endian, and
209	// the shift by 3 converts a bit index to a byte index.
210	XORQ AX, BX
211	BSFQ BX, BX
212	SHRQ $3, BX
213	ADDQ BX, SI
214
215	// Convert from &src[ret] to ret.
216	SUBQ DX, SI
217	MOVQ SI, ret+40(FP)
218	RET
219
220cmp1:
221	// In src's tail, compare 1 byte at a time.
222	CMPQ SI, R14
223	JAE  extendMatchEnd
224	MOVB (R15), AX
225	MOVB (SI), BX
226	CMPB AX, BX
227	JNE  extendMatchEnd
228	ADDQ $1, R15
229	ADDQ $1, SI
230	JMP  cmp1
231
232extendMatchEnd:
233	// Convert from &src[ret] to ret.
234	SUBQ DX, SI
235	MOVQ SI, ret+40(FP)
236	RET
237
238// ----------------------------------------------------------------------------
239
240// func encodeBlock(dst, src []byte) (d int)
241//
242// All local variables fit into registers, other than "var table". The register
243// allocation:
244//	- AX	.	.
245//	- BX	.	.
246//	- CX	56	shift (note that amd64 shifts by non-immediates must use CX).
247//	- DX	64	&src[0], tableSize
248//	- SI	72	&src[s]
249//	- DI	80	&dst[d]
250//	- R9	88	sLimit
251//	- R10	.	&src[nextEmit]
252//	- R11	96	prevHash, currHash, nextHash, offset
253//	- R12	104	&src[base], skip
254//	- R13	.	&src[nextS], &src[len(src) - 8]
255//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
256//	- R15	112	candidate
257//
258// The second column (56, 64, etc) is the stack offset to spill the registers
259// when calling other functions. We could pack this slightly tighter, but it's
260// simpler to have a dedicated spill map independent of the function called.
261//
262// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
263// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
264// local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
265TEXT ·encodeBlock(SB), 0, $32888-56
266	MOVQ dst_base+0(FP), DI
267	MOVQ src_base+24(FP), SI
268	MOVQ src_len+32(FP), R14
269
270	// shift, tableSize := uint32(32-8), 1<<8
271	MOVQ $24, CX
272	MOVQ $256, DX
273
274calcShift:
275	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
276	//	shift--
277	// }
278	CMPQ DX, $16384
279	JGE  varTable
280	CMPQ DX, R14
281	JGE  varTable
282	SUBQ $1, CX
283	SHLQ $1, DX
284	JMP  calcShift
285
286varTable:
287	// var table [maxTableSize]uint16
288	//
289	// In the asm code, unlike the Go code, we can zero-initialize only the
290	// first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
291	// writes 16 bytes, so we can do only tableSize/8 writes instead of the
292	// 2048 writes that would zero-initialize all of table's 32768 bytes.
293	SHRQ $3, DX
294	LEAQ table-32768(SP), BX
295	PXOR X0, X0
296
297memclr:
298	MOVOU X0, 0(BX)
299	ADDQ  $16, BX
300	SUBQ  $1, DX
301	JNZ   memclr
302
303	// !!! DX = &src[0]
304	MOVQ SI, DX
305
306	// sLimit := len(src) - inputMargin
307	MOVQ R14, R9
308	SUBQ $15, R9
309
310	// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
311	// change for the rest of the function.
312	MOVQ CX, 56(SP)
313	MOVQ DX, 64(SP)
314	MOVQ R9, 88(SP)
315
316	// nextEmit := 0
317	MOVQ DX, R10
318
319	// s := 1
320	ADDQ $1, SI
321
322	// nextHash := hash(load32(src, s), shift)
323	MOVL  0(SI), R11
324	IMULL $0x1e35a7bd, R11
325	SHRL  CX, R11
326
327outer:
328	// for { etc }
329
330	// skip := 32
331	MOVQ $32, R12
332
333	// nextS := s
334	MOVQ SI, R13
335
336	// candidate := 0
337	MOVQ $0, R15
338
339inner0:
340	// for { etc }
341
342	// s := nextS
343	MOVQ R13, SI
344
345	// bytesBetweenHashLookups := skip >> 5
346	MOVQ R12, R14
347	SHRQ $5, R14
348
349	// nextS = s + bytesBetweenHashLookups
350	ADDQ R14, R13
351
352	// skip += bytesBetweenHashLookups
353	ADDQ R14, R12
354
355	// if nextS > sLimit { goto emitRemainder }
356	MOVQ R13, AX
357	SUBQ DX, AX
358	CMPQ AX, R9
359	JA   emitRemainder
360
361	// candidate = int(table[nextHash])
362	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
363	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
364	BYTE $0x4e
365	BYTE $0x0f
366	BYTE $0xb7
367	BYTE $0x7c
368	BYTE $0x5c
369	BYTE $0x78
370
371	// table[nextHash] = uint16(s)
372	MOVQ SI, AX
373	SUBQ DX, AX
374
375	// XXX: MOVW AX, table-32768(SP)(R11*2)
376	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
377	BYTE $0x66
378	BYTE $0x42
379	BYTE $0x89
380	BYTE $0x44
381	BYTE $0x5c
382	BYTE $0x78
383
384	// nextHash = hash(load32(src, nextS), shift)
385	MOVL  0(R13), R11
386	IMULL $0x1e35a7bd, R11
387	SHRL  CX, R11
388
389	// if load32(src, s) != load32(src, candidate) { continue } break
390	MOVL 0(SI), AX
391	MOVL (DX)(R15*1), BX
392	CMPL AX, BX
393	JNE  inner0
394
395fourByteMatch:
396	// As per the encode_other.go code:
397	//
398	// A 4-byte match has been found. We'll later see etc.
399
400	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
401	// on inputMargin in encode.go.
402	MOVQ SI, AX
403	SUBQ R10, AX
404	CMPQ AX, $16
405	JLE  emitLiteralFastPath
406
407	// ----------------------------------------
408	// Begin inline of the emitLiteral call.
409	//
410	// d += emitLiteral(dst[d:], src[nextEmit:s])
411
412	MOVL AX, BX
413	SUBL $1, BX
414
415	CMPL BX, $60
416	JLT  inlineEmitLiteralOneByte
417	CMPL BX, $256
418	JLT  inlineEmitLiteralTwoBytes
419
420inlineEmitLiteralThreeBytes:
421	MOVB $0xf4, 0(DI)
422	MOVW BX, 1(DI)
423	ADDQ $3, DI
424	JMP  inlineEmitLiteralMemmove
425
426inlineEmitLiteralTwoBytes:
427	MOVB $0xf0, 0(DI)
428	MOVB BX, 1(DI)
429	ADDQ $2, DI
430	JMP  inlineEmitLiteralMemmove
431
432inlineEmitLiteralOneByte:
433	SHLB $2, BX
434	MOVB BX, 0(DI)
435	ADDQ $1, DI
436
437inlineEmitLiteralMemmove:
438	// Spill local variables (registers) onto the stack; call; unspill.
439	//
440	// copy(dst[i:], lit)
441	//
442	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
443	// DI, R10 and AX as arguments.
444	MOVQ DI, 0(SP)
445	MOVQ R10, 8(SP)
446	MOVQ AX, 16(SP)
447	ADDQ AX, DI              // Finish the "d +=" part of "d += emitLiteral(etc)".
448	MOVQ SI, 72(SP)
449	MOVQ DI, 80(SP)
450	MOVQ R15, 112(SP)
451	CALL runtime·memmove(SB)
452	MOVQ 56(SP), CX
453	MOVQ 64(SP), DX
454	MOVQ 72(SP), SI
455	MOVQ 80(SP), DI
456	MOVQ 88(SP), R9
457	MOVQ 112(SP), R15
458	JMP  inner1
459
460inlineEmitLiteralEnd:
461	// End inline of the emitLiteral call.
462	// ----------------------------------------
463
464emitLiteralFastPath:
465	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
466	MOVB AX, BX
467	SUBB $1, BX
468	SHLB $2, BX
469	MOVB BX, (DI)
470	ADDQ $1, DI
471
472	// !!! Implement the copy from lit to dst as a 16-byte load and store.
473	// (Encode's documentation says that dst and src must not overlap.)
474	//
475	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
476	// OK. Subsequent iterations will fix up the overrun.
477	//
478	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
479	// 16-byte loads and stores. This technique probably wouldn't be as
480	// effective on architectures that are fussier about alignment.
481	MOVOU 0(R10), X0
482	MOVOU X0, 0(DI)
483	ADDQ  AX, DI
484
485inner1:
486	// for { etc }
487
488	// base := s
489	MOVQ SI, R12
490
491	// !!! offset := base - candidate
492	MOVQ R12, R11
493	SUBQ R15, R11
494	SUBQ DX, R11
495
496	// ----------------------------------------
497	// Begin inline of the extendMatch call.
498	//
499	// s = extendMatch(src, candidate+4, s+4)
500
501	// !!! R14 = &src[len(src)]
502	MOVQ src_len+32(FP), R14
503	ADDQ DX, R14
504
505	// !!! R13 = &src[len(src) - 8]
506	MOVQ R14, R13
507	SUBQ $8, R13
508
509	// !!! R15 = &src[candidate + 4]
510	ADDQ $4, R15
511	ADDQ DX, R15
512
513	// !!! s += 4
514	ADDQ $4, SI
515
516inlineExtendMatchCmp8:
517	// As long as we are 8 or more bytes before the end of src, we can load and
518	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
519	CMPQ SI, R13
520	JA   inlineExtendMatchCmp1
521	MOVQ (R15), AX
522	MOVQ (SI), BX
523	CMPQ AX, BX
524	JNE  inlineExtendMatchBSF
525	ADDQ $8, R15
526	ADDQ $8, SI
527	JMP  inlineExtendMatchCmp8
528
529inlineExtendMatchBSF:
530	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
531	// the index of the first byte that differs. The BSF instruction finds the
532	// least significant 1 bit, the amd64 architecture is little-endian, and
533	// the shift by 3 converts a bit index to a byte index.
534	XORQ AX, BX
535	BSFQ BX, BX
536	SHRQ $3, BX
537	ADDQ BX, SI
538	JMP  inlineExtendMatchEnd
539
540inlineExtendMatchCmp1:
541	// In src's tail, compare 1 byte at a time.
542	CMPQ SI, R14
543	JAE  inlineExtendMatchEnd
544	MOVB (R15), AX
545	MOVB (SI), BX
546	CMPB AX, BX
547	JNE  inlineExtendMatchEnd
548	ADDQ $1, R15
549	ADDQ $1, SI
550	JMP  inlineExtendMatchCmp1
551
552inlineExtendMatchEnd:
553	// End inline of the extendMatch call.
554	// ----------------------------------------
555
556	// ----------------------------------------
557	// Begin inline of the emitCopy call.
558	//
559	// d += emitCopy(dst[d:], base-candidate, s-base)
560
561	// !!! length := s - base
562	MOVQ SI, AX
563	SUBQ R12, AX
564
565inlineEmitCopyLoop0:
566	// for length >= 68 { etc }
567	CMPL AX, $68
568	JLT  inlineEmitCopyStep1
569
570	// Emit a length 64 copy, encoded as 3 bytes.
571	MOVB $0xfe, 0(DI)
572	MOVW R11, 1(DI)
573	ADDQ $3, DI
574	SUBL $64, AX
575	JMP  inlineEmitCopyLoop0
576
577inlineEmitCopyStep1:
578	// if length > 64 { etc }
579	CMPL AX, $64
580	JLE  inlineEmitCopyStep2
581
582	// Emit a length 60 copy, encoded as 3 bytes.
583	MOVB $0xee, 0(DI)
584	MOVW R11, 1(DI)
585	ADDQ $3, DI
586	SUBL $60, AX
587
588inlineEmitCopyStep2:
589	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
590	CMPL AX, $12
591	JGE  inlineEmitCopyStep3
592	CMPL R11, $2048
593	JGE  inlineEmitCopyStep3
594
595	// Emit the remaining copy, encoded as 2 bytes.
596	MOVB R11, 1(DI)
597	SHRL $8, R11
598	SHLB $5, R11
599	SUBB $4, AX
600	SHLB $2, AX
601	ORB  AX, R11
602	ORB  $1, R11
603	MOVB R11, 0(DI)
604	ADDQ $2, DI
605	JMP  inlineEmitCopyEnd
606
607inlineEmitCopyStep3:
608	// Emit the remaining copy, encoded as 3 bytes.
609	SUBL $1, AX
610	SHLB $2, AX
611	ORB  $2, AX
612	MOVB AX, 0(DI)
613	MOVW R11, 1(DI)
614	ADDQ $3, DI
615
616inlineEmitCopyEnd:
617	// End inline of the emitCopy call.
618	// ----------------------------------------
619
620	// nextEmit = s
621	MOVQ SI, R10
622
623	// if s >= sLimit { goto emitRemainder }
624	MOVQ SI, AX
625	SUBQ DX, AX
626	CMPQ AX, R9
627	JAE  emitRemainder
628
629	// As per the encode_other.go code:
630	//
631	// We could immediately etc.
632
633	// x := load64(src, s-1)
634	MOVQ -1(SI), R14
635
636	// prevHash := hash(uint32(x>>0), shift)
637	MOVL  R14, R11
638	IMULL $0x1e35a7bd, R11
639	SHRL  CX, R11
640
641	// table[prevHash] = uint16(s-1)
642	MOVQ SI, AX
643	SUBQ DX, AX
644	SUBQ $1, AX
645
646	// XXX: MOVW AX, table-32768(SP)(R11*2)
647	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
648	BYTE $0x66
649	BYTE $0x42
650	BYTE $0x89
651	BYTE $0x44
652	BYTE $0x5c
653	BYTE $0x78
654
655	// currHash := hash(uint32(x>>8), shift)
656	SHRQ  $8, R14
657	MOVL  R14, R11
658	IMULL $0x1e35a7bd, R11
659	SHRL  CX, R11
660
661	// candidate = int(table[currHash])
662	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
663	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
664	BYTE $0x4e
665	BYTE $0x0f
666	BYTE $0xb7
667	BYTE $0x7c
668	BYTE $0x5c
669	BYTE $0x78
670
671	// table[currHash] = uint16(s)
672	ADDQ $1, AX
673
674	// XXX: MOVW AX, table-32768(SP)(R11*2)
675	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
676	BYTE $0x66
677	BYTE $0x42
678	BYTE $0x89
679	BYTE $0x44
680	BYTE $0x5c
681	BYTE $0x78
682
683	// if uint32(x>>8) == load32(src, candidate) { continue }
684	MOVL (DX)(R15*1), BX
685	CMPL R14, BX
686	JEQ  inner1
687
688	// nextHash = hash(uint32(x>>16), shift)
689	SHRQ  $8, R14
690	MOVL  R14, R11
691	IMULL $0x1e35a7bd, R11
692	SHRL  CX, R11
693
694	// s++
695	ADDQ $1, SI
696
697	// break out of the inner1 for loop, i.e. continue the outer loop.
698	JMP outer
699
700emitRemainder:
701	// if nextEmit < len(src) { etc }
702	MOVQ src_len+32(FP), AX
703	ADDQ DX, AX
704	CMPQ R10, AX
705	JEQ  encodeBlockEnd
706
707	// d += emitLiteral(dst[d:], src[nextEmit:])
708	//
709	// Push args.
710	MOVQ DI, 0(SP)
711	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
712	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
713	MOVQ R10, 24(SP)
714	SUBQ R10, AX
715	MOVQ AX, 32(SP)
716	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
717
718	// Spill local variables (registers) onto the stack; call; unspill.
719	MOVQ DI, 80(SP)
720	CALL ·emitLiteral(SB)
721	MOVQ 80(SP), DI
722
723	// Finish the "d +=" part of "d += emitLiteral(etc)".
724	ADDQ 48(SP), DI
725
726encodeBlockEnd:
727	MOVQ dst_base+0(FP), AX
728	SUBQ AX, DI
729	MOVQ DI, d+48(FP)
730	RET
731