1// Copyright 2016 The Go Authors. All rights reserved.
2// Copyright (c) 2019 Klaus Post. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6// +build !appengine
7// +build gc
8// +build !noasm
9
10#include "textflag.h"
11
12#define R_TMP0 AX
13#define R_TMP1 BX
14#define R_LEN CX
15#define R_OFF DX
16#define R_SRC SI
17#define R_DST DI
18#define R_DBASE R8
19#define R_DLEN R9
20#define R_DEND R10
21#define R_SBASE R11
22#define R_SLEN R12
23#define R_SEND R13
24#define R_TMP2 R14
25#define R_TMP3 R15
26
27// The asm code generally follows the pure Go code in decode_other.go, except
28// where marked with a "!!!".
29
30// func decode(dst, src []byte) int
31//
32// All local variables fit into registers. The non-zero stack size is only to
33// spill registers and push args when issuing a CALL. The register allocation:
34//	- R_TMP0	scratch
35//	- R_TMP1	scratch
36//	- R_LEN	    length or x (shared)
37//	- R_OFF	    offset
38//	- R_SRC	    &src[s]
39//	- R_DST	    &dst[d]
40//	+ R_DBASE	dst_base
41//	+ R_DLEN	dst_len
42//	+ R_DEND	dst_base + dst_len
43//	+ R_SBASE	src_base
44//	+ R_SLEN	src_len
45//	+ R_SEND	src_base + src_len
46//	- R_TMP2	used by doCopy
47//	- R_TMP3	used by doCopy
48//
49// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
50// function, and after a CALL returns, and are not otherwise modified.
51//
52// The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
53// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
54TEXT ·s2Decode(SB), NOSPLIT, $48-56
55	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
56	MOVQ dst_base+0(FP), R_DBASE
57	MOVQ dst_len+8(FP), R_DLEN
58	MOVQ R_DBASE, R_DST
59	MOVQ R_DBASE, R_DEND
60	ADDQ R_DLEN, R_DEND
61	MOVQ src_base+24(FP), R_SBASE
62	MOVQ src_len+32(FP), R_SLEN
63	MOVQ R_SBASE, R_SRC
64	MOVQ R_SBASE, R_SEND
65	ADDQ R_SLEN, R_SEND
66	XORQ R_OFF, R_OFF
67
68loop:
69	// for s < len(src)
70	CMPQ R_SRC, R_SEND
71	JEQ  end
72
73	// R_LEN = uint32(src[s])
74	//
75	// switch src[s] & 0x03
76	MOVBLZX (R_SRC), R_LEN
77	MOVL    R_LEN, R_TMP1
78	ANDL    $3, R_TMP1
79	CMPL    R_TMP1, $1
80	JAE     tagCopy
81
82	// ----------------------------------------
83	// The code below handles literal tags.
84
85	// case tagLiteral:
86	// x := uint32(src[s] >> 2)
87	// switch
88	SHRL $2, R_LEN
89	CMPL R_LEN, $60
90	JAE  tagLit60Plus
91
92	// case x < 60:
93	// s++
94	INCQ R_SRC
95
96doLit:
97	// This is the end of the inner "switch", when we have a literal tag.
98	//
99	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
100	// used in the pure Go decode_other.go code.
101
102	// length = int(x) + 1
103	//
104	// Unlike the pure Go code, we don't need to check if length <= 0 because
105	// R_LEN can hold 64 bits, so the increment cannot overflow.
106	INCQ R_LEN
107
108	// Prepare to check if copying length bytes will run past the end of dst or
109	// src.
110	//
111	// R_TMP0 = len(dst) - d
112	// R_TMP1 = len(src) - s
113	MOVQ R_DEND, R_TMP0
114	SUBQ R_DST, R_TMP0
115	MOVQ R_SEND, R_TMP1
116	SUBQ R_SRC, R_TMP1
117
118	// !!! Try a faster technique for short (16 or fewer bytes) copies.
119	//
120	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
121	//   goto callMemmove // Fall back on calling runtime·memmove.
122	// }
123	//
124	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
125	// against 21 instead of 16, because it cannot assume that all of its input
126	// is contiguous in memory and so it needs to leave enough source bytes to
127	// read the next tag without refilling buffers, but Go's Decode assumes
128	// contiguousness (the src argument is a []byte).
129	CMPQ R_LEN, $16
130	JGT  callMemmove
131	CMPQ R_TMP0, $16
132	JLT  callMemmove
133	CMPQ R_TMP1, $16
134	JLT  callMemmove
135
136	// !!! Implement the copy from src to dst as a 16-byte load and store.
137	// (Decode's documentation says that dst and src must not overlap.)
138	//
139	// This always copies 16 bytes, instead of only length bytes, but that's
140	// OK. If the input is a valid Snappy encoding then subsequent iterations
141	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
142	// non-nil error), so the overrun will be ignored.
143	//
144	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
145	// 16-byte loads and stores. This technique probably wouldn't be as
146	// effective on architectures that are fussier about alignment.
147	MOVOU 0(R_SRC), X0
148	MOVOU X0, 0(R_DST)
149
150	// d += length
151	// s += length
152	ADDQ R_LEN, R_DST
153	ADDQ R_LEN, R_SRC
154	JMP  loop
155
156callMemmove:
157	// if length > len(dst)-d || length > len(src)-s { etc }
158	CMPQ R_LEN, R_TMP0
159	JGT  errCorrupt
160	CMPQ R_LEN, R_TMP1
161	JGT  errCorrupt
162
163	// copy(dst[d:], src[s:s+length])
164	//
165	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
166	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
167	// three registers to the stack, to save local variables across the CALL.
168	MOVQ R_DST, 0(SP)
169	MOVQ R_SRC, 8(SP)
170	MOVQ R_LEN, 16(SP)
171	MOVQ R_DST, 24(SP)
172	MOVQ R_SRC, 32(SP)
173	MOVQ R_LEN, 40(SP)
174	MOVQ R_OFF, 48(SP)
175	CALL runtime·memmove(SB)
176
177	// Restore local variables: unspill registers from the stack and
178	// re-calculate R_DBASE-R_SEND.
179	MOVQ 24(SP), R_DST
180	MOVQ 32(SP), R_SRC
181	MOVQ 40(SP), R_LEN
182	MOVQ 48(SP), R_OFF
183	MOVQ dst_base+0(FP), R_DBASE
184	MOVQ dst_len+8(FP), R_DLEN
185	MOVQ R_DBASE, R_DEND
186	ADDQ R_DLEN, R_DEND
187	MOVQ src_base+24(FP), R_SBASE
188	MOVQ src_len+32(FP), R_SLEN
189	MOVQ R_SBASE, R_SEND
190	ADDQ R_SLEN, R_SEND
191
192	// d += length
193	// s += length
194	ADDQ R_LEN, R_DST
195	ADDQ R_LEN, R_SRC
196	JMP  loop
197
198tagLit60Plus:
199	// !!! This fragment does the
200	//
201	// s += x - 58; if uint(s) > uint(len(src)) { etc }
202	//
203	// checks. In the asm version, we code it once instead of once per switch case.
204	ADDQ R_LEN, R_SRC
205	SUBQ $58, R_SRC
206	CMPQ R_SRC, R_SEND
207	JA   errCorrupt
208
209	// case x == 60:
210	CMPL R_LEN, $61
211	JEQ  tagLit61
212	JA   tagLit62Plus
213
214	// x = uint32(src[s-1])
215	MOVBLZX -1(R_SRC), R_LEN
216	JMP     doLit
217
218tagLit61:
219	// case x == 61:
220	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
221	MOVWLZX -2(R_SRC), R_LEN
222	JMP     doLit
223
224tagLit62Plus:
225	CMPL R_LEN, $62
226	JA   tagLit63
227
228	// case x == 62:
229	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
230	// We read one byte, safe to read one back, since we are just reading tag.
231	// x = binary.LittleEndian.Uint32(src[s-1:]) >> 8
232	MOVL -4(R_SRC), R_LEN
233	SHRL $8, R_LEN
234	JMP  doLit
235
236tagLit63:
237	// case x == 63:
238	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
239	MOVL -4(R_SRC), R_LEN
240	JMP  doLit
241
242// The code above handles literal tags.
243// ----------------------------------------
244// The code below handles copy tags.
245
246tagCopy4:
247	// case tagCopy4:
248	// s += 5
249	ADDQ $5, R_SRC
250
251	// if uint(s) > uint(len(src)) { etc }
252	CMPQ R_SRC, R_SEND
253	JA   errCorrupt
254
255	// length = 1 + int(src[s-5])>>2
256	SHRQ $2, R_LEN
257	INCQ R_LEN
258
259	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
260	MOVLQZX -4(R_SRC), R_OFF
261	JMP     doCopy
262
263tagCopy2:
264	// case tagCopy2:
265	// s += 3
266	ADDQ $3, R_SRC
267
268	// if uint(s) > uint(len(src)) { etc }
269	CMPQ R_SRC, R_SEND
270	JA   errCorrupt
271
272	// length = 1 + int(src[s-3])>>2
273	SHRQ $2, R_LEN
274	INCQ R_LEN
275
276	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
277	MOVWQZX -2(R_SRC), R_OFF
278	JMP     doCopy
279
280tagCopy:
281	// We have a copy tag. We assume that:
282	//	- R_TMP1 == src[s] & 0x03
283	//	- R_LEN == src[s]
284	CMPQ R_TMP1, $2
285	JEQ  tagCopy2
286	JA   tagCopy4
287
288	// case tagCopy1:
289	// s += 2
290	ADDQ $2, R_SRC
291
292	// if uint(s) > uint(len(src)) { etc }
293	CMPQ R_SRC, R_SEND
294	JA   errCorrupt
295
296	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
297	// length = 4 + int(src[s-2])>>2&0x7
298	MOVBQZX -1(R_SRC), R_TMP1
299	MOVQ    R_LEN, R_TMP0
300	SHRQ    $2, R_LEN
301	ANDQ    $0xe0, R_TMP0
302	ANDQ    $7, R_LEN
303	SHLQ    $3, R_TMP0
304	ADDQ    $4, R_LEN
305	ORQ     R_TMP1, R_TMP0
306
307	// check if repeat code, ZF set by ORQ.
308	JZ repeatCode
309
310	// This is a regular copy, transfer our temporary value to R_OFF (length)
311	MOVQ R_TMP0, R_OFF
312	JMP  doCopy
313
314// This is a repeat code.
315repeatCode:
316	// If length < 9, reuse last offset, with the length already calculated.
317	CMPQ R_LEN, $9
318	JL   doCopyRepeat
319
320	// Read additional bytes for length.
321	JE repeatLen1
322
323	// Rare, so the extra branch shouldn't hurt too much.
324	CMPQ R_LEN, $10
325	JE   repeatLen2
326	JMP  repeatLen3
327
328// Read repeat lengths.
329repeatLen1:
330	// s ++
331	ADDQ $1, R_SRC
332
333	// if uint(s) > uint(len(src)) { etc }
334	CMPQ R_SRC, R_SEND
335	JA   errCorrupt
336
337	// length = src[s-1] + 8
338	MOVBQZX -1(R_SRC), R_LEN
339	ADDL    $8, R_LEN
340	JMP     doCopyRepeat
341
342repeatLen2:
343	// s +=2
344	ADDQ $2, R_SRC
345
346	// if uint(s) > uint(len(src)) { etc }
347	CMPQ R_SRC, R_SEND
348	JA   errCorrupt
349
350	// length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8)
351	MOVWQZX -2(R_SRC), R_LEN
352	ADDL    $260, R_LEN
353	JMP     doCopyRepeat
354
355repeatLen3:
356	// s +=3
357	ADDQ $3, R_SRC
358
359	// if uint(s) > uint(len(src)) { etc }
360	CMPQ R_SRC, R_SEND
361	JA   errCorrupt
362
363	// length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16)
364	// Read one byte further back (just part of the tag, shifted out)
365	MOVL -4(R_SRC), R_LEN
366	SHRL $8, R_LEN
367	ADDL $65540, R_LEN
368	JMP  doCopyRepeat
369
370doCopy:
371	// This is the end of the outer "switch", when we have a copy tag.
372	//
373	// We assume that:
374	//	- R_LEN == length && R_LEN > 0
375	//	- R_OFF == offset
376
377	// if d < offset { etc }
378	MOVQ R_DST, R_TMP1
379	SUBQ R_DBASE, R_TMP1
380	CMPQ R_TMP1, R_OFF
381	JLT  errCorrupt
382
383	// Repeat values can skip the test above, since any offset > 0 will be in dst.
384doCopyRepeat:
385	// if offset <= 0 { etc }
386	CMPQ R_OFF, $0
387	JLE  errCorrupt
388
389	// if length > len(dst)-d { etc }
390	MOVQ R_DEND, R_TMP1
391	SUBQ R_DST, R_TMP1
392	CMPQ R_LEN, R_TMP1
393	JGT  errCorrupt
394
395	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
396	//
397	// Set:
398	//	- R_TMP2 = len(dst)-d
399	//	- R_TMP3 = &dst[d-offset]
400	MOVQ R_DEND, R_TMP2
401	SUBQ R_DST, R_TMP2
402	MOVQ R_DST, R_TMP3
403	SUBQ R_OFF, R_TMP3
404
405	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
406	//
407	// First, try using two 8-byte load/stores, similar to the doLit technique
408	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
409	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
410	// and not one 16-byte load/store, and the first store has to be before the
411	// second load, due to the overlap if offset is in the range [8, 16).
412	//
413	// if length > 16 || offset < 8 || len(dst)-d < 16 {
414	//   goto slowForwardCopy
415	// }
416	// copy 16 bytes
417	// d += length
418	CMPQ R_LEN, $16
419	JGT  slowForwardCopy
420	CMPQ R_OFF, $8
421	JLT  slowForwardCopy
422	CMPQ R_TMP2, $16
423	JLT  slowForwardCopy
424	MOVQ 0(R_TMP3), R_TMP0
425	MOVQ R_TMP0, 0(R_DST)
426	MOVQ 8(R_TMP3), R_TMP1
427	MOVQ R_TMP1, 8(R_DST)
428	ADDQ R_LEN, R_DST
429	JMP  loop
430
431slowForwardCopy:
432	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
433	// can still try 8-byte load stores, provided we can overrun up to 10 extra
434	// bytes. As above, the overrun will be fixed up by subsequent iterations
435	// of the outermost loop.
436	//
437	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
438	// commentary says:
439	//
440	// ----
441	//
442	// The main part of this loop is a simple copy of eight bytes at a time
443	// until we've copied (at least) the requested amount of bytes.  However,
444	// if d and d-offset are less than eight bytes apart (indicating a
445	// repeating pattern of length < 8), we first need to expand the pattern in
446	// order to get the correct results. For instance, if the buffer looks like
447	// this, with the eight-byte <d-offset> and <d> patterns marked as
448	// intervals:
449	//
450	//    abxxxxxxxxxxxx
451	//    [------]           d-offset
452	//      [------]         d
453	//
454	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
455	// once, after which we can move <d> two bytes without moving <d-offset>:
456	//
457	//    ababxxxxxxxxxx
458	//    [------]           d-offset
459	//        [------]       d
460	//
461	// and repeat the exercise until the two no longer overlap.
462	//
463	// This allows us to do very well in the special case of one single byte
464	// repeated many times, without taking a big hit for more general cases.
465	//
466	// The worst case of extra writing past the end of the match occurs when
467	// offset == 1 and length == 1; the last copy will read from byte positions
468	// [0..7] and write to [4..11], whereas it was only supposed to write to
469	// position 1. Thus, ten excess bytes.
470	//
471	// ----
472	//
473	// That "10 byte overrun" worst case is confirmed by Go's
474	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
475	// and finishSlowForwardCopy algorithm.
476	//
477	// if length > len(dst)-d-10 {
478	//   goto verySlowForwardCopy
479	// }
480	SUBQ $10, R_TMP2
481	CMPQ R_LEN, R_TMP2
482	JGT  verySlowForwardCopy
483
484	// We want to keep the offset, so we use R_TMP2 from here.
485	MOVQ R_OFF, R_TMP2
486
487makeOffsetAtLeast8:
488	// !!! As above, expand the pattern so that offset >= 8 and we can use
489	// 8-byte load/stores.
490	//
491	// for offset < 8 {
492	//   copy 8 bytes from dst[d-offset:] to dst[d:]
493	//   length -= offset
494	//   d      += offset
495	//   offset += offset
496	//   // The two previous lines together means that d-offset, and therefore
497	//   // R_TMP3, is unchanged.
498	// }
499	CMPQ R_TMP2, $8
500	JGE  fixUpSlowForwardCopy
501	MOVQ (R_TMP3), R_TMP1
502	MOVQ R_TMP1, (R_DST)
503	SUBQ R_TMP2, R_LEN
504	ADDQ R_TMP2, R_DST
505	ADDQ R_TMP2, R_TMP2
506	JMP  makeOffsetAtLeast8
507
508fixUpSlowForwardCopy:
509	// !!! Add length (which might be negative now) to d (implied by R_DST being
510	// &dst[d]) so that d ends up at the right place when we jump back to the
511	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
512	// length is positive, copying the remaining length bytes will write to the
513	// right place.
514	MOVQ R_DST, R_TMP0
515	ADDQ R_LEN, R_DST
516
517finishSlowForwardCopy:
518	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
519	// length means that we overrun, but as above, that will be fixed up by
520	// subsequent iterations of the outermost loop.
521	CMPQ R_LEN, $0
522	JLE  loop
523	MOVQ (R_TMP3), R_TMP1
524	MOVQ R_TMP1, (R_TMP0)
525	ADDQ $8, R_TMP3
526	ADDQ $8, R_TMP0
527	SUBQ $8, R_LEN
528	JMP  finishSlowForwardCopy
529
530verySlowForwardCopy:
531	// verySlowForwardCopy is a simple implementation of forward copy. In C
532	// parlance, this is a do/while loop instead of a while loop, since we know
533	// that length > 0. In Go syntax:
534	//
535	// for {
536	//   dst[d] = dst[d - offset]
537	//   d++
538	//   length--
539	//   if length == 0 {
540	//     break
541	//   }
542	// }
543	MOVB (R_TMP3), R_TMP1
544	MOVB R_TMP1, (R_DST)
545	INCQ R_TMP3
546	INCQ R_DST
547	DECQ R_LEN
548	JNZ  verySlowForwardCopy
549	JMP  loop
550
551// The code above handles copy tags.
552// ----------------------------------------
553
554end:
555	// This is the end of the "for s < len(src)".
556	//
557	// if d != len(dst) { etc }
558	CMPQ R_DST, R_DEND
559	JNE  errCorrupt
560
561	// return 0
562	MOVQ $0, ret+48(FP)
563	RET
564
565errCorrupt:
566	// return decodeErrCodeCorrupt
567	MOVQ $1, ret+48(FP)
568	RET
569