1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build !appengine
6// +build gc
7// +build !noasm
8
9#include "textflag.h"
10
11// The asm code generally follows the pure Go code in decode_other.go, except
12// where marked with a "!!!".
13
14// func decode(dst, src []byte) int
15//
16// All local variables fit into registers. The non-zero stack size is only to
17// spill registers and push args when issuing a CALL. The register allocation:
18//	- AX	scratch
19//	- BX	scratch
20//	- CX	length or x
21//	- DX	offset
22//	- SI	&src[s]
23//	- DI	&dst[d]
24//	+ R8	dst_base
25//	+ R9	dst_len
26//	+ R10	dst_base + dst_len
27//	+ R11	src_base
28//	+ R12	src_len
29//	+ R13	src_base + src_len
30//	- R14	used by doCopy
31//	- R15	used by doCopy
32//
33// The registers R8-R13 (marked with a "+") are set at the start of the
34// function, and after a CALL returns, and are not otherwise modified.
35//
36// The d variable is implicitly DI - R8,  and len(dst)-d is R10 - DI.
37// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
38TEXT ·decode(SB), NOSPLIT, $48-56
39	// Initialize SI, DI and R8-R13.
40	MOVQ dst_base+0(FP), R8
41	MOVQ dst_len+8(FP), R9
42	MOVQ R8, DI
43	MOVQ R8, R10
44	ADDQ R9, R10
45	MOVQ src_base+24(FP), R11
46	MOVQ src_len+32(FP), R12
47	MOVQ R11, SI
48	MOVQ R11, R13
49	ADDQ R12, R13
50
51loop:
52	// for s < len(src)
53	CMPQ SI, R13
54	JEQ  end
55
56	// CX = uint32(src[s])
57	//
58	// switch src[s] & 0x03
59	MOVBLZX (SI), CX
60	MOVL    CX, BX
61	ANDL    $3, BX
62	CMPL    BX, $1
63	JAE     tagCopy
64
65	// ----------------------------------------
66	// The code below handles literal tags.
67
68	// case tagLiteral:
69	// x := uint32(src[s] >> 2)
70	// switch
71	SHRL $2, CX
72	CMPL CX, $60
73	JAE  tagLit60Plus
74
75	// case x < 60:
76	// s++
77	INCQ SI
78
79doLit:
80	// This is the end of the inner "switch", when we have a literal tag.
81	//
82	// We assume that CX == x and x fits in a uint32, where x is the variable
83	// used in the pure Go decode_other.go code.
84
85	// length = int(x) + 1
86	//
87	// Unlike the pure Go code, we don't need to check if length <= 0 because
88	// CX can hold 64 bits, so the increment cannot overflow.
89	INCQ CX
90
91	// Prepare to check if copying length bytes will run past the end of dst or
92	// src.
93	//
94	// AX = len(dst) - d
95	// BX = len(src) - s
96	MOVQ R10, AX
97	SUBQ DI, AX
98	MOVQ R13, BX
99	SUBQ SI, BX
100
101	// !!! Try a faster technique for short (16 or fewer bytes) copies.
102	//
103	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
104	//   goto callMemmove // Fall back on calling runtime·memmove.
105	// }
106	//
107	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
108	// against 21 instead of 16, because it cannot assume that all of its input
109	// is contiguous in memory and so it needs to leave enough source bytes to
110	// read the next tag without refilling buffers, but Go's Decode assumes
111	// contiguousness (the src argument is a []byte).
112	CMPQ CX, $16
113	JGT  callMemmove
114	CMPQ AX, $16
115	JLT  callMemmove
116	CMPQ BX, $16
117	JLT  callMemmove
118
119	// !!! Implement the copy from src to dst as a 16-byte load and store.
120	// (Decode's documentation says that dst and src must not overlap.)
121	//
122	// This always copies 16 bytes, instead of only length bytes, but that's
123	// OK. If the input is a valid Snappy encoding then subsequent iterations
124	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
125	// non-nil error), so the overrun will be ignored.
126	//
127	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
128	// 16-byte loads and stores. This technique probably wouldn't be as
129	// effective on architectures that are fussier about alignment.
130	MOVOU 0(SI), X0
131	MOVOU X0, 0(DI)
132
133	// d += length
134	// s += length
135	ADDQ CX, DI
136	ADDQ CX, SI
137	JMP  loop
138
139callMemmove:
140	// if length > len(dst)-d || length > len(src)-s { etc }
141	CMPQ CX, AX
142	JGT  errCorrupt
143	CMPQ CX, BX
144	JGT  errCorrupt
145
146	// copy(dst[d:], src[s:s+length])
147	//
148	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
149	// DI, SI and CX as arguments. Coincidentally, we also need to spill those
150	// three registers to the stack, to save local variables across the CALL.
151	MOVQ DI, 0(SP)
152	MOVQ SI, 8(SP)
153	MOVQ CX, 16(SP)
154	MOVQ DI, 24(SP)
155	MOVQ SI, 32(SP)
156	MOVQ CX, 40(SP)
157	CALL runtime·memmove(SB)
158
159	// Restore local variables: unspill registers from the stack and
160	// re-calculate R8-R13.
161	MOVQ 24(SP), DI
162	MOVQ 32(SP), SI
163	MOVQ 40(SP), CX
164	MOVQ dst_base+0(FP), R8
165	MOVQ dst_len+8(FP), R9
166	MOVQ R8, R10
167	ADDQ R9, R10
168	MOVQ src_base+24(FP), R11
169	MOVQ src_len+32(FP), R12
170	MOVQ R11, R13
171	ADDQ R12, R13
172
173	// d += length
174	// s += length
175	ADDQ CX, DI
176	ADDQ CX, SI
177	JMP  loop
178
179tagLit60Plus:
180	// !!! This fragment does the
181	//
182	// s += x - 58; if uint(s) > uint(len(src)) { etc }
183	//
184	// checks. In the asm version, we code it once instead of once per switch case.
185	ADDQ CX, SI
186	SUBQ $58, SI
187	MOVQ SI, BX
188	SUBQ R11, BX
189	CMPQ BX, R12
190	JA   errCorrupt
191
192	// case x == 60:
193	CMPL CX, $61
194	JEQ  tagLit61
195	JA   tagLit62Plus
196
197	// x = uint32(src[s-1])
198	MOVBLZX -1(SI), CX
199	JMP     doLit
200
201tagLit61:
202	// case x == 61:
203	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
204	MOVWLZX -2(SI), CX
205	JMP     doLit
206
207tagLit62Plus:
208	CMPL CX, $62
209	JA   tagLit63
210
211	// case x == 62:
212	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
213	MOVWLZX -3(SI), CX
214	MOVBLZX -1(SI), BX
215	SHLL    $16, BX
216	ORL     BX, CX
217	JMP     doLit
218
219tagLit63:
220	// case x == 63:
221	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
222	MOVL -4(SI), CX
223	JMP  doLit
224
225// The code above handles literal tags.
226// ----------------------------------------
227// The code below handles copy tags.
228
229tagCopy4:
230	// case tagCopy4:
231	// s += 5
232	ADDQ $5, SI
233
234	// if uint(s) > uint(len(src)) { etc }
235	MOVQ SI, BX
236	SUBQ R11, BX
237	CMPQ BX, R12
238	JA   errCorrupt
239
240	// length = 1 + int(src[s-5])>>2
241	SHRQ $2, CX
242	INCQ CX
243
244	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
245	MOVLQZX -4(SI), DX
246	JMP     doCopy
247
248tagCopy2:
249	// case tagCopy2:
250	// s += 3
251	ADDQ $3, SI
252
253	// if uint(s) > uint(len(src)) { etc }
254	MOVQ SI, BX
255	SUBQ R11, BX
256	CMPQ BX, R12
257	JA   errCorrupt
258
259	// length = 1 + int(src[s-3])>>2
260	SHRQ $2, CX
261	INCQ CX
262
263	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
264	MOVWQZX -2(SI), DX
265	JMP     doCopy
266
267tagCopy:
268	// We have a copy tag. We assume that:
269	//	- BX == src[s] & 0x03
270	//	- CX == src[s]
271	CMPQ BX, $2
272	JEQ  tagCopy2
273	JA   tagCopy4
274
275	// case tagCopy1:
276	// s += 2
277	ADDQ $2, SI
278
279	// if uint(s) > uint(len(src)) { etc }
280	MOVQ SI, BX
281	SUBQ R11, BX
282	CMPQ BX, R12
283	JA   errCorrupt
284
285	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
286	MOVQ    CX, DX
287	ANDQ    $0xe0, DX
288	SHLQ    $3, DX
289	MOVBQZX -1(SI), BX
290	ORQ     BX, DX
291
292	// length = 4 + int(src[s-2])>>2&0x7
293	SHRQ $2, CX
294	ANDQ $7, CX
295	ADDQ $4, CX
296
297doCopy:
298	// This is the end of the outer "switch", when we have a copy tag.
299	//
300	// We assume that:
301	//	- CX == length && CX > 0
302	//	- DX == offset
303
304	// if offset <= 0 { etc }
305	CMPQ DX, $0
306	JLE  errCorrupt
307
308	// if d < offset { etc }
309	MOVQ DI, BX
310	SUBQ R8, BX
311	CMPQ BX, DX
312	JLT  errCorrupt
313
314	// if length > len(dst)-d { etc }
315	MOVQ R10, BX
316	SUBQ DI, BX
317	CMPQ CX, BX
318	JGT  errCorrupt
319
320	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
321	//
322	// Set:
323	//	- R14 = len(dst)-d
324	//	- R15 = &dst[d-offset]
325	MOVQ R10, R14
326	SUBQ DI, R14
327	MOVQ DI, R15
328	SUBQ DX, R15
329
330	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
331	//
332	// First, try using two 8-byte load/stores, similar to the doLit technique
333	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
334	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
335	// and not one 16-byte load/store, and the first store has to be before the
336	// second load, due to the overlap if offset is in the range [8, 16).
337	//
338	// if length > 16 || offset < 8 || len(dst)-d < 16 {
339	//   goto slowForwardCopy
340	// }
341	// copy 16 bytes
342	// d += length
343	CMPQ CX, $16
344	JGT  slowForwardCopy
345	CMPQ DX, $8
346	JLT  slowForwardCopy
347	CMPQ R14, $16
348	JLT  slowForwardCopy
349	MOVQ 0(R15), AX
350	MOVQ AX, 0(DI)
351	MOVQ 8(R15), BX
352	MOVQ BX, 8(DI)
353	ADDQ CX, DI
354	JMP  loop
355
356slowForwardCopy:
357	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
358	// can still try 8-byte load stores, provided we can overrun up to 10 extra
359	// bytes. As above, the overrun will be fixed up by subsequent iterations
360	// of the outermost loop.
361	//
362	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
363	// commentary says:
364	//
365	// ----
366	//
367	// The main part of this loop is a simple copy of eight bytes at a time
368	// until we've copied (at least) the requested amount of bytes.  However,
369	// if d and d-offset are less than eight bytes apart (indicating a
370	// repeating pattern of length < 8), we first need to expand the pattern in
371	// order to get the correct results. For instance, if the buffer looks like
372	// this, with the eight-byte <d-offset> and <d> patterns marked as
373	// intervals:
374	//
375	//    abxxxxxxxxxxxx
376	//    [------]           d-offset
377	//      [------]         d
378	//
379	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
380	// once, after which we can move <d> two bytes without moving <d-offset>:
381	//
382	//    ababxxxxxxxxxx
383	//    [------]           d-offset
384	//        [------]       d
385	//
386	// and repeat the exercise until the two no longer overlap.
387	//
388	// This allows us to do very well in the special case of one single byte
389	// repeated many times, without taking a big hit for more general cases.
390	//
391	// The worst case of extra writing past the end of the match occurs when
392	// offset == 1 and length == 1; the last copy will read from byte positions
393	// [0..7] and write to [4..11], whereas it was only supposed to write to
394	// position 1. Thus, ten excess bytes.
395	//
396	// ----
397	//
398	// That "10 byte overrun" worst case is confirmed by Go's
399	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
400	// and finishSlowForwardCopy algorithm.
401	//
402	// if length > len(dst)-d-10 {
403	//   goto verySlowForwardCopy
404	// }
405	SUBQ $10, R14
406	CMPQ CX, R14
407	JGT  verySlowForwardCopy
408
409makeOffsetAtLeast8:
410	// !!! As above, expand the pattern so that offset >= 8 and we can use
411	// 8-byte load/stores.
412	//
413	// for offset < 8 {
414	//   copy 8 bytes from dst[d-offset:] to dst[d:]
415	//   length -= offset
416	//   d      += offset
417	//   offset += offset
418	//   // The two previous lines together means that d-offset, and therefore
419	//   // R15, is unchanged.
420	// }
421	CMPQ DX, $8
422	JGE  fixUpSlowForwardCopy
423	MOVQ (R15), BX
424	MOVQ BX, (DI)
425	SUBQ DX, CX
426	ADDQ DX, DI
427	ADDQ DX, DX
428	JMP  makeOffsetAtLeast8
429
430fixUpSlowForwardCopy:
431	// !!! Add length (which might be negative now) to d (implied by DI being
432	// &dst[d]) so that d ends up at the right place when we jump back to the
433	// top of the loop. Before we do that, though, we save DI to AX so that, if
434	// length is positive, copying the remaining length bytes will write to the
435	// right place.
436	MOVQ DI, AX
437	ADDQ CX, DI
438
439finishSlowForwardCopy:
440	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
441	// length means that we overrun, but as above, that will be fixed up by
442	// subsequent iterations of the outermost loop.
443	CMPQ CX, $0
444	JLE  loop
445	MOVQ (R15), BX
446	MOVQ BX, (AX)
447	ADDQ $8, R15
448	ADDQ $8, AX
449	SUBQ $8, CX
450	JMP  finishSlowForwardCopy
451
452verySlowForwardCopy:
453	// verySlowForwardCopy is a simple implementation of forward copy. In C
454	// parlance, this is a do/while loop instead of a while loop, since we know
455	// that length > 0. In Go syntax:
456	//
457	// for {
458	//   dst[d] = dst[d - offset]
459	//   d++
460	//   length--
461	//   if length == 0 {
462	//     break
463	//   }
464	// }
465	MOVB (R15), BX
466	MOVB BX, (DI)
467	INCQ R15
468	INCQ DI
469	DECQ CX
470	JNZ  verySlowForwardCopy
471	JMP  loop
472
473// The code above handles copy tags.
474// ----------------------------------------
475
476end:
477	// This is the end of the "for s < len(src)".
478	//
479	// if d != len(dst) { etc }
480	CMPQ DI, R10
481	JNE  errCorrupt
482
483	// return 0
484	MOVQ $0, ret+48(FP)
485	RET
486
487errCorrupt:
488	// return decodeErrCodeCorrupt
489	MOVQ $1, ret+48(FP)
490	RET
491