1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build !appengine
6// +build gc
7// +build !noasm
8
9#include "textflag.h"
10
11// The asm code generally follows the pure Go code in decode_other.go, except
12// where marked with a "!!!".
13
14// func decode(dst, src []byte) int
15//
16// All local variables fit into registers. The non-zero stack size is only to
17// spill registers and push args when issuing a CALL. The register allocation:
18//	- AX	scratch
19//	- BX	scratch
20//	- CX	length or x
21//	- DX	offset
22//	- SI	&src[s]
23//	- DI	&dst[d]
24//	+ R8	dst_base
25//	+ R9	dst_len
26//	+ R10	dst_base + dst_len
27//	+ R11	src_base
28//	+ R12	src_len
29//	+ R13	src_base + src_len
30//	- R14	used by doCopy
31//	- R15	used by doCopy
32//
33// The registers R8-R13 (marked with a "+") are set at the start of the
34// function, and after a CALL returns, and are not otherwise modified.
35//
36// The d variable is implicitly DI - R8,  and len(dst)-d is R10 - DI.
37// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
38TEXT ·decode(SB), NOSPLIT, $48-56
39	// Initialize SI, DI and R8-R13.
40	MOVQ dst_base+0(FP), R8
41	MOVQ dst_len+8(FP), R9
42	MOVQ R8, DI
43	MOVQ R8, R10
44	ADDQ R9, R10
45	MOVQ src_base+24(FP), R11
46	MOVQ src_len+32(FP), R12
47	MOVQ R11, SI
48	MOVQ R11, R13
49	ADDQ R12, R13
50
51loop:
52	// for s < len(src)
53	CMPQ SI, R13
54	JEQ  end
55
56	// CX = uint32(src[s])
57	//
58	// switch src[s] & 0x03
59	MOVBLZX (SI), CX
60	MOVL    CX, BX
61	ANDL    $3, BX
62	CMPL    BX, $1
63	JAE     tagCopy
64
65	// ----------------------------------------
66	// The code below handles literal tags.
67
68	// case tagLiteral:
69	// x := uint32(src[s] >> 2)
70	// switch
71	SHRL $2, CX
72	CMPL CX, $60
73	JAE  tagLit60Plus
74
75	// case x < 60:
76	// s++
77	INCQ SI
78
79doLit:
80	// This is the end of the inner "switch", when we have a literal tag.
81	//
82	// We assume that CX == x and x fits in a uint32, where x is the variable
83	// used in the pure Go decode_other.go code.
84
85	// length = int(x) + 1
86	//
87	// Unlike the pure Go code, we don't need to check if length <= 0 because
88	// CX can hold 64 bits, so the increment cannot overflow.
89	INCQ CX
90
91	// Prepare to check if copying length bytes will run past the end of dst or
92	// src.
93	//
94	// AX = len(dst) - d
95	// BX = len(src) - s
96	MOVQ R10, AX
97	SUBQ DI, AX
98	MOVQ R13, BX
99	SUBQ SI, BX
100
101	// !!! Try a faster technique for short (16 or fewer bytes) copies.
102	//
103	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
104	//   goto callMemmove // Fall back on calling runtime·memmove.
105	// }
106	//
107	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
108	// against 21 instead of 16, because it cannot assume that all of its input
109	// is contiguous in memory and so it needs to leave enough source bytes to
110	// read the next tag without refilling buffers, but Go's Decode assumes
111	// contiguousness (the src argument is a []byte).
112	CMPQ CX, $16
113	JGT  callMemmove
114	CMPQ AX, $16
115	JLT  callMemmove
116	CMPQ BX, $16
117	JLT  callMemmove
118
119	// !!! Implement the copy from src to dst as a 16-byte load and store.
120	// (Decode's documentation says that dst and src must not overlap.)
121	//
122	// This always copies 16 bytes, instead of only length bytes, but that's
123	// OK. If the input is a valid Snappy encoding then subsequent iterations
124	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
125	// non-nil error), so the overrun will be ignored.
126	//
127	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
128	// 16-byte loads and stores. This technique probably wouldn't be as
129	// effective on architectures that are fussier about alignment.
130	MOVOU 0(SI), X0
131	MOVOU X0, 0(DI)
132
133	// d += length
134	// s += length
135	ADDQ CX, DI
136	ADDQ CX, SI
137	JMP  loop
138
139callMemmove:
140	// if length > len(dst)-d || length > len(src)-s { etc }
141	CMPQ CX, AX
142	JGT  errCorrupt
143	CMPQ CX, BX
144	JGT  errCorrupt
145
146	// copy(dst[d:], src[s:s+length])
147	//
148	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
149	// DI, SI and CX as arguments. Coincidentally, we also need to spill those
150	// three registers to the stack, to save local variables across the CALL.
151	MOVQ DI, 0(SP)
152	MOVQ SI, 8(SP)
153	MOVQ CX, 16(SP)
154	MOVQ DI, 24(SP)
155	MOVQ SI, 32(SP)
156	MOVQ CX, 40(SP)
157	CALL runtime·memmove(SB)
158
159	// Restore local variables: unspill registers from the stack and
160	// re-calculate R8-R13.
161	MOVQ 24(SP), DI
162	MOVQ 32(SP), SI
163	MOVQ 40(SP), CX
164	MOVQ dst_base+0(FP), R8
165	MOVQ dst_len+8(FP), R9
166	MOVQ R8, R10
167	ADDQ R9, R10
168	MOVQ src_base+24(FP), R11
169	MOVQ src_len+32(FP), R12
170	MOVQ R11, R13
171	ADDQ R12, R13
172
173	// d += length
174	// s += length
175	ADDQ CX, DI
176	ADDQ CX, SI
177	JMP  loop
178
179tagLit60Plus:
180	// !!! This fragment does the
181	//
182	// s += x - 58; if uint(s) > uint(len(src)) { etc }
183	//
184	// checks. In the asm version, we code it once instead of once per switch case.
185	ADDQ CX, SI
186	SUBQ $58, SI
187	CMPQ SI, R13
188	JA   errCorrupt
189
190	// case x == 60:
191	CMPL CX, $61
192	JEQ  tagLit61
193	JA   tagLit62Plus
194
195	// x = uint32(src[s-1])
196	MOVBLZX -1(SI), CX
197	JMP     doLit
198
199tagLit61:
200	// case x == 61:
201	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
202	MOVWLZX -2(SI), CX
203	JMP     doLit
204
205tagLit62Plus:
206	CMPL CX, $62
207	JA   tagLit63
208
209	// case x == 62:
210	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
211	MOVWLZX -3(SI), CX
212	MOVBLZX -1(SI), BX
213	SHLL    $16, BX
214	ORL     BX, CX
215	JMP     doLit
216
217tagLit63:
218	// case x == 63:
219	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
220	MOVL -4(SI), CX
221	JMP  doLit
222
223// The code above handles literal tags.
224// ----------------------------------------
225// The code below handles copy tags.
226
227tagCopy4:
228	// case tagCopy4:
229	// s += 5
230	ADDQ $5, SI
231
232	// if uint(s) > uint(len(src)) { etc }
233	CMPQ SI, R13
234	JA   errCorrupt
235
236	// length = 1 + int(src[s-5])>>2
237	SHRQ $2, CX
238	INCQ CX
239
240	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
241	MOVLQZX -4(SI), DX
242	JMP     doCopy
243
244tagCopy2:
245	// case tagCopy2:
246	// s += 3
247	ADDQ $3, SI
248
249	// if uint(s) > uint(len(src)) { etc }
250	CMPQ SI, R13
251	JA   errCorrupt
252
253	// length = 1 + int(src[s-3])>>2
254	SHRQ $2, CX
255	INCQ CX
256
257	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
258	MOVWQZX -2(SI), DX
259	JMP     doCopy
260
261tagCopy:
262	// We have a copy tag. We assume that:
263	//	- BX == src[s] & 0x03
264	//	- CX == src[s]
265	CMPQ BX, $2
266	JEQ  tagCopy2
267	JA   tagCopy4
268
269	// case tagCopy1:
270	// s += 2
271	ADDQ $2, SI
272
273	// if uint(s) > uint(len(src)) { etc }
274	CMPQ SI, R13
275	JA   errCorrupt
276
277	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
278	MOVQ    CX, DX
279	ANDQ    $0xe0, DX
280	SHLQ    $3, DX
281	MOVBQZX -1(SI), BX
282	ORQ     BX, DX
283
284	// length = 4 + int(src[s-2])>>2&0x7
285	SHRQ $2, CX
286	ANDQ $7, CX
287	ADDQ $4, CX
288
289doCopy:
290	// This is the end of the outer "switch", when we have a copy tag.
291	//
292	// We assume that:
293	//	- CX == length && CX > 0
294	//	- DX == offset
295
296	// if offset <= 0 { etc }
297	CMPQ DX, $0
298	JLE  errCorrupt
299
300	// if d < offset { etc }
301	MOVQ DI, BX
302	SUBQ R8, BX
303	CMPQ BX, DX
304	JLT  errCorrupt
305
306	// if length > len(dst)-d { etc }
307	MOVQ R10, BX
308	SUBQ DI, BX
309	CMPQ CX, BX
310	JGT  errCorrupt
311
312	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
313	//
314	// Set:
315	//	- R14 = len(dst)-d
316	//	- R15 = &dst[d-offset]
317	MOVQ R10, R14
318	SUBQ DI, R14
319	MOVQ DI, R15
320	SUBQ DX, R15
321
322	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
323	//
324	// First, try using two 8-byte load/stores, similar to the doLit technique
325	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
326	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
327	// and not one 16-byte load/store, and the first store has to be before the
328	// second load, due to the overlap if offset is in the range [8, 16).
329	//
330	// if length > 16 || offset < 8 || len(dst)-d < 16 {
331	//   goto slowForwardCopy
332	// }
333	// copy 16 bytes
334	// d += length
335	CMPQ CX, $16
336	JGT  slowForwardCopy
337	CMPQ DX, $8
338	JLT  slowForwardCopy
339	CMPQ R14, $16
340	JLT  slowForwardCopy
341	MOVQ 0(R15), AX
342	MOVQ AX, 0(DI)
343	MOVQ 8(R15), BX
344	MOVQ BX, 8(DI)
345	ADDQ CX, DI
346	JMP  loop
347
348slowForwardCopy:
349	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
350	// can still try 8-byte load stores, provided we can overrun up to 10 extra
351	// bytes. As above, the overrun will be fixed up by subsequent iterations
352	// of the outermost loop.
353	//
354	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
355	// commentary says:
356	//
357	// ----
358	//
359	// The main part of this loop is a simple copy of eight bytes at a time
360	// until we've copied (at least) the requested amount of bytes.  However,
361	// if d and d-offset are less than eight bytes apart (indicating a
362	// repeating pattern of length < 8), we first need to expand the pattern in
363	// order to get the correct results. For instance, if the buffer looks like
364	// this, with the eight-byte <d-offset> and <d> patterns marked as
365	// intervals:
366	//
367	//    abxxxxxxxxxxxx
368	//    [------]           d-offset
369	//      [------]         d
370	//
371	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
372	// once, after which we can move <d> two bytes without moving <d-offset>:
373	//
374	//    ababxxxxxxxxxx
375	//    [------]           d-offset
376	//        [------]       d
377	//
378	// and repeat the exercise until the two no longer overlap.
379	//
380	// This allows us to do very well in the special case of one single byte
381	// repeated many times, without taking a big hit for more general cases.
382	//
383	// The worst case of extra writing past the end of the match occurs when
384	// offset == 1 and length == 1; the last copy will read from byte positions
385	// [0..7] and write to [4..11], whereas it was only supposed to write to
386	// position 1. Thus, ten excess bytes.
387	//
388	// ----
389	//
390	// That "10 byte overrun" worst case is confirmed by Go's
391	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
392	// and finishSlowForwardCopy algorithm.
393	//
394	// if length > len(dst)-d-10 {
395	//   goto verySlowForwardCopy
396	// }
397	SUBQ $10, R14
398	CMPQ CX, R14
399	JGT  verySlowForwardCopy
400
401makeOffsetAtLeast8:
402	// !!! As above, expand the pattern so that offset >= 8 and we can use
403	// 8-byte load/stores.
404	//
405	// for offset < 8 {
406	//   copy 8 bytes from dst[d-offset:] to dst[d:]
407	//   length -= offset
408	//   d      += offset
409	//   offset += offset
410	//   // The two previous lines together means that d-offset, and therefore
411	//   // R15, is unchanged.
412	// }
413	CMPQ DX, $8
414	JGE  fixUpSlowForwardCopy
415	MOVQ (R15), BX
416	MOVQ BX, (DI)
417	SUBQ DX, CX
418	ADDQ DX, DI
419	ADDQ DX, DX
420	JMP  makeOffsetAtLeast8
421
422fixUpSlowForwardCopy:
423	// !!! Add length (which might be negative now) to d (implied by DI being
424	// &dst[d]) so that d ends up at the right place when we jump back to the
425	// top of the loop. Before we do that, though, we save DI to AX so that, if
426	// length is positive, copying the remaining length bytes will write to the
427	// right place.
428	MOVQ DI, AX
429	ADDQ CX, DI
430
431finishSlowForwardCopy:
432	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
433	// length means that we overrun, but as above, that will be fixed up by
434	// subsequent iterations of the outermost loop.
435	CMPQ CX, $0
436	JLE  loop
437	MOVQ (R15), BX
438	MOVQ BX, (AX)
439	ADDQ $8, R15
440	ADDQ $8, AX
441	SUBQ $8, CX
442	JMP  finishSlowForwardCopy
443
444verySlowForwardCopy:
445	// verySlowForwardCopy is a simple implementation of forward copy. In C
446	// parlance, this is a do/while loop instead of a while loop, since we know
447	// that length > 0. In Go syntax:
448	//
449	// for {
450	//   dst[d] = dst[d - offset]
451	//   d++
452	//   length--
453	//   if length == 0 {
454	//     break
455	//   }
456	// }
457	MOVB (R15), BX
458	MOVB BX, (DI)
459	INCQ R15
460	INCQ DI
461	DECQ CX
462	JNZ  verySlowForwardCopy
463	JMP  loop
464
465// The code above handles copy tags.
466// ----------------------------------------
467
468end:
469	// This is the end of the "for s < len(src)".
470	//
471	// if d != len(dst) { etc }
472	CMPQ DI, R10
473	JNE  errCorrupt
474
475	// return 0
476	MOVQ $0, ret+48(FP)
477	RET
478
479errCorrupt:
480	// return decodeErrCodeCorrupt
481	MOVQ $1, ret+48(FP)
482	RET
483