1// Copyright 2020 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build !appengine
6// +build gc
7// +build !noasm
8
9#include "textflag.h"
10
11// The asm code generally follows the pure Go code in encode_other.go, except
12// where marked with a "!!!".
13
14// ----------------------------------------------------------------------------
15
16// func emitLiteral(dst, lit []byte) int
17//
18// All local variables fit into registers. The register allocation:
19//	- R3	len(lit)
20//	- R4	n
21//	- R6	return value
22//	- R8	&dst[i]
23//	- R10	&lit[0]
24//
25// The 32 bytes of stack space is to call runtime·memmove.
26//
27// The unusual register allocation of local variables, such as R10 for the
28// source pointer, matches the allocation used at the call site in encodeBlock,
29// which makes it easier to manually inline this function.
30TEXT ·emitLiteral(SB), NOSPLIT, $32-56
31	MOVD dst_base+0(FP), R8
32	MOVD lit_base+24(FP), R10
33	MOVD lit_len+32(FP), R3
34	MOVD R3, R6
35	MOVW R3, R4
36	SUBW $1, R4, R4
37
38	CMPW $60, R4
39	BLT  oneByte
40	CMPW $256, R4
41	BLT  twoBytes
42
43threeBytes:
44	MOVD $0xf4, R2
45	MOVB R2, 0(R8)
46	MOVW R4, 1(R8)
47	ADD  $3, R8, R8
48	ADD  $3, R6, R6
49	B    memmove
50
51twoBytes:
52	MOVD $0xf0, R2
53	MOVB R2, 0(R8)
54	MOVB R4, 1(R8)
55	ADD  $2, R8, R8
56	ADD  $2, R6, R6
57	B    memmove
58
59oneByte:
60	LSLW $2, R4, R4
61	MOVB R4, 0(R8)
62	ADD  $1, R8, R8
63	ADD  $1, R6, R6
64
65memmove:
66	MOVD R6, ret+48(FP)
67
68	// copy(dst[i:], lit)
69	//
70	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
71	// R8, R10 and R3 as arguments.
72	MOVD R8, 8(RSP)
73	MOVD R10, 16(RSP)
74	MOVD R3, 24(RSP)
75	CALL runtime·memmove(SB)
76	RET
77
78// ----------------------------------------------------------------------------
79
80// func emitCopy(dst []byte, offset, length int) int
81//
82// All local variables fit into registers. The register allocation:
83//	- R3	length
84//	- R7	&dst[0]
85//	- R8	&dst[i]
86//	- R11	offset
87//
88// The unusual register allocation of local variables, such as R11 for the
89// offset, matches the allocation used at the call site in encodeBlock, which
90// makes it easier to manually inline this function.
91TEXT ·emitCopy(SB), NOSPLIT, $0-48
92	MOVD dst_base+0(FP), R8
93	MOVD R8, R7
94	MOVD offset+24(FP), R11
95	MOVD length+32(FP), R3
96
97loop0:
98	// for length >= 68 { etc }
99	CMPW $68, R3
100	BLT  step1
101
102	// Emit a length 64 copy, encoded as 3 bytes.
103	MOVD $0xfe, R2
104	MOVB R2, 0(R8)
105	MOVW R11, 1(R8)
106	ADD  $3, R8, R8
107	SUB  $64, R3, R3
108	B    loop0
109
110step1:
111	// if length > 64 { etc }
112	CMP $64, R3
113	BLE step2
114
115	// Emit a length 60 copy, encoded as 3 bytes.
116	MOVD $0xee, R2
117	MOVB R2, 0(R8)
118	MOVW R11, 1(R8)
119	ADD  $3, R8, R8
120	SUB  $60, R3, R3
121
122step2:
123	// if length >= 12 || offset >= 2048 { goto step3 }
124	CMP  $12, R3
125	BGE  step3
126	CMPW $2048, R11
127	BGE  step3
128
129	// Emit the remaining copy, encoded as 2 bytes.
130	MOVB R11, 1(R8)
131	LSRW $3, R11, R11
132	AND  $0xe0, R11, R11
133	SUB  $4, R3, R3
134	LSLW $2, R3
135	AND  $0xff, R3, R3
136	ORRW R3, R11, R11
137	ORRW $1, R11, R11
138	MOVB R11, 0(R8)
139	ADD  $2, R8, R8
140
141	// Return the number of bytes written.
142	SUB  R7, R8, R8
143	MOVD R8, ret+40(FP)
144	RET
145
146step3:
147	// Emit the remaining copy, encoded as 3 bytes.
148	SUB  $1, R3, R3
149	AND  $0xff, R3, R3
150	LSLW $2, R3, R3
151	ORRW $2, R3, R3
152	MOVB R3, 0(R8)
153	MOVW R11, 1(R8)
154	ADD  $3, R8, R8
155
156	// Return the number of bytes written.
157	SUB  R7, R8, R8
158	MOVD R8, ret+40(FP)
159	RET
160
161// ----------------------------------------------------------------------------
162
163// func extendMatch(src []byte, i, j int) int
164//
165// All local variables fit into registers. The register allocation:
166//	- R6	&src[0]
167//	- R7	&src[j]
168//	- R13	&src[len(src) - 8]
169//	- R14	&src[len(src)]
170//	- R15	&src[i]
171//
172// The unusual register allocation of local variables, such as R15 for a source
173// pointer, matches the allocation used at the call site in encodeBlock, which
174// makes it easier to manually inline this function.
175TEXT ·extendMatch(SB), NOSPLIT, $0-48
176	MOVD src_base+0(FP), R6
177	MOVD src_len+8(FP), R14
178	MOVD i+24(FP), R15
179	MOVD j+32(FP), R7
180	ADD  R6, R14, R14
181	ADD  R6, R15, R15
182	ADD  R6, R7, R7
183	MOVD R14, R13
184	SUB  $8, R13, R13
185
186cmp8:
187	// As long as we are 8 or more bytes before the end of src, we can load and
188	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
189	CMP  R13, R7
190	BHI  cmp1
191	MOVD (R15), R3
192	MOVD (R7), R4
193	CMP  R4, R3
194	BNE  bsf
195	ADD  $8, R15, R15
196	ADD  $8, R7, R7
197	B    cmp8
198
199bsf:
200	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
201	// the index of the first byte that differs.
202	// RBIT reverses the bit order, then CLZ counts the leading zeros, the
203	// combination of which finds the least significant bit which is set.
204	// The arm64 architecture is little-endian, and the shift by 3 converts
205	// a bit index to a byte index.
206	EOR  R3, R4, R4
207	RBIT R4, R4
208	CLZ  R4, R4
209	ADD  R4>>3, R7, R7
210
211	// Convert from &src[ret] to ret.
212	SUB  R6, R7, R7
213	MOVD R7, ret+40(FP)
214	RET
215
216cmp1:
217	// In src's tail, compare 1 byte at a time.
218	CMP  R7, R14
219	BLS  extendMatchEnd
220	MOVB (R15), R3
221	MOVB (R7), R4
222	CMP  R4, R3
223	BNE  extendMatchEnd
224	ADD  $1, R15, R15
225	ADD  $1, R7, R7
226	B    cmp1
227
228extendMatchEnd:
229	// Convert from &src[ret] to ret.
230	SUB  R6, R7, R7
231	MOVD R7, ret+40(FP)
232	RET
233
234// ----------------------------------------------------------------------------
235
236// func encodeBlock(dst, src []byte) (d int)
237//
238// All local variables fit into registers, other than "var table". The register
239// allocation:
240//	- R3	.	.
241//	- R4	.	.
242//	- R5	64	shift
243//	- R6	72	&src[0], tableSize
244//	- R7	80	&src[s]
245//	- R8	88	&dst[d]
246//	- R9	96	sLimit
247//	- R10	.	&src[nextEmit]
248//	- R11	104	prevHash, currHash, nextHash, offset
249//	- R12	112	&src[base], skip
250//	- R13	.	&src[nextS], &src[len(src) - 8]
251//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
252//	- R15	120	candidate
253//	- R16	.	hash constant, 0x1e35a7bd
254//	- R17	.	&table
255//	- .  	128	table
256//
257// The second column (64, 72, etc) is the stack offset to spill the registers
258// when calling other functions. We could pack this slightly tighter, but it's
259// simpler to have a dedicated spill map independent of the function called.
260//
261// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
262// extra 64 bytes, to call other functions, and an extra 64 bytes, to spill
263// local variables (registers) during calls gives 32768 + 64 + 64 = 32896.
264TEXT ·encodeBlock(SB), 0, $32896-56
265	MOVD dst_base+0(FP), R8
266	MOVD src_base+24(FP), R7
267	MOVD src_len+32(FP), R14
268
269	// shift, tableSize := uint32(32-8), 1<<8
270	MOVD  $24, R5
271	MOVD  $256, R6
272	MOVW  $0xa7bd, R16
273	MOVKW $(0x1e35<<16), R16
274
275calcShift:
276	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
277	//	shift--
278	// }
279	MOVD $16384, R2
280	CMP  R2, R6
281	BGE  varTable
282	CMP  R14, R6
283	BGE  varTable
284	SUB  $1, R5, R5
285	LSL  $1, R6, R6
286	B    calcShift
287
288varTable:
289	// var table [maxTableSize]uint16
290	//
291	// In the asm code, unlike the Go code, we can zero-initialize only the
292	// first tableSize elements. Each uint16 element is 2 bytes and each
293	// iterations writes 64 bytes, so we can do only tableSize/32 writes
294	// instead of the 2048 writes that would zero-initialize all of table's
295	// 32768 bytes. This clear could overrun the first tableSize elements, but
296	// it won't overrun the allocated stack size.
297	ADD  $128, RSP, R17
298	MOVD R17, R4
299
300	// !!! R6 = &src[tableSize]
301	ADD R6<<1, R17, R6
302
303memclr:
304	STP.P (ZR, ZR), 64(R4)
305	STP   (ZR, ZR), -48(R4)
306	STP   (ZR, ZR), -32(R4)
307	STP   (ZR, ZR), -16(R4)
308	CMP   R4, R6
309	BHI   memclr
310
311	// !!! R6 = &src[0]
312	MOVD R7, R6
313
314	// sLimit := len(src) - inputMargin
315	MOVD R14, R9
316	SUB  $15, R9, R9
317
318	// !!! Pre-emptively spill R5, R6 and R9 to the stack. Their values don't
319	// change for the rest of the function.
320	MOVD R5, 64(RSP)
321	MOVD R6, 72(RSP)
322	MOVD R9, 96(RSP)
323
324	// nextEmit := 0
325	MOVD R6, R10
326
327	// s := 1
328	ADD $1, R7, R7
329
330	// nextHash := hash(load32(src, s), shift)
331	MOVW 0(R7), R11
332	MULW R16, R11, R11
333	LSRW R5, R11, R11
334
335outer:
336	// for { etc }
337
338	// skip := 32
339	MOVD $32, R12
340
341	// nextS := s
342	MOVD R7, R13
343
344	// candidate := 0
345	MOVD $0, R15
346
347inner0:
348	// for { etc }
349
350	// s := nextS
351	MOVD R13, R7
352
353	// bytesBetweenHashLookups := skip >> 5
354	MOVD R12, R14
355	LSR  $5, R14, R14
356
357	// nextS = s + bytesBetweenHashLookups
358	ADD R14, R13, R13
359
360	// skip += bytesBetweenHashLookups
361	ADD R14, R12, R12
362
363	// if nextS > sLimit { goto emitRemainder }
364	MOVD R13, R3
365	SUB  R6, R3, R3
366	CMP  R9, R3
367	BHI  emitRemainder
368
369	// candidate = int(table[nextHash])
370	MOVHU 0(R17)(R11<<1), R15
371
372	// table[nextHash] = uint16(s)
373	MOVD R7, R3
374	SUB  R6, R3, R3
375
376	MOVH R3, 0(R17)(R11<<1)
377
378	// nextHash = hash(load32(src, nextS), shift)
379	MOVW 0(R13), R11
380	MULW R16, R11
381	LSRW R5, R11, R11
382
383	// if load32(src, s) != load32(src, candidate) { continue } break
384	MOVW 0(R7), R3
385	MOVW (R6)(R15), R4
386	CMPW R4, R3
387	BNE  inner0
388
389fourByteMatch:
390	// As per the encode_other.go code:
391	//
392	// A 4-byte match has been found. We'll later see etc.
393
394	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
395	// on inputMargin in encode.go.
396	MOVD R7, R3
397	SUB  R10, R3, R3
398	CMP  $16, R3
399	BLE  emitLiteralFastPath
400
401	// ----------------------------------------
402	// Begin inline of the emitLiteral call.
403	//
404	// d += emitLiteral(dst[d:], src[nextEmit:s])
405
406	MOVW R3, R4
407	SUBW $1, R4, R4
408
409	MOVW $60, R2
410	CMPW R2, R4
411	BLT  inlineEmitLiteralOneByte
412	MOVW $256, R2
413	CMPW R2, R4
414	BLT  inlineEmitLiteralTwoBytes
415
416inlineEmitLiteralThreeBytes:
417	MOVD $0xf4, R1
418	MOVB R1, 0(R8)
419	MOVW R4, 1(R8)
420	ADD  $3, R8, R8
421	B    inlineEmitLiteralMemmove
422
423inlineEmitLiteralTwoBytes:
424	MOVD $0xf0, R1
425	MOVB R1, 0(R8)
426	MOVB R4, 1(R8)
427	ADD  $2, R8, R8
428	B    inlineEmitLiteralMemmove
429
430inlineEmitLiteralOneByte:
431	LSLW $2, R4, R4
432	MOVB R4, 0(R8)
433	ADD  $1, R8, R8
434
435inlineEmitLiteralMemmove:
436	// Spill local variables (registers) onto the stack; call; unspill.
437	//
438	// copy(dst[i:], lit)
439	//
440	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
441	// R8, R10 and R3 as arguments.
442	MOVD R8, 8(RSP)
443	MOVD R10, 16(RSP)
444	MOVD R3, 24(RSP)
445
446	// Finish the "d +=" part of "d += emitLiteral(etc)".
447	ADD   R3, R8, R8
448	MOVD  R7, 80(RSP)
449	MOVD  R8, 88(RSP)
450	MOVD  R15, 120(RSP)
451	CALL  runtime·memmove(SB)
452	MOVD  64(RSP), R5
453	MOVD  72(RSP), R6
454	MOVD  80(RSP), R7
455	MOVD  88(RSP), R8
456	MOVD  96(RSP), R9
457	MOVD  120(RSP), R15
458	ADD   $128, RSP, R17
459	MOVW  $0xa7bd, R16
460	MOVKW $(0x1e35<<16), R16
461	B     inner1
462
463inlineEmitLiteralEnd:
464	// End inline of the emitLiteral call.
465	// ----------------------------------------
466
467emitLiteralFastPath:
468	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
469	MOVB R3, R4
470	SUBW $1, R4, R4
471	AND  $0xff, R4, R4
472	LSLW $2, R4, R4
473	MOVB R4, (R8)
474	ADD  $1, R8, R8
475
476	// !!! Implement the copy from lit to dst as a 16-byte load and store.
477	// (Encode's documentation says that dst and src must not overlap.)
478	//
479	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
480	// OK. Subsequent iterations will fix up the overrun.
481	//
482	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
483	// 16-byte loads and stores. This technique probably wouldn't be as
484	// effective on architectures that are fussier about alignment.
485	LDP 0(R10), (R0, R1)
486	STP (R0, R1), 0(R8)
487	ADD R3, R8, R8
488
489inner1:
490	// for { etc }
491
492	// base := s
493	MOVD R7, R12
494
495	// !!! offset := base - candidate
496	MOVD R12, R11
497	SUB  R15, R11, R11
498	SUB  R6, R11, R11
499
500	// ----------------------------------------
501	// Begin inline of the extendMatch call.
502	//
503	// s = extendMatch(src, candidate+4, s+4)
504
505	// !!! R14 = &src[len(src)]
506	MOVD src_len+32(FP), R14
507	ADD  R6, R14, R14
508
509	// !!! R13 = &src[len(src) - 8]
510	MOVD R14, R13
511	SUB  $8, R13, R13
512
513	// !!! R15 = &src[candidate + 4]
514	ADD $4, R15, R15
515	ADD R6, R15, R15
516
517	// !!! s += 4
518	ADD $4, R7, R7
519
520inlineExtendMatchCmp8:
521	// As long as we are 8 or more bytes before the end of src, we can load and
522	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
523	CMP  R13, R7
524	BHI  inlineExtendMatchCmp1
525	MOVD (R15), R3
526	MOVD (R7), R4
527	CMP  R4, R3
528	BNE  inlineExtendMatchBSF
529	ADD  $8, R15, R15
530	ADD  $8, R7, R7
531	B    inlineExtendMatchCmp8
532
533inlineExtendMatchBSF:
534	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
535	// the index of the first byte that differs.
536	// RBIT reverses the bit order, then CLZ counts the leading zeros, the
537	// combination of which finds the least significant bit which is set.
538	// The arm64 architecture is little-endian, and the shift by 3 converts
539	// a bit index to a byte index.
540	EOR  R3, R4, R4
541	RBIT R4, R4
542	CLZ  R4, R4
543	ADD  R4>>3, R7, R7
544	B    inlineExtendMatchEnd
545
546inlineExtendMatchCmp1:
547	// In src's tail, compare 1 byte at a time.
548	CMP  R7, R14
549	BLS  inlineExtendMatchEnd
550	MOVB (R15), R3
551	MOVB (R7), R4
552	CMP  R4, R3
553	BNE  inlineExtendMatchEnd
554	ADD  $1, R15, R15
555	ADD  $1, R7, R7
556	B    inlineExtendMatchCmp1
557
558inlineExtendMatchEnd:
559	// End inline of the extendMatch call.
560	// ----------------------------------------
561
562	// ----------------------------------------
563	// Begin inline of the emitCopy call.
564	//
565	// d += emitCopy(dst[d:], base-candidate, s-base)
566
567	// !!! length := s - base
568	MOVD R7, R3
569	SUB  R12, R3, R3
570
571inlineEmitCopyLoop0:
572	// for length >= 68 { etc }
573	MOVW $68, R2
574	CMPW R2, R3
575	BLT  inlineEmitCopyStep1
576
577	// Emit a length 64 copy, encoded as 3 bytes.
578	MOVD $0xfe, R1
579	MOVB R1, 0(R8)
580	MOVW R11, 1(R8)
581	ADD  $3, R8, R8
582	SUBW $64, R3, R3
583	B    inlineEmitCopyLoop0
584
585inlineEmitCopyStep1:
586	// if length > 64 { etc }
587	MOVW $64, R2
588	CMPW R2, R3
589	BLE  inlineEmitCopyStep2
590
591	// Emit a length 60 copy, encoded as 3 bytes.
592	MOVD $0xee, R1
593	MOVB R1, 0(R8)
594	MOVW R11, 1(R8)
595	ADD  $3, R8, R8
596	SUBW $60, R3, R3
597
598inlineEmitCopyStep2:
599	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
600	MOVW $12, R2
601	CMPW R2, R3
602	BGE  inlineEmitCopyStep3
603	MOVW $2048, R2
604	CMPW R2, R11
605	BGE  inlineEmitCopyStep3
606
607	// Emit the remaining copy, encoded as 2 bytes.
608	MOVB R11, 1(R8)
609	LSRW $8, R11, R11
610	LSLW $5, R11, R11
611	SUBW $4, R3, R3
612	AND  $0xff, R3, R3
613	LSLW $2, R3, R3
614	ORRW R3, R11, R11
615	ORRW $1, R11, R11
616	MOVB R11, 0(R8)
617	ADD  $2, R8, R8
618	B    inlineEmitCopyEnd
619
620inlineEmitCopyStep3:
621	// Emit the remaining copy, encoded as 3 bytes.
622	SUBW $1, R3, R3
623	LSLW $2, R3, R3
624	ORRW $2, R3, R3
625	MOVB R3, 0(R8)
626	MOVW R11, 1(R8)
627	ADD  $3, R8, R8
628
629inlineEmitCopyEnd:
630	// End inline of the emitCopy call.
631	// ----------------------------------------
632
633	// nextEmit = s
634	MOVD R7, R10
635
636	// if s >= sLimit { goto emitRemainder }
637	MOVD R7, R3
638	SUB  R6, R3, R3
639	CMP  R3, R9
640	BLS  emitRemainder
641
642	// As per the encode_other.go code:
643	//
644	// We could immediately etc.
645
646	// x := load64(src, s-1)
647	MOVD -1(R7), R14
648
649	// prevHash := hash(uint32(x>>0), shift)
650	MOVW R14, R11
651	MULW R16, R11, R11
652	LSRW R5, R11, R11
653
654	// table[prevHash] = uint16(s-1)
655	MOVD R7, R3
656	SUB  R6, R3, R3
657	SUB  $1, R3, R3
658
659	MOVHU R3, 0(R17)(R11<<1)
660
661	// currHash := hash(uint32(x>>8), shift)
662	LSR  $8, R14, R14
663	MOVW R14, R11
664	MULW R16, R11, R11
665	LSRW R5, R11, R11
666
667	// candidate = int(table[currHash])
668	MOVHU 0(R17)(R11<<1), R15
669
670	// table[currHash] = uint16(s)
671	ADD   $1, R3, R3
672	MOVHU R3, 0(R17)(R11<<1)
673
674	// if uint32(x>>8) == load32(src, candidate) { continue }
675	MOVW (R6)(R15), R4
676	CMPW R4, R14
677	BEQ  inner1
678
679	// nextHash = hash(uint32(x>>16), shift)
680	LSR  $8, R14, R14
681	MOVW R14, R11
682	MULW R16, R11, R11
683	LSRW R5, R11, R11
684
685	// s++
686	ADD $1, R7, R7
687
688	// break out of the inner1 for loop, i.e. continue the outer loop.
689	B outer
690
691emitRemainder:
692	// if nextEmit < len(src) { etc }
693	MOVD src_len+32(FP), R3
694	ADD  R6, R3, R3
695	CMP  R3, R10
696	BEQ  encodeBlockEnd
697
698	// d += emitLiteral(dst[d:], src[nextEmit:])
699	//
700	// Push args.
701	MOVD R8, 8(RSP)
702	MOVD $0, 16(RSP)  // Unnecessary, as the callee ignores it, but conservative.
703	MOVD $0, 24(RSP)  // Unnecessary, as the callee ignores it, but conservative.
704	MOVD R10, 32(RSP)
705	SUB  R10, R3, R3
706	MOVD R3, 40(RSP)
707	MOVD R3, 48(RSP)  // Unnecessary, as the callee ignores it, but conservative.
708
709	// Spill local variables (registers) onto the stack; call; unspill.
710	MOVD R8, 88(RSP)
711	CALL ·emitLiteral(SB)
712	MOVD 88(RSP), R8
713
714	// Finish the "d +=" part of "d += emitLiteral(etc)".
715	MOVD 56(RSP), R1
716	ADD  R1, R8, R8
717
718encodeBlockEnd:
719	MOVD dst_base+0(FP), R3
720	SUB  R3, R8, R8
721	MOVD R8, d+48(FP)
722	RET
723