1// +build !appengine
2// +build gc
3// +build !purego
4
5#include "textflag.h"
6
7// Register allocation:
8// AX	h
9// CX	pointer to advance through b
10// DX	n
11// BX	loop end
12// R8	v1, k1
13// R9	v2
14// R10	v3
15// R11	v4
16// R12	tmp
17// R13	prime1v
18// R14	prime2v
19// R15	prime4v
20
21// round reads from and advances the buffer pointer in CX.
22// It assumes that R13 has prime1v and R14 has prime2v.
23#define round(r) \
24	MOVQ  (CX), R12 \
25	ADDQ  $8, CX    \
26	IMULQ R14, R12  \
27	ADDQ  R12, r    \
28	ROLQ  $31, r    \
29	IMULQ R13, r
30
31// mergeRound applies a merge round on the two registers acc and val.
32// It assumes that R13 has prime1v, R14 has prime2v, and R15 has prime4v.
33#define mergeRound(acc, val) \
34	IMULQ R14, val \
35	ROLQ  $31, val \
36	IMULQ R13, val \
37	XORQ  val, acc \
38	IMULQ R13, acc \
39	ADDQ  R15, acc
40
41// func Sum64(b []byte) uint64
42TEXT ·Sum64(SB), NOSPLIT, $0-32
43	// Load fixed primes.
44	MOVQ ·prime1v(SB), R13
45	MOVQ ·prime2v(SB), R14
46	MOVQ ·prime4v(SB), R15
47
48	// Load slice.
49	MOVQ b_base+0(FP), CX
50	MOVQ b_len+8(FP), DX
51	LEAQ (CX)(DX*1), BX
52
53	// The first loop limit will be len(b)-32.
54	SUBQ $32, BX
55
56	// Check whether we have at least one block.
57	CMPQ DX, $32
58	JLT  noBlocks
59
60	// Set up initial state (v1, v2, v3, v4).
61	MOVQ R13, R8
62	ADDQ R14, R8
63	MOVQ R14, R9
64	XORQ R10, R10
65	XORQ R11, R11
66	SUBQ R13, R11
67
68	// Loop until CX > BX.
69blockLoop:
70	round(R8)
71	round(R9)
72	round(R10)
73	round(R11)
74
75	CMPQ CX, BX
76	JLE  blockLoop
77
78	MOVQ R8, AX
79	ROLQ $1, AX
80	MOVQ R9, R12
81	ROLQ $7, R12
82	ADDQ R12, AX
83	MOVQ R10, R12
84	ROLQ $12, R12
85	ADDQ R12, AX
86	MOVQ R11, R12
87	ROLQ $18, R12
88	ADDQ R12, AX
89
90	mergeRound(AX, R8)
91	mergeRound(AX, R9)
92	mergeRound(AX, R10)
93	mergeRound(AX, R11)
94
95	JMP afterBlocks
96
97noBlocks:
98	MOVQ ·prime5v(SB), AX
99
100afterBlocks:
101	ADDQ DX, AX
102
103	// Right now BX has len(b)-32, and we want to loop until CX > len(b)-8.
104	ADDQ $24, BX
105
106	CMPQ CX, BX
107	JG   fourByte
108
109wordLoop:
110	// Calculate k1.
111	MOVQ  (CX), R8
112	ADDQ  $8, CX
113	IMULQ R14, R8
114	ROLQ  $31, R8
115	IMULQ R13, R8
116
117	XORQ  R8, AX
118	ROLQ  $27, AX
119	IMULQ R13, AX
120	ADDQ  R15, AX
121
122	CMPQ CX, BX
123	JLE  wordLoop
124
125fourByte:
126	ADDQ $4, BX
127	CMPQ CX, BX
128	JG   singles
129
130	MOVL  (CX), R8
131	ADDQ  $4, CX
132	IMULQ R13, R8
133	XORQ  R8, AX
134
135	ROLQ  $23, AX
136	IMULQ R14, AX
137	ADDQ  ·prime3v(SB), AX
138
139singles:
140	ADDQ $4, BX
141	CMPQ CX, BX
142	JGE  finalize
143
144singlesLoop:
145	MOVBQZX (CX), R12
146	ADDQ    $1, CX
147	IMULQ   ·prime5v(SB), R12
148	XORQ    R12, AX
149
150	ROLQ  $11, AX
151	IMULQ R13, AX
152
153	CMPQ CX, BX
154	JL   singlesLoop
155
156finalize:
157	MOVQ  AX, R12
158	SHRQ  $33, R12
159	XORQ  R12, AX
160	IMULQ R14, AX
161	MOVQ  AX, R12
162	SHRQ  $29, R12
163	XORQ  R12, AX
164	IMULQ ·prime3v(SB), AX
165	MOVQ  AX, R12
166	SHRQ  $32, R12
167	XORQ  R12, AX
168
169	MOVQ AX, ret+24(FP)
170	RET
171
172// writeBlocks uses the same registers as above except that it uses AX to store
173// the x pointer.
174
175// func writeBlocks(x *xxh, b []byte) []byte
176TEXT ·writeBlocks(SB), NOSPLIT, $0-56
177	// Load fixed primes needed for round.
178	MOVQ ·prime1v(SB), R13
179	MOVQ ·prime2v(SB), R14
180
181	// Load slice.
182	MOVQ b_base+8(FP), CX
183	MOVQ CX, ret_base+32(FP) // initialize return base pointer; see NOTE below
184	MOVQ b_len+16(FP), DX
185	LEAQ (CX)(DX*1), BX
186	SUBQ $32, BX
187
188	// Load vN from x.
189	MOVQ x+0(FP), AX
190	MOVQ 0(AX), R8   // v1
191	MOVQ 8(AX), R9   // v2
192	MOVQ 16(AX), R10 // v3
193	MOVQ 24(AX), R11 // v4
194
195	// We don't need to check the loop condition here; this function is
196	// always called with at least one block of data to process.
197blockLoop:
198	round(R8)
199	round(R9)
200	round(R10)
201	round(R11)
202
203	CMPQ CX, BX
204	JLE  blockLoop
205
206	// Copy vN back to x.
207	MOVQ R8, 0(AX)
208	MOVQ R9, 8(AX)
209	MOVQ R10, 16(AX)
210	MOVQ R11, 24(AX)
211
212	// Construct return slice.
213	// NOTE: It's important that we don't construct a slice that has a base
214	// pointer off the end of the original slice, as in Go 1.7+ this will
215	// cause runtime crashes. (See discussion in, for example,
216	// https://github.com/golang/go/issues/16772.)
217	// Therefore, we calculate the length/cap first, and if they're zero, we
218	// keep the old base. This is what the compiler does as well if you
219	// write code like
220	//   b = b[len(b):]
221
222	// New length is 32 - (CX - BX) -> BX+32 - CX.
223	ADDQ $32, BX
224	SUBQ CX, BX
225	JZ   afterSetBase
226
227	MOVQ CX, ret_base+32(FP)
228
229afterSetBase:
230	MOVQ BX, ret_len+40(FP)
231	MOVQ BX, ret_cap+48(FP) // set cap == len
232
233	RET
234