1// +build amd64,!appengine,!gccgo
2
3// This is a translation of the gcc output of FloodyBerry's pure-C public
4// domain siphash implementation at https://github.com/floodyberry/siphash
5
6// This assembly code has been modified from the 64-bit output to the experiment 128-bit output.
7
8// SI = v0
9// AX = v1
10// CX = v2
11// DX = v3
12
13// func Hash128(k0, k1 uint64, b []byte) (r0 uint64, r1 uint64)
14TEXT	·Hash128(SB),4,$0-56
15	MOVQ	k0+0(FP),CX
16	MOVQ	$0x736F6D6570736575,R9
17	MOVQ	k1+8(FP),DI
18	MOVQ	$0x6C7967656E657261,BX
19	MOVQ	$0x646F72616E646F6D,AX
20	MOVQ	b_len+24(FP),DX
21	XORQ	$0xEE,AX
22	MOVQ	DX,R11
23	MOVQ	DX,R10
24	XORQ	CX,R9
25	XORQ	CX,BX
26	MOVQ	$0x7465646279746573,CX
27	XORQ	DI,AX
28	XORQ	DI,CX
29	SHLQ	$0x38,R11
30	XORQ	DI,DI
31	MOVQ	b_base+16(FP),SI
32	ANDQ	$0xFFFFFFFFFFFFFFF8,R10
33	JE	afterLoop
34	XCHGQ	AX,AX
35loopBody:
36	MOVQ	0(SI)(DI*1),R8
37	ADDQ	AX,R9
38	RORQ	$0x33,AX
39	XORQ	R9,AX
40	RORQ	$0x20,R9
41	ADDQ	$0x8,DI
42	XORQ	R8,CX
43	ADDQ	CX,BX
44	RORQ	$0x30,CX
45	XORQ	BX,CX
46	ADDQ	AX,BX
47	RORQ	$0x2F,AX
48	ADDQ	CX,R9
49	RORQ	$0x2B,CX
50	XORQ	BX,AX
51	XORQ	R9,CX
52	RORQ	$0x20,BX
53	ADDQ	AX,R9
54	ADDQ	CX,BX
55	RORQ	$0x33,AX
56	RORQ	$0x30,CX
57	XORQ	R9,AX
58	XORQ	BX,CX
59	RORQ	$0x20,R9
60	ADDQ	AX,BX
61	ADDQ	CX,R9
62	RORQ	$0x2F,AX
63	RORQ	$0x2B,CX
64	XORQ	BX,AX
65	RORQ	$0x20,BX
66	XORQ	R9,CX
67	XORQ	R8,R9
68	CMPQ	R10,DI
69	JA	loopBody
70afterLoop:
71	SUBQ	R10,DX
72
73	CMPQ	DX,$0x7
74	JA	afterSwitch
75
76	// no support for jump tables
77
78	CMPQ	DX,$0x7
79	JE	sw7
80
81	CMPQ	DX,$0x6
82	JE	sw6
83
84	CMPQ	DX,$0x5
85	JE	sw5
86
87	CMPQ	DX,$0x4
88	JE	sw4
89
90	CMPQ	DX,$0x3
91	JE	sw3
92
93	CMPQ	DX,$0x2
94	JE	sw2
95
96	CMPQ	DX,$0x1
97	JE	sw1
98
99	JMP	afterSwitch
100
101sw7:	MOVBQZX	6(SI)(DI*1),DX
102	SHLQ	$0x30,DX
103	ORQ	DX,R11
104sw6:	MOVBQZX	0x5(SI)(DI*1),DX
105	SHLQ	$0x28,DX
106	ORQ	DX,R11
107sw5:	MOVBQZX	0x4(SI)(DI*1),DX
108	SHLQ	$0x20,DX
109	ORQ	DX,R11
110sw4:	MOVBQZX	0x3(SI)(DI*1),DX
111	SHLQ	$0x18,DX
112	ORQ	DX,R11
113sw3:	MOVBQZX	0x2(SI)(DI*1),DX
114	SHLQ	$0x10,DX
115	ORQ	DX,R11
116sw2:	MOVBQZX	0x1(SI)(DI*1),DX
117	SHLQ	$0x8,DX
118	ORQ	DX,R11
119sw1:	MOVBQZX	0(SI)(DI*1),DX
120	ORQ	DX,R11
121afterSwitch:
122	LEAQ	(AX)(R9*1),SI
123	XORQ	R11,CX
124	RORQ	$0x33,AX
125	ADDQ	CX,BX
126	MOVQ	CX,DX
127	XORQ	SI,AX
128	RORQ	$0x30,DX
129	RORQ	$0x20,SI
130	LEAQ	0(BX)(AX*1),CX
131	XORQ	BX,DX
132	RORQ	$0x2F,AX
133	ADDQ	DX,SI
134	RORQ	$0x2B,DX
135	XORQ	CX,AX
136	XORQ	SI,DX
137	RORQ	$0x20,CX
138	ADDQ	AX,SI
139	RORQ	$0x33,AX
140	ADDQ	DX,CX
141	XORQ	SI,AX
142	RORQ	$0x30,DX
143	RORQ	$0x20,SI
144	XORQ	CX,DX
145	ADDQ	AX,CX
146	RORQ	$0x2F,AX
147	ADDQ	DX,SI
148	XORQ	CX,AX
149	RORQ	$0x2B,DX
150	RORQ	$0x20,CX
151	XORQ	SI,DX
152	XORQ	R11,SI
153	XORB	$0xEE,CL
154	ADDQ	AX,SI
155	RORQ	$0x33,AX
156	ADDQ	DX,CX
157	RORQ	$0x30,DX
158	XORQ	SI,AX
159	XORQ	CX,DX
160	RORQ	$0x20,SI
161	ADDQ	AX,CX
162	ADDQ	DX,SI
163	RORQ	$0x2F,AX
164	RORQ	$0x2B,DX
165	XORQ	CX,AX
166	XORQ	SI,DX
167	RORQ	$0x20,CX
168	ADDQ	AX,SI
169	ADDQ	DX,CX
170	RORQ	$0x33,AX
171	RORQ	$0x30,DX
172	XORQ	SI,AX
173	RORQ	$0x20,SI
174	XORQ	CX,DX
175	ADDQ	AX,CX
176	RORQ	$0x2F,AX
177	ADDQ	DX,SI
178	RORQ	$0x2B,DX
179	XORQ	CX,AX
180	XORQ	SI,DX
181	RORQ	$0x20,CX
182	ADDQ	AX,SI
183	ADDQ	DX,CX
184	RORQ	$0x33,AX
185	RORQ	$0x30,DX
186	XORQ	CX,DX
187	XORQ	SI,AX
188	RORQ	$0x20,SI
189	ADDQ	DX,SI
190	ADDQ	AX,CX
191	RORQ	$0x2F,AX
192	XORQ	CX,AX
193	RORQ	$0x2B,DX
194	RORQ	$0x20,CX
195	XORQ	SI,DX
196
197	// gcc optimized the tail end of this function differently.  However,
198	// we need to preserve out registers to carry out the second stage of
199	// the finalization.  This is a duplicate of an earlier finalization
200	// round.
201
202	ADDQ	AX,SI
203	RORQ	$0x33,AX
204	ADDQ	DX,CX
205	RORQ	$0x30,DX
206	XORQ	SI,AX
207	XORQ	CX,DX
208	RORQ	$0x20,SI
209	ADDQ	AX,CX
210	ADDQ	DX,SI
211	RORQ	$0x2F,AX
212	RORQ	$0x2B,DX
213	XORQ	CX,AX
214	XORQ	SI,DX
215	RORQ	$0x20,CX
216
217	// Stuff the result into BX instead of AX as gcc had done
218
219	MOVQ	SI,BX
220	XORQ	AX,BX
221	XORQ	DX,BX
222	XORQ	CX,BX
223	MOVQ	BX,ret+40(FP)
224
225	// Start the second finalization round
226
227	XORB	$0xDD,AL
228	ADDQ	AX,SI
229	RORQ	$0x33,AX
230	ADDQ	DX,CX
231	RORQ	$0x30,DX
232	XORQ	SI,AX
233	XORQ	CX,DX
234	RORQ	$0x20,SI
235	ADDQ	AX,CX
236	ADDQ	DX,SI
237	RORQ	$0x2F,AX
238	RORQ	$0x2B,DX
239	XORQ	CX,AX
240	XORQ	SI,DX
241	RORQ	$0x20,CX
242	ADDQ	AX,SI
243	ADDQ	DX,CX
244	RORQ	$0x33,AX
245	RORQ	$0x30,DX
246	XORQ	SI,AX
247	RORQ	$0x20,SI
248	XORQ	CX,DX
249	ADDQ	AX,CX
250	RORQ	$0x2F,AX
251	ADDQ	DX,SI
252	RORQ	$0x2B,DX
253	XORQ	CX,AX
254	XORQ	SI,DX
255	RORQ	$0x20,CX
256	ADDQ	AX,SI
257	ADDQ	DX,CX
258	RORQ	$0x33,AX
259	RORQ	$0x30,DX
260	XORQ	CX,DX
261	XORQ	SI,AX
262	RORQ	$0x20,SI
263	ADDQ	DX,SI
264	ADDQ	AX,CX
265	RORQ	$0x2F,AX
266	XORQ	CX,AX
267	RORQ	$0x2B,DX
268	RORQ	$0x20,CX
269	XORQ	SI,DX
270
271	ADDQ	AX,SI
272	RORQ	$0x33,AX
273	ADDQ	DX,CX
274	RORQ	$0x30,DX
275	XORQ	SI,AX
276	XORQ	CX,DX
277	RORQ	$0x20,SI
278	ADDQ	AX,CX
279	ADDQ	DX,SI
280	RORQ	$0x2F,AX
281	RORQ	$0x2B,DX
282	XORQ	CX,AX
283	XORQ	SI,DX
284	RORQ	$0x20,CX
285
286	MOVQ	SI,BX
287	XORQ	AX,BX
288	XORQ	DX,BX
289	XORQ	CX,BX
290	MOVQ	BX,ret1+48(FP)
291
292	RET
293