1// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
2
3// +build !appengine
4// +build !noasm
5// +build gc
6
7#include "textflag.h"
8
9// func encodeBlockAsm(dst []byte, src []byte) int
10// Requires: SSE2
11TEXT ·encodeBlockAsm(SB), $65560-56
12	MOVQ dst_base+0(FP), AX
13	MOVQ $0x00000200, CX
14	LEAQ 24(SP), DX
15	PXOR X0, X0
16
17zero_loop_encodeBlockAsm:
18	MOVOU X0, (DX)
19	MOVOU X0, 16(DX)
20	MOVOU X0, 32(DX)
21	MOVOU X0, 48(DX)
22	MOVOU X0, 64(DX)
23	MOVOU X0, 80(DX)
24	MOVOU X0, 96(DX)
25	MOVOU X0, 112(DX)
26	ADDQ  $0x80, DX
27	DECQ  CX
28	JNZ   zero_loop_encodeBlockAsm
29	MOVL  $0x00000000, 12(SP)
30	MOVQ  src_len+32(FP), CX
31	LEAQ  -9(CX), DX
32	LEAQ  -8(CX), SI
33	MOVL  SI, 8(SP)
34	SHRQ  $0x05, CX
35	SUBL  CX, DX
36	LEAQ  (AX)(DX*1), DX
37	MOVQ  DX, (SP)
38	MOVL  $0x00000001, CX
39	MOVL  CX, 16(SP)
40	MOVQ  src_base+24(FP), DX
41
42search_loop_encodeBlockAsm:
43	MOVL  CX, SI
44	SUBL  12(SP), SI
45	SHRL  $0x06, SI
46	LEAL  4(CX)(SI*1), SI
47	CMPL  SI, 8(SP)
48	JGE   emit_remainder_encodeBlockAsm
49	MOVQ  (DX)(CX*1), DI
50	MOVL  SI, 20(SP)
51	MOVQ  $0x0000cf1bbcdcbf9b, R9
52	MOVQ  DI, R10
53	MOVQ  DI, R11
54	SHRQ  $0x08, R11
55	SHLQ  $0x10, R10
56	IMULQ R9, R10
57	SHRQ  $0x32, R10
58	SHLQ  $0x10, R11
59	IMULQ R9, R11
60	SHRQ  $0x32, R11
61	MOVL  24(SP)(R10*4), SI
62	MOVL  24(SP)(R11*4), R8
63	MOVL  CX, 24(SP)(R10*4)
64	LEAL  1(CX), R10
65	MOVL  R10, 24(SP)(R11*4)
66	MOVQ  DI, R10
67	SHRQ  $0x10, R10
68	SHLQ  $0x10, R10
69	IMULQ R9, R10
70	SHRQ  $0x32, R10
71	MOVL  CX, R9
72	SUBL  16(SP), R9
73	MOVL  1(DX)(R9*1), R11
74	MOVQ  DI, R9
75	SHRQ  $0x08, R9
76	CMPL  R9, R11
77	JNE   no_repeat_found_encodeBlockAsm
78	LEAL  1(CX), DI
79	MOVL  12(SP), R8
80	MOVL  DI, SI
81	SUBL  16(SP), SI
82	JZ    repeat_extend_back_end_encodeBlockAsm
83
84repeat_extend_back_loop_encodeBlockAsm:
85	CMPL DI, R8
86	JLE  repeat_extend_back_end_encodeBlockAsm
87	MOVB -1(DX)(SI*1), BL
88	MOVB -1(DX)(DI*1), R9
89	CMPB BL, R9
90	JNE  repeat_extend_back_end_encodeBlockAsm
91	LEAL -1(DI), DI
92	DECL SI
93	JNZ  repeat_extend_back_loop_encodeBlockAsm
94
95repeat_extend_back_end_encodeBlockAsm:
96	MOVL 12(SP), SI
97	CMPL SI, DI
98	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm
99	MOVL DI, R9
100	MOVL DI, 12(SP)
101	LEAQ (DX)(SI*1), R10
102	SUBL SI, R9
103	LEAL -1(R9), SI
104	CMPL SI, $0x3c
105	JLT  one_byte_repeat_emit_encodeBlockAsm
106	CMPL SI, $0x00000100
107	JLT  two_bytes_repeat_emit_encodeBlockAsm
108	CMPL SI, $0x00010000
109	JLT  three_bytes_repeat_emit_encodeBlockAsm
110	CMPL SI, $0x01000000
111	JLT  four_bytes_repeat_emit_encodeBlockAsm
112	MOVB $0xfc, (AX)
113	MOVL SI, 1(AX)
114	ADDQ $0x05, AX
115	JMP  memmove_long_repeat_emit_encodeBlockAsm
116
117four_bytes_repeat_emit_encodeBlockAsm:
118	MOVL SI, R11
119	SHRL $0x10, R11
120	MOVB $0xf8, (AX)
121	MOVW SI, 1(AX)
122	MOVB R11, 3(AX)
123	ADDQ $0x04, AX
124	JMP  memmove_long_repeat_emit_encodeBlockAsm
125
126three_bytes_repeat_emit_encodeBlockAsm:
127	MOVB $0xf4, (AX)
128	MOVW SI, 1(AX)
129	ADDQ $0x03, AX
130	JMP  memmove_long_repeat_emit_encodeBlockAsm
131
132two_bytes_repeat_emit_encodeBlockAsm:
133	MOVB $0xf0, (AX)
134	MOVB SI, 1(AX)
135	ADDQ $0x02, AX
136	CMPL SI, $0x40
137	JL   memmove_repeat_emit_encodeBlockAsm
138	JMP  memmove_long_repeat_emit_encodeBlockAsm
139
140one_byte_repeat_emit_encodeBlockAsm:
141	SHLB $0x02, SI
142	MOVB SI, (AX)
143	ADDQ $0x01, AX
144
145memmove_repeat_emit_encodeBlockAsm:
146	LEAQ (AX)(R9*1), SI
147
148	// genMemMoveShort
149	CMPQ R9, $0x08
150	JLE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8
151	CMPQ R9, $0x10
152	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16
153	CMPQ R9, $0x20
154	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32
155	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64
156
157emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8:
158	MOVQ (R10), R11
159	MOVQ R11, (AX)
160	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm
161
162emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16:
163	MOVQ (R10), R11
164	MOVQ -8(R10)(R9*1), R10
165	MOVQ R11, (AX)
166	MOVQ R10, -8(AX)(R9*1)
167	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm
168
169emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32:
170	MOVOU (R10), X0
171	MOVOU -16(R10)(R9*1), X1
172	MOVOU X0, (AX)
173	MOVOU X1, -16(AX)(R9*1)
174	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm
175
176emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64:
177	MOVOU (R10), X0
178	MOVOU 16(R10), X1
179	MOVOU -32(R10)(R9*1), X2
180	MOVOU -16(R10)(R9*1), X3
181	MOVOU X0, (AX)
182	MOVOU X1, 16(AX)
183	MOVOU X2, -32(AX)(R9*1)
184	MOVOU X3, -16(AX)(R9*1)
185
186memmove_end_copy_repeat_emit_encodeBlockAsm:
187	MOVQ SI, AX
188	JMP  emit_literal_done_repeat_emit_encodeBlockAsm
189
190memmove_long_repeat_emit_encodeBlockAsm:
191	LEAQ (AX)(R9*1), SI
192
193	// genMemMoveLong
194	MOVOU (R10), X0
195	MOVOU 16(R10), X1
196	MOVOU -32(R10)(R9*1), X2
197	MOVOU -16(R10)(R9*1), X3
198	MOVQ  R9, R12
199	SHRQ  $0x05, R12
200	MOVQ  AX, R11
201	ANDL  $0x0000001f, R11
202	MOVQ  $0x00000040, R13
203	SUBQ  R11, R13
204	DECQ  R12
205	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
206	LEAQ  -32(R10)(R13*1), R11
207	LEAQ  -32(AX)(R13*1), R14
208
209emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back:
210	MOVOU (R11), X4
211	MOVOU 16(R11), X5
212	MOVOA X4, (R14)
213	MOVOA X5, 16(R14)
214	ADDQ  $0x20, R14
215	ADDQ  $0x20, R11
216	ADDQ  $0x20, R13
217	DECQ  R12
218	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back
219
220emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32:
221	MOVOU -32(R10)(R13*1), X4
222	MOVOU -16(R10)(R13*1), X5
223	MOVOA X4, -32(AX)(R13*1)
224	MOVOA X5, -16(AX)(R13*1)
225	ADDQ  $0x20, R13
226	CMPQ  R9, R13
227	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
228	MOVOU X0, (AX)
229	MOVOU X1, 16(AX)
230	MOVOU X2, -32(AX)(R9*1)
231	MOVOU X3, -16(AX)(R9*1)
232	MOVQ  SI, AX
233
234emit_literal_done_repeat_emit_encodeBlockAsm:
235	ADDL $0x05, CX
236	MOVL CX, SI
237	SUBL 16(SP), SI
238	MOVQ src_len+32(FP), R9
239	SUBL CX, R9
240	LEAQ (DX)(CX*1), R10
241	LEAQ (DX)(SI*1), SI
242
243	// matchLen
244	XORL R12, R12
245	CMPL R9, $0x08
246	JL   matchlen_single_repeat_extend_encodeBlockAsm
247
248matchlen_loopback_repeat_extend_encodeBlockAsm:
249	MOVQ  (R10)(R12*1), R11
250	XORQ  (SI)(R12*1), R11
251	TESTQ R11, R11
252	JZ    matchlen_loop_repeat_extend_encodeBlockAsm
253	BSFQ  R11, R11
254	SARQ  $0x03, R11
255	LEAL  (R12)(R11*1), R12
256	JMP   repeat_extend_forward_end_encodeBlockAsm
257
258matchlen_loop_repeat_extend_encodeBlockAsm:
259	LEAL -8(R9), R9
260	LEAL 8(R12), R12
261	CMPL R9, $0x08
262	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm
263
264matchlen_single_repeat_extend_encodeBlockAsm:
265	TESTL R9, R9
266	JZ    repeat_extend_forward_end_encodeBlockAsm
267
268matchlen_single_loopback_repeat_extend_encodeBlockAsm:
269	MOVB (R10)(R12*1), R11
270	CMPB (SI)(R12*1), R11
271	JNE  repeat_extend_forward_end_encodeBlockAsm
272	LEAL 1(R12), R12
273	DECL R9
274	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm
275
276repeat_extend_forward_end_encodeBlockAsm:
277	ADDL  R12, CX
278	MOVL  CX, SI
279	SUBL  DI, SI
280	MOVL  16(SP), DI
281	TESTL R8, R8
282	JZ    repeat_as_copy_encodeBlockAsm
283
284	// emitRepeat
285emit_repeat_again_match_repeat_encodeBlockAsm:
286	MOVL SI, R8
287	LEAL -4(SI), SI
288	CMPL R8, $0x08
289	JLE  repeat_two_match_repeat_encodeBlockAsm
290	CMPL R8, $0x0c
291	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm
292	CMPL DI, $0x00000800
293	JLT  repeat_two_offset_match_repeat_encodeBlockAsm
294
295cant_repeat_two_offset_match_repeat_encodeBlockAsm:
296	CMPL SI, $0x00000104
297	JLT  repeat_three_match_repeat_encodeBlockAsm
298	CMPL SI, $0x00010100
299	JLT  repeat_four_match_repeat_encodeBlockAsm
300	CMPL SI, $0x0100ffff
301	JLT  repeat_five_match_repeat_encodeBlockAsm
302	LEAL -16842747(SI), SI
303	MOVW $0x001d, (AX)
304	MOVW $0xfffb, 2(AX)
305	MOVB $0xff, 4(AX)
306	ADDQ $0x05, AX
307	JMP  emit_repeat_again_match_repeat_encodeBlockAsm
308
309repeat_five_match_repeat_encodeBlockAsm:
310	LEAL -65536(SI), SI
311	MOVL SI, DI
312	MOVW $0x001d, (AX)
313	MOVW SI, 2(AX)
314	SARL $0x10, DI
315	MOVB DI, 4(AX)
316	ADDQ $0x05, AX
317	JMP  repeat_end_emit_encodeBlockAsm
318
319repeat_four_match_repeat_encodeBlockAsm:
320	LEAL -256(SI), SI
321	MOVW $0x0019, (AX)
322	MOVW SI, 2(AX)
323	ADDQ $0x04, AX
324	JMP  repeat_end_emit_encodeBlockAsm
325
326repeat_three_match_repeat_encodeBlockAsm:
327	LEAL -4(SI), SI
328	MOVW $0x0015, (AX)
329	MOVB SI, 2(AX)
330	ADDQ $0x03, AX
331	JMP  repeat_end_emit_encodeBlockAsm
332
333repeat_two_match_repeat_encodeBlockAsm:
334	SHLL $0x02, SI
335	ORL  $0x01, SI
336	MOVW SI, (AX)
337	ADDQ $0x02, AX
338	JMP  repeat_end_emit_encodeBlockAsm
339
340repeat_two_offset_match_repeat_encodeBlockAsm:
341	XORQ R8, R8
342	LEAL 1(R8)(SI*4), SI
343	MOVB DI, 1(AX)
344	SARL $0x08, DI
345	SHLL $0x05, DI
346	ORL  DI, SI
347	MOVB SI, (AX)
348	ADDQ $0x02, AX
349	JMP  repeat_end_emit_encodeBlockAsm
350
351repeat_as_copy_encodeBlockAsm:
352	// emitCopy
353	CMPL DI, $0x00010000
354	JL   two_byte_offset_repeat_as_copy_encodeBlockAsm
355
356four_bytes_loop_back_repeat_as_copy_encodeBlockAsm:
357	CMPL SI, $0x40
358	JLE  four_bytes_remain_repeat_as_copy_encodeBlockAsm
359	MOVB $0xff, (AX)
360	MOVL DI, 1(AX)
361	LEAL -64(SI), SI
362	ADDQ $0x05, AX
363	CMPL SI, $0x04
364	JL   four_bytes_remain_repeat_as_copy_encodeBlockAsm
365
366	// emitRepeat
367emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy:
368	MOVL SI, R8
369	LEAL -4(SI), SI
370	CMPL R8, $0x08
371	JLE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy
372	CMPL R8, $0x0c
373	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
374	CMPL DI, $0x00000800
375	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
376
377cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
378	CMPL SI, $0x00000104
379	JLT  repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy
380	CMPL SI, $0x00010100
381	JLT  repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy
382	CMPL SI, $0x0100ffff
383	JLT  repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy
384	LEAL -16842747(SI), SI
385	MOVW $0x001d, (AX)
386	MOVW $0xfffb, 2(AX)
387	MOVB $0xff, 4(AX)
388	ADDQ $0x05, AX
389	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy
390
391repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy:
392	LEAL -65536(SI), SI
393	MOVL SI, DI
394	MOVW $0x001d, (AX)
395	MOVW SI, 2(AX)
396	SARL $0x10, DI
397	MOVB DI, 4(AX)
398	ADDQ $0x05, AX
399	JMP  repeat_end_emit_encodeBlockAsm
400
401repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy:
402	LEAL -256(SI), SI
403	MOVW $0x0019, (AX)
404	MOVW SI, 2(AX)
405	ADDQ $0x04, AX
406	JMP  repeat_end_emit_encodeBlockAsm
407
408repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy:
409	LEAL -4(SI), SI
410	MOVW $0x0015, (AX)
411	MOVB SI, 2(AX)
412	ADDQ $0x03, AX
413	JMP  repeat_end_emit_encodeBlockAsm
414
415repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy:
416	SHLL $0x02, SI
417	ORL  $0x01, SI
418	MOVW SI, (AX)
419	ADDQ $0x02, AX
420	JMP  repeat_end_emit_encodeBlockAsm
421
422repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
423	XORQ R8, R8
424	LEAL 1(R8)(SI*4), SI
425	MOVB DI, 1(AX)
426	SARL $0x08, DI
427	SHLL $0x05, DI
428	ORL  DI, SI
429	MOVB SI, (AX)
430	ADDQ $0x02, AX
431	JMP  repeat_end_emit_encodeBlockAsm
432	JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm
433
434four_bytes_remain_repeat_as_copy_encodeBlockAsm:
435	TESTL SI, SI
436	JZ    repeat_end_emit_encodeBlockAsm
437	MOVB  $0x03, BL
438	LEAL  -4(BX)(SI*4), SI
439	MOVB  SI, (AX)
440	MOVL  DI, 1(AX)
441	ADDQ  $0x05, AX
442	JMP   repeat_end_emit_encodeBlockAsm
443
444two_byte_offset_repeat_as_copy_encodeBlockAsm:
445	CMPL SI, $0x40
446	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm
447	MOVB $0xee, (AX)
448	MOVW DI, 1(AX)
449	LEAL -60(SI), SI
450	ADDQ $0x03, AX
451
452	// emitRepeat
453emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short:
454	MOVL SI, R8
455	LEAL -4(SI), SI
456	CMPL R8, $0x08
457	JLE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short
458	CMPL R8, $0x0c
459	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
460	CMPL DI, $0x00000800
461	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
462
463cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
464	CMPL SI, $0x00000104
465	JLT  repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short
466	CMPL SI, $0x00010100
467	JLT  repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short
468	CMPL SI, $0x0100ffff
469	JLT  repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short
470	LEAL -16842747(SI), SI
471	MOVW $0x001d, (AX)
472	MOVW $0xfffb, 2(AX)
473	MOVB $0xff, 4(AX)
474	ADDQ $0x05, AX
475	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short
476
477repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short:
478	LEAL -65536(SI), SI
479	MOVL SI, DI
480	MOVW $0x001d, (AX)
481	MOVW SI, 2(AX)
482	SARL $0x10, DI
483	MOVB DI, 4(AX)
484	ADDQ $0x05, AX
485	JMP  repeat_end_emit_encodeBlockAsm
486
487repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short:
488	LEAL -256(SI), SI
489	MOVW $0x0019, (AX)
490	MOVW SI, 2(AX)
491	ADDQ $0x04, AX
492	JMP  repeat_end_emit_encodeBlockAsm
493
494repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short:
495	LEAL -4(SI), SI
496	MOVW $0x0015, (AX)
497	MOVB SI, 2(AX)
498	ADDQ $0x03, AX
499	JMP  repeat_end_emit_encodeBlockAsm
500
501repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short:
502	SHLL $0x02, SI
503	ORL  $0x01, SI
504	MOVW SI, (AX)
505	ADDQ $0x02, AX
506	JMP  repeat_end_emit_encodeBlockAsm
507
508repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
509	XORQ R8, R8
510	LEAL 1(R8)(SI*4), SI
511	MOVB DI, 1(AX)
512	SARL $0x08, DI
513	SHLL $0x05, DI
514	ORL  DI, SI
515	MOVB SI, (AX)
516	ADDQ $0x02, AX
517	JMP  repeat_end_emit_encodeBlockAsm
518	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm
519
520two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
521	CMPL SI, $0x0c
522	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm
523	CMPL DI, $0x00000800
524	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm
525	MOVB $0x01, BL
526	LEAL -16(BX)(SI*4), SI
527	MOVB DI, 1(AX)
528	SHRL $0x08, DI
529	SHLL $0x05, DI
530	ORL  DI, SI
531	MOVB SI, (AX)
532	ADDQ $0x02, AX
533	JMP  repeat_end_emit_encodeBlockAsm
534
535emit_copy_three_repeat_as_copy_encodeBlockAsm:
536	MOVB $0x02, BL
537	LEAL -4(BX)(SI*4), SI
538	MOVB SI, (AX)
539	MOVW DI, 1(AX)
540	ADDQ $0x03, AX
541
542repeat_end_emit_encodeBlockAsm:
543	MOVL CX, 12(SP)
544	JMP  search_loop_encodeBlockAsm
545
546no_repeat_found_encodeBlockAsm:
547	CMPL (DX)(SI*1), DI
548	JEQ  candidate_match_encodeBlockAsm
549	SHRQ $0x08, DI
550	MOVL 24(SP)(R10*4), SI
551	LEAL 2(CX), R9
552	CMPL (DX)(R8*1), DI
553	JEQ  candidate2_match_encodeBlockAsm
554	MOVL R9, 24(SP)(R10*4)
555	SHRQ $0x08, DI
556	CMPL (DX)(SI*1), DI
557	JEQ  candidate3_match_encodeBlockAsm
558	MOVL 20(SP), CX
559	JMP  search_loop_encodeBlockAsm
560
561candidate3_match_encodeBlockAsm:
562	ADDL $0x02, CX
563	JMP  candidate_match_encodeBlockAsm
564
565candidate2_match_encodeBlockAsm:
566	MOVL R9, 24(SP)(R10*4)
567	INCL CX
568	MOVL R8, SI
569
570candidate_match_encodeBlockAsm:
571	MOVL  12(SP), DI
572	TESTL SI, SI
573	JZ    match_extend_back_end_encodeBlockAsm
574
575match_extend_back_loop_encodeBlockAsm:
576	CMPL CX, DI
577	JLE  match_extend_back_end_encodeBlockAsm
578	MOVB -1(DX)(SI*1), BL
579	MOVB -1(DX)(CX*1), R8
580	CMPB BL, R8
581	JNE  match_extend_back_end_encodeBlockAsm
582	LEAL -1(CX), CX
583	DECL SI
584	JZ   match_extend_back_end_encodeBlockAsm
585	JMP  match_extend_back_loop_encodeBlockAsm
586
587match_extend_back_end_encodeBlockAsm:
588	MOVL CX, DI
589	SUBL 12(SP), DI
590	LEAQ 5(AX)(DI*1), DI
591	CMPQ DI, (SP)
592	JL   match_dst_size_check_encodeBlockAsm
593	MOVQ $0x00000000, ret+48(FP)
594	RET
595
596match_dst_size_check_encodeBlockAsm:
597	MOVL CX, DI
598	MOVL 12(SP), R8
599	CMPL R8, DI
600	JEQ  emit_literal_done_match_emit_encodeBlockAsm
601	MOVL DI, R9
602	MOVL DI, 12(SP)
603	LEAQ (DX)(R8*1), DI
604	SUBL R8, R9
605	LEAL -1(R9), R8
606	CMPL R8, $0x3c
607	JLT  one_byte_match_emit_encodeBlockAsm
608	CMPL R8, $0x00000100
609	JLT  two_bytes_match_emit_encodeBlockAsm
610	CMPL R8, $0x00010000
611	JLT  three_bytes_match_emit_encodeBlockAsm
612	CMPL R8, $0x01000000
613	JLT  four_bytes_match_emit_encodeBlockAsm
614	MOVB $0xfc, (AX)
615	MOVL R8, 1(AX)
616	ADDQ $0x05, AX
617	JMP  memmove_long_match_emit_encodeBlockAsm
618
619four_bytes_match_emit_encodeBlockAsm:
620	MOVL R8, R10
621	SHRL $0x10, R10
622	MOVB $0xf8, (AX)
623	MOVW R8, 1(AX)
624	MOVB R10, 3(AX)
625	ADDQ $0x04, AX
626	JMP  memmove_long_match_emit_encodeBlockAsm
627
628three_bytes_match_emit_encodeBlockAsm:
629	MOVB $0xf4, (AX)
630	MOVW R8, 1(AX)
631	ADDQ $0x03, AX
632	JMP  memmove_long_match_emit_encodeBlockAsm
633
634two_bytes_match_emit_encodeBlockAsm:
635	MOVB $0xf0, (AX)
636	MOVB R8, 1(AX)
637	ADDQ $0x02, AX
638	CMPL R8, $0x40
639	JL   memmove_match_emit_encodeBlockAsm
640	JMP  memmove_long_match_emit_encodeBlockAsm
641
642one_byte_match_emit_encodeBlockAsm:
643	SHLB $0x02, R8
644	MOVB R8, (AX)
645	ADDQ $0x01, AX
646
647memmove_match_emit_encodeBlockAsm:
648	LEAQ (AX)(R9*1), R8
649
650	// genMemMoveShort
651	CMPQ R9, $0x08
652	JLE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8
653	CMPQ R9, $0x10
654	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
655	CMPQ R9, $0x20
656	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
657	JMP  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64
658
659emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8:
660	MOVQ (DI), R10
661	MOVQ R10, (AX)
662	JMP  memmove_end_copy_match_emit_encodeBlockAsm
663
664emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
665	MOVQ (DI), R10
666	MOVQ -8(DI)(R9*1), DI
667	MOVQ R10, (AX)
668	MOVQ DI, -8(AX)(R9*1)
669	JMP  memmove_end_copy_match_emit_encodeBlockAsm
670
671emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
672	MOVOU (DI), X0
673	MOVOU -16(DI)(R9*1), X1
674	MOVOU X0, (AX)
675	MOVOU X1, -16(AX)(R9*1)
676	JMP   memmove_end_copy_match_emit_encodeBlockAsm
677
678emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
679	MOVOU (DI), X0
680	MOVOU 16(DI), X1
681	MOVOU -32(DI)(R9*1), X2
682	MOVOU -16(DI)(R9*1), X3
683	MOVOU X0, (AX)
684	MOVOU X1, 16(AX)
685	MOVOU X2, -32(AX)(R9*1)
686	MOVOU X3, -16(AX)(R9*1)
687
688memmove_end_copy_match_emit_encodeBlockAsm:
689	MOVQ R8, AX
690	JMP  emit_literal_done_match_emit_encodeBlockAsm
691
692memmove_long_match_emit_encodeBlockAsm:
693	LEAQ (AX)(R9*1), R8
694
695	// genMemMoveLong
696	MOVOU (DI), X0
697	MOVOU 16(DI), X1
698	MOVOU -32(DI)(R9*1), X2
699	MOVOU -16(DI)(R9*1), X3
700	MOVQ  R9, R11
701	SHRQ  $0x05, R11
702	MOVQ  AX, R10
703	ANDL  $0x0000001f, R10
704	MOVQ  $0x00000040, R12
705	SUBQ  R10, R12
706	DECQ  R11
707	JA    emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
708	LEAQ  -32(DI)(R12*1), R10
709	LEAQ  -32(AX)(R12*1), R13
710
711emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
712	MOVOU (R10), X4
713	MOVOU 16(R10), X5
714	MOVOA X4, (R13)
715	MOVOA X5, 16(R13)
716	ADDQ  $0x20, R13
717	ADDQ  $0x20, R10
718	ADDQ  $0x20, R12
719	DECQ  R11
720	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back
721
722emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
723	MOVOU -32(DI)(R12*1), X4
724	MOVOU -16(DI)(R12*1), X5
725	MOVOA X4, -32(AX)(R12*1)
726	MOVOA X5, -16(AX)(R12*1)
727	ADDQ  $0x20, R12
728	CMPQ  R9, R12
729	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
730	MOVOU X0, (AX)
731	MOVOU X1, 16(AX)
732	MOVOU X2, -32(AX)(R9*1)
733	MOVOU X3, -16(AX)(R9*1)
734	MOVQ  R8, AX
735
736emit_literal_done_match_emit_encodeBlockAsm:
737match_nolit_loop_encodeBlockAsm:
738	MOVL CX, DI
739	SUBL SI, DI
740	MOVL DI, 16(SP)
741	ADDL $0x04, CX
742	ADDL $0x04, SI
743	MOVQ src_len+32(FP), DI
744	SUBL CX, DI
745	LEAQ (DX)(CX*1), R8
746	LEAQ (DX)(SI*1), SI
747
748	// matchLen
749	XORL R10, R10
750	CMPL DI, $0x08
751	JL   matchlen_single_match_nolit_encodeBlockAsm
752
753matchlen_loopback_match_nolit_encodeBlockAsm:
754	MOVQ  (R8)(R10*1), R9
755	XORQ  (SI)(R10*1), R9
756	TESTQ R9, R9
757	JZ    matchlen_loop_match_nolit_encodeBlockAsm
758	BSFQ  R9, R9
759	SARQ  $0x03, R9
760	LEAL  (R10)(R9*1), R10
761	JMP   match_nolit_end_encodeBlockAsm
762
763matchlen_loop_match_nolit_encodeBlockAsm:
764	LEAL -8(DI), DI
765	LEAL 8(R10), R10
766	CMPL DI, $0x08
767	JGE  matchlen_loopback_match_nolit_encodeBlockAsm
768
769matchlen_single_match_nolit_encodeBlockAsm:
770	TESTL DI, DI
771	JZ    match_nolit_end_encodeBlockAsm
772
773matchlen_single_loopback_match_nolit_encodeBlockAsm:
774	MOVB (R8)(R10*1), R9
775	CMPB (SI)(R10*1), R9
776	JNE  match_nolit_end_encodeBlockAsm
777	LEAL 1(R10), R10
778	DECL DI
779	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm
780
781match_nolit_end_encodeBlockAsm:
782	ADDL R10, CX
783	MOVL 16(SP), SI
784	ADDL $0x04, R10
785	MOVL CX, 12(SP)
786
787	// emitCopy
788	CMPL SI, $0x00010000
789	JL   two_byte_offset_match_nolit_encodeBlockAsm
790
791four_bytes_loop_back_match_nolit_encodeBlockAsm:
792	CMPL R10, $0x40
793	JLE  four_bytes_remain_match_nolit_encodeBlockAsm
794	MOVB $0xff, (AX)
795	MOVL SI, 1(AX)
796	LEAL -64(R10), R10
797	ADDQ $0x05, AX
798	CMPL R10, $0x04
799	JL   four_bytes_remain_match_nolit_encodeBlockAsm
800
801	// emitRepeat
802emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy:
803	MOVL R10, DI
804	LEAL -4(R10), R10
805	CMPL DI, $0x08
806	JLE  repeat_two_match_nolit_encodeBlockAsm_emit_copy
807	CMPL DI, $0x0c
808	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
809	CMPL SI, $0x00000800
810	JLT  repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
811
812cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
813	CMPL R10, $0x00000104
814	JLT  repeat_three_match_nolit_encodeBlockAsm_emit_copy
815	CMPL R10, $0x00010100
816	JLT  repeat_four_match_nolit_encodeBlockAsm_emit_copy
817	CMPL R10, $0x0100ffff
818	JLT  repeat_five_match_nolit_encodeBlockAsm_emit_copy
819	LEAL -16842747(R10), R10
820	MOVW $0x001d, (AX)
821	MOVW $0xfffb, 2(AX)
822	MOVB $0xff, 4(AX)
823	ADDQ $0x05, AX
824	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy
825
826repeat_five_match_nolit_encodeBlockAsm_emit_copy:
827	LEAL -65536(R10), R10
828	MOVL R10, SI
829	MOVW $0x001d, (AX)
830	MOVW R10, 2(AX)
831	SARL $0x10, SI
832	MOVB SI, 4(AX)
833	ADDQ $0x05, AX
834	JMP  match_nolit_emitcopy_end_encodeBlockAsm
835
836repeat_four_match_nolit_encodeBlockAsm_emit_copy:
837	LEAL -256(R10), R10
838	MOVW $0x0019, (AX)
839	MOVW R10, 2(AX)
840	ADDQ $0x04, AX
841	JMP  match_nolit_emitcopy_end_encodeBlockAsm
842
843repeat_three_match_nolit_encodeBlockAsm_emit_copy:
844	LEAL -4(R10), R10
845	MOVW $0x0015, (AX)
846	MOVB R10, 2(AX)
847	ADDQ $0x03, AX
848	JMP  match_nolit_emitcopy_end_encodeBlockAsm
849
850repeat_two_match_nolit_encodeBlockAsm_emit_copy:
851	SHLL $0x02, R10
852	ORL  $0x01, R10
853	MOVW R10, (AX)
854	ADDQ $0x02, AX
855	JMP  match_nolit_emitcopy_end_encodeBlockAsm
856
857repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
858	XORQ DI, DI
859	LEAL 1(DI)(R10*4), R10
860	MOVB SI, 1(AX)
861	SARL $0x08, SI
862	SHLL $0x05, SI
863	ORL  SI, R10
864	MOVB R10, (AX)
865	ADDQ $0x02, AX
866	JMP  match_nolit_emitcopy_end_encodeBlockAsm
867	JMP four_bytes_loop_back_match_nolit_encodeBlockAsm
868
869four_bytes_remain_match_nolit_encodeBlockAsm:
870	TESTL R10, R10
871	JZ    match_nolit_emitcopy_end_encodeBlockAsm
872	MOVB  $0x03, BL
873	LEAL  -4(BX)(R10*4), R10
874	MOVB  R10, (AX)
875	MOVL  SI, 1(AX)
876	ADDQ  $0x05, AX
877	JMP   match_nolit_emitcopy_end_encodeBlockAsm
878
879two_byte_offset_match_nolit_encodeBlockAsm:
880	CMPL R10, $0x40
881	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm
882	MOVB $0xee, (AX)
883	MOVW SI, 1(AX)
884	LEAL -60(R10), R10
885	ADDQ $0x03, AX
886
887	// emitRepeat
888emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short:
889	MOVL R10, DI
890	LEAL -4(R10), R10
891	CMPL DI, $0x08
892	JLE  repeat_two_match_nolit_encodeBlockAsm_emit_copy_short
893	CMPL DI, $0x0c
894	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
895	CMPL SI, $0x00000800
896	JLT  repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
897
898cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
899	CMPL R10, $0x00000104
900	JLT  repeat_three_match_nolit_encodeBlockAsm_emit_copy_short
901	CMPL R10, $0x00010100
902	JLT  repeat_four_match_nolit_encodeBlockAsm_emit_copy_short
903	CMPL R10, $0x0100ffff
904	JLT  repeat_five_match_nolit_encodeBlockAsm_emit_copy_short
905	LEAL -16842747(R10), R10
906	MOVW $0x001d, (AX)
907	MOVW $0xfffb, 2(AX)
908	MOVB $0xff, 4(AX)
909	ADDQ $0x05, AX
910	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short
911
912repeat_five_match_nolit_encodeBlockAsm_emit_copy_short:
913	LEAL -65536(R10), R10
914	MOVL R10, SI
915	MOVW $0x001d, (AX)
916	MOVW R10, 2(AX)
917	SARL $0x10, SI
918	MOVB SI, 4(AX)
919	ADDQ $0x05, AX
920	JMP  match_nolit_emitcopy_end_encodeBlockAsm
921
922repeat_four_match_nolit_encodeBlockAsm_emit_copy_short:
923	LEAL -256(R10), R10
924	MOVW $0x0019, (AX)
925	MOVW R10, 2(AX)
926	ADDQ $0x04, AX
927	JMP  match_nolit_emitcopy_end_encodeBlockAsm
928
929repeat_three_match_nolit_encodeBlockAsm_emit_copy_short:
930	LEAL -4(R10), R10
931	MOVW $0x0015, (AX)
932	MOVB R10, 2(AX)
933	ADDQ $0x03, AX
934	JMP  match_nolit_emitcopy_end_encodeBlockAsm
935
936repeat_two_match_nolit_encodeBlockAsm_emit_copy_short:
937	SHLL $0x02, R10
938	ORL  $0x01, R10
939	MOVW R10, (AX)
940	ADDQ $0x02, AX
941	JMP  match_nolit_emitcopy_end_encodeBlockAsm
942
943repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
944	XORQ DI, DI
945	LEAL 1(DI)(R10*4), R10
946	MOVB SI, 1(AX)
947	SARL $0x08, SI
948	SHLL $0x05, SI
949	ORL  SI, R10
950	MOVB R10, (AX)
951	ADDQ $0x02, AX
952	JMP  match_nolit_emitcopy_end_encodeBlockAsm
953	JMP two_byte_offset_match_nolit_encodeBlockAsm
954
955two_byte_offset_short_match_nolit_encodeBlockAsm:
956	CMPL R10, $0x0c
957	JGE  emit_copy_three_match_nolit_encodeBlockAsm
958	CMPL SI, $0x00000800
959	JGE  emit_copy_three_match_nolit_encodeBlockAsm
960	MOVB $0x01, BL
961	LEAL -16(BX)(R10*4), R10
962	MOVB SI, 1(AX)
963	SHRL $0x08, SI
964	SHLL $0x05, SI
965	ORL  SI, R10
966	MOVB R10, (AX)
967	ADDQ $0x02, AX
968	JMP  match_nolit_emitcopy_end_encodeBlockAsm
969
970emit_copy_three_match_nolit_encodeBlockAsm:
971	MOVB $0x02, BL
972	LEAL -4(BX)(R10*4), R10
973	MOVB R10, (AX)
974	MOVW SI, 1(AX)
975	ADDQ $0x03, AX
976
977match_nolit_emitcopy_end_encodeBlockAsm:
978	CMPL CX, 8(SP)
979	JGE  emit_remainder_encodeBlockAsm
980	MOVQ -2(DX)(CX*1), DI
981	CMPQ AX, (SP)
982	JL   match_nolit_dst_ok_encodeBlockAsm
983	MOVQ $0x00000000, ret+48(FP)
984	RET
985
986match_nolit_dst_ok_encodeBlockAsm:
987	MOVQ  $0x0000cf1bbcdcbf9b, R9
988	MOVQ  DI, R8
989	SHRQ  $0x10, DI
990	MOVQ  DI, SI
991	SHLQ  $0x10, R8
992	IMULQ R9, R8
993	SHRQ  $0x32, R8
994	SHLQ  $0x10, SI
995	IMULQ R9, SI
996	SHRQ  $0x32, SI
997	LEAL  -2(CX), R9
998	LEAQ  24(SP)(SI*4), R10
999	MOVL  (R10), SI
1000	MOVL  R9, 24(SP)(R8*4)
1001	MOVL  CX, (R10)
1002	CMPL  (DX)(SI*1), DI
1003	JEQ   match_nolit_loop_encodeBlockAsm
1004	INCL  CX
1005	JMP   search_loop_encodeBlockAsm
1006
1007emit_remainder_encodeBlockAsm:
1008	MOVQ src_len+32(FP), CX
1009	SUBL 12(SP), CX
1010	LEAQ 5(AX)(CX*1), CX
1011	CMPQ CX, (SP)
1012	JL   emit_remainder_ok_encodeBlockAsm
1013	MOVQ $0x00000000, ret+48(FP)
1014	RET
1015
1016emit_remainder_ok_encodeBlockAsm:
1017	MOVQ src_len+32(FP), CX
1018	MOVL 12(SP), BX
1019	CMPL BX, CX
1020	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm
1021	MOVL CX, SI
1022	MOVL CX, 12(SP)
1023	LEAQ (DX)(BX*1), CX
1024	SUBL BX, SI
1025	LEAL -1(SI), DX
1026	CMPL DX, $0x3c
1027	JLT  one_byte_emit_remainder_encodeBlockAsm
1028	CMPL DX, $0x00000100
1029	JLT  two_bytes_emit_remainder_encodeBlockAsm
1030	CMPL DX, $0x00010000
1031	JLT  three_bytes_emit_remainder_encodeBlockAsm
1032	CMPL DX, $0x01000000
1033	JLT  four_bytes_emit_remainder_encodeBlockAsm
1034	MOVB $0xfc, (AX)
1035	MOVL DX, 1(AX)
1036	ADDQ $0x05, AX
1037	JMP  memmove_long_emit_remainder_encodeBlockAsm
1038
1039four_bytes_emit_remainder_encodeBlockAsm:
1040	MOVL DX, BX
1041	SHRL $0x10, BX
1042	MOVB $0xf8, (AX)
1043	MOVW DX, 1(AX)
1044	MOVB BL, 3(AX)
1045	ADDQ $0x04, AX
1046	JMP  memmove_long_emit_remainder_encodeBlockAsm
1047
1048three_bytes_emit_remainder_encodeBlockAsm:
1049	MOVB $0xf4, (AX)
1050	MOVW DX, 1(AX)
1051	ADDQ $0x03, AX
1052	JMP  memmove_long_emit_remainder_encodeBlockAsm
1053
1054two_bytes_emit_remainder_encodeBlockAsm:
1055	MOVB $0xf0, (AX)
1056	MOVB DL, 1(AX)
1057	ADDQ $0x02, AX
1058	CMPL DX, $0x40
1059	JL   memmove_emit_remainder_encodeBlockAsm
1060	JMP  memmove_long_emit_remainder_encodeBlockAsm
1061
1062one_byte_emit_remainder_encodeBlockAsm:
1063	SHLB $0x02, DL
1064	MOVB DL, (AX)
1065	ADDQ $0x01, AX
1066
1067memmove_emit_remainder_encodeBlockAsm:
1068	LEAQ (AX)(SI*1), DX
1069	MOVL SI, BX
1070
1071	// genMemMoveShort
1072	CMPQ BX, $0x08
1073	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8
1074	CMPQ BX, $0x10
1075	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
1076	CMPQ BX, $0x20
1077	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
1078	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64
1079
1080emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8:
1081	MOVQ (CX), SI
1082	MOVQ SI, (AX)
1083	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
1084
1085emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
1086	MOVQ (CX), SI
1087	MOVQ -8(CX)(BX*1), CX
1088	MOVQ SI, (AX)
1089	MOVQ CX, -8(AX)(BX*1)
1090	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
1091
1092emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32:
1093	MOVOU (CX), X0
1094	MOVOU -16(CX)(BX*1), X1
1095	MOVOU X0, (AX)
1096	MOVOU X1, -16(AX)(BX*1)
1097	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm
1098
1099emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64:
1100	MOVOU (CX), X0
1101	MOVOU 16(CX), X1
1102	MOVOU -32(CX)(BX*1), X2
1103	MOVOU -16(CX)(BX*1), X3
1104	MOVOU X0, (AX)
1105	MOVOU X1, 16(AX)
1106	MOVOU X2, -32(AX)(BX*1)
1107	MOVOU X3, -16(AX)(BX*1)
1108
1109memmove_end_copy_emit_remainder_encodeBlockAsm:
1110	MOVQ DX, AX
1111	JMP  emit_literal_done_emit_remainder_encodeBlockAsm
1112
1113memmove_long_emit_remainder_encodeBlockAsm:
1114	LEAQ (AX)(SI*1), DX
1115	MOVL SI, BX
1116
1117	// genMemMoveLong
1118	MOVOU (CX), X0
1119	MOVOU 16(CX), X1
1120	MOVOU -32(CX)(BX*1), X2
1121	MOVOU -16(CX)(BX*1), X3
1122	MOVQ  BX, DI
1123	SHRQ  $0x05, DI
1124	MOVQ  AX, SI
1125	ANDL  $0x0000001f, SI
1126	MOVQ  $0x00000040, R8
1127	SUBQ  SI, R8
1128	DECQ  DI
1129	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
1130	LEAQ  -32(CX)(R8*1), SI
1131	LEAQ  -32(AX)(R8*1), R9
1132
1133emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back:
1134	MOVOU (SI), X4
1135	MOVOU 16(SI), X5
1136	MOVOA X4, (R9)
1137	MOVOA X5, 16(R9)
1138	ADDQ  $0x20, R9
1139	ADDQ  $0x20, SI
1140	ADDQ  $0x20, R8
1141	DECQ  DI
1142	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back
1143
1144emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32:
1145	MOVOU -32(CX)(R8*1), X4
1146	MOVOU -16(CX)(R8*1), X5
1147	MOVOA X4, -32(AX)(R8*1)
1148	MOVOA X5, -16(AX)(R8*1)
1149	ADDQ  $0x20, R8
1150	CMPQ  BX, R8
1151	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
1152	MOVOU X0, (AX)
1153	MOVOU X1, 16(AX)
1154	MOVOU X2, -32(AX)(BX*1)
1155	MOVOU X3, -16(AX)(BX*1)
1156	MOVQ  DX, AX
1157
1158emit_literal_done_emit_remainder_encodeBlockAsm:
1159	MOVQ dst_base+0(FP), CX
1160	SUBQ CX, AX
1161	MOVQ AX, ret+48(FP)
1162	RET
1163
1164// func encodeBlockAsm4MB(dst []byte, src []byte) int
1165// Requires: SSE2
1166TEXT ·encodeBlockAsm4MB(SB), $65560-56
1167	MOVQ dst_base+0(FP), AX
1168	MOVQ $0x00000200, CX
1169	LEAQ 24(SP), DX
1170	PXOR X0, X0
1171
1172zero_loop_encodeBlockAsm4MB:
1173	MOVOU X0, (DX)
1174	MOVOU X0, 16(DX)
1175	MOVOU X0, 32(DX)
1176	MOVOU X0, 48(DX)
1177	MOVOU X0, 64(DX)
1178	MOVOU X0, 80(DX)
1179	MOVOU X0, 96(DX)
1180	MOVOU X0, 112(DX)
1181	ADDQ  $0x80, DX
1182	DECQ  CX
1183	JNZ   zero_loop_encodeBlockAsm4MB
1184	MOVL  $0x00000000, 12(SP)
1185	MOVQ  src_len+32(FP), CX
1186	LEAQ  -9(CX), DX
1187	LEAQ  -8(CX), SI
1188	MOVL  SI, 8(SP)
1189	SHRQ  $0x05, CX
1190	SUBL  CX, DX
1191	LEAQ  (AX)(DX*1), DX
1192	MOVQ  DX, (SP)
1193	MOVL  $0x00000001, CX
1194	MOVL  CX, 16(SP)
1195	MOVQ  src_base+24(FP), DX
1196
1197search_loop_encodeBlockAsm4MB:
1198	MOVL  CX, SI
1199	SUBL  12(SP), SI
1200	SHRL  $0x06, SI
1201	LEAL  4(CX)(SI*1), SI
1202	CMPL  SI, 8(SP)
1203	JGE   emit_remainder_encodeBlockAsm4MB
1204	MOVQ  (DX)(CX*1), DI
1205	MOVL  SI, 20(SP)
1206	MOVQ  $0x0000cf1bbcdcbf9b, R9
1207	MOVQ  DI, R10
1208	MOVQ  DI, R11
1209	SHRQ  $0x08, R11
1210	SHLQ  $0x10, R10
1211	IMULQ R9, R10
1212	SHRQ  $0x32, R10
1213	SHLQ  $0x10, R11
1214	IMULQ R9, R11
1215	SHRQ  $0x32, R11
1216	MOVL  24(SP)(R10*4), SI
1217	MOVL  24(SP)(R11*4), R8
1218	MOVL  CX, 24(SP)(R10*4)
1219	LEAL  1(CX), R10
1220	MOVL  R10, 24(SP)(R11*4)
1221	MOVQ  DI, R10
1222	SHRQ  $0x10, R10
1223	SHLQ  $0x10, R10
1224	IMULQ R9, R10
1225	SHRQ  $0x32, R10
1226	MOVL  CX, R9
1227	SUBL  16(SP), R9
1228	MOVL  1(DX)(R9*1), R11
1229	MOVQ  DI, R9
1230	SHRQ  $0x08, R9
1231	CMPL  R9, R11
1232	JNE   no_repeat_found_encodeBlockAsm4MB
1233	LEAL  1(CX), DI
1234	MOVL  12(SP), R8
1235	MOVL  DI, SI
1236	SUBL  16(SP), SI
1237	JZ    repeat_extend_back_end_encodeBlockAsm4MB
1238
1239repeat_extend_back_loop_encodeBlockAsm4MB:
1240	CMPL DI, R8
1241	JLE  repeat_extend_back_end_encodeBlockAsm4MB
1242	MOVB -1(DX)(SI*1), BL
1243	MOVB -1(DX)(DI*1), R9
1244	CMPB BL, R9
1245	JNE  repeat_extend_back_end_encodeBlockAsm4MB
1246	LEAL -1(DI), DI
1247	DECL SI
1248	JNZ  repeat_extend_back_loop_encodeBlockAsm4MB
1249
1250repeat_extend_back_end_encodeBlockAsm4MB:
1251	MOVL 12(SP), SI
1252	CMPL SI, DI
1253	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm4MB
1254	MOVL DI, R9
1255	MOVL DI, 12(SP)
1256	LEAQ (DX)(SI*1), R10
1257	SUBL SI, R9
1258	LEAL -1(R9), SI
1259	CMPL SI, $0x3c
1260	JLT  one_byte_repeat_emit_encodeBlockAsm4MB
1261	CMPL SI, $0x00000100
1262	JLT  two_bytes_repeat_emit_encodeBlockAsm4MB
1263	CMPL SI, $0x00010000
1264	JLT  three_bytes_repeat_emit_encodeBlockAsm4MB
1265	MOVL SI, R11
1266	SHRL $0x10, R11
1267	MOVB $0xf8, (AX)
1268	MOVW SI, 1(AX)
1269	MOVB R11, 3(AX)
1270	ADDQ $0x04, AX
1271	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB
1272
1273three_bytes_repeat_emit_encodeBlockAsm4MB:
1274	MOVB $0xf4, (AX)
1275	MOVW SI, 1(AX)
1276	ADDQ $0x03, AX
1277	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB
1278
1279two_bytes_repeat_emit_encodeBlockAsm4MB:
1280	MOVB $0xf0, (AX)
1281	MOVB SI, 1(AX)
1282	ADDQ $0x02, AX
1283	CMPL SI, $0x40
1284	JL   memmove_repeat_emit_encodeBlockAsm4MB
1285	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB
1286
1287one_byte_repeat_emit_encodeBlockAsm4MB:
1288	SHLB $0x02, SI
1289	MOVB SI, (AX)
1290	ADDQ $0x01, AX
1291
1292memmove_repeat_emit_encodeBlockAsm4MB:
1293	LEAQ (AX)(R9*1), SI
1294
1295	// genMemMoveShort
1296	CMPQ R9, $0x08
1297	JLE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8
1298	CMPQ R9, $0x10
1299	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16
1300	CMPQ R9, $0x20
1301	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32
1302	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64
1303
1304emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8:
1305	MOVQ (R10), R11
1306	MOVQ R11, (AX)
1307	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm4MB
1308
1309emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16:
1310	MOVQ (R10), R11
1311	MOVQ -8(R10)(R9*1), R10
1312	MOVQ R11, (AX)
1313	MOVQ R10, -8(AX)(R9*1)
1314	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm4MB
1315
1316emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32:
1317	MOVOU (R10), X0
1318	MOVOU -16(R10)(R9*1), X1
1319	MOVOU X0, (AX)
1320	MOVOU X1, -16(AX)(R9*1)
1321	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm4MB
1322
1323emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64:
1324	MOVOU (R10), X0
1325	MOVOU 16(R10), X1
1326	MOVOU -32(R10)(R9*1), X2
1327	MOVOU -16(R10)(R9*1), X3
1328	MOVOU X0, (AX)
1329	MOVOU X1, 16(AX)
1330	MOVOU X2, -32(AX)(R9*1)
1331	MOVOU X3, -16(AX)(R9*1)
1332
1333memmove_end_copy_repeat_emit_encodeBlockAsm4MB:
1334	MOVQ SI, AX
1335	JMP  emit_literal_done_repeat_emit_encodeBlockAsm4MB
1336
1337memmove_long_repeat_emit_encodeBlockAsm4MB:
1338	LEAQ (AX)(R9*1), SI
1339
1340	// genMemMoveLong
1341	MOVOU (R10), X0
1342	MOVOU 16(R10), X1
1343	MOVOU -32(R10)(R9*1), X2
1344	MOVOU -16(R10)(R9*1), X3
1345	MOVQ  R9, R12
1346	SHRQ  $0x05, R12
1347	MOVQ  AX, R11
1348	ANDL  $0x0000001f, R11
1349	MOVQ  $0x00000040, R13
1350	SUBQ  R11, R13
1351	DECQ  R12
1352	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
1353	LEAQ  -32(R10)(R13*1), R11
1354	LEAQ  -32(AX)(R13*1), R14
1355
1356emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back:
1357	MOVOU (R11), X4
1358	MOVOU 16(R11), X5
1359	MOVOA X4, (R14)
1360	MOVOA X5, 16(R14)
1361	ADDQ  $0x20, R14
1362	ADDQ  $0x20, R11
1363	ADDQ  $0x20, R13
1364	DECQ  R12
1365	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back
1366
1367emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
1368	MOVOU -32(R10)(R13*1), X4
1369	MOVOU -16(R10)(R13*1), X5
1370	MOVOA X4, -32(AX)(R13*1)
1371	MOVOA X5, -16(AX)(R13*1)
1372	ADDQ  $0x20, R13
1373	CMPQ  R9, R13
1374	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
1375	MOVOU X0, (AX)
1376	MOVOU X1, 16(AX)
1377	MOVOU X2, -32(AX)(R9*1)
1378	MOVOU X3, -16(AX)(R9*1)
1379	MOVQ  SI, AX
1380
1381emit_literal_done_repeat_emit_encodeBlockAsm4MB:
1382	ADDL $0x05, CX
1383	MOVL CX, SI
1384	SUBL 16(SP), SI
1385	MOVQ src_len+32(FP), R9
1386	SUBL CX, R9
1387	LEAQ (DX)(CX*1), R10
1388	LEAQ (DX)(SI*1), SI
1389
1390	// matchLen
1391	XORL R12, R12
1392	CMPL R9, $0x08
1393	JL   matchlen_single_repeat_extend_encodeBlockAsm4MB
1394
1395matchlen_loopback_repeat_extend_encodeBlockAsm4MB:
1396	MOVQ  (R10)(R12*1), R11
1397	XORQ  (SI)(R12*1), R11
1398	TESTQ R11, R11
1399	JZ    matchlen_loop_repeat_extend_encodeBlockAsm4MB
1400	BSFQ  R11, R11
1401	SARQ  $0x03, R11
1402	LEAL  (R12)(R11*1), R12
1403	JMP   repeat_extend_forward_end_encodeBlockAsm4MB
1404
1405matchlen_loop_repeat_extend_encodeBlockAsm4MB:
1406	LEAL -8(R9), R9
1407	LEAL 8(R12), R12
1408	CMPL R9, $0x08
1409	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm4MB
1410
1411matchlen_single_repeat_extend_encodeBlockAsm4MB:
1412	TESTL R9, R9
1413	JZ    repeat_extend_forward_end_encodeBlockAsm4MB
1414
1415matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB:
1416	MOVB (R10)(R12*1), R11
1417	CMPB (SI)(R12*1), R11
1418	JNE  repeat_extend_forward_end_encodeBlockAsm4MB
1419	LEAL 1(R12), R12
1420	DECL R9
1421	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB
1422
1423repeat_extend_forward_end_encodeBlockAsm4MB:
1424	ADDL  R12, CX
1425	MOVL  CX, SI
1426	SUBL  DI, SI
1427	MOVL  16(SP), DI
1428	TESTL R8, R8
1429	JZ    repeat_as_copy_encodeBlockAsm4MB
1430
1431	// emitRepeat
1432	MOVL SI, R8
1433	LEAL -4(SI), SI
1434	CMPL R8, $0x08
1435	JLE  repeat_two_match_repeat_encodeBlockAsm4MB
1436	CMPL R8, $0x0c
1437	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB
1438	CMPL DI, $0x00000800
1439	JLT  repeat_two_offset_match_repeat_encodeBlockAsm4MB
1440
1441cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB:
1442	CMPL SI, $0x00000104
1443	JLT  repeat_three_match_repeat_encodeBlockAsm4MB
1444	CMPL SI, $0x00010100
1445	JLT  repeat_four_match_repeat_encodeBlockAsm4MB
1446	LEAL -65536(SI), SI
1447	MOVL SI, DI
1448	MOVW $0x001d, (AX)
1449	MOVW SI, 2(AX)
1450	SARL $0x10, DI
1451	MOVB DI, 4(AX)
1452	ADDQ $0x05, AX
1453	JMP  repeat_end_emit_encodeBlockAsm4MB
1454
1455repeat_four_match_repeat_encodeBlockAsm4MB:
1456	LEAL -256(SI), SI
1457	MOVW $0x0019, (AX)
1458	MOVW SI, 2(AX)
1459	ADDQ $0x04, AX
1460	JMP  repeat_end_emit_encodeBlockAsm4MB
1461
1462repeat_three_match_repeat_encodeBlockAsm4MB:
1463	LEAL -4(SI), SI
1464	MOVW $0x0015, (AX)
1465	MOVB SI, 2(AX)
1466	ADDQ $0x03, AX
1467	JMP  repeat_end_emit_encodeBlockAsm4MB
1468
1469repeat_two_match_repeat_encodeBlockAsm4MB:
1470	SHLL $0x02, SI
1471	ORL  $0x01, SI
1472	MOVW SI, (AX)
1473	ADDQ $0x02, AX
1474	JMP  repeat_end_emit_encodeBlockAsm4MB
1475
1476repeat_two_offset_match_repeat_encodeBlockAsm4MB:
1477	XORQ R8, R8
1478	LEAL 1(R8)(SI*4), SI
1479	MOVB DI, 1(AX)
1480	SARL $0x08, DI
1481	SHLL $0x05, DI
1482	ORL  DI, SI
1483	MOVB SI, (AX)
1484	ADDQ $0x02, AX
1485	JMP  repeat_end_emit_encodeBlockAsm4MB
1486
1487repeat_as_copy_encodeBlockAsm4MB:
1488	// emitCopy
1489	CMPL DI, $0x00010000
1490	JL   two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
1491
1492four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB:
1493	CMPL SI, $0x40
1494	JLE  four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
1495	MOVB $0xff, (AX)
1496	MOVL DI, 1(AX)
1497	LEAL -64(SI), SI
1498	ADDQ $0x05, AX
1499	CMPL SI, $0x04
1500	JL   four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
1501
1502	// emitRepeat
1503	MOVL SI, R8
1504	LEAL -4(SI), SI
1505	CMPL R8, $0x08
1506	JLE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1507	CMPL R8, $0x0c
1508	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1509	CMPL DI, $0x00000800
1510	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1511
1512cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1513	CMPL SI, $0x00000104
1514	JLT  repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1515	CMPL SI, $0x00010100
1516	JLT  repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1517	LEAL -65536(SI), SI
1518	MOVL SI, DI
1519	MOVW $0x001d, (AX)
1520	MOVW SI, 2(AX)
1521	SARL $0x10, DI
1522	MOVB DI, 4(AX)
1523	ADDQ $0x05, AX
1524	JMP  repeat_end_emit_encodeBlockAsm4MB
1525
1526repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1527	LEAL -256(SI), SI
1528	MOVW $0x0019, (AX)
1529	MOVW SI, 2(AX)
1530	ADDQ $0x04, AX
1531	JMP  repeat_end_emit_encodeBlockAsm4MB
1532
1533repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1534	LEAL -4(SI), SI
1535	MOVW $0x0015, (AX)
1536	MOVB SI, 2(AX)
1537	ADDQ $0x03, AX
1538	JMP  repeat_end_emit_encodeBlockAsm4MB
1539
1540repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1541	SHLL $0x02, SI
1542	ORL  $0x01, SI
1543	MOVW SI, (AX)
1544	ADDQ $0x02, AX
1545	JMP  repeat_end_emit_encodeBlockAsm4MB
1546
1547repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1548	XORQ R8, R8
1549	LEAL 1(R8)(SI*4), SI
1550	MOVB DI, 1(AX)
1551	SARL $0x08, DI
1552	SHLL $0x05, DI
1553	ORL  DI, SI
1554	MOVB SI, (AX)
1555	ADDQ $0x02, AX
1556	JMP  repeat_end_emit_encodeBlockAsm4MB
1557	JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB
1558
1559four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
1560	TESTL SI, SI
1561	JZ    repeat_end_emit_encodeBlockAsm4MB
1562	MOVB  $0x03, BL
1563	LEAL  -4(BX)(SI*4), SI
1564	MOVB  SI, (AX)
1565	MOVL  DI, 1(AX)
1566	ADDQ  $0x05, AX
1567	JMP   repeat_end_emit_encodeBlockAsm4MB
1568
1569two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
1570	CMPL SI, $0x40
1571	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
1572	MOVB $0xee, (AX)
1573	MOVW DI, 1(AX)
1574	LEAL -60(SI), SI
1575	ADDQ $0x03, AX
1576
1577	// emitRepeat
1578	MOVL SI, R8
1579	LEAL -4(SI), SI
1580	CMPL R8, $0x08
1581	JLE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1582	CMPL R8, $0x0c
1583	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1584	CMPL DI, $0x00000800
1585	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1586
1587cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1588	CMPL SI, $0x00000104
1589	JLT  repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1590	CMPL SI, $0x00010100
1591	JLT  repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1592	LEAL -65536(SI), SI
1593	MOVL SI, DI
1594	MOVW $0x001d, (AX)
1595	MOVW SI, 2(AX)
1596	SARL $0x10, DI
1597	MOVB DI, 4(AX)
1598	ADDQ $0x05, AX
1599	JMP  repeat_end_emit_encodeBlockAsm4MB
1600
1601repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1602	LEAL -256(SI), SI
1603	MOVW $0x0019, (AX)
1604	MOVW SI, 2(AX)
1605	ADDQ $0x04, AX
1606	JMP  repeat_end_emit_encodeBlockAsm4MB
1607
1608repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1609	LEAL -4(SI), SI
1610	MOVW $0x0015, (AX)
1611	MOVB SI, 2(AX)
1612	ADDQ $0x03, AX
1613	JMP  repeat_end_emit_encodeBlockAsm4MB
1614
1615repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1616	SHLL $0x02, SI
1617	ORL  $0x01, SI
1618	MOVW SI, (AX)
1619	ADDQ $0x02, AX
1620	JMP  repeat_end_emit_encodeBlockAsm4MB
1621
1622repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1623	XORQ R8, R8
1624	LEAL 1(R8)(SI*4), SI
1625	MOVB DI, 1(AX)
1626	SARL $0x08, DI
1627	SHLL $0x05, DI
1628	ORL  DI, SI
1629	MOVB SI, (AX)
1630	ADDQ $0x02, AX
1631	JMP  repeat_end_emit_encodeBlockAsm4MB
1632	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
1633
1634two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
1635	CMPL SI, $0x0c
1636	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
1637	CMPL DI, $0x00000800
1638	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
1639	MOVB $0x01, BL
1640	LEAL -16(BX)(SI*4), SI
1641	MOVB DI, 1(AX)
1642	SHRL $0x08, DI
1643	SHLL $0x05, DI
1644	ORL  DI, SI
1645	MOVB SI, (AX)
1646	ADDQ $0x02, AX
1647	JMP  repeat_end_emit_encodeBlockAsm4MB
1648
1649emit_copy_three_repeat_as_copy_encodeBlockAsm4MB:
1650	MOVB $0x02, BL
1651	LEAL -4(BX)(SI*4), SI
1652	MOVB SI, (AX)
1653	MOVW DI, 1(AX)
1654	ADDQ $0x03, AX
1655
1656repeat_end_emit_encodeBlockAsm4MB:
1657	MOVL CX, 12(SP)
1658	JMP  search_loop_encodeBlockAsm4MB
1659
1660no_repeat_found_encodeBlockAsm4MB:
1661	CMPL (DX)(SI*1), DI
1662	JEQ  candidate_match_encodeBlockAsm4MB
1663	SHRQ $0x08, DI
1664	MOVL 24(SP)(R10*4), SI
1665	LEAL 2(CX), R9
1666	CMPL (DX)(R8*1), DI
1667	JEQ  candidate2_match_encodeBlockAsm4MB
1668	MOVL R9, 24(SP)(R10*4)
1669	SHRQ $0x08, DI
1670	CMPL (DX)(SI*1), DI
1671	JEQ  candidate3_match_encodeBlockAsm4MB
1672	MOVL 20(SP), CX
1673	JMP  search_loop_encodeBlockAsm4MB
1674
1675candidate3_match_encodeBlockAsm4MB:
1676	ADDL $0x02, CX
1677	JMP  candidate_match_encodeBlockAsm4MB
1678
1679candidate2_match_encodeBlockAsm4MB:
1680	MOVL R9, 24(SP)(R10*4)
1681	INCL CX
1682	MOVL R8, SI
1683
1684candidate_match_encodeBlockAsm4MB:
1685	MOVL  12(SP), DI
1686	TESTL SI, SI
1687	JZ    match_extend_back_end_encodeBlockAsm4MB
1688
1689match_extend_back_loop_encodeBlockAsm4MB:
1690	CMPL CX, DI
1691	JLE  match_extend_back_end_encodeBlockAsm4MB
1692	MOVB -1(DX)(SI*1), BL
1693	MOVB -1(DX)(CX*1), R8
1694	CMPB BL, R8
1695	JNE  match_extend_back_end_encodeBlockAsm4MB
1696	LEAL -1(CX), CX
1697	DECL SI
1698	JZ   match_extend_back_end_encodeBlockAsm4MB
1699	JMP  match_extend_back_loop_encodeBlockAsm4MB
1700
1701match_extend_back_end_encodeBlockAsm4MB:
1702	MOVL CX, DI
1703	SUBL 12(SP), DI
1704	LEAQ 4(AX)(DI*1), DI
1705	CMPQ DI, (SP)
1706	JL   match_dst_size_check_encodeBlockAsm4MB
1707	MOVQ $0x00000000, ret+48(FP)
1708	RET
1709
1710match_dst_size_check_encodeBlockAsm4MB:
1711	MOVL CX, DI
1712	MOVL 12(SP), R8
1713	CMPL R8, DI
1714	JEQ  emit_literal_done_match_emit_encodeBlockAsm4MB
1715	MOVL DI, R9
1716	MOVL DI, 12(SP)
1717	LEAQ (DX)(R8*1), DI
1718	SUBL R8, R9
1719	LEAL -1(R9), R8
1720	CMPL R8, $0x3c
1721	JLT  one_byte_match_emit_encodeBlockAsm4MB
1722	CMPL R8, $0x00000100
1723	JLT  two_bytes_match_emit_encodeBlockAsm4MB
1724	CMPL R8, $0x00010000
1725	JLT  three_bytes_match_emit_encodeBlockAsm4MB
1726	MOVL R8, R10
1727	SHRL $0x10, R10
1728	MOVB $0xf8, (AX)
1729	MOVW R8, 1(AX)
1730	MOVB R10, 3(AX)
1731	ADDQ $0x04, AX
1732	JMP  memmove_long_match_emit_encodeBlockAsm4MB
1733
1734three_bytes_match_emit_encodeBlockAsm4MB:
1735	MOVB $0xf4, (AX)
1736	MOVW R8, 1(AX)
1737	ADDQ $0x03, AX
1738	JMP  memmove_long_match_emit_encodeBlockAsm4MB
1739
1740two_bytes_match_emit_encodeBlockAsm4MB:
1741	MOVB $0xf0, (AX)
1742	MOVB R8, 1(AX)
1743	ADDQ $0x02, AX
1744	CMPL R8, $0x40
1745	JL   memmove_match_emit_encodeBlockAsm4MB
1746	JMP  memmove_long_match_emit_encodeBlockAsm4MB
1747
1748one_byte_match_emit_encodeBlockAsm4MB:
1749	SHLB $0x02, R8
1750	MOVB R8, (AX)
1751	ADDQ $0x01, AX
1752
1753memmove_match_emit_encodeBlockAsm4MB:
1754	LEAQ (AX)(R9*1), R8
1755
1756	// genMemMoveShort
1757	CMPQ R9, $0x08
1758	JLE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8
1759	CMPQ R9, $0x10
1760	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16
1761	CMPQ R9, $0x20
1762	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32
1763	JMP  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64
1764
1765emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8:
1766	MOVQ (DI), R10
1767	MOVQ R10, (AX)
1768	JMP  memmove_end_copy_match_emit_encodeBlockAsm4MB
1769
1770emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16:
1771	MOVQ (DI), R10
1772	MOVQ -8(DI)(R9*1), DI
1773	MOVQ R10, (AX)
1774	MOVQ DI, -8(AX)(R9*1)
1775	JMP  memmove_end_copy_match_emit_encodeBlockAsm4MB
1776
1777emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32:
1778	MOVOU (DI), X0
1779	MOVOU -16(DI)(R9*1), X1
1780	MOVOU X0, (AX)
1781	MOVOU X1, -16(AX)(R9*1)
1782	JMP   memmove_end_copy_match_emit_encodeBlockAsm4MB
1783
1784emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64:
1785	MOVOU (DI), X0
1786	MOVOU 16(DI), X1
1787	MOVOU -32(DI)(R9*1), X2
1788	MOVOU -16(DI)(R9*1), X3
1789	MOVOU X0, (AX)
1790	MOVOU X1, 16(AX)
1791	MOVOU X2, -32(AX)(R9*1)
1792	MOVOU X3, -16(AX)(R9*1)
1793
1794memmove_end_copy_match_emit_encodeBlockAsm4MB:
1795	MOVQ R8, AX
1796	JMP  emit_literal_done_match_emit_encodeBlockAsm4MB
1797
1798memmove_long_match_emit_encodeBlockAsm4MB:
1799	LEAQ (AX)(R9*1), R8
1800
1801	// genMemMoveLong
1802	MOVOU (DI), X0
1803	MOVOU 16(DI), X1
1804	MOVOU -32(DI)(R9*1), X2
1805	MOVOU -16(DI)(R9*1), X3
1806	MOVQ  R9, R11
1807	SHRQ  $0x05, R11
1808	MOVQ  AX, R10
1809	ANDL  $0x0000001f, R10
1810	MOVQ  $0x00000040, R12
1811	SUBQ  R10, R12
1812	DECQ  R11
1813	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
1814	LEAQ  -32(DI)(R12*1), R10
1815	LEAQ  -32(AX)(R12*1), R13
1816
1817emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back:
1818	MOVOU (R10), X4
1819	MOVOU 16(R10), X5
1820	MOVOA X4, (R13)
1821	MOVOA X5, 16(R13)
1822	ADDQ  $0x20, R13
1823	ADDQ  $0x20, R10
1824	ADDQ  $0x20, R12
1825	DECQ  R11
1826	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back
1827
1828emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
1829	MOVOU -32(DI)(R12*1), X4
1830	MOVOU -16(DI)(R12*1), X5
1831	MOVOA X4, -32(AX)(R12*1)
1832	MOVOA X5, -16(AX)(R12*1)
1833	ADDQ  $0x20, R12
1834	CMPQ  R9, R12
1835	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
1836	MOVOU X0, (AX)
1837	MOVOU X1, 16(AX)
1838	MOVOU X2, -32(AX)(R9*1)
1839	MOVOU X3, -16(AX)(R9*1)
1840	MOVQ  R8, AX
1841
1842emit_literal_done_match_emit_encodeBlockAsm4MB:
1843match_nolit_loop_encodeBlockAsm4MB:
1844	MOVL CX, DI
1845	SUBL SI, DI
1846	MOVL DI, 16(SP)
1847	ADDL $0x04, CX
1848	ADDL $0x04, SI
1849	MOVQ src_len+32(FP), DI
1850	SUBL CX, DI
1851	LEAQ (DX)(CX*1), R8
1852	LEAQ (DX)(SI*1), SI
1853
1854	// matchLen
1855	XORL R10, R10
1856	CMPL DI, $0x08
1857	JL   matchlen_single_match_nolit_encodeBlockAsm4MB
1858
1859matchlen_loopback_match_nolit_encodeBlockAsm4MB:
1860	MOVQ  (R8)(R10*1), R9
1861	XORQ  (SI)(R10*1), R9
1862	TESTQ R9, R9
1863	JZ    matchlen_loop_match_nolit_encodeBlockAsm4MB
1864	BSFQ  R9, R9
1865	SARQ  $0x03, R9
1866	LEAL  (R10)(R9*1), R10
1867	JMP   match_nolit_end_encodeBlockAsm4MB
1868
1869matchlen_loop_match_nolit_encodeBlockAsm4MB:
1870	LEAL -8(DI), DI
1871	LEAL 8(R10), R10
1872	CMPL DI, $0x08
1873	JGE  matchlen_loopback_match_nolit_encodeBlockAsm4MB
1874
1875matchlen_single_match_nolit_encodeBlockAsm4MB:
1876	TESTL DI, DI
1877	JZ    match_nolit_end_encodeBlockAsm4MB
1878
1879matchlen_single_loopback_match_nolit_encodeBlockAsm4MB:
1880	MOVB (R8)(R10*1), R9
1881	CMPB (SI)(R10*1), R9
1882	JNE  match_nolit_end_encodeBlockAsm4MB
1883	LEAL 1(R10), R10
1884	DECL DI
1885	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm4MB
1886
1887match_nolit_end_encodeBlockAsm4MB:
1888	ADDL R10, CX
1889	MOVL 16(SP), SI
1890	ADDL $0x04, R10
1891	MOVL CX, 12(SP)
1892
1893	// emitCopy
1894	CMPL SI, $0x00010000
1895	JL   two_byte_offset_match_nolit_encodeBlockAsm4MB
1896
1897four_bytes_loop_back_match_nolit_encodeBlockAsm4MB:
1898	CMPL R10, $0x40
1899	JLE  four_bytes_remain_match_nolit_encodeBlockAsm4MB
1900	MOVB $0xff, (AX)
1901	MOVL SI, 1(AX)
1902	LEAL -64(R10), R10
1903	ADDQ $0x05, AX
1904	CMPL R10, $0x04
1905	JL   four_bytes_remain_match_nolit_encodeBlockAsm4MB
1906
1907	// emitRepeat
1908	MOVL R10, DI
1909	LEAL -4(R10), R10
1910	CMPL DI, $0x08
1911	JLE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy
1912	CMPL DI, $0x0c
1913	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
1914	CMPL SI, $0x00000800
1915	JLT  repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
1916
1917cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
1918	CMPL R10, $0x00000104
1919	JLT  repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy
1920	CMPL R10, $0x00010100
1921	JLT  repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy
1922	LEAL -65536(R10), R10
1923	MOVL R10, SI
1924	MOVW $0x001d, (AX)
1925	MOVW R10, 2(AX)
1926	SARL $0x10, SI
1927	MOVB SI, 4(AX)
1928	ADDQ $0x05, AX
1929	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
1930
1931repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy:
1932	LEAL -256(R10), R10
1933	MOVW $0x0019, (AX)
1934	MOVW R10, 2(AX)
1935	ADDQ $0x04, AX
1936	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
1937
1938repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy:
1939	LEAL -4(R10), R10
1940	MOVW $0x0015, (AX)
1941	MOVB R10, 2(AX)
1942	ADDQ $0x03, AX
1943	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
1944
1945repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy:
1946	SHLL $0x02, R10
1947	ORL  $0x01, R10
1948	MOVW R10, (AX)
1949	ADDQ $0x02, AX
1950	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
1951
1952repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
1953	XORQ DI, DI
1954	LEAL 1(DI)(R10*4), R10
1955	MOVB SI, 1(AX)
1956	SARL $0x08, SI
1957	SHLL $0x05, SI
1958	ORL  SI, R10
1959	MOVB R10, (AX)
1960	ADDQ $0x02, AX
1961	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
1962	JMP four_bytes_loop_back_match_nolit_encodeBlockAsm4MB
1963
1964four_bytes_remain_match_nolit_encodeBlockAsm4MB:
1965	TESTL R10, R10
1966	JZ    match_nolit_emitcopy_end_encodeBlockAsm4MB
1967	MOVB  $0x03, BL
1968	LEAL  -4(BX)(R10*4), R10
1969	MOVB  R10, (AX)
1970	MOVL  SI, 1(AX)
1971	ADDQ  $0x05, AX
1972	JMP   match_nolit_emitcopy_end_encodeBlockAsm4MB
1973
1974two_byte_offset_match_nolit_encodeBlockAsm4MB:
1975	CMPL R10, $0x40
1976	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm4MB
1977	MOVB $0xee, (AX)
1978	MOVW SI, 1(AX)
1979	LEAL -60(R10), R10
1980	ADDQ $0x03, AX
1981
1982	// emitRepeat
1983	MOVL R10, DI
1984	LEAL -4(R10), R10
1985	CMPL DI, $0x08
1986	JLE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short
1987	CMPL DI, $0x0c
1988	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
1989	CMPL SI, $0x00000800
1990	JLT  repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
1991
1992cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
1993	CMPL R10, $0x00000104
1994	JLT  repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short
1995	CMPL R10, $0x00010100
1996	JLT  repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short
1997	LEAL -65536(R10), R10
1998	MOVL R10, SI
1999	MOVW $0x001d, (AX)
2000	MOVW R10, 2(AX)
2001	SARL $0x10, SI
2002	MOVB SI, 4(AX)
2003	ADDQ $0x05, AX
2004	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
2005
2006repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2007	LEAL -256(R10), R10
2008	MOVW $0x0019, (AX)
2009	MOVW R10, 2(AX)
2010	ADDQ $0x04, AX
2011	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
2012
2013repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2014	LEAL -4(R10), R10
2015	MOVW $0x0015, (AX)
2016	MOVB R10, 2(AX)
2017	ADDQ $0x03, AX
2018	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
2019
2020repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2021	SHLL $0x02, R10
2022	ORL  $0x01, R10
2023	MOVW R10, (AX)
2024	ADDQ $0x02, AX
2025	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
2026
2027repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2028	XORQ DI, DI
2029	LEAL 1(DI)(R10*4), R10
2030	MOVB SI, 1(AX)
2031	SARL $0x08, SI
2032	SHLL $0x05, SI
2033	ORL  SI, R10
2034	MOVB R10, (AX)
2035	ADDQ $0x02, AX
2036	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
2037	JMP two_byte_offset_match_nolit_encodeBlockAsm4MB
2038
2039two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
2040	CMPL R10, $0x0c
2041	JGE  emit_copy_three_match_nolit_encodeBlockAsm4MB
2042	CMPL SI, $0x00000800
2043	JGE  emit_copy_three_match_nolit_encodeBlockAsm4MB
2044	MOVB $0x01, BL
2045	LEAL -16(BX)(R10*4), R10
2046	MOVB SI, 1(AX)
2047	SHRL $0x08, SI
2048	SHLL $0x05, SI
2049	ORL  SI, R10
2050	MOVB R10, (AX)
2051	ADDQ $0x02, AX
2052	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
2053
2054emit_copy_three_match_nolit_encodeBlockAsm4MB:
2055	MOVB $0x02, BL
2056	LEAL -4(BX)(R10*4), R10
2057	MOVB R10, (AX)
2058	MOVW SI, 1(AX)
2059	ADDQ $0x03, AX
2060
2061match_nolit_emitcopy_end_encodeBlockAsm4MB:
2062	CMPL CX, 8(SP)
2063	JGE  emit_remainder_encodeBlockAsm4MB
2064	MOVQ -2(DX)(CX*1), DI
2065	CMPQ AX, (SP)
2066	JL   match_nolit_dst_ok_encodeBlockAsm4MB
2067	MOVQ $0x00000000, ret+48(FP)
2068	RET
2069
2070match_nolit_dst_ok_encodeBlockAsm4MB:
2071	MOVQ  $0x0000cf1bbcdcbf9b, R9
2072	MOVQ  DI, R8
2073	SHRQ  $0x10, DI
2074	MOVQ  DI, SI
2075	SHLQ  $0x10, R8
2076	IMULQ R9, R8
2077	SHRQ  $0x32, R8
2078	SHLQ  $0x10, SI
2079	IMULQ R9, SI
2080	SHRQ  $0x32, SI
2081	LEAL  -2(CX), R9
2082	LEAQ  24(SP)(SI*4), R10
2083	MOVL  (R10), SI
2084	MOVL  R9, 24(SP)(R8*4)
2085	MOVL  CX, (R10)
2086	CMPL  (DX)(SI*1), DI
2087	JEQ   match_nolit_loop_encodeBlockAsm4MB
2088	INCL  CX
2089	JMP   search_loop_encodeBlockAsm4MB
2090
2091emit_remainder_encodeBlockAsm4MB:
2092	MOVQ src_len+32(FP), CX
2093	SUBL 12(SP), CX
2094	LEAQ 4(AX)(CX*1), CX
2095	CMPQ CX, (SP)
2096	JL   emit_remainder_ok_encodeBlockAsm4MB
2097	MOVQ $0x00000000, ret+48(FP)
2098	RET
2099
2100emit_remainder_ok_encodeBlockAsm4MB:
2101	MOVQ src_len+32(FP), CX
2102	MOVL 12(SP), BX
2103	CMPL BX, CX
2104	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm4MB
2105	MOVL CX, SI
2106	MOVL CX, 12(SP)
2107	LEAQ (DX)(BX*1), CX
2108	SUBL BX, SI
2109	LEAL -1(SI), DX
2110	CMPL DX, $0x3c
2111	JLT  one_byte_emit_remainder_encodeBlockAsm4MB
2112	CMPL DX, $0x00000100
2113	JLT  two_bytes_emit_remainder_encodeBlockAsm4MB
2114	CMPL DX, $0x00010000
2115	JLT  three_bytes_emit_remainder_encodeBlockAsm4MB
2116	MOVL DX, BX
2117	SHRL $0x10, BX
2118	MOVB $0xf8, (AX)
2119	MOVW DX, 1(AX)
2120	MOVB BL, 3(AX)
2121	ADDQ $0x04, AX
2122	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB
2123
2124three_bytes_emit_remainder_encodeBlockAsm4MB:
2125	MOVB $0xf4, (AX)
2126	MOVW DX, 1(AX)
2127	ADDQ $0x03, AX
2128	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB
2129
2130two_bytes_emit_remainder_encodeBlockAsm4MB:
2131	MOVB $0xf0, (AX)
2132	MOVB DL, 1(AX)
2133	ADDQ $0x02, AX
2134	CMPL DX, $0x40
2135	JL   memmove_emit_remainder_encodeBlockAsm4MB
2136	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB
2137
2138one_byte_emit_remainder_encodeBlockAsm4MB:
2139	SHLB $0x02, DL
2140	MOVB DL, (AX)
2141	ADDQ $0x01, AX
2142
2143memmove_emit_remainder_encodeBlockAsm4MB:
2144	LEAQ (AX)(SI*1), DX
2145	MOVL SI, BX
2146
2147	// genMemMoveShort
2148	CMPQ BX, $0x08
2149	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8
2150	CMPQ BX, $0x10
2151	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16
2152	CMPQ BX, $0x20
2153	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32
2154	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64
2155
2156emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8:
2157	MOVQ (CX), SI
2158	MOVQ SI, (AX)
2159	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2160
2161emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16:
2162	MOVQ (CX), SI
2163	MOVQ -8(CX)(BX*1), CX
2164	MOVQ SI, (AX)
2165	MOVQ CX, -8(AX)(BX*1)
2166	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2167
2168emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32:
2169	MOVOU (CX), X0
2170	MOVOU -16(CX)(BX*1), X1
2171	MOVOU X0, (AX)
2172	MOVOU X1, -16(AX)(BX*1)
2173	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2174
2175emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64:
2176	MOVOU (CX), X0
2177	MOVOU 16(CX), X1
2178	MOVOU -32(CX)(BX*1), X2
2179	MOVOU -16(CX)(BX*1), X3
2180	MOVOU X0, (AX)
2181	MOVOU X1, 16(AX)
2182	MOVOU X2, -32(AX)(BX*1)
2183	MOVOU X3, -16(AX)(BX*1)
2184
2185memmove_end_copy_emit_remainder_encodeBlockAsm4MB:
2186	MOVQ DX, AX
2187	JMP  emit_literal_done_emit_remainder_encodeBlockAsm4MB
2188
2189memmove_long_emit_remainder_encodeBlockAsm4MB:
2190	LEAQ (AX)(SI*1), DX
2191	MOVL SI, BX
2192
2193	// genMemMoveLong
2194	MOVOU (CX), X0
2195	MOVOU 16(CX), X1
2196	MOVOU -32(CX)(BX*1), X2
2197	MOVOU -16(CX)(BX*1), X3
2198	MOVQ  BX, DI
2199	SHRQ  $0x05, DI
2200	MOVQ  AX, SI
2201	ANDL  $0x0000001f, SI
2202	MOVQ  $0x00000040, R8
2203	SUBQ  SI, R8
2204	DECQ  DI
2205	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
2206	LEAQ  -32(CX)(R8*1), SI
2207	LEAQ  -32(AX)(R8*1), R9
2208
2209emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back:
2210	MOVOU (SI), X4
2211	MOVOU 16(SI), X5
2212	MOVOA X4, (R9)
2213	MOVOA X5, 16(R9)
2214	ADDQ  $0x20, R9
2215	ADDQ  $0x20, SI
2216	ADDQ  $0x20, R8
2217	DECQ  DI
2218	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back
2219
2220emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32:
2221	MOVOU -32(CX)(R8*1), X4
2222	MOVOU -16(CX)(R8*1), X5
2223	MOVOA X4, -32(AX)(R8*1)
2224	MOVOA X5, -16(AX)(R8*1)
2225	ADDQ  $0x20, R8
2226	CMPQ  BX, R8
2227	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
2228	MOVOU X0, (AX)
2229	MOVOU X1, 16(AX)
2230	MOVOU X2, -32(AX)(BX*1)
2231	MOVOU X3, -16(AX)(BX*1)
2232	MOVQ  DX, AX
2233
2234emit_literal_done_emit_remainder_encodeBlockAsm4MB:
2235	MOVQ dst_base+0(FP), CX
2236	SUBQ CX, AX
2237	MOVQ AX, ret+48(FP)
2238	RET
2239
2240// func encodeBlockAsm12B(dst []byte, src []byte) int
2241// Requires: SSE2
2242TEXT ·encodeBlockAsm12B(SB), $16408-56
2243	MOVQ dst_base+0(FP), AX
2244	MOVQ $0x00000080, CX
2245	LEAQ 24(SP), DX
2246	PXOR X0, X0
2247
2248zero_loop_encodeBlockAsm12B:
2249	MOVOU X0, (DX)
2250	MOVOU X0, 16(DX)
2251	MOVOU X0, 32(DX)
2252	MOVOU X0, 48(DX)
2253	MOVOU X0, 64(DX)
2254	MOVOU X0, 80(DX)
2255	MOVOU X0, 96(DX)
2256	MOVOU X0, 112(DX)
2257	ADDQ  $0x80, DX
2258	DECQ  CX
2259	JNZ   zero_loop_encodeBlockAsm12B
2260	MOVL  $0x00000000, 12(SP)
2261	MOVQ  src_len+32(FP), CX
2262	LEAQ  -9(CX), DX
2263	LEAQ  -8(CX), SI
2264	MOVL  SI, 8(SP)
2265	SHRQ  $0x05, CX
2266	SUBL  CX, DX
2267	LEAQ  (AX)(DX*1), DX
2268	MOVQ  DX, (SP)
2269	MOVL  $0x00000001, CX
2270	MOVL  CX, 16(SP)
2271	MOVQ  src_base+24(FP), DX
2272
2273search_loop_encodeBlockAsm12B:
2274	MOVL  CX, SI
2275	SUBL  12(SP), SI
2276	SHRL  $0x05, SI
2277	LEAL  4(CX)(SI*1), SI
2278	CMPL  SI, 8(SP)
2279	JGE   emit_remainder_encodeBlockAsm12B
2280	MOVQ  (DX)(CX*1), DI
2281	MOVL  SI, 20(SP)
2282	MOVQ  $0x000000cf1bbcdcbb, R9
2283	MOVQ  DI, R10
2284	MOVQ  DI, R11
2285	SHRQ  $0x08, R11
2286	SHLQ  $0x18, R10
2287	IMULQ R9, R10
2288	SHRQ  $0x34, R10
2289	SHLQ  $0x18, R11
2290	IMULQ R9, R11
2291	SHRQ  $0x34, R11
2292	MOVL  24(SP)(R10*4), SI
2293	MOVL  24(SP)(R11*4), R8
2294	MOVL  CX, 24(SP)(R10*4)
2295	LEAL  1(CX), R10
2296	MOVL  R10, 24(SP)(R11*4)
2297	MOVQ  DI, R10
2298	SHRQ  $0x10, R10
2299	SHLQ  $0x18, R10
2300	IMULQ R9, R10
2301	SHRQ  $0x34, R10
2302	MOVL  CX, R9
2303	SUBL  16(SP), R9
2304	MOVL  1(DX)(R9*1), R11
2305	MOVQ  DI, R9
2306	SHRQ  $0x08, R9
2307	CMPL  R9, R11
2308	JNE   no_repeat_found_encodeBlockAsm12B
2309	LEAL  1(CX), DI
2310	MOVL  12(SP), R8
2311	MOVL  DI, SI
2312	SUBL  16(SP), SI
2313	JZ    repeat_extend_back_end_encodeBlockAsm12B
2314
2315repeat_extend_back_loop_encodeBlockAsm12B:
2316	CMPL DI, R8
2317	JLE  repeat_extend_back_end_encodeBlockAsm12B
2318	MOVB -1(DX)(SI*1), BL
2319	MOVB -1(DX)(DI*1), R9
2320	CMPB BL, R9
2321	JNE  repeat_extend_back_end_encodeBlockAsm12B
2322	LEAL -1(DI), DI
2323	DECL SI
2324	JNZ  repeat_extend_back_loop_encodeBlockAsm12B
2325
2326repeat_extend_back_end_encodeBlockAsm12B:
2327	MOVL 12(SP), SI
2328	CMPL SI, DI
2329	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm12B
2330	MOVL DI, R9
2331	MOVL DI, 12(SP)
2332	LEAQ (DX)(SI*1), R10
2333	SUBL SI, R9
2334	LEAL -1(R9), SI
2335	CMPL SI, $0x3c
2336	JLT  one_byte_repeat_emit_encodeBlockAsm12B
2337	CMPL SI, $0x00000100
2338	JLT  two_bytes_repeat_emit_encodeBlockAsm12B
2339	MOVB $0xf4, (AX)
2340	MOVW SI, 1(AX)
2341	ADDQ $0x03, AX
2342	JMP  memmove_long_repeat_emit_encodeBlockAsm12B
2343
2344two_bytes_repeat_emit_encodeBlockAsm12B:
2345	MOVB $0xf0, (AX)
2346	MOVB SI, 1(AX)
2347	ADDQ $0x02, AX
2348	CMPL SI, $0x40
2349	JL   memmove_repeat_emit_encodeBlockAsm12B
2350	JMP  memmove_long_repeat_emit_encodeBlockAsm12B
2351
2352one_byte_repeat_emit_encodeBlockAsm12B:
2353	SHLB $0x02, SI
2354	MOVB SI, (AX)
2355	ADDQ $0x01, AX
2356
2357memmove_repeat_emit_encodeBlockAsm12B:
2358	LEAQ (AX)(R9*1), SI
2359
2360	// genMemMoveShort
2361	CMPQ R9, $0x08
2362	JLE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8
2363	CMPQ R9, $0x10
2364	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16
2365	CMPQ R9, $0x20
2366	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32
2367	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64
2368
2369emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8:
2370	MOVQ (R10), R11
2371	MOVQ R11, (AX)
2372	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm12B
2373
2374emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16:
2375	MOVQ (R10), R11
2376	MOVQ -8(R10)(R9*1), R10
2377	MOVQ R11, (AX)
2378	MOVQ R10, -8(AX)(R9*1)
2379	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm12B
2380
2381emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32:
2382	MOVOU (R10), X0
2383	MOVOU -16(R10)(R9*1), X1
2384	MOVOU X0, (AX)
2385	MOVOU X1, -16(AX)(R9*1)
2386	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm12B
2387
2388emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64:
2389	MOVOU (R10), X0
2390	MOVOU 16(R10), X1
2391	MOVOU -32(R10)(R9*1), X2
2392	MOVOU -16(R10)(R9*1), X3
2393	MOVOU X0, (AX)
2394	MOVOU X1, 16(AX)
2395	MOVOU X2, -32(AX)(R9*1)
2396	MOVOU X3, -16(AX)(R9*1)
2397
2398memmove_end_copy_repeat_emit_encodeBlockAsm12B:
2399	MOVQ SI, AX
2400	JMP  emit_literal_done_repeat_emit_encodeBlockAsm12B
2401
2402memmove_long_repeat_emit_encodeBlockAsm12B:
2403	LEAQ (AX)(R9*1), SI
2404
2405	// genMemMoveLong
2406	MOVOU (R10), X0
2407	MOVOU 16(R10), X1
2408	MOVOU -32(R10)(R9*1), X2
2409	MOVOU -16(R10)(R9*1), X3
2410	MOVQ  R9, R12
2411	SHRQ  $0x05, R12
2412	MOVQ  AX, R11
2413	ANDL  $0x0000001f, R11
2414	MOVQ  $0x00000040, R13
2415	SUBQ  R11, R13
2416	DECQ  R12
2417	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
2418	LEAQ  -32(R10)(R13*1), R11
2419	LEAQ  -32(AX)(R13*1), R14
2420
2421emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back:
2422	MOVOU (R11), X4
2423	MOVOU 16(R11), X5
2424	MOVOA X4, (R14)
2425	MOVOA X5, 16(R14)
2426	ADDQ  $0x20, R14
2427	ADDQ  $0x20, R11
2428	ADDQ  $0x20, R13
2429	DECQ  R12
2430	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back
2431
2432emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
2433	MOVOU -32(R10)(R13*1), X4
2434	MOVOU -16(R10)(R13*1), X5
2435	MOVOA X4, -32(AX)(R13*1)
2436	MOVOA X5, -16(AX)(R13*1)
2437	ADDQ  $0x20, R13
2438	CMPQ  R9, R13
2439	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
2440	MOVOU X0, (AX)
2441	MOVOU X1, 16(AX)
2442	MOVOU X2, -32(AX)(R9*1)
2443	MOVOU X3, -16(AX)(R9*1)
2444	MOVQ  SI, AX
2445
2446emit_literal_done_repeat_emit_encodeBlockAsm12B:
2447	ADDL $0x05, CX
2448	MOVL CX, SI
2449	SUBL 16(SP), SI
2450	MOVQ src_len+32(FP), R9
2451	SUBL CX, R9
2452	LEAQ (DX)(CX*1), R10
2453	LEAQ (DX)(SI*1), SI
2454
2455	// matchLen
2456	XORL R12, R12
2457	CMPL R9, $0x08
2458	JL   matchlen_single_repeat_extend_encodeBlockAsm12B
2459
2460matchlen_loopback_repeat_extend_encodeBlockAsm12B:
2461	MOVQ  (R10)(R12*1), R11
2462	XORQ  (SI)(R12*1), R11
2463	TESTQ R11, R11
2464	JZ    matchlen_loop_repeat_extend_encodeBlockAsm12B
2465	BSFQ  R11, R11
2466	SARQ  $0x03, R11
2467	LEAL  (R12)(R11*1), R12
2468	JMP   repeat_extend_forward_end_encodeBlockAsm12B
2469
2470matchlen_loop_repeat_extend_encodeBlockAsm12B:
2471	LEAL -8(R9), R9
2472	LEAL 8(R12), R12
2473	CMPL R9, $0x08
2474	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm12B
2475
2476matchlen_single_repeat_extend_encodeBlockAsm12B:
2477	TESTL R9, R9
2478	JZ    repeat_extend_forward_end_encodeBlockAsm12B
2479
2480matchlen_single_loopback_repeat_extend_encodeBlockAsm12B:
2481	MOVB (R10)(R12*1), R11
2482	CMPB (SI)(R12*1), R11
2483	JNE  repeat_extend_forward_end_encodeBlockAsm12B
2484	LEAL 1(R12), R12
2485	DECL R9
2486	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm12B
2487
2488repeat_extend_forward_end_encodeBlockAsm12B:
2489	ADDL  R12, CX
2490	MOVL  CX, SI
2491	SUBL  DI, SI
2492	MOVL  16(SP), DI
2493	TESTL R8, R8
2494	JZ    repeat_as_copy_encodeBlockAsm12B
2495
2496	// emitRepeat
2497	MOVL SI, R8
2498	LEAL -4(SI), SI
2499	CMPL R8, $0x08
2500	JLE  repeat_two_match_repeat_encodeBlockAsm12B
2501	CMPL R8, $0x0c
2502	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm12B
2503	CMPL DI, $0x00000800
2504	JLT  repeat_two_offset_match_repeat_encodeBlockAsm12B
2505
2506cant_repeat_two_offset_match_repeat_encodeBlockAsm12B:
2507	CMPL SI, $0x00000104
2508	JLT  repeat_three_match_repeat_encodeBlockAsm12B
2509	LEAL -256(SI), SI
2510	MOVW $0x0019, (AX)
2511	MOVW SI, 2(AX)
2512	ADDQ $0x04, AX
2513	JMP  repeat_end_emit_encodeBlockAsm12B
2514
2515repeat_three_match_repeat_encodeBlockAsm12B:
2516	LEAL -4(SI), SI
2517	MOVW $0x0015, (AX)
2518	MOVB SI, 2(AX)
2519	ADDQ $0x03, AX
2520	JMP  repeat_end_emit_encodeBlockAsm12B
2521
2522repeat_two_match_repeat_encodeBlockAsm12B:
2523	SHLL $0x02, SI
2524	ORL  $0x01, SI
2525	MOVW SI, (AX)
2526	ADDQ $0x02, AX
2527	JMP  repeat_end_emit_encodeBlockAsm12B
2528
2529repeat_two_offset_match_repeat_encodeBlockAsm12B:
2530	XORQ R8, R8
2531	LEAL 1(R8)(SI*4), SI
2532	MOVB DI, 1(AX)
2533	SARL $0x08, DI
2534	SHLL $0x05, DI
2535	ORL  DI, SI
2536	MOVB SI, (AX)
2537	ADDQ $0x02, AX
2538	JMP  repeat_end_emit_encodeBlockAsm12B
2539
2540repeat_as_copy_encodeBlockAsm12B:
2541	// emitCopy
2542two_byte_offset_repeat_as_copy_encodeBlockAsm12B:
2543	CMPL SI, $0x40
2544	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
2545	MOVB $0xee, (AX)
2546	MOVW DI, 1(AX)
2547	LEAL -60(SI), SI
2548	ADDQ $0x03, AX
2549
2550	// emitRepeat
2551	MOVL SI, R8
2552	LEAL -4(SI), SI
2553	CMPL R8, $0x08
2554	JLE  repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
2555	CMPL R8, $0x0c
2556	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
2557	CMPL DI, $0x00000800
2558	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
2559
2560cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
2561	CMPL SI, $0x00000104
2562	JLT  repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
2563	LEAL -256(SI), SI
2564	MOVW $0x0019, (AX)
2565	MOVW SI, 2(AX)
2566	ADDQ $0x04, AX
2567	JMP  repeat_end_emit_encodeBlockAsm12B
2568
2569repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
2570	LEAL -4(SI), SI
2571	MOVW $0x0015, (AX)
2572	MOVB SI, 2(AX)
2573	ADDQ $0x03, AX
2574	JMP  repeat_end_emit_encodeBlockAsm12B
2575
2576repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
2577	SHLL $0x02, SI
2578	ORL  $0x01, SI
2579	MOVW SI, (AX)
2580	ADDQ $0x02, AX
2581	JMP  repeat_end_emit_encodeBlockAsm12B
2582
2583repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
2584	XORQ R8, R8
2585	LEAL 1(R8)(SI*4), SI
2586	MOVB DI, 1(AX)
2587	SARL $0x08, DI
2588	SHLL $0x05, DI
2589	ORL  DI, SI
2590	MOVB SI, (AX)
2591	ADDQ $0x02, AX
2592	JMP  repeat_end_emit_encodeBlockAsm12B
2593	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B
2594
2595two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
2596	CMPL SI, $0x0c
2597	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm12B
2598	CMPL DI, $0x00000800
2599	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm12B
2600	MOVB $0x01, BL
2601	LEAL -16(BX)(SI*4), SI
2602	MOVB DI, 1(AX)
2603	SHRL $0x08, DI
2604	SHLL $0x05, DI
2605	ORL  DI, SI
2606	MOVB SI, (AX)
2607	ADDQ $0x02, AX
2608	JMP  repeat_end_emit_encodeBlockAsm12B
2609
2610emit_copy_three_repeat_as_copy_encodeBlockAsm12B:
2611	MOVB $0x02, BL
2612	LEAL -4(BX)(SI*4), SI
2613	MOVB SI, (AX)
2614	MOVW DI, 1(AX)
2615	ADDQ $0x03, AX
2616
2617repeat_end_emit_encodeBlockAsm12B:
2618	MOVL CX, 12(SP)
2619	JMP  search_loop_encodeBlockAsm12B
2620
2621no_repeat_found_encodeBlockAsm12B:
2622	CMPL (DX)(SI*1), DI
2623	JEQ  candidate_match_encodeBlockAsm12B
2624	SHRQ $0x08, DI
2625	MOVL 24(SP)(R10*4), SI
2626	LEAL 2(CX), R9
2627	CMPL (DX)(R8*1), DI
2628	JEQ  candidate2_match_encodeBlockAsm12B
2629	MOVL R9, 24(SP)(R10*4)
2630	SHRQ $0x08, DI
2631	CMPL (DX)(SI*1), DI
2632	JEQ  candidate3_match_encodeBlockAsm12B
2633	MOVL 20(SP), CX
2634	JMP  search_loop_encodeBlockAsm12B
2635
2636candidate3_match_encodeBlockAsm12B:
2637	ADDL $0x02, CX
2638	JMP  candidate_match_encodeBlockAsm12B
2639
2640candidate2_match_encodeBlockAsm12B:
2641	MOVL R9, 24(SP)(R10*4)
2642	INCL CX
2643	MOVL R8, SI
2644
2645candidate_match_encodeBlockAsm12B:
2646	MOVL  12(SP), DI
2647	TESTL SI, SI
2648	JZ    match_extend_back_end_encodeBlockAsm12B
2649
2650match_extend_back_loop_encodeBlockAsm12B:
2651	CMPL CX, DI
2652	JLE  match_extend_back_end_encodeBlockAsm12B
2653	MOVB -1(DX)(SI*1), BL
2654	MOVB -1(DX)(CX*1), R8
2655	CMPB BL, R8
2656	JNE  match_extend_back_end_encodeBlockAsm12B
2657	LEAL -1(CX), CX
2658	DECL SI
2659	JZ   match_extend_back_end_encodeBlockAsm12B
2660	JMP  match_extend_back_loop_encodeBlockAsm12B
2661
2662match_extend_back_end_encodeBlockAsm12B:
2663	MOVL CX, DI
2664	SUBL 12(SP), DI
2665	LEAQ 3(AX)(DI*1), DI
2666	CMPQ DI, (SP)
2667	JL   match_dst_size_check_encodeBlockAsm12B
2668	MOVQ $0x00000000, ret+48(FP)
2669	RET
2670
2671match_dst_size_check_encodeBlockAsm12B:
2672	MOVL CX, DI
2673	MOVL 12(SP), R8
2674	CMPL R8, DI
2675	JEQ  emit_literal_done_match_emit_encodeBlockAsm12B
2676	MOVL DI, R9
2677	MOVL DI, 12(SP)
2678	LEAQ (DX)(R8*1), DI
2679	SUBL R8, R9
2680	LEAL -1(R9), R8
2681	CMPL R8, $0x3c
2682	JLT  one_byte_match_emit_encodeBlockAsm12B
2683	CMPL R8, $0x00000100
2684	JLT  two_bytes_match_emit_encodeBlockAsm12B
2685	MOVB $0xf4, (AX)
2686	MOVW R8, 1(AX)
2687	ADDQ $0x03, AX
2688	JMP  memmove_long_match_emit_encodeBlockAsm12B
2689
2690two_bytes_match_emit_encodeBlockAsm12B:
2691	MOVB $0xf0, (AX)
2692	MOVB R8, 1(AX)
2693	ADDQ $0x02, AX
2694	CMPL R8, $0x40
2695	JL   memmove_match_emit_encodeBlockAsm12B
2696	JMP  memmove_long_match_emit_encodeBlockAsm12B
2697
2698one_byte_match_emit_encodeBlockAsm12B:
2699	SHLB $0x02, R8
2700	MOVB R8, (AX)
2701	ADDQ $0x01, AX
2702
2703memmove_match_emit_encodeBlockAsm12B:
2704	LEAQ (AX)(R9*1), R8
2705
2706	// genMemMoveShort
2707	CMPQ R9, $0x08
2708	JLE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8
2709	CMPQ R9, $0x10
2710	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16
2711	CMPQ R9, $0x20
2712	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32
2713	JMP  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64
2714
2715emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8:
2716	MOVQ (DI), R10
2717	MOVQ R10, (AX)
2718	JMP  memmove_end_copy_match_emit_encodeBlockAsm12B
2719
2720emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16:
2721	MOVQ (DI), R10
2722	MOVQ -8(DI)(R9*1), DI
2723	MOVQ R10, (AX)
2724	MOVQ DI, -8(AX)(R9*1)
2725	JMP  memmove_end_copy_match_emit_encodeBlockAsm12B
2726
2727emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32:
2728	MOVOU (DI), X0
2729	MOVOU -16(DI)(R9*1), X1
2730	MOVOU X0, (AX)
2731	MOVOU X1, -16(AX)(R9*1)
2732	JMP   memmove_end_copy_match_emit_encodeBlockAsm12B
2733
2734emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64:
2735	MOVOU (DI), X0
2736	MOVOU 16(DI), X1
2737	MOVOU -32(DI)(R9*1), X2
2738	MOVOU -16(DI)(R9*1), X3
2739	MOVOU X0, (AX)
2740	MOVOU X1, 16(AX)
2741	MOVOU X2, -32(AX)(R9*1)
2742	MOVOU X3, -16(AX)(R9*1)
2743
2744memmove_end_copy_match_emit_encodeBlockAsm12B:
2745	MOVQ R8, AX
2746	JMP  emit_literal_done_match_emit_encodeBlockAsm12B
2747
2748memmove_long_match_emit_encodeBlockAsm12B:
2749	LEAQ (AX)(R9*1), R8
2750
2751	// genMemMoveLong
2752	MOVOU (DI), X0
2753	MOVOU 16(DI), X1
2754	MOVOU -32(DI)(R9*1), X2
2755	MOVOU -16(DI)(R9*1), X3
2756	MOVQ  R9, R11
2757	SHRQ  $0x05, R11
2758	MOVQ  AX, R10
2759	ANDL  $0x0000001f, R10
2760	MOVQ  $0x00000040, R12
2761	SUBQ  R10, R12
2762	DECQ  R11
2763	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
2764	LEAQ  -32(DI)(R12*1), R10
2765	LEAQ  -32(AX)(R12*1), R13
2766
2767emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back:
2768	MOVOU (R10), X4
2769	MOVOU 16(R10), X5
2770	MOVOA X4, (R13)
2771	MOVOA X5, 16(R13)
2772	ADDQ  $0x20, R13
2773	ADDQ  $0x20, R10
2774	ADDQ  $0x20, R12
2775	DECQ  R11
2776	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back
2777
2778emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
2779	MOVOU -32(DI)(R12*1), X4
2780	MOVOU -16(DI)(R12*1), X5
2781	MOVOA X4, -32(AX)(R12*1)
2782	MOVOA X5, -16(AX)(R12*1)
2783	ADDQ  $0x20, R12
2784	CMPQ  R9, R12
2785	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
2786	MOVOU X0, (AX)
2787	MOVOU X1, 16(AX)
2788	MOVOU X2, -32(AX)(R9*1)
2789	MOVOU X3, -16(AX)(R9*1)
2790	MOVQ  R8, AX
2791
2792emit_literal_done_match_emit_encodeBlockAsm12B:
2793match_nolit_loop_encodeBlockAsm12B:
2794	MOVL CX, DI
2795	SUBL SI, DI
2796	MOVL DI, 16(SP)
2797	ADDL $0x04, CX
2798	ADDL $0x04, SI
2799	MOVQ src_len+32(FP), DI
2800	SUBL CX, DI
2801	LEAQ (DX)(CX*1), R8
2802	LEAQ (DX)(SI*1), SI
2803
2804	// matchLen
2805	XORL R10, R10
2806	CMPL DI, $0x08
2807	JL   matchlen_single_match_nolit_encodeBlockAsm12B
2808
2809matchlen_loopback_match_nolit_encodeBlockAsm12B:
2810	MOVQ  (R8)(R10*1), R9
2811	XORQ  (SI)(R10*1), R9
2812	TESTQ R9, R9
2813	JZ    matchlen_loop_match_nolit_encodeBlockAsm12B
2814	BSFQ  R9, R9
2815	SARQ  $0x03, R9
2816	LEAL  (R10)(R9*1), R10
2817	JMP   match_nolit_end_encodeBlockAsm12B
2818
2819matchlen_loop_match_nolit_encodeBlockAsm12B:
2820	LEAL -8(DI), DI
2821	LEAL 8(R10), R10
2822	CMPL DI, $0x08
2823	JGE  matchlen_loopback_match_nolit_encodeBlockAsm12B
2824
2825matchlen_single_match_nolit_encodeBlockAsm12B:
2826	TESTL DI, DI
2827	JZ    match_nolit_end_encodeBlockAsm12B
2828
2829matchlen_single_loopback_match_nolit_encodeBlockAsm12B:
2830	MOVB (R8)(R10*1), R9
2831	CMPB (SI)(R10*1), R9
2832	JNE  match_nolit_end_encodeBlockAsm12B
2833	LEAL 1(R10), R10
2834	DECL DI
2835	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm12B
2836
2837match_nolit_end_encodeBlockAsm12B:
2838	ADDL R10, CX
2839	MOVL 16(SP), SI
2840	ADDL $0x04, R10
2841	MOVL CX, 12(SP)
2842
2843	// emitCopy
2844two_byte_offset_match_nolit_encodeBlockAsm12B:
2845	CMPL R10, $0x40
2846	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm12B
2847	MOVB $0xee, (AX)
2848	MOVW SI, 1(AX)
2849	LEAL -60(R10), R10
2850	ADDQ $0x03, AX
2851
2852	// emitRepeat
2853	MOVL R10, DI
2854	LEAL -4(R10), R10
2855	CMPL DI, $0x08
2856	JLE  repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short
2857	CMPL DI, $0x0c
2858	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
2859	CMPL SI, $0x00000800
2860	JLT  repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
2861
2862cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
2863	CMPL R10, $0x00000104
2864	JLT  repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short
2865	LEAL -256(R10), R10
2866	MOVW $0x0019, (AX)
2867	MOVW R10, 2(AX)
2868	ADDQ $0x04, AX
2869	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
2870
2871repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short:
2872	LEAL -4(R10), R10
2873	MOVW $0x0015, (AX)
2874	MOVB R10, 2(AX)
2875	ADDQ $0x03, AX
2876	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
2877
2878repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short:
2879	SHLL $0x02, R10
2880	ORL  $0x01, R10
2881	MOVW R10, (AX)
2882	ADDQ $0x02, AX
2883	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
2884
2885repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
2886	XORQ DI, DI
2887	LEAL 1(DI)(R10*4), R10
2888	MOVB SI, 1(AX)
2889	SARL $0x08, SI
2890	SHLL $0x05, SI
2891	ORL  SI, R10
2892	MOVB R10, (AX)
2893	ADDQ $0x02, AX
2894	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
2895	JMP two_byte_offset_match_nolit_encodeBlockAsm12B
2896
2897two_byte_offset_short_match_nolit_encodeBlockAsm12B:
2898	CMPL R10, $0x0c
2899	JGE  emit_copy_three_match_nolit_encodeBlockAsm12B
2900	CMPL SI, $0x00000800
2901	JGE  emit_copy_three_match_nolit_encodeBlockAsm12B
2902	MOVB $0x01, BL
2903	LEAL -16(BX)(R10*4), R10
2904	MOVB SI, 1(AX)
2905	SHRL $0x08, SI
2906	SHLL $0x05, SI
2907	ORL  SI, R10
2908	MOVB R10, (AX)
2909	ADDQ $0x02, AX
2910	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
2911
2912emit_copy_three_match_nolit_encodeBlockAsm12B:
2913	MOVB $0x02, BL
2914	LEAL -4(BX)(R10*4), R10
2915	MOVB R10, (AX)
2916	MOVW SI, 1(AX)
2917	ADDQ $0x03, AX
2918
2919match_nolit_emitcopy_end_encodeBlockAsm12B:
2920	CMPL CX, 8(SP)
2921	JGE  emit_remainder_encodeBlockAsm12B
2922	MOVQ -2(DX)(CX*1), DI
2923	CMPQ AX, (SP)
2924	JL   match_nolit_dst_ok_encodeBlockAsm12B
2925	MOVQ $0x00000000, ret+48(FP)
2926	RET
2927
2928match_nolit_dst_ok_encodeBlockAsm12B:
2929	MOVQ  $0x000000cf1bbcdcbb, R9
2930	MOVQ  DI, R8
2931	SHRQ  $0x10, DI
2932	MOVQ  DI, SI
2933	SHLQ  $0x18, R8
2934	IMULQ R9, R8
2935	SHRQ  $0x34, R8
2936	SHLQ  $0x18, SI
2937	IMULQ R9, SI
2938	SHRQ  $0x34, SI
2939	LEAL  -2(CX), R9
2940	LEAQ  24(SP)(SI*4), R10
2941	MOVL  (R10), SI
2942	MOVL  R9, 24(SP)(R8*4)
2943	MOVL  CX, (R10)
2944	CMPL  (DX)(SI*1), DI
2945	JEQ   match_nolit_loop_encodeBlockAsm12B
2946	INCL  CX
2947	JMP   search_loop_encodeBlockAsm12B
2948
2949emit_remainder_encodeBlockAsm12B:
2950	MOVQ src_len+32(FP), CX
2951	SUBL 12(SP), CX
2952	LEAQ 3(AX)(CX*1), CX
2953	CMPQ CX, (SP)
2954	JL   emit_remainder_ok_encodeBlockAsm12B
2955	MOVQ $0x00000000, ret+48(FP)
2956	RET
2957
2958emit_remainder_ok_encodeBlockAsm12B:
2959	MOVQ src_len+32(FP), CX
2960	MOVL 12(SP), BX
2961	CMPL BX, CX
2962	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm12B
2963	MOVL CX, SI
2964	MOVL CX, 12(SP)
2965	LEAQ (DX)(BX*1), CX
2966	SUBL BX, SI
2967	LEAL -1(SI), DX
2968	CMPL DX, $0x3c
2969	JLT  one_byte_emit_remainder_encodeBlockAsm12B
2970	CMPL DX, $0x00000100
2971	JLT  two_bytes_emit_remainder_encodeBlockAsm12B
2972	MOVB $0xf4, (AX)
2973	MOVW DX, 1(AX)
2974	ADDQ $0x03, AX
2975	JMP  memmove_long_emit_remainder_encodeBlockAsm12B
2976
2977two_bytes_emit_remainder_encodeBlockAsm12B:
2978	MOVB $0xf0, (AX)
2979	MOVB DL, 1(AX)
2980	ADDQ $0x02, AX
2981	CMPL DX, $0x40
2982	JL   memmove_emit_remainder_encodeBlockAsm12B
2983	JMP  memmove_long_emit_remainder_encodeBlockAsm12B
2984
2985one_byte_emit_remainder_encodeBlockAsm12B:
2986	SHLB $0x02, DL
2987	MOVB DL, (AX)
2988	ADDQ $0x01, AX
2989
2990memmove_emit_remainder_encodeBlockAsm12B:
2991	LEAQ (AX)(SI*1), DX
2992	MOVL SI, BX
2993
2994	// genMemMoveShort
2995	CMPQ BX, $0x08
2996	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8
2997	CMPQ BX, $0x10
2998	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16
2999	CMPQ BX, $0x20
3000	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32
3001	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64
3002
3003emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8:
3004	MOVQ (CX), SI
3005	MOVQ SI, (AX)
3006	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B
3007
3008emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16:
3009	MOVQ (CX), SI
3010	MOVQ -8(CX)(BX*1), CX
3011	MOVQ SI, (AX)
3012	MOVQ CX, -8(AX)(BX*1)
3013	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B
3014
3015emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32:
3016	MOVOU (CX), X0
3017	MOVOU -16(CX)(BX*1), X1
3018	MOVOU X0, (AX)
3019	MOVOU X1, -16(AX)(BX*1)
3020	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm12B
3021
3022emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64:
3023	MOVOU (CX), X0
3024	MOVOU 16(CX), X1
3025	MOVOU -32(CX)(BX*1), X2
3026	MOVOU -16(CX)(BX*1), X3
3027	MOVOU X0, (AX)
3028	MOVOU X1, 16(AX)
3029	MOVOU X2, -32(AX)(BX*1)
3030	MOVOU X3, -16(AX)(BX*1)
3031
3032memmove_end_copy_emit_remainder_encodeBlockAsm12B:
3033	MOVQ DX, AX
3034	JMP  emit_literal_done_emit_remainder_encodeBlockAsm12B
3035
3036memmove_long_emit_remainder_encodeBlockAsm12B:
3037	LEAQ (AX)(SI*1), DX
3038	MOVL SI, BX
3039
3040	// genMemMoveLong
3041	MOVOU (CX), X0
3042	MOVOU 16(CX), X1
3043	MOVOU -32(CX)(BX*1), X2
3044	MOVOU -16(CX)(BX*1), X3
3045	MOVQ  BX, DI
3046	SHRQ  $0x05, DI
3047	MOVQ  AX, SI
3048	ANDL  $0x0000001f, SI
3049	MOVQ  $0x00000040, R8
3050	SUBQ  SI, R8
3051	DECQ  DI
3052	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
3053	LEAQ  -32(CX)(R8*1), SI
3054	LEAQ  -32(AX)(R8*1), R9
3055
3056emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back:
3057	MOVOU (SI), X4
3058	MOVOU 16(SI), X5
3059	MOVOA X4, (R9)
3060	MOVOA X5, 16(R9)
3061	ADDQ  $0x20, R9
3062	ADDQ  $0x20, SI
3063	ADDQ  $0x20, R8
3064	DECQ  DI
3065	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back
3066
3067emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32:
3068	MOVOU -32(CX)(R8*1), X4
3069	MOVOU -16(CX)(R8*1), X5
3070	MOVOA X4, -32(AX)(R8*1)
3071	MOVOA X5, -16(AX)(R8*1)
3072	ADDQ  $0x20, R8
3073	CMPQ  BX, R8
3074	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
3075	MOVOU X0, (AX)
3076	MOVOU X1, 16(AX)
3077	MOVOU X2, -32(AX)(BX*1)
3078	MOVOU X3, -16(AX)(BX*1)
3079	MOVQ  DX, AX
3080
3081emit_literal_done_emit_remainder_encodeBlockAsm12B:
3082	MOVQ dst_base+0(FP), CX
3083	SUBQ CX, AX
3084	MOVQ AX, ret+48(FP)
3085	RET
3086
3087// func encodeBlockAsm10B(dst []byte, src []byte) int
3088// Requires: SSE2
3089TEXT ·encodeBlockAsm10B(SB), $4120-56
3090	MOVQ dst_base+0(FP), AX
3091	MOVQ $0x00000020, CX
3092	LEAQ 24(SP), DX
3093	PXOR X0, X0
3094
3095zero_loop_encodeBlockAsm10B:
3096	MOVOU X0, (DX)
3097	MOVOU X0, 16(DX)
3098	MOVOU X0, 32(DX)
3099	MOVOU X0, 48(DX)
3100	MOVOU X0, 64(DX)
3101	MOVOU X0, 80(DX)
3102	MOVOU X0, 96(DX)
3103	MOVOU X0, 112(DX)
3104	ADDQ  $0x80, DX
3105	DECQ  CX
3106	JNZ   zero_loop_encodeBlockAsm10B
3107	MOVL  $0x00000000, 12(SP)
3108	MOVQ  src_len+32(FP), CX
3109	LEAQ  -9(CX), DX
3110	LEAQ  -8(CX), SI
3111	MOVL  SI, 8(SP)
3112	SHRQ  $0x05, CX
3113	SUBL  CX, DX
3114	LEAQ  (AX)(DX*1), DX
3115	MOVQ  DX, (SP)
3116	MOVL  $0x00000001, CX
3117	MOVL  CX, 16(SP)
3118	MOVQ  src_base+24(FP), DX
3119
3120search_loop_encodeBlockAsm10B:
3121	MOVL  CX, SI
3122	SUBL  12(SP), SI
3123	SHRL  $0x05, SI
3124	LEAL  4(CX)(SI*1), SI
3125	CMPL  SI, 8(SP)
3126	JGE   emit_remainder_encodeBlockAsm10B
3127	MOVQ  (DX)(CX*1), DI
3128	MOVL  SI, 20(SP)
3129	MOVQ  $0x9e3779b1, R9
3130	MOVQ  DI, R10
3131	MOVQ  DI, R11
3132	SHRQ  $0x08, R11
3133	SHLQ  $0x20, R10
3134	IMULQ R9, R10
3135	SHRQ  $0x36, R10
3136	SHLQ  $0x20, R11
3137	IMULQ R9, R11
3138	SHRQ  $0x36, R11
3139	MOVL  24(SP)(R10*4), SI
3140	MOVL  24(SP)(R11*4), R8
3141	MOVL  CX, 24(SP)(R10*4)
3142	LEAL  1(CX), R10
3143	MOVL  R10, 24(SP)(R11*4)
3144	MOVQ  DI, R10
3145	SHRQ  $0x10, R10
3146	SHLQ  $0x20, R10
3147	IMULQ R9, R10
3148	SHRQ  $0x36, R10
3149	MOVL  CX, R9
3150	SUBL  16(SP), R9
3151	MOVL  1(DX)(R9*1), R11
3152	MOVQ  DI, R9
3153	SHRQ  $0x08, R9
3154	CMPL  R9, R11
3155	JNE   no_repeat_found_encodeBlockAsm10B
3156	LEAL  1(CX), DI
3157	MOVL  12(SP), R8
3158	MOVL  DI, SI
3159	SUBL  16(SP), SI
3160	JZ    repeat_extend_back_end_encodeBlockAsm10B
3161
3162repeat_extend_back_loop_encodeBlockAsm10B:
3163	CMPL DI, R8
3164	JLE  repeat_extend_back_end_encodeBlockAsm10B
3165	MOVB -1(DX)(SI*1), BL
3166	MOVB -1(DX)(DI*1), R9
3167	CMPB BL, R9
3168	JNE  repeat_extend_back_end_encodeBlockAsm10B
3169	LEAL -1(DI), DI
3170	DECL SI
3171	JNZ  repeat_extend_back_loop_encodeBlockAsm10B
3172
3173repeat_extend_back_end_encodeBlockAsm10B:
3174	MOVL 12(SP), SI
3175	CMPL SI, DI
3176	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm10B
3177	MOVL DI, R9
3178	MOVL DI, 12(SP)
3179	LEAQ (DX)(SI*1), R10
3180	SUBL SI, R9
3181	LEAL -1(R9), SI
3182	CMPL SI, $0x3c
3183	JLT  one_byte_repeat_emit_encodeBlockAsm10B
3184	CMPL SI, $0x00000100
3185	JLT  two_bytes_repeat_emit_encodeBlockAsm10B
3186	MOVB $0xf4, (AX)
3187	MOVW SI, 1(AX)
3188	ADDQ $0x03, AX
3189	JMP  memmove_long_repeat_emit_encodeBlockAsm10B
3190
3191two_bytes_repeat_emit_encodeBlockAsm10B:
3192	MOVB $0xf0, (AX)
3193	MOVB SI, 1(AX)
3194	ADDQ $0x02, AX
3195	CMPL SI, $0x40
3196	JL   memmove_repeat_emit_encodeBlockAsm10B
3197	JMP  memmove_long_repeat_emit_encodeBlockAsm10B
3198
3199one_byte_repeat_emit_encodeBlockAsm10B:
3200	SHLB $0x02, SI
3201	MOVB SI, (AX)
3202	ADDQ $0x01, AX
3203
3204memmove_repeat_emit_encodeBlockAsm10B:
3205	LEAQ (AX)(R9*1), SI
3206
3207	// genMemMoveShort
3208	CMPQ R9, $0x08
3209	JLE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8
3210	CMPQ R9, $0x10
3211	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16
3212	CMPQ R9, $0x20
3213	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32
3214	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64
3215
3216emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8:
3217	MOVQ (R10), R11
3218	MOVQ R11, (AX)
3219	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm10B
3220
3221emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16:
3222	MOVQ (R10), R11
3223	MOVQ -8(R10)(R9*1), R10
3224	MOVQ R11, (AX)
3225	MOVQ R10, -8(AX)(R9*1)
3226	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm10B
3227
3228emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32:
3229	MOVOU (R10), X0
3230	MOVOU -16(R10)(R9*1), X1
3231	MOVOU X0, (AX)
3232	MOVOU X1, -16(AX)(R9*1)
3233	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm10B
3234
3235emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64:
3236	MOVOU (R10), X0
3237	MOVOU 16(R10), X1
3238	MOVOU -32(R10)(R9*1), X2
3239	MOVOU -16(R10)(R9*1), X3
3240	MOVOU X0, (AX)
3241	MOVOU X1, 16(AX)
3242	MOVOU X2, -32(AX)(R9*1)
3243	MOVOU X3, -16(AX)(R9*1)
3244
3245memmove_end_copy_repeat_emit_encodeBlockAsm10B:
3246	MOVQ SI, AX
3247	JMP  emit_literal_done_repeat_emit_encodeBlockAsm10B
3248
3249memmove_long_repeat_emit_encodeBlockAsm10B:
3250	LEAQ (AX)(R9*1), SI
3251
3252	// genMemMoveLong
3253	MOVOU (R10), X0
3254	MOVOU 16(R10), X1
3255	MOVOU -32(R10)(R9*1), X2
3256	MOVOU -16(R10)(R9*1), X3
3257	MOVQ  R9, R12
3258	SHRQ  $0x05, R12
3259	MOVQ  AX, R11
3260	ANDL  $0x0000001f, R11
3261	MOVQ  $0x00000040, R13
3262	SUBQ  R11, R13
3263	DECQ  R12
3264	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
3265	LEAQ  -32(R10)(R13*1), R11
3266	LEAQ  -32(AX)(R13*1), R14
3267
3268emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back:
3269	MOVOU (R11), X4
3270	MOVOU 16(R11), X5
3271	MOVOA X4, (R14)
3272	MOVOA X5, 16(R14)
3273	ADDQ  $0x20, R14
3274	ADDQ  $0x20, R11
3275	ADDQ  $0x20, R13
3276	DECQ  R12
3277	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back
3278
3279emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
3280	MOVOU -32(R10)(R13*1), X4
3281	MOVOU -16(R10)(R13*1), X5
3282	MOVOA X4, -32(AX)(R13*1)
3283	MOVOA X5, -16(AX)(R13*1)
3284	ADDQ  $0x20, R13
3285	CMPQ  R9, R13
3286	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
3287	MOVOU X0, (AX)
3288	MOVOU X1, 16(AX)
3289	MOVOU X2, -32(AX)(R9*1)
3290	MOVOU X3, -16(AX)(R9*1)
3291	MOVQ  SI, AX
3292
3293emit_literal_done_repeat_emit_encodeBlockAsm10B:
3294	ADDL $0x05, CX
3295	MOVL CX, SI
3296	SUBL 16(SP), SI
3297	MOVQ src_len+32(FP), R9
3298	SUBL CX, R9
3299	LEAQ (DX)(CX*1), R10
3300	LEAQ (DX)(SI*1), SI
3301
3302	// matchLen
3303	XORL R12, R12
3304	CMPL R9, $0x08
3305	JL   matchlen_single_repeat_extend_encodeBlockAsm10B
3306
3307matchlen_loopback_repeat_extend_encodeBlockAsm10B:
3308	MOVQ  (R10)(R12*1), R11
3309	XORQ  (SI)(R12*1), R11
3310	TESTQ R11, R11
3311	JZ    matchlen_loop_repeat_extend_encodeBlockAsm10B
3312	BSFQ  R11, R11
3313	SARQ  $0x03, R11
3314	LEAL  (R12)(R11*1), R12
3315	JMP   repeat_extend_forward_end_encodeBlockAsm10B
3316
3317matchlen_loop_repeat_extend_encodeBlockAsm10B:
3318	LEAL -8(R9), R9
3319	LEAL 8(R12), R12
3320	CMPL R9, $0x08
3321	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm10B
3322
3323matchlen_single_repeat_extend_encodeBlockAsm10B:
3324	TESTL R9, R9
3325	JZ    repeat_extend_forward_end_encodeBlockAsm10B
3326
3327matchlen_single_loopback_repeat_extend_encodeBlockAsm10B:
3328	MOVB (R10)(R12*1), R11
3329	CMPB (SI)(R12*1), R11
3330	JNE  repeat_extend_forward_end_encodeBlockAsm10B
3331	LEAL 1(R12), R12
3332	DECL R9
3333	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm10B
3334
3335repeat_extend_forward_end_encodeBlockAsm10B:
3336	ADDL  R12, CX
3337	MOVL  CX, SI
3338	SUBL  DI, SI
3339	MOVL  16(SP), DI
3340	TESTL R8, R8
3341	JZ    repeat_as_copy_encodeBlockAsm10B
3342
3343	// emitRepeat
3344	MOVL SI, R8
3345	LEAL -4(SI), SI
3346	CMPL R8, $0x08
3347	JLE  repeat_two_match_repeat_encodeBlockAsm10B
3348	CMPL R8, $0x0c
3349	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm10B
3350	CMPL DI, $0x00000800
3351	JLT  repeat_two_offset_match_repeat_encodeBlockAsm10B
3352
3353cant_repeat_two_offset_match_repeat_encodeBlockAsm10B:
3354	CMPL SI, $0x00000104
3355	JLT  repeat_three_match_repeat_encodeBlockAsm10B
3356	LEAL -256(SI), SI
3357	MOVW $0x0019, (AX)
3358	MOVW SI, 2(AX)
3359	ADDQ $0x04, AX
3360	JMP  repeat_end_emit_encodeBlockAsm10B
3361
3362repeat_three_match_repeat_encodeBlockAsm10B:
3363	LEAL -4(SI), SI
3364	MOVW $0x0015, (AX)
3365	MOVB SI, 2(AX)
3366	ADDQ $0x03, AX
3367	JMP  repeat_end_emit_encodeBlockAsm10B
3368
3369repeat_two_match_repeat_encodeBlockAsm10B:
3370	SHLL $0x02, SI
3371	ORL  $0x01, SI
3372	MOVW SI, (AX)
3373	ADDQ $0x02, AX
3374	JMP  repeat_end_emit_encodeBlockAsm10B
3375
3376repeat_two_offset_match_repeat_encodeBlockAsm10B:
3377	XORQ R8, R8
3378	LEAL 1(R8)(SI*4), SI
3379	MOVB DI, 1(AX)
3380	SARL $0x08, DI
3381	SHLL $0x05, DI
3382	ORL  DI, SI
3383	MOVB SI, (AX)
3384	ADDQ $0x02, AX
3385	JMP  repeat_end_emit_encodeBlockAsm10B
3386
3387repeat_as_copy_encodeBlockAsm10B:
3388	// emitCopy
3389two_byte_offset_repeat_as_copy_encodeBlockAsm10B:
3390	CMPL SI, $0x40
3391	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
3392	MOVB $0xee, (AX)
3393	MOVW DI, 1(AX)
3394	LEAL -60(SI), SI
3395	ADDQ $0x03, AX
3396
3397	// emitRepeat
3398	MOVL SI, R8
3399	LEAL -4(SI), SI
3400	CMPL R8, $0x08
3401	JLE  repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
3402	CMPL R8, $0x0c
3403	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
3404	CMPL DI, $0x00000800
3405	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
3406
3407cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
3408	CMPL SI, $0x00000104
3409	JLT  repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
3410	LEAL -256(SI), SI
3411	MOVW $0x0019, (AX)
3412	MOVW SI, 2(AX)
3413	ADDQ $0x04, AX
3414	JMP  repeat_end_emit_encodeBlockAsm10B
3415
3416repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
3417	LEAL -4(SI), SI
3418	MOVW $0x0015, (AX)
3419	MOVB SI, 2(AX)
3420	ADDQ $0x03, AX
3421	JMP  repeat_end_emit_encodeBlockAsm10B
3422
3423repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
3424	SHLL $0x02, SI
3425	ORL  $0x01, SI
3426	MOVW SI, (AX)
3427	ADDQ $0x02, AX
3428	JMP  repeat_end_emit_encodeBlockAsm10B
3429
3430repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
3431	XORQ R8, R8
3432	LEAL 1(R8)(SI*4), SI
3433	MOVB DI, 1(AX)
3434	SARL $0x08, DI
3435	SHLL $0x05, DI
3436	ORL  DI, SI
3437	MOVB SI, (AX)
3438	ADDQ $0x02, AX
3439	JMP  repeat_end_emit_encodeBlockAsm10B
3440	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B
3441
3442two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
3443	CMPL SI, $0x0c
3444	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm10B
3445	CMPL DI, $0x00000800
3446	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm10B
3447	MOVB $0x01, BL
3448	LEAL -16(BX)(SI*4), SI
3449	MOVB DI, 1(AX)
3450	SHRL $0x08, DI
3451	SHLL $0x05, DI
3452	ORL  DI, SI
3453	MOVB SI, (AX)
3454	ADDQ $0x02, AX
3455	JMP  repeat_end_emit_encodeBlockAsm10B
3456
3457emit_copy_three_repeat_as_copy_encodeBlockAsm10B:
3458	MOVB $0x02, BL
3459	LEAL -4(BX)(SI*4), SI
3460	MOVB SI, (AX)
3461	MOVW DI, 1(AX)
3462	ADDQ $0x03, AX
3463
3464repeat_end_emit_encodeBlockAsm10B:
3465	MOVL CX, 12(SP)
3466	JMP  search_loop_encodeBlockAsm10B
3467
3468no_repeat_found_encodeBlockAsm10B:
3469	CMPL (DX)(SI*1), DI
3470	JEQ  candidate_match_encodeBlockAsm10B
3471	SHRQ $0x08, DI
3472	MOVL 24(SP)(R10*4), SI
3473	LEAL 2(CX), R9
3474	CMPL (DX)(R8*1), DI
3475	JEQ  candidate2_match_encodeBlockAsm10B
3476	MOVL R9, 24(SP)(R10*4)
3477	SHRQ $0x08, DI
3478	CMPL (DX)(SI*1), DI
3479	JEQ  candidate3_match_encodeBlockAsm10B
3480	MOVL 20(SP), CX
3481	JMP  search_loop_encodeBlockAsm10B
3482
3483candidate3_match_encodeBlockAsm10B:
3484	ADDL $0x02, CX
3485	JMP  candidate_match_encodeBlockAsm10B
3486
3487candidate2_match_encodeBlockAsm10B:
3488	MOVL R9, 24(SP)(R10*4)
3489	INCL CX
3490	MOVL R8, SI
3491
3492candidate_match_encodeBlockAsm10B:
3493	MOVL  12(SP), DI
3494	TESTL SI, SI
3495	JZ    match_extend_back_end_encodeBlockAsm10B
3496
3497match_extend_back_loop_encodeBlockAsm10B:
3498	CMPL CX, DI
3499	JLE  match_extend_back_end_encodeBlockAsm10B
3500	MOVB -1(DX)(SI*1), BL
3501	MOVB -1(DX)(CX*1), R8
3502	CMPB BL, R8
3503	JNE  match_extend_back_end_encodeBlockAsm10B
3504	LEAL -1(CX), CX
3505	DECL SI
3506	JZ   match_extend_back_end_encodeBlockAsm10B
3507	JMP  match_extend_back_loop_encodeBlockAsm10B
3508
3509match_extend_back_end_encodeBlockAsm10B:
3510	MOVL CX, DI
3511	SUBL 12(SP), DI
3512	LEAQ 3(AX)(DI*1), DI
3513	CMPQ DI, (SP)
3514	JL   match_dst_size_check_encodeBlockAsm10B
3515	MOVQ $0x00000000, ret+48(FP)
3516	RET
3517
3518match_dst_size_check_encodeBlockAsm10B:
3519	MOVL CX, DI
3520	MOVL 12(SP), R8
3521	CMPL R8, DI
3522	JEQ  emit_literal_done_match_emit_encodeBlockAsm10B
3523	MOVL DI, R9
3524	MOVL DI, 12(SP)
3525	LEAQ (DX)(R8*1), DI
3526	SUBL R8, R9
3527	LEAL -1(R9), R8
3528	CMPL R8, $0x3c
3529	JLT  one_byte_match_emit_encodeBlockAsm10B
3530	CMPL R8, $0x00000100
3531	JLT  two_bytes_match_emit_encodeBlockAsm10B
3532	MOVB $0xf4, (AX)
3533	MOVW R8, 1(AX)
3534	ADDQ $0x03, AX
3535	JMP  memmove_long_match_emit_encodeBlockAsm10B
3536
3537two_bytes_match_emit_encodeBlockAsm10B:
3538	MOVB $0xf0, (AX)
3539	MOVB R8, 1(AX)
3540	ADDQ $0x02, AX
3541	CMPL R8, $0x40
3542	JL   memmove_match_emit_encodeBlockAsm10B
3543	JMP  memmove_long_match_emit_encodeBlockAsm10B
3544
3545one_byte_match_emit_encodeBlockAsm10B:
3546	SHLB $0x02, R8
3547	MOVB R8, (AX)
3548	ADDQ $0x01, AX
3549
3550memmove_match_emit_encodeBlockAsm10B:
3551	LEAQ (AX)(R9*1), R8
3552
3553	// genMemMoveShort
3554	CMPQ R9, $0x08
3555	JLE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8
3556	CMPQ R9, $0x10
3557	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16
3558	CMPQ R9, $0x20
3559	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32
3560	JMP  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64
3561
3562emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8:
3563	MOVQ (DI), R10
3564	MOVQ R10, (AX)
3565	JMP  memmove_end_copy_match_emit_encodeBlockAsm10B
3566
3567emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16:
3568	MOVQ (DI), R10
3569	MOVQ -8(DI)(R9*1), DI
3570	MOVQ R10, (AX)
3571	MOVQ DI, -8(AX)(R9*1)
3572	JMP  memmove_end_copy_match_emit_encodeBlockAsm10B
3573
3574emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32:
3575	MOVOU (DI), X0
3576	MOVOU -16(DI)(R9*1), X1
3577	MOVOU X0, (AX)
3578	MOVOU X1, -16(AX)(R9*1)
3579	JMP   memmove_end_copy_match_emit_encodeBlockAsm10B
3580
3581emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64:
3582	MOVOU (DI), X0
3583	MOVOU 16(DI), X1
3584	MOVOU -32(DI)(R9*1), X2
3585	MOVOU -16(DI)(R9*1), X3
3586	MOVOU X0, (AX)
3587	MOVOU X1, 16(AX)
3588	MOVOU X2, -32(AX)(R9*1)
3589	MOVOU X3, -16(AX)(R9*1)
3590
3591memmove_end_copy_match_emit_encodeBlockAsm10B:
3592	MOVQ R8, AX
3593	JMP  emit_literal_done_match_emit_encodeBlockAsm10B
3594
3595memmove_long_match_emit_encodeBlockAsm10B:
3596	LEAQ (AX)(R9*1), R8
3597
3598	// genMemMoveLong
3599	MOVOU (DI), X0
3600	MOVOU 16(DI), X1
3601	MOVOU -32(DI)(R9*1), X2
3602	MOVOU -16(DI)(R9*1), X3
3603	MOVQ  R9, R11
3604	SHRQ  $0x05, R11
3605	MOVQ  AX, R10
3606	ANDL  $0x0000001f, R10
3607	MOVQ  $0x00000040, R12
3608	SUBQ  R10, R12
3609	DECQ  R11
3610	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
3611	LEAQ  -32(DI)(R12*1), R10
3612	LEAQ  -32(AX)(R12*1), R13
3613
3614emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back:
3615	MOVOU (R10), X4
3616	MOVOU 16(R10), X5
3617	MOVOA X4, (R13)
3618	MOVOA X5, 16(R13)
3619	ADDQ  $0x20, R13
3620	ADDQ  $0x20, R10
3621	ADDQ  $0x20, R12
3622	DECQ  R11
3623	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back
3624
3625emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
3626	MOVOU -32(DI)(R12*1), X4
3627	MOVOU -16(DI)(R12*1), X5
3628	MOVOA X4, -32(AX)(R12*1)
3629	MOVOA X5, -16(AX)(R12*1)
3630	ADDQ  $0x20, R12
3631	CMPQ  R9, R12
3632	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
3633	MOVOU X0, (AX)
3634	MOVOU X1, 16(AX)
3635	MOVOU X2, -32(AX)(R9*1)
3636	MOVOU X3, -16(AX)(R9*1)
3637	MOVQ  R8, AX
3638
3639emit_literal_done_match_emit_encodeBlockAsm10B:
3640match_nolit_loop_encodeBlockAsm10B:
3641	MOVL CX, DI
3642	SUBL SI, DI
3643	MOVL DI, 16(SP)
3644	ADDL $0x04, CX
3645	ADDL $0x04, SI
3646	MOVQ src_len+32(FP), DI
3647	SUBL CX, DI
3648	LEAQ (DX)(CX*1), R8
3649	LEAQ (DX)(SI*1), SI
3650
3651	// matchLen
3652	XORL R10, R10
3653	CMPL DI, $0x08
3654	JL   matchlen_single_match_nolit_encodeBlockAsm10B
3655
3656matchlen_loopback_match_nolit_encodeBlockAsm10B:
3657	MOVQ  (R8)(R10*1), R9
3658	XORQ  (SI)(R10*1), R9
3659	TESTQ R9, R9
3660	JZ    matchlen_loop_match_nolit_encodeBlockAsm10B
3661	BSFQ  R9, R9
3662	SARQ  $0x03, R9
3663	LEAL  (R10)(R9*1), R10
3664	JMP   match_nolit_end_encodeBlockAsm10B
3665
3666matchlen_loop_match_nolit_encodeBlockAsm10B:
3667	LEAL -8(DI), DI
3668	LEAL 8(R10), R10
3669	CMPL DI, $0x08
3670	JGE  matchlen_loopback_match_nolit_encodeBlockAsm10B
3671
3672matchlen_single_match_nolit_encodeBlockAsm10B:
3673	TESTL DI, DI
3674	JZ    match_nolit_end_encodeBlockAsm10B
3675
3676matchlen_single_loopback_match_nolit_encodeBlockAsm10B:
3677	MOVB (R8)(R10*1), R9
3678	CMPB (SI)(R10*1), R9
3679	JNE  match_nolit_end_encodeBlockAsm10B
3680	LEAL 1(R10), R10
3681	DECL DI
3682	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm10B
3683
3684match_nolit_end_encodeBlockAsm10B:
3685	ADDL R10, CX
3686	MOVL 16(SP), SI
3687	ADDL $0x04, R10
3688	MOVL CX, 12(SP)
3689
3690	// emitCopy
3691two_byte_offset_match_nolit_encodeBlockAsm10B:
3692	CMPL R10, $0x40
3693	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm10B
3694	MOVB $0xee, (AX)
3695	MOVW SI, 1(AX)
3696	LEAL -60(R10), R10
3697	ADDQ $0x03, AX
3698
3699	// emitRepeat
3700	MOVL R10, DI
3701	LEAL -4(R10), R10
3702	CMPL DI, $0x08
3703	JLE  repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short
3704	CMPL DI, $0x0c
3705	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
3706	CMPL SI, $0x00000800
3707	JLT  repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
3708
3709cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
3710	CMPL R10, $0x00000104
3711	JLT  repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short
3712	LEAL -256(R10), R10
3713	MOVW $0x0019, (AX)
3714	MOVW R10, 2(AX)
3715	ADDQ $0x04, AX
3716	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
3717
3718repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short:
3719	LEAL -4(R10), R10
3720	MOVW $0x0015, (AX)
3721	MOVB R10, 2(AX)
3722	ADDQ $0x03, AX
3723	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
3724
3725repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short:
3726	SHLL $0x02, R10
3727	ORL  $0x01, R10
3728	MOVW R10, (AX)
3729	ADDQ $0x02, AX
3730	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
3731
3732repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
3733	XORQ DI, DI
3734	LEAL 1(DI)(R10*4), R10
3735	MOVB SI, 1(AX)
3736	SARL $0x08, SI
3737	SHLL $0x05, SI
3738	ORL  SI, R10
3739	MOVB R10, (AX)
3740	ADDQ $0x02, AX
3741	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
3742	JMP two_byte_offset_match_nolit_encodeBlockAsm10B
3743
3744two_byte_offset_short_match_nolit_encodeBlockAsm10B:
3745	CMPL R10, $0x0c
3746	JGE  emit_copy_three_match_nolit_encodeBlockAsm10B
3747	CMPL SI, $0x00000800
3748	JGE  emit_copy_three_match_nolit_encodeBlockAsm10B
3749	MOVB $0x01, BL
3750	LEAL -16(BX)(R10*4), R10
3751	MOVB SI, 1(AX)
3752	SHRL $0x08, SI
3753	SHLL $0x05, SI
3754	ORL  SI, R10
3755	MOVB R10, (AX)
3756	ADDQ $0x02, AX
3757	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
3758
3759emit_copy_three_match_nolit_encodeBlockAsm10B:
3760	MOVB $0x02, BL
3761	LEAL -4(BX)(R10*4), R10
3762	MOVB R10, (AX)
3763	MOVW SI, 1(AX)
3764	ADDQ $0x03, AX
3765
3766match_nolit_emitcopy_end_encodeBlockAsm10B:
3767	CMPL CX, 8(SP)
3768	JGE  emit_remainder_encodeBlockAsm10B
3769	MOVQ -2(DX)(CX*1), DI
3770	CMPQ AX, (SP)
3771	JL   match_nolit_dst_ok_encodeBlockAsm10B
3772	MOVQ $0x00000000, ret+48(FP)
3773	RET
3774
3775match_nolit_dst_ok_encodeBlockAsm10B:
3776	MOVQ  $0x9e3779b1, R9
3777	MOVQ  DI, R8
3778	SHRQ  $0x10, DI
3779	MOVQ  DI, SI
3780	SHLQ  $0x20, R8
3781	IMULQ R9, R8
3782	SHRQ  $0x36, R8
3783	SHLQ  $0x20, SI
3784	IMULQ R9, SI
3785	SHRQ  $0x36, SI
3786	LEAL  -2(CX), R9
3787	LEAQ  24(SP)(SI*4), R10
3788	MOVL  (R10), SI
3789	MOVL  R9, 24(SP)(R8*4)
3790	MOVL  CX, (R10)
3791	CMPL  (DX)(SI*1), DI
3792	JEQ   match_nolit_loop_encodeBlockAsm10B
3793	INCL  CX
3794	JMP   search_loop_encodeBlockAsm10B
3795
3796emit_remainder_encodeBlockAsm10B:
3797	MOVQ src_len+32(FP), CX
3798	SUBL 12(SP), CX
3799	LEAQ 3(AX)(CX*1), CX
3800	CMPQ CX, (SP)
3801	JL   emit_remainder_ok_encodeBlockAsm10B
3802	MOVQ $0x00000000, ret+48(FP)
3803	RET
3804
3805emit_remainder_ok_encodeBlockAsm10B:
3806	MOVQ src_len+32(FP), CX
3807	MOVL 12(SP), BX
3808	CMPL BX, CX
3809	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm10B
3810	MOVL CX, SI
3811	MOVL CX, 12(SP)
3812	LEAQ (DX)(BX*1), CX
3813	SUBL BX, SI
3814	LEAL -1(SI), DX
3815	CMPL DX, $0x3c
3816	JLT  one_byte_emit_remainder_encodeBlockAsm10B
3817	CMPL DX, $0x00000100
3818	JLT  two_bytes_emit_remainder_encodeBlockAsm10B
3819	MOVB $0xf4, (AX)
3820	MOVW DX, 1(AX)
3821	ADDQ $0x03, AX
3822	JMP  memmove_long_emit_remainder_encodeBlockAsm10B
3823
3824two_bytes_emit_remainder_encodeBlockAsm10B:
3825	MOVB $0xf0, (AX)
3826	MOVB DL, 1(AX)
3827	ADDQ $0x02, AX
3828	CMPL DX, $0x40
3829	JL   memmove_emit_remainder_encodeBlockAsm10B
3830	JMP  memmove_long_emit_remainder_encodeBlockAsm10B
3831
3832one_byte_emit_remainder_encodeBlockAsm10B:
3833	SHLB $0x02, DL
3834	MOVB DL, (AX)
3835	ADDQ $0x01, AX
3836
3837memmove_emit_remainder_encodeBlockAsm10B:
3838	LEAQ (AX)(SI*1), DX
3839	MOVL SI, BX
3840
3841	// genMemMoveShort
3842	CMPQ BX, $0x08
3843	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8
3844	CMPQ BX, $0x10
3845	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16
3846	CMPQ BX, $0x20
3847	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32
3848	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64
3849
3850emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8:
3851	MOVQ (CX), SI
3852	MOVQ SI, (AX)
3853	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B
3854
3855emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16:
3856	MOVQ (CX), SI
3857	MOVQ -8(CX)(BX*1), CX
3858	MOVQ SI, (AX)
3859	MOVQ CX, -8(AX)(BX*1)
3860	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B
3861
3862emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32:
3863	MOVOU (CX), X0
3864	MOVOU -16(CX)(BX*1), X1
3865	MOVOU X0, (AX)
3866	MOVOU X1, -16(AX)(BX*1)
3867	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm10B
3868
3869emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64:
3870	MOVOU (CX), X0
3871	MOVOU 16(CX), X1
3872	MOVOU -32(CX)(BX*1), X2
3873	MOVOU -16(CX)(BX*1), X3
3874	MOVOU X0, (AX)
3875	MOVOU X1, 16(AX)
3876	MOVOU X2, -32(AX)(BX*1)
3877	MOVOU X3, -16(AX)(BX*1)
3878
3879memmove_end_copy_emit_remainder_encodeBlockAsm10B:
3880	MOVQ DX, AX
3881	JMP  emit_literal_done_emit_remainder_encodeBlockAsm10B
3882
3883memmove_long_emit_remainder_encodeBlockAsm10B:
3884	LEAQ (AX)(SI*1), DX
3885	MOVL SI, BX
3886
3887	// genMemMoveLong
3888	MOVOU (CX), X0
3889	MOVOU 16(CX), X1
3890	MOVOU -32(CX)(BX*1), X2
3891	MOVOU -16(CX)(BX*1), X3
3892	MOVQ  BX, DI
3893	SHRQ  $0x05, DI
3894	MOVQ  AX, SI
3895	ANDL  $0x0000001f, SI
3896	MOVQ  $0x00000040, R8
3897	SUBQ  SI, R8
3898	DECQ  DI
3899	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
3900	LEAQ  -32(CX)(R8*1), SI
3901	LEAQ  -32(AX)(R8*1), R9
3902
3903emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back:
3904	MOVOU (SI), X4
3905	MOVOU 16(SI), X5
3906	MOVOA X4, (R9)
3907	MOVOA X5, 16(R9)
3908	ADDQ  $0x20, R9
3909	ADDQ  $0x20, SI
3910	ADDQ  $0x20, R8
3911	DECQ  DI
3912	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back
3913
3914emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32:
3915	MOVOU -32(CX)(R8*1), X4
3916	MOVOU -16(CX)(R8*1), X5
3917	MOVOA X4, -32(AX)(R8*1)
3918	MOVOA X5, -16(AX)(R8*1)
3919	ADDQ  $0x20, R8
3920	CMPQ  BX, R8
3921	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
3922	MOVOU X0, (AX)
3923	MOVOU X1, 16(AX)
3924	MOVOU X2, -32(AX)(BX*1)
3925	MOVOU X3, -16(AX)(BX*1)
3926	MOVQ  DX, AX
3927
3928emit_literal_done_emit_remainder_encodeBlockAsm10B:
3929	MOVQ dst_base+0(FP), CX
3930	SUBQ CX, AX
3931	MOVQ AX, ret+48(FP)
3932	RET
3933
3934// func encodeBlockAsm8B(dst []byte, src []byte) int
3935// Requires: SSE2
3936TEXT ·encodeBlockAsm8B(SB), $1048-56
3937	MOVQ dst_base+0(FP), AX
3938	MOVQ $0x00000008, CX
3939	LEAQ 24(SP), DX
3940	PXOR X0, X0
3941
3942zero_loop_encodeBlockAsm8B:
3943	MOVOU X0, (DX)
3944	MOVOU X0, 16(DX)
3945	MOVOU X0, 32(DX)
3946	MOVOU X0, 48(DX)
3947	MOVOU X0, 64(DX)
3948	MOVOU X0, 80(DX)
3949	MOVOU X0, 96(DX)
3950	MOVOU X0, 112(DX)
3951	ADDQ  $0x80, DX
3952	DECQ  CX
3953	JNZ   zero_loop_encodeBlockAsm8B
3954	MOVL  $0x00000000, 12(SP)
3955	MOVQ  src_len+32(FP), CX
3956	LEAQ  -9(CX), DX
3957	LEAQ  -8(CX), SI
3958	MOVL  SI, 8(SP)
3959	SHRQ  $0x05, CX
3960	SUBL  CX, DX
3961	LEAQ  (AX)(DX*1), DX
3962	MOVQ  DX, (SP)
3963	MOVL  $0x00000001, CX
3964	MOVL  CX, 16(SP)
3965	MOVQ  src_base+24(FP), DX
3966
3967search_loop_encodeBlockAsm8B:
3968	MOVL  CX, SI
3969	SUBL  12(SP), SI
3970	SHRL  $0x04, SI
3971	LEAL  4(CX)(SI*1), SI
3972	CMPL  SI, 8(SP)
3973	JGE   emit_remainder_encodeBlockAsm8B
3974	MOVQ  (DX)(CX*1), DI
3975	MOVL  SI, 20(SP)
3976	MOVQ  $0x9e3779b1, R9
3977	MOVQ  DI, R10
3978	MOVQ  DI, R11
3979	SHRQ  $0x08, R11
3980	SHLQ  $0x20, R10
3981	IMULQ R9, R10
3982	SHRQ  $0x38, R10
3983	SHLQ  $0x20, R11
3984	IMULQ R9, R11
3985	SHRQ  $0x38, R11
3986	MOVL  24(SP)(R10*4), SI
3987	MOVL  24(SP)(R11*4), R8
3988	MOVL  CX, 24(SP)(R10*4)
3989	LEAL  1(CX), R10
3990	MOVL  R10, 24(SP)(R11*4)
3991	MOVQ  DI, R10
3992	SHRQ  $0x10, R10
3993	SHLQ  $0x20, R10
3994	IMULQ R9, R10
3995	SHRQ  $0x38, R10
3996	MOVL  CX, R9
3997	SUBL  16(SP), R9
3998	MOVL  1(DX)(R9*1), R11
3999	MOVQ  DI, R9
4000	SHRQ  $0x08, R9
4001	CMPL  R9, R11
4002	JNE   no_repeat_found_encodeBlockAsm8B
4003	LEAL  1(CX), DI
4004	MOVL  12(SP), R8
4005	MOVL  DI, SI
4006	SUBL  16(SP), SI
4007	JZ    repeat_extend_back_end_encodeBlockAsm8B
4008
4009repeat_extend_back_loop_encodeBlockAsm8B:
4010	CMPL DI, R8
4011	JLE  repeat_extend_back_end_encodeBlockAsm8B
4012	MOVB -1(DX)(SI*1), BL
4013	MOVB -1(DX)(DI*1), R9
4014	CMPB BL, R9
4015	JNE  repeat_extend_back_end_encodeBlockAsm8B
4016	LEAL -1(DI), DI
4017	DECL SI
4018	JNZ  repeat_extend_back_loop_encodeBlockAsm8B
4019
4020repeat_extend_back_end_encodeBlockAsm8B:
4021	MOVL 12(SP), SI
4022	CMPL SI, DI
4023	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm8B
4024	MOVL DI, R9
4025	MOVL DI, 12(SP)
4026	LEAQ (DX)(SI*1), R10
4027	SUBL SI, R9
4028	LEAL -1(R9), SI
4029	CMPL SI, $0x3c
4030	JLT  one_byte_repeat_emit_encodeBlockAsm8B
4031	CMPL SI, $0x00000100
4032	JLT  two_bytes_repeat_emit_encodeBlockAsm8B
4033	MOVB $0xf4, (AX)
4034	MOVW SI, 1(AX)
4035	ADDQ $0x03, AX
4036	JMP  memmove_long_repeat_emit_encodeBlockAsm8B
4037
4038two_bytes_repeat_emit_encodeBlockAsm8B:
4039	MOVB $0xf0, (AX)
4040	MOVB SI, 1(AX)
4041	ADDQ $0x02, AX
4042	CMPL SI, $0x40
4043	JL   memmove_repeat_emit_encodeBlockAsm8B
4044	JMP  memmove_long_repeat_emit_encodeBlockAsm8B
4045
4046one_byte_repeat_emit_encodeBlockAsm8B:
4047	SHLB $0x02, SI
4048	MOVB SI, (AX)
4049	ADDQ $0x01, AX
4050
4051memmove_repeat_emit_encodeBlockAsm8B:
4052	LEAQ (AX)(R9*1), SI
4053
4054	// genMemMoveShort
4055	CMPQ R9, $0x08
4056	JLE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8
4057	CMPQ R9, $0x10
4058	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16
4059	CMPQ R9, $0x20
4060	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32
4061	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64
4062
4063emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8:
4064	MOVQ (R10), R11
4065	MOVQ R11, (AX)
4066	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm8B
4067
4068emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16:
4069	MOVQ (R10), R11
4070	MOVQ -8(R10)(R9*1), R10
4071	MOVQ R11, (AX)
4072	MOVQ R10, -8(AX)(R9*1)
4073	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm8B
4074
4075emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32:
4076	MOVOU (R10), X0
4077	MOVOU -16(R10)(R9*1), X1
4078	MOVOU X0, (AX)
4079	MOVOU X1, -16(AX)(R9*1)
4080	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm8B
4081
4082emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64:
4083	MOVOU (R10), X0
4084	MOVOU 16(R10), X1
4085	MOVOU -32(R10)(R9*1), X2
4086	MOVOU -16(R10)(R9*1), X3
4087	MOVOU X0, (AX)
4088	MOVOU X1, 16(AX)
4089	MOVOU X2, -32(AX)(R9*1)
4090	MOVOU X3, -16(AX)(R9*1)
4091
4092memmove_end_copy_repeat_emit_encodeBlockAsm8B:
4093	MOVQ SI, AX
4094	JMP  emit_literal_done_repeat_emit_encodeBlockAsm8B
4095
4096memmove_long_repeat_emit_encodeBlockAsm8B:
4097	LEAQ (AX)(R9*1), SI
4098
4099	// genMemMoveLong
4100	MOVOU (R10), X0
4101	MOVOU 16(R10), X1
4102	MOVOU -32(R10)(R9*1), X2
4103	MOVOU -16(R10)(R9*1), X3
4104	MOVQ  R9, R12
4105	SHRQ  $0x05, R12
4106	MOVQ  AX, R11
4107	ANDL  $0x0000001f, R11
4108	MOVQ  $0x00000040, R13
4109	SUBQ  R11, R13
4110	DECQ  R12
4111	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
4112	LEAQ  -32(R10)(R13*1), R11
4113	LEAQ  -32(AX)(R13*1), R14
4114
4115emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back:
4116	MOVOU (R11), X4
4117	MOVOU 16(R11), X5
4118	MOVOA X4, (R14)
4119	MOVOA X5, 16(R14)
4120	ADDQ  $0x20, R14
4121	ADDQ  $0x20, R11
4122	ADDQ  $0x20, R13
4123	DECQ  R12
4124	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back
4125
4126emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
4127	MOVOU -32(R10)(R13*1), X4
4128	MOVOU -16(R10)(R13*1), X5
4129	MOVOA X4, -32(AX)(R13*1)
4130	MOVOA X5, -16(AX)(R13*1)
4131	ADDQ  $0x20, R13
4132	CMPQ  R9, R13
4133	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
4134	MOVOU X0, (AX)
4135	MOVOU X1, 16(AX)
4136	MOVOU X2, -32(AX)(R9*1)
4137	MOVOU X3, -16(AX)(R9*1)
4138	MOVQ  SI, AX
4139
4140emit_literal_done_repeat_emit_encodeBlockAsm8B:
4141	ADDL $0x05, CX
4142	MOVL CX, SI
4143	SUBL 16(SP), SI
4144	MOVQ src_len+32(FP), R9
4145	SUBL CX, R9
4146	LEAQ (DX)(CX*1), R10
4147	LEAQ (DX)(SI*1), SI
4148
4149	// matchLen
4150	XORL R12, R12
4151	CMPL R9, $0x08
4152	JL   matchlen_single_repeat_extend_encodeBlockAsm8B
4153
4154matchlen_loopback_repeat_extend_encodeBlockAsm8B:
4155	MOVQ  (R10)(R12*1), R11
4156	XORQ  (SI)(R12*1), R11
4157	TESTQ R11, R11
4158	JZ    matchlen_loop_repeat_extend_encodeBlockAsm8B
4159	BSFQ  R11, R11
4160	SARQ  $0x03, R11
4161	LEAL  (R12)(R11*1), R12
4162	JMP   repeat_extend_forward_end_encodeBlockAsm8B
4163
4164matchlen_loop_repeat_extend_encodeBlockAsm8B:
4165	LEAL -8(R9), R9
4166	LEAL 8(R12), R12
4167	CMPL R9, $0x08
4168	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm8B
4169
4170matchlen_single_repeat_extend_encodeBlockAsm8B:
4171	TESTL R9, R9
4172	JZ    repeat_extend_forward_end_encodeBlockAsm8B
4173
4174matchlen_single_loopback_repeat_extend_encodeBlockAsm8B:
4175	MOVB (R10)(R12*1), R11
4176	CMPB (SI)(R12*1), R11
4177	JNE  repeat_extend_forward_end_encodeBlockAsm8B
4178	LEAL 1(R12), R12
4179	DECL R9
4180	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm8B
4181
4182repeat_extend_forward_end_encodeBlockAsm8B:
4183	ADDL  R12, CX
4184	MOVL  CX, SI
4185	SUBL  DI, SI
4186	MOVL  16(SP), DI
4187	TESTL R8, R8
4188	JZ    repeat_as_copy_encodeBlockAsm8B
4189
4190	// emitRepeat
4191	MOVL SI, DI
4192	LEAL -4(SI), SI
4193	CMPL DI, $0x08
4194	JLE  repeat_two_match_repeat_encodeBlockAsm8B
4195	CMPL DI, $0x0c
4196	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm8B
4197
4198cant_repeat_two_offset_match_repeat_encodeBlockAsm8B:
4199	CMPL SI, $0x00000104
4200	JLT  repeat_three_match_repeat_encodeBlockAsm8B
4201	LEAL -256(SI), SI
4202	MOVW $0x0019, (AX)
4203	MOVW SI, 2(AX)
4204	ADDQ $0x04, AX
4205	JMP  repeat_end_emit_encodeBlockAsm8B
4206
4207repeat_three_match_repeat_encodeBlockAsm8B:
4208	LEAL -4(SI), SI
4209	MOVW $0x0015, (AX)
4210	MOVB SI, 2(AX)
4211	ADDQ $0x03, AX
4212	JMP  repeat_end_emit_encodeBlockAsm8B
4213
4214repeat_two_match_repeat_encodeBlockAsm8B:
4215	SHLL $0x02, SI
4216	ORL  $0x01, SI
4217	MOVW SI, (AX)
4218	ADDQ $0x02, AX
4219	JMP  repeat_end_emit_encodeBlockAsm8B
4220	XORQ R8, R8
4221	LEAL 1(R8)(SI*4), SI
4222	MOVB DI, 1(AX)
4223	SARL $0x08, DI
4224	SHLL $0x05, DI
4225	ORL  DI, SI
4226	MOVB SI, (AX)
4227	ADDQ $0x02, AX
4228	JMP  repeat_end_emit_encodeBlockAsm8B
4229
4230repeat_as_copy_encodeBlockAsm8B:
4231	// emitCopy
4232two_byte_offset_repeat_as_copy_encodeBlockAsm8B:
4233	CMPL SI, $0x40
4234	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
4235	MOVB $0xee, (AX)
4236	MOVW DI, 1(AX)
4237	LEAL -60(SI), SI
4238	ADDQ $0x03, AX
4239
4240	// emitRepeat
4241	MOVL SI, DI
4242	LEAL -4(SI), SI
4243	CMPL DI, $0x08
4244	JLE  repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
4245	CMPL DI, $0x0c
4246	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
4247
4248cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
4249	CMPL SI, $0x00000104
4250	JLT  repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
4251	LEAL -256(SI), SI
4252	MOVW $0x0019, (AX)
4253	MOVW SI, 2(AX)
4254	ADDQ $0x04, AX
4255	JMP  repeat_end_emit_encodeBlockAsm8B
4256
4257repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
4258	LEAL -4(SI), SI
4259	MOVW $0x0015, (AX)
4260	MOVB SI, 2(AX)
4261	ADDQ $0x03, AX
4262	JMP  repeat_end_emit_encodeBlockAsm8B
4263
4264repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
4265	SHLL $0x02, SI
4266	ORL  $0x01, SI
4267	MOVW SI, (AX)
4268	ADDQ $0x02, AX
4269	JMP  repeat_end_emit_encodeBlockAsm8B
4270	XORQ R8, R8
4271	LEAL 1(R8)(SI*4), SI
4272	MOVB DI, 1(AX)
4273	SARL $0x08, DI
4274	SHLL $0x05, DI
4275	ORL  DI, SI
4276	MOVB SI, (AX)
4277	ADDQ $0x02, AX
4278	JMP  repeat_end_emit_encodeBlockAsm8B
4279	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B
4280
4281two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
4282	CMPL SI, $0x0c
4283	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm8B
4284	MOVB $0x01, BL
4285	LEAL -16(BX)(SI*4), SI
4286	MOVB DI, 1(AX)
4287	SHRL $0x08, DI
4288	SHLL $0x05, DI
4289	ORL  DI, SI
4290	MOVB SI, (AX)
4291	ADDQ $0x02, AX
4292	JMP  repeat_end_emit_encodeBlockAsm8B
4293
4294emit_copy_three_repeat_as_copy_encodeBlockAsm8B:
4295	MOVB $0x02, BL
4296	LEAL -4(BX)(SI*4), SI
4297	MOVB SI, (AX)
4298	MOVW DI, 1(AX)
4299	ADDQ $0x03, AX
4300
4301repeat_end_emit_encodeBlockAsm8B:
4302	MOVL CX, 12(SP)
4303	JMP  search_loop_encodeBlockAsm8B
4304
4305no_repeat_found_encodeBlockAsm8B:
4306	CMPL (DX)(SI*1), DI
4307	JEQ  candidate_match_encodeBlockAsm8B
4308	SHRQ $0x08, DI
4309	MOVL 24(SP)(R10*4), SI
4310	LEAL 2(CX), R9
4311	CMPL (DX)(R8*1), DI
4312	JEQ  candidate2_match_encodeBlockAsm8B
4313	MOVL R9, 24(SP)(R10*4)
4314	SHRQ $0x08, DI
4315	CMPL (DX)(SI*1), DI
4316	JEQ  candidate3_match_encodeBlockAsm8B
4317	MOVL 20(SP), CX
4318	JMP  search_loop_encodeBlockAsm8B
4319
4320candidate3_match_encodeBlockAsm8B:
4321	ADDL $0x02, CX
4322	JMP  candidate_match_encodeBlockAsm8B
4323
4324candidate2_match_encodeBlockAsm8B:
4325	MOVL R9, 24(SP)(R10*4)
4326	INCL CX
4327	MOVL R8, SI
4328
4329candidate_match_encodeBlockAsm8B:
4330	MOVL  12(SP), DI
4331	TESTL SI, SI
4332	JZ    match_extend_back_end_encodeBlockAsm8B
4333
4334match_extend_back_loop_encodeBlockAsm8B:
4335	CMPL CX, DI
4336	JLE  match_extend_back_end_encodeBlockAsm8B
4337	MOVB -1(DX)(SI*1), BL
4338	MOVB -1(DX)(CX*1), R8
4339	CMPB BL, R8
4340	JNE  match_extend_back_end_encodeBlockAsm8B
4341	LEAL -1(CX), CX
4342	DECL SI
4343	JZ   match_extend_back_end_encodeBlockAsm8B
4344	JMP  match_extend_back_loop_encodeBlockAsm8B
4345
4346match_extend_back_end_encodeBlockAsm8B:
4347	MOVL CX, DI
4348	SUBL 12(SP), DI
4349	LEAQ 3(AX)(DI*1), DI
4350	CMPQ DI, (SP)
4351	JL   match_dst_size_check_encodeBlockAsm8B
4352	MOVQ $0x00000000, ret+48(FP)
4353	RET
4354
4355match_dst_size_check_encodeBlockAsm8B:
4356	MOVL CX, DI
4357	MOVL 12(SP), R8
4358	CMPL R8, DI
4359	JEQ  emit_literal_done_match_emit_encodeBlockAsm8B
4360	MOVL DI, R9
4361	MOVL DI, 12(SP)
4362	LEAQ (DX)(R8*1), DI
4363	SUBL R8, R9
4364	LEAL -1(R9), R8
4365	CMPL R8, $0x3c
4366	JLT  one_byte_match_emit_encodeBlockAsm8B
4367	CMPL R8, $0x00000100
4368	JLT  two_bytes_match_emit_encodeBlockAsm8B
4369	MOVB $0xf4, (AX)
4370	MOVW R8, 1(AX)
4371	ADDQ $0x03, AX
4372	JMP  memmove_long_match_emit_encodeBlockAsm8B
4373
4374two_bytes_match_emit_encodeBlockAsm8B:
4375	MOVB $0xf0, (AX)
4376	MOVB R8, 1(AX)
4377	ADDQ $0x02, AX
4378	CMPL R8, $0x40
4379	JL   memmove_match_emit_encodeBlockAsm8B
4380	JMP  memmove_long_match_emit_encodeBlockAsm8B
4381
4382one_byte_match_emit_encodeBlockAsm8B:
4383	SHLB $0x02, R8
4384	MOVB R8, (AX)
4385	ADDQ $0x01, AX
4386
4387memmove_match_emit_encodeBlockAsm8B:
4388	LEAQ (AX)(R9*1), R8
4389
4390	// genMemMoveShort
4391	CMPQ R9, $0x08
4392	JLE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8
4393	CMPQ R9, $0x10
4394	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16
4395	CMPQ R9, $0x20
4396	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32
4397	JMP  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64
4398
4399emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8:
4400	MOVQ (DI), R10
4401	MOVQ R10, (AX)
4402	JMP  memmove_end_copy_match_emit_encodeBlockAsm8B
4403
4404emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16:
4405	MOVQ (DI), R10
4406	MOVQ -8(DI)(R9*1), DI
4407	MOVQ R10, (AX)
4408	MOVQ DI, -8(AX)(R9*1)
4409	JMP  memmove_end_copy_match_emit_encodeBlockAsm8B
4410
4411emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32:
4412	MOVOU (DI), X0
4413	MOVOU -16(DI)(R9*1), X1
4414	MOVOU X0, (AX)
4415	MOVOU X1, -16(AX)(R9*1)
4416	JMP   memmove_end_copy_match_emit_encodeBlockAsm8B
4417
4418emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64:
4419	MOVOU (DI), X0
4420	MOVOU 16(DI), X1
4421	MOVOU -32(DI)(R9*1), X2
4422	MOVOU -16(DI)(R9*1), X3
4423	MOVOU X0, (AX)
4424	MOVOU X1, 16(AX)
4425	MOVOU X2, -32(AX)(R9*1)
4426	MOVOU X3, -16(AX)(R9*1)
4427
4428memmove_end_copy_match_emit_encodeBlockAsm8B:
4429	MOVQ R8, AX
4430	JMP  emit_literal_done_match_emit_encodeBlockAsm8B
4431
4432memmove_long_match_emit_encodeBlockAsm8B:
4433	LEAQ (AX)(R9*1), R8
4434
4435	// genMemMoveLong
4436	MOVOU (DI), X0
4437	MOVOU 16(DI), X1
4438	MOVOU -32(DI)(R9*1), X2
4439	MOVOU -16(DI)(R9*1), X3
4440	MOVQ  R9, R11
4441	SHRQ  $0x05, R11
4442	MOVQ  AX, R10
4443	ANDL  $0x0000001f, R10
4444	MOVQ  $0x00000040, R12
4445	SUBQ  R10, R12
4446	DECQ  R11
4447	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
4448	LEAQ  -32(DI)(R12*1), R10
4449	LEAQ  -32(AX)(R12*1), R13
4450
4451emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back:
4452	MOVOU (R10), X4
4453	MOVOU 16(R10), X5
4454	MOVOA X4, (R13)
4455	MOVOA X5, 16(R13)
4456	ADDQ  $0x20, R13
4457	ADDQ  $0x20, R10
4458	ADDQ  $0x20, R12
4459	DECQ  R11
4460	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back
4461
4462emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
4463	MOVOU -32(DI)(R12*1), X4
4464	MOVOU -16(DI)(R12*1), X5
4465	MOVOA X4, -32(AX)(R12*1)
4466	MOVOA X5, -16(AX)(R12*1)
4467	ADDQ  $0x20, R12
4468	CMPQ  R9, R12
4469	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
4470	MOVOU X0, (AX)
4471	MOVOU X1, 16(AX)
4472	MOVOU X2, -32(AX)(R9*1)
4473	MOVOU X3, -16(AX)(R9*1)
4474	MOVQ  R8, AX
4475
4476emit_literal_done_match_emit_encodeBlockAsm8B:
4477match_nolit_loop_encodeBlockAsm8B:
4478	MOVL CX, DI
4479	SUBL SI, DI
4480	MOVL DI, 16(SP)
4481	ADDL $0x04, CX
4482	ADDL $0x04, SI
4483	MOVQ src_len+32(FP), DI
4484	SUBL CX, DI
4485	LEAQ (DX)(CX*1), R8
4486	LEAQ (DX)(SI*1), SI
4487
4488	// matchLen
4489	XORL R10, R10
4490	CMPL DI, $0x08
4491	JL   matchlen_single_match_nolit_encodeBlockAsm8B
4492
4493matchlen_loopback_match_nolit_encodeBlockAsm8B:
4494	MOVQ  (R8)(R10*1), R9
4495	XORQ  (SI)(R10*1), R9
4496	TESTQ R9, R9
4497	JZ    matchlen_loop_match_nolit_encodeBlockAsm8B
4498	BSFQ  R9, R9
4499	SARQ  $0x03, R9
4500	LEAL  (R10)(R9*1), R10
4501	JMP   match_nolit_end_encodeBlockAsm8B
4502
4503matchlen_loop_match_nolit_encodeBlockAsm8B:
4504	LEAL -8(DI), DI
4505	LEAL 8(R10), R10
4506	CMPL DI, $0x08
4507	JGE  matchlen_loopback_match_nolit_encodeBlockAsm8B
4508
4509matchlen_single_match_nolit_encodeBlockAsm8B:
4510	TESTL DI, DI
4511	JZ    match_nolit_end_encodeBlockAsm8B
4512
4513matchlen_single_loopback_match_nolit_encodeBlockAsm8B:
4514	MOVB (R8)(R10*1), R9
4515	CMPB (SI)(R10*1), R9
4516	JNE  match_nolit_end_encodeBlockAsm8B
4517	LEAL 1(R10), R10
4518	DECL DI
4519	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm8B
4520
4521match_nolit_end_encodeBlockAsm8B:
4522	ADDL R10, CX
4523	MOVL 16(SP), SI
4524	ADDL $0x04, R10
4525	MOVL CX, 12(SP)
4526
4527	// emitCopy
4528two_byte_offset_match_nolit_encodeBlockAsm8B:
4529	CMPL R10, $0x40
4530	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm8B
4531	MOVB $0xee, (AX)
4532	MOVW SI, 1(AX)
4533	LEAL -60(R10), R10
4534	ADDQ $0x03, AX
4535
4536	// emitRepeat
4537	MOVL R10, SI
4538	LEAL -4(R10), R10
4539	CMPL SI, $0x08
4540	JLE  repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short
4541	CMPL SI, $0x0c
4542	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short
4543
4544cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short:
4545	CMPL R10, $0x00000104
4546	JLT  repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short
4547	LEAL -256(R10), R10
4548	MOVW $0x0019, (AX)
4549	MOVW R10, 2(AX)
4550	ADDQ $0x04, AX
4551	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
4552
4553repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short:
4554	LEAL -4(R10), R10
4555	MOVW $0x0015, (AX)
4556	MOVB R10, 2(AX)
4557	ADDQ $0x03, AX
4558	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
4559
4560repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
4561	SHLL $0x02, R10
4562	ORL  $0x01, R10
4563	MOVW R10, (AX)
4564	ADDQ $0x02, AX
4565	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
4566	XORQ DI, DI
4567	LEAL 1(DI)(R10*4), R10
4568	MOVB SI, 1(AX)
4569	SARL $0x08, SI
4570	SHLL $0x05, SI
4571	ORL  SI, R10
4572	MOVB R10, (AX)
4573	ADDQ $0x02, AX
4574	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
4575	JMP two_byte_offset_match_nolit_encodeBlockAsm8B
4576
4577two_byte_offset_short_match_nolit_encodeBlockAsm8B:
4578	CMPL R10, $0x0c
4579	JGE  emit_copy_three_match_nolit_encodeBlockAsm8B
4580	MOVB $0x01, BL
4581	LEAL -16(BX)(R10*4), R10
4582	MOVB SI, 1(AX)
4583	SHRL $0x08, SI
4584	SHLL $0x05, SI
4585	ORL  SI, R10
4586	MOVB R10, (AX)
4587	ADDQ $0x02, AX
4588	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
4589
4590emit_copy_three_match_nolit_encodeBlockAsm8B:
4591	MOVB $0x02, BL
4592	LEAL -4(BX)(R10*4), R10
4593	MOVB R10, (AX)
4594	MOVW SI, 1(AX)
4595	ADDQ $0x03, AX
4596
4597match_nolit_emitcopy_end_encodeBlockAsm8B:
4598	CMPL CX, 8(SP)
4599	JGE  emit_remainder_encodeBlockAsm8B
4600	MOVQ -2(DX)(CX*1), DI
4601	CMPQ AX, (SP)
4602	JL   match_nolit_dst_ok_encodeBlockAsm8B
4603	MOVQ $0x00000000, ret+48(FP)
4604	RET
4605
4606match_nolit_dst_ok_encodeBlockAsm8B:
4607	MOVQ  $0x9e3779b1, R9
4608	MOVQ  DI, R8
4609	SHRQ  $0x10, DI
4610	MOVQ  DI, SI
4611	SHLQ  $0x20, R8
4612	IMULQ R9, R8
4613	SHRQ  $0x38, R8
4614	SHLQ  $0x20, SI
4615	IMULQ R9, SI
4616	SHRQ  $0x38, SI
4617	LEAL  -2(CX), R9
4618	LEAQ  24(SP)(SI*4), R10
4619	MOVL  (R10), SI
4620	MOVL  R9, 24(SP)(R8*4)
4621	MOVL  CX, (R10)
4622	CMPL  (DX)(SI*1), DI
4623	JEQ   match_nolit_loop_encodeBlockAsm8B
4624	INCL  CX
4625	JMP   search_loop_encodeBlockAsm8B
4626
4627emit_remainder_encodeBlockAsm8B:
4628	MOVQ src_len+32(FP), CX
4629	SUBL 12(SP), CX
4630	LEAQ 3(AX)(CX*1), CX
4631	CMPQ CX, (SP)
4632	JL   emit_remainder_ok_encodeBlockAsm8B
4633	MOVQ $0x00000000, ret+48(FP)
4634	RET
4635
4636emit_remainder_ok_encodeBlockAsm8B:
4637	MOVQ src_len+32(FP), CX
4638	MOVL 12(SP), BX
4639	CMPL BX, CX
4640	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm8B
4641	MOVL CX, SI
4642	MOVL CX, 12(SP)
4643	LEAQ (DX)(BX*1), CX
4644	SUBL BX, SI
4645	LEAL -1(SI), DX
4646	CMPL DX, $0x3c
4647	JLT  one_byte_emit_remainder_encodeBlockAsm8B
4648	CMPL DX, $0x00000100
4649	JLT  two_bytes_emit_remainder_encodeBlockAsm8B
4650	MOVB $0xf4, (AX)
4651	MOVW DX, 1(AX)
4652	ADDQ $0x03, AX
4653	JMP  memmove_long_emit_remainder_encodeBlockAsm8B
4654
4655two_bytes_emit_remainder_encodeBlockAsm8B:
4656	MOVB $0xf0, (AX)
4657	MOVB DL, 1(AX)
4658	ADDQ $0x02, AX
4659	CMPL DX, $0x40
4660	JL   memmove_emit_remainder_encodeBlockAsm8B
4661	JMP  memmove_long_emit_remainder_encodeBlockAsm8B
4662
4663one_byte_emit_remainder_encodeBlockAsm8B:
4664	SHLB $0x02, DL
4665	MOVB DL, (AX)
4666	ADDQ $0x01, AX
4667
4668memmove_emit_remainder_encodeBlockAsm8B:
4669	LEAQ (AX)(SI*1), DX
4670	MOVL SI, BX
4671
4672	// genMemMoveShort
4673	CMPQ BX, $0x08
4674	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8
4675	CMPQ BX, $0x10
4676	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16
4677	CMPQ BX, $0x20
4678	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32
4679	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64
4680
4681emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8:
4682	MOVQ (CX), SI
4683	MOVQ SI, (AX)
4684	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B
4685
4686emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16:
4687	MOVQ (CX), SI
4688	MOVQ -8(CX)(BX*1), CX
4689	MOVQ SI, (AX)
4690	MOVQ CX, -8(AX)(BX*1)
4691	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B
4692
4693emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32:
4694	MOVOU (CX), X0
4695	MOVOU -16(CX)(BX*1), X1
4696	MOVOU X0, (AX)
4697	MOVOU X1, -16(AX)(BX*1)
4698	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm8B
4699
4700emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64:
4701	MOVOU (CX), X0
4702	MOVOU 16(CX), X1
4703	MOVOU -32(CX)(BX*1), X2
4704	MOVOU -16(CX)(BX*1), X3
4705	MOVOU X0, (AX)
4706	MOVOU X1, 16(AX)
4707	MOVOU X2, -32(AX)(BX*1)
4708	MOVOU X3, -16(AX)(BX*1)
4709
4710memmove_end_copy_emit_remainder_encodeBlockAsm8B:
4711	MOVQ DX, AX
4712	JMP  emit_literal_done_emit_remainder_encodeBlockAsm8B
4713
4714memmove_long_emit_remainder_encodeBlockAsm8B:
4715	LEAQ (AX)(SI*1), DX
4716	MOVL SI, BX
4717
4718	// genMemMoveLong
4719	MOVOU (CX), X0
4720	MOVOU 16(CX), X1
4721	MOVOU -32(CX)(BX*1), X2
4722	MOVOU -16(CX)(BX*1), X3
4723	MOVQ  BX, DI
4724	SHRQ  $0x05, DI
4725	MOVQ  AX, SI
4726	ANDL  $0x0000001f, SI
4727	MOVQ  $0x00000040, R8
4728	SUBQ  SI, R8
4729	DECQ  DI
4730	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
4731	LEAQ  -32(CX)(R8*1), SI
4732	LEAQ  -32(AX)(R8*1), R9
4733
4734emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back:
4735	MOVOU (SI), X4
4736	MOVOU 16(SI), X5
4737	MOVOA X4, (R9)
4738	MOVOA X5, 16(R9)
4739	ADDQ  $0x20, R9
4740	ADDQ  $0x20, SI
4741	ADDQ  $0x20, R8
4742	DECQ  DI
4743	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back
4744
4745emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32:
4746	MOVOU -32(CX)(R8*1), X4
4747	MOVOU -16(CX)(R8*1), X5
4748	MOVOA X4, -32(AX)(R8*1)
4749	MOVOA X5, -16(AX)(R8*1)
4750	ADDQ  $0x20, R8
4751	CMPQ  BX, R8
4752	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
4753	MOVOU X0, (AX)
4754	MOVOU X1, 16(AX)
4755	MOVOU X2, -32(AX)(BX*1)
4756	MOVOU X3, -16(AX)(BX*1)
4757	MOVQ  DX, AX
4758
4759emit_literal_done_emit_remainder_encodeBlockAsm8B:
4760	MOVQ dst_base+0(FP), CX
4761	SUBQ CX, AX
4762	MOVQ AX, ret+48(FP)
4763	RET
4764
4765// func encodeBetterBlockAsm(dst []byte, src []byte) int
4766// Requires: SSE2
4767TEXT ·encodeBetterBlockAsm(SB), $327704-56
4768	MOVQ dst_base+0(FP), AX
4769	MOVQ $0x00000a00, CX
4770	LEAQ 24(SP), DX
4771	PXOR X0, X0
4772
4773zero_loop_encodeBetterBlockAsm:
4774	MOVOU X0, (DX)
4775	MOVOU X0, 16(DX)
4776	MOVOU X0, 32(DX)
4777	MOVOU X0, 48(DX)
4778	MOVOU X0, 64(DX)
4779	MOVOU X0, 80(DX)
4780	MOVOU X0, 96(DX)
4781	MOVOU X0, 112(DX)
4782	ADDQ  $0x80, DX
4783	DECQ  CX
4784	JNZ   zero_loop_encodeBetterBlockAsm
4785	MOVL  $0x00000000, 12(SP)
4786	MOVQ  src_len+32(FP), CX
4787	LEAQ  -6(CX), DX
4788	LEAQ  -8(CX), SI
4789	MOVL  SI, 8(SP)
4790	SHRQ  $0x05, CX
4791	SUBL  CX, DX
4792	LEAQ  (AX)(DX*1), DX
4793	MOVQ  DX, (SP)
4794	MOVL  $0x00000001, CX
4795	MOVL  $0x00000000, 16(SP)
4796	MOVQ  src_base+24(FP), DX
4797
4798search_loop_encodeBetterBlockAsm:
4799	MOVL CX, SI
4800	SUBL 12(SP), SI
4801	SHRL $0x07, SI
4802	CMPL SI, $0x63
4803	JLE  check_maxskip_ok_encodeBetterBlockAsm
4804	LEAL 100(CX), SI
4805	JMP  check_maxskip_cont_encodeBetterBlockAsm
4806
4807check_maxskip_ok_encodeBetterBlockAsm:
4808	LEAL 1(CX)(SI*1), SI
4809
4810check_maxskip_cont_encodeBetterBlockAsm:
4811	CMPL  SI, 8(SP)
4812	JGE   emit_remainder_encodeBetterBlockAsm
4813	MOVQ  (DX)(CX*1), DI
4814	MOVL  SI, 20(SP)
4815	MOVQ  $0x00cf1bbcdcbfa563, R9
4816	MOVQ  $0x9e3779b1, SI
4817	MOVQ  DI, R10
4818	MOVQ  DI, R11
4819	SHLQ  $0x08, R10
4820	IMULQ R9, R10
4821	SHRQ  $0x30, R10
4822	SHLQ  $0x20, R11
4823	IMULQ SI, R11
4824	SHRQ  $0x32, R11
4825	MOVL  24(SP)(R10*4), SI
4826	MOVL  262168(SP)(R11*4), R8
4827	MOVL  CX, 24(SP)(R10*4)
4828	MOVL  CX, 262168(SP)(R11*4)
4829	CMPL  (DX)(SI*1), DI
4830	JEQ   candidate_match_encodeBetterBlockAsm
4831	CMPL  (DX)(R8*1), DI
4832	JEQ   candidateS_match_encodeBetterBlockAsm
4833	MOVL  20(SP), CX
4834	JMP   search_loop_encodeBetterBlockAsm
4835
4836candidateS_match_encodeBetterBlockAsm:
4837	SHRQ  $0x08, DI
4838	MOVQ  DI, R10
4839	SHLQ  $0x08, R10
4840	IMULQ R9, R10
4841	SHRQ  $0x30, R10
4842	MOVL  24(SP)(R10*4), SI
4843	INCL  CX
4844	MOVL  CX, 24(SP)(R10*4)
4845	CMPL  (DX)(SI*1), DI
4846	JEQ   candidate_match_encodeBetterBlockAsm
4847	DECL  CX
4848	MOVL  R8, SI
4849
4850candidate_match_encodeBetterBlockAsm:
4851	MOVL  12(SP), DI
4852	TESTL SI, SI
4853	JZ    match_extend_back_end_encodeBetterBlockAsm
4854
4855match_extend_back_loop_encodeBetterBlockAsm:
4856	CMPL CX, DI
4857	JLE  match_extend_back_end_encodeBetterBlockAsm
4858	MOVB -1(DX)(SI*1), BL
4859	MOVB -1(DX)(CX*1), R8
4860	CMPB BL, R8
4861	JNE  match_extend_back_end_encodeBetterBlockAsm
4862	LEAL -1(CX), CX
4863	DECL SI
4864	JZ   match_extend_back_end_encodeBetterBlockAsm
4865	JMP  match_extend_back_loop_encodeBetterBlockAsm
4866
4867match_extend_back_end_encodeBetterBlockAsm:
4868	MOVL CX, DI
4869	SUBL 12(SP), DI
4870	LEAQ 5(AX)(DI*1), DI
4871	CMPQ DI, (SP)
4872	JL   match_dst_size_check_encodeBetterBlockAsm
4873	MOVQ $0x00000000, ret+48(FP)
4874	RET
4875
4876match_dst_size_check_encodeBetterBlockAsm:
4877	MOVL CX, DI
4878	ADDL $0x04, CX
4879	ADDL $0x04, SI
4880	MOVQ src_len+32(FP), R8
4881	SUBL CX, R8
4882	LEAQ (DX)(CX*1), R9
4883	LEAQ (DX)(SI*1), R10
4884
4885	// matchLen
4886	XORL R12, R12
4887	CMPL R8, $0x08
4888	JL   matchlen_single_match_nolit_encodeBetterBlockAsm
4889
4890matchlen_loopback_match_nolit_encodeBetterBlockAsm:
4891	MOVQ  (R9)(R12*1), R11
4892	XORQ  (R10)(R12*1), R11
4893	TESTQ R11, R11
4894	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm
4895	BSFQ  R11, R11
4896	SARQ  $0x03, R11
4897	LEAL  (R12)(R11*1), R12
4898	JMP   match_nolit_end_encodeBetterBlockAsm
4899
4900matchlen_loop_match_nolit_encodeBetterBlockAsm:
4901	LEAL -8(R8), R8
4902	LEAL 8(R12), R12
4903	CMPL R8, $0x08
4904	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm
4905
4906matchlen_single_match_nolit_encodeBetterBlockAsm:
4907	TESTL R8, R8
4908	JZ    match_nolit_end_encodeBetterBlockAsm
4909
4910matchlen_single_loopback_match_nolit_encodeBetterBlockAsm:
4911	MOVB (R9)(R12*1), R11
4912	CMPB (R10)(R12*1), R11
4913	JNE  match_nolit_end_encodeBetterBlockAsm
4914	LEAL 1(R12), R12
4915	DECL R8
4916	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm
4917
4918match_nolit_end_encodeBetterBlockAsm:
4919	MOVL CX, R8
4920	SUBL SI, R8
4921
4922	// Check if repeat
4923	CMPL 16(SP), R8
4924	JEQ  match_is_repeat_encodeBetterBlockAsm
4925	CMPL R12, $0x01
4926	JG   match_length_ok_encodeBetterBlockAsm
4927	CMPL R8, $0x0000ffff
4928	JLE  match_length_ok_encodeBetterBlockAsm
4929	MOVL 20(SP), CX
4930	INCL CX
4931	JMP  search_loop_encodeBetterBlockAsm
4932
4933match_length_ok_encodeBetterBlockAsm:
4934	MOVL R8, 16(SP)
4935	MOVL 12(SP), SI
4936	CMPL SI, DI
4937	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm
4938	MOVL DI, R9
4939	MOVL DI, 12(SP)
4940	LEAQ (DX)(SI*1), R10
4941	SUBL SI, R9
4942	LEAL -1(R9), SI
4943	CMPL SI, $0x3c
4944	JLT  one_byte_match_emit_encodeBetterBlockAsm
4945	CMPL SI, $0x00000100
4946	JLT  two_bytes_match_emit_encodeBetterBlockAsm
4947	CMPL SI, $0x00010000
4948	JLT  three_bytes_match_emit_encodeBetterBlockAsm
4949	CMPL SI, $0x01000000
4950	JLT  four_bytes_match_emit_encodeBetterBlockAsm
4951	MOVB $0xfc, (AX)
4952	MOVL SI, 1(AX)
4953	ADDQ $0x05, AX
4954	JMP  memmove_long_match_emit_encodeBetterBlockAsm
4955
4956four_bytes_match_emit_encodeBetterBlockAsm:
4957	MOVL SI, R11
4958	SHRL $0x10, R11
4959	MOVB $0xf8, (AX)
4960	MOVW SI, 1(AX)
4961	MOVB R11, 3(AX)
4962	ADDQ $0x04, AX
4963	JMP  memmove_long_match_emit_encodeBetterBlockAsm
4964
4965three_bytes_match_emit_encodeBetterBlockAsm:
4966	MOVB $0xf4, (AX)
4967	MOVW SI, 1(AX)
4968	ADDQ $0x03, AX
4969	JMP  memmove_long_match_emit_encodeBetterBlockAsm
4970
4971two_bytes_match_emit_encodeBetterBlockAsm:
4972	MOVB $0xf0, (AX)
4973	MOVB SI, 1(AX)
4974	ADDQ $0x02, AX
4975	CMPL SI, $0x40
4976	JL   memmove_match_emit_encodeBetterBlockAsm
4977	JMP  memmove_long_match_emit_encodeBetterBlockAsm
4978
4979one_byte_match_emit_encodeBetterBlockAsm:
4980	SHLB $0x02, SI
4981	MOVB SI, (AX)
4982	ADDQ $0x01, AX
4983
4984memmove_match_emit_encodeBetterBlockAsm:
4985	LEAQ (AX)(R9*1), SI
4986
4987	// genMemMoveShort
4988	CMPQ R9, $0x04
4989	JLE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4
4990	CMPQ R9, $0x08
4991	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7
4992	CMPQ R9, $0x10
4993	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
4994	CMPQ R9, $0x20
4995	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
4996	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64
4997
4998emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4:
4999	MOVL (R10), R11
5000	MOVL R11, (AX)
5001	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm
5002
5003emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7:
5004	MOVL (R10), R11
5005	MOVL -4(R10)(R9*1), R10
5006	MOVL R11, (AX)
5007	MOVL R10, -4(AX)(R9*1)
5008	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm
5009
5010emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
5011	MOVQ (R10), R11
5012	MOVQ -8(R10)(R9*1), R10
5013	MOVQ R11, (AX)
5014	MOVQ R10, -8(AX)(R9*1)
5015	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm
5016
5017emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
5018	MOVOU (R10), X0
5019	MOVOU -16(R10)(R9*1), X1
5020	MOVOU X0, (AX)
5021	MOVOU X1, -16(AX)(R9*1)
5022	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm
5023
5024emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
5025	MOVOU (R10), X0
5026	MOVOU 16(R10), X1
5027	MOVOU -32(R10)(R9*1), X2
5028	MOVOU -16(R10)(R9*1), X3
5029	MOVOU X0, (AX)
5030	MOVOU X1, 16(AX)
5031	MOVOU X2, -32(AX)(R9*1)
5032	MOVOU X3, -16(AX)(R9*1)
5033
5034memmove_end_copy_match_emit_encodeBetterBlockAsm:
5035	MOVQ SI, AX
5036	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm
5037
5038memmove_long_match_emit_encodeBetterBlockAsm:
5039	LEAQ (AX)(R9*1), SI
5040
5041	// genMemMoveLong
5042	MOVOU (R10), X0
5043	MOVOU 16(R10), X1
5044	MOVOU -32(R10)(R9*1), X2
5045	MOVOU -16(R10)(R9*1), X3
5046	MOVQ  R9, R13
5047	SHRQ  $0x05, R13
5048	MOVQ  AX, R11
5049	ANDL  $0x0000001f, R11
5050	MOVQ  $0x00000040, R14
5051	SUBQ  R11, R14
5052	DECQ  R13
5053	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
5054	LEAQ  -32(R10)(R14*1), R11
5055	LEAQ  -32(AX)(R14*1), R15
5056
5057emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
5058	MOVOU (R11), X4
5059	MOVOU 16(R11), X5
5060	MOVOA X4, (R15)
5061	MOVOA X5, 16(R15)
5062	ADDQ  $0x20, R15
5063	ADDQ  $0x20, R11
5064	ADDQ  $0x20, R14
5065	DECQ  R13
5066	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back
5067
5068emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
5069	MOVOU -32(R10)(R14*1), X4
5070	MOVOU -16(R10)(R14*1), X5
5071	MOVOA X4, -32(AX)(R14*1)
5072	MOVOA X5, -16(AX)(R14*1)
5073	ADDQ  $0x20, R14
5074	CMPQ  R9, R14
5075	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
5076	MOVOU X0, (AX)
5077	MOVOU X1, 16(AX)
5078	MOVOU X2, -32(AX)(R9*1)
5079	MOVOU X3, -16(AX)(R9*1)
5080	MOVQ  SI, AX
5081
5082emit_literal_done_match_emit_encodeBetterBlockAsm:
5083	ADDL R12, CX
5084	ADDL $0x04, R12
5085	MOVL CX, 12(SP)
5086
5087	// emitCopy
5088	CMPL R8, $0x00010000
5089	JL   two_byte_offset_match_nolit_encodeBetterBlockAsm
5090
5091four_bytes_loop_back_match_nolit_encodeBetterBlockAsm:
5092	CMPL R12, $0x40
5093	JLE  four_bytes_remain_match_nolit_encodeBetterBlockAsm
5094	MOVB $0xff, (AX)
5095	MOVL R8, 1(AX)
5096	LEAL -64(R12), R12
5097	ADDQ $0x05, AX
5098	CMPL R12, $0x04
5099	JL   four_bytes_remain_match_nolit_encodeBetterBlockAsm
5100
5101	// emitRepeat
5102emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy:
5103	MOVL R12, SI
5104	LEAL -4(R12), R12
5105	CMPL SI, $0x08
5106	JLE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy
5107	CMPL SI, $0x0c
5108	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
5109	CMPL R8, $0x00000800
5110	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
5111
5112cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
5113	CMPL R12, $0x00000104
5114	JLT  repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy
5115	CMPL R12, $0x00010100
5116	JLT  repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy
5117	CMPL R12, $0x0100ffff
5118	JLT  repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy
5119	LEAL -16842747(R12), R12
5120	MOVW $0x001d, (AX)
5121	MOVW $0xfffb, 2(AX)
5122	MOVB $0xff, 4(AX)
5123	ADDQ $0x05, AX
5124	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy
5125
5126repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy:
5127	LEAL -65536(R12), R12
5128	MOVL R12, R8
5129	MOVW $0x001d, (AX)
5130	MOVW R12, 2(AX)
5131	SARL $0x10, R8
5132	MOVB R8, 4(AX)
5133	ADDQ $0x05, AX
5134	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5135
5136repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy:
5137	LEAL -256(R12), R12
5138	MOVW $0x0019, (AX)
5139	MOVW R12, 2(AX)
5140	ADDQ $0x04, AX
5141	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5142
5143repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy:
5144	LEAL -4(R12), R12
5145	MOVW $0x0015, (AX)
5146	MOVB R12, 2(AX)
5147	ADDQ $0x03, AX
5148	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5149
5150repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy:
5151	SHLL $0x02, R12
5152	ORL  $0x01, R12
5153	MOVW R12, (AX)
5154	ADDQ $0x02, AX
5155	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5156
5157repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
5158	XORQ SI, SI
5159	LEAL 1(SI)(R12*4), R12
5160	MOVB R8, 1(AX)
5161	SARL $0x08, R8
5162	SHLL $0x05, R8
5163	ORL  R8, R12
5164	MOVB R12, (AX)
5165	ADDQ $0x02, AX
5166	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5167	JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm
5168
5169four_bytes_remain_match_nolit_encodeBetterBlockAsm:
5170	TESTL R12, R12
5171	JZ    match_nolit_emitcopy_end_encodeBetterBlockAsm
5172	MOVB  $0x03, BL
5173	LEAL  -4(BX)(R12*4), R12
5174	MOVB  R12, (AX)
5175	MOVL  R8, 1(AX)
5176	ADDQ  $0x05, AX
5177	JMP   match_nolit_emitcopy_end_encodeBetterBlockAsm
5178
5179two_byte_offset_match_nolit_encodeBetterBlockAsm:
5180	CMPL R12, $0x40
5181	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm
5182	MOVB $0xee, (AX)
5183	MOVW R8, 1(AX)
5184	LEAL -60(R12), R12
5185	ADDQ $0x03, AX
5186
5187	// emitRepeat
5188emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short:
5189	MOVL R12, SI
5190	LEAL -4(R12), R12
5191	CMPL SI, $0x08
5192	JLE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short
5193	CMPL SI, $0x0c
5194	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
5195	CMPL R8, $0x00000800
5196	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
5197
5198cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
5199	CMPL R12, $0x00000104
5200	JLT  repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short
5201	CMPL R12, $0x00010100
5202	JLT  repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short
5203	CMPL R12, $0x0100ffff
5204	JLT  repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short
5205	LEAL -16842747(R12), R12
5206	MOVW $0x001d, (AX)
5207	MOVW $0xfffb, 2(AX)
5208	MOVB $0xff, 4(AX)
5209	ADDQ $0x05, AX
5210	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short
5211
5212repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short:
5213	LEAL -65536(R12), R12
5214	MOVL R12, R8
5215	MOVW $0x001d, (AX)
5216	MOVW R12, 2(AX)
5217	SARL $0x10, R8
5218	MOVB R8, 4(AX)
5219	ADDQ $0x05, AX
5220	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5221
5222repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short:
5223	LEAL -256(R12), R12
5224	MOVW $0x0019, (AX)
5225	MOVW R12, 2(AX)
5226	ADDQ $0x04, AX
5227	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5228
5229repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short:
5230	LEAL -4(R12), R12
5231	MOVW $0x0015, (AX)
5232	MOVB R12, 2(AX)
5233	ADDQ $0x03, AX
5234	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5235
5236repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short:
5237	SHLL $0x02, R12
5238	ORL  $0x01, R12
5239	MOVW R12, (AX)
5240	ADDQ $0x02, AX
5241	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5242
5243repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
5244	XORQ SI, SI
5245	LEAL 1(SI)(R12*4), R12
5246	MOVB R8, 1(AX)
5247	SARL $0x08, R8
5248	SHLL $0x05, R8
5249	ORL  R8, R12
5250	MOVB R12, (AX)
5251	ADDQ $0x02, AX
5252	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5253	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm
5254
5255two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
5256	CMPL R12, $0x0c
5257	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm
5258	CMPL R8, $0x00000800
5259	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm
5260	MOVB $0x01, BL
5261	LEAL -16(BX)(R12*4), R12
5262	MOVB R8, 1(AX)
5263	SHRL $0x08, R8
5264	SHLL $0x05, R8
5265	ORL  R8, R12
5266	MOVB R12, (AX)
5267	ADDQ $0x02, AX
5268	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5269
5270emit_copy_three_match_nolit_encodeBetterBlockAsm:
5271	MOVB $0x02, BL
5272	LEAL -4(BX)(R12*4), R12
5273	MOVB R12, (AX)
5274	MOVW R8, 1(AX)
5275	ADDQ $0x03, AX
5276	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5277
5278match_is_repeat_encodeBetterBlockAsm:
5279	MOVL 12(SP), SI
5280	CMPL SI, DI
5281	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
5282	MOVL DI, R9
5283	MOVL DI, 12(SP)
5284	LEAQ (DX)(SI*1), R10
5285	SUBL SI, R9
5286	LEAL -1(R9), SI
5287	CMPL SI, $0x3c
5288	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm
5289	CMPL SI, $0x00000100
5290	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm
5291	CMPL SI, $0x00010000
5292	JLT  three_bytes_match_emit_repeat_encodeBetterBlockAsm
5293	CMPL SI, $0x01000000
5294	JLT  four_bytes_match_emit_repeat_encodeBetterBlockAsm
5295	MOVB $0xfc, (AX)
5296	MOVL SI, 1(AX)
5297	ADDQ $0x05, AX
5298	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
5299
5300four_bytes_match_emit_repeat_encodeBetterBlockAsm:
5301	MOVL SI, R11
5302	SHRL $0x10, R11
5303	MOVB $0xf8, (AX)
5304	MOVW SI, 1(AX)
5305	MOVB R11, 3(AX)
5306	ADDQ $0x04, AX
5307	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
5308
5309three_bytes_match_emit_repeat_encodeBetterBlockAsm:
5310	MOVB $0xf4, (AX)
5311	MOVW SI, 1(AX)
5312	ADDQ $0x03, AX
5313	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
5314
5315two_bytes_match_emit_repeat_encodeBetterBlockAsm:
5316	MOVB $0xf0, (AX)
5317	MOVB SI, 1(AX)
5318	ADDQ $0x02, AX
5319	CMPL SI, $0x40
5320	JL   memmove_match_emit_repeat_encodeBetterBlockAsm
5321	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
5322
5323one_byte_match_emit_repeat_encodeBetterBlockAsm:
5324	SHLB $0x02, SI
5325	MOVB SI, (AX)
5326	ADDQ $0x01, AX
5327
5328memmove_match_emit_repeat_encodeBetterBlockAsm:
5329	LEAQ (AX)(R9*1), SI
5330
5331	// genMemMoveShort
5332	CMPQ R9, $0x04
5333	JLE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4
5334	CMPQ R9, $0x08
5335	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7
5336	CMPQ R9, $0x10
5337	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
5338	CMPQ R9, $0x20
5339	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
5340	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
5341
5342emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4:
5343	MOVL (R10), R11
5344	MOVL R11, (AX)
5345	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
5346
5347emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7:
5348	MOVL (R10), R11
5349	MOVL -4(R10)(R9*1), R10
5350	MOVL R11, (AX)
5351	MOVL R10, -4(AX)(R9*1)
5352	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
5353
5354emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
5355	MOVQ (R10), R11
5356	MOVQ -8(R10)(R9*1), R10
5357	MOVQ R11, (AX)
5358	MOVQ R10, -8(AX)(R9*1)
5359	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
5360
5361emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
5362	MOVOU (R10), X0
5363	MOVOU -16(R10)(R9*1), X1
5364	MOVOU X0, (AX)
5365	MOVOU X1, -16(AX)(R9*1)
5366	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
5367
5368emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
5369	MOVOU (R10), X0
5370	MOVOU 16(R10), X1
5371	MOVOU -32(R10)(R9*1), X2
5372	MOVOU -16(R10)(R9*1), X3
5373	MOVOU X0, (AX)
5374	MOVOU X1, 16(AX)
5375	MOVOU X2, -32(AX)(R9*1)
5376	MOVOU X3, -16(AX)(R9*1)
5377
5378memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
5379	MOVQ SI, AX
5380	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
5381
5382memmove_long_match_emit_repeat_encodeBetterBlockAsm:
5383	LEAQ (AX)(R9*1), SI
5384
5385	// genMemMoveLong
5386	MOVOU (R10), X0
5387	MOVOU 16(R10), X1
5388	MOVOU -32(R10)(R9*1), X2
5389	MOVOU -16(R10)(R9*1), X3
5390	MOVQ  R9, R13
5391	SHRQ  $0x05, R13
5392	MOVQ  AX, R11
5393	ANDL  $0x0000001f, R11
5394	MOVQ  $0x00000040, R14
5395	SUBQ  R11, R14
5396	DECQ  R13
5397	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
5398	LEAQ  -32(R10)(R14*1), R11
5399	LEAQ  -32(AX)(R14*1), R15
5400
5401emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
5402	MOVOU (R11), X4
5403	MOVOU 16(R11), X5
5404	MOVOA X4, (R15)
5405	MOVOA X5, 16(R15)
5406	ADDQ  $0x20, R15
5407	ADDQ  $0x20, R11
5408	ADDQ  $0x20, R14
5409	DECQ  R13
5410	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back
5411
5412emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
5413	MOVOU -32(R10)(R14*1), X4
5414	MOVOU -16(R10)(R14*1), X5
5415	MOVOA X4, -32(AX)(R14*1)
5416	MOVOA X5, -16(AX)(R14*1)
5417	ADDQ  $0x20, R14
5418	CMPQ  R9, R14
5419	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
5420	MOVOU X0, (AX)
5421	MOVOU X1, 16(AX)
5422	MOVOU X2, -32(AX)(R9*1)
5423	MOVOU X3, -16(AX)(R9*1)
5424	MOVQ  SI, AX
5425
5426emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
5427	ADDL R12, CX
5428	ADDL $0x04, R12
5429	MOVL CX, 12(SP)
5430
5431	// emitRepeat
5432emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm:
5433	MOVL R12, SI
5434	LEAL -4(R12), R12
5435	CMPL SI, $0x08
5436	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm
5437	CMPL SI, $0x0c
5438	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
5439	CMPL R8, $0x00000800
5440	JLT  repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
5441
5442cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
5443	CMPL R12, $0x00000104
5444	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm
5445	CMPL R12, $0x00010100
5446	JLT  repeat_four_match_nolit_repeat_encodeBetterBlockAsm
5447	CMPL R12, $0x0100ffff
5448	JLT  repeat_five_match_nolit_repeat_encodeBetterBlockAsm
5449	LEAL -16842747(R12), R12
5450	MOVW $0x001d, (AX)
5451	MOVW $0xfffb, 2(AX)
5452	MOVB $0xff, 4(AX)
5453	ADDQ $0x05, AX
5454	JMP  emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm
5455
5456repeat_five_match_nolit_repeat_encodeBetterBlockAsm:
5457	LEAL -65536(R12), R12
5458	MOVL R12, R8
5459	MOVW $0x001d, (AX)
5460	MOVW R12, 2(AX)
5461	SARL $0x10, R8
5462	MOVB R8, 4(AX)
5463	ADDQ $0x05, AX
5464	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5465
5466repeat_four_match_nolit_repeat_encodeBetterBlockAsm:
5467	LEAL -256(R12), R12
5468	MOVW $0x0019, (AX)
5469	MOVW R12, 2(AX)
5470	ADDQ $0x04, AX
5471	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5472
5473repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
5474	LEAL -4(R12), R12
5475	MOVW $0x0015, (AX)
5476	MOVB R12, 2(AX)
5477	ADDQ $0x03, AX
5478	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5479
5480repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
5481	SHLL $0x02, R12
5482	ORL  $0x01, R12
5483	MOVW R12, (AX)
5484	ADDQ $0x02, AX
5485	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5486
5487repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
5488	XORQ SI, SI
5489	LEAL 1(SI)(R12*4), R12
5490	MOVB R8, 1(AX)
5491	SARL $0x08, R8
5492	SHLL $0x05, R8
5493	ORL  R8, R12
5494	MOVB R12, (AX)
5495	ADDQ $0x02, AX
5496
5497match_nolit_emitcopy_end_encodeBetterBlockAsm:
5498	CMPL CX, 8(SP)
5499	JGE  emit_remainder_encodeBetterBlockAsm
5500	CMPQ AX, (SP)
5501	JL   match_nolit_dst_ok_encodeBetterBlockAsm
5502	MOVQ $0x00000000, ret+48(FP)
5503	RET
5504
5505match_nolit_dst_ok_encodeBetterBlockAsm:
5506	MOVQ  $0x00cf1bbcdcbfa563, SI
5507	MOVQ  $0x9e3779b1, R8
5508	INCL  DI
5509	MOVQ  (DX)(DI*1), R9
5510	MOVQ  R9, R10
5511	MOVQ  R9, R11
5512	MOVQ  R9, R12
5513	SHRQ  $0x08, R11
5514	MOVQ  R11, R13
5515	SHRQ  $0x10, R12
5516	LEAL  1(DI), R14
5517	LEAL  2(DI), R15
5518	MOVQ  -2(DX)(CX*1), R9
5519	SHLQ  $0x08, R10
5520	IMULQ SI, R10
5521	SHRQ  $0x30, R10
5522	SHLQ  $0x08, R13
5523	IMULQ SI, R13
5524	SHRQ  $0x30, R13
5525	SHLQ  $0x20, R11
5526	IMULQ R8, R11
5527	SHRQ  $0x32, R11
5528	SHLQ  $0x20, R12
5529	IMULQ R8, R12
5530	SHRQ  $0x32, R12
5531	MOVL  DI, 24(SP)(R10*4)
5532	MOVL  R14, 24(SP)(R13*4)
5533	MOVL  R14, 262168(SP)(R11*4)
5534	MOVL  R15, 262168(SP)(R12*4)
5535	MOVQ  R9, R10
5536	MOVQ  R9, R11
5537	SHRQ  $0x08, R11
5538	MOVQ  R11, R13
5539	LEAL  -2(CX), R9
5540	LEAL  -1(CX), DI
5541	SHLQ  $0x08, R10
5542	IMULQ SI, R10
5543	SHRQ  $0x30, R10
5544	SHLQ  $0x20, R11
5545	IMULQ R8, R11
5546	SHRQ  $0x32, R11
5547	SHLQ  $0x08, R13
5548	IMULQ SI, R13
5549	SHRQ  $0x30, R13
5550	MOVL  R9, 24(SP)(R10*4)
5551	MOVL  DI, 262168(SP)(R11*4)
5552	MOVL  DI, 24(SP)(R13*4)
5553	JMP   search_loop_encodeBetterBlockAsm
5554
5555emit_remainder_encodeBetterBlockAsm:
5556	MOVQ src_len+32(FP), CX
5557	SUBL 12(SP), CX
5558	LEAQ 5(AX)(CX*1), CX
5559	CMPQ CX, (SP)
5560	JL   emit_remainder_ok_encodeBetterBlockAsm
5561	MOVQ $0x00000000, ret+48(FP)
5562	RET
5563
5564emit_remainder_ok_encodeBetterBlockAsm:
5565	MOVQ src_len+32(FP), CX
5566	MOVL 12(SP), BX
5567	CMPL BX, CX
5568	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm
5569	MOVL CX, SI
5570	MOVL CX, 12(SP)
5571	LEAQ (DX)(BX*1), CX
5572	SUBL BX, SI
5573	LEAL -1(SI), DX
5574	CMPL DX, $0x3c
5575	JLT  one_byte_emit_remainder_encodeBetterBlockAsm
5576	CMPL DX, $0x00000100
5577	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm
5578	CMPL DX, $0x00010000
5579	JLT  three_bytes_emit_remainder_encodeBetterBlockAsm
5580	CMPL DX, $0x01000000
5581	JLT  four_bytes_emit_remainder_encodeBetterBlockAsm
5582	MOVB $0xfc, (AX)
5583	MOVL DX, 1(AX)
5584	ADDQ $0x05, AX
5585	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
5586
5587four_bytes_emit_remainder_encodeBetterBlockAsm:
5588	MOVL DX, BX
5589	SHRL $0x10, BX
5590	MOVB $0xf8, (AX)
5591	MOVW DX, 1(AX)
5592	MOVB BL, 3(AX)
5593	ADDQ $0x04, AX
5594	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
5595
5596three_bytes_emit_remainder_encodeBetterBlockAsm:
5597	MOVB $0xf4, (AX)
5598	MOVW DX, 1(AX)
5599	ADDQ $0x03, AX
5600	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
5601
5602two_bytes_emit_remainder_encodeBetterBlockAsm:
5603	MOVB $0xf0, (AX)
5604	MOVB DL, 1(AX)
5605	ADDQ $0x02, AX
5606	CMPL DX, $0x40
5607	JL   memmove_emit_remainder_encodeBetterBlockAsm
5608	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
5609
5610one_byte_emit_remainder_encodeBetterBlockAsm:
5611	SHLB $0x02, DL
5612	MOVB DL, (AX)
5613	ADDQ $0x01, AX
5614
5615memmove_emit_remainder_encodeBetterBlockAsm:
5616	LEAQ (AX)(SI*1), DX
5617	MOVL SI, BX
5618
5619	// genMemMoveShort
5620	CMPQ BX, $0x04
5621	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4
5622	CMPQ BX, $0x08
5623	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7
5624	CMPQ BX, $0x10
5625	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16
5626	CMPQ BX, $0x20
5627	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
5628	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64
5629
5630emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4:
5631	MOVL (CX), SI
5632	MOVL SI, (AX)
5633	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
5634
5635emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7:
5636	MOVL (CX), SI
5637	MOVL -4(CX)(BX*1), CX
5638	MOVL SI, (AX)
5639	MOVL CX, -4(AX)(BX*1)
5640	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
5641
5642emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16:
5643	MOVQ (CX), SI
5644	MOVQ -8(CX)(BX*1), CX
5645	MOVQ SI, (AX)
5646	MOVQ CX, -8(AX)(BX*1)
5647	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
5648
5649emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
5650	MOVOU (CX), X0
5651	MOVOU -16(CX)(BX*1), X1
5652	MOVOU X0, (AX)
5653	MOVOU X1, -16(AX)(BX*1)
5654	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm
5655
5656emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
5657	MOVOU (CX), X0
5658	MOVOU 16(CX), X1
5659	MOVOU -32(CX)(BX*1), X2
5660	MOVOU -16(CX)(BX*1), X3
5661	MOVOU X0, (AX)
5662	MOVOU X1, 16(AX)
5663	MOVOU X2, -32(AX)(BX*1)
5664	MOVOU X3, -16(AX)(BX*1)
5665
5666memmove_end_copy_emit_remainder_encodeBetterBlockAsm:
5667	MOVQ DX, AX
5668	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm
5669
5670memmove_long_emit_remainder_encodeBetterBlockAsm:
5671	LEAQ (AX)(SI*1), DX
5672	MOVL SI, BX
5673
5674	// genMemMoveLong
5675	MOVOU (CX), X0
5676	MOVOU 16(CX), X1
5677	MOVOU -32(CX)(BX*1), X2
5678	MOVOU -16(CX)(BX*1), X3
5679	MOVQ  BX, DI
5680	SHRQ  $0x05, DI
5681	MOVQ  AX, SI
5682	ANDL  $0x0000001f, SI
5683	MOVQ  $0x00000040, R8
5684	SUBQ  SI, R8
5685	DECQ  DI
5686	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
5687	LEAQ  -32(CX)(R8*1), SI
5688	LEAQ  -32(AX)(R8*1), R9
5689
5690emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back:
5691	MOVOU (SI), X4
5692	MOVOU 16(SI), X5
5693	MOVOA X4, (R9)
5694	MOVOA X5, 16(R9)
5695	ADDQ  $0x20, R9
5696	ADDQ  $0x20, SI
5697	ADDQ  $0x20, R8
5698	DECQ  DI
5699	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back
5700
5701emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32:
5702	MOVOU -32(CX)(R8*1), X4
5703	MOVOU -16(CX)(R8*1), X5
5704	MOVOA X4, -32(AX)(R8*1)
5705	MOVOA X5, -16(AX)(R8*1)
5706	ADDQ  $0x20, R8
5707	CMPQ  BX, R8
5708	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
5709	MOVOU X0, (AX)
5710	MOVOU X1, 16(AX)
5711	MOVOU X2, -32(AX)(BX*1)
5712	MOVOU X3, -16(AX)(BX*1)
5713	MOVQ  DX, AX
5714
5715emit_literal_done_emit_remainder_encodeBetterBlockAsm:
5716	MOVQ dst_base+0(FP), CX
5717	SUBQ CX, AX
5718	MOVQ AX, ret+48(FP)
5719	RET
5720
5721// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
5722// Requires: SSE2
5723TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56
5724	MOVQ dst_base+0(FP), AX
5725	MOVQ $0x00000a00, CX
5726	LEAQ 24(SP), DX
5727	PXOR X0, X0
5728
5729zero_loop_encodeBetterBlockAsm4MB:
5730	MOVOU X0, (DX)
5731	MOVOU X0, 16(DX)
5732	MOVOU X0, 32(DX)
5733	MOVOU X0, 48(DX)
5734	MOVOU X0, 64(DX)
5735	MOVOU X0, 80(DX)
5736	MOVOU X0, 96(DX)
5737	MOVOU X0, 112(DX)
5738	ADDQ  $0x80, DX
5739	DECQ  CX
5740	JNZ   zero_loop_encodeBetterBlockAsm4MB
5741	MOVL  $0x00000000, 12(SP)
5742	MOVQ  src_len+32(FP), CX
5743	LEAQ  -6(CX), DX
5744	LEAQ  -8(CX), SI
5745	MOVL  SI, 8(SP)
5746	SHRQ  $0x05, CX
5747	SUBL  CX, DX
5748	LEAQ  (AX)(DX*1), DX
5749	MOVQ  DX, (SP)
5750	MOVL  $0x00000001, CX
5751	MOVL  $0x00000000, 16(SP)
5752	MOVQ  src_base+24(FP), DX
5753
5754search_loop_encodeBetterBlockAsm4MB:
5755	MOVL CX, SI
5756	SUBL 12(SP), SI
5757	SHRL $0x07, SI
5758	CMPL SI, $0x63
5759	JLE  check_maxskip_ok_encodeBetterBlockAsm4MB
5760	LEAL 100(CX), SI
5761	JMP  check_maxskip_cont_encodeBetterBlockAsm4MB
5762
5763check_maxskip_ok_encodeBetterBlockAsm4MB:
5764	LEAL 1(CX)(SI*1), SI
5765
5766check_maxskip_cont_encodeBetterBlockAsm4MB:
5767	CMPL  SI, 8(SP)
5768	JGE   emit_remainder_encodeBetterBlockAsm4MB
5769	MOVQ  (DX)(CX*1), DI
5770	MOVL  SI, 20(SP)
5771	MOVQ  $0x00cf1bbcdcbfa563, R9
5772	MOVQ  $0x9e3779b1, SI
5773	MOVQ  DI, R10
5774	MOVQ  DI, R11
5775	SHLQ  $0x08, R10
5776	IMULQ R9, R10
5777	SHRQ  $0x30, R10
5778	SHLQ  $0x20, R11
5779	IMULQ SI, R11
5780	SHRQ  $0x32, R11
5781	MOVL  24(SP)(R10*4), SI
5782	MOVL  262168(SP)(R11*4), R8
5783	MOVL  CX, 24(SP)(R10*4)
5784	MOVL  CX, 262168(SP)(R11*4)
5785	CMPL  (DX)(SI*1), DI
5786	JEQ   candidate_match_encodeBetterBlockAsm4MB
5787	CMPL  (DX)(R8*1), DI
5788	JEQ   candidateS_match_encodeBetterBlockAsm4MB
5789	MOVL  20(SP), CX
5790	JMP   search_loop_encodeBetterBlockAsm4MB
5791
5792candidateS_match_encodeBetterBlockAsm4MB:
5793	SHRQ  $0x08, DI
5794	MOVQ  DI, R10
5795	SHLQ  $0x08, R10
5796	IMULQ R9, R10
5797	SHRQ  $0x30, R10
5798	MOVL  24(SP)(R10*4), SI
5799	INCL  CX
5800	MOVL  CX, 24(SP)(R10*4)
5801	CMPL  (DX)(SI*1), DI
5802	JEQ   candidate_match_encodeBetterBlockAsm4MB
5803	DECL  CX
5804	MOVL  R8, SI
5805
5806candidate_match_encodeBetterBlockAsm4MB:
5807	MOVL  12(SP), DI
5808	TESTL SI, SI
5809	JZ    match_extend_back_end_encodeBetterBlockAsm4MB
5810
5811match_extend_back_loop_encodeBetterBlockAsm4MB:
5812	CMPL CX, DI
5813	JLE  match_extend_back_end_encodeBetterBlockAsm4MB
5814	MOVB -1(DX)(SI*1), BL
5815	MOVB -1(DX)(CX*1), R8
5816	CMPB BL, R8
5817	JNE  match_extend_back_end_encodeBetterBlockAsm4MB
5818	LEAL -1(CX), CX
5819	DECL SI
5820	JZ   match_extend_back_end_encodeBetterBlockAsm4MB
5821	JMP  match_extend_back_loop_encodeBetterBlockAsm4MB
5822
5823match_extend_back_end_encodeBetterBlockAsm4MB:
5824	MOVL CX, DI
5825	SUBL 12(SP), DI
5826	LEAQ 4(AX)(DI*1), DI
5827	CMPQ DI, (SP)
5828	JL   match_dst_size_check_encodeBetterBlockAsm4MB
5829	MOVQ $0x00000000, ret+48(FP)
5830	RET
5831
5832match_dst_size_check_encodeBetterBlockAsm4MB:
5833	MOVL CX, DI
5834	ADDL $0x04, CX
5835	ADDL $0x04, SI
5836	MOVQ src_len+32(FP), R8
5837	SUBL CX, R8
5838	LEAQ (DX)(CX*1), R9
5839	LEAQ (DX)(SI*1), R10
5840
5841	// matchLen
5842	XORL R12, R12
5843	CMPL R8, $0x08
5844	JL   matchlen_single_match_nolit_encodeBetterBlockAsm4MB
5845
5846matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB:
5847	MOVQ  (R9)(R12*1), R11
5848	XORQ  (R10)(R12*1), R11
5849	TESTQ R11, R11
5850	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm4MB
5851	BSFQ  R11, R11
5852	SARQ  $0x03, R11
5853	LEAL  (R12)(R11*1), R12
5854	JMP   match_nolit_end_encodeBetterBlockAsm4MB
5855
5856matchlen_loop_match_nolit_encodeBetterBlockAsm4MB:
5857	LEAL -8(R8), R8
5858	LEAL 8(R12), R12
5859	CMPL R8, $0x08
5860	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB
5861
5862matchlen_single_match_nolit_encodeBetterBlockAsm4MB:
5863	TESTL R8, R8
5864	JZ    match_nolit_end_encodeBetterBlockAsm4MB
5865
5866matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB:
5867	MOVB (R9)(R12*1), R11
5868	CMPB (R10)(R12*1), R11
5869	JNE  match_nolit_end_encodeBetterBlockAsm4MB
5870	LEAL 1(R12), R12
5871	DECL R8
5872	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB
5873
5874match_nolit_end_encodeBetterBlockAsm4MB:
5875	MOVL CX, R8
5876	SUBL SI, R8
5877
5878	// Check if repeat
5879	CMPL 16(SP), R8
5880	JEQ  match_is_repeat_encodeBetterBlockAsm4MB
5881	CMPL R12, $0x01
5882	JG   match_length_ok_encodeBetterBlockAsm4MB
5883	CMPL R8, $0x0000ffff
5884	JLE  match_length_ok_encodeBetterBlockAsm4MB
5885	MOVL 20(SP), CX
5886	INCL CX
5887	JMP  search_loop_encodeBetterBlockAsm4MB
5888
5889match_length_ok_encodeBetterBlockAsm4MB:
5890	MOVL R8, 16(SP)
5891	MOVL 12(SP), SI
5892	CMPL SI, DI
5893	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm4MB
5894	MOVL DI, R9
5895	MOVL DI, 12(SP)
5896	LEAQ (DX)(SI*1), R10
5897	SUBL SI, R9
5898	LEAL -1(R9), SI
5899	CMPL SI, $0x3c
5900	JLT  one_byte_match_emit_encodeBetterBlockAsm4MB
5901	CMPL SI, $0x00000100
5902	JLT  two_bytes_match_emit_encodeBetterBlockAsm4MB
5903	CMPL SI, $0x00010000
5904	JLT  three_bytes_match_emit_encodeBetterBlockAsm4MB
5905	MOVL SI, R11
5906	SHRL $0x10, R11
5907	MOVB $0xf8, (AX)
5908	MOVW SI, 1(AX)
5909	MOVB R11, 3(AX)
5910	ADDQ $0x04, AX
5911	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB
5912
5913three_bytes_match_emit_encodeBetterBlockAsm4MB:
5914	MOVB $0xf4, (AX)
5915	MOVW SI, 1(AX)
5916	ADDQ $0x03, AX
5917	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB
5918
5919two_bytes_match_emit_encodeBetterBlockAsm4MB:
5920	MOVB $0xf0, (AX)
5921	MOVB SI, 1(AX)
5922	ADDQ $0x02, AX
5923	CMPL SI, $0x40
5924	JL   memmove_match_emit_encodeBetterBlockAsm4MB
5925	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB
5926
5927one_byte_match_emit_encodeBetterBlockAsm4MB:
5928	SHLB $0x02, SI
5929	MOVB SI, (AX)
5930	ADDQ $0x01, AX
5931
5932memmove_match_emit_encodeBetterBlockAsm4MB:
5933	LEAQ (AX)(R9*1), SI
5934
5935	// genMemMoveShort
5936	CMPQ R9, $0x04
5937	JLE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4
5938	CMPQ R9, $0x08
5939	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7
5940	CMPQ R9, $0x10
5941	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16
5942	CMPQ R9, $0x20
5943	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32
5944	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64
5945
5946emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4:
5947	MOVL (R10), R11
5948	MOVL R11, (AX)
5949	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
5950
5951emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7:
5952	MOVL (R10), R11
5953	MOVL -4(R10)(R9*1), R10
5954	MOVL R11, (AX)
5955	MOVL R10, -4(AX)(R9*1)
5956	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
5957
5958emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16:
5959	MOVQ (R10), R11
5960	MOVQ -8(R10)(R9*1), R10
5961	MOVQ R11, (AX)
5962	MOVQ R10, -8(AX)(R9*1)
5963	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
5964
5965emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32:
5966	MOVOU (R10), X0
5967	MOVOU -16(R10)(R9*1), X1
5968	MOVOU X0, (AX)
5969	MOVOU X1, -16(AX)(R9*1)
5970	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
5971
5972emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64:
5973	MOVOU (R10), X0
5974	MOVOU 16(R10), X1
5975	MOVOU -32(R10)(R9*1), X2
5976	MOVOU -16(R10)(R9*1), X3
5977	MOVOU X0, (AX)
5978	MOVOU X1, 16(AX)
5979	MOVOU X2, -32(AX)(R9*1)
5980	MOVOU X3, -16(AX)(R9*1)
5981
5982memmove_end_copy_match_emit_encodeBetterBlockAsm4MB:
5983	MOVQ SI, AX
5984	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm4MB
5985
5986memmove_long_match_emit_encodeBetterBlockAsm4MB:
5987	LEAQ (AX)(R9*1), SI
5988
5989	// genMemMoveLong
5990	MOVOU (R10), X0
5991	MOVOU 16(R10), X1
5992	MOVOU -32(R10)(R9*1), X2
5993	MOVOU -16(R10)(R9*1), X3
5994	MOVQ  R9, R13
5995	SHRQ  $0x05, R13
5996	MOVQ  AX, R11
5997	ANDL  $0x0000001f, R11
5998	MOVQ  $0x00000040, R14
5999	SUBQ  R11, R14
6000	DECQ  R13
6001	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
6002	LEAQ  -32(R10)(R14*1), R11
6003	LEAQ  -32(AX)(R14*1), R15
6004
6005emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back:
6006	MOVOU (R11), X4
6007	MOVOU 16(R11), X5
6008	MOVOA X4, (R15)
6009	MOVOA X5, 16(R15)
6010	ADDQ  $0x20, R15
6011	ADDQ  $0x20, R11
6012	ADDQ  $0x20, R14
6013	DECQ  R13
6014	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back
6015
6016emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
6017	MOVOU -32(R10)(R14*1), X4
6018	MOVOU -16(R10)(R14*1), X5
6019	MOVOA X4, -32(AX)(R14*1)
6020	MOVOA X5, -16(AX)(R14*1)
6021	ADDQ  $0x20, R14
6022	CMPQ  R9, R14
6023	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
6024	MOVOU X0, (AX)
6025	MOVOU X1, 16(AX)
6026	MOVOU X2, -32(AX)(R9*1)
6027	MOVOU X3, -16(AX)(R9*1)
6028	MOVQ  SI, AX
6029
6030emit_literal_done_match_emit_encodeBetterBlockAsm4MB:
6031	ADDL R12, CX
6032	ADDL $0x04, R12
6033	MOVL CX, 12(SP)
6034
6035	// emitCopy
6036	CMPL R8, $0x00010000
6037	JL   two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
6038
6039four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB:
6040	CMPL R12, $0x40
6041	JLE  four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
6042	MOVB $0xff, (AX)
6043	MOVL R8, 1(AX)
6044	LEAL -64(R12), R12
6045	ADDQ $0x05, AX
6046	CMPL R12, $0x04
6047	JL   four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
6048
6049	// emitRepeat
6050	MOVL R12, SI
6051	LEAL -4(R12), R12
6052	CMPL SI, $0x08
6053	JLE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy
6054	CMPL SI, $0x0c
6055	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
6056	CMPL R8, $0x00000800
6057	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
6058
6059cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
6060	CMPL R12, $0x00000104
6061	JLT  repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy
6062	CMPL R12, $0x00010100
6063	JLT  repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy
6064	LEAL -65536(R12), R12
6065	MOVL R12, R8
6066	MOVW $0x001d, (AX)
6067	MOVW R12, 2(AX)
6068	SARL $0x10, R8
6069	MOVB R8, 4(AX)
6070	ADDQ $0x05, AX
6071	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6072
6073repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
6074	LEAL -256(R12), R12
6075	MOVW $0x0019, (AX)
6076	MOVW R12, 2(AX)
6077	ADDQ $0x04, AX
6078	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6079
6080repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
6081	LEAL -4(R12), R12
6082	MOVW $0x0015, (AX)
6083	MOVB R12, 2(AX)
6084	ADDQ $0x03, AX
6085	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6086
6087repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
6088	SHLL $0x02, R12
6089	ORL  $0x01, R12
6090	MOVW R12, (AX)
6091	ADDQ $0x02, AX
6092	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6093
6094repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
6095	XORQ SI, SI
6096	LEAL 1(SI)(R12*4), R12
6097	MOVB R8, 1(AX)
6098	SARL $0x08, R8
6099	SHLL $0x05, R8
6100	ORL  R8, R12
6101	MOVB R12, (AX)
6102	ADDQ $0x02, AX
6103	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6104	JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB
6105
6106four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
6107	TESTL R12, R12
6108	JZ    match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6109	MOVB  $0x03, BL
6110	LEAL  -4(BX)(R12*4), R12
6111	MOVB  R12, (AX)
6112	MOVL  R8, 1(AX)
6113	ADDQ  $0x05, AX
6114	JMP   match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6115
6116two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
6117	CMPL R12, $0x40
6118	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
6119	MOVB $0xee, (AX)
6120	MOVW R8, 1(AX)
6121	LEAL -60(R12), R12
6122	ADDQ $0x03, AX
6123
6124	// emitRepeat
6125	MOVL R12, SI
6126	LEAL -4(R12), R12
6127	CMPL SI, $0x08
6128	JLE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
6129	CMPL SI, $0x0c
6130	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
6131	CMPL R8, $0x00000800
6132	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
6133
6134cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
6135	CMPL R12, $0x00000104
6136	JLT  repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
6137	CMPL R12, $0x00010100
6138	JLT  repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
6139	LEAL -65536(R12), R12
6140	MOVL R12, R8
6141	MOVW $0x001d, (AX)
6142	MOVW R12, 2(AX)
6143	SARL $0x10, R8
6144	MOVB R8, 4(AX)
6145	ADDQ $0x05, AX
6146	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6147
6148repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
6149	LEAL -256(R12), R12
6150	MOVW $0x0019, (AX)
6151	MOVW R12, 2(AX)
6152	ADDQ $0x04, AX
6153	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6154
6155repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
6156	LEAL -4(R12), R12
6157	MOVW $0x0015, (AX)
6158	MOVB R12, 2(AX)
6159	ADDQ $0x03, AX
6160	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6161
6162repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
6163	SHLL $0x02, R12
6164	ORL  $0x01, R12
6165	MOVW R12, (AX)
6166	ADDQ $0x02, AX
6167	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6168
6169repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
6170	XORQ SI, SI
6171	LEAL 1(SI)(R12*4), R12
6172	MOVB R8, 1(AX)
6173	SARL $0x08, R8
6174	SHLL $0x05, R8
6175	ORL  R8, R12
6176	MOVB R12, (AX)
6177	ADDQ $0x02, AX
6178	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6179	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
6180
6181two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
6182	CMPL R12, $0x0c
6183	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
6184	CMPL R8, $0x00000800
6185	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
6186	MOVB $0x01, BL
6187	LEAL -16(BX)(R12*4), R12
6188	MOVB R8, 1(AX)
6189	SHRL $0x08, R8
6190	SHLL $0x05, R8
6191	ORL  R8, R12
6192	MOVB R12, (AX)
6193	ADDQ $0x02, AX
6194	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6195
6196emit_copy_three_match_nolit_encodeBetterBlockAsm4MB:
6197	MOVB $0x02, BL
6198	LEAL -4(BX)(R12*4), R12
6199	MOVB R12, (AX)
6200	MOVW R8, 1(AX)
6201	ADDQ $0x03, AX
6202	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6203
6204match_is_repeat_encodeBetterBlockAsm4MB:
6205	MOVL 12(SP), SI
6206	CMPL SI, DI
6207	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
6208	MOVL DI, R9
6209	MOVL DI, 12(SP)
6210	LEAQ (DX)(SI*1), R10
6211	SUBL SI, R9
6212	LEAL -1(R9), SI
6213	CMPL SI, $0x3c
6214	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm4MB
6215	CMPL SI, $0x00000100
6216	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
6217	CMPL SI, $0x00010000
6218	JLT  three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
6219	MOVL SI, R11
6220	SHRL $0x10, R11
6221	MOVB $0xf8, (AX)
6222	MOVW SI, 1(AX)
6223	MOVB R11, 3(AX)
6224	ADDQ $0x04, AX
6225	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
6226
6227three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
6228	MOVB $0xf4, (AX)
6229	MOVW SI, 1(AX)
6230	ADDQ $0x03, AX
6231	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
6232
6233two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
6234	MOVB $0xf0, (AX)
6235	MOVB SI, 1(AX)
6236	ADDQ $0x02, AX
6237	CMPL SI, $0x40
6238	JL   memmove_match_emit_repeat_encodeBetterBlockAsm4MB
6239	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
6240
6241one_byte_match_emit_repeat_encodeBetterBlockAsm4MB:
6242	SHLB $0x02, SI
6243	MOVB SI, (AX)
6244	ADDQ $0x01, AX
6245
6246memmove_match_emit_repeat_encodeBetterBlockAsm4MB:
6247	LEAQ (AX)(R9*1), SI
6248
6249	// genMemMoveShort
6250	CMPQ R9, $0x04
6251	JLE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4
6252	CMPQ R9, $0x08
6253	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7
6254	CMPQ R9, $0x10
6255	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16
6256	CMPQ R9, $0x20
6257	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32
6258	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64
6259
6260emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4:
6261	MOVL (R10), R11
6262	MOVL R11, (AX)
6263	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
6264
6265emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7:
6266	MOVL (R10), R11
6267	MOVL -4(R10)(R9*1), R10
6268	MOVL R11, (AX)
6269	MOVL R10, -4(AX)(R9*1)
6270	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
6271
6272emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16:
6273	MOVQ (R10), R11
6274	MOVQ -8(R10)(R9*1), R10
6275	MOVQ R11, (AX)
6276	MOVQ R10, -8(AX)(R9*1)
6277	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
6278
6279emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32:
6280	MOVOU (R10), X0
6281	MOVOU -16(R10)(R9*1), X1
6282	MOVOU X0, (AX)
6283	MOVOU X1, -16(AX)(R9*1)
6284	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
6285
6286emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64:
6287	MOVOU (R10), X0
6288	MOVOU 16(R10), X1
6289	MOVOU -32(R10)(R9*1), X2
6290	MOVOU -16(R10)(R9*1), X3
6291	MOVOU X0, (AX)
6292	MOVOU X1, 16(AX)
6293	MOVOU X2, -32(AX)(R9*1)
6294	MOVOU X3, -16(AX)(R9*1)
6295
6296memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB:
6297	MOVQ SI, AX
6298	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
6299
6300memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB:
6301	LEAQ (AX)(R9*1), SI
6302
6303	// genMemMoveLong
6304	MOVOU (R10), X0
6305	MOVOU 16(R10), X1
6306	MOVOU -32(R10)(R9*1), X2
6307	MOVOU -16(R10)(R9*1), X3
6308	MOVQ  R9, R13
6309	SHRQ  $0x05, R13
6310	MOVQ  AX, R11
6311	ANDL  $0x0000001f, R11
6312	MOVQ  $0x00000040, R14
6313	SUBQ  R11, R14
6314	DECQ  R13
6315	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
6316	LEAQ  -32(R10)(R14*1), R11
6317	LEAQ  -32(AX)(R14*1), R15
6318
6319emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back:
6320	MOVOU (R11), X4
6321	MOVOU 16(R11), X5
6322	MOVOA X4, (R15)
6323	MOVOA X5, 16(R15)
6324	ADDQ  $0x20, R15
6325	ADDQ  $0x20, R11
6326	ADDQ  $0x20, R14
6327	DECQ  R13
6328	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back
6329
6330emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
6331	MOVOU -32(R10)(R14*1), X4
6332	MOVOU -16(R10)(R14*1), X5
6333	MOVOA X4, -32(AX)(R14*1)
6334	MOVOA X5, -16(AX)(R14*1)
6335	ADDQ  $0x20, R14
6336	CMPQ  R9, R14
6337	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
6338	MOVOU X0, (AX)
6339	MOVOU X1, 16(AX)
6340	MOVOU X2, -32(AX)(R9*1)
6341	MOVOU X3, -16(AX)(R9*1)
6342	MOVQ  SI, AX
6343
6344emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB:
6345	ADDL R12, CX
6346	ADDL $0x04, R12
6347	MOVL CX, 12(SP)
6348
6349	// emitRepeat
6350	MOVL R12, SI
6351	LEAL -4(R12), R12
6352	CMPL SI, $0x08
6353	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB
6354	CMPL SI, $0x0c
6355	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
6356	CMPL R8, $0x00000800
6357	JLT  repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
6358
6359cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
6360	CMPL R12, $0x00000104
6361	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB
6362	CMPL R12, $0x00010100
6363	JLT  repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB
6364	LEAL -65536(R12), R12
6365	MOVL R12, R8
6366	MOVW $0x001d, (AX)
6367	MOVW R12, 2(AX)
6368	SARL $0x10, R8
6369	MOVB R8, 4(AX)
6370	ADDQ $0x05, AX
6371	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6372
6373repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB:
6374	LEAL -256(R12), R12
6375	MOVW $0x0019, (AX)
6376	MOVW R12, 2(AX)
6377	ADDQ $0x04, AX
6378	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6379
6380repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB:
6381	LEAL -4(R12), R12
6382	MOVW $0x0015, (AX)
6383	MOVB R12, 2(AX)
6384	ADDQ $0x03, AX
6385	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6386
6387repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB:
6388	SHLL $0x02, R12
6389	ORL  $0x01, R12
6390	MOVW R12, (AX)
6391	ADDQ $0x02, AX
6392	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6393
6394repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
6395	XORQ SI, SI
6396	LEAL 1(SI)(R12*4), R12
6397	MOVB R8, 1(AX)
6398	SARL $0x08, R8
6399	SHLL $0x05, R8
6400	ORL  R8, R12
6401	MOVB R12, (AX)
6402	ADDQ $0x02, AX
6403
6404match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
6405	CMPL CX, 8(SP)
6406	JGE  emit_remainder_encodeBetterBlockAsm4MB
6407	CMPQ AX, (SP)
6408	JL   match_nolit_dst_ok_encodeBetterBlockAsm4MB
6409	MOVQ $0x00000000, ret+48(FP)
6410	RET
6411
6412match_nolit_dst_ok_encodeBetterBlockAsm4MB:
6413	MOVQ  $0x00cf1bbcdcbfa563, SI
6414	MOVQ  $0x9e3779b1, R8
6415	INCL  DI
6416	MOVQ  (DX)(DI*1), R9
6417	MOVQ  R9, R10
6418	MOVQ  R9, R11
6419	MOVQ  R9, R12
6420	SHRQ  $0x08, R11
6421	MOVQ  R11, R13
6422	SHRQ  $0x10, R12
6423	LEAL  1(DI), R14
6424	LEAL  2(DI), R15
6425	MOVQ  -2(DX)(CX*1), R9
6426	SHLQ  $0x08, R10
6427	IMULQ SI, R10
6428	SHRQ  $0x30, R10
6429	SHLQ  $0x08, R13
6430	IMULQ SI, R13
6431	SHRQ  $0x30, R13
6432	SHLQ  $0x20, R11
6433	IMULQ R8, R11
6434	SHRQ  $0x32, R11
6435	SHLQ  $0x20, R12
6436	IMULQ R8, R12
6437	SHRQ  $0x32, R12
6438	MOVL  DI, 24(SP)(R10*4)
6439	MOVL  R14, 24(SP)(R13*4)
6440	MOVL  R14, 262168(SP)(R11*4)
6441	MOVL  R15, 262168(SP)(R12*4)
6442	MOVQ  R9, R10
6443	MOVQ  R9, R11
6444	SHRQ  $0x08, R11
6445	MOVQ  R11, R13
6446	LEAL  -2(CX), R9
6447	LEAL  -1(CX), DI
6448	SHLQ  $0x08, R10
6449	IMULQ SI, R10
6450	SHRQ  $0x30, R10
6451	SHLQ  $0x20, R11
6452	IMULQ R8, R11
6453	SHRQ  $0x32, R11
6454	SHLQ  $0x08, R13
6455	IMULQ SI, R13
6456	SHRQ  $0x30, R13
6457	MOVL  R9, 24(SP)(R10*4)
6458	MOVL  DI, 262168(SP)(R11*4)
6459	MOVL  DI, 24(SP)(R13*4)
6460	JMP   search_loop_encodeBetterBlockAsm4MB
6461
6462emit_remainder_encodeBetterBlockAsm4MB:
6463	MOVQ src_len+32(FP), CX
6464	SUBL 12(SP), CX
6465	LEAQ 4(AX)(CX*1), CX
6466	CMPQ CX, (SP)
6467	JL   emit_remainder_ok_encodeBetterBlockAsm4MB
6468	MOVQ $0x00000000, ret+48(FP)
6469	RET
6470
6471emit_remainder_ok_encodeBetterBlockAsm4MB:
6472	MOVQ src_len+32(FP), CX
6473	MOVL 12(SP), BX
6474	CMPL BX, CX
6475	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
6476	MOVL CX, SI
6477	MOVL CX, 12(SP)
6478	LEAQ (DX)(BX*1), CX
6479	SUBL BX, SI
6480	LEAL -1(SI), DX
6481	CMPL DX, $0x3c
6482	JLT  one_byte_emit_remainder_encodeBetterBlockAsm4MB
6483	CMPL DX, $0x00000100
6484	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm4MB
6485	CMPL DX, $0x00010000
6486	JLT  three_bytes_emit_remainder_encodeBetterBlockAsm4MB
6487	MOVL DX, BX
6488	SHRL $0x10, BX
6489	MOVB $0xf8, (AX)
6490	MOVW DX, 1(AX)
6491	MOVB BL, 3(AX)
6492	ADDQ $0x04, AX
6493	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB
6494
6495three_bytes_emit_remainder_encodeBetterBlockAsm4MB:
6496	MOVB $0xf4, (AX)
6497	MOVW DX, 1(AX)
6498	ADDQ $0x03, AX
6499	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB
6500
6501two_bytes_emit_remainder_encodeBetterBlockAsm4MB:
6502	MOVB $0xf0, (AX)
6503	MOVB DL, 1(AX)
6504	ADDQ $0x02, AX
6505	CMPL DX, $0x40
6506	JL   memmove_emit_remainder_encodeBetterBlockAsm4MB
6507	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB
6508
6509one_byte_emit_remainder_encodeBetterBlockAsm4MB:
6510	SHLB $0x02, DL
6511	MOVB DL, (AX)
6512	ADDQ $0x01, AX
6513
6514memmove_emit_remainder_encodeBetterBlockAsm4MB:
6515	LEAQ (AX)(SI*1), DX
6516	MOVL SI, BX
6517
6518	// genMemMoveShort
6519	CMPQ BX, $0x04
6520	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4
6521	CMPQ BX, $0x08
6522	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7
6523	CMPQ BX, $0x10
6524	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16
6525	CMPQ BX, $0x20
6526	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32
6527	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64
6528
6529emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4:
6530	MOVL (CX), SI
6531	MOVL SI, (AX)
6532	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
6533
6534emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7:
6535	MOVL (CX), SI
6536	MOVL -4(CX)(BX*1), CX
6537	MOVL SI, (AX)
6538	MOVL CX, -4(AX)(BX*1)
6539	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
6540
6541emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16:
6542	MOVQ (CX), SI
6543	MOVQ -8(CX)(BX*1), CX
6544	MOVQ SI, (AX)
6545	MOVQ CX, -8(AX)(BX*1)
6546	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
6547
6548emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32:
6549	MOVOU (CX), X0
6550	MOVOU -16(CX)(BX*1), X1
6551	MOVOU X0, (AX)
6552	MOVOU X1, -16(AX)(BX*1)
6553	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
6554
6555emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64:
6556	MOVOU (CX), X0
6557	MOVOU 16(CX), X1
6558	MOVOU -32(CX)(BX*1), X2
6559	MOVOU -16(CX)(BX*1), X3
6560	MOVOU X0, (AX)
6561	MOVOU X1, 16(AX)
6562	MOVOU X2, -32(AX)(BX*1)
6563	MOVOU X3, -16(AX)(BX*1)
6564
6565memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB:
6566	MOVQ DX, AX
6567	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
6568
6569memmove_long_emit_remainder_encodeBetterBlockAsm4MB:
6570	LEAQ (AX)(SI*1), DX
6571	MOVL SI, BX
6572
6573	// genMemMoveLong
6574	MOVOU (CX), X0
6575	MOVOU 16(CX), X1
6576	MOVOU -32(CX)(BX*1), X2
6577	MOVOU -16(CX)(BX*1), X3
6578	MOVQ  BX, DI
6579	SHRQ  $0x05, DI
6580	MOVQ  AX, SI
6581	ANDL  $0x0000001f, SI
6582	MOVQ  $0x00000040, R8
6583	SUBQ  SI, R8
6584	DECQ  DI
6585	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
6586	LEAQ  -32(CX)(R8*1), SI
6587	LEAQ  -32(AX)(R8*1), R9
6588
6589emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back:
6590	MOVOU (SI), X4
6591	MOVOU 16(SI), X5
6592	MOVOA X4, (R9)
6593	MOVOA X5, 16(R9)
6594	ADDQ  $0x20, R9
6595	ADDQ  $0x20, SI
6596	ADDQ  $0x20, R8
6597	DECQ  DI
6598	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back
6599
6600emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
6601	MOVOU -32(CX)(R8*1), X4
6602	MOVOU -16(CX)(R8*1), X5
6603	MOVOA X4, -32(AX)(R8*1)
6604	MOVOA X5, -16(AX)(R8*1)
6605	ADDQ  $0x20, R8
6606	CMPQ  BX, R8
6607	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
6608	MOVOU X0, (AX)
6609	MOVOU X1, 16(AX)
6610	MOVOU X2, -32(AX)(BX*1)
6611	MOVOU X3, -16(AX)(BX*1)
6612	MOVQ  DX, AX
6613
6614emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB:
6615	MOVQ dst_base+0(FP), CX
6616	SUBQ CX, AX
6617	MOVQ AX, ret+48(FP)
6618	RET
6619
6620// func encodeBetterBlockAsm12B(dst []byte, src []byte) int
6621// Requires: SSE2
6622TEXT ·encodeBetterBlockAsm12B(SB), $81944-56
6623	MOVQ dst_base+0(FP), AX
6624	MOVQ $0x00000280, CX
6625	LEAQ 24(SP), DX
6626	PXOR X0, X0
6627
6628zero_loop_encodeBetterBlockAsm12B:
6629	MOVOU X0, (DX)
6630	MOVOU X0, 16(DX)
6631	MOVOU X0, 32(DX)
6632	MOVOU X0, 48(DX)
6633	MOVOU X0, 64(DX)
6634	MOVOU X0, 80(DX)
6635	MOVOU X0, 96(DX)
6636	MOVOU X0, 112(DX)
6637	ADDQ  $0x80, DX
6638	DECQ  CX
6639	JNZ   zero_loop_encodeBetterBlockAsm12B
6640	MOVL  $0x00000000, 12(SP)
6641	MOVQ  src_len+32(FP), CX
6642	LEAQ  -6(CX), DX
6643	LEAQ  -8(CX), SI
6644	MOVL  SI, 8(SP)
6645	SHRQ  $0x05, CX
6646	SUBL  CX, DX
6647	LEAQ  (AX)(DX*1), DX
6648	MOVQ  DX, (SP)
6649	MOVL  $0x00000001, CX
6650	MOVL  $0x00000000, 16(SP)
6651	MOVQ  src_base+24(FP), DX
6652
6653search_loop_encodeBetterBlockAsm12B:
6654	MOVL  CX, SI
6655	SUBL  12(SP), SI
6656	SHRL  $0x06, SI
6657	LEAL  1(CX)(SI*1), SI
6658	CMPL  SI, 8(SP)
6659	JGE   emit_remainder_encodeBetterBlockAsm12B
6660	MOVQ  (DX)(CX*1), DI
6661	MOVL  SI, 20(SP)
6662	MOVQ  $0x0000cf1bbcdcbf9b, R9
6663	MOVQ  $0x9e3779b1, SI
6664	MOVQ  DI, R10
6665	MOVQ  DI, R11
6666	SHLQ  $0x10, R10
6667	IMULQ R9, R10
6668	SHRQ  $0x32, R10
6669	SHLQ  $0x20, R11
6670	IMULQ SI, R11
6671	SHRQ  $0x34, R11
6672	MOVL  24(SP)(R10*4), SI
6673	MOVL  65560(SP)(R11*4), R8
6674	MOVL  CX, 24(SP)(R10*4)
6675	MOVL  CX, 65560(SP)(R11*4)
6676	CMPL  (DX)(SI*1), DI
6677	JEQ   candidate_match_encodeBetterBlockAsm12B
6678	CMPL  (DX)(R8*1), DI
6679	JEQ   candidateS_match_encodeBetterBlockAsm12B
6680	MOVL  20(SP), CX
6681	JMP   search_loop_encodeBetterBlockAsm12B
6682
6683candidateS_match_encodeBetterBlockAsm12B:
6684	SHRQ  $0x08, DI
6685	MOVQ  DI, R10
6686	SHLQ  $0x10, R10
6687	IMULQ R9, R10
6688	SHRQ  $0x32, R10
6689	MOVL  24(SP)(R10*4), SI
6690	INCL  CX
6691	MOVL  CX, 24(SP)(R10*4)
6692	CMPL  (DX)(SI*1), DI
6693	JEQ   candidate_match_encodeBetterBlockAsm12B
6694	DECL  CX
6695	MOVL  R8, SI
6696
6697candidate_match_encodeBetterBlockAsm12B:
6698	MOVL  12(SP), DI
6699	TESTL SI, SI
6700	JZ    match_extend_back_end_encodeBetterBlockAsm12B
6701
6702match_extend_back_loop_encodeBetterBlockAsm12B:
6703	CMPL CX, DI
6704	JLE  match_extend_back_end_encodeBetterBlockAsm12B
6705	MOVB -1(DX)(SI*1), BL
6706	MOVB -1(DX)(CX*1), R8
6707	CMPB BL, R8
6708	JNE  match_extend_back_end_encodeBetterBlockAsm12B
6709	LEAL -1(CX), CX
6710	DECL SI
6711	JZ   match_extend_back_end_encodeBetterBlockAsm12B
6712	JMP  match_extend_back_loop_encodeBetterBlockAsm12B
6713
6714match_extend_back_end_encodeBetterBlockAsm12B:
6715	MOVL CX, DI
6716	SUBL 12(SP), DI
6717	LEAQ 3(AX)(DI*1), DI
6718	CMPQ DI, (SP)
6719	JL   match_dst_size_check_encodeBetterBlockAsm12B
6720	MOVQ $0x00000000, ret+48(FP)
6721	RET
6722
6723match_dst_size_check_encodeBetterBlockAsm12B:
6724	MOVL CX, DI
6725	ADDL $0x04, CX
6726	ADDL $0x04, SI
6727	MOVQ src_len+32(FP), R8
6728	SUBL CX, R8
6729	LEAQ (DX)(CX*1), R9
6730	LEAQ (DX)(SI*1), R10
6731
6732	// matchLen
6733	XORL R12, R12
6734	CMPL R8, $0x08
6735	JL   matchlen_single_match_nolit_encodeBetterBlockAsm12B
6736
6737matchlen_loopback_match_nolit_encodeBetterBlockAsm12B:
6738	MOVQ  (R9)(R12*1), R11
6739	XORQ  (R10)(R12*1), R11
6740	TESTQ R11, R11
6741	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm12B
6742	BSFQ  R11, R11
6743	SARQ  $0x03, R11
6744	LEAL  (R12)(R11*1), R12
6745	JMP   match_nolit_end_encodeBetterBlockAsm12B
6746
6747matchlen_loop_match_nolit_encodeBetterBlockAsm12B:
6748	LEAL -8(R8), R8
6749	LEAL 8(R12), R12
6750	CMPL R8, $0x08
6751	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm12B
6752
6753matchlen_single_match_nolit_encodeBetterBlockAsm12B:
6754	TESTL R8, R8
6755	JZ    match_nolit_end_encodeBetterBlockAsm12B
6756
6757matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B:
6758	MOVB (R9)(R12*1), R11
6759	CMPB (R10)(R12*1), R11
6760	JNE  match_nolit_end_encodeBetterBlockAsm12B
6761	LEAL 1(R12), R12
6762	DECL R8
6763	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B
6764
6765match_nolit_end_encodeBetterBlockAsm12B:
6766	MOVL CX, R8
6767	SUBL SI, R8
6768
6769	// Check if repeat
6770	CMPL 16(SP), R8
6771	JEQ  match_is_repeat_encodeBetterBlockAsm12B
6772	MOVL R8, 16(SP)
6773	MOVL 12(SP), SI
6774	CMPL SI, DI
6775	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm12B
6776	MOVL DI, R9
6777	MOVL DI, 12(SP)
6778	LEAQ (DX)(SI*1), R10
6779	SUBL SI, R9
6780	LEAL -1(R9), SI
6781	CMPL SI, $0x3c
6782	JLT  one_byte_match_emit_encodeBetterBlockAsm12B
6783	CMPL SI, $0x00000100
6784	JLT  two_bytes_match_emit_encodeBetterBlockAsm12B
6785	MOVB $0xf4, (AX)
6786	MOVW SI, 1(AX)
6787	ADDQ $0x03, AX
6788	JMP  memmove_long_match_emit_encodeBetterBlockAsm12B
6789
6790two_bytes_match_emit_encodeBetterBlockAsm12B:
6791	MOVB $0xf0, (AX)
6792	MOVB SI, 1(AX)
6793	ADDQ $0x02, AX
6794	CMPL SI, $0x40
6795	JL   memmove_match_emit_encodeBetterBlockAsm12B
6796	JMP  memmove_long_match_emit_encodeBetterBlockAsm12B
6797
6798one_byte_match_emit_encodeBetterBlockAsm12B:
6799	SHLB $0x02, SI
6800	MOVB SI, (AX)
6801	ADDQ $0x01, AX
6802
6803memmove_match_emit_encodeBetterBlockAsm12B:
6804	LEAQ (AX)(R9*1), SI
6805
6806	// genMemMoveShort
6807	CMPQ R9, $0x04
6808	JLE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4
6809	CMPQ R9, $0x08
6810	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7
6811	CMPQ R9, $0x10
6812	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16
6813	CMPQ R9, $0x20
6814	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32
6815	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64
6816
6817emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4:
6818	MOVL (R10), R11
6819	MOVL R11, (AX)
6820	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B
6821
6822emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7:
6823	MOVL (R10), R11
6824	MOVL -4(R10)(R9*1), R10
6825	MOVL R11, (AX)
6826	MOVL R10, -4(AX)(R9*1)
6827	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B
6828
6829emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16:
6830	MOVQ (R10), R11
6831	MOVQ -8(R10)(R9*1), R10
6832	MOVQ R11, (AX)
6833	MOVQ R10, -8(AX)(R9*1)
6834	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B
6835
6836emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32:
6837	MOVOU (R10), X0
6838	MOVOU -16(R10)(R9*1), X1
6839	MOVOU X0, (AX)
6840	MOVOU X1, -16(AX)(R9*1)
6841	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm12B
6842
6843emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64:
6844	MOVOU (R10), X0
6845	MOVOU 16(R10), X1
6846	MOVOU -32(R10)(R9*1), X2
6847	MOVOU -16(R10)(R9*1), X3
6848	MOVOU X0, (AX)
6849	MOVOU X1, 16(AX)
6850	MOVOU X2, -32(AX)(R9*1)
6851	MOVOU X3, -16(AX)(R9*1)
6852
6853memmove_end_copy_match_emit_encodeBetterBlockAsm12B:
6854	MOVQ SI, AX
6855	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm12B
6856
6857memmove_long_match_emit_encodeBetterBlockAsm12B:
6858	LEAQ (AX)(R9*1), SI
6859
6860	// genMemMoveLong
6861	MOVOU (R10), X0
6862	MOVOU 16(R10), X1
6863	MOVOU -32(R10)(R9*1), X2
6864	MOVOU -16(R10)(R9*1), X3
6865	MOVQ  R9, R13
6866	SHRQ  $0x05, R13
6867	MOVQ  AX, R11
6868	ANDL  $0x0000001f, R11
6869	MOVQ  $0x00000040, R14
6870	SUBQ  R11, R14
6871	DECQ  R13
6872	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
6873	LEAQ  -32(R10)(R14*1), R11
6874	LEAQ  -32(AX)(R14*1), R15
6875
6876emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back:
6877	MOVOU (R11), X4
6878	MOVOU 16(R11), X5
6879	MOVOA X4, (R15)
6880	MOVOA X5, 16(R15)
6881	ADDQ  $0x20, R15
6882	ADDQ  $0x20, R11
6883	ADDQ  $0x20, R14
6884	DECQ  R13
6885	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back
6886
6887emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
6888	MOVOU -32(R10)(R14*1), X4
6889	MOVOU -16(R10)(R14*1), X5
6890	MOVOA X4, -32(AX)(R14*1)
6891	MOVOA X5, -16(AX)(R14*1)
6892	ADDQ  $0x20, R14
6893	CMPQ  R9, R14
6894	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
6895	MOVOU X0, (AX)
6896	MOVOU X1, 16(AX)
6897	MOVOU X2, -32(AX)(R9*1)
6898	MOVOU X3, -16(AX)(R9*1)
6899	MOVQ  SI, AX
6900
6901emit_literal_done_match_emit_encodeBetterBlockAsm12B:
6902	ADDL R12, CX
6903	ADDL $0x04, R12
6904	MOVL CX, 12(SP)
6905
6906	// emitCopy
6907two_byte_offset_match_nolit_encodeBetterBlockAsm12B:
6908	CMPL R12, $0x40
6909	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
6910	MOVB $0xee, (AX)
6911	MOVW R8, 1(AX)
6912	LEAL -60(R12), R12
6913	ADDQ $0x03, AX
6914
6915	// emitRepeat
6916	MOVL R12, SI
6917	LEAL -4(R12), R12
6918	CMPL SI, $0x08
6919	JLE  repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
6920	CMPL SI, $0x0c
6921	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
6922	CMPL R8, $0x00000800
6923	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
6924
6925cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
6926	CMPL R12, $0x00000104
6927	JLT  repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
6928	LEAL -256(R12), R12
6929	MOVW $0x0019, (AX)
6930	MOVW R12, 2(AX)
6931	ADDQ $0x04, AX
6932	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
6933
6934repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
6935	LEAL -4(R12), R12
6936	MOVW $0x0015, (AX)
6937	MOVB R12, 2(AX)
6938	ADDQ $0x03, AX
6939	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
6940
6941repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
6942	SHLL $0x02, R12
6943	ORL  $0x01, R12
6944	MOVW R12, (AX)
6945	ADDQ $0x02, AX
6946	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
6947
6948repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
6949	XORQ SI, SI
6950	LEAL 1(SI)(R12*4), R12
6951	MOVB R8, 1(AX)
6952	SARL $0x08, R8
6953	SHLL $0x05, R8
6954	ORL  R8, R12
6955	MOVB R12, (AX)
6956	ADDQ $0x02, AX
6957	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
6958	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B
6959
6960two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
6961	CMPL R12, $0x0c
6962	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm12B
6963	CMPL R8, $0x00000800
6964	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm12B
6965	MOVB $0x01, BL
6966	LEAL -16(BX)(R12*4), R12
6967	MOVB R8, 1(AX)
6968	SHRL $0x08, R8
6969	SHLL $0x05, R8
6970	ORL  R8, R12
6971	MOVB R12, (AX)
6972	ADDQ $0x02, AX
6973	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
6974
6975emit_copy_three_match_nolit_encodeBetterBlockAsm12B:
6976	MOVB $0x02, BL
6977	LEAL -4(BX)(R12*4), R12
6978	MOVB R12, (AX)
6979	MOVW R8, 1(AX)
6980	ADDQ $0x03, AX
6981	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
6982
6983match_is_repeat_encodeBetterBlockAsm12B:
6984	MOVL 12(SP), SI
6985	CMPL SI, DI
6986	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
6987	MOVL DI, R9
6988	MOVL DI, 12(SP)
6989	LEAQ (DX)(SI*1), R10
6990	SUBL SI, R9
6991	LEAL -1(R9), SI
6992	CMPL SI, $0x3c
6993	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm12B
6994	CMPL SI, $0x00000100
6995	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm12B
6996	MOVB $0xf4, (AX)
6997	MOVW SI, 1(AX)
6998	ADDQ $0x03, AX
6999	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
7000
7001two_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
7002	MOVB $0xf0, (AX)
7003	MOVB SI, 1(AX)
7004	ADDQ $0x02, AX
7005	CMPL SI, $0x40
7006	JL   memmove_match_emit_repeat_encodeBetterBlockAsm12B
7007	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
7008
7009one_byte_match_emit_repeat_encodeBetterBlockAsm12B:
7010	SHLB $0x02, SI
7011	MOVB SI, (AX)
7012	ADDQ $0x01, AX
7013
7014memmove_match_emit_repeat_encodeBetterBlockAsm12B:
7015	LEAQ (AX)(R9*1), SI
7016
7017	// genMemMoveShort
7018	CMPQ R9, $0x04
7019	JLE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4
7020	CMPQ R9, $0x08
7021	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7
7022	CMPQ R9, $0x10
7023	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16
7024	CMPQ R9, $0x20
7025	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32
7026	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64
7027
7028emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4:
7029	MOVL (R10), R11
7030	MOVL R11, (AX)
7031	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
7032
7033emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7:
7034	MOVL (R10), R11
7035	MOVL -4(R10)(R9*1), R10
7036	MOVL R11, (AX)
7037	MOVL R10, -4(AX)(R9*1)
7038	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
7039
7040emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16:
7041	MOVQ (R10), R11
7042	MOVQ -8(R10)(R9*1), R10
7043	MOVQ R11, (AX)
7044	MOVQ R10, -8(AX)(R9*1)
7045	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
7046
7047emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32:
7048	MOVOU (R10), X0
7049	MOVOU -16(R10)(R9*1), X1
7050	MOVOU X0, (AX)
7051	MOVOU X1, -16(AX)(R9*1)
7052	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
7053
7054emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64:
7055	MOVOU (R10), X0
7056	MOVOU 16(R10), X1
7057	MOVOU -32(R10)(R9*1), X2
7058	MOVOU -16(R10)(R9*1), X3
7059	MOVOU X0, (AX)
7060	MOVOU X1, 16(AX)
7061	MOVOU X2, -32(AX)(R9*1)
7062	MOVOU X3, -16(AX)(R9*1)
7063
7064memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B:
7065	MOVQ SI, AX
7066	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
7067
7068memmove_long_match_emit_repeat_encodeBetterBlockAsm12B:
7069	LEAQ (AX)(R9*1), SI
7070
7071	// genMemMoveLong
7072	MOVOU (R10), X0
7073	MOVOU 16(R10), X1
7074	MOVOU -32(R10)(R9*1), X2
7075	MOVOU -16(R10)(R9*1), X3
7076	MOVQ  R9, R13
7077	SHRQ  $0x05, R13
7078	MOVQ  AX, R11
7079	ANDL  $0x0000001f, R11
7080	MOVQ  $0x00000040, R14
7081	SUBQ  R11, R14
7082	DECQ  R13
7083	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
7084	LEAQ  -32(R10)(R14*1), R11
7085	LEAQ  -32(AX)(R14*1), R15
7086
7087emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back:
7088	MOVOU (R11), X4
7089	MOVOU 16(R11), X5
7090	MOVOA X4, (R15)
7091	MOVOA X5, 16(R15)
7092	ADDQ  $0x20, R15
7093	ADDQ  $0x20, R11
7094	ADDQ  $0x20, R14
7095	DECQ  R13
7096	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back
7097
7098emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
7099	MOVOU -32(R10)(R14*1), X4
7100	MOVOU -16(R10)(R14*1), X5
7101	MOVOA X4, -32(AX)(R14*1)
7102	MOVOA X5, -16(AX)(R14*1)
7103	ADDQ  $0x20, R14
7104	CMPQ  R9, R14
7105	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
7106	MOVOU X0, (AX)
7107	MOVOU X1, 16(AX)
7108	MOVOU X2, -32(AX)(R9*1)
7109	MOVOU X3, -16(AX)(R9*1)
7110	MOVQ  SI, AX
7111
7112emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B:
7113	ADDL R12, CX
7114	ADDL $0x04, R12
7115	MOVL CX, 12(SP)
7116
7117	// emitRepeat
7118	MOVL R12, SI
7119	LEAL -4(R12), R12
7120	CMPL SI, $0x08
7121	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B
7122	CMPL SI, $0x0c
7123	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
7124	CMPL R8, $0x00000800
7125	JLT  repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
7126
7127cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
7128	CMPL R12, $0x00000104
7129	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B
7130	LEAL -256(R12), R12
7131	MOVW $0x0019, (AX)
7132	MOVW R12, 2(AX)
7133	ADDQ $0x04, AX
7134	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
7135
7136repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B:
7137	LEAL -4(R12), R12
7138	MOVW $0x0015, (AX)
7139	MOVB R12, 2(AX)
7140	ADDQ $0x03, AX
7141	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
7142
7143repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B:
7144	SHLL $0x02, R12
7145	ORL  $0x01, R12
7146	MOVW R12, (AX)
7147	ADDQ $0x02, AX
7148	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
7149
7150repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
7151	XORQ SI, SI
7152	LEAL 1(SI)(R12*4), R12
7153	MOVB R8, 1(AX)
7154	SARL $0x08, R8
7155	SHLL $0x05, R8
7156	ORL  R8, R12
7157	MOVB R12, (AX)
7158	ADDQ $0x02, AX
7159
7160match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
7161	CMPL CX, 8(SP)
7162	JGE  emit_remainder_encodeBetterBlockAsm12B
7163	CMPQ AX, (SP)
7164	JL   match_nolit_dst_ok_encodeBetterBlockAsm12B
7165	MOVQ $0x00000000, ret+48(FP)
7166	RET
7167
7168match_nolit_dst_ok_encodeBetterBlockAsm12B:
7169	MOVQ  $0x0000cf1bbcdcbf9b, SI
7170	MOVQ  $0x9e3779b1, R8
7171	INCL  DI
7172	MOVQ  (DX)(DI*1), R9
7173	MOVQ  R9, R10
7174	MOVQ  R9, R11
7175	MOVQ  R9, R12
7176	SHRQ  $0x08, R11
7177	MOVQ  R11, R13
7178	SHRQ  $0x10, R12
7179	LEAL  1(DI), R14
7180	LEAL  2(DI), R15
7181	MOVQ  -2(DX)(CX*1), R9
7182	SHLQ  $0x10, R10
7183	IMULQ SI, R10
7184	SHRQ  $0x32, R10
7185	SHLQ  $0x10, R13
7186	IMULQ SI, R13
7187	SHRQ  $0x32, R13
7188	SHLQ  $0x20, R11
7189	IMULQ R8, R11
7190	SHRQ  $0x34, R11
7191	SHLQ  $0x20, R12
7192	IMULQ R8, R12
7193	SHRQ  $0x34, R12
7194	MOVL  DI, 24(SP)(R10*4)
7195	MOVL  R14, 24(SP)(R13*4)
7196	MOVL  R14, 65560(SP)(R11*4)
7197	MOVL  R15, 65560(SP)(R12*4)
7198	MOVQ  R9, R10
7199	MOVQ  R9, R11
7200	SHRQ  $0x08, R11
7201	MOVQ  R11, R13
7202	LEAL  -2(CX), R9
7203	LEAL  -1(CX), DI
7204	SHLQ  $0x10, R10
7205	IMULQ SI, R10
7206	SHRQ  $0x32, R10
7207	SHLQ  $0x20, R11
7208	IMULQ R8, R11
7209	SHRQ  $0x34, R11
7210	SHLQ  $0x10, R13
7211	IMULQ SI, R13
7212	SHRQ  $0x32, R13
7213	MOVL  R9, 24(SP)(R10*4)
7214	MOVL  DI, 65560(SP)(R11*4)
7215	MOVL  DI, 24(SP)(R13*4)
7216	JMP   search_loop_encodeBetterBlockAsm12B
7217
7218emit_remainder_encodeBetterBlockAsm12B:
7219	MOVQ src_len+32(FP), CX
7220	SUBL 12(SP), CX
7221	LEAQ 3(AX)(CX*1), CX
7222	CMPQ CX, (SP)
7223	JL   emit_remainder_ok_encodeBetterBlockAsm12B
7224	MOVQ $0x00000000, ret+48(FP)
7225	RET
7226
7227emit_remainder_ok_encodeBetterBlockAsm12B:
7228	MOVQ src_len+32(FP), CX
7229	MOVL 12(SP), BX
7230	CMPL BX, CX
7231	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
7232	MOVL CX, SI
7233	MOVL CX, 12(SP)
7234	LEAQ (DX)(BX*1), CX
7235	SUBL BX, SI
7236	LEAL -1(SI), DX
7237	CMPL DX, $0x3c
7238	JLT  one_byte_emit_remainder_encodeBetterBlockAsm12B
7239	CMPL DX, $0x00000100
7240	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm12B
7241	MOVB $0xf4, (AX)
7242	MOVW DX, 1(AX)
7243	ADDQ $0x03, AX
7244	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm12B
7245
7246two_bytes_emit_remainder_encodeBetterBlockAsm12B:
7247	MOVB $0xf0, (AX)
7248	MOVB DL, 1(AX)
7249	ADDQ $0x02, AX
7250	CMPL DX, $0x40
7251	JL   memmove_emit_remainder_encodeBetterBlockAsm12B
7252	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm12B
7253
7254one_byte_emit_remainder_encodeBetterBlockAsm12B:
7255	SHLB $0x02, DL
7256	MOVB DL, (AX)
7257	ADDQ $0x01, AX
7258
7259memmove_emit_remainder_encodeBetterBlockAsm12B:
7260	LEAQ (AX)(SI*1), DX
7261	MOVL SI, BX
7262
7263	// genMemMoveShort
7264	CMPQ BX, $0x04
7265	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4
7266	CMPQ BX, $0x08
7267	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7
7268	CMPQ BX, $0x10
7269	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16
7270	CMPQ BX, $0x20
7271	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32
7272	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64
7273
7274emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4:
7275	MOVL (CX), SI
7276	MOVL SI, (AX)
7277	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
7278
7279emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7:
7280	MOVL (CX), SI
7281	MOVL -4(CX)(BX*1), CX
7282	MOVL SI, (AX)
7283	MOVL CX, -4(AX)(BX*1)
7284	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
7285
7286emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16:
7287	MOVQ (CX), SI
7288	MOVQ -8(CX)(BX*1), CX
7289	MOVQ SI, (AX)
7290	MOVQ CX, -8(AX)(BX*1)
7291	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
7292
7293emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32:
7294	MOVOU (CX), X0
7295	MOVOU -16(CX)(BX*1), X1
7296	MOVOU X0, (AX)
7297	MOVOU X1, -16(AX)(BX*1)
7298	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
7299
7300emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64:
7301	MOVOU (CX), X0
7302	MOVOU 16(CX), X1
7303	MOVOU -32(CX)(BX*1), X2
7304	MOVOU -16(CX)(BX*1), X3
7305	MOVOU X0, (AX)
7306	MOVOU X1, 16(AX)
7307	MOVOU X2, -32(AX)(BX*1)
7308	MOVOU X3, -16(AX)(BX*1)
7309
7310memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B:
7311	MOVQ DX, AX
7312	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
7313
7314memmove_long_emit_remainder_encodeBetterBlockAsm12B:
7315	LEAQ (AX)(SI*1), DX
7316	MOVL SI, BX
7317
7318	// genMemMoveLong
7319	MOVOU (CX), X0
7320	MOVOU 16(CX), X1
7321	MOVOU -32(CX)(BX*1), X2
7322	MOVOU -16(CX)(BX*1), X3
7323	MOVQ  BX, DI
7324	SHRQ  $0x05, DI
7325	MOVQ  AX, SI
7326	ANDL  $0x0000001f, SI
7327	MOVQ  $0x00000040, R8
7328	SUBQ  SI, R8
7329	DECQ  DI
7330	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
7331	LEAQ  -32(CX)(R8*1), SI
7332	LEAQ  -32(AX)(R8*1), R9
7333
7334emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back:
7335	MOVOU (SI), X4
7336	MOVOU 16(SI), X5
7337	MOVOA X4, (R9)
7338	MOVOA X5, 16(R9)
7339	ADDQ  $0x20, R9
7340	ADDQ  $0x20, SI
7341	ADDQ  $0x20, R8
7342	DECQ  DI
7343	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back
7344
7345emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
7346	MOVOU -32(CX)(R8*1), X4
7347	MOVOU -16(CX)(R8*1), X5
7348	MOVOA X4, -32(AX)(R8*1)
7349	MOVOA X5, -16(AX)(R8*1)
7350	ADDQ  $0x20, R8
7351	CMPQ  BX, R8
7352	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
7353	MOVOU X0, (AX)
7354	MOVOU X1, 16(AX)
7355	MOVOU X2, -32(AX)(BX*1)
7356	MOVOU X3, -16(AX)(BX*1)
7357	MOVQ  DX, AX
7358
7359emit_literal_done_emit_remainder_encodeBetterBlockAsm12B:
7360	MOVQ dst_base+0(FP), CX
7361	SUBQ CX, AX
7362	MOVQ AX, ret+48(FP)
7363	RET
7364
7365// func encodeBetterBlockAsm10B(dst []byte, src []byte) int
7366// Requires: SSE2
7367TEXT ·encodeBetterBlockAsm10B(SB), $20504-56
7368	MOVQ dst_base+0(FP), AX
7369	MOVQ $0x000000a0, CX
7370	LEAQ 24(SP), DX
7371	PXOR X0, X0
7372
7373zero_loop_encodeBetterBlockAsm10B:
7374	MOVOU X0, (DX)
7375	MOVOU X0, 16(DX)
7376	MOVOU X0, 32(DX)
7377	MOVOU X0, 48(DX)
7378	MOVOU X0, 64(DX)
7379	MOVOU X0, 80(DX)
7380	MOVOU X0, 96(DX)
7381	MOVOU X0, 112(DX)
7382	ADDQ  $0x80, DX
7383	DECQ  CX
7384	JNZ   zero_loop_encodeBetterBlockAsm10B
7385	MOVL  $0x00000000, 12(SP)
7386	MOVQ  src_len+32(FP), CX
7387	LEAQ  -6(CX), DX
7388	LEAQ  -8(CX), SI
7389	MOVL  SI, 8(SP)
7390	SHRQ  $0x05, CX
7391	SUBL  CX, DX
7392	LEAQ  (AX)(DX*1), DX
7393	MOVQ  DX, (SP)
7394	MOVL  $0x00000001, CX
7395	MOVL  $0x00000000, 16(SP)
7396	MOVQ  src_base+24(FP), DX
7397
7398search_loop_encodeBetterBlockAsm10B:
7399	MOVL  CX, SI
7400	SUBL  12(SP), SI
7401	SHRL  $0x05, SI
7402	LEAL  1(CX)(SI*1), SI
7403	CMPL  SI, 8(SP)
7404	JGE   emit_remainder_encodeBetterBlockAsm10B
7405	MOVQ  (DX)(CX*1), DI
7406	MOVL  SI, 20(SP)
7407	MOVQ  $0x0000cf1bbcdcbf9b, R9
7408	MOVQ  $0x9e3779b1, SI
7409	MOVQ  DI, R10
7410	MOVQ  DI, R11
7411	SHLQ  $0x10, R10
7412	IMULQ R9, R10
7413	SHRQ  $0x34, R10
7414	SHLQ  $0x20, R11
7415	IMULQ SI, R11
7416	SHRQ  $0x36, R11
7417	MOVL  24(SP)(R10*4), SI
7418	MOVL  16408(SP)(R11*4), R8
7419	MOVL  CX, 24(SP)(R10*4)
7420	MOVL  CX, 16408(SP)(R11*4)
7421	CMPL  (DX)(SI*1), DI
7422	JEQ   candidate_match_encodeBetterBlockAsm10B
7423	CMPL  (DX)(R8*1), DI
7424	JEQ   candidateS_match_encodeBetterBlockAsm10B
7425	MOVL  20(SP), CX
7426	JMP   search_loop_encodeBetterBlockAsm10B
7427
7428candidateS_match_encodeBetterBlockAsm10B:
7429	SHRQ  $0x08, DI
7430	MOVQ  DI, R10
7431	SHLQ  $0x10, R10
7432	IMULQ R9, R10
7433	SHRQ  $0x34, R10
7434	MOVL  24(SP)(R10*4), SI
7435	INCL  CX
7436	MOVL  CX, 24(SP)(R10*4)
7437	CMPL  (DX)(SI*1), DI
7438	JEQ   candidate_match_encodeBetterBlockAsm10B
7439	DECL  CX
7440	MOVL  R8, SI
7441
7442candidate_match_encodeBetterBlockAsm10B:
7443	MOVL  12(SP), DI
7444	TESTL SI, SI
7445	JZ    match_extend_back_end_encodeBetterBlockAsm10B
7446
7447match_extend_back_loop_encodeBetterBlockAsm10B:
7448	CMPL CX, DI
7449	JLE  match_extend_back_end_encodeBetterBlockAsm10B
7450	MOVB -1(DX)(SI*1), BL
7451	MOVB -1(DX)(CX*1), R8
7452	CMPB BL, R8
7453	JNE  match_extend_back_end_encodeBetterBlockAsm10B
7454	LEAL -1(CX), CX
7455	DECL SI
7456	JZ   match_extend_back_end_encodeBetterBlockAsm10B
7457	JMP  match_extend_back_loop_encodeBetterBlockAsm10B
7458
7459match_extend_back_end_encodeBetterBlockAsm10B:
7460	MOVL CX, DI
7461	SUBL 12(SP), DI
7462	LEAQ 3(AX)(DI*1), DI
7463	CMPQ DI, (SP)
7464	JL   match_dst_size_check_encodeBetterBlockAsm10B
7465	MOVQ $0x00000000, ret+48(FP)
7466	RET
7467
7468match_dst_size_check_encodeBetterBlockAsm10B:
7469	MOVL CX, DI
7470	ADDL $0x04, CX
7471	ADDL $0x04, SI
7472	MOVQ src_len+32(FP), R8
7473	SUBL CX, R8
7474	LEAQ (DX)(CX*1), R9
7475	LEAQ (DX)(SI*1), R10
7476
7477	// matchLen
7478	XORL R12, R12
7479	CMPL R8, $0x08
7480	JL   matchlen_single_match_nolit_encodeBetterBlockAsm10B
7481
7482matchlen_loopback_match_nolit_encodeBetterBlockAsm10B:
7483	MOVQ  (R9)(R12*1), R11
7484	XORQ  (R10)(R12*1), R11
7485	TESTQ R11, R11
7486	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm10B
7487	BSFQ  R11, R11
7488	SARQ  $0x03, R11
7489	LEAL  (R12)(R11*1), R12
7490	JMP   match_nolit_end_encodeBetterBlockAsm10B
7491
7492matchlen_loop_match_nolit_encodeBetterBlockAsm10B:
7493	LEAL -8(R8), R8
7494	LEAL 8(R12), R12
7495	CMPL R8, $0x08
7496	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm10B
7497
7498matchlen_single_match_nolit_encodeBetterBlockAsm10B:
7499	TESTL R8, R8
7500	JZ    match_nolit_end_encodeBetterBlockAsm10B
7501
7502matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B:
7503	MOVB (R9)(R12*1), R11
7504	CMPB (R10)(R12*1), R11
7505	JNE  match_nolit_end_encodeBetterBlockAsm10B
7506	LEAL 1(R12), R12
7507	DECL R8
7508	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B
7509
7510match_nolit_end_encodeBetterBlockAsm10B:
7511	MOVL CX, R8
7512	SUBL SI, R8
7513
7514	// Check if repeat
7515	CMPL 16(SP), R8
7516	JEQ  match_is_repeat_encodeBetterBlockAsm10B
7517	MOVL R8, 16(SP)
7518	MOVL 12(SP), SI
7519	CMPL SI, DI
7520	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm10B
7521	MOVL DI, R9
7522	MOVL DI, 12(SP)
7523	LEAQ (DX)(SI*1), R10
7524	SUBL SI, R9
7525	LEAL -1(R9), SI
7526	CMPL SI, $0x3c
7527	JLT  one_byte_match_emit_encodeBetterBlockAsm10B
7528	CMPL SI, $0x00000100
7529	JLT  two_bytes_match_emit_encodeBetterBlockAsm10B
7530	MOVB $0xf4, (AX)
7531	MOVW SI, 1(AX)
7532	ADDQ $0x03, AX
7533	JMP  memmove_long_match_emit_encodeBetterBlockAsm10B
7534
7535two_bytes_match_emit_encodeBetterBlockAsm10B:
7536	MOVB $0xf0, (AX)
7537	MOVB SI, 1(AX)
7538	ADDQ $0x02, AX
7539	CMPL SI, $0x40
7540	JL   memmove_match_emit_encodeBetterBlockAsm10B
7541	JMP  memmove_long_match_emit_encodeBetterBlockAsm10B
7542
7543one_byte_match_emit_encodeBetterBlockAsm10B:
7544	SHLB $0x02, SI
7545	MOVB SI, (AX)
7546	ADDQ $0x01, AX
7547
7548memmove_match_emit_encodeBetterBlockAsm10B:
7549	LEAQ (AX)(R9*1), SI
7550
7551	// genMemMoveShort
7552	CMPQ R9, $0x04
7553	JLE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4
7554	CMPQ R9, $0x08
7555	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7
7556	CMPQ R9, $0x10
7557	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16
7558	CMPQ R9, $0x20
7559	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32
7560	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64
7561
7562emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4:
7563	MOVL (R10), R11
7564	MOVL R11, (AX)
7565	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B
7566
7567emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7:
7568	MOVL (R10), R11
7569	MOVL -4(R10)(R9*1), R10
7570	MOVL R11, (AX)
7571	MOVL R10, -4(AX)(R9*1)
7572	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B
7573
7574emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16:
7575	MOVQ (R10), R11
7576	MOVQ -8(R10)(R9*1), R10
7577	MOVQ R11, (AX)
7578	MOVQ R10, -8(AX)(R9*1)
7579	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B
7580
7581emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32:
7582	MOVOU (R10), X0
7583	MOVOU -16(R10)(R9*1), X1
7584	MOVOU X0, (AX)
7585	MOVOU X1, -16(AX)(R9*1)
7586	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm10B
7587
7588emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64:
7589	MOVOU (R10), X0
7590	MOVOU 16(R10), X1
7591	MOVOU -32(R10)(R9*1), X2
7592	MOVOU -16(R10)(R9*1), X3
7593	MOVOU X0, (AX)
7594	MOVOU X1, 16(AX)
7595	MOVOU X2, -32(AX)(R9*1)
7596	MOVOU X3, -16(AX)(R9*1)
7597
7598memmove_end_copy_match_emit_encodeBetterBlockAsm10B:
7599	MOVQ SI, AX
7600	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm10B
7601
7602memmove_long_match_emit_encodeBetterBlockAsm10B:
7603	LEAQ (AX)(R9*1), SI
7604
7605	// genMemMoveLong
7606	MOVOU (R10), X0
7607	MOVOU 16(R10), X1
7608	MOVOU -32(R10)(R9*1), X2
7609	MOVOU -16(R10)(R9*1), X3
7610	MOVQ  R9, R13
7611	SHRQ  $0x05, R13
7612	MOVQ  AX, R11
7613	ANDL  $0x0000001f, R11
7614	MOVQ  $0x00000040, R14
7615	SUBQ  R11, R14
7616	DECQ  R13
7617	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
7618	LEAQ  -32(R10)(R14*1), R11
7619	LEAQ  -32(AX)(R14*1), R15
7620
7621emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back:
7622	MOVOU (R11), X4
7623	MOVOU 16(R11), X5
7624	MOVOA X4, (R15)
7625	MOVOA X5, 16(R15)
7626	ADDQ  $0x20, R15
7627	ADDQ  $0x20, R11
7628	ADDQ  $0x20, R14
7629	DECQ  R13
7630	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back
7631
7632emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
7633	MOVOU -32(R10)(R14*1), X4
7634	MOVOU -16(R10)(R14*1), X5
7635	MOVOA X4, -32(AX)(R14*1)
7636	MOVOA X5, -16(AX)(R14*1)
7637	ADDQ  $0x20, R14
7638	CMPQ  R9, R14
7639	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
7640	MOVOU X0, (AX)
7641	MOVOU X1, 16(AX)
7642	MOVOU X2, -32(AX)(R9*1)
7643	MOVOU X3, -16(AX)(R9*1)
7644	MOVQ  SI, AX
7645
7646emit_literal_done_match_emit_encodeBetterBlockAsm10B:
7647	ADDL R12, CX
7648	ADDL $0x04, R12
7649	MOVL CX, 12(SP)
7650
7651	// emitCopy
7652two_byte_offset_match_nolit_encodeBetterBlockAsm10B:
7653	CMPL R12, $0x40
7654	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
7655	MOVB $0xee, (AX)
7656	MOVW R8, 1(AX)
7657	LEAL -60(R12), R12
7658	ADDQ $0x03, AX
7659
7660	// emitRepeat
7661	MOVL R12, SI
7662	LEAL -4(R12), R12
7663	CMPL SI, $0x08
7664	JLE  repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
7665	CMPL SI, $0x0c
7666	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
7667	CMPL R8, $0x00000800
7668	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
7669
7670cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
7671	CMPL R12, $0x00000104
7672	JLT  repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
7673	LEAL -256(R12), R12
7674	MOVW $0x0019, (AX)
7675	MOVW R12, 2(AX)
7676	ADDQ $0x04, AX
7677	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
7678
7679repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
7680	LEAL -4(R12), R12
7681	MOVW $0x0015, (AX)
7682	MOVB R12, 2(AX)
7683	ADDQ $0x03, AX
7684	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
7685
7686repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
7687	SHLL $0x02, R12
7688	ORL  $0x01, R12
7689	MOVW R12, (AX)
7690	ADDQ $0x02, AX
7691	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
7692
7693repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
7694	XORQ SI, SI
7695	LEAL 1(SI)(R12*4), R12
7696	MOVB R8, 1(AX)
7697	SARL $0x08, R8
7698	SHLL $0x05, R8
7699	ORL  R8, R12
7700	MOVB R12, (AX)
7701	ADDQ $0x02, AX
7702	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
7703	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B
7704
7705two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
7706	CMPL R12, $0x0c
7707	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm10B
7708	CMPL R8, $0x00000800
7709	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm10B
7710	MOVB $0x01, BL
7711	LEAL -16(BX)(R12*4), R12
7712	MOVB R8, 1(AX)
7713	SHRL $0x08, R8
7714	SHLL $0x05, R8
7715	ORL  R8, R12
7716	MOVB R12, (AX)
7717	ADDQ $0x02, AX
7718	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
7719
7720emit_copy_three_match_nolit_encodeBetterBlockAsm10B:
7721	MOVB $0x02, BL
7722	LEAL -4(BX)(R12*4), R12
7723	MOVB R12, (AX)
7724	MOVW R8, 1(AX)
7725	ADDQ $0x03, AX
7726	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
7727
7728match_is_repeat_encodeBetterBlockAsm10B:
7729	MOVL 12(SP), SI
7730	CMPL SI, DI
7731	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
7732	MOVL DI, R9
7733	MOVL DI, 12(SP)
7734	LEAQ (DX)(SI*1), R10
7735	SUBL SI, R9
7736	LEAL -1(R9), SI
7737	CMPL SI, $0x3c
7738	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm10B
7739	CMPL SI, $0x00000100
7740	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm10B
7741	MOVB $0xf4, (AX)
7742	MOVW SI, 1(AX)
7743	ADDQ $0x03, AX
7744	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
7745
7746two_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
7747	MOVB $0xf0, (AX)
7748	MOVB SI, 1(AX)
7749	ADDQ $0x02, AX
7750	CMPL SI, $0x40
7751	JL   memmove_match_emit_repeat_encodeBetterBlockAsm10B
7752	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
7753
7754one_byte_match_emit_repeat_encodeBetterBlockAsm10B:
7755	SHLB $0x02, SI
7756	MOVB SI, (AX)
7757	ADDQ $0x01, AX
7758
7759memmove_match_emit_repeat_encodeBetterBlockAsm10B:
7760	LEAQ (AX)(R9*1), SI
7761
7762	// genMemMoveShort
7763	CMPQ R9, $0x04
7764	JLE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4
7765	CMPQ R9, $0x08
7766	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7
7767	CMPQ R9, $0x10
7768	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16
7769	CMPQ R9, $0x20
7770	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32
7771	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64
7772
7773emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4:
7774	MOVL (R10), R11
7775	MOVL R11, (AX)
7776	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
7777
7778emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7:
7779	MOVL (R10), R11
7780	MOVL -4(R10)(R9*1), R10
7781	MOVL R11, (AX)
7782	MOVL R10, -4(AX)(R9*1)
7783	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
7784
7785emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16:
7786	MOVQ (R10), R11
7787	MOVQ -8(R10)(R9*1), R10
7788	MOVQ R11, (AX)
7789	MOVQ R10, -8(AX)(R9*1)
7790	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
7791
7792emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32:
7793	MOVOU (R10), X0
7794	MOVOU -16(R10)(R9*1), X1
7795	MOVOU X0, (AX)
7796	MOVOU X1, -16(AX)(R9*1)
7797	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
7798
7799emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64:
7800	MOVOU (R10), X0
7801	MOVOU 16(R10), X1
7802	MOVOU -32(R10)(R9*1), X2
7803	MOVOU -16(R10)(R9*1), X3
7804	MOVOU X0, (AX)
7805	MOVOU X1, 16(AX)
7806	MOVOU X2, -32(AX)(R9*1)
7807	MOVOU X3, -16(AX)(R9*1)
7808
7809memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B:
7810	MOVQ SI, AX
7811	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
7812
7813memmove_long_match_emit_repeat_encodeBetterBlockAsm10B:
7814	LEAQ (AX)(R9*1), SI
7815
7816	// genMemMoveLong
7817	MOVOU (R10), X0
7818	MOVOU 16(R10), X1
7819	MOVOU -32(R10)(R9*1), X2
7820	MOVOU -16(R10)(R9*1), X3
7821	MOVQ  R9, R13
7822	SHRQ  $0x05, R13
7823	MOVQ  AX, R11
7824	ANDL  $0x0000001f, R11
7825	MOVQ  $0x00000040, R14
7826	SUBQ  R11, R14
7827	DECQ  R13
7828	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
7829	LEAQ  -32(R10)(R14*1), R11
7830	LEAQ  -32(AX)(R14*1), R15
7831
7832emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back:
7833	MOVOU (R11), X4
7834	MOVOU 16(R11), X5
7835	MOVOA X4, (R15)
7836	MOVOA X5, 16(R15)
7837	ADDQ  $0x20, R15
7838	ADDQ  $0x20, R11
7839	ADDQ  $0x20, R14
7840	DECQ  R13
7841	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back
7842
7843emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
7844	MOVOU -32(R10)(R14*1), X4
7845	MOVOU -16(R10)(R14*1), X5
7846	MOVOA X4, -32(AX)(R14*1)
7847	MOVOA X5, -16(AX)(R14*1)
7848	ADDQ  $0x20, R14
7849	CMPQ  R9, R14
7850	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
7851	MOVOU X0, (AX)
7852	MOVOU X1, 16(AX)
7853	MOVOU X2, -32(AX)(R9*1)
7854	MOVOU X3, -16(AX)(R9*1)
7855	MOVQ  SI, AX
7856
7857emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B:
7858	ADDL R12, CX
7859	ADDL $0x04, R12
7860	MOVL CX, 12(SP)
7861
7862	// emitRepeat
7863	MOVL R12, SI
7864	LEAL -4(R12), R12
7865	CMPL SI, $0x08
7866	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B
7867	CMPL SI, $0x0c
7868	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
7869	CMPL R8, $0x00000800
7870	JLT  repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
7871
7872cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
7873	CMPL R12, $0x00000104
7874	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B
7875	LEAL -256(R12), R12
7876	MOVW $0x0019, (AX)
7877	MOVW R12, 2(AX)
7878	ADDQ $0x04, AX
7879	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
7880
7881repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B:
7882	LEAL -4(R12), R12
7883	MOVW $0x0015, (AX)
7884	MOVB R12, 2(AX)
7885	ADDQ $0x03, AX
7886	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
7887
7888repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B:
7889	SHLL $0x02, R12
7890	ORL  $0x01, R12
7891	MOVW R12, (AX)
7892	ADDQ $0x02, AX
7893	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
7894
7895repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
7896	XORQ SI, SI
7897	LEAL 1(SI)(R12*4), R12
7898	MOVB R8, 1(AX)
7899	SARL $0x08, R8
7900	SHLL $0x05, R8
7901	ORL  R8, R12
7902	MOVB R12, (AX)
7903	ADDQ $0x02, AX
7904
7905match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
7906	CMPL CX, 8(SP)
7907	JGE  emit_remainder_encodeBetterBlockAsm10B
7908	CMPQ AX, (SP)
7909	JL   match_nolit_dst_ok_encodeBetterBlockAsm10B
7910	MOVQ $0x00000000, ret+48(FP)
7911	RET
7912
7913match_nolit_dst_ok_encodeBetterBlockAsm10B:
7914	MOVQ  $0x0000cf1bbcdcbf9b, SI
7915	MOVQ  $0x9e3779b1, R8
7916	INCL  DI
7917	MOVQ  (DX)(DI*1), R9
7918	MOVQ  R9, R10
7919	MOVQ  R9, R11
7920	MOVQ  R9, R12
7921	SHRQ  $0x08, R11
7922	MOVQ  R11, R13
7923	SHRQ  $0x10, R12
7924	LEAL  1(DI), R14
7925	LEAL  2(DI), R15
7926	MOVQ  -2(DX)(CX*1), R9
7927	SHLQ  $0x10, R10
7928	IMULQ SI, R10
7929	SHRQ  $0x34, R10
7930	SHLQ  $0x10, R13
7931	IMULQ SI, R13
7932	SHRQ  $0x34, R13
7933	SHLQ  $0x20, R11
7934	IMULQ R8, R11
7935	SHRQ  $0x36, R11
7936	SHLQ  $0x20, R12
7937	IMULQ R8, R12
7938	SHRQ  $0x36, R12
7939	MOVL  DI, 24(SP)(R10*4)
7940	MOVL  R14, 24(SP)(R13*4)
7941	MOVL  R14, 16408(SP)(R11*4)
7942	MOVL  R15, 16408(SP)(R12*4)
7943	MOVQ  R9, R10
7944	MOVQ  R9, R11
7945	SHRQ  $0x08, R11
7946	MOVQ  R11, R13
7947	LEAL  -2(CX), R9
7948	LEAL  -1(CX), DI
7949	SHLQ  $0x10, R10
7950	IMULQ SI, R10
7951	SHRQ  $0x34, R10
7952	SHLQ  $0x20, R11
7953	IMULQ R8, R11
7954	SHRQ  $0x36, R11
7955	SHLQ  $0x10, R13
7956	IMULQ SI, R13
7957	SHRQ  $0x34, R13
7958	MOVL  R9, 24(SP)(R10*4)
7959	MOVL  DI, 16408(SP)(R11*4)
7960	MOVL  DI, 24(SP)(R13*4)
7961	JMP   search_loop_encodeBetterBlockAsm10B
7962
7963emit_remainder_encodeBetterBlockAsm10B:
7964	MOVQ src_len+32(FP), CX
7965	SUBL 12(SP), CX
7966	LEAQ 3(AX)(CX*1), CX
7967	CMPQ CX, (SP)
7968	JL   emit_remainder_ok_encodeBetterBlockAsm10B
7969	MOVQ $0x00000000, ret+48(FP)
7970	RET
7971
7972emit_remainder_ok_encodeBetterBlockAsm10B:
7973	MOVQ src_len+32(FP), CX
7974	MOVL 12(SP), BX
7975	CMPL BX, CX
7976	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
7977	MOVL CX, SI
7978	MOVL CX, 12(SP)
7979	LEAQ (DX)(BX*1), CX
7980	SUBL BX, SI
7981	LEAL -1(SI), DX
7982	CMPL DX, $0x3c
7983	JLT  one_byte_emit_remainder_encodeBetterBlockAsm10B
7984	CMPL DX, $0x00000100
7985	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm10B
7986	MOVB $0xf4, (AX)
7987	MOVW DX, 1(AX)
7988	ADDQ $0x03, AX
7989	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm10B
7990
7991two_bytes_emit_remainder_encodeBetterBlockAsm10B:
7992	MOVB $0xf0, (AX)
7993	MOVB DL, 1(AX)
7994	ADDQ $0x02, AX
7995	CMPL DX, $0x40
7996	JL   memmove_emit_remainder_encodeBetterBlockAsm10B
7997	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm10B
7998
7999one_byte_emit_remainder_encodeBetterBlockAsm10B:
8000	SHLB $0x02, DL
8001	MOVB DL, (AX)
8002	ADDQ $0x01, AX
8003
8004memmove_emit_remainder_encodeBetterBlockAsm10B:
8005	LEAQ (AX)(SI*1), DX
8006	MOVL SI, BX
8007
8008	// genMemMoveShort
8009	CMPQ BX, $0x04
8010	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4
8011	CMPQ BX, $0x08
8012	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7
8013	CMPQ BX, $0x10
8014	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16
8015	CMPQ BX, $0x20
8016	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32
8017	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64
8018
8019emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4:
8020	MOVL (CX), SI
8021	MOVL SI, (AX)
8022	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
8023
8024emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7:
8025	MOVL (CX), SI
8026	MOVL -4(CX)(BX*1), CX
8027	MOVL SI, (AX)
8028	MOVL CX, -4(AX)(BX*1)
8029	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
8030
8031emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16:
8032	MOVQ (CX), SI
8033	MOVQ -8(CX)(BX*1), CX
8034	MOVQ SI, (AX)
8035	MOVQ CX, -8(AX)(BX*1)
8036	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
8037
8038emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32:
8039	MOVOU (CX), X0
8040	MOVOU -16(CX)(BX*1), X1
8041	MOVOU X0, (AX)
8042	MOVOU X1, -16(AX)(BX*1)
8043	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
8044
8045emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64:
8046	MOVOU (CX), X0
8047	MOVOU 16(CX), X1
8048	MOVOU -32(CX)(BX*1), X2
8049	MOVOU -16(CX)(BX*1), X3
8050	MOVOU X0, (AX)
8051	MOVOU X1, 16(AX)
8052	MOVOU X2, -32(AX)(BX*1)
8053	MOVOU X3, -16(AX)(BX*1)
8054
8055memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B:
8056	MOVQ DX, AX
8057	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
8058
8059memmove_long_emit_remainder_encodeBetterBlockAsm10B:
8060	LEAQ (AX)(SI*1), DX
8061	MOVL SI, BX
8062
8063	// genMemMoveLong
8064	MOVOU (CX), X0
8065	MOVOU 16(CX), X1
8066	MOVOU -32(CX)(BX*1), X2
8067	MOVOU -16(CX)(BX*1), X3
8068	MOVQ  BX, DI
8069	SHRQ  $0x05, DI
8070	MOVQ  AX, SI
8071	ANDL  $0x0000001f, SI
8072	MOVQ  $0x00000040, R8
8073	SUBQ  SI, R8
8074	DECQ  DI
8075	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
8076	LEAQ  -32(CX)(R8*1), SI
8077	LEAQ  -32(AX)(R8*1), R9
8078
8079emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back:
8080	MOVOU (SI), X4
8081	MOVOU 16(SI), X5
8082	MOVOA X4, (R9)
8083	MOVOA X5, 16(R9)
8084	ADDQ  $0x20, R9
8085	ADDQ  $0x20, SI
8086	ADDQ  $0x20, R8
8087	DECQ  DI
8088	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back
8089
8090emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
8091	MOVOU -32(CX)(R8*1), X4
8092	MOVOU -16(CX)(R8*1), X5
8093	MOVOA X4, -32(AX)(R8*1)
8094	MOVOA X5, -16(AX)(R8*1)
8095	ADDQ  $0x20, R8
8096	CMPQ  BX, R8
8097	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
8098	MOVOU X0, (AX)
8099	MOVOU X1, 16(AX)
8100	MOVOU X2, -32(AX)(BX*1)
8101	MOVOU X3, -16(AX)(BX*1)
8102	MOVQ  DX, AX
8103
8104emit_literal_done_emit_remainder_encodeBetterBlockAsm10B:
8105	MOVQ dst_base+0(FP), CX
8106	SUBQ CX, AX
8107	MOVQ AX, ret+48(FP)
8108	RET
8109
8110// func encodeBetterBlockAsm8B(dst []byte, src []byte) int
8111// Requires: SSE2
8112TEXT ·encodeBetterBlockAsm8B(SB), $5144-56
8113	MOVQ dst_base+0(FP), AX
8114	MOVQ $0x00000028, CX
8115	LEAQ 24(SP), DX
8116	PXOR X0, X0
8117
8118zero_loop_encodeBetterBlockAsm8B:
8119	MOVOU X0, (DX)
8120	MOVOU X0, 16(DX)
8121	MOVOU X0, 32(DX)
8122	MOVOU X0, 48(DX)
8123	MOVOU X0, 64(DX)
8124	MOVOU X0, 80(DX)
8125	MOVOU X0, 96(DX)
8126	MOVOU X0, 112(DX)
8127	ADDQ  $0x80, DX
8128	DECQ  CX
8129	JNZ   zero_loop_encodeBetterBlockAsm8B
8130	MOVL  $0x00000000, 12(SP)
8131	MOVQ  src_len+32(FP), CX
8132	LEAQ  -6(CX), DX
8133	LEAQ  -8(CX), SI
8134	MOVL  SI, 8(SP)
8135	SHRQ  $0x05, CX
8136	SUBL  CX, DX
8137	LEAQ  (AX)(DX*1), DX
8138	MOVQ  DX, (SP)
8139	MOVL  $0x00000001, CX
8140	MOVL  $0x00000000, 16(SP)
8141	MOVQ  src_base+24(FP), DX
8142
8143search_loop_encodeBetterBlockAsm8B:
8144	MOVL  CX, SI
8145	SUBL  12(SP), SI
8146	SHRL  $0x04, SI
8147	LEAL  1(CX)(SI*1), SI
8148	CMPL  SI, 8(SP)
8149	JGE   emit_remainder_encodeBetterBlockAsm8B
8150	MOVQ  (DX)(CX*1), DI
8151	MOVL  SI, 20(SP)
8152	MOVQ  $0x0000cf1bbcdcbf9b, R9
8153	MOVQ  $0x9e3779b1, SI
8154	MOVQ  DI, R10
8155	MOVQ  DI, R11
8156	SHLQ  $0x10, R10
8157	IMULQ R9, R10
8158	SHRQ  $0x36, R10
8159	SHLQ  $0x20, R11
8160	IMULQ SI, R11
8161	SHRQ  $0x38, R11
8162	MOVL  24(SP)(R10*4), SI
8163	MOVL  4120(SP)(R11*4), R8
8164	MOVL  CX, 24(SP)(R10*4)
8165	MOVL  CX, 4120(SP)(R11*4)
8166	CMPL  (DX)(SI*1), DI
8167	JEQ   candidate_match_encodeBetterBlockAsm8B
8168	CMPL  (DX)(R8*1), DI
8169	JEQ   candidateS_match_encodeBetterBlockAsm8B
8170	MOVL  20(SP), CX
8171	JMP   search_loop_encodeBetterBlockAsm8B
8172
8173candidateS_match_encodeBetterBlockAsm8B:
8174	SHRQ  $0x08, DI
8175	MOVQ  DI, R10
8176	SHLQ  $0x10, R10
8177	IMULQ R9, R10
8178	SHRQ  $0x36, R10
8179	MOVL  24(SP)(R10*4), SI
8180	INCL  CX
8181	MOVL  CX, 24(SP)(R10*4)
8182	CMPL  (DX)(SI*1), DI
8183	JEQ   candidate_match_encodeBetterBlockAsm8B
8184	DECL  CX
8185	MOVL  R8, SI
8186
8187candidate_match_encodeBetterBlockAsm8B:
8188	MOVL  12(SP), DI
8189	TESTL SI, SI
8190	JZ    match_extend_back_end_encodeBetterBlockAsm8B
8191
8192match_extend_back_loop_encodeBetterBlockAsm8B:
8193	CMPL CX, DI
8194	JLE  match_extend_back_end_encodeBetterBlockAsm8B
8195	MOVB -1(DX)(SI*1), BL
8196	MOVB -1(DX)(CX*1), R8
8197	CMPB BL, R8
8198	JNE  match_extend_back_end_encodeBetterBlockAsm8B
8199	LEAL -1(CX), CX
8200	DECL SI
8201	JZ   match_extend_back_end_encodeBetterBlockAsm8B
8202	JMP  match_extend_back_loop_encodeBetterBlockAsm8B
8203
8204match_extend_back_end_encodeBetterBlockAsm8B:
8205	MOVL CX, DI
8206	SUBL 12(SP), DI
8207	LEAQ 3(AX)(DI*1), DI
8208	CMPQ DI, (SP)
8209	JL   match_dst_size_check_encodeBetterBlockAsm8B
8210	MOVQ $0x00000000, ret+48(FP)
8211	RET
8212
8213match_dst_size_check_encodeBetterBlockAsm8B:
8214	MOVL CX, DI
8215	ADDL $0x04, CX
8216	ADDL $0x04, SI
8217	MOVQ src_len+32(FP), R8
8218	SUBL CX, R8
8219	LEAQ (DX)(CX*1), R9
8220	LEAQ (DX)(SI*1), R10
8221
8222	// matchLen
8223	XORL R12, R12
8224	CMPL R8, $0x08
8225	JL   matchlen_single_match_nolit_encodeBetterBlockAsm8B
8226
8227matchlen_loopback_match_nolit_encodeBetterBlockAsm8B:
8228	MOVQ  (R9)(R12*1), R11
8229	XORQ  (R10)(R12*1), R11
8230	TESTQ R11, R11
8231	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm8B
8232	BSFQ  R11, R11
8233	SARQ  $0x03, R11
8234	LEAL  (R12)(R11*1), R12
8235	JMP   match_nolit_end_encodeBetterBlockAsm8B
8236
8237matchlen_loop_match_nolit_encodeBetterBlockAsm8B:
8238	LEAL -8(R8), R8
8239	LEAL 8(R12), R12
8240	CMPL R8, $0x08
8241	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm8B
8242
8243matchlen_single_match_nolit_encodeBetterBlockAsm8B:
8244	TESTL R8, R8
8245	JZ    match_nolit_end_encodeBetterBlockAsm8B
8246
8247matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B:
8248	MOVB (R9)(R12*1), R11
8249	CMPB (R10)(R12*1), R11
8250	JNE  match_nolit_end_encodeBetterBlockAsm8B
8251	LEAL 1(R12), R12
8252	DECL R8
8253	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B
8254
8255match_nolit_end_encodeBetterBlockAsm8B:
8256	MOVL CX, R8
8257	SUBL SI, R8
8258
8259	// Check if repeat
8260	CMPL 16(SP), R8
8261	JEQ  match_is_repeat_encodeBetterBlockAsm8B
8262	MOVL R8, 16(SP)
8263	MOVL 12(SP), SI
8264	CMPL SI, DI
8265	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm8B
8266	MOVL DI, R9
8267	MOVL DI, 12(SP)
8268	LEAQ (DX)(SI*1), R10
8269	SUBL SI, R9
8270	LEAL -1(R9), SI
8271	CMPL SI, $0x3c
8272	JLT  one_byte_match_emit_encodeBetterBlockAsm8B
8273	CMPL SI, $0x00000100
8274	JLT  two_bytes_match_emit_encodeBetterBlockAsm8B
8275	MOVB $0xf4, (AX)
8276	MOVW SI, 1(AX)
8277	ADDQ $0x03, AX
8278	JMP  memmove_long_match_emit_encodeBetterBlockAsm8B
8279
8280two_bytes_match_emit_encodeBetterBlockAsm8B:
8281	MOVB $0xf0, (AX)
8282	MOVB SI, 1(AX)
8283	ADDQ $0x02, AX
8284	CMPL SI, $0x40
8285	JL   memmove_match_emit_encodeBetterBlockAsm8B
8286	JMP  memmove_long_match_emit_encodeBetterBlockAsm8B
8287
8288one_byte_match_emit_encodeBetterBlockAsm8B:
8289	SHLB $0x02, SI
8290	MOVB SI, (AX)
8291	ADDQ $0x01, AX
8292
8293memmove_match_emit_encodeBetterBlockAsm8B:
8294	LEAQ (AX)(R9*1), SI
8295
8296	// genMemMoveShort
8297	CMPQ R9, $0x04
8298	JLE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4
8299	CMPQ R9, $0x08
8300	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7
8301	CMPQ R9, $0x10
8302	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16
8303	CMPQ R9, $0x20
8304	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32
8305	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64
8306
8307emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4:
8308	MOVL (R10), R11
8309	MOVL R11, (AX)
8310	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B
8311
8312emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7:
8313	MOVL (R10), R11
8314	MOVL -4(R10)(R9*1), R10
8315	MOVL R11, (AX)
8316	MOVL R10, -4(AX)(R9*1)
8317	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B
8318
8319emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16:
8320	MOVQ (R10), R11
8321	MOVQ -8(R10)(R9*1), R10
8322	MOVQ R11, (AX)
8323	MOVQ R10, -8(AX)(R9*1)
8324	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B
8325
8326emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32:
8327	MOVOU (R10), X0
8328	MOVOU -16(R10)(R9*1), X1
8329	MOVOU X0, (AX)
8330	MOVOU X1, -16(AX)(R9*1)
8331	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm8B
8332
8333emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64:
8334	MOVOU (R10), X0
8335	MOVOU 16(R10), X1
8336	MOVOU -32(R10)(R9*1), X2
8337	MOVOU -16(R10)(R9*1), X3
8338	MOVOU X0, (AX)
8339	MOVOU X1, 16(AX)
8340	MOVOU X2, -32(AX)(R9*1)
8341	MOVOU X3, -16(AX)(R9*1)
8342
8343memmove_end_copy_match_emit_encodeBetterBlockAsm8B:
8344	MOVQ SI, AX
8345	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm8B
8346
8347memmove_long_match_emit_encodeBetterBlockAsm8B:
8348	LEAQ (AX)(R9*1), SI
8349
8350	// genMemMoveLong
8351	MOVOU (R10), X0
8352	MOVOU 16(R10), X1
8353	MOVOU -32(R10)(R9*1), X2
8354	MOVOU -16(R10)(R9*1), X3
8355	MOVQ  R9, R13
8356	SHRQ  $0x05, R13
8357	MOVQ  AX, R11
8358	ANDL  $0x0000001f, R11
8359	MOVQ  $0x00000040, R14
8360	SUBQ  R11, R14
8361	DECQ  R13
8362	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
8363	LEAQ  -32(R10)(R14*1), R11
8364	LEAQ  -32(AX)(R14*1), R15
8365
8366emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back:
8367	MOVOU (R11), X4
8368	MOVOU 16(R11), X5
8369	MOVOA X4, (R15)
8370	MOVOA X5, 16(R15)
8371	ADDQ  $0x20, R15
8372	ADDQ  $0x20, R11
8373	ADDQ  $0x20, R14
8374	DECQ  R13
8375	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back
8376
8377emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
8378	MOVOU -32(R10)(R14*1), X4
8379	MOVOU -16(R10)(R14*1), X5
8380	MOVOA X4, -32(AX)(R14*1)
8381	MOVOA X5, -16(AX)(R14*1)
8382	ADDQ  $0x20, R14
8383	CMPQ  R9, R14
8384	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
8385	MOVOU X0, (AX)
8386	MOVOU X1, 16(AX)
8387	MOVOU X2, -32(AX)(R9*1)
8388	MOVOU X3, -16(AX)(R9*1)
8389	MOVQ  SI, AX
8390
8391emit_literal_done_match_emit_encodeBetterBlockAsm8B:
8392	ADDL R12, CX
8393	ADDL $0x04, R12
8394	MOVL CX, 12(SP)
8395
8396	// emitCopy
8397two_byte_offset_match_nolit_encodeBetterBlockAsm8B:
8398	CMPL R12, $0x40
8399	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
8400	MOVB $0xee, (AX)
8401	MOVW R8, 1(AX)
8402	LEAL -60(R12), R12
8403	ADDQ $0x03, AX
8404
8405	// emitRepeat
8406	MOVL R12, SI
8407	LEAL -4(R12), R12
8408	CMPL SI, $0x08
8409	JLE  repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
8410	CMPL SI, $0x0c
8411	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
8412
8413cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
8414	CMPL R12, $0x00000104
8415	JLT  repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
8416	LEAL -256(R12), R12
8417	MOVW $0x0019, (AX)
8418	MOVW R12, 2(AX)
8419	ADDQ $0x04, AX
8420	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
8421
8422repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
8423	LEAL -4(R12), R12
8424	MOVW $0x0015, (AX)
8425	MOVB R12, 2(AX)
8426	ADDQ $0x03, AX
8427	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
8428
8429repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
8430	SHLL $0x02, R12
8431	ORL  $0x01, R12
8432	MOVW R12, (AX)
8433	ADDQ $0x02, AX
8434	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
8435	XORQ SI, SI
8436	LEAL 1(SI)(R12*4), R12
8437	MOVB R8, 1(AX)
8438	SARL $0x08, R8
8439	SHLL $0x05, R8
8440	ORL  R8, R12
8441	MOVB R12, (AX)
8442	ADDQ $0x02, AX
8443	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
8444	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B
8445
8446two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
8447	CMPL R12, $0x0c
8448	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm8B
8449	MOVB $0x01, BL
8450	LEAL -16(BX)(R12*4), R12
8451	MOVB R8, 1(AX)
8452	SHRL $0x08, R8
8453	SHLL $0x05, R8
8454	ORL  R8, R12
8455	MOVB R12, (AX)
8456	ADDQ $0x02, AX
8457	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
8458
8459emit_copy_three_match_nolit_encodeBetterBlockAsm8B:
8460	MOVB $0x02, BL
8461	LEAL -4(BX)(R12*4), R12
8462	MOVB R12, (AX)
8463	MOVW R8, 1(AX)
8464	ADDQ $0x03, AX
8465	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
8466
8467match_is_repeat_encodeBetterBlockAsm8B:
8468	MOVL 12(SP), SI
8469	CMPL SI, DI
8470	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
8471	MOVL DI, R8
8472	MOVL DI, 12(SP)
8473	LEAQ (DX)(SI*1), R9
8474	SUBL SI, R8
8475	LEAL -1(R8), SI
8476	CMPL SI, $0x3c
8477	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm8B
8478	CMPL SI, $0x00000100
8479	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm8B
8480	MOVB $0xf4, (AX)
8481	MOVW SI, 1(AX)
8482	ADDQ $0x03, AX
8483	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
8484
8485two_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
8486	MOVB $0xf0, (AX)
8487	MOVB SI, 1(AX)
8488	ADDQ $0x02, AX
8489	CMPL SI, $0x40
8490	JL   memmove_match_emit_repeat_encodeBetterBlockAsm8B
8491	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
8492
8493one_byte_match_emit_repeat_encodeBetterBlockAsm8B:
8494	SHLB $0x02, SI
8495	MOVB SI, (AX)
8496	ADDQ $0x01, AX
8497
8498memmove_match_emit_repeat_encodeBetterBlockAsm8B:
8499	LEAQ (AX)(R8*1), SI
8500
8501	// genMemMoveShort
8502	CMPQ R8, $0x04
8503	JLE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4
8504	CMPQ R8, $0x08
8505	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7
8506	CMPQ R8, $0x10
8507	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16
8508	CMPQ R8, $0x20
8509	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32
8510	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64
8511
8512emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4:
8513	MOVL (R9), R10
8514	MOVL R10, (AX)
8515	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
8516
8517emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7:
8518	MOVL (R9), R10
8519	MOVL -4(R9)(R8*1), R9
8520	MOVL R10, (AX)
8521	MOVL R9, -4(AX)(R8*1)
8522	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
8523
8524emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16:
8525	MOVQ (R9), R10
8526	MOVQ -8(R9)(R8*1), R9
8527	MOVQ R10, (AX)
8528	MOVQ R9, -8(AX)(R8*1)
8529	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
8530
8531emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32:
8532	MOVOU (R9), X0
8533	MOVOU -16(R9)(R8*1), X1
8534	MOVOU X0, (AX)
8535	MOVOU X1, -16(AX)(R8*1)
8536	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
8537
8538emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64:
8539	MOVOU (R9), X0
8540	MOVOU 16(R9), X1
8541	MOVOU -32(R9)(R8*1), X2
8542	MOVOU -16(R9)(R8*1), X3
8543	MOVOU X0, (AX)
8544	MOVOU X1, 16(AX)
8545	MOVOU X2, -32(AX)(R8*1)
8546	MOVOU X3, -16(AX)(R8*1)
8547
8548memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B:
8549	MOVQ SI, AX
8550	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
8551
8552memmove_long_match_emit_repeat_encodeBetterBlockAsm8B:
8553	LEAQ (AX)(R8*1), SI
8554
8555	// genMemMoveLong
8556	MOVOU (R9), X0
8557	MOVOU 16(R9), X1
8558	MOVOU -32(R9)(R8*1), X2
8559	MOVOU -16(R9)(R8*1), X3
8560	MOVQ  R8, R11
8561	SHRQ  $0x05, R11
8562	MOVQ  AX, R10
8563	ANDL  $0x0000001f, R10
8564	MOVQ  $0x00000040, R13
8565	SUBQ  R10, R13
8566	DECQ  R11
8567	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
8568	LEAQ  -32(R9)(R13*1), R10
8569	LEAQ  -32(AX)(R13*1), R14
8570
8571emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back:
8572	MOVOU (R10), X4
8573	MOVOU 16(R10), X5
8574	MOVOA X4, (R14)
8575	MOVOA X5, 16(R14)
8576	ADDQ  $0x20, R14
8577	ADDQ  $0x20, R10
8578	ADDQ  $0x20, R13
8579	DECQ  R11
8580	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back
8581
8582emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
8583	MOVOU -32(R9)(R13*1), X4
8584	MOVOU -16(R9)(R13*1), X5
8585	MOVOA X4, -32(AX)(R13*1)
8586	MOVOA X5, -16(AX)(R13*1)
8587	ADDQ  $0x20, R13
8588	CMPQ  R8, R13
8589	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
8590	MOVOU X0, (AX)
8591	MOVOU X1, 16(AX)
8592	MOVOU X2, -32(AX)(R8*1)
8593	MOVOU X3, -16(AX)(R8*1)
8594	MOVQ  SI, AX
8595
8596emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B:
8597	ADDL R12, CX
8598	ADDL $0x04, R12
8599	MOVL CX, 12(SP)
8600
8601	// emitRepeat
8602	MOVL R12, SI
8603	LEAL -4(R12), R12
8604	CMPL SI, $0x08
8605	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B
8606	CMPL SI, $0x0c
8607	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B
8608
8609cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B:
8610	CMPL R12, $0x00000104
8611	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B
8612	LEAL -256(R12), R12
8613	MOVW $0x0019, (AX)
8614	MOVW R12, 2(AX)
8615	ADDQ $0x04, AX
8616	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
8617
8618repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B:
8619	LEAL -4(R12), R12
8620	MOVW $0x0015, (AX)
8621	MOVB R12, 2(AX)
8622	ADDQ $0x03, AX
8623	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
8624
8625repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B:
8626	SHLL $0x02, R12
8627	ORL  $0x01, R12
8628	MOVW R12, (AX)
8629	ADDQ $0x02, AX
8630	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
8631	XORQ SI, SI
8632	LEAL 1(SI)(R12*4), R12
8633	MOVB R8, 1(AX)
8634	SARL $0x08, R8
8635	SHLL $0x05, R8
8636	ORL  R8, R12
8637	MOVB R12, (AX)
8638	ADDQ $0x02, AX
8639
8640match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
8641	CMPL CX, 8(SP)
8642	JGE  emit_remainder_encodeBetterBlockAsm8B
8643	CMPQ AX, (SP)
8644	JL   match_nolit_dst_ok_encodeBetterBlockAsm8B
8645	MOVQ $0x00000000, ret+48(FP)
8646	RET
8647
8648match_nolit_dst_ok_encodeBetterBlockAsm8B:
8649	MOVQ  $0x0000cf1bbcdcbf9b, SI
8650	MOVQ  $0x9e3779b1, R8
8651	INCL  DI
8652	MOVQ  (DX)(DI*1), R9
8653	MOVQ  R9, R10
8654	MOVQ  R9, R11
8655	MOVQ  R9, R12
8656	SHRQ  $0x08, R11
8657	MOVQ  R11, R13
8658	SHRQ  $0x10, R12
8659	LEAL  1(DI), R14
8660	LEAL  2(DI), R15
8661	MOVQ  -2(DX)(CX*1), R9
8662	SHLQ  $0x10, R10
8663	IMULQ SI, R10
8664	SHRQ  $0x36, R10
8665	SHLQ  $0x10, R13
8666	IMULQ SI, R13
8667	SHRQ  $0x36, R13
8668	SHLQ  $0x20, R11
8669	IMULQ R8, R11
8670	SHRQ  $0x38, R11
8671	SHLQ  $0x20, R12
8672	IMULQ R8, R12
8673	SHRQ  $0x38, R12
8674	MOVL  DI, 24(SP)(R10*4)
8675	MOVL  R14, 24(SP)(R13*4)
8676	MOVL  R14, 4120(SP)(R11*4)
8677	MOVL  R15, 4120(SP)(R12*4)
8678	MOVQ  R9, R10
8679	MOVQ  R9, R11
8680	SHRQ  $0x08, R11
8681	MOVQ  R11, R13
8682	LEAL  -2(CX), R9
8683	LEAL  -1(CX), DI
8684	SHLQ  $0x10, R10
8685	IMULQ SI, R10
8686	SHRQ  $0x36, R10
8687	SHLQ  $0x20, R11
8688	IMULQ R8, R11
8689	SHRQ  $0x38, R11
8690	SHLQ  $0x10, R13
8691	IMULQ SI, R13
8692	SHRQ  $0x36, R13
8693	MOVL  R9, 24(SP)(R10*4)
8694	MOVL  DI, 4120(SP)(R11*4)
8695	MOVL  DI, 24(SP)(R13*4)
8696	JMP   search_loop_encodeBetterBlockAsm8B
8697
8698emit_remainder_encodeBetterBlockAsm8B:
8699	MOVQ src_len+32(FP), CX
8700	SUBL 12(SP), CX
8701	LEAQ 3(AX)(CX*1), CX
8702	CMPQ CX, (SP)
8703	JL   emit_remainder_ok_encodeBetterBlockAsm8B
8704	MOVQ $0x00000000, ret+48(FP)
8705	RET
8706
8707emit_remainder_ok_encodeBetterBlockAsm8B:
8708	MOVQ src_len+32(FP), CX
8709	MOVL 12(SP), BX
8710	CMPL BX, CX
8711	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
8712	MOVL CX, SI
8713	MOVL CX, 12(SP)
8714	LEAQ (DX)(BX*1), CX
8715	SUBL BX, SI
8716	LEAL -1(SI), DX
8717	CMPL DX, $0x3c
8718	JLT  one_byte_emit_remainder_encodeBetterBlockAsm8B
8719	CMPL DX, $0x00000100
8720	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm8B
8721	MOVB $0xf4, (AX)
8722	MOVW DX, 1(AX)
8723	ADDQ $0x03, AX
8724	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm8B
8725
8726two_bytes_emit_remainder_encodeBetterBlockAsm8B:
8727	MOVB $0xf0, (AX)
8728	MOVB DL, 1(AX)
8729	ADDQ $0x02, AX
8730	CMPL DX, $0x40
8731	JL   memmove_emit_remainder_encodeBetterBlockAsm8B
8732	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm8B
8733
8734one_byte_emit_remainder_encodeBetterBlockAsm8B:
8735	SHLB $0x02, DL
8736	MOVB DL, (AX)
8737	ADDQ $0x01, AX
8738
8739memmove_emit_remainder_encodeBetterBlockAsm8B:
8740	LEAQ (AX)(SI*1), DX
8741	MOVL SI, BX
8742
8743	// genMemMoveShort
8744	CMPQ BX, $0x04
8745	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4
8746	CMPQ BX, $0x08
8747	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7
8748	CMPQ BX, $0x10
8749	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16
8750	CMPQ BX, $0x20
8751	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32
8752	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64
8753
8754emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4:
8755	MOVL (CX), SI
8756	MOVL SI, (AX)
8757	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
8758
8759emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7:
8760	MOVL (CX), SI
8761	MOVL -4(CX)(BX*1), CX
8762	MOVL SI, (AX)
8763	MOVL CX, -4(AX)(BX*1)
8764	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
8765
8766emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16:
8767	MOVQ (CX), SI
8768	MOVQ -8(CX)(BX*1), CX
8769	MOVQ SI, (AX)
8770	MOVQ CX, -8(AX)(BX*1)
8771	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
8772
8773emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32:
8774	MOVOU (CX), X0
8775	MOVOU -16(CX)(BX*1), X1
8776	MOVOU X0, (AX)
8777	MOVOU X1, -16(AX)(BX*1)
8778	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
8779
8780emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64:
8781	MOVOU (CX), X0
8782	MOVOU 16(CX), X1
8783	MOVOU -32(CX)(BX*1), X2
8784	MOVOU -16(CX)(BX*1), X3
8785	MOVOU X0, (AX)
8786	MOVOU X1, 16(AX)
8787	MOVOU X2, -32(AX)(BX*1)
8788	MOVOU X3, -16(AX)(BX*1)
8789
8790memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B:
8791	MOVQ DX, AX
8792	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
8793
8794memmove_long_emit_remainder_encodeBetterBlockAsm8B:
8795	LEAQ (AX)(SI*1), DX
8796	MOVL SI, BX
8797
8798	// genMemMoveLong
8799	MOVOU (CX), X0
8800	MOVOU 16(CX), X1
8801	MOVOU -32(CX)(BX*1), X2
8802	MOVOU -16(CX)(BX*1), X3
8803	MOVQ  BX, DI
8804	SHRQ  $0x05, DI
8805	MOVQ  AX, SI
8806	ANDL  $0x0000001f, SI
8807	MOVQ  $0x00000040, R8
8808	SUBQ  SI, R8
8809	DECQ  DI
8810	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
8811	LEAQ  -32(CX)(R8*1), SI
8812	LEAQ  -32(AX)(R8*1), R9
8813
8814emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back:
8815	MOVOU (SI), X4
8816	MOVOU 16(SI), X5
8817	MOVOA X4, (R9)
8818	MOVOA X5, 16(R9)
8819	ADDQ  $0x20, R9
8820	ADDQ  $0x20, SI
8821	ADDQ  $0x20, R8
8822	DECQ  DI
8823	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back
8824
8825emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
8826	MOVOU -32(CX)(R8*1), X4
8827	MOVOU -16(CX)(R8*1), X5
8828	MOVOA X4, -32(AX)(R8*1)
8829	MOVOA X5, -16(AX)(R8*1)
8830	ADDQ  $0x20, R8
8831	CMPQ  BX, R8
8832	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
8833	MOVOU X0, (AX)
8834	MOVOU X1, 16(AX)
8835	MOVOU X2, -32(AX)(BX*1)
8836	MOVOU X3, -16(AX)(BX*1)
8837	MOVQ  DX, AX
8838
8839emit_literal_done_emit_remainder_encodeBetterBlockAsm8B:
8840	MOVQ dst_base+0(FP), CX
8841	SUBQ CX, AX
8842	MOVQ AX, ret+48(FP)
8843	RET
8844
8845// func encodeSnappyBlockAsm(dst []byte, src []byte) int
8846// Requires: SSE2
8847TEXT ·encodeSnappyBlockAsm(SB), $65560-56
8848	MOVQ dst_base+0(FP), AX
8849	MOVQ $0x00000200, CX
8850	LEAQ 24(SP), DX
8851	PXOR X0, X0
8852
8853zero_loop_encodeSnappyBlockAsm:
8854	MOVOU X0, (DX)
8855	MOVOU X0, 16(DX)
8856	MOVOU X0, 32(DX)
8857	MOVOU X0, 48(DX)
8858	MOVOU X0, 64(DX)
8859	MOVOU X0, 80(DX)
8860	MOVOU X0, 96(DX)
8861	MOVOU X0, 112(DX)
8862	ADDQ  $0x80, DX
8863	DECQ  CX
8864	JNZ   zero_loop_encodeSnappyBlockAsm
8865	MOVL  $0x00000000, 12(SP)
8866	MOVQ  src_len+32(FP), CX
8867	LEAQ  -9(CX), DX
8868	LEAQ  -8(CX), SI
8869	MOVL  SI, 8(SP)
8870	SHRQ  $0x05, CX
8871	SUBL  CX, DX
8872	LEAQ  (AX)(DX*1), DX
8873	MOVQ  DX, (SP)
8874	MOVL  $0x00000001, CX
8875	MOVL  CX, 16(SP)
8876	MOVQ  src_base+24(FP), DX
8877
8878search_loop_encodeSnappyBlockAsm:
8879	MOVL  CX, SI
8880	SUBL  12(SP), SI
8881	SHRL  $0x06, SI
8882	LEAL  4(CX)(SI*1), SI
8883	CMPL  SI, 8(SP)
8884	JGE   emit_remainder_encodeSnappyBlockAsm
8885	MOVQ  (DX)(CX*1), DI
8886	MOVL  SI, 20(SP)
8887	MOVQ  $0x0000cf1bbcdcbf9b, R9
8888	MOVQ  DI, R10
8889	MOVQ  DI, R11
8890	SHRQ  $0x08, R11
8891	SHLQ  $0x10, R10
8892	IMULQ R9, R10
8893	SHRQ  $0x32, R10
8894	SHLQ  $0x10, R11
8895	IMULQ R9, R11
8896	SHRQ  $0x32, R11
8897	MOVL  24(SP)(R10*4), SI
8898	MOVL  24(SP)(R11*4), R8
8899	MOVL  CX, 24(SP)(R10*4)
8900	LEAL  1(CX), R10
8901	MOVL  R10, 24(SP)(R11*4)
8902	MOVQ  DI, R10
8903	SHRQ  $0x10, R10
8904	SHLQ  $0x10, R10
8905	IMULQ R9, R10
8906	SHRQ  $0x32, R10
8907	MOVL  CX, R9
8908	SUBL  16(SP), R9
8909	MOVL  1(DX)(R9*1), R11
8910	MOVQ  DI, R9
8911	SHRQ  $0x08, R9
8912	CMPL  R9, R11
8913	JNE   no_repeat_found_encodeSnappyBlockAsm
8914	LEAL  1(CX), DI
8915	MOVL  12(SP), SI
8916	MOVL  DI, R8
8917	SUBL  16(SP), R8
8918	JZ    repeat_extend_back_end_encodeSnappyBlockAsm
8919
8920repeat_extend_back_loop_encodeSnappyBlockAsm:
8921	CMPL DI, SI
8922	JLE  repeat_extend_back_end_encodeSnappyBlockAsm
8923	MOVB -1(DX)(R8*1), BL
8924	MOVB -1(DX)(DI*1), R9
8925	CMPB BL, R9
8926	JNE  repeat_extend_back_end_encodeSnappyBlockAsm
8927	LEAL -1(DI), DI
8928	DECL R8
8929	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm
8930
8931repeat_extend_back_end_encodeSnappyBlockAsm:
8932	MOVL 12(SP), SI
8933	CMPL SI, DI
8934	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm
8935	MOVL DI, R8
8936	MOVL DI, 12(SP)
8937	LEAQ (DX)(SI*1), R9
8938	SUBL SI, R8
8939	LEAL -1(R8), SI
8940	CMPL SI, $0x3c
8941	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm
8942	CMPL SI, $0x00000100
8943	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm
8944	CMPL SI, $0x00010000
8945	JLT  three_bytes_repeat_emit_encodeSnappyBlockAsm
8946	CMPL SI, $0x01000000
8947	JLT  four_bytes_repeat_emit_encodeSnappyBlockAsm
8948	MOVB $0xfc, (AX)
8949	MOVL SI, 1(AX)
8950	ADDQ $0x05, AX
8951	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
8952
8953four_bytes_repeat_emit_encodeSnappyBlockAsm:
8954	MOVL SI, R10
8955	SHRL $0x10, R10
8956	MOVB $0xf8, (AX)
8957	MOVW SI, 1(AX)
8958	MOVB R10, 3(AX)
8959	ADDQ $0x04, AX
8960	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
8961
8962three_bytes_repeat_emit_encodeSnappyBlockAsm:
8963	MOVB $0xf4, (AX)
8964	MOVW SI, 1(AX)
8965	ADDQ $0x03, AX
8966	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
8967
8968two_bytes_repeat_emit_encodeSnappyBlockAsm:
8969	MOVB $0xf0, (AX)
8970	MOVB SI, 1(AX)
8971	ADDQ $0x02, AX
8972	CMPL SI, $0x40
8973	JL   memmove_repeat_emit_encodeSnappyBlockAsm
8974	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
8975
8976one_byte_repeat_emit_encodeSnappyBlockAsm:
8977	SHLB $0x02, SI
8978	MOVB SI, (AX)
8979	ADDQ $0x01, AX
8980
8981memmove_repeat_emit_encodeSnappyBlockAsm:
8982	LEAQ (AX)(R8*1), SI
8983
8984	// genMemMoveShort
8985	CMPQ R8, $0x08
8986	JLE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8
8987	CMPQ R8, $0x10
8988	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16
8989	CMPQ R8, $0x20
8990	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32
8991	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64
8992
8993emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8:
8994	MOVQ (R9), R10
8995	MOVQ R10, (AX)
8996	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
8997
8998emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16:
8999	MOVQ (R9), R10
9000	MOVQ -8(R9)(R8*1), R9
9001	MOVQ R10, (AX)
9002	MOVQ R9, -8(AX)(R8*1)
9003	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
9004
9005emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32:
9006	MOVOU (R9), X0
9007	MOVOU -16(R9)(R8*1), X1
9008	MOVOU X0, (AX)
9009	MOVOU X1, -16(AX)(R8*1)
9010	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
9011
9012emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64:
9013	MOVOU (R9), X0
9014	MOVOU 16(R9), X1
9015	MOVOU -32(R9)(R8*1), X2
9016	MOVOU -16(R9)(R8*1), X3
9017	MOVOU X0, (AX)
9018	MOVOU X1, 16(AX)
9019	MOVOU X2, -32(AX)(R8*1)
9020	MOVOU X3, -16(AX)(R8*1)
9021
9022memmove_end_copy_repeat_emit_encodeSnappyBlockAsm:
9023	MOVQ SI, AX
9024	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm
9025
9026memmove_long_repeat_emit_encodeSnappyBlockAsm:
9027	LEAQ (AX)(R8*1), SI
9028
9029	// genMemMoveLong
9030	MOVOU (R9), X0
9031	MOVOU 16(R9), X1
9032	MOVOU -32(R9)(R8*1), X2
9033	MOVOU -16(R9)(R8*1), X3
9034	MOVQ  R8, R11
9035	SHRQ  $0x05, R11
9036	MOVQ  AX, R10
9037	ANDL  $0x0000001f, R10
9038	MOVQ  $0x00000040, R12
9039	SUBQ  R10, R12
9040	DECQ  R11
9041	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
9042	LEAQ  -32(R9)(R12*1), R10
9043	LEAQ  -32(AX)(R12*1), R13
9044
9045emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back:
9046	MOVOU (R10), X4
9047	MOVOU 16(R10), X5
9048	MOVOA X4, (R13)
9049	MOVOA X5, 16(R13)
9050	ADDQ  $0x20, R13
9051	ADDQ  $0x20, R10
9052	ADDQ  $0x20, R12
9053	DECQ  R11
9054	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back
9055
9056emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
9057	MOVOU -32(R9)(R12*1), X4
9058	MOVOU -16(R9)(R12*1), X5
9059	MOVOA X4, -32(AX)(R12*1)
9060	MOVOA X5, -16(AX)(R12*1)
9061	ADDQ  $0x20, R12
9062	CMPQ  R8, R12
9063	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
9064	MOVOU X0, (AX)
9065	MOVOU X1, 16(AX)
9066	MOVOU X2, -32(AX)(R8*1)
9067	MOVOU X3, -16(AX)(R8*1)
9068	MOVQ  SI, AX
9069
9070emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
9071	ADDL $0x05, CX
9072	MOVL CX, SI
9073	SUBL 16(SP), SI
9074	MOVQ src_len+32(FP), R8
9075	SUBL CX, R8
9076	LEAQ (DX)(CX*1), R9
9077	LEAQ (DX)(SI*1), SI
9078
9079	// matchLen
9080	XORL R11, R11
9081	CMPL R8, $0x08
9082	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm
9083
9084matchlen_loopback_repeat_extend_encodeSnappyBlockAsm:
9085	MOVQ  (R9)(R11*1), R10
9086	XORQ  (SI)(R11*1), R10
9087	TESTQ R10, R10
9088	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm
9089	BSFQ  R10, R10
9090	SARQ  $0x03, R10
9091	LEAL  (R11)(R10*1), R11
9092	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm
9093
9094matchlen_loop_repeat_extend_encodeSnappyBlockAsm:
9095	LEAL -8(R8), R8
9096	LEAL 8(R11), R11
9097	CMPL R8, $0x08
9098	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm
9099
9100matchlen_single_repeat_extend_encodeSnappyBlockAsm:
9101	TESTL R8, R8
9102	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm
9103
9104matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm:
9105	MOVB (R9)(R11*1), R10
9106	CMPB (SI)(R11*1), R10
9107	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm
9108	LEAL 1(R11), R11
9109	DECL R8
9110	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm
9111
9112repeat_extend_forward_end_encodeSnappyBlockAsm:
9113	ADDL R11, CX
9114	MOVL CX, SI
9115	SUBL DI, SI
9116	MOVL 16(SP), DI
9117
9118	// emitCopy
9119	CMPL DI, $0x00010000
9120	JL   two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
9121
9122four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm:
9123	CMPL SI, $0x40
9124	JLE  four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
9125	MOVB $0xff, (AX)
9126	MOVL DI, 1(AX)
9127	LEAL -64(SI), SI
9128	ADDQ $0x05, AX
9129	CMPL SI, $0x04
9130	JL   four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
9131	JMP  four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm
9132
9133four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm:
9134	TESTL SI, SI
9135	JZ    repeat_end_emit_encodeSnappyBlockAsm
9136	MOVB  $0x03, BL
9137	LEAL  -4(BX)(SI*4), SI
9138	MOVB  SI, (AX)
9139	MOVL  DI, 1(AX)
9140	ADDQ  $0x05, AX
9141	JMP   repeat_end_emit_encodeSnappyBlockAsm
9142
9143two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm:
9144	CMPL SI, $0x40
9145	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm
9146	MOVB $0xee, (AX)
9147	MOVW DI, 1(AX)
9148	LEAL -60(SI), SI
9149	ADDQ $0x03, AX
9150	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
9151
9152two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm:
9153	CMPL SI, $0x0c
9154	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
9155	CMPL DI, $0x00000800
9156	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
9157	MOVB $0x01, BL
9158	LEAL -16(BX)(SI*4), SI
9159	MOVB DI, 1(AX)
9160	SHRL $0x08, DI
9161	SHLL $0x05, DI
9162	ORL  DI, SI
9163	MOVB SI, (AX)
9164	ADDQ $0x02, AX
9165	JMP  repeat_end_emit_encodeSnappyBlockAsm
9166
9167emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm:
9168	MOVB $0x02, BL
9169	LEAL -4(BX)(SI*4), SI
9170	MOVB SI, (AX)
9171	MOVW DI, 1(AX)
9172	ADDQ $0x03, AX
9173
9174repeat_end_emit_encodeSnappyBlockAsm:
9175	MOVL CX, 12(SP)
9176	JMP  search_loop_encodeSnappyBlockAsm
9177
9178no_repeat_found_encodeSnappyBlockAsm:
9179	CMPL (DX)(SI*1), DI
9180	JEQ  candidate_match_encodeSnappyBlockAsm
9181	SHRQ $0x08, DI
9182	MOVL 24(SP)(R10*4), SI
9183	LEAL 2(CX), R9
9184	CMPL (DX)(R8*1), DI
9185	JEQ  candidate2_match_encodeSnappyBlockAsm
9186	MOVL R9, 24(SP)(R10*4)
9187	SHRQ $0x08, DI
9188	CMPL (DX)(SI*1), DI
9189	JEQ  candidate3_match_encodeSnappyBlockAsm
9190	MOVL 20(SP), CX
9191	JMP  search_loop_encodeSnappyBlockAsm
9192
9193candidate3_match_encodeSnappyBlockAsm:
9194	ADDL $0x02, CX
9195	JMP  candidate_match_encodeSnappyBlockAsm
9196
9197candidate2_match_encodeSnappyBlockAsm:
9198	MOVL R9, 24(SP)(R10*4)
9199	INCL CX
9200	MOVL R8, SI
9201
9202candidate_match_encodeSnappyBlockAsm:
9203	MOVL  12(SP), DI
9204	TESTL SI, SI
9205	JZ    match_extend_back_end_encodeSnappyBlockAsm
9206
9207match_extend_back_loop_encodeSnappyBlockAsm:
9208	CMPL CX, DI
9209	JLE  match_extend_back_end_encodeSnappyBlockAsm
9210	MOVB -1(DX)(SI*1), BL
9211	MOVB -1(DX)(CX*1), R8
9212	CMPB BL, R8
9213	JNE  match_extend_back_end_encodeSnappyBlockAsm
9214	LEAL -1(CX), CX
9215	DECL SI
9216	JZ   match_extend_back_end_encodeSnappyBlockAsm
9217	JMP  match_extend_back_loop_encodeSnappyBlockAsm
9218
9219match_extend_back_end_encodeSnappyBlockAsm:
9220	MOVL CX, DI
9221	SUBL 12(SP), DI
9222	LEAQ 5(AX)(DI*1), DI
9223	CMPQ DI, (SP)
9224	JL   match_dst_size_check_encodeSnappyBlockAsm
9225	MOVQ $0x00000000, ret+48(FP)
9226	RET
9227
9228match_dst_size_check_encodeSnappyBlockAsm:
9229	MOVL CX, DI
9230	MOVL 12(SP), R8
9231	CMPL R8, DI
9232	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm
9233	MOVL DI, R9
9234	MOVL DI, 12(SP)
9235	LEAQ (DX)(R8*1), DI
9236	SUBL R8, R9
9237	LEAL -1(R9), R8
9238	CMPL R8, $0x3c
9239	JLT  one_byte_match_emit_encodeSnappyBlockAsm
9240	CMPL R8, $0x00000100
9241	JLT  two_bytes_match_emit_encodeSnappyBlockAsm
9242	CMPL R8, $0x00010000
9243	JLT  three_bytes_match_emit_encodeSnappyBlockAsm
9244	CMPL R8, $0x01000000
9245	JLT  four_bytes_match_emit_encodeSnappyBlockAsm
9246	MOVB $0xfc, (AX)
9247	MOVL R8, 1(AX)
9248	ADDQ $0x05, AX
9249	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
9250
9251four_bytes_match_emit_encodeSnappyBlockAsm:
9252	MOVL R8, R10
9253	SHRL $0x10, R10
9254	MOVB $0xf8, (AX)
9255	MOVW R8, 1(AX)
9256	MOVB R10, 3(AX)
9257	ADDQ $0x04, AX
9258	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
9259
9260three_bytes_match_emit_encodeSnappyBlockAsm:
9261	MOVB $0xf4, (AX)
9262	MOVW R8, 1(AX)
9263	ADDQ $0x03, AX
9264	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
9265
9266two_bytes_match_emit_encodeSnappyBlockAsm:
9267	MOVB $0xf0, (AX)
9268	MOVB R8, 1(AX)
9269	ADDQ $0x02, AX
9270	CMPL R8, $0x40
9271	JL   memmove_match_emit_encodeSnappyBlockAsm
9272	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
9273
9274one_byte_match_emit_encodeSnappyBlockAsm:
9275	SHLB $0x02, R8
9276	MOVB R8, (AX)
9277	ADDQ $0x01, AX
9278
9279memmove_match_emit_encodeSnappyBlockAsm:
9280	LEAQ (AX)(R9*1), R8
9281
9282	// genMemMoveShort
9283	CMPQ R9, $0x08
9284	JLE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8
9285	CMPQ R9, $0x10
9286	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16
9287	CMPQ R9, $0x20
9288	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32
9289	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64
9290
9291emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8:
9292	MOVQ (DI), R10
9293	MOVQ R10, (AX)
9294	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm
9295
9296emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16:
9297	MOVQ (DI), R10
9298	MOVQ -8(DI)(R9*1), DI
9299	MOVQ R10, (AX)
9300	MOVQ DI, -8(AX)(R9*1)
9301	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm
9302
9303emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32:
9304	MOVOU (DI), X0
9305	MOVOU -16(DI)(R9*1), X1
9306	MOVOU X0, (AX)
9307	MOVOU X1, -16(AX)(R9*1)
9308	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm
9309
9310emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64:
9311	MOVOU (DI), X0
9312	MOVOU 16(DI), X1
9313	MOVOU -32(DI)(R9*1), X2
9314	MOVOU -16(DI)(R9*1), X3
9315	MOVOU X0, (AX)
9316	MOVOU X1, 16(AX)
9317	MOVOU X2, -32(AX)(R9*1)
9318	MOVOU X3, -16(AX)(R9*1)
9319
9320memmove_end_copy_match_emit_encodeSnappyBlockAsm:
9321	MOVQ R8, AX
9322	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm
9323
9324memmove_long_match_emit_encodeSnappyBlockAsm:
9325	LEAQ (AX)(R9*1), R8
9326
9327	// genMemMoveLong
9328	MOVOU (DI), X0
9329	MOVOU 16(DI), X1
9330	MOVOU -32(DI)(R9*1), X2
9331	MOVOU -16(DI)(R9*1), X3
9332	MOVQ  R9, R11
9333	SHRQ  $0x05, R11
9334	MOVQ  AX, R10
9335	ANDL  $0x0000001f, R10
9336	MOVQ  $0x00000040, R12
9337	SUBQ  R10, R12
9338	DECQ  R11
9339	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
9340	LEAQ  -32(DI)(R12*1), R10
9341	LEAQ  -32(AX)(R12*1), R13
9342
9343emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back:
9344	MOVOU (R10), X4
9345	MOVOU 16(R10), X5
9346	MOVOA X4, (R13)
9347	MOVOA X5, 16(R13)
9348	ADDQ  $0x20, R13
9349	ADDQ  $0x20, R10
9350	ADDQ  $0x20, R12
9351	DECQ  R11
9352	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back
9353
9354emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
9355	MOVOU -32(DI)(R12*1), X4
9356	MOVOU -16(DI)(R12*1), X5
9357	MOVOA X4, -32(AX)(R12*1)
9358	MOVOA X5, -16(AX)(R12*1)
9359	ADDQ  $0x20, R12
9360	CMPQ  R9, R12
9361	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
9362	MOVOU X0, (AX)
9363	MOVOU X1, 16(AX)
9364	MOVOU X2, -32(AX)(R9*1)
9365	MOVOU X3, -16(AX)(R9*1)
9366	MOVQ  R8, AX
9367
9368emit_literal_done_match_emit_encodeSnappyBlockAsm:
9369match_nolit_loop_encodeSnappyBlockAsm:
9370	MOVL CX, DI
9371	SUBL SI, DI
9372	MOVL DI, 16(SP)
9373	ADDL $0x04, CX
9374	ADDL $0x04, SI
9375	MOVQ src_len+32(FP), DI
9376	SUBL CX, DI
9377	LEAQ (DX)(CX*1), R8
9378	LEAQ (DX)(SI*1), SI
9379
9380	// matchLen
9381	XORL R10, R10
9382	CMPL DI, $0x08
9383	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm
9384
9385matchlen_loopback_match_nolit_encodeSnappyBlockAsm:
9386	MOVQ  (R8)(R10*1), R9
9387	XORQ  (SI)(R10*1), R9
9388	TESTQ R9, R9
9389	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm
9390	BSFQ  R9, R9
9391	SARQ  $0x03, R9
9392	LEAL  (R10)(R9*1), R10
9393	JMP   match_nolit_end_encodeSnappyBlockAsm
9394
9395matchlen_loop_match_nolit_encodeSnappyBlockAsm:
9396	LEAL -8(DI), DI
9397	LEAL 8(R10), R10
9398	CMPL DI, $0x08
9399	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm
9400
9401matchlen_single_match_nolit_encodeSnappyBlockAsm:
9402	TESTL DI, DI
9403	JZ    match_nolit_end_encodeSnappyBlockAsm
9404
9405matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm:
9406	MOVB (R8)(R10*1), R9
9407	CMPB (SI)(R10*1), R9
9408	JNE  match_nolit_end_encodeSnappyBlockAsm
9409	LEAL 1(R10), R10
9410	DECL DI
9411	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm
9412
9413match_nolit_end_encodeSnappyBlockAsm:
9414	ADDL R10, CX
9415	MOVL 16(SP), SI
9416	ADDL $0x04, R10
9417	MOVL CX, 12(SP)
9418
9419	// emitCopy
9420	CMPL SI, $0x00010000
9421	JL   two_byte_offset_match_nolit_encodeSnappyBlockAsm
9422
9423four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm:
9424	CMPL R10, $0x40
9425	JLE  four_bytes_remain_match_nolit_encodeSnappyBlockAsm
9426	MOVB $0xff, (AX)
9427	MOVL SI, 1(AX)
9428	LEAL -64(R10), R10
9429	ADDQ $0x05, AX
9430	CMPL R10, $0x04
9431	JL   four_bytes_remain_match_nolit_encodeSnappyBlockAsm
9432	JMP  four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm
9433
9434four_bytes_remain_match_nolit_encodeSnappyBlockAsm:
9435	TESTL R10, R10
9436	JZ    match_nolit_emitcopy_end_encodeSnappyBlockAsm
9437	MOVB  $0x03, BL
9438	LEAL  -4(BX)(R10*4), R10
9439	MOVB  R10, (AX)
9440	MOVL  SI, 1(AX)
9441	ADDQ  $0x05, AX
9442	JMP   match_nolit_emitcopy_end_encodeSnappyBlockAsm
9443
9444two_byte_offset_match_nolit_encodeSnappyBlockAsm:
9445	CMPL R10, $0x40
9446	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm
9447	MOVB $0xee, (AX)
9448	MOVW SI, 1(AX)
9449	LEAL -60(R10), R10
9450	ADDQ $0x03, AX
9451	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm
9452
9453two_byte_offset_short_match_nolit_encodeSnappyBlockAsm:
9454	CMPL R10, $0x0c
9455	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm
9456	CMPL SI, $0x00000800
9457	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm
9458	MOVB $0x01, BL
9459	LEAL -16(BX)(R10*4), R10
9460	MOVB SI, 1(AX)
9461	SHRL $0x08, SI
9462	SHLL $0x05, SI
9463	ORL  SI, R10
9464	MOVB R10, (AX)
9465	ADDQ $0x02, AX
9466	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm
9467
9468emit_copy_three_match_nolit_encodeSnappyBlockAsm:
9469	MOVB $0x02, BL
9470	LEAL -4(BX)(R10*4), R10
9471	MOVB R10, (AX)
9472	MOVW SI, 1(AX)
9473	ADDQ $0x03, AX
9474
9475match_nolit_emitcopy_end_encodeSnappyBlockAsm:
9476	CMPL CX, 8(SP)
9477	JGE  emit_remainder_encodeSnappyBlockAsm
9478	MOVQ -2(DX)(CX*1), DI
9479	CMPQ AX, (SP)
9480	JL   match_nolit_dst_ok_encodeSnappyBlockAsm
9481	MOVQ $0x00000000, ret+48(FP)
9482	RET
9483
9484match_nolit_dst_ok_encodeSnappyBlockAsm:
9485	MOVQ  $0x0000cf1bbcdcbf9b, R9
9486	MOVQ  DI, R8
9487	SHRQ  $0x10, DI
9488	MOVQ  DI, SI
9489	SHLQ  $0x10, R8
9490	IMULQ R9, R8
9491	SHRQ  $0x32, R8
9492	SHLQ  $0x10, SI
9493	IMULQ R9, SI
9494	SHRQ  $0x32, SI
9495	LEAL  -2(CX), R9
9496	LEAQ  24(SP)(SI*4), R10
9497	MOVL  (R10), SI
9498	MOVL  R9, 24(SP)(R8*4)
9499	MOVL  CX, (R10)
9500	CMPL  (DX)(SI*1), DI
9501	JEQ   match_nolit_loop_encodeSnappyBlockAsm
9502	INCL  CX
9503	JMP   search_loop_encodeSnappyBlockAsm
9504
9505emit_remainder_encodeSnappyBlockAsm:
9506	MOVQ src_len+32(FP), CX
9507	SUBL 12(SP), CX
9508	LEAQ 5(AX)(CX*1), CX
9509	CMPQ CX, (SP)
9510	JL   emit_remainder_ok_encodeSnappyBlockAsm
9511	MOVQ $0x00000000, ret+48(FP)
9512	RET
9513
9514emit_remainder_ok_encodeSnappyBlockAsm:
9515	MOVQ src_len+32(FP), CX
9516	MOVL 12(SP), BX
9517	CMPL BX, CX
9518	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm
9519	MOVL CX, SI
9520	MOVL CX, 12(SP)
9521	LEAQ (DX)(BX*1), CX
9522	SUBL BX, SI
9523	LEAL -1(SI), DX
9524	CMPL DX, $0x3c
9525	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm
9526	CMPL DX, $0x00000100
9527	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm
9528	CMPL DX, $0x00010000
9529	JLT  three_bytes_emit_remainder_encodeSnappyBlockAsm
9530	CMPL DX, $0x01000000
9531	JLT  four_bytes_emit_remainder_encodeSnappyBlockAsm
9532	MOVB $0xfc, (AX)
9533	MOVL DX, 1(AX)
9534	ADDQ $0x05, AX
9535	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
9536
9537four_bytes_emit_remainder_encodeSnappyBlockAsm:
9538	MOVL DX, BX
9539	SHRL $0x10, BX
9540	MOVB $0xf8, (AX)
9541	MOVW DX, 1(AX)
9542	MOVB BL, 3(AX)
9543	ADDQ $0x04, AX
9544	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
9545
9546three_bytes_emit_remainder_encodeSnappyBlockAsm:
9547	MOVB $0xf4, (AX)
9548	MOVW DX, 1(AX)
9549	ADDQ $0x03, AX
9550	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
9551
9552two_bytes_emit_remainder_encodeSnappyBlockAsm:
9553	MOVB $0xf0, (AX)
9554	MOVB DL, 1(AX)
9555	ADDQ $0x02, AX
9556	CMPL DX, $0x40
9557	JL   memmove_emit_remainder_encodeSnappyBlockAsm
9558	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
9559
9560one_byte_emit_remainder_encodeSnappyBlockAsm:
9561	SHLB $0x02, DL
9562	MOVB DL, (AX)
9563	ADDQ $0x01, AX
9564
9565memmove_emit_remainder_encodeSnappyBlockAsm:
9566	LEAQ (AX)(SI*1), DX
9567	MOVL SI, BX
9568
9569	// genMemMoveShort
9570	CMPQ BX, $0x08
9571	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8
9572	CMPQ BX, $0x10
9573	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16
9574	CMPQ BX, $0x20
9575	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32
9576	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64
9577
9578emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8:
9579	MOVQ (CX), SI
9580	MOVQ SI, (AX)
9581	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
9582
9583emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16:
9584	MOVQ (CX), SI
9585	MOVQ -8(CX)(BX*1), CX
9586	MOVQ SI, (AX)
9587	MOVQ CX, -8(AX)(BX*1)
9588	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
9589
9590emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32:
9591	MOVOU (CX), X0
9592	MOVOU -16(CX)(BX*1), X1
9593	MOVOU X0, (AX)
9594	MOVOU X1, -16(AX)(BX*1)
9595	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
9596
9597emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64:
9598	MOVOU (CX), X0
9599	MOVOU 16(CX), X1
9600	MOVOU -32(CX)(BX*1), X2
9601	MOVOU -16(CX)(BX*1), X3
9602	MOVOU X0, (AX)
9603	MOVOU X1, 16(AX)
9604	MOVOU X2, -32(AX)(BX*1)
9605	MOVOU X3, -16(AX)(BX*1)
9606
9607memmove_end_copy_emit_remainder_encodeSnappyBlockAsm:
9608	MOVQ DX, AX
9609	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm
9610
9611memmove_long_emit_remainder_encodeSnappyBlockAsm:
9612	LEAQ (AX)(SI*1), DX
9613	MOVL SI, BX
9614
9615	// genMemMoveLong
9616	MOVOU (CX), X0
9617	MOVOU 16(CX), X1
9618	MOVOU -32(CX)(BX*1), X2
9619	MOVOU -16(CX)(BX*1), X3
9620	MOVQ  BX, DI
9621	SHRQ  $0x05, DI
9622	MOVQ  AX, SI
9623	ANDL  $0x0000001f, SI
9624	MOVQ  $0x00000040, R8
9625	SUBQ  SI, R8
9626	DECQ  DI
9627	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
9628	LEAQ  -32(CX)(R8*1), SI
9629	LEAQ  -32(AX)(R8*1), R9
9630
9631emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back:
9632	MOVOU (SI), X4
9633	MOVOU 16(SI), X5
9634	MOVOA X4, (R9)
9635	MOVOA X5, 16(R9)
9636	ADDQ  $0x20, R9
9637	ADDQ  $0x20, SI
9638	ADDQ  $0x20, R8
9639	DECQ  DI
9640	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back
9641
9642emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
9643	MOVOU -32(CX)(R8*1), X4
9644	MOVOU -16(CX)(R8*1), X5
9645	MOVOA X4, -32(AX)(R8*1)
9646	MOVOA X5, -16(AX)(R8*1)
9647	ADDQ  $0x20, R8
9648	CMPQ  BX, R8
9649	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
9650	MOVOU X0, (AX)
9651	MOVOU X1, 16(AX)
9652	MOVOU X2, -32(AX)(BX*1)
9653	MOVOU X3, -16(AX)(BX*1)
9654	MOVQ  DX, AX
9655
9656emit_literal_done_emit_remainder_encodeSnappyBlockAsm:
9657	MOVQ dst_base+0(FP), CX
9658	SUBQ CX, AX
9659	MOVQ AX, ret+48(FP)
9660	RET
9661
9662// func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
9663// Requires: SSE2
9664TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56
9665	MOVQ dst_base+0(FP), AX
9666	MOVQ $0x00000200, CX
9667	LEAQ 24(SP), DX
9668	PXOR X0, X0
9669
9670zero_loop_encodeSnappyBlockAsm64K:
9671	MOVOU X0, (DX)
9672	MOVOU X0, 16(DX)
9673	MOVOU X0, 32(DX)
9674	MOVOU X0, 48(DX)
9675	MOVOU X0, 64(DX)
9676	MOVOU X0, 80(DX)
9677	MOVOU X0, 96(DX)
9678	MOVOU X0, 112(DX)
9679	ADDQ  $0x80, DX
9680	DECQ  CX
9681	JNZ   zero_loop_encodeSnappyBlockAsm64K
9682	MOVL  $0x00000000, 12(SP)
9683	MOVQ  src_len+32(FP), CX
9684	LEAQ  -9(CX), DX
9685	LEAQ  -8(CX), SI
9686	MOVL  SI, 8(SP)
9687	SHRQ  $0x05, CX
9688	SUBL  CX, DX
9689	LEAQ  (AX)(DX*1), DX
9690	MOVQ  DX, (SP)
9691	MOVL  $0x00000001, CX
9692	MOVL  CX, 16(SP)
9693	MOVQ  src_base+24(FP), DX
9694
9695search_loop_encodeSnappyBlockAsm64K:
9696	MOVL  CX, SI
9697	SUBL  12(SP), SI
9698	SHRL  $0x06, SI
9699	LEAL  4(CX)(SI*1), SI
9700	CMPL  SI, 8(SP)
9701	JGE   emit_remainder_encodeSnappyBlockAsm64K
9702	MOVQ  (DX)(CX*1), DI
9703	MOVL  SI, 20(SP)
9704	MOVQ  $0x0000cf1bbcdcbf9b, R9
9705	MOVQ  DI, R10
9706	MOVQ  DI, R11
9707	SHRQ  $0x08, R11
9708	SHLQ  $0x10, R10
9709	IMULQ R9, R10
9710	SHRQ  $0x32, R10
9711	SHLQ  $0x10, R11
9712	IMULQ R9, R11
9713	SHRQ  $0x32, R11
9714	MOVL  24(SP)(R10*4), SI
9715	MOVL  24(SP)(R11*4), R8
9716	MOVL  CX, 24(SP)(R10*4)
9717	LEAL  1(CX), R10
9718	MOVL  R10, 24(SP)(R11*4)
9719	MOVQ  DI, R10
9720	SHRQ  $0x10, R10
9721	SHLQ  $0x10, R10
9722	IMULQ R9, R10
9723	SHRQ  $0x32, R10
9724	MOVL  CX, R9
9725	SUBL  16(SP), R9
9726	MOVL  1(DX)(R9*1), R11
9727	MOVQ  DI, R9
9728	SHRQ  $0x08, R9
9729	CMPL  R9, R11
9730	JNE   no_repeat_found_encodeSnappyBlockAsm64K
9731	LEAL  1(CX), DI
9732	MOVL  12(SP), SI
9733	MOVL  DI, R8
9734	SUBL  16(SP), R8
9735	JZ    repeat_extend_back_end_encodeSnappyBlockAsm64K
9736
9737repeat_extend_back_loop_encodeSnappyBlockAsm64K:
9738	CMPL DI, SI
9739	JLE  repeat_extend_back_end_encodeSnappyBlockAsm64K
9740	MOVB -1(DX)(R8*1), BL
9741	MOVB -1(DX)(DI*1), R9
9742	CMPB BL, R9
9743	JNE  repeat_extend_back_end_encodeSnappyBlockAsm64K
9744	LEAL -1(DI), DI
9745	DECL R8
9746	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm64K
9747
9748repeat_extend_back_end_encodeSnappyBlockAsm64K:
9749	MOVL 12(SP), SI
9750	CMPL SI, DI
9751	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
9752	MOVL DI, R8
9753	MOVL DI, 12(SP)
9754	LEAQ (DX)(SI*1), R9
9755	SUBL SI, R8
9756	LEAL -1(R8), SI
9757	CMPL SI, $0x3c
9758	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm64K
9759	CMPL SI, $0x00000100
9760	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm64K
9761	MOVB $0xf4, (AX)
9762	MOVW SI, 1(AX)
9763	ADDQ $0x03, AX
9764	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm64K
9765
9766two_bytes_repeat_emit_encodeSnappyBlockAsm64K:
9767	MOVB $0xf0, (AX)
9768	MOVB SI, 1(AX)
9769	ADDQ $0x02, AX
9770	CMPL SI, $0x40
9771	JL   memmove_repeat_emit_encodeSnappyBlockAsm64K
9772	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm64K
9773
9774one_byte_repeat_emit_encodeSnappyBlockAsm64K:
9775	SHLB $0x02, SI
9776	MOVB SI, (AX)
9777	ADDQ $0x01, AX
9778
9779memmove_repeat_emit_encodeSnappyBlockAsm64K:
9780	LEAQ (AX)(R8*1), SI
9781
9782	// genMemMoveShort
9783	CMPQ R8, $0x08
9784	JLE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8
9785	CMPQ R8, $0x10
9786	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
9787	CMPQ R8, $0x20
9788	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
9789	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
9790
9791emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8:
9792	MOVQ (R9), R10
9793	MOVQ R10, (AX)
9794	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
9795
9796emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
9797	MOVQ (R9), R10
9798	MOVQ -8(R9)(R8*1), R9
9799	MOVQ R10, (AX)
9800	MOVQ R9, -8(AX)(R8*1)
9801	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
9802
9803emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
9804	MOVOU (R9), X0
9805	MOVOU -16(R9)(R8*1), X1
9806	MOVOU X0, (AX)
9807	MOVOU X1, -16(AX)(R8*1)
9808	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
9809
9810emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
9811	MOVOU (R9), X0
9812	MOVOU 16(R9), X1
9813	MOVOU -32(R9)(R8*1), X2
9814	MOVOU -16(R9)(R8*1), X3
9815	MOVOU X0, (AX)
9816	MOVOU X1, 16(AX)
9817	MOVOU X2, -32(AX)(R8*1)
9818	MOVOU X3, -16(AX)(R8*1)
9819
9820memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K:
9821	MOVQ SI, AX
9822	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
9823
9824memmove_long_repeat_emit_encodeSnappyBlockAsm64K:
9825	LEAQ (AX)(R8*1), SI
9826
9827	// genMemMoveLong
9828	MOVOU (R9), X0
9829	MOVOU 16(R9), X1
9830	MOVOU -32(R9)(R8*1), X2
9831	MOVOU -16(R9)(R8*1), X3
9832	MOVQ  R8, R11
9833	SHRQ  $0x05, R11
9834	MOVQ  AX, R10
9835	ANDL  $0x0000001f, R10
9836	MOVQ  $0x00000040, R12
9837	SUBQ  R10, R12
9838	DECQ  R11
9839	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
9840	LEAQ  -32(R9)(R12*1), R10
9841	LEAQ  -32(AX)(R12*1), R13
9842
9843emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
9844	MOVOU (R10), X4
9845	MOVOU 16(R10), X5
9846	MOVOA X4, (R13)
9847	MOVOA X5, 16(R13)
9848	ADDQ  $0x20, R13
9849	ADDQ  $0x20, R10
9850	ADDQ  $0x20, R12
9851	DECQ  R11
9852	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
9853
9854emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
9855	MOVOU -32(R9)(R12*1), X4
9856	MOVOU -16(R9)(R12*1), X5
9857	MOVOA X4, -32(AX)(R12*1)
9858	MOVOA X5, -16(AX)(R12*1)
9859	ADDQ  $0x20, R12
9860	CMPQ  R8, R12
9861	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
9862	MOVOU X0, (AX)
9863	MOVOU X1, 16(AX)
9864	MOVOU X2, -32(AX)(R8*1)
9865	MOVOU X3, -16(AX)(R8*1)
9866	MOVQ  SI, AX
9867
9868emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K:
9869	ADDL $0x05, CX
9870	MOVL CX, SI
9871	SUBL 16(SP), SI
9872	MOVQ src_len+32(FP), R8
9873	SUBL CX, R8
9874	LEAQ (DX)(CX*1), R9
9875	LEAQ (DX)(SI*1), SI
9876
9877	// matchLen
9878	XORL R11, R11
9879	CMPL R8, $0x08
9880	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm64K
9881
9882matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K:
9883	MOVQ  (R9)(R11*1), R10
9884	XORQ  (SI)(R11*1), R10
9885	TESTQ R10, R10
9886	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K
9887	BSFQ  R10, R10
9888	SARQ  $0x03, R10
9889	LEAL  (R11)(R10*1), R11
9890	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm64K
9891
9892matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K:
9893	LEAL -8(R8), R8
9894	LEAL 8(R11), R11
9895	CMPL R8, $0x08
9896	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K
9897
9898matchlen_single_repeat_extend_encodeSnappyBlockAsm64K:
9899	TESTL R8, R8
9900	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm64K
9901
9902matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K:
9903	MOVB (R9)(R11*1), R10
9904	CMPB (SI)(R11*1), R10
9905	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm64K
9906	LEAL 1(R11), R11
9907	DECL R8
9908	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm64K
9909
9910repeat_extend_forward_end_encodeSnappyBlockAsm64K:
9911	ADDL R11, CX
9912	MOVL CX, SI
9913	SUBL DI, SI
9914	MOVL 16(SP), DI
9915
9916	// emitCopy
9917two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K:
9918	CMPL SI, $0x40
9919	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K
9920	MOVB $0xee, (AX)
9921	MOVW DI, 1(AX)
9922	LEAL -60(SI), SI
9923	ADDQ $0x03, AX
9924	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K
9925
9926two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K:
9927	CMPL SI, $0x0c
9928	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
9929	CMPL DI, $0x00000800
9930	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
9931	MOVB $0x01, BL
9932	LEAL -16(BX)(SI*4), SI
9933	MOVB DI, 1(AX)
9934	SHRL $0x08, DI
9935	SHLL $0x05, DI
9936	ORL  DI, SI
9937	MOVB SI, (AX)
9938	ADDQ $0x02, AX
9939	JMP  repeat_end_emit_encodeSnappyBlockAsm64K
9940
9941emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K:
9942	MOVB $0x02, BL
9943	LEAL -4(BX)(SI*4), SI
9944	MOVB SI, (AX)
9945	MOVW DI, 1(AX)
9946	ADDQ $0x03, AX
9947
9948repeat_end_emit_encodeSnappyBlockAsm64K:
9949	MOVL CX, 12(SP)
9950	JMP  search_loop_encodeSnappyBlockAsm64K
9951
9952no_repeat_found_encodeSnappyBlockAsm64K:
9953	CMPL (DX)(SI*1), DI
9954	JEQ  candidate_match_encodeSnappyBlockAsm64K
9955	SHRQ $0x08, DI
9956	MOVL 24(SP)(R10*4), SI
9957	LEAL 2(CX), R9
9958	CMPL (DX)(R8*1), DI
9959	JEQ  candidate2_match_encodeSnappyBlockAsm64K
9960	MOVL R9, 24(SP)(R10*4)
9961	SHRQ $0x08, DI
9962	CMPL (DX)(SI*1), DI
9963	JEQ  candidate3_match_encodeSnappyBlockAsm64K
9964	MOVL 20(SP), CX
9965	JMP  search_loop_encodeSnappyBlockAsm64K
9966
9967candidate3_match_encodeSnappyBlockAsm64K:
9968	ADDL $0x02, CX
9969	JMP  candidate_match_encodeSnappyBlockAsm64K
9970
9971candidate2_match_encodeSnappyBlockAsm64K:
9972	MOVL R9, 24(SP)(R10*4)
9973	INCL CX
9974	MOVL R8, SI
9975
9976candidate_match_encodeSnappyBlockAsm64K:
9977	MOVL  12(SP), DI
9978	TESTL SI, SI
9979	JZ    match_extend_back_end_encodeSnappyBlockAsm64K
9980
9981match_extend_back_loop_encodeSnappyBlockAsm64K:
9982	CMPL CX, DI
9983	JLE  match_extend_back_end_encodeSnappyBlockAsm64K
9984	MOVB -1(DX)(SI*1), BL
9985	MOVB -1(DX)(CX*1), R8
9986	CMPB BL, R8
9987	JNE  match_extend_back_end_encodeSnappyBlockAsm64K
9988	LEAL -1(CX), CX
9989	DECL SI
9990	JZ   match_extend_back_end_encodeSnappyBlockAsm64K
9991	JMP  match_extend_back_loop_encodeSnappyBlockAsm64K
9992
9993match_extend_back_end_encodeSnappyBlockAsm64K:
9994	MOVL CX, DI
9995	SUBL 12(SP), DI
9996	LEAQ 3(AX)(DI*1), DI
9997	CMPQ DI, (SP)
9998	JL   match_dst_size_check_encodeSnappyBlockAsm64K
9999	MOVQ $0x00000000, ret+48(FP)
10000	RET
10001
10002match_dst_size_check_encodeSnappyBlockAsm64K:
10003	MOVL CX, DI
10004	MOVL 12(SP), R8
10005	CMPL R8, DI
10006	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm64K
10007	MOVL DI, R9
10008	MOVL DI, 12(SP)
10009	LEAQ (DX)(R8*1), DI
10010	SUBL R8, R9
10011	LEAL -1(R9), R8
10012	CMPL R8, $0x3c
10013	JLT  one_byte_match_emit_encodeSnappyBlockAsm64K
10014	CMPL R8, $0x00000100
10015	JLT  two_bytes_match_emit_encodeSnappyBlockAsm64K
10016	MOVB $0xf4, (AX)
10017	MOVW R8, 1(AX)
10018	ADDQ $0x03, AX
10019	JMP  memmove_long_match_emit_encodeSnappyBlockAsm64K
10020
10021two_bytes_match_emit_encodeSnappyBlockAsm64K:
10022	MOVB $0xf0, (AX)
10023	MOVB R8, 1(AX)
10024	ADDQ $0x02, AX
10025	CMPL R8, $0x40
10026	JL   memmove_match_emit_encodeSnappyBlockAsm64K
10027	JMP  memmove_long_match_emit_encodeSnappyBlockAsm64K
10028
10029one_byte_match_emit_encodeSnappyBlockAsm64K:
10030	SHLB $0x02, R8
10031	MOVB R8, (AX)
10032	ADDQ $0x01, AX
10033
10034memmove_match_emit_encodeSnappyBlockAsm64K:
10035	LEAQ (AX)(R9*1), R8
10036
10037	// genMemMoveShort
10038	CMPQ R9, $0x08
10039	JLE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8
10040	CMPQ R9, $0x10
10041	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
10042	CMPQ R9, $0x20
10043	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
10044	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
10045
10046emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8:
10047	MOVQ (DI), R10
10048	MOVQ R10, (AX)
10049	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
10050
10051emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
10052	MOVQ (DI), R10
10053	MOVQ -8(DI)(R9*1), DI
10054	MOVQ R10, (AX)
10055	MOVQ DI, -8(AX)(R9*1)
10056	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
10057
10058emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
10059	MOVOU (DI), X0
10060	MOVOU -16(DI)(R9*1), X1
10061	MOVOU X0, (AX)
10062	MOVOU X1, -16(AX)(R9*1)
10063	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
10064
10065emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
10066	MOVOU (DI), X0
10067	MOVOU 16(DI), X1
10068	MOVOU -32(DI)(R9*1), X2
10069	MOVOU -16(DI)(R9*1), X3
10070	MOVOU X0, (AX)
10071	MOVOU X1, 16(AX)
10072	MOVOU X2, -32(AX)(R9*1)
10073	MOVOU X3, -16(AX)(R9*1)
10074
10075memmove_end_copy_match_emit_encodeSnappyBlockAsm64K:
10076	MOVQ R8, AX
10077	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm64K
10078
10079memmove_long_match_emit_encodeSnappyBlockAsm64K:
10080	LEAQ (AX)(R9*1), R8
10081
10082	// genMemMoveLong
10083	MOVOU (DI), X0
10084	MOVOU 16(DI), X1
10085	MOVOU -32(DI)(R9*1), X2
10086	MOVOU -16(DI)(R9*1), X3
10087	MOVQ  R9, R11
10088	SHRQ  $0x05, R11
10089	MOVQ  AX, R10
10090	ANDL  $0x0000001f, R10
10091	MOVQ  $0x00000040, R12
10092	SUBQ  R10, R12
10093	DECQ  R11
10094	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
10095	LEAQ  -32(DI)(R12*1), R10
10096	LEAQ  -32(AX)(R12*1), R13
10097
10098emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
10099	MOVOU (R10), X4
10100	MOVOU 16(R10), X5
10101	MOVOA X4, (R13)
10102	MOVOA X5, 16(R13)
10103	ADDQ  $0x20, R13
10104	ADDQ  $0x20, R10
10105	ADDQ  $0x20, R12
10106	DECQ  R11
10107	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
10108
10109emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
10110	MOVOU -32(DI)(R12*1), X4
10111	MOVOU -16(DI)(R12*1), X5
10112	MOVOA X4, -32(AX)(R12*1)
10113	MOVOA X5, -16(AX)(R12*1)
10114	ADDQ  $0x20, R12
10115	CMPQ  R9, R12
10116	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
10117	MOVOU X0, (AX)
10118	MOVOU X1, 16(AX)
10119	MOVOU X2, -32(AX)(R9*1)
10120	MOVOU X3, -16(AX)(R9*1)
10121	MOVQ  R8, AX
10122
10123emit_literal_done_match_emit_encodeSnappyBlockAsm64K:
10124match_nolit_loop_encodeSnappyBlockAsm64K:
10125	MOVL CX, DI
10126	SUBL SI, DI
10127	MOVL DI, 16(SP)
10128	ADDL $0x04, CX
10129	ADDL $0x04, SI
10130	MOVQ src_len+32(FP), DI
10131	SUBL CX, DI
10132	LEAQ (DX)(CX*1), R8
10133	LEAQ (DX)(SI*1), SI
10134
10135	// matchLen
10136	XORL R10, R10
10137	CMPL DI, $0x08
10138	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm64K
10139
10140matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K:
10141	MOVQ  (R8)(R10*1), R9
10142	XORQ  (SI)(R10*1), R9
10143	TESTQ R9, R9
10144	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm64K
10145	BSFQ  R9, R9
10146	SARQ  $0x03, R9
10147	LEAL  (R10)(R9*1), R10
10148	JMP   match_nolit_end_encodeSnappyBlockAsm64K
10149
10150matchlen_loop_match_nolit_encodeSnappyBlockAsm64K:
10151	LEAL -8(DI), DI
10152	LEAL 8(R10), R10
10153	CMPL DI, $0x08
10154	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K
10155
10156matchlen_single_match_nolit_encodeSnappyBlockAsm64K:
10157	TESTL DI, DI
10158	JZ    match_nolit_end_encodeSnappyBlockAsm64K
10159
10160matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K:
10161	MOVB (R8)(R10*1), R9
10162	CMPB (SI)(R10*1), R9
10163	JNE  match_nolit_end_encodeSnappyBlockAsm64K
10164	LEAL 1(R10), R10
10165	DECL DI
10166	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm64K
10167
10168match_nolit_end_encodeSnappyBlockAsm64K:
10169	ADDL R10, CX
10170	MOVL 16(SP), SI
10171	ADDL $0x04, R10
10172	MOVL CX, 12(SP)
10173
10174	// emitCopy
10175two_byte_offset_match_nolit_encodeSnappyBlockAsm64K:
10176	CMPL R10, $0x40
10177	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K
10178	MOVB $0xee, (AX)
10179	MOVW SI, 1(AX)
10180	LEAL -60(R10), R10
10181	ADDQ $0x03, AX
10182	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm64K
10183
10184two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K:
10185	CMPL R10, $0x0c
10186	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
10187	CMPL SI, $0x00000800
10188	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
10189	MOVB $0x01, BL
10190	LEAL -16(BX)(R10*4), R10
10191	MOVB SI, 1(AX)
10192	SHRL $0x08, SI
10193	SHLL $0x05, SI
10194	ORL  SI, R10
10195	MOVB R10, (AX)
10196	ADDQ $0x02, AX
10197	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm64K
10198
10199emit_copy_three_match_nolit_encodeSnappyBlockAsm64K:
10200	MOVB $0x02, BL
10201	LEAL -4(BX)(R10*4), R10
10202	MOVB R10, (AX)
10203	MOVW SI, 1(AX)
10204	ADDQ $0x03, AX
10205
10206match_nolit_emitcopy_end_encodeSnappyBlockAsm64K:
10207	CMPL CX, 8(SP)
10208	JGE  emit_remainder_encodeSnappyBlockAsm64K
10209	MOVQ -2(DX)(CX*1), DI
10210	CMPQ AX, (SP)
10211	JL   match_nolit_dst_ok_encodeSnappyBlockAsm64K
10212	MOVQ $0x00000000, ret+48(FP)
10213	RET
10214
10215match_nolit_dst_ok_encodeSnappyBlockAsm64K:
10216	MOVQ  $0x0000cf1bbcdcbf9b, R9
10217	MOVQ  DI, R8
10218	SHRQ  $0x10, DI
10219	MOVQ  DI, SI
10220	SHLQ  $0x10, R8
10221	IMULQ R9, R8
10222	SHRQ  $0x32, R8
10223	SHLQ  $0x10, SI
10224	IMULQ R9, SI
10225	SHRQ  $0x32, SI
10226	LEAL  -2(CX), R9
10227	LEAQ  24(SP)(SI*4), R10
10228	MOVL  (R10), SI
10229	MOVL  R9, 24(SP)(R8*4)
10230	MOVL  CX, (R10)
10231	CMPL  (DX)(SI*1), DI
10232	JEQ   match_nolit_loop_encodeSnappyBlockAsm64K
10233	INCL  CX
10234	JMP   search_loop_encodeSnappyBlockAsm64K
10235
10236emit_remainder_encodeSnappyBlockAsm64K:
10237	MOVQ src_len+32(FP), CX
10238	SUBL 12(SP), CX
10239	LEAQ 3(AX)(CX*1), CX
10240	CMPQ CX, (SP)
10241	JL   emit_remainder_ok_encodeSnappyBlockAsm64K
10242	MOVQ $0x00000000, ret+48(FP)
10243	RET
10244
10245emit_remainder_ok_encodeSnappyBlockAsm64K:
10246	MOVQ src_len+32(FP), CX
10247	MOVL 12(SP), BX
10248	CMPL BX, CX
10249	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
10250	MOVL CX, SI
10251	MOVL CX, 12(SP)
10252	LEAQ (DX)(BX*1), CX
10253	SUBL BX, SI
10254	LEAL -1(SI), DX
10255	CMPL DX, $0x3c
10256	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm64K
10257	CMPL DX, $0x00000100
10258	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm64K
10259	MOVB $0xf4, (AX)
10260	MOVW DX, 1(AX)
10261	ADDQ $0x03, AX
10262	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm64K
10263
10264two_bytes_emit_remainder_encodeSnappyBlockAsm64K:
10265	MOVB $0xf0, (AX)
10266	MOVB DL, 1(AX)
10267	ADDQ $0x02, AX
10268	CMPL DX, $0x40
10269	JL   memmove_emit_remainder_encodeSnappyBlockAsm64K
10270	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm64K
10271
10272one_byte_emit_remainder_encodeSnappyBlockAsm64K:
10273	SHLB $0x02, DL
10274	MOVB DL, (AX)
10275	ADDQ $0x01, AX
10276
10277memmove_emit_remainder_encodeSnappyBlockAsm64K:
10278	LEAQ (AX)(SI*1), DX
10279	MOVL SI, BX
10280
10281	// genMemMoveShort
10282	CMPQ BX, $0x08
10283	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8
10284	CMPQ BX, $0x10
10285	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16
10286	CMPQ BX, $0x20
10287	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32
10288	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64
10289
10290emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8:
10291	MOVQ (CX), SI
10292	MOVQ SI, (AX)
10293	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
10294
10295emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16:
10296	MOVQ (CX), SI
10297	MOVQ -8(CX)(BX*1), CX
10298	MOVQ SI, (AX)
10299	MOVQ CX, -8(AX)(BX*1)
10300	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
10301
10302emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32:
10303	MOVOU (CX), X0
10304	MOVOU -16(CX)(BX*1), X1
10305	MOVOU X0, (AX)
10306	MOVOU X1, -16(AX)(BX*1)
10307	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
10308
10309emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64:
10310	MOVOU (CX), X0
10311	MOVOU 16(CX), X1
10312	MOVOU -32(CX)(BX*1), X2
10313	MOVOU -16(CX)(BX*1), X3
10314	MOVOU X0, (AX)
10315	MOVOU X1, 16(AX)
10316	MOVOU X2, -32(AX)(BX*1)
10317	MOVOU X3, -16(AX)(BX*1)
10318
10319memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K:
10320	MOVQ DX, AX
10321	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
10322
10323memmove_long_emit_remainder_encodeSnappyBlockAsm64K:
10324	LEAQ (AX)(SI*1), DX
10325	MOVL SI, BX
10326
10327	// genMemMoveLong
10328	MOVOU (CX), X0
10329	MOVOU 16(CX), X1
10330	MOVOU -32(CX)(BX*1), X2
10331	MOVOU -16(CX)(BX*1), X3
10332	MOVQ  BX, DI
10333	SHRQ  $0x05, DI
10334	MOVQ  AX, SI
10335	ANDL  $0x0000001f, SI
10336	MOVQ  $0x00000040, R8
10337	SUBQ  SI, R8
10338	DECQ  DI
10339	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
10340	LEAQ  -32(CX)(R8*1), SI
10341	LEAQ  -32(AX)(R8*1), R9
10342
10343emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back:
10344	MOVOU (SI), X4
10345	MOVOU 16(SI), X5
10346	MOVOA X4, (R9)
10347	MOVOA X5, 16(R9)
10348	ADDQ  $0x20, R9
10349	ADDQ  $0x20, SI
10350	ADDQ  $0x20, R8
10351	DECQ  DI
10352	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back
10353
10354emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
10355	MOVOU -32(CX)(R8*1), X4
10356	MOVOU -16(CX)(R8*1), X5
10357	MOVOA X4, -32(AX)(R8*1)
10358	MOVOA X5, -16(AX)(R8*1)
10359	ADDQ  $0x20, R8
10360	CMPQ  BX, R8
10361	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
10362	MOVOU X0, (AX)
10363	MOVOU X1, 16(AX)
10364	MOVOU X2, -32(AX)(BX*1)
10365	MOVOU X3, -16(AX)(BX*1)
10366	MOVQ  DX, AX
10367
10368emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K:
10369	MOVQ dst_base+0(FP), CX
10370	SUBQ CX, AX
10371	MOVQ AX, ret+48(FP)
10372	RET
10373
10374// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
10375// Requires: SSE2
10376TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56
10377	MOVQ dst_base+0(FP), AX
10378	MOVQ $0x00000080, CX
10379	LEAQ 24(SP), DX
10380	PXOR X0, X0
10381
10382zero_loop_encodeSnappyBlockAsm12B:
10383	MOVOU X0, (DX)
10384	MOVOU X0, 16(DX)
10385	MOVOU X0, 32(DX)
10386	MOVOU X0, 48(DX)
10387	MOVOU X0, 64(DX)
10388	MOVOU X0, 80(DX)
10389	MOVOU X0, 96(DX)
10390	MOVOU X0, 112(DX)
10391	ADDQ  $0x80, DX
10392	DECQ  CX
10393	JNZ   zero_loop_encodeSnappyBlockAsm12B
10394	MOVL  $0x00000000, 12(SP)
10395	MOVQ  src_len+32(FP), CX
10396	LEAQ  -9(CX), DX
10397	LEAQ  -8(CX), SI
10398	MOVL  SI, 8(SP)
10399	SHRQ  $0x05, CX
10400	SUBL  CX, DX
10401	LEAQ  (AX)(DX*1), DX
10402	MOVQ  DX, (SP)
10403	MOVL  $0x00000001, CX
10404	MOVL  CX, 16(SP)
10405	MOVQ  src_base+24(FP), DX
10406
10407search_loop_encodeSnappyBlockAsm12B:
10408	MOVL  CX, SI
10409	SUBL  12(SP), SI
10410	SHRL  $0x05, SI
10411	LEAL  4(CX)(SI*1), SI
10412	CMPL  SI, 8(SP)
10413	JGE   emit_remainder_encodeSnappyBlockAsm12B
10414	MOVQ  (DX)(CX*1), DI
10415	MOVL  SI, 20(SP)
10416	MOVQ  $0x000000cf1bbcdcbb, R9
10417	MOVQ  DI, R10
10418	MOVQ  DI, R11
10419	SHRQ  $0x08, R11
10420	SHLQ  $0x18, R10
10421	IMULQ R9, R10
10422	SHRQ  $0x34, R10
10423	SHLQ  $0x18, R11
10424	IMULQ R9, R11
10425	SHRQ  $0x34, R11
10426	MOVL  24(SP)(R10*4), SI
10427	MOVL  24(SP)(R11*4), R8
10428	MOVL  CX, 24(SP)(R10*4)
10429	LEAL  1(CX), R10
10430	MOVL  R10, 24(SP)(R11*4)
10431	MOVQ  DI, R10
10432	SHRQ  $0x10, R10
10433	SHLQ  $0x18, R10
10434	IMULQ R9, R10
10435	SHRQ  $0x34, R10
10436	MOVL  CX, R9
10437	SUBL  16(SP), R9
10438	MOVL  1(DX)(R9*1), R11
10439	MOVQ  DI, R9
10440	SHRQ  $0x08, R9
10441	CMPL  R9, R11
10442	JNE   no_repeat_found_encodeSnappyBlockAsm12B
10443	LEAL  1(CX), DI
10444	MOVL  12(SP), SI
10445	MOVL  DI, R8
10446	SUBL  16(SP), R8
10447	JZ    repeat_extend_back_end_encodeSnappyBlockAsm12B
10448
10449repeat_extend_back_loop_encodeSnappyBlockAsm12B:
10450	CMPL DI, SI
10451	JLE  repeat_extend_back_end_encodeSnappyBlockAsm12B
10452	MOVB -1(DX)(R8*1), BL
10453	MOVB -1(DX)(DI*1), R9
10454	CMPB BL, R9
10455	JNE  repeat_extend_back_end_encodeSnappyBlockAsm12B
10456	LEAL -1(DI), DI
10457	DECL R8
10458	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm12B
10459
10460repeat_extend_back_end_encodeSnappyBlockAsm12B:
10461	MOVL 12(SP), SI
10462	CMPL SI, DI
10463	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
10464	MOVL DI, R8
10465	MOVL DI, 12(SP)
10466	LEAQ (DX)(SI*1), R9
10467	SUBL SI, R8
10468	LEAL -1(R8), SI
10469	CMPL SI, $0x3c
10470	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm12B
10471	CMPL SI, $0x00000100
10472	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm12B
10473	MOVB $0xf4, (AX)
10474	MOVW SI, 1(AX)
10475	ADDQ $0x03, AX
10476	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm12B
10477
10478two_bytes_repeat_emit_encodeSnappyBlockAsm12B:
10479	MOVB $0xf0, (AX)
10480	MOVB SI, 1(AX)
10481	ADDQ $0x02, AX
10482	CMPL SI, $0x40
10483	JL   memmove_repeat_emit_encodeSnappyBlockAsm12B
10484	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm12B
10485
10486one_byte_repeat_emit_encodeSnappyBlockAsm12B:
10487	SHLB $0x02, SI
10488	MOVB SI, (AX)
10489	ADDQ $0x01, AX
10490
10491memmove_repeat_emit_encodeSnappyBlockAsm12B:
10492	LEAQ (AX)(R8*1), SI
10493
10494	// genMemMoveShort
10495	CMPQ R8, $0x08
10496	JLE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8
10497	CMPQ R8, $0x10
10498	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
10499	CMPQ R8, $0x20
10500	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
10501	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
10502
10503emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8:
10504	MOVQ (R9), R10
10505	MOVQ R10, (AX)
10506	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
10507
10508emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
10509	MOVQ (R9), R10
10510	MOVQ -8(R9)(R8*1), R9
10511	MOVQ R10, (AX)
10512	MOVQ R9, -8(AX)(R8*1)
10513	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
10514
10515emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
10516	MOVOU (R9), X0
10517	MOVOU -16(R9)(R8*1), X1
10518	MOVOU X0, (AX)
10519	MOVOU X1, -16(AX)(R8*1)
10520	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
10521
10522emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
10523	MOVOU (R9), X0
10524	MOVOU 16(R9), X1
10525	MOVOU -32(R9)(R8*1), X2
10526	MOVOU -16(R9)(R8*1), X3
10527	MOVOU X0, (AX)
10528	MOVOU X1, 16(AX)
10529	MOVOU X2, -32(AX)(R8*1)
10530	MOVOU X3, -16(AX)(R8*1)
10531
10532memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B:
10533	MOVQ SI, AX
10534	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
10535
10536memmove_long_repeat_emit_encodeSnappyBlockAsm12B:
10537	LEAQ (AX)(R8*1), SI
10538
10539	// genMemMoveLong
10540	MOVOU (R9), X0
10541	MOVOU 16(R9), X1
10542	MOVOU -32(R9)(R8*1), X2
10543	MOVOU -16(R9)(R8*1), X3
10544	MOVQ  R8, R11
10545	SHRQ  $0x05, R11
10546	MOVQ  AX, R10
10547	ANDL  $0x0000001f, R10
10548	MOVQ  $0x00000040, R12
10549	SUBQ  R10, R12
10550	DECQ  R11
10551	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
10552	LEAQ  -32(R9)(R12*1), R10
10553	LEAQ  -32(AX)(R12*1), R13
10554
10555emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
10556	MOVOU (R10), X4
10557	MOVOU 16(R10), X5
10558	MOVOA X4, (R13)
10559	MOVOA X5, 16(R13)
10560	ADDQ  $0x20, R13
10561	ADDQ  $0x20, R10
10562	ADDQ  $0x20, R12
10563	DECQ  R11
10564	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
10565
10566emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
10567	MOVOU -32(R9)(R12*1), X4
10568	MOVOU -16(R9)(R12*1), X5
10569	MOVOA X4, -32(AX)(R12*1)
10570	MOVOA X5, -16(AX)(R12*1)
10571	ADDQ  $0x20, R12
10572	CMPQ  R8, R12
10573	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
10574	MOVOU X0, (AX)
10575	MOVOU X1, 16(AX)
10576	MOVOU X2, -32(AX)(R8*1)
10577	MOVOU X3, -16(AX)(R8*1)
10578	MOVQ  SI, AX
10579
10580emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
10581	ADDL $0x05, CX
10582	MOVL CX, SI
10583	SUBL 16(SP), SI
10584	MOVQ src_len+32(FP), R8
10585	SUBL CX, R8
10586	LEAQ (DX)(CX*1), R9
10587	LEAQ (DX)(SI*1), SI
10588
10589	// matchLen
10590	XORL R11, R11
10591	CMPL R8, $0x08
10592	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm12B
10593
10594matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B:
10595	MOVQ  (R9)(R11*1), R10
10596	XORQ  (SI)(R11*1), R10
10597	TESTQ R10, R10
10598	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B
10599	BSFQ  R10, R10
10600	SARQ  $0x03, R10
10601	LEAL  (R11)(R10*1), R11
10602	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm12B
10603
10604matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B:
10605	LEAL -8(R8), R8
10606	LEAL 8(R11), R11
10607	CMPL R8, $0x08
10608	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B
10609
10610matchlen_single_repeat_extend_encodeSnappyBlockAsm12B:
10611	TESTL R8, R8
10612	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm12B
10613
10614matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B:
10615	MOVB (R9)(R11*1), R10
10616	CMPB (SI)(R11*1), R10
10617	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm12B
10618	LEAL 1(R11), R11
10619	DECL R8
10620	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B
10621
10622repeat_extend_forward_end_encodeSnappyBlockAsm12B:
10623	ADDL R11, CX
10624	MOVL CX, SI
10625	SUBL DI, SI
10626	MOVL 16(SP), DI
10627
10628	// emitCopy
10629two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B:
10630	CMPL SI, $0x40
10631	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B
10632	MOVB $0xee, (AX)
10633	MOVW DI, 1(AX)
10634	LEAL -60(SI), SI
10635	ADDQ $0x03, AX
10636	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B
10637
10638two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B:
10639	CMPL SI, $0x0c
10640	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
10641	CMPL DI, $0x00000800
10642	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
10643	MOVB $0x01, BL
10644	LEAL -16(BX)(SI*4), SI
10645	MOVB DI, 1(AX)
10646	SHRL $0x08, DI
10647	SHLL $0x05, DI
10648	ORL  DI, SI
10649	MOVB SI, (AX)
10650	ADDQ $0x02, AX
10651	JMP  repeat_end_emit_encodeSnappyBlockAsm12B
10652
10653emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B:
10654	MOVB $0x02, BL
10655	LEAL -4(BX)(SI*4), SI
10656	MOVB SI, (AX)
10657	MOVW DI, 1(AX)
10658	ADDQ $0x03, AX
10659
10660repeat_end_emit_encodeSnappyBlockAsm12B:
10661	MOVL CX, 12(SP)
10662	JMP  search_loop_encodeSnappyBlockAsm12B
10663
10664no_repeat_found_encodeSnappyBlockAsm12B:
10665	CMPL (DX)(SI*1), DI
10666	JEQ  candidate_match_encodeSnappyBlockAsm12B
10667	SHRQ $0x08, DI
10668	MOVL 24(SP)(R10*4), SI
10669	LEAL 2(CX), R9
10670	CMPL (DX)(R8*1), DI
10671	JEQ  candidate2_match_encodeSnappyBlockAsm12B
10672	MOVL R9, 24(SP)(R10*4)
10673	SHRQ $0x08, DI
10674	CMPL (DX)(SI*1), DI
10675	JEQ  candidate3_match_encodeSnappyBlockAsm12B
10676	MOVL 20(SP), CX
10677	JMP  search_loop_encodeSnappyBlockAsm12B
10678
10679candidate3_match_encodeSnappyBlockAsm12B:
10680	ADDL $0x02, CX
10681	JMP  candidate_match_encodeSnappyBlockAsm12B
10682
10683candidate2_match_encodeSnappyBlockAsm12B:
10684	MOVL R9, 24(SP)(R10*4)
10685	INCL CX
10686	MOVL R8, SI
10687
10688candidate_match_encodeSnappyBlockAsm12B:
10689	MOVL  12(SP), DI
10690	TESTL SI, SI
10691	JZ    match_extend_back_end_encodeSnappyBlockAsm12B
10692
10693match_extend_back_loop_encodeSnappyBlockAsm12B:
10694	CMPL CX, DI
10695	JLE  match_extend_back_end_encodeSnappyBlockAsm12B
10696	MOVB -1(DX)(SI*1), BL
10697	MOVB -1(DX)(CX*1), R8
10698	CMPB BL, R8
10699	JNE  match_extend_back_end_encodeSnappyBlockAsm12B
10700	LEAL -1(CX), CX
10701	DECL SI
10702	JZ   match_extend_back_end_encodeSnappyBlockAsm12B
10703	JMP  match_extend_back_loop_encodeSnappyBlockAsm12B
10704
10705match_extend_back_end_encodeSnappyBlockAsm12B:
10706	MOVL CX, DI
10707	SUBL 12(SP), DI
10708	LEAQ 3(AX)(DI*1), DI
10709	CMPQ DI, (SP)
10710	JL   match_dst_size_check_encodeSnappyBlockAsm12B
10711	MOVQ $0x00000000, ret+48(FP)
10712	RET
10713
10714match_dst_size_check_encodeSnappyBlockAsm12B:
10715	MOVL CX, DI
10716	MOVL 12(SP), R8
10717	CMPL R8, DI
10718	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm12B
10719	MOVL DI, R9
10720	MOVL DI, 12(SP)
10721	LEAQ (DX)(R8*1), DI
10722	SUBL R8, R9
10723	LEAL -1(R9), R8
10724	CMPL R8, $0x3c
10725	JLT  one_byte_match_emit_encodeSnappyBlockAsm12B
10726	CMPL R8, $0x00000100
10727	JLT  two_bytes_match_emit_encodeSnappyBlockAsm12B
10728	MOVB $0xf4, (AX)
10729	MOVW R8, 1(AX)
10730	ADDQ $0x03, AX
10731	JMP  memmove_long_match_emit_encodeSnappyBlockAsm12B
10732
10733two_bytes_match_emit_encodeSnappyBlockAsm12B:
10734	MOVB $0xf0, (AX)
10735	MOVB R8, 1(AX)
10736	ADDQ $0x02, AX
10737	CMPL R8, $0x40
10738	JL   memmove_match_emit_encodeSnappyBlockAsm12B
10739	JMP  memmove_long_match_emit_encodeSnappyBlockAsm12B
10740
10741one_byte_match_emit_encodeSnappyBlockAsm12B:
10742	SHLB $0x02, R8
10743	MOVB R8, (AX)
10744	ADDQ $0x01, AX
10745
10746memmove_match_emit_encodeSnappyBlockAsm12B:
10747	LEAQ (AX)(R9*1), R8
10748
10749	// genMemMoveShort
10750	CMPQ R9, $0x08
10751	JLE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8
10752	CMPQ R9, $0x10
10753	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
10754	CMPQ R9, $0x20
10755	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
10756	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
10757
10758emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8:
10759	MOVQ (DI), R10
10760	MOVQ R10, (AX)
10761	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
10762
10763emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
10764	MOVQ (DI), R10
10765	MOVQ -8(DI)(R9*1), DI
10766	MOVQ R10, (AX)
10767	MOVQ DI, -8(AX)(R9*1)
10768	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
10769
10770emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
10771	MOVOU (DI), X0
10772	MOVOU -16(DI)(R9*1), X1
10773	MOVOU X0, (AX)
10774	MOVOU X1, -16(AX)(R9*1)
10775	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
10776
10777emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
10778	MOVOU (DI), X0
10779	MOVOU 16(DI), X1
10780	MOVOU -32(DI)(R9*1), X2
10781	MOVOU -16(DI)(R9*1), X3
10782	MOVOU X0, (AX)
10783	MOVOU X1, 16(AX)
10784	MOVOU X2, -32(AX)(R9*1)
10785	MOVOU X3, -16(AX)(R9*1)
10786
10787memmove_end_copy_match_emit_encodeSnappyBlockAsm12B:
10788	MOVQ R8, AX
10789	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm12B
10790
10791memmove_long_match_emit_encodeSnappyBlockAsm12B:
10792	LEAQ (AX)(R9*1), R8
10793
10794	// genMemMoveLong
10795	MOVOU (DI), X0
10796	MOVOU 16(DI), X1
10797	MOVOU -32(DI)(R9*1), X2
10798	MOVOU -16(DI)(R9*1), X3
10799	MOVQ  R9, R11
10800	SHRQ  $0x05, R11
10801	MOVQ  AX, R10
10802	ANDL  $0x0000001f, R10
10803	MOVQ  $0x00000040, R12
10804	SUBQ  R10, R12
10805	DECQ  R11
10806	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
10807	LEAQ  -32(DI)(R12*1), R10
10808	LEAQ  -32(AX)(R12*1), R13
10809
10810emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
10811	MOVOU (R10), X4
10812	MOVOU 16(R10), X5
10813	MOVOA X4, (R13)
10814	MOVOA X5, 16(R13)
10815	ADDQ  $0x20, R13
10816	ADDQ  $0x20, R10
10817	ADDQ  $0x20, R12
10818	DECQ  R11
10819	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
10820
10821emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
10822	MOVOU -32(DI)(R12*1), X4
10823	MOVOU -16(DI)(R12*1), X5
10824	MOVOA X4, -32(AX)(R12*1)
10825	MOVOA X5, -16(AX)(R12*1)
10826	ADDQ  $0x20, R12
10827	CMPQ  R9, R12
10828	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
10829	MOVOU X0, (AX)
10830	MOVOU X1, 16(AX)
10831	MOVOU X2, -32(AX)(R9*1)
10832	MOVOU X3, -16(AX)(R9*1)
10833	MOVQ  R8, AX
10834
10835emit_literal_done_match_emit_encodeSnappyBlockAsm12B:
10836match_nolit_loop_encodeSnappyBlockAsm12B:
10837	MOVL CX, DI
10838	SUBL SI, DI
10839	MOVL DI, 16(SP)
10840	ADDL $0x04, CX
10841	ADDL $0x04, SI
10842	MOVQ src_len+32(FP), DI
10843	SUBL CX, DI
10844	LEAQ (DX)(CX*1), R8
10845	LEAQ (DX)(SI*1), SI
10846
10847	// matchLen
10848	XORL R10, R10
10849	CMPL DI, $0x08
10850	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm12B
10851
10852matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B:
10853	MOVQ  (R8)(R10*1), R9
10854	XORQ  (SI)(R10*1), R9
10855	TESTQ R9, R9
10856	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm12B
10857	BSFQ  R9, R9
10858	SARQ  $0x03, R9
10859	LEAL  (R10)(R9*1), R10
10860	JMP   match_nolit_end_encodeSnappyBlockAsm12B
10861
10862matchlen_loop_match_nolit_encodeSnappyBlockAsm12B:
10863	LEAL -8(DI), DI
10864	LEAL 8(R10), R10
10865	CMPL DI, $0x08
10866	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B
10867
10868matchlen_single_match_nolit_encodeSnappyBlockAsm12B:
10869	TESTL DI, DI
10870	JZ    match_nolit_end_encodeSnappyBlockAsm12B
10871
10872matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B:
10873	MOVB (R8)(R10*1), R9
10874	CMPB (SI)(R10*1), R9
10875	JNE  match_nolit_end_encodeSnappyBlockAsm12B
10876	LEAL 1(R10), R10
10877	DECL DI
10878	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B
10879
10880match_nolit_end_encodeSnappyBlockAsm12B:
10881	ADDL R10, CX
10882	MOVL 16(SP), SI
10883	ADDL $0x04, R10
10884	MOVL CX, 12(SP)
10885
10886	// emitCopy
10887two_byte_offset_match_nolit_encodeSnappyBlockAsm12B:
10888	CMPL R10, $0x40
10889	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B
10890	MOVB $0xee, (AX)
10891	MOVW SI, 1(AX)
10892	LEAL -60(R10), R10
10893	ADDQ $0x03, AX
10894	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm12B
10895
10896two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B:
10897	CMPL R10, $0x0c
10898	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
10899	CMPL SI, $0x00000800
10900	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
10901	MOVB $0x01, BL
10902	LEAL -16(BX)(R10*4), R10
10903	MOVB SI, 1(AX)
10904	SHRL $0x08, SI
10905	SHLL $0x05, SI
10906	ORL  SI, R10
10907	MOVB R10, (AX)
10908	ADDQ $0x02, AX
10909	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm12B
10910
10911emit_copy_three_match_nolit_encodeSnappyBlockAsm12B:
10912	MOVB $0x02, BL
10913	LEAL -4(BX)(R10*4), R10
10914	MOVB R10, (AX)
10915	MOVW SI, 1(AX)
10916	ADDQ $0x03, AX
10917
10918match_nolit_emitcopy_end_encodeSnappyBlockAsm12B:
10919	CMPL CX, 8(SP)
10920	JGE  emit_remainder_encodeSnappyBlockAsm12B
10921	MOVQ -2(DX)(CX*1), DI
10922	CMPQ AX, (SP)
10923	JL   match_nolit_dst_ok_encodeSnappyBlockAsm12B
10924	MOVQ $0x00000000, ret+48(FP)
10925	RET
10926
10927match_nolit_dst_ok_encodeSnappyBlockAsm12B:
10928	MOVQ  $0x000000cf1bbcdcbb, R9
10929	MOVQ  DI, R8
10930	SHRQ  $0x10, DI
10931	MOVQ  DI, SI
10932	SHLQ  $0x18, R8
10933	IMULQ R9, R8
10934	SHRQ  $0x34, R8
10935	SHLQ  $0x18, SI
10936	IMULQ R9, SI
10937	SHRQ  $0x34, SI
10938	LEAL  -2(CX), R9
10939	LEAQ  24(SP)(SI*4), R10
10940	MOVL  (R10), SI
10941	MOVL  R9, 24(SP)(R8*4)
10942	MOVL  CX, (R10)
10943	CMPL  (DX)(SI*1), DI
10944	JEQ   match_nolit_loop_encodeSnappyBlockAsm12B
10945	INCL  CX
10946	JMP   search_loop_encodeSnappyBlockAsm12B
10947
10948emit_remainder_encodeSnappyBlockAsm12B:
10949	MOVQ src_len+32(FP), CX
10950	SUBL 12(SP), CX
10951	LEAQ 3(AX)(CX*1), CX
10952	CMPQ CX, (SP)
10953	JL   emit_remainder_ok_encodeSnappyBlockAsm12B
10954	MOVQ $0x00000000, ret+48(FP)
10955	RET
10956
10957emit_remainder_ok_encodeSnappyBlockAsm12B:
10958	MOVQ src_len+32(FP), CX
10959	MOVL 12(SP), BX
10960	CMPL BX, CX
10961	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
10962	MOVL CX, SI
10963	MOVL CX, 12(SP)
10964	LEAQ (DX)(BX*1), CX
10965	SUBL BX, SI
10966	LEAL -1(SI), DX
10967	CMPL DX, $0x3c
10968	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm12B
10969	CMPL DX, $0x00000100
10970	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm12B
10971	MOVB $0xf4, (AX)
10972	MOVW DX, 1(AX)
10973	ADDQ $0x03, AX
10974	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm12B
10975
10976two_bytes_emit_remainder_encodeSnappyBlockAsm12B:
10977	MOVB $0xf0, (AX)
10978	MOVB DL, 1(AX)
10979	ADDQ $0x02, AX
10980	CMPL DX, $0x40
10981	JL   memmove_emit_remainder_encodeSnappyBlockAsm12B
10982	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm12B
10983
10984one_byte_emit_remainder_encodeSnappyBlockAsm12B:
10985	SHLB $0x02, DL
10986	MOVB DL, (AX)
10987	ADDQ $0x01, AX
10988
10989memmove_emit_remainder_encodeSnappyBlockAsm12B:
10990	LEAQ (AX)(SI*1), DX
10991	MOVL SI, BX
10992
10993	// genMemMoveShort
10994	CMPQ BX, $0x08
10995	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8
10996	CMPQ BX, $0x10
10997	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16
10998	CMPQ BX, $0x20
10999	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32
11000	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64
11001
11002emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8:
11003	MOVQ (CX), SI
11004	MOVQ SI, (AX)
11005	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
11006
11007emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16:
11008	MOVQ (CX), SI
11009	MOVQ -8(CX)(BX*1), CX
11010	MOVQ SI, (AX)
11011	MOVQ CX, -8(AX)(BX*1)
11012	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
11013
11014emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32:
11015	MOVOU (CX), X0
11016	MOVOU -16(CX)(BX*1), X1
11017	MOVOU X0, (AX)
11018	MOVOU X1, -16(AX)(BX*1)
11019	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
11020
11021emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64:
11022	MOVOU (CX), X0
11023	MOVOU 16(CX), X1
11024	MOVOU -32(CX)(BX*1), X2
11025	MOVOU -16(CX)(BX*1), X3
11026	MOVOU X0, (AX)
11027	MOVOU X1, 16(AX)
11028	MOVOU X2, -32(AX)(BX*1)
11029	MOVOU X3, -16(AX)(BX*1)
11030
11031memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B:
11032	MOVQ DX, AX
11033	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
11034
11035memmove_long_emit_remainder_encodeSnappyBlockAsm12B:
11036	LEAQ (AX)(SI*1), DX
11037	MOVL SI, BX
11038
11039	// genMemMoveLong
11040	MOVOU (CX), X0
11041	MOVOU 16(CX), X1
11042	MOVOU -32(CX)(BX*1), X2
11043	MOVOU -16(CX)(BX*1), X3
11044	MOVQ  BX, DI
11045	SHRQ  $0x05, DI
11046	MOVQ  AX, SI
11047	ANDL  $0x0000001f, SI
11048	MOVQ  $0x00000040, R8
11049	SUBQ  SI, R8
11050	DECQ  DI
11051	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
11052	LEAQ  -32(CX)(R8*1), SI
11053	LEAQ  -32(AX)(R8*1), R9
11054
11055emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back:
11056	MOVOU (SI), X4
11057	MOVOU 16(SI), X5
11058	MOVOA X4, (R9)
11059	MOVOA X5, 16(R9)
11060	ADDQ  $0x20, R9
11061	ADDQ  $0x20, SI
11062	ADDQ  $0x20, R8
11063	DECQ  DI
11064	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back
11065
11066emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
11067	MOVOU -32(CX)(R8*1), X4
11068	MOVOU -16(CX)(R8*1), X5
11069	MOVOA X4, -32(AX)(R8*1)
11070	MOVOA X5, -16(AX)(R8*1)
11071	ADDQ  $0x20, R8
11072	CMPQ  BX, R8
11073	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
11074	MOVOU X0, (AX)
11075	MOVOU X1, 16(AX)
11076	MOVOU X2, -32(AX)(BX*1)
11077	MOVOU X3, -16(AX)(BX*1)
11078	MOVQ  DX, AX
11079
11080emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B:
11081	MOVQ dst_base+0(FP), CX
11082	SUBQ CX, AX
11083	MOVQ AX, ret+48(FP)
11084	RET
11085
11086// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
11087// Requires: SSE2
11088TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56
11089	MOVQ dst_base+0(FP), AX
11090	MOVQ $0x00000020, CX
11091	LEAQ 24(SP), DX
11092	PXOR X0, X0
11093
11094zero_loop_encodeSnappyBlockAsm10B:
11095	MOVOU X0, (DX)
11096	MOVOU X0, 16(DX)
11097	MOVOU X0, 32(DX)
11098	MOVOU X0, 48(DX)
11099	MOVOU X0, 64(DX)
11100	MOVOU X0, 80(DX)
11101	MOVOU X0, 96(DX)
11102	MOVOU X0, 112(DX)
11103	ADDQ  $0x80, DX
11104	DECQ  CX
11105	JNZ   zero_loop_encodeSnappyBlockAsm10B
11106	MOVL  $0x00000000, 12(SP)
11107	MOVQ  src_len+32(FP), CX
11108	LEAQ  -9(CX), DX
11109	LEAQ  -8(CX), SI
11110	MOVL  SI, 8(SP)
11111	SHRQ  $0x05, CX
11112	SUBL  CX, DX
11113	LEAQ  (AX)(DX*1), DX
11114	MOVQ  DX, (SP)
11115	MOVL  $0x00000001, CX
11116	MOVL  CX, 16(SP)
11117	MOVQ  src_base+24(FP), DX
11118
11119search_loop_encodeSnappyBlockAsm10B:
11120	MOVL  CX, SI
11121	SUBL  12(SP), SI
11122	SHRL  $0x05, SI
11123	LEAL  4(CX)(SI*1), SI
11124	CMPL  SI, 8(SP)
11125	JGE   emit_remainder_encodeSnappyBlockAsm10B
11126	MOVQ  (DX)(CX*1), DI
11127	MOVL  SI, 20(SP)
11128	MOVQ  $0x9e3779b1, R9
11129	MOVQ  DI, R10
11130	MOVQ  DI, R11
11131	SHRQ  $0x08, R11
11132	SHLQ  $0x20, R10
11133	IMULQ R9, R10
11134	SHRQ  $0x36, R10
11135	SHLQ  $0x20, R11
11136	IMULQ R9, R11
11137	SHRQ  $0x36, R11
11138	MOVL  24(SP)(R10*4), SI
11139	MOVL  24(SP)(R11*4), R8
11140	MOVL  CX, 24(SP)(R10*4)
11141	LEAL  1(CX), R10
11142	MOVL  R10, 24(SP)(R11*4)
11143	MOVQ  DI, R10
11144	SHRQ  $0x10, R10
11145	SHLQ  $0x20, R10
11146	IMULQ R9, R10
11147	SHRQ  $0x36, R10
11148	MOVL  CX, R9
11149	SUBL  16(SP), R9
11150	MOVL  1(DX)(R9*1), R11
11151	MOVQ  DI, R9
11152	SHRQ  $0x08, R9
11153	CMPL  R9, R11
11154	JNE   no_repeat_found_encodeSnappyBlockAsm10B
11155	LEAL  1(CX), DI
11156	MOVL  12(SP), SI
11157	MOVL  DI, R8
11158	SUBL  16(SP), R8
11159	JZ    repeat_extend_back_end_encodeSnappyBlockAsm10B
11160
11161repeat_extend_back_loop_encodeSnappyBlockAsm10B:
11162	CMPL DI, SI
11163	JLE  repeat_extend_back_end_encodeSnappyBlockAsm10B
11164	MOVB -1(DX)(R8*1), BL
11165	MOVB -1(DX)(DI*1), R9
11166	CMPB BL, R9
11167	JNE  repeat_extend_back_end_encodeSnappyBlockAsm10B
11168	LEAL -1(DI), DI
11169	DECL R8
11170	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm10B
11171
11172repeat_extend_back_end_encodeSnappyBlockAsm10B:
11173	MOVL 12(SP), SI
11174	CMPL SI, DI
11175	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
11176	MOVL DI, R8
11177	MOVL DI, 12(SP)
11178	LEAQ (DX)(SI*1), R9
11179	SUBL SI, R8
11180	LEAL -1(R8), SI
11181	CMPL SI, $0x3c
11182	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm10B
11183	CMPL SI, $0x00000100
11184	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm10B
11185	MOVB $0xf4, (AX)
11186	MOVW SI, 1(AX)
11187	ADDQ $0x03, AX
11188	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm10B
11189
11190two_bytes_repeat_emit_encodeSnappyBlockAsm10B:
11191	MOVB $0xf0, (AX)
11192	MOVB SI, 1(AX)
11193	ADDQ $0x02, AX
11194	CMPL SI, $0x40
11195	JL   memmove_repeat_emit_encodeSnappyBlockAsm10B
11196	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm10B
11197
11198one_byte_repeat_emit_encodeSnappyBlockAsm10B:
11199	SHLB $0x02, SI
11200	MOVB SI, (AX)
11201	ADDQ $0x01, AX
11202
11203memmove_repeat_emit_encodeSnappyBlockAsm10B:
11204	LEAQ (AX)(R8*1), SI
11205
11206	// genMemMoveShort
11207	CMPQ R8, $0x08
11208	JLE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8
11209	CMPQ R8, $0x10
11210	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
11211	CMPQ R8, $0x20
11212	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
11213	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
11214
11215emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8:
11216	MOVQ (R9), R10
11217	MOVQ R10, (AX)
11218	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
11219
11220emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
11221	MOVQ (R9), R10
11222	MOVQ -8(R9)(R8*1), R9
11223	MOVQ R10, (AX)
11224	MOVQ R9, -8(AX)(R8*1)
11225	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
11226
11227emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
11228	MOVOU (R9), X0
11229	MOVOU -16(R9)(R8*1), X1
11230	MOVOU X0, (AX)
11231	MOVOU X1, -16(AX)(R8*1)
11232	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
11233
11234emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
11235	MOVOU (R9), X0
11236	MOVOU 16(R9), X1
11237	MOVOU -32(R9)(R8*1), X2
11238	MOVOU -16(R9)(R8*1), X3
11239	MOVOU X0, (AX)
11240	MOVOU X1, 16(AX)
11241	MOVOU X2, -32(AX)(R8*1)
11242	MOVOU X3, -16(AX)(R8*1)
11243
11244memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B:
11245	MOVQ SI, AX
11246	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
11247
11248memmove_long_repeat_emit_encodeSnappyBlockAsm10B:
11249	LEAQ (AX)(R8*1), SI
11250
11251	// genMemMoveLong
11252	MOVOU (R9), X0
11253	MOVOU 16(R9), X1
11254	MOVOU -32(R9)(R8*1), X2
11255	MOVOU -16(R9)(R8*1), X3
11256	MOVQ  R8, R11
11257	SHRQ  $0x05, R11
11258	MOVQ  AX, R10
11259	ANDL  $0x0000001f, R10
11260	MOVQ  $0x00000040, R12
11261	SUBQ  R10, R12
11262	DECQ  R11
11263	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
11264	LEAQ  -32(R9)(R12*1), R10
11265	LEAQ  -32(AX)(R12*1), R13
11266
11267emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
11268	MOVOU (R10), X4
11269	MOVOU 16(R10), X5
11270	MOVOA X4, (R13)
11271	MOVOA X5, 16(R13)
11272	ADDQ  $0x20, R13
11273	ADDQ  $0x20, R10
11274	ADDQ  $0x20, R12
11275	DECQ  R11
11276	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
11277
11278emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
11279	MOVOU -32(R9)(R12*1), X4
11280	MOVOU -16(R9)(R12*1), X5
11281	MOVOA X4, -32(AX)(R12*1)
11282	MOVOA X5, -16(AX)(R12*1)
11283	ADDQ  $0x20, R12
11284	CMPQ  R8, R12
11285	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
11286	MOVOU X0, (AX)
11287	MOVOU X1, 16(AX)
11288	MOVOU X2, -32(AX)(R8*1)
11289	MOVOU X3, -16(AX)(R8*1)
11290	MOVQ  SI, AX
11291
11292emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
11293	ADDL $0x05, CX
11294	MOVL CX, SI
11295	SUBL 16(SP), SI
11296	MOVQ src_len+32(FP), R8
11297	SUBL CX, R8
11298	LEAQ (DX)(CX*1), R9
11299	LEAQ (DX)(SI*1), SI
11300
11301	// matchLen
11302	XORL R11, R11
11303	CMPL R8, $0x08
11304	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm10B
11305
11306matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B:
11307	MOVQ  (R9)(R11*1), R10
11308	XORQ  (SI)(R11*1), R10
11309	TESTQ R10, R10
11310	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B
11311	BSFQ  R10, R10
11312	SARQ  $0x03, R10
11313	LEAL  (R11)(R10*1), R11
11314	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm10B
11315
11316matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B:
11317	LEAL -8(R8), R8
11318	LEAL 8(R11), R11
11319	CMPL R8, $0x08
11320	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B
11321
11322matchlen_single_repeat_extend_encodeSnappyBlockAsm10B:
11323	TESTL R8, R8
11324	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm10B
11325
11326matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B:
11327	MOVB (R9)(R11*1), R10
11328	CMPB (SI)(R11*1), R10
11329	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm10B
11330	LEAL 1(R11), R11
11331	DECL R8
11332	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B
11333
11334repeat_extend_forward_end_encodeSnappyBlockAsm10B:
11335	ADDL R11, CX
11336	MOVL CX, SI
11337	SUBL DI, SI
11338	MOVL 16(SP), DI
11339
11340	// emitCopy
11341two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B:
11342	CMPL SI, $0x40
11343	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B
11344	MOVB $0xee, (AX)
11345	MOVW DI, 1(AX)
11346	LEAL -60(SI), SI
11347	ADDQ $0x03, AX
11348	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B
11349
11350two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B:
11351	CMPL SI, $0x0c
11352	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
11353	CMPL DI, $0x00000800
11354	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
11355	MOVB $0x01, BL
11356	LEAL -16(BX)(SI*4), SI
11357	MOVB DI, 1(AX)
11358	SHRL $0x08, DI
11359	SHLL $0x05, DI
11360	ORL  DI, SI
11361	MOVB SI, (AX)
11362	ADDQ $0x02, AX
11363	JMP  repeat_end_emit_encodeSnappyBlockAsm10B
11364
11365emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B:
11366	MOVB $0x02, BL
11367	LEAL -4(BX)(SI*4), SI
11368	MOVB SI, (AX)
11369	MOVW DI, 1(AX)
11370	ADDQ $0x03, AX
11371
11372repeat_end_emit_encodeSnappyBlockAsm10B:
11373	MOVL CX, 12(SP)
11374	JMP  search_loop_encodeSnappyBlockAsm10B
11375
11376no_repeat_found_encodeSnappyBlockAsm10B:
11377	CMPL (DX)(SI*1), DI
11378	JEQ  candidate_match_encodeSnappyBlockAsm10B
11379	SHRQ $0x08, DI
11380	MOVL 24(SP)(R10*4), SI
11381	LEAL 2(CX), R9
11382	CMPL (DX)(R8*1), DI
11383	JEQ  candidate2_match_encodeSnappyBlockAsm10B
11384	MOVL R9, 24(SP)(R10*4)
11385	SHRQ $0x08, DI
11386	CMPL (DX)(SI*1), DI
11387	JEQ  candidate3_match_encodeSnappyBlockAsm10B
11388	MOVL 20(SP), CX
11389	JMP  search_loop_encodeSnappyBlockAsm10B
11390
11391candidate3_match_encodeSnappyBlockAsm10B:
11392	ADDL $0x02, CX
11393	JMP  candidate_match_encodeSnappyBlockAsm10B
11394
11395candidate2_match_encodeSnappyBlockAsm10B:
11396	MOVL R9, 24(SP)(R10*4)
11397	INCL CX
11398	MOVL R8, SI
11399
11400candidate_match_encodeSnappyBlockAsm10B:
11401	MOVL  12(SP), DI
11402	TESTL SI, SI
11403	JZ    match_extend_back_end_encodeSnappyBlockAsm10B
11404
11405match_extend_back_loop_encodeSnappyBlockAsm10B:
11406	CMPL CX, DI
11407	JLE  match_extend_back_end_encodeSnappyBlockAsm10B
11408	MOVB -1(DX)(SI*1), BL
11409	MOVB -1(DX)(CX*1), R8
11410	CMPB BL, R8
11411	JNE  match_extend_back_end_encodeSnappyBlockAsm10B
11412	LEAL -1(CX), CX
11413	DECL SI
11414	JZ   match_extend_back_end_encodeSnappyBlockAsm10B
11415	JMP  match_extend_back_loop_encodeSnappyBlockAsm10B
11416
11417match_extend_back_end_encodeSnappyBlockAsm10B:
11418	MOVL CX, DI
11419	SUBL 12(SP), DI
11420	LEAQ 3(AX)(DI*1), DI
11421	CMPQ DI, (SP)
11422	JL   match_dst_size_check_encodeSnappyBlockAsm10B
11423	MOVQ $0x00000000, ret+48(FP)
11424	RET
11425
11426match_dst_size_check_encodeSnappyBlockAsm10B:
11427	MOVL CX, DI
11428	MOVL 12(SP), R8
11429	CMPL R8, DI
11430	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm10B
11431	MOVL DI, R9
11432	MOVL DI, 12(SP)
11433	LEAQ (DX)(R8*1), DI
11434	SUBL R8, R9
11435	LEAL -1(R9), R8
11436	CMPL R8, $0x3c
11437	JLT  one_byte_match_emit_encodeSnappyBlockAsm10B
11438	CMPL R8, $0x00000100
11439	JLT  two_bytes_match_emit_encodeSnappyBlockAsm10B
11440	MOVB $0xf4, (AX)
11441	MOVW R8, 1(AX)
11442	ADDQ $0x03, AX
11443	JMP  memmove_long_match_emit_encodeSnappyBlockAsm10B
11444
11445two_bytes_match_emit_encodeSnappyBlockAsm10B:
11446	MOVB $0xf0, (AX)
11447	MOVB R8, 1(AX)
11448	ADDQ $0x02, AX
11449	CMPL R8, $0x40
11450	JL   memmove_match_emit_encodeSnappyBlockAsm10B
11451	JMP  memmove_long_match_emit_encodeSnappyBlockAsm10B
11452
11453one_byte_match_emit_encodeSnappyBlockAsm10B:
11454	SHLB $0x02, R8
11455	MOVB R8, (AX)
11456	ADDQ $0x01, AX
11457
11458memmove_match_emit_encodeSnappyBlockAsm10B:
11459	LEAQ (AX)(R9*1), R8
11460
11461	// genMemMoveShort
11462	CMPQ R9, $0x08
11463	JLE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8
11464	CMPQ R9, $0x10
11465	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
11466	CMPQ R9, $0x20
11467	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
11468	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
11469
11470emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8:
11471	MOVQ (DI), R10
11472	MOVQ R10, (AX)
11473	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
11474
11475emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
11476	MOVQ (DI), R10
11477	MOVQ -8(DI)(R9*1), DI
11478	MOVQ R10, (AX)
11479	MOVQ DI, -8(AX)(R9*1)
11480	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
11481
11482emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
11483	MOVOU (DI), X0
11484	MOVOU -16(DI)(R9*1), X1
11485	MOVOU X0, (AX)
11486	MOVOU X1, -16(AX)(R9*1)
11487	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
11488
11489emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
11490	MOVOU (DI), X0
11491	MOVOU 16(DI), X1
11492	MOVOU -32(DI)(R9*1), X2
11493	MOVOU -16(DI)(R9*1), X3
11494	MOVOU X0, (AX)
11495	MOVOU X1, 16(AX)
11496	MOVOU X2, -32(AX)(R9*1)
11497	MOVOU X3, -16(AX)(R9*1)
11498
11499memmove_end_copy_match_emit_encodeSnappyBlockAsm10B:
11500	MOVQ R8, AX
11501	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm10B
11502
11503memmove_long_match_emit_encodeSnappyBlockAsm10B:
11504	LEAQ (AX)(R9*1), R8
11505
11506	// genMemMoveLong
11507	MOVOU (DI), X0
11508	MOVOU 16(DI), X1
11509	MOVOU -32(DI)(R9*1), X2
11510	MOVOU -16(DI)(R9*1), X3
11511	MOVQ  R9, R11
11512	SHRQ  $0x05, R11
11513	MOVQ  AX, R10
11514	ANDL  $0x0000001f, R10
11515	MOVQ  $0x00000040, R12
11516	SUBQ  R10, R12
11517	DECQ  R11
11518	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
11519	LEAQ  -32(DI)(R12*1), R10
11520	LEAQ  -32(AX)(R12*1), R13
11521
11522emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
11523	MOVOU (R10), X4
11524	MOVOU 16(R10), X5
11525	MOVOA X4, (R13)
11526	MOVOA X5, 16(R13)
11527	ADDQ  $0x20, R13
11528	ADDQ  $0x20, R10
11529	ADDQ  $0x20, R12
11530	DECQ  R11
11531	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
11532
11533emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
11534	MOVOU -32(DI)(R12*1), X4
11535	MOVOU -16(DI)(R12*1), X5
11536	MOVOA X4, -32(AX)(R12*1)
11537	MOVOA X5, -16(AX)(R12*1)
11538	ADDQ  $0x20, R12
11539	CMPQ  R9, R12
11540	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
11541	MOVOU X0, (AX)
11542	MOVOU X1, 16(AX)
11543	MOVOU X2, -32(AX)(R9*1)
11544	MOVOU X3, -16(AX)(R9*1)
11545	MOVQ  R8, AX
11546
11547emit_literal_done_match_emit_encodeSnappyBlockAsm10B:
11548match_nolit_loop_encodeSnappyBlockAsm10B:
11549	MOVL CX, DI
11550	SUBL SI, DI
11551	MOVL DI, 16(SP)
11552	ADDL $0x04, CX
11553	ADDL $0x04, SI
11554	MOVQ src_len+32(FP), DI
11555	SUBL CX, DI
11556	LEAQ (DX)(CX*1), R8
11557	LEAQ (DX)(SI*1), SI
11558
11559	// matchLen
11560	XORL R10, R10
11561	CMPL DI, $0x08
11562	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm10B
11563
11564matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B:
11565	MOVQ  (R8)(R10*1), R9
11566	XORQ  (SI)(R10*1), R9
11567	TESTQ R9, R9
11568	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm10B
11569	BSFQ  R9, R9
11570	SARQ  $0x03, R9
11571	LEAL  (R10)(R9*1), R10
11572	JMP   match_nolit_end_encodeSnappyBlockAsm10B
11573
11574matchlen_loop_match_nolit_encodeSnappyBlockAsm10B:
11575	LEAL -8(DI), DI
11576	LEAL 8(R10), R10
11577	CMPL DI, $0x08
11578	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B
11579
11580matchlen_single_match_nolit_encodeSnappyBlockAsm10B:
11581	TESTL DI, DI
11582	JZ    match_nolit_end_encodeSnappyBlockAsm10B
11583
11584matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B:
11585	MOVB (R8)(R10*1), R9
11586	CMPB (SI)(R10*1), R9
11587	JNE  match_nolit_end_encodeSnappyBlockAsm10B
11588	LEAL 1(R10), R10
11589	DECL DI
11590	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B
11591
11592match_nolit_end_encodeSnappyBlockAsm10B:
11593	ADDL R10, CX
11594	MOVL 16(SP), SI
11595	ADDL $0x04, R10
11596	MOVL CX, 12(SP)
11597
11598	// emitCopy
11599two_byte_offset_match_nolit_encodeSnappyBlockAsm10B:
11600	CMPL R10, $0x40
11601	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B
11602	MOVB $0xee, (AX)
11603	MOVW SI, 1(AX)
11604	LEAL -60(R10), R10
11605	ADDQ $0x03, AX
11606	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm10B
11607
11608two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B:
11609	CMPL R10, $0x0c
11610	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
11611	CMPL SI, $0x00000800
11612	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
11613	MOVB $0x01, BL
11614	LEAL -16(BX)(R10*4), R10
11615	MOVB SI, 1(AX)
11616	SHRL $0x08, SI
11617	SHLL $0x05, SI
11618	ORL  SI, R10
11619	MOVB R10, (AX)
11620	ADDQ $0x02, AX
11621	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm10B
11622
11623emit_copy_three_match_nolit_encodeSnappyBlockAsm10B:
11624	MOVB $0x02, BL
11625	LEAL -4(BX)(R10*4), R10
11626	MOVB R10, (AX)
11627	MOVW SI, 1(AX)
11628	ADDQ $0x03, AX
11629
11630match_nolit_emitcopy_end_encodeSnappyBlockAsm10B:
11631	CMPL CX, 8(SP)
11632	JGE  emit_remainder_encodeSnappyBlockAsm10B
11633	MOVQ -2(DX)(CX*1), DI
11634	CMPQ AX, (SP)
11635	JL   match_nolit_dst_ok_encodeSnappyBlockAsm10B
11636	MOVQ $0x00000000, ret+48(FP)
11637	RET
11638
11639match_nolit_dst_ok_encodeSnappyBlockAsm10B:
11640	MOVQ  $0x9e3779b1, R9
11641	MOVQ  DI, R8
11642	SHRQ  $0x10, DI
11643	MOVQ  DI, SI
11644	SHLQ  $0x20, R8
11645	IMULQ R9, R8
11646	SHRQ  $0x36, R8
11647	SHLQ  $0x20, SI
11648	IMULQ R9, SI
11649	SHRQ  $0x36, SI
11650	LEAL  -2(CX), R9
11651	LEAQ  24(SP)(SI*4), R10
11652	MOVL  (R10), SI
11653	MOVL  R9, 24(SP)(R8*4)
11654	MOVL  CX, (R10)
11655	CMPL  (DX)(SI*1), DI
11656	JEQ   match_nolit_loop_encodeSnappyBlockAsm10B
11657	INCL  CX
11658	JMP   search_loop_encodeSnappyBlockAsm10B
11659
11660emit_remainder_encodeSnappyBlockAsm10B:
11661	MOVQ src_len+32(FP), CX
11662	SUBL 12(SP), CX
11663	LEAQ 3(AX)(CX*1), CX
11664	CMPQ CX, (SP)
11665	JL   emit_remainder_ok_encodeSnappyBlockAsm10B
11666	MOVQ $0x00000000, ret+48(FP)
11667	RET
11668
11669emit_remainder_ok_encodeSnappyBlockAsm10B:
11670	MOVQ src_len+32(FP), CX
11671	MOVL 12(SP), BX
11672	CMPL BX, CX
11673	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
11674	MOVL CX, SI
11675	MOVL CX, 12(SP)
11676	LEAQ (DX)(BX*1), CX
11677	SUBL BX, SI
11678	LEAL -1(SI), DX
11679	CMPL DX, $0x3c
11680	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm10B
11681	CMPL DX, $0x00000100
11682	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm10B
11683	MOVB $0xf4, (AX)
11684	MOVW DX, 1(AX)
11685	ADDQ $0x03, AX
11686	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm10B
11687
11688two_bytes_emit_remainder_encodeSnappyBlockAsm10B:
11689	MOVB $0xf0, (AX)
11690	MOVB DL, 1(AX)
11691	ADDQ $0x02, AX
11692	CMPL DX, $0x40
11693	JL   memmove_emit_remainder_encodeSnappyBlockAsm10B
11694	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm10B
11695
11696one_byte_emit_remainder_encodeSnappyBlockAsm10B:
11697	SHLB $0x02, DL
11698	MOVB DL, (AX)
11699	ADDQ $0x01, AX
11700
11701memmove_emit_remainder_encodeSnappyBlockAsm10B:
11702	LEAQ (AX)(SI*1), DX
11703	MOVL SI, BX
11704
11705	// genMemMoveShort
11706	CMPQ BX, $0x08
11707	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8
11708	CMPQ BX, $0x10
11709	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16
11710	CMPQ BX, $0x20
11711	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32
11712	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64
11713
11714emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8:
11715	MOVQ (CX), SI
11716	MOVQ SI, (AX)
11717	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
11718
11719emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16:
11720	MOVQ (CX), SI
11721	MOVQ -8(CX)(BX*1), CX
11722	MOVQ SI, (AX)
11723	MOVQ CX, -8(AX)(BX*1)
11724	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
11725
11726emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32:
11727	MOVOU (CX), X0
11728	MOVOU -16(CX)(BX*1), X1
11729	MOVOU X0, (AX)
11730	MOVOU X1, -16(AX)(BX*1)
11731	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
11732
11733emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64:
11734	MOVOU (CX), X0
11735	MOVOU 16(CX), X1
11736	MOVOU -32(CX)(BX*1), X2
11737	MOVOU -16(CX)(BX*1), X3
11738	MOVOU X0, (AX)
11739	MOVOU X1, 16(AX)
11740	MOVOU X2, -32(AX)(BX*1)
11741	MOVOU X3, -16(AX)(BX*1)
11742
11743memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B:
11744	MOVQ DX, AX
11745	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
11746
11747memmove_long_emit_remainder_encodeSnappyBlockAsm10B:
11748	LEAQ (AX)(SI*1), DX
11749	MOVL SI, BX
11750
11751	// genMemMoveLong
11752	MOVOU (CX), X0
11753	MOVOU 16(CX), X1
11754	MOVOU -32(CX)(BX*1), X2
11755	MOVOU -16(CX)(BX*1), X3
11756	MOVQ  BX, DI
11757	SHRQ  $0x05, DI
11758	MOVQ  AX, SI
11759	ANDL  $0x0000001f, SI
11760	MOVQ  $0x00000040, R8
11761	SUBQ  SI, R8
11762	DECQ  DI
11763	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
11764	LEAQ  -32(CX)(R8*1), SI
11765	LEAQ  -32(AX)(R8*1), R9
11766
11767emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back:
11768	MOVOU (SI), X4
11769	MOVOU 16(SI), X5
11770	MOVOA X4, (R9)
11771	MOVOA X5, 16(R9)
11772	ADDQ  $0x20, R9
11773	ADDQ  $0x20, SI
11774	ADDQ  $0x20, R8
11775	DECQ  DI
11776	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back
11777
11778emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
11779	MOVOU -32(CX)(R8*1), X4
11780	MOVOU -16(CX)(R8*1), X5
11781	MOVOA X4, -32(AX)(R8*1)
11782	MOVOA X5, -16(AX)(R8*1)
11783	ADDQ  $0x20, R8
11784	CMPQ  BX, R8
11785	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
11786	MOVOU X0, (AX)
11787	MOVOU X1, 16(AX)
11788	MOVOU X2, -32(AX)(BX*1)
11789	MOVOU X3, -16(AX)(BX*1)
11790	MOVQ  DX, AX
11791
11792emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B:
11793	MOVQ dst_base+0(FP), CX
11794	SUBQ CX, AX
11795	MOVQ AX, ret+48(FP)
11796	RET
11797
11798// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
11799// Requires: SSE2
11800TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56
11801	MOVQ dst_base+0(FP), AX
11802	MOVQ $0x00000008, CX
11803	LEAQ 24(SP), DX
11804	PXOR X0, X0
11805
11806zero_loop_encodeSnappyBlockAsm8B:
11807	MOVOU X0, (DX)
11808	MOVOU X0, 16(DX)
11809	MOVOU X0, 32(DX)
11810	MOVOU X0, 48(DX)
11811	MOVOU X0, 64(DX)
11812	MOVOU X0, 80(DX)
11813	MOVOU X0, 96(DX)
11814	MOVOU X0, 112(DX)
11815	ADDQ  $0x80, DX
11816	DECQ  CX
11817	JNZ   zero_loop_encodeSnappyBlockAsm8B
11818	MOVL  $0x00000000, 12(SP)
11819	MOVQ  src_len+32(FP), CX
11820	LEAQ  -9(CX), DX
11821	LEAQ  -8(CX), SI
11822	MOVL  SI, 8(SP)
11823	SHRQ  $0x05, CX
11824	SUBL  CX, DX
11825	LEAQ  (AX)(DX*1), DX
11826	MOVQ  DX, (SP)
11827	MOVL  $0x00000001, CX
11828	MOVL  CX, 16(SP)
11829	MOVQ  src_base+24(FP), DX
11830
11831search_loop_encodeSnappyBlockAsm8B:
11832	MOVL  CX, SI
11833	SUBL  12(SP), SI
11834	SHRL  $0x04, SI
11835	LEAL  4(CX)(SI*1), SI
11836	CMPL  SI, 8(SP)
11837	JGE   emit_remainder_encodeSnappyBlockAsm8B
11838	MOVQ  (DX)(CX*1), DI
11839	MOVL  SI, 20(SP)
11840	MOVQ  $0x9e3779b1, R9
11841	MOVQ  DI, R10
11842	MOVQ  DI, R11
11843	SHRQ  $0x08, R11
11844	SHLQ  $0x20, R10
11845	IMULQ R9, R10
11846	SHRQ  $0x38, R10
11847	SHLQ  $0x20, R11
11848	IMULQ R9, R11
11849	SHRQ  $0x38, R11
11850	MOVL  24(SP)(R10*4), SI
11851	MOVL  24(SP)(R11*4), R8
11852	MOVL  CX, 24(SP)(R10*4)
11853	LEAL  1(CX), R10
11854	MOVL  R10, 24(SP)(R11*4)
11855	MOVQ  DI, R10
11856	SHRQ  $0x10, R10
11857	SHLQ  $0x20, R10
11858	IMULQ R9, R10
11859	SHRQ  $0x38, R10
11860	MOVL  CX, R9
11861	SUBL  16(SP), R9
11862	MOVL  1(DX)(R9*1), R11
11863	MOVQ  DI, R9
11864	SHRQ  $0x08, R9
11865	CMPL  R9, R11
11866	JNE   no_repeat_found_encodeSnappyBlockAsm8B
11867	LEAL  1(CX), DI
11868	MOVL  12(SP), SI
11869	MOVL  DI, R8
11870	SUBL  16(SP), R8
11871	JZ    repeat_extend_back_end_encodeSnappyBlockAsm8B
11872
11873repeat_extend_back_loop_encodeSnappyBlockAsm8B:
11874	CMPL DI, SI
11875	JLE  repeat_extend_back_end_encodeSnappyBlockAsm8B
11876	MOVB -1(DX)(R8*1), BL
11877	MOVB -1(DX)(DI*1), R9
11878	CMPB BL, R9
11879	JNE  repeat_extend_back_end_encodeSnappyBlockAsm8B
11880	LEAL -1(DI), DI
11881	DECL R8
11882	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm8B
11883
11884repeat_extend_back_end_encodeSnappyBlockAsm8B:
11885	MOVL 12(SP), SI
11886	CMPL SI, DI
11887	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
11888	MOVL DI, R8
11889	MOVL DI, 12(SP)
11890	LEAQ (DX)(SI*1), R9
11891	SUBL SI, R8
11892	LEAL -1(R8), SI
11893	CMPL SI, $0x3c
11894	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm8B
11895	CMPL SI, $0x00000100
11896	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm8B
11897	MOVB $0xf4, (AX)
11898	MOVW SI, 1(AX)
11899	ADDQ $0x03, AX
11900	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm8B
11901
11902two_bytes_repeat_emit_encodeSnappyBlockAsm8B:
11903	MOVB $0xf0, (AX)
11904	MOVB SI, 1(AX)
11905	ADDQ $0x02, AX
11906	CMPL SI, $0x40
11907	JL   memmove_repeat_emit_encodeSnappyBlockAsm8B
11908	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm8B
11909
11910one_byte_repeat_emit_encodeSnappyBlockAsm8B:
11911	SHLB $0x02, SI
11912	MOVB SI, (AX)
11913	ADDQ $0x01, AX
11914
11915memmove_repeat_emit_encodeSnappyBlockAsm8B:
11916	LEAQ (AX)(R8*1), SI
11917
11918	// genMemMoveShort
11919	CMPQ R8, $0x08
11920	JLE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8
11921	CMPQ R8, $0x10
11922	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
11923	CMPQ R8, $0x20
11924	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
11925	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
11926
11927emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8:
11928	MOVQ (R9), R10
11929	MOVQ R10, (AX)
11930	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
11931
11932emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
11933	MOVQ (R9), R10
11934	MOVQ -8(R9)(R8*1), R9
11935	MOVQ R10, (AX)
11936	MOVQ R9, -8(AX)(R8*1)
11937	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
11938
11939emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
11940	MOVOU (R9), X0
11941	MOVOU -16(R9)(R8*1), X1
11942	MOVOU X0, (AX)
11943	MOVOU X1, -16(AX)(R8*1)
11944	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
11945
11946emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
11947	MOVOU (R9), X0
11948	MOVOU 16(R9), X1
11949	MOVOU -32(R9)(R8*1), X2
11950	MOVOU -16(R9)(R8*1), X3
11951	MOVOU X0, (AX)
11952	MOVOU X1, 16(AX)
11953	MOVOU X2, -32(AX)(R8*1)
11954	MOVOU X3, -16(AX)(R8*1)
11955
11956memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B:
11957	MOVQ SI, AX
11958	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
11959
11960memmove_long_repeat_emit_encodeSnappyBlockAsm8B:
11961	LEAQ (AX)(R8*1), SI
11962
11963	// genMemMoveLong
11964	MOVOU (R9), X0
11965	MOVOU 16(R9), X1
11966	MOVOU -32(R9)(R8*1), X2
11967	MOVOU -16(R9)(R8*1), X3
11968	MOVQ  R8, R11
11969	SHRQ  $0x05, R11
11970	MOVQ  AX, R10
11971	ANDL  $0x0000001f, R10
11972	MOVQ  $0x00000040, R12
11973	SUBQ  R10, R12
11974	DECQ  R11
11975	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
11976	LEAQ  -32(R9)(R12*1), R10
11977	LEAQ  -32(AX)(R12*1), R13
11978
11979emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
11980	MOVOU (R10), X4
11981	MOVOU 16(R10), X5
11982	MOVOA X4, (R13)
11983	MOVOA X5, 16(R13)
11984	ADDQ  $0x20, R13
11985	ADDQ  $0x20, R10
11986	ADDQ  $0x20, R12
11987	DECQ  R11
11988	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
11989
11990emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
11991	MOVOU -32(R9)(R12*1), X4
11992	MOVOU -16(R9)(R12*1), X5
11993	MOVOA X4, -32(AX)(R12*1)
11994	MOVOA X5, -16(AX)(R12*1)
11995	ADDQ  $0x20, R12
11996	CMPQ  R8, R12
11997	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
11998	MOVOU X0, (AX)
11999	MOVOU X1, 16(AX)
12000	MOVOU X2, -32(AX)(R8*1)
12001	MOVOU X3, -16(AX)(R8*1)
12002	MOVQ  SI, AX
12003
12004emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
12005	ADDL $0x05, CX
12006	MOVL CX, SI
12007	SUBL 16(SP), SI
12008	MOVQ src_len+32(FP), R8
12009	SUBL CX, R8
12010	LEAQ (DX)(CX*1), R9
12011	LEAQ (DX)(SI*1), SI
12012
12013	// matchLen
12014	XORL R11, R11
12015	CMPL R8, $0x08
12016	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm8B
12017
12018matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B:
12019	MOVQ  (R9)(R11*1), R10
12020	XORQ  (SI)(R11*1), R10
12021	TESTQ R10, R10
12022	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B
12023	BSFQ  R10, R10
12024	SARQ  $0x03, R10
12025	LEAL  (R11)(R10*1), R11
12026	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm8B
12027
12028matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B:
12029	LEAL -8(R8), R8
12030	LEAL 8(R11), R11
12031	CMPL R8, $0x08
12032	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B
12033
12034matchlen_single_repeat_extend_encodeSnappyBlockAsm8B:
12035	TESTL R8, R8
12036	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm8B
12037
12038matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B:
12039	MOVB (R9)(R11*1), R10
12040	CMPB (SI)(R11*1), R10
12041	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm8B
12042	LEAL 1(R11), R11
12043	DECL R8
12044	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B
12045
12046repeat_extend_forward_end_encodeSnappyBlockAsm8B:
12047	ADDL R11, CX
12048	MOVL CX, SI
12049	SUBL DI, SI
12050	MOVL 16(SP), DI
12051
12052	// emitCopy
12053two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B:
12054	CMPL SI, $0x40
12055	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B
12056	MOVB $0xee, (AX)
12057	MOVW DI, 1(AX)
12058	LEAL -60(SI), SI
12059	ADDQ $0x03, AX
12060	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B
12061
12062two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B:
12063	CMPL SI, $0x0c
12064	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B
12065	MOVB $0x01, BL
12066	LEAL -16(BX)(SI*4), SI
12067	MOVB DI, 1(AX)
12068	SHRL $0x08, DI
12069	SHLL $0x05, DI
12070	ORL  DI, SI
12071	MOVB SI, (AX)
12072	ADDQ $0x02, AX
12073	JMP  repeat_end_emit_encodeSnappyBlockAsm8B
12074
12075emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B:
12076	MOVB $0x02, BL
12077	LEAL -4(BX)(SI*4), SI
12078	MOVB SI, (AX)
12079	MOVW DI, 1(AX)
12080	ADDQ $0x03, AX
12081
12082repeat_end_emit_encodeSnappyBlockAsm8B:
12083	MOVL CX, 12(SP)
12084	JMP  search_loop_encodeSnappyBlockAsm8B
12085
12086no_repeat_found_encodeSnappyBlockAsm8B:
12087	CMPL (DX)(SI*1), DI
12088	JEQ  candidate_match_encodeSnappyBlockAsm8B
12089	SHRQ $0x08, DI
12090	MOVL 24(SP)(R10*4), SI
12091	LEAL 2(CX), R9
12092	CMPL (DX)(R8*1), DI
12093	JEQ  candidate2_match_encodeSnappyBlockAsm8B
12094	MOVL R9, 24(SP)(R10*4)
12095	SHRQ $0x08, DI
12096	CMPL (DX)(SI*1), DI
12097	JEQ  candidate3_match_encodeSnappyBlockAsm8B
12098	MOVL 20(SP), CX
12099	JMP  search_loop_encodeSnappyBlockAsm8B
12100
12101candidate3_match_encodeSnappyBlockAsm8B:
12102	ADDL $0x02, CX
12103	JMP  candidate_match_encodeSnappyBlockAsm8B
12104
12105candidate2_match_encodeSnappyBlockAsm8B:
12106	MOVL R9, 24(SP)(R10*4)
12107	INCL CX
12108	MOVL R8, SI
12109
12110candidate_match_encodeSnappyBlockAsm8B:
12111	MOVL  12(SP), DI
12112	TESTL SI, SI
12113	JZ    match_extend_back_end_encodeSnappyBlockAsm8B
12114
12115match_extend_back_loop_encodeSnappyBlockAsm8B:
12116	CMPL CX, DI
12117	JLE  match_extend_back_end_encodeSnappyBlockAsm8B
12118	MOVB -1(DX)(SI*1), BL
12119	MOVB -1(DX)(CX*1), R8
12120	CMPB BL, R8
12121	JNE  match_extend_back_end_encodeSnappyBlockAsm8B
12122	LEAL -1(CX), CX
12123	DECL SI
12124	JZ   match_extend_back_end_encodeSnappyBlockAsm8B
12125	JMP  match_extend_back_loop_encodeSnappyBlockAsm8B
12126
12127match_extend_back_end_encodeSnappyBlockAsm8B:
12128	MOVL CX, DI
12129	SUBL 12(SP), DI
12130	LEAQ 3(AX)(DI*1), DI
12131	CMPQ DI, (SP)
12132	JL   match_dst_size_check_encodeSnappyBlockAsm8B
12133	MOVQ $0x00000000, ret+48(FP)
12134	RET
12135
12136match_dst_size_check_encodeSnappyBlockAsm8B:
12137	MOVL CX, DI
12138	MOVL 12(SP), R8
12139	CMPL R8, DI
12140	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm8B
12141	MOVL DI, R9
12142	MOVL DI, 12(SP)
12143	LEAQ (DX)(R8*1), DI
12144	SUBL R8, R9
12145	LEAL -1(R9), R8
12146	CMPL R8, $0x3c
12147	JLT  one_byte_match_emit_encodeSnappyBlockAsm8B
12148	CMPL R8, $0x00000100
12149	JLT  two_bytes_match_emit_encodeSnappyBlockAsm8B
12150	MOVB $0xf4, (AX)
12151	MOVW R8, 1(AX)
12152	ADDQ $0x03, AX
12153	JMP  memmove_long_match_emit_encodeSnappyBlockAsm8B
12154
12155two_bytes_match_emit_encodeSnappyBlockAsm8B:
12156	MOVB $0xf0, (AX)
12157	MOVB R8, 1(AX)
12158	ADDQ $0x02, AX
12159	CMPL R8, $0x40
12160	JL   memmove_match_emit_encodeSnappyBlockAsm8B
12161	JMP  memmove_long_match_emit_encodeSnappyBlockAsm8B
12162
12163one_byte_match_emit_encodeSnappyBlockAsm8B:
12164	SHLB $0x02, R8
12165	MOVB R8, (AX)
12166	ADDQ $0x01, AX
12167
12168memmove_match_emit_encodeSnappyBlockAsm8B:
12169	LEAQ (AX)(R9*1), R8
12170
12171	// genMemMoveShort
12172	CMPQ R9, $0x08
12173	JLE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8
12174	CMPQ R9, $0x10
12175	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
12176	CMPQ R9, $0x20
12177	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
12178	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
12179
12180emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8:
12181	MOVQ (DI), R10
12182	MOVQ R10, (AX)
12183	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
12184
12185emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
12186	MOVQ (DI), R10
12187	MOVQ -8(DI)(R9*1), DI
12188	MOVQ R10, (AX)
12189	MOVQ DI, -8(AX)(R9*1)
12190	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
12191
12192emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
12193	MOVOU (DI), X0
12194	MOVOU -16(DI)(R9*1), X1
12195	MOVOU X0, (AX)
12196	MOVOU X1, -16(AX)(R9*1)
12197	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
12198
12199emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
12200	MOVOU (DI), X0
12201	MOVOU 16(DI), X1
12202	MOVOU -32(DI)(R9*1), X2
12203	MOVOU -16(DI)(R9*1), X3
12204	MOVOU X0, (AX)
12205	MOVOU X1, 16(AX)
12206	MOVOU X2, -32(AX)(R9*1)
12207	MOVOU X3, -16(AX)(R9*1)
12208
12209memmove_end_copy_match_emit_encodeSnappyBlockAsm8B:
12210	MOVQ R8, AX
12211	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm8B
12212
12213memmove_long_match_emit_encodeSnappyBlockAsm8B:
12214	LEAQ (AX)(R9*1), R8
12215
12216	// genMemMoveLong
12217	MOVOU (DI), X0
12218	MOVOU 16(DI), X1
12219	MOVOU -32(DI)(R9*1), X2
12220	MOVOU -16(DI)(R9*1), X3
12221	MOVQ  R9, R11
12222	SHRQ  $0x05, R11
12223	MOVQ  AX, R10
12224	ANDL  $0x0000001f, R10
12225	MOVQ  $0x00000040, R12
12226	SUBQ  R10, R12
12227	DECQ  R11
12228	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
12229	LEAQ  -32(DI)(R12*1), R10
12230	LEAQ  -32(AX)(R12*1), R13
12231
12232emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
12233	MOVOU (R10), X4
12234	MOVOU 16(R10), X5
12235	MOVOA X4, (R13)
12236	MOVOA X5, 16(R13)
12237	ADDQ  $0x20, R13
12238	ADDQ  $0x20, R10
12239	ADDQ  $0x20, R12
12240	DECQ  R11
12241	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
12242
12243emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
12244	MOVOU -32(DI)(R12*1), X4
12245	MOVOU -16(DI)(R12*1), X5
12246	MOVOA X4, -32(AX)(R12*1)
12247	MOVOA X5, -16(AX)(R12*1)
12248	ADDQ  $0x20, R12
12249	CMPQ  R9, R12
12250	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
12251	MOVOU X0, (AX)
12252	MOVOU X1, 16(AX)
12253	MOVOU X2, -32(AX)(R9*1)
12254	MOVOU X3, -16(AX)(R9*1)
12255	MOVQ  R8, AX
12256
12257emit_literal_done_match_emit_encodeSnappyBlockAsm8B:
12258match_nolit_loop_encodeSnappyBlockAsm8B:
12259	MOVL CX, DI
12260	SUBL SI, DI
12261	MOVL DI, 16(SP)
12262	ADDL $0x04, CX
12263	ADDL $0x04, SI
12264	MOVQ src_len+32(FP), DI
12265	SUBL CX, DI
12266	LEAQ (DX)(CX*1), R8
12267	LEAQ (DX)(SI*1), SI
12268
12269	// matchLen
12270	XORL R10, R10
12271	CMPL DI, $0x08
12272	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm8B
12273
12274matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B:
12275	MOVQ  (R8)(R10*1), R9
12276	XORQ  (SI)(R10*1), R9
12277	TESTQ R9, R9
12278	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm8B
12279	BSFQ  R9, R9
12280	SARQ  $0x03, R9
12281	LEAL  (R10)(R9*1), R10
12282	JMP   match_nolit_end_encodeSnappyBlockAsm8B
12283
12284matchlen_loop_match_nolit_encodeSnappyBlockAsm8B:
12285	LEAL -8(DI), DI
12286	LEAL 8(R10), R10
12287	CMPL DI, $0x08
12288	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B
12289
12290matchlen_single_match_nolit_encodeSnappyBlockAsm8B:
12291	TESTL DI, DI
12292	JZ    match_nolit_end_encodeSnappyBlockAsm8B
12293
12294matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B:
12295	MOVB (R8)(R10*1), R9
12296	CMPB (SI)(R10*1), R9
12297	JNE  match_nolit_end_encodeSnappyBlockAsm8B
12298	LEAL 1(R10), R10
12299	DECL DI
12300	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B
12301
12302match_nolit_end_encodeSnappyBlockAsm8B:
12303	ADDL R10, CX
12304	MOVL 16(SP), SI
12305	ADDL $0x04, R10
12306	MOVL CX, 12(SP)
12307
12308	// emitCopy
12309two_byte_offset_match_nolit_encodeSnappyBlockAsm8B:
12310	CMPL R10, $0x40
12311	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B
12312	MOVB $0xee, (AX)
12313	MOVW SI, 1(AX)
12314	LEAL -60(R10), R10
12315	ADDQ $0x03, AX
12316	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm8B
12317
12318two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B:
12319	CMPL R10, $0x0c
12320	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm8B
12321	MOVB $0x01, BL
12322	LEAL -16(BX)(R10*4), R10
12323	MOVB SI, 1(AX)
12324	SHRL $0x08, SI
12325	SHLL $0x05, SI
12326	ORL  SI, R10
12327	MOVB R10, (AX)
12328	ADDQ $0x02, AX
12329	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm8B
12330
12331emit_copy_three_match_nolit_encodeSnappyBlockAsm8B:
12332	MOVB $0x02, BL
12333	LEAL -4(BX)(R10*4), R10
12334	MOVB R10, (AX)
12335	MOVW SI, 1(AX)
12336	ADDQ $0x03, AX
12337
12338match_nolit_emitcopy_end_encodeSnappyBlockAsm8B:
12339	CMPL CX, 8(SP)
12340	JGE  emit_remainder_encodeSnappyBlockAsm8B
12341	MOVQ -2(DX)(CX*1), DI
12342	CMPQ AX, (SP)
12343	JL   match_nolit_dst_ok_encodeSnappyBlockAsm8B
12344	MOVQ $0x00000000, ret+48(FP)
12345	RET
12346
12347match_nolit_dst_ok_encodeSnappyBlockAsm8B:
12348	MOVQ  $0x9e3779b1, R9
12349	MOVQ  DI, R8
12350	SHRQ  $0x10, DI
12351	MOVQ  DI, SI
12352	SHLQ  $0x20, R8
12353	IMULQ R9, R8
12354	SHRQ  $0x38, R8
12355	SHLQ  $0x20, SI
12356	IMULQ R9, SI
12357	SHRQ  $0x38, SI
12358	LEAL  -2(CX), R9
12359	LEAQ  24(SP)(SI*4), R10
12360	MOVL  (R10), SI
12361	MOVL  R9, 24(SP)(R8*4)
12362	MOVL  CX, (R10)
12363	CMPL  (DX)(SI*1), DI
12364	JEQ   match_nolit_loop_encodeSnappyBlockAsm8B
12365	INCL  CX
12366	JMP   search_loop_encodeSnappyBlockAsm8B
12367
12368emit_remainder_encodeSnappyBlockAsm8B:
12369	MOVQ src_len+32(FP), CX
12370	SUBL 12(SP), CX
12371	LEAQ 3(AX)(CX*1), CX
12372	CMPQ CX, (SP)
12373	JL   emit_remainder_ok_encodeSnappyBlockAsm8B
12374	MOVQ $0x00000000, ret+48(FP)
12375	RET
12376
12377emit_remainder_ok_encodeSnappyBlockAsm8B:
12378	MOVQ src_len+32(FP), CX
12379	MOVL 12(SP), BX
12380	CMPL BX, CX
12381	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
12382	MOVL CX, SI
12383	MOVL CX, 12(SP)
12384	LEAQ (DX)(BX*1), CX
12385	SUBL BX, SI
12386	LEAL -1(SI), DX
12387	CMPL DX, $0x3c
12388	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm8B
12389	CMPL DX, $0x00000100
12390	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm8B
12391	MOVB $0xf4, (AX)
12392	MOVW DX, 1(AX)
12393	ADDQ $0x03, AX
12394	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm8B
12395
12396two_bytes_emit_remainder_encodeSnappyBlockAsm8B:
12397	MOVB $0xf0, (AX)
12398	MOVB DL, 1(AX)
12399	ADDQ $0x02, AX
12400	CMPL DX, $0x40
12401	JL   memmove_emit_remainder_encodeSnappyBlockAsm8B
12402	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm8B
12403
12404one_byte_emit_remainder_encodeSnappyBlockAsm8B:
12405	SHLB $0x02, DL
12406	MOVB DL, (AX)
12407	ADDQ $0x01, AX
12408
12409memmove_emit_remainder_encodeSnappyBlockAsm8B:
12410	LEAQ (AX)(SI*1), DX
12411	MOVL SI, BX
12412
12413	// genMemMoveShort
12414	CMPQ BX, $0x08
12415	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8
12416	CMPQ BX, $0x10
12417	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16
12418	CMPQ BX, $0x20
12419	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32
12420	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64
12421
12422emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8:
12423	MOVQ (CX), SI
12424	MOVQ SI, (AX)
12425	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
12426
12427emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16:
12428	MOVQ (CX), SI
12429	MOVQ -8(CX)(BX*1), CX
12430	MOVQ SI, (AX)
12431	MOVQ CX, -8(AX)(BX*1)
12432	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
12433
12434emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32:
12435	MOVOU (CX), X0
12436	MOVOU -16(CX)(BX*1), X1
12437	MOVOU X0, (AX)
12438	MOVOU X1, -16(AX)(BX*1)
12439	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
12440
12441emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64:
12442	MOVOU (CX), X0
12443	MOVOU 16(CX), X1
12444	MOVOU -32(CX)(BX*1), X2
12445	MOVOU -16(CX)(BX*1), X3
12446	MOVOU X0, (AX)
12447	MOVOU X1, 16(AX)
12448	MOVOU X2, -32(AX)(BX*1)
12449	MOVOU X3, -16(AX)(BX*1)
12450
12451memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B:
12452	MOVQ DX, AX
12453	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
12454
12455memmove_long_emit_remainder_encodeSnappyBlockAsm8B:
12456	LEAQ (AX)(SI*1), DX
12457	MOVL SI, BX
12458
12459	// genMemMoveLong
12460	MOVOU (CX), X0
12461	MOVOU 16(CX), X1
12462	MOVOU -32(CX)(BX*1), X2
12463	MOVOU -16(CX)(BX*1), X3
12464	MOVQ  BX, DI
12465	SHRQ  $0x05, DI
12466	MOVQ  AX, SI
12467	ANDL  $0x0000001f, SI
12468	MOVQ  $0x00000040, R8
12469	SUBQ  SI, R8
12470	DECQ  DI
12471	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
12472	LEAQ  -32(CX)(R8*1), SI
12473	LEAQ  -32(AX)(R8*1), R9
12474
12475emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back:
12476	MOVOU (SI), X4
12477	MOVOU 16(SI), X5
12478	MOVOA X4, (R9)
12479	MOVOA X5, 16(R9)
12480	ADDQ  $0x20, R9
12481	ADDQ  $0x20, SI
12482	ADDQ  $0x20, R8
12483	DECQ  DI
12484	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back
12485
12486emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
12487	MOVOU -32(CX)(R8*1), X4
12488	MOVOU -16(CX)(R8*1), X5
12489	MOVOA X4, -32(AX)(R8*1)
12490	MOVOA X5, -16(AX)(R8*1)
12491	ADDQ  $0x20, R8
12492	CMPQ  BX, R8
12493	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
12494	MOVOU X0, (AX)
12495	MOVOU X1, 16(AX)
12496	MOVOU X2, -32(AX)(BX*1)
12497	MOVOU X3, -16(AX)(BX*1)
12498	MOVQ  DX, AX
12499
12500emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
12501	MOVQ dst_base+0(FP), CX
12502	SUBQ CX, AX
12503	MOVQ AX, ret+48(FP)
12504	RET
12505
12506// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
12507// Requires: SSE2
12508TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56
12509	MOVQ dst_base+0(FP), AX
12510	MOVQ $0x00000a00, CX
12511	LEAQ 24(SP), DX
12512	PXOR X0, X0
12513
12514zero_loop_encodeSnappyBetterBlockAsm:
12515	MOVOU X0, (DX)
12516	MOVOU X0, 16(DX)
12517	MOVOU X0, 32(DX)
12518	MOVOU X0, 48(DX)
12519	MOVOU X0, 64(DX)
12520	MOVOU X0, 80(DX)
12521	MOVOU X0, 96(DX)
12522	MOVOU X0, 112(DX)
12523	ADDQ  $0x80, DX
12524	DECQ  CX
12525	JNZ   zero_loop_encodeSnappyBetterBlockAsm
12526	MOVL  $0x00000000, 12(SP)
12527	MOVQ  src_len+32(FP), CX
12528	LEAQ  -9(CX), DX
12529	LEAQ  -8(CX), SI
12530	MOVL  SI, 8(SP)
12531	SHRQ  $0x05, CX
12532	SUBL  CX, DX
12533	LEAQ  (AX)(DX*1), DX
12534	MOVQ  DX, (SP)
12535	MOVL  $0x00000001, CX
12536	MOVL  $0x00000000, 16(SP)
12537	MOVQ  src_base+24(FP), DX
12538
12539search_loop_encodeSnappyBetterBlockAsm:
12540	MOVL CX, SI
12541	SUBL 12(SP), SI
12542	SHRL $0x07, SI
12543	CMPL SI, $0x63
12544	JLE  check_maxskip_ok_encodeSnappyBetterBlockAsm
12545	LEAL 100(CX), SI
12546	JMP  check_maxskip_cont_encodeSnappyBetterBlockAsm
12547
12548check_maxskip_ok_encodeSnappyBetterBlockAsm:
12549	LEAL 1(CX)(SI*1), SI
12550
12551check_maxskip_cont_encodeSnappyBetterBlockAsm:
12552	CMPL  SI, 8(SP)
12553	JGE   emit_remainder_encodeSnappyBetterBlockAsm
12554	MOVQ  (DX)(CX*1), DI
12555	MOVL  SI, 20(SP)
12556	MOVQ  $0x00cf1bbcdcbfa563, R9
12557	MOVQ  $0x9e3779b1, SI
12558	MOVQ  DI, R10
12559	MOVQ  DI, R11
12560	SHLQ  $0x08, R10
12561	IMULQ R9, R10
12562	SHRQ  $0x30, R10
12563	SHLQ  $0x20, R11
12564	IMULQ SI, R11
12565	SHRQ  $0x32, R11
12566	MOVL  24(SP)(R10*4), SI
12567	MOVL  262168(SP)(R11*4), R8
12568	MOVL  CX, 24(SP)(R10*4)
12569	MOVL  CX, 262168(SP)(R11*4)
12570	CMPL  (DX)(SI*1), DI
12571	JEQ   candidate_match_encodeSnappyBetterBlockAsm
12572	CMPL  (DX)(R8*1), DI
12573	JEQ   candidateS_match_encodeSnappyBetterBlockAsm
12574	MOVL  20(SP), CX
12575	JMP   search_loop_encodeSnappyBetterBlockAsm
12576
12577candidateS_match_encodeSnappyBetterBlockAsm:
12578	SHRQ  $0x08, DI
12579	MOVQ  DI, R10
12580	SHLQ  $0x08, R10
12581	IMULQ R9, R10
12582	SHRQ  $0x30, R10
12583	MOVL  24(SP)(R10*4), SI
12584	INCL  CX
12585	MOVL  CX, 24(SP)(R10*4)
12586	CMPL  (DX)(SI*1), DI
12587	JEQ   candidate_match_encodeSnappyBetterBlockAsm
12588	DECL  CX
12589	MOVL  R8, SI
12590
12591candidate_match_encodeSnappyBetterBlockAsm:
12592	MOVL  12(SP), DI
12593	TESTL SI, SI
12594	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm
12595
12596match_extend_back_loop_encodeSnappyBetterBlockAsm:
12597	CMPL CX, DI
12598	JLE  match_extend_back_end_encodeSnappyBetterBlockAsm
12599	MOVB -1(DX)(SI*1), BL
12600	MOVB -1(DX)(CX*1), R8
12601	CMPB BL, R8
12602	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm
12603	LEAL -1(CX), CX
12604	DECL SI
12605	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm
12606	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm
12607
12608match_extend_back_end_encodeSnappyBetterBlockAsm:
12609	MOVL CX, DI
12610	SUBL 12(SP), DI
12611	LEAQ 5(AX)(DI*1), DI
12612	CMPQ DI, (SP)
12613	JL   match_dst_size_check_encodeSnappyBetterBlockAsm
12614	MOVQ $0x00000000, ret+48(FP)
12615	RET
12616
12617match_dst_size_check_encodeSnappyBetterBlockAsm:
12618	MOVL CX, DI
12619	ADDL $0x04, CX
12620	ADDL $0x04, SI
12621	MOVQ src_len+32(FP), R8
12622	SUBL CX, R8
12623	LEAQ (DX)(CX*1), R9
12624	LEAQ (DX)(SI*1), R10
12625
12626	// matchLen
12627	XORL R12, R12
12628	CMPL R8, $0x08
12629	JL   matchlen_single_match_nolit_encodeSnappyBetterBlockAsm
12630
12631matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm:
12632	MOVQ  (R9)(R12*1), R11
12633	XORQ  (R10)(R12*1), R11
12634	TESTQ R11, R11
12635	JZ    matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm
12636	BSFQ  R11, R11
12637	SARQ  $0x03, R11
12638	LEAL  (R12)(R11*1), R12
12639	JMP   match_nolit_end_encodeSnappyBetterBlockAsm
12640
12641matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm:
12642	LEAL -8(R8), R8
12643	LEAL 8(R12), R12
12644	CMPL R8, $0x08
12645	JGE  matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm
12646
12647matchlen_single_match_nolit_encodeSnappyBetterBlockAsm:
12648	TESTL R8, R8
12649	JZ    match_nolit_end_encodeSnappyBetterBlockAsm
12650
12651matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm:
12652	MOVB (R9)(R12*1), R11
12653	CMPB (R10)(R12*1), R11
12654	JNE  match_nolit_end_encodeSnappyBetterBlockAsm
12655	LEAL 1(R12), R12
12656	DECL R8
12657	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm
12658
12659match_nolit_end_encodeSnappyBetterBlockAsm:
12660	MOVL CX, R8
12661	SUBL SI, R8
12662
12663	// Check if repeat
12664	CMPL R12, $0x01
12665	JG   match_length_ok_encodeSnappyBetterBlockAsm
12666	CMPL R8, $0x0000ffff
12667	JLE  match_length_ok_encodeSnappyBetterBlockAsm
12668	MOVL 20(SP), CX
12669	INCL CX
12670	JMP  search_loop_encodeSnappyBetterBlockAsm
12671
12672match_length_ok_encodeSnappyBetterBlockAsm:
12673	MOVL R8, 16(SP)
12674	MOVL 12(SP), SI
12675	CMPL SI, DI
12676	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
12677	MOVL DI, R9
12678	MOVL DI, 12(SP)
12679	LEAQ (DX)(SI*1), R10
12680	SUBL SI, R9
12681	LEAL -1(R9), SI
12682	CMPL SI, $0x3c
12683	JLT  one_byte_match_emit_encodeSnappyBetterBlockAsm
12684	CMPL SI, $0x00000100
12685	JLT  two_bytes_match_emit_encodeSnappyBetterBlockAsm
12686	CMPL SI, $0x00010000
12687	JLT  three_bytes_match_emit_encodeSnappyBetterBlockAsm
12688	CMPL SI, $0x01000000
12689	JLT  four_bytes_match_emit_encodeSnappyBetterBlockAsm
12690	MOVB $0xfc, (AX)
12691	MOVL SI, 1(AX)
12692	ADDQ $0x05, AX
12693	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm
12694
12695four_bytes_match_emit_encodeSnappyBetterBlockAsm:
12696	MOVL SI, R11
12697	SHRL $0x10, R11
12698	MOVB $0xf8, (AX)
12699	MOVW SI, 1(AX)
12700	MOVB R11, 3(AX)
12701	ADDQ $0x04, AX
12702	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm
12703
12704three_bytes_match_emit_encodeSnappyBetterBlockAsm:
12705	MOVB $0xf4, (AX)
12706	MOVW SI, 1(AX)
12707	ADDQ $0x03, AX
12708	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm
12709
12710two_bytes_match_emit_encodeSnappyBetterBlockAsm:
12711	MOVB $0xf0, (AX)
12712	MOVB SI, 1(AX)
12713	ADDQ $0x02, AX
12714	CMPL SI, $0x40
12715	JL   memmove_match_emit_encodeSnappyBetterBlockAsm
12716	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm
12717
12718one_byte_match_emit_encodeSnappyBetterBlockAsm:
12719	SHLB $0x02, SI
12720	MOVB SI, (AX)
12721	ADDQ $0x01, AX
12722
12723memmove_match_emit_encodeSnappyBetterBlockAsm:
12724	LEAQ (AX)(R9*1), SI
12725
12726	// genMemMoveShort
12727	CMPQ R9, $0x08
12728	JLE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8
12729	CMPQ R9, $0x10
12730	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16
12731	CMPQ R9, $0x20
12732	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32
12733	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64
12734
12735emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8:
12736	MOVQ (R10), R11
12737	MOVQ R11, (AX)
12738	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
12739
12740emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16:
12741	MOVQ (R10), R11
12742	MOVQ -8(R10)(R9*1), R10
12743	MOVQ R11, (AX)
12744	MOVQ R10, -8(AX)(R9*1)
12745	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
12746
12747emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32:
12748	MOVOU (R10), X0
12749	MOVOU -16(R10)(R9*1), X1
12750	MOVOU X0, (AX)
12751	MOVOU X1, -16(AX)(R9*1)
12752	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
12753
12754emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64:
12755	MOVOU (R10), X0
12756	MOVOU 16(R10), X1
12757	MOVOU -32(R10)(R9*1), X2
12758	MOVOU -16(R10)(R9*1), X3
12759	MOVOU X0, (AX)
12760	MOVOU X1, 16(AX)
12761	MOVOU X2, -32(AX)(R9*1)
12762	MOVOU X3, -16(AX)(R9*1)
12763
12764memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm:
12765	MOVQ SI, AX
12766	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
12767
12768memmove_long_match_emit_encodeSnappyBetterBlockAsm:
12769	LEAQ (AX)(R9*1), SI
12770
12771	// genMemMoveLong
12772	MOVOU (R10), X0
12773	MOVOU 16(R10), X1
12774	MOVOU -32(R10)(R9*1), X2
12775	MOVOU -16(R10)(R9*1), X3
12776	MOVQ  R9, R13
12777	SHRQ  $0x05, R13
12778	MOVQ  AX, R11
12779	ANDL  $0x0000001f, R11
12780	MOVQ  $0x00000040, R14
12781	SUBQ  R11, R14
12782	DECQ  R13
12783	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
12784	LEAQ  -32(R10)(R14*1), R11
12785	LEAQ  -32(AX)(R14*1), R15
12786
12787emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back:
12788	MOVOU (R11), X4
12789	MOVOU 16(R11), X5
12790	MOVOA X4, (R15)
12791	MOVOA X5, 16(R15)
12792	ADDQ  $0x20, R15
12793	ADDQ  $0x20, R11
12794	ADDQ  $0x20, R14
12795	DECQ  R13
12796	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back
12797
12798emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
12799	MOVOU -32(R10)(R14*1), X4
12800	MOVOU -16(R10)(R14*1), X5
12801	MOVOA X4, -32(AX)(R14*1)
12802	MOVOA X5, -16(AX)(R14*1)
12803	ADDQ  $0x20, R14
12804	CMPQ  R9, R14
12805	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
12806	MOVOU X0, (AX)
12807	MOVOU X1, 16(AX)
12808	MOVOU X2, -32(AX)(R9*1)
12809	MOVOU X3, -16(AX)(R9*1)
12810	MOVQ  SI, AX
12811
12812emit_literal_done_match_emit_encodeSnappyBetterBlockAsm:
12813	ADDL R12, CX
12814	ADDL $0x04, R12
12815	MOVL CX, 12(SP)
12816
12817	// emitCopy
12818	CMPL R8, $0x00010000
12819	JL   two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
12820
12821four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm:
12822	CMPL R12, $0x40
12823	JLE  four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
12824	MOVB $0xff, (AX)
12825	MOVL R8, 1(AX)
12826	LEAL -64(R12), R12
12827	ADDQ $0x05, AX
12828	CMPL R12, $0x04
12829	JL   four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
12830	JMP  four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm
12831
12832four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm:
12833	TESTL R12, R12
12834	JZ    match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
12835	MOVB  $0x03, BL
12836	LEAL  -4(BX)(R12*4), R12
12837	MOVB  R12, (AX)
12838	MOVL  R8, 1(AX)
12839	ADDQ  $0x05, AX
12840	JMP   match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
12841
12842two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm:
12843	CMPL R12, $0x40
12844	JLE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm
12845	MOVB $0xee, (AX)
12846	MOVW R8, 1(AX)
12847	LEAL -60(R12), R12
12848	ADDQ $0x03, AX
12849	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
12850
12851two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm:
12852	CMPL R12, $0x0c
12853	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
12854	CMPL R8, $0x00000800
12855	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
12856	MOVB $0x01, BL
12857	LEAL -16(BX)(R12*4), R12
12858	MOVB R8, 1(AX)
12859	SHRL $0x08, R8
12860	SHLL $0x05, R8
12861	ORL  R8, R12
12862	MOVB R12, (AX)
12863	ADDQ $0x02, AX
12864	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
12865
12866emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm:
12867	MOVB $0x02, BL
12868	LEAL -4(BX)(R12*4), R12
12869	MOVB R12, (AX)
12870	MOVW R8, 1(AX)
12871	ADDQ $0x03, AX
12872
12873match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
12874	CMPL CX, 8(SP)
12875	JGE  emit_remainder_encodeSnappyBetterBlockAsm
12876	CMPQ AX, (SP)
12877	JL   match_nolit_dst_ok_encodeSnappyBetterBlockAsm
12878	MOVQ $0x00000000, ret+48(FP)
12879	RET
12880
12881match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
12882	MOVQ  $0x00cf1bbcdcbfa563, SI
12883	MOVQ  $0x9e3779b1, R8
12884	INCL  DI
12885	MOVQ  (DX)(DI*1), R9
12886	MOVQ  R9, R10
12887	MOVQ  R9, R11
12888	MOVQ  R9, R12
12889	SHRQ  $0x08, R11
12890	MOVQ  R11, R13
12891	SHRQ  $0x10, R12
12892	LEAL  1(DI), R14
12893	LEAL  2(DI), R15
12894	MOVQ  -2(DX)(CX*1), R9
12895	SHLQ  $0x08, R10
12896	IMULQ SI, R10
12897	SHRQ  $0x30, R10
12898	SHLQ  $0x08, R13
12899	IMULQ SI, R13
12900	SHRQ  $0x30, R13
12901	SHLQ  $0x20, R11
12902	IMULQ R8, R11
12903	SHRQ  $0x32, R11
12904	SHLQ  $0x20, R12
12905	IMULQ R8, R12
12906	SHRQ  $0x32, R12
12907	MOVL  DI, 24(SP)(R10*4)
12908	MOVL  R14, 24(SP)(R13*4)
12909	MOVL  R14, 262168(SP)(R11*4)
12910	MOVL  R15, 262168(SP)(R12*4)
12911	MOVQ  R9, R10
12912	MOVQ  R9, R11
12913	SHRQ  $0x08, R11
12914	MOVQ  R11, R13
12915	LEAL  -2(CX), R9
12916	LEAL  -1(CX), DI
12917	SHLQ  $0x08, R10
12918	IMULQ SI, R10
12919	SHRQ  $0x30, R10
12920	SHLQ  $0x20, R11
12921	IMULQ R8, R11
12922	SHRQ  $0x32, R11
12923	SHLQ  $0x08, R13
12924	IMULQ SI, R13
12925	SHRQ  $0x30, R13
12926	MOVL  R9, 24(SP)(R10*4)
12927	MOVL  DI, 262168(SP)(R11*4)
12928	MOVL  DI, 24(SP)(R13*4)
12929	JMP   search_loop_encodeSnappyBetterBlockAsm
12930
12931emit_remainder_encodeSnappyBetterBlockAsm:
12932	MOVQ src_len+32(FP), CX
12933	SUBL 12(SP), CX
12934	LEAQ 5(AX)(CX*1), CX
12935	CMPQ CX, (SP)
12936	JL   emit_remainder_ok_encodeSnappyBetterBlockAsm
12937	MOVQ $0x00000000, ret+48(FP)
12938	RET
12939
12940emit_remainder_ok_encodeSnappyBetterBlockAsm:
12941	MOVQ src_len+32(FP), CX
12942	MOVL 12(SP), BX
12943	CMPL BX, CX
12944	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
12945	MOVL CX, SI
12946	MOVL CX, 12(SP)
12947	LEAQ (DX)(BX*1), CX
12948	SUBL BX, SI
12949	LEAL -1(SI), DX
12950	CMPL DX, $0x3c
12951	JLT  one_byte_emit_remainder_encodeSnappyBetterBlockAsm
12952	CMPL DX, $0x00000100
12953	JLT  two_bytes_emit_remainder_encodeSnappyBetterBlockAsm
12954	CMPL DX, $0x00010000
12955	JLT  three_bytes_emit_remainder_encodeSnappyBetterBlockAsm
12956	CMPL DX, $0x01000000
12957	JLT  four_bytes_emit_remainder_encodeSnappyBetterBlockAsm
12958	MOVB $0xfc, (AX)
12959	MOVL DX, 1(AX)
12960	ADDQ $0x05, AX
12961	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
12962
12963four_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
12964	MOVL DX, BX
12965	SHRL $0x10, BX
12966	MOVB $0xf8, (AX)
12967	MOVW DX, 1(AX)
12968	MOVB BL, 3(AX)
12969	ADDQ $0x04, AX
12970	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
12971
12972three_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
12973	MOVB $0xf4, (AX)
12974	MOVW DX, 1(AX)
12975	ADDQ $0x03, AX
12976	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
12977
12978two_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
12979	MOVB $0xf0, (AX)
12980	MOVB DL, 1(AX)
12981	ADDQ $0x02, AX
12982	CMPL DX, $0x40
12983	JL   memmove_emit_remainder_encodeSnappyBetterBlockAsm
12984	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
12985
12986one_byte_emit_remainder_encodeSnappyBetterBlockAsm:
12987	SHLB $0x02, DL
12988	MOVB DL, (AX)
12989	ADDQ $0x01, AX
12990
12991memmove_emit_remainder_encodeSnappyBetterBlockAsm:
12992	LEAQ (AX)(SI*1), DX
12993	MOVL SI, BX
12994
12995	// genMemMoveShort
12996	CMPQ BX, $0x08
12997	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8
12998	CMPQ BX, $0x10
12999	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16
13000	CMPQ BX, $0x20
13001	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32
13002	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64
13003
13004emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8:
13005	MOVQ (CX), SI
13006	MOVQ SI, (AX)
13007	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
13008
13009emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16:
13010	MOVQ (CX), SI
13011	MOVQ -8(CX)(BX*1), CX
13012	MOVQ SI, (AX)
13013	MOVQ CX, -8(AX)(BX*1)
13014	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
13015
13016emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32:
13017	MOVOU (CX), X0
13018	MOVOU -16(CX)(BX*1), X1
13019	MOVOU X0, (AX)
13020	MOVOU X1, -16(AX)(BX*1)
13021	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
13022
13023emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64:
13024	MOVOU (CX), X0
13025	MOVOU 16(CX), X1
13026	MOVOU -32(CX)(BX*1), X2
13027	MOVOU -16(CX)(BX*1), X3
13028	MOVOU X0, (AX)
13029	MOVOU X1, 16(AX)
13030	MOVOU X2, -32(AX)(BX*1)
13031	MOVOU X3, -16(AX)(BX*1)
13032
13033memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm:
13034	MOVQ DX, AX
13035	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
13036
13037memmove_long_emit_remainder_encodeSnappyBetterBlockAsm:
13038	LEAQ (AX)(SI*1), DX
13039	MOVL SI, BX
13040
13041	// genMemMoveLong
13042	MOVOU (CX), X0
13043	MOVOU 16(CX), X1
13044	MOVOU -32(CX)(BX*1), X2
13045	MOVOU -16(CX)(BX*1), X3
13046	MOVQ  BX, DI
13047	SHRQ  $0x05, DI
13048	MOVQ  AX, SI
13049	ANDL  $0x0000001f, SI
13050	MOVQ  $0x00000040, R8
13051	SUBQ  SI, R8
13052	DECQ  DI
13053	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
13054	LEAQ  -32(CX)(R8*1), SI
13055	LEAQ  -32(AX)(R8*1), R9
13056
13057emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back:
13058	MOVOU (SI), X4
13059	MOVOU 16(SI), X5
13060	MOVOA X4, (R9)
13061	MOVOA X5, 16(R9)
13062	ADDQ  $0x20, R9
13063	ADDQ  $0x20, SI
13064	ADDQ  $0x20, R8
13065	DECQ  DI
13066	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back
13067
13068emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
13069	MOVOU -32(CX)(R8*1), X4
13070	MOVOU -16(CX)(R8*1), X5
13071	MOVOA X4, -32(AX)(R8*1)
13072	MOVOA X5, -16(AX)(R8*1)
13073	ADDQ  $0x20, R8
13074	CMPQ  BX, R8
13075	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
13076	MOVOU X0, (AX)
13077	MOVOU X1, 16(AX)
13078	MOVOU X2, -32(AX)(BX*1)
13079	MOVOU X3, -16(AX)(BX*1)
13080	MOVQ  DX, AX
13081
13082emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm:
13083	MOVQ dst_base+0(FP), CX
13084	SUBQ CX, AX
13085	MOVQ AX, ret+48(FP)
13086	RET
13087
13088// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
13089// Requires: SSE2
13090TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56
13091	MOVQ dst_base+0(FP), AX
13092	MOVQ $0x00000a00, CX
13093	LEAQ 24(SP), DX
13094	PXOR X0, X0
13095
13096zero_loop_encodeSnappyBetterBlockAsm64K:
13097	MOVOU X0, (DX)
13098	MOVOU X0, 16(DX)
13099	MOVOU X0, 32(DX)
13100	MOVOU X0, 48(DX)
13101	MOVOU X0, 64(DX)
13102	MOVOU X0, 80(DX)
13103	MOVOU X0, 96(DX)
13104	MOVOU X0, 112(DX)
13105	ADDQ  $0x80, DX
13106	DECQ  CX
13107	JNZ   zero_loop_encodeSnappyBetterBlockAsm64K
13108	MOVL  $0x00000000, 12(SP)
13109	MOVQ  src_len+32(FP), CX
13110	LEAQ  -9(CX), DX
13111	LEAQ  -8(CX), SI
13112	MOVL  SI, 8(SP)
13113	SHRQ  $0x05, CX
13114	SUBL  CX, DX
13115	LEAQ  (AX)(DX*1), DX
13116	MOVQ  DX, (SP)
13117	MOVL  $0x00000001, CX
13118	MOVL  $0x00000000, 16(SP)
13119	MOVQ  src_base+24(FP), DX
13120
13121search_loop_encodeSnappyBetterBlockAsm64K:
13122	MOVL  CX, SI
13123	SUBL  12(SP), SI
13124	SHRL  $0x07, SI
13125	LEAL  1(CX)(SI*1), SI
13126	CMPL  SI, 8(SP)
13127	JGE   emit_remainder_encodeSnappyBetterBlockAsm64K
13128	MOVQ  (DX)(CX*1), DI
13129	MOVL  SI, 20(SP)
13130	MOVQ  $0x00cf1bbcdcbfa563, R9
13131	MOVQ  $0x9e3779b1, SI
13132	MOVQ  DI, R10
13133	MOVQ  DI, R11
13134	SHLQ  $0x08, R10
13135	IMULQ R9, R10
13136	SHRQ  $0x30, R10
13137	SHLQ  $0x20, R11
13138	IMULQ SI, R11
13139	SHRQ  $0x32, R11
13140	MOVL  24(SP)(R10*4), SI
13141	MOVL  262168(SP)(R11*4), R8
13142	MOVL  CX, 24(SP)(R10*4)
13143	MOVL  CX, 262168(SP)(R11*4)
13144	CMPL  (DX)(SI*1), DI
13145	JEQ   candidate_match_encodeSnappyBetterBlockAsm64K
13146	CMPL  (DX)(R8*1), DI
13147	JEQ   candidateS_match_encodeSnappyBetterBlockAsm64K
13148	MOVL  20(SP), CX
13149	JMP   search_loop_encodeSnappyBetterBlockAsm64K
13150
13151candidateS_match_encodeSnappyBetterBlockAsm64K:
13152	SHRQ  $0x08, DI
13153	MOVQ  DI, R10
13154	SHLQ  $0x08, R10
13155	IMULQ R9, R10
13156	SHRQ  $0x30, R10
13157	MOVL  24(SP)(R10*4), SI
13158	INCL  CX
13159	MOVL  CX, 24(SP)(R10*4)
13160	CMPL  (DX)(SI*1), DI
13161	JEQ   candidate_match_encodeSnappyBetterBlockAsm64K
13162	DECL  CX
13163	MOVL  R8, SI
13164
13165candidate_match_encodeSnappyBetterBlockAsm64K:
13166	MOVL  12(SP), DI
13167	TESTL SI, SI
13168	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm64K
13169
13170match_extend_back_loop_encodeSnappyBetterBlockAsm64K:
13171	CMPL CX, DI
13172	JLE  match_extend_back_end_encodeSnappyBetterBlockAsm64K
13173	MOVB -1(DX)(SI*1), BL
13174	MOVB -1(DX)(CX*1), R8
13175	CMPB BL, R8
13176	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm64K
13177	LEAL -1(CX), CX
13178	DECL SI
13179	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm64K
13180	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm64K
13181
13182match_extend_back_end_encodeSnappyBetterBlockAsm64K:
13183	MOVL CX, DI
13184	SUBL 12(SP), DI
13185	LEAQ 3(AX)(DI*1), DI
13186	CMPQ DI, (SP)
13187	JL   match_dst_size_check_encodeSnappyBetterBlockAsm64K
13188	MOVQ $0x00000000, ret+48(FP)
13189	RET
13190
13191match_dst_size_check_encodeSnappyBetterBlockAsm64K:
13192	MOVL CX, DI
13193	ADDL $0x04, CX
13194	ADDL $0x04, SI
13195	MOVQ src_len+32(FP), R8
13196	SUBL CX, R8
13197	LEAQ (DX)(CX*1), R9
13198	LEAQ (DX)(SI*1), R10
13199
13200	// matchLen
13201	XORL R12, R12
13202	CMPL R8, $0x08
13203	JL   matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K
13204
13205matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:
13206	MOVQ  (R9)(R12*1), R11
13207	XORQ  (R10)(R12*1), R11
13208	TESTQ R11, R11
13209	JZ    matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K
13210	BSFQ  R11, R11
13211	SARQ  $0x03, R11
13212	LEAL  (R12)(R11*1), R12
13213	JMP   match_nolit_end_encodeSnappyBetterBlockAsm64K
13214
13215matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K:
13216	LEAL -8(R8), R8
13217	LEAL 8(R12), R12
13218	CMPL R8, $0x08
13219	JGE  matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K
13220
13221matchlen_single_match_nolit_encodeSnappyBetterBlockAsm64K:
13222	TESTL R8, R8
13223	JZ    match_nolit_end_encodeSnappyBetterBlockAsm64K
13224
13225matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:
13226	MOVB (R9)(R12*1), R11
13227	CMPB (R10)(R12*1), R11
13228	JNE  match_nolit_end_encodeSnappyBetterBlockAsm64K
13229	LEAL 1(R12), R12
13230	DECL R8
13231	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm64K
13232
13233match_nolit_end_encodeSnappyBetterBlockAsm64K:
13234	MOVL CX, R8
13235	SUBL SI, R8
13236
13237	// Check if repeat
13238	MOVL R8, 16(SP)
13239	MOVL 12(SP), SI
13240	CMPL SI, DI
13241	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
13242	MOVL DI, R9
13243	MOVL DI, 12(SP)
13244	LEAQ (DX)(SI*1), R10
13245	SUBL SI, R9
13246	LEAL -1(R9), SI
13247	CMPL SI, $0x3c
13248	JLT  one_byte_match_emit_encodeSnappyBetterBlockAsm64K
13249	CMPL SI, $0x00000100
13250	JLT  two_bytes_match_emit_encodeSnappyBetterBlockAsm64K
13251	MOVB $0xf4, (AX)
13252	MOVW SI, 1(AX)
13253	ADDQ $0x03, AX
13254	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
13255
13256two_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
13257	MOVB $0xf0, (AX)
13258	MOVB SI, 1(AX)
13259	ADDQ $0x02, AX
13260	CMPL SI, $0x40
13261	JL   memmove_match_emit_encodeSnappyBetterBlockAsm64K
13262	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
13263
13264one_byte_match_emit_encodeSnappyBetterBlockAsm64K:
13265	SHLB $0x02, SI
13266	MOVB SI, (AX)
13267	ADDQ $0x01, AX
13268
13269memmove_match_emit_encodeSnappyBetterBlockAsm64K:
13270	LEAQ (AX)(R9*1), SI
13271
13272	// genMemMoveShort
13273	CMPQ R9, $0x08
13274	JLE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8
13275	CMPQ R9, $0x10
13276	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
13277	CMPQ R9, $0x20
13278	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
13279	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
13280
13281emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8:
13282	MOVQ (R10), R11
13283	MOVQ R11, (AX)
13284	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
13285
13286emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
13287	MOVQ (R10), R11
13288	MOVQ -8(R10)(R9*1), R10
13289	MOVQ R11, (AX)
13290	MOVQ R10, -8(AX)(R9*1)
13291	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
13292
13293emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
13294	MOVOU (R10), X0
13295	MOVOU -16(R10)(R9*1), X1
13296	MOVOU X0, (AX)
13297	MOVOU X1, -16(AX)(R9*1)
13298	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
13299
13300emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
13301	MOVOU (R10), X0
13302	MOVOU 16(R10), X1
13303	MOVOU -32(R10)(R9*1), X2
13304	MOVOU -16(R10)(R9*1), X3
13305	MOVOU X0, (AX)
13306	MOVOU X1, 16(AX)
13307	MOVOU X2, -32(AX)(R9*1)
13308	MOVOU X3, -16(AX)(R9*1)
13309
13310memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K:
13311	MOVQ SI, AX
13312	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
13313
13314memmove_long_match_emit_encodeSnappyBetterBlockAsm64K:
13315	LEAQ (AX)(R9*1), SI
13316
13317	// genMemMoveLong
13318	MOVOU (R10), X0
13319	MOVOU 16(R10), X1
13320	MOVOU -32(R10)(R9*1), X2
13321	MOVOU -16(R10)(R9*1), X3
13322	MOVQ  R9, R13
13323	SHRQ  $0x05, R13
13324	MOVQ  AX, R11
13325	ANDL  $0x0000001f, R11
13326	MOVQ  $0x00000040, R14
13327	SUBQ  R11, R14
13328	DECQ  R13
13329	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
13330	LEAQ  -32(R10)(R14*1), R11
13331	LEAQ  -32(AX)(R14*1), R15
13332
13333emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
13334	MOVOU (R11), X4
13335	MOVOU 16(R11), X5
13336	MOVOA X4, (R15)
13337	MOVOA X5, 16(R15)
13338	ADDQ  $0x20, R15
13339	ADDQ  $0x20, R11
13340	ADDQ  $0x20, R14
13341	DECQ  R13
13342	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
13343
13344emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
13345	MOVOU -32(R10)(R14*1), X4
13346	MOVOU -16(R10)(R14*1), X5
13347	MOVOA X4, -32(AX)(R14*1)
13348	MOVOA X5, -16(AX)(R14*1)
13349	ADDQ  $0x20, R14
13350	CMPQ  R9, R14
13351	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
13352	MOVOU X0, (AX)
13353	MOVOU X1, 16(AX)
13354	MOVOU X2, -32(AX)(R9*1)
13355	MOVOU X3, -16(AX)(R9*1)
13356	MOVQ  SI, AX
13357
13358emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K:
13359	ADDL R12, CX
13360	ADDL $0x04, R12
13361	MOVL CX, 12(SP)
13362
13363	// emitCopy
13364two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K:
13365	CMPL R12, $0x40
13366	JLE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K
13367	MOVB $0xee, (AX)
13368	MOVW R8, 1(AX)
13369	LEAL -60(R12), R12
13370	ADDQ $0x03, AX
13371	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K
13372
13373two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K:
13374	CMPL R12, $0x0c
13375	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
13376	CMPL R8, $0x00000800
13377	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
13378	MOVB $0x01, BL
13379	LEAL -16(BX)(R12*4), R12
13380	MOVB R8, 1(AX)
13381	SHRL $0x08, R8
13382	SHLL $0x05, R8
13383	ORL  R8, R12
13384	MOVB R12, (AX)
13385	ADDQ $0x02, AX
13386	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K
13387
13388emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K:
13389	MOVB $0x02, BL
13390	LEAL -4(BX)(R12*4), R12
13391	MOVB R12, (AX)
13392	MOVW R8, 1(AX)
13393	ADDQ $0x03, AX
13394
13395match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
13396	CMPL CX, 8(SP)
13397	JGE  emit_remainder_encodeSnappyBetterBlockAsm64K
13398	CMPQ AX, (SP)
13399	JL   match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K
13400	MOVQ $0x00000000, ret+48(FP)
13401	RET
13402
13403match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
13404	MOVQ  $0x00cf1bbcdcbfa563, SI
13405	MOVQ  $0x9e3779b1, R8
13406	INCL  DI
13407	MOVQ  (DX)(DI*1), R9
13408	MOVQ  R9, R10
13409	MOVQ  R9, R11
13410	MOVQ  R9, R12
13411	SHRQ  $0x08, R11
13412	MOVQ  R11, R13
13413	SHRQ  $0x10, R12
13414	LEAL  1(DI), R14
13415	LEAL  2(DI), R15
13416	MOVQ  -2(DX)(CX*1), R9
13417	SHLQ  $0x08, R10
13418	IMULQ SI, R10
13419	SHRQ  $0x30, R10
13420	SHLQ  $0x08, R13
13421	IMULQ SI, R13
13422	SHRQ  $0x30, R13
13423	SHLQ  $0x20, R11
13424	IMULQ R8, R11
13425	SHRQ  $0x32, R11
13426	SHLQ  $0x20, R12
13427	IMULQ R8, R12
13428	SHRQ  $0x32, R12
13429	MOVL  DI, 24(SP)(R10*4)
13430	MOVL  R14, 24(SP)(R13*4)
13431	MOVL  R14, 262168(SP)(R11*4)
13432	MOVL  R15, 262168(SP)(R12*4)
13433	MOVQ  R9, R10
13434	MOVQ  R9, R11
13435	SHRQ  $0x08, R11
13436	MOVQ  R11, R13
13437	LEAL  -2(CX), R9
13438	LEAL  -1(CX), DI
13439	SHLQ  $0x08, R10
13440	IMULQ SI, R10
13441	SHRQ  $0x30, R10
13442	SHLQ  $0x20, R11
13443	IMULQ R8, R11
13444	SHRQ  $0x32, R11
13445	SHLQ  $0x08, R13
13446	IMULQ SI, R13
13447	SHRQ  $0x30, R13
13448	MOVL  R9, 24(SP)(R10*4)
13449	MOVL  DI, 262168(SP)(R11*4)
13450	MOVL  DI, 24(SP)(R13*4)
13451	JMP   search_loop_encodeSnappyBetterBlockAsm64K
13452
13453emit_remainder_encodeSnappyBetterBlockAsm64K:
13454	MOVQ src_len+32(FP), CX
13455	SUBL 12(SP), CX
13456	LEAQ 3(AX)(CX*1), CX
13457	CMPQ CX, (SP)
13458	JL   emit_remainder_ok_encodeSnappyBetterBlockAsm64K
13459	MOVQ $0x00000000, ret+48(FP)
13460	RET
13461
13462emit_remainder_ok_encodeSnappyBetterBlockAsm64K:
13463	MOVQ src_len+32(FP), CX
13464	MOVL 12(SP), BX
13465	CMPL BX, CX
13466	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
13467	MOVL CX, SI
13468	MOVL CX, 12(SP)
13469	LEAQ (DX)(BX*1), CX
13470	SUBL BX, SI
13471	LEAL -1(SI), DX
13472	CMPL DX, $0x3c
13473	JLT  one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K
13474	CMPL DX, $0x00000100
13475	JLT  two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
13476	MOVB $0xf4, (AX)
13477	MOVW DX, 1(AX)
13478	ADDQ $0x03, AX
13479	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
13480
13481two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
13482	MOVB $0xf0, (AX)
13483	MOVB DL, 1(AX)
13484	ADDQ $0x02, AX
13485	CMPL DX, $0x40
13486	JL   memmove_emit_remainder_encodeSnappyBetterBlockAsm64K
13487	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
13488
13489one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K:
13490	SHLB $0x02, DL
13491	MOVB DL, (AX)
13492	ADDQ $0x01, AX
13493
13494memmove_emit_remainder_encodeSnappyBetterBlockAsm64K:
13495	LEAQ (AX)(SI*1), DX
13496	MOVL SI, BX
13497
13498	// genMemMoveShort
13499	CMPQ BX, $0x08
13500	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8
13501	CMPQ BX, $0x10
13502	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
13503	CMPQ BX, $0x20
13504	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
13505	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
13506
13507emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8:
13508	MOVQ (CX), SI
13509	MOVQ SI, (AX)
13510	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
13511
13512emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
13513	MOVQ (CX), SI
13514	MOVQ -8(CX)(BX*1), CX
13515	MOVQ SI, (AX)
13516	MOVQ CX, -8(AX)(BX*1)
13517	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
13518
13519emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
13520	MOVOU (CX), X0
13521	MOVOU -16(CX)(BX*1), X1
13522	MOVOU X0, (AX)
13523	MOVOU X1, -16(AX)(BX*1)
13524	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
13525
13526emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
13527	MOVOU (CX), X0
13528	MOVOU 16(CX), X1
13529	MOVOU -32(CX)(BX*1), X2
13530	MOVOU -16(CX)(BX*1), X3
13531	MOVOU X0, (AX)
13532	MOVOU X1, 16(AX)
13533	MOVOU X2, -32(AX)(BX*1)
13534	MOVOU X3, -16(AX)(BX*1)
13535
13536memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K:
13537	MOVQ DX, AX
13538	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
13539
13540memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K:
13541	LEAQ (AX)(SI*1), DX
13542	MOVL SI, BX
13543
13544	// genMemMoveLong
13545	MOVOU (CX), X0
13546	MOVOU 16(CX), X1
13547	MOVOU -32(CX)(BX*1), X2
13548	MOVOU -16(CX)(BX*1), X3
13549	MOVQ  BX, DI
13550	SHRQ  $0x05, DI
13551	MOVQ  AX, SI
13552	ANDL  $0x0000001f, SI
13553	MOVQ  $0x00000040, R8
13554	SUBQ  SI, R8
13555	DECQ  DI
13556	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
13557	LEAQ  -32(CX)(R8*1), SI
13558	LEAQ  -32(AX)(R8*1), R9
13559
13560emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
13561	MOVOU (SI), X4
13562	MOVOU 16(SI), X5
13563	MOVOA X4, (R9)
13564	MOVOA X5, 16(R9)
13565	ADDQ  $0x20, R9
13566	ADDQ  $0x20, SI
13567	ADDQ  $0x20, R8
13568	DECQ  DI
13569	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
13570
13571emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
13572	MOVOU -32(CX)(R8*1), X4
13573	MOVOU -16(CX)(R8*1), X5
13574	MOVOA X4, -32(AX)(R8*1)
13575	MOVOA X5, -16(AX)(R8*1)
13576	ADDQ  $0x20, R8
13577	CMPQ  BX, R8
13578	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
13579	MOVOU X0, (AX)
13580	MOVOU X1, 16(AX)
13581	MOVOU X2, -32(AX)(BX*1)
13582	MOVOU X3, -16(AX)(BX*1)
13583	MOVQ  DX, AX
13584
13585emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K:
13586	MOVQ dst_base+0(FP), CX
13587	SUBQ CX, AX
13588	MOVQ AX, ret+48(FP)
13589	RET
13590
13591// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
13592// Requires: SSE2
13593TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56
13594	MOVQ dst_base+0(FP), AX
13595	MOVQ $0x00000280, CX
13596	LEAQ 24(SP), DX
13597	PXOR X0, X0
13598
13599zero_loop_encodeSnappyBetterBlockAsm12B:
13600	MOVOU X0, (DX)
13601	MOVOU X0, 16(DX)
13602	MOVOU X0, 32(DX)
13603	MOVOU X0, 48(DX)
13604	MOVOU X0, 64(DX)
13605	MOVOU X0, 80(DX)
13606	MOVOU X0, 96(DX)
13607	MOVOU X0, 112(DX)
13608	ADDQ  $0x80, DX
13609	DECQ  CX
13610	JNZ   zero_loop_encodeSnappyBetterBlockAsm12B
13611	MOVL  $0x00000000, 12(SP)
13612	MOVQ  src_len+32(FP), CX
13613	LEAQ  -9(CX), DX
13614	LEAQ  -8(CX), SI
13615	MOVL  SI, 8(SP)
13616	SHRQ  $0x05, CX
13617	SUBL  CX, DX
13618	LEAQ  (AX)(DX*1), DX
13619	MOVQ  DX, (SP)
13620	MOVL  $0x00000001, CX
13621	MOVL  $0x00000000, 16(SP)
13622	MOVQ  src_base+24(FP), DX
13623
13624search_loop_encodeSnappyBetterBlockAsm12B:
13625	MOVL  CX, SI
13626	SUBL  12(SP), SI
13627	SHRL  $0x06, SI
13628	LEAL  1(CX)(SI*1), SI
13629	CMPL  SI, 8(SP)
13630	JGE   emit_remainder_encodeSnappyBetterBlockAsm12B
13631	MOVQ  (DX)(CX*1), DI
13632	MOVL  SI, 20(SP)
13633	MOVQ  $0x0000cf1bbcdcbf9b, R9
13634	MOVQ  $0x9e3779b1, SI
13635	MOVQ  DI, R10
13636	MOVQ  DI, R11
13637	SHLQ  $0x10, R10
13638	IMULQ R9, R10
13639	SHRQ  $0x32, R10
13640	SHLQ  $0x20, R11
13641	IMULQ SI, R11
13642	SHRQ  $0x34, R11
13643	MOVL  24(SP)(R10*4), SI
13644	MOVL  65560(SP)(R11*4), R8
13645	MOVL  CX, 24(SP)(R10*4)
13646	MOVL  CX, 65560(SP)(R11*4)
13647	CMPL  (DX)(SI*1), DI
13648	JEQ   candidate_match_encodeSnappyBetterBlockAsm12B
13649	CMPL  (DX)(R8*1), DI
13650	JEQ   candidateS_match_encodeSnappyBetterBlockAsm12B
13651	MOVL  20(SP), CX
13652	JMP   search_loop_encodeSnappyBetterBlockAsm12B
13653
13654candidateS_match_encodeSnappyBetterBlockAsm12B:
13655	SHRQ  $0x08, DI
13656	MOVQ  DI, R10
13657	SHLQ  $0x10, R10
13658	IMULQ R9, R10
13659	SHRQ  $0x32, R10
13660	MOVL  24(SP)(R10*4), SI
13661	INCL  CX
13662	MOVL  CX, 24(SP)(R10*4)
13663	CMPL  (DX)(SI*1), DI
13664	JEQ   candidate_match_encodeSnappyBetterBlockAsm12B
13665	DECL  CX
13666	MOVL  R8, SI
13667
13668candidate_match_encodeSnappyBetterBlockAsm12B:
13669	MOVL  12(SP), DI
13670	TESTL SI, SI
13671	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm12B
13672
13673match_extend_back_loop_encodeSnappyBetterBlockAsm12B:
13674	CMPL CX, DI
13675	JLE  match_extend_back_end_encodeSnappyBetterBlockAsm12B
13676	MOVB -1(DX)(SI*1), BL
13677	MOVB -1(DX)(CX*1), R8
13678	CMPB BL, R8
13679	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm12B
13680	LEAL -1(CX), CX
13681	DECL SI
13682	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm12B
13683	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm12B
13684
13685match_extend_back_end_encodeSnappyBetterBlockAsm12B:
13686	MOVL CX, DI
13687	SUBL 12(SP), DI
13688	LEAQ 3(AX)(DI*1), DI
13689	CMPQ DI, (SP)
13690	JL   match_dst_size_check_encodeSnappyBetterBlockAsm12B
13691	MOVQ $0x00000000, ret+48(FP)
13692	RET
13693
13694match_dst_size_check_encodeSnappyBetterBlockAsm12B:
13695	MOVL CX, DI
13696	ADDL $0x04, CX
13697	ADDL $0x04, SI
13698	MOVQ src_len+32(FP), R8
13699	SUBL CX, R8
13700	LEAQ (DX)(CX*1), R9
13701	LEAQ (DX)(SI*1), R10
13702
13703	// matchLen
13704	XORL R12, R12
13705	CMPL R8, $0x08
13706	JL   matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B
13707
13708matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:
13709	MOVQ  (R9)(R12*1), R11
13710	XORQ  (R10)(R12*1), R11
13711	TESTQ R11, R11
13712	JZ    matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B
13713	BSFQ  R11, R11
13714	SARQ  $0x03, R11
13715	LEAL  (R12)(R11*1), R12
13716	JMP   match_nolit_end_encodeSnappyBetterBlockAsm12B
13717
13718matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B:
13719	LEAL -8(R8), R8
13720	LEAL 8(R12), R12
13721	CMPL R8, $0x08
13722	JGE  matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B
13723
13724matchlen_single_match_nolit_encodeSnappyBetterBlockAsm12B:
13725	TESTL R8, R8
13726	JZ    match_nolit_end_encodeSnappyBetterBlockAsm12B
13727
13728matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:
13729	MOVB (R9)(R12*1), R11
13730	CMPB (R10)(R12*1), R11
13731	JNE  match_nolit_end_encodeSnappyBetterBlockAsm12B
13732	LEAL 1(R12), R12
13733	DECL R8
13734	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm12B
13735
13736match_nolit_end_encodeSnappyBetterBlockAsm12B:
13737	MOVL CX, R8
13738	SUBL SI, R8
13739
13740	// Check if repeat
13741	MOVL R8, 16(SP)
13742	MOVL 12(SP), SI
13743	CMPL SI, DI
13744	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
13745	MOVL DI, R9
13746	MOVL DI, 12(SP)
13747	LEAQ (DX)(SI*1), R10
13748	SUBL SI, R9
13749	LEAL -1(R9), SI
13750	CMPL SI, $0x3c
13751	JLT  one_byte_match_emit_encodeSnappyBetterBlockAsm12B
13752	CMPL SI, $0x00000100
13753	JLT  two_bytes_match_emit_encodeSnappyBetterBlockAsm12B
13754	MOVB $0xf4, (AX)
13755	MOVW SI, 1(AX)
13756	ADDQ $0x03, AX
13757	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
13758
13759two_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
13760	MOVB $0xf0, (AX)
13761	MOVB SI, 1(AX)
13762	ADDQ $0x02, AX
13763	CMPL SI, $0x40
13764	JL   memmove_match_emit_encodeSnappyBetterBlockAsm12B
13765	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
13766
13767one_byte_match_emit_encodeSnappyBetterBlockAsm12B:
13768	SHLB $0x02, SI
13769	MOVB SI, (AX)
13770	ADDQ $0x01, AX
13771
13772memmove_match_emit_encodeSnappyBetterBlockAsm12B:
13773	LEAQ (AX)(R9*1), SI
13774
13775	// genMemMoveShort
13776	CMPQ R9, $0x08
13777	JLE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8
13778	CMPQ R9, $0x10
13779	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
13780	CMPQ R9, $0x20
13781	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
13782	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
13783
13784emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8:
13785	MOVQ (R10), R11
13786	MOVQ R11, (AX)
13787	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
13788
13789emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
13790	MOVQ (R10), R11
13791	MOVQ -8(R10)(R9*1), R10
13792	MOVQ R11, (AX)
13793	MOVQ R10, -8(AX)(R9*1)
13794	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
13795
13796emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
13797	MOVOU (R10), X0
13798	MOVOU -16(R10)(R9*1), X1
13799	MOVOU X0, (AX)
13800	MOVOU X1, -16(AX)(R9*1)
13801	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
13802
13803emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
13804	MOVOU (R10), X0
13805	MOVOU 16(R10), X1
13806	MOVOU -32(R10)(R9*1), X2
13807	MOVOU -16(R10)(R9*1), X3
13808	MOVOU X0, (AX)
13809	MOVOU X1, 16(AX)
13810	MOVOU X2, -32(AX)(R9*1)
13811	MOVOU X3, -16(AX)(R9*1)
13812
13813memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B:
13814	MOVQ SI, AX
13815	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
13816
13817memmove_long_match_emit_encodeSnappyBetterBlockAsm12B:
13818	LEAQ (AX)(R9*1), SI
13819
13820	// genMemMoveLong
13821	MOVOU (R10), X0
13822	MOVOU 16(R10), X1
13823	MOVOU -32(R10)(R9*1), X2
13824	MOVOU -16(R10)(R9*1), X3
13825	MOVQ  R9, R13
13826	SHRQ  $0x05, R13
13827	MOVQ  AX, R11
13828	ANDL  $0x0000001f, R11
13829	MOVQ  $0x00000040, R14
13830	SUBQ  R11, R14
13831	DECQ  R13
13832	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
13833	LEAQ  -32(R10)(R14*1), R11
13834	LEAQ  -32(AX)(R14*1), R15
13835
13836emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
13837	MOVOU (R11), X4
13838	MOVOU 16(R11), X5
13839	MOVOA X4, (R15)
13840	MOVOA X5, 16(R15)
13841	ADDQ  $0x20, R15
13842	ADDQ  $0x20, R11
13843	ADDQ  $0x20, R14
13844	DECQ  R13
13845	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
13846
13847emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
13848	MOVOU -32(R10)(R14*1), X4
13849	MOVOU -16(R10)(R14*1), X5
13850	MOVOA X4, -32(AX)(R14*1)
13851	MOVOA X5, -16(AX)(R14*1)
13852	ADDQ  $0x20, R14
13853	CMPQ  R9, R14
13854	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
13855	MOVOU X0, (AX)
13856	MOVOU X1, 16(AX)
13857	MOVOU X2, -32(AX)(R9*1)
13858	MOVOU X3, -16(AX)(R9*1)
13859	MOVQ  SI, AX
13860
13861emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B:
13862	ADDL R12, CX
13863	ADDL $0x04, R12
13864	MOVL CX, 12(SP)
13865
13866	// emitCopy
13867two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B:
13868	CMPL R12, $0x40
13869	JLE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B
13870	MOVB $0xee, (AX)
13871	MOVW R8, 1(AX)
13872	LEAL -60(R12), R12
13873	ADDQ $0x03, AX
13874	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B
13875
13876two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B:
13877	CMPL R12, $0x0c
13878	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
13879	CMPL R8, $0x00000800
13880	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
13881	MOVB $0x01, BL
13882	LEAL -16(BX)(R12*4), R12
13883	MOVB R8, 1(AX)
13884	SHRL $0x08, R8
13885	SHLL $0x05, R8
13886	ORL  R8, R12
13887	MOVB R12, (AX)
13888	ADDQ $0x02, AX
13889	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B
13890
13891emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B:
13892	MOVB $0x02, BL
13893	LEAL -4(BX)(R12*4), R12
13894	MOVB R12, (AX)
13895	MOVW R8, 1(AX)
13896	ADDQ $0x03, AX
13897
13898match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
13899	CMPL CX, 8(SP)
13900	JGE  emit_remainder_encodeSnappyBetterBlockAsm12B
13901	CMPQ AX, (SP)
13902	JL   match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B
13903	MOVQ $0x00000000, ret+48(FP)
13904	RET
13905
13906match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
13907	MOVQ  $0x0000cf1bbcdcbf9b, SI
13908	MOVQ  $0x9e3779b1, R8
13909	INCL  DI
13910	MOVQ  (DX)(DI*1), R9
13911	MOVQ  R9, R10
13912	MOVQ  R9, R11
13913	MOVQ  R9, R12
13914	SHRQ  $0x08, R11
13915	MOVQ  R11, R13
13916	SHRQ  $0x10, R12
13917	LEAL  1(DI), R14
13918	LEAL  2(DI), R15
13919	MOVQ  -2(DX)(CX*1), R9
13920	SHLQ  $0x10, R10
13921	IMULQ SI, R10
13922	SHRQ  $0x32, R10
13923	SHLQ  $0x10, R13
13924	IMULQ SI, R13
13925	SHRQ  $0x32, R13
13926	SHLQ  $0x20, R11
13927	IMULQ R8, R11
13928	SHRQ  $0x34, R11
13929	SHLQ  $0x20, R12
13930	IMULQ R8, R12
13931	SHRQ  $0x34, R12
13932	MOVL  DI, 24(SP)(R10*4)
13933	MOVL  R14, 24(SP)(R13*4)
13934	MOVL  R14, 65560(SP)(R11*4)
13935	MOVL  R15, 65560(SP)(R12*4)
13936	MOVQ  R9, R10
13937	MOVQ  R9, R11
13938	SHRQ  $0x08, R11
13939	MOVQ  R11, R13
13940	LEAL  -2(CX), R9
13941	LEAL  -1(CX), DI
13942	SHLQ  $0x10, R10
13943	IMULQ SI, R10
13944	SHRQ  $0x32, R10
13945	SHLQ  $0x20, R11
13946	IMULQ R8, R11
13947	SHRQ  $0x34, R11
13948	SHLQ  $0x10, R13
13949	IMULQ SI, R13
13950	SHRQ  $0x32, R13
13951	MOVL  R9, 24(SP)(R10*4)
13952	MOVL  DI, 65560(SP)(R11*4)
13953	MOVL  DI, 24(SP)(R13*4)
13954	JMP   search_loop_encodeSnappyBetterBlockAsm12B
13955
13956emit_remainder_encodeSnappyBetterBlockAsm12B:
13957	MOVQ src_len+32(FP), CX
13958	SUBL 12(SP), CX
13959	LEAQ 3(AX)(CX*1), CX
13960	CMPQ CX, (SP)
13961	JL   emit_remainder_ok_encodeSnappyBetterBlockAsm12B
13962	MOVQ $0x00000000, ret+48(FP)
13963	RET
13964
13965emit_remainder_ok_encodeSnappyBetterBlockAsm12B:
13966	MOVQ src_len+32(FP), CX
13967	MOVL 12(SP), BX
13968	CMPL BX, CX
13969	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
13970	MOVL CX, SI
13971	MOVL CX, 12(SP)
13972	LEAQ (DX)(BX*1), CX
13973	SUBL BX, SI
13974	LEAL -1(SI), DX
13975	CMPL DX, $0x3c
13976	JLT  one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B
13977	CMPL DX, $0x00000100
13978	JLT  two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
13979	MOVB $0xf4, (AX)
13980	MOVW DX, 1(AX)
13981	ADDQ $0x03, AX
13982	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
13983
13984two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
13985	MOVB $0xf0, (AX)
13986	MOVB DL, 1(AX)
13987	ADDQ $0x02, AX
13988	CMPL DX, $0x40
13989	JL   memmove_emit_remainder_encodeSnappyBetterBlockAsm12B
13990	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
13991
13992one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B:
13993	SHLB $0x02, DL
13994	MOVB DL, (AX)
13995	ADDQ $0x01, AX
13996
13997memmove_emit_remainder_encodeSnappyBetterBlockAsm12B:
13998	LEAQ (AX)(SI*1), DX
13999	MOVL SI, BX
14000
14001	// genMemMoveShort
14002	CMPQ BX, $0x08
14003	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8
14004	CMPQ BX, $0x10
14005	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
14006	CMPQ BX, $0x20
14007	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
14008	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
14009
14010emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8:
14011	MOVQ (CX), SI
14012	MOVQ SI, (AX)
14013	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
14014
14015emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
14016	MOVQ (CX), SI
14017	MOVQ -8(CX)(BX*1), CX
14018	MOVQ SI, (AX)
14019	MOVQ CX, -8(AX)(BX*1)
14020	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
14021
14022emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
14023	MOVOU (CX), X0
14024	MOVOU -16(CX)(BX*1), X1
14025	MOVOU X0, (AX)
14026	MOVOU X1, -16(AX)(BX*1)
14027	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
14028
14029emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
14030	MOVOU (CX), X0
14031	MOVOU 16(CX), X1
14032	MOVOU -32(CX)(BX*1), X2
14033	MOVOU -16(CX)(BX*1), X3
14034	MOVOU X0, (AX)
14035	MOVOU X1, 16(AX)
14036	MOVOU X2, -32(AX)(BX*1)
14037	MOVOU X3, -16(AX)(BX*1)
14038
14039memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B:
14040	MOVQ DX, AX
14041	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
14042
14043memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B:
14044	LEAQ (AX)(SI*1), DX
14045	MOVL SI, BX
14046
14047	// genMemMoveLong
14048	MOVOU (CX), X0
14049	MOVOU 16(CX), X1
14050	MOVOU -32(CX)(BX*1), X2
14051	MOVOU -16(CX)(BX*1), X3
14052	MOVQ  BX, DI
14053	SHRQ  $0x05, DI
14054	MOVQ  AX, SI
14055	ANDL  $0x0000001f, SI
14056	MOVQ  $0x00000040, R8
14057	SUBQ  SI, R8
14058	DECQ  DI
14059	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
14060	LEAQ  -32(CX)(R8*1), SI
14061	LEAQ  -32(AX)(R8*1), R9
14062
14063emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
14064	MOVOU (SI), X4
14065	MOVOU 16(SI), X5
14066	MOVOA X4, (R9)
14067	MOVOA X5, 16(R9)
14068	ADDQ  $0x20, R9
14069	ADDQ  $0x20, SI
14070	ADDQ  $0x20, R8
14071	DECQ  DI
14072	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
14073
14074emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
14075	MOVOU -32(CX)(R8*1), X4
14076	MOVOU -16(CX)(R8*1), X5
14077	MOVOA X4, -32(AX)(R8*1)
14078	MOVOA X5, -16(AX)(R8*1)
14079	ADDQ  $0x20, R8
14080	CMPQ  BX, R8
14081	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
14082	MOVOU X0, (AX)
14083	MOVOU X1, 16(AX)
14084	MOVOU X2, -32(AX)(BX*1)
14085	MOVOU X3, -16(AX)(BX*1)
14086	MOVQ  DX, AX
14087
14088emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B:
14089	MOVQ dst_base+0(FP), CX
14090	SUBQ CX, AX
14091	MOVQ AX, ret+48(FP)
14092	RET
14093
14094// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
14095// Requires: SSE2
14096TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56
14097	MOVQ dst_base+0(FP), AX
14098	MOVQ $0x000000a0, CX
14099	LEAQ 24(SP), DX
14100	PXOR X0, X0
14101
14102zero_loop_encodeSnappyBetterBlockAsm10B:
14103	MOVOU X0, (DX)
14104	MOVOU X0, 16(DX)
14105	MOVOU X0, 32(DX)
14106	MOVOU X0, 48(DX)
14107	MOVOU X0, 64(DX)
14108	MOVOU X0, 80(DX)
14109	MOVOU X0, 96(DX)
14110	MOVOU X0, 112(DX)
14111	ADDQ  $0x80, DX
14112	DECQ  CX
14113	JNZ   zero_loop_encodeSnappyBetterBlockAsm10B
14114	MOVL  $0x00000000, 12(SP)
14115	MOVQ  src_len+32(FP), CX
14116	LEAQ  -9(CX), DX
14117	LEAQ  -8(CX), SI
14118	MOVL  SI, 8(SP)
14119	SHRQ  $0x05, CX
14120	SUBL  CX, DX
14121	LEAQ  (AX)(DX*1), DX
14122	MOVQ  DX, (SP)
14123	MOVL  $0x00000001, CX
14124	MOVL  $0x00000000, 16(SP)
14125	MOVQ  src_base+24(FP), DX
14126
14127search_loop_encodeSnappyBetterBlockAsm10B:
14128	MOVL  CX, SI
14129	SUBL  12(SP), SI
14130	SHRL  $0x05, SI
14131	LEAL  1(CX)(SI*1), SI
14132	CMPL  SI, 8(SP)
14133	JGE   emit_remainder_encodeSnappyBetterBlockAsm10B
14134	MOVQ  (DX)(CX*1), DI
14135	MOVL  SI, 20(SP)
14136	MOVQ  $0x0000cf1bbcdcbf9b, R9
14137	MOVQ  $0x9e3779b1, SI
14138	MOVQ  DI, R10
14139	MOVQ  DI, R11
14140	SHLQ  $0x10, R10
14141	IMULQ R9, R10
14142	SHRQ  $0x34, R10
14143	SHLQ  $0x20, R11
14144	IMULQ SI, R11
14145	SHRQ  $0x36, R11
14146	MOVL  24(SP)(R10*4), SI
14147	MOVL  16408(SP)(R11*4), R8
14148	MOVL  CX, 24(SP)(R10*4)
14149	MOVL  CX, 16408(SP)(R11*4)
14150	CMPL  (DX)(SI*1), DI
14151	JEQ   candidate_match_encodeSnappyBetterBlockAsm10B
14152	CMPL  (DX)(R8*1), DI
14153	JEQ   candidateS_match_encodeSnappyBetterBlockAsm10B
14154	MOVL  20(SP), CX
14155	JMP   search_loop_encodeSnappyBetterBlockAsm10B
14156
14157candidateS_match_encodeSnappyBetterBlockAsm10B:
14158	SHRQ  $0x08, DI
14159	MOVQ  DI, R10
14160	SHLQ  $0x10, R10
14161	IMULQ R9, R10
14162	SHRQ  $0x34, R10
14163	MOVL  24(SP)(R10*4), SI
14164	INCL  CX
14165	MOVL  CX, 24(SP)(R10*4)
14166	CMPL  (DX)(SI*1), DI
14167	JEQ   candidate_match_encodeSnappyBetterBlockAsm10B
14168	DECL  CX
14169	MOVL  R8, SI
14170
14171candidate_match_encodeSnappyBetterBlockAsm10B:
14172	MOVL  12(SP), DI
14173	TESTL SI, SI
14174	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm10B
14175
14176match_extend_back_loop_encodeSnappyBetterBlockAsm10B:
14177	CMPL CX, DI
14178	JLE  match_extend_back_end_encodeSnappyBetterBlockAsm10B
14179	MOVB -1(DX)(SI*1), BL
14180	MOVB -1(DX)(CX*1), R8
14181	CMPB BL, R8
14182	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm10B
14183	LEAL -1(CX), CX
14184	DECL SI
14185	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm10B
14186	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm10B
14187
14188match_extend_back_end_encodeSnappyBetterBlockAsm10B:
14189	MOVL CX, DI
14190	SUBL 12(SP), DI
14191	LEAQ 3(AX)(DI*1), DI
14192	CMPQ DI, (SP)
14193	JL   match_dst_size_check_encodeSnappyBetterBlockAsm10B
14194	MOVQ $0x00000000, ret+48(FP)
14195	RET
14196
14197match_dst_size_check_encodeSnappyBetterBlockAsm10B:
14198	MOVL CX, DI
14199	ADDL $0x04, CX
14200	ADDL $0x04, SI
14201	MOVQ src_len+32(FP), R8
14202	SUBL CX, R8
14203	LEAQ (DX)(CX*1), R9
14204	LEAQ (DX)(SI*1), R10
14205
14206	// matchLen
14207	XORL R12, R12
14208	CMPL R8, $0x08
14209	JL   matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B
14210
14211matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:
14212	MOVQ  (R9)(R12*1), R11
14213	XORQ  (R10)(R12*1), R11
14214	TESTQ R11, R11
14215	JZ    matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B
14216	BSFQ  R11, R11
14217	SARQ  $0x03, R11
14218	LEAL  (R12)(R11*1), R12
14219	JMP   match_nolit_end_encodeSnappyBetterBlockAsm10B
14220
14221matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B:
14222	LEAL -8(R8), R8
14223	LEAL 8(R12), R12
14224	CMPL R8, $0x08
14225	JGE  matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B
14226
14227matchlen_single_match_nolit_encodeSnappyBetterBlockAsm10B:
14228	TESTL R8, R8
14229	JZ    match_nolit_end_encodeSnappyBetterBlockAsm10B
14230
14231matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:
14232	MOVB (R9)(R12*1), R11
14233	CMPB (R10)(R12*1), R11
14234	JNE  match_nolit_end_encodeSnappyBetterBlockAsm10B
14235	LEAL 1(R12), R12
14236	DECL R8
14237	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm10B
14238
14239match_nolit_end_encodeSnappyBetterBlockAsm10B:
14240	MOVL CX, R8
14241	SUBL SI, R8
14242
14243	// Check if repeat
14244	MOVL R8, 16(SP)
14245	MOVL 12(SP), SI
14246	CMPL SI, DI
14247	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
14248	MOVL DI, R9
14249	MOVL DI, 12(SP)
14250	LEAQ (DX)(SI*1), R10
14251	SUBL SI, R9
14252	LEAL -1(R9), SI
14253	CMPL SI, $0x3c
14254	JLT  one_byte_match_emit_encodeSnappyBetterBlockAsm10B
14255	CMPL SI, $0x00000100
14256	JLT  two_bytes_match_emit_encodeSnappyBetterBlockAsm10B
14257	MOVB $0xf4, (AX)
14258	MOVW SI, 1(AX)
14259	ADDQ $0x03, AX
14260	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
14261
14262two_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
14263	MOVB $0xf0, (AX)
14264	MOVB SI, 1(AX)
14265	ADDQ $0x02, AX
14266	CMPL SI, $0x40
14267	JL   memmove_match_emit_encodeSnappyBetterBlockAsm10B
14268	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
14269
14270one_byte_match_emit_encodeSnappyBetterBlockAsm10B:
14271	SHLB $0x02, SI
14272	MOVB SI, (AX)
14273	ADDQ $0x01, AX
14274
14275memmove_match_emit_encodeSnappyBetterBlockAsm10B:
14276	LEAQ (AX)(R9*1), SI
14277
14278	// genMemMoveShort
14279	CMPQ R9, $0x08
14280	JLE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8
14281	CMPQ R9, $0x10
14282	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
14283	CMPQ R9, $0x20
14284	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
14285	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
14286
14287emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8:
14288	MOVQ (R10), R11
14289	MOVQ R11, (AX)
14290	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
14291
14292emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
14293	MOVQ (R10), R11
14294	MOVQ -8(R10)(R9*1), R10
14295	MOVQ R11, (AX)
14296	MOVQ R10, -8(AX)(R9*1)
14297	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
14298
14299emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
14300	MOVOU (R10), X0
14301	MOVOU -16(R10)(R9*1), X1
14302	MOVOU X0, (AX)
14303	MOVOU X1, -16(AX)(R9*1)
14304	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
14305
14306emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
14307	MOVOU (R10), X0
14308	MOVOU 16(R10), X1
14309	MOVOU -32(R10)(R9*1), X2
14310	MOVOU -16(R10)(R9*1), X3
14311	MOVOU X0, (AX)
14312	MOVOU X1, 16(AX)
14313	MOVOU X2, -32(AX)(R9*1)
14314	MOVOU X3, -16(AX)(R9*1)
14315
14316memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B:
14317	MOVQ SI, AX
14318	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
14319
14320memmove_long_match_emit_encodeSnappyBetterBlockAsm10B:
14321	LEAQ (AX)(R9*1), SI
14322
14323	// genMemMoveLong
14324	MOVOU (R10), X0
14325	MOVOU 16(R10), X1
14326	MOVOU -32(R10)(R9*1), X2
14327	MOVOU -16(R10)(R9*1), X3
14328	MOVQ  R9, R13
14329	SHRQ  $0x05, R13
14330	MOVQ  AX, R11
14331	ANDL  $0x0000001f, R11
14332	MOVQ  $0x00000040, R14
14333	SUBQ  R11, R14
14334	DECQ  R13
14335	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
14336	LEAQ  -32(R10)(R14*1), R11
14337	LEAQ  -32(AX)(R14*1), R15
14338
14339emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
14340	MOVOU (R11), X4
14341	MOVOU 16(R11), X5
14342	MOVOA X4, (R15)
14343	MOVOA X5, 16(R15)
14344	ADDQ  $0x20, R15
14345	ADDQ  $0x20, R11
14346	ADDQ  $0x20, R14
14347	DECQ  R13
14348	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
14349
14350emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
14351	MOVOU -32(R10)(R14*1), X4
14352	MOVOU -16(R10)(R14*1), X5
14353	MOVOA X4, -32(AX)(R14*1)
14354	MOVOA X5, -16(AX)(R14*1)
14355	ADDQ  $0x20, R14
14356	CMPQ  R9, R14
14357	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
14358	MOVOU X0, (AX)
14359	MOVOU X1, 16(AX)
14360	MOVOU X2, -32(AX)(R9*1)
14361	MOVOU X3, -16(AX)(R9*1)
14362	MOVQ  SI, AX
14363
14364emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B:
14365	ADDL R12, CX
14366	ADDL $0x04, R12
14367	MOVL CX, 12(SP)
14368
14369	// emitCopy
14370two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B:
14371	CMPL R12, $0x40
14372	JLE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B
14373	MOVB $0xee, (AX)
14374	MOVW R8, 1(AX)
14375	LEAL -60(R12), R12
14376	ADDQ $0x03, AX
14377	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B
14378
14379two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B:
14380	CMPL R12, $0x0c
14381	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
14382	CMPL R8, $0x00000800
14383	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
14384	MOVB $0x01, BL
14385	LEAL -16(BX)(R12*4), R12
14386	MOVB R8, 1(AX)
14387	SHRL $0x08, R8
14388	SHLL $0x05, R8
14389	ORL  R8, R12
14390	MOVB R12, (AX)
14391	ADDQ $0x02, AX
14392	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B
14393
14394emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B:
14395	MOVB $0x02, BL
14396	LEAL -4(BX)(R12*4), R12
14397	MOVB R12, (AX)
14398	MOVW R8, 1(AX)
14399	ADDQ $0x03, AX
14400
14401match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
14402	CMPL CX, 8(SP)
14403	JGE  emit_remainder_encodeSnappyBetterBlockAsm10B
14404	CMPQ AX, (SP)
14405	JL   match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B
14406	MOVQ $0x00000000, ret+48(FP)
14407	RET
14408
14409match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
14410	MOVQ  $0x0000cf1bbcdcbf9b, SI
14411	MOVQ  $0x9e3779b1, R8
14412	INCL  DI
14413	MOVQ  (DX)(DI*1), R9
14414	MOVQ  R9, R10
14415	MOVQ  R9, R11
14416	MOVQ  R9, R12
14417	SHRQ  $0x08, R11
14418	MOVQ  R11, R13
14419	SHRQ  $0x10, R12
14420	LEAL  1(DI), R14
14421	LEAL  2(DI), R15
14422	MOVQ  -2(DX)(CX*1), R9
14423	SHLQ  $0x10, R10
14424	IMULQ SI, R10
14425	SHRQ  $0x34, R10
14426	SHLQ  $0x10, R13
14427	IMULQ SI, R13
14428	SHRQ  $0x34, R13
14429	SHLQ  $0x20, R11
14430	IMULQ R8, R11
14431	SHRQ  $0x36, R11
14432	SHLQ  $0x20, R12
14433	IMULQ R8, R12
14434	SHRQ  $0x36, R12
14435	MOVL  DI, 24(SP)(R10*4)
14436	MOVL  R14, 24(SP)(R13*4)
14437	MOVL  R14, 16408(SP)(R11*4)
14438	MOVL  R15, 16408(SP)(R12*4)
14439	MOVQ  R9, R10
14440	MOVQ  R9, R11
14441	SHRQ  $0x08, R11
14442	MOVQ  R11, R13
14443	LEAL  -2(CX), R9
14444	LEAL  -1(CX), DI
14445	SHLQ  $0x10, R10
14446	IMULQ SI, R10
14447	SHRQ  $0x34, R10
14448	SHLQ  $0x20, R11
14449	IMULQ R8, R11
14450	SHRQ  $0x36, R11
14451	SHLQ  $0x10, R13
14452	IMULQ SI, R13
14453	SHRQ  $0x34, R13
14454	MOVL  R9, 24(SP)(R10*4)
14455	MOVL  DI, 16408(SP)(R11*4)
14456	MOVL  DI, 24(SP)(R13*4)
14457	JMP   search_loop_encodeSnappyBetterBlockAsm10B
14458
14459emit_remainder_encodeSnappyBetterBlockAsm10B:
14460	MOVQ src_len+32(FP), CX
14461	SUBL 12(SP), CX
14462	LEAQ 3(AX)(CX*1), CX
14463	CMPQ CX, (SP)
14464	JL   emit_remainder_ok_encodeSnappyBetterBlockAsm10B
14465	MOVQ $0x00000000, ret+48(FP)
14466	RET
14467
14468emit_remainder_ok_encodeSnappyBetterBlockAsm10B:
14469	MOVQ src_len+32(FP), CX
14470	MOVL 12(SP), BX
14471	CMPL BX, CX
14472	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
14473	MOVL CX, SI
14474	MOVL CX, 12(SP)
14475	LEAQ (DX)(BX*1), CX
14476	SUBL BX, SI
14477	LEAL -1(SI), DX
14478	CMPL DX, $0x3c
14479	JLT  one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B
14480	CMPL DX, $0x00000100
14481	JLT  two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
14482	MOVB $0xf4, (AX)
14483	MOVW DX, 1(AX)
14484	ADDQ $0x03, AX
14485	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
14486
14487two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
14488	MOVB $0xf0, (AX)
14489	MOVB DL, 1(AX)
14490	ADDQ $0x02, AX
14491	CMPL DX, $0x40
14492	JL   memmove_emit_remainder_encodeSnappyBetterBlockAsm10B
14493	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
14494
14495one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B:
14496	SHLB $0x02, DL
14497	MOVB DL, (AX)
14498	ADDQ $0x01, AX
14499
14500memmove_emit_remainder_encodeSnappyBetterBlockAsm10B:
14501	LEAQ (AX)(SI*1), DX
14502	MOVL SI, BX
14503
14504	// genMemMoveShort
14505	CMPQ BX, $0x08
14506	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8
14507	CMPQ BX, $0x10
14508	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
14509	CMPQ BX, $0x20
14510	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
14511	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
14512
14513emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8:
14514	MOVQ (CX), SI
14515	MOVQ SI, (AX)
14516	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
14517
14518emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
14519	MOVQ (CX), SI
14520	MOVQ -8(CX)(BX*1), CX
14521	MOVQ SI, (AX)
14522	MOVQ CX, -8(AX)(BX*1)
14523	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
14524
14525emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
14526	MOVOU (CX), X0
14527	MOVOU -16(CX)(BX*1), X1
14528	MOVOU X0, (AX)
14529	MOVOU X1, -16(AX)(BX*1)
14530	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
14531
14532emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
14533	MOVOU (CX), X0
14534	MOVOU 16(CX), X1
14535	MOVOU -32(CX)(BX*1), X2
14536	MOVOU -16(CX)(BX*1), X3
14537	MOVOU X0, (AX)
14538	MOVOU X1, 16(AX)
14539	MOVOU X2, -32(AX)(BX*1)
14540	MOVOU X3, -16(AX)(BX*1)
14541
14542memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B:
14543	MOVQ DX, AX
14544	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
14545
14546memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B:
14547	LEAQ (AX)(SI*1), DX
14548	MOVL SI, BX
14549
14550	// genMemMoveLong
14551	MOVOU (CX), X0
14552	MOVOU 16(CX), X1
14553	MOVOU -32(CX)(BX*1), X2
14554	MOVOU -16(CX)(BX*1), X3
14555	MOVQ  BX, DI
14556	SHRQ  $0x05, DI
14557	MOVQ  AX, SI
14558	ANDL  $0x0000001f, SI
14559	MOVQ  $0x00000040, R8
14560	SUBQ  SI, R8
14561	DECQ  DI
14562	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
14563	LEAQ  -32(CX)(R8*1), SI
14564	LEAQ  -32(AX)(R8*1), R9
14565
14566emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
14567	MOVOU (SI), X4
14568	MOVOU 16(SI), X5
14569	MOVOA X4, (R9)
14570	MOVOA X5, 16(R9)
14571	ADDQ  $0x20, R9
14572	ADDQ  $0x20, SI
14573	ADDQ  $0x20, R8
14574	DECQ  DI
14575	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
14576
14577emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
14578	MOVOU -32(CX)(R8*1), X4
14579	MOVOU -16(CX)(R8*1), X5
14580	MOVOA X4, -32(AX)(R8*1)
14581	MOVOA X5, -16(AX)(R8*1)
14582	ADDQ  $0x20, R8
14583	CMPQ  BX, R8
14584	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
14585	MOVOU X0, (AX)
14586	MOVOU X1, 16(AX)
14587	MOVOU X2, -32(AX)(BX*1)
14588	MOVOU X3, -16(AX)(BX*1)
14589	MOVQ  DX, AX
14590
14591emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B:
14592	MOVQ dst_base+0(FP), CX
14593	SUBQ CX, AX
14594	MOVQ AX, ret+48(FP)
14595	RET
14596
14597// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
14598// Requires: SSE2
14599TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56
14600	MOVQ dst_base+0(FP), AX
14601	MOVQ $0x00000028, CX
14602	LEAQ 24(SP), DX
14603	PXOR X0, X0
14604
14605zero_loop_encodeSnappyBetterBlockAsm8B:
14606	MOVOU X0, (DX)
14607	MOVOU X0, 16(DX)
14608	MOVOU X0, 32(DX)
14609	MOVOU X0, 48(DX)
14610	MOVOU X0, 64(DX)
14611	MOVOU X0, 80(DX)
14612	MOVOU X0, 96(DX)
14613	MOVOU X0, 112(DX)
14614	ADDQ  $0x80, DX
14615	DECQ  CX
14616	JNZ   zero_loop_encodeSnappyBetterBlockAsm8B
14617	MOVL  $0x00000000, 12(SP)
14618	MOVQ  src_len+32(FP), CX
14619	LEAQ  -9(CX), DX
14620	LEAQ  -8(CX), SI
14621	MOVL  SI, 8(SP)
14622	SHRQ  $0x05, CX
14623	SUBL  CX, DX
14624	LEAQ  (AX)(DX*1), DX
14625	MOVQ  DX, (SP)
14626	MOVL  $0x00000001, CX
14627	MOVL  $0x00000000, 16(SP)
14628	MOVQ  src_base+24(FP), DX
14629
14630search_loop_encodeSnappyBetterBlockAsm8B:
14631	MOVL  CX, SI
14632	SUBL  12(SP), SI
14633	SHRL  $0x04, SI
14634	LEAL  1(CX)(SI*1), SI
14635	CMPL  SI, 8(SP)
14636	JGE   emit_remainder_encodeSnappyBetterBlockAsm8B
14637	MOVQ  (DX)(CX*1), DI
14638	MOVL  SI, 20(SP)
14639	MOVQ  $0x0000cf1bbcdcbf9b, R9
14640	MOVQ  $0x9e3779b1, SI
14641	MOVQ  DI, R10
14642	MOVQ  DI, R11
14643	SHLQ  $0x10, R10
14644	IMULQ R9, R10
14645	SHRQ  $0x36, R10
14646	SHLQ  $0x20, R11
14647	IMULQ SI, R11
14648	SHRQ  $0x38, R11
14649	MOVL  24(SP)(R10*4), SI
14650	MOVL  4120(SP)(R11*4), R8
14651	MOVL  CX, 24(SP)(R10*4)
14652	MOVL  CX, 4120(SP)(R11*4)
14653	CMPL  (DX)(SI*1), DI
14654	JEQ   candidate_match_encodeSnappyBetterBlockAsm8B
14655	CMPL  (DX)(R8*1), DI
14656	JEQ   candidateS_match_encodeSnappyBetterBlockAsm8B
14657	MOVL  20(SP), CX
14658	JMP   search_loop_encodeSnappyBetterBlockAsm8B
14659
14660candidateS_match_encodeSnappyBetterBlockAsm8B:
14661	SHRQ  $0x08, DI
14662	MOVQ  DI, R10
14663	SHLQ  $0x10, R10
14664	IMULQ R9, R10
14665	SHRQ  $0x36, R10
14666	MOVL  24(SP)(R10*4), SI
14667	INCL  CX
14668	MOVL  CX, 24(SP)(R10*4)
14669	CMPL  (DX)(SI*1), DI
14670	JEQ   candidate_match_encodeSnappyBetterBlockAsm8B
14671	DECL  CX
14672	MOVL  R8, SI
14673
14674candidate_match_encodeSnappyBetterBlockAsm8B:
14675	MOVL  12(SP), DI
14676	TESTL SI, SI
14677	JZ    match_extend_back_end_encodeSnappyBetterBlockAsm8B
14678
14679match_extend_back_loop_encodeSnappyBetterBlockAsm8B:
14680	CMPL CX, DI
14681	JLE  match_extend_back_end_encodeSnappyBetterBlockAsm8B
14682	MOVB -1(DX)(SI*1), BL
14683	MOVB -1(DX)(CX*1), R8
14684	CMPB BL, R8
14685	JNE  match_extend_back_end_encodeSnappyBetterBlockAsm8B
14686	LEAL -1(CX), CX
14687	DECL SI
14688	JZ   match_extend_back_end_encodeSnappyBetterBlockAsm8B
14689	JMP  match_extend_back_loop_encodeSnappyBetterBlockAsm8B
14690
14691match_extend_back_end_encodeSnappyBetterBlockAsm8B:
14692	MOVL CX, DI
14693	SUBL 12(SP), DI
14694	LEAQ 3(AX)(DI*1), DI
14695	CMPQ DI, (SP)
14696	JL   match_dst_size_check_encodeSnappyBetterBlockAsm8B
14697	MOVQ $0x00000000, ret+48(FP)
14698	RET
14699
14700match_dst_size_check_encodeSnappyBetterBlockAsm8B:
14701	MOVL CX, DI
14702	ADDL $0x04, CX
14703	ADDL $0x04, SI
14704	MOVQ src_len+32(FP), R8
14705	SUBL CX, R8
14706	LEAQ (DX)(CX*1), R9
14707	LEAQ (DX)(SI*1), R10
14708
14709	// matchLen
14710	XORL R12, R12
14711	CMPL R8, $0x08
14712	JL   matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B
14713
14714matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:
14715	MOVQ  (R9)(R12*1), R11
14716	XORQ  (R10)(R12*1), R11
14717	TESTQ R11, R11
14718	JZ    matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B
14719	BSFQ  R11, R11
14720	SARQ  $0x03, R11
14721	LEAL  (R12)(R11*1), R12
14722	JMP   match_nolit_end_encodeSnappyBetterBlockAsm8B
14723
14724matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B:
14725	LEAL -8(R8), R8
14726	LEAL 8(R12), R12
14727	CMPL R8, $0x08
14728	JGE  matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B
14729
14730matchlen_single_match_nolit_encodeSnappyBetterBlockAsm8B:
14731	TESTL R8, R8
14732	JZ    match_nolit_end_encodeSnappyBetterBlockAsm8B
14733
14734matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:
14735	MOVB (R9)(R12*1), R11
14736	CMPB (R10)(R12*1), R11
14737	JNE  match_nolit_end_encodeSnappyBetterBlockAsm8B
14738	LEAL 1(R12), R12
14739	DECL R8
14740	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBetterBlockAsm8B
14741
14742match_nolit_end_encodeSnappyBetterBlockAsm8B:
14743	MOVL CX, R8
14744	SUBL SI, R8
14745
14746	// Check if repeat
14747	MOVL R8, 16(SP)
14748	MOVL 12(SP), SI
14749	CMPL SI, DI
14750	JEQ  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
14751	MOVL DI, R9
14752	MOVL DI, 12(SP)
14753	LEAQ (DX)(SI*1), R10
14754	SUBL SI, R9
14755	LEAL -1(R9), SI
14756	CMPL SI, $0x3c
14757	JLT  one_byte_match_emit_encodeSnappyBetterBlockAsm8B
14758	CMPL SI, $0x00000100
14759	JLT  two_bytes_match_emit_encodeSnappyBetterBlockAsm8B
14760	MOVB $0xf4, (AX)
14761	MOVW SI, 1(AX)
14762	ADDQ $0x03, AX
14763	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
14764
14765two_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
14766	MOVB $0xf0, (AX)
14767	MOVB SI, 1(AX)
14768	ADDQ $0x02, AX
14769	CMPL SI, $0x40
14770	JL   memmove_match_emit_encodeSnappyBetterBlockAsm8B
14771	JMP  memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
14772
14773one_byte_match_emit_encodeSnappyBetterBlockAsm8B:
14774	SHLB $0x02, SI
14775	MOVB SI, (AX)
14776	ADDQ $0x01, AX
14777
14778memmove_match_emit_encodeSnappyBetterBlockAsm8B:
14779	LEAQ (AX)(R9*1), SI
14780
14781	// genMemMoveShort
14782	CMPQ R9, $0x08
14783	JLE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8
14784	CMPQ R9, $0x10
14785	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
14786	CMPQ R9, $0x20
14787	JBE  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
14788	JMP  emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
14789
14790emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8:
14791	MOVQ (R10), R11
14792	MOVQ R11, (AX)
14793	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
14794
14795emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
14796	MOVQ (R10), R11
14797	MOVQ -8(R10)(R9*1), R10
14798	MOVQ R11, (AX)
14799	MOVQ R10, -8(AX)(R9*1)
14800	JMP  memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
14801
14802emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
14803	MOVOU (R10), X0
14804	MOVOU -16(R10)(R9*1), X1
14805	MOVOU X0, (AX)
14806	MOVOU X1, -16(AX)(R9*1)
14807	JMP   memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
14808
14809emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
14810	MOVOU (R10), X0
14811	MOVOU 16(R10), X1
14812	MOVOU -32(R10)(R9*1), X2
14813	MOVOU -16(R10)(R9*1), X3
14814	MOVOU X0, (AX)
14815	MOVOU X1, 16(AX)
14816	MOVOU X2, -32(AX)(R9*1)
14817	MOVOU X3, -16(AX)(R9*1)
14818
14819memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B:
14820	MOVQ SI, AX
14821	JMP  emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
14822
14823memmove_long_match_emit_encodeSnappyBetterBlockAsm8B:
14824	LEAQ (AX)(R9*1), SI
14825
14826	// genMemMoveLong
14827	MOVOU (R10), X0
14828	MOVOU 16(R10), X1
14829	MOVOU -32(R10)(R9*1), X2
14830	MOVOU -16(R10)(R9*1), X3
14831	MOVQ  R9, R13
14832	SHRQ  $0x05, R13
14833	MOVQ  AX, R11
14834	ANDL  $0x0000001f, R11
14835	MOVQ  $0x00000040, R14
14836	SUBQ  R11, R14
14837	DECQ  R13
14838	JA    emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
14839	LEAQ  -32(R10)(R14*1), R11
14840	LEAQ  -32(AX)(R14*1), R15
14841
14842emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
14843	MOVOU (R11), X4
14844	MOVOU 16(R11), X5
14845	MOVOA X4, (R15)
14846	MOVOA X5, 16(R15)
14847	ADDQ  $0x20, R15
14848	ADDQ  $0x20, R11
14849	ADDQ  $0x20, R14
14850	DECQ  R13
14851	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
14852
14853emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
14854	MOVOU -32(R10)(R14*1), X4
14855	MOVOU -16(R10)(R14*1), X5
14856	MOVOA X4, -32(AX)(R14*1)
14857	MOVOA X5, -16(AX)(R14*1)
14858	ADDQ  $0x20, R14
14859	CMPQ  R9, R14
14860	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
14861	MOVOU X0, (AX)
14862	MOVOU X1, 16(AX)
14863	MOVOU X2, -32(AX)(R9*1)
14864	MOVOU X3, -16(AX)(R9*1)
14865	MOVQ  SI, AX
14866
14867emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B:
14868	ADDL R12, CX
14869	ADDL $0x04, R12
14870	MOVL CX, 12(SP)
14871
14872	// emitCopy
14873two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B:
14874	CMPL R12, $0x40
14875	JLE  two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B
14876	MOVB $0xee, (AX)
14877	MOVW R8, 1(AX)
14878	LEAL -60(R12), R12
14879	ADDQ $0x03, AX
14880	JMP  two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B
14881
14882two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B:
14883	CMPL R12, $0x0c
14884	JGE  emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B
14885	MOVB $0x01, BL
14886	LEAL -16(BX)(R12*4), R12
14887	MOVB R8, 1(AX)
14888	SHRL $0x08, R8
14889	SHLL $0x05, R8
14890	ORL  R8, R12
14891	MOVB R12, (AX)
14892	ADDQ $0x02, AX
14893	JMP  match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B
14894
14895emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B:
14896	MOVB $0x02, BL
14897	LEAL -4(BX)(R12*4), R12
14898	MOVB R12, (AX)
14899	MOVW R8, 1(AX)
14900	ADDQ $0x03, AX
14901
14902match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
14903	CMPL CX, 8(SP)
14904	JGE  emit_remainder_encodeSnappyBetterBlockAsm8B
14905	CMPQ AX, (SP)
14906	JL   match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B
14907	MOVQ $0x00000000, ret+48(FP)
14908	RET
14909
14910match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
14911	MOVQ  $0x0000cf1bbcdcbf9b, SI
14912	MOVQ  $0x9e3779b1, R8
14913	INCL  DI
14914	MOVQ  (DX)(DI*1), R9
14915	MOVQ  R9, R10
14916	MOVQ  R9, R11
14917	MOVQ  R9, R12
14918	SHRQ  $0x08, R11
14919	MOVQ  R11, R13
14920	SHRQ  $0x10, R12
14921	LEAL  1(DI), R14
14922	LEAL  2(DI), R15
14923	MOVQ  -2(DX)(CX*1), R9
14924	SHLQ  $0x10, R10
14925	IMULQ SI, R10
14926	SHRQ  $0x36, R10
14927	SHLQ  $0x10, R13
14928	IMULQ SI, R13
14929	SHRQ  $0x36, R13
14930	SHLQ  $0x20, R11
14931	IMULQ R8, R11
14932	SHRQ  $0x38, R11
14933	SHLQ  $0x20, R12
14934	IMULQ R8, R12
14935	SHRQ  $0x38, R12
14936	MOVL  DI, 24(SP)(R10*4)
14937	MOVL  R14, 24(SP)(R13*4)
14938	MOVL  R14, 4120(SP)(R11*4)
14939	MOVL  R15, 4120(SP)(R12*4)
14940	MOVQ  R9, R10
14941	MOVQ  R9, R11
14942	SHRQ  $0x08, R11
14943	MOVQ  R11, R13
14944	LEAL  -2(CX), R9
14945	LEAL  -1(CX), DI
14946	SHLQ  $0x10, R10
14947	IMULQ SI, R10
14948	SHRQ  $0x36, R10
14949	SHLQ  $0x20, R11
14950	IMULQ R8, R11
14951	SHRQ  $0x38, R11
14952	SHLQ  $0x10, R13
14953	IMULQ SI, R13
14954	SHRQ  $0x36, R13
14955	MOVL  R9, 24(SP)(R10*4)
14956	MOVL  DI, 4120(SP)(R11*4)
14957	MOVL  DI, 24(SP)(R13*4)
14958	JMP   search_loop_encodeSnappyBetterBlockAsm8B
14959
14960emit_remainder_encodeSnappyBetterBlockAsm8B:
14961	MOVQ src_len+32(FP), CX
14962	SUBL 12(SP), CX
14963	LEAQ 3(AX)(CX*1), CX
14964	CMPQ CX, (SP)
14965	JL   emit_remainder_ok_encodeSnappyBetterBlockAsm8B
14966	MOVQ $0x00000000, ret+48(FP)
14967	RET
14968
14969emit_remainder_ok_encodeSnappyBetterBlockAsm8B:
14970	MOVQ src_len+32(FP), CX
14971	MOVL 12(SP), BX
14972	CMPL BX, CX
14973	JEQ  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
14974	MOVL CX, SI
14975	MOVL CX, 12(SP)
14976	LEAQ (DX)(BX*1), CX
14977	SUBL BX, SI
14978	LEAL -1(SI), DX
14979	CMPL DX, $0x3c
14980	JLT  one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B
14981	CMPL DX, $0x00000100
14982	JLT  two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
14983	MOVB $0xf4, (AX)
14984	MOVW DX, 1(AX)
14985	ADDQ $0x03, AX
14986	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
14987
14988two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
14989	MOVB $0xf0, (AX)
14990	MOVB DL, 1(AX)
14991	ADDQ $0x02, AX
14992	CMPL DX, $0x40
14993	JL   memmove_emit_remainder_encodeSnappyBetterBlockAsm8B
14994	JMP  memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
14995
14996one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B:
14997	SHLB $0x02, DL
14998	MOVB DL, (AX)
14999	ADDQ $0x01, AX
15000
15001memmove_emit_remainder_encodeSnappyBetterBlockAsm8B:
15002	LEAQ (AX)(SI*1), DX
15003	MOVL SI, BX
15004
15005	// genMemMoveShort
15006	CMPQ BX, $0x08
15007	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8
15008	CMPQ BX, $0x10
15009	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
15010	CMPQ BX, $0x20
15011	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
15012	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
15013
15014emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8:
15015	MOVQ (CX), SI
15016	MOVQ SI, (AX)
15017	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
15018
15019emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
15020	MOVQ (CX), SI
15021	MOVQ -8(CX)(BX*1), CX
15022	MOVQ SI, (AX)
15023	MOVQ CX, -8(AX)(BX*1)
15024	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
15025
15026emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
15027	MOVOU (CX), X0
15028	MOVOU -16(CX)(BX*1), X1
15029	MOVOU X0, (AX)
15030	MOVOU X1, -16(AX)(BX*1)
15031	JMP   memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
15032
15033emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
15034	MOVOU (CX), X0
15035	MOVOU 16(CX), X1
15036	MOVOU -32(CX)(BX*1), X2
15037	MOVOU -16(CX)(BX*1), X3
15038	MOVOU X0, (AX)
15039	MOVOU X1, 16(AX)
15040	MOVOU X2, -32(AX)(BX*1)
15041	MOVOU X3, -16(AX)(BX*1)
15042
15043memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B:
15044	MOVQ DX, AX
15045	JMP  emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
15046
15047memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B:
15048	LEAQ (AX)(SI*1), DX
15049	MOVL SI, BX
15050
15051	// genMemMoveLong
15052	MOVOU (CX), X0
15053	MOVOU 16(CX), X1
15054	MOVOU -32(CX)(BX*1), X2
15055	MOVOU -16(CX)(BX*1), X3
15056	MOVQ  BX, DI
15057	SHRQ  $0x05, DI
15058	MOVQ  AX, SI
15059	ANDL  $0x0000001f, SI
15060	MOVQ  $0x00000040, R8
15061	SUBQ  SI, R8
15062	DECQ  DI
15063	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
15064	LEAQ  -32(CX)(R8*1), SI
15065	LEAQ  -32(AX)(R8*1), R9
15066
15067emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
15068	MOVOU (SI), X4
15069	MOVOU 16(SI), X5
15070	MOVOA X4, (R9)
15071	MOVOA X5, 16(R9)
15072	ADDQ  $0x20, R9
15073	ADDQ  $0x20, SI
15074	ADDQ  $0x20, R8
15075	DECQ  DI
15076	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
15077
15078emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
15079	MOVOU -32(CX)(R8*1), X4
15080	MOVOU -16(CX)(R8*1), X5
15081	MOVOA X4, -32(AX)(R8*1)
15082	MOVOA X5, -16(AX)(R8*1)
15083	ADDQ  $0x20, R8
15084	CMPQ  BX, R8
15085	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
15086	MOVOU X0, (AX)
15087	MOVOU X1, 16(AX)
15088	MOVOU X2, -32(AX)(BX*1)
15089	MOVOU X3, -16(AX)(BX*1)
15090	MOVQ  DX, AX
15091
15092emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B:
15093	MOVQ dst_base+0(FP), CX
15094	SUBQ CX, AX
15095	MOVQ AX, ret+48(FP)
15096	RET
15097
15098// func emitLiteral(dst []byte, lit []byte) int
15099// Requires: SSE2
15100TEXT ·emitLiteral(SB), NOSPLIT, $0-56
15101	MOVQ  lit_len+32(FP), DX
15102	MOVQ  dst_base+0(FP), AX
15103	MOVQ  lit_base+24(FP), CX
15104	TESTQ DX, DX
15105	JZ    emit_literal_end_standalone_skip
15106	MOVL  DX, BX
15107	LEAL  -1(DX), SI
15108	CMPL  SI, $0x3c
15109	JLT   one_byte_standalone
15110	CMPL  SI, $0x00000100
15111	JLT   two_bytes_standalone
15112	CMPL  SI, $0x00010000
15113	JLT   three_bytes_standalone
15114	CMPL  SI, $0x01000000
15115	JLT   four_bytes_standalone
15116	MOVB  $0xfc, (AX)
15117	MOVL  SI, 1(AX)
15118	ADDQ  $0x05, BX
15119	ADDQ  $0x05, AX
15120	JMP   memmove_long_standalone
15121
15122four_bytes_standalone:
15123	MOVL SI, DI
15124	SHRL $0x10, DI
15125	MOVB $0xf8, (AX)
15126	MOVW SI, 1(AX)
15127	MOVB DI, 3(AX)
15128	ADDQ $0x04, BX
15129	ADDQ $0x04, AX
15130	JMP  memmove_long_standalone
15131
15132three_bytes_standalone:
15133	MOVB $0xf4, (AX)
15134	MOVW SI, 1(AX)
15135	ADDQ $0x03, BX
15136	ADDQ $0x03, AX
15137	JMP  memmove_long_standalone
15138
15139two_bytes_standalone:
15140	MOVB $0xf0, (AX)
15141	MOVB SI, 1(AX)
15142	ADDQ $0x02, BX
15143	ADDQ $0x02, AX
15144	CMPL SI, $0x40
15145	JL   memmove_standalone
15146	JMP  memmove_long_standalone
15147
15148one_byte_standalone:
15149	SHLB $0x02, SI
15150	MOVB SI, (AX)
15151	ADDQ $0x01, BX
15152	ADDQ $0x01, AX
15153
15154memmove_standalone:
15155	// genMemMoveShort
15156	CMPQ DX, $0x03
15157	JB   emit_lit_memmove_standalone_memmove_move_1or2
15158	JE   emit_lit_memmove_standalone_memmove_move_3
15159	CMPQ DX, $0x08
15160	JB   emit_lit_memmove_standalone_memmove_move_4through7
15161	CMPQ DX, $0x10
15162	JBE  emit_lit_memmove_standalone_memmove_move_8through16
15163	CMPQ DX, $0x20
15164	JBE  emit_lit_memmove_standalone_memmove_move_17through32
15165	JMP  emit_lit_memmove_standalone_memmove_move_33through64
15166
15167emit_lit_memmove_standalone_memmove_move_1or2:
15168	MOVB (CX), SI
15169	MOVB -1(CX)(DX*1), CL
15170	MOVB SI, (AX)
15171	MOVB CL, -1(AX)(DX*1)
15172	JMP  emit_literal_end_standalone
15173
15174emit_lit_memmove_standalone_memmove_move_3:
15175	MOVW (CX), SI
15176	MOVB 2(CX), CL
15177	MOVW SI, (AX)
15178	MOVB CL, 2(AX)
15179	JMP  emit_literal_end_standalone
15180
15181emit_lit_memmove_standalone_memmove_move_4through7:
15182	MOVL (CX), SI
15183	MOVL -4(CX)(DX*1), CX
15184	MOVL SI, (AX)
15185	MOVL CX, -4(AX)(DX*1)
15186	JMP  emit_literal_end_standalone
15187
15188emit_lit_memmove_standalone_memmove_move_8through16:
15189	MOVQ (CX), SI
15190	MOVQ -8(CX)(DX*1), CX
15191	MOVQ SI, (AX)
15192	MOVQ CX, -8(AX)(DX*1)
15193	JMP  emit_literal_end_standalone
15194
15195emit_lit_memmove_standalone_memmove_move_17through32:
15196	MOVOU (CX), X0
15197	MOVOU -16(CX)(DX*1), X1
15198	MOVOU X0, (AX)
15199	MOVOU X1, -16(AX)(DX*1)
15200	JMP   emit_literal_end_standalone
15201
15202emit_lit_memmove_standalone_memmove_move_33through64:
15203	MOVOU (CX), X0
15204	MOVOU 16(CX), X1
15205	MOVOU -32(CX)(DX*1), X2
15206	MOVOU -16(CX)(DX*1), X3
15207	MOVOU X0, (AX)
15208	MOVOU X1, 16(AX)
15209	MOVOU X2, -32(AX)(DX*1)
15210	MOVOU X3, -16(AX)(DX*1)
15211	JMP   emit_literal_end_standalone
15212	JMP emit_literal_end_standalone
15213
15214memmove_long_standalone:
15215	// genMemMoveLong
15216	MOVOU (CX), X0
15217	MOVOU 16(CX), X1
15218	MOVOU -32(CX)(DX*1), X2
15219	MOVOU -16(CX)(DX*1), X3
15220	MOVQ  DX, DI
15221	SHRQ  $0x05, DI
15222	MOVQ  AX, SI
15223	ANDL  $0x0000001f, SI
15224	MOVQ  $0x00000040, R8
15225	SUBQ  SI, R8
15226	DECQ  DI
15227	JA    emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
15228	LEAQ  -32(CX)(R8*1), SI
15229	LEAQ  -32(AX)(R8*1), R9
15230
15231emit_lit_memmove_long_standalonelarge_big_loop_back:
15232	MOVOU (SI), X4
15233	MOVOU 16(SI), X5
15234	MOVOA X4, (R9)
15235	MOVOA X5, 16(R9)
15236	ADDQ  $0x20, R9
15237	ADDQ  $0x20, SI
15238	ADDQ  $0x20, R8
15239	DECQ  DI
15240	JNA   emit_lit_memmove_long_standalonelarge_big_loop_back
15241
15242emit_lit_memmove_long_standalonelarge_forward_sse_loop_32:
15243	MOVOU -32(CX)(R8*1), X4
15244	MOVOU -16(CX)(R8*1), X5
15245	MOVOA X4, -32(AX)(R8*1)
15246	MOVOA X5, -16(AX)(R8*1)
15247	ADDQ  $0x20, R8
15248	CMPQ  DX, R8
15249	JAE   emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
15250	MOVOU X0, (AX)
15251	MOVOU X1, 16(AX)
15252	MOVOU X2, -32(AX)(DX*1)
15253	MOVOU X3, -16(AX)(DX*1)
15254	JMP   emit_literal_end_standalone
15255	JMP emit_literal_end_standalone
15256
15257emit_literal_end_standalone_skip:
15258	XORQ BX, BX
15259
15260emit_literal_end_standalone:
15261	MOVQ BX, ret+48(FP)
15262	RET
15263
15264// func emitRepeat(dst []byte, offset int, length int) int
15265TEXT ·emitRepeat(SB), NOSPLIT, $0-48
15266	XORQ BX, BX
15267	MOVQ dst_base+0(FP), AX
15268	MOVQ offset+24(FP), CX
15269	MOVQ length+32(FP), DX
15270
15271	// emitRepeat
15272emit_repeat_again_standalone:
15273	MOVL DX, SI
15274	LEAL -4(DX), DX
15275	CMPL SI, $0x08
15276	JLE  repeat_two_standalone
15277	CMPL SI, $0x0c
15278	JGE  cant_repeat_two_offset_standalone
15279	CMPL CX, $0x00000800
15280	JLT  repeat_two_offset_standalone
15281
15282cant_repeat_two_offset_standalone:
15283	CMPL DX, $0x00000104
15284	JLT  repeat_three_standalone
15285	CMPL DX, $0x00010100
15286	JLT  repeat_four_standalone
15287	CMPL DX, $0x0100ffff
15288	JLT  repeat_five_standalone
15289	LEAL -16842747(DX), DX
15290	MOVW $0x001d, (AX)
15291	MOVW $0xfffb, 2(AX)
15292	MOVB $0xff, 4(AX)
15293	ADDQ $0x05, AX
15294	ADDQ $0x05, BX
15295	JMP  emit_repeat_again_standalone
15296
15297repeat_five_standalone:
15298	LEAL -65536(DX), DX
15299	MOVL DX, CX
15300	MOVW $0x001d, (AX)
15301	MOVW DX, 2(AX)
15302	SARL $0x10, CX
15303	MOVB CL, 4(AX)
15304	ADDQ $0x05, BX
15305	ADDQ $0x05, AX
15306	JMP  gen_emit_repeat_end
15307
15308repeat_four_standalone:
15309	LEAL -256(DX), DX
15310	MOVW $0x0019, (AX)
15311	MOVW DX, 2(AX)
15312	ADDQ $0x04, BX
15313	ADDQ $0x04, AX
15314	JMP  gen_emit_repeat_end
15315
15316repeat_three_standalone:
15317	LEAL -4(DX), DX
15318	MOVW $0x0015, (AX)
15319	MOVB DL, 2(AX)
15320	ADDQ $0x03, BX
15321	ADDQ $0x03, AX
15322	JMP  gen_emit_repeat_end
15323
15324repeat_two_standalone:
15325	SHLL $0x02, DX
15326	ORL  $0x01, DX
15327	MOVW DX, (AX)
15328	ADDQ $0x02, BX
15329	ADDQ $0x02, AX
15330	JMP  gen_emit_repeat_end
15331
15332repeat_two_offset_standalone:
15333	XORQ SI, SI
15334	LEAL 1(SI)(DX*4), DX
15335	MOVB CL, 1(AX)
15336	SARL $0x08, CX
15337	SHLL $0x05, CX
15338	ORL  CX, DX
15339	MOVB DL, (AX)
15340	ADDQ $0x02, BX
15341	ADDQ $0x02, AX
15342
15343gen_emit_repeat_end:
15344	MOVQ BX, ret+40(FP)
15345	RET
15346
15347// func emitCopy(dst []byte, offset int, length int) int
15348TEXT ·emitCopy(SB), NOSPLIT, $0-48
15349	XORQ BX, BX
15350	MOVQ dst_base+0(FP), AX
15351	MOVQ offset+24(FP), CX
15352	MOVQ length+32(FP), DX
15353
15354	// emitCopy
15355	CMPL CX, $0x00010000
15356	JL   two_byte_offset_standalone
15357
15358four_bytes_loop_back_standalone:
15359	CMPL DX, $0x40
15360	JLE  four_bytes_remain_standalone
15361	MOVB $0xff, (AX)
15362	MOVL CX, 1(AX)
15363	LEAL -64(DX), DX
15364	ADDQ $0x05, BX
15365	ADDQ $0x05, AX
15366	CMPL DX, $0x04
15367	JL   four_bytes_remain_standalone
15368
15369	// emitRepeat
15370emit_repeat_again_standalone_emit_copy:
15371	MOVL DX, SI
15372	LEAL -4(DX), DX
15373	CMPL SI, $0x08
15374	JLE  repeat_two_standalone_emit_copy
15375	CMPL SI, $0x0c
15376	JGE  cant_repeat_two_offset_standalone_emit_copy
15377	CMPL CX, $0x00000800
15378	JLT  repeat_two_offset_standalone_emit_copy
15379
15380cant_repeat_two_offset_standalone_emit_copy:
15381	CMPL DX, $0x00000104
15382	JLT  repeat_three_standalone_emit_copy
15383	CMPL DX, $0x00010100
15384	JLT  repeat_four_standalone_emit_copy
15385	CMPL DX, $0x0100ffff
15386	JLT  repeat_five_standalone_emit_copy
15387	LEAL -16842747(DX), DX
15388	MOVW $0x001d, (AX)
15389	MOVW $0xfffb, 2(AX)
15390	MOVB $0xff, 4(AX)
15391	ADDQ $0x05, AX
15392	ADDQ $0x05, BX
15393	JMP  emit_repeat_again_standalone_emit_copy
15394
15395repeat_five_standalone_emit_copy:
15396	LEAL -65536(DX), DX
15397	MOVL DX, CX
15398	MOVW $0x001d, (AX)
15399	MOVW DX, 2(AX)
15400	SARL $0x10, CX
15401	MOVB CL, 4(AX)
15402	ADDQ $0x05, BX
15403	ADDQ $0x05, AX
15404	JMP  gen_emit_copy_end
15405
15406repeat_four_standalone_emit_copy:
15407	LEAL -256(DX), DX
15408	MOVW $0x0019, (AX)
15409	MOVW DX, 2(AX)
15410	ADDQ $0x04, BX
15411	ADDQ $0x04, AX
15412	JMP  gen_emit_copy_end
15413
15414repeat_three_standalone_emit_copy:
15415	LEAL -4(DX), DX
15416	MOVW $0x0015, (AX)
15417	MOVB DL, 2(AX)
15418	ADDQ $0x03, BX
15419	ADDQ $0x03, AX
15420	JMP  gen_emit_copy_end
15421
15422repeat_two_standalone_emit_copy:
15423	SHLL $0x02, DX
15424	ORL  $0x01, DX
15425	MOVW DX, (AX)
15426	ADDQ $0x02, BX
15427	ADDQ $0x02, AX
15428	JMP  gen_emit_copy_end
15429
15430repeat_two_offset_standalone_emit_copy:
15431	XORQ SI, SI
15432	LEAL 1(SI)(DX*4), DX
15433	MOVB CL, 1(AX)
15434	SARL $0x08, CX
15435	SHLL $0x05, CX
15436	ORL  CX, DX
15437	MOVB DL, (AX)
15438	ADDQ $0x02, BX
15439	ADDQ $0x02, AX
15440	JMP  gen_emit_copy_end
15441	JMP four_bytes_loop_back_standalone
15442
15443four_bytes_remain_standalone:
15444	TESTL DX, DX
15445	JZ    gen_emit_copy_end
15446	MOVB  $0x03, SI
15447	LEAL  -4(SI)(DX*4), DX
15448	MOVB  DL, (AX)
15449	MOVL  CX, 1(AX)
15450	ADDQ  $0x05, BX
15451	ADDQ  $0x05, AX
15452	JMP   gen_emit_copy_end
15453
15454two_byte_offset_standalone:
15455	CMPL DX, $0x40
15456	JLE  two_byte_offset_short_standalone
15457	MOVB $0xee, (AX)
15458	MOVW CX, 1(AX)
15459	LEAL -60(DX), DX
15460	ADDQ $0x03, AX
15461	ADDQ $0x03, BX
15462
15463	// emitRepeat
15464emit_repeat_again_standalone_emit_copy_short:
15465	MOVL DX, SI
15466	LEAL -4(DX), DX
15467	CMPL SI, $0x08
15468	JLE  repeat_two_standalone_emit_copy_short
15469	CMPL SI, $0x0c
15470	JGE  cant_repeat_two_offset_standalone_emit_copy_short
15471	CMPL CX, $0x00000800
15472	JLT  repeat_two_offset_standalone_emit_copy_short
15473
15474cant_repeat_two_offset_standalone_emit_copy_short:
15475	CMPL DX, $0x00000104
15476	JLT  repeat_three_standalone_emit_copy_short
15477	CMPL DX, $0x00010100
15478	JLT  repeat_four_standalone_emit_copy_short
15479	CMPL DX, $0x0100ffff
15480	JLT  repeat_five_standalone_emit_copy_short
15481	LEAL -16842747(DX), DX
15482	MOVW $0x001d, (AX)
15483	MOVW $0xfffb, 2(AX)
15484	MOVB $0xff, 4(AX)
15485	ADDQ $0x05, AX
15486	ADDQ $0x05, BX
15487	JMP  emit_repeat_again_standalone_emit_copy_short
15488
15489repeat_five_standalone_emit_copy_short:
15490	LEAL -65536(DX), DX
15491	MOVL DX, CX
15492	MOVW $0x001d, (AX)
15493	MOVW DX, 2(AX)
15494	SARL $0x10, CX
15495	MOVB CL, 4(AX)
15496	ADDQ $0x05, BX
15497	ADDQ $0x05, AX
15498	JMP  gen_emit_copy_end
15499
15500repeat_four_standalone_emit_copy_short:
15501	LEAL -256(DX), DX
15502	MOVW $0x0019, (AX)
15503	MOVW DX, 2(AX)
15504	ADDQ $0x04, BX
15505	ADDQ $0x04, AX
15506	JMP  gen_emit_copy_end
15507
15508repeat_three_standalone_emit_copy_short:
15509	LEAL -4(DX), DX
15510	MOVW $0x0015, (AX)
15511	MOVB DL, 2(AX)
15512	ADDQ $0x03, BX
15513	ADDQ $0x03, AX
15514	JMP  gen_emit_copy_end
15515
15516repeat_two_standalone_emit_copy_short:
15517	SHLL $0x02, DX
15518	ORL  $0x01, DX
15519	MOVW DX, (AX)
15520	ADDQ $0x02, BX
15521	ADDQ $0x02, AX
15522	JMP  gen_emit_copy_end
15523
15524repeat_two_offset_standalone_emit_copy_short:
15525	XORQ SI, SI
15526	LEAL 1(SI)(DX*4), DX
15527	MOVB CL, 1(AX)
15528	SARL $0x08, CX
15529	SHLL $0x05, CX
15530	ORL  CX, DX
15531	MOVB DL, (AX)
15532	ADDQ $0x02, BX
15533	ADDQ $0x02, AX
15534	JMP  gen_emit_copy_end
15535	JMP two_byte_offset_standalone
15536
15537two_byte_offset_short_standalone:
15538	CMPL DX, $0x0c
15539	JGE  emit_copy_three_standalone
15540	CMPL CX, $0x00000800
15541	JGE  emit_copy_three_standalone
15542	MOVB $0x01, SI
15543	LEAL -16(SI)(DX*4), DX
15544	MOVB CL, 1(AX)
15545	SHRL $0x08, CX
15546	SHLL $0x05, CX
15547	ORL  CX, DX
15548	MOVB DL, (AX)
15549	ADDQ $0x02, BX
15550	ADDQ $0x02, AX
15551	JMP  gen_emit_copy_end
15552
15553emit_copy_three_standalone:
15554	MOVB $0x02, SI
15555	LEAL -4(SI)(DX*4), DX
15556	MOVB DL, (AX)
15557	MOVW CX, 1(AX)
15558	ADDQ $0x03, BX
15559	ADDQ $0x03, AX
15560
15561gen_emit_copy_end:
15562	MOVQ BX, ret+40(FP)
15563	RET
15564
15565// func emitCopyNoRepeat(dst []byte, offset int, length int) int
15566TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48
15567	XORQ BX, BX
15568	MOVQ dst_base+0(FP), AX
15569	MOVQ offset+24(FP), CX
15570	MOVQ length+32(FP), DX
15571
15572	// emitCopy
15573	CMPL CX, $0x00010000
15574	JL   two_byte_offset_standalone_snappy
15575
15576four_bytes_loop_back_standalone_snappy:
15577	CMPL DX, $0x40
15578	JLE  four_bytes_remain_standalone_snappy
15579	MOVB $0xff, (AX)
15580	MOVL CX, 1(AX)
15581	LEAL -64(DX), DX
15582	ADDQ $0x05, BX
15583	ADDQ $0x05, AX
15584	CMPL DX, $0x04
15585	JL   four_bytes_remain_standalone_snappy
15586	JMP  four_bytes_loop_back_standalone_snappy
15587
15588four_bytes_remain_standalone_snappy:
15589	TESTL DX, DX
15590	JZ    gen_emit_copy_end_snappy
15591	MOVB  $0x03, SI
15592	LEAL  -4(SI)(DX*4), DX
15593	MOVB  DL, (AX)
15594	MOVL  CX, 1(AX)
15595	ADDQ  $0x05, BX
15596	ADDQ  $0x05, AX
15597	JMP   gen_emit_copy_end_snappy
15598
15599two_byte_offset_standalone_snappy:
15600	CMPL DX, $0x40
15601	JLE  two_byte_offset_short_standalone_snappy
15602	MOVB $0xee, (AX)
15603	MOVW CX, 1(AX)
15604	LEAL -60(DX), DX
15605	ADDQ $0x03, AX
15606	ADDQ $0x03, BX
15607	JMP  two_byte_offset_standalone_snappy
15608
15609two_byte_offset_short_standalone_snappy:
15610	CMPL DX, $0x0c
15611	JGE  emit_copy_three_standalone_snappy
15612	CMPL CX, $0x00000800
15613	JGE  emit_copy_three_standalone_snappy
15614	MOVB $0x01, SI
15615	LEAL -16(SI)(DX*4), DX
15616	MOVB CL, 1(AX)
15617	SHRL $0x08, CX
15618	SHLL $0x05, CX
15619	ORL  CX, DX
15620	MOVB DL, (AX)
15621	ADDQ $0x02, BX
15622	ADDQ $0x02, AX
15623	JMP  gen_emit_copy_end_snappy
15624
15625emit_copy_three_standalone_snappy:
15626	MOVB $0x02, SI
15627	LEAL -4(SI)(DX*4), DX
15628	MOVB DL, (AX)
15629	MOVW CX, 1(AX)
15630	ADDQ $0x03, BX
15631	ADDQ $0x03, AX
15632
15633gen_emit_copy_end_snappy:
15634	MOVQ BX, ret+40(FP)
15635	RET
15636
15637// func matchLen(a []byte, b []byte) int
15638TEXT ·matchLen(SB), NOSPLIT, $0-56
15639	MOVQ a_base+0(FP), AX
15640	MOVQ b_base+24(FP), CX
15641	MOVQ a_len+8(FP), DX
15642
15643	// matchLen
15644	XORL SI, SI
15645	CMPL DX, $0x08
15646	JL   matchlen_single_standalone
15647
15648matchlen_loopback_standalone:
15649	MOVQ  (AX)(SI*1), BX
15650	XORQ  (CX)(SI*1), BX
15651	TESTQ BX, BX
15652	JZ    matchlen_loop_standalone
15653	BSFQ  BX, BX
15654	SARQ  $0x03, BX
15655	LEAL  (SI)(BX*1), SI
15656	JMP   gen_match_len_end
15657
15658matchlen_loop_standalone:
15659	LEAL -8(DX), DX
15660	LEAL 8(SI), SI
15661	CMPL DX, $0x08
15662	JGE  matchlen_loopback_standalone
15663
15664matchlen_single_standalone:
15665	TESTL DX, DX
15666	JZ    gen_match_len_end
15667
15668matchlen_single_loopback_standalone:
15669	MOVB (AX)(SI*1), BL
15670	CMPB (CX)(SI*1), BL
15671	JNE  gen_match_len_end
15672	LEAL 1(SI), SI
15673	DECL DX
15674	JNZ  matchlen_single_loopback_standalone
15675
15676gen_match_len_end:
15677	MOVQ SI, ret+48(FP)
15678	RET
15679