1// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
2
3// +build !appengine
4// +build !noasm
5// +build gc
6
7#include "textflag.h"
8
9// func encodeBlockAsm(dst []byte, src []byte) int
10// Requires: SSE2
11TEXT ·encodeBlockAsm(SB), $65560-56
12	MOVQ dst_base+0(FP), AX
13	MOVQ $0x00000200, CX
14	LEAQ 24(SP), DX
15	PXOR X0, X0
16
17zero_loop_encodeBlockAsm:
18	MOVOU X0, (DX)
19	MOVOU X0, 16(DX)
20	MOVOU X0, 32(DX)
21	MOVOU X0, 48(DX)
22	MOVOU X0, 64(DX)
23	MOVOU X0, 80(DX)
24	MOVOU X0, 96(DX)
25	MOVOU X0, 112(DX)
26	ADDQ  $0x80, DX
27	DECQ  CX
28	JNZ   zero_loop_encodeBlockAsm
29	MOVL  $0x00000000, 12(SP)
30	MOVQ  src_len+32(FP), CX
31	LEAQ  -5(CX), DX
32	LEAQ  -8(CX), SI
33	MOVL  SI, 8(SP)
34	SHRQ  $0x05, CX
35	SUBL  CX, DX
36	LEAQ  (AX)(DX*1), DX
37	MOVQ  DX, (SP)
38	MOVL  $0x00000001, CX
39	MOVL  CX, 16(SP)
40	MOVQ  src_base+24(FP), DX
41
42search_loop_encodeBlockAsm:
43	MOVL  CX, SI
44	SUBL  12(SP), SI
45	SHRL  $0x06, SI
46	LEAL  4(CX)(SI*1), SI
47	CMPL  SI, 8(SP)
48	JGE   emit_remainder_encodeBlockAsm
49	MOVQ  (DX)(CX*1), DI
50	MOVL  SI, 20(SP)
51	MOVQ  $0x0000cf1bbcdcbf9b, R9
52	MOVQ  DI, R10
53	MOVQ  DI, R11
54	SHRQ  $0x08, R11
55	SHLQ  $0x10, R10
56	IMULQ R9, R10
57	SHRQ  $0x32, R10
58	SHLQ  $0x10, R11
59	IMULQ R9, R11
60	SHRQ  $0x32, R11
61	MOVL  24(SP)(R10*4), SI
62	MOVL  24(SP)(R11*4), R8
63	MOVL  CX, 24(SP)(R10*4)
64	LEAL  1(CX), R10
65	MOVL  R10, 24(SP)(R11*4)
66	MOVQ  DI, R10
67	SHRQ  $0x10, R10
68	SHLQ  $0x10, R10
69	IMULQ R9, R10
70	SHRQ  $0x32, R10
71	MOVL  CX, R9
72	SUBL  16(SP), R9
73	MOVL  1(DX)(R9*1), R11
74	MOVQ  DI, R9
75	SHRQ  $0x08, R9
76	CMPL  R9, R11
77	JNE   no_repeat_found_encodeBlockAsm
78	LEAL  1(CX), DI
79	MOVL  12(SP), R8
80	MOVL  DI, SI
81	SUBL  16(SP), SI
82	JZ    repeat_extend_back_end_encodeBlockAsm
83
84repeat_extend_back_loop_encodeBlockAsm:
85	CMPL DI, R8
86	JLE  repeat_extend_back_end_encodeBlockAsm
87	MOVB -1(DX)(SI*1), BL
88	MOVB -1(DX)(DI*1), R9
89	CMPB BL, R9
90	JNE  repeat_extend_back_end_encodeBlockAsm
91	LEAL -1(DI), DI
92	DECL SI
93	JNZ  repeat_extend_back_loop_encodeBlockAsm
94
95repeat_extend_back_end_encodeBlockAsm:
96	MOVL 12(SP), SI
97	CMPL SI, DI
98	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm
99	MOVL DI, R9
100	MOVL DI, 12(SP)
101	LEAQ (DX)(SI*1), R10
102	SUBL SI, R9
103	LEAL -1(R9), SI
104	CMPL SI, $0x3c
105	JLT  one_byte_repeat_emit_encodeBlockAsm
106	CMPL SI, $0x00000100
107	JLT  two_bytes_repeat_emit_encodeBlockAsm
108	CMPL SI, $0x00010000
109	JLT  three_bytes_repeat_emit_encodeBlockAsm
110	CMPL SI, $0x01000000
111	JLT  four_bytes_repeat_emit_encodeBlockAsm
112	MOVB $0xfc, (AX)
113	MOVL SI, 1(AX)
114	ADDQ $0x05, AX
115	JMP  memmove_long_repeat_emit_encodeBlockAsm
116
117four_bytes_repeat_emit_encodeBlockAsm:
118	MOVL SI, R11
119	SHRL $0x10, R11
120	MOVB $0xf8, (AX)
121	MOVW SI, 1(AX)
122	MOVB R11, 3(AX)
123	ADDQ $0x04, AX
124	JMP  memmove_long_repeat_emit_encodeBlockAsm
125
126three_bytes_repeat_emit_encodeBlockAsm:
127	MOVB $0xf4, (AX)
128	MOVW SI, 1(AX)
129	ADDQ $0x03, AX
130	JMP  memmove_long_repeat_emit_encodeBlockAsm
131
132two_bytes_repeat_emit_encodeBlockAsm:
133	MOVB $0xf0, (AX)
134	MOVB SI, 1(AX)
135	ADDQ $0x02, AX
136	CMPL SI, $0x40
137	JL   memmove_repeat_emit_encodeBlockAsm
138	JMP  memmove_long_repeat_emit_encodeBlockAsm
139
140one_byte_repeat_emit_encodeBlockAsm:
141	SHLB $0x02, SI
142	MOVB SI, (AX)
143	ADDQ $0x01, AX
144
145memmove_repeat_emit_encodeBlockAsm:
146	LEAQ (AX)(R9*1), SI
147
148	// genMemMoveShort
149	CMPQ R9, $0x03
150	JB   emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_1or2
151	JE   emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_3
152	CMPQ R9, $0x08
153	JB   emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_4through7
154	CMPQ R9, $0x10
155	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16
156	CMPQ R9, $0x20
157	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32
158	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64
159
160emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_1or2:
161	MOVB (R10), R11
162	MOVB -1(R10)(R9*1), R10
163	MOVB R11, (AX)
164	MOVB R10, -1(AX)(R9*1)
165	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm
166
167emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_3:
168	MOVW (R10), R11
169	MOVB 2(R10), R10
170	MOVW R11, (AX)
171	MOVB R10, 2(AX)
172	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm
173
174emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_4through7:
175	MOVL (R10), R11
176	MOVL -4(R10)(R9*1), R10
177	MOVL R11, (AX)
178	MOVL R10, -4(AX)(R9*1)
179	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm
180
181emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16:
182	MOVQ (R10), R11
183	MOVQ -8(R10)(R9*1), R10
184	MOVQ R11, (AX)
185	MOVQ R10, -8(AX)(R9*1)
186	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm
187
188emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32:
189	MOVOU (R10), X0
190	MOVOU -16(R10)(R9*1), X1
191	MOVOU X0, (AX)
192	MOVOU X1, -16(AX)(R9*1)
193	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm
194
195emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64:
196	MOVOU (R10), X0
197	MOVOU 16(R10), X1
198	MOVOU -32(R10)(R9*1), X2
199	MOVOU -16(R10)(R9*1), X3
200	MOVOU X0, (AX)
201	MOVOU X1, 16(AX)
202	MOVOU X2, -32(AX)(R9*1)
203	MOVOU X3, -16(AX)(R9*1)
204
205memmove_end_copy_repeat_emit_encodeBlockAsm:
206	MOVQ SI, AX
207	JMP  emit_literal_done_repeat_emit_encodeBlockAsm
208
209memmove_long_repeat_emit_encodeBlockAsm:
210	LEAQ (AX)(R9*1), SI
211
212	// genMemMoveLong
213	MOVOU (R10), X0
214	MOVOU 16(R10), X1
215	MOVOU -32(R10)(R9*1), X2
216	MOVOU -16(R10)(R9*1), X3
217	MOVQ  R9, R12
218	SHRQ  $0x05, R12
219	MOVQ  AX, R11
220	ANDL  $0x0000001f, R11
221	MOVQ  $0x00000040, R13
222	SUBQ  R11, R13
223	DECQ  R12
224	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
225	LEAQ  -32(R10)(R13*1), R11
226	LEAQ  -32(AX)(R13*1), R14
227
228emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back:
229	MOVOU (R11), X4
230	MOVOU 16(R11), X5
231	MOVOA X4, (R14)
232	MOVOA X5, 16(R14)
233	ADDQ  $0x20, R14
234	ADDQ  $0x20, R11
235	ADDQ  $0x20, R13
236	DECQ  R12
237	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back
238
239emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32:
240	MOVOU -32(R10)(R13*1), X4
241	MOVOU -16(R10)(R13*1), X5
242	MOVOA X4, -32(AX)(R13*1)
243	MOVOA X5, -16(AX)(R13*1)
244	ADDQ  $0x20, R13
245	CMPQ  R9, R13
246	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
247	MOVOU X0, (AX)
248	MOVOU X1, 16(AX)
249	MOVOU X2, -32(AX)(R9*1)
250	MOVOU X3, -16(AX)(R9*1)
251	MOVQ  SI, AX
252
253emit_literal_done_repeat_emit_encodeBlockAsm:
254	ADDL $0x05, CX
255	MOVL CX, SI
256	SUBL 16(SP), SI
257	MOVQ src_len+32(FP), R9
258	SUBL CX, R9
259	LEAQ (DX)(CX*1), R10
260	LEAQ (DX)(SI*1), SI
261
262	// matchLen
263	XORL R12, R12
264	CMPL R9, $0x08
265	JL   matchlen_single_repeat_extend_encodeBlockAsm
266
267matchlen_loopback_repeat_extend_encodeBlockAsm:
268	MOVQ  (R10)(R12*1), R11
269	XORQ  (SI)(R12*1), R11
270	TESTQ R11, R11
271	JZ    matchlen_loop_repeat_extend_encodeBlockAsm
272	BSFQ  R11, R11
273	SARQ  $0x03, R11
274	LEAL  (R12)(R11*1), R12
275	JMP   repeat_extend_forward_end_encodeBlockAsm
276
277matchlen_loop_repeat_extend_encodeBlockAsm:
278	LEAL -8(R9), R9
279	LEAL 8(R12), R12
280	CMPL R9, $0x08
281	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm
282
283matchlen_single_repeat_extend_encodeBlockAsm:
284	TESTL R9, R9
285	JZ    repeat_extend_forward_end_encodeBlockAsm
286
287matchlen_single_loopback_repeat_extend_encodeBlockAsm:
288	MOVB (R10)(R12*1), R11
289	CMPB (SI)(R12*1), R11
290	JNE  repeat_extend_forward_end_encodeBlockAsm
291	LEAL 1(R12), R12
292	DECL R9
293	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm
294
295repeat_extend_forward_end_encodeBlockAsm:
296	ADDL  R12, CX
297	MOVL  CX, SI
298	SUBL  DI, SI
299	MOVL  16(SP), DI
300	TESTL R8, R8
301	JZ    repeat_as_copy_encodeBlockAsm
302
303	// emitRepeat
304emit_repeat_again_match_repeat_encodeBlockAsm:
305	MOVL SI, R8
306	LEAL -4(SI), SI
307	CMPL R8, $0x08
308	JLE  repeat_two_match_repeat_encodeBlockAsm
309	CMPL R8, $0x0c
310	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm
311	CMPL DI, $0x00000800
312	JLT  repeat_two_offset_match_repeat_encodeBlockAsm
313
314cant_repeat_two_offset_match_repeat_encodeBlockAsm:
315	CMPL SI, $0x00000104
316	JLT  repeat_three_match_repeat_encodeBlockAsm
317	CMPL SI, $0x00010100
318	JLT  repeat_four_match_repeat_encodeBlockAsm
319	CMPL SI, $0x0100ffff
320	JLT  repeat_five_match_repeat_encodeBlockAsm
321	LEAL -16842747(SI), SI
322	MOVW $0x001d, (AX)
323	MOVW $0xfffb, 2(AX)
324	MOVB $0xff, 4(AX)
325	ADDQ $0x05, AX
326	JMP  emit_repeat_again_match_repeat_encodeBlockAsm
327
328repeat_five_match_repeat_encodeBlockAsm:
329	LEAL -65536(SI), SI
330	MOVL SI, DI
331	MOVW $0x001d, (AX)
332	MOVW SI, 2(AX)
333	SARL $0x10, DI
334	MOVB DI, 4(AX)
335	ADDQ $0x05, AX
336	JMP  repeat_end_emit_encodeBlockAsm
337
338repeat_four_match_repeat_encodeBlockAsm:
339	LEAL -256(SI), SI
340	MOVW $0x0019, (AX)
341	MOVW SI, 2(AX)
342	ADDQ $0x04, AX
343	JMP  repeat_end_emit_encodeBlockAsm
344
345repeat_three_match_repeat_encodeBlockAsm:
346	LEAL -4(SI), SI
347	MOVW $0x0015, (AX)
348	MOVB SI, 2(AX)
349	ADDQ $0x03, AX
350	JMP  repeat_end_emit_encodeBlockAsm
351
352repeat_two_match_repeat_encodeBlockAsm:
353	SHLL $0x02, SI
354	ORL  $0x01, SI
355	MOVW SI, (AX)
356	ADDQ $0x02, AX
357	JMP  repeat_end_emit_encodeBlockAsm
358
359repeat_two_offset_match_repeat_encodeBlockAsm:
360	XORQ R8, R8
361	LEAL 1(R8)(SI*4), SI
362	MOVB DI, 1(AX)
363	SARL $0x08, DI
364	SHLL $0x05, DI
365	ORL  DI, SI
366	MOVB SI, (AX)
367	ADDQ $0x02, AX
368	JMP  repeat_end_emit_encodeBlockAsm
369
370repeat_as_copy_encodeBlockAsm:
371	// emitCopy
372	CMPL DI, $0x00010000
373	JL   two_byte_offset_repeat_as_copy_encodeBlockAsm
374
375four_bytes_loop_back_repeat_as_copy_encodeBlockAsm:
376	CMPL SI, $0x40
377	JLE  four_bytes_remain_repeat_as_copy_encodeBlockAsm
378	MOVB $0xff, (AX)
379	MOVL DI, 1(AX)
380	LEAL -64(SI), SI
381	ADDQ $0x05, AX
382	CMPL SI, $0x04
383	JL   four_bytes_remain_repeat_as_copy_encodeBlockAsm
384
385	// emitRepeat
386emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy:
387	MOVL SI, R8
388	LEAL -4(SI), SI
389	CMPL R8, $0x08
390	JLE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy
391	CMPL R8, $0x0c
392	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
393	CMPL DI, $0x00000800
394	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
395
396cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
397	CMPL SI, $0x00000104
398	JLT  repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy
399	CMPL SI, $0x00010100
400	JLT  repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy
401	CMPL SI, $0x0100ffff
402	JLT  repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy
403	LEAL -16842747(SI), SI
404	MOVW $0x001d, (AX)
405	MOVW $0xfffb, 2(AX)
406	MOVB $0xff, 4(AX)
407	ADDQ $0x05, AX
408	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy
409
410repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy:
411	LEAL -65536(SI), SI
412	MOVL SI, DI
413	MOVW $0x001d, (AX)
414	MOVW SI, 2(AX)
415	SARL $0x10, DI
416	MOVB DI, 4(AX)
417	ADDQ $0x05, AX
418	JMP  repeat_end_emit_encodeBlockAsm
419
420repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy:
421	LEAL -256(SI), SI
422	MOVW $0x0019, (AX)
423	MOVW SI, 2(AX)
424	ADDQ $0x04, AX
425	JMP  repeat_end_emit_encodeBlockAsm
426
427repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy:
428	LEAL -4(SI), SI
429	MOVW $0x0015, (AX)
430	MOVB SI, 2(AX)
431	ADDQ $0x03, AX
432	JMP  repeat_end_emit_encodeBlockAsm
433
434repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy:
435	SHLL $0x02, SI
436	ORL  $0x01, SI
437	MOVW SI, (AX)
438	ADDQ $0x02, AX
439	JMP  repeat_end_emit_encodeBlockAsm
440
441repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
442	XORQ R8, R8
443	LEAL 1(R8)(SI*4), SI
444	MOVB DI, 1(AX)
445	SARL $0x08, DI
446	SHLL $0x05, DI
447	ORL  DI, SI
448	MOVB SI, (AX)
449	ADDQ $0x02, AX
450	JMP  repeat_end_emit_encodeBlockAsm
451	JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm
452
453four_bytes_remain_repeat_as_copy_encodeBlockAsm:
454	TESTL SI, SI
455	JZ    repeat_end_emit_encodeBlockAsm
456	MOVB  $0x03, BL
457	LEAL  -4(BX)(SI*4), SI
458	MOVB  SI, (AX)
459	MOVL  DI, 1(AX)
460	ADDQ  $0x05, AX
461	JMP   repeat_end_emit_encodeBlockAsm
462
463two_byte_offset_repeat_as_copy_encodeBlockAsm:
464	CMPL SI, $0x40
465	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm
466	MOVB $0xee, (AX)
467	MOVW DI, 1(AX)
468	LEAL -60(SI), SI
469	ADDQ $0x03, AX
470
471	// emitRepeat
472emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short:
473	MOVL SI, R8
474	LEAL -4(SI), SI
475	CMPL R8, $0x08
476	JLE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short
477	CMPL R8, $0x0c
478	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
479	CMPL DI, $0x00000800
480	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
481
482cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
483	CMPL SI, $0x00000104
484	JLT  repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short
485	CMPL SI, $0x00010100
486	JLT  repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short
487	CMPL SI, $0x0100ffff
488	JLT  repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short
489	LEAL -16842747(SI), SI
490	MOVW $0x001d, (AX)
491	MOVW $0xfffb, 2(AX)
492	MOVB $0xff, 4(AX)
493	ADDQ $0x05, AX
494	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short
495
496repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short:
497	LEAL -65536(SI), SI
498	MOVL SI, DI
499	MOVW $0x001d, (AX)
500	MOVW SI, 2(AX)
501	SARL $0x10, DI
502	MOVB DI, 4(AX)
503	ADDQ $0x05, AX
504	JMP  repeat_end_emit_encodeBlockAsm
505
506repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short:
507	LEAL -256(SI), SI
508	MOVW $0x0019, (AX)
509	MOVW SI, 2(AX)
510	ADDQ $0x04, AX
511	JMP  repeat_end_emit_encodeBlockAsm
512
513repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short:
514	LEAL -4(SI), SI
515	MOVW $0x0015, (AX)
516	MOVB SI, 2(AX)
517	ADDQ $0x03, AX
518	JMP  repeat_end_emit_encodeBlockAsm
519
520repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short:
521	SHLL $0x02, SI
522	ORL  $0x01, SI
523	MOVW SI, (AX)
524	ADDQ $0x02, AX
525	JMP  repeat_end_emit_encodeBlockAsm
526
527repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
528	XORQ R8, R8
529	LEAL 1(R8)(SI*4), SI
530	MOVB DI, 1(AX)
531	SARL $0x08, DI
532	SHLL $0x05, DI
533	ORL  DI, SI
534	MOVB SI, (AX)
535	ADDQ $0x02, AX
536	JMP  repeat_end_emit_encodeBlockAsm
537	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm
538
539two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
540	CMPL SI, $0x0c
541	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm
542	CMPL DI, $0x00000800
543	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm
544	MOVB $0x01, BL
545	LEAL -16(BX)(SI*4), SI
546	MOVB DI, 1(AX)
547	SHRL $0x08, DI
548	SHLL $0x05, DI
549	ORL  DI, SI
550	MOVB SI, (AX)
551	ADDQ $0x02, AX
552	JMP  repeat_end_emit_encodeBlockAsm
553
554emit_copy_three_repeat_as_copy_encodeBlockAsm:
555	MOVB $0x02, BL
556	LEAL -4(BX)(SI*4), SI
557	MOVB SI, (AX)
558	MOVW DI, 1(AX)
559	ADDQ $0x03, AX
560
561repeat_end_emit_encodeBlockAsm:
562	MOVL CX, 12(SP)
563	JMP  search_loop_encodeBlockAsm
564
565no_repeat_found_encodeBlockAsm:
566	CMPL (DX)(SI*1), DI
567	JEQ  candidate_match_encodeBlockAsm
568	SHRQ $0x08, DI
569	MOVL 24(SP)(R10*4), SI
570	LEAL 2(CX), R9
571	CMPL (DX)(R8*1), DI
572	JEQ  candidate2_match_encodeBlockAsm
573	MOVL R9, 24(SP)(R10*4)
574	SHRQ $0x08, DI
575	CMPL (DX)(SI*1), DI
576	JEQ  candidate3_match_encodeBlockAsm
577	MOVL 20(SP), CX
578	JMP  search_loop_encodeBlockAsm
579
580candidate3_match_encodeBlockAsm:
581	ADDL $0x02, CX
582	JMP  candidate_match_encodeBlockAsm
583
584candidate2_match_encodeBlockAsm:
585	MOVL R9, 24(SP)(R10*4)
586	INCL CX
587	MOVL R8, SI
588
589candidate_match_encodeBlockAsm:
590	MOVL  12(SP), DI
591	TESTL SI, SI
592	JZ    match_extend_back_end_encodeBlockAsm
593
594match_extend_back_loop_encodeBlockAsm:
595	CMPL CX, DI
596	JLE  match_extend_back_end_encodeBlockAsm
597	MOVB -1(DX)(SI*1), BL
598	MOVB -1(DX)(CX*1), R8
599	CMPB BL, R8
600	JNE  match_extend_back_end_encodeBlockAsm
601	LEAL -1(CX), CX
602	DECL SI
603	JZ   match_extend_back_end_encodeBlockAsm
604	JMP  match_extend_back_loop_encodeBlockAsm
605
606match_extend_back_end_encodeBlockAsm:
607	MOVL CX, DI
608	SUBL 12(SP), DI
609	LEAQ 5(AX)(DI*1), DI
610	CMPQ DI, (SP)
611	JL   match_dst_size_check_encodeBlockAsm
612	MOVQ $0x00000000, ret+48(FP)
613	RET
614
615match_dst_size_check_encodeBlockAsm:
616	MOVL CX, DI
617	MOVL 12(SP), R8
618	CMPL R8, DI
619	JEQ  emit_literal_done_match_emit_encodeBlockAsm
620	MOVL DI, R9
621	MOVL DI, 12(SP)
622	LEAQ (DX)(R8*1), DI
623	SUBL R8, R9
624	LEAL -1(R9), R8
625	CMPL R8, $0x3c
626	JLT  one_byte_match_emit_encodeBlockAsm
627	CMPL R8, $0x00000100
628	JLT  two_bytes_match_emit_encodeBlockAsm
629	CMPL R8, $0x00010000
630	JLT  three_bytes_match_emit_encodeBlockAsm
631	CMPL R8, $0x01000000
632	JLT  four_bytes_match_emit_encodeBlockAsm
633	MOVB $0xfc, (AX)
634	MOVL R8, 1(AX)
635	ADDQ $0x05, AX
636	JMP  memmove_long_match_emit_encodeBlockAsm
637
638four_bytes_match_emit_encodeBlockAsm:
639	MOVL R8, R10
640	SHRL $0x10, R10
641	MOVB $0xf8, (AX)
642	MOVW R8, 1(AX)
643	MOVB R10, 3(AX)
644	ADDQ $0x04, AX
645	JMP  memmove_long_match_emit_encodeBlockAsm
646
647three_bytes_match_emit_encodeBlockAsm:
648	MOVB $0xf4, (AX)
649	MOVW R8, 1(AX)
650	ADDQ $0x03, AX
651	JMP  memmove_long_match_emit_encodeBlockAsm
652
653two_bytes_match_emit_encodeBlockAsm:
654	MOVB $0xf0, (AX)
655	MOVB R8, 1(AX)
656	ADDQ $0x02, AX
657	CMPL R8, $0x40
658	JL   memmove_match_emit_encodeBlockAsm
659	JMP  memmove_long_match_emit_encodeBlockAsm
660
661one_byte_match_emit_encodeBlockAsm:
662	SHLB $0x02, R8
663	MOVB R8, (AX)
664	ADDQ $0x01, AX
665
666memmove_match_emit_encodeBlockAsm:
667	LEAQ (AX)(R9*1), R8
668
669	// genMemMoveShort
670	CMPQ R9, $0x03
671	JB   emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_1or2
672	JE   emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_3
673	CMPQ R9, $0x08
674	JB   emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_4through7
675	CMPQ R9, $0x10
676	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
677	CMPQ R9, $0x20
678	JBE  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
679	JMP  emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64
680
681emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_1or2:
682	MOVB (DI), R10
683	MOVB -1(DI)(R9*1), DI
684	MOVB R10, (AX)
685	MOVB DI, -1(AX)(R9*1)
686	JMP  memmove_end_copy_match_emit_encodeBlockAsm
687
688emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_3:
689	MOVW (DI), R10
690	MOVB 2(DI), DI
691	MOVW R10, (AX)
692	MOVB DI, 2(AX)
693	JMP  memmove_end_copy_match_emit_encodeBlockAsm
694
695emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_4through7:
696	MOVL (DI), R10
697	MOVL -4(DI)(R9*1), DI
698	MOVL R10, (AX)
699	MOVL DI, -4(AX)(R9*1)
700	JMP  memmove_end_copy_match_emit_encodeBlockAsm
701
702emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
703	MOVQ (DI), R10
704	MOVQ -8(DI)(R9*1), DI
705	MOVQ R10, (AX)
706	MOVQ DI, -8(AX)(R9*1)
707	JMP  memmove_end_copy_match_emit_encodeBlockAsm
708
709emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
710	MOVOU (DI), X0
711	MOVOU -16(DI)(R9*1), X1
712	MOVOU X0, (AX)
713	MOVOU X1, -16(AX)(R9*1)
714	JMP   memmove_end_copy_match_emit_encodeBlockAsm
715
716emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
717	MOVOU (DI), X0
718	MOVOU 16(DI), X1
719	MOVOU -32(DI)(R9*1), X2
720	MOVOU -16(DI)(R9*1), X3
721	MOVOU X0, (AX)
722	MOVOU X1, 16(AX)
723	MOVOU X2, -32(AX)(R9*1)
724	MOVOU X3, -16(AX)(R9*1)
725
726memmove_end_copy_match_emit_encodeBlockAsm:
727	MOVQ R8, AX
728	JMP  emit_literal_done_match_emit_encodeBlockAsm
729
730memmove_long_match_emit_encodeBlockAsm:
731	LEAQ (AX)(R9*1), R8
732
733	// genMemMoveLong
734	MOVOU (DI), X0
735	MOVOU 16(DI), X1
736	MOVOU -32(DI)(R9*1), X2
737	MOVOU -16(DI)(R9*1), X3
738	MOVQ  R9, R11
739	SHRQ  $0x05, R11
740	MOVQ  AX, R10
741	ANDL  $0x0000001f, R10
742	MOVQ  $0x00000040, R12
743	SUBQ  R10, R12
744	DECQ  R11
745	JA    emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
746	LEAQ  -32(DI)(R12*1), R10
747	LEAQ  -32(AX)(R12*1), R13
748
749emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
750	MOVOU (R10), X4
751	MOVOU 16(R10), X5
752	MOVOA X4, (R13)
753	MOVOA X5, 16(R13)
754	ADDQ  $0x20, R13
755	ADDQ  $0x20, R10
756	ADDQ  $0x20, R12
757	DECQ  R11
758	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back
759
760emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
761	MOVOU -32(DI)(R12*1), X4
762	MOVOU -16(DI)(R12*1), X5
763	MOVOA X4, -32(AX)(R12*1)
764	MOVOA X5, -16(AX)(R12*1)
765	ADDQ  $0x20, R12
766	CMPQ  R9, R12
767	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
768	MOVOU X0, (AX)
769	MOVOU X1, 16(AX)
770	MOVOU X2, -32(AX)(R9*1)
771	MOVOU X3, -16(AX)(R9*1)
772	MOVQ  R8, AX
773
774emit_literal_done_match_emit_encodeBlockAsm:
775match_nolit_loop_encodeBlockAsm:
776	MOVL CX, DI
777	SUBL SI, DI
778	MOVL DI, 16(SP)
779	ADDL $0x04, CX
780	ADDL $0x04, SI
781	MOVQ src_len+32(FP), DI
782	SUBL CX, DI
783	LEAQ (DX)(CX*1), R8
784	LEAQ (DX)(SI*1), SI
785
786	// matchLen
787	XORL R10, R10
788	CMPL DI, $0x08
789	JL   matchlen_single_match_nolit_encodeBlockAsm
790
791matchlen_loopback_match_nolit_encodeBlockAsm:
792	MOVQ  (R8)(R10*1), R9
793	XORQ  (SI)(R10*1), R9
794	TESTQ R9, R9
795	JZ    matchlen_loop_match_nolit_encodeBlockAsm
796	BSFQ  R9, R9
797	SARQ  $0x03, R9
798	LEAL  (R10)(R9*1), R10
799	JMP   match_nolit_end_encodeBlockAsm
800
801matchlen_loop_match_nolit_encodeBlockAsm:
802	LEAL -8(DI), DI
803	LEAL 8(R10), R10
804	CMPL DI, $0x08
805	JGE  matchlen_loopback_match_nolit_encodeBlockAsm
806
807matchlen_single_match_nolit_encodeBlockAsm:
808	TESTL DI, DI
809	JZ    match_nolit_end_encodeBlockAsm
810
811matchlen_single_loopback_match_nolit_encodeBlockAsm:
812	MOVB (R8)(R10*1), R9
813	CMPB (SI)(R10*1), R9
814	JNE  match_nolit_end_encodeBlockAsm
815	LEAL 1(R10), R10
816	DECL DI
817	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm
818
819match_nolit_end_encodeBlockAsm:
820	ADDL R10, CX
821	MOVL 16(SP), SI
822	ADDL $0x04, R10
823	MOVL CX, 12(SP)
824
825	// emitCopy
826	CMPL SI, $0x00010000
827	JL   two_byte_offset_match_nolit_encodeBlockAsm
828
829four_bytes_loop_back_match_nolit_encodeBlockAsm:
830	CMPL R10, $0x40
831	JLE  four_bytes_remain_match_nolit_encodeBlockAsm
832	MOVB $0xff, (AX)
833	MOVL SI, 1(AX)
834	LEAL -64(R10), R10
835	ADDQ $0x05, AX
836	CMPL R10, $0x04
837	JL   four_bytes_remain_match_nolit_encodeBlockAsm
838
839	// emitRepeat
840emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy:
841	MOVL R10, DI
842	LEAL -4(R10), R10
843	CMPL DI, $0x08
844	JLE  repeat_two_match_nolit_encodeBlockAsm_emit_copy
845	CMPL DI, $0x0c
846	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
847	CMPL SI, $0x00000800
848	JLT  repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
849
850cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
851	CMPL R10, $0x00000104
852	JLT  repeat_three_match_nolit_encodeBlockAsm_emit_copy
853	CMPL R10, $0x00010100
854	JLT  repeat_four_match_nolit_encodeBlockAsm_emit_copy
855	CMPL R10, $0x0100ffff
856	JLT  repeat_five_match_nolit_encodeBlockAsm_emit_copy
857	LEAL -16842747(R10), R10
858	MOVW $0x001d, (AX)
859	MOVW $0xfffb, 2(AX)
860	MOVB $0xff, 4(AX)
861	ADDQ $0x05, AX
862	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy
863
864repeat_five_match_nolit_encodeBlockAsm_emit_copy:
865	LEAL -65536(R10), R10
866	MOVL R10, SI
867	MOVW $0x001d, (AX)
868	MOVW R10, 2(AX)
869	SARL $0x10, SI
870	MOVB SI, 4(AX)
871	ADDQ $0x05, AX
872	JMP  match_nolit_emitcopy_end_encodeBlockAsm
873
874repeat_four_match_nolit_encodeBlockAsm_emit_copy:
875	LEAL -256(R10), R10
876	MOVW $0x0019, (AX)
877	MOVW R10, 2(AX)
878	ADDQ $0x04, AX
879	JMP  match_nolit_emitcopy_end_encodeBlockAsm
880
881repeat_three_match_nolit_encodeBlockAsm_emit_copy:
882	LEAL -4(R10), R10
883	MOVW $0x0015, (AX)
884	MOVB R10, 2(AX)
885	ADDQ $0x03, AX
886	JMP  match_nolit_emitcopy_end_encodeBlockAsm
887
888repeat_two_match_nolit_encodeBlockAsm_emit_copy:
889	SHLL $0x02, R10
890	ORL  $0x01, R10
891	MOVW R10, (AX)
892	ADDQ $0x02, AX
893	JMP  match_nolit_emitcopy_end_encodeBlockAsm
894
895repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
896	XORQ DI, DI
897	LEAL 1(DI)(R10*4), R10
898	MOVB SI, 1(AX)
899	SARL $0x08, SI
900	SHLL $0x05, SI
901	ORL  SI, R10
902	MOVB R10, (AX)
903	ADDQ $0x02, AX
904	JMP  match_nolit_emitcopy_end_encodeBlockAsm
905	JMP four_bytes_loop_back_match_nolit_encodeBlockAsm
906
907four_bytes_remain_match_nolit_encodeBlockAsm:
908	TESTL R10, R10
909	JZ    match_nolit_emitcopy_end_encodeBlockAsm
910	MOVB  $0x03, BL
911	LEAL  -4(BX)(R10*4), R10
912	MOVB  R10, (AX)
913	MOVL  SI, 1(AX)
914	ADDQ  $0x05, AX
915	JMP   match_nolit_emitcopy_end_encodeBlockAsm
916
917two_byte_offset_match_nolit_encodeBlockAsm:
918	CMPL R10, $0x40
919	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm
920	MOVB $0xee, (AX)
921	MOVW SI, 1(AX)
922	LEAL -60(R10), R10
923	ADDQ $0x03, AX
924
925	// emitRepeat
926emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short:
927	MOVL R10, DI
928	LEAL -4(R10), R10
929	CMPL DI, $0x08
930	JLE  repeat_two_match_nolit_encodeBlockAsm_emit_copy_short
931	CMPL DI, $0x0c
932	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
933	CMPL SI, $0x00000800
934	JLT  repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
935
936cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
937	CMPL R10, $0x00000104
938	JLT  repeat_three_match_nolit_encodeBlockAsm_emit_copy_short
939	CMPL R10, $0x00010100
940	JLT  repeat_four_match_nolit_encodeBlockAsm_emit_copy_short
941	CMPL R10, $0x0100ffff
942	JLT  repeat_five_match_nolit_encodeBlockAsm_emit_copy_short
943	LEAL -16842747(R10), R10
944	MOVW $0x001d, (AX)
945	MOVW $0xfffb, 2(AX)
946	MOVB $0xff, 4(AX)
947	ADDQ $0x05, AX
948	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short
949
950repeat_five_match_nolit_encodeBlockAsm_emit_copy_short:
951	LEAL -65536(R10), R10
952	MOVL R10, SI
953	MOVW $0x001d, (AX)
954	MOVW R10, 2(AX)
955	SARL $0x10, SI
956	MOVB SI, 4(AX)
957	ADDQ $0x05, AX
958	JMP  match_nolit_emitcopy_end_encodeBlockAsm
959
960repeat_four_match_nolit_encodeBlockAsm_emit_copy_short:
961	LEAL -256(R10), R10
962	MOVW $0x0019, (AX)
963	MOVW R10, 2(AX)
964	ADDQ $0x04, AX
965	JMP  match_nolit_emitcopy_end_encodeBlockAsm
966
967repeat_three_match_nolit_encodeBlockAsm_emit_copy_short:
968	LEAL -4(R10), R10
969	MOVW $0x0015, (AX)
970	MOVB R10, 2(AX)
971	ADDQ $0x03, AX
972	JMP  match_nolit_emitcopy_end_encodeBlockAsm
973
974repeat_two_match_nolit_encodeBlockAsm_emit_copy_short:
975	SHLL $0x02, R10
976	ORL  $0x01, R10
977	MOVW R10, (AX)
978	ADDQ $0x02, AX
979	JMP  match_nolit_emitcopy_end_encodeBlockAsm
980
981repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
982	XORQ DI, DI
983	LEAL 1(DI)(R10*4), R10
984	MOVB SI, 1(AX)
985	SARL $0x08, SI
986	SHLL $0x05, SI
987	ORL  SI, R10
988	MOVB R10, (AX)
989	ADDQ $0x02, AX
990	JMP  match_nolit_emitcopy_end_encodeBlockAsm
991	JMP two_byte_offset_match_nolit_encodeBlockAsm
992
993two_byte_offset_short_match_nolit_encodeBlockAsm:
994	CMPL R10, $0x0c
995	JGE  emit_copy_three_match_nolit_encodeBlockAsm
996	CMPL SI, $0x00000800
997	JGE  emit_copy_three_match_nolit_encodeBlockAsm
998	MOVB $0x01, BL
999	LEAL -16(BX)(R10*4), R10
1000	MOVB SI, 1(AX)
1001	SHRL $0x08, SI
1002	SHLL $0x05, SI
1003	ORL  SI, R10
1004	MOVB R10, (AX)
1005	ADDQ $0x02, AX
1006	JMP  match_nolit_emitcopy_end_encodeBlockAsm
1007
1008emit_copy_three_match_nolit_encodeBlockAsm:
1009	MOVB $0x02, BL
1010	LEAL -4(BX)(R10*4), R10
1011	MOVB R10, (AX)
1012	MOVW SI, 1(AX)
1013	ADDQ $0x03, AX
1014
1015match_nolit_emitcopy_end_encodeBlockAsm:
1016	CMPL CX, 8(SP)
1017	JGE  emit_remainder_encodeBlockAsm
1018	MOVQ -2(DX)(CX*1), DI
1019	CMPQ AX, (SP)
1020	JL   match_nolit_dst_ok_encodeBlockAsm
1021	MOVQ $0x00000000, ret+48(FP)
1022	RET
1023
1024match_nolit_dst_ok_encodeBlockAsm:
1025	MOVQ  $0x0000cf1bbcdcbf9b, R9
1026	MOVQ  DI, R8
1027	SHRQ  $0x10, DI
1028	MOVQ  DI, SI
1029	SHLQ  $0x10, R8
1030	IMULQ R9, R8
1031	SHRQ  $0x32, R8
1032	SHLQ  $0x10, SI
1033	IMULQ R9, SI
1034	SHRQ  $0x32, SI
1035	LEAL  -2(CX), R9
1036	LEAQ  24(SP)(SI*4), R10
1037	MOVL  (R10), SI
1038	MOVL  R9, 24(SP)(R8*4)
1039	MOVL  CX, (R10)
1040	CMPL  (DX)(SI*1), DI
1041	JEQ   match_nolit_loop_encodeBlockAsm
1042	INCL  CX
1043	JMP   search_loop_encodeBlockAsm
1044
1045emit_remainder_encodeBlockAsm:
1046	MOVQ src_len+32(FP), CX
1047	SUBL 12(SP), CX
1048	LEAQ 5(AX)(CX*1), CX
1049	CMPQ CX, (SP)
1050	JL   emit_remainder_ok_encodeBlockAsm
1051	MOVQ $0x00000000, ret+48(FP)
1052	RET
1053
1054emit_remainder_ok_encodeBlockAsm:
1055	MOVQ src_len+32(FP), CX
1056	MOVL 12(SP), BX
1057	CMPL BX, CX
1058	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm
1059	MOVL CX, SI
1060	MOVL CX, 12(SP)
1061	LEAQ (DX)(BX*1), CX
1062	SUBL BX, SI
1063	LEAL -1(SI), DX
1064	CMPL DX, $0x3c
1065	JLT  one_byte_emit_remainder_encodeBlockAsm
1066	CMPL DX, $0x00000100
1067	JLT  two_bytes_emit_remainder_encodeBlockAsm
1068	CMPL DX, $0x00010000
1069	JLT  three_bytes_emit_remainder_encodeBlockAsm
1070	CMPL DX, $0x01000000
1071	JLT  four_bytes_emit_remainder_encodeBlockAsm
1072	MOVB $0xfc, (AX)
1073	MOVL DX, 1(AX)
1074	ADDQ $0x05, AX
1075	JMP  memmove_long_emit_remainder_encodeBlockAsm
1076
1077four_bytes_emit_remainder_encodeBlockAsm:
1078	MOVL DX, BX
1079	SHRL $0x10, BX
1080	MOVB $0xf8, (AX)
1081	MOVW DX, 1(AX)
1082	MOVB BL, 3(AX)
1083	ADDQ $0x04, AX
1084	JMP  memmove_long_emit_remainder_encodeBlockAsm
1085
1086three_bytes_emit_remainder_encodeBlockAsm:
1087	MOVB $0xf4, (AX)
1088	MOVW DX, 1(AX)
1089	ADDQ $0x03, AX
1090	JMP  memmove_long_emit_remainder_encodeBlockAsm
1091
1092two_bytes_emit_remainder_encodeBlockAsm:
1093	MOVB $0xf0, (AX)
1094	MOVB DL, 1(AX)
1095	ADDQ $0x02, AX
1096	CMPL DX, $0x40
1097	JL   memmove_emit_remainder_encodeBlockAsm
1098	JMP  memmove_long_emit_remainder_encodeBlockAsm
1099
1100one_byte_emit_remainder_encodeBlockAsm:
1101	SHLB $0x02, DL
1102	MOVB DL, (AX)
1103	ADDQ $0x01, AX
1104
1105memmove_emit_remainder_encodeBlockAsm:
1106	LEAQ (AX)(SI*1), DX
1107	MOVL SI, BX
1108
1109	// genMemMoveShort
1110	CMPQ BX, $0x03
1111	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2
1112	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3
1113	CMPQ BX, $0x08
1114	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7
1115	CMPQ BX, $0x10
1116	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
1117	CMPQ BX, $0x20
1118	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
1119	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64
1120
1121emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2:
1122	MOVB (CX), SI
1123	MOVB -1(CX)(BX*1), CL
1124	MOVB SI, (AX)
1125	MOVB CL, -1(AX)(BX*1)
1126	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
1127
1128emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3:
1129	MOVW (CX), SI
1130	MOVB 2(CX), CL
1131	MOVW SI, (AX)
1132	MOVB CL, 2(AX)
1133	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
1134
1135emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7:
1136	MOVL (CX), SI
1137	MOVL -4(CX)(BX*1), CX
1138	MOVL SI, (AX)
1139	MOVL CX, -4(AX)(BX*1)
1140	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
1141
1142emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
1143	MOVQ (CX), SI
1144	MOVQ -8(CX)(BX*1), CX
1145	MOVQ SI, (AX)
1146	MOVQ CX, -8(AX)(BX*1)
1147	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm
1148
1149emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32:
1150	MOVOU (CX), X0
1151	MOVOU -16(CX)(BX*1), X1
1152	MOVOU X0, (AX)
1153	MOVOU X1, -16(AX)(BX*1)
1154	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm
1155
1156emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64:
1157	MOVOU (CX), X0
1158	MOVOU 16(CX), X1
1159	MOVOU -32(CX)(BX*1), X2
1160	MOVOU -16(CX)(BX*1), X3
1161	MOVOU X0, (AX)
1162	MOVOU X1, 16(AX)
1163	MOVOU X2, -32(AX)(BX*1)
1164	MOVOU X3, -16(AX)(BX*1)
1165
1166memmove_end_copy_emit_remainder_encodeBlockAsm:
1167	MOVQ DX, AX
1168	JMP  emit_literal_done_emit_remainder_encodeBlockAsm
1169
1170memmove_long_emit_remainder_encodeBlockAsm:
1171	LEAQ (AX)(SI*1), DX
1172	MOVL SI, BX
1173
1174	// genMemMoveLong
1175	MOVOU (CX), X0
1176	MOVOU 16(CX), X1
1177	MOVOU -32(CX)(BX*1), X2
1178	MOVOU -16(CX)(BX*1), X3
1179	MOVQ  BX, DI
1180	SHRQ  $0x05, DI
1181	MOVQ  AX, SI
1182	ANDL  $0x0000001f, SI
1183	MOVQ  $0x00000040, R8
1184	SUBQ  SI, R8
1185	DECQ  DI
1186	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
1187	LEAQ  -32(CX)(R8*1), SI
1188	LEAQ  -32(AX)(R8*1), R9
1189
1190emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back:
1191	MOVOU (SI), X4
1192	MOVOU 16(SI), X5
1193	MOVOA X4, (R9)
1194	MOVOA X5, 16(R9)
1195	ADDQ  $0x20, R9
1196	ADDQ  $0x20, SI
1197	ADDQ  $0x20, R8
1198	DECQ  DI
1199	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back
1200
1201emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32:
1202	MOVOU -32(CX)(R8*1), X4
1203	MOVOU -16(CX)(R8*1), X5
1204	MOVOA X4, -32(AX)(R8*1)
1205	MOVOA X5, -16(AX)(R8*1)
1206	ADDQ  $0x20, R8
1207	CMPQ  BX, R8
1208	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
1209	MOVOU X0, (AX)
1210	MOVOU X1, 16(AX)
1211	MOVOU X2, -32(AX)(BX*1)
1212	MOVOU X3, -16(AX)(BX*1)
1213	MOVQ  DX, AX
1214
1215emit_literal_done_emit_remainder_encodeBlockAsm:
1216	MOVQ dst_base+0(FP), CX
1217	SUBQ CX, AX
1218	MOVQ AX, ret+48(FP)
1219	RET
1220
1221// func encodeBlockAsm4MB(dst []byte, src []byte) int
1222// Requires: SSE2
1223TEXT ·encodeBlockAsm4MB(SB), $65560-56
1224	MOVQ dst_base+0(FP), AX
1225	MOVQ $0x00000200, CX
1226	LEAQ 24(SP), DX
1227	PXOR X0, X0
1228
1229zero_loop_encodeBlockAsm4MB:
1230	MOVOU X0, (DX)
1231	MOVOU X0, 16(DX)
1232	MOVOU X0, 32(DX)
1233	MOVOU X0, 48(DX)
1234	MOVOU X0, 64(DX)
1235	MOVOU X0, 80(DX)
1236	MOVOU X0, 96(DX)
1237	MOVOU X0, 112(DX)
1238	ADDQ  $0x80, DX
1239	DECQ  CX
1240	JNZ   zero_loop_encodeBlockAsm4MB
1241	MOVL  $0x00000000, 12(SP)
1242	MOVQ  src_len+32(FP), CX
1243	LEAQ  -5(CX), DX
1244	LEAQ  -8(CX), SI
1245	MOVL  SI, 8(SP)
1246	SHRQ  $0x05, CX
1247	SUBL  CX, DX
1248	LEAQ  (AX)(DX*1), DX
1249	MOVQ  DX, (SP)
1250	MOVL  $0x00000001, CX
1251	MOVL  CX, 16(SP)
1252	MOVQ  src_base+24(FP), DX
1253
1254search_loop_encodeBlockAsm4MB:
1255	MOVL  CX, SI
1256	SUBL  12(SP), SI
1257	SHRL  $0x06, SI
1258	LEAL  4(CX)(SI*1), SI
1259	CMPL  SI, 8(SP)
1260	JGE   emit_remainder_encodeBlockAsm4MB
1261	MOVQ  (DX)(CX*1), DI
1262	MOVL  SI, 20(SP)
1263	MOVQ  $0x0000cf1bbcdcbf9b, R9
1264	MOVQ  DI, R10
1265	MOVQ  DI, R11
1266	SHRQ  $0x08, R11
1267	SHLQ  $0x10, R10
1268	IMULQ R9, R10
1269	SHRQ  $0x32, R10
1270	SHLQ  $0x10, R11
1271	IMULQ R9, R11
1272	SHRQ  $0x32, R11
1273	MOVL  24(SP)(R10*4), SI
1274	MOVL  24(SP)(R11*4), R8
1275	MOVL  CX, 24(SP)(R10*4)
1276	LEAL  1(CX), R10
1277	MOVL  R10, 24(SP)(R11*4)
1278	MOVQ  DI, R10
1279	SHRQ  $0x10, R10
1280	SHLQ  $0x10, R10
1281	IMULQ R9, R10
1282	SHRQ  $0x32, R10
1283	MOVL  CX, R9
1284	SUBL  16(SP), R9
1285	MOVL  1(DX)(R9*1), R11
1286	MOVQ  DI, R9
1287	SHRQ  $0x08, R9
1288	CMPL  R9, R11
1289	JNE   no_repeat_found_encodeBlockAsm4MB
1290	LEAL  1(CX), DI
1291	MOVL  12(SP), R8
1292	MOVL  DI, SI
1293	SUBL  16(SP), SI
1294	JZ    repeat_extend_back_end_encodeBlockAsm4MB
1295
1296repeat_extend_back_loop_encodeBlockAsm4MB:
1297	CMPL DI, R8
1298	JLE  repeat_extend_back_end_encodeBlockAsm4MB
1299	MOVB -1(DX)(SI*1), BL
1300	MOVB -1(DX)(DI*1), R9
1301	CMPB BL, R9
1302	JNE  repeat_extend_back_end_encodeBlockAsm4MB
1303	LEAL -1(DI), DI
1304	DECL SI
1305	JNZ  repeat_extend_back_loop_encodeBlockAsm4MB
1306
1307repeat_extend_back_end_encodeBlockAsm4MB:
1308	MOVL 12(SP), SI
1309	CMPL SI, DI
1310	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm4MB
1311	MOVL DI, R9
1312	MOVL DI, 12(SP)
1313	LEAQ (DX)(SI*1), R10
1314	SUBL SI, R9
1315	LEAL -1(R9), SI
1316	CMPL SI, $0x3c
1317	JLT  one_byte_repeat_emit_encodeBlockAsm4MB
1318	CMPL SI, $0x00000100
1319	JLT  two_bytes_repeat_emit_encodeBlockAsm4MB
1320	CMPL SI, $0x00010000
1321	JLT  three_bytes_repeat_emit_encodeBlockAsm4MB
1322	MOVL SI, R11
1323	SHRL $0x10, R11
1324	MOVB $0xf8, (AX)
1325	MOVW SI, 1(AX)
1326	MOVB R11, 3(AX)
1327	ADDQ $0x04, AX
1328	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB
1329
1330three_bytes_repeat_emit_encodeBlockAsm4MB:
1331	MOVB $0xf4, (AX)
1332	MOVW SI, 1(AX)
1333	ADDQ $0x03, AX
1334	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB
1335
1336two_bytes_repeat_emit_encodeBlockAsm4MB:
1337	MOVB $0xf0, (AX)
1338	MOVB SI, 1(AX)
1339	ADDQ $0x02, AX
1340	CMPL SI, $0x40
1341	JL   memmove_repeat_emit_encodeBlockAsm4MB
1342	JMP  memmove_long_repeat_emit_encodeBlockAsm4MB
1343
1344one_byte_repeat_emit_encodeBlockAsm4MB:
1345	SHLB $0x02, SI
1346	MOVB SI, (AX)
1347	ADDQ $0x01, AX
1348
1349memmove_repeat_emit_encodeBlockAsm4MB:
1350	LEAQ (AX)(R9*1), SI
1351
1352	// genMemMoveShort
1353	CMPQ R9, $0x03
1354	JB   emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_1or2
1355	JE   emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_3
1356	CMPQ R9, $0x08
1357	JB   emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_4through7
1358	CMPQ R9, $0x10
1359	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16
1360	CMPQ R9, $0x20
1361	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32
1362	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64
1363
1364emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_1or2:
1365	MOVB (R10), R11
1366	MOVB -1(R10)(R9*1), R10
1367	MOVB R11, (AX)
1368	MOVB R10, -1(AX)(R9*1)
1369	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm4MB
1370
1371emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_3:
1372	MOVW (R10), R11
1373	MOVB 2(R10), R10
1374	MOVW R11, (AX)
1375	MOVB R10, 2(AX)
1376	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm4MB
1377
1378emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_4through7:
1379	MOVL (R10), R11
1380	MOVL -4(R10)(R9*1), R10
1381	MOVL R11, (AX)
1382	MOVL R10, -4(AX)(R9*1)
1383	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm4MB
1384
1385emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16:
1386	MOVQ (R10), R11
1387	MOVQ -8(R10)(R9*1), R10
1388	MOVQ R11, (AX)
1389	MOVQ R10, -8(AX)(R9*1)
1390	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm4MB
1391
1392emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32:
1393	MOVOU (R10), X0
1394	MOVOU -16(R10)(R9*1), X1
1395	MOVOU X0, (AX)
1396	MOVOU X1, -16(AX)(R9*1)
1397	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm4MB
1398
1399emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64:
1400	MOVOU (R10), X0
1401	MOVOU 16(R10), X1
1402	MOVOU -32(R10)(R9*1), X2
1403	MOVOU -16(R10)(R9*1), X3
1404	MOVOU X0, (AX)
1405	MOVOU X1, 16(AX)
1406	MOVOU X2, -32(AX)(R9*1)
1407	MOVOU X3, -16(AX)(R9*1)
1408
1409memmove_end_copy_repeat_emit_encodeBlockAsm4MB:
1410	MOVQ SI, AX
1411	JMP  emit_literal_done_repeat_emit_encodeBlockAsm4MB
1412
1413memmove_long_repeat_emit_encodeBlockAsm4MB:
1414	LEAQ (AX)(R9*1), SI
1415
1416	// genMemMoveLong
1417	MOVOU (R10), X0
1418	MOVOU 16(R10), X1
1419	MOVOU -32(R10)(R9*1), X2
1420	MOVOU -16(R10)(R9*1), X3
1421	MOVQ  R9, R12
1422	SHRQ  $0x05, R12
1423	MOVQ  AX, R11
1424	ANDL  $0x0000001f, R11
1425	MOVQ  $0x00000040, R13
1426	SUBQ  R11, R13
1427	DECQ  R12
1428	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
1429	LEAQ  -32(R10)(R13*1), R11
1430	LEAQ  -32(AX)(R13*1), R14
1431
1432emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back:
1433	MOVOU (R11), X4
1434	MOVOU 16(R11), X5
1435	MOVOA X4, (R14)
1436	MOVOA X5, 16(R14)
1437	ADDQ  $0x20, R14
1438	ADDQ  $0x20, R11
1439	ADDQ  $0x20, R13
1440	DECQ  R12
1441	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back
1442
1443emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
1444	MOVOU -32(R10)(R13*1), X4
1445	MOVOU -16(R10)(R13*1), X5
1446	MOVOA X4, -32(AX)(R13*1)
1447	MOVOA X5, -16(AX)(R13*1)
1448	ADDQ  $0x20, R13
1449	CMPQ  R9, R13
1450	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
1451	MOVOU X0, (AX)
1452	MOVOU X1, 16(AX)
1453	MOVOU X2, -32(AX)(R9*1)
1454	MOVOU X3, -16(AX)(R9*1)
1455	MOVQ  SI, AX
1456
1457emit_literal_done_repeat_emit_encodeBlockAsm4MB:
1458	ADDL $0x05, CX
1459	MOVL CX, SI
1460	SUBL 16(SP), SI
1461	MOVQ src_len+32(FP), R9
1462	SUBL CX, R9
1463	LEAQ (DX)(CX*1), R10
1464	LEAQ (DX)(SI*1), SI
1465
1466	// matchLen
1467	XORL R12, R12
1468	CMPL R9, $0x08
1469	JL   matchlen_single_repeat_extend_encodeBlockAsm4MB
1470
1471matchlen_loopback_repeat_extend_encodeBlockAsm4MB:
1472	MOVQ  (R10)(R12*1), R11
1473	XORQ  (SI)(R12*1), R11
1474	TESTQ R11, R11
1475	JZ    matchlen_loop_repeat_extend_encodeBlockAsm4MB
1476	BSFQ  R11, R11
1477	SARQ  $0x03, R11
1478	LEAL  (R12)(R11*1), R12
1479	JMP   repeat_extend_forward_end_encodeBlockAsm4MB
1480
1481matchlen_loop_repeat_extend_encodeBlockAsm4MB:
1482	LEAL -8(R9), R9
1483	LEAL 8(R12), R12
1484	CMPL R9, $0x08
1485	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm4MB
1486
1487matchlen_single_repeat_extend_encodeBlockAsm4MB:
1488	TESTL R9, R9
1489	JZ    repeat_extend_forward_end_encodeBlockAsm4MB
1490
1491matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB:
1492	MOVB (R10)(R12*1), R11
1493	CMPB (SI)(R12*1), R11
1494	JNE  repeat_extend_forward_end_encodeBlockAsm4MB
1495	LEAL 1(R12), R12
1496	DECL R9
1497	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm4MB
1498
1499repeat_extend_forward_end_encodeBlockAsm4MB:
1500	ADDL  R12, CX
1501	MOVL  CX, SI
1502	SUBL  DI, SI
1503	MOVL  16(SP), DI
1504	TESTL R8, R8
1505	JZ    repeat_as_copy_encodeBlockAsm4MB
1506
1507	// emitRepeat
1508	MOVL SI, R8
1509	LEAL -4(SI), SI
1510	CMPL R8, $0x08
1511	JLE  repeat_two_match_repeat_encodeBlockAsm4MB
1512	CMPL R8, $0x0c
1513	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB
1514	CMPL DI, $0x00000800
1515	JLT  repeat_two_offset_match_repeat_encodeBlockAsm4MB
1516
1517cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB:
1518	CMPL SI, $0x00000104
1519	JLT  repeat_three_match_repeat_encodeBlockAsm4MB
1520	CMPL SI, $0x00010100
1521	JLT  repeat_four_match_repeat_encodeBlockAsm4MB
1522	LEAL -65536(SI), SI
1523	MOVL SI, DI
1524	MOVW $0x001d, (AX)
1525	MOVW SI, 2(AX)
1526	SARL $0x10, DI
1527	MOVB DI, 4(AX)
1528	ADDQ $0x05, AX
1529	JMP  repeat_end_emit_encodeBlockAsm4MB
1530
1531repeat_four_match_repeat_encodeBlockAsm4MB:
1532	LEAL -256(SI), SI
1533	MOVW $0x0019, (AX)
1534	MOVW SI, 2(AX)
1535	ADDQ $0x04, AX
1536	JMP  repeat_end_emit_encodeBlockAsm4MB
1537
1538repeat_three_match_repeat_encodeBlockAsm4MB:
1539	LEAL -4(SI), SI
1540	MOVW $0x0015, (AX)
1541	MOVB SI, 2(AX)
1542	ADDQ $0x03, AX
1543	JMP  repeat_end_emit_encodeBlockAsm4MB
1544
1545repeat_two_match_repeat_encodeBlockAsm4MB:
1546	SHLL $0x02, SI
1547	ORL  $0x01, SI
1548	MOVW SI, (AX)
1549	ADDQ $0x02, AX
1550	JMP  repeat_end_emit_encodeBlockAsm4MB
1551
1552repeat_two_offset_match_repeat_encodeBlockAsm4MB:
1553	XORQ R8, R8
1554	LEAL 1(R8)(SI*4), SI
1555	MOVB DI, 1(AX)
1556	SARL $0x08, DI
1557	SHLL $0x05, DI
1558	ORL  DI, SI
1559	MOVB SI, (AX)
1560	ADDQ $0x02, AX
1561	JMP  repeat_end_emit_encodeBlockAsm4MB
1562
1563repeat_as_copy_encodeBlockAsm4MB:
1564	// emitCopy
1565	CMPL DI, $0x00010000
1566	JL   two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
1567
1568four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB:
1569	CMPL SI, $0x40
1570	JLE  four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
1571	MOVB $0xff, (AX)
1572	MOVL DI, 1(AX)
1573	LEAL -64(SI), SI
1574	ADDQ $0x05, AX
1575	CMPL SI, $0x04
1576	JL   four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
1577
1578	// emitRepeat
1579	MOVL SI, R8
1580	LEAL -4(SI), SI
1581	CMPL R8, $0x08
1582	JLE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1583	CMPL R8, $0x0c
1584	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1585	CMPL DI, $0x00000800
1586	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1587
1588cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1589	CMPL SI, $0x00000104
1590	JLT  repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1591	CMPL SI, $0x00010100
1592	JLT  repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1593	LEAL -65536(SI), SI
1594	MOVL SI, DI
1595	MOVW $0x001d, (AX)
1596	MOVW SI, 2(AX)
1597	SARL $0x10, DI
1598	MOVB DI, 4(AX)
1599	ADDQ $0x05, AX
1600	JMP  repeat_end_emit_encodeBlockAsm4MB
1601
1602repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1603	LEAL -256(SI), SI
1604	MOVW $0x0019, (AX)
1605	MOVW SI, 2(AX)
1606	ADDQ $0x04, AX
1607	JMP  repeat_end_emit_encodeBlockAsm4MB
1608
1609repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1610	LEAL -4(SI), SI
1611	MOVW $0x0015, (AX)
1612	MOVB SI, 2(AX)
1613	ADDQ $0x03, AX
1614	JMP  repeat_end_emit_encodeBlockAsm4MB
1615
1616repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1617	SHLL $0x02, SI
1618	ORL  $0x01, SI
1619	MOVW SI, (AX)
1620	ADDQ $0x02, AX
1621	JMP  repeat_end_emit_encodeBlockAsm4MB
1622
1623repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1624	XORQ R8, R8
1625	LEAL 1(R8)(SI*4), SI
1626	MOVB DI, 1(AX)
1627	SARL $0x08, DI
1628	SHLL $0x05, DI
1629	ORL  DI, SI
1630	MOVB SI, (AX)
1631	ADDQ $0x02, AX
1632	JMP  repeat_end_emit_encodeBlockAsm4MB
1633	JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB
1634
1635four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
1636	TESTL SI, SI
1637	JZ    repeat_end_emit_encodeBlockAsm4MB
1638	MOVB  $0x03, BL
1639	LEAL  -4(BX)(SI*4), SI
1640	MOVB  SI, (AX)
1641	MOVL  DI, 1(AX)
1642	ADDQ  $0x05, AX
1643	JMP   repeat_end_emit_encodeBlockAsm4MB
1644
1645two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
1646	CMPL SI, $0x40
1647	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
1648	MOVB $0xee, (AX)
1649	MOVW DI, 1(AX)
1650	LEAL -60(SI), SI
1651	ADDQ $0x03, AX
1652
1653	// emitRepeat
1654	MOVL SI, R8
1655	LEAL -4(SI), SI
1656	CMPL R8, $0x08
1657	JLE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1658	CMPL R8, $0x0c
1659	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1660	CMPL DI, $0x00000800
1661	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1662
1663cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1664	CMPL SI, $0x00000104
1665	JLT  repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1666	CMPL SI, $0x00010100
1667	JLT  repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1668	LEAL -65536(SI), SI
1669	MOVL SI, DI
1670	MOVW $0x001d, (AX)
1671	MOVW SI, 2(AX)
1672	SARL $0x10, DI
1673	MOVB DI, 4(AX)
1674	ADDQ $0x05, AX
1675	JMP  repeat_end_emit_encodeBlockAsm4MB
1676
1677repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1678	LEAL -256(SI), SI
1679	MOVW $0x0019, (AX)
1680	MOVW SI, 2(AX)
1681	ADDQ $0x04, AX
1682	JMP  repeat_end_emit_encodeBlockAsm4MB
1683
1684repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1685	LEAL -4(SI), SI
1686	MOVW $0x0015, (AX)
1687	MOVB SI, 2(AX)
1688	ADDQ $0x03, AX
1689	JMP  repeat_end_emit_encodeBlockAsm4MB
1690
1691repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1692	SHLL $0x02, SI
1693	ORL  $0x01, SI
1694	MOVW SI, (AX)
1695	ADDQ $0x02, AX
1696	JMP  repeat_end_emit_encodeBlockAsm4MB
1697
1698repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1699	XORQ R8, R8
1700	LEAL 1(R8)(SI*4), SI
1701	MOVB DI, 1(AX)
1702	SARL $0x08, DI
1703	SHLL $0x05, DI
1704	ORL  DI, SI
1705	MOVB SI, (AX)
1706	ADDQ $0x02, AX
1707	JMP  repeat_end_emit_encodeBlockAsm4MB
1708	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
1709
1710two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
1711	CMPL SI, $0x0c
1712	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
1713	CMPL DI, $0x00000800
1714	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
1715	MOVB $0x01, BL
1716	LEAL -16(BX)(SI*4), SI
1717	MOVB DI, 1(AX)
1718	SHRL $0x08, DI
1719	SHLL $0x05, DI
1720	ORL  DI, SI
1721	MOVB SI, (AX)
1722	ADDQ $0x02, AX
1723	JMP  repeat_end_emit_encodeBlockAsm4MB
1724
1725emit_copy_three_repeat_as_copy_encodeBlockAsm4MB:
1726	MOVB $0x02, BL
1727	LEAL -4(BX)(SI*4), SI
1728	MOVB SI, (AX)
1729	MOVW DI, 1(AX)
1730	ADDQ $0x03, AX
1731
1732repeat_end_emit_encodeBlockAsm4MB:
1733	MOVL CX, 12(SP)
1734	JMP  search_loop_encodeBlockAsm4MB
1735
1736no_repeat_found_encodeBlockAsm4MB:
1737	CMPL (DX)(SI*1), DI
1738	JEQ  candidate_match_encodeBlockAsm4MB
1739	SHRQ $0x08, DI
1740	MOVL 24(SP)(R10*4), SI
1741	LEAL 2(CX), R9
1742	CMPL (DX)(R8*1), DI
1743	JEQ  candidate2_match_encodeBlockAsm4MB
1744	MOVL R9, 24(SP)(R10*4)
1745	SHRQ $0x08, DI
1746	CMPL (DX)(SI*1), DI
1747	JEQ  candidate3_match_encodeBlockAsm4MB
1748	MOVL 20(SP), CX
1749	JMP  search_loop_encodeBlockAsm4MB
1750
1751candidate3_match_encodeBlockAsm4MB:
1752	ADDL $0x02, CX
1753	JMP  candidate_match_encodeBlockAsm4MB
1754
1755candidate2_match_encodeBlockAsm4MB:
1756	MOVL R9, 24(SP)(R10*4)
1757	INCL CX
1758	MOVL R8, SI
1759
1760candidate_match_encodeBlockAsm4MB:
1761	MOVL  12(SP), DI
1762	TESTL SI, SI
1763	JZ    match_extend_back_end_encodeBlockAsm4MB
1764
1765match_extend_back_loop_encodeBlockAsm4MB:
1766	CMPL CX, DI
1767	JLE  match_extend_back_end_encodeBlockAsm4MB
1768	MOVB -1(DX)(SI*1), BL
1769	MOVB -1(DX)(CX*1), R8
1770	CMPB BL, R8
1771	JNE  match_extend_back_end_encodeBlockAsm4MB
1772	LEAL -1(CX), CX
1773	DECL SI
1774	JZ   match_extend_back_end_encodeBlockAsm4MB
1775	JMP  match_extend_back_loop_encodeBlockAsm4MB
1776
1777match_extend_back_end_encodeBlockAsm4MB:
1778	MOVL CX, DI
1779	SUBL 12(SP), DI
1780	LEAQ 4(AX)(DI*1), DI
1781	CMPQ DI, (SP)
1782	JL   match_dst_size_check_encodeBlockAsm4MB
1783	MOVQ $0x00000000, ret+48(FP)
1784	RET
1785
1786match_dst_size_check_encodeBlockAsm4MB:
1787	MOVL CX, DI
1788	MOVL 12(SP), R8
1789	CMPL R8, DI
1790	JEQ  emit_literal_done_match_emit_encodeBlockAsm4MB
1791	MOVL DI, R9
1792	MOVL DI, 12(SP)
1793	LEAQ (DX)(R8*1), DI
1794	SUBL R8, R9
1795	LEAL -1(R9), R8
1796	CMPL R8, $0x3c
1797	JLT  one_byte_match_emit_encodeBlockAsm4MB
1798	CMPL R8, $0x00000100
1799	JLT  two_bytes_match_emit_encodeBlockAsm4MB
1800	CMPL R8, $0x00010000
1801	JLT  three_bytes_match_emit_encodeBlockAsm4MB
1802	MOVL R8, R10
1803	SHRL $0x10, R10
1804	MOVB $0xf8, (AX)
1805	MOVW R8, 1(AX)
1806	MOVB R10, 3(AX)
1807	ADDQ $0x04, AX
1808	JMP  memmove_long_match_emit_encodeBlockAsm4MB
1809
1810three_bytes_match_emit_encodeBlockAsm4MB:
1811	MOVB $0xf4, (AX)
1812	MOVW R8, 1(AX)
1813	ADDQ $0x03, AX
1814	JMP  memmove_long_match_emit_encodeBlockAsm4MB
1815
1816two_bytes_match_emit_encodeBlockAsm4MB:
1817	MOVB $0xf0, (AX)
1818	MOVB R8, 1(AX)
1819	ADDQ $0x02, AX
1820	CMPL R8, $0x40
1821	JL   memmove_match_emit_encodeBlockAsm4MB
1822	JMP  memmove_long_match_emit_encodeBlockAsm4MB
1823
1824one_byte_match_emit_encodeBlockAsm4MB:
1825	SHLB $0x02, R8
1826	MOVB R8, (AX)
1827	ADDQ $0x01, AX
1828
1829memmove_match_emit_encodeBlockAsm4MB:
1830	LEAQ (AX)(R9*1), R8
1831
1832	// genMemMoveShort
1833	CMPQ R9, $0x03
1834	JB   emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_1or2
1835	JE   emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_3
1836	CMPQ R9, $0x08
1837	JB   emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_4through7
1838	CMPQ R9, $0x10
1839	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16
1840	CMPQ R9, $0x20
1841	JBE  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32
1842	JMP  emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64
1843
1844emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_1or2:
1845	MOVB (DI), R10
1846	MOVB -1(DI)(R9*1), DI
1847	MOVB R10, (AX)
1848	MOVB DI, -1(AX)(R9*1)
1849	JMP  memmove_end_copy_match_emit_encodeBlockAsm4MB
1850
1851emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_3:
1852	MOVW (DI), R10
1853	MOVB 2(DI), DI
1854	MOVW R10, (AX)
1855	MOVB DI, 2(AX)
1856	JMP  memmove_end_copy_match_emit_encodeBlockAsm4MB
1857
1858emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_4through7:
1859	MOVL (DI), R10
1860	MOVL -4(DI)(R9*1), DI
1861	MOVL R10, (AX)
1862	MOVL DI, -4(AX)(R9*1)
1863	JMP  memmove_end_copy_match_emit_encodeBlockAsm4MB
1864
1865emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16:
1866	MOVQ (DI), R10
1867	MOVQ -8(DI)(R9*1), DI
1868	MOVQ R10, (AX)
1869	MOVQ DI, -8(AX)(R9*1)
1870	JMP  memmove_end_copy_match_emit_encodeBlockAsm4MB
1871
1872emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32:
1873	MOVOU (DI), X0
1874	MOVOU -16(DI)(R9*1), X1
1875	MOVOU X0, (AX)
1876	MOVOU X1, -16(AX)(R9*1)
1877	JMP   memmove_end_copy_match_emit_encodeBlockAsm4MB
1878
1879emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64:
1880	MOVOU (DI), X0
1881	MOVOU 16(DI), X1
1882	MOVOU -32(DI)(R9*1), X2
1883	MOVOU -16(DI)(R9*1), X3
1884	MOVOU X0, (AX)
1885	MOVOU X1, 16(AX)
1886	MOVOU X2, -32(AX)(R9*1)
1887	MOVOU X3, -16(AX)(R9*1)
1888
1889memmove_end_copy_match_emit_encodeBlockAsm4MB:
1890	MOVQ R8, AX
1891	JMP  emit_literal_done_match_emit_encodeBlockAsm4MB
1892
1893memmove_long_match_emit_encodeBlockAsm4MB:
1894	LEAQ (AX)(R9*1), R8
1895
1896	// genMemMoveLong
1897	MOVOU (DI), X0
1898	MOVOU 16(DI), X1
1899	MOVOU -32(DI)(R9*1), X2
1900	MOVOU -16(DI)(R9*1), X3
1901	MOVQ  R9, R11
1902	SHRQ  $0x05, R11
1903	MOVQ  AX, R10
1904	ANDL  $0x0000001f, R10
1905	MOVQ  $0x00000040, R12
1906	SUBQ  R10, R12
1907	DECQ  R11
1908	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
1909	LEAQ  -32(DI)(R12*1), R10
1910	LEAQ  -32(AX)(R12*1), R13
1911
1912emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back:
1913	MOVOU (R10), X4
1914	MOVOU 16(R10), X5
1915	MOVOA X4, (R13)
1916	MOVOA X5, 16(R13)
1917	ADDQ  $0x20, R13
1918	ADDQ  $0x20, R10
1919	ADDQ  $0x20, R12
1920	DECQ  R11
1921	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back
1922
1923emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
1924	MOVOU -32(DI)(R12*1), X4
1925	MOVOU -16(DI)(R12*1), X5
1926	MOVOA X4, -32(AX)(R12*1)
1927	MOVOA X5, -16(AX)(R12*1)
1928	ADDQ  $0x20, R12
1929	CMPQ  R9, R12
1930	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
1931	MOVOU X0, (AX)
1932	MOVOU X1, 16(AX)
1933	MOVOU X2, -32(AX)(R9*1)
1934	MOVOU X3, -16(AX)(R9*1)
1935	MOVQ  R8, AX
1936
1937emit_literal_done_match_emit_encodeBlockAsm4MB:
1938match_nolit_loop_encodeBlockAsm4MB:
1939	MOVL CX, DI
1940	SUBL SI, DI
1941	MOVL DI, 16(SP)
1942	ADDL $0x04, CX
1943	ADDL $0x04, SI
1944	MOVQ src_len+32(FP), DI
1945	SUBL CX, DI
1946	LEAQ (DX)(CX*1), R8
1947	LEAQ (DX)(SI*1), SI
1948
1949	// matchLen
1950	XORL R10, R10
1951	CMPL DI, $0x08
1952	JL   matchlen_single_match_nolit_encodeBlockAsm4MB
1953
1954matchlen_loopback_match_nolit_encodeBlockAsm4MB:
1955	MOVQ  (R8)(R10*1), R9
1956	XORQ  (SI)(R10*1), R9
1957	TESTQ R9, R9
1958	JZ    matchlen_loop_match_nolit_encodeBlockAsm4MB
1959	BSFQ  R9, R9
1960	SARQ  $0x03, R9
1961	LEAL  (R10)(R9*1), R10
1962	JMP   match_nolit_end_encodeBlockAsm4MB
1963
1964matchlen_loop_match_nolit_encodeBlockAsm4MB:
1965	LEAL -8(DI), DI
1966	LEAL 8(R10), R10
1967	CMPL DI, $0x08
1968	JGE  matchlen_loopback_match_nolit_encodeBlockAsm4MB
1969
1970matchlen_single_match_nolit_encodeBlockAsm4MB:
1971	TESTL DI, DI
1972	JZ    match_nolit_end_encodeBlockAsm4MB
1973
1974matchlen_single_loopback_match_nolit_encodeBlockAsm4MB:
1975	MOVB (R8)(R10*1), R9
1976	CMPB (SI)(R10*1), R9
1977	JNE  match_nolit_end_encodeBlockAsm4MB
1978	LEAL 1(R10), R10
1979	DECL DI
1980	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm4MB
1981
1982match_nolit_end_encodeBlockAsm4MB:
1983	ADDL R10, CX
1984	MOVL 16(SP), SI
1985	ADDL $0x04, R10
1986	MOVL CX, 12(SP)
1987
1988	// emitCopy
1989	CMPL SI, $0x00010000
1990	JL   two_byte_offset_match_nolit_encodeBlockAsm4MB
1991
1992four_bytes_loop_back_match_nolit_encodeBlockAsm4MB:
1993	CMPL R10, $0x40
1994	JLE  four_bytes_remain_match_nolit_encodeBlockAsm4MB
1995	MOVB $0xff, (AX)
1996	MOVL SI, 1(AX)
1997	LEAL -64(R10), R10
1998	ADDQ $0x05, AX
1999	CMPL R10, $0x04
2000	JL   four_bytes_remain_match_nolit_encodeBlockAsm4MB
2001
2002	// emitRepeat
2003	MOVL R10, DI
2004	LEAL -4(R10), R10
2005	CMPL DI, $0x08
2006	JLE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy
2007	CMPL DI, $0x0c
2008	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
2009	CMPL SI, $0x00000800
2010	JLT  repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
2011
2012cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
2013	CMPL R10, $0x00000104
2014	JLT  repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy
2015	CMPL R10, $0x00010100
2016	JLT  repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy
2017	LEAL -65536(R10), R10
2018	MOVL R10, SI
2019	MOVW $0x001d, (AX)
2020	MOVW R10, 2(AX)
2021	SARL $0x10, SI
2022	MOVB SI, 4(AX)
2023	ADDQ $0x05, AX
2024	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
2025
2026repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy:
2027	LEAL -256(R10), R10
2028	MOVW $0x0019, (AX)
2029	MOVW R10, 2(AX)
2030	ADDQ $0x04, AX
2031	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
2032
2033repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy:
2034	LEAL -4(R10), R10
2035	MOVW $0x0015, (AX)
2036	MOVB R10, 2(AX)
2037	ADDQ $0x03, AX
2038	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
2039
2040repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy:
2041	SHLL $0x02, R10
2042	ORL  $0x01, R10
2043	MOVW R10, (AX)
2044	ADDQ $0x02, AX
2045	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
2046
2047repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
2048	XORQ DI, DI
2049	LEAL 1(DI)(R10*4), R10
2050	MOVB SI, 1(AX)
2051	SARL $0x08, SI
2052	SHLL $0x05, SI
2053	ORL  SI, R10
2054	MOVB R10, (AX)
2055	ADDQ $0x02, AX
2056	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
2057	JMP four_bytes_loop_back_match_nolit_encodeBlockAsm4MB
2058
2059four_bytes_remain_match_nolit_encodeBlockAsm4MB:
2060	TESTL R10, R10
2061	JZ    match_nolit_emitcopy_end_encodeBlockAsm4MB
2062	MOVB  $0x03, BL
2063	LEAL  -4(BX)(R10*4), R10
2064	MOVB  R10, (AX)
2065	MOVL  SI, 1(AX)
2066	ADDQ  $0x05, AX
2067	JMP   match_nolit_emitcopy_end_encodeBlockAsm4MB
2068
2069two_byte_offset_match_nolit_encodeBlockAsm4MB:
2070	CMPL R10, $0x40
2071	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm4MB
2072	MOVB $0xee, (AX)
2073	MOVW SI, 1(AX)
2074	LEAL -60(R10), R10
2075	ADDQ $0x03, AX
2076
2077	// emitRepeat
2078	MOVL R10, DI
2079	LEAL -4(R10), R10
2080	CMPL DI, $0x08
2081	JLE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short
2082	CMPL DI, $0x0c
2083	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
2084	CMPL SI, $0x00000800
2085	JLT  repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
2086
2087cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2088	CMPL R10, $0x00000104
2089	JLT  repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short
2090	CMPL R10, $0x00010100
2091	JLT  repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short
2092	LEAL -65536(R10), R10
2093	MOVL R10, SI
2094	MOVW $0x001d, (AX)
2095	MOVW R10, 2(AX)
2096	SARL $0x10, SI
2097	MOVB SI, 4(AX)
2098	ADDQ $0x05, AX
2099	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
2100
2101repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2102	LEAL -256(R10), R10
2103	MOVW $0x0019, (AX)
2104	MOVW R10, 2(AX)
2105	ADDQ $0x04, AX
2106	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
2107
2108repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2109	LEAL -4(R10), R10
2110	MOVW $0x0015, (AX)
2111	MOVB R10, 2(AX)
2112	ADDQ $0x03, AX
2113	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
2114
2115repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2116	SHLL $0x02, R10
2117	ORL  $0x01, R10
2118	MOVW R10, (AX)
2119	ADDQ $0x02, AX
2120	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
2121
2122repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2123	XORQ DI, DI
2124	LEAL 1(DI)(R10*4), R10
2125	MOVB SI, 1(AX)
2126	SARL $0x08, SI
2127	SHLL $0x05, SI
2128	ORL  SI, R10
2129	MOVB R10, (AX)
2130	ADDQ $0x02, AX
2131	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
2132	JMP two_byte_offset_match_nolit_encodeBlockAsm4MB
2133
2134two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
2135	CMPL R10, $0x0c
2136	JGE  emit_copy_three_match_nolit_encodeBlockAsm4MB
2137	CMPL SI, $0x00000800
2138	JGE  emit_copy_three_match_nolit_encodeBlockAsm4MB
2139	MOVB $0x01, BL
2140	LEAL -16(BX)(R10*4), R10
2141	MOVB SI, 1(AX)
2142	SHRL $0x08, SI
2143	SHLL $0x05, SI
2144	ORL  SI, R10
2145	MOVB R10, (AX)
2146	ADDQ $0x02, AX
2147	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
2148
2149emit_copy_three_match_nolit_encodeBlockAsm4MB:
2150	MOVB $0x02, BL
2151	LEAL -4(BX)(R10*4), R10
2152	MOVB R10, (AX)
2153	MOVW SI, 1(AX)
2154	ADDQ $0x03, AX
2155
2156match_nolit_emitcopy_end_encodeBlockAsm4MB:
2157	CMPL CX, 8(SP)
2158	JGE  emit_remainder_encodeBlockAsm4MB
2159	MOVQ -2(DX)(CX*1), DI
2160	CMPQ AX, (SP)
2161	JL   match_nolit_dst_ok_encodeBlockAsm4MB
2162	MOVQ $0x00000000, ret+48(FP)
2163	RET
2164
2165match_nolit_dst_ok_encodeBlockAsm4MB:
2166	MOVQ  $0x0000cf1bbcdcbf9b, R9
2167	MOVQ  DI, R8
2168	SHRQ  $0x10, DI
2169	MOVQ  DI, SI
2170	SHLQ  $0x10, R8
2171	IMULQ R9, R8
2172	SHRQ  $0x32, R8
2173	SHLQ  $0x10, SI
2174	IMULQ R9, SI
2175	SHRQ  $0x32, SI
2176	LEAL  -2(CX), R9
2177	LEAQ  24(SP)(SI*4), R10
2178	MOVL  (R10), SI
2179	MOVL  R9, 24(SP)(R8*4)
2180	MOVL  CX, (R10)
2181	CMPL  (DX)(SI*1), DI
2182	JEQ   match_nolit_loop_encodeBlockAsm4MB
2183	INCL  CX
2184	JMP   search_loop_encodeBlockAsm4MB
2185
2186emit_remainder_encodeBlockAsm4MB:
2187	MOVQ src_len+32(FP), CX
2188	SUBL 12(SP), CX
2189	LEAQ 4(AX)(CX*1), CX
2190	CMPQ CX, (SP)
2191	JL   emit_remainder_ok_encodeBlockAsm4MB
2192	MOVQ $0x00000000, ret+48(FP)
2193	RET
2194
2195emit_remainder_ok_encodeBlockAsm4MB:
2196	MOVQ src_len+32(FP), CX
2197	MOVL 12(SP), BX
2198	CMPL BX, CX
2199	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm4MB
2200	MOVL CX, SI
2201	MOVL CX, 12(SP)
2202	LEAQ (DX)(BX*1), CX
2203	SUBL BX, SI
2204	LEAL -1(SI), DX
2205	CMPL DX, $0x3c
2206	JLT  one_byte_emit_remainder_encodeBlockAsm4MB
2207	CMPL DX, $0x00000100
2208	JLT  two_bytes_emit_remainder_encodeBlockAsm4MB
2209	CMPL DX, $0x00010000
2210	JLT  three_bytes_emit_remainder_encodeBlockAsm4MB
2211	MOVL DX, BX
2212	SHRL $0x10, BX
2213	MOVB $0xf8, (AX)
2214	MOVW DX, 1(AX)
2215	MOVB BL, 3(AX)
2216	ADDQ $0x04, AX
2217	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB
2218
2219three_bytes_emit_remainder_encodeBlockAsm4MB:
2220	MOVB $0xf4, (AX)
2221	MOVW DX, 1(AX)
2222	ADDQ $0x03, AX
2223	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB
2224
2225two_bytes_emit_remainder_encodeBlockAsm4MB:
2226	MOVB $0xf0, (AX)
2227	MOVB DL, 1(AX)
2228	ADDQ $0x02, AX
2229	CMPL DX, $0x40
2230	JL   memmove_emit_remainder_encodeBlockAsm4MB
2231	JMP  memmove_long_emit_remainder_encodeBlockAsm4MB
2232
2233one_byte_emit_remainder_encodeBlockAsm4MB:
2234	SHLB $0x02, DL
2235	MOVB DL, (AX)
2236	ADDQ $0x01, AX
2237
2238memmove_emit_remainder_encodeBlockAsm4MB:
2239	LEAQ (AX)(SI*1), DX
2240	MOVL SI, BX
2241
2242	// genMemMoveShort
2243	CMPQ BX, $0x03
2244	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2
2245	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3
2246	CMPQ BX, $0x08
2247	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7
2248	CMPQ BX, $0x10
2249	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16
2250	CMPQ BX, $0x20
2251	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32
2252	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64
2253
2254emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2:
2255	MOVB (CX), SI
2256	MOVB -1(CX)(BX*1), CL
2257	MOVB SI, (AX)
2258	MOVB CL, -1(AX)(BX*1)
2259	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2260
2261emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3:
2262	MOVW (CX), SI
2263	MOVB 2(CX), CL
2264	MOVW SI, (AX)
2265	MOVB CL, 2(AX)
2266	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2267
2268emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7:
2269	MOVL (CX), SI
2270	MOVL -4(CX)(BX*1), CX
2271	MOVL SI, (AX)
2272	MOVL CX, -4(AX)(BX*1)
2273	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2274
2275emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16:
2276	MOVQ (CX), SI
2277	MOVQ -8(CX)(BX*1), CX
2278	MOVQ SI, (AX)
2279	MOVQ CX, -8(AX)(BX*1)
2280	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2281
2282emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32:
2283	MOVOU (CX), X0
2284	MOVOU -16(CX)(BX*1), X1
2285	MOVOU X0, (AX)
2286	MOVOU X1, -16(AX)(BX*1)
2287	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2288
2289emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64:
2290	MOVOU (CX), X0
2291	MOVOU 16(CX), X1
2292	MOVOU -32(CX)(BX*1), X2
2293	MOVOU -16(CX)(BX*1), X3
2294	MOVOU X0, (AX)
2295	MOVOU X1, 16(AX)
2296	MOVOU X2, -32(AX)(BX*1)
2297	MOVOU X3, -16(AX)(BX*1)
2298
2299memmove_end_copy_emit_remainder_encodeBlockAsm4MB:
2300	MOVQ DX, AX
2301	JMP  emit_literal_done_emit_remainder_encodeBlockAsm4MB
2302
2303memmove_long_emit_remainder_encodeBlockAsm4MB:
2304	LEAQ (AX)(SI*1), DX
2305	MOVL SI, BX
2306
2307	// genMemMoveLong
2308	MOVOU (CX), X0
2309	MOVOU 16(CX), X1
2310	MOVOU -32(CX)(BX*1), X2
2311	MOVOU -16(CX)(BX*1), X3
2312	MOVQ  BX, DI
2313	SHRQ  $0x05, DI
2314	MOVQ  AX, SI
2315	ANDL  $0x0000001f, SI
2316	MOVQ  $0x00000040, R8
2317	SUBQ  SI, R8
2318	DECQ  DI
2319	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
2320	LEAQ  -32(CX)(R8*1), SI
2321	LEAQ  -32(AX)(R8*1), R9
2322
2323emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back:
2324	MOVOU (SI), X4
2325	MOVOU 16(SI), X5
2326	MOVOA X4, (R9)
2327	MOVOA X5, 16(R9)
2328	ADDQ  $0x20, R9
2329	ADDQ  $0x20, SI
2330	ADDQ  $0x20, R8
2331	DECQ  DI
2332	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back
2333
2334emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32:
2335	MOVOU -32(CX)(R8*1), X4
2336	MOVOU -16(CX)(R8*1), X5
2337	MOVOA X4, -32(AX)(R8*1)
2338	MOVOA X5, -16(AX)(R8*1)
2339	ADDQ  $0x20, R8
2340	CMPQ  BX, R8
2341	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
2342	MOVOU X0, (AX)
2343	MOVOU X1, 16(AX)
2344	MOVOU X2, -32(AX)(BX*1)
2345	MOVOU X3, -16(AX)(BX*1)
2346	MOVQ  DX, AX
2347
2348emit_literal_done_emit_remainder_encodeBlockAsm4MB:
2349	MOVQ dst_base+0(FP), CX
2350	SUBQ CX, AX
2351	MOVQ AX, ret+48(FP)
2352	RET
2353
2354// func encodeBlockAsm12B(dst []byte, src []byte) int
2355// Requires: SSE2
2356TEXT ·encodeBlockAsm12B(SB), $16408-56
2357	MOVQ dst_base+0(FP), AX
2358	MOVQ $0x00000080, CX
2359	LEAQ 24(SP), DX
2360	PXOR X0, X0
2361
2362zero_loop_encodeBlockAsm12B:
2363	MOVOU X0, (DX)
2364	MOVOU X0, 16(DX)
2365	MOVOU X0, 32(DX)
2366	MOVOU X0, 48(DX)
2367	MOVOU X0, 64(DX)
2368	MOVOU X0, 80(DX)
2369	MOVOU X0, 96(DX)
2370	MOVOU X0, 112(DX)
2371	ADDQ  $0x80, DX
2372	DECQ  CX
2373	JNZ   zero_loop_encodeBlockAsm12B
2374	MOVL  $0x00000000, 12(SP)
2375	MOVQ  src_len+32(FP), CX
2376	LEAQ  -5(CX), DX
2377	LEAQ  -8(CX), SI
2378	MOVL  SI, 8(SP)
2379	SHRQ  $0x05, CX
2380	SUBL  CX, DX
2381	LEAQ  (AX)(DX*1), DX
2382	MOVQ  DX, (SP)
2383	MOVL  $0x00000001, CX
2384	MOVL  CX, 16(SP)
2385	MOVQ  src_base+24(FP), DX
2386
2387search_loop_encodeBlockAsm12B:
2388	MOVL  CX, SI
2389	SUBL  12(SP), SI
2390	SHRL  $0x05, SI
2391	LEAL  4(CX)(SI*1), SI
2392	CMPL  SI, 8(SP)
2393	JGE   emit_remainder_encodeBlockAsm12B
2394	MOVQ  (DX)(CX*1), DI
2395	MOVL  SI, 20(SP)
2396	MOVQ  $0x000000cf1bbcdcbb, R9
2397	MOVQ  DI, R10
2398	MOVQ  DI, R11
2399	SHRQ  $0x08, R11
2400	SHLQ  $0x18, R10
2401	IMULQ R9, R10
2402	SHRQ  $0x34, R10
2403	SHLQ  $0x18, R11
2404	IMULQ R9, R11
2405	SHRQ  $0x34, R11
2406	MOVL  24(SP)(R10*4), SI
2407	MOVL  24(SP)(R11*4), R8
2408	MOVL  CX, 24(SP)(R10*4)
2409	LEAL  1(CX), R10
2410	MOVL  R10, 24(SP)(R11*4)
2411	MOVQ  DI, R10
2412	SHRQ  $0x10, R10
2413	SHLQ  $0x18, R10
2414	IMULQ R9, R10
2415	SHRQ  $0x34, R10
2416	MOVL  CX, R9
2417	SUBL  16(SP), R9
2418	MOVL  1(DX)(R9*1), R11
2419	MOVQ  DI, R9
2420	SHRQ  $0x08, R9
2421	CMPL  R9, R11
2422	JNE   no_repeat_found_encodeBlockAsm12B
2423	LEAL  1(CX), DI
2424	MOVL  12(SP), R8
2425	MOVL  DI, SI
2426	SUBL  16(SP), SI
2427	JZ    repeat_extend_back_end_encodeBlockAsm12B
2428
2429repeat_extend_back_loop_encodeBlockAsm12B:
2430	CMPL DI, R8
2431	JLE  repeat_extend_back_end_encodeBlockAsm12B
2432	MOVB -1(DX)(SI*1), BL
2433	MOVB -1(DX)(DI*1), R9
2434	CMPB BL, R9
2435	JNE  repeat_extend_back_end_encodeBlockAsm12B
2436	LEAL -1(DI), DI
2437	DECL SI
2438	JNZ  repeat_extend_back_loop_encodeBlockAsm12B
2439
2440repeat_extend_back_end_encodeBlockAsm12B:
2441	MOVL 12(SP), SI
2442	CMPL SI, DI
2443	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm12B
2444	MOVL DI, R9
2445	MOVL DI, 12(SP)
2446	LEAQ (DX)(SI*1), R10
2447	SUBL SI, R9
2448	LEAL -1(R9), SI
2449	CMPL SI, $0x3c
2450	JLT  one_byte_repeat_emit_encodeBlockAsm12B
2451	CMPL SI, $0x00000100
2452	JLT  two_bytes_repeat_emit_encodeBlockAsm12B
2453	MOVB $0xf4, (AX)
2454	MOVW SI, 1(AX)
2455	ADDQ $0x03, AX
2456	JMP  memmove_long_repeat_emit_encodeBlockAsm12B
2457
2458two_bytes_repeat_emit_encodeBlockAsm12B:
2459	MOVB $0xf0, (AX)
2460	MOVB SI, 1(AX)
2461	ADDQ $0x02, AX
2462	CMPL SI, $0x40
2463	JL   memmove_repeat_emit_encodeBlockAsm12B
2464	JMP  memmove_long_repeat_emit_encodeBlockAsm12B
2465
2466one_byte_repeat_emit_encodeBlockAsm12B:
2467	SHLB $0x02, SI
2468	MOVB SI, (AX)
2469	ADDQ $0x01, AX
2470
2471memmove_repeat_emit_encodeBlockAsm12B:
2472	LEAQ (AX)(R9*1), SI
2473
2474	// genMemMoveShort
2475	CMPQ R9, $0x03
2476	JB   emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2
2477	JE   emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3
2478	CMPQ R9, $0x08
2479	JB   emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_4through7
2480	CMPQ R9, $0x10
2481	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16
2482	CMPQ R9, $0x20
2483	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32
2484	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64
2485
2486emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2:
2487	MOVB (R10), R11
2488	MOVB -1(R10)(R9*1), R10
2489	MOVB R11, (AX)
2490	MOVB R10, -1(AX)(R9*1)
2491	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm12B
2492
2493emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3:
2494	MOVW (R10), R11
2495	MOVB 2(R10), R10
2496	MOVW R11, (AX)
2497	MOVB R10, 2(AX)
2498	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm12B
2499
2500emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_4through7:
2501	MOVL (R10), R11
2502	MOVL -4(R10)(R9*1), R10
2503	MOVL R11, (AX)
2504	MOVL R10, -4(AX)(R9*1)
2505	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm12B
2506
2507emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16:
2508	MOVQ (R10), R11
2509	MOVQ -8(R10)(R9*1), R10
2510	MOVQ R11, (AX)
2511	MOVQ R10, -8(AX)(R9*1)
2512	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm12B
2513
2514emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32:
2515	MOVOU (R10), X0
2516	MOVOU -16(R10)(R9*1), X1
2517	MOVOU X0, (AX)
2518	MOVOU X1, -16(AX)(R9*1)
2519	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm12B
2520
2521emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64:
2522	MOVOU (R10), X0
2523	MOVOU 16(R10), X1
2524	MOVOU -32(R10)(R9*1), X2
2525	MOVOU -16(R10)(R9*1), X3
2526	MOVOU X0, (AX)
2527	MOVOU X1, 16(AX)
2528	MOVOU X2, -32(AX)(R9*1)
2529	MOVOU X3, -16(AX)(R9*1)
2530
2531memmove_end_copy_repeat_emit_encodeBlockAsm12B:
2532	MOVQ SI, AX
2533	JMP  emit_literal_done_repeat_emit_encodeBlockAsm12B
2534
2535memmove_long_repeat_emit_encodeBlockAsm12B:
2536	LEAQ (AX)(R9*1), SI
2537
2538	// genMemMoveLong
2539	MOVOU (R10), X0
2540	MOVOU 16(R10), X1
2541	MOVOU -32(R10)(R9*1), X2
2542	MOVOU -16(R10)(R9*1), X3
2543	MOVQ  R9, R12
2544	SHRQ  $0x05, R12
2545	MOVQ  AX, R11
2546	ANDL  $0x0000001f, R11
2547	MOVQ  $0x00000040, R13
2548	SUBQ  R11, R13
2549	DECQ  R12
2550	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
2551	LEAQ  -32(R10)(R13*1), R11
2552	LEAQ  -32(AX)(R13*1), R14
2553
2554emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back:
2555	MOVOU (R11), X4
2556	MOVOU 16(R11), X5
2557	MOVOA X4, (R14)
2558	MOVOA X5, 16(R14)
2559	ADDQ  $0x20, R14
2560	ADDQ  $0x20, R11
2561	ADDQ  $0x20, R13
2562	DECQ  R12
2563	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back
2564
2565emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
2566	MOVOU -32(R10)(R13*1), X4
2567	MOVOU -16(R10)(R13*1), X5
2568	MOVOA X4, -32(AX)(R13*1)
2569	MOVOA X5, -16(AX)(R13*1)
2570	ADDQ  $0x20, R13
2571	CMPQ  R9, R13
2572	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
2573	MOVOU X0, (AX)
2574	MOVOU X1, 16(AX)
2575	MOVOU X2, -32(AX)(R9*1)
2576	MOVOU X3, -16(AX)(R9*1)
2577	MOVQ  SI, AX
2578
2579emit_literal_done_repeat_emit_encodeBlockAsm12B:
2580	ADDL $0x05, CX
2581	MOVL CX, SI
2582	SUBL 16(SP), SI
2583	MOVQ src_len+32(FP), R9
2584	SUBL CX, R9
2585	LEAQ (DX)(CX*1), R10
2586	LEAQ (DX)(SI*1), SI
2587
2588	// matchLen
2589	XORL R12, R12
2590	CMPL R9, $0x08
2591	JL   matchlen_single_repeat_extend_encodeBlockAsm12B
2592
2593matchlen_loopback_repeat_extend_encodeBlockAsm12B:
2594	MOVQ  (R10)(R12*1), R11
2595	XORQ  (SI)(R12*1), R11
2596	TESTQ R11, R11
2597	JZ    matchlen_loop_repeat_extend_encodeBlockAsm12B
2598	BSFQ  R11, R11
2599	SARQ  $0x03, R11
2600	LEAL  (R12)(R11*1), R12
2601	JMP   repeat_extend_forward_end_encodeBlockAsm12B
2602
2603matchlen_loop_repeat_extend_encodeBlockAsm12B:
2604	LEAL -8(R9), R9
2605	LEAL 8(R12), R12
2606	CMPL R9, $0x08
2607	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm12B
2608
2609matchlen_single_repeat_extend_encodeBlockAsm12B:
2610	TESTL R9, R9
2611	JZ    repeat_extend_forward_end_encodeBlockAsm12B
2612
2613matchlen_single_loopback_repeat_extend_encodeBlockAsm12B:
2614	MOVB (R10)(R12*1), R11
2615	CMPB (SI)(R12*1), R11
2616	JNE  repeat_extend_forward_end_encodeBlockAsm12B
2617	LEAL 1(R12), R12
2618	DECL R9
2619	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm12B
2620
2621repeat_extend_forward_end_encodeBlockAsm12B:
2622	ADDL  R12, CX
2623	MOVL  CX, SI
2624	SUBL  DI, SI
2625	MOVL  16(SP), DI
2626	TESTL R8, R8
2627	JZ    repeat_as_copy_encodeBlockAsm12B
2628
2629	// emitRepeat
2630	MOVL SI, R8
2631	LEAL -4(SI), SI
2632	CMPL R8, $0x08
2633	JLE  repeat_two_match_repeat_encodeBlockAsm12B
2634	CMPL R8, $0x0c
2635	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm12B
2636	CMPL DI, $0x00000800
2637	JLT  repeat_two_offset_match_repeat_encodeBlockAsm12B
2638
2639cant_repeat_two_offset_match_repeat_encodeBlockAsm12B:
2640	CMPL SI, $0x00000104
2641	JLT  repeat_three_match_repeat_encodeBlockAsm12B
2642	LEAL -256(SI), SI
2643	MOVW $0x0019, (AX)
2644	MOVW SI, 2(AX)
2645	ADDQ $0x04, AX
2646	JMP  repeat_end_emit_encodeBlockAsm12B
2647
2648repeat_three_match_repeat_encodeBlockAsm12B:
2649	LEAL -4(SI), SI
2650	MOVW $0x0015, (AX)
2651	MOVB SI, 2(AX)
2652	ADDQ $0x03, AX
2653	JMP  repeat_end_emit_encodeBlockAsm12B
2654
2655repeat_two_match_repeat_encodeBlockAsm12B:
2656	SHLL $0x02, SI
2657	ORL  $0x01, SI
2658	MOVW SI, (AX)
2659	ADDQ $0x02, AX
2660	JMP  repeat_end_emit_encodeBlockAsm12B
2661
2662repeat_two_offset_match_repeat_encodeBlockAsm12B:
2663	XORQ R8, R8
2664	LEAL 1(R8)(SI*4), SI
2665	MOVB DI, 1(AX)
2666	SARL $0x08, DI
2667	SHLL $0x05, DI
2668	ORL  DI, SI
2669	MOVB SI, (AX)
2670	ADDQ $0x02, AX
2671	JMP  repeat_end_emit_encodeBlockAsm12B
2672
2673repeat_as_copy_encodeBlockAsm12B:
2674	// emitCopy
2675two_byte_offset_repeat_as_copy_encodeBlockAsm12B:
2676	CMPL SI, $0x40
2677	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
2678	MOVB $0xee, (AX)
2679	MOVW DI, 1(AX)
2680	LEAL -60(SI), SI
2681	ADDQ $0x03, AX
2682
2683	// emitRepeat
2684	MOVL SI, R8
2685	LEAL -4(SI), SI
2686	CMPL R8, $0x08
2687	JLE  repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
2688	CMPL R8, $0x0c
2689	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
2690	CMPL DI, $0x00000800
2691	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
2692
2693cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
2694	CMPL SI, $0x00000104
2695	JLT  repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
2696	LEAL -256(SI), SI
2697	MOVW $0x0019, (AX)
2698	MOVW SI, 2(AX)
2699	ADDQ $0x04, AX
2700	JMP  repeat_end_emit_encodeBlockAsm12B
2701
2702repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
2703	LEAL -4(SI), SI
2704	MOVW $0x0015, (AX)
2705	MOVB SI, 2(AX)
2706	ADDQ $0x03, AX
2707	JMP  repeat_end_emit_encodeBlockAsm12B
2708
2709repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
2710	SHLL $0x02, SI
2711	ORL  $0x01, SI
2712	MOVW SI, (AX)
2713	ADDQ $0x02, AX
2714	JMP  repeat_end_emit_encodeBlockAsm12B
2715
2716repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
2717	XORQ R8, R8
2718	LEAL 1(R8)(SI*4), SI
2719	MOVB DI, 1(AX)
2720	SARL $0x08, DI
2721	SHLL $0x05, DI
2722	ORL  DI, SI
2723	MOVB SI, (AX)
2724	ADDQ $0x02, AX
2725	JMP  repeat_end_emit_encodeBlockAsm12B
2726	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B
2727
2728two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
2729	CMPL SI, $0x0c
2730	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm12B
2731	CMPL DI, $0x00000800
2732	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm12B
2733	MOVB $0x01, BL
2734	LEAL -16(BX)(SI*4), SI
2735	MOVB DI, 1(AX)
2736	SHRL $0x08, DI
2737	SHLL $0x05, DI
2738	ORL  DI, SI
2739	MOVB SI, (AX)
2740	ADDQ $0x02, AX
2741	JMP  repeat_end_emit_encodeBlockAsm12B
2742
2743emit_copy_three_repeat_as_copy_encodeBlockAsm12B:
2744	MOVB $0x02, BL
2745	LEAL -4(BX)(SI*4), SI
2746	MOVB SI, (AX)
2747	MOVW DI, 1(AX)
2748	ADDQ $0x03, AX
2749
2750repeat_end_emit_encodeBlockAsm12B:
2751	MOVL CX, 12(SP)
2752	JMP  search_loop_encodeBlockAsm12B
2753
2754no_repeat_found_encodeBlockAsm12B:
2755	CMPL (DX)(SI*1), DI
2756	JEQ  candidate_match_encodeBlockAsm12B
2757	SHRQ $0x08, DI
2758	MOVL 24(SP)(R10*4), SI
2759	LEAL 2(CX), R9
2760	CMPL (DX)(R8*1), DI
2761	JEQ  candidate2_match_encodeBlockAsm12B
2762	MOVL R9, 24(SP)(R10*4)
2763	SHRQ $0x08, DI
2764	CMPL (DX)(SI*1), DI
2765	JEQ  candidate3_match_encodeBlockAsm12B
2766	MOVL 20(SP), CX
2767	JMP  search_loop_encodeBlockAsm12B
2768
2769candidate3_match_encodeBlockAsm12B:
2770	ADDL $0x02, CX
2771	JMP  candidate_match_encodeBlockAsm12B
2772
2773candidate2_match_encodeBlockAsm12B:
2774	MOVL R9, 24(SP)(R10*4)
2775	INCL CX
2776	MOVL R8, SI
2777
2778candidate_match_encodeBlockAsm12B:
2779	MOVL  12(SP), DI
2780	TESTL SI, SI
2781	JZ    match_extend_back_end_encodeBlockAsm12B
2782
2783match_extend_back_loop_encodeBlockAsm12B:
2784	CMPL CX, DI
2785	JLE  match_extend_back_end_encodeBlockAsm12B
2786	MOVB -1(DX)(SI*1), BL
2787	MOVB -1(DX)(CX*1), R8
2788	CMPB BL, R8
2789	JNE  match_extend_back_end_encodeBlockAsm12B
2790	LEAL -1(CX), CX
2791	DECL SI
2792	JZ   match_extend_back_end_encodeBlockAsm12B
2793	JMP  match_extend_back_loop_encodeBlockAsm12B
2794
2795match_extend_back_end_encodeBlockAsm12B:
2796	MOVL CX, DI
2797	SUBL 12(SP), DI
2798	LEAQ 3(AX)(DI*1), DI
2799	CMPQ DI, (SP)
2800	JL   match_dst_size_check_encodeBlockAsm12B
2801	MOVQ $0x00000000, ret+48(FP)
2802	RET
2803
2804match_dst_size_check_encodeBlockAsm12B:
2805	MOVL CX, DI
2806	MOVL 12(SP), R8
2807	CMPL R8, DI
2808	JEQ  emit_literal_done_match_emit_encodeBlockAsm12B
2809	MOVL DI, R9
2810	MOVL DI, 12(SP)
2811	LEAQ (DX)(R8*1), DI
2812	SUBL R8, R9
2813	LEAL -1(R9), R8
2814	CMPL R8, $0x3c
2815	JLT  one_byte_match_emit_encodeBlockAsm12B
2816	CMPL R8, $0x00000100
2817	JLT  two_bytes_match_emit_encodeBlockAsm12B
2818	MOVB $0xf4, (AX)
2819	MOVW R8, 1(AX)
2820	ADDQ $0x03, AX
2821	JMP  memmove_long_match_emit_encodeBlockAsm12B
2822
2823two_bytes_match_emit_encodeBlockAsm12B:
2824	MOVB $0xf0, (AX)
2825	MOVB R8, 1(AX)
2826	ADDQ $0x02, AX
2827	CMPL R8, $0x40
2828	JL   memmove_match_emit_encodeBlockAsm12B
2829	JMP  memmove_long_match_emit_encodeBlockAsm12B
2830
2831one_byte_match_emit_encodeBlockAsm12B:
2832	SHLB $0x02, R8
2833	MOVB R8, (AX)
2834	ADDQ $0x01, AX
2835
2836memmove_match_emit_encodeBlockAsm12B:
2837	LEAQ (AX)(R9*1), R8
2838
2839	// genMemMoveShort
2840	CMPQ R9, $0x03
2841	JB   emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_1or2
2842	JE   emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_3
2843	CMPQ R9, $0x08
2844	JB   emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_4through7
2845	CMPQ R9, $0x10
2846	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16
2847	CMPQ R9, $0x20
2848	JBE  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32
2849	JMP  emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64
2850
2851emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_1or2:
2852	MOVB (DI), R10
2853	MOVB -1(DI)(R9*1), DI
2854	MOVB R10, (AX)
2855	MOVB DI, -1(AX)(R9*1)
2856	JMP  memmove_end_copy_match_emit_encodeBlockAsm12B
2857
2858emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_3:
2859	MOVW (DI), R10
2860	MOVB 2(DI), DI
2861	MOVW R10, (AX)
2862	MOVB DI, 2(AX)
2863	JMP  memmove_end_copy_match_emit_encodeBlockAsm12B
2864
2865emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_4through7:
2866	MOVL (DI), R10
2867	MOVL -4(DI)(R9*1), DI
2868	MOVL R10, (AX)
2869	MOVL DI, -4(AX)(R9*1)
2870	JMP  memmove_end_copy_match_emit_encodeBlockAsm12B
2871
2872emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16:
2873	MOVQ (DI), R10
2874	MOVQ -8(DI)(R9*1), DI
2875	MOVQ R10, (AX)
2876	MOVQ DI, -8(AX)(R9*1)
2877	JMP  memmove_end_copy_match_emit_encodeBlockAsm12B
2878
2879emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32:
2880	MOVOU (DI), X0
2881	MOVOU -16(DI)(R9*1), X1
2882	MOVOU X0, (AX)
2883	MOVOU X1, -16(AX)(R9*1)
2884	JMP   memmove_end_copy_match_emit_encodeBlockAsm12B
2885
2886emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64:
2887	MOVOU (DI), X0
2888	MOVOU 16(DI), X1
2889	MOVOU -32(DI)(R9*1), X2
2890	MOVOU -16(DI)(R9*1), X3
2891	MOVOU X0, (AX)
2892	MOVOU X1, 16(AX)
2893	MOVOU X2, -32(AX)(R9*1)
2894	MOVOU X3, -16(AX)(R9*1)
2895
2896memmove_end_copy_match_emit_encodeBlockAsm12B:
2897	MOVQ R8, AX
2898	JMP  emit_literal_done_match_emit_encodeBlockAsm12B
2899
2900memmove_long_match_emit_encodeBlockAsm12B:
2901	LEAQ (AX)(R9*1), R8
2902
2903	// genMemMoveLong
2904	MOVOU (DI), X0
2905	MOVOU 16(DI), X1
2906	MOVOU -32(DI)(R9*1), X2
2907	MOVOU -16(DI)(R9*1), X3
2908	MOVQ  R9, R11
2909	SHRQ  $0x05, R11
2910	MOVQ  AX, R10
2911	ANDL  $0x0000001f, R10
2912	MOVQ  $0x00000040, R12
2913	SUBQ  R10, R12
2914	DECQ  R11
2915	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
2916	LEAQ  -32(DI)(R12*1), R10
2917	LEAQ  -32(AX)(R12*1), R13
2918
2919emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back:
2920	MOVOU (R10), X4
2921	MOVOU 16(R10), X5
2922	MOVOA X4, (R13)
2923	MOVOA X5, 16(R13)
2924	ADDQ  $0x20, R13
2925	ADDQ  $0x20, R10
2926	ADDQ  $0x20, R12
2927	DECQ  R11
2928	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back
2929
2930emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
2931	MOVOU -32(DI)(R12*1), X4
2932	MOVOU -16(DI)(R12*1), X5
2933	MOVOA X4, -32(AX)(R12*1)
2934	MOVOA X5, -16(AX)(R12*1)
2935	ADDQ  $0x20, R12
2936	CMPQ  R9, R12
2937	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
2938	MOVOU X0, (AX)
2939	MOVOU X1, 16(AX)
2940	MOVOU X2, -32(AX)(R9*1)
2941	MOVOU X3, -16(AX)(R9*1)
2942	MOVQ  R8, AX
2943
2944emit_literal_done_match_emit_encodeBlockAsm12B:
2945match_nolit_loop_encodeBlockAsm12B:
2946	MOVL CX, DI
2947	SUBL SI, DI
2948	MOVL DI, 16(SP)
2949	ADDL $0x04, CX
2950	ADDL $0x04, SI
2951	MOVQ src_len+32(FP), DI
2952	SUBL CX, DI
2953	LEAQ (DX)(CX*1), R8
2954	LEAQ (DX)(SI*1), SI
2955
2956	// matchLen
2957	XORL R10, R10
2958	CMPL DI, $0x08
2959	JL   matchlen_single_match_nolit_encodeBlockAsm12B
2960
2961matchlen_loopback_match_nolit_encodeBlockAsm12B:
2962	MOVQ  (R8)(R10*1), R9
2963	XORQ  (SI)(R10*1), R9
2964	TESTQ R9, R9
2965	JZ    matchlen_loop_match_nolit_encodeBlockAsm12B
2966	BSFQ  R9, R9
2967	SARQ  $0x03, R9
2968	LEAL  (R10)(R9*1), R10
2969	JMP   match_nolit_end_encodeBlockAsm12B
2970
2971matchlen_loop_match_nolit_encodeBlockAsm12B:
2972	LEAL -8(DI), DI
2973	LEAL 8(R10), R10
2974	CMPL DI, $0x08
2975	JGE  matchlen_loopback_match_nolit_encodeBlockAsm12B
2976
2977matchlen_single_match_nolit_encodeBlockAsm12B:
2978	TESTL DI, DI
2979	JZ    match_nolit_end_encodeBlockAsm12B
2980
2981matchlen_single_loopback_match_nolit_encodeBlockAsm12B:
2982	MOVB (R8)(R10*1), R9
2983	CMPB (SI)(R10*1), R9
2984	JNE  match_nolit_end_encodeBlockAsm12B
2985	LEAL 1(R10), R10
2986	DECL DI
2987	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm12B
2988
2989match_nolit_end_encodeBlockAsm12B:
2990	ADDL R10, CX
2991	MOVL 16(SP), SI
2992	ADDL $0x04, R10
2993	MOVL CX, 12(SP)
2994
2995	// emitCopy
2996two_byte_offset_match_nolit_encodeBlockAsm12B:
2997	CMPL R10, $0x40
2998	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm12B
2999	MOVB $0xee, (AX)
3000	MOVW SI, 1(AX)
3001	LEAL -60(R10), R10
3002	ADDQ $0x03, AX
3003
3004	// emitRepeat
3005	MOVL R10, DI
3006	LEAL -4(R10), R10
3007	CMPL DI, $0x08
3008	JLE  repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short
3009	CMPL DI, $0x0c
3010	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
3011	CMPL SI, $0x00000800
3012	JLT  repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
3013
3014cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
3015	CMPL R10, $0x00000104
3016	JLT  repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short
3017	LEAL -256(R10), R10
3018	MOVW $0x0019, (AX)
3019	MOVW R10, 2(AX)
3020	ADDQ $0x04, AX
3021	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
3022
3023repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short:
3024	LEAL -4(R10), R10
3025	MOVW $0x0015, (AX)
3026	MOVB R10, 2(AX)
3027	ADDQ $0x03, AX
3028	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
3029
3030repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short:
3031	SHLL $0x02, R10
3032	ORL  $0x01, R10
3033	MOVW R10, (AX)
3034	ADDQ $0x02, AX
3035	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
3036
3037repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
3038	XORQ DI, DI
3039	LEAL 1(DI)(R10*4), R10
3040	MOVB SI, 1(AX)
3041	SARL $0x08, SI
3042	SHLL $0x05, SI
3043	ORL  SI, R10
3044	MOVB R10, (AX)
3045	ADDQ $0x02, AX
3046	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
3047	JMP two_byte_offset_match_nolit_encodeBlockAsm12B
3048
3049two_byte_offset_short_match_nolit_encodeBlockAsm12B:
3050	CMPL R10, $0x0c
3051	JGE  emit_copy_three_match_nolit_encodeBlockAsm12B
3052	CMPL SI, $0x00000800
3053	JGE  emit_copy_three_match_nolit_encodeBlockAsm12B
3054	MOVB $0x01, BL
3055	LEAL -16(BX)(R10*4), R10
3056	MOVB SI, 1(AX)
3057	SHRL $0x08, SI
3058	SHLL $0x05, SI
3059	ORL  SI, R10
3060	MOVB R10, (AX)
3061	ADDQ $0x02, AX
3062	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
3063
3064emit_copy_three_match_nolit_encodeBlockAsm12B:
3065	MOVB $0x02, BL
3066	LEAL -4(BX)(R10*4), R10
3067	MOVB R10, (AX)
3068	MOVW SI, 1(AX)
3069	ADDQ $0x03, AX
3070
3071match_nolit_emitcopy_end_encodeBlockAsm12B:
3072	CMPL CX, 8(SP)
3073	JGE  emit_remainder_encodeBlockAsm12B
3074	MOVQ -2(DX)(CX*1), DI
3075	CMPQ AX, (SP)
3076	JL   match_nolit_dst_ok_encodeBlockAsm12B
3077	MOVQ $0x00000000, ret+48(FP)
3078	RET
3079
3080match_nolit_dst_ok_encodeBlockAsm12B:
3081	MOVQ  $0x000000cf1bbcdcbb, R9
3082	MOVQ  DI, R8
3083	SHRQ  $0x10, DI
3084	MOVQ  DI, SI
3085	SHLQ  $0x18, R8
3086	IMULQ R9, R8
3087	SHRQ  $0x34, R8
3088	SHLQ  $0x18, SI
3089	IMULQ R9, SI
3090	SHRQ  $0x34, SI
3091	LEAL  -2(CX), R9
3092	LEAQ  24(SP)(SI*4), R10
3093	MOVL  (R10), SI
3094	MOVL  R9, 24(SP)(R8*4)
3095	MOVL  CX, (R10)
3096	CMPL  (DX)(SI*1), DI
3097	JEQ   match_nolit_loop_encodeBlockAsm12B
3098	INCL  CX
3099	JMP   search_loop_encodeBlockAsm12B
3100
3101emit_remainder_encodeBlockAsm12B:
3102	MOVQ src_len+32(FP), CX
3103	SUBL 12(SP), CX
3104	LEAQ 3(AX)(CX*1), CX
3105	CMPQ CX, (SP)
3106	JL   emit_remainder_ok_encodeBlockAsm12B
3107	MOVQ $0x00000000, ret+48(FP)
3108	RET
3109
3110emit_remainder_ok_encodeBlockAsm12B:
3111	MOVQ src_len+32(FP), CX
3112	MOVL 12(SP), BX
3113	CMPL BX, CX
3114	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm12B
3115	MOVL CX, SI
3116	MOVL CX, 12(SP)
3117	LEAQ (DX)(BX*1), CX
3118	SUBL BX, SI
3119	LEAL -1(SI), DX
3120	CMPL DX, $0x3c
3121	JLT  one_byte_emit_remainder_encodeBlockAsm12B
3122	CMPL DX, $0x00000100
3123	JLT  two_bytes_emit_remainder_encodeBlockAsm12B
3124	MOVB $0xf4, (AX)
3125	MOVW DX, 1(AX)
3126	ADDQ $0x03, AX
3127	JMP  memmove_long_emit_remainder_encodeBlockAsm12B
3128
3129two_bytes_emit_remainder_encodeBlockAsm12B:
3130	MOVB $0xf0, (AX)
3131	MOVB DL, 1(AX)
3132	ADDQ $0x02, AX
3133	CMPL DX, $0x40
3134	JL   memmove_emit_remainder_encodeBlockAsm12B
3135	JMP  memmove_long_emit_remainder_encodeBlockAsm12B
3136
3137one_byte_emit_remainder_encodeBlockAsm12B:
3138	SHLB $0x02, DL
3139	MOVB DL, (AX)
3140	ADDQ $0x01, AX
3141
3142memmove_emit_remainder_encodeBlockAsm12B:
3143	LEAQ (AX)(SI*1), DX
3144	MOVL SI, BX
3145
3146	// genMemMoveShort
3147	CMPQ BX, $0x03
3148	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2
3149	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3
3150	CMPQ BX, $0x08
3151	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7
3152	CMPQ BX, $0x10
3153	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16
3154	CMPQ BX, $0x20
3155	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32
3156	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64
3157
3158emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2:
3159	MOVB (CX), SI
3160	MOVB -1(CX)(BX*1), CL
3161	MOVB SI, (AX)
3162	MOVB CL, -1(AX)(BX*1)
3163	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B
3164
3165emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3:
3166	MOVW (CX), SI
3167	MOVB 2(CX), CL
3168	MOVW SI, (AX)
3169	MOVB CL, 2(AX)
3170	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B
3171
3172emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7:
3173	MOVL (CX), SI
3174	MOVL -4(CX)(BX*1), CX
3175	MOVL SI, (AX)
3176	MOVL CX, -4(AX)(BX*1)
3177	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B
3178
3179emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16:
3180	MOVQ (CX), SI
3181	MOVQ -8(CX)(BX*1), CX
3182	MOVQ SI, (AX)
3183	MOVQ CX, -8(AX)(BX*1)
3184	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B
3185
3186emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32:
3187	MOVOU (CX), X0
3188	MOVOU -16(CX)(BX*1), X1
3189	MOVOU X0, (AX)
3190	MOVOU X1, -16(AX)(BX*1)
3191	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm12B
3192
3193emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64:
3194	MOVOU (CX), X0
3195	MOVOU 16(CX), X1
3196	MOVOU -32(CX)(BX*1), X2
3197	MOVOU -16(CX)(BX*1), X3
3198	MOVOU X0, (AX)
3199	MOVOU X1, 16(AX)
3200	MOVOU X2, -32(AX)(BX*1)
3201	MOVOU X3, -16(AX)(BX*1)
3202
3203memmove_end_copy_emit_remainder_encodeBlockAsm12B:
3204	MOVQ DX, AX
3205	JMP  emit_literal_done_emit_remainder_encodeBlockAsm12B
3206
3207memmove_long_emit_remainder_encodeBlockAsm12B:
3208	LEAQ (AX)(SI*1), DX
3209	MOVL SI, BX
3210
3211	// genMemMoveLong
3212	MOVOU (CX), X0
3213	MOVOU 16(CX), X1
3214	MOVOU -32(CX)(BX*1), X2
3215	MOVOU -16(CX)(BX*1), X3
3216	MOVQ  BX, DI
3217	SHRQ  $0x05, DI
3218	MOVQ  AX, SI
3219	ANDL  $0x0000001f, SI
3220	MOVQ  $0x00000040, R8
3221	SUBQ  SI, R8
3222	DECQ  DI
3223	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
3224	LEAQ  -32(CX)(R8*1), SI
3225	LEAQ  -32(AX)(R8*1), R9
3226
3227emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back:
3228	MOVOU (SI), X4
3229	MOVOU 16(SI), X5
3230	MOVOA X4, (R9)
3231	MOVOA X5, 16(R9)
3232	ADDQ  $0x20, R9
3233	ADDQ  $0x20, SI
3234	ADDQ  $0x20, R8
3235	DECQ  DI
3236	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back
3237
3238emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32:
3239	MOVOU -32(CX)(R8*1), X4
3240	MOVOU -16(CX)(R8*1), X5
3241	MOVOA X4, -32(AX)(R8*1)
3242	MOVOA X5, -16(AX)(R8*1)
3243	ADDQ  $0x20, R8
3244	CMPQ  BX, R8
3245	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
3246	MOVOU X0, (AX)
3247	MOVOU X1, 16(AX)
3248	MOVOU X2, -32(AX)(BX*1)
3249	MOVOU X3, -16(AX)(BX*1)
3250	MOVQ  DX, AX
3251
3252emit_literal_done_emit_remainder_encodeBlockAsm12B:
3253	MOVQ dst_base+0(FP), CX
3254	SUBQ CX, AX
3255	MOVQ AX, ret+48(FP)
3256	RET
3257
3258// func encodeBlockAsm10B(dst []byte, src []byte) int
3259// Requires: SSE2
3260TEXT ·encodeBlockAsm10B(SB), $4120-56
3261	MOVQ dst_base+0(FP), AX
3262	MOVQ $0x00000020, CX
3263	LEAQ 24(SP), DX
3264	PXOR X0, X0
3265
3266zero_loop_encodeBlockAsm10B:
3267	MOVOU X0, (DX)
3268	MOVOU X0, 16(DX)
3269	MOVOU X0, 32(DX)
3270	MOVOU X0, 48(DX)
3271	MOVOU X0, 64(DX)
3272	MOVOU X0, 80(DX)
3273	MOVOU X0, 96(DX)
3274	MOVOU X0, 112(DX)
3275	ADDQ  $0x80, DX
3276	DECQ  CX
3277	JNZ   zero_loop_encodeBlockAsm10B
3278	MOVL  $0x00000000, 12(SP)
3279	MOVQ  src_len+32(FP), CX
3280	LEAQ  -5(CX), DX
3281	LEAQ  -8(CX), SI
3282	MOVL  SI, 8(SP)
3283	SHRQ  $0x05, CX
3284	SUBL  CX, DX
3285	LEAQ  (AX)(DX*1), DX
3286	MOVQ  DX, (SP)
3287	MOVL  $0x00000001, CX
3288	MOVL  CX, 16(SP)
3289	MOVQ  src_base+24(FP), DX
3290
3291search_loop_encodeBlockAsm10B:
3292	MOVL  CX, SI
3293	SUBL  12(SP), SI
3294	SHRL  $0x05, SI
3295	LEAL  4(CX)(SI*1), SI
3296	CMPL  SI, 8(SP)
3297	JGE   emit_remainder_encodeBlockAsm10B
3298	MOVQ  (DX)(CX*1), DI
3299	MOVL  SI, 20(SP)
3300	MOVQ  $0x9e3779b1, R9
3301	MOVQ  DI, R10
3302	MOVQ  DI, R11
3303	SHRQ  $0x08, R11
3304	SHLQ  $0x20, R10
3305	IMULQ R9, R10
3306	SHRQ  $0x36, R10
3307	SHLQ  $0x20, R11
3308	IMULQ R9, R11
3309	SHRQ  $0x36, R11
3310	MOVL  24(SP)(R10*4), SI
3311	MOVL  24(SP)(R11*4), R8
3312	MOVL  CX, 24(SP)(R10*4)
3313	LEAL  1(CX), R10
3314	MOVL  R10, 24(SP)(R11*4)
3315	MOVQ  DI, R10
3316	SHRQ  $0x10, R10
3317	SHLQ  $0x20, R10
3318	IMULQ R9, R10
3319	SHRQ  $0x36, R10
3320	MOVL  CX, R9
3321	SUBL  16(SP), R9
3322	MOVL  1(DX)(R9*1), R11
3323	MOVQ  DI, R9
3324	SHRQ  $0x08, R9
3325	CMPL  R9, R11
3326	JNE   no_repeat_found_encodeBlockAsm10B
3327	LEAL  1(CX), DI
3328	MOVL  12(SP), R8
3329	MOVL  DI, SI
3330	SUBL  16(SP), SI
3331	JZ    repeat_extend_back_end_encodeBlockAsm10B
3332
3333repeat_extend_back_loop_encodeBlockAsm10B:
3334	CMPL DI, R8
3335	JLE  repeat_extend_back_end_encodeBlockAsm10B
3336	MOVB -1(DX)(SI*1), BL
3337	MOVB -1(DX)(DI*1), R9
3338	CMPB BL, R9
3339	JNE  repeat_extend_back_end_encodeBlockAsm10B
3340	LEAL -1(DI), DI
3341	DECL SI
3342	JNZ  repeat_extend_back_loop_encodeBlockAsm10B
3343
3344repeat_extend_back_end_encodeBlockAsm10B:
3345	MOVL 12(SP), SI
3346	CMPL SI, DI
3347	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm10B
3348	MOVL DI, R9
3349	MOVL DI, 12(SP)
3350	LEAQ (DX)(SI*1), R10
3351	SUBL SI, R9
3352	LEAL -1(R9), SI
3353	CMPL SI, $0x3c
3354	JLT  one_byte_repeat_emit_encodeBlockAsm10B
3355	CMPL SI, $0x00000100
3356	JLT  two_bytes_repeat_emit_encodeBlockAsm10B
3357	MOVB $0xf4, (AX)
3358	MOVW SI, 1(AX)
3359	ADDQ $0x03, AX
3360	JMP  memmove_long_repeat_emit_encodeBlockAsm10B
3361
3362two_bytes_repeat_emit_encodeBlockAsm10B:
3363	MOVB $0xf0, (AX)
3364	MOVB SI, 1(AX)
3365	ADDQ $0x02, AX
3366	CMPL SI, $0x40
3367	JL   memmove_repeat_emit_encodeBlockAsm10B
3368	JMP  memmove_long_repeat_emit_encodeBlockAsm10B
3369
3370one_byte_repeat_emit_encodeBlockAsm10B:
3371	SHLB $0x02, SI
3372	MOVB SI, (AX)
3373	ADDQ $0x01, AX
3374
3375memmove_repeat_emit_encodeBlockAsm10B:
3376	LEAQ (AX)(R9*1), SI
3377
3378	// genMemMoveShort
3379	CMPQ R9, $0x03
3380	JB   emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_1or2
3381	JE   emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_3
3382	CMPQ R9, $0x08
3383	JB   emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_4through7
3384	CMPQ R9, $0x10
3385	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16
3386	CMPQ R9, $0x20
3387	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32
3388	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64
3389
3390emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_1or2:
3391	MOVB (R10), R11
3392	MOVB -1(R10)(R9*1), R10
3393	MOVB R11, (AX)
3394	MOVB R10, -1(AX)(R9*1)
3395	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm10B
3396
3397emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_3:
3398	MOVW (R10), R11
3399	MOVB 2(R10), R10
3400	MOVW R11, (AX)
3401	MOVB R10, 2(AX)
3402	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm10B
3403
3404emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_4through7:
3405	MOVL (R10), R11
3406	MOVL -4(R10)(R9*1), R10
3407	MOVL R11, (AX)
3408	MOVL R10, -4(AX)(R9*1)
3409	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm10B
3410
3411emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16:
3412	MOVQ (R10), R11
3413	MOVQ -8(R10)(R9*1), R10
3414	MOVQ R11, (AX)
3415	MOVQ R10, -8(AX)(R9*1)
3416	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm10B
3417
3418emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32:
3419	MOVOU (R10), X0
3420	MOVOU -16(R10)(R9*1), X1
3421	MOVOU X0, (AX)
3422	MOVOU X1, -16(AX)(R9*1)
3423	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm10B
3424
3425emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64:
3426	MOVOU (R10), X0
3427	MOVOU 16(R10), X1
3428	MOVOU -32(R10)(R9*1), X2
3429	MOVOU -16(R10)(R9*1), X3
3430	MOVOU X0, (AX)
3431	MOVOU X1, 16(AX)
3432	MOVOU X2, -32(AX)(R9*1)
3433	MOVOU X3, -16(AX)(R9*1)
3434
3435memmove_end_copy_repeat_emit_encodeBlockAsm10B:
3436	MOVQ SI, AX
3437	JMP  emit_literal_done_repeat_emit_encodeBlockAsm10B
3438
3439memmove_long_repeat_emit_encodeBlockAsm10B:
3440	LEAQ (AX)(R9*1), SI
3441
3442	// genMemMoveLong
3443	MOVOU (R10), X0
3444	MOVOU 16(R10), X1
3445	MOVOU -32(R10)(R9*1), X2
3446	MOVOU -16(R10)(R9*1), X3
3447	MOVQ  R9, R12
3448	SHRQ  $0x05, R12
3449	MOVQ  AX, R11
3450	ANDL  $0x0000001f, R11
3451	MOVQ  $0x00000040, R13
3452	SUBQ  R11, R13
3453	DECQ  R12
3454	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
3455	LEAQ  -32(R10)(R13*1), R11
3456	LEAQ  -32(AX)(R13*1), R14
3457
3458emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back:
3459	MOVOU (R11), X4
3460	MOVOU 16(R11), X5
3461	MOVOA X4, (R14)
3462	MOVOA X5, 16(R14)
3463	ADDQ  $0x20, R14
3464	ADDQ  $0x20, R11
3465	ADDQ  $0x20, R13
3466	DECQ  R12
3467	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back
3468
3469emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
3470	MOVOU -32(R10)(R13*1), X4
3471	MOVOU -16(R10)(R13*1), X5
3472	MOVOA X4, -32(AX)(R13*1)
3473	MOVOA X5, -16(AX)(R13*1)
3474	ADDQ  $0x20, R13
3475	CMPQ  R9, R13
3476	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
3477	MOVOU X0, (AX)
3478	MOVOU X1, 16(AX)
3479	MOVOU X2, -32(AX)(R9*1)
3480	MOVOU X3, -16(AX)(R9*1)
3481	MOVQ  SI, AX
3482
3483emit_literal_done_repeat_emit_encodeBlockAsm10B:
3484	ADDL $0x05, CX
3485	MOVL CX, SI
3486	SUBL 16(SP), SI
3487	MOVQ src_len+32(FP), R9
3488	SUBL CX, R9
3489	LEAQ (DX)(CX*1), R10
3490	LEAQ (DX)(SI*1), SI
3491
3492	// matchLen
3493	XORL R12, R12
3494	CMPL R9, $0x08
3495	JL   matchlen_single_repeat_extend_encodeBlockAsm10B
3496
3497matchlen_loopback_repeat_extend_encodeBlockAsm10B:
3498	MOVQ  (R10)(R12*1), R11
3499	XORQ  (SI)(R12*1), R11
3500	TESTQ R11, R11
3501	JZ    matchlen_loop_repeat_extend_encodeBlockAsm10B
3502	BSFQ  R11, R11
3503	SARQ  $0x03, R11
3504	LEAL  (R12)(R11*1), R12
3505	JMP   repeat_extend_forward_end_encodeBlockAsm10B
3506
3507matchlen_loop_repeat_extend_encodeBlockAsm10B:
3508	LEAL -8(R9), R9
3509	LEAL 8(R12), R12
3510	CMPL R9, $0x08
3511	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm10B
3512
3513matchlen_single_repeat_extend_encodeBlockAsm10B:
3514	TESTL R9, R9
3515	JZ    repeat_extend_forward_end_encodeBlockAsm10B
3516
3517matchlen_single_loopback_repeat_extend_encodeBlockAsm10B:
3518	MOVB (R10)(R12*1), R11
3519	CMPB (SI)(R12*1), R11
3520	JNE  repeat_extend_forward_end_encodeBlockAsm10B
3521	LEAL 1(R12), R12
3522	DECL R9
3523	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm10B
3524
3525repeat_extend_forward_end_encodeBlockAsm10B:
3526	ADDL  R12, CX
3527	MOVL  CX, SI
3528	SUBL  DI, SI
3529	MOVL  16(SP), DI
3530	TESTL R8, R8
3531	JZ    repeat_as_copy_encodeBlockAsm10B
3532
3533	// emitRepeat
3534	MOVL SI, R8
3535	LEAL -4(SI), SI
3536	CMPL R8, $0x08
3537	JLE  repeat_two_match_repeat_encodeBlockAsm10B
3538	CMPL R8, $0x0c
3539	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm10B
3540	CMPL DI, $0x00000800
3541	JLT  repeat_two_offset_match_repeat_encodeBlockAsm10B
3542
3543cant_repeat_two_offset_match_repeat_encodeBlockAsm10B:
3544	CMPL SI, $0x00000104
3545	JLT  repeat_three_match_repeat_encodeBlockAsm10B
3546	LEAL -256(SI), SI
3547	MOVW $0x0019, (AX)
3548	MOVW SI, 2(AX)
3549	ADDQ $0x04, AX
3550	JMP  repeat_end_emit_encodeBlockAsm10B
3551
3552repeat_three_match_repeat_encodeBlockAsm10B:
3553	LEAL -4(SI), SI
3554	MOVW $0x0015, (AX)
3555	MOVB SI, 2(AX)
3556	ADDQ $0x03, AX
3557	JMP  repeat_end_emit_encodeBlockAsm10B
3558
3559repeat_two_match_repeat_encodeBlockAsm10B:
3560	SHLL $0x02, SI
3561	ORL  $0x01, SI
3562	MOVW SI, (AX)
3563	ADDQ $0x02, AX
3564	JMP  repeat_end_emit_encodeBlockAsm10B
3565
3566repeat_two_offset_match_repeat_encodeBlockAsm10B:
3567	XORQ R8, R8
3568	LEAL 1(R8)(SI*4), SI
3569	MOVB DI, 1(AX)
3570	SARL $0x08, DI
3571	SHLL $0x05, DI
3572	ORL  DI, SI
3573	MOVB SI, (AX)
3574	ADDQ $0x02, AX
3575	JMP  repeat_end_emit_encodeBlockAsm10B
3576
3577repeat_as_copy_encodeBlockAsm10B:
3578	// emitCopy
3579two_byte_offset_repeat_as_copy_encodeBlockAsm10B:
3580	CMPL SI, $0x40
3581	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
3582	MOVB $0xee, (AX)
3583	MOVW DI, 1(AX)
3584	LEAL -60(SI), SI
3585	ADDQ $0x03, AX
3586
3587	// emitRepeat
3588	MOVL SI, R8
3589	LEAL -4(SI), SI
3590	CMPL R8, $0x08
3591	JLE  repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
3592	CMPL R8, $0x0c
3593	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
3594	CMPL DI, $0x00000800
3595	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
3596
3597cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
3598	CMPL SI, $0x00000104
3599	JLT  repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
3600	LEAL -256(SI), SI
3601	MOVW $0x0019, (AX)
3602	MOVW SI, 2(AX)
3603	ADDQ $0x04, AX
3604	JMP  repeat_end_emit_encodeBlockAsm10B
3605
3606repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
3607	LEAL -4(SI), SI
3608	MOVW $0x0015, (AX)
3609	MOVB SI, 2(AX)
3610	ADDQ $0x03, AX
3611	JMP  repeat_end_emit_encodeBlockAsm10B
3612
3613repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
3614	SHLL $0x02, SI
3615	ORL  $0x01, SI
3616	MOVW SI, (AX)
3617	ADDQ $0x02, AX
3618	JMP  repeat_end_emit_encodeBlockAsm10B
3619
3620repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
3621	XORQ R8, R8
3622	LEAL 1(R8)(SI*4), SI
3623	MOVB DI, 1(AX)
3624	SARL $0x08, DI
3625	SHLL $0x05, DI
3626	ORL  DI, SI
3627	MOVB SI, (AX)
3628	ADDQ $0x02, AX
3629	JMP  repeat_end_emit_encodeBlockAsm10B
3630	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B
3631
3632two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
3633	CMPL SI, $0x0c
3634	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm10B
3635	CMPL DI, $0x00000800
3636	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm10B
3637	MOVB $0x01, BL
3638	LEAL -16(BX)(SI*4), SI
3639	MOVB DI, 1(AX)
3640	SHRL $0x08, DI
3641	SHLL $0x05, DI
3642	ORL  DI, SI
3643	MOVB SI, (AX)
3644	ADDQ $0x02, AX
3645	JMP  repeat_end_emit_encodeBlockAsm10B
3646
3647emit_copy_three_repeat_as_copy_encodeBlockAsm10B:
3648	MOVB $0x02, BL
3649	LEAL -4(BX)(SI*4), SI
3650	MOVB SI, (AX)
3651	MOVW DI, 1(AX)
3652	ADDQ $0x03, AX
3653
3654repeat_end_emit_encodeBlockAsm10B:
3655	MOVL CX, 12(SP)
3656	JMP  search_loop_encodeBlockAsm10B
3657
3658no_repeat_found_encodeBlockAsm10B:
3659	CMPL (DX)(SI*1), DI
3660	JEQ  candidate_match_encodeBlockAsm10B
3661	SHRQ $0x08, DI
3662	MOVL 24(SP)(R10*4), SI
3663	LEAL 2(CX), R9
3664	CMPL (DX)(R8*1), DI
3665	JEQ  candidate2_match_encodeBlockAsm10B
3666	MOVL R9, 24(SP)(R10*4)
3667	SHRQ $0x08, DI
3668	CMPL (DX)(SI*1), DI
3669	JEQ  candidate3_match_encodeBlockAsm10B
3670	MOVL 20(SP), CX
3671	JMP  search_loop_encodeBlockAsm10B
3672
3673candidate3_match_encodeBlockAsm10B:
3674	ADDL $0x02, CX
3675	JMP  candidate_match_encodeBlockAsm10B
3676
3677candidate2_match_encodeBlockAsm10B:
3678	MOVL R9, 24(SP)(R10*4)
3679	INCL CX
3680	MOVL R8, SI
3681
3682candidate_match_encodeBlockAsm10B:
3683	MOVL  12(SP), DI
3684	TESTL SI, SI
3685	JZ    match_extend_back_end_encodeBlockAsm10B
3686
3687match_extend_back_loop_encodeBlockAsm10B:
3688	CMPL CX, DI
3689	JLE  match_extend_back_end_encodeBlockAsm10B
3690	MOVB -1(DX)(SI*1), BL
3691	MOVB -1(DX)(CX*1), R8
3692	CMPB BL, R8
3693	JNE  match_extend_back_end_encodeBlockAsm10B
3694	LEAL -1(CX), CX
3695	DECL SI
3696	JZ   match_extend_back_end_encodeBlockAsm10B
3697	JMP  match_extend_back_loop_encodeBlockAsm10B
3698
3699match_extend_back_end_encodeBlockAsm10B:
3700	MOVL CX, DI
3701	SUBL 12(SP), DI
3702	LEAQ 3(AX)(DI*1), DI
3703	CMPQ DI, (SP)
3704	JL   match_dst_size_check_encodeBlockAsm10B
3705	MOVQ $0x00000000, ret+48(FP)
3706	RET
3707
3708match_dst_size_check_encodeBlockAsm10B:
3709	MOVL CX, DI
3710	MOVL 12(SP), R8
3711	CMPL R8, DI
3712	JEQ  emit_literal_done_match_emit_encodeBlockAsm10B
3713	MOVL DI, R9
3714	MOVL DI, 12(SP)
3715	LEAQ (DX)(R8*1), DI
3716	SUBL R8, R9
3717	LEAL -1(R9), R8
3718	CMPL R8, $0x3c
3719	JLT  one_byte_match_emit_encodeBlockAsm10B
3720	CMPL R8, $0x00000100
3721	JLT  two_bytes_match_emit_encodeBlockAsm10B
3722	MOVB $0xf4, (AX)
3723	MOVW R8, 1(AX)
3724	ADDQ $0x03, AX
3725	JMP  memmove_long_match_emit_encodeBlockAsm10B
3726
3727two_bytes_match_emit_encodeBlockAsm10B:
3728	MOVB $0xf0, (AX)
3729	MOVB R8, 1(AX)
3730	ADDQ $0x02, AX
3731	CMPL R8, $0x40
3732	JL   memmove_match_emit_encodeBlockAsm10B
3733	JMP  memmove_long_match_emit_encodeBlockAsm10B
3734
3735one_byte_match_emit_encodeBlockAsm10B:
3736	SHLB $0x02, R8
3737	MOVB R8, (AX)
3738	ADDQ $0x01, AX
3739
3740memmove_match_emit_encodeBlockAsm10B:
3741	LEAQ (AX)(R9*1), R8
3742
3743	// genMemMoveShort
3744	CMPQ R9, $0x03
3745	JB   emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_1or2
3746	JE   emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_3
3747	CMPQ R9, $0x08
3748	JB   emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_4through7
3749	CMPQ R9, $0x10
3750	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16
3751	CMPQ R9, $0x20
3752	JBE  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32
3753	JMP  emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64
3754
3755emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_1or2:
3756	MOVB (DI), R10
3757	MOVB -1(DI)(R9*1), DI
3758	MOVB R10, (AX)
3759	MOVB DI, -1(AX)(R9*1)
3760	JMP  memmove_end_copy_match_emit_encodeBlockAsm10B
3761
3762emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_3:
3763	MOVW (DI), R10
3764	MOVB 2(DI), DI
3765	MOVW R10, (AX)
3766	MOVB DI, 2(AX)
3767	JMP  memmove_end_copy_match_emit_encodeBlockAsm10B
3768
3769emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_4through7:
3770	MOVL (DI), R10
3771	MOVL -4(DI)(R9*1), DI
3772	MOVL R10, (AX)
3773	MOVL DI, -4(AX)(R9*1)
3774	JMP  memmove_end_copy_match_emit_encodeBlockAsm10B
3775
3776emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16:
3777	MOVQ (DI), R10
3778	MOVQ -8(DI)(R9*1), DI
3779	MOVQ R10, (AX)
3780	MOVQ DI, -8(AX)(R9*1)
3781	JMP  memmove_end_copy_match_emit_encodeBlockAsm10B
3782
3783emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32:
3784	MOVOU (DI), X0
3785	MOVOU -16(DI)(R9*1), X1
3786	MOVOU X0, (AX)
3787	MOVOU X1, -16(AX)(R9*1)
3788	JMP   memmove_end_copy_match_emit_encodeBlockAsm10B
3789
3790emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64:
3791	MOVOU (DI), X0
3792	MOVOU 16(DI), X1
3793	MOVOU -32(DI)(R9*1), X2
3794	MOVOU -16(DI)(R9*1), X3
3795	MOVOU X0, (AX)
3796	MOVOU X1, 16(AX)
3797	MOVOU X2, -32(AX)(R9*1)
3798	MOVOU X3, -16(AX)(R9*1)
3799
3800memmove_end_copy_match_emit_encodeBlockAsm10B:
3801	MOVQ R8, AX
3802	JMP  emit_literal_done_match_emit_encodeBlockAsm10B
3803
3804memmove_long_match_emit_encodeBlockAsm10B:
3805	LEAQ (AX)(R9*1), R8
3806
3807	// genMemMoveLong
3808	MOVOU (DI), X0
3809	MOVOU 16(DI), X1
3810	MOVOU -32(DI)(R9*1), X2
3811	MOVOU -16(DI)(R9*1), X3
3812	MOVQ  R9, R11
3813	SHRQ  $0x05, R11
3814	MOVQ  AX, R10
3815	ANDL  $0x0000001f, R10
3816	MOVQ  $0x00000040, R12
3817	SUBQ  R10, R12
3818	DECQ  R11
3819	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
3820	LEAQ  -32(DI)(R12*1), R10
3821	LEAQ  -32(AX)(R12*1), R13
3822
3823emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back:
3824	MOVOU (R10), X4
3825	MOVOU 16(R10), X5
3826	MOVOA X4, (R13)
3827	MOVOA X5, 16(R13)
3828	ADDQ  $0x20, R13
3829	ADDQ  $0x20, R10
3830	ADDQ  $0x20, R12
3831	DECQ  R11
3832	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back
3833
3834emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
3835	MOVOU -32(DI)(R12*1), X4
3836	MOVOU -16(DI)(R12*1), X5
3837	MOVOA X4, -32(AX)(R12*1)
3838	MOVOA X5, -16(AX)(R12*1)
3839	ADDQ  $0x20, R12
3840	CMPQ  R9, R12
3841	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
3842	MOVOU X0, (AX)
3843	MOVOU X1, 16(AX)
3844	MOVOU X2, -32(AX)(R9*1)
3845	MOVOU X3, -16(AX)(R9*1)
3846	MOVQ  R8, AX
3847
3848emit_literal_done_match_emit_encodeBlockAsm10B:
3849match_nolit_loop_encodeBlockAsm10B:
3850	MOVL CX, DI
3851	SUBL SI, DI
3852	MOVL DI, 16(SP)
3853	ADDL $0x04, CX
3854	ADDL $0x04, SI
3855	MOVQ src_len+32(FP), DI
3856	SUBL CX, DI
3857	LEAQ (DX)(CX*1), R8
3858	LEAQ (DX)(SI*1), SI
3859
3860	// matchLen
3861	XORL R10, R10
3862	CMPL DI, $0x08
3863	JL   matchlen_single_match_nolit_encodeBlockAsm10B
3864
3865matchlen_loopback_match_nolit_encodeBlockAsm10B:
3866	MOVQ  (R8)(R10*1), R9
3867	XORQ  (SI)(R10*1), R9
3868	TESTQ R9, R9
3869	JZ    matchlen_loop_match_nolit_encodeBlockAsm10B
3870	BSFQ  R9, R9
3871	SARQ  $0x03, R9
3872	LEAL  (R10)(R9*1), R10
3873	JMP   match_nolit_end_encodeBlockAsm10B
3874
3875matchlen_loop_match_nolit_encodeBlockAsm10B:
3876	LEAL -8(DI), DI
3877	LEAL 8(R10), R10
3878	CMPL DI, $0x08
3879	JGE  matchlen_loopback_match_nolit_encodeBlockAsm10B
3880
3881matchlen_single_match_nolit_encodeBlockAsm10B:
3882	TESTL DI, DI
3883	JZ    match_nolit_end_encodeBlockAsm10B
3884
3885matchlen_single_loopback_match_nolit_encodeBlockAsm10B:
3886	MOVB (R8)(R10*1), R9
3887	CMPB (SI)(R10*1), R9
3888	JNE  match_nolit_end_encodeBlockAsm10B
3889	LEAL 1(R10), R10
3890	DECL DI
3891	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm10B
3892
3893match_nolit_end_encodeBlockAsm10B:
3894	ADDL R10, CX
3895	MOVL 16(SP), SI
3896	ADDL $0x04, R10
3897	MOVL CX, 12(SP)
3898
3899	// emitCopy
3900two_byte_offset_match_nolit_encodeBlockAsm10B:
3901	CMPL R10, $0x40
3902	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm10B
3903	MOVB $0xee, (AX)
3904	MOVW SI, 1(AX)
3905	LEAL -60(R10), R10
3906	ADDQ $0x03, AX
3907
3908	// emitRepeat
3909	MOVL R10, DI
3910	LEAL -4(R10), R10
3911	CMPL DI, $0x08
3912	JLE  repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short
3913	CMPL DI, $0x0c
3914	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
3915	CMPL SI, $0x00000800
3916	JLT  repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
3917
3918cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
3919	CMPL R10, $0x00000104
3920	JLT  repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short
3921	LEAL -256(R10), R10
3922	MOVW $0x0019, (AX)
3923	MOVW R10, 2(AX)
3924	ADDQ $0x04, AX
3925	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
3926
3927repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short:
3928	LEAL -4(R10), R10
3929	MOVW $0x0015, (AX)
3930	MOVB R10, 2(AX)
3931	ADDQ $0x03, AX
3932	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
3933
3934repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short:
3935	SHLL $0x02, R10
3936	ORL  $0x01, R10
3937	MOVW R10, (AX)
3938	ADDQ $0x02, AX
3939	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
3940
3941repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
3942	XORQ DI, DI
3943	LEAL 1(DI)(R10*4), R10
3944	MOVB SI, 1(AX)
3945	SARL $0x08, SI
3946	SHLL $0x05, SI
3947	ORL  SI, R10
3948	MOVB R10, (AX)
3949	ADDQ $0x02, AX
3950	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
3951	JMP two_byte_offset_match_nolit_encodeBlockAsm10B
3952
3953two_byte_offset_short_match_nolit_encodeBlockAsm10B:
3954	CMPL R10, $0x0c
3955	JGE  emit_copy_three_match_nolit_encodeBlockAsm10B
3956	CMPL SI, $0x00000800
3957	JGE  emit_copy_three_match_nolit_encodeBlockAsm10B
3958	MOVB $0x01, BL
3959	LEAL -16(BX)(R10*4), R10
3960	MOVB SI, 1(AX)
3961	SHRL $0x08, SI
3962	SHLL $0x05, SI
3963	ORL  SI, R10
3964	MOVB R10, (AX)
3965	ADDQ $0x02, AX
3966	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
3967
3968emit_copy_three_match_nolit_encodeBlockAsm10B:
3969	MOVB $0x02, BL
3970	LEAL -4(BX)(R10*4), R10
3971	MOVB R10, (AX)
3972	MOVW SI, 1(AX)
3973	ADDQ $0x03, AX
3974
3975match_nolit_emitcopy_end_encodeBlockAsm10B:
3976	CMPL CX, 8(SP)
3977	JGE  emit_remainder_encodeBlockAsm10B
3978	MOVQ -2(DX)(CX*1), DI
3979	CMPQ AX, (SP)
3980	JL   match_nolit_dst_ok_encodeBlockAsm10B
3981	MOVQ $0x00000000, ret+48(FP)
3982	RET
3983
3984match_nolit_dst_ok_encodeBlockAsm10B:
3985	MOVQ  $0x9e3779b1, R9
3986	MOVQ  DI, R8
3987	SHRQ  $0x10, DI
3988	MOVQ  DI, SI
3989	SHLQ  $0x20, R8
3990	IMULQ R9, R8
3991	SHRQ  $0x36, R8
3992	SHLQ  $0x20, SI
3993	IMULQ R9, SI
3994	SHRQ  $0x36, SI
3995	LEAL  -2(CX), R9
3996	LEAQ  24(SP)(SI*4), R10
3997	MOVL  (R10), SI
3998	MOVL  R9, 24(SP)(R8*4)
3999	MOVL  CX, (R10)
4000	CMPL  (DX)(SI*1), DI
4001	JEQ   match_nolit_loop_encodeBlockAsm10B
4002	INCL  CX
4003	JMP   search_loop_encodeBlockAsm10B
4004
4005emit_remainder_encodeBlockAsm10B:
4006	MOVQ src_len+32(FP), CX
4007	SUBL 12(SP), CX
4008	LEAQ 3(AX)(CX*1), CX
4009	CMPQ CX, (SP)
4010	JL   emit_remainder_ok_encodeBlockAsm10B
4011	MOVQ $0x00000000, ret+48(FP)
4012	RET
4013
4014emit_remainder_ok_encodeBlockAsm10B:
4015	MOVQ src_len+32(FP), CX
4016	MOVL 12(SP), BX
4017	CMPL BX, CX
4018	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm10B
4019	MOVL CX, SI
4020	MOVL CX, 12(SP)
4021	LEAQ (DX)(BX*1), CX
4022	SUBL BX, SI
4023	LEAL -1(SI), DX
4024	CMPL DX, $0x3c
4025	JLT  one_byte_emit_remainder_encodeBlockAsm10B
4026	CMPL DX, $0x00000100
4027	JLT  two_bytes_emit_remainder_encodeBlockAsm10B
4028	MOVB $0xf4, (AX)
4029	MOVW DX, 1(AX)
4030	ADDQ $0x03, AX
4031	JMP  memmove_long_emit_remainder_encodeBlockAsm10B
4032
4033two_bytes_emit_remainder_encodeBlockAsm10B:
4034	MOVB $0xf0, (AX)
4035	MOVB DL, 1(AX)
4036	ADDQ $0x02, AX
4037	CMPL DX, $0x40
4038	JL   memmove_emit_remainder_encodeBlockAsm10B
4039	JMP  memmove_long_emit_remainder_encodeBlockAsm10B
4040
4041one_byte_emit_remainder_encodeBlockAsm10B:
4042	SHLB $0x02, DL
4043	MOVB DL, (AX)
4044	ADDQ $0x01, AX
4045
4046memmove_emit_remainder_encodeBlockAsm10B:
4047	LEAQ (AX)(SI*1), DX
4048	MOVL SI, BX
4049
4050	// genMemMoveShort
4051	CMPQ BX, $0x03
4052	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2
4053	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3
4054	CMPQ BX, $0x08
4055	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7
4056	CMPQ BX, $0x10
4057	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16
4058	CMPQ BX, $0x20
4059	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32
4060	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64
4061
4062emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2:
4063	MOVB (CX), SI
4064	MOVB -1(CX)(BX*1), CL
4065	MOVB SI, (AX)
4066	MOVB CL, -1(AX)(BX*1)
4067	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B
4068
4069emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3:
4070	MOVW (CX), SI
4071	MOVB 2(CX), CL
4072	MOVW SI, (AX)
4073	MOVB CL, 2(AX)
4074	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B
4075
4076emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7:
4077	MOVL (CX), SI
4078	MOVL -4(CX)(BX*1), CX
4079	MOVL SI, (AX)
4080	MOVL CX, -4(AX)(BX*1)
4081	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B
4082
4083emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16:
4084	MOVQ (CX), SI
4085	MOVQ -8(CX)(BX*1), CX
4086	MOVQ SI, (AX)
4087	MOVQ CX, -8(AX)(BX*1)
4088	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B
4089
4090emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32:
4091	MOVOU (CX), X0
4092	MOVOU -16(CX)(BX*1), X1
4093	MOVOU X0, (AX)
4094	MOVOU X1, -16(AX)(BX*1)
4095	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm10B
4096
4097emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64:
4098	MOVOU (CX), X0
4099	MOVOU 16(CX), X1
4100	MOVOU -32(CX)(BX*1), X2
4101	MOVOU -16(CX)(BX*1), X3
4102	MOVOU X0, (AX)
4103	MOVOU X1, 16(AX)
4104	MOVOU X2, -32(AX)(BX*1)
4105	MOVOU X3, -16(AX)(BX*1)
4106
4107memmove_end_copy_emit_remainder_encodeBlockAsm10B:
4108	MOVQ DX, AX
4109	JMP  emit_literal_done_emit_remainder_encodeBlockAsm10B
4110
4111memmove_long_emit_remainder_encodeBlockAsm10B:
4112	LEAQ (AX)(SI*1), DX
4113	MOVL SI, BX
4114
4115	// genMemMoveLong
4116	MOVOU (CX), X0
4117	MOVOU 16(CX), X1
4118	MOVOU -32(CX)(BX*1), X2
4119	MOVOU -16(CX)(BX*1), X3
4120	MOVQ  BX, DI
4121	SHRQ  $0x05, DI
4122	MOVQ  AX, SI
4123	ANDL  $0x0000001f, SI
4124	MOVQ  $0x00000040, R8
4125	SUBQ  SI, R8
4126	DECQ  DI
4127	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
4128	LEAQ  -32(CX)(R8*1), SI
4129	LEAQ  -32(AX)(R8*1), R9
4130
4131emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back:
4132	MOVOU (SI), X4
4133	MOVOU 16(SI), X5
4134	MOVOA X4, (R9)
4135	MOVOA X5, 16(R9)
4136	ADDQ  $0x20, R9
4137	ADDQ  $0x20, SI
4138	ADDQ  $0x20, R8
4139	DECQ  DI
4140	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back
4141
4142emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32:
4143	MOVOU -32(CX)(R8*1), X4
4144	MOVOU -16(CX)(R8*1), X5
4145	MOVOA X4, -32(AX)(R8*1)
4146	MOVOA X5, -16(AX)(R8*1)
4147	ADDQ  $0x20, R8
4148	CMPQ  BX, R8
4149	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
4150	MOVOU X0, (AX)
4151	MOVOU X1, 16(AX)
4152	MOVOU X2, -32(AX)(BX*1)
4153	MOVOU X3, -16(AX)(BX*1)
4154	MOVQ  DX, AX
4155
4156emit_literal_done_emit_remainder_encodeBlockAsm10B:
4157	MOVQ dst_base+0(FP), CX
4158	SUBQ CX, AX
4159	MOVQ AX, ret+48(FP)
4160	RET
4161
4162// func encodeBlockAsm8B(dst []byte, src []byte) int
4163// Requires: SSE2
4164TEXT ·encodeBlockAsm8B(SB), $1048-56
4165	MOVQ dst_base+0(FP), AX
4166	MOVQ $0x00000008, CX
4167	LEAQ 24(SP), DX
4168	PXOR X0, X0
4169
4170zero_loop_encodeBlockAsm8B:
4171	MOVOU X0, (DX)
4172	MOVOU X0, 16(DX)
4173	MOVOU X0, 32(DX)
4174	MOVOU X0, 48(DX)
4175	MOVOU X0, 64(DX)
4176	MOVOU X0, 80(DX)
4177	MOVOU X0, 96(DX)
4178	MOVOU X0, 112(DX)
4179	ADDQ  $0x80, DX
4180	DECQ  CX
4181	JNZ   zero_loop_encodeBlockAsm8B
4182	MOVL  $0x00000000, 12(SP)
4183	MOVQ  src_len+32(FP), CX
4184	LEAQ  -5(CX), DX
4185	LEAQ  -8(CX), SI
4186	MOVL  SI, 8(SP)
4187	SHRQ  $0x05, CX
4188	SUBL  CX, DX
4189	LEAQ  (AX)(DX*1), DX
4190	MOVQ  DX, (SP)
4191	MOVL  $0x00000001, CX
4192	MOVL  CX, 16(SP)
4193	MOVQ  src_base+24(FP), DX
4194
4195search_loop_encodeBlockAsm8B:
4196	MOVL  CX, SI
4197	SUBL  12(SP), SI
4198	SHRL  $0x04, SI
4199	LEAL  4(CX)(SI*1), SI
4200	CMPL  SI, 8(SP)
4201	JGE   emit_remainder_encodeBlockAsm8B
4202	MOVQ  (DX)(CX*1), DI
4203	MOVL  SI, 20(SP)
4204	MOVQ  $0x9e3779b1, R9
4205	MOVQ  DI, R10
4206	MOVQ  DI, R11
4207	SHRQ  $0x08, R11
4208	SHLQ  $0x20, R10
4209	IMULQ R9, R10
4210	SHRQ  $0x38, R10
4211	SHLQ  $0x20, R11
4212	IMULQ R9, R11
4213	SHRQ  $0x38, R11
4214	MOVL  24(SP)(R10*4), SI
4215	MOVL  24(SP)(R11*4), R8
4216	MOVL  CX, 24(SP)(R10*4)
4217	LEAL  1(CX), R10
4218	MOVL  R10, 24(SP)(R11*4)
4219	MOVQ  DI, R10
4220	SHRQ  $0x10, R10
4221	SHLQ  $0x20, R10
4222	IMULQ R9, R10
4223	SHRQ  $0x38, R10
4224	MOVL  CX, R9
4225	SUBL  16(SP), R9
4226	MOVL  1(DX)(R9*1), R11
4227	MOVQ  DI, R9
4228	SHRQ  $0x08, R9
4229	CMPL  R9, R11
4230	JNE   no_repeat_found_encodeBlockAsm8B
4231	LEAL  1(CX), DI
4232	MOVL  12(SP), R8
4233	MOVL  DI, SI
4234	SUBL  16(SP), SI
4235	JZ    repeat_extend_back_end_encodeBlockAsm8B
4236
4237repeat_extend_back_loop_encodeBlockAsm8B:
4238	CMPL DI, R8
4239	JLE  repeat_extend_back_end_encodeBlockAsm8B
4240	MOVB -1(DX)(SI*1), BL
4241	MOVB -1(DX)(DI*1), R9
4242	CMPB BL, R9
4243	JNE  repeat_extend_back_end_encodeBlockAsm8B
4244	LEAL -1(DI), DI
4245	DECL SI
4246	JNZ  repeat_extend_back_loop_encodeBlockAsm8B
4247
4248repeat_extend_back_end_encodeBlockAsm8B:
4249	MOVL 12(SP), SI
4250	CMPL SI, DI
4251	JEQ  emit_literal_done_repeat_emit_encodeBlockAsm8B
4252	MOVL DI, R9
4253	MOVL DI, 12(SP)
4254	LEAQ (DX)(SI*1), R10
4255	SUBL SI, R9
4256	LEAL -1(R9), SI
4257	CMPL SI, $0x3c
4258	JLT  one_byte_repeat_emit_encodeBlockAsm8B
4259	CMPL SI, $0x00000100
4260	JLT  two_bytes_repeat_emit_encodeBlockAsm8B
4261	MOVB $0xf4, (AX)
4262	MOVW SI, 1(AX)
4263	ADDQ $0x03, AX
4264	JMP  memmove_long_repeat_emit_encodeBlockAsm8B
4265
4266two_bytes_repeat_emit_encodeBlockAsm8B:
4267	MOVB $0xf0, (AX)
4268	MOVB SI, 1(AX)
4269	ADDQ $0x02, AX
4270	CMPL SI, $0x40
4271	JL   memmove_repeat_emit_encodeBlockAsm8B
4272	JMP  memmove_long_repeat_emit_encodeBlockAsm8B
4273
4274one_byte_repeat_emit_encodeBlockAsm8B:
4275	SHLB $0x02, SI
4276	MOVB SI, (AX)
4277	ADDQ $0x01, AX
4278
4279memmove_repeat_emit_encodeBlockAsm8B:
4280	LEAQ (AX)(R9*1), SI
4281
4282	// genMemMoveShort
4283	CMPQ R9, $0x03
4284	JB   emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_1or2
4285	JE   emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_3
4286	CMPQ R9, $0x08
4287	JB   emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_4through7
4288	CMPQ R9, $0x10
4289	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16
4290	CMPQ R9, $0x20
4291	JBE  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32
4292	JMP  emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64
4293
4294emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_1or2:
4295	MOVB (R10), R11
4296	MOVB -1(R10)(R9*1), R10
4297	MOVB R11, (AX)
4298	MOVB R10, -1(AX)(R9*1)
4299	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm8B
4300
4301emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_3:
4302	MOVW (R10), R11
4303	MOVB 2(R10), R10
4304	MOVW R11, (AX)
4305	MOVB R10, 2(AX)
4306	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm8B
4307
4308emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_4through7:
4309	MOVL (R10), R11
4310	MOVL -4(R10)(R9*1), R10
4311	MOVL R11, (AX)
4312	MOVL R10, -4(AX)(R9*1)
4313	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm8B
4314
4315emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16:
4316	MOVQ (R10), R11
4317	MOVQ -8(R10)(R9*1), R10
4318	MOVQ R11, (AX)
4319	MOVQ R10, -8(AX)(R9*1)
4320	JMP  memmove_end_copy_repeat_emit_encodeBlockAsm8B
4321
4322emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32:
4323	MOVOU (R10), X0
4324	MOVOU -16(R10)(R9*1), X1
4325	MOVOU X0, (AX)
4326	MOVOU X1, -16(AX)(R9*1)
4327	JMP   memmove_end_copy_repeat_emit_encodeBlockAsm8B
4328
4329emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64:
4330	MOVOU (R10), X0
4331	MOVOU 16(R10), X1
4332	MOVOU -32(R10)(R9*1), X2
4333	MOVOU -16(R10)(R9*1), X3
4334	MOVOU X0, (AX)
4335	MOVOU X1, 16(AX)
4336	MOVOU X2, -32(AX)(R9*1)
4337	MOVOU X3, -16(AX)(R9*1)
4338
4339memmove_end_copy_repeat_emit_encodeBlockAsm8B:
4340	MOVQ SI, AX
4341	JMP  emit_literal_done_repeat_emit_encodeBlockAsm8B
4342
4343memmove_long_repeat_emit_encodeBlockAsm8B:
4344	LEAQ (AX)(R9*1), SI
4345
4346	// genMemMoveLong
4347	MOVOU (R10), X0
4348	MOVOU 16(R10), X1
4349	MOVOU -32(R10)(R9*1), X2
4350	MOVOU -16(R10)(R9*1), X3
4351	MOVQ  R9, R12
4352	SHRQ  $0x05, R12
4353	MOVQ  AX, R11
4354	ANDL  $0x0000001f, R11
4355	MOVQ  $0x00000040, R13
4356	SUBQ  R11, R13
4357	DECQ  R12
4358	JA    emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
4359	LEAQ  -32(R10)(R13*1), R11
4360	LEAQ  -32(AX)(R13*1), R14
4361
4362emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back:
4363	MOVOU (R11), X4
4364	MOVOU 16(R11), X5
4365	MOVOA X4, (R14)
4366	MOVOA X5, 16(R14)
4367	ADDQ  $0x20, R14
4368	ADDQ  $0x20, R11
4369	ADDQ  $0x20, R13
4370	DECQ  R12
4371	JNA   emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back
4372
4373emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
4374	MOVOU -32(R10)(R13*1), X4
4375	MOVOU -16(R10)(R13*1), X5
4376	MOVOA X4, -32(AX)(R13*1)
4377	MOVOA X5, -16(AX)(R13*1)
4378	ADDQ  $0x20, R13
4379	CMPQ  R9, R13
4380	JAE   emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
4381	MOVOU X0, (AX)
4382	MOVOU X1, 16(AX)
4383	MOVOU X2, -32(AX)(R9*1)
4384	MOVOU X3, -16(AX)(R9*1)
4385	MOVQ  SI, AX
4386
4387emit_literal_done_repeat_emit_encodeBlockAsm8B:
4388	ADDL $0x05, CX
4389	MOVL CX, SI
4390	SUBL 16(SP), SI
4391	MOVQ src_len+32(FP), R9
4392	SUBL CX, R9
4393	LEAQ (DX)(CX*1), R10
4394	LEAQ (DX)(SI*1), SI
4395
4396	// matchLen
4397	XORL R12, R12
4398	CMPL R9, $0x08
4399	JL   matchlen_single_repeat_extend_encodeBlockAsm8B
4400
4401matchlen_loopback_repeat_extend_encodeBlockAsm8B:
4402	MOVQ  (R10)(R12*1), R11
4403	XORQ  (SI)(R12*1), R11
4404	TESTQ R11, R11
4405	JZ    matchlen_loop_repeat_extend_encodeBlockAsm8B
4406	BSFQ  R11, R11
4407	SARQ  $0x03, R11
4408	LEAL  (R12)(R11*1), R12
4409	JMP   repeat_extend_forward_end_encodeBlockAsm8B
4410
4411matchlen_loop_repeat_extend_encodeBlockAsm8B:
4412	LEAL -8(R9), R9
4413	LEAL 8(R12), R12
4414	CMPL R9, $0x08
4415	JGE  matchlen_loopback_repeat_extend_encodeBlockAsm8B
4416
4417matchlen_single_repeat_extend_encodeBlockAsm8B:
4418	TESTL R9, R9
4419	JZ    repeat_extend_forward_end_encodeBlockAsm8B
4420
4421matchlen_single_loopback_repeat_extend_encodeBlockAsm8B:
4422	MOVB (R10)(R12*1), R11
4423	CMPB (SI)(R12*1), R11
4424	JNE  repeat_extend_forward_end_encodeBlockAsm8B
4425	LEAL 1(R12), R12
4426	DECL R9
4427	JNZ  matchlen_single_loopback_repeat_extend_encodeBlockAsm8B
4428
4429repeat_extend_forward_end_encodeBlockAsm8B:
4430	ADDL  R12, CX
4431	MOVL  CX, SI
4432	SUBL  DI, SI
4433	MOVL  16(SP), DI
4434	TESTL R8, R8
4435	JZ    repeat_as_copy_encodeBlockAsm8B
4436
4437	// emitRepeat
4438	MOVL SI, DI
4439	LEAL -4(SI), SI
4440	CMPL DI, $0x08
4441	JLE  repeat_two_match_repeat_encodeBlockAsm8B
4442	CMPL DI, $0x0c
4443	JGE  cant_repeat_two_offset_match_repeat_encodeBlockAsm8B
4444
4445cant_repeat_two_offset_match_repeat_encodeBlockAsm8B:
4446	CMPL SI, $0x00000104
4447	JLT  repeat_three_match_repeat_encodeBlockAsm8B
4448	LEAL -256(SI), SI
4449	MOVW $0x0019, (AX)
4450	MOVW SI, 2(AX)
4451	ADDQ $0x04, AX
4452	JMP  repeat_end_emit_encodeBlockAsm8B
4453
4454repeat_three_match_repeat_encodeBlockAsm8B:
4455	LEAL -4(SI), SI
4456	MOVW $0x0015, (AX)
4457	MOVB SI, 2(AX)
4458	ADDQ $0x03, AX
4459	JMP  repeat_end_emit_encodeBlockAsm8B
4460
4461repeat_two_match_repeat_encodeBlockAsm8B:
4462	SHLL $0x02, SI
4463	ORL  $0x01, SI
4464	MOVW SI, (AX)
4465	ADDQ $0x02, AX
4466	JMP  repeat_end_emit_encodeBlockAsm8B
4467	XORQ R8, R8
4468	LEAL 1(R8)(SI*4), SI
4469	MOVB DI, 1(AX)
4470	SARL $0x08, DI
4471	SHLL $0x05, DI
4472	ORL  DI, SI
4473	MOVB SI, (AX)
4474	ADDQ $0x02, AX
4475	JMP  repeat_end_emit_encodeBlockAsm8B
4476
4477repeat_as_copy_encodeBlockAsm8B:
4478	// emitCopy
4479two_byte_offset_repeat_as_copy_encodeBlockAsm8B:
4480	CMPL SI, $0x40
4481	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
4482	MOVB $0xee, (AX)
4483	MOVW DI, 1(AX)
4484	LEAL -60(SI), SI
4485	ADDQ $0x03, AX
4486
4487	// emitRepeat
4488	MOVL SI, DI
4489	LEAL -4(SI), SI
4490	CMPL DI, $0x08
4491	JLE  repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
4492	CMPL DI, $0x0c
4493	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
4494
4495cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
4496	CMPL SI, $0x00000104
4497	JLT  repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
4498	LEAL -256(SI), SI
4499	MOVW $0x0019, (AX)
4500	MOVW SI, 2(AX)
4501	ADDQ $0x04, AX
4502	JMP  repeat_end_emit_encodeBlockAsm8B
4503
4504repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
4505	LEAL -4(SI), SI
4506	MOVW $0x0015, (AX)
4507	MOVB SI, 2(AX)
4508	ADDQ $0x03, AX
4509	JMP  repeat_end_emit_encodeBlockAsm8B
4510
4511repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
4512	SHLL $0x02, SI
4513	ORL  $0x01, SI
4514	MOVW SI, (AX)
4515	ADDQ $0x02, AX
4516	JMP  repeat_end_emit_encodeBlockAsm8B
4517	XORQ R8, R8
4518	LEAL 1(R8)(SI*4), SI
4519	MOVB DI, 1(AX)
4520	SARL $0x08, DI
4521	SHLL $0x05, DI
4522	ORL  DI, SI
4523	MOVB SI, (AX)
4524	ADDQ $0x02, AX
4525	JMP  repeat_end_emit_encodeBlockAsm8B
4526	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B
4527
4528two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
4529	CMPL SI, $0x0c
4530	JGE  emit_copy_three_repeat_as_copy_encodeBlockAsm8B
4531	MOVB $0x01, BL
4532	LEAL -16(BX)(SI*4), SI
4533	MOVB DI, 1(AX)
4534	SHRL $0x08, DI
4535	SHLL $0x05, DI
4536	ORL  DI, SI
4537	MOVB SI, (AX)
4538	ADDQ $0x02, AX
4539	JMP  repeat_end_emit_encodeBlockAsm8B
4540
4541emit_copy_three_repeat_as_copy_encodeBlockAsm8B:
4542	MOVB $0x02, BL
4543	LEAL -4(BX)(SI*4), SI
4544	MOVB SI, (AX)
4545	MOVW DI, 1(AX)
4546	ADDQ $0x03, AX
4547
4548repeat_end_emit_encodeBlockAsm8B:
4549	MOVL CX, 12(SP)
4550	JMP  search_loop_encodeBlockAsm8B
4551
4552no_repeat_found_encodeBlockAsm8B:
4553	CMPL (DX)(SI*1), DI
4554	JEQ  candidate_match_encodeBlockAsm8B
4555	SHRQ $0x08, DI
4556	MOVL 24(SP)(R10*4), SI
4557	LEAL 2(CX), R9
4558	CMPL (DX)(R8*1), DI
4559	JEQ  candidate2_match_encodeBlockAsm8B
4560	MOVL R9, 24(SP)(R10*4)
4561	SHRQ $0x08, DI
4562	CMPL (DX)(SI*1), DI
4563	JEQ  candidate3_match_encodeBlockAsm8B
4564	MOVL 20(SP), CX
4565	JMP  search_loop_encodeBlockAsm8B
4566
4567candidate3_match_encodeBlockAsm8B:
4568	ADDL $0x02, CX
4569	JMP  candidate_match_encodeBlockAsm8B
4570
4571candidate2_match_encodeBlockAsm8B:
4572	MOVL R9, 24(SP)(R10*4)
4573	INCL CX
4574	MOVL R8, SI
4575
4576candidate_match_encodeBlockAsm8B:
4577	MOVL  12(SP), DI
4578	TESTL SI, SI
4579	JZ    match_extend_back_end_encodeBlockAsm8B
4580
4581match_extend_back_loop_encodeBlockAsm8B:
4582	CMPL CX, DI
4583	JLE  match_extend_back_end_encodeBlockAsm8B
4584	MOVB -1(DX)(SI*1), BL
4585	MOVB -1(DX)(CX*1), R8
4586	CMPB BL, R8
4587	JNE  match_extend_back_end_encodeBlockAsm8B
4588	LEAL -1(CX), CX
4589	DECL SI
4590	JZ   match_extend_back_end_encodeBlockAsm8B
4591	JMP  match_extend_back_loop_encodeBlockAsm8B
4592
4593match_extend_back_end_encodeBlockAsm8B:
4594	MOVL CX, DI
4595	SUBL 12(SP), DI
4596	LEAQ 3(AX)(DI*1), DI
4597	CMPQ DI, (SP)
4598	JL   match_dst_size_check_encodeBlockAsm8B
4599	MOVQ $0x00000000, ret+48(FP)
4600	RET
4601
4602match_dst_size_check_encodeBlockAsm8B:
4603	MOVL CX, DI
4604	MOVL 12(SP), R8
4605	CMPL R8, DI
4606	JEQ  emit_literal_done_match_emit_encodeBlockAsm8B
4607	MOVL DI, R9
4608	MOVL DI, 12(SP)
4609	LEAQ (DX)(R8*1), DI
4610	SUBL R8, R9
4611	LEAL -1(R9), R8
4612	CMPL R8, $0x3c
4613	JLT  one_byte_match_emit_encodeBlockAsm8B
4614	CMPL R8, $0x00000100
4615	JLT  two_bytes_match_emit_encodeBlockAsm8B
4616	MOVB $0xf4, (AX)
4617	MOVW R8, 1(AX)
4618	ADDQ $0x03, AX
4619	JMP  memmove_long_match_emit_encodeBlockAsm8B
4620
4621two_bytes_match_emit_encodeBlockAsm8B:
4622	MOVB $0xf0, (AX)
4623	MOVB R8, 1(AX)
4624	ADDQ $0x02, AX
4625	CMPL R8, $0x40
4626	JL   memmove_match_emit_encodeBlockAsm8B
4627	JMP  memmove_long_match_emit_encodeBlockAsm8B
4628
4629one_byte_match_emit_encodeBlockAsm8B:
4630	SHLB $0x02, R8
4631	MOVB R8, (AX)
4632	ADDQ $0x01, AX
4633
4634memmove_match_emit_encodeBlockAsm8B:
4635	LEAQ (AX)(R9*1), R8
4636
4637	// genMemMoveShort
4638	CMPQ R9, $0x03
4639	JB   emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_1or2
4640	JE   emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_3
4641	CMPQ R9, $0x08
4642	JB   emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_4through7
4643	CMPQ R9, $0x10
4644	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16
4645	CMPQ R9, $0x20
4646	JBE  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32
4647	JMP  emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64
4648
4649emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_1or2:
4650	MOVB (DI), R10
4651	MOVB -1(DI)(R9*1), DI
4652	MOVB R10, (AX)
4653	MOVB DI, -1(AX)(R9*1)
4654	JMP  memmove_end_copy_match_emit_encodeBlockAsm8B
4655
4656emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_3:
4657	MOVW (DI), R10
4658	MOVB 2(DI), DI
4659	MOVW R10, (AX)
4660	MOVB DI, 2(AX)
4661	JMP  memmove_end_copy_match_emit_encodeBlockAsm8B
4662
4663emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_4through7:
4664	MOVL (DI), R10
4665	MOVL -4(DI)(R9*1), DI
4666	MOVL R10, (AX)
4667	MOVL DI, -4(AX)(R9*1)
4668	JMP  memmove_end_copy_match_emit_encodeBlockAsm8B
4669
4670emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16:
4671	MOVQ (DI), R10
4672	MOVQ -8(DI)(R9*1), DI
4673	MOVQ R10, (AX)
4674	MOVQ DI, -8(AX)(R9*1)
4675	JMP  memmove_end_copy_match_emit_encodeBlockAsm8B
4676
4677emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32:
4678	MOVOU (DI), X0
4679	MOVOU -16(DI)(R9*1), X1
4680	MOVOU X0, (AX)
4681	MOVOU X1, -16(AX)(R9*1)
4682	JMP   memmove_end_copy_match_emit_encodeBlockAsm8B
4683
4684emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64:
4685	MOVOU (DI), X0
4686	MOVOU 16(DI), X1
4687	MOVOU -32(DI)(R9*1), X2
4688	MOVOU -16(DI)(R9*1), X3
4689	MOVOU X0, (AX)
4690	MOVOU X1, 16(AX)
4691	MOVOU X2, -32(AX)(R9*1)
4692	MOVOU X3, -16(AX)(R9*1)
4693
4694memmove_end_copy_match_emit_encodeBlockAsm8B:
4695	MOVQ R8, AX
4696	JMP  emit_literal_done_match_emit_encodeBlockAsm8B
4697
4698memmove_long_match_emit_encodeBlockAsm8B:
4699	LEAQ (AX)(R9*1), R8
4700
4701	// genMemMoveLong
4702	MOVOU (DI), X0
4703	MOVOU 16(DI), X1
4704	MOVOU -32(DI)(R9*1), X2
4705	MOVOU -16(DI)(R9*1), X3
4706	MOVQ  R9, R11
4707	SHRQ  $0x05, R11
4708	MOVQ  AX, R10
4709	ANDL  $0x0000001f, R10
4710	MOVQ  $0x00000040, R12
4711	SUBQ  R10, R12
4712	DECQ  R11
4713	JA    emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
4714	LEAQ  -32(DI)(R12*1), R10
4715	LEAQ  -32(AX)(R12*1), R13
4716
4717emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back:
4718	MOVOU (R10), X4
4719	MOVOU 16(R10), X5
4720	MOVOA X4, (R13)
4721	MOVOA X5, 16(R13)
4722	ADDQ  $0x20, R13
4723	ADDQ  $0x20, R10
4724	ADDQ  $0x20, R12
4725	DECQ  R11
4726	JNA   emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back
4727
4728emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
4729	MOVOU -32(DI)(R12*1), X4
4730	MOVOU -16(DI)(R12*1), X5
4731	MOVOA X4, -32(AX)(R12*1)
4732	MOVOA X5, -16(AX)(R12*1)
4733	ADDQ  $0x20, R12
4734	CMPQ  R9, R12
4735	JAE   emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
4736	MOVOU X0, (AX)
4737	MOVOU X1, 16(AX)
4738	MOVOU X2, -32(AX)(R9*1)
4739	MOVOU X3, -16(AX)(R9*1)
4740	MOVQ  R8, AX
4741
4742emit_literal_done_match_emit_encodeBlockAsm8B:
4743match_nolit_loop_encodeBlockAsm8B:
4744	MOVL CX, DI
4745	SUBL SI, DI
4746	MOVL DI, 16(SP)
4747	ADDL $0x04, CX
4748	ADDL $0x04, SI
4749	MOVQ src_len+32(FP), DI
4750	SUBL CX, DI
4751	LEAQ (DX)(CX*1), R8
4752	LEAQ (DX)(SI*1), SI
4753
4754	// matchLen
4755	XORL R10, R10
4756	CMPL DI, $0x08
4757	JL   matchlen_single_match_nolit_encodeBlockAsm8B
4758
4759matchlen_loopback_match_nolit_encodeBlockAsm8B:
4760	MOVQ  (R8)(R10*1), R9
4761	XORQ  (SI)(R10*1), R9
4762	TESTQ R9, R9
4763	JZ    matchlen_loop_match_nolit_encodeBlockAsm8B
4764	BSFQ  R9, R9
4765	SARQ  $0x03, R9
4766	LEAL  (R10)(R9*1), R10
4767	JMP   match_nolit_end_encodeBlockAsm8B
4768
4769matchlen_loop_match_nolit_encodeBlockAsm8B:
4770	LEAL -8(DI), DI
4771	LEAL 8(R10), R10
4772	CMPL DI, $0x08
4773	JGE  matchlen_loopback_match_nolit_encodeBlockAsm8B
4774
4775matchlen_single_match_nolit_encodeBlockAsm8B:
4776	TESTL DI, DI
4777	JZ    match_nolit_end_encodeBlockAsm8B
4778
4779matchlen_single_loopback_match_nolit_encodeBlockAsm8B:
4780	MOVB (R8)(R10*1), R9
4781	CMPB (SI)(R10*1), R9
4782	JNE  match_nolit_end_encodeBlockAsm8B
4783	LEAL 1(R10), R10
4784	DECL DI
4785	JNZ  matchlen_single_loopback_match_nolit_encodeBlockAsm8B
4786
4787match_nolit_end_encodeBlockAsm8B:
4788	ADDL R10, CX
4789	MOVL 16(SP), SI
4790	ADDL $0x04, R10
4791	MOVL CX, 12(SP)
4792
4793	// emitCopy
4794two_byte_offset_match_nolit_encodeBlockAsm8B:
4795	CMPL R10, $0x40
4796	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm8B
4797	MOVB $0xee, (AX)
4798	MOVW SI, 1(AX)
4799	LEAL -60(R10), R10
4800	ADDQ $0x03, AX
4801
4802	// emitRepeat
4803	MOVL R10, SI
4804	LEAL -4(R10), R10
4805	CMPL SI, $0x08
4806	JLE  repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short
4807	CMPL SI, $0x0c
4808	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short
4809
4810cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short:
4811	CMPL R10, $0x00000104
4812	JLT  repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short
4813	LEAL -256(R10), R10
4814	MOVW $0x0019, (AX)
4815	MOVW R10, 2(AX)
4816	ADDQ $0x04, AX
4817	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
4818
4819repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short:
4820	LEAL -4(R10), R10
4821	MOVW $0x0015, (AX)
4822	MOVB R10, 2(AX)
4823	ADDQ $0x03, AX
4824	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
4825
4826repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
4827	SHLL $0x02, R10
4828	ORL  $0x01, R10
4829	MOVW R10, (AX)
4830	ADDQ $0x02, AX
4831	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
4832	XORQ DI, DI
4833	LEAL 1(DI)(R10*4), R10
4834	MOVB SI, 1(AX)
4835	SARL $0x08, SI
4836	SHLL $0x05, SI
4837	ORL  SI, R10
4838	MOVB R10, (AX)
4839	ADDQ $0x02, AX
4840	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
4841	JMP two_byte_offset_match_nolit_encodeBlockAsm8B
4842
4843two_byte_offset_short_match_nolit_encodeBlockAsm8B:
4844	CMPL R10, $0x0c
4845	JGE  emit_copy_three_match_nolit_encodeBlockAsm8B
4846	MOVB $0x01, BL
4847	LEAL -16(BX)(R10*4), R10
4848	MOVB SI, 1(AX)
4849	SHRL $0x08, SI
4850	SHLL $0x05, SI
4851	ORL  SI, R10
4852	MOVB R10, (AX)
4853	ADDQ $0x02, AX
4854	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
4855
4856emit_copy_three_match_nolit_encodeBlockAsm8B:
4857	MOVB $0x02, BL
4858	LEAL -4(BX)(R10*4), R10
4859	MOVB R10, (AX)
4860	MOVW SI, 1(AX)
4861	ADDQ $0x03, AX
4862
4863match_nolit_emitcopy_end_encodeBlockAsm8B:
4864	CMPL CX, 8(SP)
4865	JGE  emit_remainder_encodeBlockAsm8B
4866	MOVQ -2(DX)(CX*1), DI
4867	CMPQ AX, (SP)
4868	JL   match_nolit_dst_ok_encodeBlockAsm8B
4869	MOVQ $0x00000000, ret+48(FP)
4870	RET
4871
4872match_nolit_dst_ok_encodeBlockAsm8B:
4873	MOVQ  $0x9e3779b1, R9
4874	MOVQ  DI, R8
4875	SHRQ  $0x10, DI
4876	MOVQ  DI, SI
4877	SHLQ  $0x20, R8
4878	IMULQ R9, R8
4879	SHRQ  $0x38, R8
4880	SHLQ  $0x20, SI
4881	IMULQ R9, SI
4882	SHRQ  $0x38, SI
4883	LEAL  -2(CX), R9
4884	LEAQ  24(SP)(SI*4), R10
4885	MOVL  (R10), SI
4886	MOVL  R9, 24(SP)(R8*4)
4887	MOVL  CX, (R10)
4888	CMPL  (DX)(SI*1), DI
4889	JEQ   match_nolit_loop_encodeBlockAsm8B
4890	INCL  CX
4891	JMP   search_loop_encodeBlockAsm8B
4892
4893emit_remainder_encodeBlockAsm8B:
4894	MOVQ src_len+32(FP), CX
4895	SUBL 12(SP), CX
4896	LEAQ 3(AX)(CX*1), CX
4897	CMPQ CX, (SP)
4898	JL   emit_remainder_ok_encodeBlockAsm8B
4899	MOVQ $0x00000000, ret+48(FP)
4900	RET
4901
4902emit_remainder_ok_encodeBlockAsm8B:
4903	MOVQ src_len+32(FP), CX
4904	MOVL 12(SP), BX
4905	CMPL BX, CX
4906	JEQ  emit_literal_done_emit_remainder_encodeBlockAsm8B
4907	MOVL CX, SI
4908	MOVL CX, 12(SP)
4909	LEAQ (DX)(BX*1), CX
4910	SUBL BX, SI
4911	LEAL -1(SI), DX
4912	CMPL DX, $0x3c
4913	JLT  one_byte_emit_remainder_encodeBlockAsm8B
4914	CMPL DX, $0x00000100
4915	JLT  two_bytes_emit_remainder_encodeBlockAsm8B
4916	MOVB $0xf4, (AX)
4917	MOVW DX, 1(AX)
4918	ADDQ $0x03, AX
4919	JMP  memmove_long_emit_remainder_encodeBlockAsm8B
4920
4921two_bytes_emit_remainder_encodeBlockAsm8B:
4922	MOVB $0xf0, (AX)
4923	MOVB DL, 1(AX)
4924	ADDQ $0x02, AX
4925	CMPL DX, $0x40
4926	JL   memmove_emit_remainder_encodeBlockAsm8B
4927	JMP  memmove_long_emit_remainder_encodeBlockAsm8B
4928
4929one_byte_emit_remainder_encodeBlockAsm8B:
4930	SHLB $0x02, DL
4931	MOVB DL, (AX)
4932	ADDQ $0x01, AX
4933
4934memmove_emit_remainder_encodeBlockAsm8B:
4935	LEAQ (AX)(SI*1), DX
4936	MOVL SI, BX
4937
4938	// genMemMoveShort
4939	CMPQ BX, $0x03
4940	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2
4941	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3
4942	CMPQ BX, $0x08
4943	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7
4944	CMPQ BX, $0x10
4945	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16
4946	CMPQ BX, $0x20
4947	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32
4948	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64
4949
4950emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2:
4951	MOVB (CX), SI
4952	MOVB -1(CX)(BX*1), CL
4953	MOVB SI, (AX)
4954	MOVB CL, -1(AX)(BX*1)
4955	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B
4956
4957emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3:
4958	MOVW (CX), SI
4959	MOVB 2(CX), CL
4960	MOVW SI, (AX)
4961	MOVB CL, 2(AX)
4962	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B
4963
4964emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7:
4965	MOVL (CX), SI
4966	MOVL -4(CX)(BX*1), CX
4967	MOVL SI, (AX)
4968	MOVL CX, -4(AX)(BX*1)
4969	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B
4970
4971emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16:
4972	MOVQ (CX), SI
4973	MOVQ -8(CX)(BX*1), CX
4974	MOVQ SI, (AX)
4975	MOVQ CX, -8(AX)(BX*1)
4976	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B
4977
4978emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32:
4979	MOVOU (CX), X0
4980	MOVOU -16(CX)(BX*1), X1
4981	MOVOU X0, (AX)
4982	MOVOU X1, -16(AX)(BX*1)
4983	JMP   memmove_end_copy_emit_remainder_encodeBlockAsm8B
4984
4985emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64:
4986	MOVOU (CX), X0
4987	MOVOU 16(CX), X1
4988	MOVOU -32(CX)(BX*1), X2
4989	MOVOU -16(CX)(BX*1), X3
4990	MOVOU X0, (AX)
4991	MOVOU X1, 16(AX)
4992	MOVOU X2, -32(AX)(BX*1)
4993	MOVOU X3, -16(AX)(BX*1)
4994
4995memmove_end_copy_emit_remainder_encodeBlockAsm8B:
4996	MOVQ DX, AX
4997	JMP  emit_literal_done_emit_remainder_encodeBlockAsm8B
4998
4999memmove_long_emit_remainder_encodeBlockAsm8B:
5000	LEAQ (AX)(SI*1), DX
5001	MOVL SI, BX
5002
5003	// genMemMoveLong
5004	MOVOU (CX), X0
5005	MOVOU 16(CX), X1
5006	MOVOU -32(CX)(BX*1), X2
5007	MOVOU -16(CX)(BX*1), X3
5008	MOVQ  BX, DI
5009	SHRQ  $0x05, DI
5010	MOVQ  AX, SI
5011	ANDL  $0x0000001f, SI
5012	MOVQ  $0x00000040, R8
5013	SUBQ  SI, R8
5014	DECQ  DI
5015	JA    emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
5016	LEAQ  -32(CX)(R8*1), SI
5017	LEAQ  -32(AX)(R8*1), R9
5018
5019emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back:
5020	MOVOU (SI), X4
5021	MOVOU 16(SI), X5
5022	MOVOA X4, (R9)
5023	MOVOA X5, 16(R9)
5024	ADDQ  $0x20, R9
5025	ADDQ  $0x20, SI
5026	ADDQ  $0x20, R8
5027	DECQ  DI
5028	JNA   emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back
5029
5030emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32:
5031	MOVOU -32(CX)(R8*1), X4
5032	MOVOU -16(CX)(R8*1), X5
5033	MOVOA X4, -32(AX)(R8*1)
5034	MOVOA X5, -16(AX)(R8*1)
5035	ADDQ  $0x20, R8
5036	CMPQ  BX, R8
5037	JAE   emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
5038	MOVOU X0, (AX)
5039	MOVOU X1, 16(AX)
5040	MOVOU X2, -32(AX)(BX*1)
5041	MOVOU X3, -16(AX)(BX*1)
5042	MOVQ  DX, AX
5043
5044emit_literal_done_emit_remainder_encodeBlockAsm8B:
5045	MOVQ dst_base+0(FP), CX
5046	SUBQ CX, AX
5047	MOVQ AX, ret+48(FP)
5048	RET
5049
5050// func encodeBetterBlockAsm(dst []byte, src []byte) int
5051// Requires: SSE2
5052TEXT ·encodeBetterBlockAsm(SB), $327704-56
5053	MOVQ dst_base+0(FP), AX
5054	MOVQ $0x00000a00, CX
5055	LEAQ 24(SP), DX
5056	PXOR X0, X0
5057
5058zero_loop_encodeBetterBlockAsm:
5059	MOVOU X0, (DX)
5060	MOVOU X0, 16(DX)
5061	MOVOU X0, 32(DX)
5062	MOVOU X0, 48(DX)
5063	MOVOU X0, 64(DX)
5064	MOVOU X0, 80(DX)
5065	MOVOU X0, 96(DX)
5066	MOVOU X0, 112(DX)
5067	ADDQ  $0x80, DX
5068	DECQ  CX
5069	JNZ   zero_loop_encodeBetterBlockAsm
5070	MOVL  $0x00000000, 12(SP)
5071	MOVQ  src_len+32(FP), CX
5072	LEAQ  -6(CX), DX
5073	LEAQ  -8(CX), SI
5074	MOVL  SI, 8(SP)
5075	SHRQ  $0x05, CX
5076	SUBL  CX, DX
5077	LEAQ  (AX)(DX*1), DX
5078	MOVQ  DX, (SP)
5079	MOVL  $0x00000001, CX
5080	MOVL  $0x00000000, 16(SP)
5081	MOVQ  src_base+24(FP), DX
5082
5083search_loop_encodeBetterBlockAsm:
5084	MOVL  CX, SI
5085	SUBL  12(SP), SI
5086	SHRL  $0x07, SI
5087	LEAL  1(CX)(SI*1), SI
5088	CMPL  SI, 8(SP)
5089	JGE   emit_remainder_encodeBetterBlockAsm
5090	MOVQ  (DX)(CX*1), DI
5091	MOVL  SI, 20(SP)
5092	MOVQ  $0x00cf1bbcdcbfa563, R9
5093	MOVQ  $0x9e3779b1, SI
5094	MOVQ  DI, R10
5095	MOVQ  DI, R11
5096	SHLQ  $0x08, R10
5097	IMULQ R9, R10
5098	SHRQ  $0x30, R10
5099	SHLQ  $0x20, R11
5100	IMULQ SI, R11
5101	SHRQ  $0x32, R11
5102	MOVL  24(SP)(R10*4), SI
5103	MOVL  262168(SP)(R11*4), R8
5104	MOVL  CX, 24(SP)(R10*4)
5105	MOVL  CX, 262168(SP)(R11*4)
5106	CMPL  (DX)(SI*1), DI
5107	JEQ   candidate_match_encodeBetterBlockAsm
5108	CMPL  (DX)(R8*1), DI
5109	JEQ   candidateS_match_encodeBetterBlockAsm
5110	MOVL  20(SP), CX
5111	JMP   search_loop_encodeBetterBlockAsm
5112
5113candidateS_match_encodeBetterBlockAsm:
5114	SHRQ  $0x08, DI
5115	MOVQ  DI, R10
5116	SHLQ  $0x08, R10
5117	IMULQ R9, R10
5118	SHRQ  $0x30, R10
5119	MOVL  24(SP)(R10*4), SI
5120	INCL  CX
5121	MOVL  CX, 24(SP)(R10*4)
5122	CMPL  (DX)(SI*1), DI
5123	JEQ   candidate_match_encodeBetterBlockAsm
5124	DECL  CX
5125	MOVL  R8, SI
5126
5127candidate_match_encodeBetterBlockAsm:
5128	MOVL  12(SP), DI
5129	TESTL SI, SI
5130	JZ    match_extend_back_end_encodeBetterBlockAsm
5131
5132match_extend_back_loop_encodeBetterBlockAsm:
5133	CMPL CX, DI
5134	JLE  match_extend_back_end_encodeBetterBlockAsm
5135	MOVB -1(DX)(SI*1), BL
5136	MOVB -1(DX)(CX*1), R8
5137	CMPB BL, R8
5138	JNE  match_extend_back_end_encodeBetterBlockAsm
5139	LEAL -1(CX), CX
5140	DECL SI
5141	JZ   match_extend_back_end_encodeBetterBlockAsm
5142	JMP  match_extend_back_loop_encodeBetterBlockAsm
5143
5144match_extend_back_end_encodeBetterBlockAsm:
5145	MOVL CX, DI
5146	SUBL 12(SP), DI
5147	LEAQ 5(AX)(DI*1), DI
5148	CMPQ DI, (SP)
5149	JL   match_dst_size_check_encodeBetterBlockAsm
5150	MOVQ $0x00000000, ret+48(FP)
5151	RET
5152
5153match_dst_size_check_encodeBetterBlockAsm:
5154	MOVL CX, DI
5155	ADDL $0x04, CX
5156	ADDL $0x04, SI
5157	MOVQ src_len+32(FP), R8
5158	SUBL CX, R8
5159	LEAQ (DX)(CX*1), R9
5160	LEAQ (DX)(SI*1), R10
5161
5162	// matchLen
5163	XORL R12, R12
5164	CMPL R8, $0x08
5165	JL   matchlen_single_match_nolit_encodeBetterBlockAsm
5166
5167matchlen_loopback_match_nolit_encodeBetterBlockAsm:
5168	MOVQ  (R9)(R12*1), R11
5169	XORQ  (R10)(R12*1), R11
5170	TESTQ R11, R11
5171	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm
5172	BSFQ  R11, R11
5173	SARQ  $0x03, R11
5174	LEAL  (R12)(R11*1), R12
5175	JMP   match_nolit_end_encodeBetterBlockAsm
5176
5177matchlen_loop_match_nolit_encodeBetterBlockAsm:
5178	LEAL -8(R8), R8
5179	LEAL 8(R12), R12
5180	CMPL R8, $0x08
5181	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm
5182
5183matchlen_single_match_nolit_encodeBetterBlockAsm:
5184	TESTL R8, R8
5185	JZ    match_nolit_end_encodeBetterBlockAsm
5186
5187matchlen_single_loopback_match_nolit_encodeBetterBlockAsm:
5188	MOVB (R9)(R12*1), R11
5189	CMPB (R10)(R12*1), R11
5190	JNE  match_nolit_end_encodeBetterBlockAsm
5191	LEAL 1(R12), R12
5192	DECL R8
5193	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm
5194
5195match_nolit_end_encodeBetterBlockAsm:
5196	MOVL CX, R8
5197	SUBL SI, R8
5198
5199	// Check if repeat
5200	CMPL 16(SP), R8
5201	JEQ  match_is_repeat_encodeBetterBlockAsm
5202	CMPL R12, $0x01
5203	JG   match_length_ok_encodeBetterBlockAsm
5204	CMPL R8, $0x0000ffff
5205	JLE  match_length_ok_encodeBetterBlockAsm
5206	MOVL 20(SP), CX
5207	INCL CX
5208	JMP  search_loop_encodeBetterBlockAsm
5209
5210match_length_ok_encodeBetterBlockAsm:
5211	MOVL R8, 16(SP)
5212	MOVL 12(SP), SI
5213	CMPL SI, DI
5214	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm
5215	MOVL DI, R9
5216	MOVL DI, 12(SP)
5217	LEAQ (DX)(SI*1), R10
5218	SUBL SI, R9
5219	LEAL -1(R9), SI
5220	CMPL SI, $0x3c
5221	JLT  one_byte_match_emit_encodeBetterBlockAsm
5222	CMPL SI, $0x00000100
5223	JLT  two_bytes_match_emit_encodeBetterBlockAsm
5224	CMPL SI, $0x00010000
5225	JLT  three_bytes_match_emit_encodeBetterBlockAsm
5226	CMPL SI, $0x01000000
5227	JLT  four_bytes_match_emit_encodeBetterBlockAsm
5228	MOVB $0xfc, (AX)
5229	MOVL SI, 1(AX)
5230	ADDQ $0x05, AX
5231	JMP  memmove_long_match_emit_encodeBetterBlockAsm
5232
5233four_bytes_match_emit_encodeBetterBlockAsm:
5234	MOVL SI, R11
5235	SHRL $0x10, R11
5236	MOVB $0xf8, (AX)
5237	MOVW SI, 1(AX)
5238	MOVB R11, 3(AX)
5239	ADDQ $0x04, AX
5240	JMP  memmove_long_match_emit_encodeBetterBlockAsm
5241
5242three_bytes_match_emit_encodeBetterBlockAsm:
5243	MOVB $0xf4, (AX)
5244	MOVW SI, 1(AX)
5245	ADDQ $0x03, AX
5246	JMP  memmove_long_match_emit_encodeBetterBlockAsm
5247
5248two_bytes_match_emit_encodeBetterBlockAsm:
5249	MOVB $0xf0, (AX)
5250	MOVB SI, 1(AX)
5251	ADDQ $0x02, AX
5252	CMPL SI, $0x40
5253	JL   memmove_match_emit_encodeBetterBlockAsm
5254	JMP  memmove_long_match_emit_encodeBetterBlockAsm
5255
5256one_byte_match_emit_encodeBetterBlockAsm:
5257	SHLB $0x02, SI
5258	MOVB SI, (AX)
5259	ADDQ $0x01, AX
5260
5261memmove_match_emit_encodeBetterBlockAsm:
5262	LEAQ (AX)(R9*1), SI
5263
5264	// genMemMoveShort
5265	CMPQ R9, $0x03
5266	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_1or2
5267	JE   emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_3
5268	CMPQ R9, $0x08
5269	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7
5270	CMPQ R9, $0x10
5271	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
5272	CMPQ R9, $0x20
5273	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
5274	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64
5275
5276emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_1or2:
5277	MOVB (R10), R11
5278	MOVB -1(R10)(R9*1), R10
5279	MOVB R11, (AX)
5280	MOVB R10, -1(AX)(R9*1)
5281	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm
5282
5283emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_3:
5284	MOVW (R10), R11
5285	MOVB 2(R10), R10
5286	MOVW R11, (AX)
5287	MOVB R10, 2(AX)
5288	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm
5289
5290emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7:
5291	MOVL (R10), R11
5292	MOVL -4(R10)(R9*1), R10
5293	MOVL R11, (AX)
5294	MOVL R10, -4(AX)(R9*1)
5295	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm
5296
5297emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
5298	MOVQ (R10), R11
5299	MOVQ -8(R10)(R9*1), R10
5300	MOVQ R11, (AX)
5301	MOVQ R10, -8(AX)(R9*1)
5302	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm
5303
5304emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
5305	MOVOU (R10), X0
5306	MOVOU -16(R10)(R9*1), X1
5307	MOVOU X0, (AX)
5308	MOVOU X1, -16(AX)(R9*1)
5309	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm
5310
5311emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
5312	MOVOU (R10), X0
5313	MOVOU 16(R10), X1
5314	MOVOU -32(R10)(R9*1), X2
5315	MOVOU -16(R10)(R9*1), X3
5316	MOVOU X0, (AX)
5317	MOVOU X1, 16(AX)
5318	MOVOU X2, -32(AX)(R9*1)
5319	MOVOU X3, -16(AX)(R9*1)
5320
5321memmove_end_copy_match_emit_encodeBetterBlockAsm:
5322	MOVQ SI, AX
5323	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm
5324
5325memmove_long_match_emit_encodeBetterBlockAsm:
5326	LEAQ (AX)(R9*1), SI
5327
5328	// genMemMoveLong
5329	MOVOU (R10), X0
5330	MOVOU 16(R10), X1
5331	MOVOU -32(R10)(R9*1), X2
5332	MOVOU -16(R10)(R9*1), X3
5333	MOVQ  R9, R13
5334	SHRQ  $0x05, R13
5335	MOVQ  AX, R11
5336	ANDL  $0x0000001f, R11
5337	MOVQ  $0x00000040, R14
5338	SUBQ  R11, R14
5339	DECQ  R13
5340	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
5341	LEAQ  -32(R10)(R14*1), R11
5342	LEAQ  -32(AX)(R14*1), R15
5343
5344emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
5345	MOVOU (R11), X4
5346	MOVOU 16(R11), X5
5347	MOVOA X4, (R15)
5348	MOVOA X5, 16(R15)
5349	ADDQ  $0x20, R15
5350	ADDQ  $0x20, R11
5351	ADDQ  $0x20, R14
5352	DECQ  R13
5353	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back
5354
5355emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
5356	MOVOU -32(R10)(R14*1), X4
5357	MOVOU -16(R10)(R14*1), X5
5358	MOVOA X4, -32(AX)(R14*1)
5359	MOVOA X5, -16(AX)(R14*1)
5360	ADDQ  $0x20, R14
5361	CMPQ  R9, R14
5362	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
5363	MOVOU X0, (AX)
5364	MOVOU X1, 16(AX)
5365	MOVOU X2, -32(AX)(R9*1)
5366	MOVOU X3, -16(AX)(R9*1)
5367	MOVQ  SI, AX
5368
5369emit_literal_done_match_emit_encodeBetterBlockAsm:
5370	ADDL R12, CX
5371	ADDL $0x04, R12
5372	MOVL CX, 12(SP)
5373
5374	// emitCopy
5375	CMPL R8, $0x00010000
5376	JL   two_byte_offset_match_nolit_encodeBetterBlockAsm
5377
5378four_bytes_loop_back_match_nolit_encodeBetterBlockAsm:
5379	CMPL R12, $0x40
5380	JLE  four_bytes_remain_match_nolit_encodeBetterBlockAsm
5381	MOVB $0xff, (AX)
5382	MOVL R8, 1(AX)
5383	LEAL -64(R12), R12
5384	ADDQ $0x05, AX
5385	CMPL R12, $0x04
5386	JL   four_bytes_remain_match_nolit_encodeBetterBlockAsm
5387
5388	// emitRepeat
5389emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy:
5390	MOVL R12, SI
5391	LEAL -4(R12), R12
5392	CMPL SI, $0x08
5393	JLE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy
5394	CMPL SI, $0x0c
5395	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
5396	CMPL R8, $0x00000800
5397	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
5398
5399cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
5400	CMPL R12, $0x00000104
5401	JLT  repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy
5402	CMPL R12, $0x00010100
5403	JLT  repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy
5404	CMPL R12, $0x0100ffff
5405	JLT  repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy
5406	LEAL -16842747(R12), R12
5407	MOVW $0x001d, (AX)
5408	MOVW $0xfffb, 2(AX)
5409	MOVB $0xff, 4(AX)
5410	ADDQ $0x05, AX
5411	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy
5412
5413repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy:
5414	LEAL -65536(R12), R12
5415	MOVL R12, R8
5416	MOVW $0x001d, (AX)
5417	MOVW R12, 2(AX)
5418	SARL $0x10, R8
5419	MOVB R8, 4(AX)
5420	ADDQ $0x05, AX
5421	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5422
5423repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy:
5424	LEAL -256(R12), R12
5425	MOVW $0x0019, (AX)
5426	MOVW R12, 2(AX)
5427	ADDQ $0x04, AX
5428	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5429
5430repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy:
5431	LEAL -4(R12), R12
5432	MOVW $0x0015, (AX)
5433	MOVB R12, 2(AX)
5434	ADDQ $0x03, AX
5435	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5436
5437repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy:
5438	SHLL $0x02, R12
5439	ORL  $0x01, R12
5440	MOVW R12, (AX)
5441	ADDQ $0x02, AX
5442	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5443
5444repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
5445	XORQ SI, SI
5446	LEAL 1(SI)(R12*4), R12
5447	MOVB R8, 1(AX)
5448	SARL $0x08, R8
5449	SHLL $0x05, R8
5450	ORL  R8, R12
5451	MOVB R12, (AX)
5452	ADDQ $0x02, AX
5453	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5454	JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm
5455
5456four_bytes_remain_match_nolit_encodeBetterBlockAsm:
5457	TESTL R12, R12
5458	JZ    match_nolit_emitcopy_end_encodeBetterBlockAsm
5459	MOVB  $0x03, BL
5460	LEAL  -4(BX)(R12*4), R12
5461	MOVB  R12, (AX)
5462	MOVL  R8, 1(AX)
5463	ADDQ  $0x05, AX
5464	JMP   match_nolit_emitcopy_end_encodeBetterBlockAsm
5465
5466two_byte_offset_match_nolit_encodeBetterBlockAsm:
5467	CMPL R12, $0x40
5468	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm
5469	MOVB $0xee, (AX)
5470	MOVW R8, 1(AX)
5471	LEAL -60(R12), R12
5472	ADDQ $0x03, AX
5473
5474	// emitRepeat
5475emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short:
5476	MOVL R12, SI
5477	LEAL -4(R12), R12
5478	CMPL SI, $0x08
5479	JLE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short
5480	CMPL SI, $0x0c
5481	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
5482	CMPL R8, $0x00000800
5483	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
5484
5485cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
5486	CMPL R12, $0x00000104
5487	JLT  repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short
5488	CMPL R12, $0x00010100
5489	JLT  repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short
5490	CMPL R12, $0x0100ffff
5491	JLT  repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short
5492	LEAL -16842747(R12), R12
5493	MOVW $0x001d, (AX)
5494	MOVW $0xfffb, 2(AX)
5495	MOVB $0xff, 4(AX)
5496	ADDQ $0x05, AX
5497	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short
5498
5499repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short:
5500	LEAL -65536(R12), R12
5501	MOVL R12, R8
5502	MOVW $0x001d, (AX)
5503	MOVW R12, 2(AX)
5504	SARL $0x10, R8
5505	MOVB R8, 4(AX)
5506	ADDQ $0x05, AX
5507	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5508
5509repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short:
5510	LEAL -256(R12), R12
5511	MOVW $0x0019, (AX)
5512	MOVW R12, 2(AX)
5513	ADDQ $0x04, AX
5514	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5515
5516repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short:
5517	LEAL -4(R12), R12
5518	MOVW $0x0015, (AX)
5519	MOVB R12, 2(AX)
5520	ADDQ $0x03, AX
5521	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5522
5523repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short:
5524	SHLL $0x02, R12
5525	ORL  $0x01, R12
5526	MOVW R12, (AX)
5527	ADDQ $0x02, AX
5528	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5529
5530repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
5531	XORQ SI, SI
5532	LEAL 1(SI)(R12*4), R12
5533	MOVB R8, 1(AX)
5534	SARL $0x08, R8
5535	SHLL $0x05, R8
5536	ORL  R8, R12
5537	MOVB R12, (AX)
5538	ADDQ $0x02, AX
5539	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5540	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm
5541
5542two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
5543	CMPL R12, $0x0c
5544	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm
5545	CMPL R8, $0x00000800
5546	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm
5547	MOVB $0x01, BL
5548	LEAL -16(BX)(R12*4), R12
5549	MOVB R8, 1(AX)
5550	SHRL $0x08, R8
5551	SHLL $0x05, R8
5552	ORL  R8, R12
5553	MOVB R12, (AX)
5554	ADDQ $0x02, AX
5555	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5556
5557emit_copy_three_match_nolit_encodeBetterBlockAsm:
5558	MOVB $0x02, BL
5559	LEAL -4(BX)(R12*4), R12
5560	MOVB R12, (AX)
5561	MOVW R8, 1(AX)
5562	ADDQ $0x03, AX
5563	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5564
5565match_is_repeat_encodeBetterBlockAsm:
5566	MOVL 12(SP), SI
5567	CMPL SI, DI
5568	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
5569	MOVL DI, R9
5570	MOVL DI, 12(SP)
5571	LEAQ (DX)(SI*1), R10
5572	SUBL SI, R9
5573	LEAL -1(R9), SI
5574	CMPL SI, $0x3c
5575	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm
5576	CMPL SI, $0x00000100
5577	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm
5578	CMPL SI, $0x00010000
5579	JLT  three_bytes_match_emit_repeat_encodeBetterBlockAsm
5580	CMPL SI, $0x01000000
5581	JLT  four_bytes_match_emit_repeat_encodeBetterBlockAsm
5582	MOVB $0xfc, (AX)
5583	MOVL SI, 1(AX)
5584	ADDQ $0x05, AX
5585	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
5586
5587four_bytes_match_emit_repeat_encodeBetterBlockAsm:
5588	MOVL SI, R11
5589	SHRL $0x10, R11
5590	MOVB $0xf8, (AX)
5591	MOVW SI, 1(AX)
5592	MOVB R11, 3(AX)
5593	ADDQ $0x04, AX
5594	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
5595
5596three_bytes_match_emit_repeat_encodeBetterBlockAsm:
5597	MOVB $0xf4, (AX)
5598	MOVW SI, 1(AX)
5599	ADDQ $0x03, AX
5600	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
5601
5602two_bytes_match_emit_repeat_encodeBetterBlockAsm:
5603	MOVB $0xf0, (AX)
5604	MOVB SI, 1(AX)
5605	ADDQ $0x02, AX
5606	CMPL SI, $0x40
5607	JL   memmove_match_emit_repeat_encodeBetterBlockAsm
5608	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm
5609
5610one_byte_match_emit_repeat_encodeBetterBlockAsm:
5611	SHLB $0x02, SI
5612	MOVB SI, (AX)
5613	ADDQ $0x01, AX
5614
5615memmove_match_emit_repeat_encodeBetterBlockAsm:
5616	LEAQ (AX)(R9*1), SI
5617
5618	// genMemMoveShort
5619	CMPQ R9, $0x03
5620	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_1or2
5621	JE   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_3
5622	CMPQ R9, $0x08
5623	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7
5624	CMPQ R9, $0x10
5625	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
5626	CMPQ R9, $0x20
5627	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
5628	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
5629
5630emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_1or2:
5631	MOVB (R10), R11
5632	MOVB -1(R10)(R9*1), R10
5633	MOVB R11, (AX)
5634	MOVB R10, -1(AX)(R9*1)
5635	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
5636
5637emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_3:
5638	MOVW (R10), R11
5639	MOVB 2(R10), R10
5640	MOVW R11, (AX)
5641	MOVB R10, 2(AX)
5642	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
5643
5644emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7:
5645	MOVL (R10), R11
5646	MOVL -4(R10)(R9*1), R10
5647	MOVL R11, (AX)
5648	MOVL R10, -4(AX)(R9*1)
5649	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
5650
5651emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
5652	MOVQ (R10), R11
5653	MOVQ -8(R10)(R9*1), R10
5654	MOVQ R11, (AX)
5655	MOVQ R10, -8(AX)(R9*1)
5656	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
5657
5658emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
5659	MOVOU (R10), X0
5660	MOVOU -16(R10)(R9*1), X1
5661	MOVOU X0, (AX)
5662	MOVOU X1, -16(AX)(R9*1)
5663	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
5664
5665emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
5666	MOVOU (R10), X0
5667	MOVOU 16(R10), X1
5668	MOVOU -32(R10)(R9*1), X2
5669	MOVOU -16(R10)(R9*1), X3
5670	MOVOU X0, (AX)
5671	MOVOU X1, 16(AX)
5672	MOVOU X2, -32(AX)(R9*1)
5673	MOVOU X3, -16(AX)(R9*1)
5674
5675memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
5676	MOVQ SI, AX
5677	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
5678
5679memmove_long_match_emit_repeat_encodeBetterBlockAsm:
5680	LEAQ (AX)(R9*1), SI
5681
5682	// genMemMoveLong
5683	MOVOU (R10), X0
5684	MOVOU 16(R10), X1
5685	MOVOU -32(R10)(R9*1), X2
5686	MOVOU -16(R10)(R9*1), X3
5687	MOVQ  R9, R13
5688	SHRQ  $0x05, R13
5689	MOVQ  AX, R11
5690	ANDL  $0x0000001f, R11
5691	MOVQ  $0x00000040, R14
5692	SUBQ  R11, R14
5693	DECQ  R13
5694	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
5695	LEAQ  -32(R10)(R14*1), R11
5696	LEAQ  -32(AX)(R14*1), R15
5697
5698emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
5699	MOVOU (R11), X4
5700	MOVOU 16(R11), X5
5701	MOVOA X4, (R15)
5702	MOVOA X5, 16(R15)
5703	ADDQ  $0x20, R15
5704	ADDQ  $0x20, R11
5705	ADDQ  $0x20, R14
5706	DECQ  R13
5707	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back
5708
5709emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
5710	MOVOU -32(R10)(R14*1), X4
5711	MOVOU -16(R10)(R14*1), X5
5712	MOVOA X4, -32(AX)(R14*1)
5713	MOVOA X5, -16(AX)(R14*1)
5714	ADDQ  $0x20, R14
5715	CMPQ  R9, R14
5716	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
5717	MOVOU X0, (AX)
5718	MOVOU X1, 16(AX)
5719	MOVOU X2, -32(AX)(R9*1)
5720	MOVOU X3, -16(AX)(R9*1)
5721	MOVQ  SI, AX
5722
5723emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
5724	ADDL R12, CX
5725	ADDL $0x04, R12
5726	MOVL CX, 12(SP)
5727
5728	// emitRepeat
5729emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm:
5730	MOVL R12, SI
5731	LEAL -4(R12), R12
5732	CMPL SI, $0x08
5733	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm
5734	CMPL SI, $0x0c
5735	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
5736	CMPL R8, $0x00000800
5737	JLT  repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
5738
5739cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
5740	CMPL R12, $0x00000104
5741	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm
5742	CMPL R12, $0x00010100
5743	JLT  repeat_four_match_nolit_repeat_encodeBetterBlockAsm
5744	CMPL R12, $0x0100ffff
5745	JLT  repeat_five_match_nolit_repeat_encodeBetterBlockAsm
5746	LEAL -16842747(R12), R12
5747	MOVW $0x001d, (AX)
5748	MOVW $0xfffb, 2(AX)
5749	MOVB $0xff, 4(AX)
5750	ADDQ $0x05, AX
5751	JMP  emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm
5752
5753repeat_five_match_nolit_repeat_encodeBetterBlockAsm:
5754	LEAL -65536(R12), R12
5755	MOVL R12, R8
5756	MOVW $0x001d, (AX)
5757	MOVW R12, 2(AX)
5758	SARL $0x10, R8
5759	MOVB R8, 4(AX)
5760	ADDQ $0x05, AX
5761	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5762
5763repeat_four_match_nolit_repeat_encodeBetterBlockAsm:
5764	LEAL -256(R12), R12
5765	MOVW $0x0019, (AX)
5766	MOVW R12, 2(AX)
5767	ADDQ $0x04, AX
5768	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5769
5770repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
5771	LEAL -4(R12), R12
5772	MOVW $0x0015, (AX)
5773	MOVB R12, 2(AX)
5774	ADDQ $0x03, AX
5775	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5776
5777repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
5778	SHLL $0x02, R12
5779	ORL  $0x01, R12
5780	MOVW R12, (AX)
5781	ADDQ $0x02, AX
5782	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
5783
5784repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
5785	XORQ SI, SI
5786	LEAL 1(SI)(R12*4), R12
5787	MOVB R8, 1(AX)
5788	SARL $0x08, R8
5789	SHLL $0x05, R8
5790	ORL  R8, R12
5791	MOVB R12, (AX)
5792	ADDQ $0x02, AX
5793
5794match_nolit_emitcopy_end_encodeBetterBlockAsm:
5795	CMPL CX, 8(SP)
5796	JGE  emit_remainder_encodeBetterBlockAsm
5797	CMPQ AX, (SP)
5798	JL   match_nolit_dst_ok_encodeBetterBlockAsm
5799	MOVQ $0x00000000, ret+48(FP)
5800	RET
5801
5802match_nolit_dst_ok_encodeBetterBlockAsm:
5803	MOVQ  $0x00cf1bbcdcbfa563, SI
5804	MOVQ  $0x9e3779b1, R8
5805	INCL  DI
5806	MOVQ  (DX)(DI*1), R9
5807	MOVQ  R9, R10
5808	MOVQ  R9, R11
5809	MOVQ  R9, R12
5810	SHRQ  $0x08, R11
5811	MOVQ  R11, R13
5812	SHRQ  $0x10, R12
5813	LEAL  1(DI), R14
5814	LEAL  2(DI), R15
5815	MOVQ  -2(DX)(CX*1), R9
5816	SHLQ  $0x08, R10
5817	IMULQ SI, R10
5818	SHRQ  $0x30, R10
5819	SHLQ  $0x08, R13
5820	IMULQ SI, R13
5821	SHRQ  $0x30, R13
5822	SHLQ  $0x20, R11
5823	IMULQ R8, R11
5824	SHRQ  $0x32, R11
5825	SHLQ  $0x20, R12
5826	IMULQ R8, R12
5827	SHRQ  $0x32, R12
5828	MOVL  DI, 24(SP)(R10*4)
5829	MOVL  R14, 24(SP)(R13*4)
5830	MOVL  R14, 262168(SP)(R11*4)
5831	MOVL  R15, 262168(SP)(R12*4)
5832	MOVQ  R9, R10
5833	MOVQ  R9, R11
5834	SHRQ  $0x08, R11
5835	MOVQ  R11, R13
5836	LEAL  -2(CX), R9
5837	LEAL  -1(CX), DI
5838	SHLQ  $0x08, R10
5839	IMULQ SI, R10
5840	SHRQ  $0x30, R10
5841	SHLQ  $0x20, R11
5842	IMULQ R8, R11
5843	SHRQ  $0x32, R11
5844	SHLQ  $0x08, R13
5845	IMULQ SI, R13
5846	SHRQ  $0x30, R13
5847	MOVL  R9, 24(SP)(R10*4)
5848	MOVL  DI, 262168(SP)(R11*4)
5849	MOVL  DI, 24(SP)(R13*4)
5850	JMP   search_loop_encodeBetterBlockAsm
5851
5852emit_remainder_encodeBetterBlockAsm:
5853	MOVQ src_len+32(FP), CX
5854	SUBL 12(SP), CX
5855	LEAQ 5(AX)(CX*1), CX
5856	CMPQ CX, (SP)
5857	JL   emit_remainder_ok_encodeBetterBlockAsm
5858	MOVQ $0x00000000, ret+48(FP)
5859	RET
5860
5861emit_remainder_ok_encodeBetterBlockAsm:
5862	MOVQ src_len+32(FP), CX
5863	MOVL 12(SP), BX
5864	CMPL BX, CX
5865	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm
5866	MOVL CX, SI
5867	MOVL CX, 12(SP)
5868	LEAQ (DX)(BX*1), CX
5869	SUBL BX, SI
5870	LEAL -1(SI), DX
5871	CMPL DX, $0x3c
5872	JLT  one_byte_emit_remainder_encodeBetterBlockAsm
5873	CMPL DX, $0x00000100
5874	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm
5875	CMPL DX, $0x00010000
5876	JLT  three_bytes_emit_remainder_encodeBetterBlockAsm
5877	CMPL DX, $0x01000000
5878	JLT  four_bytes_emit_remainder_encodeBetterBlockAsm
5879	MOVB $0xfc, (AX)
5880	MOVL DX, 1(AX)
5881	ADDQ $0x05, AX
5882	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
5883
5884four_bytes_emit_remainder_encodeBetterBlockAsm:
5885	MOVL DX, BX
5886	SHRL $0x10, BX
5887	MOVB $0xf8, (AX)
5888	MOVW DX, 1(AX)
5889	MOVB BL, 3(AX)
5890	ADDQ $0x04, AX
5891	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
5892
5893three_bytes_emit_remainder_encodeBetterBlockAsm:
5894	MOVB $0xf4, (AX)
5895	MOVW DX, 1(AX)
5896	ADDQ $0x03, AX
5897	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
5898
5899two_bytes_emit_remainder_encodeBetterBlockAsm:
5900	MOVB $0xf0, (AX)
5901	MOVB DL, 1(AX)
5902	ADDQ $0x02, AX
5903	CMPL DX, $0x40
5904	JL   memmove_emit_remainder_encodeBetterBlockAsm
5905	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm
5906
5907one_byte_emit_remainder_encodeBetterBlockAsm:
5908	SHLB $0x02, DL
5909	MOVB DL, (AX)
5910	ADDQ $0x01, AX
5911
5912memmove_emit_remainder_encodeBetterBlockAsm:
5913	LEAQ (AX)(SI*1), DX
5914	MOVL SI, BX
5915
5916	// genMemMoveShort
5917	CMPQ BX, $0x03
5918	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2
5919	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3
5920	CMPQ BX, $0x08
5921	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7
5922	CMPQ BX, $0x10
5923	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16
5924	CMPQ BX, $0x20
5925	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
5926	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64
5927
5928emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2:
5929	MOVB (CX), SI
5930	MOVB -1(CX)(BX*1), CL
5931	MOVB SI, (AX)
5932	MOVB CL, -1(AX)(BX*1)
5933	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
5934
5935emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3:
5936	MOVW (CX), SI
5937	MOVB 2(CX), CL
5938	MOVW SI, (AX)
5939	MOVB CL, 2(AX)
5940	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
5941
5942emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7:
5943	MOVL (CX), SI
5944	MOVL -4(CX)(BX*1), CX
5945	MOVL SI, (AX)
5946	MOVL CX, -4(AX)(BX*1)
5947	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
5948
5949emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16:
5950	MOVQ (CX), SI
5951	MOVQ -8(CX)(BX*1), CX
5952	MOVQ SI, (AX)
5953	MOVQ CX, -8(AX)(BX*1)
5954	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm
5955
5956emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
5957	MOVOU (CX), X0
5958	MOVOU -16(CX)(BX*1), X1
5959	MOVOU X0, (AX)
5960	MOVOU X1, -16(AX)(BX*1)
5961	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm
5962
5963emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
5964	MOVOU (CX), X0
5965	MOVOU 16(CX), X1
5966	MOVOU -32(CX)(BX*1), X2
5967	MOVOU -16(CX)(BX*1), X3
5968	MOVOU X0, (AX)
5969	MOVOU X1, 16(AX)
5970	MOVOU X2, -32(AX)(BX*1)
5971	MOVOU X3, -16(AX)(BX*1)
5972
5973memmove_end_copy_emit_remainder_encodeBetterBlockAsm:
5974	MOVQ DX, AX
5975	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm
5976
5977memmove_long_emit_remainder_encodeBetterBlockAsm:
5978	LEAQ (AX)(SI*1), DX
5979	MOVL SI, BX
5980
5981	// genMemMoveLong
5982	MOVOU (CX), X0
5983	MOVOU 16(CX), X1
5984	MOVOU -32(CX)(BX*1), X2
5985	MOVOU -16(CX)(BX*1), X3
5986	MOVQ  BX, DI
5987	SHRQ  $0x05, DI
5988	MOVQ  AX, SI
5989	ANDL  $0x0000001f, SI
5990	MOVQ  $0x00000040, R8
5991	SUBQ  SI, R8
5992	DECQ  DI
5993	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
5994	LEAQ  -32(CX)(R8*1), SI
5995	LEAQ  -32(AX)(R8*1), R9
5996
5997emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back:
5998	MOVOU (SI), X4
5999	MOVOU 16(SI), X5
6000	MOVOA X4, (R9)
6001	MOVOA X5, 16(R9)
6002	ADDQ  $0x20, R9
6003	ADDQ  $0x20, SI
6004	ADDQ  $0x20, R8
6005	DECQ  DI
6006	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back
6007
6008emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32:
6009	MOVOU -32(CX)(R8*1), X4
6010	MOVOU -16(CX)(R8*1), X5
6011	MOVOA X4, -32(AX)(R8*1)
6012	MOVOA X5, -16(AX)(R8*1)
6013	ADDQ  $0x20, R8
6014	CMPQ  BX, R8
6015	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
6016	MOVOU X0, (AX)
6017	MOVOU X1, 16(AX)
6018	MOVOU X2, -32(AX)(BX*1)
6019	MOVOU X3, -16(AX)(BX*1)
6020	MOVQ  DX, AX
6021
6022emit_literal_done_emit_remainder_encodeBetterBlockAsm:
6023	MOVQ dst_base+0(FP), CX
6024	SUBQ CX, AX
6025	MOVQ AX, ret+48(FP)
6026	RET
6027
6028// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
6029// Requires: SSE2
6030TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56
6031	MOVQ dst_base+0(FP), AX
6032	MOVQ $0x00000a00, CX
6033	LEAQ 24(SP), DX
6034	PXOR X0, X0
6035
6036zero_loop_encodeBetterBlockAsm4MB:
6037	MOVOU X0, (DX)
6038	MOVOU X0, 16(DX)
6039	MOVOU X0, 32(DX)
6040	MOVOU X0, 48(DX)
6041	MOVOU X0, 64(DX)
6042	MOVOU X0, 80(DX)
6043	MOVOU X0, 96(DX)
6044	MOVOU X0, 112(DX)
6045	ADDQ  $0x80, DX
6046	DECQ  CX
6047	JNZ   zero_loop_encodeBetterBlockAsm4MB
6048	MOVL  $0x00000000, 12(SP)
6049	MOVQ  src_len+32(FP), CX
6050	LEAQ  -6(CX), DX
6051	LEAQ  -8(CX), SI
6052	MOVL  SI, 8(SP)
6053	SHRQ  $0x05, CX
6054	SUBL  CX, DX
6055	LEAQ  (AX)(DX*1), DX
6056	MOVQ  DX, (SP)
6057	MOVL  $0x00000001, CX
6058	MOVL  $0x00000000, 16(SP)
6059	MOVQ  src_base+24(FP), DX
6060
6061search_loop_encodeBetterBlockAsm4MB:
6062	MOVL  CX, SI
6063	SUBL  12(SP), SI
6064	SHRL  $0x07, SI
6065	LEAL  1(CX)(SI*1), SI
6066	CMPL  SI, 8(SP)
6067	JGE   emit_remainder_encodeBetterBlockAsm4MB
6068	MOVQ  (DX)(CX*1), DI
6069	MOVL  SI, 20(SP)
6070	MOVQ  $0x00cf1bbcdcbfa563, R9
6071	MOVQ  $0x9e3779b1, SI
6072	MOVQ  DI, R10
6073	MOVQ  DI, R11
6074	SHLQ  $0x08, R10
6075	IMULQ R9, R10
6076	SHRQ  $0x30, R10
6077	SHLQ  $0x20, R11
6078	IMULQ SI, R11
6079	SHRQ  $0x32, R11
6080	MOVL  24(SP)(R10*4), SI
6081	MOVL  262168(SP)(R11*4), R8
6082	MOVL  CX, 24(SP)(R10*4)
6083	MOVL  CX, 262168(SP)(R11*4)
6084	CMPL  (DX)(SI*1), DI
6085	JEQ   candidate_match_encodeBetterBlockAsm4MB
6086	CMPL  (DX)(R8*1), DI
6087	JEQ   candidateS_match_encodeBetterBlockAsm4MB
6088	MOVL  20(SP), CX
6089	JMP   search_loop_encodeBetterBlockAsm4MB
6090
6091candidateS_match_encodeBetterBlockAsm4MB:
6092	SHRQ  $0x08, DI
6093	MOVQ  DI, R10
6094	SHLQ  $0x08, R10
6095	IMULQ R9, R10
6096	SHRQ  $0x30, R10
6097	MOVL  24(SP)(R10*4), SI
6098	INCL  CX
6099	MOVL  CX, 24(SP)(R10*4)
6100	CMPL  (DX)(SI*1), DI
6101	JEQ   candidate_match_encodeBetterBlockAsm4MB
6102	DECL  CX
6103	MOVL  R8, SI
6104
6105candidate_match_encodeBetterBlockAsm4MB:
6106	MOVL  12(SP), DI
6107	TESTL SI, SI
6108	JZ    match_extend_back_end_encodeBetterBlockAsm4MB
6109
6110match_extend_back_loop_encodeBetterBlockAsm4MB:
6111	CMPL CX, DI
6112	JLE  match_extend_back_end_encodeBetterBlockAsm4MB
6113	MOVB -1(DX)(SI*1), BL
6114	MOVB -1(DX)(CX*1), R8
6115	CMPB BL, R8
6116	JNE  match_extend_back_end_encodeBetterBlockAsm4MB
6117	LEAL -1(CX), CX
6118	DECL SI
6119	JZ   match_extend_back_end_encodeBetterBlockAsm4MB
6120	JMP  match_extend_back_loop_encodeBetterBlockAsm4MB
6121
6122match_extend_back_end_encodeBetterBlockAsm4MB:
6123	MOVL CX, DI
6124	SUBL 12(SP), DI
6125	LEAQ 4(AX)(DI*1), DI
6126	CMPQ DI, (SP)
6127	JL   match_dst_size_check_encodeBetterBlockAsm4MB
6128	MOVQ $0x00000000, ret+48(FP)
6129	RET
6130
6131match_dst_size_check_encodeBetterBlockAsm4MB:
6132	MOVL CX, DI
6133	ADDL $0x04, CX
6134	ADDL $0x04, SI
6135	MOVQ src_len+32(FP), R8
6136	SUBL CX, R8
6137	LEAQ (DX)(CX*1), R9
6138	LEAQ (DX)(SI*1), R10
6139
6140	// matchLen
6141	XORL R12, R12
6142	CMPL R8, $0x08
6143	JL   matchlen_single_match_nolit_encodeBetterBlockAsm4MB
6144
6145matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB:
6146	MOVQ  (R9)(R12*1), R11
6147	XORQ  (R10)(R12*1), R11
6148	TESTQ R11, R11
6149	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm4MB
6150	BSFQ  R11, R11
6151	SARQ  $0x03, R11
6152	LEAL  (R12)(R11*1), R12
6153	JMP   match_nolit_end_encodeBetterBlockAsm4MB
6154
6155matchlen_loop_match_nolit_encodeBetterBlockAsm4MB:
6156	LEAL -8(R8), R8
6157	LEAL 8(R12), R12
6158	CMPL R8, $0x08
6159	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB
6160
6161matchlen_single_match_nolit_encodeBetterBlockAsm4MB:
6162	TESTL R8, R8
6163	JZ    match_nolit_end_encodeBetterBlockAsm4MB
6164
6165matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB:
6166	MOVB (R9)(R12*1), R11
6167	CMPB (R10)(R12*1), R11
6168	JNE  match_nolit_end_encodeBetterBlockAsm4MB
6169	LEAL 1(R12), R12
6170	DECL R8
6171	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm4MB
6172
6173match_nolit_end_encodeBetterBlockAsm4MB:
6174	MOVL CX, R8
6175	SUBL SI, R8
6176
6177	// Check if repeat
6178	CMPL 16(SP), R8
6179	JEQ  match_is_repeat_encodeBetterBlockAsm4MB
6180	CMPL R12, $0x01
6181	JG   match_length_ok_encodeBetterBlockAsm4MB
6182	CMPL R8, $0x0000ffff
6183	JLE  match_length_ok_encodeBetterBlockAsm4MB
6184	MOVL 20(SP), CX
6185	INCL CX
6186	JMP  search_loop_encodeBetterBlockAsm4MB
6187
6188match_length_ok_encodeBetterBlockAsm4MB:
6189	MOVL R8, 16(SP)
6190	MOVL 12(SP), SI
6191	CMPL SI, DI
6192	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm4MB
6193	MOVL DI, R9
6194	MOVL DI, 12(SP)
6195	LEAQ (DX)(SI*1), R10
6196	SUBL SI, R9
6197	LEAL -1(R9), SI
6198	CMPL SI, $0x3c
6199	JLT  one_byte_match_emit_encodeBetterBlockAsm4MB
6200	CMPL SI, $0x00000100
6201	JLT  two_bytes_match_emit_encodeBetterBlockAsm4MB
6202	CMPL SI, $0x00010000
6203	JLT  three_bytes_match_emit_encodeBetterBlockAsm4MB
6204	MOVL SI, R11
6205	SHRL $0x10, R11
6206	MOVB $0xf8, (AX)
6207	MOVW SI, 1(AX)
6208	MOVB R11, 3(AX)
6209	ADDQ $0x04, AX
6210	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB
6211
6212three_bytes_match_emit_encodeBetterBlockAsm4MB:
6213	MOVB $0xf4, (AX)
6214	MOVW SI, 1(AX)
6215	ADDQ $0x03, AX
6216	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB
6217
6218two_bytes_match_emit_encodeBetterBlockAsm4MB:
6219	MOVB $0xf0, (AX)
6220	MOVB SI, 1(AX)
6221	ADDQ $0x02, AX
6222	CMPL SI, $0x40
6223	JL   memmove_match_emit_encodeBetterBlockAsm4MB
6224	JMP  memmove_long_match_emit_encodeBetterBlockAsm4MB
6225
6226one_byte_match_emit_encodeBetterBlockAsm4MB:
6227	SHLB $0x02, SI
6228	MOVB SI, (AX)
6229	ADDQ $0x01, AX
6230
6231memmove_match_emit_encodeBetterBlockAsm4MB:
6232	LEAQ (AX)(R9*1), SI
6233
6234	// genMemMoveShort
6235	CMPQ R9, $0x03
6236	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_1or2
6237	JE   emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_3
6238	CMPQ R9, $0x08
6239	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7
6240	CMPQ R9, $0x10
6241	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16
6242	CMPQ R9, $0x20
6243	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32
6244	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64
6245
6246emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_1or2:
6247	MOVB (R10), R11
6248	MOVB -1(R10)(R9*1), R10
6249	MOVB R11, (AX)
6250	MOVB R10, -1(AX)(R9*1)
6251	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
6252
6253emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_3:
6254	MOVW (R10), R11
6255	MOVB 2(R10), R10
6256	MOVW R11, (AX)
6257	MOVB R10, 2(AX)
6258	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
6259
6260emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7:
6261	MOVL (R10), R11
6262	MOVL -4(R10)(R9*1), R10
6263	MOVL R11, (AX)
6264	MOVL R10, -4(AX)(R9*1)
6265	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
6266
6267emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16:
6268	MOVQ (R10), R11
6269	MOVQ -8(R10)(R9*1), R10
6270	MOVQ R11, (AX)
6271	MOVQ R10, -8(AX)(R9*1)
6272	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
6273
6274emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32:
6275	MOVOU (R10), X0
6276	MOVOU -16(R10)(R9*1), X1
6277	MOVOU X0, (AX)
6278	MOVOU X1, -16(AX)(R9*1)
6279	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
6280
6281emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64:
6282	MOVOU (R10), X0
6283	MOVOU 16(R10), X1
6284	MOVOU -32(R10)(R9*1), X2
6285	MOVOU -16(R10)(R9*1), X3
6286	MOVOU X0, (AX)
6287	MOVOU X1, 16(AX)
6288	MOVOU X2, -32(AX)(R9*1)
6289	MOVOU X3, -16(AX)(R9*1)
6290
6291memmove_end_copy_match_emit_encodeBetterBlockAsm4MB:
6292	MOVQ SI, AX
6293	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm4MB
6294
6295memmove_long_match_emit_encodeBetterBlockAsm4MB:
6296	LEAQ (AX)(R9*1), SI
6297
6298	// genMemMoveLong
6299	MOVOU (R10), X0
6300	MOVOU 16(R10), X1
6301	MOVOU -32(R10)(R9*1), X2
6302	MOVOU -16(R10)(R9*1), X3
6303	MOVQ  R9, R13
6304	SHRQ  $0x05, R13
6305	MOVQ  AX, R11
6306	ANDL  $0x0000001f, R11
6307	MOVQ  $0x00000040, R14
6308	SUBQ  R11, R14
6309	DECQ  R13
6310	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
6311	LEAQ  -32(R10)(R14*1), R11
6312	LEAQ  -32(AX)(R14*1), R15
6313
6314emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back:
6315	MOVOU (R11), X4
6316	MOVOU 16(R11), X5
6317	MOVOA X4, (R15)
6318	MOVOA X5, 16(R15)
6319	ADDQ  $0x20, R15
6320	ADDQ  $0x20, R11
6321	ADDQ  $0x20, R14
6322	DECQ  R13
6323	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back
6324
6325emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
6326	MOVOU -32(R10)(R14*1), X4
6327	MOVOU -16(R10)(R14*1), X5
6328	MOVOA X4, -32(AX)(R14*1)
6329	MOVOA X5, -16(AX)(R14*1)
6330	ADDQ  $0x20, R14
6331	CMPQ  R9, R14
6332	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
6333	MOVOU X0, (AX)
6334	MOVOU X1, 16(AX)
6335	MOVOU X2, -32(AX)(R9*1)
6336	MOVOU X3, -16(AX)(R9*1)
6337	MOVQ  SI, AX
6338
6339emit_literal_done_match_emit_encodeBetterBlockAsm4MB:
6340	ADDL R12, CX
6341	ADDL $0x04, R12
6342	MOVL CX, 12(SP)
6343
6344	// emitCopy
6345	CMPL R8, $0x00010000
6346	JL   two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
6347
6348four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB:
6349	CMPL R12, $0x40
6350	JLE  four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
6351	MOVB $0xff, (AX)
6352	MOVL R8, 1(AX)
6353	LEAL -64(R12), R12
6354	ADDQ $0x05, AX
6355	CMPL R12, $0x04
6356	JL   four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
6357
6358	// emitRepeat
6359	MOVL R12, SI
6360	LEAL -4(R12), R12
6361	CMPL SI, $0x08
6362	JLE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy
6363	CMPL SI, $0x0c
6364	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
6365	CMPL R8, $0x00000800
6366	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
6367
6368cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
6369	CMPL R12, $0x00000104
6370	JLT  repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy
6371	CMPL R12, $0x00010100
6372	JLT  repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy
6373	LEAL -65536(R12), R12
6374	MOVL R12, R8
6375	MOVW $0x001d, (AX)
6376	MOVW R12, 2(AX)
6377	SARL $0x10, R8
6378	MOVB R8, 4(AX)
6379	ADDQ $0x05, AX
6380	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6381
6382repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
6383	LEAL -256(R12), R12
6384	MOVW $0x0019, (AX)
6385	MOVW R12, 2(AX)
6386	ADDQ $0x04, AX
6387	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6388
6389repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
6390	LEAL -4(R12), R12
6391	MOVW $0x0015, (AX)
6392	MOVB R12, 2(AX)
6393	ADDQ $0x03, AX
6394	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6395
6396repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
6397	SHLL $0x02, R12
6398	ORL  $0x01, R12
6399	MOVW R12, (AX)
6400	ADDQ $0x02, AX
6401	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6402
6403repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
6404	XORQ SI, SI
6405	LEAL 1(SI)(R12*4), R12
6406	MOVB R8, 1(AX)
6407	SARL $0x08, R8
6408	SHLL $0x05, R8
6409	ORL  R8, R12
6410	MOVB R12, (AX)
6411	ADDQ $0x02, AX
6412	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6413	JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB
6414
6415four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
6416	TESTL R12, R12
6417	JZ    match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6418	MOVB  $0x03, BL
6419	LEAL  -4(BX)(R12*4), R12
6420	MOVB  R12, (AX)
6421	MOVL  R8, 1(AX)
6422	ADDQ  $0x05, AX
6423	JMP   match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6424
6425two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
6426	CMPL R12, $0x40
6427	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
6428	MOVB $0xee, (AX)
6429	MOVW R8, 1(AX)
6430	LEAL -60(R12), R12
6431	ADDQ $0x03, AX
6432
6433	// emitRepeat
6434	MOVL R12, SI
6435	LEAL -4(R12), R12
6436	CMPL SI, $0x08
6437	JLE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
6438	CMPL SI, $0x0c
6439	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
6440	CMPL R8, $0x00000800
6441	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
6442
6443cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
6444	CMPL R12, $0x00000104
6445	JLT  repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
6446	CMPL R12, $0x00010100
6447	JLT  repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
6448	LEAL -65536(R12), R12
6449	MOVL R12, R8
6450	MOVW $0x001d, (AX)
6451	MOVW R12, 2(AX)
6452	SARL $0x10, R8
6453	MOVB R8, 4(AX)
6454	ADDQ $0x05, AX
6455	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6456
6457repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
6458	LEAL -256(R12), R12
6459	MOVW $0x0019, (AX)
6460	MOVW R12, 2(AX)
6461	ADDQ $0x04, AX
6462	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6463
6464repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
6465	LEAL -4(R12), R12
6466	MOVW $0x0015, (AX)
6467	MOVB R12, 2(AX)
6468	ADDQ $0x03, AX
6469	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6470
6471repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
6472	SHLL $0x02, R12
6473	ORL  $0x01, R12
6474	MOVW R12, (AX)
6475	ADDQ $0x02, AX
6476	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6477
6478repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
6479	XORQ SI, SI
6480	LEAL 1(SI)(R12*4), R12
6481	MOVB R8, 1(AX)
6482	SARL $0x08, R8
6483	SHLL $0x05, R8
6484	ORL  R8, R12
6485	MOVB R12, (AX)
6486	ADDQ $0x02, AX
6487	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6488	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
6489
6490two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
6491	CMPL R12, $0x0c
6492	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
6493	CMPL R8, $0x00000800
6494	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
6495	MOVB $0x01, BL
6496	LEAL -16(BX)(R12*4), R12
6497	MOVB R8, 1(AX)
6498	SHRL $0x08, R8
6499	SHLL $0x05, R8
6500	ORL  R8, R12
6501	MOVB R12, (AX)
6502	ADDQ $0x02, AX
6503	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6504
6505emit_copy_three_match_nolit_encodeBetterBlockAsm4MB:
6506	MOVB $0x02, BL
6507	LEAL -4(BX)(R12*4), R12
6508	MOVB R12, (AX)
6509	MOVW R8, 1(AX)
6510	ADDQ $0x03, AX
6511	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6512
6513match_is_repeat_encodeBetterBlockAsm4MB:
6514	MOVL 12(SP), SI
6515	CMPL SI, DI
6516	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
6517	MOVL DI, R9
6518	MOVL DI, 12(SP)
6519	LEAQ (DX)(SI*1), R10
6520	SUBL SI, R9
6521	LEAL -1(R9), SI
6522	CMPL SI, $0x3c
6523	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm4MB
6524	CMPL SI, $0x00000100
6525	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
6526	CMPL SI, $0x00010000
6527	JLT  three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
6528	MOVL SI, R11
6529	SHRL $0x10, R11
6530	MOVB $0xf8, (AX)
6531	MOVW SI, 1(AX)
6532	MOVB R11, 3(AX)
6533	ADDQ $0x04, AX
6534	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
6535
6536three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
6537	MOVB $0xf4, (AX)
6538	MOVW SI, 1(AX)
6539	ADDQ $0x03, AX
6540	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
6541
6542two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
6543	MOVB $0xf0, (AX)
6544	MOVB SI, 1(AX)
6545	ADDQ $0x02, AX
6546	CMPL SI, $0x40
6547	JL   memmove_match_emit_repeat_encodeBetterBlockAsm4MB
6548	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
6549
6550one_byte_match_emit_repeat_encodeBetterBlockAsm4MB:
6551	SHLB $0x02, SI
6552	MOVB SI, (AX)
6553	ADDQ $0x01, AX
6554
6555memmove_match_emit_repeat_encodeBetterBlockAsm4MB:
6556	LEAQ (AX)(R9*1), SI
6557
6558	// genMemMoveShort
6559	CMPQ R9, $0x03
6560	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_1or2
6561	JE   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_3
6562	CMPQ R9, $0x08
6563	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7
6564	CMPQ R9, $0x10
6565	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16
6566	CMPQ R9, $0x20
6567	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32
6568	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64
6569
6570emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_1or2:
6571	MOVB (R10), R11
6572	MOVB -1(R10)(R9*1), R10
6573	MOVB R11, (AX)
6574	MOVB R10, -1(AX)(R9*1)
6575	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
6576
6577emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_3:
6578	MOVW (R10), R11
6579	MOVB 2(R10), R10
6580	MOVW R11, (AX)
6581	MOVB R10, 2(AX)
6582	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
6583
6584emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7:
6585	MOVL (R10), R11
6586	MOVL -4(R10)(R9*1), R10
6587	MOVL R11, (AX)
6588	MOVL R10, -4(AX)(R9*1)
6589	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
6590
6591emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16:
6592	MOVQ (R10), R11
6593	MOVQ -8(R10)(R9*1), R10
6594	MOVQ R11, (AX)
6595	MOVQ R10, -8(AX)(R9*1)
6596	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
6597
6598emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32:
6599	MOVOU (R10), X0
6600	MOVOU -16(R10)(R9*1), X1
6601	MOVOU X0, (AX)
6602	MOVOU X1, -16(AX)(R9*1)
6603	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
6604
6605emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64:
6606	MOVOU (R10), X0
6607	MOVOU 16(R10), X1
6608	MOVOU -32(R10)(R9*1), X2
6609	MOVOU -16(R10)(R9*1), X3
6610	MOVOU X0, (AX)
6611	MOVOU X1, 16(AX)
6612	MOVOU X2, -32(AX)(R9*1)
6613	MOVOU X3, -16(AX)(R9*1)
6614
6615memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB:
6616	MOVQ SI, AX
6617	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
6618
6619memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB:
6620	LEAQ (AX)(R9*1), SI
6621
6622	// genMemMoveLong
6623	MOVOU (R10), X0
6624	MOVOU 16(R10), X1
6625	MOVOU -32(R10)(R9*1), X2
6626	MOVOU -16(R10)(R9*1), X3
6627	MOVQ  R9, R13
6628	SHRQ  $0x05, R13
6629	MOVQ  AX, R11
6630	ANDL  $0x0000001f, R11
6631	MOVQ  $0x00000040, R14
6632	SUBQ  R11, R14
6633	DECQ  R13
6634	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
6635	LEAQ  -32(R10)(R14*1), R11
6636	LEAQ  -32(AX)(R14*1), R15
6637
6638emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back:
6639	MOVOU (R11), X4
6640	MOVOU 16(R11), X5
6641	MOVOA X4, (R15)
6642	MOVOA X5, 16(R15)
6643	ADDQ  $0x20, R15
6644	ADDQ  $0x20, R11
6645	ADDQ  $0x20, R14
6646	DECQ  R13
6647	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back
6648
6649emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
6650	MOVOU -32(R10)(R14*1), X4
6651	MOVOU -16(R10)(R14*1), X5
6652	MOVOA X4, -32(AX)(R14*1)
6653	MOVOA X5, -16(AX)(R14*1)
6654	ADDQ  $0x20, R14
6655	CMPQ  R9, R14
6656	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
6657	MOVOU X0, (AX)
6658	MOVOU X1, 16(AX)
6659	MOVOU X2, -32(AX)(R9*1)
6660	MOVOU X3, -16(AX)(R9*1)
6661	MOVQ  SI, AX
6662
6663emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB:
6664	ADDL R12, CX
6665	ADDL $0x04, R12
6666	MOVL CX, 12(SP)
6667
6668	// emitRepeat
6669	MOVL R12, SI
6670	LEAL -4(R12), R12
6671	CMPL SI, $0x08
6672	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB
6673	CMPL SI, $0x0c
6674	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
6675	CMPL R8, $0x00000800
6676	JLT  repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
6677
6678cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
6679	CMPL R12, $0x00000104
6680	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB
6681	CMPL R12, $0x00010100
6682	JLT  repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB
6683	LEAL -65536(R12), R12
6684	MOVL R12, R8
6685	MOVW $0x001d, (AX)
6686	MOVW R12, 2(AX)
6687	SARL $0x10, R8
6688	MOVB R8, 4(AX)
6689	ADDQ $0x05, AX
6690	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6691
6692repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB:
6693	LEAL -256(R12), R12
6694	MOVW $0x0019, (AX)
6695	MOVW R12, 2(AX)
6696	ADDQ $0x04, AX
6697	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6698
6699repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB:
6700	LEAL -4(R12), R12
6701	MOVW $0x0015, (AX)
6702	MOVB R12, 2(AX)
6703	ADDQ $0x03, AX
6704	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6705
6706repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB:
6707	SHLL $0x02, R12
6708	ORL  $0x01, R12
6709	MOVW R12, (AX)
6710	ADDQ $0x02, AX
6711	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
6712
6713repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
6714	XORQ SI, SI
6715	LEAL 1(SI)(R12*4), R12
6716	MOVB R8, 1(AX)
6717	SARL $0x08, R8
6718	SHLL $0x05, R8
6719	ORL  R8, R12
6720	MOVB R12, (AX)
6721	ADDQ $0x02, AX
6722
6723match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
6724	CMPL CX, 8(SP)
6725	JGE  emit_remainder_encodeBetterBlockAsm4MB
6726	CMPQ AX, (SP)
6727	JL   match_nolit_dst_ok_encodeBetterBlockAsm4MB
6728	MOVQ $0x00000000, ret+48(FP)
6729	RET
6730
6731match_nolit_dst_ok_encodeBetterBlockAsm4MB:
6732	MOVQ  $0x00cf1bbcdcbfa563, SI
6733	MOVQ  $0x9e3779b1, R8
6734	INCL  DI
6735	MOVQ  (DX)(DI*1), R9
6736	MOVQ  R9, R10
6737	MOVQ  R9, R11
6738	MOVQ  R9, R12
6739	SHRQ  $0x08, R11
6740	MOVQ  R11, R13
6741	SHRQ  $0x10, R12
6742	LEAL  1(DI), R14
6743	LEAL  2(DI), R15
6744	MOVQ  -2(DX)(CX*1), R9
6745	SHLQ  $0x08, R10
6746	IMULQ SI, R10
6747	SHRQ  $0x30, R10
6748	SHLQ  $0x08, R13
6749	IMULQ SI, R13
6750	SHRQ  $0x30, R13
6751	SHLQ  $0x20, R11
6752	IMULQ R8, R11
6753	SHRQ  $0x32, R11
6754	SHLQ  $0x20, R12
6755	IMULQ R8, R12
6756	SHRQ  $0x32, R12
6757	MOVL  DI, 24(SP)(R10*4)
6758	MOVL  R14, 24(SP)(R13*4)
6759	MOVL  R14, 262168(SP)(R11*4)
6760	MOVL  R15, 262168(SP)(R12*4)
6761	MOVQ  R9, R10
6762	MOVQ  R9, R11
6763	SHRQ  $0x08, R11
6764	MOVQ  R11, R13
6765	LEAL  -2(CX), R9
6766	LEAL  -1(CX), DI
6767	SHLQ  $0x08, R10
6768	IMULQ SI, R10
6769	SHRQ  $0x30, R10
6770	SHLQ  $0x20, R11
6771	IMULQ R8, R11
6772	SHRQ  $0x32, R11
6773	SHLQ  $0x08, R13
6774	IMULQ SI, R13
6775	SHRQ  $0x30, R13
6776	MOVL  R9, 24(SP)(R10*4)
6777	MOVL  DI, 262168(SP)(R11*4)
6778	MOVL  DI, 24(SP)(R13*4)
6779	JMP   search_loop_encodeBetterBlockAsm4MB
6780
6781emit_remainder_encodeBetterBlockAsm4MB:
6782	MOVQ src_len+32(FP), CX
6783	SUBL 12(SP), CX
6784	LEAQ 4(AX)(CX*1), CX
6785	CMPQ CX, (SP)
6786	JL   emit_remainder_ok_encodeBetterBlockAsm4MB
6787	MOVQ $0x00000000, ret+48(FP)
6788	RET
6789
6790emit_remainder_ok_encodeBetterBlockAsm4MB:
6791	MOVQ src_len+32(FP), CX
6792	MOVL 12(SP), BX
6793	CMPL BX, CX
6794	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
6795	MOVL CX, SI
6796	MOVL CX, 12(SP)
6797	LEAQ (DX)(BX*1), CX
6798	SUBL BX, SI
6799	LEAL -1(SI), DX
6800	CMPL DX, $0x3c
6801	JLT  one_byte_emit_remainder_encodeBetterBlockAsm4MB
6802	CMPL DX, $0x00000100
6803	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm4MB
6804	CMPL DX, $0x00010000
6805	JLT  three_bytes_emit_remainder_encodeBetterBlockAsm4MB
6806	MOVL DX, BX
6807	SHRL $0x10, BX
6808	MOVB $0xf8, (AX)
6809	MOVW DX, 1(AX)
6810	MOVB BL, 3(AX)
6811	ADDQ $0x04, AX
6812	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB
6813
6814three_bytes_emit_remainder_encodeBetterBlockAsm4MB:
6815	MOVB $0xf4, (AX)
6816	MOVW DX, 1(AX)
6817	ADDQ $0x03, AX
6818	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB
6819
6820two_bytes_emit_remainder_encodeBetterBlockAsm4MB:
6821	MOVB $0xf0, (AX)
6822	MOVB DL, 1(AX)
6823	ADDQ $0x02, AX
6824	CMPL DX, $0x40
6825	JL   memmove_emit_remainder_encodeBetterBlockAsm4MB
6826	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm4MB
6827
6828one_byte_emit_remainder_encodeBetterBlockAsm4MB:
6829	SHLB $0x02, DL
6830	MOVB DL, (AX)
6831	ADDQ $0x01, AX
6832
6833memmove_emit_remainder_encodeBetterBlockAsm4MB:
6834	LEAQ (AX)(SI*1), DX
6835	MOVL SI, BX
6836
6837	// genMemMoveShort
6838	CMPQ BX, $0x03
6839	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2
6840	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3
6841	CMPQ BX, $0x08
6842	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7
6843	CMPQ BX, $0x10
6844	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16
6845	CMPQ BX, $0x20
6846	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32
6847	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64
6848
6849emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2:
6850	MOVB (CX), SI
6851	MOVB -1(CX)(BX*1), CL
6852	MOVB SI, (AX)
6853	MOVB CL, -1(AX)(BX*1)
6854	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
6855
6856emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3:
6857	MOVW (CX), SI
6858	MOVB 2(CX), CL
6859	MOVW SI, (AX)
6860	MOVB CL, 2(AX)
6861	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
6862
6863emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7:
6864	MOVL (CX), SI
6865	MOVL -4(CX)(BX*1), CX
6866	MOVL SI, (AX)
6867	MOVL CX, -4(AX)(BX*1)
6868	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
6869
6870emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16:
6871	MOVQ (CX), SI
6872	MOVQ -8(CX)(BX*1), CX
6873	MOVQ SI, (AX)
6874	MOVQ CX, -8(AX)(BX*1)
6875	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
6876
6877emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32:
6878	MOVOU (CX), X0
6879	MOVOU -16(CX)(BX*1), X1
6880	MOVOU X0, (AX)
6881	MOVOU X1, -16(AX)(BX*1)
6882	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
6883
6884emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64:
6885	MOVOU (CX), X0
6886	MOVOU 16(CX), X1
6887	MOVOU -32(CX)(BX*1), X2
6888	MOVOU -16(CX)(BX*1), X3
6889	MOVOU X0, (AX)
6890	MOVOU X1, 16(AX)
6891	MOVOU X2, -32(AX)(BX*1)
6892	MOVOU X3, -16(AX)(BX*1)
6893
6894memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB:
6895	MOVQ DX, AX
6896	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
6897
6898memmove_long_emit_remainder_encodeBetterBlockAsm4MB:
6899	LEAQ (AX)(SI*1), DX
6900	MOVL SI, BX
6901
6902	// genMemMoveLong
6903	MOVOU (CX), X0
6904	MOVOU 16(CX), X1
6905	MOVOU -32(CX)(BX*1), X2
6906	MOVOU -16(CX)(BX*1), X3
6907	MOVQ  BX, DI
6908	SHRQ  $0x05, DI
6909	MOVQ  AX, SI
6910	ANDL  $0x0000001f, SI
6911	MOVQ  $0x00000040, R8
6912	SUBQ  SI, R8
6913	DECQ  DI
6914	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
6915	LEAQ  -32(CX)(R8*1), SI
6916	LEAQ  -32(AX)(R8*1), R9
6917
6918emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back:
6919	MOVOU (SI), X4
6920	MOVOU 16(SI), X5
6921	MOVOA X4, (R9)
6922	MOVOA X5, 16(R9)
6923	ADDQ  $0x20, R9
6924	ADDQ  $0x20, SI
6925	ADDQ  $0x20, R8
6926	DECQ  DI
6927	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back
6928
6929emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
6930	MOVOU -32(CX)(R8*1), X4
6931	MOVOU -16(CX)(R8*1), X5
6932	MOVOA X4, -32(AX)(R8*1)
6933	MOVOA X5, -16(AX)(R8*1)
6934	ADDQ  $0x20, R8
6935	CMPQ  BX, R8
6936	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
6937	MOVOU X0, (AX)
6938	MOVOU X1, 16(AX)
6939	MOVOU X2, -32(AX)(BX*1)
6940	MOVOU X3, -16(AX)(BX*1)
6941	MOVQ  DX, AX
6942
6943emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB:
6944	MOVQ dst_base+0(FP), CX
6945	SUBQ CX, AX
6946	MOVQ AX, ret+48(FP)
6947	RET
6948
6949// func encodeBetterBlockAsm12B(dst []byte, src []byte) int
6950// Requires: SSE2
6951TEXT ·encodeBetterBlockAsm12B(SB), $81944-56
6952	MOVQ dst_base+0(FP), AX
6953	MOVQ $0x00000280, CX
6954	LEAQ 24(SP), DX
6955	PXOR X0, X0
6956
6957zero_loop_encodeBetterBlockAsm12B:
6958	MOVOU X0, (DX)
6959	MOVOU X0, 16(DX)
6960	MOVOU X0, 32(DX)
6961	MOVOU X0, 48(DX)
6962	MOVOU X0, 64(DX)
6963	MOVOU X0, 80(DX)
6964	MOVOU X0, 96(DX)
6965	MOVOU X0, 112(DX)
6966	ADDQ  $0x80, DX
6967	DECQ  CX
6968	JNZ   zero_loop_encodeBetterBlockAsm12B
6969	MOVL  $0x00000000, 12(SP)
6970	MOVQ  src_len+32(FP), CX
6971	LEAQ  -6(CX), DX
6972	LEAQ  -8(CX), SI
6973	MOVL  SI, 8(SP)
6974	SHRQ  $0x05, CX
6975	SUBL  CX, DX
6976	LEAQ  (AX)(DX*1), DX
6977	MOVQ  DX, (SP)
6978	MOVL  $0x00000001, CX
6979	MOVL  $0x00000000, 16(SP)
6980	MOVQ  src_base+24(FP), DX
6981
6982search_loop_encodeBetterBlockAsm12B:
6983	MOVL  CX, SI
6984	SUBL  12(SP), SI
6985	SHRL  $0x06, SI
6986	LEAL  1(CX)(SI*1), SI
6987	CMPL  SI, 8(SP)
6988	JGE   emit_remainder_encodeBetterBlockAsm12B
6989	MOVQ  (DX)(CX*1), DI
6990	MOVL  SI, 20(SP)
6991	MOVQ  $0x0000cf1bbcdcbf9b, R9
6992	MOVQ  $0x9e3779b1, SI
6993	MOVQ  DI, R10
6994	MOVQ  DI, R11
6995	SHLQ  $0x10, R10
6996	IMULQ R9, R10
6997	SHRQ  $0x32, R10
6998	SHLQ  $0x20, R11
6999	IMULQ SI, R11
7000	SHRQ  $0x34, R11
7001	MOVL  24(SP)(R10*4), SI
7002	MOVL  65560(SP)(R11*4), R8
7003	MOVL  CX, 24(SP)(R10*4)
7004	MOVL  CX, 65560(SP)(R11*4)
7005	CMPL  (DX)(SI*1), DI
7006	JEQ   candidate_match_encodeBetterBlockAsm12B
7007	CMPL  (DX)(R8*1), DI
7008	JEQ   candidateS_match_encodeBetterBlockAsm12B
7009	MOVL  20(SP), CX
7010	JMP   search_loop_encodeBetterBlockAsm12B
7011
7012candidateS_match_encodeBetterBlockAsm12B:
7013	SHRQ  $0x08, DI
7014	MOVQ  DI, R10
7015	SHLQ  $0x10, R10
7016	IMULQ R9, R10
7017	SHRQ  $0x32, R10
7018	MOVL  24(SP)(R10*4), SI
7019	INCL  CX
7020	MOVL  CX, 24(SP)(R10*4)
7021	CMPL  (DX)(SI*1), DI
7022	JEQ   candidate_match_encodeBetterBlockAsm12B
7023	DECL  CX
7024	MOVL  R8, SI
7025
7026candidate_match_encodeBetterBlockAsm12B:
7027	MOVL  12(SP), DI
7028	TESTL SI, SI
7029	JZ    match_extend_back_end_encodeBetterBlockAsm12B
7030
7031match_extend_back_loop_encodeBetterBlockAsm12B:
7032	CMPL CX, DI
7033	JLE  match_extend_back_end_encodeBetterBlockAsm12B
7034	MOVB -1(DX)(SI*1), BL
7035	MOVB -1(DX)(CX*1), R8
7036	CMPB BL, R8
7037	JNE  match_extend_back_end_encodeBetterBlockAsm12B
7038	LEAL -1(CX), CX
7039	DECL SI
7040	JZ   match_extend_back_end_encodeBetterBlockAsm12B
7041	JMP  match_extend_back_loop_encodeBetterBlockAsm12B
7042
7043match_extend_back_end_encodeBetterBlockAsm12B:
7044	MOVL CX, DI
7045	SUBL 12(SP), DI
7046	LEAQ 3(AX)(DI*1), DI
7047	CMPQ DI, (SP)
7048	JL   match_dst_size_check_encodeBetterBlockAsm12B
7049	MOVQ $0x00000000, ret+48(FP)
7050	RET
7051
7052match_dst_size_check_encodeBetterBlockAsm12B:
7053	MOVL CX, DI
7054	ADDL $0x04, CX
7055	ADDL $0x04, SI
7056	MOVQ src_len+32(FP), R8
7057	SUBL CX, R8
7058	LEAQ (DX)(CX*1), R9
7059	LEAQ (DX)(SI*1), R10
7060
7061	// matchLen
7062	XORL R12, R12
7063	CMPL R8, $0x08
7064	JL   matchlen_single_match_nolit_encodeBetterBlockAsm12B
7065
7066matchlen_loopback_match_nolit_encodeBetterBlockAsm12B:
7067	MOVQ  (R9)(R12*1), R11
7068	XORQ  (R10)(R12*1), R11
7069	TESTQ R11, R11
7070	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm12B
7071	BSFQ  R11, R11
7072	SARQ  $0x03, R11
7073	LEAL  (R12)(R11*1), R12
7074	JMP   match_nolit_end_encodeBetterBlockAsm12B
7075
7076matchlen_loop_match_nolit_encodeBetterBlockAsm12B:
7077	LEAL -8(R8), R8
7078	LEAL 8(R12), R12
7079	CMPL R8, $0x08
7080	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm12B
7081
7082matchlen_single_match_nolit_encodeBetterBlockAsm12B:
7083	TESTL R8, R8
7084	JZ    match_nolit_end_encodeBetterBlockAsm12B
7085
7086matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B:
7087	MOVB (R9)(R12*1), R11
7088	CMPB (R10)(R12*1), R11
7089	JNE  match_nolit_end_encodeBetterBlockAsm12B
7090	LEAL 1(R12), R12
7091	DECL R8
7092	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm12B
7093
7094match_nolit_end_encodeBetterBlockAsm12B:
7095	MOVL CX, R8
7096	SUBL SI, R8
7097
7098	// Check if repeat
7099	CMPL 16(SP), R8
7100	JEQ  match_is_repeat_encodeBetterBlockAsm12B
7101	MOVL R8, 16(SP)
7102	MOVL 12(SP), SI
7103	CMPL SI, DI
7104	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm12B
7105	MOVL DI, R9
7106	MOVL DI, 12(SP)
7107	LEAQ (DX)(SI*1), R10
7108	SUBL SI, R9
7109	LEAL -1(R9), SI
7110	CMPL SI, $0x3c
7111	JLT  one_byte_match_emit_encodeBetterBlockAsm12B
7112	CMPL SI, $0x00000100
7113	JLT  two_bytes_match_emit_encodeBetterBlockAsm12B
7114	MOVB $0xf4, (AX)
7115	MOVW SI, 1(AX)
7116	ADDQ $0x03, AX
7117	JMP  memmove_long_match_emit_encodeBetterBlockAsm12B
7118
7119two_bytes_match_emit_encodeBetterBlockAsm12B:
7120	MOVB $0xf0, (AX)
7121	MOVB SI, 1(AX)
7122	ADDQ $0x02, AX
7123	CMPL SI, $0x40
7124	JL   memmove_match_emit_encodeBetterBlockAsm12B
7125	JMP  memmove_long_match_emit_encodeBetterBlockAsm12B
7126
7127one_byte_match_emit_encodeBetterBlockAsm12B:
7128	SHLB $0x02, SI
7129	MOVB SI, (AX)
7130	ADDQ $0x01, AX
7131
7132memmove_match_emit_encodeBetterBlockAsm12B:
7133	LEAQ (AX)(R9*1), SI
7134
7135	// genMemMoveShort
7136	CMPQ R9, $0x03
7137	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_1or2
7138	JE   emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_3
7139	CMPQ R9, $0x08
7140	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7
7141	CMPQ R9, $0x10
7142	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16
7143	CMPQ R9, $0x20
7144	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32
7145	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64
7146
7147emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_1or2:
7148	MOVB (R10), R11
7149	MOVB -1(R10)(R9*1), R10
7150	MOVB R11, (AX)
7151	MOVB R10, -1(AX)(R9*1)
7152	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B
7153
7154emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_3:
7155	MOVW (R10), R11
7156	MOVB 2(R10), R10
7157	MOVW R11, (AX)
7158	MOVB R10, 2(AX)
7159	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B
7160
7161emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7:
7162	MOVL (R10), R11
7163	MOVL -4(R10)(R9*1), R10
7164	MOVL R11, (AX)
7165	MOVL R10, -4(AX)(R9*1)
7166	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B
7167
7168emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16:
7169	MOVQ (R10), R11
7170	MOVQ -8(R10)(R9*1), R10
7171	MOVQ R11, (AX)
7172	MOVQ R10, -8(AX)(R9*1)
7173	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm12B
7174
7175emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32:
7176	MOVOU (R10), X0
7177	MOVOU -16(R10)(R9*1), X1
7178	MOVOU X0, (AX)
7179	MOVOU X1, -16(AX)(R9*1)
7180	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm12B
7181
7182emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64:
7183	MOVOU (R10), X0
7184	MOVOU 16(R10), X1
7185	MOVOU -32(R10)(R9*1), X2
7186	MOVOU -16(R10)(R9*1), X3
7187	MOVOU X0, (AX)
7188	MOVOU X1, 16(AX)
7189	MOVOU X2, -32(AX)(R9*1)
7190	MOVOU X3, -16(AX)(R9*1)
7191
7192memmove_end_copy_match_emit_encodeBetterBlockAsm12B:
7193	MOVQ SI, AX
7194	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm12B
7195
7196memmove_long_match_emit_encodeBetterBlockAsm12B:
7197	LEAQ (AX)(R9*1), SI
7198
7199	// genMemMoveLong
7200	MOVOU (R10), X0
7201	MOVOU 16(R10), X1
7202	MOVOU -32(R10)(R9*1), X2
7203	MOVOU -16(R10)(R9*1), X3
7204	MOVQ  R9, R13
7205	SHRQ  $0x05, R13
7206	MOVQ  AX, R11
7207	ANDL  $0x0000001f, R11
7208	MOVQ  $0x00000040, R14
7209	SUBQ  R11, R14
7210	DECQ  R13
7211	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
7212	LEAQ  -32(R10)(R14*1), R11
7213	LEAQ  -32(AX)(R14*1), R15
7214
7215emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back:
7216	MOVOU (R11), X4
7217	MOVOU 16(R11), X5
7218	MOVOA X4, (R15)
7219	MOVOA X5, 16(R15)
7220	ADDQ  $0x20, R15
7221	ADDQ  $0x20, R11
7222	ADDQ  $0x20, R14
7223	DECQ  R13
7224	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back
7225
7226emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
7227	MOVOU -32(R10)(R14*1), X4
7228	MOVOU -16(R10)(R14*1), X5
7229	MOVOA X4, -32(AX)(R14*1)
7230	MOVOA X5, -16(AX)(R14*1)
7231	ADDQ  $0x20, R14
7232	CMPQ  R9, R14
7233	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
7234	MOVOU X0, (AX)
7235	MOVOU X1, 16(AX)
7236	MOVOU X2, -32(AX)(R9*1)
7237	MOVOU X3, -16(AX)(R9*1)
7238	MOVQ  SI, AX
7239
7240emit_literal_done_match_emit_encodeBetterBlockAsm12B:
7241	ADDL R12, CX
7242	ADDL $0x04, R12
7243	MOVL CX, 12(SP)
7244
7245	// emitCopy
7246two_byte_offset_match_nolit_encodeBetterBlockAsm12B:
7247	CMPL R12, $0x40
7248	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
7249	MOVB $0xee, (AX)
7250	MOVW R8, 1(AX)
7251	LEAL -60(R12), R12
7252	ADDQ $0x03, AX
7253
7254	// emitRepeat
7255	MOVL R12, SI
7256	LEAL -4(R12), R12
7257	CMPL SI, $0x08
7258	JLE  repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
7259	CMPL SI, $0x0c
7260	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
7261	CMPL R8, $0x00000800
7262	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
7263
7264cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
7265	CMPL R12, $0x00000104
7266	JLT  repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
7267	LEAL -256(R12), R12
7268	MOVW $0x0019, (AX)
7269	MOVW R12, 2(AX)
7270	ADDQ $0x04, AX
7271	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
7272
7273repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
7274	LEAL -4(R12), R12
7275	MOVW $0x0015, (AX)
7276	MOVB R12, 2(AX)
7277	ADDQ $0x03, AX
7278	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
7279
7280repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
7281	SHLL $0x02, R12
7282	ORL  $0x01, R12
7283	MOVW R12, (AX)
7284	ADDQ $0x02, AX
7285	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
7286
7287repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
7288	XORQ SI, SI
7289	LEAL 1(SI)(R12*4), R12
7290	MOVB R8, 1(AX)
7291	SARL $0x08, R8
7292	SHLL $0x05, R8
7293	ORL  R8, R12
7294	MOVB R12, (AX)
7295	ADDQ $0x02, AX
7296	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
7297	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B
7298
7299two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
7300	CMPL R12, $0x0c
7301	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm12B
7302	CMPL R8, $0x00000800
7303	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm12B
7304	MOVB $0x01, BL
7305	LEAL -16(BX)(R12*4), R12
7306	MOVB R8, 1(AX)
7307	SHRL $0x08, R8
7308	SHLL $0x05, R8
7309	ORL  R8, R12
7310	MOVB R12, (AX)
7311	ADDQ $0x02, AX
7312	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
7313
7314emit_copy_three_match_nolit_encodeBetterBlockAsm12B:
7315	MOVB $0x02, BL
7316	LEAL -4(BX)(R12*4), R12
7317	MOVB R12, (AX)
7318	MOVW R8, 1(AX)
7319	ADDQ $0x03, AX
7320	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
7321
7322match_is_repeat_encodeBetterBlockAsm12B:
7323	MOVL 12(SP), SI
7324	CMPL SI, DI
7325	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
7326	MOVL DI, R9
7327	MOVL DI, 12(SP)
7328	LEAQ (DX)(SI*1), R10
7329	SUBL SI, R9
7330	LEAL -1(R9), SI
7331	CMPL SI, $0x3c
7332	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm12B
7333	CMPL SI, $0x00000100
7334	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm12B
7335	MOVB $0xf4, (AX)
7336	MOVW SI, 1(AX)
7337	ADDQ $0x03, AX
7338	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
7339
7340two_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
7341	MOVB $0xf0, (AX)
7342	MOVB SI, 1(AX)
7343	ADDQ $0x02, AX
7344	CMPL SI, $0x40
7345	JL   memmove_match_emit_repeat_encodeBetterBlockAsm12B
7346	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
7347
7348one_byte_match_emit_repeat_encodeBetterBlockAsm12B:
7349	SHLB $0x02, SI
7350	MOVB SI, (AX)
7351	ADDQ $0x01, AX
7352
7353memmove_match_emit_repeat_encodeBetterBlockAsm12B:
7354	LEAQ (AX)(R9*1), SI
7355
7356	// genMemMoveShort
7357	CMPQ R9, $0x03
7358	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_1or2
7359	JE   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_3
7360	CMPQ R9, $0x08
7361	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7
7362	CMPQ R9, $0x10
7363	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16
7364	CMPQ R9, $0x20
7365	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32
7366	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64
7367
7368emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_1or2:
7369	MOVB (R10), R11
7370	MOVB -1(R10)(R9*1), R10
7371	MOVB R11, (AX)
7372	MOVB R10, -1(AX)(R9*1)
7373	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
7374
7375emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_3:
7376	MOVW (R10), R11
7377	MOVB 2(R10), R10
7378	MOVW R11, (AX)
7379	MOVB R10, 2(AX)
7380	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
7381
7382emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7:
7383	MOVL (R10), R11
7384	MOVL -4(R10)(R9*1), R10
7385	MOVL R11, (AX)
7386	MOVL R10, -4(AX)(R9*1)
7387	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
7388
7389emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16:
7390	MOVQ (R10), R11
7391	MOVQ -8(R10)(R9*1), R10
7392	MOVQ R11, (AX)
7393	MOVQ R10, -8(AX)(R9*1)
7394	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
7395
7396emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32:
7397	MOVOU (R10), X0
7398	MOVOU -16(R10)(R9*1), X1
7399	MOVOU X0, (AX)
7400	MOVOU X1, -16(AX)(R9*1)
7401	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
7402
7403emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64:
7404	MOVOU (R10), X0
7405	MOVOU 16(R10), X1
7406	MOVOU -32(R10)(R9*1), X2
7407	MOVOU -16(R10)(R9*1), X3
7408	MOVOU X0, (AX)
7409	MOVOU X1, 16(AX)
7410	MOVOU X2, -32(AX)(R9*1)
7411	MOVOU X3, -16(AX)(R9*1)
7412
7413memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B:
7414	MOVQ SI, AX
7415	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
7416
7417memmove_long_match_emit_repeat_encodeBetterBlockAsm12B:
7418	LEAQ (AX)(R9*1), SI
7419
7420	// genMemMoveLong
7421	MOVOU (R10), X0
7422	MOVOU 16(R10), X1
7423	MOVOU -32(R10)(R9*1), X2
7424	MOVOU -16(R10)(R9*1), X3
7425	MOVQ  R9, R13
7426	SHRQ  $0x05, R13
7427	MOVQ  AX, R11
7428	ANDL  $0x0000001f, R11
7429	MOVQ  $0x00000040, R14
7430	SUBQ  R11, R14
7431	DECQ  R13
7432	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
7433	LEAQ  -32(R10)(R14*1), R11
7434	LEAQ  -32(AX)(R14*1), R15
7435
7436emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back:
7437	MOVOU (R11), X4
7438	MOVOU 16(R11), X5
7439	MOVOA X4, (R15)
7440	MOVOA X5, 16(R15)
7441	ADDQ  $0x20, R15
7442	ADDQ  $0x20, R11
7443	ADDQ  $0x20, R14
7444	DECQ  R13
7445	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back
7446
7447emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
7448	MOVOU -32(R10)(R14*1), X4
7449	MOVOU -16(R10)(R14*1), X5
7450	MOVOA X4, -32(AX)(R14*1)
7451	MOVOA X5, -16(AX)(R14*1)
7452	ADDQ  $0x20, R14
7453	CMPQ  R9, R14
7454	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
7455	MOVOU X0, (AX)
7456	MOVOU X1, 16(AX)
7457	MOVOU X2, -32(AX)(R9*1)
7458	MOVOU X3, -16(AX)(R9*1)
7459	MOVQ  SI, AX
7460
7461emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B:
7462	ADDL R12, CX
7463	ADDL $0x04, R12
7464	MOVL CX, 12(SP)
7465
7466	// emitRepeat
7467	MOVL R12, SI
7468	LEAL -4(R12), R12
7469	CMPL SI, $0x08
7470	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B
7471	CMPL SI, $0x0c
7472	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
7473	CMPL R8, $0x00000800
7474	JLT  repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
7475
7476cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
7477	CMPL R12, $0x00000104
7478	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B
7479	LEAL -256(R12), R12
7480	MOVW $0x0019, (AX)
7481	MOVW R12, 2(AX)
7482	ADDQ $0x04, AX
7483	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
7484
7485repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B:
7486	LEAL -4(R12), R12
7487	MOVW $0x0015, (AX)
7488	MOVB R12, 2(AX)
7489	ADDQ $0x03, AX
7490	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
7491
7492repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B:
7493	SHLL $0x02, R12
7494	ORL  $0x01, R12
7495	MOVW R12, (AX)
7496	ADDQ $0x02, AX
7497	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
7498
7499repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
7500	XORQ SI, SI
7501	LEAL 1(SI)(R12*4), R12
7502	MOVB R8, 1(AX)
7503	SARL $0x08, R8
7504	SHLL $0x05, R8
7505	ORL  R8, R12
7506	MOVB R12, (AX)
7507	ADDQ $0x02, AX
7508
7509match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
7510	CMPL CX, 8(SP)
7511	JGE  emit_remainder_encodeBetterBlockAsm12B
7512	CMPQ AX, (SP)
7513	JL   match_nolit_dst_ok_encodeBetterBlockAsm12B
7514	MOVQ $0x00000000, ret+48(FP)
7515	RET
7516
7517match_nolit_dst_ok_encodeBetterBlockAsm12B:
7518	MOVQ  $0x0000cf1bbcdcbf9b, SI
7519	MOVQ  $0x9e3779b1, R8
7520	INCL  DI
7521	MOVQ  (DX)(DI*1), R9
7522	MOVQ  R9, R10
7523	MOVQ  R9, R11
7524	MOVQ  R9, R12
7525	SHRQ  $0x08, R11
7526	MOVQ  R11, R13
7527	SHRQ  $0x10, R12
7528	LEAL  1(DI), R14
7529	LEAL  2(DI), R15
7530	MOVQ  -2(DX)(CX*1), R9
7531	SHLQ  $0x10, R10
7532	IMULQ SI, R10
7533	SHRQ  $0x32, R10
7534	SHLQ  $0x10, R13
7535	IMULQ SI, R13
7536	SHRQ  $0x32, R13
7537	SHLQ  $0x20, R11
7538	IMULQ R8, R11
7539	SHRQ  $0x34, R11
7540	SHLQ  $0x20, R12
7541	IMULQ R8, R12
7542	SHRQ  $0x34, R12
7543	MOVL  DI, 24(SP)(R10*4)
7544	MOVL  R14, 24(SP)(R13*4)
7545	MOVL  R14, 65560(SP)(R11*4)
7546	MOVL  R15, 65560(SP)(R12*4)
7547	MOVQ  R9, R10
7548	MOVQ  R9, R11
7549	SHRQ  $0x08, R11
7550	MOVQ  R11, R13
7551	LEAL  -2(CX), R9
7552	LEAL  -1(CX), DI
7553	SHLQ  $0x10, R10
7554	IMULQ SI, R10
7555	SHRQ  $0x32, R10
7556	SHLQ  $0x20, R11
7557	IMULQ R8, R11
7558	SHRQ  $0x34, R11
7559	SHLQ  $0x10, R13
7560	IMULQ SI, R13
7561	SHRQ  $0x32, R13
7562	MOVL  R9, 24(SP)(R10*4)
7563	MOVL  DI, 65560(SP)(R11*4)
7564	MOVL  DI, 24(SP)(R13*4)
7565	JMP   search_loop_encodeBetterBlockAsm12B
7566
7567emit_remainder_encodeBetterBlockAsm12B:
7568	MOVQ src_len+32(FP), CX
7569	SUBL 12(SP), CX
7570	LEAQ 3(AX)(CX*1), CX
7571	CMPQ CX, (SP)
7572	JL   emit_remainder_ok_encodeBetterBlockAsm12B
7573	MOVQ $0x00000000, ret+48(FP)
7574	RET
7575
7576emit_remainder_ok_encodeBetterBlockAsm12B:
7577	MOVQ src_len+32(FP), CX
7578	MOVL 12(SP), BX
7579	CMPL BX, CX
7580	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
7581	MOVL CX, SI
7582	MOVL CX, 12(SP)
7583	LEAQ (DX)(BX*1), CX
7584	SUBL BX, SI
7585	LEAL -1(SI), DX
7586	CMPL DX, $0x3c
7587	JLT  one_byte_emit_remainder_encodeBetterBlockAsm12B
7588	CMPL DX, $0x00000100
7589	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm12B
7590	MOVB $0xf4, (AX)
7591	MOVW DX, 1(AX)
7592	ADDQ $0x03, AX
7593	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm12B
7594
7595two_bytes_emit_remainder_encodeBetterBlockAsm12B:
7596	MOVB $0xf0, (AX)
7597	MOVB DL, 1(AX)
7598	ADDQ $0x02, AX
7599	CMPL DX, $0x40
7600	JL   memmove_emit_remainder_encodeBetterBlockAsm12B
7601	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm12B
7602
7603one_byte_emit_remainder_encodeBetterBlockAsm12B:
7604	SHLB $0x02, DL
7605	MOVB DL, (AX)
7606	ADDQ $0x01, AX
7607
7608memmove_emit_remainder_encodeBetterBlockAsm12B:
7609	LEAQ (AX)(SI*1), DX
7610	MOVL SI, BX
7611
7612	// genMemMoveShort
7613	CMPQ BX, $0x03
7614	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2
7615	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3
7616	CMPQ BX, $0x08
7617	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7
7618	CMPQ BX, $0x10
7619	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16
7620	CMPQ BX, $0x20
7621	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32
7622	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64
7623
7624emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2:
7625	MOVB (CX), SI
7626	MOVB -1(CX)(BX*1), CL
7627	MOVB SI, (AX)
7628	MOVB CL, -1(AX)(BX*1)
7629	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
7630
7631emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3:
7632	MOVW (CX), SI
7633	MOVB 2(CX), CL
7634	MOVW SI, (AX)
7635	MOVB CL, 2(AX)
7636	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
7637
7638emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7:
7639	MOVL (CX), SI
7640	MOVL -4(CX)(BX*1), CX
7641	MOVL SI, (AX)
7642	MOVL CX, -4(AX)(BX*1)
7643	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
7644
7645emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16:
7646	MOVQ (CX), SI
7647	MOVQ -8(CX)(BX*1), CX
7648	MOVQ SI, (AX)
7649	MOVQ CX, -8(AX)(BX*1)
7650	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
7651
7652emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32:
7653	MOVOU (CX), X0
7654	MOVOU -16(CX)(BX*1), X1
7655	MOVOU X0, (AX)
7656	MOVOU X1, -16(AX)(BX*1)
7657	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
7658
7659emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64:
7660	MOVOU (CX), X0
7661	MOVOU 16(CX), X1
7662	MOVOU -32(CX)(BX*1), X2
7663	MOVOU -16(CX)(BX*1), X3
7664	MOVOU X0, (AX)
7665	MOVOU X1, 16(AX)
7666	MOVOU X2, -32(AX)(BX*1)
7667	MOVOU X3, -16(AX)(BX*1)
7668
7669memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B:
7670	MOVQ DX, AX
7671	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
7672
7673memmove_long_emit_remainder_encodeBetterBlockAsm12B:
7674	LEAQ (AX)(SI*1), DX
7675	MOVL SI, BX
7676
7677	// genMemMoveLong
7678	MOVOU (CX), X0
7679	MOVOU 16(CX), X1
7680	MOVOU -32(CX)(BX*1), X2
7681	MOVOU -16(CX)(BX*1), X3
7682	MOVQ  BX, DI
7683	SHRQ  $0x05, DI
7684	MOVQ  AX, SI
7685	ANDL  $0x0000001f, SI
7686	MOVQ  $0x00000040, R8
7687	SUBQ  SI, R8
7688	DECQ  DI
7689	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
7690	LEAQ  -32(CX)(R8*1), SI
7691	LEAQ  -32(AX)(R8*1), R9
7692
7693emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back:
7694	MOVOU (SI), X4
7695	MOVOU 16(SI), X5
7696	MOVOA X4, (R9)
7697	MOVOA X5, 16(R9)
7698	ADDQ  $0x20, R9
7699	ADDQ  $0x20, SI
7700	ADDQ  $0x20, R8
7701	DECQ  DI
7702	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back
7703
7704emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
7705	MOVOU -32(CX)(R8*1), X4
7706	MOVOU -16(CX)(R8*1), X5
7707	MOVOA X4, -32(AX)(R8*1)
7708	MOVOA X5, -16(AX)(R8*1)
7709	ADDQ  $0x20, R8
7710	CMPQ  BX, R8
7711	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
7712	MOVOU X0, (AX)
7713	MOVOU X1, 16(AX)
7714	MOVOU X2, -32(AX)(BX*1)
7715	MOVOU X3, -16(AX)(BX*1)
7716	MOVQ  DX, AX
7717
7718emit_literal_done_emit_remainder_encodeBetterBlockAsm12B:
7719	MOVQ dst_base+0(FP), CX
7720	SUBQ CX, AX
7721	MOVQ AX, ret+48(FP)
7722	RET
7723
7724// func encodeBetterBlockAsm10B(dst []byte, src []byte) int
7725// Requires: SSE2
7726TEXT ·encodeBetterBlockAsm10B(SB), $20504-56
7727	MOVQ dst_base+0(FP), AX
7728	MOVQ $0x000000a0, CX
7729	LEAQ 24(SP), DX
7730	PXOR X0, X0
7731
7732zero_loop_encodeBetterBlockAsm10B:
7733	MOVOU X0, (DX)
7734	MOVOU X0, 16(DX)
7735	MOVOU X0, 32(DX)
7736	MOVOU X0, 48(DX)
7737	MOVOU X0, 64(DX)
7738	MOVOU X0, 80(DX)
7739	MOVOU X0, 96(DX)
7740	MOVOU X0, 112(DX)
7741	ADDQ  $0x80, DX
7742	DECQ  CX
7743	JNZ   zero_loop_encodeBetterBlockAsm10B
7744	MOVL  $0x00000000, 12(SP)
7745	MOVQ  src_len+32(FP), CX
7746	LEAQ  -6(CX), DX
7747	LEAQ  -8(CX), SI
7748	MOVL  SI, 8(SP)
7749	SHRQ  $0x05, CX
7750	SUBL  CX, DX
7751	LEAQ  (AX)(DX*1), DX
7752	MOVQ  DX, (SP)
7753	MOVL  $0x00000001, CX
7754	MOVL  $0x00000000, 16(SP)
7755	MOVQ  src_base+24(FP), DX
7756
7757search_loop_encodeBetterBlockAsm10B:
7758	MOVL  CX, SI
7759	SUBL  12(SP), SI
7760	SHRL  $0x05, SI
7761	LEAL  1(CX)(SI*1), SI
7762	CMPL  SI, 8(SP)
7763	JGE   emit_remainder_encodeBetterBlockAsm10B
7764	MOVQ  (DX)(CX*1), DI
7765	MOVL  SI, 20(SP)
7766	MOVQ  $0x0000cf1bbcdcbf9b, R9
7767	MOVQ  $0x9e3779b1, SI
7768	MOVQ  DI, R10
7769	MOVQ  DI, R11
7770	SHLQ  $0x10, R10
7771	IMULQ R9, R10
7772	SHRQ  $0x34, R10
7773	SHLQ  $0x20, R11
7774	IMULQ SI, R11
7775	SHRQ  $0x36, R11
7776	MOVL  24(SP)(R10*4), SI
7777	MOVL  16408(SP)(R11*4), R8
7778	MOVL  CX, 24(SP)(R10*4)
7779	MOVL  CX, 16408(SP)(R11*4)
7780	CMPL  (DX)(SI*1), DI
7781	JEQ   candidate_match_encodeBetterBlockAsm10B
7782	CMPL  (DX)(R8*1), DI
7783	JEQ   candidateS_match_encodeBetterBlockAsm10B
7784	MOVL  20(SP), CX
7785	JMP   search_loop_encodeBetterBlockAsm10B
7786
7787candidateS_match_encodeBetterBlockAsm10B:
7788	SHRQ  $0x08, DI
7789	MOVQ  DI, R10
7790	SHLQ  $0x10, R10
7791	IMULQ R9, R10
7792	SHRQ  $0x34, R10
7793	MOVL  24(SP)(R10*4), SI
7794	INCL  CX
7795	MOVL  CX, 24(SP)(R10*4)
7796	CMPL  (DX)(SI*1), DI
7797	JEQ   candidate_match_encodeBetterBlockAsm10B
7798	DECL  CX
7799	MOVL  R8, SI
7800
7801candidate_match_encodeBetterBlockAsm10B:
7802	MOVL  12(SP), DI
7803	TESTL SI, SI
7804	JZ    match_extend_back_end_encodeBetterBlockAsm10B
7805
7806match_extend_back_loop_encodeBetterBlockAsm10B:
7807	CMPL CX, DI
7808	JLE  match_extend_back_end_encodeBetterBlockAsm10B
7809	MOVB -1(DX)(SI*1), BL
7810	MOVB -1(DX)(CX*1), R8
7811	CMPB BL, R8
7812	JNE  match_extend_back_end_encodeBetterBlockAsm10B
7813	LEAL -1(CX), CX
7814	DECL SI
7815	JZ   match_extend_back_end_encodeBetterBlockAsm10B
7816	JMP  match_extend_back_loop_encodeBetterBlockAsm10B
7817
7818match_extend_back_end_encodeBetterBlockAsm10B:
7819	MOVL CX, DI
7820	SUBL 12(SP), DI
7821	LEAQ 3(AX)(DI*1), DI
7822	CMPQ DI, (SP)
7823	JL   match_dst_size_check_encodeBetterBlockAsm10B
7824	MOVQ $0x00000000, ret+48(FP)
7825	RET
7826
7827match_dst_size_check_encodeBetterBlockAsm10B:
7828	MOVL CX, DI
7829	ADDL $0x04, CX
7830	ADDL $0x04, SI
7831	MOVQ src_len+32(FP), R8
7832	SUBL CX, R8
7833	LEAQ (DX)(CX*1), R9
7834	LEAQ (DX)(SI*1), R10
7835
7836	// matchLen
7837	XORL R12, R12
7838	CMPL R8, $0x08
7839	JL   matchlen_single_match_nolit_encodeBetterBlockAsm10B
7840
7841matchlen_loopback_match_nolit_encodeBetterBlockAsm10B:
7842	MOVQ  (R9)(R12*1), R11
7843	XORQ  (R10)(R12*1), R11
7844	TESTQ R11, R11
7845	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm10B
7846	BSFQ  R11, R11
7847	SARQ  $0x03, R11
7848	LEAL  (R12)(R11*1), R12
7849	JMP   match_nolit_end_encodeBetterBlockAsm10B
7850
7851matchlen_loop_match_nolit_encodeBetterBlockAsm10B:
7852	LEAL -8(R8), R8
7853	LEAL 8(R12), R12
7854	CMPL R8, $0x08
7855	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm10B
7856
7857matchlen_single_match_nolit_encodeBetterBlockAsm10B:
7858	TESTL R8, R8
7859	JZ    match_nolit_end_encodeBetterBlockAsm10B
7860
7861matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B:
7862	MOVB (R9)(R12*1), R11
7863	CMPB (R10)(R12*1), R11
7864	JNE  match_nolit_end_encodeBetterBlockAsm10B
7865	LEAL 1(R12), R12
7866	DECL R8
7867	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm10B
7868
7869match_nolit_end_encodeBetterBlockAsm10B:
7870	MOVL CX, R8
7871	SUBL SI, R8
7872
7873	// Check if repeat
7874	CMPL 16(SP), R8
7875	JEQ  match_is_repeat_encodeBetterBlockAsm10B
7876	MOVL R8, 16(SP)
7877	MOVL 12(SP), SI
7878	CMPL SI, DI
7879	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm10B
7880	MOVL DI, R9
7881	MOVL DI, 12(SP)
7882	LEAQ (DX)(SI*1), R10
7883	SUBL SI, R9
7884	LEAL -1(R9), SI
7885	CMPL SI, $0x3c
7886	JLT  one_byte_match_emit_encodeBetterBlockAsm10B
7887	CMPL SI, $0x00000100
7888	JLT  two_bytes_match_emit_encodeBetterBlockAsm10B
7889	MOVB $0xf4, (AX)
7890	MOVW SI, 1(AX)
7891	ADDQ $0x03, AX
7892	JMP  memmove_long_match_emit_encodeBetterBlockAsm10B
7893
7894two_bytes_match_emit_encodeBetterBlockAsm10B:
7895	MOVB $0xf0, (AX)
7896	MOVB SI, 1(AX)
7897	ADDQ $0x02, AX
7898	CMPL SI, $0x40
7899	JL   memmove_match_emit_encodeBetterBlockAsm10B
7900	JMP  memmove_long_match_emit_encodeBetterBlockAsm10B
7901
7902one_byte_match_emit_encodeBetterBlockAsm10B:
7903	SHLB $0x02, SI
7904	MOVB SI, (AX)
7905	ADDQ $0x01, AX
7906
7907memmove_match_emit_encodeBetterBlockAsm10B:
7908	LEAQ (AX)(R9*1), SI
7909
7910	// genMemMoveShort
7911	CMPQ R9, $0x03
7912	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_1or2
7913	JE   emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_3
7914	CMPQ R9, $0x08
7915	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7
7916	CMPQ R9, $0x10
7917	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16
7918	CMPQ R9, $0x20
7919	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32
7920	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64
7921
7922emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_1or2:
7923	MOVB (R10), R11
7924	MOVB -1(R10)(R9*1), R10
7925	MOVB R11, (AX)
7926	MOVB R10, -1(AX)(R9*1)
7927	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B
7928
7929emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_3:
7930	MOVW (R10), R11
7931	MOVB 2(R10), R10
7932	MOVW R11, (AX)
7933	MOVB R10, 2(AX)
7934	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B
7935
7936emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7:
7937	MOVL (R10), R11
7938	MOVL -4(R10)(R9*1), R10
7939	MOVL R11, (AX)
7940	MOVL R10, -4(AX)(R9*1)
7941	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B
7942
7943emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16:
7944	MOVQ (R10), R11
7945	MOVQ -8(R10)(R9*1), R10
7946	MOVQ R11, (AX)
7947	MOVQ R10, -8(AX)(R9*1)
7948	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm10B
7949
7950emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32:
7951	MOVOU (R10), X0
7952	MOVOU -16(R10)(R9*1), X1
7953	MOVOU X0, (AX)
7954	MOVOU X1, -16(AX)(R9*1)
7955	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm10B
7956
7957emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64:
7958	MOVOU (R10), X0
7959	MOVOU 16(R10), X1
7960	MOVOU -32(R10)(R9*1), X2
7961	MOVOU -16(R10)(R9*1), X3
7962	MOVOU X0, (AX)
7963	MOVOU X1, 16(AX)
7964	MOVOU X2, -32(AX)(R9*1)
7965	MOVOU X3, -16(AX)(R9*1)
7966
7967memmove_end_copy_match_emit_encodeBetterBlockAsm10B:
7968	MOVQ SI, AX
7969	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm10B
7970
7971memmove_long_match_emit_encodeBetterBlockAsm10B:
7972	LEAQ (AX)(R9*1), SI
7973
7974	// genMemMoveLong
7975	MOVOU (R10), X0
7976	MOVOU 16(R10), X1
7977	MOVOU -32(R10)(R9*1), X2
7978	MOVOU -16(R10)(R9*1), X3
7979	MOVQ  R9, R13
7980	SHRQ  $0x05, R13
7981	MOVQ  AX, R11
7982	ANDL  $0x0000001f, R11
7983	MOVQ  $0x00000040, R14
7984	SUBQ  R11, R14
7985	DECQ  R13
7986	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
7987	LEAQ  -32(R10)(R14*1), R11
7988	LEAQ  -32(AX)(R14*1), R15
7989
7990emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back:
7991	MOVOU (R11), X4
7992	MOVOU 16(R11), X5
7993	MOVOA X4, (R15)
7994	MOVOA X5, 16(R15)
7995	ADDQ  $0x20, R15
7996	ADDQ  $0x20, R11
7997	ADDQ  $0x20, R14
7998	DECQ  R13
7999	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back
8000
8001emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
8002	MOVOU -32(R10)(R14*1), X4
8003	MOVOU -16(R10)(R14*1), X5
8004	MOVOA X4, -32(AX)(R14*1)
8005	MOVOA X5, -16(AX)(R14*1)
8006	ADDQ  $0x20, R14
8007	CMPQ  R9, R14
8008	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
8009	MOVOU X0, (AX)
8010	MOVOU X1, 16(AX)
8011	MOVOU X2, -32(AX)(R9*1)
8012	MOVOU X3, -16(AX)(R9*1)
8013	MOVQ  SI, AX
8014
8015emit_literal_done_match_emit_encodeBetterBlockAsm10B:
8016	ADDL R12, CX
8017	ADDL $0x04, R12
8018	MOVL CX, 12(SP)
8019
8020	// emitCopy
8021two_byte_offset_match_nolit_encodeBetterBlockAsm10B:
8022	CMPL R12, $0x40
8023	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
8024	MOVB $0xee, (AX)
8025	MOVW R8, 1(AX)
8026	LEAL -60(R12), R12
8027	ADDQ $0x03, AX
8028
8029	// emitRepeat
8030	MOVL R12, SI
8031	LEAL -4(R12), R12
8032	CMPL SI, $0x08
8033	JLE  repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
8034	CMPL SI, $0x0c
8035	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
8036	CMPL R8, $0x00000800
8037	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
8038
8039cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
8040	CMPL R12, $0x00000104
8041	JLT  repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
8042	LEAL -256(R12), R12
8043	MOVW $0x0019, (AX)
8044	MOVW R12, 2(AX)
8045	ADDQ $0x04, AX
8046	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
8047
8048repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
8049	LEAL -4(R12), R12
8050	MOVW $0x0015, (AX)
8051	MOVB R12, 2(AX)
8052	ADDQ $0x03, AX
8053	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
8054
8055repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
8056	SHLL $0x02, R12
8057	ORL  $0x01, R12
8058	MOVW R12, (AX)
8059	ADDQ $0x02, AX
8060	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
8061
8062repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
8063	XORQ SI, SI
8064	LEAL 1(SI)(R12*4), R12
8065	MOVB R8, 1(AX)
8066	SARL $0x08, R8
8067	SHLL $0x05, R8
8068	ORL  R8, R12
8069	MOVB R12, (AX)
8070	ADDQ $0x02, AX
8071	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
8072	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B
8073
8074two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
8075	CMPL R12, $0x0c
8076	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm10B
8077	CMPL R8, $0x00000800
8078	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm10B
8079	MOVB $0x01, BL
8080	LEAL -16(BX)(R12*4), R12
8081	MOVB R8, 1(AX)
8082	SHRL $0x08, R8
8083	SHLL $0x05, R8
8084	ORL  R8, R12
8085	MOVB R12, (AX)
8086	ADDQ $0x02, AX
8087	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
8088
8089emit_copy_three_match_nolit_encodeBetterBlockAsm10B:
8090	MOVB $0x02, BL
8091	LEAL -4(BX)(R12*4), R12
8092	MOVB R12, (AX)
8093	MOVW R8, 1(AX)
8094	ADDQ $0x03, AX
8095	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
8096
8097match_is_repeat_encodeBetterBlockAsm10B:
8098	MOVL 12(SP), SI
8099	CMPL SI, DI
8100	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
8101	MOVL DI, R9
8102	MOVL DI, 12(SP)
8103	LEAQ (DX)(SI*1), R10
8104	SUBL SI, R9
8105	LEAL -1(R9), SI
8106	CMPL SI, $0x3c
8107	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm10B
8108	CMPL SI, $0x00000100
8109	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm10B
8110	MOVB $0xf4, (AX)
8111	MOVW SI, 1(AX)
8112	ADDQ $0x03, AX
8113	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
8114
8115two_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
8116	MOVB $0xf0, (AX)
8117	MOVB SI, 1(AX)
8118	ADDQ $0x02, AX
8119	CMPL SI, $0x40
8120	JL   memmove_match_emit_repeat_encodeBetterBlockAsm10B
8121	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
8122
8123one_byte_match_emit_repeat_encodeBetterBlockAsm10B:
8124	SHLB $0x02, SI
8125	MOVB SI, (AX)
8126	ADDQ $0x01, AX
8127
8128memmove_match_emit_repeat_encodeBetterBlockAsm10B:
8129	LEAQ (AX)(R9*1), SI
8130
8131	// genMemMoveShort
8132	CMPQ R9, $0x03
8133	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_1or2
8134	JE   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_3
8135	CMPQ R9, $0x08
8136	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7
8137	CMPQ R9, $0x10
8138	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16
8139	CMPQ R9, $0x20
8140	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32
8141	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64
8142
8143emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_1or2:
8144	MOVB (R10), R11
8145	MOVB -1(R10)(R9*1), R10
8146	MOVB R11, (AX)
8147	MOVB R10, -1(AX)(R9*1)
8148	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
8149
8150emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_3:
8151	MOVW (R10), R11
8152	MOVB 2(R10), R10
8153	MOVW R11, (AX)
8154	MOVB R10, 2(AX)
8155	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
8156
8157emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7:
8158	MOVL (R10), R11
8159	MOVL -4(R10)(R9*1), R10
8160	MOVL R11, (AX)
8161	MOVL R10, -4(AX)(R9*1)
8162	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
8163
8164emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16:
8165	MOVQ (R10), R11
8166	MOVQ -8(R10)(R9*1), R10
8167	MOVQ R11, (AX)
8168	MOVQ R10, -8(AX)(R9*1)
8169	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
8170
8171emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32:
8172	MOVOU (R10), X0
8173	MOVOU -16(R10)(R9*1), X1
8174	MOVOU X0, (AX)
8175	MOVOU X1, -16(AX)(R9*1)
8176	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
8177
8178emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64:
8179	MOVOU (R10), X0
8180	MOVOU 16(R10), X1
8181	MOVOU -32(R10)(R9*1), X2
8182	MOVOU -16(R10)(R9*1), X3
8183	MOVOU X0, (AX)
8184	MOVOU X1, 16(AX)
8185	MOVOU X2, -32(AX)(R9*1)
8186	MOVOU X3, -16(AX)(R9*1)
8187
8188memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B:
8189	MOVQ SI, AX
8190	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
8191
8192memmove_long_match_emit_repeat_encodeBetterBlockAsm10B:
8193	LEAQ (AX)(R9*1), SI
8194
8195	// genMemMoveLong
8196	MOVOU (R10), X0
8197	MOVOU 16(R10), X1
8198	MOVOU -32(R10)(R9*1), X2
8199	MOVOU -16(R10)(R9*1), X3
8200	MOVQ  R9, R13
8201	SHRQ  $0x05, R13
8202	MOVQ  AX, R11
8203	ANDL  $0x0000001f, R11
8204	MOVQ  $0x00000040, R14
8205	SUBQ  R11, R14
8206	DECQ  R13
8207	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
8208	LEAQ  -32(R10)(R14*1), R11
8209	LEAQ  -32(AX)(R14*1), R15
8210
8211emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back:
8212	MOVOU (R11), X4
8213	MOVOU 16(R11), X5
8214	MOVOA X4, (R15)
8215	MOVOA X5, 16(R15)
8216	ADDQ  $0x20, R15
8217	ADDQ  $0x20, R11
8218	ADDQ  $0x20, R14
8219	DECQ  R13
8220	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back
8221
8222emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
8223	MOVOU -32(R10)(R14*1), X4
8224	MOVOU -16(R10)(R14*1), X5
8225	MOVOA X4, -32(AX)(R14*1)
8226	MOVOA X5, -16(AX)(R14*1)
8227	ADDQ  $0x20, R14
8228	CMPQ  R9, R14
8229	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
8230	MOVOU X0, (AX)
8231	MOVOU X1, 16(AX)
8232	MOVOU X2, -32(AX)(R9*1)
8233	MOVOU X3, -16(AX)(R9*1)
8234	MOVQ  SI, AX
8235
8236emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B:
8237	ADDL R12, CX
8238	ADDL $0x04, R12
8239	MOVL CX, 12(SP)
8240
8241	// emitRepeat
8242	MOVL R12, SI
8243	LEAL -4(R12), R12
8244	CMPL SI, $0x08
8245	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B
8246	CMPL SI, $0x0c
8247	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
8248	CMPL R8, $0x00000800
8249	JLT  repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
8250
8251cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
8252	CMPL R12, $0x00000104
8253	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B
8254	LEAL -256(R12), R12
8255	MOVW $0x0019, (AX)
8256	MOVW R12, 2(AX)
8257	ADDQ $0x04, AX
8258	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
8259
8260repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B:
8261	LEAL -4(R12), R12
8262	MOVW $0x0015, (AX)
8263	MOVB R12, 2(AX)
8264	ADDQ $0x03, AX
8265	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
8266
8267repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B:
8268	SHLL $0x02, R12
8269	ORL  $0x01, R12
8270	MOVW R12, (AX)
8271	ADDQ $0x02, AX
8272	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
8273
8274repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
8275	XORQ SI, SI
8276	LEAL 1(SI)(R12*4), R12
8277	MOVB R8, 1(AX)
8278	SARL $0x08, R8
8279	SHLL $0x05, R8
8280	ORL  R8, R12
8281	MOVB R12, (AX)
8282	ADDQ $0x02, AX
8283
8284match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
8285	CMPL CX, 8(SP)
8286	JGE  emit_remainder_encodeBetterBlockAsm10B
8287	CMPQ AX, (SP)
8288	JL   match_nolit_dst_ok_encodeBetterBlockAsm10B
8289	MOVQ $0x00000000, ret+48(FP)
8290	RET
8291
8292match_nolit_dst_ok_encodeBetterBlockAsm10B:
8293	MOVQ  $0x0000cf1bbcdcbf9b, SI
8294	MOVQ  $0x9e3779b1, R8
8295	INCL  DI
8296	MOVQ  (DX)(DI*1), R9
8297	MOVQ  R9, R10
8298	MOVQ  R9, R11
8299	MOVQ  R9, R12
8300	SHRQ  $0x08, R11
8301	MOVQ  R11, R13
8302	SHRQ  $0x10, R12
8303	LEAL  1(DI), R14
8304	LEAL  2(DI), R15
8305	MOVQ  -2(DX)(CX*1), R9
8306	SHLQ  $0x10, R10
8307	IMULQ SI, R10
8308	SHRQ  $0x34, R10
8309	SHLQ  $0x10, R13
8310	IMULQ SI, R13
8311	SHRQ  $0x34, R13
8312	SHLQ  $0x20, R11
8313	IMULQ R8, R11
8314	SHRQ  $0x36, R11
8315	SHLQ  $0x20, R12
8316	IMULQ R8, R12
8317	SHRQ  $0x36, R12
8318	MOVL  DI, 24(SP)(R10*4)
8319	MOVL  R14, 24(SP)(R13*4)
8320	MOVL  R14, 16408(SP)(R11*4)
8321	MOVL  R15, 16408(SP)(R12*4)
8322	MOVQ  R9, R10
8323	MOVQ  R9, R11
8324	SHRQ  $0x08, R11
8325	MOVQ  R11, R13
8326	LEAL  -2(CX), R9
8327	LEAL  -1(CX), DI
8328	SHLQ  $0x10, R10
8329	IMULQ SI, R10
8330	SHRQ  $0x34, R10
8331	SHLQ  $0x20, R11
8332	IMULQ R8, R11
8333	SHRQ  $0x36, R11
8334	SHLQ  $0x10, R13
8335	IMULQ SI, R13
8336	SHRQ  $0x34, R13
8337	MOVL  R9, 24(SP)(R10*4)
8338	MOVL  DI, 16408(SP)(R11*4)
8339	MOVL  DI, 24(SP)(R13*4)
8340	JMP   search_loop_encodeBetterBlockAsm10B
8341
8342emit_remainder_encodeBetterBlockAsm10B:
8343	MOVQ src_len+32(FP), CX
8344	SUBL 12(SP), CX
8345	LEAQ 3(AX)(CX*1), CX
8346	CMPQ CX, (SP)
8347	JL   emit_remainder_ok_encodeBetterBlockAsm10B
8348	MOVQ $0x00000000, ret+48(FP)
8349	RET
8350
8351emit_remainder_ok_encodeBetterBlockAsm10B:
8352	MOVQ src_len+32(FP), CX
8353	MOVL 12(SP), BX
8354	CMPL BX, CX
8355	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
8356	MOVL CX, SI
8357	MOVL CX, 12(SP)
8358	LEAQ (DX)(BX*1), CX
8359	SUBL BX, SI
8360	LEAL -1(SI), DX
8361	CMPL DX, $0x3c
8362	JLT  one_byte_emit_remainder_encodeBetterBlockAsm10B
8363	CMPL DX, $0x00000100
8364	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm10B
8365	MOVB $0xf4, (AX)
8366	MOVW DX, 1(AX)
8367	ADDQ $0x03, AX
8368	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm10B
8369
8370two_bytes_emit_remainder_encodeBetterBlockAsm10B:
8371	MOVB $0xf0, (AX)
8372	MOVB DL, 1(AX)
8373	ADDQ $0x02, AX
8374	CMPL DX, $0x40
8375	JL   memmove_emit_remainder_encodeBetterBlockAsm10B
8376	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm10B
8377
8378one_byte_emit_remainder_encodeBetterBlockAsm10B:
8379	SHLB $0x02, DL
8380	MOVB DL, (AX)
8381	ADDQ $0x01, AX
8382
8383memmove_emit_remainder_encodeBetterBlockAsm10B:
8384	LEAQ (AX)(SI*1), DX
8385	MOVL SI, BX
8386
8387	// genMemMoveShort
8388	CMPQ BX, $0x03
8389	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2
8390	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3
8391	CMPQ BX, $0x08
8392	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7
8393	CMPQ BX, $0x10
8394	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16
8395	CMPQ BX, $0x20
8396	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32
8397	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64
8398
8399emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2:
8400	MOVB (CX), SI
8401	MOVB -1(CX)(BX*1), CL
8402	MOVB SI, (AX)
8403	MOVB CL, -1(AX)(BX*1)
8404	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
8405
8406emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3:
8407	MOVW (CX), SI
8408	MOVB 2(CX), CL
8409	MOVW SI, (AX)
8410	MOVB CL, 2(AX)
8411	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
8412
8413emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7:
8414	MOVL (CX), SI
8415	MOVL -4(CX)(BX*1), CX
8416	MOVL SI, (AX)
8417	MOVL CX, -4(AX)(BX*1)
8418	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
8419
8420emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16:
8421	MOVQ (CX), SI
8422	MOVQ -8(CX)(BX*1), CX
8423	MOVQ SI, (AX)
8424	MOVQ CX, -8(AX)(BX*1)
8425	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
8426
8427emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32:
8428	MOVOU (CX), X0
8429	MOVOU -16(CX)(BX*1), X1
8430	MOVOU X0, (AX)
8431	MOVOU X1, -16(AX)(BX*1)
8432	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
8433
8434emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64:
8435	MOVOU (CX), X0
8436	MOVOU 16(CX), X1
8437	MOVOU -32(CX)(BX*1), X2
8438	MOVOU -16(CX)(BX*1), X3
8439	MOVOU X0, (AX)
8440	MOVOU X1, 16(AX)
8441	MOVOU X2, -32(AX)(BX*1)
8442	MOVOU X3, -16(AX)(BX*1)
8443
8444memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B:
8445	MOVQ DX, AX
8446	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
8447
8448memmove_long_emit_remainder_encodeBetterBlockAsm10B:
8449	LEAQ (AX)(SI*1), DX
8450	MOVL SI, BX
8451
8452	// genMemMoveLong
8453	MOVOU (CX), X0
8454	MOVOU 16(CX), X1
8455	MOVOU -32(CX)(BX*1), X2
8456	MOVOU -16(CX)(BX*1), X3
8457	MOVQ  BX, DI
8458	SHRQ  $0x05, DI
8459	MOVQ  AX, SI
8460	ANDL  $0x0000001f, SI
8461	MOVQ  $0x00000040, R8
8462	SUBQ  SI, R8
8463	DECQ  DI
8464	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
8465	LEAQ  -32(CX)(R8*1), SI
8466	LEAQ  -32(AX)(R8*1), R9
8467
8468emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back:
8469	MOVOU (SI), X4
8470	MOVOU 16(SI), X5
8471	MOVOA X4, (R9)
8472	MOVOA X5, 16(R9)
8473	ADDQ  $0x20, R9
8474	ADDQ  $0x20, SI
8475	ADDQ  $0x20, R8
8476	DECQ  DI
8477	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back
8478
8479emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
8480	MOVOU -32(CX)(R8*1), X4
8481	MOVOU -16(CX)(R8*1), X5
8482	MOVOA X4, -32(AX)(R8*1)
8483	MOVOA X5, -16(AX)(R8*1)
8484	ADDQ  $0x20, R8
8485	CMPQ  BX, R8
8486	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
8487	MOVOU X0, (AX)
8488	MOVOU X1, 16(AX)
8489	MOVOU X2, -32(AX)(BX*1)
8490	MOVOU X3, -16(AX)(BX*1)
8491	MOVQ  DX, AX
8492
8493emit_literal_done_emit_remainder_encodeBetterBlockAsm10B:
8494	MOVQ dst_base+0(FP), CX
8495	SUBQ CX, AX
8496	MOVQ AX, ret+48(FP)
8497	RET
8498
8499// func encodeBetterBlockAsm8B(dst []byte, src []byte) int
8500// Requires: SSE2
8501TEXT ·encodeBetterBlockAsm8B(SB), $5144-56
8502	MOVQ dst_base+0(FP), AX
8503	MOVQ $0x00000028, CX
8504	LEAQ 24(SP), DX
8505	PXOR X0, X0
8506
8507zero_loop_encodeBetterBlockAsm8B:
8508	MOVOU X0, (DX)
8509	MOVOU X0, 16(DX)
8510	MOVOU X0, 32(DX)
8511	MOVOU X0, 48(DX)
8512	MOVOU X0, 64(DX)
8513	MOVOU X0, 80(DX)
8514	MOVOU X0, 96(DX)
8515	MOVOU X0, 112(DX)
8516	ADDQ  $0x80, DX
8517	DECQ  CX
8518	JNZ   zero_loop_encodeBetterBlockAsm8B
8519	MOVL  $0x00000000, 12(SP)
8520	MOVQ  src_len+32(FP), CX
8521	LEAQ  -6(CX), DX
8522	LEAQ  -8(CX), SI
8523	MOVL  SI, 8(SP)
8524	SHRQ  $0x05, CX
8525	SUBL  CX, DX
8526	LEAQ  (AX)(DX*1), DX
8527	MOVQ  DX, (SP)
8528	MOVL  $0x00000001, CX
8529	MOVL  $0x00000000, 16(SP)
8530	MOVQ  src_base+24(FP), DX
8531
8532search_loop_encodeBetterBlockAsm8B:
8533	MOVL  CX, SI
8534	SUBL  12(SP), SI
8535	SHRL  $0x04, SI
8536	LEAL  1(CX)(SI*1), SI
8537	CMPL  SI, 8(SP)
8538	JGE   emit_remainder_encodeBetterBlockAsm8B
8539	MOVQ  (DX)(CX*1), DI
8540	MOVL  SI, 20(SP)
8541	MOVQ  $0x0000cf1bbcdcbf9b, R9
8542	MOVQ  $0x9e3779b1, SI
8543	MOVQ  DI, R10
8544	MOVQ  DI, R11
8545	SHLQ  $0x10, R10
8546	IMULQ R9, R10
8547	SHRQ  $0x36, R10
8548	SHLQ  $0x20, R11
8549	IMULQ SI, R11
8550	SHRQ  $0x38, R11
8551	MOVL  24(SP)(R10*4), SI
8552	MOVL  4120(SP)(R11*4), R8
8553	MOVL  CX, 24(SP)(R10*4)
8554	MOVL  CX, 4120(SP)(R11*4)
8555	CMPL  (DX)(SI*1), DI
8556	JEQ   candidate_match_encodeBetterBlockAsm8B
8557	CMPL  (DX)(R8*1), DI
8558	JEQ   candidateS_match_encodeBetterBlockAsm8B
8559	MOVL  20(SP), CX
8560	JMP   search_loop_encodeBetterBlockAsm8B
8561
8562candidateS_match_encodeBetterBlockAsm8B:
8563	SHRQ  $0x08, DI
8564	MOVQ  DI, R10
8565	SHLQ  $0x10, R10
8566	IMULQ R9, R10
8567	SHRQ  $0x36, R10
8568	MOVL  24(SP)(R10*4), SI
8569	INCL  CX
8570	MOVL  CX, 24(SP)(R10*4)
8571	CMPL  (DX)(SI*1), DI
8572	JEQ   candidate_match_encodeBetterBlockAsm8B
8573	DECL  CX
8574	MOVL  R8, SI
8575
8576candidate_match_encodeBetterBlockAsm8B:
8577	MOVL  12(SP), DI
8578	TESTL SI, SI
8579	JZ    match_extend_back_end_encodeBetterBlockAsm8B
8580
8581match_extend_back_loop_encodeBetterBlockAsm8B:
8582	CMPL CX, DI
8583	JLE  match_extend_back_end_encodeBetterBlockAsm8B
8584	MOVB -1(DX)(SI*1), BL
8585	MOVB -1(DX)(CX*1), R8
8586	CMPB BL, R8
8587	JNE  match_extend_back_end_encodeBetterBlockAsm8B
8588	LEAL -1(CX), CX
8589	DECL SI
8590	JZ   match_extend_back_end_encodeBetterBlockAsm8B
8591	JMP  match_extend_back_loop_encodeBetterBlockAsm8B
8592
8593match_extend_back_end_encodeBetterBlockAsm8B:
8594	MOVL CX, DI
8595	SUBL 12(SP), DI
8596	LEAQ 3(AX)(DI*1), DI
8597	CMPQ DI, (SP)
8598	JL   match_dst_size_check_encodeBetterBlockAsm8B
8599	MOVQ $0x00000000, ret+48(FP)
8600	RET
8601
8602match_dst_size_check_encodeBetterBlockAsm8B:
8603	MOVL CX, DI
8604	ADDL $0x04, CX
8605	ADDL $0x04, SI
8606	MOVQ src_len+32(FP), R8
8607	SUBL CX, R8
8608	LEAQ (DX)(CX*1), R9
8609	LEAQ (DX)(SI*1), R10
8610
8611	// matchLen
8612	XORL R12, R12
8613	CMPL R8, $0x08
8614	JL   matchlen_single_match_nolit_encodeBetterBlockAsm8B
8615
8616matchlen_loopback_match_nolit_encodeBetterBlockAsm8B:
8617	MOVQ  (R9)(R12*1), R11
8618	XORQ  (R10)(R12*1), R11
8619	TESTQ R11, R11
8620	JZ    matchlen_loop_match_nolit_encodeBetterBlockAsm8B
8621	BSFQ  R11, R11
8622	SARQ  $0x03, R11
8623	LEAL  (R12)(R11*1), R12
8624	JMP   match_nolit_end_encodeBetterBlockAsm8B
8625
8626matchlen_loop_match_nolit_encodeBetterBlockAsm8B:
8627	LEAL -8(R8), R8
8628	LEAL 8(R12), R12
8629	CMPL R8, $0x08
8630	JGE  matchlen_loopback_match_nolit_encodeBetterBlockAsm8B
8631
8632matchlen_single_match_nolit_encodeBetterBlockAsm8B:
8633	TESTL R8, R8
8634	JZ    match_nolit_end_encodeBetterBlockAsm8B
8635
8636matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B:
8637	MOVB (R9)(R12*1), R11
8638	CMPB (R10)(R12*1), R11
8639	JNE  match_nolit_end_encodeBetterBlockAsm8B
8640	LEAL 1(R12), R12
8641	DECL R8
8642	JNZ  matchlen_single_loopback_match_nolit_encodeBetterBlockAsm8B
8643
8644match_nolit_end_encodeBetterBlockAsm8B:
8645	MOVL CX, R8
8646	SUBL SI, R8
8647
8648	// Check if repeat
8649	CMPL 16(SP), R8
8650	JEQ  match_is_repeat_encodeBetterBlockAsm8B
8651	MOVL R8, 16(SP)
8652	MOVL 12(SP), SI
8653	CMPL SI, DI
8654	JEQ  emit_literal_done_match_emit_encodeBetterBlockAsm8B
8655	MOVL DI, R9
8656	MOVL DI, 12(SP)
8657	LEAQ (DX)(SI*1), R10
8658	SUBL SI, R9
8659	LEAL -1(R9), SI
8660	CMPL SI, $0x3c
8661	JLT  one_byte_match_emit_encodeBetterBlockAsm8B
8662	CMPL SI, $0x00000100
8663	JLT  two_bytes_match_emit_encodeBetterBlockAsm8B
8664	MOVB $0xf4, (AX)
8665	MOVW SI, 1(AX)
8666	ADDQ $0x03, AX
8667	JMP  memmove_long_match_emit_encodeBetterBlockAsm8B
8668
8669two_bytes_match_emit_encodeBetterBlockAsm8B:
8670	MOVB $0xf0, (AX)
8671	MOVB SI, 1(AX)
8672	ADDQ $0x02, AX
8673	CMPL SI, $0x40
8674	JL   memmove_match_emit_encodeBetterBlockAsm8B
8675	JMP  memmove_long_match_emit_encodeBetterBlockAsm8B
8676
8677one_byte_match_emit_encodeBetterBlockAsm8B:
8678	SHLB $0x02, SI
8679	MOVB SI, (AX)
8680	ADDQ $0x01, AX
8681
8682memmove_match_emit_encodeBetterBlockAsm8B:
8683	LEAQ (AX)(R9*1), SI
8684
8685	// genMemMoveShort
8686	CMPQ R9, $0x03
8687	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_1or2
8688	JE   emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_3
8689	CMPQ R9, $0x08
8690	JB   emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7
8691	CMPQ R9, $0x10
8692	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16
8693	CMPQ R9, $0x20
8694	JBE  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32
8695	JMP  emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64
8696
8697emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_1or2:
8698	MOVB (R10), R11
8699	MOVB -1(R10)(R9*1), R10
8700	MOVB R11, (AX)
8701	MOVB R10, -1(AX)(R9*1)
8702	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B
8703
8704emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_3:
8705	MOVW (R10), R11
8706	MOVB 2(R10), R10
8707	MOVW R11, (AX)
8708	MOVB R10, 2(AX)
8709	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B
8710
8711emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7:
8712	MOVL (R10), R11
8713	MOVL -4(R10)(R9*1), R10
8714	MOVL R11, (AX)
8715	MOVL R10, -4(AX)(R9*1)
8716	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B
8717
8718emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16:
8719	MOVQ (R10), R11
8720	MOVQ -8(R10)(R9*1), R10
8721	MOVQ R11, (AX)
8722	MOVQ R10, -8(AX)(R9*1)
8723	JMP  memmove_end_copy_match_emit_encodeBetterBlockAsm8B
8724
8725emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32:
8726	MOVOU (R10), X0
8727	MOVOU -16(R10)(R9*1), X1
8728	MOVOU X0, (AX)
8729	MOVOU X1, -16(AX)(R9*1)
8730	JMP   memmove_end_copy_match_emit_encodeBetterBlockAsm8B
8731
8732emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64:
8733	MOVOU (R10), X0
8734	MOVOU 16(R10), X1
8735	MOVOU -32(R10)(R9*1), X2
8736	MOVOU -16(R10)(R9*1), X3
8737	MOVOU X0, (AX)
8738	MOVOU X1, 16(AX)
8739	MOVOU X2, -32(AX)(R9*1)
8740	MOVOU X3, -16(AX)(R9*1)
8741
8742memmove_end_copy_match_emit_encodeBetterBlockAsm8B:
8743	MOVQ SI, AX
8744	JMP  emit_literal_done_match_emit_encodeBetterBlockAsm8B
8745
8746memmove_long_match_emit_encodeBetterBlockAsm8B:
8747	LEAQ (AX)(R9*1), SI
8748
8749	// genMemMoveLong
8750	MOVOU (R10), X0
8751	MOVOU 16(R10), X1
8752	MOVOU -32(R10)(R9*1), X2
8753	MOVOU -16(R10)(R9*1), X3
8754	MOVQ  R9, R13
8755	SHRQ  $0x05, R13
8756	MOVQ  AX, R11
8757	ANDL  $0x0000001f, R11
8758	MOVQ  $0x00000040, R14
8759	SUBQ  R11, R14
8760	DECQ  R13
8761	JA    emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
8762	LEAQ  -32(R10)(R14*1), R11
8763	LEAQ  -32(AX)(R14*1), R15
8764
8765emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back:
8766	MOVOU (R11), X4
8767	MOVOU 16(R11), X5
8768	MOVOA X4, (R15)
8769	MOVOA X5, 16(R15)
8770	ADDQ  $0x20, R15
8771	ADDQ  $0x20, R11
8772	ADDQ  $0x20, R14
8773	DECQ  R13
8774	JNA   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back
8775
8776emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
8777	MOVOU -32(R10)(R14*1), X4
8778	MOVOU -16(R10)(R14*1), X5
8779	MOVOA X4, -32(AX)(R14*1)
8780	MOVOA X5, -16(AX)(R14*1)
8781	ADDQ  $0x20, R14
8782	CMPQ  R9, R14
8783	JAE   emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
8784	MOVOU X0, (AX)
8785	MOVOU X1, 16(AX)
8786	MOVOU X2, -32(AX)(R9*1)
8787	MOVOU X3, -16(AX)(R9*1)
8788	MOVQ  SI, AX
8789
8790emit_literal_done_match_emit_encodeBetterBlockAsm8B:
8791	ADDL R12, CX
8792	ADDL $0x04, R12
8793	MOVL CX, 12(SP)
8794
8795	// emitCopy
8796two_byte_offset_match_nolit_encodeBetterBlockAsm8B:
8797	CMPL R12, $0x40
8798	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
8799	MOVB $0xee, (AX)
8800	MOVW R8, 1(AX)
8801	LEAL -60(R12), R12
8802	ADDQ $0x03, AX
8803
8804	// emitRepeat
8805	MOVL R12, SI
8806	LEAL -4(R12), R12
8807	CMPL SI, $0x08
8808	JLE  repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
8809	CMPL SI, $0x0c
8810	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
8811
8812cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
8813	CMPL R12, $0x00000104
8814	JLT  repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
8815	LEAL -256(R12), R12
8816	MOVW $0x0019, (AX)
8817	MOVW R12, 2(AX)
8818	ADDQ $0x04, AX
8819	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
8820
8821repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
8822	LEAL -4(R12), R12
8823	MOVW $0x0015, (AX)
8824	MOVB R12, 2(AX)
8825	ADDQ $0x03, AX
8826	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
8827
8828repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
8829	SHLL $0x02, R12
8830	ORL  $0x01, R12
8831	MOVW R12, (AX)
8832	ADDQ $0x02, AX
8833	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
8834	XORQ SI, SI
8835	LEAL 1(SI)(R12*4), R12
8836	MOVB R8, 1(AX)
8837	SARL $0x08, R8
8838	SHLL $0x05, R8
8839	ORL  R8, R12
8840	MOVB R12, (AX)
8841	ADDQ $0x02, AX
8842	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
8843	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B
8844
8845two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
8846	CMPL R12, $0x0c
8847	JGE  emit_copy_three_match_nolit_encodeBetterBlockAsm8B
8848	MOVB $0x01, BL
8849	LEAL -16(BX)(R12*4), R12
8850	MOVB R8, 1(AX)
8851	SHRL $0x08, R8
8852	SHLL $0x05, R8
8853	ORL  R8, R12
8854	MOVB R12, (AX)
8855	ADDQ $0x02, AX
8856	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
8857
8858emit_copy_three_match_nolit_encodeBetterBlockAsm8B:
8859	MOVB $0x02, BL
8860	LEAL -4(BX)(R12*4), R12
8861	MOVB R12, (AX)
8862	MOVW R8, 1(AX)
8863	ADDQ $0x03, AX
8864	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
8865
8866match_is_repeat_encodeBetterBlockAsm8B:
8867	MOVL 12(SP), SI
8868	CMPL SI, DI
8869	JEQ  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
8870	MOVL DI, R8
8871	MOVL DI, 12(SP)
8872	LEAQ (DX)(SI*1), R9
8873	SUBL SI, R8
8874	LEAL -1(R8), SI
8875	CMPL SI, $0x3c
8876	JLT  one_byte_match_emit_repeat_encodeBetterBlockAsm8B
8877	CMPL SI, $0x00000100
8878	JLT  two_bytes_match_emit_repeat_encodeBetterBlockAsm8B
8879	MOVB $0xf4, (AX)
8880	MOVW SI, 1(AX)
8881	ADDQ $0x03, AX
8882	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
8883
8884two_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
8885	MOVB $0xf0, (AX)
8886	MOVB SI, 1(AX)
8887	ADDQ $0x02, AX
8888	CMPL SI, $0x40
8889	JL   memmove_match_emit_repeat_encodeBetterBlockAsm8B
8890	JMP  memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
8891
8892one_byte_match_emit_repeat_encodeBetterBlockAsm8B:
8893	SHLB $0x02, SI
8894	MOVB SI, (AX)
8895	ADDQ $0x01, AX
8896
8897memmove_match_emit_repeat_encodeBetterBlockAsm8B:
8898	LEAQ (AX)(R8*1), SI
8899
8900	// genMemMoveShort
8901	CMPQ R8, $0x03
8902	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_1or2
8903	JE   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_3
8904	CMPQ R8, $0x08
8905	JB   emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7
8906	CMPQ R8, $0x10
8907	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16
8908	CMPQ R8, $0x20
8909	JBE  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32
8910	JMP  emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64
8911
8912emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_1or2:
8913	MOVB (R9), R10
8914	MOVB -1(R9)(R8*1), R9
8915	MOVB R10, (AX)
8916	MOVB R9, -1(AX)(R8*1)
8917	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
8918
8919emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_3:
8920	MOVW (R9), R10
8921	MOVB 2(R9), R9
8922	MOVW R10, (AX)
8923	MOVB R9, 2(AX)
8924	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
8925
8926emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7:
8927	MOVL (R9), R10
8928	MOVL -4(R9)(R8*1), R9
8929	MOVL R10, (AX)
8930	MOVL R9, -4(AX)(R8*1)
8931	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
8932
8933emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16:
8934	MOVQ (R9), R10
8935	MOVQ -8(R9)(R8*1), R9
8936	MOVQ R10, (AX)
8937	MOVQ R9, -8(AX)(R8*1)
8938	JMP  memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
8939
8940emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32:
8941	MOVOU (R9), X0
8942	MOVOU -16(R9)(R8*1), X1
8943	MOVOU X0, (AX)
8944	MOVOU X1, -16(AX)(R8*1)
8945	JMP   memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
8946
8947emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64:
8948	MOVOU (R9), X0
8949	MOVOU 16(R9), X1
8950	MOVOU -32(R9)(R8*1), X2
8951	MOVOU -16(R9)(R8*1), X3
8952	MOVOU X0, (AX)
8953	MOVOU X1, 16(AX)
8954	MOVOU X2, -32(AX)(R8*1)
8955	MOVOU X3, -16(AX)(R8*1)
8956
8957memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B:
8958	MOVQ SI, AX
8959	JMP  emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
8960
8961memmove_long_match_emit_repeat_encodeBetterBlockAsm8B:
8962	LEAQ (AX)(R8*1), SI
8963
8964	// genMemMoveLong
8965	MOVOU (R9), X0
8966	MOVOU 16(R9), X1
8967	MOVOU -32(R9)(R8*1), X2
8968	MOVOU -16(R9)(R8*1), X3
8969	MOVQ  R8, R11
8970	SHRQ  $0x05, R11
8971	MOVQ  AX, R10
8972	ANDL  $0x0000001f, R10
8973	MOVQ  $0x00000040, R13
8974	SUBQ  R10, R13
8975	DECQ  R11
8976	JA    emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
8977	LEAQ  -32(R9)(R13*1), R10
8978	LEAQ  -32(AX)(R13*1), R14
8979
8980emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back:
8981	MOVOU (R10), X4
8982	MOVOU 16(R10), X5
8983	MOVOA X4, (R14)
8984	MOVOA X5, 16(R14)
8985	ADDQ  $0x20, R14
8986	ADDQ  $0x20, R10
8987	ADDQ  $0x20, R13
8988	DECQ  R11
8989	JNA   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back
8990
8991emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
8992	MOVOU -32(R9)(R13*1), X4
8993	MOVOU -16(R9)(R13*1), X5
8994	MOVOA X4, -32(AX)(R13*1)
8995	MOVOA X5, -16(AX)(R13*1)
8996	ADDQ  $0x20, R13
8997	CMPQ  R8, R13
8998	JAE   emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
8999	MOVOU X0, (AX)
9000	MOVOU X1, 16(AX)
9001	MOVOU X2, -32(AX)(R8*1)
9002	MOVOU X3, -16(AX)(R8*1)
9003	MOVQ  SI, AX
9004
9005emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B:
9006	ADDL R12, CX
9007	ADDL $0x04, R12
9008	MOVL CX, 12(SP)
9009
9010	// emitRepeat
9011	MOVL R12, SI
9012	LEAL -4(R12), R12
9013	CMPL SI, $0x08
9014	JLE  repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B
9015	CMPL SI, $0x0c
9016	JGE  cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B
9017
9018cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B:
9019	CMPL R12, $0x00000104
9020	JLT  repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B
9021	LEAL -256(R12), R12
9022	MOVW $0x0019, (AX)
9023	MOVW R12, 2(AX)
9024	ADDQ $0x04, AX
9025	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
9026
9027repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B:
9028	LEAL -4(R12), R12
9029	MOVW $0x0015, (AX)
9030	MOVB R12, 2(AX)
9031	ADDQ $0x03, AX
9032	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
9033
9034repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B:
9035	SHLL $0x02, R12
9036	ORL  $0x01, R12
9037	MOVW R12, (AX)
9038	ADDQ $0x02, AX
9039	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
9040	XORQ SI, SI
9041	LEAL 1(SI)(R12*4), R12
9042	MOVB R8, 1(AX)
9043	SARL $0x08, R8
9044	SHLL $0x05, R8
9045	ORL  R8, R12
9046	MOVB R12, (AX)
9047	ADDQ $0x02, AX
9048
9049match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
9050	CMPL CX, 8(SP)
9051	JGE  emit_remainder_encodeBetterBlockAsm8B
9052	CMPQ AX, (SP)
9053	JL   match_nolit_dst_ok_encodeBetterBlockAsm8B
9054	MOVQ $0x00000000, ret+48(FP)
9055	RET
9056
9057match_nolit_dst_ok_encodeBetterBlockAsm8B:
9058	MOVQ  $0x0000cf1bbcdcbf9b, SI
9059	MOVQ  $0x9e3779b1, R8
9060	INCL  DI
9061	MOVQ  (DX)(DI*1), R9
9062	MOVQ  R9, R10
9063	MOVQ  R9, R11
9064	MOVQ  R9, R12
9065	SHRQ  $0x08, R11
9066	MOVQ  R11, R13
9067	SHRQ  $0x10, R12
9068	LEAL  1(DI), R14
9069	LEAL  2(DI), R15
9070	MOVQ  -2(DX)(CX*1), R9
9071	SHLQ  $0x10, R10
9072	IMULQ SI, R10
9073	SHRQ  $0x36, R10
9074	SHLQ  $0x10, R13
9075	IMULQ SI, R13
9076	SHRQ  $0x36, R13
9077	SHLQ  $0x20, R11
9078	IMULQ R8, R11
9079	SHRQ  $0x38, R11
9080	SHLQ  $0x20, R12
9081	IMULQ R8, R12
9082	SHRQ  $0x38, R12
9083	MOVL  DI, 24(SP)(R10*4)
9084	MOVL  R14, 24(SP)(R13*4)
9085	MOVL  R14, 4120(SP)(R11*4)
9086	MOVL  R15, 4120(SP)(R12*4)
9087	MOVQ  R9, R10
9088	MOVQ  R9, R11
9089	SHRQ  $0x08, R11
9090	MOVQ  R11, R13
9091	LEAL  -2(CX), R9
9092	LEAL  -1(CX), DI
9093	SHLQ  $0x10, R10
9094	IMULQ SI, R10
9095	SHRQ  $0x36, R10
9096	SHLQ  $0x20, R11
9097	IMULQ R8, R11
9098	SHRQ  $0x38, R11
9099	SHLQ  $0x10, R13
9100	IMULQ SI, R13
9101	SHRQ  $0x36, R13
9102	MOVL  R9, 24(SP)(R10*4)
9103	MOVL  DI, 4120(SP)(R11*4)
9104	MOVL  DI, 24(SP)(R13*4)
9105	JMP   search_loop_encodeBetterBlockAsm8B
9106
9107emit_remainder_encodeBetterBlockAsm8B:
9108	MOVQ src_len+32(FP), CX
9109	SUBL 12(SP), CX
9110	LEAQ 3(AX)(CX*1), CX
9111	CMPQ CX, (SP)
9112	JL   emit_remainder_ok_encodeBetterBlockAsm8B
9113	MOVQ $0x00000000, ret+48(FP)
9114	RET
9115
9116emit_remainder_ok_encodeBetterBlockAsm8B:
9117	MOVQ src_len+32(FP), CX
9118	MOVL 12(SP), BX
9119	CMPL BX, CX
9120	JEQ  emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
9121	MOVL CX, SI
9122	MOVL CX, 12(SP)
9123	LEAQ (DX)(BX*1), CX
9124	SUBL BX, SI
9125	LEAL -1(SI), DX
9126	CMPL DX, $0x3c
9127	JLT  one_byte_emit_remainder_encodeBetterBlockAsm8B
9128	CMPL DX, $0x00000100
9129	JLT  two_bytes_emit_remainder_encodeBetterBlockAsm8B
9130	MOVB $0xf4, (AX)
9131	MOVW DX, 1(AX)
9132	ADDQ $0x03, AX
9133	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm8B
9134
9135two_bytes_emit_remainder_encodeBetterBlockAsm8B:
9136	MOVB $0xf0, (AX)
9137	MOVB DL, 1(AX)
9138	ADDQ $0x02, AX
9139	CMPL DX, $0x40
9140	JL   memmove_emit_remainder_encodeBetterBlockAsm8B
9141	JMP  memmove_long_emit_remainder_encodeBetterBlockAsm8B
9142
9143one_byte_emit_remainder_encodeBetterBlockAsm8B:
9144	SHLB $0x02, DL
9145	MOVB DL, (AX)
9146	ADDQ $0x01, AX
9147
9148memmove_emit_remainder_encodeBetterBlockAsm8B:
9149	LEAQ (AX)(SI*1), DX
9150	MOVL SI, BX
9151
9152	// genMemMoveShort
9153	CMPQ BX, $0x03
9154	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2
9155	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3
9156	CMPQ BX, $0x08
9157	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7
9158	CMPQ BX, $0x10
9159	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16
9160	CMPQ BX, $0x20
9161	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32
9162	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64
9163
9164emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2:
9165	MOVB (CX), SI
9166	MOVB -1(CX)(BX*1), CL
9167	MOVB SI, (AX)
9168	MOVB CL, -1(AX)(BX*1)
9169	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
9170
9171emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3:
9172	MOVW (CX), SI
9173	MOVB 2(CX), CL
9174	MOVW SI, (AX)
9175	MOVB CL, 2(AX)
9176	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
9177
9178emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7:
9179	MOVL (CX), SI
9180	MOVL -4(CX)(BX*1), CX
9181	MOVL SI, (AX)
9182	MOVL CX, -4(AX)(BX*1)
9183	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
9184
9185emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16:
9186	MOVQ (CX), SI
9187	MOVQ -8(CX)(BX*1), CX
9188	MOVQ SI, (AX)
9189	MOVQ CX, -8(AX)(BX*1)
9190	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
9191
9192emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32:
9193	MOVOU (CX), X0
9194	MOVOU -16(CX)(BX*1), X1
9195	MOVOU X0, (AX)
9196	MOVOU X1, -16(AX)(BX*1)
9197	JMP   memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
9198
9199emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64:
9200	MOVOU (CX), X0
9201	MOVOU 16(CX), X1
9202	MOVOU -32(CX)(BX*1), X2
9203	MOVOU -16(CX)(BX*1), X3
9204	MOVOU X0, (AX)
9205	MOVOU X1, 16(AX)
9206	MOVOU X2, -32(AX)(BX*1)
9207	MOVOU X3, -16(AX)(BX*1)
9208
9209memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B:
9210	MOVQ DX, AX
9211	JMP  emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
9212
9213memmove_long_emit_remainder_encodeBetterBlockAsm8B:
9214	LEAQ (AX)(SI*1), DX
9215	MOVL SI, BX
9216
9217	// genMemMoveLong
9218	MOVOU (CX), X0
9219	MOVOU 16(CX), X1
9220	MOVOU -32(CX)(BX*1), X2
9221	MOVOU -16(CX)(BX*1), X3
9222	MOVQ  BX, DI
9223	SHRQ  $0x05, DI
9224	MOVQ  AX, SI
9225	ANDL  $0x0000001f, SI
9226	MOVQ  $0x00000040, R8
9227	SUBQ  SI, R8
9228	DECQ  DI
9229	JA    emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
9230	LEAQ  -32(CX)(R8*1), SI
9231	LEAQ  -32(AX)(R8*1), R9
9232
9233emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back:
9234	MOVOU (SI), X4
9235	MOVOU 16(SI), X5
9236	MOVOA X4, (R9)
9237	MOVOA X5, 16(R9)
9238	ADDQ  $0x20, R9
9239	ADDQ  $0x20, SI
9240	ADDQ  $0x20, R8
9241	DECQ  DI
9242	JNA   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back
9243
9244emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
9245	MOVOU -32(CX)(R8*1), X4
9246	MOVOU -16(CX)(R8*1), X5
9247	MOVOA X4, -32(AX)(R8*1)
9248	MOVOA X5, -16(AX)(R8*1)
9249	ADDQ  $0x20, R8
9250	CMPQ  BX, R8
9251	JAE   emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
9252	MOVOU X0, (AX)
9253	MOVOU X1, 16(AX)
9254	MOVOU X2, -32(AX)(BX*1)
9255	MOVOU X3, -16(AX)(BX*1)
9256	MOVQ  DX, AX
9257
9258emit_literal_done_emit_remainder_encodeBetterBlockAsm8B:
9259	MOVQ dst_base+0(FP), CX
9260	SUBQ CX, AX
9261	MOVQ AX, ret+48(FP)
9262	RET
9263
9264// func encodeSnappyBlockAsm(dst []byte, src []byte) int
9265// Requires: SSE2
9266TEXT ·encodeSnappyBlockAsm(SB), $65560-56
9267	MOVQ dst_base+0(FP), AX
9268	MOVQ $0x00000200, CX
9269	LEAQ 24(SP), DX
9270	PXOR X0, X0
9271
9272zero_loop_encodeSnappyBlockAsm:
9273	MOVOU X0, (DX)
9274	MOVOU X0, 16(DX)
9275	MOVOU X0, 32(DX)
9276	MOVOU X0, 48(DX)
9277	MOVOU X0, 64(DX)
9278	MOVOU X0, 80(DX)
9279	MOVOU X0, 96(DX)
9280	MOVOU X0, 112(DX)
9281	ADDQ  $0x80, DX
9282	DECQ  CX
9283	JNZ   zero_loop_encodeSnappyBlockAsm
9284	MOVL  $0x00000000, 12(SP)
9285	MOVQ  src_len+32(FP), CX
9286	LEAQ  -5(CX), DX
9287	LEAQ  -8(CX), SI
9288	MOVL  SI, 8(SP)
9289	SHRQ  $0x05, CX
9290	SUBL  CX, DX
9291	LEAQ  (AX)(DX*1), DX
9292	MOVQ  DX, (SP)
9293	MOVL  $0x00000001, CX
9294	MOVL  CX, 16(SP)
9295	MOVQ  src_base+24(FP), DX
9296
9297search_loop_encodeSnappyBlockAsm:
9298	MOVL  CX, SI
9299	SUBL  12(SP), SI
9300	SHRL  $0x06, SI
9301	LEAL  4(CX)(SI*1), SI
9302	CMPL  SI, 8(SP)
9303	JGE   emit_remainder_encodeSnappyBlockAsm
9304	MOVQ  (DX)(CX*1), DI
9305	MOVL  SI, 20(SP)
9306	MOVQ  $0x0000cf1bbcdcbf9b, R9
9307	MOVQ  DI, R10
9308	MOVQ  DI, R11
9309	SHRQ  $0x08, R11
9310	SHLQ  $0x10, R10
9311	IMULQ R9, R10
9312	SHRQ  $0x32, R10
9313	SHLQ  $0x10, R11
9314	IMULQ R9, R11
9315	SHRQ  $0x32, R11
9316	MOVL  24(SP)(R10*4), SI
9317	MOVL  24(SP)(R11*4), R8
9318	MOVL  CX, 24(SP)(R10*4)
9319	LEAL  1(CX), R10
9320	MOVL  R10, 24(SP)(R11*4)
9321	MOVQ  DI, R10
9322	SHRQ  $0x10, R10
9323	SHLQ  $0x10, R10
9324	IMULQ R9, R10
9325	SHRQ  $0x32, R10
9326	MOVL  CX, R9
9327	SUBL  16(SP), R9
9328	MOVL  1(DX)(R9*1), R11
9329	MOVQ  DI, R9
9330	SHRQ  $0x08, R9
9331	CMPL  R9, R11
9332	JNE   no_repeat_found_encodeSnappyBlockAsm
9333	LEAL  1(CX), DI
9334	MOVL  12(SP), SI
9335	MOVL  DI, R8
9336	SUBL  16(SP), R8
9337	JZ    repeat_extend_back_end_encodeSnappyBlockAsm
9338
9339repeat_extend_back_loop_encodeSnappyBlockAsm:
9340	CMPL DI, SI
9341	JLE  repeat_extend_back_end_encodeSnappyBlockAsm
9342	MOVB -1(DX)(R8*1), BL
9343	MOVB -1(DX)(DI*1), R9
9344	CMPB BL, R9
9345	JNE  repeat_extend_back_end_encodeSnappyBlockAsm
9346	LEAL -1(DI), DI
9347	DECL R8
9348	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm
9349
9350repeat_extend_back_end_encodeSnappyBlockAsm:
9351	MOVL 12(SP), SI
9352	CMPL SI, DI
9353	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm
9354	MOVL DI, R8
9355	MOVL DI, 12(SP)
9356	LEAQ (DX)(SI*1), R9
9357	SUBL SI, R8
9358	LEAL -1(R8), SI
9359	CMPL SI, $0x3c
9360	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm
9361	CMPL SI, $0x00000100
9362	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm
9363	CMPL SI, $0x00010000
9364	JLT  three_bytes_repeat_emit_encodeSnappyBlockAsm
9365	CMPL SI, $0x01000000
9366	JLT  four_bytes_repeat_emit_encodeSnappyBlockAsm
9367	MOVB $0xfc, (AX)
9368	MOVL SI, 1(AX)
9369	ADDQ $0x05, AX
9370	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
9371
9372four_bytes_repeat_emit_encodeSnappyBlockAsm:
9373	MOVL SI, R10
9374	SHRL $0x10, R10
9375	MOVB $0xf8, (AX)
9376	MOVW SI, 1(AX)
9377	MOVB R10, 3(AX)
9378	ADDQ $0x04, AX
9379	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
9380
9381three_bytes_repeat_emit_encodeSnappyBlockAsm:
9382	MOVB $0xf4, (AX)
9383	MOVW SI, 1(AX)
9384	ADDQ $0x03, AX
9385	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
9386
9387two_bytes_repeat_emit_encodeSnappyBlockAsm:
9388	MOVB $0xf0, (AX)
9389	MOVB SI, 1(AX)
9390	ADDQ $0x02, AX
9391	CMPL SI, $0x40
9392	JL   memmove_repeat_emit_encodeSnappyBlockAsm
9393	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm
9394
9395one_byte_repeat_emit_encodeSnappyBlockAsm:
9396	SHLB $0x02, SI
9397	MOVB SI, (AX)
9398	ADDQ $0x01, AX
9399
9400memmove_repeat_emit_encodeSnappyBlockAsm:
9401	LEAQ (AX)(R8*1), SI
9402
9403	// genMemMoveShort
9404	CMPQ R8, $0x03
9405	JB   emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_1or2
9406	JE   emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_3
9407	CMPQ R8, $0x08
9408	JB   emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_4through7
9409	CMPQ R8, $0x10
9410	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16
9411	CMPQ R8, $0x20
9412	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32
9413	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64
9414
9415emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_1or2:
9416	MOVB (R9), R10
9417	MOVB -1(R9)(R8*1), R9
9418	MOVB R10, (AX)
9419	MOVB R9, -1(AX)(R8*1)
9420	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
9421
9422emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_3:
9423	MOVW (R9), R10
9424	MOVB 2(R9), R9
9425	MOVW R10, (AX)
9426	MOVB R9, 2(AX)
9427	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
9428
9429emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_4through7:
9430	MOVL (R9), R10
9431	MOVL -4(R9)(R8*1), R9
9432	MOVL R10, (AX)
9433	MOVL R9, -4(AX)(R8*1)
9434	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
9435
9436emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16:
9437	MOVQ (R9), R10
9438	MOVQ -8(R9)(R8*1), R9
9439	MOVQ R10, (AX)
9440	MOVQ R9, -8(AX)(R8*1)
9441	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
9442
9443emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32:
9444	MOVOU (R9), X0
9445	MOVOU -16(R9)(R8*1), X1
9446	MOVOU X0, (AX)
9447	MOVOU X1, -16(AX)(R8*1)
9448	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
9449
9450emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64:
9451	MOVOU (R9), X0
9452	MOVOU 16(R9), X1
9453	MOVOU -32(R9)(R8*1), X2
9454	MOVOU -16(R9)(R8*1), X3
9455	MOVOU X0, (AX)
9456	MOVOU X1, 16(AX)
9457	MOVOU X2, -32(AX)(R8*1)
9458	MOVOU X3, -16(AX)(R8*1)
9459
9460memmove_end_copy_repeat_emit_encodeSnappyBlockAsm:
9461	MOVQ SI, AX
9462	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm
9463
9464memmove_long_repeat_emit_encodeSnappyBlockAsm:
9465	LEAQ (AX)(R8*1), SI
9466
9467	// genMemMoveLong
9468	MOVOU (R9), X0
9469	MOVOU 16(R9), X1
9470	MOVOU -32(R9)(R8*1), X2
9471	MOVOU -16(R9)(R8*1), X3
9472	MOVQ  R8, R11
9473	SHRQ  $0x05, R11
9474	MOVQ  AX, R10
9475	ANDL  $0x0000001f, R10
9476	MOVQ  $0x00000040, R12
9477	SUBQ  R10, R12
9478	DECQ  R11
9479	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
9480	LEAQ  -32(R9)(R12*1), R10
9481	LEAQ  -32(AX)(R12*1), R13
9482
9483emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back:
9484	MOVOU (R10), X4
9485	MOVOU 16(R10), X5
9486	MOVOA X4, (R13)
9487	MOVOA X5, 16(R13)
9488	ADDQ  $0x20, R13
9489	ADDQ  $0x20, R10
9490	ADDQ  $0x20, R12
9491	DECQ  R11
9492	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back
9493
9494emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
9495	MOVOU -32(R9)(R12*1), X4
9496	MOVOU -16(R9)(R12*1), X5
9497	MOVOA X4, -32(AX)(R12*1)
9498	MOVOA X5, -16(AX)(R12*1)
9499	ADDQ  $0x20, R12
9500	CMPQ  R8, R12
9501	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
9502	MOVOU X0, (AX)
9503	MOVOU X1, 16(AX)
9504	MOVOU X2, -32(AX)(R8*1)
9505	MOVOU X3, -16(AX)(R8*1)
9506	MOVQ  SI, AX
9507
9508emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
9509	ADDL $0x05, CX
9510	MOVL CX, SI
9511	SUBL 16(SP), SI
9512	MOVQ src_len+32(FP), R8
9513	SUBL CX, R8
9514	LEAQ (DX)(CX*1), R9
9515	LEAQ (DX)(SI*1), SI
9516
9517	// matchLen
9518	XORL R11, R11
9519	CMPL R8, $0x08
9520	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm
9521
9522matchlen_loopback_repeat_extend_encodeSnappyBlockAsm:
9523	MOVQ  (R9)(R11*1), R10
9524	XORQ  (SI)(R11*1), R10
9525	TESTQ R10, R10
9526	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm
9527	BSFQ  R10, R10
9528	SARQ  $0x03, R10
9529	LEAL  (R11)(R10*1), R11
9530	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm
9531
9532matchlen_loop_repeat_extend_encodeSnappyBlockAsm:
9533	LEAL -8(R8), R8
9534	LEAL 8(R11), R11
9535	CMPL R8, $0x08
9536	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm
9537
9538matchlen_single_repeat_extend_encodeSnappyBlockAsm:
9539	TESTL R8, R8
9540	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm
9541
9542matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm:
9543	MOVB (R9)(R11*1), R10
9544	CMPB (SI)(R11*1), R10
9545	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm
9546	LEAL 1(R11), R11
9547	DECL R8
9548	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm
9549
9550repeat_extend_forward_end_encodeSnappyBlockAsm:
9551	ADDL R11, CX
9552	MOVL CX, SI
9553	SUBL DI, SI
9554	MOVL 16(SP), DI
9555
9556	// emitCopy
9557	CMPL DI, $0x00010000
9558	JL   two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
9559
9560four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm:
9561	CMPL SI, $0x40
9562	JLE  four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
9563	MOVB $0xff, (AX)
9564	MOVL DI, 1(AX)
9565	LEAL -64(SI), SI
9566	ADDQ $0x05, AX
9567	CMPL SI, $0x04
9568	JL   four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
9569	JMP  four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm
9570
9571four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm:
9572	TESTL SI, SI
9573	JZ    repeat_end_emit_encodeSnappyBlockAsm
9574	MOVB  $0x03, BL
9575	LEAL  -4(BX)(SI*4), SI
9576	MOVB  SI, (AX)
9577	MOVL  DI, 1(AX)
9578	ADDQ  $0x05, AX
9579	JMP   repeat_end_emit_encodeSnappyBlockAsm
9580
9581two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm:
9582	CMPL SI, $0x40
9583	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm
9584	MOVB $0xee, (AX)
9585	MOVW DI, 1(AX)
9586	LEAL -60(SI), SI
9587	ADDQ $0x03, AX
9588	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
9589
9590two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm:
9591	CMPL SI, $0x0c
9592	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
9593	CMPL DI, $0x00000800
9594	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
9595	MOVB $0x01, BL
9596	LEAL -16(BX)(SI*4), SI
9597	MOVB DI, 1(AX)
9598	SHRL $0x08, DI
9599	SHLL $0x05, DI
9600	ORL  DI, SI
9601	MOVB SI, (AX)
9602	ADDQ $0x02, AX
9603	JMP  repeat_end_emit_encodeSnappyBlockAsm
9604
9605emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm:
9606	MOVB $0x02, BL
9607	LEAL -4(BX)(SI*4), SI
9608	MOVB SI, (AX)
9609	MOVW DI, 1(AX)
9610	ADDQ $0x03, AX
9611
9612repeat_end_emit_encodeSnappyBlockAsm:
9613	MOVL CX, 12(SP)
9614	JMP  search_loop_encodeSnappyBlockAsm
9615
9616no_repeat_found_encodeSnappyBlockAsm:
9617	CMPL (DX)(SI*1), DI
9618	JEQ  candidate_match_encodeSnappyBlockAsm
9619	SHRQ $0x08, DI
9620	MOVL 24(SP)(R10*4), SI
9621	LEAL 2(CX), R9
9622	CMPL (DX)(R8*1), DI
9623	JEQ  candidate2_match_encodeSnappyBlockAsm
9624	MOVL R9, 24(SP)(R10*4)
9625	SHRQ $0x08, DI
9626	CMPL (DX)(SI*1), DI
9627	JEQ  candidate3_match_encodeSnappyBlockAsm
9628	MOVL 20(SP), CX
9629	JMP  search_loop_encodeSnappyBlockAsm
9630
9631candidate3_match_encodeSnappyBlockAsm:
9632	ADDL $0x02, CX
9633	JMP  candidate_match_encodeSnappyBlockAsm
9634
9635candidate2_match_encodeSnappyBlockAsm:
9636	MOVL R9, 24(SP)(R10*4)
9637	INCL CX
9638	MOVL R8, SI
9639
9640candidate_match_encodeSnappyBlockAsm:
9641	MOVL  12(SP), DI
9642	TESTL SI, SI
9643	JZ    match_extend_back_end_encodeSnappyBlockAsm
9644
9645match_extend_back_loop_encodeSnappyBlockAsm:
9646	CMPL CX, DI
9647	JLE  match_extend_back_end_encodeSnappyBlockAsm
9648	MOVB -1(DX)(SI*1), BL
9649	MOVB -1(DX)(CX*1), R8
9650	CMPB BL, R8
9651	JNE  match_extend_back_end_encodeSnappyBlockAsm
9652	LEAL -1(CX), CX
9653	DECL SI
9654	JZ   match_extend_back_end_encodeSnappyBlockAsm
9655	JMP  match_extend_back_loop_encodeSnappyBlockAsm
9656
9657match_extend_back_end_encodeSnappyBlockAsm:
9658	MOVL CX, DI
9659	SUBL 12(SP), DI
9660	LEAQ 5(AX)(DI*1), DI
9661	CMPQ DI, (SP)
9662	JL   match_dst_size_check_encodeSnappyBlockAsm
9663	MOVQ $0x00000000, ret+48(FP)
9664	RET
9665
9666match_dst_size_check_encodeSnappyBlockAsm:
9667	MOVL CX, DI
9668	MOVL 12(SP), R8
9669	CMPL R8, DI
9670	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm
9671	MOVL DI, R9
9672	MOVL DI, 12(SP)
9673	LEAQ (DX)(R8*1), DI
9674	SUBL R8, R9
9675	LEAL -1(R9), R8
9676	CMPL R8, $0x3c
9677	JLT  one_byte_match_emit_encodeSnappyBlockAsm
9678	CMPL R8, $0x00000100
9679	JLT  two_bytes_match_emit_encodeSnappyBlockAsm
9680	CMPL R8, $0x00010000
9681	JLT  three_bytes_match_emit_encodeSnappyBlockAsm
9682	CMPL R8, $0x01000000
9683	JLT  four_bytes_match_emit_encodeSnappyBlockAsm
9684	MOVB $0xfc, (AX)
9685	MOVL R8, 1(AX)
9686	ADDQ $0x05, AX
9687	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
9688
9689four_bytes_match_emit_encodeSnappyBlockAsm:
9690	MOVL R8, R10
9691	SHRL $0x10, R10
9692	MOVB $0xf8, (AX)
9693	MOVW R8, 1(AX)
9694	MOVB R10, 3(AX)
9695	ADDQ $0x04, AX
9696	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
9697
9698three_bytes_match_emit_encodeSnappyBlockAsm:
9699	MOVB $0xf4, (AX)
9700	MOVW R8, 1(AX)
9701	ADDQ $0x03, AX
9702	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
9703
9704two_bytes_match_emit_encodeSnappyBlockAsm:
9705	MOVB $0xf0, (AX)
9706	MOVB R8, 1(AX)
9707	ADDQ $0x02, AX
9708	CMPL R8, $0x40
9709	JL   memmove_match_emit_encodeSnappyBlockAsm
9710	JMP  memmove_long_match_emit_encodeSnappyBlockAsm
9711
9712one_byte_match_emit_encodeSnappyBlockAsm:
9713	SHLB $0x02, R8
9714	MOVB R8, (AX)
9715	ADDQ $0x01, AX
9716
9717memmove_match_emit_encodeSnappyBlockAsm:
9718	LEAQ (AX)(R9*1), R8
9719
9720	// genMemMoveShort
9721	CMPQ R9, $0x03
9722	JB   emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_1or2
9723	JE   emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_3
9724	CMPQ R9, $0x08
9725	JB   emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_4through7
9726	CMPQ R9, $0x10
9727	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16
9728	CMPQ R9, $0x20
9729	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32
9730	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64
9731
9732emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_1or2:
9733	MOVB (DI), R10
9734	MOVB -1(DI)(R9*1), DI
9735	MOVB R10, (AX)
9736	MOVB DI, -1(AX)(R9*1)
9737	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm
9738
9739emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_3:
9740	MOVW (DI), R10
9741	MOVB 2(DI), DI
9742	MOVW R10, (AX)
9743	MOVB DI, 2(AX)
9744	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm
9745
9746emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_4through7:
9747	MOVL (DI), R10
9748	MOVL -4(DI)(R9*1), DI
9749	MOVL R10, (AX)
9750	MOVL DI, -4(AX)(R9*1)
9751	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm
9752
9753emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16:
9754	MOVQ (DI), R10
9755	MOVQ -8(DI)(R9*1), DI
9756	MOVQ R10, (AX)
9757	MOVQ DI, -8(AX)(R9*1)
9758	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm
9759
9760emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32:
9761	MOVOU (DI), X0
9762	MOVOU -16(DI)(R9*1), X1
9763	MOVOU X0, (AX)
9764	MOVOU X1, -16(AX)(R9*1)
9765	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm
9766
9767emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64:
9768	MOVOU (DI), X0
9769	MOVOU 16(DI), X1
9770	MOVOU -32(DI)(R9*1), X2
9771	MOVOU -16(DI)(R9*1), X3
9772	MOVOU X0, (AX)
9773	MOVOU X1, 16(AX)
9774	MOVOU X2, -32(AX)(R9*1)
9775	MOVOU X3, -16(AX)(R9*1)
9776
9777memmove_end_copy_match_emit_encodeSnappyBlockAsm:
9778	MOVQ R8, AX
9779	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm
9780
9781memmove_long_match_emit_encodeSnappyBlockAsm:
9782	LEAQ (AX)(R9*1), R8
9783
9784	// genMemMoveLong
9785	MOVOU (DI), X0
9786	MOVOU 16(DI), X1
9787	MOVOU -32(DI)(R9*1), X2
9788	MOVOU -16(DI)(R9*1), X3
9789	MOVQ  R9, R11
9790	SHRQ  $0x05, R11
9791	MOVQ  AX, R10
9792	ANDL  $0x0000001f, R10
9793	MOVQ  $0x00000040, R12
9794	SUBQ  R10, R12
9795	DECQ  R11
9796	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
9797	LEAQ  -32(DI)(R12*1), R10
9798	LEAQ  -32(AX)(R12*1), R13
9799
9800emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back:
9801	MOVOU (R10), X4
9802	MOVOU 16(R10), X5
9803	MOVOA X4, (R13)
9804	MOVOA X5, 16(R13)
9805	ADDQ  $0x20, R13
9806	ADDQ  $0x20, R10
9807	ADDQ  $0x20, R12
9808	DECQ  R11
9809	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back
9810
9811emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
9812	MOVOU -32(DI)(R12*1), X4
9813	MOVOU -16(DI)(R12*1), X5
9814	MOVOA X4, -32(AX)(R12*1)
9815	MOVOA X5, -16(AX)(R12*1)
9816	ADDQ  $0x20, R12
9817	CMPQ  R9, R12
9818	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
9819	MOVOU X0, (AX)
9820	MOVOU X1, 16(AX)
9821	MOVOU X2, -32(AX)(R9*1)
9822	MOVOU X3, -16(AX)(R9*1)
9823	MOVQ  R8, AX
9824
9825emit_literal_done_match_emit_encodeSnappyBlockAsm:
9826match_nolit_loop_encodeSnappyBlockAsm:
9827	MOVL CX, DI
9828	SUBL SI, DI
9829	MOVL DI, 16(SP)
9830	ADDL $0x04, CX
9831	ADDL $0x04, SI
9832	MOVQ src_len+32(FP), DI
9833	SUBL CX, DI
9834	LEAQ (DX)(CX*1), R8
9835	LEAQ (DX)(SI*1), SI
9836
9837	// matchLen
9838	XORL R10, R10
9839	CMPL DI, $0x08
9840	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm
9841
9842matchlen_loopback_match_nolit_encodeSnappyBlockAsm:
9843	MOVQ  (R8)(R10*1), R9
9844	XORQ  (SI)(R10*1), R9
9845	TESTQ R9, R9
9846	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm
9847	BSFQ  R9, R9
9848	SARQ  $0x03, R9
9849	LEAL  (R10)(R9*1), R10
9850	JMP   match_nolit_end_encodeSnappyBlockAsm
9851
9852matchlen_loop_match_nolit_encodeSnappyBlockAsm:
9853	LEAL -8(DI), DI
9854	LEAL 8(R10), R10
9855	CMPL DI, $0x08
9856	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm
9857
9858matchlen_single_match_nolit_encodeSnappyBlockAsm:
9859	TESTL DI, DI
9860	JZ    match_nolit_end_encodeSnappyBlockAsm
9861
9862matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm:
9863	MOVB (R8)(R10*1), R9
9864	CMPB (SI)(R10*1), R9
9865	JNE  match_nolit_end_encodeSnappyBlockAsm
9866	LEAL 1(R10), R10
9867	DECL DI
9868	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm
9869
9870match_nolit_end_encodeSnappyBlockAsm:
9871	ADDL R10, CX
9872	MOVL 16(SP), SI
9873	ADDL $0x04, R10
9874	MOVL CX, 12(SP)
9875
9876	// emitCopy
9877	CMPL SI, $0x00010000
9878	JL   two_byte_offset_match_nolit_encodeSnappyBlockAsm
9879
9880four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm:
9881	CMPL R10, $0x40
9882	JLE  four_bytes_remain_match_nolit_encodeSnappyBlockAsm
9883	MOVB $0xff, (AX)
9884	MOVL SI, 1(AX)
9885	LEAL -64(R10), R10
9886	ADDQ $0x05, AX
9887	CMPL R10, $0x04
9888	JL   four_bytes_remain_match_nolit_encodeSnappyBlockAsm
9889	JMP  four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm
9890
9891four_bytes_remain_match_nolit_encodeSnappyBlockAsm:
9892	TESTL R10, R10
9893	JZ    match_nolit_emitcopy_end_encodeSnappyBlockAsm
9894	MOVB  $0x03, BL
9895	LEAL  -4(BX)(R10*4), R10
9896	MOVB  R10, (AX)
9897	MOVL  SI, 1(AX)
9898	ADDQ  $0x05, AX
9899	JMP   match_nolit_emitcopy_end_encodeSnappyBlockAsm
9900
9901two_byte_offset_match_nolit_encodeSnappyBlockAsm:
9902	CMPL R10, $0x40
9903	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm
9904	MOVB $0xee, (AX)
9905	MOVW SI, 1(AX)
9906	LEAL -60(R10), R10
9907	ADDQ $0x03, AX
9908	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm
9909
9910two_byte_offset_short_match_nolit_encodeSnappyBlockAsm:
9911	CMPL R10, $0x0c
9912	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm
9913	CMPL SI, $0x00000800
9914	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm
9915	MOVB $0x01, BL
9916	LEAL -16(BX)(R10*4), R10
9917	MOVB SI, 1(AX)
9918	SHRL $0x08, SI
9919	SHLL $0x05, SI
9920	ORL  SI, R10
9921	MOVB R10, (AX)
9922	ADDQ $0x02, AX
9923	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm
9924
9925emit_copy_three_match_nolit_encodeSnappyBlockAsm:
9926	MOVB $0x02, BL
9927	LEAL -4(BX)(R10*4), R10
9928	MOVB R10, (AX)
9929	MOVW SI, 1(AX)
9930	ADDQ $0x03, AX
9931
9932match_nolit_emitcopy_end_encodeSnappyBlockAsm:
9933	CMPL CX, 8(SP)
9934	JGE  emit_remainder_encodeSnappyBlockAsm
9935	MOVQ -2(DX)(CX*1), DI
9936	CMPQ AX, (SP)
9937	JL   match_nolit_dst_ok_encodeSnappyBlockAsm
9938	MOVQ $0x00000000, ret+48(FP)
9939	RET
9940
9941match_nolit_dst_ok_encodeSnappyBlockAsm:
9942	MOVQ  $0x0000cf1bbcdcbf9b, R9
9943	MOVQ  DI, R8
9944	SHRQ  $0x10, DI
9945	MOVQ  DI, SI
9946	SHLQ  $0x10, R8
9947	IMULQ R9, R8
9948	SHRQ  $0x32, R8
9949	SHLQ  $0x10, SI
9950	IMULQ R9, SI
9951	SHRQ  $0x32, SI
9952	LEAL  -2(CX), R9
9953	LEAQ  24(SP)(SI*4), R10
9954	MOVL  (R10), SI
9955	MOVL  R9, 24(SP)(R8*4)
9956	MOVL  CX, (R10)
9957	CMPL  (DX)(SI*1), DI
9958	JEQ   match_nolit_loop_encodeSnappyBlockAsm
9959	INCL  CX
9960	JMP   search_loop_encodeSnappyBlockAsm
9961
9962emit_remainder_encodeSnappyBlockAsm:
9963	MOVQ src_len+32(FP), CX
9964	SUBL 12(SP), CX
9965	LEAQ 5(AX)(CX*1), CX
9966	CMPQ CX, (SP)
9967	JL   emit_remainder_ok_encodeSnappyBlockAsm
9968	MOVQ $0x00000000, ret+48(FP)
9969	RET
9970
9971emit_remainder_ok_encodeSnappyBlockAsm:
9972	MOVQ src_len+32(FP), CX
9973	MOVL 12(SP), BX
9974	CMPL BX, CX
9975	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm
9976	MOVL CX, SI
9977	MOVL CX, 12(SP)
9978	LEAQ (DX)(BX*1), CX
9979	SUBL BX, SI
9980	LEAL -1(SI), DX
9981	CMPL DX, $0x3c
9982	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm
9983	CMPL DX, $0x00000100
9984	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm
9985	CMPL DX, $0x00010000
9986	JLT  three_bytes_emit_remainder_encodeSnappyBlockAsm
9987	CMPL DX, $0x01000000
9988	JLT  four_bytes_emit_remainder_encodeSnappyBlockAsm
9989	MOVB $0xfc, (AX)
9990	MOVL DX, 1(AX)
9991	ADDQ $0x05, AX
9992	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
9993
9994four_bytes_emit_remainder_encodeSnappyBlockAsm:
9995	MOVL DX, BX
9996	SHRL $0x10, BX
9997	MOVB $0xf8, (AX)
9998	MOVW DX, 1(AX)
9999	MOVB BL, 3(AX)
10000	ADDQ $0x04, AX
10001	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
10002
10003three_bytes_emit_remainder_encodeSnappyBlockAsm:
10004	MOVB $0xf4, (AX)
10005	MOVW DX, 1(AX)
10006	ADDQ $0x03, AX
10007	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
10008
10009two_bytes_emit_remainder_encodeSnappyBlockAsm:
10010	MOVB $0xf0, (AX)
10011	MOVB DL, 1(AX)
10012	ADDQ $0x02, AX
10013	CMPL DX, $0x40
10014	JL   memmove_emit_remainder_encodeSnappyBlockAsm
10015	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm
10016
10017one_byte_emit_remainder_encodeSnappyBlockAsm:
10018	SHLB $0x02, DL
10019	MOVB DL, (AX)
10020	ADDQ $0x01, AX
10021
10022memmove_emit_remainder_encodeSnappyBlockAsm:
10023	LEAQ (AX)(SI*1), DX
10024	MOVL SI, BX
10025
10026	// genMemMoveShort
10027	CMPQ BX, $0x03
10028	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2
10029	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3
10030	CMPQ BX, $0x08
10031	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7
10032	CMPQ BX, $0x10
10033	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16
10034	CMPQ BX, $0x20
10035	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32
10036	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64
10037
10038emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2:
10039	MOVB (CX), SI
10040	MOVB -1(CX)(BX*1), CL
10041	MOVB SI, (AX)
10042	MOVB CL, -1(AX)(BX*1)
10043	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
10044
10045emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3:
10046	MOVW (CX), SI
10047	MOVB 2(CX), CL
10048	MOVW SI, (AX)
10049	MOVB CL, 2(AX)
10050	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
10051
10052emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7:
10053	MOVL (CX), SI
10054	MOVL -4(CX)(BX*1), CX
10055	MOVL SI, (AX)
10056	MOVL CX, -4(AX)(BX*1)
10057	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
10058
10059emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16:
10060	MOVQ (CX), SI
10061	MOVQ -8(CX)(BX*1), CX
10062	MOVQ SI, (AX)
10063	MOVQ CX, -8(AX)(BX*1)
10064	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
10065
10066emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32:
10067	MOVOU (CX), X0
10068	MOVOU -16(CX)(BX*1), X1
10069	MOVOU X0, (AX)
10070	MOVOU X1, -16(AX)(BX*1)
10071	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
10072
10073emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64:
10074	MOVOU (CX), X0
10075	MOVOU 16(CX), X1
10076	MOVOU -32(CX)(BX*1), X2
10077	MOVOU -16(CX)(BX*1), X3
10078	MOVOU X0, (AX)
10079	MOVOU X1, 16(AX)
10080	MOVOU X2, -32(AX)(BX*1)
10081	MOVOU X3, -16(AX)(BX*1)
10082
10083memmove_end_copy_emit_remainder_encodeSnappyBlockAsm:
10084	MOVQ DX, AX
10085	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm
10086
10087memmove_long_emit_remainder_encodeSnappyBlockAsm:
10088	LEAQ (AX)(SI*1), DX
10089	MOVL SI, BX
10090
10091	// genMemMoveLong
10092	MOVOU (CX), X0
10093	MOVOU 16(CX), X1
10094	MOVOU -32(CX)(BX*1), X2
10095	MOVOU -16(CX)(BX*1), X3
10096	MOVQ  BX, DI
10097	SHRQ  $0x05, DI
10098	MOVQ  AX, SI
10099	ANDL  $0x0000001f, SI
10100	MOVQ  $0x00000040, R8
10101	SUBQ  SI, R8
10102	DECQ  DI
10103	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
10104	LEAQ  -32(CX)(R8*1), SI
10105	LEAQ  -32(AX)(R8*1), R9
10106
10107emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back:
10108	MOVOU (SI), X4
10109	MOVOU 16(SI), X5
10110	MOVOA X4, (R9)
10111	MOVOA X5, 16(R9)
10112	ADDQ  $0x20, R9
10113	ADDQ  $0x20, SI
10114	ADDQ  $0x20, R8
10115	DECQ  DI
10116	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back
10117
10118emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
10119	MOVOU -32(CX)(R8*1), X4
10120	MOVOU -16(CX)(R8*1), X5
10121	MOVOA X4, -32(AX)(R8*1)
10122	MOVOA X5, -16(AX)(R8*1)
10123	ADDQ  $0x20, R8
10124	CMPQ  BX, R8
10125	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
10126	MOVOU X0, (AX)
10127	MOVOU X1, 16(AX)
10128	MOVOU X2, -32(AX)(BX*1)
10129	MOVOU X3, -16(AX)(BX*1)
10130	MOVQ  DX, AX
10131
10132emit_literal_done_emit_remainder_encodeSnappyBlockAsm:
10133	MOVQ dst_base+0(FP), CX
10134	SUBQ CX, AX
10135	MOVQ AX, ret+48(FP)
10136	RET
10137
10138// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
10139// Requires: SSE2
10140TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56
10141	MOVQ dst_base+0(FP), AX
10142	MOVQ $0x00000080, CX
10143	LEAQ 24(SP), DX
10144	PXOR X0, X0
10145
10146zero_loop_encodeSnappyBlockAsm12B:
10147	MOVOU X0, (DX)
10148	MOVOU X0, 16(DX)
10149	MOVOU X0, 32(DX)
10150	MOVOU X0, 48(DX)
10151	MOVOU X0, 64(DX)
10152	MOVOU X0, 80(DX)
10153	MOVOU X0, 96(DX)
10154	MOVOU X0, 112(DX)
10155	ADDQ  $0x80, DX
10156	DECQ  CX
10157	JNZ   zero_loop_encodeSnappyBlockAsm12B
10158	MOVL  $0x00000000, 12(SP)
10159	MOVQ  src_len+32(FP), CX
10160	LEAQ  -5(CX), DX
10161	LEAQ  -8(CX), SI
10162	MOVL  SI, 8(SP)
10163	SHRQ  $0x05, CX
10164	SUBL  CX, DX
10165	LEAQ  (AX)(DX*1), DX
10166	MOVQ  DX, (SP)
10167	MOVL  $0x00000001, CX
10168	MOVL  CX, 16(SP)
10169	MOVQ  src_base+24(FP), DX
10170
10171search_loop_encodeSnappyBlockAsm12B:
10172	MOVL  CX, SI
10173	SUBL  12(SP), SI
10174	SHRL  $0x05, SI
10175	LEAL  4(CX)(SI*1), SI
10176	CMPL  SI, 8(SP)
10177	JGE   emit_remainder_encodeSnappyBlockAsm12B
10178	MOVQ  (DX)(CX*1), DI
10179	MOVL  SI, 20(SP)
10180	MOVQ  $0x000000cf1bbcdcbb, R9
10181	MOVQ  DI, R10
10182	MOVQ  DI, R11
10183	SHRQ  $0x08, R11
10184	SHLQ  $0x18, R10
10185	IMULQ R9, R10
10186	SHRQ  $0x34, R10
10187	SHLQ  $0x18, R11
10188	IMULQ R9, R11
10189	SHRQ  $0x34, R11
10190	MOVL  24(SP)(R10*4), SI
10191	MOVL  24(SP)(R11*4), R8
10192	MOVL  CX, 24(SP)(R10*4)
10193	LEAL  1(CX), R10
10194	MOVL  R10, 24(SP)(R11*4)
10195	MOVQ  DI, R10
10196	SHRQ  $0x10, R10
10197	SHLQ  $0x18, R10
10198	IMULQ R9, R10
10199	SHRQ  $0x34, R10
10200	MOVL  CX, R9
10201	SUBL  16(SP), R9
10202	MOVL  1(DX)(R9*1), R11
10203	MOVQ  DI, R9
10204	SHRQ  $0x08, R9
10205	CMPL  R9, R11
10206	JNE   no_repeat_found_encodeSnappyBlockAsm12B
10207	LEAL  1(CX), DI
10208	MOVL  12(SP), SI
10209	MOVL  DI, R8
10210	SUBL  16(SP), R8
10211	JZ    repeat_extend_back_end_encodeSnappyBlockAsm12B
10212
10213repeat_extend_back_loop_encodeSnappyBlockAsm12B:
10214	CMPL DI, SI
10215	JLE  repeat_extend_back_end_encodeSnappyBlockAsm12B
10216	MOVB -1(DX)(R8*1), BL
10217	MOVB -1(DX)(DI*1), R9
10218	CMPB BL, R9
10219	JNE  repeat_extend_back_end_encodeSnappyBlockAsm12B
10220	LEAL -1(DI), DI
10221	DECL R8
10222	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm12B
10223
10224repeat_extend_back_end_encodeSnappyBlockAsm12B:
10225	MOVL 12(SP), SI
10226	CMPL SI, DI
10227	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
10228	MOVL DI, R8
10229	MOVL DI, 12(SP)
10230	LEAQ (DX)(SI*1), R9
10231	SUBL SI, R8
10232	LEAL -1(R8), SI
10233	CMPL SI, $0x3c
10234	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm12B
10235	CMPL SI, $0x00000100
10236	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm12B
10237	MOVB $0xf4, (AX)
10238	MOVW SI, 1(AX)
10239	ADDQ $0x03, AX
10240	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm12B
10241
10242two_bytes_repeat_emit_encodeSnappyBlockAsm12B:
10243	MOVB $0xf0, (AX)
10244	MOVB SI, 1(AX)
10245	ADDQ $0x02, AX
10246	CMPL SI, $0x40
10247	JL   memmove_repeat_emit_encodeSnappyBlockAsm12B
10248	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm12B
10249
10250one_byte_repeat_emit_encodeSnappyBlockAsm12B:
10251	SHLB $0x02, SI
10252	MOVB SI, (AX)
10253	ADDQ $0x01, AX
10254
10255memmove_repeat_emit_encodeSnappyBlockAsm12B:
10256	LEAQ (AX)(R8*1), SI
10257
10258	// genMemMoveShort
10259	CMPQ R8, $0x03
10260	JB   emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_1or2
10261	JE   emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_3
10262	CMPQ R8, $0x08
10263	JB   emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_4through7
10264	CMPQ R8, $0x10
10265	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
10266	CMPQ R8, $0x20
10267	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
10268	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
10269
10270emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_1or2:
10271	MOVB (R9), R10
10272	MOVB -1(R9)(R8*1), R9
10273	MOVB R10, (AX)
10274	MOVB R9, -1(AX)(R8*1)
10275	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
10276
10277emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_3:
10278	MOVW (R9), R10
10279	MOVB 2(R9), R9
10280	MOVW R10, (AX)
10281	MOVB R9, 2(AX)
10282	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
10283
10284emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_4through7:
10285	MOVL (R9), R10
10286	MOVL -4(R9)(R8*1), R9
10287	MOVL R10, (AX)
10288	MOVL R9, -4(AX)(R8*1)
10289	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
10290
10291emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
10292	MOVQ (R9), R10
10293	MOVQ -8(R9)(R8*1), R9
10294	MOVQ R10, (AX)
10295	MOVQ R9, -8(AX)(R8*1)
10296	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
10297
10298emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
10299	MOVOU (R9), X0
10300	MOVOU -16(R9)(R8*1), X1
10301	MOVOU X0, (AX)
10302	MOVOU X1, -16(AX)(R8*1)
10303	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
10304
10305emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
10306	MOVOU (R9), X0
10307	MOVOU 16(R9), X1
10308	MOVOU -32(R9)(R8*1), X2
10309	MOVOU -16(R9)(R8*1), X3
10310	MOVOU X0, (AX)
10311	MOVOU X1, 16(AX)
10312	MOVOU X2, -32(AX)(R8*1)
10313	MOVOU X3, -16(AX)(R8*1)
10314
10315memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B:
10316	MOVQ SI, AX
10317	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
10318
10319memmove_long_repeat_emit_encodeSnappyBlockAsm12B:
10320	LEAQ (AX)(R8*1), SI
10321
10322	// genMemMoveLong
10323	MOVOU (R9), X0
10324	MOVOU 16(R9), X1
10325	MOVOU -32(R9)(R8*1), X2
10326	MOVOU -16(R9)(R8*1), X3
10327	MOVQ  R8, R11
10328	SHRQ  $0x05, R11
10329	MOVQ  AX, R10
10330	ANDL  $0x0000001f, R10
10331	MOVQ  $0x00000040, R12
10332	SUBQ  R10, R12
10333	DECQ  R11
10334	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
10335	LEAQ  -32(R9)(R12*1), R10
10336	LEAQ  -32(AX)(R12*1), R13
10337
10338emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
10339	MOVOU (R10), X4
10340	MOVOU 16(R10), X5
10341	MOVOA X4, (R13)
10342	MOVOA X5, 16(R13)
10343	ADDQ  $0x20, R13
10344	ADDQ  $0x20, R10
10345	ADDQ  $0x20, R12
10346	DECQ  R11
10347	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
10348
10349emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
10350	MOVOU -32(R9)(R12*1), X4
10351	MOVOU -16(R9)(R12*1), X5
10352	MOVOA X4, -32(AX)(R12*1)
10353	MOVOA X5, -16(AX)(R12*1)
10354	ADDQ  $0x20, R12
10355	CMPQ  R8, R12
10356	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
10357	MOVOU X0, (AX)
10358	MOVOU X1, 16(AX)
10359	MOVOU X2, -32(AX)(R8*1)
10360	MOVOU X3, -16(AX)(R8*1)
10361	MOVQ  SI, AX
10362
10363emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
10364	ADDL $0x05, CX
10365	MOVL CX, SI
10366	SUBL 16(SP), SI
10367	MOVQ src_len+32(FP), R8
10368	SUBL CX, R8
10369	LEAQ (DX)(CX*1), R9
10370	LEAQ (DX)(SI*1), SI
10371
10372	// matchLen
10373	XORL R11, R11
10374	CMPL R8, $0x08
10375	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm12B
10376
10377matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B:
10378	MOVQ  (R9)(R11*1), R10
10379	XORQ  (SI)(R11*1), R10
10380	TESTQ R10, R10
10381	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B
10382	BSFQ  R10, R10
10383	SARQ  $0x03, R10
10384	LEAL  (R11)(R10*1), R11
10385	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm12B
10386
10387matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B:
10388	LEAL -8(R8), R8
10389	LEAL 8(R11), R11
10390	CMPL R8, $0x08
10391	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B
10392
10393matchlen_single_repeat_extend_encodeSnappyBlockAsm12B:
10394	TESTL R8, R8
10395	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm12B
10396
10397matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B:
10398	MOVB (R9)(R11*1), R10
10399	CMPB (SI)(R11*1), R10
10400	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm12B
10401	LEAL 1(R11), R11
10402	DECL R8
10403	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm12B
10404
10405repeat_extend_forward_end_encodeSnappyBlockAsm12B:
10406	ADDL R11, CX
10407	MOVL CX, SI
10408	SUBL DI, SI
10409	MOVL 16(SP), DI
10410
10411	// emitCopy
10412two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B:
10413	CMPL SI, $0x40
10414	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B
10415	MOVB $0xee, (AX)
10416	MOVW DI, 1(AX)
10417	LEAL -60(SI), SI
10418	ADDQ $0x03, AX
10419	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B
10420
10421two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B:
10422	CMPL SI, $0x0c
10423	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
10424	CMPL DI, $0x00000800
10425	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
10426	MOVB $0x01, BL
10427	LEAL -16(BX)(SI*4), SI
10428	MOVB DI, 1(AX)
10429	SHRL $0x08, DI
10430	SHLL $0x05, DI
10431	ORL  DI, SI
10432	MOVB SI, (AX)
10433	ADDQ $0x02, AX
10434	JMP  repeat_end_emit_encodeSnappyBlockAsm12B
10435
10436emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B:
10437	MOVB $0x02, BL
10438	LEAL -4(BX)(SI*4), SI
10439	MOVB SI, (AX)
10440	MOVW DI, 1(AX)
10441	ADDQ $0x03, AX
10442
10443repeat_end_emit_encodeSnappyBlockAsm12B:
10444	MOVL CX, 12(SP)
10445	JMP  search_loop_encodeSnappyBlockAsm12B
10446
10447no_repeat_found_encodeSnappyBlockAsm12B:
10448	CMPL (DX)(SI*1), DI
10449	JEQ  candidate_match_encodeSnappyBlockAsm12B
10450	SHRQ $0x08, DI
10451	MOVL 24(SP)(R10*4), SI
10452	LEAL 2(CX), R9
10453	CMPL (DX)(R8*1), DI
10454	JEQ  candidate2_match_encodeSnappyBlockAsm12B
10455	MOVL R9, 24(SP)(R10*4)
10456	SHRQ $0x08, DI
10457	CMPL (DX)(SI*1), DI
10458	JEQ  candidate3_match_encodeSnappyBlockAsm12B
10459	MOVL 20(SP), CX
10460	JMP  search_loop_encodeSnappyBlockAsm12B
10461
10462candidate3_match_encodeSnappyBlockAsm12B:
10463	ADDL $0x02, CX
10464	JMP  candidate_match_encodeSnappyBlockAsm12B
10465
10466candidate2_match_encodeSnappyBlockAsm12B:
10467	MOVL R9, 24(SP)(R10*4)
10468	INCL CX
10469	MOVL R8, SI
10470
10471candidate_match_encodeSnappyBlockAsm12B:
10472	MOVL  12(SP), DI
10473	TESTL SI, SI
10474	JZ    match_extend_back_end_encodeSnappyBlockAsm12B
10475
10476match_extend_back_loop_encodeSnappyBlockAsm12B:
10477	CMPL CX, DI
10478	JLE  match_extend_back_end_encodeSnappyBlockAsm12B
10479	MOVB -1(DX)(SI*1), BL
10480	MOVB -1(DX)(CX*1), R8
10481	CMPB BL, R8
10482	JNE  match_extend_back_end_encodeSnappyBlockAsm12B
10483	LEAL -1(CX), CX
10484	DECL SI
10485	JZ   match_extend_back_end_encodeSnappyBlockAsm12B
10486	JMP  match_extend_back_loop_encodeSnappyBlockAsm12B
10487
10488match_extend_back_end_encodeSnappyBlockAsm12B:
10489	MOVL CX, DI
10490	SUBL 12(SP), DI
10491	LEAQ 3(AX)(DI*1), DI
10492	CMPQ DI, (SP)
10493	JL   match_dst_size_check_encodeSnappyBlockAsm12B
10494	MOVQ $0x00000000, ret+48(FP)
10495	RET
10496
10497match_dst_size_check_encodeSnappyBlockAsm12B:
10498	MOVL CX, DI
10499	MOVL 12(SP), R8
10500	CMPL R8, DI
10501	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm12B
10502	MOVL DI, R9
10503	MOVL DI, 12(SP)
10504	LEAQ (DX)(R8*1), DI
10505	SUBL R8, R9
10506	LEAL -1(R9), R8
10507	CMPL R8, $0x3c
10508	JLT  one_byte_match_emit_encodeSnappyBlockAsm12B
10509	CMPL R8, $0x00000100
10510	JLT  two_bytes_match_emit_encodeSnappyBlockAsm12B
10511	MOVB $0xf4, (AX)
10512	MOVW R8, 1(AX)
10513	ADDQ $0x03, AX
10514	JMP  memmove_long_match_emit_encodeSnappyBlockAsm12B
10515
10516two_bytes_match_emit_encodeSnappyBlockAsm12B:
10517	MOVB $0xf0, (AX)
10518	MOVB R8, 1(AX)
10519	ADDQ $0x02, AX
10520	CMPL R8, $0x40
10521	JL   memmove_match_emit_encodeSnappyBlockAsm12B
10522	JMP  memmove_long_match_emit_encodeSnappyBlockAsm12B
10523
10524one_byte_match_emit_encodeSnappyBlockAsm12B:
10525	SHLB $0x02, R8
10526	MOVB R8, (AX)
10527	ADDQ $0x01, AX
10528
10529memmove_match_emit_encodeSnappyBlockAsm12B:
10530	LEAQ (AX)(R9*1), R8
10531
10532	// genMemMoveShort
10533	CMPQ R9, $0x03
10534	JB   emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_1or2
10535	JE   emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_3
10536	CMPQ R9, $0x08
10537	JB   emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_4through7
10538	CMPQ R9, $0x10
10539	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
10540	CMPQ R9, $0x20
10541	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
10542	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
10543
10544emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_1or2:
10545	MOVB (DI), R10
10546	MOVB -1(DI)(R9*1), DI
10547	MOVB R10, (AX)
10548	MOVB DI, -1(AX)(R9*1)
10549	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
10550
10551emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_3:
10552	MOVW (DI), R10
10553	MOVB 2(DI), DI
10554	MOVW R10, (AX)
10555	MOVB DI, 2(AX)
10556	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
10557
10558emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_4through7:
10559	MOVL (DI), R10
10560	MOVL -4(DI)(R9*1), DI
10561	MOVL R10, (AX)
10562	MOVL DI, -4(AX)(R9*1)
10563	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
10564
10565emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
10566	MOVQ (DI), R10
10567	MOVQ -8(DI)(R9*1), DI
10568	MOVQ R10, (AX)
10569	MOVQ DI, -8(AX)(R9*1)
10570	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
10571
10572emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
10573	MOVOU (DI), X0
10574	MOVOU -16(DI)(R9*1), X1
10575	MOVOU X0, (AX)
10576	MOVOU X1, -16(AX)(R9*1)
10577	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
10578
10579emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
10580	MOVOU (DI), X0
10581	MOVOU 16(DI), X1
10582	MOVOU -32(DI)(R9*1), X2
10583	MOVOU -16(DI)(R9*1), X3
10584	MOVOU X0, (AX)
10585	MOVOU X1, 16(AX)
10586	MOVOU X2, -32(AX)(R9*1)
10587	MOVOU X3, -16(AX)(R9*1)
10588
10589memmove_end_copy_match_emit_encodeSnappyBlockAsm12B:
10590	MOVQ R8, AX
10591	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm12B
10592
10593memmove_long_match_emit_encodeSnappyBlockAsm12B:
10594	LEAQ (AX)(R9*1), R8
10595
10596	// genMemMoveLong
10597	MOVOU (DI), X0
10598	MOVOU 16(DI), X1
10599	MOVOU -32(DI)(R9*1), X2
10600	MOVOU -16(DI)(R9*1), X3
10601	MOVQ  R9, R11
10602	SHRQ  $0x05, R11
10603	MOVQ  AX, R10
10604	ANDL  $0x0000001f, R10
10605	MOVQ  $0x00000040, R12
10606	SUBQ  R10, R12
10607	DECQ  R11
10608	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
10609	LEAQ  -32(DI)(R12*1), R10
10610	LEAQ  -32(AX)(R12*1), R13
10611
10612emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
10613	MOVOU (R10), X4
10614	MOVOU 16(R10), X5
10615	MOVOA X4, (R13)
10616	MOVOA X5, 16(R13)
10617	ADDQ  $0x20, R13
10618	ADDQ  $0x20, R10
10619	ADDQ  $0x20, R12
10620	DECQ  R11
10621	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
10622
10623emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
10624	MOVOU -32(DI)(R12*1), X4
10625	MOVOU -16(DI)(R12*1), X5
10626	MOVOA X4, -32(AX)(R12*1)
10627	MOVOA X5, -16(AX)(R12*1)
10628	ADDQ  $0x20, R12
10629	CMPQ  R9, R12
10630	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
10631	MOVOU X0, (AX)
10632	MOVOU X1, 16(AX)
10633	MOVOU X2, -32(AX)(R9*1)
10634	MOVOU X3, -16(AX)(R9*1)
10635	MOVQ  R8, AX
10636
10637emit_literal_done_match_emit_encodeSnappyBlockAsm12B:
10638match_nolit_loop_encodeSnappyBlockAsm12B:
10639	MOVL CX, DI
10640	SUBL SI, DI
10641	MOVL DI, 16(SP)
10642	ADDL $0x04, CX
10643	ADDL $0x04, SI
10644	MOVQ src_len+32(FP), DI
10645	SUBL CX, DI
10646	LEAQ (DX)(CX*1), R8
10647	LEAQ (DX)(SI*1), SI
10648
10649	// matchLen
10650	XORL R10, R10
10651	CMPL DI, $0x08
10652	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm12B
10653
10654matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B:
10655	MOVQ  (R8)(R10*1), R9
10656	XORQ  (SI)(R10*1), R9
10657	TESTQ R9, R9
10658	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm12B
10659	BSFQ  R9, R9
10660	SARQ  $0x03, R9
10661	LEAL  (R10)(R9*1), R10
10662	JMP   match_nolit_end_encodeSnappyBlockAsm12B
10663
10664matchlen_loop_match_nolit_encodeSnappyBlockAsm12B:
10665	LEAL -8(DI), DI
10666	LEAL 8(R10), R10
10667	CMPL DI, $0x08
10668	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B
10669
10670matchlen_single_match_nolit_encodeSnappyBlockAsm12B:
10671	TESTL DI, DI
10672	JZ    match_nolit_end_encodeSnappyBlockAsm12B
10673
10674matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B:
10675	MOVB (R8)(R10*1), R9
10676	CMPB (SI)(R10*1), R9
10677	JNE  match_nolit_end_encodeSnappyBlockAsm12B
10678	LEAL 1(R10), R10
10679	DECL DI
10680	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B
10681
10682match_nolit_end_encodeSnappyBlockAsm12B:
10683	ADDL R10, CX
10684	MOVL 16(SP), SI
10685	ADDL $0x04, R10
10686	MOVL CX, 12(SP)
10687
10688	// emitCopy
10689two_byte_offset_match_nolit_encodeSnappyBlockAsm12B:
10690	CMPL R10, $0x40
10691	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B
10692	MOVB $0xee, (AX)
10693	MOVW SI, 1(AX)
10694	LEAL -60(R10), R10
10695	ADDQ $0x03, AX
10696	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm12B
10697
10698two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B:
10699	CMPL R10, $0x0c
10700	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
10701	CMPL SI, $0x00000800
10702	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
10703	MOVB $0x01, BL
10704	LEAL -16(BX)(R10*4), R10
10705	MOVB SI, 1(AX)
10706	SHRL $0x08, SI
10707	SHLL $0x05, SI
10708	ORL  SI, R10
10709	MOVB R10, (AX)
10710	ADDQ $0x02, AX
10711	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm12B
10712
10713emit_copy_three_match_nolit_encodeSnappyBlockAsm12B:
10714	MOVB $0x02, BL
10715	LEAL -4(BX)(R10*4), R10
10716	MOVB R10, (AX)
10717	MOVW SI, 1(AX)
10718	ADDQ $0x03, AX
10719
10720match_nolit_emitcopy_end_encodeSnappyBlockAsm12B:
10721	CMPL CX, 8(SP)
10722	JGE  emit_remainder_encodeSnappyBlockAsm12B
10723	MOVQ -2(DX)(CX*1), DI
10724	CMPQ AX, (SP)
10725	JL   match_nolit_dst_ok_encodeSnappyBlockAsm12B
10726	MOVQ $0x00000000, ret+48(FP)
10727	RET
10728
10729match_nolit_dst_ok_encodeSnappyBlockAsm12B:
10730	MOVQ  $0x000000cf1bbcdcbb, R9
10731	MOVQ  DI, R8
10732	SHRQ  $0x10, DI
10733	MOVQ  DI, SI
10734	SHLQ  $0x18, R8
10735	IMULQ R9, R8
10736	SHRQ  $0x34, R8
10737	SHLQ  $0x18, SI
10738	IMULQ R9, SI
10739	SHRQ  $0x34, SI
10740	LEAL  -2(CX), R9
10741	LEAQ  24(SP)(SI*4), R10
10742	MOVL  (R10), SI
10743	MOVL  R9, 24(SP)(R8*4)
10744	MOVL  CX, (R10)
10745	CMPL  (DX)(SI*1), DI
10746	JEQ   match_nolit_loop_encodeSnappyBlockAsm12B
10747	INCL  CX
10748	JMP   search_loop_encodeSnappyBlockAsm12B
10749
10750emit_remainder_encodeSnappyBlockAsm12B:
10751	MOVQ src_len+32(FP), CX
10752	SUBL 12(SP), CX
10753	LEAQ 3(AX)(CX*1), CX
10754	CMPQ CX, (SP)
10755	JL   emit_remainder_ok_encodeSnappyBlockAsm12B
10756	MOVQ $0x00000000, ret+48(FP)
10757	RET
10758
10759emit_remainder_ok_encodeSnappyBlockAsm12B:
10760	MOVQ src_len+32(FP), CX
10761	MOVL 12(SP), BX
10762	CMPL BX, CX
10763	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
10764	MOVL CX, SI
10765	MOVL CX, 12(SP)
10766	LEAQ (DX)(BX*1), CX
10767	SUBL BX, SI
10768	LEAL -1(SI), DX
10769	CMPL DX, $0x3c
10770	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm12B
10771	CMPL DX, $0x00000100
10772	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm12B
10773	MOVB $0xf4, (AX)
10774	MOVW DX, 1(AX)
10775	ADDQ $0x03, AX
10776	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm12B
10777
10778two_bytes_emit_remainder_encodeSnappyBlockAsm12B:
10779	MOVB $0xf0, (AX)
10780	MOVB DL, 1(AX)
10781	ADDQ $0x02, AX
10782	CMPL DX, $0x40
10783	JL   memmove_emit_remainder_encodeSnappyBlockAsm12B
10784	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm12B
10785
10786one_byte_emit_remainder_encodeSnappyBlockAsm12B:
10787	SHLB $0x02, DL
10788	MOVB DL, (AX)
10789	ADDQ $0x01, AX
10790
10791memmove_emit_remainder_encodeSnappyBlockAsm12B:
10792	LEAQ (AX)(SI*1), DX
10793	MOVL SI, BX
10794
10795	// genMemMoveShort
10796	CMPQ BX, $0x03
10797	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2
10798	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3
10799	CMPQ BX, $0x08
10800	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7
10801	CMPQ BX, $0x10
10802	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16
10803	CMPQ BX, $0x20
10804	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32
10805	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64
10806
10807emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2:
10808	MOVB (CX), SI
10809	MOVB -1(CX)(BX*1), CL
10810	MOVB SI, (AX)
10811	MOVB CL, -1(AX)(BX*1)
10812	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
10813
10814emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3:
10815	MOVW (CX), SI
10816	MOVB 2(CX), CL
10817	MOVW SI, (AX)
10818	MOVB CL, 2(AX)
10819	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
10820
10821emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7:
10822	MOVL (CX), SI
10823	MOVL -4(CX)(BX*1), CX
10824	MOVL SI, (AX)
10825	MOVL CX, -4(AX)(BX*1)
10826	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
10827
10828emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16:
10829	MOVQ (CX), SI
10830	MOVQ -8(CX)(BX*1), CX
10831	MOVQ SI, (AX)
10832	MOVQ CX, -8(AX)(BX*1)
10833	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
10834
10835emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32:
10836	MOVOU (CX), X0
10837	MOVOU -16(CX)(BX*1), X1
10838	MOVOU X0, (AX)
10839	MOVOU X1, -16(AX)(BX*1)
10840	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
10841
10842emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64:
10843	MOVOU (CX), X0
10844	MOVOU 16(CX), X1
10845	MOVOU -32(CX)(BX*1), X2
10846	MOVOU -16(CX)(BX*1), X3
10847	MOVOU X0, (AX)
10848	MOVOU X1, 16(AX)
10849	MOVOU X2, -32(AX)(BX*1)
10850	MOVOU X3, -16(AX)(BX*1)
10851
10852memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B:
10853	MOVQ DX, AX
10854	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
10855
10856memmove_long_emit_remainder_encodeSnappyBlockAsm12B:
10857	LEAQ (AX)(SI*1), DX
10858	MOVL SI, BX
10859
10860	// genMemMoveLong
10861	MOVOU (CX), X0
10862	MOVOU 16(CX), X1
10863	MOVOU -32(CX)(BX*1), X2
10864	MOVOU -16(CX)(BX*1), X3
10865	MOVQ  BX, DI
10866	SHRQ  $0x05, DI
10867	MOVQ  AX, SI
10868	ANDL  $0x0000001f, SI
10869	MOVQ  $0x00000040, R8
10870	SUBQ  SI, R8
10871	DECQ  DI
10872	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
10873	LEAQ  -32(CX)(R8*1), SI
10874	LEAQ  -32(AX)(R8*1), R9
10875
10876emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back:
10877	MOVOU (SI), X4
10878	MOVOU 16(SI), X5
10879	MOVOA X4, (R9)
10880	MOVOA X5, 16(R9)
10881	ADDQ  $0x20, R9
10882	ADDQ  $0x20, SI
10883	ADDQ  $0x20, R8
10884	DECQ  DI
10885	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back
10886
10887emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
10888	MOVOU -32(CX)(R8*1), X4
10889	MOVOU -16(CX)(R8*1), X5
10890	MOVOA X4, -32(AX)(R8*1)
10891	MOVOA X5, -16(AX)(R8*1)
10892	ADDQ  $0x20, R8
10893	CMPQ  BX, R8
10894	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
10895	MOVOU X0, (AX)
10896	MOVOU X1, 16(AX)
10897	MOVOU X2, -32(AX)(BX*1)
10898	MOVOU X3, -16(AX)(BX*1)
10899	MOVQ  DX, AX
10900
10901emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B:
10902	MOVQ dst_base+0(FP), CX
10903	SUBQ CX, AX
10904	MOVQ AX, ret+48(FP)
10905	RET
10906
10907// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
10908// Requires: SSE2
10909TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56
10910	MOVQ dst_base+0(FP), AX
10911	MOVQ $0x00000020, CX
10912	LEAQ 24(SP), DX
10913	PXOR X0, X0
10914
10915zero_loop_encodeSnappyBlockAsm10B:
10916	MOVOU X0, (DX)
10917	MOVOU X0, 16(DX)
10918	MOVOU X0, 32(DX)
10919	MOVOU X0, 48(DX)
10920	MOVOU X0, 64(DX)
10921	MOVOU X0, 80(DX)
10922	MOVOU X0, 96(DX)
10923	MOVOU X0, 112(DX)
10924	ADDQ  $0x80, DX
10925	DECQ  CX
10926	JNZ   zero_loop_encodeSnappyBlockAsm10B
10927	MOVL  $0x00000000, 12(SP)
10928	MOVQ  src_len+32(FP), CX
10929	LEAQ  -5(CX), DX
10930	LEAQ  -8(CX), SI
10931	MOVL  SI, 8(SP)
10932	SHRQ  $0x05, CX
10933	SUBL  CX, DX
10934	LEAQ  (AX)(DX*1), DX
10935	MOVQ  DX, (SP)
10936	MOVL  $0x00000001, CX
10937	MOVL  CX, 16(SP)
10938	MOVQ  src_base+24(FP), DX
10939
10940search_loop_encodeSnappyBlockAsm10B:
10941	MOVL  CX, SI
10942	SUBL  12(SP), SI
10943	SHRL  $0x05, SI
10944	LEAL  4(CX)(SI*1), SI
10945	CMPL  SI, 8(SP)
10946	JGE   emit_remainder_encodeSnappyBlockAsm10B
10947	MOVQ  (DX)(CX*1), DI
10948	MOVL  SI, 20(SP)
10949	MOVQ  $0x9e3779b1, R9
10950	MOVQ  DI, R10
10951	MOVQ  DI, R11
10952	SHRQ  $0x08, R11
10953	SHLQ  $0x20, R10
10954	IMULQ R9, R10
10955	SHRQ  $0x36, R10
10956	SHLQ  $0x20, R11
10957	IMULQ R9, R11
10958	SHRQ  $0x36, R11
10959	MOVL  24(SP)(R10*4), SI
10960	MOVL  24(SP)(R11*4), R8
10961	MOVL  CX, 24(SP)(R10*4)
10962	LEAL  1(CX), R10
10963	MOVL  R10, 24(SP)(R11*4)
10964	MOVQ  DI, R10
10965	SHRQ  $0x10, R10
10966	SHLQ  $0x20, R10
10967	IMULQ R9, R10
10968	SHRQ  $0x36, R10
10969	MOVL  CX, R9
10970	SUBL  16(SP), R9
10971	MOVL  1(DX)(R9*1), R11
10972	MOVQ  DI, R9
10973	SHRQ  $0x08, R9
10974	CMPL  R9, R11
10975	JNE   no_repeat_found_encodeSnappyBlockAsm10B
10976	LEAL  1(CX), DI
10977	MOVL  12(SP), SI
10978	MOVL  DI, R8
10979	SUBL  16(SP), R8
10980	JZ    repeat_extend_back_end_encodeSnappyBlockAsm10B
10981
10982repeat_extend_back_loop_encodeSnappyBlockAsm10B:
10983	CMPL DI, SI
10984	JLE  repeat_extend_back_end_encodeSnappyBlockAsm10B
10985	MOVB -1(DX)(R8*1), BL
10986	MOVB -1(DX)(DI*1), R9
10987	CMPB BL, R9
10988	JNE  repeat_extend_back_end_encodeSnappyBlockAsm10B
10989	LEAL -1(DI), DI
10990	DECL R8
10991	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm10B
10992
10993repeat_extend_back_end_encodeSnappyBlockAsm10B:
10994	MOVL 12(SP), SI
10995	CMPL SI, DI
10996	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
10997	MOVL DI, R8
10998	MOVL DI, 12(SP)
10999	LEAQ (DX)(SI*1), R9
11000	SUBL SI, R8
11001	LEAL -1(R8), SI
11002	CMPL SI, $0x3c
11003	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm10B
11004	CMPL SI, $0x00000100
11005	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm10B
11006	MOVB $0xf4, (AX)
11007	MOVW SI, 1(AX)
11008	ADDQ $0x03, AX
11009	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm10B
11010
11011two_bytes_repeat_emit_encodeSnappyBlockAsm10B:
11012	MOVB $0xf0, (AX)
11013	MOVB SI, 1(AX)
11014	ADDQ $0x02, AX
11015	CMPL SI, $0x40
11016	JL   memmove_repeat_emit_encodeSnappyBlockAsm10B
11017	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm10B
11018
11019one_byte_repeat_emit_encodeSnappyBlockAsm10B:
11020	SHLB $0x02, SI
11021	MOVB SI, (AX)
11022	ADDQ $0x01, AX
11023
11024memmove_repeat_emit_encodeSnappyBlockAsm10B:
11025	LEAQ (AX)(R8*1), SI
11026
11027	// genMemMoveShort
11028	CMPQ R8, $0x03
11029	JB   emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_1or2
11030	JE   emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_3
11031	CMPQ R8, $0x08
11032	JB   emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_4through7
11033	CMPQ R8, $0x10
11034	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
11035	CMPQ R8, $0x20
11036	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
11037	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
11038
11039emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_1or2:
11040	MOVB (R9), R10
11041	MOVB -1(R9)(R8*1), R9
11042	MOVB R10, (AX)
11043	MOVB R9, -1(AX)(R8*1)
11044	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
11045
11046emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_3:
11047	MOVW (R9), R10
11048	MOVB 2(R9), R9
11049	MOVW R10, (AX)
11050	MOVB R9, 2(AX)
11051	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
11052
11053emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_4through7:
11054	MOVL (R9), R10
11055	MOVL -4(R9)(R8*1), R9
11056	MOVL R10, (AX)
11057	MOVL R9, -4(AX)(R8*1)
11058	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
11059
11060emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
11061	MOVQ (R9), R10
11062	MOVQ -8(R9)(R8*1), R9
11063	MOVQ R10, (AX)
11064	MOVQ R9, -8(AX)(R8*1)
11065	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
11066
11067emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
11068	MOVOU (R9), X0
11069	MOVOU -16(R9)(R8*1), X1
11070	MOVOU X0, (AX)
11071	MOVOU X1, -16(AX)(R8*1)
11072	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
11073
11074emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
11075	MOVOU (R9), X0
11076	MOVOU 16(R9), X1
11077	MOVOU -32(R9)(R8*1), X2
11078	MOVOU -16(R9)(R8*1), X3
11079	MOVOU X0, (AX)
11080	MOVOU X1, 16(AX)
11081	MOVOU X2, -32(AX)(R8*1)
11082	MOVOU X3, -16(AX)(R8*1)
11083
11084memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B:
11085	MOVQ SI, AX
11086	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
11087
11088memmove_long_repeat_emit_encodeSnappyBlockAsm10B:
11089	LEAQ (AX)(R8*1), SI
11090
11091	// genMemMoveLong
11092	MOVOU (R9), X0
11093	MOVOU 16(R9), X1
11094	MOVOU -32(R9)(R8*1), X2
11095	MOVOU -16(R9)(R8*1), X3
11096	MOVQ  R8, R11
11097	SHRQ  $0x05, R11
11098	MOVQ  AX, R10
11099	ANDL  $0x0000001f, R10
11100	MOVQ  $0x00000040, R12
11101	SUBQ  R10, R12
11102	DECQ  R11
11103	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
11104	LEAQ  -32(R9)(R12*1), R10
11105	LEAQ  -32(AX)(R12*1), R13
11106
11107emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
11108	MOVOU (R10), X4
11109	MOVOU 16(R10), X5
11110	MOVOA X4, (R13)
11111	MOVOA X5, 16(R13)
11112	ADDQ  $0x20, R13
11113	ADDQ  $0x20, R10
11114	ADDQ  $0x20, R12
11115	DECQ  R11
11116	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
11117
11118emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
11119	MOVOU -32(R9)(R12*1), X4
11120	MOVOU -16(R9)(R12*1), X5
11121	MOVOA X4, -32(AX)(R12*1)
11122	MOVOA X5, -16(AX)(R12*1)
11123	ADDQ  $0x20, R12
11124	CMPQ  R8, R12
11125	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
11126	MOVOU X0, (AX)
11127	MOVOU X1, 16(AX)
11128	MOVOU X2, -32(AX)(R8*1)
11129	MOVOU X3, -16(AX)(R8*1)
11130	MOVQ  SI, AX
11131
11132emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
11133	ADDL $0x05, CX
11134	MOVL CX, SI
11135	SUBL 16(SP), SI
11136	MOVQ src_len+32(FP), R8
11137	SUBL CX, R8
11138	LEAQ (DX)(CX*1), R9
11139	LEAQ (DX)(SI*1), SI
11140
11141	// matchLen
11142	XORL R11, R11
11143	CMPL R8, $0x08
11144	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm10B
11145
11146matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B:
11147	MOVQ  (R9)(R11*1), R10
11148	XORQ  (SI)(R11*1), R10
11149	TESTQ R10, R10
11150	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B
11151	BSFQ  R10, R10
11152	SARQ  $0x03, R10
11153	LEAL  (R11)(R10*1), R11
11154	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm10B
11155
11156matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B:
11157	LEAL -8(R8), R8
11158	LEAL 8(R11), R11
11159	CMPL R8, $0x08
11160	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B
11161
11162matchlen_single_repeat_extend_encodeSnappyBlockAsm10B:
11163	TESTL R8, R8
11164	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm10B
11165
11166matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B:
11167	MOVB (R9)(R11*1), R10
11168	CMPB (SI)(R11*1), R10
11169	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm10B
11170	LEAL 1(R11), R11
11171	DECL R8
11172	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm10B
11173
11174repeat_extend_forward_end_encodeSnappyBlockAsm10B:
11175	ADDL R11, CX
11176	MOVL CX, SI
11177	SUBL DI, SI
11178	MOVL 16(SP), DI
11179
11180	// emitCopy
11181two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B:
11182	CMPL SI, $0x40
11183	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B
11184	MOVB $0xee, (AX)
11185	MOVW DI, 1(AX)
11186	LEAL -60(SI), SI
11187	ADDQ $0x03, AX
11188	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B
11189
11190two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B:
11191	CMPL SI, $0x0c
11192	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
11193	CMPL DI, $0x00000800
11194	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
11195	MOVB $0x01, BL
11196	LEAL -16(BX)(SI*4), SI
11197	MOVB DI, 1(AX)
11198	SHRL $0x08, DI
11199	SHLL $0x05, DI
11200	ORL  DI, SI
11201	MOVB SI, (AX)
11202	ADDQ $0x02, AX
11203	JMP  repeat_end_emit_encodeSnappyBlockAsm10B
11204
11205emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B:
11206	MOVB $0x02, BL
11207	LEAL -4(BX)(SI*4), SI
11208	MOVB SI, (AX)
11209	MOVW DI, 1(AX)
11210	ADDQ $0x03, AX
11211
11212repeat_end_emit_encodeSnappyBlockAsm10B:
11213	MOVL CX, 12(SP)
11214	JMP  search_loop_encodeSnappyBlockAsm10B
11215
11216no_repeat_found_encodeSnappyBlockAsm10B:
11217	CMPL (DX)(SI*1), DI
11218	JEQ  candidate_match_encodeSnappyBlockAsm10B
11219	SHRQ $0x08, DI
11220	MOVL 24(SP)(R10*4), SI
11221	LEAL 2(CX), R9
11222	CMPL (DX)(R8*1), DI
11223	JEQ  candidate2_match_encodeSnappyBlockAsm10B
11224	MOVL R9, 24(SP)(R10*4)
11225	SHRQ $0x08, DI
11226	CMPL (DX)(SI*1), DI
11227	JEQ  candidate3_match_encodeSnappyBlockAsm10B
11228	MOVL 20(SP), CX
11229	JMP  search_loop_encodeSnappyBlockAsm10B
11230
11231candidate3_match_encodeSnappyBlockAsm10B:
11232	ADDL $0x02, CX
11233	JMP  candidate_match_encodeSnappyBlockAsm10B
11234
11235candidate2_match_encodeSnappyBlockAsm10B:
11236	MOVL R9, 24(SP)(R10*4)
11237	INCL CX
11238	MOVL R8, SI
11239
11240candidate_match_encodeSnappyBlockAsm10B:
11241	MOVL  12(SP), DI
11242	TESTL SI, SI
11243	JZ    match_extend_back_end_encodeSnappyBlockAsm10B
11244
11245match_extend_back_loop_encodeSnappyBlockAsm10B:
11246	CMPL CX, DI
11247	JLE  match_extend_back_end_encodeSnappyBlockAsm10B
11248	MOVB -1(DX)(SI*1), BL
11249	MOVB -1(DX)(CX*1), R8
11250	CMPB BL, R8
11251	JNE  match_extend_back_end_encodeSnappyBlockAsm10B
11252	LEAL -1(CX), CX
11253	DECL SI
11254	JZ   match_extend_back_end_encodeSnappyBlockAsm10B
11255	JMP  match_extend_back_loop_encodeSnappyBlockAsm10B
11256
11257match_extend_back_end_encodeSnappyBlockAsm10B:
11258	MOVL CX, DI
11259	SUBL 12(SP), DI
11260	LEAQ 3(AX)(DI*1), DI
11261	CMPQ DI, (SP)
11262	JL   match_dst_size_check_encodeSnappyBlockAsm10B
11263	MOVQ $0x00000000, ret+48(FP)
11264	RET
11265
11266match_dst_size_check_encodeSnappyBlockAsm10B:
11267	MOVL CX, DI
11268	MOVL 12(SP), R8
11269	CMPL R8, DI
11270	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm10B
11271	MOVL DI, R9
11272	MOVL DI, 12(SP)
11273	LEAQ (DX)(R8*1), DI
11274	SUBL R8, R9
11275	LEAL -1(R9), R8
11276	CMPL R8, $0x3c
11277	JLT  one_byte_match_emit_encodeSnappyBlockAsm10B
11278	CMPL R8, $0x00000100
11279	JLT  two_bytes_match_emit_encodeSnappyBlockAsm10B
11280	MOVB $0xf4, (AX)
11281	MOVW R8, 1(AX)
11282	ADDQ $0x03, AX
11283	JMP  memmove_long_match_emit_encodeSnappyBlockAsm10B
11284
11285two_bytes_match_emit_encodeSnappyBlockAsm10B:
11286	MOVB $0xf0, (AX)
11287	MOVB R8, 1(AX)
11288	ADDQ $0x02, AX
11289	CMPL R8, $0x40
11290	JL   memmove_match_emit_encodeSnappyBlockAsm10B
11291	JMP  memmove_long_match_emit_encodeSnappyBlockAsm10B
11292
11293one_byte_match_emit_encodeSnappyBlockAsm10B:
11294	SHLB $0x02, R8
11295	MOVB R8, (AX)
11296	ADDQ $0x01, AX
11297
11298memmove_match_emit_encodeSnappyBlockAsm10B:
11299	LEAQ (AX)(R9*1), R8
11300
11301	// genMemMoveShort
11302	CMPQ R9, $0x03
11303	JB   emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_1or2
11304	JE   emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_3
11305	CMPQ R9, $0x08
11306	JB   emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_4through7
11307	CMPQ R9, $0x10
11308	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
11309	CMPQ R9, $0x20
11310	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
11311	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
11312
11313emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_1or2:
11314	MOVB (DI), R10
11315	MOVB -1(DI)(R9*1), DI
11316	MOVB R10, (AX)
11317	MOVB DI, -1(AX)(R9*1)
11318	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
11319
11320emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_3:
11321	MOVW (DI), R10
11322	MOVB 2(DI), DI
11323	MOVW R10, (AX)
11324	MOVB DI, 2(AX)
11325	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
11326
11327emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_4through7:
11328	MOVL (DI), R10
11329	MOVL -4(DI)(R9*1), DI
11330	MOVL R10, (AX)
11331	MOVL DI, -4(AX)(R9*1)
11332	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
11333
11334emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
11335	MOVQ (DI), R10
11336	MOVQ -8(DI)(R9*1), DI
11337	MOVQ R10, (AX)
11338	MOVQ DI, -8(AX)(R9*1)
11339	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
11340
11341emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
11342	MOVOU (DI), X0
11343	MOVOU -16(DI)(R9*1), X1
11344	MOVOU X0, (AX)
11345	MOVOU X1, -16(AX)(R9*1)
11346	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
11347
11348emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
11349	MOVOU (DI), X0
11350	MOVOU 16(DI), X1
11351	MOVOU -32(DI)(R9*1), X2
11352	MOVOU -16(DI)(R9*1), X3
11353	MOVOU X0, (AX)
11354	MOVOU X1, 16(AX)
11355	MOVOU X2, -32(AX)(R9*1)
11356	MOVOU X3, -16(AX)(R9*1)
11357
11358memmove_end_copy_match_emit_encodeSnappyBlockAsm10B:
11359	MOVQ R8, AX
11360	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm10B
11361
11362memmove_long_match_emit_encodeSnappyBlockAsm10B:
11363	LEAQ (AX)(R9*1), R8
11364
11365	// genMemMoveLong
11366	MOVOU (DI), X0
11367	MOVOU 16(DI), X1
11368	MOVOU -32(DI)(R9*1), X2
11369	MOVOU -16(DI)(R9*1), X3
11370	MOVQ  R9, R11
11371	SHRQ  $0x05, R11
11372	MOVQ  AX, R10
11373	ANDL  $0x0000001f, R10
11374	MOVQ  $0x00000040, R12
11375	SUBQ  R10, R12
11376	DECQ  R11
11377	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
11378	LEAQ  -32(DI)(R12*1), R10
11379	LEAQ  -32(AX)(R12*1), R13
11380
11381emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
11382	MOVOU (R10), X4
11383	MOVOU 16(R10), X5
11384	MOVOA X4, (R13)
11385	MOVOA X5, 16(R13)
11386	ADDQ  $0x20, R13
11387	ADDQ  $0x20, R10
11388	ADDQ  $0x20, R12
11389	DECQ  R11
11390	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
11391
11392emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
11393	MOVOU -32(DI)(R12*1), X4
11394	MOVOU -16(DI)(R12*1), X5
11395	MOVOA X4, -32(AX)(R12*1)
11396	MOVOA X5, -16(AX)(R12*1)
11397	ADDQ  $0x20, R12
11398	CMPQ  R9, R12
11399	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
11400	MOVOU X0, (AX)
11401	MOVOU X1, 16(AX)
11402	MOVOU X2, -32(AX)(R9*1)
11403	MOVOU X3, -16(AX)(R9*1)
11404	MOVQ  R8, AX
11405
11406emit_literal_done_match_emit_encodeSnappyBlockAsm10B:
11407match_nolit_loop_encodeSnappyBlockAsm10B:
11408	MOVL CX, DI
11409	SUBL SI, DI
11410	MOVL DI, 16(SP)
11411	ADDL $0x04, CX
11412	ADDL $0x04, SI
11413	MOVQ src_len+32(FP), DI
11414	SUBL CX, DI
11415	LEAQ (DX)(CX*1), R8
11416	LEAQ (DX)(SI*1), SI
11417
11418	// matchLen
11419	XORL R10, R10
11420	CMPL DI, $0x08
11421	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm10B
11422
11423matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B:
11424	MOVQ  (R8)(R10*1), R9
11425	XORQ  (SI)(R10*1), R9
11426	TESTQ R9, R9
11427	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm10B
11428	BSFQ  R9, R9
11429	SARQ  $0x03, R9
11430	LEAL  (R10)(R9*1), R10
11431	JMP   match_nolit_end_encodeSnappyBlockAsm10B
11432
11433matchlen_loop_match_nolit_encodeSnappyBlockAsm10B:
11434	LEAL -8(DI), DI
11435	LEAL 8(R10), R10
11436	CMPL DI, $0x08
11437	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B
11438
11439matchlen_single_match_nolit_encodeSnappyBlockAsm10B:
11440	TESTL DI, DI
11441	JZ    match_nolit_end_encodeSnappyBlockAsm10B
11442
11443matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B:
11444	MOVB (R8)(R10*1), R9
11445	CMPB (SI)(R10*1), R9
11446	JNE  match_nolit_end_encodeSnappyBlockAsm10B
11447	LEAL 1(R10), R10
11448	DECL DI
11449	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B
11450
11451match_nolit_end_encodeSnappyBlockAsm10B:
11452	ADDL R10, CX
11453	MOVL 16(SP), SI
11454	ADDL $0x04, R10
11455	MOVL CX, 12(SP)
11456
11457	// emitCopy
11458two_byte_offset_match_nolit_encodeSnappyBlockAsm10B:
11459	CMPL R10, $0x40
11460	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B
11461	MOVB $0xee, (AX)
11462	MOVW SI, 1(AX)
11463	LEAL -60(R10), R10
11464	ADDQ $0x03, AX
11465	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm10B
11466
11467two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B:
11468	CMPL R10, $0x0c
11469	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
11470	CMPL SI, $0x00000800
11471	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
11472	MOVB $0x01, BL
11473	LEAL -16(BX)(R10*4), R10
11474	MOVB SI, 1(AX)
11475	SHRL $0x08, SI
11476	SHLL $0x05, SI
11477	ORL  SI, R10
11478	MOVB R10, (AX)
11479	ADDQ $0x02, AX
11480	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm10B
11481
11482emit_copy_three_match_nolit_encodeSnappyBlockAsm10B:
11483	MOVB $0x02, BL
11484	LEAL -4(BX)(R10*4), R10
11485	MOVB R10, (AX)
11486	MOVW SI, 1(AX)
11487	ADDQ $0x03, AX
11488
11489match_nolit_emitcopy_end_encodeSnappyBlockAsm10B:
11490	CMPL CX, 8(SP)
11491	JGE  emit_remainder_encodeSnappyBlockAsm10B
11492	MOVQ -2(DX)(CX*1), DI
11493	CMPQ AX, (SP)
11494	JL   match_nolit_dst_ok_encodeSnappyBlockAsm10B
11495	MOVQ $0x00000000, ret+48(FP)
11496	RET
11497
11498match_nolit_dst_ok_encodeSnappyBlockAsm10B:
11499	MOVQ  $0x9e3779b1, R9
11500	MOVQ  DI, R8
11501	SHRQ  $0x10, DI
11502	MOVQ  DI, SI
11503	SHLQ  $0x20, R8
11504	IMULQ R9, R8
11505	SHRQ  $0x36, R8
11506	SHLQ  $0x20, SI
11507	IMULQ R9, SI
11508	SHRQ  $0x36, SI
11509	LEAL  -2(CX), R9
11510	LEAQ  24(SP)(SI*4), R10
11511	MOVL  (R10), SI
11512	MOVL  R9, 24(SP)(R8*4)
11513	MOVL  CX, (R10)
11514	CMPL  (DX)(SI*1), DI
11515	JEQ   match_nolit_loop_encodeSnappyBlockAsm10B
11516	INCL  CX
11517	JMP   search_loop_encodeSnappyBlockAsm10B
11518
11519emit_remainder_encodeSnappyBlockAsm10B:
11520	MOVQ src_len+32(FP), CX
11521	SUBL 12(SP), CX
11522	LEAQ 3(AX)(CX*1), CX
11523	CMPQ CX, (SP)
11524	JL   emit_remainder_ok_encodeSnappyBlockAsm10B
11525	MOVQ $0x00000000, ret+48(FP)
11526	RET
11527
11528emit_remainder_ok_encodeSnappyBlockAsm10B:
11529	MOVQ src_len+32(FP), CX
11530	MOVL 12(SP), BX
11531	CMPL BX, CX
11532	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
11533	MOVL CX, SI
11534	MOVL CX, 12(SP)
11535	LEAQ (DX)(BX*1), CX
11536	SUBL BX, SI
11537	LEAL -1(SI), DX
11538	CMPL DX, $0x3c
11539	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm10B
11540	CMPL DX, $0x00000100
11541	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm10B
11542	MOVB $0xf4, (AX)
11543	MOVW DX, 1(AX)
11544	ADDQ $0x03, AX
11545	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm10B
11546
11547two_bytes_emit_remainder_encodeSnappyBlockAsm10B:
11548	MOVB $0xf0, (AX)
11549	MOVB DL, 1(AX)
11550	ADDQ $0x02, AX
11551	CMPL DX, $0x40
11552	JL   memmove_emit_remainder_encodeSnappyBlockAsm10B
11553	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm10B
11554
11555one_byte_emit_remainder_encodeSnappyBlockAsm10B:
11556	SHLB $0x02, DL
11557	MOVB DL, (AX)
11558	ADDQ $0x01, AX
11559
11560memmove_emit_remainder_encodeSnappyBlockAsm10B:
11561	LEAQ (AX)(SI*1), DX
11562	MOVL SI, BX
11563
11564	// genMemMoveShort
11565	CMPQ BX, $0x03
11566	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2
11567	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3
11568	CMPQ BX, $0x08
11569	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7
11570	CMPQ BX, $0x10
11571	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16
11572	CMPQ BX, $0x20
11573	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32
11574	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64
11575
11576emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2:
11577	MOVB (CX), SI
11578	MOVB -1(CX)(BX*1), CL
11579	MOVB SI, (AX)
11580	MOVB CL, -1(AX)(BX*1)
11581	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
11582
11583emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3:
11584	MOVW (CX), SI
11585	MOVB 2(CX), CL
11586	MOVW SI, (AX)
11587	MOVB CL, 2(AX)
11588	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
11589
11590emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7:
11591	MOVL (CX), SI
11592	MOVL -4(CX)(BX*1), CX
11593	MOVL SI, (AX)
11594	MOVL CX, -4(AX)(BX*1)
11595	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
11596
11597emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16:
11598	MOVQ (CX), SI
11599	MOVQ -8(CX)(BX*1), CX
11600	MOVQ SI, (AX)
11601	MOVQ CX, -8(AX)(BX*1)
11602	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
11603
11604emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32:
11605	MOVOU (CX), X0
11606	MOVOU -16(CX)(BX*1), X1
11607	MOVOU X0, (AX)
11608	MOVOU X1, -16(AX)(BX*1)
11609	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
11610
11611emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64:
11612	MOVOU (CX), X0
11613	MOVOU 16(CX), X1
11614	MOVOU -32(CX)(BX*1), X2
11615	MOVOU -16(CX)(BX*1), X3
11616	MOVOU X0, (AX)
11617	MOVOU X1, 16(AX)
11618	MOVOU X2, -32(AX)(BX*1)
11619	MOVOU X3, -16(AX)(BX*1)
11620
11621memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B:
11622	MOVQ DX, AX
11623	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
11624
11625memmove_long_emit_remainder_encodeSnappyBlockAsm10B:
11626	LEAQ (AX)(SI*1), DX
11627	MOVL SI, BX
11628
11629	// genMemMoveLong
11630	MOVOU (CX), X0
11631	MOVOU 16(CX), X1
11632	MOVOU -32(CX)(BX*1), X2
11633	MOVOU -16(CX)(BX*1), X3
11634	MOVQ  BX, DI
11635	SHRQ  $0x05, DI
11636	MOVQ  AX, SI
11637	ANDL  $0x0000001f, SI
11638	MOVQ  $0x00000040, R8
11639	SUBQ  SI, R8
11640	DECQ  DI
11641	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
11642	LEAQ  -32(CX)(R8*1), SI
11643	LEAQ  -32(AX)(R8*1), R9
11644
11645emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back:
11646	MOVOU (SI), X4
11647	MOVOU 16(SI), X5
11648	MOVOA X4, (R9)
11649	MOVOA X5, 16(R9)
11650	ADDQ  $0x20, R9
11651	ADDQ  $0x20, SI
11652	ADDQ  $0x20, R8
11653	DECQ  DI
11654	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back
11655
11656emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
11657	MOVOU -32(CX)(R8*1), X4
11658	MOVOU -16(CX)(R8*1), X5
11659	MOVOA X4, -32(AX)(R8*1)
11660	MOVOA X5, -16(AX)(R8*1)
11661	ADDQ  $0x20, R8
11662	CMPQ  BX, R8
11663	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
11664	MOVOU X0, (AX)
11665	MOVOU X1, 16(AX)
11666	MOVOU X2, -32(AX)(BX*1)
11667	MOVOU X3, -16(AX)(BX*1)
11668	MOVQ  DX, AX
11669
11670emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B:
11671	MOVQ dst_base+0(FP), CX
11672	SUBQ CX, AX
11673	MOVQ AX, ret+48(FP)
11674	RET
11675
11676// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
11677// Requires: SSE2
11678TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56
11679	MOVQ dst_base+0(FP), AX
11680	MOVQ $0x00000008, CX
11681	LEAQ 24(SP), DX
11682	PXOR X0, X0
11683
11684zero_loop_encodeSnappyBlockAsm8B:
11685	MOVOU X0, (DX)
11686	MOVOU X0, 16(DX)
11687	MOVOU X0, 32(DX)
11688	MOVOU X0, 48(DX)
11689	MOVOU X0, 64(DX)
11690	MOVOU X0, 80(DX)
11691	MOVOU X0, 96(DX)
11692	MOVOU X0, 112(DX)
11693	ADDQ  $0x80, DX
11694	DECQ  CX
11695	JNZ   zero_loop_encodeSnappyBlockAsm8B
11696	MOVL  $0x00000000, 12(SP)
11697	MOVQ  src_len+32(FP), CX
11698	LEAQ  -5(CX), DX
11699	LEAQ  -8(CX), SI
11700	MOVL  SI, 8(SP)
11701	SHRQ  $0x05, CX
11702	SUBL  CX, DX
11703	LEAQ  (AX)(DX*1), DX
11704	MOVQ  DX, (SP)
11705	MOVL  $0x00000001, CX
11706	MOVL  CX, 16(SP)
11707	MOVQ  src_base+24(FP), DX
11708
11709search_loop_encodeSnappyBlockAsm8B:
11710	MOVL  CX, SI
11711	SUBL  12(SP), SI
11712	SHRL  $0x04, SI
11713	LEAL  4(CX)(SI*1), SI
11714	CMPL  SI, 8(SP)
11715	JGE   emit_remainder_encodeSnappyBlockAsm8B
11716	MOVQ  (DX)(CX*1), DI
11717	MOVL  SI, 20(SP)
11718	MOVQ  $0x9e3779b1, R9
11719	MOVQ  DI, R10
11720	MOVQ  DI, R11
11721	SHRQ  $0x08, R11
11722	SHLQ  $0x20, R10
11723	IMULQ R9, R10
11724	SHRQ  $0x38, R10
11725	SHLQ  $0x20, R11
11726	IMULQ R9, R11
11727	SHRQ  $0x38, R11
11728	MOVL  24(SP)(R10*4), SI
11729	MOVL  24(SP)(R11*4), R8
11730	MOVL  CX, 24(SP)(R10*4)
11731	LEAL  1(CX), R10
11732	MOVL  R10, 24(SP)(R11*4)
11733	MOVQ  DI, R10
11734	SHRQ  $0x10, R10
11735	SHLQ  $0x20, R10
11736	IMULQ R9, R10
11737	SHRQ  $0x38, R10
11738	MOVL  CX, R9
11739	SUBL  16(SP), R9
11740	MOVL  1(DX)(R9*1), R11
11741	MOVQ  DI, R9
11742	SHRQ  $0x08, R9
11743	CMPL  R9, R11
11744	JNE   no_repeat_found_encodeSnappyBlockAsm8B
11745	LEAL  1(CX), DI
11746	MOVL  12(SP), SI
11747	MOVL  DI, R8
11748	SUBL  16(SP), R8
11749	JZ    repeat_extend_back_end_encodeSnappyBlockAsm8B
11750
11751repeat_extend_back_loop_encodeSnappyBlockAsm8B:
11752	CMPL DI, SI
11753	JLE  repeat_extend_back_end_encodeSnappyBlockAsm8B
11754	MOVB -1(DX)(R8*1), BL
11755	MOVB -1(DX)(DI*1), R9
11756	CMPB BL, R9
11757	JNE  repeat_extend_back_end_encodeSnappyBlockAsm8B
11758	LEAL -1(DI), DI
11759	DECL R8
11760	JNZ  repeat_extend_back_loop_encodeSnappyBlockAsm8B
11761
11762repeat_extend_back_end_encodeSnappyBlockAsm8B:
11763	MOVL 12(SP), SI
11764	CMPL SI, DI
11765	JEQ  emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
11766	MOVL DI, R8
11767	MOVL DI, 12(SP)
11768	LEAQ (DX)(SI*1), R9
11769	SUBL SI, R8
11770	LEAL -1(R8), SI
11771	CMPL SI, $0x3c
11772	JLT  one_byte_repeat_emit_encodeSnappyBlockAsm8B
11773	CMPL SI, $0x00000100
11774	JLT  two_bytes_repeat_emit_encodeSnappyBlockAsm8B
11775	MOVB $0xf4, (AX)
11776	MOVW SI, 1(AX)
11777	ADDQ $0x03, AX
11778	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm8B
11779
11780two_bytes_repeat_emit_encodeSnappyBlockAsm8B:
11781	MOVB $0xf0, (AX)
11782	MOVB SI, 1(AX)
11783	ADDQ $0x02, AX
11784	CMPL SI, $0x40
11785	JL   memmove_repeat_emit_encodeSnappyBlockAsm8B
11786	JMP  memmove_long_repeat_emit_encodeSnappyBlockAsm8B
11787
11788one_byte_repeat_emit_encodeSnappyBlockAsm8B:
11789	SHLB $0x02, SI
11790	MOVB SI, (AX)
11791	ADDQ $0x01, AX
11792
11793memmove_repeat_emit_encodeSnappyBlockAsm8B:
11794	LEAQ (AX)(R8*1), SI
11795
11796	// genMemMoveShort
11797	CMPQ R8, $0x03
11798	JB   emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_1or2
11799	JE   emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_3
11800	CMPQ R8, $0x08
11801	JB   emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_4through7
11802	CMPQ R8, $0x10
11803	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
11804	CMPQ R8, $0x20
11805	JBE  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
11806	JMP  emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
11807
11808emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_1or2:
11809	MOVB (R9), R10
11810	MOVB -1(R9)(R8*1), R9
11811	MOVB R10, (AX)
11812	MOVB R9, -1(AX)(R8*1)
11813	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
11814
11815emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_3:
11816	MOVW (R9), R10
11817	MOVB 2(R9), R9
11818	MOVW R10, (AX)
11819	MOVB R9, 2(AX)
11820	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
11821
11822emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_4through7:
11823	MOVL (R9), R10
11824	MOVL -4(R9)(R8*1), R9
11825	MOVL R10, (AX)
11826	MOVL R9, -4(AX)(R8*1)
11827	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
11828
11829emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
11830	MOVQ (R9), R10
11831	MOVQ -8(R9)(R8*1), R9
11832	MOVQ R10, (AX)
11833	MOVQ R9, -8(AX)(R8*1)
11834	JMP  memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
11835
11836emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
11837	MOVOU (R9), X0
11838	MOVOU -16(R9)(R8*1), X1
11839	MOVOU X0, (AX)
11840	MOVOU X1, -16(AX)(R8*1)
11841	JMP   memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
11842
11843emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
11844	MOVOU (R9), X0
11845	MOVOU 16(R9), X1
11846	MOVOU -32(R9)(R8*1), X2
11847	MOVOU -16(R9)(R8*1), X3
11848	MOVOU X0, (AX)
11849	MOVOU X1, 16(AX)
11850	MOVOU X2, -32(AX)(R8*1)
11851	MOVOU X3, -16(AX)(R8*1)
11852
11853memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B:
11854	MOVQ SI, AX
11855	JMP  emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
11856
11857memmove_long_repeat_emit_encodeSnappyBlockAsm8B:
11858	LEAQ (AX)(R8*1), SI
11859
11860	// genMemMoveLong
11861	MOVOU (R9), X0
11862	MOVOU 16(R9), X1
11863	MOVOU -32(R9)(R8*1), X2
11864	MOVOU -16(R9)(R8*1), X3
11865	MOVQ  R8, R11
11866	SHRQ  $0x05, R11
11867	MOVQ  AX, R10
11868	ANDL  $0x0000001f, R10
11869	MOVQ  $0x00000040, R12
11870	SUBQ  R10, R12
11871	DECQ  R11
11872	JA    emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
11873	LEAQ  -32(R9)(R12*1), R10
11874	LEAQ  -32(AX)(R12*1), R13
11875
11876emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
11877	MOVOU (R10), X4
11878	MOVOU 16(R10), X5
11879	MOVOA X4, (R13)
11880	MOVOA X5, 16(R13)
11881	ADDQ  $0x20, R13
11882	ADDQ  $0x20, R10
11883	ADDQ  $0x20, R12
11884	DECQ  R11
11885	JNA   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
11886
11887emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
11888	MOVOU -32(R9)(R12*1), X4
11889	MOVOU -16(R9)(R12*1), X5
11890	MOVOA X4, -32(AX)(R12*1)
11891	MOVOA X5, -16(AX)(R12*1)
11892	ADDQ  $0x20, R12
11893	CMPQ  R8, R12
11894	JAE   emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
11895	MOVOU X0, (AX)
11896	MOVOU X1, 16(AX)
11897	MOVOU X2, -32(AX)(R8*1)
11898	MOVOU X3, -16(AX)(R8*1)
11899	MOVQ  SI, AX
11900
11901emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
11902	ADDL $0x05, CX
11903	MOVL CX, SI
11904	SUBL 16(SP), SI
11905	MOVQ src_len+32(FP), R8
11906	SUBL CX, R8
11907	LEAQ (DX)(CX*1), R9
11908	LEAQ (DX)(SI*1), SI
11909
11910	// matchLen
11911	XORL R11, R11
11912	CMPL R8, $0x08
11913	JL   matchlen_single_repeat_extend_encodeSnappyBlockAsm8B
11914
11915matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B:
11916	MOVQ  (R9)(R11*1), R10
11917	XORQ  (SI)(R11*1), R10
11918	TESTQ R10, R10
11919	JZ    matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B
11920	BSFQ  R10, R10
11921	SARQ  $0x03, R10
11922	LEAL  (R11)(R10*1), R11
11923	JMP   repeat_extend_forward_end_encodeSnappyBlockAsm8B
11924
11925matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B:
11926	LEAL -8(R8), R8
11927	LEAL 8(R11), R11
11928	CMPL R8, $0x08
11929	JGE  matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B
11930
11931matchlen_single_repeat_extend_encodeSnappyBlockAsm8B:
11932	TESTL R8, R8
11933	JZ    repeat_extend_forward_end_encodeSnappyBlockAsm8B
11934
11935matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B:
11936	MOVB (R9)(R11*1), R10
11937	CMPB (SI)(R11*1), R10
11938	JNE  repeat_extend_forward_end_encodeSnappyBlockAsm8B
11939	LEAL 1(R11), R11
11940	DECL R8
11941	JNZ  matchlen_single_loopback_repeat_extend_encodeSnappyBlockAsm8B
11942
11943repeat_extend_forward_end_encodeSnappyBlockAsm8B:
11944	ADDL R11, CX
11945	MOVL CX, SI
11946	SUBL DI, SI
11947	MOVL 16(SP), DI
11948
11949	// emitCopy
11950two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B:
11951	CMPL SI, $0x40
11952	JLE  two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B
11953	MOVB $0xee, (AX)
11954	MOVW DI, 1(AX)
11955	LEAL -60(SI), SI
11956	ADDQ $0x03, AX
11957	JMP  two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B
11958
11959two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B:
11960	CMPL SI, $0x0c
11961	JGE  emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B
11962	MOVB $0x01, BL
11963	LEAL -16(BX)(SI*4), SI
11964	MOVB DI, 1(AX)
11965	SHRL $0x08, DI
11966	SHLL $0x05, DI
11967	ORL  DI, SI
11968	MOVB SI, (AX)
11969	ADDQ $0x02, AX
11970	JMP  repeat_end_emit_encodeSnappyBlockAsm8B
11971
11972emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B:
11973	MOVB $0x02, BL
11974	LEAL -4(BX)(SI*4), SI
11975	MOVB SI, (AX)
11976	MOVW DI, 1(AX)
11977	ADDQ $0x03, AX
11978
11979repeat_end_emit_encodeSnappyBlockAsm8B:
11980	MOVL CX, 12(SP)
11981	JMP  search_loop_encodeSnappyBlockAsm8B
11982
11983no_repeat_found_encodeSnappyBlockAsm8B:
11984	CMPL (DX)(SI*1), DI
11985	JEQ  candidate_match_encodeSnappyBlockAsm8B
11986	SHRQ $0x08, DI
11987	MOVL 24(SP)(R10*4), SI
11988	LEAL 2(CX), R9
11989	CMPL (DX)(R8*1), DI
11990	JEQ  candidate2_match_encodeSnappyBlockAsm8B
11991	MOVL R9, 24(SP)(R10*4)
11992	SHRQ $0x08, DI
11993	CMPL (DX)(SI*1), DI
11994	JEQ  candidate3_match_encodeSnappyBlockAsm8B
11995	MOVL 20(SP), CX
11996	JMP  search_loop_encodeSnappyBlockAsm8B
11997
11998candidate3_match_encodeSnappyBlockAsm8B:
11999	ADDL $0x02, CX
12000	JMP  candidate_match_encodeSnappyBlockAsm8B
12001
12002candidate2_match_encodeSnappyBlockAsm8B:
12003	MOVL R9, 24(SP)(R10*4)
12004	INCL CX
12005	MOVL R8, SI
12006
12007candidate_match_encodeSnappyBlockAsm8B:
12008	MOVL  12(SP), DI
12009	TESTL SI, SI
12010	JZ    match_extend_back_end_encodeSnappyBlockAsm8B
12011
12012match_extend_back_loop_encodeSnappyBlockAsm8B:
12013	CMPL CX, DI
12014	JLE  match_extend_back_end_encodeSnappyBlockAsm8B
12015	MOVB -1(DX)(SI*1), BL
12016	MOVB -1(DX)(CX*1), R8
12017	CMPB BL, R8
12018	JNE  match_extend_back_end_encodeSnappyBlockAsm8B
12019	LEAL -1(CX), CX
12020	DECL SI
12021	JZ   match_extend_back_end_encodeSnappyBlockAsm8B
12022	JMP  match_extend_back_loop_encodeSnappyBlockAsm8B
12023
12024match_extend_back_end_encodeSnappyBlockAsm8B:
12025	MOVL CX, DI
12026	SUBL 12(SP), DI
12027	LEAQ 3(AX)(DI*1), DI
12028	CMPQ DI, (SP)
12029	JL   match_dst_size_check_encodeSnappyBlockAsm8B
12030	MOVQ $0x00000000, ret+48(FP)
12031	RET
12032
12033match_dst_size_check_encodeSnappyBlockAsm8B:
12034	MOVL CX, DI
12035	MOVL 12(SP), R8
12036	CMPL R8, DI
12037	JEQ  emit_literal_done_match_emit_encodeSnappyBlockAsm8B
12038	MOVL DI, R9
12039	MOVL DI, 12(SP)
12040	LEAQ (DX)(R8*1), DI
12041	SUBL R8, R9
12042	LEAL -1(R9), R8
12043	CMPL R8, $0x3c
12044	JLT  one_byte_match_emit_encodeSnappyBlockAsm8B
12045	CMPL R8, $0x00000100
12046	JLT  two_bytes_match_emit_encodeSnappyBlockAsm8B
12047	MOVB $0xf4, (AX)
12048	MOVW R8, 1(AX)
12049	ADDQ $0x03, AX
12050	JMP  memmove_long_match_emit_encodeSnappyBlockAsm8B
12051
12052two_bytes_match_emit_encodeSnappyBlockAsm8B:
12053	MOVB $0xf0, (AX)
12054	MOVB R8, 1(AX)
12055	ADDQ $0x02, AX
12056	CMPL R8, $0x40
12057	JL   memmove_match_emit_encodeSnappyBlockAsm8B
12058	JMP  memmove_long_match_emit_encodeSnappyBlockAsm8B
12059
12060one_byte_match_emit_encodeSnappyBlockAsm8B:
12061	SHLB $0x02, R8
12062	MOVB R8, (AX)
12063	ADDQ $0x01, AX
12064
12065memmove_match_emit_encodeSnappyBlockAsm8B:
12066	LEAQ (AX)(R9*1), R8
12067
12068	// genMemMoveShort
12069	CMPQ R9, $0x03
12070	JB   emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_1or2
12071	JE   emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_3
12072	CMPQ R9, $0x08
12073	JB   emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_4through7
12074	CMPQ R9, $0x10
12075	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
12076	CMPQ R9, $0x20
12077	JBE  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
12078	JMP  emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
12079
12080emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_1or2:
12081	MOVB (DI), R10
12082	MOVB -1(DI)(R9*1), DI
12083	MOVB R10, (AX)
12084	MOVB DI, -1(AX)(R9*1)
12085	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
12086
12087emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_3:
12088	MOVW (DI), R10
12089	MOVB 2(DI), DI
12090	MOVW R10, (AX)
12091	MOVB DI, 2(AX)
12092	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
12093
12094emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_4through7:
12095	MOVL (DI), R10
12096	MOVL -4(DI)(R9*1), DI
12097	MOVL R10, (AX)
12098	MOVL DI, -4(AX)(R9*1)
12099	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
12100
12101emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
12102	MOVQ (DI), R10
12103	MOVQ -8(DI)(R9*1), DI
12104	MOVQ R10, (AX)
12105	MOVQ DI, -8(AX)(R9*1)
12106	JMP  memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
12107
12108emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
12109	MOVOU (DI), X0
12110	MOVOU -16(DI)(R9*1), X1
12111	MOVOU X0, (AX)
12112	MOVOU X1, -16(AX)(R9*1)
12113	JMP   memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
12114
12115emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
12116	MOVOU (DI), X0
12117	MOVOU 16(DI), X1
12118	MOVOU -32(DI)(R9*1), X2
12119	MOVOU -16(DI)(R9*1), X3
12120	MOVOU X0, (AX)
12121	MOVOU X1, 16(AX)
12122	MOVOU X2, -32(AX)(R9*1)
12123	MOVOU X3, -16(AX)(R9*1)
12124
12125memmove_end_copy_match_emit_encodeSnappyBlockAsm8B:
12126	MOVQ R8, AX
12127	JMP  emit_literal_done_match_emit_encodeSnappyBlockAsm8B
12128
12129memmove_long_match_emit_encodeSnappyBlockAsm8B:
12130	LEAQ (AX)(R9*1), R8
12131
12132	// genMemMoveLong
12133	MOVOU (DI), X0
12134	MOVOU 16(DI), X1
12135	MOVOU -32(DI)(R9*1), X2
12136	MOVOU -16(DI)(R9*1), X3
12137	MOVQ  R9, R11
12138	SHRQ  $0x05, R11
12139	MOVQ  AX, R10
12140	ANDL  $0x0000001f, R10
12141	MOVQ  $0x00000040, R12
12142	SUBQ  R10, R12
12143	DECQ  R11
12144	JA    emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
12145	LEAQ  -32(DI)(R12*1), R10
12146	LEAQ  -32(AX)(R12*1), R13
12147
12148emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
12149	MOVOU (R10), X4
12150	MOVOU 16(R10), X5
12151	MOVOA X4, (R13)
12152	MOVOA X5, 16(R13)
12153	ADDQ  $0x20, R13
12154	ADDQ  $0x20, R10
12155	ADDQ  $0x20, R12
12156	DECQ  R11
12157	JNA   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
12158
12159emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
12160	MOVOU -32(DI)(R12*1), X4
12161	MOVOU -16(DI)(R12*1), X5
12162	MOVOA X4, -32(AX)(R12*1)
12163	MOVOA X5, -16(AX)(R12*1)
12164	ADDQ  $0x20, R12
12165	CMPQ  R9, R12
12166	JAE   emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
12167	MOVOU X0, (AX)
12168	MOVOU X1, 16(AX)
12169	MOVOU X2, -32(AX)(R9*1)
12170	MOVOU X3, -16(AX)(R9*1)
12171	MOVQ  R8, AX
12172
12173emit_literal_done_match_emit_encodeSnappyBlockAsm8B:
12174match_nolit_loop_encodeSnappyBlockAsm8B:
12175	MOVL CX, DI
12176	SUBL SI, DI
12177	MOVL DI, 16(SP)
12178	ADDL $0x04, CX
12179	ADDL $0x04, SI
12180	MOVQ src_len+32(FP), DI
12181	SUBL CX, DI
12182	LEAQ (DX)(CX*1), R8
12183	LEAQ (DX)(SI*1), SI
12184
12185	// matchLen
12186	XORL R10, R10
12187	CMPL DI, $0x08
12188	JL   matchlen_single_match_nolit_encodeSnappyBlockAsm8B
12189
12190matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B:
12191	MOVQ  (R8)(R10*1), R9
12192	XORQ  (SI)(R10*1), R9
12193	TESTQ R9, R9
12194	JZ    matchlen_loop_match_nolit_encodeSnappyBlockAsm8B
12195	BSFQ  R9, R9
12196	SARQ  $0x03, R9
12197	LEAL  (R10)(R9*1), R10
12198	JMP   match_nolit_end_encodeSnappyBlockAsm8B
12199
12200matchlen_loop_match_nolit_encodeSnappyBlockAsm8B:
12201	LEAL -8(DI), DI
12202	LEAL 8(R10), R10
12203	CMPL DI, $0x08
12204	JGE  matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B
12205
12206matchlen_single_match_nolit_encodeSnappyBlockAsm8B:
12207	TESTL DI, DI
12208	JZ    match_nolit_end_encodeSnappyBlockAsm8B
12209
12210matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B:
12211	MOVB (R8)(R10*1), R9
12212	CMPB (SI)(R10*1), R9
12213	JNE  match_nolit_end_encodeSnappyBlockAsm8B
12214	LEAL 1(R10), R10
12215	DECL DI
12216	JNZ  matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B
12217
12218match_nolit_end_encodeSnappyBlockAsm8B:
12219	ADDL R10, CX
12220	MOVL 16(SP), SI
12221	ADDL $0x04, R10
12222	MOVL CX, 12(SP)
12223
12224	// emitCopy
12225two_byte_offset_match_nolit_encodeSnappyBlockAsm8B:
12226	CMPL R10, $0x40
12227	JLE  two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B
12228	MOVB $0xee, (AX)
12229	MOVW SI, 1(AX)
12230	LEAL -60(R10), R10
12231	ADDQ $0x03, AX
12232	JMP  two_byte_offset_match_nolit_encodeSnappyBlockAsm8B
12233
12234two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B:
12235	CMPL R10, $0x0c
12236	JGE  emit_copy_three_match_nolit_encodeSnappyBlockAsm8B
12237	MOVB $0x01, BL
12238	LEAL -16(BX)(R10*4), R10
12239	MOVB SI, 1(AX)
12240	SHRL $0x08, SI
12241	SHLL $0x05, SI
12242	ORL  SI, R10
12243	MOVB R10, (AX)
12244	ADDQ $0x02, AX
12245	JMP  match_nolit_emitcopy_end_encodeSnappyBlockAsm8B
12246
12247emit_copy_three_match_nolit_encodeSnappyBlockAsm8B:
12248	MOVB $0x02, BL
12249	LEAL -4(BX)(R10*4), R10
12250	MOVB R10, (AX)
12251	MOVW SI, 1(AX)
12252	ADDQ $0x03, AX
12253
12254match_nolit_emitcopy_end_encodeSnappyBlockAsm8B:
12255	CMPL CX, 8(SP)
12256	JGE  emit_remainder_encodeSnappyBlockAsm8B
12257	MOVQ -2(DX)(CX*1), DI
12258	CMPQ AX, (SP)
12259	JL   match_nolit_dst_ok_encodeSnappyBlockAsm8B
12260	MOVQ $0x00000000, ret+48(FP)
12261	RET
12262
12263match_nolit_dst_ok_encodeSnappyBlockAsm8B:
12264	MOVQ  $0x9e3779b1, R9
12265	MOVQ  DI, R8
12266	SHRQ  $0x10, DI
12267	MOVQ  DI, SI
12268	SHLQ  $0x20, R8
12269	IMULQ R9, R8
12270	SHRQ  $0x38, R8
12271	SHLQ  $0x20, SI
12272	IMULQ R9, SI
12273	SHRQ  $0x38, SI
12274	LEAL  -2(CX), R9
12275	LEAQ  24(SP)(SI*4), R10
12276	MOVL  (R10), SI
12277	MOVL  R9, 24(SP)(R8*4)
12278	MOVL  CX, (R10)
12279	CMPL  (DX)(SI*1), DI
12280	JEQ   match_nolit_loop_encodeSnappyBlockAsm8B
12281	INCL  CX
12282	JMP   search_loop_encodeSnappyBlockAsm8B
12283
12284emit_remainder_encodeSnappyBlockAsm8B:
12285	MOVQ src_len+32(FP), CX
12286	SUBL 12(SP), CX
12287	LEAQ 3(AX)(CX*1), CX
12288	CMPQ CX, (SP)
12289	JL   emit_remainder_ok_encodeSnappyBlockAsm8B
12290	MOVQ $0x00000000, ret+48(FP)
12291	RET
12292
12293emit_remainder_ok_encodeSnappyBlockAsm8B:
12294	MOVQ src_len+32(FP), CX
12295	MOVL 12(SP), BX
12296	CMPL BX, CX
12297	JEQ  emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
12298	MOVL CX, SI
12299	MOVL CX, 12(SP)
12300	LEAQ (DX)(BX*1), CX
12301	SUBL BX, SI
12302	LEAL -1(SI), DX
12303	CMPL DX, $0x3c
12304	JLT  one_byte_emit_remainder_encodeSnappyBlockAsm8B
12305	CMPL DX, $0x00000100
12306	JLT  two_bytes_emit_remainder_encodeSnappyBlockAsm8B
12307	MOVB $0xf4, (AX)
12308	MOVW DX, 1(AX)
12309	ADDQ $0x03, AX
12310	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm8B
12311
12312two_bytes_emit_remainder_encodeSnappyBlockAsm8B:
12313	MOVB $0xf0, (AX)
12314	MOVB DL, 1(AX)
12315	ADDQ $0x02, AX
12316	CMPL DX, $0x40
12317	JL   memmove_emit_remainder_encodeSnappyBlockAsm8B
12318	JMP  memmove_long_emit_remainder_encodeSnappyBlockAsm8B
12319
12320one_byte_emit_remainder_encodeSnappyBlockAsm8B:
12321	SHLB $0x02, DL
12322	MOVB DL, (AX)
12323	ADDQ $0x01, AX
12324
12325memmove_emit_remainder_encodeSnappyBlockAsm8B:
12326	LEAQ (AX)(SI*1), DX
12327	MOVL SI, BX
12328
12329	// genMemMoveShort
12330	CMPQ BX, $0x03
12331	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2
12332	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3
12333	CMPQ BX, $0x08
12334	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7
12335	CMPQ BX, $0x10
12336	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16
12337	CMPQ BX, $0x20
12338	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32
12339	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64
12340
12341emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2:
12342	MOVB (CX), SI
12343	MOVB -1(CX)(BX*1), CL
12344	MOVB SI, (AX)
12345	MOVB CL, -1(AX)(BX*1)
12346	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
12347
12348emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3:
12349	MOVW (CX), SI
12350	MOVB 2(CX), CL
12351	MOVW SI, (AX)
12352	MOVB CL, 2(AX)
12353	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
12354
12355emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7:
12356	MOVL (CX), SI
12357	MOVL -4(CX)(BX*1), CX
12358	MOVL SI, (AX)
12359	MOVL CX, -4(AX)(BX*1)
12360	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
12361
12362emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16:
12363	MOVQ (CX), SI
12364	MOVQ -8(CX)(BX*1), CX
12365	MOVQ SI, (AX)
12366	MOVQ CX, -8(AX)(BX*1)
12367	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
12368
12369emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32:
12370	MOVOU (CX), X0
12371	MOVOU -16(CX)(BX*1), X1
12372	MOVOU X0, (AX)
12373	MOVOU X1, -16(AX)(BX*1)
12374	JMP   memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
12375
12376emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64:
12377	MOVOU (CX), X0
12378	MOVOU 16(CX), X1
12379	MOVOU -32(CX)(BX*1), X2
12380	MOVOU -16(CX)(BX*1), X3
12381	MOVOU X0, (AX)
12382	MOVOU X1, 16(AX)
12383	MOVOU X2, -32(AX)(BX*1)
12384	MOVOU X3, -16(AX)(BX*1)
12385
12386memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B:
12387	MOVQ DX, AX
12388	JMP  emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
12389
12390memmove_long_emit_remainder_encodeSnappyBlockAsm8B:
12391	LEAQ (AX)(SI*1), DX
12392	MOVL SI, BX
12393
12394	// genMemMoveLong
12395	MOVOU (CX), X0
12396	MOVOU 16(CX), X1
12397	MOVOU -32(CX)(BX*1), X2
12398	MOVOU -16(CX)(BX*1), X3
12399	MOVQ  BX, DI
12400	SHRQ  $0x05, DI
12401	MOVQ  AX, SI
12402	ANDL  $0x0000001f, SI
12403	MOVQ  $0x00000040, R8
12404	SUBQ  SI, R8
12405	DECQ  DI
12406	JA    emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
12407	LEAQ  -32(CX)(R8*1), SI
12408	LEAQ  -32(AX)(R8*1), R9
12409
12410emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back:
12411	MOVOU (SI), X4
12412	MOVOU 16(SI), X5
12413	MOVOA X4, (R9)
12414	MOVOA X5, 16(R9)
12415	ADDQ  $0x20, R9
12416	ADDQ  $0x20, SI
12417	ADDQ  $0x20, R8
12418	DECQ  DI
12419	JNA   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back
12420
12421emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
12422	MOVOU -32(CX)(R8*1), X4
12423	MOVOU -16(CX)(R8*1), X5
12424	MOVOA X4, -32(AX)(R8*1)
12425	MOVOA X5, -16(AX)(R8*1)
12426	ADDQ  $0x20, R8
12427	CMPQ  BX, R8
12428	JAE   emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
12429	MOVOU X0, (AX)
12430	MOVOU X1, 16(AX)
12431	MOVOU X2, -32(AX)(BX*1)
12432	MOVOU X3, -16(AX)(BX*1)
12433	MOVQ  DX, AX
12434
12435emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
12436	MOVQ dst_base+0(FP), CX
12437	SUBQ CX, AX
12438	MOVQ AX, ret+48(FP)
12439	RET
12440
12441// func emitLiteral(dst []byte, lit []byte) int
12442// Requires: SSE2
12443TEXT ·emitLiteral(SB), NOSPLIT, $0-56
12444	MOVQ  lit_len+32(FP), DX
12445	MOVQ  dst_base+0(FP), AX
12446	MOVQ  lit_base+24(FP), CX
12447	TESTQ DX, DX
12448	JZ    emit_literal_end_standalone_skip
12449	MOVL  DX, BX
12450	LEAL  -1(DX), SI
12451	CMPL  SI, $0x3c
12452	JLT   one_byte_standalone
12453	CMPL  SI, $0x00000100
12454	JLT   two_bytes_standalone
12455	CMPL  SI, $0x00010000
12456	JLT   three_bytes_standalone
12457	CMPL  SI, $0x01000000
12458	JLT   four_bytes_standalone
12459	MOVB  $0xfc, (AX)
12460	MOVL  SI, 1(AX)
12461	ADDQ  $0x05, BX
12462	ADDQ  $0x05, AX
12463	JMP   memmove_long_standalone
12464
12465four_bytes_standalone:
12466	MOVL SI, DI
12467	SHRL $0x10, DI
12468	MOVB $0xf8, (AX)
12469	MOVW SI, 1(AX)
12470	MOVB DI, 3(AX)
12471	ADDQ $0x04, BX
12472	ADDQ $0x04, AX
12473	JMP  memmove_long_standalone
12474
12475three_bytes_standalone:
12476	MOVB $0xf4, (AX)
12477	MOVW SI, 1(AX)
12478	ADDQ $0x03, BX
12479	ADDQ $0x03, AX
12480	JMP  memmove_long_standalone
12481
12482two_bytes_standalone:
12483	MOVB $0xf0, (AX)
12484	MOVB SI, 1(AX)
12485	ADDQ $0x02, BX
12486	ADDQ $0x02, AX
12487	CMPL SI, $0x40
12488	JL   memmove_standalone
12489	JMP  memmove_long_standalone
12490
12491one_byte_standalone:
12492	SHLB $0x02, SI
12493	MOVB SI, (AX)
12494	ADDQ $0x01, BX
12495	ADDQ $0x01, AX
12496
12497memmove_standalone:
12498	// genMemMoveShort
12499	CMPQ DX, $0x03
12500	JB   emit_lit_memmove_standalone_memmove_move_1or2
12501	JE   emit_lit_memmove_standalone_memmove_move_3
12502	CMPQ DX, $0x08
12503	JB   emit_lit_memmove_standalone_memmove_move_4through7
12504	CMPQ DX, $0x10
12505	JBE  emit_lit_memmove_standalone_memmove_move_8through16
12506	CMPQ DX, $0x20
12507	JBE  emit_lit_memmove_standalone_memmove_move_17through32
12508	JMP  emit_lit_memmove_standalone_memmove_move_33through64
12509
12510emit_lit_memmove_standalone_memmove_move_1or2:
12511	MOVB (CX), SI
12512	MOVB -1(CX)(DX*1), CL
12513	MOVB SI, (AX)
12514	MOVB CL, -1(AX)(DX*1)
12515	JMP  emit_literal_end_standalone
12516
12517emit_lit_memmove_standalone_memmove_move_3:
12518	MOVW (CX), SI
12519	MOVB 2(CX), CL
12520	MOVW SI, (AX)
12521	MOVB CL, 2(AX)
12522	JMP  emit_literal_end_standalone
12523
12524emit_lit_memmove_standalone_memmove_move_4through7:
12525	MOVL (CX), SI
12526	MOVL -4(CX)(DX*1), CX
12527	MOVL SI, (AX)
12528	MOVL CX, -4(AX)(DX*1)
12529	JMP  emit_literal_end_standalone
12530
12531emit_lit_memmove_standalone_memmove_move_8through16:
12532	MOVQ (CX), SI
12533	MOVQ -8(CX)(DX*1), CX
12534	MOVQ SI, (AX)
12535	MOVQ CX, -8(AX)(DX*1)
12536	JMP  emit_literal_end_standalone
12537
12538emit_lit_memmove_standalone_memmove_move_17through32:
12539	MOVOU (CX), X0
12540	MOVOU -16(CX)(DX*1), X1
12541	MOVOU X0, (AX)
12542	MOVOU X1, -16(AX)(DX*1)
12543	JMP   emit_literal_end_standalone
12544
12545emit_lit_memmove_standalone_memmove_move_33through64:
12546	MOVOU (CX), X0
12547	MOVOU 16(CX), X1
12548	MOVOU -32(CX)(DX*1), X2
12549	MOVOU -16(CX)(DX*1), X3
12550	MOVOU X0, (AX)
12551	MOVOU X1, 16(AX)
12552	MOVOU X2, -32(AX)(DX*1)
12553	MOVOU X3, -16(AX)(DX*1)
12554	JMP   emit_literal_end_standalone
12555	JMP emit_literal_end_standalone
12556
12557memmove_long_standalone:
12558	// genMemMoveLong
12559	MOVOU (CX), X0
12560	MOVOU 16(CX), X1
12561	MOVOU -32(CX)(DX*1), X2
12562	MOVOU -16(CX)(DX*1), X3
12563	MOVQ  DX, DI
12564	SHRQ  $0x05, DI
12565	MOVQ  AX, SI
12566	ANDL  $0x0000001f, SI
12567	MOVQ  $0x00000040, R8
12568	SUBQ  SI, R8
12569	DECQ  DI
12570	JA    emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
12571	LEAQ  -32(CX)(R8*1), SI
12572	LEAQ  -32(AX)(R8*1), R9
12573
12574emit_lit_memmove_long_standalonelarge_big_loop_back:
12575	MOVOU (SI), X4
12576	MOVOU 16(SI), X5
12577	MOVOA X4, (R9)
12578	MOVOA X5, 16(R9)
12579	ADDQ  $0x20, R9
12580	ADDQ  $0x20, SI
12581	ADDQ  $0x20, R8
12582	DECQ  DI
12583	JNA   emit_lit_memmove_long_standalonelarge_big_loop_back
12584
12585emit_lit_memmove_long_standalonelarge_forward_sse_loop_32:
12586	MOVOU -32(CX)(R8*1), X4
12587	MOVOU -16(CX)(R8*1), X5
12588	MOVOA X4, -32(AX)(R8*1)
12589	MOVOA X5, -16(AX)(R8*1)
12590	ADDQ  $0x20, R8
12591	CMPQ  DX, R8
12592	JAE   emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
12593	MOVOU X0, (AX)
12594	MOVOU X1, 16(AX)
12595	MOVOU X2, -32(AX)(DX*1)
12596	MOVOU X3, -16(AX)(DX*1)
12597	JMP   emit_literal_end_standalone
12598	JMP emit_literal_end_standalone
12599
12600emit_literal_end_standalone_skip:
12601	XORQ BX, BX
12602
12603emit_literal_end_standalone:
12604	MOVQ BX, ret+48(FP)
12605	RET
12606
12607// func emitRepeat(dst []byte, offset int, length int) int
12608TEXT ·emitRepeat(SB), NOSPLIT, $0-48
12609	XORQ BX, BX
12610	MOVQ dst_base+0(FP), AX
12611	MOVQ offset+24(FP), CX
12612	MOVQ length+32(FP), DX
12613
12614	// emitRepeat
12615emit_repeat_again_standalone:
12616	MOVL DX, SI
12617	LEAL -4(DX), DX
12618	CMPL SI, $0x08
12619	JLE  repeat_two_standalone
12620	CMPL SI, $0x0c
12621	JGE  cant_repeat_two_offset_standalone
12622	CMPL CX, $0x00000800
12623	JLT  repeat_two_offset_standalone
12624
12625cant_repeat_two_offset_standalone:
12626	CMPL DX, $0x00000104
12627	JLT  repeat_three_standalone
12628	CMPL DX, $0x00010100
12629	JLT  repeat_four_standalone
12630	CMPL DX, $0x0100ffff
12631	JLT  repeat_five_standalone
12632	LEAL -16842747(DX), DX
12633	MOVW $0x001d, (AX)
12634	MOVW $0xfffb, 2(AX)
12635	MOVB $0xff, 4(AX)
12636	ADDQ $0x05, AX
12637	ADDQ $0x05, BX
12638	JMP  emit_repeat_again_standalone
12639
12640repeat_five_standalone:
12641	LEAL -65536(DX), DX
12642	MOVL DX, CX
12643	MOVW $0x001d, (AX)
12644	MOVW DX, 2(AX)
12645	SARL $0x10, CX
12646	MOVB CL, 4(AX)
12647	ADDQ $0x05, BX
12648	ADDQ $0x05, AX
12649	JMP  gen_emit_repeat_end
12650
12651repeat_four_standalone:
12652	LEAL -256(DX), DX
12653	MOVW $0x0019, (AX)
12654	MOVW DX, 2(AX)
12655	ADDQ $0x04, BX
12656	ADDQ $0x04, AX
12657	JMP  gen_emit_repeat_end
12658
12659repeat_three_standalone:
12660	LEAL -4(DX), DX
12661	MOVW $0x0015, (AX)
12662	MOVB DL, 2(AX)
12663	ADDQ $0x03, BX
12664	ADDQ $0x03, AX
12665	JMP  gen_emit_repeat_end
12666
12667repeat_two_standalone:
12668	SHLL $0x02, DX
12669	ORL  $0x01, DX
12670	MOVW DX, (AX)
12671	ADDQ $0x02, BX
12672	ADDQ $0x02, AX
12673	JMP  gen_emit_repeat_end
12674
12675repeat_two_offset_standalone:
12676	XORQ SI, SI
12677	LEAL 1(SI)(DX*4), DX
12678	MOVB CL, 1(AX)
12679	SARL $0x08, CX
12680	SHLL $0x05, CX
12681	ORL  CX, DX
12682	MOVB DL, (AX)
12683	ADDQ $0x02, BX
12684	ADDQ $0x02, AX
12685
12686gen_emit_repeat_end:
12687	MOVQ BX, ret+40(FP)
12688	RET
12689
12690// func emitCopy(dst []byte, offset int, length int) int
12691TEXT ·emitCopy(SB), NOSPLIT, $0-48
12692	XORQ BX, BX
12693	MOVQ dst_base+0(FP), AX
12694	MOVQ offset+24(FP), CX
12695	MOVQ length+32(FP), DX
12696
12697	// emitCopy
12698	CMPL CX, $0x00010000
12699	JL   two_byte_offset_standalone
12700
12701four_bytes_loop_back_standalone:
12702	CMPL DX, $0x40
12703	JLE  four_bytes_remain_standalone
12704	MOVB $0xff, (AX)
12705	MOVL CX, 1(AX)
12706	LEAL -64(DX), DX
12707	ADDQ $0x05, BX
12708	ADDQ $0x05, AX
12709	CMPL DX, $0x04
12710	JL   four_bytes_remain_standalone
12711
12712	// emitRepeat
12713emit_repeat_again_standalone_emit_copy:
12714	MOVL DX, SI
12715	LEAL -4(DX), DX
12716	CMPL SI, $0x08
12717	JLE  repeat_two_standalone_emit_copy
12718	CMPL SI, $0x0c
12719	JGE  cant_repeat_two_offset_standalone_emit_copy
12720	CMPL CX, $0x00000800
12721	JLT  repeat_two_offset_standalone_emit_copy
12722
12723cant_repeat_two_offset_standalone_emit_copy:
12724	CMPL DX, $0x00000104
12725	JLT  repeat_three_standalone_emit_copy
12726	CMPL DX, $0x00010100
12727	JLT  repeat_four_standalone_emit_copy
12728	CMPL DX, $0x0100ffff
12729	JLT  repeat_five_standalone_emit_copy
12730	LEAL -16842747(DX), DX
12731	MOVW $0x001d, (AX)
12732	MOVW $0xfffb, 2(AX)
12733	MOVB $0xff, 4(AX)
12734	ADDQ $0x05, AX
12735	ADDQ $0x05, BX
12736	JMP  emit_repeat_again_standalone_emit_copy
12737
12738repeat_five_standalone_emit_copy:
12739	LEAL -65536(DX), DX
12740	MOVL DX, CX
12741	MOVW $0x001d, (AX)
12742	MOVW DX, 2(AX)
12743	SARL $0x10, CX
12744	MOVB CL, 4(AX)
12745	ADDQ $0x05, BX
12746	ADDQ $0x05, AX
12747	JMP  gen_emit_copy_end
12748
12749repeat_four_standalone_emit_copy:
12750	LEAL -256(DX), DX
12751	MOVW $0x0019, (AX)
12752	MOVW DX, 2(AX)
12753	ADDQ $0x04, BX
12754	ADDQ $0x04, AX
12755	JMP  gen_emit_copy_end
12756
12757repeat_three_standalone_emit_copy:
12758	LEAL -4(DX), DX
12759	MOVW $0x0015, (AX)
12760	MOVB DL, 2(AX)
12761	ADDQ $0x03, BX
12762	ADDQ $0x03, AX
12763	JMP  gen_emit_copy_end
12764
12765repeat_two_standalone_emit_copy:
12766	SHLL $0x02, DX
12767	ORL  $0x01, DX
12768	MOVW DX, (AX)
12769	ADDQ $0x02, BX
12770	ADDQ $0x02, AX
12771	JMP  gen_emit_copy_end
12772
12773repeat_two_offset_standalone_emit_copy:
12774	XORQ SI, SI
12775	LEAL 1(SI)(DX*4), DX
12776	MOVB CL, 1(AX)
12777	SARL $0x08, CX
12778	SHLL $0x05, CX
12779	ORL  CX, DX
12780	MOVB DL, (AX)
12781	ADDQ $0x02, BX
12782	ADDQ $0x02, AX
12783	JMP  gen_emit_copy_end
12784	JMP four_bytes_loop_back_standalone
12785
12786four_bytes_remain_standalone:
12787	TESTL DX, DX
12788	JZ    gen_emit_copy_end
12789	MOVB  $0x03, SI
12790	LEAL  -4(SI)(DX*4), DX
12791	MOVB  DL, (AX)
12792	MOVL  CX, 1(AX)
12793	ADDQ  $0x05, BX
12794	ADDQ  $0x05, AX
12795	JMP   gen_emit_copy_end
12796
12797two_byte_offset_standalone:
12798	CMPL DX, $0x40
12799	JLE  two_byte_offset_short_standalone
12800	MOVB $0xee, (AX)
12801	MOVW CX, 1(AX)
12802	LEAL -60(DX), DX
12803	ADDQ $0x03, AX
12804	ADDQ $0x03, BX
12805
12806	// emitRepeat
12807emit_repeat_again_standalone_emit_copy_short:
12808	MOVL DX, SI
12809	LEAL -4(DX), DX
12810	CMPL SI, $0x08
12811	JLE  repeat_two_standalone_emit_copy_short
12812	CMPL SI, $0x0c
12813	JGE  cant_repeat_two_offset_standalone_emit_copy_short
12814	CMPL CX, $0x00000800
12815	JLT  repeat_two_offset_standalone_emit_copy_short
12816
12817cant_repeat_two_offset_standalone_emit_copy_short:
12818	CMPL DX, $0x00000104
12819	JLT  repeat_three_standalone_emit_copy_short
12820	CMPL DX, $0x00010100
12821	JLT  repeat_four_standalone_emit_copy_short
12822	CMPL DX, $0x0100ffff
12823	JLT  repeat_five_standalone_emit_copy_short
12824	LEAL -16842747(DX), DX
12825	MOVW $0x001d, (AX)
12826	MOVW $0xfffb, 2(AX)
12827	MOVB $0xff, 4(AX)
12828	ADDQ $0x05, AX
12829	ADDQ $0x05, BX
12830	JMP  emit_repeat_again_standalone_emit_copy_short
12831
12832repeat_five_standalone_emit_copy_short:
12833	LEAL -65536(DX), DX
12834	MOVL DX, CX
12835	MOVW $0x001d, (AX)
12836	MOVW DX, 2(AX)
12837	SARL $0x10, CX
12838	MOVB CL, 4(AX)
12839	ADDQ $0x05, BX
12840	ADDQ $0x05, AX
12841	JMP  gen_emit_copy_end
12842
12843repeat_four_standalone_emit_copy_short:
12844	LEAL -256(DX), DX
12845	MOVW $0x0019, (AX)
12846	MOVW DX, 2(AX)
12847	ADDQ $0x04, BX
12848	ADDQ $0x04, AX
12849	JMP  gen_emit_copy_end
12850
12851repeat_three_standalone_emit_copy_short:
12852	LEAL -4(DX), DX
12853	MOVW $0x0015, (AX)
12854	MOVB DL, 2(AX)
12855	ADDQ $0x03, BX
12856	ADDQ $0x03, AX
12857	JMP  gen_emit_copy_end
12858
12859repeat_two_standalone_emit_copy_short:
12860	SHLL $0x02, DX
12861	ORL  $0x01, DX
12862	MOVW DX, (AX)
12863	ADDQ $0x02, BX
12864	ADDQ $0x02, AX
12865	JMP  gen_emit_copy_end
12866
12867repeat_two_offset_standalone_emit_copy_short:
12868	XORQ SI, SI
12869	LEAL 1(SI)(DX*4), DX
12870	MOVB CL, 1(AX)
12871	SARL $0x08, CX
12872	SHLL $0x05, CX
12873	ORL  CX, DX
12874	MOVB DL, (AX)
12875	ADDQ $0x02, BX
12876	ADDQ $0x02, AX
12877	JMP  gen_emit_copy_end
12878	JMP two_byte_offset_standalone
12879
12880two_byte_offset_short_standalone:
12881	CMPL DX, $0x0c
12882	JGE  emit_copy_three_standalone
12883	CMPL CX, $0x00000800
12884	JGE  emit_copy_three_standalone
12885	MOVB $0x01, SI
12886	LEAL -16(SI)(DX*4), DX
12887	MOVB CL, 1(AX)
12888	SHRL $0x08, CX
12889	SHLL $0x05, CX
12890	ORL  CX, DX
12891	MOVB DL, (AX)
12892	ADDQ $0x02, BX
12893	ADDQ $0x02, AX
12894	JMP  gen_emit_copy_end
12895
12896emit_copy_three_standalone:
12897	MOVB $0x02, SI
12898	LEAL -4(SI)(DX*4), DX
12899	MOVB DL, (AX)
12900	MOVW CX, 1(AX)
12901	ADDQ $0x03, BX
12902	ADDQ $0x03, AX
12903
12904gen_emit_copy_end:
12905	MOVQ BX, ret+40(FP)
12906	RET
12907
12908// func emitCopyNoRepeat(dst []byte, offset int, length int) int
12909TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48
12910	XORQ BX, BX
12911	MOVQ dst_base+0(FP), AX
12912	MOVQ offset+24(FP), CX
12913	MOVQ length+32(FP), DX
12914
12915	// emitCopy
12916	CMPL CX, $0x00010000
12917	JL   two_byte_offset_standalone_snappy
12918
12919four_bytes_loop_back_standalone_snappy:
12920	CMPL DX, $0x40
12921	JLE  four_bytes_remain_standalone_snappy
12922	MOVB $0xff, (AX)
12923	MOVL CX, 1(AX)
12924	LEAL -64(DX), DX
12925	ADDQ $0x05, BX
12926	ADDQ $0x05, AX
12927	CMPL DX, $0x04
12928	JL   four_bytes_remain_standalone_snappy
12929	JMP  four_bytes_loop_back_standalone_snappy
12930
12931four_bytes_remain_standalone_snappy:
12932	TESTL DX, DX
12933	JZ    gen_emit_copy_end_snappy
12934	MOVB  $0x03, SI
12935	LEAL  -4(SI)(DX*4), DX
12936	MOVB  DL, (AX)
12937	MOVL  CX, 1(AX)
12938	ADDQ  $0x05, BX
12939	ADDQ  $0x05, AX
12940	JMP   gen_emit_copy_end_snappy
12941
12942two_byte_offset_standalone_snappy:
12943	CMPL DX, $0x40
12944	JLE  two_byte_offset_short_standalone_snappy
12945	MOVB $0xee, (AX)
12946	MOVW CX, 1(AX)
12947	LEAL -60(DX), DX
12948	ADDQ $0x03, AX
12949	ADDQ $0x03, BX
12950	JMP  two_byte_offset_standalone_snappy
12951
12952two_byte_offset_short_standalone_snappy:
12953	CMPL DX, $0x0c
12954	JGE  emit_copy_three_standalone_snappy
12955	CMPL CX, $0x00000800
12956	JGE  emit_copy_three_standalone_snappy
12957	MOVB $0x01, SI
12958	LEAL -16(SI)(DX*4), DX
12959	MOVB CL, 1(AX)
12960	SHRL $0x08, CX
12961	SHLL $0x05, CX
12962	ORL  CX, DX
12963	MOVB DL, (AX)
12964	ADDQ $0x02, BX
12965	ADDQ $0x02, AX
12966	JMP  gen_emit_copy_end_snappy
12967
12968emit_copy_three_standalone_snappy:
12969	MOVB $0x02, SI
12970	LEAL -4(SI)(DX*4), DX
12971	MOVB DL, (AX)
12972	MOVW CX, 1(AX)
12973	ADDQ $0x03, BX
12974	ADDQ $0x03, AX
12975
12976gen_emit_copy_end_snappy:
12977	MOVQ BX, ret+40(FP)
12978	RET
12979
12980// func matchLen(a []byte, b []byte) int
12981TEXT ·matchLen(SB), NOSPLIT, $0-56
12982	MOVQ a_base+0(FP), AX
12983	MOVQ b_base+24(FP), CX
12984	MOVQ a_len+8(FP), DX
12985
12986	// matchLen
12987	XORL SI, SI
12988	CMPL DX, $0x08
12989	JL   matchlen_single_standalone
12990
12991matchlen_loopback_standalone:
12992	MOVQ  (AX)(SI*1), BX
12993	XORQ  (CX)(SI*1), BX
12994	TESTQ BX, BX
12995	JZ    matchlen_loop_standalone
12996	BSFQ  BX, BX
12997	SARQ  $0x03, BX
12998	LEAL  (SI)(BX*1), SI
12999	JMP   gen_match_len_end
13000
13001matchlen_loop_standalone:
13002	LEAL -8(DX), DX
13003	LEAL 8(SI), SI
13004	CMPL DX, $0x08
13005	JGE  matchlen_loopback_standalone
13006
13007matchlen_single_standalone:
13008	TESTL DX, DX
13009	JZ    gen_match_len_end
13010
13011matchlen_single_loopback_standalone:
13012	MOVB (AX)(SI*1), BL
13013	CMPB (CX)(SI*1), BL
13014	JNE  gen_match_len_end
13015	LEAL 1(SI), SI
13016	DECL DX
13017	JNZ  matchlen_single_loopback_standalone
13018
13019gen_match_len_end:
13020	MOVQ SI, ret+48(FP)
13021	RET
13022