1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "go_asm.h"
6#include "textflag.h"
7
8TEXT ·Compare(SB),NOSPLIT,$0-56
9	MOVQ	a_base+0(FP), SI
10	MOVQ	a_len+8(FP), BX
11	MOVQ	b_base+24(FP), DI
12	MOVQ	b_len+32(FP), DX
13	LEAQ	ret+48(FP), R9
14	JMP	cmpbody<>(SB)
15
16TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
17	MOVQ	a_base+0(FP), SI
18	MOVQ	a_len+8(FP), BX
19	MOVQ	b_base+16(FP), DI
20	MOVQ	b_len+24(FP), DX
21	LEAQ	ret+32(FP), R9
22	JMP	cmpbody<>(SB)
23
24// input:
25//   SI = a
26//   DI = b
27//   BX = alen
28//   DX = blen
29//   R9 = address of output word (stores -1/0/1 here)
30TEXT cmpbody<>(SB),NOSPLIT,$0-0
31	CMPQ	SI, DI
32	JEQ	allsame
33	CMPQ	BX, DX
34	MOVQ	DX, R8
35	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
36	CMPQ	R8, $8
37	JB	small
38
39	CMPQ	R8, $63
40	JBE	loop
41	CMPB	internalcpu·X86+const_offsetX86HasAVX2(SB), $1
42	JEQ     big_loop_avx2
43	JMP	big_loop
44loop:
45	CMPQ	R8, $16
46	JBE	_0through16
47	MOVOU	(SI), X0
48	MOVOU	(DI), X1
49	PCMPEQB X0, X1
50	PMOVMSKB X1, AX
51	XORQ	$0xffff, AX	// convert EQ to NE
52	JNE	diff16	// branch if at least one byte is not equal
53	ADDQ	$16, SI
54	ADDQ	$16, DI
55	SUBQ	$16, R8
56	JMP	loop
57
58diff64:
59	ADDQ	$48, SI
60	ADDQ	$48, DI
61	JMP	diff16
62diff48:
63	ADDQ	$32, SI
64	ADDQ	$32, DI
65	JMP	diff16
66diff32:
67	ADDQ	$16, SI
68	ADDQ	$16, DI
69	// AX = bit mask of differences
70diff16:
71	BSFQ	AX, BX	// index of first byte that differs
72	XORQ	AX, AX
73	MOVB	(SI)(BX*1), CX
74	CMPB	CX, (DI)(BX*1)
75	SETHI	AX
76	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
77	MOVQ	AX, (R9)
78	RET
79
80	// 0 through 16 bytes left, alen>=8, blen>=8
81_0through16:
82	CMPQ	R8, $8
83	JBE	_0through8
84	MOVQ	(SI), AX
85	MOVQ	(DI), CX
86	CMPQ	AX, CX
87	JNE	diff8
88_0through8:
89	MOVQ	-8(SI)(R8*1), AX
90	MOVQ	-8(DI)(R8*1), CX
91	CMPQ	AX, CX
92	JEQ	allsame
93
94	// AX and CX contain parts of a and b that differ.
95diff8:
96	BSWAPQ	AX	// reverse order of bytes
97	BSWAPQ	CX
98	XORQ	AX, CX
99	BSRQ	CX, CX	// index of highest bit difference
100	SHRQ	CX, AX	// move a's bit to bottom
101	ANDQ	$1, AX	// mask bit
102	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
103	MOVQ	AX, (R9)
104	RET
105
106	// 0-7 bytes in common
107small:
108	LEAQ	(R8*8), CX	// bytes left -> bits left
109	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
110	JEQ	allsame
111
112	// load bytes of a into high bytes of AX
113	CMPB	SI, $0xf8
114	JA	si_high
115	MOVQ	(SI), SI
116	JMP	si_finish
117si_high:
118	MOVQ	-8(SI)(R8*1), SI
119	SHRQ	CX, SI
120si_finish:
121	SHLQ	CX, SI
122
123	// load bytes of b in to high bytes of BX
124	CMPB	DI, $0xf8
125	JA	di_high
126	MOVQ	(DI), DI
127	JMP	di_finish
128di_high:
129	MOVQ	-8(DI)(R8*1), DI
130	SHRQ	CX, DI
131di_finish:
132	SHLQ	CX, DI
133
134	BSWAPQ	SI	// reverse order of bytes
135	BSWAPQ	DI
136	XORQ	SI, DI	// find bit differences
137	JEQ	allsame
138	BSRQ	DI, CX	// index of highest bit difference
139	SHRQ	CX, SI	// move a's bit to bottom
140	ANDQ	$1, SI	// mask bit
141	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
142	MOVQ	AX, (R9)
143	RET
144
145allsame:
146	XORQ	AX, AX
147	XORQ	CX, CX
148	CMPQ	BX, DX
149	SETGT	AX	// 1 if alen > blen
150	SETEQ	CX	// 1 if alen == blen
151	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
152	MOVQ	AX, (R9)
153	RET
154
155	// this works for >= 64 bytes of data.
156big_loop:
157	MOVOU	(SI), X0
158	MOVOU	(DI), X1
159	PCMPEQB X0, X1
160	PMOVMSKB X1, AX
161	XORQ	$0xffff, AX
162	JNE	diff16
163
164	MOVOU	16(SI), X0
165	MOVOU	16(DI), X1
166	PCMPEQB X0, X1
167	PMOVMSKB X1, AX
168	XORQ	$0xffff, AX
169	JNE	diff32
170
171	MOVOU	32(SI), X0
172	MOVOU	32(DI), X1
173	PCMPEQB X0, X1
174	PMOVMSKB X1, AX
175	XORQ	$0xffff, AX
176	JNE	diff48
177
178	MOVOU	48(SI), X0
179	MOVOU	48(DI), X1
180	PCMPEQB X0, X1
181	PMOVMSKB X1, AX
182	XORQ	$0xffff, AX
183	JNE	diff64
184
185	ADDQ	$64, SI
186	ADDQ	$64, DI
187	SUBQ	$64, R8
188	CMPQ	R8, $64
189	JBE	loop
190	JMP	big_loop
191
192	// Compare 64-bytes per loop iteration.
193	// Loop is unrolled and uses AVX2.
194big_loop_avx2:
195	VMOVDQU	(SI), Y2
196	VMOVDQU	(DI), Y3
197	VMOVDQU	32(SI), Y4
198	VMOVDQU	32(DI), Y5
199	VPCMPEQB Y2, Y3, Y0
200	VPMOVMSKB Y0, AX
201	XORL	$0xffffffff, AX
202	JNE	diff32_avx2
203	VPCMPEQB Y4, Y5, Y6
204	VPMOVMSKB Y6, AX
205	XORL	$0xffffffff, AX
206	JNE	diff64_avx2
207
208	ADDQ	$64, SI
209	ADDQ	$64, DI
210	SUBQ	$64, R8
211	CMPQ	R8, $64
212	JB	big_loop_avx2_exit
213	JMP	big_loop_avx2
214
215	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
216diff32_avx2:
217	VZEROUPPER
218	JMP diff16
219
220	// Same as diff32_avx2, but for last 32 bytes.
221diff64_avx2:
222	VZEROUPPER
223	JMP diff48
224
225	// For <64 bytes remainder jump to normal loop.
226big_loop_avx2_exit:
227	VZEROUPPER
228	JMP loop
229