1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build !plan9
6
7#include "go_asm.h"
8#include "textflag.h"
9
10// NOTE: Windows externalthreadhandler expects memclr to preserve DX.
11
12// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
13TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT, $0-16
14	MOVQ	ptr+0(FP), DI
15	MOVQ	n+8(FP), BX
16	XORQ	AX, AX
17
18	// MOVOU seems always faster than REP STOSQ.
19tail:
20	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
21	TESTQ	BX, BX
22	JEQ	_0
23	CMPQ	BX, $2
24	JBE	_1or2
25	CMPQ	BX, $4
26	JBE	_3or4
27	CMPQ	BX, $8
28	JB	_5through7
29	JE	_8
30	CMPQ	BX, $16
31	JBE	_9through16
32	PXOR	X0, X0
33	CMPQ	BX, $32
34	JBE	_17through32
35	CMPQ	BX, $64
36	JBE	_33through64
37	CMPQ	BX, $128
38	JBE	_65through128
39	CMPQ	BX, $256
40	JBE	_129through256
41	CMPB	internalcpu·X86+const_offsetX86HasAVX2(SB), $1
42	JE loop_preheader_avx2
43	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
44
45loop:
46	MOVOU	X0, 0(DI)
47	MOVOU	X0, 16(DI)
48	MOVOU	X0, 32(DI)
49	MOVOU	X0, 48(DI)
50	MOVOU	X0, 64(DI)
51	MOVOU	X0, 80(DI)
52	MOVOU	X0, 96(DI)
53	MOVOU	X0, 112(DI)
54	MOVOU	X0, 128(DI)
55	MOVOU	X0, 144(DI)
56	MOVOU	X0, 160(DI)
57	MOVOU	X0, 176(DI)
58	MOVOU	X0, 192(DI)
59	MOVOU	X0, 208(DI)
60	MOVOU	X0, 224(DI)
61	MOVOU	X0, 240(DI)
62	SUBQ	$256, BX
63	ADDQ	$256, DI
64	CMPQ	BX, $256
65	JAE	loop
66	JMP	tail
67
68loop_preheader_avx2:
69	VPXOR Y0, Y0, Y0
70	// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
71	// For larger sizes it is always faster, even on dual Xeons with 30M cache.
72	// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
73	CMPQ    BX, $0x2000000
74	JAE     loop_preheader_avx2_huge
75loop_avx2:
76	VMOVDQU	Y0, 0(DI)
77	VMOVDQU	Y0, 32(DI)
78	VMOVDQU	Y0, 64(DI)
79	VMOVDQU	Y0, 96(DI)
80	SUBQ	$128, BX
81	ADDQ	$128, DI
82	CMPQ	BX, $128
83	JAE	loop_avx2
84	VMOVDQU  Y0, -32(DI)(BX*1)
85	VMOVDQU  Y0, -64(DI)(BX*1)
86	VMOVDQU  Y0, -96(DI)(BX*1)
87	VMOVDQU  Y0, -128(DI)(BX*1)
88	VZEROUPPER
89	RET
90loop_preheader_avx2_huge:
91	// Align to 32 byte boundary
92	VMOVDQU  Y0, 0(DI)
93	MOVQ	DI, SI
94	ADDQ	$32, DI
95	ANDQ	$~31, DI
96	SUBQ	DI, SI
97	ADDQ	SI, BX
98loop_avx2_huge:
99	VMOVNTDQ	Y0, 0(DI)
100	VMOVNTDQ	Y0, 32(DI)
101	VMOVNTDQ	Y0, 64(DI)
102	VMOVNTDQ	Y0, 96(DI)
103	SUBQ	$128, BX
104	ADDQ	$128, DI
105	CMPQ	BX, $128
106	JAE	loop_avx2_huge
107	// In the description of MOVNTDQ in [1]
108	// "... fencing operation implemented with the SFENCE or MFENCE instruction
109	// should be used in conjunction with MOVNTDQ instructions..."
110	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
111	SFENCE
112	VMOVDQU  Y0, -32(DI)(BX*1)
113	VMOVDQU  Y0, -64(DI)(BX*1)
114	VMOVDQU  Y0, -96(DI)(BX*1)
115	VMOVDQU  Y0, -128(DI)(BX*1)
116	VZEROUPPER
117	RET
118
119_1or2:
120	MOVB	AX, (DI)
121	MOVB	AX, -1(DI)(BX*1)
122	RET
123_0:
124	RET
125_3or4:
126	MOVW	AX, (DI)
127	MOVW	AX, -2(DI)(BX*1)
128	RET
129_5through7:
130	MOVL	AX, (DI)
131	MOVL	AX, -4(DI)(BX*1)
132	RET
133_8:
134	// We need a separate case for 8 to make sure we clear pointers atomically.
135	MOVQ	AX, (DI)
136	RET
137_9through16:
138	MOVQ	AX, (DI)
139	MOVQ	AX, -8(DI)(BX*1)
140	RET
141_17through32:
142	MOVOU	X0, (DI)
143	MOVOU	X0, -16(DI)(BX*1)
144	RET
145_33through64:
146	MOVOU	X0, (DI)
147	MOVOU	X0, 16(DI)
148	MOVOU	X0, -32(DI)(BX*1)
149	MOVOU	X0, -16(DI)(BX*1)
150	RET
151_65through128:
152	MOVOU	X0, (DI)
153	MOVOU	X0, 16(DI)
154	MOVOU	X0, 32(DI)
155	MOVOU	X0, 48(DI)
156	MOVOU	X0, -64(DI)(BX*1)
157	MOVOU	X0, -48(DI)(BX*1)
158	MOVOU	X0, -32(DI)(BX*1)
159	MOVOU	X0, -16(DI)(BX*1)
160	RET
161_129through256:
162	MOVOU	X0, (DI)
163	MOVOU	X0, 16(DI)
164	MOVOU	X0, 32(DI)
165	MOVOU	X0, 48(DI)
166	MOVOU	X0, 64(DI)
167	MOVOU	X0, 80(DI)
168	MOVOU	X0, 96(DI)
169	MOVOU	X0, 112(DI)
170	MOVOU	X0, -128(DI)(BX*1)
171	MOVOU	X0, -112(DI)(BX*1)
172	MOVOU	X0, -96(DI)(BX*1)
173	MOVOU	X0, -80(DI)(BX*1)
174	MOVOU	X0, -64(DI)(BX*1)
175	MOVOU	X0, -48(DI)(BX*1)
176	MOVOU	X0, -32(DI)(BX*1)
177	MOVOU	X0, -16(DI)(BX*1)
178	RET
179