1// Copyright 2014 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build !plan9 6 7#include "go_asm.h" 8#include "textflag.h" 9 10// NOTE: Windows externalthreadhandler expects memclr to preserve DX. 11 12// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr) 13TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT, $0-16 14 MOVQ ptr+0(FP), DI 15 MOVQ n+8(FP), BX 16 XORQ AX, AX 17 18 // MOVOU seems always faster than REP STOSQ. 19tail: 20 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing. 21 TESTQ BX, BX 22 JEQ _0 23 CMPQ BX, $2 24 JBE _1or2 25 CMPQ BX, $4 26 JBE _3or4 27 CMPQ BX, $8 28 JB _5through7 29 JE _8 30 CMPQ BX, $16 31 JBE _9through16 32 PXOR X0, X0 33 CMPQ BX, $32 34 JBE _17through32 35 CMPQ BX, $64 36 JBE _33through64 37 CMPQ BX, $128 38 JBE _65through128 39 CMPQ BX, $256 40 JBE _129through256 41 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 42 JE loop_preheader_avx2 43 // TODO: for really big clears, use MOVNTDQ, even without AVX2. 44 45loop: 46 MOVOU X0, 0(DI) 47 MOVOU X0, 16(DI) 48 MOVOU X0, 32(DI) 49 MOVOU X0, 48(DI) 50 MOVOU X0, 64(DI) 51 MOVOU X0, 80(DI) 52 MOVOU X0, 96(DI) 53 MOVOU X0, 112(DI) 54 MOVOU X0, 128(DI) 55 MOVOU X0, 144(DI) 56 MOVOU X0, 160(DI) 57 MOVOU X0, 176(DI) 58 MOVOU X0, 192(DI) 59 MOVOU X0, 208(DI) 60 MOVOU X0, 224(DI) 61 MOVOU X0, 240(DI) 62 SUBQ $256, BX 63 ADDQ $256, DI 64 CMPQ BX, $256 65 JAE loop 66 JMP tail 67 68loop_preheader_avx2: 69 VPXOR Y0, Y0, Y0 70 // For smaller sizes MOVNTDQ may be faster or slower depending on hardware. 71 // For larger sizes it is always faster, even on dual Xeons with 30M cache. 72 // TODO take into account actual LLC size. E. g. glibc uses LLC size/2. 73 CMPQ BX, $0x2000000 74 JAE loop_preheader_avx2_huge 75loop_avx2: 76 VMOVDQU Y0, 0(DI) 77 VMOVDQU Y0, 32(DI) 78 VMOVDQU Y0, 64(DI) 79 VMOVDQU Y0, 96(DI) 80 SUBQ $128, BX 81 ADDQ $128, DI 82 CMPQ BX, $128 83 JAE loop_avx2 84 VMOVDQU Y0, -32(DI)(BX*1) 85 VMOVDQU Y0, -64(DI)(BX*1) 86 VMOVDQU Y0, -96(DI)(BX*1) 87 VMOVDQU Y0, -128(DI)(BX*1) 88 VZEROUPPER 89 RET 90loop_preheader_avx2_huge: 91 // Align to 32 byte boundary 92 VMOVDQU Y0, 0(DI) 93 MOVQ DI, SI 94 ADDQ $32, DI 95 ANDQ $~31, DI 96 SUBQ DI, SI 97 ADDQ SI, BX 98loop_avx2_huge: 99 VMOVNTDQ Y0, 0(DI) 100 VMOVNTDQ Y0, 32(DI) 101 VMOVNTDQ Y0, 64(DI) 102 VMOVNTDQ Y0, 96(DI) 103 SUBQ $128, BX 104 ADDQ $128, DI 105 CMPQ BX, $128 106 JAE loop_avx2_huge 107 // In the description of MOVNTDQ in [1] 108 // "... fencing operation implemented with the SFENCE or MFENCE instruction 109 // should be used in conjunction with MOVNTDQ instructions..." 110 // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf 111 SFENCE 112 VMOVDQU Y0, -32(DI)(BX*1) 113 VMOVDQU Y0, -64(DI)(BX*1) 114 VMOVDQU Y0, -96(DI)(BX*1) 115 VMOVDQU Y0, -128(DI)(BX*1) 116 VZEROUPPER 117 RET 118 119_1or2: 120 MOVB AX, (DI) 121 MOVB AX, -1(DI)(BX*1) 122 RET 123_0: 124 RET 125_3or4: 126 MOVW AX, (DI) 127 MOVW AX, -2(DI)(BX*1) 128 RET 129_5through7: 130 MOVL AX, (DI) 131 MOVL AX, -4(DI)(BX*1) 132 RET 133_8: 134 // We need a separate case for 8 to make sure we clear pointers atomically. 135 MOVQ AX, (DI) 136 RET 137_9through16: 138 MOVQ AX, (DI) 139 MOVQ AX, -8(DI)(BX*1) 140 RET 141_17through32: 142 MOVOU X0, (DI) 143 MOVOU X0, -16(DI)(BX*1) 144 RET 145_33through64: 146 MOVOU X0, (DI) 147 MOVOU X0, 16(DI) 148 MOVOU X0, -32(DI)(BX*1) 149 MOVOU X0, -16(DI)(BX*1) 150 RET 151_65through128: 152 MOVOU X0, (DI) 153 MOVOU X0, 16(DI) 154 MOVOU X0, 32(DI) 155 MOVOU X0, 48(DI) 156 MOVOU X0, -64(DI)(BX*1) 157 MOVOU X0, -48(DI)(BX*1) 158 MOVOU X0, -32(DI)(BX*1) 159 MOVOU X0, -16(DI)(BX*1) 160 RET 161_129through256: 162 MOVOU X0, (DI) 163 MOVOU X0, 16(DI) 164 MOVOU X0, 32(DI) 165 MOVOU X0, 48(DI) 166 MOVOU X0, 64(DI) 167 MOVOU X0, 80(DI) 168 MOVOU X0, 96(DI) 169 MOVOU X0, 112(DI) 170 MOVOU X0, -128(DI)(BX*1) 171 MOVOU X0, -112(DI)(BX*1) 172 MOVOU X0, -96(DI)(BX*1) 173 MOVOU X0, -80(DI)(BX*1) 174 MOVOU X0, -64(DI)(BX*1) 175 MOVOU X0, -48(DI)(BX*1) 176 MOVOU X0, -32(DI)(BX*1) 177 MOVOU X0, -16(DI)(BX*1) 178 RET 179