1// Copyright 2017 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build 386 amd64 amd64p32
6
7package cpu
8
9import (
10	"fmt"
11	"strings"
12)
13
14const CacheLineSize = 64
15
16// cpuid is implemented in cpu_x86.s.
17func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
18
19// xgetbv with ecx = 0 is implemented in cpu_x86.s.
20func xgetbv() (eax, edx uint32)
21
22const (
23	// edx bits
24	cpuid_SSE2 = 1 << 26
25
26	// ecx bits
27	cpuid_SSE3      = 1 << 0
28	cpuid_PCLMULQDQ = 1 << 1
29	cpuid_SSSE3     = 1 << 9
30	cpuid_FMA       = 1 << 12
31	cpuid_SSE41     = 1 << 19
32	cpuid_SSE42     = 1 << 20
33	cpuid_POPCNT    = 1 << 23
34	cpuid_AES       = 1 << 25
35	cpuid_OSXSAVE   = 1 << 27
36	cpuid_AVX       = 1 << 28
37
38	// ebx bits
39	cpuid_BMI1     = 1 << 3
40	cpuid_AVX2     = 1 << 5
41	cpuid_BMI2     = 1 << 8
42	cpuid_ERMS     = 1 << 9
43	cpuid_ADX      = 1 << 19
44	cpuid_AVX512F  = 1 << 16
45	cpuid_AVX512DQ = 1 << 17
46	cpuid_AVX512BW = 1 << 30
47	cpuid_AVX512VL = 1 << 31
48
49	// edx bits
50	cpuid_Invariant_TSC = 1 << 8
51)
52
53func doinit() {
54	options = []option{
55		{"adx", &X86.HasADX},
56		{"aes", &X86.HasAES},
57		{"avx", &X86.HasAVX},
58		{"avx2", &X86.HasAVX2},
59		{"bmi1", &X86.HasBMI1},
60		{"bmi2", &X86.HasBMI2},
61		{"erms", &X86.HasERMS},
62		{"fma", &X86.HasFMA},
63		{"pclmulqdq", &X86.HasPCLMULQDQ},
64		{"popcnt", &X86.HasPOPCNT},
65		{"sse3", &X86.HasSSE3},
66		{"sse41", &X86.HasSSE41},
67		{"sse42", &X86.HasSSE42},
68		{"ssse3", &X86.HasSSSE3},
69		{"avx512f", &X86.HasAVX512F},
70		{"avx512dq", &X86.HasAVX512DQ},
71		{"avx512bw", &X86.HasAVX512BW},
72		{"avx512vl", &X86.HasAVX512VL},
73		{"invariant_tsc", &X86.HasInvariantTSC},
74
75		// sse2 set as last element so it can easily be removed again. See code below.
76		{"sse2", &X86.HasSSE2},
77	}
78
79	// Remove sse2 from options on amd64(p32) because SSE2 is a mandatory feature for these GOARCHs.
80	if GOARCH == "amd64" || GOARCH == "amd64p32" {
81		options = options[:len(options)-1]
82	}
83
84	maxID, _, _, _ := cpuid(0, 0)
85
86	if maxID < 1 {
87		return
88	}
89
90	_, _, ecx1, edx1 := cpuid(1, 0)
91	X86.HasSSE2 = isSet(edx1, cpuid_SSE2)
92
93	X86.HasSSE3 = isSet(ecx1, cpuid_SSE3)
94	X86.HasPCLMULQDQ = isSet(ecx1, cpuid_PCLMULQDQ)
95	X86.HasSSSE3 = isSet(ecx1, cpuid_SSSE3)
96	X86.HasFMA = isSet(ecx1, cpuid_FMA)
97	X86.HasSSE41 = isSet(ecx1, cpuid_SSE41)
98	X86.HasSSE42 = isSet(ecx1, cpuid_SSE42)
99	X86.HasPOPCNT = isSet(ecx1, cpuid_POPCNT)
100	X86.HasAES = isSet(ecx1, cpuid_AES)
101	X86.HasOSXSAVE = isSet(ecx1, cpuid_OSXSAVE)
102
103	osSupportsAVX := false
104	osSupportsAVX512 := false
105	// For XGETBV, OSXSAVE bit is required and sufficient.
106	if X86.HasOSXSAVE {
107		eax, _ := xgetbv()
108		// Check if XMM and YMM registers have OS support.
109		osSupportsAVX = isSet(eax, 1<<1) && isSet(eax, 1<<2)
110		// Check is ZMM registers have OS support.
111		osSupportsAVX512 = isSet(eax>>5, 7) && isSet(eax>>1, 3)
112	}
113
114	X86.HasAVX = isSet(ecx1, cpuid_AVX) && osSupportsAVX
115
116	if maxID < 7 {
117		return
118	}
119
120	_, ebx7, _, _ := cpuid(7, 0)
121	X86.HasBMI1 = isSet(ebx7, cpuid_BMI1)
122	X86.HasAVX2 = isSet(ebx7, cpuid_AVX2) && osSupportsAVX
123	X86.HasAVX512F = isSet(ebx7, cpuid_AVX512F) && osSupportsAVX512
124	X86.HasAVX512DQ = isSet(ebx7, cpuid_AVX512DQ) && osSupportsAVX512
125	X86.HasAVX512BW = isSet(ebx7, cpuid_AVX512BW) && osSupportsAVX512
126	X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL) && osSupportsAVX512
127	X86.HasBMI2 = isSet(ebx7, cpuid_BMI2)
128	X86.HasERMS = isSet(ebx7, cpuid_ERMS)
129	X86.HasADX = isSet(ebx7, cpuid_ADX)
130
131	X86.Cache = getCacheSize()
132
133	X86.HasInvariantTSC = hasInvariantTSC()
134
135	X86.Family, X86.Model = getFamilyModel()
136
137	X86.Signature = makeSignature(X86.Family, X86.Model)
138
139	X86.Name = getName()
140
141	X86.TSCFrequency = getNativeTSCFrequency(X86.Name, X86.Signature)
142}
143
144func isSet(hwc uint32, value uint32) bool {
145	return hwc&value != 0
146}
147
148func hasInvariantTSC() bool {
149	if maxExtendedFunction() < 0x80000007 {
150		return false
151	}
152	_, _, _, edx := cpuid(0x80000007, 0)
153	return isSet(edx, cpuid_Invariant_TSC)
154}
155
156func getName() string {
157	if maxExtendedFunction() >= 0x80000004 {
158		v := make([]uint32, 0, 48)
159		for i := uint32(0); i < 3; i++ {
160			a, b, c, d := cpuid(0x80000002+i, 0)
161			v = append(v, a, b, c, d)
162		}
163		return strings.Trim(string(valAsString(v...)), " ")
164	}
165	return "unknown"
166}
167
168// getNativeTSCFrequency gets TSC frequency from CPUID,
169// only supports Intel (Skylake or later microarchitecture) & key information is from Intel manual & kernel codes
170// (especially this commit: https://github.com/torvalds/linux/commit/604dc9170f2435d27da5039a3efd757dceadc684).
171func getNativeTSCFrequency(name, sign string) uint64 {
172
173	if vendorID() != Intel {
174		return 0
175	}
176
177	if maxFunctionID() < 0x15 {
178		return 0
179	}
180
181	// ApolloLake, GeminiLake, CannonLake (and presumably all new chipsets
182	// from this point) report the crystal frequency directly via CPUID.0x15.
183	// That's definitive data that we can rely upon.
184	eax, ebx, ecx, _ := cpuid(0x15, 0)
185
186	// If ebx is 0, the TSC/”core crystal clock” ratio is not enumerated.
187	// We won't provide TSC frequency detection in this situation.
188	if eax == 0 || ebx == 0 {
189		return 0
190	}
191
192	// Skylake, Kabylake and all variants of those two chipsets report a
193	// crystal frequency of zero.
194	if ecx == 0 { // Crystal clock frequency is not enumerated.
195		ecx = getCrystalClockFrequency(sign)
196	}
197
198	// TSC frequency = “core crystal clock frequency” * EBX/EAX.
199	return uint64(ecx) * (uint64(ebx) / uint64(eax))
200}
201
202// Copied from: CPUID Signature values of DisplayFamily and DisplayModel,
203// in Intel® 64 and IA-32 Architectures Software Developer’s Manual
204// Volume 4: Model-Specific Registers
205// & https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/intel-family.h
206const (
207	IntelFam6SkylakeL  = "06_4EH"
208	IntelFam6Skylake   = "06_5EH"
209	IntelFam6SkylakeX  = "06_55H"
210	IntelFam6KabylakeL = "06_8EH"
211	IntelFam6Kabylake  = "06_9EH"
212)
213
214// getCrystalClockFrequency gets crystal clock frequency
215// for Intel processors in which CPUID.15H.EBX[31:0] ÷ CPUID.0x15.EAX[31:0] is enumerated
216// but CPUID.15H.ECX is not enumerated using this function to get nominal core crystal clock frequency.
217//
218// Actually these crystal clock frequencies provided by Intel hardcoded tables are not so accurate in some cases,
219// e.g. SkyLake server CPU may have issue (All SKX subject the crystal to an EMI reduction circuit that
220//reduces its actual frequency by (approximately) -0.25%):
221// see https://lore.kernel.org/lkml/ff6dcea166e8ff8f2f6a03c17beab2cb436aa779.1513920414.git.len.brown@intel.com/
222// for more details.
223// With this report, I set a coefficient (0.9975) for IntelFam6SkyLakeX.
224//
225// Unlike the kernel way (mentioned in https://github.com/torvalds/linux/commit/604dc9170f2435d27da5039a3efd757dceadc684),
226// I prefer the Intel hardcoded tables,
227// because after some testing (comparing with wall clock, see https://github.com/templexxx/tsc/tsc_test.go for more details),
228// I found hardcoded tables are more accurate.
229func getCrystalClockFrequency(sign string) uint32 {
230
231	if maxFunctionID() < 0x16 {
232		return 0
233	}
234
235	switch sign {
236	case IntelFam6SkylakeL:
237		return 24 * 1000 * 1000
238	case IntelFam6Skylake:
239		return 24 * 1000 * 1000
240	case IntelFam6SkylakeX:
241		return 25 * 1000 * 1000 * 0.9975
242	case IntelFam6KabylakeL:
243		return 24 * 1000 * 1000
244	case IntelFam6Kabylake:
245		return 24 * 1000 * 1000
246	}
247
248	return 0
249}
250
251func getFamilyModel() (uint32, uint32) {
252	if maxFunctionID() < 0x1 {
253		return 0, 0
254	}
255	eax, _, _, _ := cpuid(1, 0)
256	family := (eax >> 8) & 0xf
257	displayFamily := family
258	if family == 0xf {
259		displayFamily = ((eax >> 20) & 0xff) + family
260	}
261	model := (eax >> 4) & 0xf
262	displayModel := model
263	if family == 0x6 || family == 0xf {
264		displayModel = ((eax >> 12) & 0xf0) + model
265	}
266	return displayFamily, displayModel
267}
268
269// signature format: XX_XXH
270func makeSignature(family, model uint32) string {
271	signature := strings.ToUpper(fmt.Sprintf("0%x_0%xH", family, model))
272	ss := strings.Split(signature, "_")
273	for i, s := range ss {
274		// Maybe insert too more `0`, drop it.
275		if len(s) > 2 {
276			s = s[1:]
277			ss[i] = s
278		}
279	}
280	return strings.Join(ss, "_")
281}
282
283// getCacheSize is from
284// https://github.com/klauspost/cpuid/blob/5a626f7029c910cc8329dae5405ee4f65034bce5/cpuid.go#L723
285func getCacheSize() Cache {
286	c := Cache{
287		L1I: -1,
288		L1D: -1,
289		L2:  -1,
290		L3:  -1,
291	}
292
293	vendor := vendorID()
294	switch vendor {
295	case Intel:
296		if maxFunctionID() < 4 {
297			return c
298		}
299		for i := uint32(0); ; i++ {
300			eax, ebx, ecx, _ := cpuid(4, i)
301			cacheType := eax & 15
302			if cacheType == 0 {
303				break
304			}
305			cacheLevel := (eax >> 5) & 7
306			coherency := int(ebx&0xfff) + 1
307			partitions := int((ebx>>12)&0x3ff) + 1
308			associativity := int((ebx>>22)&0x3ff) + 1
309			sets := int(ecx) + 1
310			size := associativity * partitions * coherency * sets
311			switch cacheLevel {
312			case 1:
313				if cacheType == 1 {
314					// 1 = Data Cache
315					c.L1D = size
316				} else if cacheType == 2 {
317					// 2 = Instruction Cache
318					c.L1I = size
319				} else {
320					if c.L1D < 0 {
321						c.L1I = size
322					}
323					if c.L1I < 0 {
324						c.L1I = size
325					}
326				}
327			case 2:
328				c.L2 = size
329			case 3:
330				c.L3 = size
331			}
332		}
333	case AMD, Hygon:
334		// Untested.
335		if maxExtendedFunction() < 0x80000005 {
336			return c
337		}
338		_, _, ecx, edx := cpuid(0x80000005, 0)
339		c.L1D = int(((ecx >> 24) & 0xFF) * 1024)
340		c.L1I = int(((edx >> 24) & 0xFF) * 1024)
341
342		if maxExtendedFunction() < 0x80000006 {
343			return c
344		}
345		_, _, ecx, _ = cpuid(0x80000006, 0)
346		c.L2 = int(((ecx >> 16) & 0xFFFF) * 1024)
347	}
348
349	return c
350}
351
352func maxFunctionID() uint32 {
353	a, _, _, _ := cpuid(0, 0)
354	return a
355}
356
357func maxExtendedFunction() uint32 {
358	eax, _, _, _ := cpuid(0x80000000, 0)
359	return eax
360}
361
362const (
363	Other = iota
364	Intel
365	AMD
366	VIA
367	Transmeta
368	NSC
369	KVM  // Kernel-based Virtual Machine
370	MSVM // Microsoft Hyper-V or Windows Virtual PC
371	VMware
372	XenHVM
373	Bhyve
374	Hygon
375)
376
377// Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID
378var vendorMapping = map[string]int{
379	"AMDisbetter!": AMD,
380	"AuthenticAMD": AMD,
381	"CentaurHauls": VIA,
382	"GenuineIntel": Intel,
383	"TransmetaCPU": Transmeta,
384	"GenuineTMx86": Transmeta,
385	"Geode by NSC": NSC,
386	"VIA VIA VIA ": VIA,
387	"KVMKVMKVMKVM": KVM,
388	"Microsoft Hv": MSVM,
389	"VMwareVMware": VMware,
390	"XenVMMXenVMM": XenHVM,
391	"bhyve bhyve ": Bhyve,
392	"HygonGenuine": Hygon,
393}
394
395func vendorID() int {
396	_, b, c, d := cpuid(0, 0)
397	v := valAsString(b, d, c)
398	vend, ok := vendorMapping[string(v)]
399	if !ok {
400		return Other
401	}
402	return vend
403}
404
405func valAsString(values ...uint32) []byte {
406	r := make([]byte, 4*len(values))
407	for i, v := range values {
408		dst := r[i*4:]
409		dst[0] = byte(v & 0xff)
410		dst[1] = byte((v >> 8) & 0xff)
411		dst[2] = byte((v >> 16) & 0xff)
412		dst[3] = byte((v >> 24) & 0xff)
413		switch {
414		case dst[0] == 0:
415			return r[:i*4]
416		case dst[1] == 0:
417			return r[:i*4+1]
418		case dst[2] == 0:
419			return r[:i*4+2]
420		case dst[3] == 0:
421			return r[:i*4+3]
422		}
423	}
424	return r
425}
426