1 /* Copyright (C) 2018 Wildfire Games.
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining
4  * a copy of this software and associated documentation files (the
5  * "Software"), to deal in the Software without restriction, including
6  * without limitation the rights to use, copy, modify, merge, publish,
7  * distribute, sublicense, and/or sell copies of the Software, and to
8  * permit persons to whom the Software is furnished to do so, subject to
9  * the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "precompiled.h"
24 #include "lib/sysdep/numa.h"
25 
26 #include "lib/bits.h"	// PopulationCount
27 #include "lib/alignment.h"
28 #include "lib/lib.h"
29 #include "lib/timer.h"
30 #include "lib/module_init.h"
31 #include "lib/sysdep/vm.h"
32 #include "lib/sysdep/acpi.h"
33 #include "lib/sysdep/os_cpu.h"
34 #include "lib/sysdep/os/win/win.h"
35 #include "lib/sysdep/os/win/wutil.h"
36 #include "lib/sysdep/os/win/wcpu.h"
37 #include <Psapi.h>
38 
39 #if ARCH_X86_X64
40 #include "lib/sysdep/arch/x86_x64/apic.h"	// ProcessorFromApicId
41 #endif
42 
43 
44 //-----------------------------------------------------------------------------
45 // nodes
46 
47 struct Node	// POD
48 {
49 	// (Windows doesn't guarantee node numbers are contiguous, so
50 	// we associate them with contiguous indices in nodes[])
51 	UCHAR nodeNumber;
52 
53 	u32 proximityDomainNumber;
54 	uintptr_t processorMask;
55 };
56 
57 static Node nodes[os_cpu_MaxProcessors];
58 static size_t numNodes;
59 
AddNode()60 static Node* AddNode()
61 {
62 	ENSURE(numNodes < ARRAY_SIZE(nodes));
63 	return &nodes[numNodes++];
64 }
65 
FindNodeWithProcessorMask(uintptr_t processorMask)66 static Node* FindNodeWithProcessorMask(uintptr_t processorMask)
67 {
68 	for(size_t node = 0; node < numNodes; node++)
69 	{
70 		if(nodes[node].processorMask == processorMask)
71 			return &nodes[node];
72 	}
73 
74 	return 0;
75 }
76 
FindNodeWithProcessor(size_t processor)77 static Node* FindNodeWithProcessor(size_t processor)
78 {
79 	for(size_t node = 0; node < numNodes; node++)
80 	{
81 		if(IsBitSet(nodes[node].processorMask, processor))
82 			return &nodes[node];
83 	}
84 
85 	return 0;
86 }
87 
88 
89 //-----------------------------------------------------------------------------
90 // Windows topology
91 
HighestNodeNumber()92 static UCHAR HighestNodeNumber()
93 {
94 	WUTIL_FUNC(pGetNumaHighestNodeNumber, BOOL, (PULONG));
95 	WUTIL_IMPORT_KERNEL32(GetNumaHighestNodeNumber, pGetNumaHighestNodeNumber);
96 	if(!pGetNumaHighestNodeNumber)
97 		return 0;	// NUMA not supported => only one node
98 
99 	ULONG highestNodeNumber;
100 	const BOOL ok = pGetNumaHighestNodeNumber(&highestNodeNumber);
101 	WARN_IF_FALSE(ok);
102 	return (UCHAR)highestNodeNumber;
103 }
104 
PopulateNodes()105 static void PopulateNodes()
106 {
107 	WUTIL_FUNC(pGetNumaNodeProcessorMask, BOOL, (UCHAR, PULONGLONG));
108 	WUTIL_IMPORT_KERNEL32(GetNumaNodeProcessorMask, pGetNumaNodeProcessorMask);
109 	if(!pGetNumaNodeProcessorMask)
110 		return;
111 
112 	DWORD_PTR processAffinity, systemAffinity;
113 	{
114 		const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
115 		WARN_IF_FALSE(ok);
116 	}
117 	ENSURE(PopulationCount(processAffinity) <= PopulationCount(systemAffinity));
118 
119 	for(UCHAR nodeNumber = 0; nodeNumber <= HighestNodeNumber(); nodeNumber++)
120 	{
121 		ULONGLONG affinity;
122 		{
123 			const BOOL ok = pGetNumaNodeProcessorMask(nodeNumber, &affinity);
124 			WARN_IF_FALSE(ok);
125 		}
126 		if(!affinity)
127 			continue;	// empty node, skip
128 
129 		Node* node = AddNode();
130 		node->nodeNumber = nodeNumber;
131 		node->processorMask = wcpu_ProcessorMaskFromAffinity(processAffinity, (DWORD_PTR)affinity);
132 	}
133 }
134 
135 
136 //-----------------------------------------------------------------------------
137 // ACPI SRAT topology
138 
139 #if ARCH_X86_X64
140 
141 #pragma pack(push, 1)
142 
143 // fields common to Affinity* structures
144 struct AffinityHeader
145 {
146 	u8 type;
147 	u8 length;	// size [bytes], including this header
148 };
149 
150 struct AffinityAPIC
151 {
152 	static const u8 type = 0;
153 
154 	AffinityHeader header;
155 	u8 proximityDomainNumber0;
156 	u8 apicId;
157 	u32 flags;
158 	u8 sapicId;
159 	u8 proximityDomainNumber123[3];
160 	u32 clockDomain;
161 
ProximityDomainNumberAffinityAPIC162 	u32 ProximityDomainNumber() const
163 	{
164 		// (this is the apparent result of backwards compatibility, ugh.)
165 		u32 proximityDomainNumber;
166 		memcpy(&proximityDomainNumber, &proximityDomainNumber123[0]-1, sizeof(proximityDomainNumber));
167 		proximityDomainNumber &= ~0xFF;
168 		proximityDomainNumber |= proximityDomainNumber0;
169 		return proximityDomainNumber;
170 	}
171 };
172 
173 struct AffinityMemory
174 {
175 	static const u8 type = 1;
176 
177 	AffinityHeader header;
178 	u32 proximityDomainNumber;
179 	u16 reserved1;
180 	u64 baseAddress;
181 	u64 length;
182 	u32 reserved2;
183 	u32 flags;
184 	u64 reserved3;
185 };
186 
187 // AffinityX2APIC omitted, since the APIC ID is sufficient for our purposes
188 
189 // Static Resource Affinity Table
190 struct SRAT
191 {
192 	AcpiTable header;
193 	u32 reserved1;
194 	u8 reserved2[8];
195 	AffinityHeader affinities[1];
196 };
197 
198 #pragma pack(pop)
199 
200 template<class Affinity>
DynamicCastFromHeader(const AffinityHeader * header)201 static const Affinity* DynamicCastFromHeader(const AffinityHeader* header)
202 {
203 	if(header->type != Affinity::type)
204 		return 0;
205 
206 	// sanity check: ensure no padding was inserted
207 	ENSURE(header->length == sizeof(Affinity));
208 
209 	const Affinity* affinity = (const Affinity*)header;
210 	if(!IsBitSet(affinity->flags, 0))	// not enabled
211 		return 0;
212 
213 	return affinity;
214 }
215 
216 struct ProximityDomain
217 {
218 	uintptr_t processorMask;
219 	// (AffinityMemory's fields are not currently needed)
220 };
221 
222 typedef std::map<u32, ProximityDomain> ProximityDomains;
223 
ExtractProximityDomainsFromSRAT(const SRAT * srat)224 static ProximityDomains ExtractProximityDomainsFromSRAT(const SRAT* srat)
225 {
226 	ProximityDomains proximityDomains;
227 
228 	for(const AffinityHeader* header = srat->affinities;
229 		header < (const AffinityHeader*)(uintptr_t(srat)+srat->header.size);
230 		header = (const AffinityHeader*)(uintptr_t(header) + header->length))
231 	{
232 		const AffinityAPIC* affinityAPIC = DynamicCastFromHeader<AffinityAPIC>(header);
233 		if(affinityAPIC)
234 		{
235 			const size_t processor = ProcessorFromApicId(affinityAPIC->apicId);
236 			const u32 proximityDomainNumber = affinityAPIC->ProximityDomainNumber();
237 			ProximityDomain& proximityDomain = proximityDomains[proximityDomainNumber];
238 			proximityDomain.processorMask |= Bit<uintptr_t>(processor);
239 		}
240 	}
241 
242 	return proximityDomains;
243 }
244 
PopulateNodesFromProximityDomains(const ProximityDomains & proximityDomains)245 static void PopulateNodesFromProximityDomains(const ProximityDomains& proximityDomains)
246 {
247 	for(ProximityDomains::const_iterator it = proximityDomains.begin(); it != proximityDomains.end(); ++it)
248 	{
249 		const u32 proximityDomainNumber = it->first;
250 		const ProximityDomain& proximityDomain = it->second;
251 
252 		Node* node = FindNodeWithProcessorMask(proximityDomain.processorMask);
253 		if(!node)
254 			node = AddNode();
255 		// (we don't know Windows' nodeNumber; it has hopefully already been set)
256 		node->proximityDomainNumber = proximityDomainNumber;
257 		node->processorMask = proximityDomain.processorMask;
258 	}
259 }
260 
261 #endif	// #if ARCH_X86_X64
262 
263 
264 //-----------------------------------------------------------------------------
265 
266 static ModuleInitState initState;
267 
InitTopology()268 static Status InitTopology()
269 {
270 	PopulateNodes();
271 
272 #if ARCH_X86_X64
273 	const SRAT* srat = (const SRAT*)acpi_GetTable("SRAT");
274 	if(srat && AreApicIdsReliable())
275 	{
276 		const ProximityDomains proximityDomains = ExtractProximityDomainsFromSRAT(srat);
277 		PopulateNodesFromProximityDomains(proximityDomains);
278 	}
279 #endif
280 
281 	// neither OS nor ACPI information is available
282 	if(numNodes == 0)
283 	{
284 		// add dummy node that contains all system processors
285 		Node* node = AddNode();
286 		node->nodeNumber = 0;
287 		node->proximityDomainNumber = 0;
288 		node->processorMask = os_cpu_ProcessorMask();
289 	}
290 
291 	return INFO::OK;
292 }
293 
numa_NumNodes()294 size_t numa_NumNodes()
295 {
296 	UNUSED2(ModuleInit(&initState, InitTopology));
297 	return numNodes;
298 }
299 
numa_NodeFromProcessor(size_t processor)300 size_t numa_NodeFromProcessor(size_t processor)
301 {
302 	UNUSED2(ModuleInit(&initState, InitTopology));
303 	ENSURE(processor < os_cpu_NumProcessors());
304 	Node* node = FindNodeWithProcessor(processor);
305 	ENSURE(node);
306 	return nodes-node;
307 }
308 
numa_ProcessorMaskFromNode(size_t node)309 uintptr_t numa_ProcessorMaskFromNode(size_t node)
310 {
311 	UNUSED2(ModuleInit(&initState, InitTopology));
312 	ENSURE(node < numNodes);
313 	return nodes[node].processorMask;
314 }
315 
NodeNumberFromNode(size_t node)316 static UCHAR NodeNumberFromNode(size_t node)
317 {
318 	UNUSED2(ModuleInit(&initState, InitTopology));
319 	ENSURE(node < numa_NumNodes());
320 	return nodes[node].nodeNumber;
321 }
322 
323 
324 //-----------------------------------------------------------------------------
325 // memory info
326 
numa_AvailableMemory(size_t node)327 size_t numa_AvailableMemory(size_t node)
328 {
329 	// note: it is said that GetNumaAvailableMemoryNode sometimes incorrectly
330 	// reports zero bytes. the actual cause may however be unexpected
331 	// RAM configuration, e.g. not all slots filled.
332 	WUTIL_FUNC(pGetNumaAvailableMemoryNode, BOOL, (UCHAR, PULONGLONG));
333 	WUTIL_IMPORT_KERNEL32(GetNumaAvailableMemoryNode, pGetNumaAvailableMemoryNode);
334 	if(pGetNumaAvailableMemoryNode)
335 	{
336 		const UCHAR nodeNumber = NodeNumberFromNode(node);
337 		ULONGLONG availableBytes;
338 		const BOOL ok = pGetNumaAvailableMemoryNode(nodeNumber, &availableBytes);
339 		WARN_IF_FALSE(ok);
340 		const size_t availableMiB = size_t(availableBytes / MiB);
341 		return availableMiB;
342 	}
343 	// NUMA not supported - return available system memory
344 	else
345 		return os_cpu_MemoryAvailable();
346 }
347 
348 
349 #pragma pack(push, 1)
350 
351 // ACPI System Locality Information Table
352 // (System Locality == Proximity Domain)
353 struct SLIT
354 {
355 	AcpiTable header;
356 	u64 numSystemLocalities;
357 	u8 entries[1];		// numSystemLocalities*numSystemLocalities entries
358 };
359 
360 #pragma pack(pop)
361 
ReadRelativeDistanceFromSLIT(const SLIT * slit)362 static double ReadRelativeDistanceFromSLIT(const SLIT* slit)
363 {
364 	const size_t n = slit->numSystemLocalities;
365 	ENSURE(slit->header.size == sizeof(SLIT)-sizeof(slit->entries)+n*n);
366 	// diagonals are specified to be 10
367 	for(size_t i = 0; i < n; i++)
368 		ENSURE(slit->entries[i*n+i] == 10);
369 	// entries = relativeDistance * 10
370 	return *std::max_element(slit->entries, slit->entries+n*n) / 10.0;
371 }
372 
373 // @return ratio between max/min time required to access one node's
374 // memory from each processor.
MeasureRelativeDistance()375 static double MeasureRelativeDistance()
376 {
377 	const size_t size = 32*MiB;
378 	void* mem = vm::Allocate(size);
379 	ASSUME_ALIGNED(mem, pageSize);
380 
381 	const uintptr_t previousProcessorMask = os_cpu_SetThreadAffinityMask(os_cpu_ProcessorMask());
382 
383 	double minTime = 1e10, maxTime = 0.0;
384 	for(size_t node = 0; node < numa_NumNodes(); node++)
385 	{
386 		const uintptr_t processorMask = numa_ProcessorMaskFromNode(node);
387 		os_cpu_SetThreadAffinityMask(processorMask);
388 
389 		const double startTime = timer_Time();
390 		memset(mem, 0, size);
391 		const double elapsedTime = timer_Time() - startTime;
392 
393 		minTime = std::min(minTime, elapsedTime);
394 		maxTime = std::max(maxTime, elapsedTime);
395 	}
396 
397 	UNUSED2(os_cpu_SetThreadAffinityMask(previousProcessorMask));
398 
399 	vm::Free(mem, size);
400 
401 	return maxTime / minTime;
402 }
403 
404 static double relativeDistance;
405 
InitRelativeDistance()406 static Status InitRelativeDistance()
407 {
408 	// early-out for non-NUMA systems (saves some time)
409 	if(numa_NumNodes() == 1)
410 	{
411 		relativeDistance = 1.0;
412 		return INFO::OK;
413 	}
414 
415 	// trust values reported by the BIOS, if available
416 	const SLIT* slit = (const SLIT*)acpi_GetTable("SLIT");
417 	if(slit)
418 		relativeDistance = ReadRelativeDistanceFromSLIT(slit);
419 	else
420 		relativeDistance = MeasureRelativeDistance();
421 
422 	ENSURE(relativeDistance >= 1.0);
423 	ENSURE(relativeDistance <= 4.0);
424 	return INFO::OK;
425 }
426 
numa_Factor()427 double numa_Factor()
428 {
429 	static ModuleInitState _initState;
430 	UNUSED2(ModuleInit(&_initState, InitRelativeDistance));
431 	return relativeDistance;
432 }
433 
434 
IsMemoryInterleaved()435 static bool IsMemoryInterleaved()
436 {
437 	if(numa_NumNodes() == 1)
438 		return false;
439 
440 	if(!acpi_GetTable("FACP"))	// no ACPI tables available
441 		return false;	// indeterminate, assume not interleaved
442 
443 	if(acpi_GetTable("SRAT"))	// present iff not interleaved
444 		return false;
445 
446 	return true;
447 }
448 
449 static bool isMemoryInterleaved;
450 
InitMemoryInterleaved()451 static Status InitMemoryInterleaved()
452 {
453 	isMemoryInterleaved = IsMemoryInterleaved();
454 	return INFO::OK;
455 }
456 
numa_IsMemoryInterleaved()457 bool numa_IsMemoryInterleaved()
458 {
459 	static ModuleInitState _initState;
460 	UNUSED2(ModuleInit(&_initState, InitMemoryInterleaved));
461 	return isMemoryInterleaved;
462 }
463 
464 
465 //-----------------------------------------------------------------------------
466 
467 #if 0
468 
469 static bool VerifyPages(void* mem, size_t size, size_t pageSize, size_t node)
470 {
471 	WUTIL_FUNC(pQueryWorkingSetEx, BOOL, (HANDLE, PVOID, DWORD));
472 	WUTIL_IMPORT_KERNEL32(QueryWorkingSetEx, pQueryWorkingSetEx);
473 	if(!pQueryWorkingSetEx)
474 		return true;	// can't do anything
475 
476 #if WINVER >= 0x600
477 	size_t largePageSize = os_cpu_LargePageSize();
478 	ENSURE(largePageSize != 0); // this value is needed for later
479 
480 	// retrieve attributes of all pages constituting mem
481 	const size_t numPages = (size + pageSize-1) / pageSize;
482 	PSAPI_WORKING_SET_EX_INFORMATION* wsi = new PSAPI_WORKING_SET_EX_INFORMATION[numPages];
483 	for(size_t i = 0; i < numPages; i++)
484 		wsi[i].VirtualAddress = (u8*)mem + i*pageSize;
485 	pQueryWorkingSetEx(GetCurrentProcess(), wsi, DWORD(sizeof(PSAPI_WORKING_SET_EX_INFORMATION)*numPages));
486 
487 	// ensure each is valid and allocated on the correct node
488 	for(size_t i = 0; i < numPages; i++)
489 	{
490 		const PSAPI_WORKING_SET_EX_BLOCK& attributes = wsi[i].VirtualAttributes;
491 		if(!attributes.Valid)
492 			return false;
493 		if((attributes.LargePage != 0) != (pageSize == largePageSize))
494 		{
495 			debug_printf("NUMA: is not a large page\n");
496 			return false;
497 		}
498 		if(attributes.Node != node)
499 		{
500 			debug_printf("NUMA: allocated from remote node\n");
501 			return false;
502 		}
503 	}
504 
505 	delete[] wsi;
506 #else
507 	UNUSED2(mem);
508 	UNUSED2(size);
509 	UNUSED2(pageSize);
510 	UNUSED2(node);
511 #endif
512 
513 	return true;
514 }
515 
516 #endif
517