1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4
5 //
6 // Implementation of the Redhawk Platform Abstraction Layer (PAL) library when MinWin is the platform. In this
7 // case most or all of the import requirements which Redhawk has can be satisfied via a forwarding export to
8 // some native MinWin library. Therefore most of the work is done in the .def file and there is very little
9 // code here.
10 //
11 // Note that in general we don't want to assume that Windows and Redhawk global definitions can co-exist.
12 // Since this code must include Windows headers to do its job we can't therefore safely include general
13 // Redhawk header files.
14 //
15 #include "common.h"
16 #include <windows.h>
17 #include <stdio.h>
18 #include <errno.h>
19 #include <evntprov.h>
20 #ifdef PROJECTN
21 #include <roapi.h>
22 #endif
23
24 #include "holder.h"
25
26 #define PalRaiseFailFastException RaiseFailFastException
27
PalEventWrite(REGHANDLE arg1,const EVENT_DESCRIPTOR * arg2,uint32_t arg3,EVENT_DATA_DESCRIPTOR * arg4)28 uint32_t PalEventWrite(REGHANDLE arg1, const EVENT_DESCRIPTOR * arg2, uint32_t arg3, EVENT_DATA_DESCRIPTOR * arg4)
29 {
30 return EventWrite(arg1, arg2, arg3, arg4);
31 }
32
33 #include "gcenv.h"
34
35
36 #define REDHAWK_PALEXPORT extern "C"
37 #define REDHAWK_PALAPI __stdcall
38
39 #ifndef RUNTIME_SERVICES_ONLY
40 // Index for the fiber local storage of the attached thread pointer
41 static UInt32 g_flsIndex = FLS_OUT_OF_INDEXES;
42 #endif
43
44 static DWORD g_dwPALCapabilities;
45
46 GCSystemInfo g_SystemInfo;
47
InitializeSystemInfo()48 bool InitializeSystemInfo()
49 {
50 SYSTEM_INFO systemInfo;
51 GetSystemInfo(&systemInfo);
52
53 g_SystemInfo.dwNumberOfProcessors = systemInfo.dwNumberOfProcessors;
54 g_SystemInfo.dwPageSize = systemInfo.dwPageSize;
55 g_SystemInfo.dwAllocationGranularity = systemInfo.dwAllocationGranularity;
56
57 return true;
58 }
59
60 extern bool PalQueryProcessorTopology();
61
62 #ifndef RUNTIME_SERVICES_ONLY
63 // This is called when each *fiber* is destroyed. When the home fiber of a thread is destroyed,
64 // it means that the thread itself is destroyed.
65 // Since we receive that notification outside of the Loader Lock, it allows us to safely acquire
66 // the ThreadStore lock in the RuntimeThreadShutdown.
FiberDetachCallback(void * lpFlsData)67 void __stdcall FiberDetachCallback(void* lpFlsData)
68 {
69 ASSERT(g_flsIndex != FLS_OUT_OF_INDEXES);
70 ASSERT(lpFlsData == FlsGetValue(g_flsIndex));
71
72 if (lpFlsData != NULL)
73 {
74 // The current fiber is the home fiber of a thread, so the thread is shutting down
75 RuntimeThreadShutdown(lpFlsData);
76 }
77 }
78 #endif
79
80 // The Redhawk PAL must be initialized before any of its exports can be called. Returns true for a successful
81 // initialization and false on failure.
PalInit()82 REDHAWK_PALEXPORT bool REDHAWK_PALAPI PalInit()
83 {
84 g_dwPALCapabilities = WriteWatchCapability | GetCurrentProcessorNumberCapability | LowMemoryNotificationCapability;
85
86 if (!PalQueryProcessorTopology())
87 return false;
88
89 #ifndef RUNTIME_SERVICES_ONLY
90 // We use fiber detach callbacks to run our thread shutdown code because the fiber detach
91 // callback is made without the OS loader lock
92 g_flsIndex = FlsAlloc(FiberDetachCallback);
93 if (g_flsIndex == FLS_OUT_OF_INDEXES)
94 {
95 return false;
96 }
97 #endif
98
99 return true;
100 }
101
102 // Given a mask of capabilities return true if all of them are supported by the current PAL.
PalHasCapability(PalCapability capability)103 REDHAWK_PALEXPORT bool REDHAWK_PALAPI PalHasCapability(PalCapability capability)
104 {
105 return (g_dwPALCapabilities & (DWORD)capability) == (DWORD)capability;
106 }
107
108 #ifndef RUNTIME_SERVICES_ONLY
109 // Attach thread to PAL.
110 // It can be called multiple times for the same thread.
111 // It fails fast if a different thread was already registered with the current fiber
112 // or if the thread was already registered with a different fiber.
113 // Parameters:
114 // thread - thread to attach
PalAttachThread(void * thread)115 REDHAWK_PALEXPORT void REDHAWK_PALAPI PalAttachThread(void* thread)
116 {
117 void* threadFromCurrentFiber = FlsGetValue(g_flsIndex);
118
119 if (threadFromCurrentFiber != NULL)
120 {
121 ASSERT_UNCONDITIONALLY("Multiple threads encountered from a single fiber");
122 RhFailFast();
123 }
124
125 // Associate the current fiber with the current thread. This makes the current fiber the thread's "home"
126 // fiber. This fiber is the only fiber allowed to execute managed code on this thread. When this fiber
127 // is destroyed, we consider the thread to be destroyed.
128 FlsSetValue(g_flsIndex, thread);
129 }
130
131 // Detach thread from PAL.
132 // It fails fast if some other thread value was attached to PAL.
133 // Parameters:
134 // thread - thread to detach
135 // Return:
136 // true if the thread was detached, false if there was no attached thread
PalDetachThread(void * thread)137 REDHAWK_PALEXPORT bool REDHAWK_PALAPI PalDetachThread(void* thread)
138 {
139 ASSERT(g_flsIndex != FLS_OUT_OF_INDEXES);
140 void* threadFromCurrentFiber = FlsGetValue(g_flsIndex);
141
142 if (threadFromCurrentFiber == NULL)
143 {
144 // we've seen this thread, but not this fiber. It must be a "foreign" fiber that was
145 // borrowing this thread.
146 return false;
147 }
148
149 if (threadFromCurrentFiber != thread)
150 {
151 ASSERT_UNCONDITIONALLY("Detaching a thread from the wrong fiber");
152 RhFailFast();
153 }
154
155 FlsSetValue(g_flsIndex, NULL);
156 return true;
157 }
158 #endif // RUNTIME_SERVICES_ONLY
159
PalGetCurrentThreadIdForLogging()160 extern "C" UInt64 PalGetCurrentThreadIdForLogging()
161 {
162 return GetCurrentThreadId();
163 }
164
165 #if !defined(USE_PORTABLE_HELPERS) && !defined(FEATURE_RX_THUNKS)
PalAllocateThunksFromTemplate(_In_ HANDLE hTemplateModule,UInt32 templateRva,size_t templateSize,_Outptr_result_bytebuffer_ (templateSize)void ** newThunksOut)166 REDHAWK_PALEXPORT UInt32_BOOL REDHAWK_PALAPI PalAllocateThunksFromTemplate(_In_ HANDLE hTemplateModule, UInt32 templateRva, size_t templateSize, _Outptr_result_bytebuffer_(templateSize) void** newThunksOut)
167 {
168 #ifdef XBOX_ONE
169 return E_NOTIMPL;
170 #else
171 BOOL success = FALSE;
172 HANDLE hMap = NULL, hFile = INVALID_HANDLE_VALUE;
173
174 const WCHAR * wszModuleFileName = NULL;
175 if (PalGetModuleFileName(&wszModuleFileName, hTemplateModule) == 0 || wszModuleFileName == NULL)
176 return FALSE;
177
178 hFile = CreateFileW(wszModuleFileName, GENERIC_READ | GENERIC_EXECUTE, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
179 if (hFile == INVALID_HANDLE_VALUE)
180 goto cleanup;
181
182 hMap = CreateFileMapping(hFile, NULL, SEC_IMAGE | PAGE_READONLY, 0, 0, NULL);
183 if (hMap == NULL)
184 goto cleanup;
185
186 *newThunksOut = MapViewOfFile(hMap, 0, 0, templateRva, templateSize);
187 success = ((*newThunksOut) != NULL);
188
189 cleanup:
190 CloseHandle(hMap);
191 CloseHandle(hFile);
192
193 return success;
194 #endif
195 }
196
PalFreeThunksFromTemplate(_In_ void * pBaseAddress)197 REDHAWK_PALEXPORT UInt32_BOOL REDHAWK_PALAPI PalFreeThunksFromTemplate(_In_ void *pBaseAddress)
198 {
199 #ifdef XBOX_ONE
200 return TRUE;
201 #else
202 return UnmapViewOfFile(pBaseAddress);
203 #endif
204 }
205 #endif // !USE_PORTABLE_HELPERS && !FEATURE_RX_THUNKS
206
PalMarkThunksAsValidCallTargets(void * virtualAddress,int thunkSize,int thunksPerBlock,int thunkBlockSize,int thunkBlocksPerMapping)207 REDHAWK_PALEXPORT UInt32_BOOL REDHAWK_PALAPI PalMarkThunksAsValidCallTargets(
208 void *virtualAddress,
209 int thunkSize,
210 int thunksPerBlock,
211 int thunkBlockSize,
212 int thunkBlocksPerMapping)
213 {
214 // For CoreRT we are using RWX pages so there is no need for this API for now.
215 // Once we have a scenario for non-RWX pages we should be able to put the implementation here
216 return TRUE;
217 }
218
PalCompatibleWaitAny(UInt32_BOOL alertable,UInt32 timeout,UInt32 handleCount,HANDLE * pHandles,UInt32_BOOL allowReentrantWait)219 REDHAWK_PALEXPORT UInt32 REDHAWK_PALAPI PalCompatibleWaitAny(UInt32_BOOL alertable, UInt32 timeout, UInt32 handleCount, HANDLE* pHandles, UInt32_BOOL allowReentrantWait)
220 {
221 DWORD index;
222 SetLastError(ERROR_SUCCESS); // recommended by MSDN.
223 HRESULT hr = CoWaitForMultipleHandles(alertable ? COWAIT_ALERTABLE : 0, timeout, handleCount, pHandles, &index);
224
225 switch (hr)
226 {
227 case S_OK:
228 return index;
229
230 case RPC_S_CALLPENDING:
231 return WAIT_TIMEOUT;
232
233 default:
234 SetLastError(HRESULT_CODE(hr));
235 return WAIT_FAILED;
236 }
237 }
238
PalSleep(UInt32 milliseconds)239 REDHAWK_PALEXPORT void REDHAWK_PALAPI PalSleep(UInt32 milliseconds)
240 {
241 return Sleep(milliseconds);
242 }
243
PalSwitchToThread()244 REDHAWK_PALEXPORT UInt32_BOOL REDHAWK_PALAPI PalSwitchToThread()
245 {
246 return SwitchToThread();
247 }
248
PalCreateEventW(_In_opt_ LPSECURITY_ATTRIBUTES pEventAttributes,UInt32_BOOL manualReset,UInt32_BOOL initialState,_In_opt_z_ LPCWSTR pName)249 REDHAWK_PALEXPORT HANDLE REDHAWK_PALAPI PalCreateEventW(_In_opt_ LPSECURITY_ATTRIBUTES pEventAttributes, UInt32_BOOL manualReset, UInt32_BOOL initialState, _In_opt_z_ LPCWSTR pName)
250 {
251 return CreateEventW(pEventAttributes, manualReset, initialState, pName);
252 }
253
_Success_(return)254 REDHAWK_PALEXPORT _Success_(return) bool REDHAWK_PALAPI PalGetThreadContext(HANDLE hThread, _Out_ PAL_LIMITED_CONTEXT * pCtx)
255 {
256 CONTEXT win32ctx;
257
258 win32ctx.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_EXCEPTION_REQUEST;
259
260 if (!GetThreadContext(hThread, &win32ctx))
261 return false;
262
263 // The CONTEXT_SERVICE_ACTIVE and CONTEXT_EXCEPTION_ACTIVE output flags indicate we suspended the thread
264 // at a point where the kernel cannot guarantee a completely accurate context. We'll fail the request in
265 // this case (which should force our caller to resume the thread and try again -- since this is a fairly
266 // narrow window we're highly likely to succeed next time).
267 // Note: in some cases (x86 WOW64, ARM32 on ARM64) the OS will not set the CONTEXT_EXCEPTION_REPORTING flag
268 // if the thread is executing in kernel mode (i.e. in the middle of a syscall or exception handling).
269 // Therefore, we should treat the absence of the CONTEXT_EXCEPTION_REPORTING flag as an indication that
270 // it is not safe to manipulate with the current state of the thread context.
271 if ((win32ctx.ContextFlags & CONTEXT_EXCEPTION_REPORTING) == 0 ||
272 (win32ctx.ContextFlags & (CONTEXT_SERVICE_ACTIVE | CONTEXT_EXCEPTION_ACTIVE)))
273 return false;
274
275 #ifdef _X86_
276 pCtx->IP = win32ctx.Eip;
277 pCtx->Rsp = win32ctx.Esp;
278 pCtx->Rbp = win32ctx.Ebp;
279 pCtx->Rdi = win32ctx.Edi;
280 pCtx->Rsi = win32ctx.Esi;
281 pCtx->Rax = win32ctx.Eax;
282 pCtx->Rbx = win32ctx.Ebx;
283 #elif defined(_AMD64_)
284 pCtx->IP = win32ctx.Rip;
285 pCtx->Rsp = win32ctx.Rsp;
286 pCtx->Rbp = win32ctx.Rbp;
287 pCtx->Rdi = win32ctx.Rdi;
288 pCtx->Rsi = win32ctx.Rsi;
289 pCtx->Rax = win32ctx.Rax;
290 pCtx->Rbx = win32ctx.Rbx;
291 pCtx->R12 = win32ctx.R12;
292 pCtx->R13 = win32ctx.R13;
293 pCtx->R14 = win32ctx.R14;
294 pCtx->R15 = win32ctx.R15;
295 #elif defined(_ARM_)
296 pCtx->IP = win32ctx.Pc;
297 pCtx->R0 = win32ctx.R0;
298 pCtx->R4 = win32ctx.R4;
299 pCtx->R5 = win32ctx.R5;
300 pCtx->R6 = win32ctx.R6;
301 pCtx->R7 = win32ctx.R7;
302 pCtx->R8 = win32ctx.R8;
303 pCtx->R9 = win32ctx.R9;
304 pCtx->R10 = win32ctx.R10;
305 pCtx->R11 = win32ctx.R11;
306 pCtx->SP = win32ctx.Sp;
307 pCtx->LR = win32ctx.Lr;
308 #elif defined(_ARM64_)
309 pCtx->IP = win32ctx.Pc;
310 pCtx->X0 = win32ctx.X0;
311 pCtx->X1 = win32ctx.X1;
312 // TODO: Copy X2-X7 when we start supporting HVA's
313 pCtx->X19 = win32ctx.X19;
314 pCtx->X20 = win32ctx.X20;
315 pCtx->X21 = win32ctx.X21;
316 pCtx->X22 = win32ctx.X22;
317 pCtx->X23 = win32ctx.X23;
318 pCtx->X24 = win32ctx.X24;
319 pCtx->X25 = win32ctx.X25;
320 pCtx->X26 = win32ctx.X26;
321 pCtx->X27 = win32ctx.X27;
322 pCtx->X28 = win32ctx.X28;
323 pCtx->SP = win32ctx.Sp;
324 pCtx->LR = win32ctx.Lr;
325 pCtx->FP = win32ctx.Fp;
326 #else
327 #error Unsupported platform
328 #endif
329 return true;
330 }
331
332
PalHijack(HANDLE hThread,_In_ PalHijackCallback callback,_In_opt_ void * pCallbackContext)333 REDHAWK_PALEXPORT UInt32 REDHAWK_PALAPI PalHijack(HANDLE hThread, _In_ PalHijackCallback callback, _In_opt_ void* pCallbackContext)
334 {
335 if (hThread == INVALID_HANDLE_VALUE)
336 {
337 return (UInt32)E_INVALIDARG;
338 }
339
340 if (SuspendThread(hThread) == (DWORD)-1)
341 {
342 return HRESULT_FROM_WIN32(GetLastError());
343 }
344
345 PAL_LIMITED_CONTEXT ctx;
346 HRESULT result;
347 if (!PalGetThreadContext(hThread, &ctx))
348 {
349 result = HRESULT_FROM_WIN32(GetLastError());
350 }
351 else
352 {
353 result = callback(hThread, &ctx, pCallbackContext) ? S_OK : E_FAIL;
354 }
355
356 ResumeThread(hThread);
357
358 return result;
359 }
360
PalStartBackgroundWork(_In_ BackgroundCallback callback,_In_opt_ void * pCallbackContext,BOOL highPriority)361 REDHAWK_PALEXPORT HANDLE REDHAWK_PALAPI PalStartBackgroundWork(_In_ BackgroundCallback callback, _In_opt_ void* pCallbackContext, BOOL highPriority)
362 {
363 HANDLE hThread = CreateThread(
364 NULL,
365 0,
366 (LPTHREAD_START_ROUTINE)callback,
367 pCallbackContext,
368 highPriority ? CREATE_SUSPENDED : 0,
369 NULL);
370
371 if (hThread == NULL)
372 return NULL;
373
374 if (highPriority)
375 {
376 SetThreadPriority(hThread, THREAD_PRIORITY_HIGHEST);
377 ResumeThread(hThread);
378 }
379
380 return hThread;
381 }
382
PalStartBackgroundGCThread(_In_ BackgroundCallback callback,_In_opt_ void * pCallbackContext)383 REDHAWK_PALEXPORT bool REDHAWK_PALAPI PalStartBackgroundGCThread(_In_ BackgroundCallback callback, _In_opt_ void* pCallbackContext)
384 {
385 return PalStartBackgroundWork(callback, pCallbackContext, FALSE) != NULL;
386 }
387
PalStartFinalizerThread(_In_ BackgroundCallback callback,_In_opt_ void * pCallbackContext)388 REDHAWK_PALEXPORT bool REDHAWK_PALAPI PalStartFinalizerThread(_In_ BackgroundCallback callback, _In_opt_ void* pCallbackContext)
389 {
390 return PalStartBackgroundWork(callback, pCallbackContext, TRUE) != NULL;
391 }
392
PalGetTickCount()393 REDHAWK_PALEXPORT UInt32 REDHAWK_PALAPI PalGetTickCount()
394 {
395 #pragma warning(push)
396 #pragma warning(disable: 28159) // Consider GetTickCount64 instead
397 return GetTickCount();
398 #pragma warning(pop)
399 }
400
PalEventEnabled(REGHANDLE regHandle,_In_ const EVENT_DESCRIPTOR * eventDescriptor)401 REDHAWK_PALEXPORT bool REDHAWK_PALAPI PalEventEnabled(REGHANDLE regHandle, _In_ const EVENT_DESCRIPTOR* eventDescriptor)
402 {
403 return !!EventEnabled(regHandle, eventDescriptor);
404 }
405
PalCreateFileW(_In_z_ LPCWSTR pFileName,uint32_t desiredAccess,uint32_t shareMode,_In_opt_ void * pSecurityAttributes,uint32_t creationDisposition,uint32_t flagsAndAttributes,HANDLE hTemplateFile)406 REDHAWK_PALEXPORT HANDLE REDHAWK_PALAPI PalCreateFileW(
407 _In_z_ LPCWSTR pFileName,
408 uint32_t desiredAccess,
409 uint32_t shareMode,
410 _In_opt_ void* pSecurityAttributes,
411 uint32_t creationDisposition,
412 uint32_t flagsAndAttributes,
413 HANDLE hTemplateFile)
414 {
415 return CreateFileW(pFileName, desiredAccess, shareMode, (LPSECURITY_ATTRIBUTES)pSecurityAttributes,
416 creationDisposition, flagsAndAttributes, hTemplateFile);
417 }
418
PalCreateLowMemoryNotification()419 REDHAWK_PALEXPORT HANDLE REDHAWK_PALAPI PalCreateLowMemoryNotification()
420 {
421 return CreateMemoryResourceNotification(LowMemoryResourceNotification);
422 }
423
PalGetModuleHandleFromPointer(_In_ void * pointer)424 REDHAWK_PALEXPORT HANDLE REDHAWK_PALAPI PalGetModuleHandleFromPointer(_In_ void* pointer)
425 {
426 HMODULE module;
427 if (!GetModuleHandleExW(
428 GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
429 (LPCWSTR)pointer,
430 &module))
431 {
432 return NULL;
433 }
434
435 return (HANDLE)module;
436 }
437
PalAddVectoredExceptionHandler(UInt32 firstHandler,_In_ PVECTORED_EXCEPTION_HANDLER vectoredHandler)438 REDHAWK_PALEXPORT void* REDHAWK_PALAPI PalAddVectoredExceptionHandler(UInt32 firstHandler, _In_ PVECTORED_EXCEPTION_HANDLER vectoredHandler)
439 {
440 return AddVectoredExceptionHandler(firstHandler, vectoredHandler);
441 }
442
PalPrintFatalError(const char * message)443 REDHAWK_PALEXPORT void PalPrintFatalError(const char* message)
444 {
445 // Write the message using lowest-level OS API available. This is used to print the stack overflow
446 // message, so there is not much that can be done here.
447 DWORD dwBytesWritten;
448 WriteFile(GetStdHandle(STD_ERROR_HANDLE), message, (DWORD)strlen(message), &dwBytesWritten, NULL);
449 }
450
451 //
452 // -----------------------------------------------------------------------------------------------------------
453 //
454 // Some more globally initialized data (in InitializeSubsystems), this time internal and used to cache
455 // information returned by various GC support routines.
456 //
457 static UInt32 g_cLogicalCpus = 0;
458 static size_t g_cbLargestOnDieCache = 0;
459 static size_t g_cbLargestOnDieCacheAdjusted = 0;
460
461
462
463 #if (defined(_TARGET_AMD64_) || defined (_TARGET_X86_)) && !defined(USE_PORTABLE_HELPERS)
464 EXTERN_C DWORD __fastcall getcpuid(DWORD arg, unsigned char result[16]);
465 EXTERN_C DWORD __fastcall getextcpuid(DWORD arg1, DWORD arg2, unsigned char result[16]);
466
QueryAMDCacheInfo(_Out_ UInt32 * pcbCache,_Out_ UInt32 * pcbCacheAdjusted)467 void QueryAMDCacheInfo(_Out_ UInt32* pcbCache, _Out_ UInt32* pcbCacheAdjusted)
468 {
469 unsigned char buffer[16];
470
471 if (getcpuid(0x80000000, buffer) >= 0x80000006)
472 {
473 UInt32* pdwBuffer = (UInt32*)buffer;
474
475 getcpuid(0x80000006, buffer);
476
477 UInt32 dwL2CacheBits = pdwBuffer[2];
478 UInt32 dwL3CacheBits = pdwBuffer[3];
479
480 *pcbCache = (size_t)((dwL2CacheBits >> 16) * 1024); // L2 cache size in ECX bits 31-16
481
482 getcpuid(0x1, buffer);
483 UInt32 dwBaseFamily = (pdwBuffer[0] & (0xF << 8)) >> 8;
484 UInt32 dwExtFamily = (pdwBuffer[0] & (0xFF << 20)) >> 20;
485 UInt32 dwFamily = dwBaseFamily >= 0xF ? dwBaseFamily + dwExtFamily : dwBaseFamily;
486
487 if (dwFamily >= 0x10)
488 {
489 BOOL bSkipAMDL3 = FALSE;
490
491 if (dwFamily == 0x10) // are we running on a Barcelona (Family 10h) processor?
492 {
493 // check model
494 UInt32 dwBaseModel = (pdwBuffer[0] & (0xF << 4)) >> 4;
495 UInt32 dwExtModel = (pdwBuffer[0] & (0xF << 16)) >> 16;
496 UInt32 dwModel = dwBaseFamily >= 0xF ? (dwExtModel << 4) | dwBaseModel : dwBaseModel;
497
498 switch (dwModel)
499 {
500 case 0x2:
501 // 65nm parts do not benefit from larger Gen0
502 bSkipAMDL3 = TRUE;
503 break;
504
505 case 0x4:
506 default:
507 bSkipAMDL3 = FALSE;
508 }
509 }
510
511 if (!bSkipAMDL3)
512 {
513 // 45nm Greyhound parts (and future parts based on newer northbridge) benefit
514 // from increased gen0 size, taking L3 into account
515 getcpuid(0x80000008, buffer);
516 UInt32 dwNumberOfCores = (pdwBuffer[2] & (0xFF)) + 1; // NC is in ECX bits 7-0
517
518 UInt32 dwL3CacheSize = (size_t)((dwL3CacheBits >> 18) * 512 * 1024); // L3 size in EDX bits 31-18 * 512KB
519 // L3 is shared between cores
520 dwL3CacheSize = dwL3CacheSize / dwNumberOfCores;
521 *pcbCache += dwL3CacheSize; // due to exclusive caches, add L3 size (possibly zero) to L2
522 // L1 is too small to worry about, so ignore it
523 }
524 }
525 }
526 *pcbCacheAdjusted = *pcbCache;
527 }
528
529 #ifdef _DEBUG
530 #define CACHE_WAY_BITS 0xFFC00000 // number of cache WAYS-Associativity is returned in EBX[31:22] (10 bits) using cpuid function 4
531 #define CACHE_PARTITION_BITS 0x003FF000 // number of cache Physical Partitions is returned in EBX[21:12] (10 bits) using cpuid function 4
532 #define CACHE_LINESIZE_BITS 0x00000FFF // Linesize returned in EBX[11:0] (12 bits) using cpuid function 4
533 #define LIMITED_METHOD_CONTRACT
534
CLR_GetIntelDeterministicCacheEnum()535 size_t CLR_GetIntelDeterministicCacheEnum()
536 {
537 LIMITED_METHOD_CONTRACT;
538 size_t retVal = 0;
539 unsigned char buffer[16];
540
541 DWORD maxCpuid = getextcpuid(0, 0, buffer);
542
543 DWORD* dwBuffer = (DWORD*)buffer;
544
545 if ((maxCpuid > 3) && (maxCpuid < 0x80000000)) // Deterministic Cache Enum is Supported
546 {
547 DWORD dwCacheWays, dwCachePartitions, dwLineSize, dwSets;
548 DWORD retEAX = 0;
549 DWORD loopECX = 0;
550 size_t maxSize = 0;
551 size_t curSize = 0;
552
553 // Make First call to getextcpuid with loopECX=0. loopECX provides an index indicating which level to return information about.
554 // The second parameter is input EAX=4, to specify we want deterministic cache parameter leaf information.
555 // getextcpuid with EAX=4 should be executed with loopECX = 0,1, ... until retEAX [4:0] contains 00000b, indicating no more
556 // cache levels are supported.
557
558 getextcpuid(loopECX, 4, buffer);
559 retEAX = dwBuffer[0]; // get EAX
560
561 int i = 0;
562 while (retEAX & 0x1f) // Crack cache enums and loop while EAX > 0
563 {
564
565 dwCacheWays = (dwBuffer[1] & CACHE_WAY_BITS) >> 22;
566 dwCachePartitions = (dwBuffer[1] & CACHE_PARTITION_BITS) >> 12;
567 dwLineSize = dwBuffer[1] & CACHE_LINESIZE_BITS;
568 dwSets = dwBuffer[2]; // ECX
569
570 curSize = (dwCacheWays + 1)*(dwCachePartitions + 1)*(dwLineSize + 1)*(dwSets + 1);
571
572 if (maxSize < curSize)
573 maxSize = curSize;
574
575 loopECX++;
576 getextcpuid(loopECX, 4, buffer);
577 retEAX = dwBuffer[0]; // get EAX[4:0];
578 i++;
579 if (i > 16) // prevent infinite looping
580 return 0;
581 }
582 retVal = maxSize;
583 }
584
585 return retVal;
586 }
587
588 // The following function uses CPUID function 2 with descriptor values to determine the cache size. This requires a-priori
589 // knowledge of the descriptor values. This works on gallatin and prior processors (already released processors).
590 // If successful, this function returns the cache size in bytes of the highest level on-die cache. Returns 0 on failure.
591
CLR_GetIntelDescriptorValuesCache()592 size_t CLR_GetIntelDescriptorValuesCache()
593 {
594 LIMITED_METHOD_CONTRACT;
595 size_t size = 0;
596 size_t maxSize = 0;
597 unsigned char buffer[16];
598
599 getextcpuid(0, 2, buffer); // call CPUID with EAX function 2H to obtain cache descriptor values
600
601 for (int i = buffer[0]; --i >= 0;)
602 {
603 int j;
604 for (j = 3; j < 16; j += 4)
605 {
606 // if the information in a register is marked invalid, set to null descriptors
607 if (buffer[j] & 0x80)
608 {
609 buffer[j - 3] = 0;
610 buffer[j - 2] = 0;
611 buffer[j - 1] = 0;
612 buffer[j - 0] = 0;
613 }
614 }
615
616 for (j = 1; j < 16; j++)
617 {
618 switch (buffer[j]) // need to add descriptor values for 8M and 12M when they become known
619 {
620 case 0x41:
621 case 0x79:
622 size = 128 * 1024;
623 break;
624
625 case 0x42:
626 case 0x7A:
627 case 0x82:
628 size = 256 * 1024;
629 break;
630
631 case 0x22:
632 case 0x43:
633 case 0x7B:
634 case 0x83:
635 case 0x86:
636 size = 512 * 1024;
637 break;
638
639 case 0x23:
640 case 0x44:
641 case 0x7C:
642 case 0x84:
643 case 0x87:
644 size = 1024 * 1024;
645 break;
646
647 case 0x25:
648 case 0x45:
649 case 0x85:
650 size = 2 * 1024 * 1024;
651 break;
652
653 case 0x29:
654 size = 4 * 1024 * 1024;
655 break;
656 }
657 if (maxSize < size)
658 maxSize = size;
659 }
660
661 if (i > 0)
662 getextcpuid(0, 2, buffer);
663 }
664 return maxSize;
665 }
666
CLR_GetLargestOnDieCacheSizeX86(UInt32_BOOL bTrueSize)667 size_t CLR_GetLargestOnDieCacheSizeX86(UInt32_BOOL bTrueSize)
668 {
669
670 static size_t maxSize;
671 static size_t maxTrueSize;
672
673 if (maxSize)
674 {
675 // maxSize and maxTrueSize cached
676 if (bTrueSize)
677 {
678 return maxTrueSize;
679 }
680 else
681 {
682 return maxSize;
683 }
684 }
685
686 __try
687 {
688 unsigned char buffer[16];
689 DWORD* dwBuffer = (DWORD*)buffer;
690
691 DWORD maxCpuId = getcpuid(0, buffer);
692
693 if (dwBuffer[1] == 'uneG')
694 {
695 if (dwBuffer[3] == 'Ieni')
696 {
697 if (dwBuffer[2] == 'letn')
698 {
699 size_t tempSize = 0;
700 if (maxCpuId >= 2) // cpuid support for cache size determination is available
701 {
702 tempSize = CLR_GetIntelDeterministicCacheEnum(); // try to use use deterministic cache size enumeration
703 if (!tempSize)
704 { // deterministic enumeration failed, fallback to legacy enumeration using descriptor values
705 tempSize = CLR_GetIntelDescriptorValuesCache();
706 }
707 }
708
709 // update maxSize once with final value
710 maxTrueSize = tempSize;
711
712 #ifdef _WIN64
713 if (maxCpuId >= 2)
714 {
715 // If we're running on a Prescott or greater core, EM64T tests
716 // show that starting with a gen0 larger than LLC improves performance.
717 // Thus, start with a gen0 size that is larger than the cache. The value of
718 // 3 is a reasonable tradeoff between workingset and performance.
719 maxSize = maxTrueSize * 3;
720 }
721 else
722 #endif
723 {
724 maxSize = maxTrueSize;
725 }
726 }
727 }
728 }
729
730 if (dwBuffer[1] == 'htuA') {
731 if (dwBuffer[3] == 'itne') {
732 if (dwBuffer[2] == 'DMAc') {
733
734 if (getcpuid(0x80000000, buffer) >= 0x80000006)
735 {
736 getcpuid(0x80000006, buffer);
737
738 DWORD dwL2CacheBits = dwBuffer[2];
739 DWORD dwL3CacheBits = dwBuffer[3];
740
741 maxTrueSize = (size_t)((dwL2CacheBits >> 16) * 1024); // L2 cache size in ECX bits 31-16
742
743 getcpuid(0x1, buffer);
744 DWORD dwBaseFamily = (dwBuffer[0] & (0xF << 8)) >> 8;
745 DWORD dwExtFamily = (dwBuffer[0] & (0xFF << 20)) >> 20;
746 DWORD dwFamily = dwBaseFamily >= 0xF ? dwBaseFamily + dwExtFamily : dwBaseFamily;
747
748 if (dwFamily >= 0x10)
749 {
750 BOOL bSkipAMDL3 = FALSE;
751
752 if (dwFamily == 0x10) // are we running on a Barcelona (Family 10h) processor?
753 {
754 // check model
755 DWORD dwBaseModel = (dwBuffer[0] & (0xF << 4)) >> 4;
756 DWORD dwExtModel = (dwBuffer[0] & (0xF << 16)) >> 16;
757 DWORD dwModel = dwBaseFamily >= 0xF ? (dwExtModel << 4) | dwBaseModel : dwBaseModel;
758
759 switch (dwModel)
760 {
761 case 0x2:
762 // 65nm parts do not benefit from larger Gen0
763 bSkipAMDL3 = TRUE;
764 break;
765
766 case 0x4:
767 default:
768 bSkipAMDL3 = FALSE;
769 }
770 }
771
772 if (!bSkipAMDL3)
773 {
774 // 45nm Greyhound parts (and future parts based on newer northbridge) benefit
775 // from increased gen0 size, taking L3 into account
776 getcpuid(0x80000008, buffer);
777 DWORD dwNumberOfCores = (dwBuffer[2] & (0xFF)) + 1; // NC is in ECX bits 7-0
778
779 DWORD dwL3CacheSize = (size_t)((dwL3CacheBits >> 18) * 512 * 1024); // L3 size in EDX bits 31-18 * 512KB
780 // L3 is shared between cores
781 dwL3CacheSize = dwL3CacheSize / dwNumberOfCores;
782 maxTrueSize += dwL3CacheSize; // due to exclusive caches, add L3 size (possibly zero) to L2
783 // L1 is too small to worry about, so ignore it
784 }
785 }
786
787
788 maxSize = maxTrueSize;
789 }
790 }
791 }
792 }
793 }
794 __except (1)
795 {
796 }
797
798 if (bTrueSize)
799 return maxTrueSize;
800 else
801 return maxSize;
802 }
803
804 DWORD CLR_GetLogicalCpuCountFromOS(_In_reads_opt_(nEntries) SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi, DWORD nEntries);
805
806 // This function returns the number of logical processors on a given physical chip. If it cannot
807 // determine the number of logical cpus, or the machine is not populated uniformly with the same
808 // type of processors, this function returns 1.
CLR_GetLogicalCpuCountX86(_In_reads_opt_ (nEntries)SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi,DWORD nEntries)809 DWORD CLR_GetLogicalCpuCountX86(_In_reads_opt_(nEntries) SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi, DWORD nEntries)
810 {
811 // No CONTRACT possible because GetLogicalCpuCount uses SEH
812
813 static DWORD val = 0;
814
815 // cache value for later re-use
816 if (val)
817 {
818 return val;
819 }
820
821 DWORD retVal = 1;
822
823 __try
824 {
825 unsigned char buffer[16];
826
827 DWORD maxCpuId = getcpuid(0, buffer);
828
829 if (maxCpuId < 1)
830 goto lDone;
831
832 DWORD* dwBuffer = (DWORD*)buffer;
833
834 if (dwBuffer[1] == 'uneG') {
835 if (dwBuffer[3] == 'Ieni') {
836 if (dwBuffer[2] == 'letn') { // get SMT/multicore enumeration for Intel EM64T
837
838 // TODO: Currently GetLogicalCpuCountFromOS() and GetLogicalCpuCountFallback() are broken on
839 // multi-core processor, but we never call into those two functions since we don't halve the
840 // gen0size when it's prescott and above processor. We keep the old version here for earlier
841 // generation system(Northwood based), perf data suggests on those systems, halve gen0 size
842 // still boost the performance(ex:Biztalk boosts about 17%). So on earlier systems(Northwood)
843 // based, we still go ahead and halve gen0 size. The logic in GetLogicalCpuCountFromOS()
844 // and GetLogicalCpuCountFallback() works fine for those earlier generation systems.
845 // If it's a Prescott and above processor or Multi-core, perf data suggests not to halve gen0
846 // size at all gives us overall better performance.
847 // This is going to be fixed with a new version in orcas time frame.
848
849 if ((maxCpuId > 3) && (maxCpuId < 0x80000000))
850 goto lDone;
851
852 val = CLR_GetLogicalCpuCountFromOS(pslpi, nEntries); //try to obtain HT enumeration from OS API
853 if (val)
854 {
855 retVal = val; // OS API HT enumeration successful, we are Done
856 goto lDone;
857 }
858
859 // val = GetLogicalCpuCountFallback(); // OS API failed, Fallback to HT enumeration using CPUID
860 // if( val )
861 // retVal = val;
862 }
863 }
864 }
865 lDone:;
866 }
867 __except (1)
868 {
869 }
870
871 if (val == 0)
872 {
873 val = retVal;
874 }
875
876 return retVal;
877 }
878
879 #endif // _DEBUG
880 #endif // (defined(_TARGET_AMD64_) || defined (_TARGET_X86_)) && !defined(USE_PORTABLE_HELPERS)
881
882
883 #ifdef _DEBUG
CLR_GetLogicalCpuCountFromOS(_In_reads_opt_ (nEntries)SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi,DWORD nEntries)884 DWORD CLR_GetLogicalCpuCountFromOS(_In_reads_opt_(nEntries) SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi, DWORD nEntries)
885 {
886 // No CONTRACT possible because GetLogicalCpuCount uses SEH
887
888 static DWORD val = 0;
889 DWORD retVal = 0;
890
891 if (pslpi == NULL)
892 {
893 // GetLogicalProcessorInformation no supported
894 goto lDone;
895 }
896
897 DWORD prevcount = 0;
898 DWORD count = 1;
899
900 for (DWORD j = 0; j < nEntries; j++)
901 {
902 if (pslpi[j].Relationship == RelationProcessorCore)
903 {
904 // LTP_PC_SMT indicates HT or SMT
905 if (pslpi[j].ProcessorCore.Flags == LTP_PC_SMT)
906 {
907 SIZE_T pmask = pslpi[j].ProcessorMask;
908
909 // Count the processors in the mask
910 //
911 // These are not the fastest bit counters. There may be processor intrinsics
912 // (which would be best), but there are variants faster than these:
913 // See http://en.wikipedia.org/wiki/Hamming_weight.
914 // This is the naive implementation.
915 #if !_WIN64
916 count = (pmask & 0x55555555) + ((pmask >> 1) & 0x55555555);
917 count = (count & 0x33333333) + ((count >> 2) & 0x33333333);
918 count = (count & 0x0F0F0F0F) + ((count >> 4) & 0x0F0F0F0F);
919 count = (count & 0x00FF00FF) + ((count >> 8) & 0x00FF00FF);
920 count = (count & 0x0000FFFF) + ((count >> 16) & 0x0000FFFF);
921 #else
922 pmask = (pmask & 0x5555555555555555ull) + ((pmask >> 1) & 0x5555555555555555ull);
923 pmask = (pmask & 0x3333333333333333ull) + ((pmask >> 2) & 0x3333333333333333ull);
924 pmask = (pmask & 0x0f0f0f0f0f0f0f0full) + ((pmask >> 4) & 0x0f0f0f0f0f0f0f0full);
925 pmask = (pmask & 0x00ff00ff00ff00ffull) + ((pmask >> 8) & 0x00ff00ff00ff00ffull);
926 pmask = (pmask & 0x0000ffff0000ffffull) + ((pmask >> 16) & 0x0000ffff0000ffffull);
927 pmask = (pmask & 0x00000000ffffffffull) + ((pmask >> 32) & 0x00000000ffffffffull);
928 count = static_cast<DWORD>(pmask);
929 #endif // !_WIN64 else
930 assert(count > 0);
931
932 if (prevcount)
933 {
934 if (count != prevcount)
935 {
936 retVal = 1; // masks are not symmetric
937 goto lDone;
938 }
939 }
940
941 prevcount = count;
942 }
943 }
944 }
945
946 retVal = count;
947
948 lDone:
949 return retVal;
950 }
951
952 // This function returns the size of highest level cache on the physical chip. If it cannot
953 // determine the cachesize this function returns 0.
CLR_GetLogicalProcessorCacheSizeFromOS(_In_reads_opt_ (nEntries)SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi,DWORD nEntries)954 size_t CLR_GetLogicalProcessorCacheSizeFromOS(_In_reads_opt_(nEntries) SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi, DWORD nEntries)
955 {
956 size_t cache_size = 0;
957
958 // Try to use GetLogicalProcessorInformation API and get a valid pointer to the SLPI array if successful. Returns NULL
959 // if API not present or on failure.
960
961 if (pslpi == NULL)
962 {
963 // GetLogicalProcessorInformation not supported or failed.
964 goto Exit;
965 }
966
967 // Crack the information. Iterate through all the SLPI array entries for all processors in system.
968 // Will return the greatest of all the processor cache sizes or zero
969
970 size_t last_cache_size = 0;
971
972 for (DWORD i = 0; i < nEntries; i++)
973 {
974 if (pslpi[i].Relationship == RelationCache)
975 {
976 last_cache_size = max(last_cache_size, pslpi[i].Cache.Size);
977 }
978 }
979 cache_size = last_cache_size;
980 Exit:
981
982 return cache_size;
983 }
984
CLR_GetLogicalCpuCount(_In_reads_opt_ (nEntries)SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi,DWORD nEntries)985 DWORD CLR_GetLogicalCpuCount(_In_reads_opt_(nEntries) SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi, DWORD nEntries)
986 {
987 #if (defined(_TARGET_AMD64_) || defined (_TARGET_X86_)) && !defined(USE_PORTABLE_HELPERS)
988 return CLR_GetLogicalCpuCountX86(pslpi, nEntries);
989 #else
990 return CLR_GetLogicalCpuCountFromOS(pslpi, nEntries);
991 #endif
992 }
993
CLR_GetLargestOnDieCacheSize(UInt32_BOOL bTrueSize,_In_reads_opt_ (nEntries)SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi,DWORD nEntries)994 size_t CLR_GetLargestOnDieCacheSize(UInt32_BOOL bTrueSize, _In_reads_opt_(nEntries) SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi, DWORD nEntries)
995 {
996 #if (defined(_TARGET_AMD64_) || defined (_TARGET_X86_)) && !defined(USE_PORTABLE_HELPERS)
997 return CLR_GetLargestOnDieCacheSizeX86(bTrueSize);
998 #else
999 return CLR_GetLogicalProcessorCacheSizeFromOS(pslpi, nEntries);
1000 #endif
1001 }
1002 #endif // _DEBUG
1003
1004
1005
1006 enum CpuVendor
1007 {
1008 CpuUnknown,
1009 CpuIntel,
1010 CpuAMD,
1011 };
1012
GetCpuVendor(_Out_ UInt32 * puMaxCpuId)1013 CpuVendor GetCpuVendor(_Out_ UInt32* puMaxCpuId)
1014 {
1015 #if (defined(_TARGET_AMD64_) || defined (_TARGET_X86_)) && !defined(USE_PORTABLE_HELPERS)
1016 unsigned char buffer[16];
1017 *puMaxCpuId = getcpuid(0, buffer);
1018
1019 UInt32* pdwBuffer = (UInt32*)buffer;
1020
1021 if (pdwBuffer[1] == 'uneG'
1022 && pdwBuffer[3] == 'Ieni'
1023 && pdwBuffer[2] == 'letn')
1024 {
1025 return CpuIntel;
1026 }
1027 else if (pdwBuffer[1] == 'htuA'
1028 && pdwBuffer[3] == 'itne'
1029 && pdwBuffer[2] == 'DMAc')
1030 {
1031 return CpuAMD;
1032 }
1033 #else
1034 *puMaxCpuId = 0;
1035 #endif
1036 return CpuUnknown;
1037 }
1038
1039 // Count set bits in a bitfield.
CountBits(size_t bfBitfield)1040 UInt32 CountBits(size_t bfBitfield)
1041 {
1042 UInt32 cBits = 0;
1043
1044 // This is not the fastest algorithm possible but it's simple and the performance is not critical.
1045 for (UInt32 i = 0; i < (sizeof(size_t) * 8); i++)
1046 {
1047 cBits += (bfBitfield & 1) ? 1 : 0;
1048 bfBitfield >>= 1;
1049 }
1050
1051 return cBits;
1052 }
1053
1054 //
1055 // Enable TRACE_CACHE_TOPOLOGY to get a dump of the info provided by the OS as well as a comparison of the
1056 // 'answers' between the current implementation and the CLR implementation.
1057 //
1058 //#define TRACE_CACHE_TOPOLOGY
1059 #if defined(_DEBUG) && !defined(_ARM64_)
1060 // ARM64TODO: restore
DumpCacheTopology(_In_reads_ (cRecords)SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pProcInfos,UInt32 cRecords)1061 void DumpCacheTopology(_In_reads_(cRecords) SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pProcInfos, UInt32 cRecords)
1062 {
1063 printf("----------------\n");
1064 for (UInt32 i = 0; i < cRecords; i++)
1065 {
1066 switch (pProcInfos[i].Relationship)
1067 {
1068 case RelationProcessorCore:
1069 printf(" [%2d] Core: %d threads 0x%04zx mask, flags = %d\n",
1070 i, CountBits(pProcInfos[i].ProcessorMask), pProcInfos[i].ProcessorMask,
1071 pProcInfos[i].ProcessorCore.Flags);
1072 break;
1073
1074 case RelationCache:
1075 char* pszCacheType;
1076 switch (pProcInfos[i].Cache.Type) {
1077 case CacheUnified: pszCacheType = "[Unified]"; break;
1078 case CacheInstruction: pszCacheType = "[Instr ]"; break;
1079 case CacheData: pszCacheType = "[Data ]"; break;
1080 case CacheTrace: pszCacheType = "[Trace ]"; break;
1081 default: pszCacheType = "[Unk ]"; break;
1082 }
1083 printf(" [%2d] Cache: %s 0x%08x bytes 0x%04zx mask\n", i, pszCacheType,
1084 pProcInfos[i].Cache.Size, pProcInfos[i].ProcessorMask);
1085 break;
1086
1087 case RelationNumaNode:
1088 printf(" [%2d] NumaNode: #%02d 0x%04zx mask\n",
1089 i, pProcInfos[i].NumaNode.NodeNumber, pProcInfos[i].ProcessorMask);
1090 break;
1091 case RelationProcessorPackage:
1092 printf(" [%2d] Package: 0x%04zx mask\n",
1093 i, pProcInfos[i].ProcessorMask);
1094 break;
1095 case RelationAll:
1096 case RelationGroup:
1097 default:
1098 printf(" [%2d] unknown: %d\n", i, pProcInfos[i].Relationship);
1099 break;
1100 }
1101 }
1102 printf("----------------\n");
1103 }
DumpCacheTopologyResults(UInt32 maxCpuId,CpuVendor cpuVendor,_In_reads_ (cRecords)SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pProcInfos,UInt32 cRecords)1104 void DumpCacheTopologyResults(UInt32 maxCpuId, CpuVendor cpuVendor, _In_reads_(cRecords) SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pProcInfos, UInt32 cRecords)
1105 {
1106 DumpCacheTopology(pProcInfos, cRecords);
1107 printf("maxCpuId: %d, %s\n", maxCpuId, (cpuVendor == CpuIntel) ? "CpuIntel" : ((cpuVendor == CpuAMD) ? "CpuAMD" : "CpuUnknown"));
1108 printf(" g_cLogicalCpus: %d %d :CLR_GetLogicalCpuCount\n", g_cLogicalCpus, CLR_GetLogicalCpuCount(pProcInfos, cRecords));
1109 printf(" g_cbLargestOnDieCache: 0x%08zx 0x%08zx :CLR_LargestOnDieCache(TRUE)\n", g_cbLargestOnDieCache, CLR_GetLargestOnDieCacheSize(TRUE, pProcInfos, cRecords));
1110 printf("g_cbLargestOnDieCacheAdjusted: 0x%08zx 0x%08zx :CLR_LargestOnDieCache(FALSE)\n", g_cbLargestOnDieCacheAdjusted, CLR_GetLargestOnDieCacheSize(FALSE, pProcInfos, cRecords));
1111 }
1112 #endif // defined(_DEBUG) && !defined(_ARM64_)
1113
1114 // Method used to initialize the above values.
PalQueryProcessorTopology()1115 bool PalQueryProcessorTopology()
1116 {
1117 SYSTEM_LOGICAL_PROCESSOR_INFORMATION *pProcInfos = NULL;
1118 DWORD cbBuffer = 0;
1119 bool fError = false;
1120
1121 for (;;)
1122 {
1123 // Ask for processor information with an insufficient buffer initially. The function will tell us how
1124 // much memory we need and we'll try again.
1125 if (!GetLogicalProcessorInformation(pProcInfos, &cbBuffer))
1126 {
1127 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER)
1128 {
1129 if (pProcInfos)
1130 free(pProcInfos);
1131
1132 pProcInfos = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION*)malloc(cbBuffer);
1133
1134 if (pProcInfos == NULL)
1135 {
1136 // Ran out of memory.
1137 fError = true;
1138 break;
1139 }
1140 }
1141 else
1142 {
1143 // Unexpected error from GetLogicalProcessorInformation().
1144 fError = true;
1145 break;
1146 }
1147 }
1148 else
1149 {
1150 // Successfully read processor information, stop looping.
1151 break;
1152 }
1153 }
1154
1155 // If there was no error retrieving the data parse the result. GetLogicalProcessorInformation() returns an
1156 // array of structures each of which describes some attribute of a given group of logical processors.
1157 // Fields in the structure describe which processors and which attributes are being described and the
1158 // structures come in no particular order. Therefore we just iterate over all of them accumulating the
1159 // data we're interested in as we go.
1160 if (!fError && pProcInfos != NULL)
1161 {
1162 // Some explanation of the following logic is required. The GC queries information via two APIs:
1163 // 1) GetLogicalCpuCount()
1164 // 2) GetLargestOnDieCacheSize()
1165 //
1166 // These were once unambiguous queries; logical CPUs only existed when a physical CPU supported
1167 // threading (e.g. Intel's HyperThreading technology) and caches were always shared across an entire
1168 // physical processor.
1169 //
1170 // Unfortunately for us actual processor topologies are getting ever more complex (and divergent even
1171 // between otherwise near-identical architectures such as Intel and AMD). A single physical processor
1172 // (or package, the thing that fits in a socket on the motherboard) can now have multiple classes of
1173 // logical processors within it with differing relationships between the other logical processors
1174 // (e.g. which share functional units or caches). It's technically feasible to build systems with
1175 // non-symmetric topologies as well (where the number of logical processors or cache differs between
1176 // physical processors for instance).
1177 //
1178 // The GetLogicalProcessorInformation() reflects this in the potential complexity of its output. For
1179 // large-multi CPU systems it can generate quite a few output records effectively drawing a tree of
1180 // logical processors and their relationships within cores and packages and to various levels of
1181 // cache.
1182 //
1183 // Out of this complexity we have to distill the simple answers required above. It may well prove true
1184 // in the future that we will have to ask more complex questions, but until then this function will
1185 // utilize the following semantics for each of the queries:
1186 // 1) We will report logical processors as the average number of threads per core. (For the likely
1187 // case, a symmetric system, this average will be the exact number of threads per core).
1188 // 2) We will report the largest cache on-die as the average largest cache per-core.
1189 //
1190 // We will calculate the first value by counting the number of core records returned and the number of
1191 // threads running on those cores (each core record supplies a bitmask of processors running on that
1192 // core and by definition each of those processor sets must be disjoint, so we can simply accumulate a
1193 // count of processors seen for each core so far). For now we will count all processors on a core as a
1194 // thread (even if the HT/SMT flag is not set for the core) until we have data that suggests we should
1195 // treat non-HT processors as cores in their own right. We can then simply divide the thread total by
1196 // the core total to get a thread per core average.
1197 //
1198 // The second is harder since we have to discard caches that are superceded by a larger cache
1199 // servicing the same logical processor. For instance, on a typical Intel system we wish to sum the
1200 // sizes of all the L2 caches but ignore all the L1 caches. Since performance is not a huge issue here
1201 // (this is a one time operation and we cache the results) we'll use a linear algorithm that, when
1202 // presented with a cache information record, re-scans all the records for another cache entry which
1203 // is of larger size and has at least one logical processor in common. If found, the current cache
1204 // record can be ignored.
1205 //
1206 // Once we have to total sizes of all the largest level caches on the system we can divide it by the
1207 // previously computed total cores to get average largest cache size per core.
1208
1209 // Count info records returned by GetLogicalProcessorInformation().
1210 UInt32 cRecords = cbBuffer / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
1211
1212 UInt32 maxCpuId;
1213 CpuVendor cpuVendor = GetCpuVendor(&maxCpuId);
1214
1215 bool isAsymmetric = false;
1216 UInt32 cLogicalCpus = 0;
1217 UInt32 cbCache = 0;
1218 UInt32 cbCacheAdjusted = 0;
1219
1220 for (UInt32 i = 0; i < cRecords; i++)
1221 {
1222 switch (pProcInfos[i].Relationship)
1223 {
1224 case RelationProcessorCore:
1225 if (pProcInfos[i].ProcessorCore.Flags == LTP_PC_SMT)
1226 {
1227 UInt32 thisCount = CountBits(pProcInfos[i].ProcessorMask);
1228 if (!cLogicalCpus)
1229 cLogicalCpus = thisCount;
1230 else if (thisCount != cLogicalCpus)
1231 isAsymmetric = true;
1232 }
1233 break;
1234
1235 case RelationCache:
1236 cbCache = max(cbCache, pProcInfos[i].Cache.Size);
1237 break;
1238
1239 default:
1240 break;
1241 }
1242 }
1243
1244 cbCacheAdjusted = cbCache;
1245 if (cLogicalCpus == 0)
1246 cLogicalCpus = 1;
1247
1248 #if (defined(_TARGET_AMD64_) || defined (_TARGET_X86_)) && !defined(USE_PORTABLE_HELPERS)
1249 // Apply some experimentally-derived policy to the number of logical CPUs in the same way CLR does.
1250 if ((maxCpuId < 1)
1251 || (cpuVendor != CpuIntel)
1252 || ((maxCpuId > 3) && (maxCpuId < 0x80000000)) // This is a strange one.
1253 || isAsymmetric)
1254 {
1255 cLogicalCpus = 1;
1256 }
1257
1258 // Apply some experimentally-derived policy to the cache size in the same way CLR does.
1259 if (cpuVendor == CpuIntel)
1260 {
1261 #ifdef _WIN64
1262 if (maxCpuId >= 2)
1263 {
1264 // If we're running on a Prescott or greater core, EM64T tests
1265 // show that starting with a gen0 larger than LLC improves performance.
1266 // Thus, start with a gen0 size that is larger than the cache. The value of
1267 // 3 is a reasonable tradeoff between workingset and performance.
1268 cbCacheAdjusted = cbCache * 3;
1269 }
1270 #endif // _WIN64
1271 }
1272 else if (cpuVendor == CpuAMD)
1273 {
1274 QueryAMDCacheInfo(&cbCache, &cbCacheAdjusted);
1275 }
1276 #else // (defined(_TARGET_AMD64_) || defined (_TARGET_X86_)) && !defined(USE_PORTABLE_HELPERS)
1277 cpuVendor; // avoid unused variable warnings.
1278 maxCpuId;
1279 #endif // (defined(_TARGET_AMD64_) || defined (_TARGET_X86_)) && !defined(USE_PORTABLE_HELPERS)
1280
1281 g_cLogicalCpus = cLogicalCpus;
1282 g_cbLargestOnDieCache = cbCache;
1283 g_cbLargestOnDieCacheAdjusted = cbCacheAdjusted;
1284
1285 #if defined(_DEBUG)
1286 #if defined(TRACE_CACHE_TOPOLOGY) && !defined(_ARM64_)
1287 // ARM64TODO: restore
1288 DumpCacheTopologyResults(maxCpuId, cpuVendor, pProcInfos, cRecords);
1289 #endif // defined(TRACE_CACHE_TOPOLOGY) && !defined(_ARM64_)
1290 if ((CLR_GetLargestOnDieCacheSize(TRUE, pProcInfos, cRecords) != g_cbLargestOnDieCache) ||
1291 (CLR_GetLargestOnDieCacheSize(FALSE, pProcInfos, cRecords) != g_cbLargestOnDieCacheAdjusted) ||
1292 (CLR_GetLogicalCpuCount(pProcInfos, cRecords) != g_cLogicalCpus))
1293 {
1294 #if !defined(_ARM64_)
1295 DumpCacheTopologyResults(maxCpuId, cpuVendor, pProcInfos, cRecords);
1296 #endif
1297 assert(!"QueryProcessorTopology doesn't match CLR's results. See stdout for more info.");
1298 }
1299 #endif
1300 }
1301
1302 if (pProcInfos)
1303 delete[](UInt8*)pProcInfos;
1304
1305 return !fError;
1306 }
1307
1308 #ifdef RUNTIME_SERVICES_ONLY
1309 // Functions called by the GC to obtain our cached values for number of logical processors and cache size.
PalGetLogicalCpuCount()1310 REDHAWK_PALEXPORT UInt32 REDHAWK_PALAPI PalGetLogicalCpuCount()
1311 {
1312 return g_cLogicalCpus;
1313 }
1314
PalGetLargestOnDieCacheSize(UInt32_BOOL bTrueSize)1315 REDHAWK_PALEXPORT size_t REDHAWK_PALAPI PalGetLargestOnDieCacheSize(UInt32_BOOL bTrueSize)
1316 {
1317 return bTrueSize ? g_cbLargestOnDieCache
1318 : g_cbLargestOnDieCacheAdjusted;
1319 }
1320 #endif // RUNTIME_SERVICES_ONLY
1321
_Post_writable_byte_size_(size)1322 REDHAWK_PALEXPORT _Ret_maybenull_ _Post_writable_byte_size_(size) void* REDHAWK_PALAPI PalVirtualAlloc(_In_opt_ void* pAddress, UIntNative size, UInt32 allocationType, UInt32 protect)
1323 {
1324 return VirtualAlloc(pAddress, size, allocationType, protect);
1325 }
1326
1327 #pragma warning (push)
1328 #pragma warning (disable:28160) // warnings about invalid potential parameter combinations that would cause VirtualFree to fail - those are asserted for below
PalVirtualFree(_In_ void * pAddress,UIntNative size,UInt32 freeType)1329 REDHAWK_PALEXPORT UInt32_BOOL REDHAWK_PALAPI PalVirtualFree(_In_ void* pAddress, UIntNative size, UInt32 freeType)
1330 {
1331 assert(((freeType & MEM_RELEASE) != MEM_RELEASE) || size == 0);
1332 assert((freeType & (MEM_RELEASE | MEM_DECOMMIT)) != (MEM_RELEASE | MEM_DECOMMIT));
1333 assert(freeType != 0);
1334
1335 return VirtualFree(pAddress, size, freeType);
1336 }
1337 #pragma warning (pop)
1338
PalVirtualProtect(_In_ void * pAddress,UIntNative size,UInt32 protect)1339 REDHAWK_PALEXPORT UInt32_BOOL REDHAWK_PALAPI PalVirtualProtect(_In_ void* pAddress, UIntNative size, UInt32 protect)
1340 {
1341 DWORD oldProtect;
1342 return VirtualProtect(pAddress, size, protect, &oldProtect);
1343 }
1344
PalSetWerDataBuffer(_In_ void * pNewBuffer)1345 REDHAWK_PALEXPORT _Ret_maybenull_ void* REDHAWK_PALAPI PalSetWerDataBuffer(_In_ void* pNewBuffer)
1346 {
1347 static void* pBuffer;
1348 return InterlockedExchangePointer(&pBuffer, pNewBuffer);
1349 }
1350
1351 #ifndef RUNTIME_SERVICES_ONLY
1352
1353 static LARGE_INTEGER g_performanceFrequency;
1354
1355 #ifdef PROJECTN
1356 static bool g_roInitialized;
1357 #endif
1358
1359 // Initialize the interface implementation
1360 // Return:
1361 // true if it has succeeded, false if it has failed
Initialize()1362 bool GCToOSInterface::Initialize()
1363 {
1364 if (!::QueryPerformanceFrequency(&g_performanceFrequency))
1365 {
1366 return false;
1367 }
1368
1369 #ifdef PROJECTN
1370 // TODO: Remove the RoInitialize call when we implement non-WinRT framework for classic apps
1371 HRESULT hr = RoInitialize(RO_INIT_MULTITHREADED);
1372
1373 // RPC_E_CHANGED_MODE indicates this thread has been already initialized with a different
1374 // concurrency model. That is fine; we just need to skip the RoUninitialize call on shutdown.
1375 if (SUCCEEDED(hr))
1376 {
1377 g_roInitialized = true;
1378 }
1379 else if (hr != RPC_E_CHANGED_MODE)
1380 {
1381 return false;
1382 }
1383 #endif
1384
1385 return true;
1386 }
1387
1388 // Shutdown the interface implementation
1389 // Remarks:
1390 // Must be called on the same thread as Initialize.
Shutdown()1391 void GCToOSInterface::Shutdown()
1392 {
1393 #ifdef PROJECTN
1394 if (g_roInitialized)
1395 {
1396 RoUninitialize();
1397 g_roInitialized = false;
1398 }
1399 #endif
1400 }
1401
1402 // Get numeric id of the current thread if possible on the
1403 // current platform. It is indended for logging purposes only.
1404 // Return:
1405 // Numeric id of the current thread or 0 if the
GetCurrentThreadIdForLogging()1406 uint64_t GCToOSInterface::GetCurrentThreadIdForLogging()
1407 {
1408 return ::GetCurrentThreadId();
1409 }
1410
1411 // Get id of the process
GetCurrentProcessId()1412 uint32_t GCToOSInterface::GetCurrentProcessId()
1413 {
1414 return ::GetCurrentProcessId();
1415 }
1416
1417 // Set ideal affinity for the current thread
1418 // Parameters:
1419 // affinity - ideal processor affinity for the thread
1420 // Return:
1421 // true if it has succeeded, false if it has failed
SetCurrentThreadIdealAffinity(GCThreadAffinity * affinity)1422 bool GCToOSInterface::SetCurrentThreadIdealAffinity(GCThreadAffinity* affinity)
1423 {
1424 bool success = true;
1425
1426 PROCESSOR_NUMBER proc;
1427
1428 if (affinity->Group != -1)
1429 {
1430 proc.Group = (WORD)affinity->Group;
1431 proc.Number = (BYTE)affinity->Processor;
1432 proc.Reserved = 0;
1433
1434 success = !!SetThreadIdealProcessorEx(GetCurrentThread(), &proc, NULL);
1435 }
1436 else
1437 {
1438 if (GetThreadIdealProcessorEx(GetCurrentThread(), &proc))
1439 {
1440 proc.Number = (BYTE)affinity->Processor;
1441 success = !!SetThreadIdealProcessorEx(GetCurrentThread(), &proc, NULL);
1442 }
1443 }
1444
1445 return success;
1446 }
1447
1448 // Get the number of the current processor
GetCurrentProcessorNumber()1449 uint32_t GCToOSInterface::GetCurrentProcessorNumber()
1450 {
1451 _ASSERTE(GCToOSInterface::CanGetCurrentProcessorNumber());
1452 return ::GetCurrentProcessorNumber();
1453 }
1454
1455 // Check if the OS supports getting current processor number
CanGetCurrentProcessorNumber()1456 bool GCToOSInterface::CanGetCurrentProcessorNumber()
1457 {
1458 return true;
1459 }
1460
1461 // Flush write buffers of processors that are executing threads of the current process
FlushProcessWriteBuffers()1462 void GCToOSInterface::FlushProcessWriteBuffers()
1463 {
1464 ::FlushProcessWriteBuffers();
1465 }
1466
1467 // Break into a debugger
DebugBreak()1468 void GCToOSInterface::DebugBreak()
1469 {
1470 ::DebugBreak();
1471 }
1472
1473 // Get number of logical processors
GetLogicalCpuCount()1474 uint32_t GCToOSInterface::GetLogicalCpuCount()
1475 {
1476 return g_cLogicalCpus;
1477 }
1478
1479 // Causes the calling thread to sleep for the specified number of milliseconds
1480 // Parameters:
1481 // sleepMSec - time to sleep before switching to another thread
Sleep(uint32_t sleepMSec)1482 void GCToOSInterface::Sleep(uint32_t sleepMSec)
1483 {
1484 PalSleep(sleepMSec);
1485 }
1486
1487 // Causes the calling thread to yield execution to another thread that is ready to run on the current processor.
1488 // Parameters:
1489 // switchCount - number of times the YieldThread was called in a loop
YieldThread(uint32_t)1490 void GCToOSInterface::YieldThread(uint32_t /*switchCount*/)
1491 {
1492 PalSwitchToThread();
1493 }
1494
1495 // Reserve virtual memory range.
1496 // Parameters:
1497 // address - starting virtual address, it can be NULL to let the function choose the starting address
1498 // size - size of the virtual memory range
1499 // alignment - requested memory alignment
1500 // flags - flags to control special settings like write watching
1501 // Return:
1502 // Starting virtual address of the reserved range
VirtualReserve(size_t size,size_t alignment,uint32_t flags)1503 void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags)
1504 {
1505 DWORD memFlags = (flags & VirtualReserveFlags::WriteWatch) ? (MEM_RESERVE | MEM_WRITE_WATCH) : MEM_RESERVE;
1506 return ::VirtualAlloc(0, size, memFlags, PAGE_READWRITE);
1507 }
1508
1509 // Release virtual memory range previously reserved using VirtualReserve
1510 // Parameters:
1511 // address - starting virtual address
1512 // size - size of the virtual memory range
1513 // Return:
1514 // true if it has succeeded, false if it has failed
VirtualRelease(void * address,size_t size)1515 bool GCToOSInterface::VirtualRelease(void* address, size_t size)
1516 {
1517 UNREFERENCED_PARAMETER(size);
1518 return !!::VirtualFree(address, 0, MEM_RELEASE);
1519 }
1520
1521 // Commit virtual memory range. It must be part of a range reserved using VirtualReserve.
1522 // Parameters:
1523 // address - starting virtual address
1524 // size - size of the virtual memory range
1525 // Return:
1526 // true if it has succeeded, false if it has failed
VirtualCommit(void * address,size_t size)1527 bool GCToOSInterface::VirtualCommit(void* address, size_t size)
1528 {
1529 return ::VirtualAlloc(address, size, MEM_COMMIT, PAGE_READWRITE) != NULL;
1530 }
1531
1532 // Decomit virtual memory range.
1533 // Parameters:
1534 // address - starting virtual address
1535 // size - size of the virtual memory range
1536 // Return:
1537 // true if it has succeeded, false if it has failed
VirtualDecommit(void * address,size_t size)1538 bool GCToOSInterface::VirtualDecommit(void* address, size_t size)
1539 {
1540 return !!::VirtualFree(address, size, MEM_DECOMMIT);
1541 }
1542
1543 // Reset virtual memory range. Indicates that data in the memory range specified by address and size is no
1544 // longer of interest, but it should not be decommitted.
1545 // Parameters:
1546 // address - starting virtual address
1547 // size - size of the virtual memory range
1548 // unlock - true if the memory range should also be unlocked
1549 // Return:
1550 // true if it has succeeded, false if it has failed
VirtualReset(void * address,size_t size,bool unlock)1551 bool GCToOSInterface::VirtualReset(void * address, size_t size, bool unlock)
1552 {
1553 bool success = ::VirtualAlloc(address, size, MEM_RESET, PAGE_READWRITE) != NULL;
1554 if (success && unlock)
1555 {
1556 // Remove the page range from the working set
1557 ::VirtualUnlock(address, size);
1558 }
1559
1560 return success;
1561 }
1562
1563 // Check if the OS supports write watching
SupportsWriteWatch()1564 bool GCToOSInterface::SupportsWriteWatch()
1565 {
1566 return PalHasCapability(WriteWatchCapability);
1567 }
1568
1569 // Reset the write tracking state for the specified virtual memory range.
1570 // Parameters:
1571 // address - starting virtual address
1572 // size - size of the virtual memory range
ResetWriteWatch(void * address,size_t size)1573 void GCToOSInterface::ResetWriteWatch(void* address, size_t size)
1574 {
1575 ::ResetWriteWatch(address, size);
1576 }
1577
1578 // Retrieve addresses of the pages that are written to in a region of virtual memory
1579 // Parameters:
1580 // resetState - true indicates to reset the write tracking state
1581 // address - starting virtual address
1582 // size - size of the virtual memory range
1583 // pageAddresses - buffer that receives an array of page addresses in the memory region
1584 // pageAddressesCount - on input, size of the lpAddresses array, in array elements
1585 // on output, the number of page addresses that are returned in the array.
1586 // Return:
1587 // true if it has succeeded, false if it has failed
GetWriteWatch(bool resetState,void * address,size_t size,void ** pageAddresses,uintptr_t * pageAddressesCount)1588 bool GCToOSInterface::GetWriteWatch(bool resetState, void* address, size_t size, void** pageAddresses, uintptr_t* pageAddressesCount)
1589 {
1590 uint32_t flags = resetState ? 1 : 0;
1591 ULONG granularity;
1592
1593 bool success = ::GetWriteWatch(flags, address, size, pageAddresses, (ULONG_PTR*)pageAddressesCount, &granularity) == 0;
1594 _ASSERTE (granularity == OS_PAGE_SIZE);
1595
1596 return success;
1597 }
1598
1599 // Get size of the largest cache on the processor die
1600 // Parameters:
1601 // trueSize - true to return true cache size, false to return scaled up size based on
1602 // the processor architecture
1603 // Return:
1604 // Size of the cache
GetLargestOnDieCacheSize(bool trueSize)1605 size_t GCToOSInterface::GetLargestOnDieCacheSize(bool trueSize)
1606 {
1607 return trueSize ? g_cbLargestOnDieCache : g_cbLargestOnDieCacheAdjusted;
1608 }
1609
1610 // Get affinity mask of the current process
1611 // Parameters:
1612 // processMask - affinity mask for the specified process
1613 // systemMask - affinity mask for the system
1614 // Return:
1615 // true if it has succeeded, false if it has failed
1616 // Remarks:
1617 // A process affinity mask is a bit vector in which each bit represents the processors that
1618 // a process is allowed to run on. A system affinity mask is a bit vector in which each bit
1619 // represents the processors that are configured into a system.
1620 // A process affinity mask is a subset of the system affinity mask. A process is only allowed
1621 // to run on the processors configured into a system. Therefore, the process affinity mask cannot
1622 // specify a 1 bit for a processor when the system affinity mask specifies a 0 bit for that processor.
GetCurrentProcessAffinityMask(uintptr_t * processMask,uintptr_t * systemMask)1623 bool GCToOSInterface::GetCurrentProcessAffinityMask(uintptr_t* processMask, uintptr_t* systemMask)
1624 {
1625 return !!::GetProcessAffinityMask(GetCurrentProcess(), (PDWORD_PTR)processMask, (PDWORD_PTR)systemMask);
1626 }
1627
1628 // Get number of processors assigned to the current process
1629 // Return:
1630 // The number of processors
GetCurrentProcessCpuCount()1631 uint32_t GCToOSInterface::GetCurrentProcessCpuCount()
1632 {
1633 static int cCPUs = 0;
1634
1635 if (cCPUs != 0)
1636 return cCPUs;
1637
1638 DWORD_PTR pmask, smask;
1639
1640 if (!GetProcessAffinityMask(GetCurrentProcess(), &pmask, &smask))
1641 return 1;
1642
1643 if (pmask == 1)
1644 return 1;
1645
1646 pmask &= smask;
1647
1648 int count = 0;
1649 while (pmask)
1650 {
1651 if (pmask & 1)
1652 count++;
1653
1654 pmask >>= 1;
1655 }
1656
1657 // GetProcessAffinityMask can return pmask=0 and smask=0 on systems with more
1658 // than 64 processors, which would leave us with a count of 0. Since the GC
1659 // expects there to be at least one processor to run on (and thus at least one
1660 // heap), we'll return 64 here if count is 0, since there are likely a ton of
1661 // processors available in that case. The GC also cannot (currently) handle
1662 // the case where there are more than 64 processors, so we will return a
1663 // maximum of 64 here.
1664 if (count == 0 || count > 64)
1665 count = 64;
1666
1667 cCPUs = count;
1668
1669 return count;
1670 }
1671
1672 // Return the size of the user-mode portion of the virtual address space of this process.
1673 // Return:
1674 // non zero if it has succeeded, 0 if it has failed
GetVirtualMemoryLimit()1675 size_t GCToOSInterface::GetVirtualMemoryLimit()
1676 {
1677 MEMORYSTATUSEX memStatus;
1678
1679 memStatus.dwLength = sizeof(MEMORYSTATUSEX);
1680
1681 BOOL fRet;
1682 fRet = GlobalMemoryStatusEx(&memStatus);
1683 _ASSERTE(fRet);
1684
1685 return (size_t)memStatus.ullTotalVirtual;
1686 }
1687
1688 // Get the physical memory that this process can use.
1689 // Return:
1690 // non zero if it has succeeded, 0 if it has failed
GetPhysicalMemoryLimit()1691 uint64_t GCToOSInterface::GetPhysicalMemoryLimit()
1692 {
1693 MEMORYSTATUSEX memStatus;
1694
1695 memStatus.dwLength = sizeof(MEMORYSTATUSEX);
1696
1697 BOOL fRet;
1698 fRet = GlobalMemoryStatusEx(&memStatus);
1699 _ASSERTE(fRet);
1700
1701 return memStatus.ullTotalPhys;
1702 }
1703
1704 // Get memory status
1705 // Parameters:
1706 // memory_load - A number between 0 and 100 that specifies the approximate percentage of physical memory
1707 // that is in use (0 indicates no memory use and 100 indicates full memory use).
1708 // available_physical - The amount of physical memory currently available, in bytes.
1709 // available_page_file - The maximum amount of memory the current process can commit, in bytes.
GetMemoryStatus(uint32_t * memory_load,uint64_t * available_physical,uint64_t * available_page_file)1710 void GCToOSInterface::GetMemoryStatus(uint32_t* memory_load, uint64_t* available_physical, uint64_t* available_page_file)
1711 {
1712 MEMORYSTATUSEX memStatus;
1713
1714 memStatus.dwLength = sizeof(MEMORYSTATUSEX);
1715
1716 BOOL fRet;
1717 fRet = GlobalMemoryStatusEx(&memStatus);
1718 _ASSERTE(fRet);
1719
1720 // If the machine has more RAM than virtual address limit, let us cap it.
1721 // The GC can never use more than virtual address limit.
1722 if (memStatus.ullAvailPhys > memStatus.ullTotalVirtual)
1723 {
1724 memStatus.ullAvailPhys = memStatus.ullAvailVirtual;
1725 }
1726
1727 if (memory_load != NULL)
1728 *memory_load = memStatus.dwMemoryLoad;
1729 if (available_physical != NULL)
1730 *available_physical = memStatus.ullAvailPhys;
1731 if (available_page_file != NULL)
1732 *available_page_file = memStatus.ullAvailPageFile;
1733 }
1734
1735 // Get a high precision performance counter
1736 // Return:
1737 // The counter value
QueryPerformanceCounter()1738 int64_t GCToOSInterface::QueryPerformanceCounter()
1739 {
1740 LARGE_INTEGER ts;
1741 if (!::QueryPerformanceCounter(&ts))
1742 {
1743 ASSERT_UNCONDITIONALLY("Fatal Error - cannot query performance counter.");
1744 RhFailFast();
1745 }
1746
1747 return ts.QuadPart;
1748 }
1749
1750 // Get a frequency of the high precision performance counter
1751 // Return:
1752 // The counter frequency
QueryPerformanceFrequency()1753 int64_t GCToOSInterface::QueryPerformanceFrequency()
1754 {
1755 return g_performanceFrequency.QuadPart;
1756 }
1757
1758 // Get a time stamp with a low precision
1759 // Return:
1760 // Time stamp in milliseconds
GetLowPrecisionTimeStamp()1761 uint32_t GCToOSInterface::GetLowPrecisionTimeStamp()
1762 {
1763 return ::GetTickCount();
1764 }
1765
1766 // Parameters of the GC thread stub
1767 struct GCThreadStubParam
1768 {
1769 GCThreadFunction GCThreadFunction;
1770 void* GCThreadParam;
1771 };
1772
1773 // GC thread stub to convert GC thread function to an OS specific thread function
GCThreadStub(void * param)1774 static DWORD GCThreadStub(void* param)
1775 {
1776 GCThreadStubParam *stubParam = (GCThreadStubParam*)param;
1777 GCThreadFunction function = stubParam->GCThreadFunction;
1778 void* threadParam = stubParam->GCThreadParam;
1779
1780 delete stubParam;
1781
1782 function(threadParam);
1783
1784 return 0;
1785 }
1786
1787 // Create a new thread for GC use
1788 // Parameters:
1789 // function - the function to be executed by the thread
1790 // param - parameters of the thread
1791 // affinity - processor affinity of the thread
1792 // Return:
1793 // true if it has succeeded, false if it has failed
CreateThread(GCThreadFunction function,void * param,GCThreadAffinity * affinity)1794 bool GCToOSInterface::CreateThread(GCThreadFunction function, void* param, GCThreadAffinity* affinity)
1795 {
1796 NewHolder<GCThreadStubParam> stubParam = new (nothrow) GCThreadStubParam();
1797 if (stubParam == NULL)
1798 {
1799 return false;
1800 }
1801
1802 stubParam->GCThreadFunction = function;
1803 stubParam->GCThreadParam = param;
1804
1805 DWORD thread_id;
1806 HANDLE gc_thread = ::CreateThread(0, 4096, GCThreadStub, stubParam.GetValue(), CREATE_SUSPENDED, &thread_id);
1807
1808 if (!gc_thread)
1809 {
1810 return false;
1811 }
1812
1813 stubParam.SuppressRelease();
1814
1815 SetThreadPriority(gc_thread, /* THREAD_PRIORITY_ABOVE_NORMAL );*/ THREAD_PRIORITY_HIGHEST );
1816
1817 if (affinity->Group != GCThreadAffinity::None)
1818 {
1819 // @TODO: CPUGroupInfo
1820
1821 // ASSERT(affinity->Processor != GCThreadAffinity::None);
1822 // GROUP_AFFINITY ga;
1823 // ga.Group = (WORD)affinity->Group;
1824 // ga.Reserved[0] = 0;
1825 // ga.Reserved[1] = 0;
1826 // ga.Reserved[2] = 0;
1827 // ga.Mask = (size_t)1 << affinity->Processor;
1828 // CPUGroupInfo::SetThreadGroupAffinity(gc_thread, &ga, NULL);
1829 }
1830 else if (affinity->Processor != GCThreadAffinity::None)
1831 {
1832 SetThreadAffinityMask(gc_thread, (DWORD_PTR)1 << affinity->Processor);
1833 }
1834
1835 ResumeThread(gc_thread);
1836 CloseHandle(gc_thread);
1837
1838 return true;
1839 }
1840
1841 // Initialize the critical section
Initialize()1842 void CLRCriticalSection::Initialize()
1843 {
1844 InitializeCriticalSection(&m_cs);
1845 }
1846
1847 // Destroy the critical section
Destroy()1848 void CLRCriticalSection::Destroy()
1849 {
1850 DeleteCriticalSection(&m_cs);
1851 }
1852
1853 // Enter the critical section. Blocks until the section can be entered.
Enter()1854 void CLRCriticalSection::Enter()
1855 {
1856 EnterCriticalSection(&m_cs);
1857 }
1858
1859 // Leave the critical section
Leave()1860 void CLRCriticalSection::Leave()
1861 {
1862 LeaveCriticalSection(&m_cs);
1863 }
1864
1865 #endif // RUNTIME_SERVICES_ONLY
1866