1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4 
5 //
6 // Implementation of the Redhawk Platform Abstraction Layer (PAL) library when MinWin is the platform. In this
7 // case most or all of the import requirements which Redhawk has can be satisfied via a forwarding export to
8 // some native MinWin library. Therefore most of the work is done in the .def file and there is very little
9 // code here.
10 //
11 // Note that in general we don't want to assume that Windows and Redhawk global definitions can co-exist.
12 // Since this code must include Windows headers to do its job we can't therefore safely include general
13 // Redhawk header files.
14 //
15 #include "common.h"
16 #include <windows.h>
17 #include <stdio.h>
18 #include <errno.h>
19 #include <evntprov.h>
20 #ifdef PROJECTN
21 #include <roapi.h>
22 #endif
23 
24 #include "holder.h"
25 
26 #define PalRaiseFailFastException RaiseFailFastException
27 
PalEventWrite(REGHANDLE arg1,const EVENT_DESCRIPTOR * arg2,uint32_t arg3,EVENT_DATA_DESCRIPTOR * arg4)28 uint32_t PalEventWrite(REGHANDLE arg1, const EVENT_DESCRIPTOR * arg2, uint32_t arg3, EVENT_DATA_DESCRIPTOR * arg4)
29 {
30     return EventWrite(arg1, arg2, arg3, arg4);
31 }
32 
33 #include "gcenv.h"
34 
35 
36 #define REDHAWK_PALEXPORT extern "C"
37 #define REDHAWK_PALAPI __stdcall
38 
39 #ifndef RUNTIME_SERVICES_ONLY
40 // Index for the fiber local storage of the attached thread pointer
41 static UInt32 g_flsIndex = FLS_OUT_OF_INDEXES;
42 #endif
43 
44 static DWORD g_dwPALCapabilities;
45 
46 GCSystemInfo g_SystemInfo;
47 
InitializeSystemInfo()48 bool InitializeSystemInfo()
49 {
50     SYSTEM_INFO systemInfo;
51     GetSystemInfo(&systemInfo);
52 
53     g_SystemInfo.dwNumberOfProcessors = systemInfo.dwNumberOfProcessors;
54     g_SystemInfo.dwPageSize = systemInfo.dwPageSize;
55     g_SystemInfo.dwAllocationGranularity = systemInfo.dwAllocationGranularity;
56 
57     return true;
58 }
59 
60 extern bool PalQueryProcessorTopology();
61 
62 #ifndef RUNTIME_SERVICES_ONLY
63 // This is called when each *fiber* is destroyed. When the home fiber of a thread is destroyed,
64 // it means that the thread itself is destroyed.
65 // Since we receive that notification outside of the Loader Lock, it allows us to safely acquire
66 // the ThreadStore lock in the RuntimeThreadShutdown.
FiberDetachCallback(void * lpFlsData)67 void __stdcall FiberDetachCallback(void* lpFlsData)
68 {
69     ASSERT(g_flsIndex != FLS_OUT_OF_INDEXES);
70     ASSERT(lpFlsData == FlsGetValue(g_flsIndex));
71 
72     if (lpFlsData != NULL)
73     {
74         // The current fiber is the home fiber of a thread, so the thread is shutting down
75         RuntimeThreadShutdown(lpFlsData);
76     }
77 }
78 #endif
79 
80 // The Redhawk PAL must be initialized before any of its exports can be called. Returns true for a successful
81 // initialization and false on failure.
PalInit()82 REDHAWK_PALEXPORT bool REDHAWK_PALAPI PalInit()
83 {
84     g_dwPALCapabilities = WriteWatchCapability | GetCurrentProcessorNumberCapability | LowMemoryNotificationCapability;
85 
86     if (!PalQueryProcessorTopology())
87         return false;
88 
89 #ifndef RUNTIME_SERVICES_ONLY
90     // We use fiber detach callbacks to run our thread shutdown code because the fiber detach
91     // callback is made without the OS loader lock
92     g_flsIndex = FlsAlloc(FiberDetachCallback);
93     if (g_flsIndex == FLS_OUT_OF_INDEXES)
94     {
95         return false;
96     }
97 #endif
98 
99     return true;
100 }
101 
102 // Given a mask of capabilities return true if all of them are supported by the current PAL.
PalHasCapability(PalCapability capability)103 REDHAWK_PALEXPORT bool REDHAWK_PALAPI PalHasCapability(PalCapability capability)
104 {
105     return (g_dwPALCapabilities & (DWORD)capability) == (DWORD)capability;
106 }
107 
108 #ifndef RUNTIME_SERVICES_ONLY
109 // Attach thread to PAL.
110 // It can be called multiple times for the same thread.
111 // It fails fast if a different thread was already registered with the current fiber
112 // or if the thread was already registered with a different fiber.
113 // Parameters:
114 //  thread        - thread to attach
PalAttachThread(void * thread)115 REDHAWK_PALEXPORT void REDHAWK_PALAPI PalAttachThread(void* thread)
116 {
117     void* threadFromCurrentFiber = FlsGetValue(g_flsIndex);
118 
119     if (threadFromCurrentFiber != NULL)
120     {
121         ASSERT_UNCONDITIONALLY("Multiple threads encountered from a single fiber");
122         RhFailFast();
123     }
124 
125     // Associate the current fiber with the current thread.  This makes the current fiber the thread's "home"
126     // fiber.  This fiber is the only fiber allowed to execute managed code on this thread.  When this fiber
127     // is destroyed, we consider the thread to be destroyed.
128     FlsSetValue(g_flsIndex, thread);
129 }
130 
131 // Detach thread from PAL.
132 // It fails fast if some other thread value was attached to PAL.
133 // Parameters:
134 //  thread        - thread to detach
135 // Return:
136 //  true if the thread was detached, false if there was no attached thread
PalDetachThread(void * thread)137 REDHAWK_PALEXPORT bool REDHAWK_PALAPI PalDetachThread(void* thread)
138 {
139     ASSERT(g_flsIndex != FLS_OUT_OF_INDEXES);
140     void* threadFromCurrentFiber = FlsGetValue(g_flsIndex);
141 
142     if (threadFromCurrentFiber == NULL)
143     {
144         // we've seen this thread, but not this fiber.  It must be a "foreign" fiber that was
145         // borrowing this thread.
146         return false;
147     }
148 
149     if (threadFromCurrentFiber != thread)
150     {
151         ASSERT_UNCONDITIONALLY("Detaching a thread from the wrong fiber");
152         RhFailFast();
153     }
154 
155     FlsSetValue(g_flsIndex, NULL);
156     return true;
157 }
158 #endif // RUNTIME_SERVICES_ONLY
159 
PalGetCurrentThreadIdForLogging()160 extern "C" UInt64 PalGetCurrentThreadIdForLogging()
161 {
162     return GetCurrentThreadId();
163 }
164 
165 #if !defined(USE_PORTABLE_HELPERS) && !defined(FEATURE_RX_THUNKS)
PalAllocateThunksFromTemplate(_In_ HANDLE hTemplateModule,UInt32 templateRva,size_t templateSize,_Outptr_result_bytebuffer_ (templateSize)void ** newThunksOut)166 REDHAWK_PALEXPORT UInt32_BOOL REDHAWK_PALAPI PalAllocateThunksFromTemplate(_In_ HANDLE hTemplateModule, UInt32 templateRva, size_t templateSize, _Outptr_result_bytebuffer_(templateSize) void** newThunksOut)
167 {
168 #ifdef XBOX_ONE
169     return E_NOTIMPL;
170 #else
171     BOOL success = FALSE;
172     HANDLE hMap = NULL, hFile = INVALID_HANDLE_VALUE;
173 
174     const WCHAR * wszModuleFileName = NULL;
175     if (PalGetModuleFileName(&wszModuleFileName, hTemplateModule) == 0 || wszModuleFileName == NULL)
176         return FALSE;
177 
178     hFile = CreateFileW(wszModuleFileName, GENERIC_READ | GENERIC_EXECUTE, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
179     if (hFile == INVALID_HANDLE_VALUE)
180         goto cleanup;
181 
182     hMap = CreateFileMapping(hFile, NULL, SEC_IMAGE | PAGE_READONLY, 0, 0, NULL);
183     if (hMap == NULL)
184         goto cleanup;
185 
186     *newThunksOut = MapViewOfFile(hMap, 0, 0, templateRva, templateSize);
187     success = ((*newThunksOut) != NULL);
188 
189 cleanup:
190     CloseHandle(hMap);
191     CloseHandle(hFile);
192 
193     return success;
194 #endif
195 }
196 
PalFreeThunksFromTemplate(_In_ void * pBaseAddress)197 REDHAWK_PALEXPORT UInt32_BOOL REDHAWK_PALAPI PalFreeThunksFromTemplate(_In_ void *pBaseAddress)
198 {
199 #ifdef XBOX_ONE
200     return TRUE;
201 #else
202     return UnmapViewOfFile(pBaseAddress);
203 #endif
204 }
205 #endif // !USE_PORTABLE_HELPERS && !FEATURE_RX_THUNKS
206 
PalMarkThunksAsValidCallTargets(void * virtualAddress,int thunkSize,int thunksPerBlock,int thunkBlockSize,int thunkBlocksPerMapping)207 REDHAWK_PALEXPORT UInt32_BOOL REDHAWK_PALAPI PalMarkThunksAsValidCallTargets(
208     void *virtualAddress,
209     int thunkSize,
210     int thunksPerBlock,
211     int thunkBlockSize,
212     int thunkBlocksPerMapping)
213 {
214     // For CoreRT we are using RWX pages so there is no need for this API for now.
215     // Once we have a scenario for non-RWX pages we should be able to put the implementation here
216     return TRUE;
217 }
218 
PalCompatibleWaitAny(UInt32_BOOL alertable,UInt32 timeout,UInt32 handleCount,HANDLE * pHandles,UInt32_BOOL allowReentrantWait)219 REDHAWK_PALEXPORT UInt32 REDHAWK_PALAPI PalCompatibleWaitAny(UInt32_BOOL alertable, UInt32 timeout, UInt32 handleCount, HANDLE* pHandles, UInt32_BOOL allowReentrantWait)
220 {
221     DWORD index;
222     SetLastError(ERROR_SUCCESS); // recommended by MSDN.
223     HRESULT hr = CoWaitForMultipleHandles(alertable ? COWAIT_ALERTABLE : 0, timeout, handleCount, pHandles, &index);
224 
225     switch (hr)
226     {
227     case S_OK:
228         return index;
229 
230     case RPC_S_CALLPENDING:
231         return WAIT_TIMEOUT;
232 
233     default:
234         SetLastError(HRESULT_CODE(hr));
235         return WAIT_FAILED;
236     }
237 }
238 
PalSleep(UInt32 milliseconds)239 REDHAWK_PALEXPORT void REDHAWK_PALAPI PalSleep(UInt32 milliseconds)
240 {
241     return Sleep(milliseconds);
242 }
243 
PalSwitchToThread()244 REDHAWK_PALEXPORT UInt32_BOOL REDHAWK_PALAPI PalSwitchToThread()
245 {
246     return SwitchToThread();
247 }
248 
PalCreateEventW(_In_opt_ LPSECURITY_ATTRIBUTES pEventAttributes,UInt32_BOOL manualReset,UInt32_BOOL initialState,_In_opt_z_ LPCWSTR pName)249 REDHAWK_PALEXPORT HANDLE REDHAWK_PALAPI PalCreateEventW(_In_opt_ LPSECURITY_ATTRIBUTES pEventAttributes, UInt32_BOOL manualReset, UInt32_BOOL initialState, _In_opt_z_ LPCWSTR pName)
250 {
251     return CreateEventW(pEventAttributes, manualReset, initialState, pName);
252 }
253 
_Success_(return)254 REDHAWK_PALEXPORT _Success_(return) bool REDHAWK_PALAPI PalGetThreadContext(HANDLE hThread, _Out_ PAL_LIMITED_CONTEXT * pCtx)
255 {
256     CONTEXT win32ctx;
257 
258     win32ctx.ContextFlags = CONTEXT_CONTROL | CONTEXT_INTEGER | CONTEXT_EXCEPTION_REQUEST;
259 
260     if (!GetThreadContext(hThread, &win32ctx))
261         return false;
262 
263     // The CONTEXT_SERVICE_ACTIVE and CONTEXT_EXCEPTION_ACTIVE output flags indicate we suspended the thread
264     // at a point where the kernel cannot guarantee a completely accurate context. We'll fail the request in
265     // this case (which should force our caller to resume the thread and try again -- since this is a fairly
266     // narrow window we're highly likely to succeed next time).
267     // Note: in some cases (x86 WOW64, ARM32 on ARM64) the OS will not set the CONTEXT_EXCEPTION_REPORTING flag
268     // if the thread is executing in kernel mode (i.e. in the middle of a syscall or exception handling).
269     // Therefore, we should treat the absence of the CONTEXT_EXCEPTION_REPORTING flag as an indication that
270     // it is not safe to manipulate with the current state of the thread context.
271     if ((win32ctx.ContextFlags & CONTEXT_EXCEPTION_REPORTING) == 0 ||
272         (win32ctx.ContextFlags & (CONTEXT_SERVICE_ACTIVE | CONTEXT_EXCEPTION_ACTIVE)))
273         return false;
274 
275 #ifdef _X86_
276     pCtx->IP = win32ctx.Eip;
277     pCtx->Rsp = win32ctx.Esp;
278     pCtx->Rbp = win32ctx.Ebp;
279     pCtx->Rdi = win32ctx.Edi;
280     pCtx->Rsi = win32ctx.Esi;
281     pCtx->Rax = win32ctx.Eax;
282     pCtx->Rbx = win32ctx.Ebx;
283 #elif defined(_AMD64_)
284     pCtx->IP = win32ctx.Rip;
285     pCtx->Rsp = win32ctx.Rsp;
286     pCtx->Rbp = win32ctx.Rbp;
287     pCtx->Rdi = win32ctx.Rdi;
288     pCtx->Rsi = win32ctx.Rsi;
289     pCtx->Rax = win32ctx.Rax;
290     pCtx->Rbx = win32ctx.Rbx;
291     pCtx->R12 = win32ctx.R12;
292     pCtx->R13 = win32ctx.R13;
293     pCtx->R14 = win32ctx.R14;
294     pCtx->R15 = win32ctx.R15;
295 #elif defined(_ARM_)
296     pCtx->IP = win32ctx.Pc;
297     pCtx->R0 = win32ctx.R0;
298     pCtx->R4 = win32ctx.R4;
299     pCtx->R5 = win32ctx.R5;
300     pCtx->R6 = win32ctx.R6;
301     pCtx->R7 = win32ctx.R7;
302     pCtx->R8 = win32ctx.R8;
303     pCtx->R9 = win32ctx.R9;
304     pCtx->R10 = win32ctx.R10;
305     pCtx->R11 = win32ctx.R11;
306     pCtx->SP = win32ctx.Sp;
307     pCtx->LR = win32ctx.Lr;
308 #elif defined(_ARM64_)
309     pCtx->IP = win32ctx.Pc;
310     pCtx->X0 = win32ctx.X0;
311     pCtx->X1 = win32ctx.X1;
312     // TODO: Copy X2-X7 when we start supporting HVA's
313     pCtx->X19 = win32ctx.X19;
314     pCtx->X20 = win32ctx.X20;
315     pCtx->X21 = win32ctx.X21;
316     pCtx->X22 = win32ctx.X22;
317     pCtx->X23 = win32ctx.X23;
318     pCtx->X24 = win32ctx.X24;
319     pCtx->X25 = win32ctx.X25;
320     pCtx->X26 = win32ctx.X26;
321     pCtx->X27 = win32ctx.X27;
322     pCtx->X28 = win32ctx.X28;
323     pCtx->SP = win32ctx.Sp;
324     pCtx->LR = win32ctx.Lr;
325     pCtx->FP = win32ctx.Fp;
326 #else
327 #error Unsupported platform
328 #endif
329     return true;
330 }
331 
332 
PalHijack(HANDLE hThread,_In_ PalHijackCallback callback,_In_opt_ void * pCallbackContext)333 REDHAWK_PALEXPORT UInt32 REDHAWK_PALAPI PalHijack(HANDLE hThread, _In_ PalHijackCallback callback, _In_opt_ void* pCallbackContext)
334 {
335     if (hThread == INVALID_HANDLE_VALUE)
336     {
337         return (UInt32)E_INVALIDARG;
338     }
339 
340     if (SuspendThread(hThread) == (DWORD)-1)
341     {
342         return HRESULT_FROM_WIN32(GetLastError());
343     }
344 
345     PAL_LIMITED_CONTEXT ctx;
346     HRESULT result;
347     if (!PalGetThreadContext(hThread, &ctx))
348     {
349         result = HRESULT_FROM_WIN32(GetLastError());
350     }
351     else
352     {
353         result = callback(hThread, &ctx, pCallbackContext) ? S_OK : E_FAIL;
354     }
355 
356     ResumeThread(hThread);
357 
358     return result;
359 }
360 
PalStartBackgroundWork(_In_ BackgroundCallback callback,_In_opt_ void * pCallbackContext,BOOL highPriority)361 REDHAWK_PALEXPORT HANDLE REDHAWK_PALAPI PalStartBackgroundWork(_In_ BackgroundCallback callback, _In_opt_ void* pCallbackContext, BOOL highPriority)
362 {
363     HANDLE hThread = CreateThread(
364         NULL,
365         0,
366         (LPTHREAD_START_ROUTINE)callback,
367         pCallbackContext,
368         highPriority ? CREATE_SUSPENDED : 0,
369         NULL);
370 
371     if (hThread == NULL)
372         return NULL;
373 
374     if (highPriority)
375     {
376         SetThreadPriority(hThread, THREAD_PRIORITY_HIGHEST);
377         ResumeThread(hThread);
378     }
379 
380     return hThread;
381 }
382 
PalStartBackgroundGCThread(_In_ BackgroundCallback callback,_In_opt_ void * pCallbackContext)383 REDHAWK_PALEXPORT bool REDHAWK_PALAPI PalStartBackgroundGCThread(_In_ BackgroundCallback callback, _In_opt_ void* pCallbackContext)
384 {
385     return PalStartBackgroundWork(callback, pCallbackContext, FALSE) != NULL;
386 }
387 
PalStartFinalizerThread(_In_ BackgroundCallback callback,_In_opt_ void * pCallbackContext)388 REDHAWK_PALEXPORT bool REDHAWK_PALAPI PalStartFinalizerThread(_In_ BackgroundCallback callback, _In_opt_ void* pCallbackContext)
389 {
390     return PalStartBackgroundWork(callback, pCallbackContext, TRUE) != NULL;
391 }
392 
PalGetTickCount()393 REDHAWK_PALEXPORT UInt32 REDHAWK_PALAPI PalGetTickCount()
394 {
395 #pragma warning(push)
396 #pragma warning(disable: 28159) // Consider GetTickCount64 instead
397     return GetTickCount();
398 #pragma warning(pop)
399 }
400 
PalEventEnabled(REGHANDLE regHandle,_In_ const EVENT_DESCRIPTOR * eventDescriptor)401 REDHAWK_PALEXPORT bool REDHAWK_PALAPI PalEventEnabled(REGHANDLE regHandle, _In_ const EVENT_DESCRIPTOR* eventDescriptor)
402 {
403     return !!EventEnabled(regHandle, eventDescriptor);
404 }
405 
PalCreateFileW(_In_z_ LPCWSTR pFileName,uint32_t desiredAccess,uint32_t shareMode,_In_opt_ void * pSecurityAttributes,uint32_t creationDisposition,uint32_t flagsAndAttributes,HANDLE hTemplateFile)406 REDHAWK_PALEXPORT HANDLE REDHAWK_PALAPI PalCreateFileW(
407     _In_z_ LPCWSTR pFileName,
408     uint32_t desiredAccess,
409     uint32_t shareMode,
410     _In_opt_ void* pSecurityAttributes,
411     uint32_t creationDisposition,
412     uint32_t flagsAndAttributes,
413     HANDLE hTemplateFile)
414 {
415     return CreateFileW(pFileName, desiredAccess, shareMode, (LPSECURITY_ATTRIBUTES)pSecurityAttributes,
416                        creationDisposition, flagsAndAttributes, hTemplateFile);
417 }
418 
PalCreateLowMemoryNotification()419 REDHAWK_PALEXPORT HANDLE REDHAWK_PALAPI PalCreateLowMemoryNotification()
420 {
421     return CreateMemoryResourceNotification(LowMemoryResourceNotification);
422 }
423 
PalGetModuleHandleFromPointer(_In_ void * pointer)424 REDHAWK_PALEXPORT HANDLE REDHAWK_PALAPI PalGetModuleHandleFromPointer(_In_ void* pointer)
425 {
426     HMODULE module;
427     if (!GetModuleHandleExW(
428         GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
429         (LPCWSTR)pointer,
430         &module))
431     {
432         return NULL;
433     }
434 
435     return (HANDLE)module;
436 }
437 
PalAddVectoredExceptionHandler(UInt32 firstHandler,_In_ PVECTORED_EXCEPTION_HANDLER vectoredHandler)438 REDHAWK_PALEXPORT void* REDHAWK_PALAPI PalAddVectoredExceptionHandler(UInt32 firstHandler, _In_ PVECTORED_EXCEPTION_HANDLER vectoredHandler)
439 {
440     return AddVectoredExceptionHandler(firstHandler, vectoredHandler);
441 }
442 
PalPrintFatalError(const char * message)443 REDHAWK_PALEXPORT void PalPrintFatalError(const char* message)
444 {
445     // Write the message using lowest-level OS API available. This is used to print the stack overflow
446     // message, so there is not much that can be done here.
447     DWORD dwBytesWritten;
448     WriteFile(GetStdHandle(STD_ERROR_HANDLE), message, (DWORD)strlen(message), &dwBytesWritten, NULL);
449 }
450 
451 //
452 // -----------------------------------------------------------------------------------------------------------
453 //
454 // Some more globally initialized data (in InitializeSubsystems), this time internal and used to cache
455 // information returned by various GC support routines.
456 //
457 static UInt32 g_cLogicalCpus = 0;
458 static size_t g_cbLargestOnDieCache = 0;
459 static size_t g_cbLargestOnDieCacheAdjusted = 0;
460 
461 
462 
463 #if (defined(_TARGET_AMD64_) || defined (_TARGET_X86_)) && !defined(USE_PORTABLE_HELPERS)
464 EXTERN_C DWORD __fastcall getcpuid(DWORD arg, unsigned char result[16]);
465 EXTERN_C DWORD __fastcall getextcpuid(DWORD arg1, DWORD arg2, unsigned char result[16]);
466 
QueryAMDCacheInfo(_Out_ UInt32 * pcbCache,_Out_ UInt32 * pcbCacheAdjusted)467 void QueryAMDCacheInfo(_Out_ UInt32* pcbCache, _Out_ UInt32* pcbCacheAdjusted)
468 {
469     unsigned char buffer[16];
470 
471     if (getcpuid(0x80000000, buffer) >= 0x80000006)
472     {
473         UInt32* pdwBuffer = (UInt32*)buffer;
474 
475         getcpuid(0x80000006, buffer);
476 
477         UInt32 dwL2CacheBits = pdwBuffer[2];
478         UInt32 dwL3CacheBits = pdwBuffer[3];
479 
480         *pcbCache = (size_t)((dwL2CacheBits >> 16) * 1024);    // L2 cache size in ECX bits 31-16
481 
482         getcpuid(0x1, buffer);
483         UInt32 dwBaseFamily = (pdwBuffer[0] & (0xF << 8)) >> 8;
484         UInt32 dwExtFamily = (pdwBuffer[0] & (0xFF << 20)) >> 20;
485         UInt32 dwFamily = dwBaseFamily >= 0xF ? dwBaseFamily + dwExtFamily : dwBaseFamily;
486 
487         if (dwFamily >= 0x10)
488         {
489             BOOL bSkipAMDL3 = FALSE;
490 
491             if (dwFamily == 0x10)   // are we running on a Barcelona (Family 10h) processor?
492             {
493                 // check model
494                 UInt32 dwBaseModel = (pdwBuffer[0] & (0xF << 4)) >> 4;
495                 UInt32 dwExtModel = (pdwBuffer[0] & (0xF << 16)) >> 16;
496                 UInt32 dwModel = dwBaseFamily >= 0xF ? (dwExtModel << 4) | dwBaseModel : dwBaseModel;
497 
498                 switch (dwModel)
499                 {
500                 case 0x2:
501                     // 65nm parts do not benefit from larger Gen0
502                     bSkipAMDL3 = TRUE;
503                     break;
504 
505                 case 0x4:
506                 default:
507                     bSkipAMDL3 = FALSE;
508                 }
509             }
510 
511             if (!bSkipAMDL3)
512             {
513                 // 45nm Greyhound parts (and future parts based on newer northbridge) benefit
514                 // from increased gen0 size, taking L3 into account
515                 getcpuid(0x80000008, buffer);
516                 UInt32 dwNumberOfCores = (pdwBuffer[2] & (0xFF)) + 1;       // NC is in ECX bits 7-0
517 
518                 UInt32 dwL3CacheSize = (size_t)((dwL3CacheBits >> 18) * 512 * 1024);  // L3 size in EDX bits 31-18 * 512KB
519                                                                                       // L3 is shared between cores
520                 dwL3CacheSize = dwL3CacheSize / dwNumberOfCores;
521                 *pcbCache += dwL3CacheSize;       // due to exclusive caches, add L3 size (possibly zero) to L2
522                                                   // L1 is too small to worry about, so ignore it
523             }
524         }
525     }
526     *pcbCacheAdjusted = *pcbCache;
527 }
528 
529 #ifdef _DEBUG
530 #define CACHE_WAY_BITS          0xFFC00000      // number of cache WAYS-Associativity is returned in EBX[31:22] (10 bits) using cpuid function 4
531 #define CACHE_PARTITION_BITS    0x003FF000      // number of cache Physical Partitions is returned in EBX[21:12] (10 bits) using cpuid function 4
532 #define CACHE_LINESIZE_BITS     0x00000FFF      // Linesize returned in EBX[11:0] (12 bits) using cpuid function 4
533 #define LIMITED_METHOD_CONTRACT
534 
CLR_GetIntelDeterministicCacheEnum()535 size_t CLR_GetIntelDeterministicCacheEnum()
536 {
537     LIMITED_METHOD_CONTRACT;
538     size_t retVal = 0;
539     unsigned char buffer[16];
540 
541     DWORD maxCpuid = getextcpuid(0, 0, buffer);
542 
543     DWORD* dwBuffer = (DWORD*)buffer;
544 
545     if ((maxCpuid > 3) && (maxCpuid < 0x80000000)) // Deterministic Cache Enum is Supported
546     {
547         DWORD dwCacheWays, dwCachePartitions, dwLineSize, dwSets;
548         DWORD retEAX = 0;
549         DWORD loopECX = 0;
550         size_t maxSize = 0;
551         size_t curSize = 0;
552 
553         // Make First call  to getextcpuid with loopECX=0. loopECX provides an index indicating which level to return information about.
554         // The second parameter is input EAX=4, to specify we want deterministic cache parameter leaf information.
555         // getextcpuid with EAX=4 should be executed with loopECX = 0,1, ... until retEAX [4:0] contains 00000b, indicating no more
556         // cache levels are supported.
557 
558         getextcpuid(loopECX, 4, buffer);
559         retEAX = dwBuffer[0];       // get EAX
560 
561         int i = 0;
562         while (retEAX & 0x1f)       // Crack cache enums and loop while EAX > 0
563         {
564 
565             dwCacheWays = (dwBuffer[1] & CACHE_WAY_BITS) >> 22;
566             dwCachePartitions = (dwBuffer[1] & CACHE_PARTITION_BITS) >> 12;
567             dwLineSize = dwBuffer[1] & CACHE_LINESIZE_BITS;
568             dwSets = dwBuffer[2];    // ECX
569 
570             curSize = (dwCacheWays + 1)*(dwCachePartitions + 1)*(dwLineSize + 1)*(dwSets + 1);
571 
572             if (maxSize < curSize)
573                 maxSize = curSize;
574 
575             loopECX++;
576             getextcpuid(loopECX, 4, buffer);
577             retEAX = dwBuffer[0];      // get EAX[4:0];
578             i++;
579             if (i > 16)                // prevent infinite looping
580                 return 0;
581         }
582         retVal = maxSize;
583     }
584 
585     return retVal;
586 }
587 
588 // The following function uses CPUID function 2 with descriptor values to determine the cache size.  This requires a-priori
589 // knowledge of the descriptor values. This works on gallatin and prior processors (already released processors).
590 // If successful, this function returns the cache size in bytes of the highest level on-die cache. Returns 0 on failure.
591 
CLR_GetIntelDescriptorValuesCache()592 size_t CLR_GetIntelDescriptorValuesCache()
593 {
594     LIMITED_METHOD_CONTRACT;
595     size_t size = 0;
596     size_t maxSize = 0;
597     unsigned char buffer[16];
598 
599     getextcpuid(0, 2, buffer);         // call CPUID with EAX function 2H to obtain cache descriptor values
600 
601     for (int i = buffer[0]; --i >= 0;)
602     {
603         int j;
604         for (j = 3; j < 16; j += 4)
605         {
606             // if the information in a register is marked invalid, set to null descriptors
607             if (buffer[j] & 0x80)
608             {
609                 buffer[j - 3] = 0;
610                 buffer[j - 2] = 0;
611                 buffer[j - 1] = 0;
612                 buffer[j - 0] = 0;
613             }
614         }
615 
616         for (j = 1; j < 16; j++)
617         {
618             switch (buffer[j])    // need to add descriptor values for 8M and 12M when they become known
619             {
620             case    0x41:
621             case    0x79:
622                 size = 128 * 1024;
623                 break;
624 
625             case    0x42:
626             case    0x7A:
627             case    0x82:
628                 size = 256 * 1024;
629                 break;
630 
631             case    0x22:
632             case    0x43:
633             case    0x7B:
634             case    0x83:
635             case    0x86:
636                 size = 512 * 1024;
637                 break;
638 
639             case    0x23:
640             case    0x44:
641             case    0x7C:
642             case    0x84:
643             case    0x87:
644                 size = 1024 * 1024;
645                 break;
646 
647             case    0x25:
648             case    0x45:
649             case    0x85:
650                 size = 2 * 1024 * 1024;
651                 break;
652 
653             case    0x29:
654                 size = 4 * 1024 * 1024;
655                 break;
656             }
657             if (maxSize < size)
658                 maxSize = size;
659         }
660 
661         if (i > 0)
662             getextcpuid(0, 2, buffer);
663     }
664     return     maxSize;
665 }
666 
CLR_GetLargestOnDieCacheSizeX86(UInt32_BOOL bTrueSize)667 size_t CLR_GetLargestOnDieCacheSizeX86(UInt32_BOOL bTrueSize)
668 {
669 
670     static size_t maxSize;
671     static size_t maxTrueSize;
672 
673     if (maxSize)
674     {
675         // maxSize and maxTrueSize cached
676         if (bTrueSize)
677         {
678             return maxTrueSize;
679         }
680         else
681         {
682             return maxSize;
683         }
684     }
685 
686     __try
687     {
688         unsigned char buffer[16];
689         DWORD* dwBuffer = (DWORD*)buffer;
690 
691         DWORD maxCpuId = getcpuid(0, buffer);
692 
693         if (dwBuffer[1] == 'uneG')
694         {
695             if (dwBuffer[3] == 'Ieni')
696             {
697                 if (dwBuffer[2] == 'letn')
698                 {
699                     size_t tempSize = 0;
700                     if (maxCpuId >= 2)         // cpuid support for cache size determination is available
701                     {
702                         tempSize = CLR_GetIntelDeterministicCacheEnum();          // try to use use deterministic cache size enumeration
703                         if (!tempSize)
704                         {                    // deterministic enumeration failed, fallback to legacy enumeration using descriptor values
705                             tempSize = CLR_GetIntelDescriptorValuesCache();
706                         }
707                     }
708 
709                     // update maxSize once with final value
710                     maxTrueSize = tempSize;
711 
712 #ifdef _WIN64
713                     if (maxCpuId >= 2)
714                     {
715                         // If we're running on a Prescott or greater core, EM64T tests
716                         // show that starting with a gen0 larger than LLC improves performance.
717                         // Thus, start with a gen0 size that is larger than the cache.  The value of
718                         // 3 is a reasonable tradeoff between workingset and performance.
719                         maxSize = maxTrueSize * 3;
720                     }
721                     else
722 #endif
723                     {
724                         maxSize = maxTrueSize;
725                     }
726                 }
727             }
728         }
729 
730         if (dwBuffer[1] == 'htuA') {
731             if (dwBuffer[3] == 'itne') {
732                 if (dwBuffer[2] == 'DMAc') {
733 
734                     if (getcpuid(0x80000000, buffer) >= 0x80000006)
735                     {
736                         getcpuid(0x80000006, buffer);
737 
738                         DWORD dwL2CacheBits = dwBuffer[2];
739                         DWORD dwL3CacheBits = dwBuffer[3];
740 
741                         maxTrueSize = (size_t)((dwL2CacheBits >> 16) * 1024);    // L2 cache size in ECX bits 31-16
742 
743                         getcpuid(0x1, buffer);
744                         DWORD dwBaseFamily = (dwBuffer[0] & (0xF << 8)) >> 8;
745                         DWORD dwExtFamily = (dwBuffer[0] & (0xFF << 20)) >> 20;
746                         DWORD dwFamily = dwBaseFamily >= 0xF ? dwBaseFamily + dwExtFamily : dwBaseFamily;
747 
748                         if (dwFamily >= 0x10)
749                         {
750                             BOOL bSkipAMDL3 = FALSE;
751 
752                             if (dwFamily == 0x10)   // are we running on a Barcelona (Family 10h) processor?
753                             {
754                                 // check model
755                                 DWORD dwBaseModel = (dwBuffer[0] & (0xF << 4)) >> 4;
756                                 DWORD dwExtModel = (dwBuffer[0] & (0xF << 16)) >> 16;
757                                 DWORD dwModel = dwBaseFamily >= 0xF ? (dwExtModel << 4) | dwBaseModel : dwBaseModel;
758 
759                                 switch (dwModel)
760                                 {
761                                 case 0x2:
762                                     // 65nm parts do not benefit from larger Gen0
763                                     bSkipAMDL3 = TRUE;
764                                     break;
765 
766                                 case 0x4:
767                                 default:
768                                     bSkipAMDL3 = FALSE;
769                                 }
770                             }
771 
772                             if (!bSkipAMDL3)
773                             {
774                                 // 45nm Greyhound parts (and future parts based on newer northbridge) benefit
775                                 // from increased gen0 size, taking L3 into account
776                                 getcpuid(0x80000008, buffer);
777                                 DWORD dwNumberOfCores = (dwBuffer[2] & (0xFF)) + 1;        // NC is in ECX bits 7-0
778 
779                                 DWORD dwL3CacheSize = (size_t)((dwL3CacheBits >> 18) * 512 * 1024);  // L3 size in EDX bits 31-18 * 512KB
780                                                                                                      // L3 is shared between cores
781                                 dwL3CacheSize = dwL3CacheSize / dwNumberOfCores;
782                                 maxTrueSize += dwL3CacheSize;       // due to exclusive caches, add L3 size (possibly zero) to L2
783                                                                     // L1 is too small to worry about, so ignore it
784                             }
785                         }
786 
787 
788                         maxSize = maxTrueSize;
789                     }
790                 }
791             }
792         }
793     }
794     __except (1)
795     {
796     }
797 
798     if (bTrueSize)
799         return maxTrueSize;
800     else
801         return maxSize;
802 }
803 
804 DWORD CLR_GetLogicalCpuCountFromOS(_In_reads_opt_(nEntries) SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi, DWORD nEntries);
805 
806 // This function returns the number of logical processors on a given physical chip.  If it cannot
807 // determine the number of logical cpus, or the machine is not populated uniformly with the same
808 // type of processors, this function returns 1.
CLR_GetLogicalCpuCountX86(_In_reads_opt_ (nEntries)SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi,DWORD nEntries)809 DWORD CLR_GetLogicalCpuCountX86(_In_reads_opt_(nEntries) SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi, DWORD nEntries)
810 {
811     // No CONTRACT possible because GetLogicalCpuCount uses SEH
812 
813     static DWORD val = 0;
814 
815     // cache value for later re-use
816     if (val)
817     {
818         return val;
819     }
820 
821     DWORD retVal = 1;
822 
823     __try
824     {
825         unsigned char buffer[16];
826 
827         DWORD maxCpuId = getcpuid(0, buffer);
828 
829         if (maxCpuId < 1)
830             goto lDone;
831 
832         DWORD* dwBuffer = (DWORD*)buffer;
833 
834         if (dwBuffer[1] == 'uneG') {
835             if (dwBuffer[3] == 'Ieni') {
836                 if (dwBuffer[2] == 'letn') {  // get SMT/multicore enumeration for Intel EM64T
837 
838                                               // TODO: Currently GetLogicalCpuCountFromOS() and GetLogicalCpuCountFallback() are broken on
839                                               // multi-core processor, but we never call into those two functions since we don't halve the
840                                               // gen0size when it's prescott and above processor. We keep the old version here for earlier
841                                               // generation system(Northwood based), perf data suggests on those systems, halve gen0 size
842                                               // still boost the performance(ex:Biztalk boosts about 17%). So on earlier systems(Northwood)
843                                               // based, we still go ahead and halve gen0 size.  The logic in GetLogicalCpuCountFromOS()
844                                               // and GetLogicalCpuCountFallback() works fine for those earlier generation systems.
845                                               // If it's a Prescott and above processor or Multi-core, perf data suggests not to halve gen0
846                                               // size at all gives us overall better performance.
847                                               // This is going to be fixed with a new version in orcas time frame.
848 
849                     if ((maxCpuId > 3) && (maxCpuId < 0x80000000))
850                         goto lDone;
851 
852                     val = CLR_GetLogicalCpuCountFromOS(pslpi, nEntries); //try to obtain HT enumeration from OS API
853                     if (val)
854                     {
855                         retVal = val;     // OS API HT enumeration successful, we are Done
856                         goto lDone;
857                     }
858 
859                     // val = GetLogicalCpuCountFallback();    // OS API failed, Fallback to HT enumeration using CPUID
860                     // if( val )
861                     //     retVal = val;
862                 }
863             }
864         }
865     lDone:;
866     }
867     __except (1)
868     {
869     }
870 
871     if (val == 0)
872     {
873         val = retVal;
874     }
875 
876     return retVal;
877 }
878 
879 #endif // _DEBUG
880 #endif // (defined(_TARGET_AMD64_) || defined (_TARGET_X86_)) && !defined(USE_PORTABLE_HELPERS)
881 
882 
883 #ifdef _DEBUG
CLR_GetLogicalCpuCountFromOS(_In_reads_opt_ (nEntries)SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi,DWORD nEntries)884 DWORD CLR_GetLogicalCpuCountFromOS(_In_reads_opt_(nEntries) SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi, DWORD nEntries)
885 {
886     // No CONTRACT possible because GetLogicalCpuCount uses SEH
887 
888     static DWORD val = 0;
889     DWORD retVal = 0;
890 
891     if (pslpi == NULL)
892     {
893         // GetLogicalProcessorInformation no supported
894         goto lDone;
895     }
896 
897     DWORD prevcount = 0;
898     DWORD count = 1;
899 
900     for (DWORD j = 0; j < nEntries; j++)
901     {
902         if (pslpi[j].Relationship == RelationProcessorCore)
903         {
904             // LTP_PC_SMT indicates HT or SMT
905             if (pslpi[j].ProcessorCore.Flags == LTP_PC_SMT)
906             {
907                 SIZE_T pmask = pslpi[j].ProcessorMask;
908 
909                 // Count the processors in the mask
910                 //
911                 // These are not the fastest bit counters. There may be processor intrinsics
912                 // (which would be best), but there are variants faster than these:
913                 // See http://en.wikipedia.org/wiki/Hamming_weight.
914                 // This is the naive implementation.
915 #if !_WIN64
916                 count = (pmask & 0x55555555) + ((pmask >> 1) & 0x55555555);
917                 count = (count & 0x33333333) + ((count >> 2) & 0x33333333);
918                 count = (count & 0x0F0F0F0F) + ((count >> 4) & 0x0F0F0F0F);
919                 count = (count & 0x00FF00FF) + ((count >> 8) & 0x00FF00FF);
920                 count = (count & 0x0000FFFF) + ((count >> 16) & 0x0000FFFF);
921 #else
922                 pmask = (pmask & 0x5555555555555555ull) + ((pmask >> 1) & 0x5555555555555555ull);
923                 pmask = (pmask & 0x3333333333333333ull) + ((pmask >> 2) & 0x3333333333333333ull);
924                 pmask = (pmask & 0x0f0f0f0f0f0f0f0full) + ((pmask >> 4) & 0x0f0f0f0f0f0f0f0full);
925                 pmask = (pmask & 0x00ff00ff00ff00ffull) + ((pmask >> 8) & 0x00ff00ff00ff00ffull);
926                 pmask = (pmask & 0x0000ffff0000ffffull) + ((pmask >> 16) & 0x0000ffff0000ffffull);
927                 pmask = (pmask & 0x00000000ffffffffull) + ((pmask >> 32) & 0x00000000ffffffffull);
928                 count = static_cast<DWORD>(pmask);
929 #endif // !_WIN64 else
930                 assert(count > 0);
931 
932                 if (prevcount)
933                 {
934                     if (count != prevcount)
935                     {
936                         retVal = 1;       // masks are not symmetric
937                         goto lDone;
938                     }
939                 }
940 
941                 prevcount = count;
942             }
943         }
944     }
945 
946     retVal = count;
947 
948 lDone:
949     return retVal;
950 }
951 
952 // This function returns the size of highest level cache on the physical chip.   If it cannot
953 // determine the cachesize this function returns 0.
CLR_GetLogicalProcessorCacheSizeFromOS(_In_reads_opt_ (nEntries)SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi,DWORD nEntries)954 size_t CLR_GetLogicalProcessorCacheSizeFromOS(_In_reads_opt_(nEntries) SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi, DWORD nEntries)
955 {
956     size_t cache_size = 0;
957 
958     // Try to use GetLogicalProcessorInformation API and get a valid pointer to the SLPI array if successful.  Returns NULL
959     // if API not present or on failure.
960 
961     if (pslpi == NULL)
962     {
963         // GetLogicalProcessorInformation not supported or failed.
964         goto Exit;
965     }
966 
967     // Crack the information. Iterate through all the SLPI array entries for all processors in system.
968     // Will return the greatest of all the processor cache sizes or zero
969 
970     size_t last_cache_size = 0;
971 
972     for (DWORD i = 0; i < nEntries; i++)
973     {
974         if (pslpi[i].Relationship == RelationCache)
975         {
976             last_cache_size = max(last_cache_size, pslpi[i].Cache.Size);
977         }
978     }
979     cache_size = last_cache_size;
980 Exit:
981 
982     return cache_size;
983 }
984 
CLR_GetLogicalCpuCount(_In_reads_opt_ (nEntries)SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi,DWORD nEntries)985 DWORD CLR_GetLogicalCpuCount(_In_reads_opt_(nEntries) SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi, DWORD nEntries)
986 {
987 #if (defined(_TARGET_AMD64_) || defined (_TARGET_X86_)) && !defined(USE_PORTABLE_HELPERS)
988     return CLR_GetLogicalCpuCountX86(pslpi, nEntries);
989 #else
990     return CLR_GetLogicalCpuCountFromOS(pslpi, nEntries);
991 #endif
992 }
993 
CLR_GetLargestOnDieCacheSize(UInt32_BOOL bTrueSize,_In_reads_opt_ (nEntries)SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi,DWORD nEntries)994 size_t CLR_GetLargestOnDieCacheSize(UInt32_BOOL bTrueSize, _In_reads_opt_(nEntries) SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pslpi, DWORD nEntries)
995 {
996 #if (defined(_TARGET_AMD64_) || defined (_TARGET_X86_)) && !defined(USE_PORTABLE_HELPERS)
997     return CLR_GetLargestOnDieCacheSizeX86(bTrueSize);
998 #else
999     return CLR_GetLogicalProcessorCacheSizeFromOS(pslpi, nEntries);
1000 #endif
1001 }
1002 #endif // _DEBUG
1003 
1004 
1005 
1006 enum CpuVendor
1007 {
1008     CpuUnknown,
1009     CpuIntel,
1010     CpuAMD,
1011 };
1012 
GetCpuVendor(_Out_ UInt32 * puMaxCpuId)1013 CpuVendor GetCpuVendor(_Out_ UInt32* puMaxCpuId)
1014 {
1015 #if (defined(_TARGET_AMD64_) || defined (_TARGET_X86_)) && !defined(USE_PORTABLE_HELPERS)
1016     unsigned char buffer[16];
1017     *puMaxCpuId = getcpuid(0, buffer);
1018 
1019     UInt32* pdwBuffer = (UInt32*)buffer;
1020 
1021     if (pdwBuffer[1] == 'uneG'
1022         && pdwBuffer[3] == 'Ieni'
1023         && pdwBuffer[2] == 'letn')
1024     {
1025         return CpuIntel;
1026     }
1027     else if (pdwBuffer[1] == 'htuA'
1028         && pdwBuffer[3] == 'itne'
1029         && pdwBuffer[2] == 'DMAc')
1030     {
1031         return CpuAMD;
1032     }
1033 #else
1034     *puMaxCpuId = 0;
1035 #endif
1036     return CpuUnknown;
1037 }
1038 
1039 // Count set bits in a bitfield.
CountBits(size_t bfBitfield)1040 UInt32 CountBits(size_t bfBitfield)
1041 {
1042     UInt32 cBits = 0;
1043 
1044     // This is not the fastest algorithm possible but it's simple and the performance is not critical.
1045     for (UInt32 i = 0; i < (sizeof(size_t) * 8); i++)
1046     {
1047         cBits += (bfBitfield & 1) ? 1 : 0;
1048         bfBitfield >>= 1;
1049     }
1050 
1051     return cBits;
1052 }
1053 
1054 //
1055 // Enable TRACE_CACHE_TOPOLOGY to get a dump of the info provided by the OS as well as a comparison of the
1056 // 'answers' between the current implementation and the CLR implementation.
1057 //
1058 //#define TRACE_CACHE_TOPOLOGY
1059 #if defined(_DEBUG) && !defined(_ARM64_)
1060 // ARM64TODO: restore
DumpCacheTopology(_In_reads_ (cRecords)SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pProcInfos,UInt32 cRecords)1061 void DumpCacheTopology(_In_reads_(cRecords) SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pProcInfos, UInt32 cRecords)
1062 {
1063     printf("----------------\n");
1064     for (UInt32 i = 0; i < cRecords; i++)
1065     {
1066         switch (pProcInfos[i].Relationship)
1067         {
1068         case RelationProcessorCore:
1069             printf("    [%2d] Core: %d threads                    0x%04zx mask, flags = %d\n",
1070                 i, CountBits(pProcInfos[i].ProcessorMask), pProcInfos[i].ProcessorMask,
1071                 pProcInfos[i].ProcessorCore.Flags);
1072             break;
1073 
1074         case RelationCache:
1075             char* pszCacheType;
1076             switch (pProcInfos[i].Cache.Type) {
1077             case CacheUnified:      pszCacheType = "[Unified]"; break;
1078             case CacheInstruction:  pszCacheType = "[Instr  ]"; break;
1079             case CacheData:         pszCacheType = "[Data   ]"; break;
1080             case CacheTrace:        pszCacheType = "[Trace  ]"; break;
1081             default:                pszCacheType = "[Unk    ]"; break;
1082             }
1083             printf("    [%2d] Cache: %s 0x%08x bytes  0x%04zx mask\n", i, pszCacheType,
1084                 pProcInfos[i].Cache.Size, pProcInfos[i].ProcessorMask);
1085             break;
1086 
1087         case RelationNumaNode:
1088             printf("    [%2d] NumaNode: #%02d                      0x%04zx mask\n",
1089                 i, pProcInfos[i].NumaNode.NodeNumber, pProcInfos[i].ProcessorMask);
1090             break;
1091         case RelationProcessorPackage:
1092             printf("    [%2d] Package:                           0x%04zx mask\n",
1093                 i, pProcInfos[i].ProcessorMask);
1094             break;
1095         case RelationAll:
1096         case RelationGroup:
1097         default:
1098             printf("    [%2d] unknown: %d\n", i, pProcInfos[i].Relationship);
1099             break;
1100         }
1101     }
1102     printf("----------------\n");
1103 }
DumpCacheTopologyResults(UInt32 maxCpuId,CpuVendor cpuVendor,_In_reads_ (cRecords)SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pProcInfos,UInt32 cRecords)1104 void DumpCacheTopologyResults(UInt32 maxCpuId, CpuVendor cpuVendor, _In_reads_(cRecords) SYSTEM_LOGICAL_PROCESSOR_INFORMATION * pProcInfos, UInt32 cRecords)
1105 {
1106     DumpCacheTopology(pProcInfos, cRecords);
1107     printf("maxCpuId: %d, %s\n", maxCpuId, (cpuVendor == CpuIntel) ? "CpuIntel" : ((cpuVendor == CpuAMD) ? "CpuAMD" : "CpuUnknown"));
1108     printf("               g_cLogicalCpus:          %d %d          :CLR_GetLogicalCpuCount\n", g_cLogicalCpus, CLR_GetLogicalCpuCount(pProcInfos, cRecords));
1109     printf("        g_cbLargestOnDieCache: 0x%08zx 0x%08zx :CLR_LargestOnDieCache(TRUE)\n", g_cbLargestOnDieCache, CLR_GetLargestOnDieCacheSize(TRUE, pProcInfos, cRecords));
1110     printf("g_cbLargestOnDieCacheAdjusted: 0x%08zx 0x%08zx :CLR_LargestOnDieCache(FALSE)\n", g_cbLargestOnDieCacheAdjusted, CLR_GetLargestOnDieCacheSize(FALSE, pProcInfos, cRecords));
1111 }
1112 #endif // defined(_DEBUG) && !defined(_ARM64_)
1113 
1114 // Method used to initialize the above values.
PalQueryProcessorTopology()1115 bool PalQueryProcessorTopology()
1116 {
1117     SYSTEM_LOGICAL_PROCESSOR_INFORMATION *pProcInfos = NULL;
1118     DWORD cbBuffer = 0;
1119     bool fError = false;
1120 
1121     for (;;)
1122     {
1123         // Ask for processor information with an insufficient buffer initially. The function will tell us how
1124         // much memory we need and we'll try again.
1125         if (!GetLogicalProcessorInformation(pProcInfos, &cbBuffer))
1126         {
1127             if (GetLastError() == ERROR_INSUFFICIENT_BUFFER)
1128             {
1129                 if (pProcInfos)
1130                     free(pProcInfos);
1131 
1132                 pProcInfos = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION*)malloc(cbBuffer);
1133 
1134                 if (pProcInfos == NULL)
1135                 {
1136                     // Ran out of memory.
1137                     fError = true;
1138                     break;
1139                 }
1140             }
1141             else
1142             {
1143                 // Unexpected error from GetLogicalProcessorInformation().
1144                 fError = true;
1145                 break;
1146             }
1147         }
1148         else
1149         {
1150             // Successfully read processor information, stop looping.
1151             break;
1152         }
1153     }
1154 
1155     // If there was no error retrieving the data parse the result. GetLogicalProcessorInformation() returns an
1156     // array of structures each of which describes some attribute of a given group of logical processors.
1157     // Fields in the structure describe which processors and which attributes are being described and the
1158     // structures come in no particular order. Therefore we just iterate over all of them accumulating the
1159     // data we're interested in as we go.
1160     if (!fError && pProcInfos != NULL)
1161     {
1162         // Some explanation of the following logic is required. The GC queries information via two APIs:
1163         //  1) GetLogicalCpuCount()
1164         //  2) GetLargestOnDieCacheSize()
1165         //
1166         // These were once unambiguous queries; logical CPUs only existed when a physical CPU supported
1167         // threading (e.g. Intel's HyperThreading technology) and caches were always shared across an entire
1168         // physical processor.
1169         //
1170         // Unfortunately for us actual processor topologies are getting ever more complex (and divergent even
1171         // between otherwise near-identical architectures such as Intel and AMD). A single physical processor
1172         // (or package, the thing that fits in a socket on the motherboard) can now have multiple classes of
1173         // logical processors within it with differing relationships between the other logical processors
1174         // (e.g. which share functional units or caches). It's technically feasible to build systems with
1175         // non-symmetric topologies as well (where the number of logical processors or cache differs between
1176         // physical processors for instance).
1177         //
1178         // The GetLogicalProcessorInformation() reflects this in the potential complexity of its output. For
1179         // large-multi CPU systems it can generate quite a few output records effectively drawing a tree of
1180         // logical processors and their relationships within cores and packages and to various levels of
1181         // cache.
1182         //
1183         // Out of this complexity we have to distill the simple answers required above. It may well prove true
1184         // in the future that we will have to ask more complex questions, but until then this function will
1185         // utilize the following semantics for each of the queries:
1186         //  1) We will report logical processors as the average number of threads per core. (For the likely
1187         //     case, a symmetric system, this average will be the exact number of threads per core).
1188         //  2) We will report the largest cache on-die as the average largest cache per-core.
1189         //
1190         // We will calculate the first value by counting the number of core records returned and the number of
1191         // threads running on those cores (each core record supplies a bitmask of processors running on that
1192         // core and by definition each of those processor sets must be disjoint, so we can simply accumulate a
1193         // count of processors seen for each core so far). For now we will count all processors on a core as a
1194         // thread (even if the HT/SMT flag is not set for the core) until we have data that suggests we should
1195         // treat non-HT processors as cores in their own right. We can then simply divide the thread total by
1196         // the core total to get a thread per core average.
1197         //
1198         // The second is harder since we have to discard caches that are superceded by a larger cache
1199         // servicing the same logical processor. For instance, on a typical Intel system we wish to sum the
1200         // sizes of all the L2 caches but ignore all the L1 caches. Since performance is not a huge issue here
1201         // (this is a one time operation and we cache the results) we'll use a linear algorithm that, when
1202         // presented with a cache information record, re-scans all the records for another cache entry which
1203         // is of larger size and has at least one logical processor in common. If found, the current cache
1204         // record can be ignored.
1205         //
1206         // Once we have to total sizes of all the largest level caches on the system we can divide it by the
1207         // previously computed total cores to get average largest cache size per core.
1208 
1209         // Count info records returned by GetLogicalProcessorInformation().
1210         UInt32 cRecords = cbBuffer / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
1211 
1212         UInt32 maxCpuId;
1213         CpuVendor cpuVendor = GetCpuVendor(&maxCpuId);
1214 
1215         bool isAsymmetric = false;
1216         UInt32 cLogicalCpus = 0;
1217         UInt32 cbCache = 0;
1218         UInt32 cbCacheAdjusted = 0;
1219 
1220         for (UInt32 i = 0; i < cRecords; i++)
1221         {
1222             switch (pProcInfos[i].Relationship)
1223             {
1224             case RelationProcessorCore:
1225                 if (pProcInfos[i].ProcessorCore.Flags == LTP_PC_SMT)
1226                 {
1227                     UInt32 thisCount = CountBits(pProcInfos[i].ProcessorMask);
1228                     if (!cLogicalCpus)
1229                         cLogicalCpus = thisCount;
1230                     else if (thisCount != cLogicalCpus)
1231                         isAsymmetric = true;
1232                 }
1233                 break;
1234 
1235             case RelationCache:
1236                 cbCache = max(cbCache, pProcInfos[i].Cache.Size);
1237                 break;
1238 
1239             default:
1240                 break;
1241             }
1242         }
1243 
1244         cbCacheAdjusted = cbCache;
1245         if (cLogicalCpus == 0)
1246             cLogicalCpus = 1;
1247 
1248 #if (defined(_TARGET_AMD64_) || defined (_TARGET_X86_)) && !defined(USE_PORTABLE_HELPERS)
1249         // Apply some experimentally-derived policy to the number of logical CPUs in the same way CLR does.
1250         if ((maxCpuId < 1)
1251             || (cpuVendor != CpuIntel)
1252             || ((maxCpuId > 3) && (maxCpuId < 0x80000000))  // This is a strange one.
1253             || isAsymmetric)
1254         {
1255             cLogicalCpus = 1;
1256         }
1257 
1258         // Apply some experimentally-derived policy to the cache size in the same way CLR does.
1259         if (cpuVendor == CpuIntel)
1260         {
1261 #ifdef _WIN64
1262             if (maxCpuId >= 2)
1263             {
1264                 // If we're running on a Prescott or greater core, EM64T tests
1265                 // show that starting with a gen0 larger than LLC improves performance.
1266                 // Thus, start with a gen0 size that is larger than the cache.  The value of
1267                 // 3 is a reasonable tradeoff between workingset and performance.
1268                 cbCacheAdjusted = cbCache * 3;
1269             }
1270 #endif // _WIN64
1271         }
1272         else if (cpuVendor == CpuAMD)
1273         {
1274             QueryAMDCacheInfo(&cbCache, &cbCacheAdjusted);
1275         }
1276 #else  // (defined(_TARGET_AMD64_) || defined (_TARGET_X86_)) && !defined(USE_PORTABLE_HELPERS)
1277         cpuVendor; // avoid unused variable warnings.
1278         maxCpuId;
1279 #endif // (defined(_TARGET_AMD64_) || defined (_TARGET_X86_)) && !defined(USE_PORTABLE_HELPERS)
1280 
1281         g_cLogicalCpus = cLogicalCpus;
1282         g_cbLargestOnDieCache = cbCache;
1283         g_cbLargestOnDieCacheAdjusted = cbCacheAdjusted;
1284 
1285 #if defined(_DEBUG)
1286 #if defined(TRACE_CACHE_TOPOLOGY) && !defined(_ARM64_)
1287 // ARM64TODO: restore
1288         DumpCacheTopologyResults(maxCpuId, cpuVendor, pProcInfos, cRecords);
1289 #endif // defined(TRACE_CACHE_TOPOLOGY) && !defined(_ARM64_)
1290         if ((CLR_GetLargestOnDieCacheSize(TRUE, pProcInfos, cRecords) != g_cbLargestOnDieCache) ||
1291             (CLR_GetLargestOnDieCacheSize(FALSE, pProcInfos, cRecords) != g_cbLargestOnDieCacheAdjusted) ||
1292             (CLR_GetLogicalCpuCount(pProcInfos, cRecords) != g_cLogicalCpus))
1293         {
1294 #if !defined(_ARM64_)
1295             DumpCacheTopologyResults(maxCpuId, cpuVendor, pProcInfos, cRecords);
1296 #endif
1297             assert(!"QueryProcessorTopology doesn't match CLR's results.  See stdout for more info.");
1298         }
1299 #endif
1300     }
1301 
1302     if (pProcInfos)
1303         delete[](UInt8*)pProcInfos;
1304 
1305     return !fError;
1306 }
1307 
1308 #ifdef RUNTIME_SERVICES_ONLY
1309 // Functions called by the GC to obtain our cached values for number of logical processors and cache size.
PalGetLogicalCpuCount()1310 REDHAWK_PALEXPORT UInt32 REDHAWK_PALAPI PalGetLogicalCpuCount()
1311 {
1312     return g_cLogicalCpus;
1313 }
1314 
PalGetLargestOnDieCacheSize(UInt32_BOOL bTrueSize)1315 REDHAWK_PALEXPORT size_t REDHAWK_PALAPI PalGetLargestOnDieCacheSize(UInt32_BOOL bTrueSize)
1316 {
1317     return bTrueSize ? g_cbLargestOnDieCache
1318         : g_cbLargestOnDieCacheAdjusted;
1319 }
1320 #endif // RUNTIME_SERVICES_ONLY
1321 
_Post_writable_byte_size_(size)1322 REDHAWK_PALEXPORT _Ret_maybenull_ _Post_writable_byte_size_(size) void* REDHAWK_PALAPI PalVirtualAlloc(_In_opt_ void* pAddress, UIntNative size, UInt32 allocationType, UInt32 protect)
1323 {
1324     return VirtualAlloc(pAddress, size, allocationType, protect);
1325 }
1326 
1327 #pragma warning (push)
1328 #pragma warning (disable:28160) // warnings about invalid potential parameter combinations that would cause VirtualFree to fail - those are asserted for below
PalVirtualFree(_In_ void * pAddress,UIntNative size,UInt32 freeType)1329 REDHAWK_PALEXPORT UInt32_BOOL REDHAWK_PALAPI PalVirtualFree(_In_ void* pAddress, UIntNative size, UInt32 freeType)
1330 {
1331     assert(((freeType & MEM_RELEASE) != MEM_RELEASE) || size == 0);
1332     assert((freeType & (MEM_RELEASE | MEM_DECOMMIT)) != (MEM_RELEASE | MEM_DECOMMIT));
1333     assert(freeType != 0);
1334 
1335     return VirtualFree(pAddress, size, freeType);
1336 }
1337 #pragma warning (pop)
1338 
PalVirtualProtect(_In_ void * pAddress,UIntNative size,UInt32 protect)1339 REDHAWK_PALEXPORT UInt32_BOOL REDHAWK_PALAPI PalVirtualProtect(_In_ void* pAddress, UIntNative size, UInt32 protect)
1340 {
1341     DWORD oldProtect;
1342     return VirtualProtect(pAddress, size, protect, &oldProtect);
1343 }
1344 
PalSetWerDataBuffer(_In_ void * pNewBuffer)1345 REDHAWK_PALEXPORT _Ret_maybenull_ void* REDHAWK_PALAPI PalSetWerDataBuffer(_In_ void* pNewBuffer)
1346 {
1347     static void* pBuffer;
1348     return InterlockedExchangePointer(&pBuffer, pNewBuffer);
1349 }
1350 
1351 #ifndef RUNTIME_SERVICES_ONLY
1352 
1353 static LARGE_INTEGER g_performanceFrequency;
1354 
1355 #ifdef PROJECTN
1356 static bool g_roInitialized;
1357 #endif
1358 
1359 // Initialize the interface implementation
1360 // Return:
1361 //  true if it has succeeded, false if it has failed
Initialize()1362 bool GCToOSInterface::Initialize()
1363 {
1364     if (!::QueryPerformanceFrequency(&g_performanceFrequency))
1365     {
1366         return false;
1367     }
1368 
1369 #ifdef PROJECTN
1370     // TODO: Remove the RoInitialize call when we implement non-WinRT framework for classic apps
1371     HRESULT hr = RoInitialize(RO_INIT_MULTITHREADED);
1372 
1373     // RPC_E_CHANGED_MODE indicates this thread has been already initialized with a different
1374     // concurrency model. That is fine; we just need to skip the RoUninitialize call on shutdown.
1375     if (SUCCEEDED(hr))
1376     {
1377         g_roInitialized = true;
1378     }
1379     else if (hr != RPC_E_CHANGED_MODE)
1380     {
1381         return false;
1382     }
1383 #endif
1384 
1385     return true;
1386 }
1387 
1388 // Shutdown the interface implementation
1389 // Remarks:
1390 //  Must be called on the same thread as Initialize.
Shutdown()1391 void GCToOSInterface::Shutdown()
1392 {
1393 #ifdef PROJECTN
1394     if (g_roInitialized)
1395     {
1396         RoUninitialize();
1397         g_roInitialized = false;
1398     }
1399 #endif
1400 }
1401 
1402 // Get numeric id of the current thread if possible on the
1403 // current platform. It is indended for logging purposes only.
1404 // Return:
1405 //  Numeric id of the current thread or 0 if the
GetCurrentThreadIdForLogging()1406 uint64_t GCToOSInterface::GetCurrentThreadIdForLogging()
1407 {
1408     return ::GetCurrentThreadId();
1409 }
1410 
1411 // Get id of the process
GetCurrentProcessId()1412 uint32_t GCToOSInterface::GetCurrentProcessId()
1413 {
1414     return ::GetCurrentProcessId();
1415 }
1416 
1417 // Set ideal affinity for the current thread
1418 // Parameters:
1419 //  affinity - ideal processor affinity for the thread
1420 // Return:
1421 //  true if it has succeeded, false if it has failed
SetCurrentThreadIdealAffinity(GCThreadAffinity * affinity)1422 bool GCToOSInterface::SetCurrentThreadIdealAffinity(GCThreadAffinity* affinity)
1423 {
1424     bool success = true;
1425 
1426     PROCESSOR_NUMBER proc;
1427 
1428     if (affinity->Group != -1)
1429     {
1430         proc.Group = (WORD)affinity->Group;
1431         proc.Number = (BYTE)affinity->Processor;
1432         proc.Reserved = 0;
1433 
1434         success = !!SetThreadIdealProcessorEx(GetCurrentThread(), &proc, NULL);
1435     }
1436     else
1437     {
1438         if (GetThreadIdealProcessorEx(GetCurrentThread(), &proc))
1439         {
1440             proc.Number = (BYTE)affinity->Processor;
1441             success = !!SetThreadIdealProcessorEx(GetCurrentThread(), &proc, NULL);
1442         }
1443     }
1444 
1445     return success;
1446 }
1447 
1448 // Get the number of the current processor
GetCurrentProcessorNumber()1449 uint32_t GCToOSInterface::GetCurrentProcessorNumber()
1450 {
1451     _ASSERTE(GCToOSInterface::CanGetCurrentProcessorNumber());
1452     return ::GetCurrentProcessorNumber();
1453 }
1454 
1455 // Check if the OS supports getting current processor number
CanGetCurrentProcessorNumber()1456 bool GCToOSInterface::CanGetCurrentProcessorNumber()
1457 {
1458     return true;
1459 }
1460 
1461 // Flush write buffers of processors that are executing threads of the current process
FlushProcessWriteBuffers()1462 void GCToOSInterface::FlushProcessWriteBuffers()
1463 {
1464     ::FlushProcessWriteBuffers();
1465 }
1466 
1467 // Break into a debugger
DebugBreak()1468 void GCToOSInterface::DebugBreak()
1469 {
1470     ::DebugBreak();
1471 }
1472 
1473 // Get number of logical processors
GetLogicalCpuCount()1474 uint32_t GCToOSInterface::GetLogicalCpuCount()
1475 {
1476     return g_cLogicalCpus;
1477 }
1478 
1479 // Causes the calling thread to sleep for the specified number of milliseconds
1480 // Parameters:
1481 //  sleepMSec   - time to sleep before switching to another thread
Sleep(uint32_t sleepMSec)1482 void GCToOSInterface::Sleep(uint32_t sleepMSec)
1483 {
1484     PalSleep(sleepMSec);
1485 }
1486 
1487 // Causes the calling thread to yield execution to another thread that is ready to run on the current processor.
1488 // Parameters:
1489 //  switchCount - number of times the YieldThread was called in a loop
YieldThread(uint32_t)1490 void GCToOSInterface::YieldThread(uint32_t /*switchCount*/)
1491 {
1492     PalSwitchToThread();
1493 }
1494 
1495 // Reserve virtual memory range.
1496 // Parameters:
1497 //  address   - starting virtual address, it can be NULL to let the function choose the starting address
1498 //  size      - size of the virtual memory range
1499 //  alignment - requested memory alignment
1500 //  flags     - flags to control special settings like write watching
1501 // Return:
1502 //  Starting virtual address of the reserved range
VirtualReserve(size_t size,size_t alignment,uint32_t flags)1503 void* GCToOSInterface::VirtualReserve(size_t size, size_t alignment, uint32_t flags)
1504 {
1505     DWORD memFlags = (flags & VirtualReserveFlags::WriteWatch) ? (MEM_RESERVE | MEM_WRITE_WATCH) : MEM_RESERVE;
1506     return ::VirtualAlloc(0, size, memFlags, PAGE_READWRITE);
1507 }
1508 
1509 // Release virtual memory range previously reserved using VirtualReserve
1510 // Parameters:
1511 //  address - starting virtual address
1512 //  size    - size of the virtual memory range
1513 // Return:
1514 //  true if it has succeeded, false if it has failed
VirtualRelease(void * address,size_t size)1515 bool GCToOSInterface::VirtualRelease(void* address, size_t size)
1516 {
1517     UNREFERENCED_PARAMETER(size);
1518     return !!::VirtualFree(address, 0, MEM_RELEASE);
1519 }
1520 
1521 // Commit virtual memory range. It must be part of a range reserved using VirtualReserve.
1522 // Parameters:
1523 //  address - starting virtual address
1524 //  size    - size of the virtual memory range
1525 // Return:
1526 //  true if it has succeeded, false if it has failed
VirtualCommit(void * address,size_t size)1527 bool GCToOSInterface::VirtualCommit(void* address, size_t size)
1528 {
1529     return ::VirtualAlloc(address, size, MEM_COMMIT, PAGE_READWRITE) != NULL;
1530 }
1531 
1532 // Decomit virtual memory range.
1533 // Parameters:
1534 //  address - starting virtual address
1535 //  size    - size of the virtual memory range
1536 // Return:
1537 //  true if it has succeeded, false if it has failed
VirtualDecommit(void * address,size_t size)1538 bool GCToOSInterface::VirtualDecommit(void* address, size_t size)
1539 {
1540     return !!::VirtualFree(address, size, MEM_DECOMMIT);
1541 }
1542 
1543 // Reset virtual memory range. Indicates that data in the memory range specified by address and size is no
1544 // longer of interest, but it should not be decommitted.
1545 // Parameters:
1546 //  address - starting virtual address
1547 //  size    - size of the virtual memory range
1548 //  unlock  - true if the memory range should also be unlocked
1549 // Return:
1550 //  true if it has succeeded, false if it has failed
VirtualReset(void * address,size_t size,bool unlock)1551 bool GCToOSInterface::VirtualReset(void * address, size_t size, bool unlock)
1552 {
1553     bool success = ::VirtualAlloc(address, size, MEM_RESET, PAGE_READWRITE) != NULL;
1554     if (success && unlock)
1555     {
1556         // Remove the page range from the working set
1557         ::VirtualUnlock(address, size);
1558     }
1559 
1560     return success;
1561 }
1562 
1563 // Check if the OS supports write watching
SupportsWriteWatch()1564 bool GCToOSInterface::SupportsWriteWatch()
1565 {
1566     return PalHasCapability(WriteWatchCapability);
1567 }
1568 
1569 // Reset the write tracking state for the specified virtual memory range.
1570 // Parameters:
1571 //  address - starting virtual address
1572 //  size    - size of the virtual memory range
ResetWriteWatch(void * address,size_t size)1573 void GCToOSInterface::ResetWriteWatch(void* address, size_t size)
1574 {
1575     ::ResetWriteWatch(address, size);
1576 }
1577 
1578 // Retrieve addresses of the pages that are written to in a region of virtual memory
1579 // Parameters:
1580 //  resetState         - true indicates to reset the write tracking state
1581 //  address            - starting virtual address
1582 //  size               - size of the virtual memory range
1583 //  pageAddresses      - buffer that receives an array of page addresses in the memory region
1584 //  pageAddressesCount - on input, size of the lpAddresses array, in array elements
1585 //                       on output, the number of page addresses that are returned in the array.
1586 // Return:
1587 //  true if it has succeeded, false if it has failed
GetWriteWatch(bool resetState,void * address,size_t size,void ** pageAddresses,uintptr_t * pageAddressesCount)1588 bool GCToOSInterface::GetWriteWatch(bool resetState, void* address, size_t size, void** pageAddresses, uintptr_t* pageAddressesCount)
1589 {
1590     uint32_t flags = resetState ? 1 : 0;
1591     ULONG granularity;
1592 
1593     bool success = ::GetWriteWatch(flags, address, size, pageAddresses, (ULONG_PTR*)pageAddressesCount, &granularity) == 0;
1594     _ASSERTE (granularity == OS_PAGE_SIZE);
1595 
1596     return success;
1597 }
1598 
1599 // Get size of the largest cache on the processor die
1600 // Parameters:
1601 //  trueSize - true to return true cache size, false to return scaled up size based on
1602 //             the processor architecture
1603 // Return:
1604 //  Size of the cache
GetLargestOnDieCacheSize(bool trueSize)1605 size_t GCToOSInterface::GetLargestOnDieCacheSize(bool trueSize)
1606 {
1607     return trueSize ? g_cbLargestOnDieCache : g_cbLargestOnDieCacheAdjusted;
1608 }
1609 
1610 // Get affinity mask of the current process
1611 // Parameters:
1612 //  processMask - affinity mask for the specified process
1613 //  systemMask  - affinity mask for the system
1614 // Return:
1615 //  true if it has succeeded, false if it has failed
1616 // Remarks:
1617 //  A process affinity mask is a bit vector in which each bit represents the processors that
1618 //  a process is allowed to run on. A system affinity mask is a bit vector in which each bit
1619 //  represents the processors that are configured into a system.
1620 //  A process affinity mask is a subset of the system affinity mask. A process is only allowed
1621 //  to run on the processors configured into a system. Therefore, the process affinity mask cannot
1622 //  specify a 1 bit for a processor when the system affinity mask specifies a 0 bit for that processor.
GetCurrentProcessAffinityMask(uintptr_t * processMask,uintptr_t * systemMask)1623 bool GCToOSInterface::GetCurrentProcessAffinityMask(uintptr_t* processMask, uintptr_t* systemMask)
1624 {
1625     return !!::GetProcessAffinityMask(GetCurrentProcess(), (PDWORD_PTR)processMask, (PDWORD_PTR)systemMask);
1626 }
1627 
1628 // Get number of processors assigned to the current process
1629 // Return:
1630 //  The number of processors
GetCurrentProcessCpuCount()1631 uint32_t GCToOSInterface::GetCurrentProcessCpuCount()
1632 {
1633     static int cCPUs = 0;
1634 
1635     if (cCPUs != 0)
1636         return cCPUs;
1637 
1638     DWORD_PTR pmask, smask;
1639 
1640     if (!GetProcessAffinityMask(GetCurrentProcess(), &pmask, &smask))
1641         return 1;
1642 
1643     if (pmask == 1)
1644         return 1;
1645 
1646     pmask &= smask;
1647 
1648     int count = 0;
1649     while (pmask)
1650     {
1651         if (pmask & 1)
1652             count++;
1653 
1654         pmask >>= 1;
1655     }
1656 
1657     // GetProcessAffinityMask can return pmask=0 and smask=0 on systems with more
1658     // than 64 processors, which would leave us with a count of 0.  Since the GC
1659     // expects there to be at least one processor to run on (and thus at least one
1660     // heap), we'll return 64 here if count is 0, since there are likely a ton of
1661     // processors available in that case.  The GC also cannot (currently) handle
1662     // the case where there are more than 64 processors, so we will return a
1663     // maximum of 64 here.
1664     if (count == 0 || count > 64)
1665         count = 64;
1666 
1667     cCPUs = count;
1668 
1669     return count;
1670 }
1671 
1672 // Return the size of the user-mode portion of the virtual address space of this process.
1673 // Return:
1674 //  non zero if it has succeeded, 0 if it has failed
GetVirtualMemoryLimit()1675 size_t GCToOSInterface::GetVirtualMemoryLimit()
1676 {
1677     MEMORYSTATUSEX memStatus;
1678 
1679     memStatus.dwLength = sizeof(MEMORYSTATUSEX);
1680 
1681     BOOL fRet;
1682     fRet = GlobalMemoryStatusEx(&memStatus);
1683     _ASSERTE(fRet);
1684 
1685     return (size_t)memStatus.ullTotalVirtual;
1686 }
1687 
1688 // Get the physical memory that this process can use.
1689 // Return:
1690 //  non zero if it has succeeded, 0 if it has failed
GetPhysicalMemoryLimit()1691 uint64_t GCToOSInterface::GetPhysicalMemoryLimit()
1692 {
1693     MEMORYSTATUSEX memStatus;
1694 
1695     memStatus.dwLength = sizeof(MEMORYSTATUSEX);
1696 
1697     BOOL fRet;
1698     fRet = GlobalMemoryStatusEx(&memStatus);
1699     _ASSERTE(fRet);
1700 
1701     return memStatus.ullTotalPhys;
1702 }
1703 
1704 // Get memory status
1705 // Parameters:
1706 //  memory_load - A number between 0 and 100 that specifies the approximate percentage of physical memory
1707 //      that is in use (0 indicates no memory use and 100 indicates full memory use).
1708 //  available_physical - The amount of physical memory currently available, in bytes.
1709 //  available_page_file - The maximum amount of memory the current process can commit, in bytes.
GetMemoryStatus(uint32_t * memory_load,uint64_t * available_physical,uint64_t * available_page_file)1710 void GCToOSInterface::GetMemoryStatus(uint32_t* memory_load, uint64_t* available_physical, uint64_t* available_page_file)
1711 {
1712     MEMORYSTATUSEX memStatus;
1713 
1714     memStatus.dwLength = sizeof(MEMORYSTATUSEX);
1715 
1716     BOOL fRet;
1717     fRet = GlobalMemoryStatusEx(&memStatus);
1718     _ASSERTE(fRet);
1719 
1720     // If the machine has more RAM than virtual address limit, let us cap it.
1721     // The GC can never use more than virtual address limit.
1722     if (memStatus.ullAvailPhys > memStatus.ullTotalVirtual)
1723     {
1724         memStatus.ullAvailPhys = memStatus.ullAvailVirtual;
1725     }
1726 
1727     if (memory_load != NULL)
1728         *memory_load = memStatus.dwMemoryLoad;
1729     if (available_physical != NULL)
1730         *available_physical = memStatus.ullAvailPhys;
1731     if (available_page_file != NULL)
1732         *available_page_file = memStatus.ullAvailPageFile;
1733 }
1734 
1735 // Get a high precision performance counter
1736 // Return:
1737 //  The counter value
QueryPerformanceCounter()1738 int64_t GCToOSInterface::QueryPerformanceCounter()
1739 {
1740     LARGE_INTEGER ts;
1741     if (!::QueryPerformanceCounter(&ts))
1742     {
1743         ASSERT_UNCONDITIONALLY("Fatal Error - cannot query performance counter.");
1744         RhFailFast();
1745     }
1746 
1747     return ts.QuadPart;
1748 }
1749 
1750 // Get a frequency of the high precision performance counter
1751 // Return:
1752 //  The counter frequency
QueryPerformanceFrequency()1753 int64_t GCToOSInterface::QueryPerformanceFrequency()
1754 {
1755     return g_performanceFrequency.QuadPart;
1756 }
1757 
1758 // Get a time stamp with a low precision
1759 // Return:
1760 //  Time stamp in milliseconds
GetLowPrecisionTimeStamp()1761 uint32_t GCToOSInterface::GetLowPrecisionTimeStamp()
1762 {
1763     return ::GetTickCount();
1764 }
1765 
1766 // Parameters of the GC thread stub
1767 struct GCThreadStubParam
1768 {
1769     GCThreadFunction GCThreadFunction;
1770     void* GCThreadParam;
1771 };
1772 
1773 // GC thread stub to convert GC thread function to an OS specific thread function
GCThreadStub(void * param)1774 static DWORD GCThreadStub(void* param)
1775 {
1776     GCThreadStubParam *stubParam = (GCThreadStubParam*)param;
1777     GCThreadFunction function = stubParam->GCThreadFunction;
1778     void* threadParam = stubParam->GCThreadParam;
1779 
1780     delete stubParam;
1781 
1782     function(threadParam);
1783 
1784     return 0;
1785 }
1786 
1787 // Create a new thread for GC use
1788 // Parameters:
1789 //  function - the function to be executed by the thread
1790 //  param    - parameters of the thread
1791 //  affinity - processor affinity of the thread
1792 // Return:
1793 //  true if it has succeeded, false if it has failed
CreateThread(GCThreadFunction function,void * param,GCThreadAffinity * affinity)1794 bool GCToOSInterface::CreateThread(GCThreadFunction function, void* param, GCThreadAffinity* affinity)
1795 {
1796     NewHolder<GCThreadStubParam> stubParam = new (nothrow) GCThreadStubParam();
1797     if (stubParam == NULL)
1798     {
1799         return false;
1800     }
1801 
1802     stubParam->GCThreadFunction = function;
1803     stubParam->GCThreadParam = param;
1804 
1805     DWORD thread_id;
1806     HANDLE gc_thread = ::CreateThread(0, 4096, GCThreadStub, stubParam.GetValue(), CREATE_SUSPENDED, &thread_id);
1807 
1808     if (!gc_thread)
1809     {
1810         return false;
1811     }
1812 
1813     stubParam.SuppressRelease();
1814 
1815     SetThreadPriority(gc_thread, /* THREAD_PRIORITY_ABOVE_NORMAL );*/ THREAD_PRIORITY_HIGHEST );
1816 
1817     if (affinity->Group != GCThreadAffinity::None)
1818     {
1819         // @TODO: CPUGroupInfo
1820 
1821         // ASSERT(affinity->Processor != GCThreadAffinity::None);
1822         // GROUP_AFFINITY ga;
1823         // ga.Group = (WORD)affinity->Group;
1824         // ga.Reserved[0] = 0;
1825         // ga.Reserved[1] = 0;
1826         // ga.Reserved[2] = 0;
1827         // ga.Mask = (size_t)1 << affinity->Processor;
1828         // CPUGroupInfo::SetThreadGroupAffinity(gc_thread, &ga, NULL);
1829     }
1830     else if (affinity->Processor != GCThreadAffinity::None)
1831     {
1832         SetThreadAffinityMask(gc_thread, (DWORD_PTR)1 << affinity->Processor);
1833     }
1834 
1835     ResumeThread(gc_thread);
1836     CloseHandle(gc_thread);
1837 
1838     return true;
1839 }
1840 
1841 // Initialize the critical section
Initialize()1842 void CLRCriticalSection::Initialize()
1843 {
1844     InitializeCriticalSection(&m_cs);
1845 }
1846 
1847 // Destroy the critical section
Destroy()1848 void CLRCriticalSection::Destroy()
1849 {
1850     DeleteCriticalSection(&m_cs);
1851 }
1852 
1853 // Enter the critical section. Blocks until the section can be entered.
Enter()1854 void CLRCriticalSection::Enter()
1855 {
1856     EnterCriticalSection(&m_cs);
1857 }
1858 
1859 // Leave the critical section
Leave()1860 void CLRCriticalSection::Leave()
1861 {
1862     LeaveCriticalSection(&m_cs);
1863 }
1864 
1865 #endif // RUNTIME_SERVICES_ONLY
1866