1 // Copyright 2009-2021 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 
4 #include "thread.h"
5 #include "sysinfo.h"
6 #include "string.h"
7 
8 #include <iostream>
9 #if defined(__ARM_NEON)
10 #include "../simd/arm/emulation.h"
11 #else
12 #include <xmmintrin.h>
13 #endif
14 
15 #if defined(PTHREADS_WIN32)
16 #pragma comment (lib, "pthreadVC.lib")
17 #endif
18 
19 ////////////////////////////////////////////////////////////////////////////////
20 /// Windows Platform
21 ////////////////////////////////////////////////////////////////////////////////
22 
23 #if defined(__WIN32__)
24 
25 #define WIN32_LEAN_AND_MEAN
26 #include <windows.h>
27 
28 namespace embree
29 {
30   /*! set the affinity of a given thread */
setAffinity(HANDLE thread,ssize_t affinity)31   void setAffinity(HANDLE thread, ssize_t affinity)
32   {
33     typedef WORD (WINAPI *GetActiveProcessorGroupCountFunc)();
34     typedef DWORD (WINAPI *GetActiveProcessorCountFunc)(WORD);
35     typedef BOOL (WINAPI *SetThreadGroupAffinityFunc)(HANDLE, const GROUP_AFFINITY *, PGROUP_AFFINITY);
36     typedef BOOL (WINAPI *SetThreadIdealProcessorExFunc)(HANDLE, PPROCESSOR_NUMBER, PPROCESSOR_NUMBER);
37     HMODULE hlib = LoadLibrary("Kernel32");
38     GetActiveProcessorGroupCountFunc pGetActiveProcessorGroupCount = (GetActiveProcessorGroupCountFunc)GetProcAddress(hlib, "GetActiveProcessorGroupCount");
39     GetActiveProcessorCountFunc pGetActiveProcessorCount = (GetActiveProcessorCountFunc)GetProcAddress(hlib, "GetActiveProcessorCount");
40     SetThreadGroupAffinityFunc pSetThreadGroupAffinity = (SetThreadGroupAffinityFunc)GetProcAddress(hlib, "SetThreadGroupAffinity");
41     SetThreadIdealProcessorExFunc pSetThreadIdealProcessorEx = (SetThreadIdealProcessorExFunc)GetProcAddress(hlib, "SetThreadIdealProcessorEx");
42     if (pGetActiveProcessorGroupCount && pGetActiveProcessorCount && pSetThreadGroupAffinity && pSetThreadIdealProcessorEx)
43     {
44       int groups = pGetActiveProcessorGroupCount();
45       int totalProcessors = 0, group = 0, number = 0;
46       for (int i = 0; i<groups; i++) {
47         int processors = pGetActiveProcessorCount(i);
48         if (totalProcessors + processors > affinity) {
49           group = i;
50           number = (int)affinity - totalProcessors;
51           break;
52         }
53         totalProcessors += processors;
54       }
55 
56       GROUP_AFFINITY groupAffinity;
57       groupAffinity.Group = (WORD)group;
58       groupAffinity.Mask = (KAFFINITY)(uint64_t(1) << number);
59       groupAffinity.Reserved[0] = 0;
60       groupAffinity.Reserved[1] = 0;
61       groupAffinity.Reserved[2] = 0;
62       if (!pSetThreadGroupAffinity(thread, &groupAffinity, nullptr))
63         WARNING("SetThreadGroupAffinity failed"); // on purpose only a warning
64 
65       PROCESSOR_NUMBER processorNumber;
66       processorNumber.Group = group;
67       processorNumber.Number = number;
68       processorNumber.Reserved = 0;
69       if (!pSetThreadIdealProcessorEx(thread, &processorNumber, nullptr))
70         WARNING("SetThreadIdealProcessorEx failed"); // on purpose only a warning
71     }
72     else
73     {
74       if (!SetThreadAffinityMask(thread, DWORD_PTR(uint64_t(1) << affinity)))
75         WARNING("SetThreadAffinityMask failed"); // on purpose only a warning
76       if (SetThreadIdealProcessor(thread, (DWORD)affinity) == (DWORD)-1)
77         WARNING("SetThreadIdealProcessor failed"); // on purpose only a warning
78       }
79   }
80 
81   /*! set affinity of the calling thread */
setAffinity(ssize_t affinity)82   void setAffinity(ssize_t affinity) {
83     setAffinity(GetCurrentThread(), affinity);
84   }
85 
86   struct ThreadStartupData
87   {
88   public:
ThreadStartupDataembree::ThreadStartupData89     ThreadStartupData (thread_func f, void* arg)
90       : f(f), arg(arg) {}
91   public:
92     thread_func f;
93     void* arg;
94   };
95 
threadStartup(LPVOID ptr)96   DWORD WINAPI threadStartup(LPVOID ptr)
97   {
98     ThreadStartupData* parg = (ThreadStartupData*) ptr;
99     _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));
100     parg->f(parg->arg);
101     delete parg;
102     return 0;
103   }
104 
105 #if !defined(PTHREADS_WIN32)
106 
107   /*! creates a hardware thread running on specific core */
createThread(thread_func f,void * arg,size_t stack_size,ssize_t threadID)108   thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID)
109   {
110     HANDLE thread = CreateThread(nullptr, stack_size, threadStartup, new ThreadStartupData(f,arg), 0, nullptr);
111     if (thread == nullptr) FATAL("CreateThread failed");
112     if (threadID >= 0) setAffinity(thread, threadID);
113     return thread_t(thread);
114   }
115 
116   /*! the thread calling this function gets yielded */
yield()117   void yield() {
118     SwitchToThread();
119   }
120 
121   /*! waits until the given thread has terminated */
join(thread_t tid)122   void join(thread_t tid) {
123     WaitForSingleObject(HANDLE(tid), INFINITE);
124     CloseHandle(HANDLE(tid));
125   }
126 
127   /*! destroy a hardware thread by its handle */
destroyThread(thread_t tid)128   void destroyThread(thread_t tid) {
129     TerminateThread(HANDLE(tid),0);
130     CloseHandle(HANDLE(tid));
131   }
132 
133   /*! creates thread local storage */
createTls()134   tls_t createTls() {
135     return tls_t(size_t(TlsAlloc()));
136   }
137 
138   /*! set the thread local storage pointer */
setTls(tls_t tls,void * const ptr)139   void setTls(tls_t tls, void* const ptr) {
140     TlsSetValue(DWORD(size_t(tls)), ptr);
141   }
142 
143   /*! return the thread local storage pointer */
getTls(tls_t tls)144   void* getTls(tls_t tls) {
145     return TlsGetValue(DWORD(size_t(tls)));
146   }
147 
148   /*! destroys thread local storage identifier */
destroyTls(tls_t tls)149   void destroyTls(tls_t tls) {
150     TlsFree(DWORD(size_t(tls)));
151   }
152 #endif
153 }
154 
155 #endif
156 
157 ////////////////////////////////////////////////////////////////////////////////
158 /// Linux Platform
159 ////////////////////////////////////////////////////////////////////////////////
160 
161 #if defined(__LINUX__)
162 
163 #include <fstream>
164 #include <sstream>
165 #include <algorithm>
166 
167 namespace embree
168 {
169   static MutexSys mutex;
170   static std::vector<size_t> threadIDs;
171 
172   /* changes thread ID mapping such that we first fill up all thread on one core */
mapThreadID(size_t threadID)173   size_t mapThreadID(size_t threadID)
174   {
175     Lock<MutexSys> lock(mutex);
176 
177     if (threadIDs.size() == 0)
178     {
179       /* parse thread/CPU topology */
180       for (size_t cpuID=0;;cpuID++)
181       {
182         std::fstream fs;
183         std::string cpu = std::string("/sys/devices/system/cpu/cpu") + std::to_string((long long)cpuID) + std::string("/topology/thread_siblings_list");
184         fs.open (cpu.c_str(), std::fstream::in);
185         if (fs.fail()) break;
186 
187         int i;
188         while (fs >> i)
189         {
190           if (std::none_of(threadIDs.begin(),threadIDs.end(),[&] (int id) { return id == i; }))
191             threadIDs.push_back(i);
192           if (fs.peek() == ',')
193             fs.ignore();
194         }
195         fs.close();
196       }
197 
198 #if 0
199       for (size_t i=0;i<threadIDs.size();i++)
200         std::cout << i << " -> " << threadIDs[i] << std::endl;
201 #endif
202 
203       /* verify the mapping and do not use it if the mapping has errors */
204       for (size_t i=0;i<threadIDs.size();i++) {
205         for (size_t j=0;j<threadIDs.size();j++) {
206           if (i != j && threadIDs[i] == threadIDs[j]) {
207             threadIDs.clear();
208           }
209         }
210       }
211     }
212 
213     /* re-map threadIDs if mapping is available */
214     size_t ID = threadID;
215     if (threadID < threadIDs.size())
216       ID = threadIDs[threadID];
217 
218     /* find correct thread to affinitize to */
219     cpu_set_t set;
220     CPU_ZERO(&set);
221 
222     if (pthread_getaffinity_np(pthread_self(), sizeof(set), &set) == 0)
223     {
224       for (int i=0, j=0; i<CPU_SETSIZE; i++)
225       {
226         if (!CPU_ISSET(i,&set)) continue;
227 
228         if (j == ID) {
229           ID = i;
230           break;
231         }
232         j++;
233       }
234     }
235 
236     return ID;
237   }
238 
239   /*! set affinity of the calling thread */
setAffinity(ssize_t affinity)240   void setAffinity(ssize_t affinity)
241   {
242     cpu_set_t cset;
243     CPU_ZERO(&cset);
244     size_t threadID = mapThreadID(affinity);
245     CPU_SET(threadID, &cset);
246 
247     pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset);
248   }
249 }
250 #endif
251 
252 ////////////////////////////////////////////////////////////////////////////////
253 /// FreeBSD Platform
254 ////////////////////////////////////////////////////////////////////////////////
255 
256 #if defined(__FreeBSD__)
257 
258 #include <pthread_np.h>
259 
260 namespace embree
261 {
262   /*! set affinity of the calling thread */
setAffinity(ssize_t affinity)263   void setAffinity(ssize_t affinity)
264   {
265     cpuset_t cset;
266     CPU_ZERO(&cset);
267     CPU_SET(affinity, &cset);
268 
269     pthread_setaffinity_np(pthread_self(), sizeof(cset), &cset);
270   }
271 }
272 #endif
273 
274 ////////////////////////////////////////////////////////////////////////////////
275 /// DragonFly Platform
276 ////////////////////////////////////////////////////////////////////////////////
277 
278 #if defined(__DragonFly__)
279 
280 namespace embree
281 {
setAffinity(ssize_t affinity)282   void setAffinity(ssize_t affinity)
283   {
284 // none needed ;)
285   }
286 }
287 #endif
288 
289 ////////////////////////////////////////////////////////////////////////////////
290 /// MacOSX Platform
291 ////////////////////////////////////////////////////////////////////////////////
292 
293 #if defined(__MACOSX__)
294 
295 #include <mach/thread_act.h>
296 #include <mach/thread_policy.h>
297 #include <mach/mach_init.h>
298 
299 namespace embree
300 {
301   /*! set affinity of the calling thread */
setAffinity(ssize_t affinity)302   void setAffinity(ssize_t affinity)
303   {
304 #if !defined(__ARM_NEON) // affinity seems not supported on M1 chip
305 
306     thread_affinity_policy ap;
307     ap.affinity_tag = affinity;
308     if (thread_policy_set(mach_thread_self(),THREAD_AFFINITY_POLICY,(thread_policy_t)&ap,THREAD_AFFINITY_POLICY_COUNT) != KERN_SUCCESS)
309       WARNING("setting thread affinity failed"); // on purpose only a warning
310 
311 #endif
312   }
313 }
314 #endif
315 
316 ////////////////////////////////////////////////////////////////////////////////
317 /// Unix Platform
318 ////////////////////////////////////////////////////////////////////////////////
319 
320 #if defined(__UNIX__) || defined(PTHREADS_WIN32)
321 
322 #include <pthread.h>
323 #include <sched.h>
324 
325 #if defined(__USE_NUMA__)
326 #include <numa.h>
327 #endif
328 
329 namespace embree
330 {
331   struct ThreadStartupData
332   {
333   public:
ThreadStartupDataembree::ThreadStartupData334     ThreadStartupData (thread_func f, void* arg, int affinity)
335       : f(f), arg(arg), affinity(affinity) {}
336   public:
337     thread_func f;
338     void* arg;
339     ssize_t affinity;
340   };
341 
threadStartup(ThreadStartupData * parg)342   static void* threadStartup(ThreadStartupData* parg)
343   {
344     _mm_setcsr(_mm_getcsr() | /*FTZ:*/ (1<<15) | /*DAZ:*/ (1<<6));
345 
346     /*! Mac OS X does not support setting affinity at thread creation time */
347 #if defined(__MACOSX__)
348     if (parg->affinity >= 0)
349 	setAffinity(parg->affinity);
350 #endif
351 
352     parg->f(parg->arg);
353     delete parg;
354     return nullptr;
355   }
356 
357   /*! creates a hardware thread running on specific core */
createThread(thread_func f,void * arg,size_t stack_size,ssize_t threadID)358   thread_t createThread(thread_func f, void* arg, size_t stack_size, ssize_t threadID)
359   {
360     /* set stack size */
361     pthread_attr_t attr;
362     pthread_attr_init(&attr);
363     if (stack_size > 0) pthread_attr_setstacksize (&attr, stack_size);
364 
365     /* create thread */
366     pthread_t* tid = new pthread_t;
367     if (pthread_create(tid,&attr,(void*(*)(void*))threadStartup,new ThreadStartupData(f,arg,threadID)) != 0) {
368       pthread_attr_destroy(&attr);
369       delete tid;
370       FATAL("pthread_create failed");
371     }
372     pthread_attr_destroy(&attr);
373 
374     /* set affinity */
375 #if defined(__LINUX__)
376     if (threadID >= 0) {
377       cpu_set_t cset;
378       CPU_ZERO(&cset);
379       threadID = mapThreadID(threadID);
380       CPU_SET(threadID, &cset);
381       pthread_setaffinity_np(*tid, sizeof(cset), &cset);
382     }
383 #elif defined(__FreeBSD__)
384     if (threadID >= 0) {
385       cpuset_t cset;
386       CPU_ZERO(&cset);
387       CPU_SET(threadID, &cset);
388       pthread_setaffinity_np(*tid, sizeof(cset), &cset);
389     }
390 #endif
391 
392     return thread_t(tid);
393   }
394 
395   /*! the thread calling this function gets yielded */
yield()396   void yield() {
397     sched_yield();
398   }
399 
400   /*! waits until the given thread has terminated */
join(thread_t tid)401   void join(thread_t tid) {
402     if (pthread_join(*(pthread_t*)tid, nullptr) != 0)
403       FATAL("pthread_join failed");
404     delete (pthread_t*)tid;
405   }
406 
407   /*! destroy a hardware thread by its handle */
destroyThread(thread_t tid)408   void destroyThread(thread_t tid) {
409     pthread_cancel(*(pthread_t*)tid);
410     delete (pthread_t*)tid;
411   }
412 
413   /*! creates thread local storage */
createTls()414   tls_t createTls()
415   {
416     pthread_key_t* key = new pthread_key_t;
417     if (pthread_key_create(key,nullptr) != 0) {
418       delete key;
419       FATAL("pthread_key_create failed");
420     }
421 
422     return tls_t(key);
423   }
424 
425   /*! return the thread local storage pointer */
getTls(tls_t tls)426   void* getTls(tls_t tls)
427   {
428     assert(tls);
429     return pthread_getspecific(*(pthread_key_t*)tls);
430   }
431 
432   /*! set the thread local storage pointer */
setTls(tls_t tls,void * const ptr)433   void setTls(tls_t tls, void* const ptr)
434   {
435     assert(tls);
436     if (pthread_setspecific(*(pthread_key_t*)tls, ptr) != 0)
437       FATAL("pthread_setspecific failed");
438   }
439 
440   /*! destroys thread local storage identifier */
destroyTls(tls_t tls)441   void destroyTls(tls_t tls)
442   {
443     assert(tls);
444     if (pthread_key_delete(*(pthread_key_t*)tls) != 0)
445       FATAL("pthread_key_delete failed");
446     delete (pthread_key_t*)tls;
447   }
448 }
449 
450 #endif
451