1 /*
2 Copyright (c) 2005-2021 Intel Corporation
3
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 */
16
17 // Source file for miscellaneous entities that are infrequently referenced by
18 // an executing program, and implementation of which requires dynamic linking.
19
20 #include "misc.h"
21
22 #if !defined(__TBB_HardwareConcurrency)
23
24 #include "dynamic_link.h"
25 #include <stdio.h>
26 #include <limits.h>
27
28 #if _WIN32||_WIN64
29 #include <windows.h>
30 #if __TBB_WIN8UI_SUPPORT
31 #include <thread>
32 #endif
33 #else
34 #include <unistd.h>
35 #if __unix__
36 #if __linux__
37 #include <sys/sysinfo.h>
38 #endif
39 #include <cstring>
40 #include <sched.h>
41 #include <cerrno>
42 #elif __sun
43 #include <sys/sysinfo.h>
44 #elif __DragonFly__
45 #include <cerrno>
46 #include <cstring>
47 #include <sched.h>
48 #elif __FreeBSD__
49 #include <cerrno>
50 #include <cstring>
51 #include <sys/param.h> // Required by <sys/cpuset.h>
52 #include <sys/cpuset.h>
53 #endif
54 #endif
55
56 namespace tbb {
57 namespace detail {
58 namespace r1 {
59
60 #if __TBB_USE_OS_AFFINITY_SYSCALL
61
62 #if __unix__
63 // Handlers for interoperation with libiomp
64 static int (*libiomp_try_restoring_original_mask)();
65 // Table for mapping to libiomp entry points
66 static const dynamic_link_descriptor iompLinkTable[] = {
67 DLD_NOWEAK( kmp_set_thread_affinity_mask_initial, libiomp_try_restoring_original_mask )
68 };
69 #endif
70
set_thread_affinity_mask(std::size_t maskSize,const basic_mask_t * threadMask)71 static void set_thread_affinity_mask( std::size_t maskSize, const basic_mask_t* threadMask ) {
72 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__
73 if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
74 #else /* __unix__ */
75 if( sched_setaffinity( 0, maskSize, threadMask ) )
76 #endif
77 // Here and below the error severity is lowered from critical level
78 // because it may happen during TBB library unload because of not
79 // waiting for workers to complete (current RML policy, to be fixed).
80 // handle_perror( errno, "setaffinity syscall" );
81 runtime_warning( "setaffinity syscall failed" );
82 }
83
get_thread_affinity_mask(std::size_t maskSize,basic_mask_t * threadMask)84 static void get_thread_affinity_mask( std::size_t maskSize, basic_mask_t* threadMask ) {
85 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__
86 if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
87 #else /* __unix__ */
88 if( sched_getaffinity( 0, maskSize, threadMask ) )
89 #endif
90 runtime_warning( "getaffinity syscall failed" );
91 }
92
93 static basic_mask_t* process_mask;
94 static int num_masks;
95
destroy_process_mask()96 void destroy_process_mask() {
97 if( process_mask ) {
98 delete [] process_mask;
99 }
100 }
101
102 #define curMaskSize sizeof(basic_mask_t) * num_masks
~affinity_helper()103 affinity_helper::~affinity_helper() {
104 if( threadMask ) {
105 if( is_changed ) {
106 set_thread_affinity_mask( curMaskSize, threadMask );
107 }
108 delete [] threadMask;
109 }
110 }
protect_affinity_mask(bool restore_process_mask)111 void affinity_helper::protect_affinity_mask( bool restore_process_mask ) {
112 if( threadMask == NULL && num_masks ) { // TODO: assert num_masks validity?
113 threadMask = new basic_mask_t [num_masks];
114 std::memset( threadMask, 0, curMaskSize );
115 get_thread_affinity_mask( curMaskSize, threadMask );
116 if( restore_process_mask ) {
117 __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" );
118 is_changed = memcmp( process_mask, threadMask, curMaskSize );
119 if( is_changed )
120 set_thread_affinity_mask( curMaskSize, process_mask );
121 } else {
122 // Assume that the mask will be changed by the caller.
123 is_changed = 1;
124 }
125 }
126 }
dismiss()127 void affinity_helper::dismiss() {
128 if( threadMask ) {
129 delete [] threadMask;
130 threadMask = NULL;
131 }
132 is_changed = 0;
133 }
134 #undef curMaskSize
135
136 static std::atomic<do_once_state> hardware_concurrency_info;
137
138 static int theNumProcs;
139
initialize_hardware_concurrency_info()140 static void initialize_hardware_concurrency_info () {
141 int err;
142 int availableProcs = 0;
143 int numMasks = 1;
144 int maxProcs = sysconf(_SC_NPROCESSORS_ONLN);
145 basic_mask_t* processMask;
146 #ifdef __DragonFly__
147 // DragonFly uses static array (usually 256 bit long) in cpu_set_t
148 processMask = new cpu_set_t;
149 int pid = getpid();
150 err = sched_getaffinity( pid, sizeof(cpu_set_t), processMask );
151 #else
152 const std::size_t BasicMaskSize = sizeof(basic_mask_t);
153 for (;;) {
154 const int curMaskSize = BasicMaskSize * numMasks;
155 processMask = new basic_mask_t[numMasks];
156 std::memset( processMask, 0, curMaskSize );
157 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__
158 // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask
159 err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask );
160 if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 )
161 break;
162 #else /* __unix__ */
163 int pid = getpid();
164 err = sched_getaffinity( pid, curMaskSize, processMask );
165 if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 )
166 break;
167 #endif
168 delete[] processMask;
169 numMasks <<= 1;
170 }
171 #endif
172 if ( !err ) {
173 #ifdef __DragonFly__
174 num_masks = 1; // static array
175 for ( size_t i = 0; availableProcs < maxProcs && i < sizeof(cpu_set_t); ++i ) {
176 if ( CPU_ISSET( i, processMask ) )
177 ++availableProcs;
178 }
179 #else
180 // We have found the mask size and captured the process affinity mask into processMask.
181 num_masks = numMasks; // do here because it's needed for affinity_helper to work
182 #if __unix__
183 // For better coexistence with libiomp which might have changed the mask already,
184 // check for its presence and ask it to restore the mask.
185 dynamic_link_handle libhandle;
186 if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) {
187 // We have found the symbol provided by libiomp5 for restoring original thread affinity.
188 affinity_helper affhelp;
189 affhelp.protect_affinity_mask( /*restore_process_mask=*/false );
190 if ( libiomp_try_restoring_original_mask()==0 ) {
191 // Now we have the right mask to capture, restored by libiomp.
192 const int curMaskSize = BasicMaskSize * numMasks;
193 std::memset( processMask, 0, curMaskSize );
194 get_thread_affinity_mask( curMaskSize, processMask );
195 } else
196 affhelp.dismiss(); // thread mask has not changed
197 dynamic_unlink( libhandle );
198 // Destructor of affinity_helper restores the thread mask (unless dismissed).
199 }
200 #endif
201 for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) {
202 for ( std::size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) {
203 if ( CPU_ISSET( i, processMask + m ) )
204 ++availableProcs;
205 }
206 }
207 #endif
208 process_mask = processMask;
209 }
210 else {
211 // Failed to get the process affinity mask; assume the whole machine can be used.
212 availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs;
213 delete[] processMask;
214 }
215 theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap
216 __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), NULL );
217 }
218
AvailableHwConcurrency()219 int AvailableHwConcurrency() {
220 atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
221 return theNumProcs;
222 }
223
224 /* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */
225 #elif __ANDROID__
226
227 // Work-around for Android that reads the correct number of available CPUs since system calls are unreliable.
228 // Format of "present" file is: ([<int>-<int>|<int>],)+
229 int AvailableHwConcurrency() {
230 FILE *fp = fopen("/sys/devices/system/cpu/present", "r");
231 if (fp == NULL) return 1;
232 int num_args, lower, upper, num_cpus=0;
233 while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) {
234 switch(num_args) {
235 case 2: num_cpus += upper - lower + 1; break;
236 case 1: num_cpus += 1; break;
237 }
238 fscanf(fp, ",");
239 }
240 return (num_cpus > 0) ? num_cpus : 1;
241 }
242
243 #elif defined(_SC_NPROCESSORS_ONLN)
244
245 int AvailableHwConcurrency() {
246 int n = sysconf(_SC_NPROCESSORS_ONLN);
247 return (n > 0) ? n : 1;
248 }
249
250 #elif _WIN32||_WIN64
251
252 static std::atomic<do_once_state> hardware_concurrency_info;
253
254 static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff;
255
256 // Statically allocate an array for processor group information.
257 // Windows 7 supports maximum 4 groups, but let's look ahead a little.
258 static const WORD MaxProcessorGroups = 64;
259
260 struct ProcessorGroupInfo {
261 DWORD_PTR mask; ///< Affinity mask covering the whole group
262 int numProcs; ///< Number of processors in the group
263 int numProcsRunningTotal; ///< Subtotal of processors in this and preceding groups
264
265 //! Total number of processor groups in the system
266 static int NumGroups;
267
268 //! Index of the group with a slot reserved for the first external thread
269 /** In the context of multiple processor groups support current implementation
270 defines "the first external thread" as the first thread to invoke
271 AvailableHwConcurrency().
272
273 TODO: Implement a dynamic scheme remapping workers depending on the pending
274 external threads affinity. **/
275 static int HoleIndex;
276 };
277
278 int ProcessorGroupInfo::NumGroups = 1;
279 int ProcessorGroupInfo::HoleIndex = 0;
280
281 ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups];
282
283 struct TBB_GROUP_AFFINITY {
284 DWORD_PTR Mask;
285 WORD Group;
286 WORD Reserved[3];
287 };
288
289 static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = NULL;
290 static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = NULL;
291 static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread,
292 const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff );
293 static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* );
294
295 static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = {
296 DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount)
297 , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount)
298 , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity)
299 , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity)
300 };
301
302 static void initialize_hardware_concurrency_info () {
303 suppress_unused_warning(TBB_ALL_PROCESSOR_GROUPS);
304 #if __TBB_WIN8UI_SUPPORT
305 // For these applications processor groups info is unavailable
306 // Setting up a number of processors for one processor group
307 theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency();
308 #else /* __TBB_WIN8UI_SUPPORT */
309 dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable,
310 sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) );
311 SYSTEM_INFO si;
312 GetNativeSystemInfo(&si);
313 DWORD_PTR pam, sam, m = 1;
314 GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam );
315 int nproc = 0;
316 for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) {
317 if ( pam & m )
318 ++nproc;
319 }
320 __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, NULL );
321 // By default setting up a number of processors for one processor group
322 theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc;
323 // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present
324 if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) {
325 // The process does not have restricting affinity mask and multiple processor groups are possible
326 ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount();
327 __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, NULL );
328 // Fail safety bootstrap. Release versions will limit available concurrency
329 // level, while debug ones would assert.
330 if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups )
331 ProcessorGroupInfo::NumGroups = MaxProcessorGroups;
332 if ( ProcessorGroupInfo::NumGroups > 1 ) {
333 TBB_GROUP_AFFINITY ga;
334 if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) )
335 ProcessorGroupInfo::HoleIndex = ga.Group;
336 int nprocs = 0;
337 for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) {
338 ProcessorGroupInfo &pgi = theProcessorGroups[i];
339 pgi.numProcs = (int)TBB_GetActiveProcessorCount(i);
340 __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, NULL );
341 pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1;
342 pgi.numProcsRunningTotal = nprocs += pgi.numProcs;
343 }
344 __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), NULL );
345 }
346 }
347 #endif /* __TBB_WIN8UI_SUPPORT */
348
349 PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups);
350 if (ProcessorGroupInfo::NumGroups>1)
351 for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i)
352 PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs);
353 }
354
355 int NumberOfProcessorGroups() {
356 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "NumberOfProcessorGroups is used before AvailableHwConcurrency" );
357 return ProcessorGroupInfo::NumGroups;
358 }
359
360 // Offset for the slot reserved for the first external thread
361 #define HoleAdjusted(procIdx, grpIdx) (procIdx + (holeIdx <= grpIdx))
362
363 int FindProcessorGroupIndex ( int procIdx ) {
364 // In case of oversubscription spread extra workers in a round robin manner
365 int holeIdx;
366 const int numProcs = theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
367 if ( procIdx >= numProcs - 1 ) {
368 holeIdx = INT_MAX;
369 procIdx = (procIdx - numProcs + 1) % numProcs;
370 }
371 else
372 holeIdx = ProcessorGroupInfo::HoleIndex;
373 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "FindProcessorGroupIndex is used before AvailableHwConcurrency" );
374 // Approximate the likely group index assuming all groups are of the same size
375 int i = procIdx / theProcessorGroups[0].numProcs;
376 // Make sure the approximation is a valid group index
377 if (i >= ProcessorGroupInfo::NumGroups) i = ProcessorGroupInfo::NumGroups-1;
378 // Now adjust the approximation up or down
379 if ( theProcessorGroups[i].numProcsRunningTotal > HoleAdjusted(procIdx, i) ) {
380 while ( theProcessorGroups[i].numProcsRunningTotal - theProcessorGroups[i].numProcs > HoleAdjusted(procIdx, i) ) {
381 __TBB_ASSERT( i > 0, NULL );
382 --i;
383 }
384 }
385 else {
386 do {
387 ++i;
388 } while ( theProcessorGroups[i].numProcsRunningTotal <= HoleAdjusted(procIdx, i) );
389 }
390 __TBB_ASSERT( i < ProcessorGroupInfo::NumGroups, NULL );
391 return i;
392 }
393
394 void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) {
395 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" );
396 if ( !TBB_SetThreadGroupAffinity )
397 return;
398 TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} };
399 TBB_SetThreadGroupAffinity( hThread, &ga, NULL );
400 }
401
402 int AvailableHwConcurrency() {
403 atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
404 return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
405 }
406
407 /* End of _WIN32||_WIN64 implementation */
408 #else
409 #error AvailableHwConcurrency is not implemented for this OS
410 #endif
411
412 } // namespace r1
413 } // namespace detail
414 } // namespace tbb
415
416 #endif /* !__TBB_HardwareConcurrency */
417