1 /*
2     Copyright (c) 2005-2021 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 // Source file for miscellaneous entities that are infrequently referenced by
18 // an executing program, and implementation of which requires dynamic linking.
19 
20 #include "misc.h"
21 
22 #if !defined(__TBB_HardwareConcurrency)
23 
24 #include "dynamic_link.h"
25 #include <stdio.h>
26 #include <limits.h>
27 
28 #if _WIN32||_WIN64
29 #include <windows.h>
30 #if __TBB_WIN8UI_SUPPORT
31 #include <thread>
32 #endif
33 #else
34 #include <unistd.h>
35 #if __unix__
36 #if __linux__
37 #include <sys/sysinfo.h>
38 #endif
39 #include <cstring>
40 #include <sched.h>
41 #include <cerrno>
42 #elif __sun
43 #include <sys/sysinfo.h>
44 #elif __DragonFly__
45 #include <cerrno>
46 #include <cstring>
47 #include <sched.h>
48 #elif __FreeBSD__
49 #include <cerrno>
50 #include <cstring>
51 #include <sys/param.h>  // Required by <sys/cpuset.h>
52 #include <sys/cpuset.h>
53 #endif
54 #endif
55 
56 namespace tbb {
57 namespace detail {
58 namespace r1 {
59 
60 #if __TBB_USE_OS_AFFINITY_SYSCALL
61 
62 #if __unix__
63 // Handlers for interoperation with libiomp
64 static int (*libiomp_try_restoring_original_mask)();
65 // Table for mapping to libiomp entry points
66 static const dynamic_link_descriptor iompLinkTable[] = {
67     DLD_NOWEAK( kmp_set_thread_affinity_mask_initial, libiomp_try_restoring_original_mask )
68 };
69 #endif
70 
set_thread_affinity_mask(std::size_t maskSize,const basic_mask_t * threadMask)71 static void set_thread_affinity_mask( std::size_t maskSize, const basic_mask_t* threadMask ) {
72 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__
73     if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
74 #else /* __unix__ */
75     if( sched_setaffinity( 0, maskSize, threadMask ) )
76 #endif
77         // Here and below the error severity is lowered from critical level
78         // because it may happen during TBB library unload because of not
79         // waiting for workers to complete (current RML policy, to be fixed).
80         // handle_perror( errno, "setaffinity syscall" );
81         runtime_warning( "setaffinity syscall failed" );
82 }
83 
get_thread_affinity_mask(std::size_t maskSize,basic_mask_t * threadMask)84 static void get_thread_affinity_mask( std::size_t maskSize, basic_mask_t* threadMask ) {
85 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__
86     if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
87 #else /* __unix__ */
88     if( sched_getaffinity( 0, maskSize, threadMask ) )
89 #endif
90     runtime_warning( "getaffinity syscall failed" );
91 }
92 
93 static basic_mask_t* process_mask;
94 static int num_masks;
95 
destroy_process_mask()96 void destroy_process_mask() {
97     if( process_mask ) {
98         delete [] process_mask;
99     }
100 }
101 
102 #define curMaskSize sizeof(basic_mask_t) * num_masks
~affinity_helper()103 affinity_helper::~affinity_helper() {
104     if( threadMask ) {
105         if( is_changed ) {
106             set_thread_affinity_mask( curMaskSize, threadMask );
107         }
108         delete [] threadMask;
109     }
110 }
protect_affinity_mask(bool restore_process_mask)111 void affinity_helper::protect_affinity_mask( bool restore_process_mask ) {
112     if( threadMask == NULL && num_masks ) { // TODO: assert num_masks validity?
113         threadMask = new basic_mask_t [num_masks];
114         std::memset( threadMask, 0, curMaskSize );
115         get_thread_affinity_mask( curMaskSize, threadMask );
116         if( restore_process_mask ) {
117             __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" );
118             is_changed = memcmp( process_mask, threadMask, curMaskSize );
119             if( is_changed )
120                 set_thread_affinity_mask( curMaskSize, process_mask );
121         } else {
122             // Assume that the mask will be changed by the caller.
123             is_changed = 1;
124         }
125     }
126 }
dismiss()127 void affinity_helper::dismiss() {
128     if( threadMask ) {
129         delete [] threadMask;
130         threadMask = NULL;
131     }
132     is_changed = 0;
133 }
134 #undef curMaskSize
135 
136 static std::atomic<do_once_state> hardware_concurrency_info;
137 
138 static int theNumProcs;
139 
initialize_hardware_concurrency_info()140 static void initialize_hardware_concurrency_info () {
141     int err;
142     int availableProcs = 0;
143     int numMasks = 1;
144     int maxProcs = sysconf(_SC_NPROCESSORS_ONLN);
145     basic_mask_t* processMask;
146 #ifdef __DragonFly__
147     // DragonFly uses static array (usually 256 bit long) in cpu_set_t
148     processMask = new cpu_set_t;
149     int pid = getpid();
150     err = sched_getaffinity( pid, sizeof(cpu_set_t), processMask );
151 #else
152     const std::size_t BasicMaskSize =  sizeof(basic_mask_t);
153     for (;;) {
154         const int curMaskSize = BasicMaskSize * numMasks;
155         processMask = new basic_mask_t[numMasks];
156         std::memset( processMask, 0, curMaskSize );
157 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__
158         // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask
159         err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask );
160         if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 )
161             break;
162 #else /* __unix__  */
163         int pid = getpid();
164         err = sched_getaffinity( pid, curMaskSize, processMask );
165         if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 )
166             break;
167 #endif
168         delete[] processMask;
169         numMasks <<= 1;
170     }
171 #endif
172     if ( !err ) {
173 #ifdef __DragonFly__
174       num_masks = 1; // static array
175       for ( size_t i = 0; availableProcs < maxProcs && i < sizeof(cpu_set_t); ++i ) {
176          if ( CPU_ISSET( i, processMask ) )
177              ++availableProcs;
178       }
179 #else
180         // We have found the mask size and captured the process affinity mask into processMask.
181         num_masks = numMasks; // do here because it's needed for affinity_helper to work
182 #if __unix__
183         // For better coexistence with libiomp which might have changed the mask already,
184         // check for its presence and ask it to restore the mask.
185         dynamic_link_handle libhandle;
186         if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) {
187             // We have found the symbol provided by libiomp5 for restoring original thread affinity.
188             affinity_helper affhelp;
189             affhelp.protect_affinity_mask( /*restore_process_mask=*/false );
190             if ( libiomp_try_restoring_original_mask()==0 ) {
191                 // Now we have the right mask to capture, restored by libiomp.
192                 const int curMaskSize = BasicMaskSize * numMasks;
193                 std::memset( processMask, 0, curMaskSize );
194                 get_thread_affinity_mask( curMaskSize, processMask );
195             } else
196                 affhelp.dismiss();  // thread mask has not changed
197             dynamic_unlink( libhandle );
198             // Destructor of affinity_helper restores the thread mask (unless dismissed).
199         }
200 #endif
201         for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) {
202             for ( std::size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) {
203                 if ( CPU_ISSET( i, processMask + m ) )
204                     ++availableProcs;
205             }
206         }
207 #endif
208         process_mask = processMask;
209     }
210     else {
211         // Failed to get the process affinity mask; assume the whole machine can be used.
212         availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs;
213         delete[] processMask;
214     }
215     theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap
216     __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), NULL );
217 }
218 
AvailableHwConcurrency()219 int AvailableHwConcurrency() {
220     atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
221     return theNumProcs;
222 }
223 
224 /* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */
225 #elif __ANDROID__
226 
227 // Work-around for Android that reads the correct number of available CPUs since system calls are unreliable.
228 // Format of "present" file is: ([<int>-<int>|<int>],)+
229 int AvailableHwConcurrency() {
230     FILE *fp = fopen("/sys/devices/system/cpu/present", "r");
231     if (fp == NULL) return 1;
232     int num_args, lower, upper, num_cpus=0;
233     while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) {
234         switch(num_args) {
235             case 2: num_cpus += upper - lower + 1; break;
236             case 1: num_cpus += 1; break;
237         }
238         fscanf(fp, ",");
239     }
240     return (num_cpus > 0) ? num_cpus : 1;
241 }
242 
243 #elif defined(_SC_NPROCESSORS_ONLN)
244 
245 int AvailableHwConcurrency() {
246     int n = sysconf(_SC_NPROCESSORS_ONLN);
247     return (n > 0) ? n : 1;
248 }
249 
250 #elif _WIN32||_WIN64
251 
252 static std::atomic<do_once_state> hardware_concurrency_info;
253 
254 static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff;
255 
256 // Statically allocate an array for processor group information.
257 // Windows 7 supports maximum 4 groups, but let's look ahead a little.
258 static const WORD MaxProcessorGroups = 64;
259 
260 struct ProcessorGroupInfo {
261     DWORD_PTR   mask;                   ///< Affinity mask covering the whole group
262     int         numProcs;               ///< Number of processors in the group
263     int         numProcsRunningTotal;   ///< Subtotal of processors in this and preceding groups
264 
265     //! Total number of processor groups in the system
266     static int NumGroups;
267 
268     //! Index of the group with a slot reserved for the first external thread
269     /** In the context of multiple processor groups support current implementation
270         defines "the first external thread" as the first thread to invoke
271         AvailableHwConcurrency().
272 
273         TODO:   Implement a dynamic scheme remapping workers depending on the pending
274                 external threads affinity. **/
275     static int HoleIndex;
276 };
277 
278 int ProcessorGroupInfo::NumGroups = 1;
279 int ProcessorGroupInfo::HoleIndex = 0;
280 
281 ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups];
282 
283 struct TBB_GROUP_AFFINITY {
284     DWORD_PTR Mask;
285     WORD   Group;
286     WORD   Reserved[3];
287 };
288 
289 static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = NULL;
290 static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = NULL;
291 static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread,
292                         const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff );
293 static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* );
294 
295 static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = {
296       DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount)
297     , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount)
298     , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity)
299     , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity)
300 };
301 
302 static void initialize_hardware_concurrency_info () {
303     suppress_unused_warning(TBB_ALL_PROCESSOR_GROUPS);
304 #if __TBB_WIN8UI_SUPPORT
305     // For these applications processor groups info is unavailable
306     // Setting up a number of processors for one processor group
307     theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency();
308 #else /* __TBB_WIN8UI_SUPPORT */
309     dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable,
310                   sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) );
311     SYSTEM_INFO si;
312     GetNativeSystemInfo(&si);
313     DWORD_PTR pam, sam, m = 1;
314     GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam );
315     int nproc = 0;
316     for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) {
317         if ( pam & m )
318             ++nproc;
319     }
320     __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, NULL );
321     // By default setting up a number of processors for one processor group
322     theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc;
323     // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present
324     if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) {
325         // The process does not have restricting affinity mask and multiple processor groups are possible
326         ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount();
327         __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, NULL );
328         // Fail safety bootstrap. Release versions will limit available concurrency
329         // level, while debug ones would assert.
330         if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups )
331             ProcessorGroupInfo::NumGroups = MaxProcessorGroups;
332         if ( ProcessorGroupInfo::NumGroups > 1 ) {
333             TBB_GROUP_AFFINITY ga;
334             if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) )
335                 ProcessorGroupInfo::HoleIndex = ga.Group;
336             int nprocs = 0;
337             for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) {
338                 ProcessorGroupInfo  &pgi = theProcessorGroups[i];
339                 pgi.numProcs = (int)TBB_GetActiveProcessorCount(i);
340                 __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, NULL );
341                 pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1;
342                 pgi.numProcsRunningTotal = nprocs += pgi.numProcs;
343             }
344             __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), NULL );
345         }
346     }
347 #endif /* __TBB_WIN8UI_SUPPORT */
348 
349     PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups);
350     if (ProcessorGroupInfo::NumGroups>1)
351         for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i)
352             PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs);
353 }
354 
355 int NumberOfProcessorGroups() {
356     __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "NumberOfProcessorGroups is used before AvailableHwConcurrency" );
357     return ProcessorGroupInfo::NumGroups;
358 }
359 
360 // Offset for the slot reserved for the first external thread
361 #define HoleAdjusted(procIdx, grpIdx) (procIdx + (holeIdx <= grpIdx))
362 
363 int FindProcessorGroupIndex ( int procIdx ) {
364     // In case of oversubscription spread extra workers in a round robin manner
365     int holeIdx;
366     const int numProcs = theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
367     if ( procIdx >= numProcs - 1 ) {
368         holeIdx = INT_MAX;
369         procIdx = (procIdx - numProcs + 1) % numProcs;
370     }
371     else
372         holeIdx = ProcessorGroupInfo::HoleIndex;
373     __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "FindProcessorGroupIndex is used before AvailableHwConcurrency" );
374     // Approximate the likely group index assuming all groups are of the same size
375     int i = procIdx / theProcessorGroups[0].numProcs;
376     // Make sure the approximation is a valid group index
377     if (i >= ProcessorGroupInfo::NumGroups) i = ProcessorGroupInfo::NumGroups-1;
378     // Now adjust the approximation up or down
379     if ( theProcessorGroups[i].numProcsRunningTotal > HoleAdjusted(procIdx, i) ) {
380         while ( theProcessorGroups[i].numProcsRunningTotal - theProcessorGroups[i].numProcs > HoleAdjusted(procIdx, i) ) {
381             __TBB_ASSERT( i > 0, NULL );
382             --i;
383         }
384     }
385     else {
386         do {
387             ++i;
388         } while ( theProcessorGroups[i].numProcsRunningTotal <= HoleAdjusted(procIdx, i) );
389     }
390     __TBB_ASSERT( i < ProcessorGroupInfo::NumGroups, NULL );
391     return i;
392 }
393 
394 void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) {
395     __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" );
396     if ( !TBB_SetThreadGroupAffinity )
397         return;
398     TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} };
399     TBB_SetThreadGroupAffinity( hThread, &ga, NULL );
400 }
401 
402 int AvailableHwConcurrency() {
403     atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
404     return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
405 }
406 
407 /* End of _WIN32||_WIN64 implementation */
408 #else
409     #error AvailableHwConcurrency is not implemented for this OS
410 #endif
411 
412 } // namespace r1
413 } // namespace detail
414 } // namespace tbb
415 
416 #endif /* !__TBB_HardwareConcurrency */
417