1 // The libMesh Finite Element Library.
2 // Copyright (C) 2002-2020 Benjamin S. Kirk, John W. Peterson, Roy H. Stogner
3 
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License as published by the Free Software Foundation; either
7 // version 2.1 of the License, or (at your option) any later version.
8 
9 // This library is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12 // Lesser General Public License for more details.
13 
14 // You should have received a copy of the GNU Lesser General Public
15 // License along with this library; if not, write to the Free Software
16 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 
18 #ifndef LIBMESH_THREADS_PTHREAD_H
19 #define LIBMESH_THREADS_PTHREAD_H
20 
21 // Do not try to #include this header directly, it is designed to be
22 // #included directly by threads.h
23 #ifndef LIBMESH_SQUASH_HEADER_WARNING
24 # warning "This file is designed to be included through libmesh/threads.h"
25 #else
26 
27 #ifdef LIBMESH_HAVE_PTHREAD
28 
29 // C++ includes
30 #ifdef LIBMESH_HAVE_CXX11_THREAD
31 # include <thread>
32 #endif
33 
34 #include "libmesh/libmesh_logging.h"
35 #include <pthread.h>
36 #include <algorithm>
37 #include <vector>
38 
39 #ifdef __APPLE__
40 #  ifdef __MAC_10_12
41 #    include <os/lock.h>
42 #else
43 #    include <libkern/OSAtomic.h>
44 #  endif
45 #endif
46 
47 // Thread-Local-Storage macros
48 #ifdef LIBMESH_HAVE_CXX11_THREAD
49 #  define LIBMESH_TLS_TYPE(type)  thread_local type
50 #  define LIBMESH_TLS_REF(value)  (value)
51 #else // Maybe support gcc __thread eventually?
52 #  define LIBMESH_TLS_TYPE(type)  type
53 #  define LIBMESH_TLS_REF(value)  (value)
54 #endif
55 
56 namespace libMesh
57 {
58 
59 namespace Threads
60 {
61 
62 
63 #ifdef LIBMESH_HAVE_CXX11_THREAD
64 /**
65  * Use std::thread when available.
66  */
67 typedef std::thread Thread;
68 
69 #else
70 
71 /**
72  * Use the non-concurrent placeholder.
73  */
74 typedef NonConcurrentThread Thread;
75 
76 #endif // LIBMESH_HAVE_CXX11_THREAD
77 
78 
79 /**
80  * Spin mutex.  Implements mutual exclusion by busy-waiting in user
81  * space for the lock to be acquired.
82  */
83 #ifdef __APPLE__
84 #ifdef __MAC_10_12
85 class spin_mutex
86 {
87 public:
spin_mutex()88   spin_mutex() { ulock = OS_UNFAIR_LOCK_INIT; }
~spin_mutex()89   ~spin_mutex() {}
90 
lock()91   void lock () { os_unfair_lock_lock(&ulock); }
unlock()92   void unlock () { os_unfair_lock_unlock(&ulock); }
93 
94   class scoped_lock
95   {
96   public:
scoped_lock()97     scoped_lock () : smutex(nullptr) {}
scoped_lock(spin_mutex & in_smutex)98     explicit scoped_lock ( spin_mutex & in_smutex ) : smutex(&in_smutex) { smutex->lock(); }
99 
~scoped_lock()100     ~scoped_lock () { release(); }
101 
acquire(spin_mutex & in_smutex)102     void acquire ( spin_mutex & in_smutex ) { smutex = &in_smutex; smutex->lock(); }
release()103     void release () { if (smutex) smutex->unlock(); smutex = nullptr; }
104 
105   private:
106     spin_mutex * smutex;
107   };
108 
109 private:
110   os_unfair_lock ulock;
111 };
112 #else
113 class spin_mutex
114 {
115 public:
spin_mutex()116   spin_mutex() : slock(0) {} // The convention is that the lock being zero is _unlocked_
~spin_mutex()117   ~spin_mutex() {}
118 
lock()119   void lock () { OSSpinLockLock(&slock); }
unlock()120   void unlock () { OSSpinLockUnlock(&slock); }
121 
122   class scoped_lock
123   {
124   public:
scoped_lock()125     scoped_lock () : smutex(nullptr) {}
scoped_lock(spin_mutex & in_smutex)126     explicit scoped_lock ( spin_mutex & in_smutex ) : smutex(&in_smutex) { smutex->lock(); }
127 
~scoped_lock()128     ~scoped_lock () { release(); }
129 
acquire(spin_mutex & in_smutex)130     void acquire ( spin_mutex & in_smutex ) { smutex = &in_smutex; smutex->lock(); }
release()131     void release () { if (smutex) smutex->unlock(); smutex = nullptr; }
132 
133   private:
134     spin_mutex * smutex;
135   };
136 
137 private:
138   OSSpinLock slock;
139 };
140 #endif
141 #else
142 class spin_mutex
143 {
144 public:
145   // Might want to use PTHREAD_MUTEX_ADAPTIVE_NP on Linux, but it's not available on OSX.
spin_mutex()146   spin_mutex() { pthread_spin_init(&slock, PTHREAD_PROCESS_PRIVATE); }
~spin_mutex()147   ~spin_mutex() { pthread_spin_destroy(&slock); }
148 
lock()149   void lock () { pthread_spin_lock(&slock); }
unlock()150   void unlock () { pthread_spin_unlock(&slock); }
151 
152   class scoped_lock
153   {
154   public:
scoped_lock()155     scoped_lock () : smutex(nullptr) {}
scoped_lock(spin_mutex & in_smutex)156     explicit scoped_lock ( spin_mutex & in_smutex ) : smutex(&in_smutex) { smutex->lock(); }
157 
~scoped_lock()158     ~scoped_lock () { release(); }
159 
acquire(spin_mutex & in_smutex)160     void acquire ( spin_mutex & in_smutex ) { smutex = &in_smutex; smutex->lock(); }
release()161     void release () { if (smutex) smutex->unlock(); smutex = nullptr; }
162 
163   private:
164     spin_mutex * smutex;
165   };
166 
167 private:
168   pthread_spinlock_t slock;
169 };
170 #endif // __APPLE__
171 
172 
173 
174 /**
175  * Recursive mutex.  Implements mutual exclusion by busy-waiting in user
176  * space for the lock to be acquired.
177  */
178 class recursive_mutex
179 {
180 public:
181   // Might want to use PTHREAD_MUTEX_ADAPTIVE_NP on Linux, but it's not available on OSX.
recursive_mutex()182   recursive_mutex()
183   {
184     pthread_mutexattr_init(&attr);
185     pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
186     pthread_mutex_init(&mutex, &attr);
187   }
~recursive_mutex()188   ~recursive_mutex() { pthread_mutex_destroy(&mutex); }
189 
lock()190   void lock () { pthread_mutex_lock(&mutex); }
unlock()191   void unlock () { pthread_mutex_unlock(&mutex); }
192 
193   class scoped_lock
194   {
195   public:
scoped_lock()196     scoped_lock () : rmutex(nullptr) {}
scoped_lock(recursive_mutex & in_rmutex)197     explicit scoped_lock ( recursive_mutex & in_rmutex ) : rmutex(&in_rmutex) { rmutex->lock(); }
198 
~scoped_lock()199     ~scoped_lock () { release(); }
200 
acquire(recursive_mutex & in_rmutex)201     void acquire ( recursive_mutex & in_rmutex ) { rmutex = &in_rmutex; rmutex->lock(); }
release()202     void release () { if (rmutex) rmutex->unlock(); rmutex = nullptr; }
203 
204   private:
205     recursive_mutex * rmutex;
206   };
207 
208 private:
209   pthread_mutex_t mutex;
210   pthread_mutexattr_t attr;
211 };
212 
213 template <typename Range>
num_pthreads(Range & range)214 unsigned int num_pthreads(Range & range)
215 {
216   std::size_t min = std::min((std::size_t)libMesh::n_threads(), range.size());
217   return min > 0 ? cast_int<unsigned int>(min) : 1;
218 }
219 
220 template <typename Range, typename Body>
221 class RangeBody
222 {
223 public:
224   Range * range;
225   Body * body;
226 };
227 
228 template <typename Range, typename Body>
run_body(void * args)229 void * run_body(void * args)
230 {
231   RangeBody<Range, Body> * range_body = (RangeBody<Range, Body> *)args;
232 
233   Body & body = *range_body->body;
234   Range & range = *range_body->range;
235 
236   body(range);
237 
238   return nullptr;
239 }
240 
241 /**
242  * Scheduler to manage threads.
243  */
244 class task_scheduler_init
245 {
246 public:
247   static const int automatic = -1;
248   explicit task_scheduler_init (int = automatic) {}
249   void initialize (int = automatic) {}
terminate()250   void terminate () {}
251 };
252 
253 //-------------------------------------------------------------------
254 /**
255  * Dummy "splitting object" used to distinguish splitting constructors
256  * from copy constructors.
257  */
258 class split {};
259 
260 
261 
262 
263 //-------------------------------------------------------------------
264 /**
265  * Execute the provided function object in parallel on the specified
266  * range.
267  */
268 template <typename Range, typename Body>
269 inline
parallel_for(const Range & range,const Body & body)270 void parallel_for (const Range & range, const Body & body)
271 {
272   Threads::BoolAcquire b(Threads::in_threads);
273 
274   // If we're running in serial - just run!
275   if (libMesh::n_threads() == 1)
276   {
277     body(range);
278     return;
279   }
280 
281 #ifdef LIBMESH_ENABLE_PERFORMANCE_LOGGING
282   const bool logging_was_enabled = libMesh::perflog.logging_enabled();
283 
284   if (libMesh::n_threads() > 1)
285     libMesh::perflog.disable_logging();
286 #endif
287   unsigned int n_threads = num_pthreads(range);
288 
289   std::vector<Range *> ranges(n_threads);
290   std::vector<RangeBody<const Range, const Body>> range_bodies(n_threads);
291   std::vector<pthread_t> threads(n_threads);
292 
293   // Create the ranges for each thread
294   std::size_t range_size = range.size() / n_threads;
295 
296   typename Range::const_iterator current_beginning = range.begin();
297 
298   for (unsigned int i=0; i<n_threads; i++)
299     {
300       std::size_t this_range_size = range_size;
301 
302       if (i+1 == n_threads)
303         this_range_size += range.size() % n_threads; // Give the last one the remaining work to do
304 
305       ranges[i] = new Range(range, current_beginning, current_beginning + this_range_size);
306 
307       current_beginning = current_beginning + this_range_size;
308     }
309 
310   // Create the RangeBody arguments
311   for (unsigned int i=0; i<n_threads; i++)
312     {
313       range_bodies[i].range = ranges[i];
314       range_bodies[i].body = &body;
315     }
316 
317   // Create the threads.  It may seem redundant to wrap a pragma in
318   // #ifdefs... but GCC warns about an "unknown pragma" if it
319   // encounters this line of code when -fopenmp is not passed to the
320   // compiler.
321 #ifdef LIBMESH_HAVE_OPENMP
322 #pragma omp parallel for schedule (static)
323 #endif
324   for (unsigned int i=0; i<n_threads; i++)
325     {
326 #if !LIBMESH_HAVE_OPENMP
327       pthread_create(&threads[i], nullptr, &run_body<Range, Body>, (void *)&range_bodies[i]);
328 #else
329       run_body<Range, Body>((void *)&range_bodies[i]);
330 #endif
331     }
332 
333 #if !LIBMESH_HAVE_OPENMP
334   // Wait for them to finish
335 
336   // The use of 'int' instead of unsigned for the iteration variable
337   // is deliberate here.  This is an OpenMP loop, and some older
338   // compilers warn when you don't use int for the loop index.  The
339   // reason has to do with signed vs. unsigned integer overflow
340   // behavior and optimization.
341   // http://blog.llvm.org/2011/05/what-every-c-programmer-should-know.html
342   for (int i=0; i<static_cast<int>(n_threads); i++)
343     pthread_join(threads[i], nullptr);
344 #endif
345 
346   // Clean up
347   for (unsigned int i=0; i<n_threads; i++)
348     delete ranges[i];
349 
350 #ifdef LIBMESH_ENABLE_PERFORMANCE_LOGGING
351   if (libMesh::n_threads() > 1 && logging_was_enabled)
352     libMesh::perflog.enable_logging();
353 #endif
354 }
355 
356 /**
357  * Execute the provided function object in parallel on the specified
358  * range with the specified partitioner.
359  */
360 template <typename Range, typename Body, typename Partitioner>
361 inline
parallel_for(const Range & range,const Body & body,const Partitioner &)362 void parallel_for (const Range & range, const Body & body, const Partitioner &)
363 {
364   parallel_for (range, body);
365 }
366 
367 /**
368  * Execute the provided reduction operation in parallel on the specified
369  * range.
370  */
371 template <typename Range, typename Body>
372 inline
parallel_reduce(const Range & range,Body & body)373 void parallel_reduce (const Range & range, Body & body)
374 {
375   Threads::BoolAcquire b(Threads::in_threads);
376 
377   // If we're running in serial - just run!
378   if (libMesh::n_threads() == 1)
379   {
380     body(range);
381     return;
382   }
383 
384 #ifdef LIBMESH_ENABLE_PERFORMANCE_LOGGING
385   const bool logging_was_enabled = libMesh::perflog.logging_enabled();
386 
387   if (libMesh::n_threads() > 1)
388     libMesh::perflog.disable_logging();
389 #endif
390 
391   unsigned int n_threads = num_pthreads(range);
392 
393   std::vector<Range *> ranges(n_threads);
394   std::vector<Body *> bodies(n_threads);
395   std::vector<RangeBody<Range, Body>> range_bodies(n_threads);
396 
397   // Create copies of the body for each thread
398   bodies[0] = &body; // Use the original body for the first one
399   for (unsigned int i=1; i<n_threads; i++)
400     bodies[i] = new Body(body, Threads::split());
401 
402   // Create the ranges for each thread
403   std::size_t range_size = range.size() / n_threads;
404 
405   typename Range::const_iterator current_beginning = range.begin();
406 
407   for (unsigned int i=0; i<n_threads; i++)
408     {
409       std::size_t this_range_size = range_size;
410 
411       if (i+1 == n_threads)
412         this_range_size += range.size() % n_threads; // Give the last one the remaining work to do
413 
414       ranges[i] = new Range(range, current_beginning, current_beginning + this_range_size);
415 
416       current_beginning = current_beginning + this_range_size;
417     }
418 
419   // Create the RangeBody arguments
420   for (unsigned int i=0; i<n_threads; i++)
421     {
422       range_bodies[i].range = ranges[i];
423       range_bodies[i].body = bodies[i];
424     }
425 
426   // Create the threads
427   std::vector<pthread_t> threads(n_threads);
428 
429   // It may seem redundant to wrap a pragma in #ifdefs... but GCC
430   // warns about an "unknown pragma" if it encounters this line of
431   // code when -fopenmp is not passed to the compiler.
432 #ifdef LIBMESH_HAVE_OPENMP
433 #pragma omp parallel for schedule (static)
434 #endif
435   // The use of 'int' instead of unsigned for the iteration variable
436   // is deliberate here.  This is an OpenMP loop, and some older
437   // compilers warn when you don't use int for the loop index.  The
438   // reason has to do with signed vs. unsigned integer overflow
439   // behavior and optimization.
440   // http://blog.llvm.org/2011/05/what-every-c-programmer-should-know.html
441   for (int i=0; i<static_cast<int>(n_threads); i++)
442     {
443 #if !LIBMESH_HAVE_OPENMP
444       pthread_create(&threads[i], nullptr, &run_body<Range, Body>, (void *)&range_bodies[i]);
445 #else
446       run_body<Range, Body>((void *)&range_bodies[i]);
447 #endif
448     }
449 
450 #if !LIBMESH_HAVE_OPENMP
451   // Wait for them to finish
452   for (unsigned int i=0; i<n_threads; i++)
453     pthread_join(threads[i], nullptr);
454 #endif
455 
456   // Join them all down to the original Body
457   for (unsigned int i=n_threads-1; i != 0; i--)
458     bodies[i-1]->join(*bodies[i]);
459 
460   // Clean up
461   for (unsigned int i=1; i<n_threads; i++)
462     delete bodies[i];
463   for (unsigned int i=0; i<n_threads; i++)
464     delete ranges[i];
465 
466 #ifdef LIBMESH_ENABLE_PERFORMANCE_LOGGING
467   if (libMesh::n_threads() > 1 && logging_was_enabled)
468     libMesh::perflog.enable_logging();
469 #endif
470 }
471 
472 /**
473  * Execute the provided reduction operation in parallel on the specified
474  * range with the specified partitioner.
475  */
476 template <typename Range, typename Body, typename Partitioner>
477 inline
parallel_reduce(const Range & range,Body & body,const Partitioner &)478 void parallel_reduce (const Range & range, Body & body, const Partitioner &)
479 {
480   parallel_reduce(range, body);
481 }
482 
483 
484 /**
485  * Defines atomic operations which can only be executed on a
486  * single thread at a time.
487  */
488 template <typename T>
489 class atomic
490 {
491 public:
atomic()492   atomic () : val(0) {}
T()493   operator T () { return val; }
494 
495   T operator=( T value )
496   {
497     spin_mutex::scoped_lock lock(smutex);
498     val = value;
499     return val;
500   }
501 
502   atomic<T> & operator=( const atomic<T> & value )
503   {
504     spin_mutex::scoped_lock lock(smutex);
505     val = value;
506     return *this;
507   }
508 
509 
510   T operator+=(T value)
511   {
512     spin_mutex::scoped_lock lock(smutex);
513     val += value;
514     return val;
515   }
516 
517   T operator-=(T value)
518   {
519     spin_mutex::scoped_lock lock(smutex);
520     val -= value;
521     return val;
522   }
523 
524   T operator++()
525   {
526     spin_mutex::scoped_lock lock(smutex);
527     val++;
528     return val;
529   }
530 
531   T operator++(int)
532   {
533     spin_mutex::scoped_lock lock(smutex);
534     val++;
535     return val;
536   }
537 
538   T operator--()
539   {
540     spin_mutex::scoped_lock lock(smutex);
541     val--;
542     return val;
543   }
544 
545   T operator--(int)
546   {
547     spin_mutex::scoped_lock lock(smutex);
548     val--;
549     return val;
550   }
551 
552 private:
553   T val;
554   spin_mutex smutex;
555 };
556 
557 } // namespace Threads
558 
559 } // namespace libMesh
560 
561 #endif // #ifdef LIBMESH_HAVE_PTHREAD
562 
563 #endif // LIBMESH_SQUASH_HEADER_WARNING
564 
565 #endif // LIBMESH_THREADS_PTHREAD_H
566