1 /*
2  * This program is free software; you can redistribute it and/or
3  * modify it under the terms of the GNU General Public License
4  * as published by the Free Software Foundation; either version 2
5  * of the License, or (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software Foundation,
14  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
15  *
16  * The Original Code is Copyright (C) 2006 Blender Foundation
17  * All rights reserved.
18  */
19 
20 /** \file
21  * \ingroup bli
22  */
23 
24 #include <errno.h>
25 #include <stdlib.h>
26 #include <string.h>
27 
28 #include "MEM_guardedalloc.h"
29 
30 #include "BLI_gsqueue.h"
31 #include "BLI_listbase.h"
32 #include "BLI_system.h"
33 #include "BLI_task.h"
34 #include "BLI_threads.h"
35 
36 #include "PIL_time.h"
37 
38 /* for checking system threads - BLI_system_thread_count */
39 #ifdef WIN32
40 #  include <sys/timeb.h>
41 #  include <windows.h>
42 #elif defined(__APPLE__)
43 #  include <sys/sysctl.h>
44 #  include <sys/types.h>
45 #else
46 #  include <sys/time.h>
47 #  include <unistd.h>
48 #endif
49 
50 #ifdef WITH_TBB
51 #  include <tbb/spin_mutex.h>
52 #endif
53 
54 #include "atomic_ops.h"
55 #include "numaapi.h"
56 
57 #if defined(__APPLE__) && defined(_OPENMP) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 2) && \
58     !defined(__clang__)
59 #  define USE_APPLE_OMP_FIX
60 #endif
61 
62 #ifdef USE_APPLE_OMP_FIX
63 /* ************** libgomp (Apple gcc 4.2.1) TLS bug workaround *************** */
64 extern pthread_key_t gomp_tls_key;
65 static void *thread_tls_data;
66 #endif
67 
68 /**
69  * Basic Thread Control API
70  * ========================
71  *
72  * Many thread cases have an X amount of jobs, and only an Y amount of
73  * threads are useful (typically amount of CPU's)
74  *
75  * This code can be used to start a maximum amount of 'thread slots', which
76  * then can be filled in a loop with an idle timer.
77  *
78  * A sample loop can look like this (pseudo c);
79  *
80  * \code{.c}
81  *
82  *   ListBase lb;
83  *   int max_threads = 2;
84  *   int cont = 1;
85  *
86  *   BLI_threadpool_init(&lb, do_something_func, max_threads);
87  *
88  *   while (cont) {
89  *     if (BLI_available_threads(&lb) && !(escape loop event)) {
90  *       // get new job (data pointer)
91  *       // tag job 'processed
92  *       BLI_threadpool_insert(&lb, job);
93  *     }
94  *     else PIL_sleep_ms(50);
95  *
96  *     // Find if a job is ready, this the do_something_func() should write in job somewhere.
97  *     cont = 0;
98  *     for (go over all jobs)
99  *       if (job is ready) {
100  *         if (job was not removed) {
101  *           BLI_threadpool_remove(&lb, job);
102  *         }
103  *       }
104  *       else cont = 1;
105  *     }
106  *     // Conditions to exit loop.
107  *     if (if escape loop event) {
108  *       if (BLI_available_threadslots(&lb) == max_threads) {
109  *         break;
110  *       }
111  *     }
112  *   }
113  *
114  *   BLI_threadpool_end(&lb);
115  *
116  * \endcode
117  */
118 static pthread_mutex_t _image_lock = PTHREAD_MUTEX_INITIALIZER;
119 static pthread_mutex_t _image_draw_lock = PTHREAD_MUTEX_INITIALIZER;
120 static pthread_mutex_t _viewer_lock = PTHREAD_MUTEX_INITIALIZER;
121 static pthread_mutex_t _custom1_lock = PTHREAD_MUTEX_INITIALIZER;
122 static pthread_mutex_t _nodes_lock = PTHREAD_MUTEX_INITIALIZER;
123 static pthread_mutex_t _movieclip_lock = PTHREAD_MUTEX_INITIALIZER;
124 static pthread_mutex_t _colormanage_lock = PTHREAD_MUTEX_INITIALIZER;
125 static pthread_mutex_t _fftw_lock = PTHREAD_MUTEX_INITIALIZER;
126 static pthread_mutex_t _view3d_lock = PTHREAD_MUTEX_INITIALIZER;
127 static pthread_t mainid;
128 static bool is_numa_available = false;
129 static unsigned int thread_levels = 0; /* threads can be invoked inside threads */
130 static int num_threads_override = 0;
131 
132 /* just a max for security reasons */
133 #define RE_MAX_THREAD BLENDER_MAX_THREADS
134 
135 typedef struct ThreadSlot {
136   struct ThreadSlot *next, *prev;
137   void *(*do_thread)(void *);
138   void *callerdata;
139   pthread_t pthread;
140   int avail;
141 } ThreadSlot;
142 
BLI_threadapi_init(void)143 void BLI_threadapi_init(void)
144 {
145   mainid = pthread_self();
146   if (numaAPI_Initialize() == NUMAAPI_SUCCESS) {
147     is_numa_available = true;
148   }
149 }
150 
BLI_threadapi_exit(void)151 void BLI_threadapi_exit(void)
152 {
153 }
154 
155 /* tot = 0 only initializes malloc mutex in a safe way (see sequence.c)
156  * problem otherwise: scene render will kill of the mutex!
157  */
158 
BLI_threadpool_init(ListBase * threadbase,void * (* do_thread)(void *),int tot)159 void BLI_threadpool_init(ListBase *threadbase, void *(*do_thread)(void *), int tot)
160 {
161   int a;
162 
163   if (threadbase != nullptr && tot > 0) {
164     BLI_listbase_clear(threadbase);
165 
166     if (tot > RE_MAX_THREAD) {
167       tot = RE_MAX_THREAD;
168     }
169     else if (tot < 1) {
170       tot = 1;
171     }
172 
173     for (a = 0; a < tot; a++) {
174       ThreadSlot *tslot = static_cast<ThreadSlot *>(MEM_callocN(sizeof(ThreadSlot), "threadslot"));
175       BLI_addtail(threadbase, tslot);
176       tslot->do_thread = do_thread;
177       tslot->avail = 1;
178     }
179   }
180 
181   unsigned int level = atomic_fetch_and_add_u(&thread_levels, 1);
182   if (level == 0) {
183 #ifdef USE_APPLE_OMP_FIX
184     /* Workaround for Apple gcc 4.2.1 OMP vs background thread bug,
185      * we copy GOMP thread local storage pointer to setting it again
186      * inside the thread that we start. */
187     thread_tls_data = pthread_getspecific(gomp_tls_key);
188 #endif
189   }
190 }
191 
192 /* amount of available threads */
BLI_available_threads(ListBase * threadbase)193 int BLI_available_threads(ListBase *threadbase)
194 {
195   int counter = 0;
196 
197   LISTBASE_FOREACH (ThreadSlot *, tslot, threadbase) {
198     if (tslot->avail) {
199       counter++;
200     }
201   }
202 
203   return counter;
204 }
205 
206 /* returns thread number, for sample patterns or threadsafe tables */
BLI_threadpool_available_thread_index(ListBase * threadbase)207 int BLI_threadpool_available_thread_index(ListBase *threadbase)
208 {
209   int counter = 0;
210 
211   LISTBASE_FOREACH (ThreadSlot *, tslot, threadbase) {
212     if (tslot->avail) {
213       return counter;
214     }
215     ++counter;
216   }
217 
218   return 0;
219 }
220 
tslot_thread_start(void * tslot_p)221 static void *tslot_thread_start(void *tslot_p)
222 {
223   ThreadSlot *tslot = (ThreadSlot *)tslot_p;
224 
225 #ifdef USE_APPLE_OMP_FIX
226   /* Workaround for Apple gcc 4.2.1 OMP vs background thread bug,
227    * set GOMP thread local storage pointer which was copied beforehand */
228   pthread_setspecific(gomp_tls_key, thread_tls_data);
229 #endif
230 
231   return tslot->do_thread(tslot->callerdata);
232 }
233 
BLI_thread_is_main(void)234 int BLI_thread_is_main(void)
235 {
236   return pthread_equal(pthread_self(), mainid);
237 }
238 
BLI_threadpool_insert(ListBase * threadbase,void * callerdata)239 void BLI_threadpool_insert(ListBase *threadbase, void *callerdata)
240 {
241   LISTBASE_FOREACH (ThreadSlot *, tslot, threadbase) {
242     if (tslot->avail) {
243       tslot->avail = 0;
244       tslot->callerdata = callerdata;
245       pthread_create(&tslot->pthread, nullptr, tslot_thread_start, tslot);
246       return;
247     }
248   }
249   printf("ERROR: could not insert thread slot\n");
250 }
251 
BLI_threadpool_remove(ListBase * threadbase,void * callerdata)252 void BLI_threadpool_remove(ListBase *threadbase, void *callerdata)
253 {
254   LISTBASE_FOREACH (ThreadSlot *, tslot, threadbase) {
255     if (tslot->callerdata == callerdata) {
256       pthread_join(tslot->pthread, nullptr);
257       tslot->callerdata = nullptr;
258       tslot->avail = 1;
259     }
260   }
261 }
262 
BLI_threadpool_remove_index(ListBase * threadbase,int index)263 void BLI_threadpool_remove_index(ListBase *threadbase, int index)
264 {
265   int counter = 0;
266 
267   LISTBASE_FOREACH (ThreadSlot *, tslot, threadbase) {
268     if (counter == index && tslot->avail == 0) {
269       pthread_join(tslot->pthread, nullptr);
270       tslot->callerdata = nullptr;
271       tslot->avail = 1;
272       break;
273     }
274     ++counter;
275   }
276 }
277 
BLI_threadpool_clear(ListBase * threadbase)278 void BLI_threadpool_clear(ListBase *threadbase)
279 {
280   LISTBASE_FOREACH (ThreadSlot *, tslot, threadbase) {
281     if (tslot->avail == 0) {
282       pthread_join(tslot->pthread, nullptr);
283       tslot->callerdata = nullptr;
284       tslot->avail = 1;
285     }
286   }
287 }
288 
BLI_threadpool_end(ListBase * threadbase)289 void BLI_threadpool_end(ListBase *threadbase)
290 {
291 
292   /* only needed if there's actually some stuff to end
293    * this way we don't end up decrementing thread_levels on an empty threadbase
294    * */
295   if (threadbase == nullptr || BLI_listbase_is_empty(threadbase)) {
296     return;
297   }
298 
299   LISTBASE_FOREACH (ThreadSlot *, tslot, threadbase) {
300     if (tslot->avail == 0) {
301       pthread_join(tslot->pthread, nullptr);
302     }
303   }
304   BLI_freelistN(threadbase);
305 }
306 
307 /* System Information */
308 
309 /* how many threads are native on this system? */
BLI_system_thread_count(void)310 int BLI_system_thread_count(void)
311 {
312   static int t = -1;
313 
314   if (num_threads_override != 0) {
315     return num_threads_override;
316   }
317   if (LIKELY(t != -1)) {
318     return t;
319   }
320 
321   {
322 #ifdef WIN32
323     SYSTEM_INFO info;
324     GetSystemInfo(&info);
325     t = (int)info.dwNumberOfProcessors;
326 #else
327 #  ifdef __APPLE__
328     int mib[2];
329     size_t len;
330 
331     mib[0] = CTL_HW;
332     mib[1] = HW_NCPU;
333     len = sizeof(t);
334     sysctl(mib, 2, &t, &len, nullptr, 0);
335 #  else
336     t = (int)sysconf(_SC_NPROCESSORS_ONLN);
337 #  endif
338 #endif
339   }
340 
341   CLAMP(t, 1, RE_MAX_THREAD);
342 
343   return t;
344 }
345 
BLI_system_num_threads_override_set(int num)346 void BLI_system_num_threads_override_set(int num)
347 {
348   num_threads_override = num;
349 }
350 
BLI_system_num_threads_override_get(void)351 int BLI_system_num_threads_override_get(void)
352 {
353   return num_threads_override;
354 }
355 
356 /* Global Mutex Locks */
357 
global_mutex_from_type(const int type)358 static ThreadMutex *global_mutex_from_type(const int type)
359 {
360   switch (type) {
361     case LOCK_IMAGE:
362       return &_image_lock;
363     case LOCK_DRAW_IMAGE:
364       return &_image_draw_lock;
365     case LOCK_VIEWER:
366       return &_viewer_lock;
367     case LOCK_CUSTOM1:
368       return &_custom1_lock;
369     case LOCK_NODES:
370       return &_nodes_lock;
371     case LOCK_MOVIECLIP:
372       return &_movieclip_lock;
373     case LOCK_COLORMANAGE:
374       return &_colormanage_lock;
375     case LOCK_FFTW:
376       return &_fftw_lock;
377     case LOCK_VIEW3D:
378       return &_view3d_lock;
379     default:
380       BLI_assert(0);
381       return nullptr;
382   }
383 }
384 
BLI_thread_lock(int type)385 void BLI_thread_lock(int type)
386 {
387   pthread_mutex_lock(global_mutex_from_type(type));
388 }
389 
BLI_thread_unlock(int type)390 void BLI_thread_unlock(int type)
391 {
392   pthread_mutex_unlock(global_mutex_from_type(type));
393 }
394 
395 /* Mutex Locks */
396 
BLI_mutex_init(ThreadMutex * mutex)397 void BLI_mutex_init(ThreadMutex *mutex)
398 {
399   pthread_mutex_init(mutex, nullptr);
400 }
401 
BLI_mutex_lock(ThreadMutex * mutex)402 void BLI_mutex_lock(ThreadMutex *mutex)
403 {
404   pthread_mutex_lock(mutex);
405 }
406 
BLI_mutex_unlock(ThreadMutex * mutex)407 void BLI_mutex_unlock(ThreadMutex *mutex)
408 {
409   pthread_mutex_unlock(mutex);
410 }
411 
BLI_mutex_trylock(ThreadMutex * mutex)412 bool BLI_mutex_trylock(ThreadMutex *mutex)
413 {
414   return (pthread_mutex_trylock(mutex) == 0);
415 }
416 
BLI_mutex_end(ThreadMutex * mutex)417 void BLI_mutex_end(ThreadMutex *mutex)
418 {
419   pthread_mutex_destroy(mutex);
420 }
421 
BLI_mutex_alloc(void)422 ThreadMutex *BLI_mutex_alloc(void)
423 {
424   ThreadMutex *mutex = static_cast<ThreadMutex *>(MEM_callocN(sizeof(ThreadMutex), "ThreadMutex"));
425   BLI_mutex_init(mutex);
426   return mutex;
427 }
428 
BLI_mutex_free(ThreadMutex * mutex)429 void BLI_mutex_free(ThreadMutex *mutex)
430 {
431   BLI_mutex_end(mutex);
432   MEM_freeN(mutex);
433 }
434 
435 /* Spin Locks */
436 
437 #ifdef WITH_TBB
tbb_spin_mutex_cast(SpinLock * spin)438 static tbb::spin_mutex *tbb_spin_mutex_cast(SpinLock *spin)
439 {
440   static_assert(sizeof(SpinLock) >= sizeof(tbb::spin_mutex),
441                 "SpinLock must match tbb::spin_mutex");
442   static_assert(alignof(SpinLock) % alignof(tbb::spin_mutex) == 0,
443                 "SpinLock must be aligned same as tbb::spin_mutex");
444   return reinterpret_cast<tbb::spin_mutex *>(spin);
445 }
446 #endif
447 
BLI_spin_init(SpinLock * spin)448 void BLI_spin_init(SpinLock *spin)
449 {
450 #ifdef WITH_TBB
451   tbb::spin_mutex *spin_mutex = tbb_spin_mutex_cast(spin);
452   new (spin_mutex) tbb::spin_mutex();
453 #elif defined(__APPLE__)
454   BLI_mutex_init(spin);
455 #elif defined(_MSC_VER)
456   *spin = 0;
457 #else
458   pthread_spin_init(spin, 0);
459 #endif
460 }
461 
BLI_spin_lock(SpinLock * spin)462 void BLI_spin_lock(SpinLock *spin)
463 {
464 #ifdef WITH_TBB
465   tbb::spin_mutex *spin_mutex = tbb_spin_mutex_cast(spin);
466   spin_mutex->lock();
467 #elif defined(__APPLE__)
468   BLI_mutex_lock(spin);
469 #elif defined(_MSC_VER)
470   while (InterlockedExchangeAcquire(spin, 1)) {
471     while (*spin) {
472       /* Spin-lock hint for processors with hyperthreading. */
473       YieldProcessor();
474     }
475   }
476 #else
477   pthread_spin_lock(spin);
478 #endif
479 }
480 
BLI_spin_unlock(SpinLock * spin)481 void BLI_spin_unlock(SpinLock *spin)
482 {
483 #ifdef WITH_TBB
484   tbb::spin_mutex *spin_mutex = tbb_spin_mutex_cast(spin);
485   spin_mutex->unlock();
486 #elif defined(__APPLE__)
487   BLI_mutex_unlock(spin);
488 #elif defined(_MSC_VER)
489   _ReadWriteBarrier();
490   *spin = 0;
491 #else
492   pthread_spin_unlock(spin);
493 #endif
494 }
495 
BLI_spin_end(SpinLock * spin)496 void BLI_spin_end(SpinLock *spin)
497 {
498 #ifdef WITH_TBB
499   tbb::spin_mutex *spin_mutex = tbb_spin_mutex_cast(spin);
500   spin_mutex->~spin_mutex();
501 #elif defined(__APPLE__)
502   BLI_mutex_end(spin);
503 #elif defined(_MSC_VER)
504   /* Nothing to do, spin is a simple integer type. */
505 #else
506   pthread_spin_destroy(spin);
507 #endif
508 }
509 
510 /* Read/Write Mutex Lock */
511 
BLI_rw_mutex_init(ThreadRWMutex * mutex)512 void BLI_rw_mutex_init(ThreadRWMutex *mutex)
513 {
514   pthread_rwlock_init(mutex, nullptr);
515 }
516 
BLI_rw_mutex_lock(ThreadRWMutex * mutex,int mode)517 void BLI_rw_mutex_lock(ThreadRWMutex *mutex, int mode)
518 {
519   if (mode == THREAD_LOCK_READ) {
520     pthread_rwlock_rdlock(mutex);
521   }
522   else {
523     pthread_rwlock_wrlock(mutex);
524   }
525 }
526 
BLI_rw_mutex_unlock(ThreadRWMutex * mutex)527 void BLI_rw_mutex_unlock(ThreadRWMutex *mutex)
528 {
529   pthread_rwlock_unlock(mutex);
530 }
531 
BLI_rw_mutex_end(ThreadRWMutex * mutex)532 void BLI_rw_mutex_end(ThreadRWMutex *mutex)
533 {
534   pthread_rwlock_destroy(mutex);
535 }
536 
BLI_rw_mutex_alloc(void)537 ThreadRWMutex *BLI_rw_mutex_alloc(void)
538 {
539   ThreadRWMutex *mutex = static_cast<ThreadRWMutex *>(
540       MEM_callocN(sizeof(ThreadRWMutex), "ThreadRWMutex"));
541   BLI_rw_mutex_init(mutex);
542   return mutex;
543 }
544 
BLI_rw_mutex_free(ThreadRWMutex * mutex)545 void BLI_rw_mutex_free(ThreadRWMutex *mutex)
546 {
547   BLI_rw_mutex_end(mutex);
548   MEM_freeN(mutex);
549 }
550 
551 /* Ticket Mutex Lock */
552 
553 struct TicketMutex {
554   pthread_cond_t cond;
555   pthread_mutex_t mutex;
556   unsigned int queue_head, queue_tail;
557 };
558 
BLI_ticket_mutex_alloc(void)559 TicketMutex *BLI_ticket_mutex_alloc(void)
560 {
561   TicketMutex *ticket = static_cast<TicketMutex *>(
562       MEM_callocN(sizeof(TicketMutex), "TicketMutex"));
563 
564   pthread_cond_init(&ticket->cond, nullptr);
565   pthread_mutex_init(&ticket->mutex, nullptr);
566 
567   return ticket;
568 }
569 
BLI_ticket_mutex_free(TicketMutex * ticket)570 void BLI_ticket_mutex_free(TicketMutex *ticket)
571 {
572   pthread_mutex_destroy(&ticket->mutex);
573   pthread_cond_destroy(&ticket->cond);
574   MEM_freeN(ticket);
575 }
576 
BLI_ticket_mutex_lock(TicketMutex * ticket)577 void BLI_ticket_mutex_lock(TicketMutex *ticket)
578 {
579   unsigned int queue_me;
580 
581   pthread_mutex_lock(&ticket->mutex);
582   queue_me = ticket->queue_tail++;
583 
584   while (queue_me != ticket->queue_head) {
585     pthread_cond_wait(&ticket->cond, &ticket->mutex);
586   }
587 
588   pthread_mutex_unlock(&ticket->mutex);
589 }
590 
BLI_ticket_mutex_unlock(TicketMutex * ticket)591 void BLI_ticket_mutex_unlock(TicketMutex *ticket)
592 {
593   pthread_mutex_lock(&ticket->mutex);
594   ticket->queue_head++;
595   pthread_cond_broadcast(&ticket->cond);
596   pthread_mutex_unlock(&ticket->mutex);
597 }
598 
599 /* ************************************************ */
600 
601 /* Condition */
602 
BLI_condition_init(ThreadCondition * cond)603 void BLI_condition_init(ThreadCondition *cond)
604 {
605   pthread_cond_init(cond, nullptr);
606 }
607 
BLI_condition_wait(ThreadCondition * cond,ThreadMutex * mutex)608 void BLI_condition_wait(ThreadCondition *cond, ThreadMutex *mutex)
609 {
610   pthread_cond_wait(cond, mutex);
611 }
612 
BLI_condition_wait_global_mutex(ThreadCondition * cond,const int type)613 void BLI_condition_wait_global_mutex(ThreadCondition *cond, const int type)
614 {
615   pthread_cond_wait(cond, global_mutex_from_type(type));
616 }
617 
BLI_condition_notify_one(ThreadCondition * cond)618 void BLI_condition_notify_one(ThreadCondition *cond)
619 {
620   pthread_cond_signal(cond);
621 }
622 
BLI_condition_notify_all(ThreadCondition * cond)623 void BLI_condition_notify_all(ThreadCondition *cond)
624 {
625   pthread_cond_broadcast(cond);
626 }
627 
BLI_condition_end(ThreadCondition * cond)628 void BLI_condition_end(ThreadCondition *cond)
629 {
630   pthread_cond_destroy(cond);
631 }
632 
633 /* ************************************************ */
634 
635 struct ThreadQueue {
636   GSQueue *queue;
637   pthread_mutex_t mutex;
638   pthread_cond_t push_cond;
639   pthread_cond_t finish_cond;
640   volatile int nowait;
641   volatile int canceled;
642 };
643 
BLI_thread_queue_init(void)644 ThreadQueue *BLI_thread_queue_init(void)
645 {
646   ThreadQueue *queue;
647 
648   queue = static_cast<ThreadQueue *>(MEM_callocN(sizeof(ThreadQueue), "ThreadQueue"));
649   queue->queue = BLI_gsqueue_new(sizeof(void *));
650 
651   pthread_mutex_init(&queue->mutex, nullptr);
652   pthread_cond_init(&queue->push_cond, nullptr);
653   pthread_cond_init(&queue->finish_cond, nullptr);
654 
655   return queue;
656 }
657 
BLI_thread_queue_free(ThreadQueue * queue)658 void BLI_thread_queue_free(ThreadQueue *queue)
659 {
660   /* destroy everything, assumes no one is using queue anymore */
661   pthread_cond_destroy(&queue->finish_cond);
662   pthread_cond_destroy(&queue->push_cond);
663   pthread_mutex_destroy(&queue->mutex);
664 
665   BLI_gsqueue_free(queue->queue);
666 
667   MEM_freeN(queue);
668 }
669 
BLI_thread_queue_push(ThreadQueue * queue,void * work)670 void BLI_thread_queue_push(ThreadQueue *queue, void *work)
671 {
672   pthread_mutex_lock(&queue->mutex);
673 
674   BLI_gsqueue_push(queue->queue, &work);
675 
676   /* signal threads waiting to pop */
677   pthread_cond_signal(&queue->push_cond);
678   pthread_mutex_unlock(&queue->mutex);
679 }
680 
BLI_thread_queue_pop(ThreadQueue * queue)681 void *BLI_thread_queue_pop(ThreadQueue *queue)
682 {
683   void *work = nullptr;
684 
685   /* wait until there is work */
686   pthread_mutex_lock(&queue->mutex);
687   while (BLI_gsqueue_is_empty(queue->queue) && !queue->nowait) {
688     pthread_cond_wait(&queue->push_cond, &queue->mutex);
689   }
690 
691   /* if we have something, pop it */
692   if (!BLI_gsqueue_is_empty(queue->queue)) {
693     BLI_gsqueue_pop(queue->queue, &work);
694 
695     if (BLI_gsqueue_is_empty(queue->queue)) {
696       pthread_cond_broadcast(&queue->finish_cond);
697     }
698   }
699 
700   pthread_mutex_unlock(&queue->mutex);
701 
702   return work;
703 }
704 
wait_timeout(struct timespec * timeout,int ms)705 static void wait_timeout(struct timespec *timeout, int ms)
706 {
707   ldiv_t div_result;
708   long sec, usec, x;
709 
710 #ifdef WIN32
711   {
712     struct _timeb now;
713     _ftime(&now);
714     sec = now.time;
715     usec = now.millitm * 1000; /* microsecond precision would be better */
716   }
717 #else
718   {
719     struct timeval now;
720     gettimeofday(&now, nullptr);
721     sec = now.tv_sec;
722     usec = now.tv_usec;
723   }
724 #endif
725 
726   /* add current time + millisecond offset */
727   div_result = ldiv(ms, 1000);
728   timeout->tv_sec = sec + div_result.quot;
729 
730   x = usec + (div_result.rem * 1000);
731 
732   if (x >= 1000000) {
733     timeout->tv_sec++;
734     x -= 1000000;
735   }
736 
737   timeout->tv_nsec = x * 1000;
738 }
739 
BLI_thread_queue_pop_timeout(ThreadQueue * queue,int ms)740 void *BLI_thread_queue_pop_timeout(ThreadQueue *queue, int ms)
741 {
742   double t;
743   void *work = nullptr;
744   struct timespec timeout;
745 
746   t = PIL_check_seconds_timer();
747   wait_timeout(&timeout, ms);
748 
749   /* wait until there is work */
750   pthread_mutex_lock(&queue->mutex);
751   while (BLI_gsqueue_is_empty(queue->queue) && !queue->nowait) {
752     if (pthread_cond_timedwait(&queue->push_cond, &queue->mutex, &timeout) == ETIMEDOUT) {
753       break;
754     }
755     if (PIL_check_seconds_timer() - t >= ms * 0.001) {
756       break;
757     }
758   }
759 
760   /* if we have something, pop it */
761   if (!BLI_gsqueue_is_empty(queue->queue)) {
762     BLI_gsqueue_pop(queue->queue, &work);
763 
764     if (BLI_gsqueue_is_empty(queue->queue)) {
765       pthread_cond_broadcast(&queue->finish_cond);
766     }
767   }
768 
769   pthread_mutex_unlock(&queue->mutex);
770 
771   return work;
772 }
773 
BLI_thread_queue_len(ThreadQueue * queue)774 int BLI_thread_queue_len(ThreadQueue *queue)
775 {
776   int size;
777 
778   pthread_mutex_lock(&queue->mutex);
779   size = BLI_gsqueue_len(queue->queue);
780   pthread_mutex_unlock(&queue->mutex);
781 
782   return size;
783 }
784 
BLI_thread_queue_is_empty(ThreadQueue * queue)785 bool BLI_thread_queue_is_empty(ThreadQueue *queue)
786 {
787   bool is_empty;
788 
789   pthread_mutex_lock(&queue->mutex);
790   is_empty = BLI_gsqueue_is_empty(queue->queue);
791   pthread_mutex_unlock(&queue->mutex);
792 
793   return is_empty;
794 }
795 
BLI_thread_queue_nowait(ThreadQueue * queue)796 void BLI_thread_queue_nowait(ThreadQueue *queue)
797 {
798   pthread_mutex_lock(&queue->mutex);
799 
800   queue->nowait = 1;
801 
802   /* signal threads waiting to pop */
803   pthread_cond_broadcast(&queue->push_cond);
804   pthread_mutex_unlock(&queue->mutex);
805 }
806 
BLI_thread_queue_wait_finish(ThreadQueue * queue)807 void BLI_thread_queue_wait_finish(ThreadQueue *queue)
808 {
809   /* wait for finish condition */
810   pthread_mutex_lock(&queue->mutex);
811 
812   while (!BLI_gsqueue_is_empty(queue->queue)) {
813     pthread_cond_wait(&queue->finish_cond, &queue->mutex);
814   }
815 
816   pthread_mutex_unlock(&queue->mutex);
817 }
818 
819 /* **** Special functions to help performance on crazy NUMA setups. **** */
820 
821 #if 0  /* UNUSED */
822 static bool check_is_threadripper2_alike_topology(void)
823 {
824   /* NOTE: We hope operating system does not support CPU hot-swap to
825    * a different brand. And that SMP of different types is also not
826    * encouraged by the system. */
827   static bool is_initialized = false;
828   static bool is_threadripper2 = false;
829   if (is_initialized) {
830     return is_threadripper2;
831   }
832   is_initialized = true;
833   char *cpu_brand = BLI_cpu_brand_string();
834   if (cpu_brand == nullptr) {
835     return false;
836   }
837   if (strstr(cpu_brand, "Threadripper")) {
838     /* NOTE: We consider all Thread-rippers having similar topology to
839      * the second one. This is because we are trying to utilize NUMA node
840      * 0 as much as possible. This node does exist on earlier versions of
841      * thread-ripper and setting affinity to it should not have negative
842      * effect.
843      * This allows us to avoid per-model check, making the code more
844      * reliable for the CPUs which are not yet released.
845      */
846     if (strstr(cpu_brand, "2990WX") || strstr(cpu_brand, "2950X")) {
847       is_threadripper2 = true;
848     }
849   }
850   /* NOTE: While all dies of EPYC has memory controller, only two f them
851    * has access to a lower-indexed DDR slots. Those dies are same as on
852    * Threadripper2 with the memory controller.
853    * Now, it is rather likely that reasonable amount of users don't max
854    * up their DR slots, making it only two dies connected to a DDR slot
855    * with actual memory in it. */
856   if (strstr(cpu_brand, "EPYC")) {
857     /* NOTE: Similarly to Thread-ripper we do not do model check. */
858     is_threadripper2 = true;
859   }
860   MEM_freeN(cpu_brand);
861   return is_threadripper2;
862 }
863 
864 static void threadripper_put_process_on_fast_node(void)
865 {
866   if (!is_numa_available) {
867     return;
868   }
869   /* NOTE: Technically, we can use NUMA nodes 0 and 2 and using both of
870    * them in the affinity mask will allow OS to schedule threads more
871    * flexible,possibly increasing overall performance when multiple apps
872    * are crunching numbers.
873    *
874    * However, if scene fits into memory adjacent to a single die we don't
875    * want OS to re-schedule the process to another die since that will make
876    * it further away from memory allocated for .blend file. */
877   /* NOTE: Even if NUMA is available in the API but is disabled in BIOS on
878    * this workstation we still process here. If NUMA is disabled it will be a
879    * single node, so our action is no-visible-changes, but allows to keep
880    * things simple and unified. */
881   numaAPI_RunProcessOnNode(0);
882 }
883 
884 static void threadripper_put_thread_on_fast_node(void)
885 {
886   if (!is_numa_available) {
887     return;
888   }
889   /* NOTE: This is where things becomes more interesting. On the one hand
890    * we can use nodes 0 and 2 and allow operating system to do balancing
891    * of processes/threads for the maximum performance when multiple apps
892    * are running.
893    * On another hand, however, we probably want to use same node as the
894    * main thread since that's where the memory of .blend file is likely
895    * to be allocated.
896    * Since the main thread is currently on node 0, we also put thread on
897    * same node. */
898   /* See additional note about NUMA disabled in BIOS above. */
899   numaAPI_RunThreadOnNode(0);
900 }
901 #endif /* UNUSED */
902 
BLI_thread_put_process_on_fast_node(void)903 void BLI_thread_put_process_on_fast_node(void)
904 {
905   /* Disabled for now since this causes only 16 threads to be used on a
906    * thread-ripper for computations like sculpting and fluid sim. The problem
907    * is that all threads created as children from this thread will inherit
908    * the NUMA node and so will end up on the same node. This can be fixed
909    * case-by-case by assigning the NUMA node for every child thread, however
910    * this is difficult for external libraries and OpenMP, and out of our
911    * control for plugins like external renderers. */
912 #if 0
913   if (check_is_threadripper2_alike_topology()) {
914     threadripper_put_process_on_fast_node();
915   }
916 #endif
917 }
918 
BLI_thread_put_thread_on_fast_node(void)919 void BLI_thread_put_thread_on_fast_node(void)
920 {
921   /* Disabled for now, see comment above. */
922 #if 0
923   if (check_is_threadripper2_alike_topology()) {
924     threadripper_put_thread_on_fast_node();
925   }
926 #endif
927 }
928