1 #include "HalideRuntime.h"
2 #include "printer.h"
3 #include "scoped_spin_lock.h"
4 
5 /* This provides an implementation of pthreads-like mutex and
6  * condition variables with fast default case performance.  The code
7  * is based on the "parking lot" design and specifically Amanieu
8  * d'Antras' Rust implementation:
9  *    https://github.com/Amanieu/parking_lot
10  * and the one in WTF:
11  *     https://webkit.org/blog/6161/locking-in-webkit/
12  *
13  * Neither of the above implementations were used directly largely for
14  * dependency management. This implementation lacks a few features
15  * relative to those two. Specifically timeouts are not
16  * supported. Nor is optional fairness or deadlock detection.
17  * This implementation should provide a faily standalone "one file"
18  * fast synchronization layer on top of readily available system primitives.
19  *
20  * TODO: Implement pthread_once equivalent.
21  * TODO: Add read/write lock and move SharedExclusiveSpinLock from tracing.cpp
22  *        to this mechanism.
23  * TODO: Add timeouts and optional fairness if needed.
24  * TODO: Relying on condition variables has issues for old versions of Windows
25  *       and likely has portability issues to some very bare bones embedded OSes.
26  *       Doing an implementation using only semaphores or event counters should
27  *       be doable.
28  */
29 
30 // Copied from tsan_interface.h
31 #ifndef TSAN_ANNOTATIONS
32 #define TSAN_ANNOTATIONS 0
33 #endif
34 
35 #if TSAN_ANNOTATIONS
36 extern "C" {
37 const unsigned __tsan_mutex_linker_init = 1 << 0;
38 void __tsan_mutex_pre_lock(void *addr, unsigned flags);
39 void __tsan_mutex_post_lock(void *addr, unsigned flags, int recursion);
40 int __tsan_mutex_pre_unlock(void *addr, unsigned flags);
41 void __tsan_mutex_post_unlock(void *addr, unsigned flags);
42 void __tsan_mutex_pre_signal(void *addr, unsigned flags);
43 void __tsan_mutex_post_signal(void *addr, unsigned flags);
44 }
45 #endif
46 
47 namespace Halide {
48 namespace Runtime {
49 namespace Internal {
50 
51 namespace Synchronization {
52 
53 namespace {
54 
55 #if TSAN_ANNOTATIONS
if_tsan_pre_lock(void * mutex)56 ALWAYS_INLINE void if_tsan_pre_lock(void *mutex) {
57     __tsan_mutex_pre_lock(mutex, __tsan_mutex_linker_init);
58 };
59 // TODO(zalman|dvyukov): Is 1 the right value for a non-recursive lock? pretty sure value is ignored.
if_tsan_post_lock(void * mutex)60 ALWAYS_INLINE void if_tsan_post_lock(void *mutex) {
61     __tsan_mutex_post_lock(mutex, __tsan_mutex_linker_init, 1);
62 }
63 // TODO(zalman|dvyukov): Is it safe to ignore return value here if locks are not recursive?
if_tsan_pre_unlock(void * mutex)64 ALWAYS_INLINE void if_tsan_pre_unlock(void *mutex) {
65     (void)__tsan_mutex_pre_unlock(mutex, __tsan_mutex_linker_init);
66 }
if_tsan_post_unlock(void * mutex)67 ALWAYS_INLINE void if_tsan_post_unlock(void *mutex) {
68     __tsan_mutex_post_unlock(mutex, __tsan_mutex_linker_init);
69 }
if_tsan_pre_signal(void * cond)70 ALWAYS_INLINE void if_tsan_pre_signal(void *cond) {
71     __tsan_mutex_pre_signal(cond, 0);
72 }
if_tsan_post_signal(void * cond)73 ALWAYS_INLINE void if_tsan_post_signal(void *cond) {
74     __tsan_mutex_post_signal(cond, 0);
75 }
76 #else
77 ALWAYS_INLINE void if_tsan_pre_lock(void *) {
78 }
79 ALWAYS_INLINE void if_tsan_post_lock(void *) {
80 }
81 ALWAYS_INLINE void if_tsan_pre_unlock(void *) {
82 }
83 ALWAYS_INLINE void if_tsan_post_unlock(void *) {
84 }
85 ALWAYS_INLINE void if_tsan_pre_signal(void *) {
86 }
87 ALWAYS_INLINE void if_tsan_post_signal(void *) {
88 }
89 #endif
90 
91 #ifdef BITS_32
atomic_and_fetch_release(uintptr_t * addr,uintptr_t val)92 ALWAYS_INLINE uintptr_t atomic_and_fetch_release(uintptr_t *addr, uintptr_t val) {
93     return __sync_and_and_fetch(addr, val);
94 }
95 
96 template<typename T>
atomic_fetch_add_acquire_release(T * addr,T val)97 ALWAYS_INLINE T atomic_fetch_add_acquire_release(T *addr, T val) {
98     return __sync_fetch_and_add(addr, val);
99 }
100 
101 template<typename T>
cas_strong_sequentially_consistent_helper(T * addr,T * expected,T * desired)102 ALWAYS_INLINE bool cas_strong_sequentially_consistent_helper(T *addr, T *expected, T *desired) {
103     T oldval = *expected;
104     T gotval = __sync_val_compare_and_swap(addr, oldval, *desired);
105     *expected = gotval;
106     return oldval == gotval;
107 }
108 
atomic_cas_strong_release_relaxed(uintptr_t * addr,uintptr_t * expected,uintptr_t * desired)109 ALWAYS_INLINE bool atomic_cas_strong_release_relaxed(uintptr_t *addr, uintptr_t *expected, uintptr_t *desired) {
110     return cas_strong_sequentially_consistent_helper(addr, expected, desired);
111 }
112 
atomic_cas_weak_release_relaxed(uintptr_t * addr,uintptr_t * expected,uintptr_t * desired)113 ALWAYS_INLINE bool atomic_cas_weak_release_relaxed(uintptr_t *addr, uintptr_t *expected, uintptr_t *desired) {
114     return cas_strong_sequentially_consistent_helper(addr, expected, desired);
115 }
116 
117 template<typename T>
atomic_cas_weak_relacq_relaxed(T * addr,T * expected,T * desired)118 ALWAYS_INLINE bool atomic_cas_weak_relacq_relaxed(T *addr, T *expected, T *desired) {
119     return cas_strong_sequentially_consistent_helper(addr, expected, desired);
120 }
121 
atomic_cas_weak_relaxed_relaxed(uintptr_t * addr,uintptr_t * expected,uintptr_t * desired)122 ALWAYS_INLINE bool atomic_cas_weak_relaxed_relaxed(uintptr_t *addr, uintptr_t *expected, uintptr_t *desired) {
123     return cas_strong_sequentially_consistent_helper(addr, expected, desired);
124 }
125 
atomic_cas_weak_acquire_relaxed(uintptr_t * addr,uintptr_t * expected,uintptr_t * desired)126 ALWAYS_INLINE bool atomic_cas_weak_acquire_relaxed(uintptr_t *addr, uintptr_t *expected, uintptr_t *desired) {
127     return cas_strong_sequentially_consistent_helper(addr, expected, desired);
128 }
129 
atomic_fetch_and_release(uintptr_t * addr,uintptr_t val)130 ALWAYS_INLINE uintptr_t atomic_fetch_and_release(uintptr_t *addr, uintptr_t val) {
131     return __sync_fetch_and_and(addr, val);
132 }
133 
134 template<typename T>
atomic_load_relaxed(T * addr,T * val)135 ALWAYS_INLINE void atomic_load_relaxed(T *addr, T *val) {
136     *val = *addr;
137 }
138 
139 template<typename T>
atomic_load_acquire(T * addr,T * val)140 ALWAYS_INLINE void atomic_load_acquire(T *addr, T *val) {
141     __sync_synchronize();
142     *val = *addr;
143 }
144 
atomic_or_fetch_relaxed(uintptr_t * addr,uintptr_t val)145 ALWAYS_INLINE uintptr_t atomic_or_fetch_relaxed(uintptr_t *addr, uintptr_t val) {
146     return __sync_or_and_fetch(addr, val);
147 }
148 
atomic_store_relaxed(uintptr_t * addr,uintptr_t * val)149 ALWAYS_INLINE void atomic_store_relaxed(uintptr_t *addr, uintptr_t *val) {
150     *addr = *val;
151 }
152 
153 template<typename T>
atomic_store_release(T * addr,T * val)154 ALWAYS_INLINE void atomic_store_release(T *addr, T *val) {
155     *addr = *val;
156     __sync_synchronize();
157 }
158 
atomic_thread_fence_acquire()159 ALWAYS_INLINE void atomic_thread_fence_acquire() {
160     __sync_synchronize();
161 }
162 
163 #else
164 
atomic_and_fetch_release(uintptr_t * addr,uintptr_t val)165 ALWAYS_INLINE uintptr_t atomic_and_fetch_release(uintptr_t *addr, uintptr_t val) {
166     return __atomic_and_fetch(addr, val, __ATOMIC_RELEASE);
167 }
168 
169 template<typename T>
atomic_fetch_add_acquire_release(T * addr,T val)170 ALWAYS_INLINE T atomic_fetch_add_acquire_release(T *addr, T val) {
171     return __atomic_fetch_add(addr, val, __ATOMIC_ACQ_REL);
172 }
173 
atomic_cas_strong_release_relaxed(uintptr_t * addr,uintptr_t * expected,uintptr_t * desired)174 ALWAYS_INLINE bool atomic_cas_strong_release_relaxed(uintptr_t *addr, uintptr_t *expected, uintptr_t *desired) {
175     return __atomic_compare_exchange(addr, expected, desired, false, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
176 }
177 
178 template<typename T>
atomic_cas_weak_relacq_relaxed(T * addr,T * expected,T * desired)179 ALWAYS_INLINE bool atomic_cas_weak_relacq_relaxed(T *addr, T *expected, T *desired) {
180     return __atomic_compare_exchange(addr, expected, desired, true, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
181 }
182 
atomic_cas_weak_release_relaxed(uintptr_t * addr,uintptr_t * expected,uintptr_t * desired)183 ALWAYS_INLINE bool atomic_cas_weak_release_relaxed(uintptr_t *addr, uintptr_t *expected, uintptr_t *desired) {
184     return __atomic_compare_exchange(addr, expected, desired, true, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
185 }
186 
atomic_cas_weak_relaxed_relaxed(uintptr_t * addr,uintptr_t * expected,uintptr_t * desired)187 ALWAYS_INLINE bool atomic_cas_weak_relaxed_relaxed(uintptr_t *addr, uintptr_t *expected, uintptr_t *desired) {
188     return __atomic_compare_exchange(addr, expected, desired, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
189 }
190 
atomic_cas_weak_acquire_relaxed(uintptr_t * addr,uintptr_t * expected,uintptr_t * desired)191 ALWAYS_INLINE bool atomic_cas_weak_acquire_relaxed(uintptr_t *addr, uintptr_t *expected, uintptr_t *desired) {
192     return __atomic_compare_exchange(addr, expected, desired, true, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
193 }
194 
atomic_fetch_and_release(uintptr_t * addr,uintptr_t val)195 ALWAYS_INLINE uintptr_t atomic_fetch_and_release(uintptr_t *addr, uintptr_t val) {
196     return __atomic_fetch_and(addr, val, __ATOMIC_RELEASE);
197 }
198 
199 template<typename T>
atomic_load_relaxed(T * addr,T * val)200 ALWAYS_INLINE void atomic_load_relaxed(T *addr, T *val) {
201     __atomic_load(addr, val, __ATOMIC_RELAXED);
202 }
203 
204 template<typename T>
atomic_load_acquire(T * addr,T * val)205 ALWAYS_INLINE void atomic_load_acquire(T *addr, T *val) {
206     __atomic_load(addr, val, __ATOMIC_ACQUIRE);
207 }
208 
atomic_or_fetch_relaxed(uintptr_t * addr,uintptr_t val)209 ALWAYS_INLINE uintptr_t atomic_or_fetch_relaxed(uintptr_t *addr, uintptr_t val) {
210     return __atomic_or_fetch(addr, val, __ATOMIC_RELAXED);
211 }
212 
atomic_store_relaxed(uintptr_t * addr,uintptr_t * val)213 ALWAYS_INLINE void atomic_store_relaxed(uintptr_t *addr, uintptr_t *val) {
214     __atomic_store(addr, val, __ATOMIC_RELAXED);
215 }
216 
217 template<typename T>
atomic_store_release(T * addr,T * val)218 ALWAYS_INLINE void atomic_store_release(T *addr, T *val) {
219     __atomic_store(addr, val, __ATOMIC_RELEASE);
220 }
221 
atomic_thread_fence_acquire()222 ALWAYS_INLINE void atomic_thread_fence_acquire() {
223     __atomic_thread_fence(__ATOMIC_ACQUIRE);
224 }
225 
226 #endif
227 
228 }  // namespace
229 
230 class spin_control {
231     int spin_count;
232 
233 public:
234     // Everyone says this should be 40. Have not measured it.
spin_control()235     ALWAYS_INLINE spin_control()
236         : spin_count(40) {
237     }
238 
should_spin()239     ALWAYS_INLINE bool should_spin() {
240         if (spin_count > 0) {
241             spin_count--;
242         }
243         return spin_count > 0;
244     }
245 
reset()246     ALWAYS_INLINE void reset() {
247         spin_count = 40;
248     }
249 };
250 
251 #if __cplusplus >= 201103L
252 // Low order two bits are used for locking state,
253 static constexpr uint8_t lock_bit = 0x01;
254 static constexpr uint8_t queue_lock_bit = 0x02;
255 static constexpr uint8_t parked_bit = 0x02;
256 #else
257 #define lock_bit 0x01
258 #define queue_lock_bit 0x02
259 #define parked_bit 0x02
260 #endif
261 
262 struct word_lock_queue_data {
263     thread_parker parker;  // TODO: member or pointer?
264 
265     // This design is from the Rust parking lot implementation by Amanieu d'Antras.
266     // Comment from original:
267     //
268     // Linked list of threads in the queue. The queue is split into two parts:
269     // the processed part and the unprocessed part. When new nodes are added to
270     // the list, they only have the next pointer set, and queue_tail is null.
271     //
272     // Nodes are processed with the queue lock held, which consists of setting
273     // the prev pointer for each node and setting the queue_tail pointer on the
274     // first processed node of the list.
275     //
276     // This setup allows nodes to be added to the queue without a lock, while
277     // still allowing O(1) removal of nodes from the processed part of the list.
278     // The only cost is the O(n) processing, but this only needs to be done
279     // once for each node, and therefore isn't too expensive.
280 
281     word_lock_queue_data *next;
282     word_lock_queue_data *prev;
283 
284     word_lock_queue_data *tail;
285 
word_lock_queue_dataword_lock_queue_data286     ALWAYS_INLINE word_lock_queue_data()
287         : next(NULL), prev(NULL), tail(NULL) {
288     }
289 
290     // Inlined, empty dtor needed to avoid confusing MachO builds
~word_lock_queue_dataword_lock_queue_data291     ALWAYS_INLINE ~word_lock_queue_data() {
292     }
293 };
294 
295 class word_lock {
296     uintptr_t state;
297 
298     void lock_full();
299     void unlock_full();
300 
301 public:
word_lock()302     ALWAYS_INLINE word_lock()
303         : state(0) {
304     }
lock()305     ALWAYS_INLINE void lock() {
306         if_tsan_pre_lock(this);
307 
308         uintptr_t expected = 0;
309         uintptr_t desired = lock_bit;
310         // Try for a fast grab of the lock bit. If this does not work, call the full adaptive looping code.
311         if (!atomic_cas_weak_acquire_relaxed(&state, &expected, &desired)) {
312             lock_full();
313         }
314 
315         if_tsan_post_lock(this);
316     }
317 
unlock()318     ALWAYS_INLINE void unlock() {
319         if_tsan_pre_unlock(this);
320 
321         uintptr_t val = atomic_fetch_and_release(&state, ~(uintptr_t)lock_bit);
322         // If another thread is currently queueing, that thread will ensure
323         // it acquires the lock or wakes a waiting thread.
324         bool no_thread_queuing = (val & queue_lock_bit) == 0;
325         // Only need to do a wakeup if there are threads waiting.
326         bool some_queued = (val & ~(uintptr_t)(queue_lock_bit | lock_bit)) != 0;
327         if (no_thread_queuing && some_queued) {
328             unlock_full();
329         }
330 
331         if_tsan_post_unlock(this);
332     }
333 };
334 
lock_full()335 WEAK void word_lock::lock_full() {
336     spin_control spinner;
337     uintptr_t expected;
338     atomic_load_relaxed(&state, &expected);
339 
340     while (true) {
341         if (!(expected & lock_bit)) {
342             uintptr_t desired = expected | lock_bit;
343 
344             if (atomic_cas_weak_acquire_relaxed(&state, &expected, &desired)) {
345                 return;
346             }
347             continue;
348         }
349 
350         if (((expected & ~(uintptr_t)(queue_lock_bit | lock_bit)) != 0) && spinner.should_spin()) {
351             halide_thread_yield();
352             atomic_load_relaxed(&state, &expected);
353             continue;
354         }
355 
356         word_lock_queue_data node;
357 
358         node.parker.prepare_park();
359         // TODO set up prelinkage parking state
360 
361         word_lock_queue_data *head = (word_lock_queue_data *)(expected & ~(uintptr_t)(queue_lock_bit | lock_bit));
362         if (head == NULL) {
363             node.tail = &node;
364             // constructor set node.prev = NULL;
365         } else {
366             // Mark the tail as NULL. The unlock routine will walk the list and wakeup
367             // the thread at the end.
368             // constructor set node.tail = NULL;
369             // constructor set node.prev = NULL;
370             node.next = head;
371         }
372 
373         uintptr_t desired = ((uintptr_t)&node) | (expected & (queue_lock_bit | lock_bit));
374         if (atomic_cas_weak_release_relaxed(&state, &expected, &desired)) {
375             node.parker.park();
376             spinner.reset();
377             atomic_load_relaxed(&state, &expected);
378         }
379     }
380 }
381 
unlock_full()382 WEAK void word_lock::unlock_full() {
383     uintptr_t expected;
384     atomic_load_relaxed(&state, &expected);
385 
386     while (true) {
387         // If another thread is currently queueing, that thread will ensure
388         // it acquires the lock or wakes a waiting thread.
389         bool thread_queuing = (expected & queue_lock_bit);
390         // Only need to do a wakeup if there are threads waiting.
391         bool none_queued = (expected & ~(uintptr_t)(queue_lock_bit | lock_bit)) == 0;
392         if (thread_queuing || none_queued) {
393             return;
394         }
395 
396         uintptr_t desired = expected | queue_lock_bit;
397         if (atomic_cas_weak_acquire_relaxed(&state, &expected, &desired)) {
398             break;
399         }
400     }
401 
402     while (true) {
403         word_lock_queue_data *head = (word_lock_queue_data *)(expected & ~(uintptr_t)(queue_lock_bit | lock_bit));
404         word_lock_queue_data *current = head;
405         word_lock_queue_data *tail = current->tail;
406         int times_through = 0;
407         while (tail == NULL) {
408             word_lock_queue_data *next = current->next;
409             halide_assert(NULL, next != NULL);
410             next->prev = current;
411             current = next;
412             tail = current->tail;
413             times_through++;
414         }
415         head->tail = tail;
416 
417         // If the lock is now locked, unlock the queue and have the thread
418         // that currently holds the lock do the wakeup
419         if (expected & lock_bit) {
420             uintptr_t desired = expected & ~(uintptr_t)queue_lock_bit;
421             if (atomic_cas_weak_relacq_relaxed(&state, &expected, &desired)) {
422                 return;
423             }
424             atomic_thread_fence_acquire();
425             continue;
426         }
427 
428         word_lock_queue_data *new_tail = tail->prev;
429         if (new_tail == NULL) {
430             bool continue_outer = false;
431             while (!continue_outer) {
432                 uintptr_t desired = expected & lock_bit;
433                 if (atomic_cas_weak_relacq_relaxed(&state, &expected, &desired)) {
434                     break;
435                 }
436                 if ((expected & ~(uintptr_t)(queue_lock_bit | lock_bit)) == 0) {
437                     continue;
438                 } else {
439                     atomic_thread_fence_acquire();
440                     continue_outer = true;
441                 }
442             }
443 
444             if (continue_outer) {
445                 continue;
446             }
447         } else {
448             head->tail = new_tail;
449             atomic_and_fetch_release(&state, ~(uintptr_t)queue_lock_bit);
450         }
451 
452         // TODO: The reason there are three calls here is other things can happen between them.
453         // Also it is not clear if unpark_start has to return the mutex/flag used by unpark
454         // and unpark_finish due to memory lifetime reasons.
455         tail->parker.unpark_start();
456         tail->parker.unpark();
457         tail->parker.unpark_finish();
458         break;
459     }
460 }
461 
462 struct queue_data {
463     thread_parker parker;  // TODO: member or pointer?
464 
465     uintptr_t sleep_address;
466 
467     queue_data *next;
468 
469     uintptr_t unpark_info;
470 
queue_dataqueue_data471     ALWAYS_INLINE queue_data()
472         : sleep_address(0), next(NULL), unpark_info(0) {
473     }
474     // Inlined, empty dtor needed to avoid confusing MachO builds
~queue_dataqueue_data475     ALWAYS_INLINE ~queue_data() {
476     }
477 };
478 
479 // Must be a power of two.
480 #define LOAD_FACTOR 4
481 
482 struct hash_bucket {
483     word_lock mutex;
484 
485     queue_data *head;  // Is this queue_data or thread_data?
486     queue_data *tail;  // Is this queue_data or thread_data?
487 };
488 
489 // The use of a #define here and table_storage is because if
490 // a class with a constructor is used, clang generates a COMDAT
491 // which cannot be lowered for Mac OS due to MachO. A better
492 // solution is desired of course.
493 #define HASH_TABLE_BITS 10
494 struct hash_table {
495     hash_bucket buckets[MAX_THREADS * LOAD_FACTOR];
496 };
497 WEAK char table_storage[sizeof(hash_table)];
498 #define table (*(hash_table *)table_storage)
499 
check_hash(uintptr_t hash)500 ALWAYS_INLINE void check_hash(uintptr_t hash) {
501     halide_assert(NULL, hash < sizeof(table.buckets) / sizeof(table.buckets[0]));
502 }
503 
504 #if 0
505 WEAK void dump_hash() {
506     int i = 0;
507     for (auto &bucket : table.buckets) {
508         queue_data *head = bucket.head;
509         while (head != NULL) {
510             print(NULL) << "Bucket index " << i << " addr " << (void *)head->sleep_address << "\n";
511             head = head->next;
512         }
513         i++;
514     }
515 }
516 #endif
517 
addr_hash(uintptr_t addr,uint32_t bits)518 static ALWAYS_INLINE uintptr_t addr_hash(uintptr_t addr, uint32_t bits) {
519     // Fibonacci hashing. The golden ratio is 1.9E3779B97F4A7C15F39...
520     // in hexadecimal.
521     if (sizeof(uintptr_t) >= 8) {
522         return (addr * (uintptr_t)0x9E3779B97F4A7C15) >> (64 - bits);
523     } else {
524         return (addr * (uintptr_t)0x9E3779B9) >> (32 - bits);
525     }
526 }
527 
lock_bucket(uintptr_t addr)528 WEAK hash_bucket &lock_bucket(uintptr_t addr) {
529     uintptr_t hash = addr_hash(addr, HASH_TABLE_BITS);
530 
531     check_hash(hash);
532 
533     // TODO: if resizing is implemented, loop, etc.
534     hash_bucket &bucket = table.buckets[hash];
535 
536     bucket.mutex.lock();
537 
538     return bucket;
539 }
540 
541 struct bucket_pair {
542     hash_bucket &from;
543     hash_bucket &to;
544 
bucket_pairbucket_pair545     ALWAYS_INLINE bucket_pair(hash_bucket &from, hash_bucket &to)
546         : from(from), to(to) {
547     }
548 };
549 
lock_bucket_pair(uintptr_t addr_from,uintptr_t addr_to)550 WEAK bucket_pair lock_bucket_pair(uintptr_t addr_from, uintptr_t addr_to) {
551     // TODO: if resizing is implemented, loop, etc.
552     uintptr_t hash_from = addr_hash(addr_from, HASH_TABLE_BITS);
553     uintptr_t hash_to = addr_hash(addr_to, HASH_TABLE_BITS);
554 
555     check_hash(hash_from);
556     check_hash(hash_to);
557 
558     // Lock the bucket with the smaller hash first in order
559     // to prevent deadlock.
560     if (hash_from == hash_to) {
561         hash_bucket &first = table.buckets[hash_from];
562         first.mutex.lock();
563         return bucket_pair(first, first);
564     } else if (hash_from < hash_to) {
565         hash_bucket &first = table.buckets[hash_from];
566         hash_bucket &second = table.buckets[hash_to];
567         first.mutex.lock();
568         second.mutex.lock();
569         return bucket_pair(first, second);
570     } else {
571         hash_bucket &first = table.buckets[hash_to];
572         hash_bucket &second = table.buckets[hash_from];
573         first.mutex.lock();
574         second.mutex.lock();
575         return bucket_pair(second, first);
576     }
577 }
578 
unlock_bucket_pair(bucket_pair & buckets)579 WEAK void unlock_bucket_pair(bucket_pair &buckets) {
580     // In the lock routine, the buckets are locked smaller hash index first.
581     // Here we reverse this ordering by comparing the pointers. This works
582     // since the pointers are obtained by indexing an array with the hash
583     // values.
584     if (&buckets.from == &buckets.to) {
585         buckets.from.mutex.unlock();
586     } else if (&buckets.from > &buckets.to) {
587         buckets.from.mutex.unlock();
588         buckets.to.mutex.unlock();
589     } else {
590         buckets.to.mutex.unlock();
591         buckets.from.mutex.unlock();
592     }
593 }
594 
595 struct validate_action {
596     bool unpark_one;
597     uintptr_t invalid_unpark_info;
598 
validate_actionvalidate_action599     ALWAYS_INLINE validate_action()
600         : unpark_one(false), invalid_unpark_info(0) {
601     }
602 };
603 
parking_control_validate(void * control,validate_action & action)604 WEAK bool parking_control_validate(void *control, validate_action &action) {
605     return true;
606 };
parking_control_before_sleep(void * control)607 WEAK void parking_control_before_sleep(void *control){};
parking_control_unpark(void * control,int unparked,bool more_waiters)608 WEAK uintptr_t parking_control_unpark(void *control, int unparked, bool more_waiters) {
609     return 0;
610 };
parking_control_requeue_callback(void * control,const validate_action & action,bool one_to_wake,bool some_requeued)611 WEAK void parking_control_requeue_callback(void *control, const validate_action &action, bool one_to_wake, bool some_requeued){};
612 
613 struct parking_control {
614     bool (*validate)(void *control, validate_action &action);
615     void (*before_sleep)(void *control);
616     uintptr_t (*unpark)(void *control, int unparked, bool more_waiters);
617     void (*requeue_callback)(void *control, const validate_action &action, bool one_to_wake, bool some_requeued);
618 
parking_controlparking_control619     ALWAYS_INLINE parking_control()
620         : validate(parking_control_validate), before_sleep(parking_control_before_sleep),
621           unpark(parking_control_unpark), requeue_callback(parking_control_requeue_callback) {
622     }
623 };
624 
625 // TODO: Do we need a park_result thing here?
park(uintptr_t addr,parking_control & control)626 WEAK uintptr_t park(uintptr_t addr, parking_control &control) {
627     queue_data queue_data;
628 
629     hash_bucket &bucket = lock_bucket(addr);
630 
631     validate_action action;
632     if (!control.validate(&control, action)) {
633         bucket.mutex.unlock();
634         return action.invalid_unpark_info;
635     }
636 
637     queue_data.next = NULL;
638     queue_data.sleep_address = addr;
639     queue_data.parker.prepare_park();
640     if (bucket.head != NULL) {
641         bucket.tail->next = &queue_data;
642     } else {
643         bucket.head = &queue_data;
644     }
645     bucket.tail = &queue_data;
646     bucket.mutex.unlock();
647 
648     control.before_sleep(&control);
649 
650     queue_data.parker.park();
651 
652     return queue_data.unpark_info;
653 
654     // TODO: handling timeout.
655 }
656 
unpark_one(uintptr_t addr,parking_control & control)657 WEAK uintptr_t unpark_one(uintptr_t addr, parking_control &control) {
658     hash_bucket &bucket = lock_bucket(addr);
659 
660     queue_data **data_location = &bucket.head;
661     queue_data *prev = NULL;
662     queue_data *data = *data_location;
663     while (data != NULL) {
664         uintptr_t cur_addr;
665         atomic_load_relaxed(&data->sleep_address, &cur_addr);
666         if (cur_addr == addr) {
667             queue_data *next = data->next;
668             *data_location = next;
669 
670             bool more_waiters = false;
671 
672             if (bucket.tail == data) {
673                 bucket.tail = prev;
674             } else {
675                 queue_data *data2 = next;
676                 while (data2 != NULL && !more_waiters) {
677                     uintptr_t cur_addr2;
678                     atomic_load_relaxed(&data2->sleep_address, &cur_addr2);
679                     more_waiters = (cur_addr2 == addr);
680                     data2 = data2->next;
681                 }
682             }
683 
684             data->unpark_info = control.unpark(&control, 1, more_waiters);
685 
686             data->parker.unpark_start();
687             bucket.mutex.unlock();
688             data->parker.unpark();
689             data->parker.unpark_finish();
690 
691             // TODO: Figure out ret type.
692             return more_waiters ? 1 : 0;
693         } else {
694             data_location = &data->next;
695             prev = data;
696             data = data->next;
697         }
698     }
699 
700     control.unpark(&control, 0, false);
701 
702     bucket.mutex.unlock();
703 
704     // TODO: decide if this is the right return value.
705     return 0;
706 }
707 
unpark_all(uintptr_t addr,uintptr_t unpark_info)708 WEAK uintptr_t unpark_all(uintptr_t addr, uintptr_t unpark_info) {
709     hash_bucket &bucket = lock_bucket(addr);
710 
711     queue_data **data_location = &bucket.head;
712     queue_data *prev = NULL;
713     queue_data *data = *data_location;
714     size_t waiters = 0;
715     queue_data *temp_list_storage[16];
716     queue_data **temp_list = &temp_list_storage[0];
717     size_t max_waiters = sizeof(temp_list_storage) / sizeof(temp_list_storage[0]);
718 
719     while (data != NULL) {
720         uintptr_t cur_addr;
721         atomic_load_relaxed(&data->sleep_address, &cur_addr);
722 
723         queue_data *next = data->next;
724         if (cur_addr == addr) {
725             *data_location = next;
726 
727             if (bucket.tail == data) {
728                 bucket.tail = prev;
729             }
730 
731             if (waiters == max_waiters) {
732                 queue_data **temp = temp_list;
733                 temp_list = (queue_data **)malloc(sizeof(queue_data *) * max_waiters * 2);
734                 for (size_t i = 0; i < max_waiters; i++) {
735                     temp_list[i] = temp[i];
736                 }
737                 max_waiters *= 2;
738                 if (temp != &temp_list_storage[0]) {
739                     free(temp);
740                 }
741             }
742 
743             data->unpark_info = unpark_info;
744 
745             temp_list[waiters++] = data;
746             data->parker.unpark_start();
747 
748             data = next;
749         } else {
750             *data_location = data->next;
751             prev = data;
752             data = next;
753         }
754     }
755 
756     bucket.mutex.unlock();
757 
758     for (size_t i = 0; i < waiters; i++) {
759         temp_list[i]->parker.unpark();
760     }
761 
762     // TODO: decide if this really needs to be two loops.
763     for (size_t i = 0; i < waiters; i++) {
764         temp_list[i]->parker.unpark_finish();
765     }
766 
767     if (temp_list != &temp_list_storage[0]) {
768         free(temp_list);
769     }
770 
771     return waiters;
772 }
773 
unpark_requeue(uintptr_t addr_from,uintptr_t addr_to,parking_control & control,uintptr_t unpark_info)774 WEAK int unpark_requeue(uintptr_t addr_from, uintptr_t addr_to, parking_control &control, uintptr_t unpark_info) {
775     bucket_pair buckets = lock_bucket_pair(addr_from, addr_to);
776 
777     validate_action action;
778     if (!control.validate(&control, action)) {
779         unlock_bucket_pair(buckets);
780         return 0;
781     }
782 
783     queue_data **data_location = &buckets.from.head;
784     queue_data *prev = NULL;
785     queue_data *data = *data_location;
786     queue_data *requeue = NULL;
787     queue_data *requeue_tail = NULL;
788     queue_data *wakeup = NULL;
789 
790     while (data != NULL) {
791         uintptr_t cur_addr;
792         atomic_load_relaxed(&data->sleep_address, &cur_addr);
793 
794         queue_data *next = data->next;
795         if (cur_addr == addr_from) {
796             *data_location = next;
797 
798             if (buckets.from.tail == data) {
799                 buckets.from.tail = prev;
800             }
801 
802             if (action.unpark_one && wakeup == NULL) {
803                 wakeup = data;
804             } else {
805                 if (requeue == NULL) {
806                     requeue = data;
807                 } else {
808                     requeue_tail->next = data;
809                 }
810 
811                 requeue_tail = data;
812                 atomic_store_relaxed(&data->sleep_address, &addr_to);
813             }
814             data = next;
815             // TODO: prev ptr?
816         } else {
817             data_location = &data->next;
818             prev = data;
819             data = next;
820         }
821     }
822 
823     if (requeue != NULL) {
824         requeue_tail->next = NULL;
825         if (buckets.to.head == NULL) {
826             buckets.to.head = requeue;
827         } else {
828             buckets.to.tail->next = requeue;
829         }
830         buckets.to.tail = requeue_tail;
831     }
832 
833     control.requeue_callback(&control, action, wakeup != NULL, requeue != NULL);
834 
835     if (wakeup != NULL) {
836         wakeup->unpark_info = unpark_info;
837         wakeup->parker.unpark_start();
838         unlock_bucket_pair(buckets);
839         wakeup->parker.unpark();
840         wakeup->parker.unpark_finish();
841     } else {
842         unlock_bucket_pair(buckets);
843     }
844 
845     return wakeup != NULL && action.unpark_one;
846 }
847 
848 WEAK bool mutex_parking_control_validate(void *control, validate_action &action);
849 WEAK uintptr_t mutex_parking_control_unpark(void *control, int unparked, bool more_waiters);
850 struct mutex_parking_control : parking_control {
851     uintptr_t *lock_state;
852 
mutex_parking_controlmutex_parking_control853     ALWAYS_INLINE mutex_parking_control(uintptr_t *lock_state)
854         : lock_state(lock_state) {
855         validate = mutex_parking_control_validate;
856         unpark = mutex_parking_control_unpark;
857     }
858 };
859 
860 // Only used in parking -- lock_full.
mutex_parking_control_validate(void * control,validate_action & action)861 WEAK bool mutex_parking_control_validate(void *control, validate_action &action) {
862     mutex_parking_control *mutex_control = (mutex_parking_control *)control;
863 
864     uintptr_t result;
865     atomic_load_relaxed(mutex_control->lock_state, &result);
866     return result == (lock_bit | parked_bit);
867 }
868 
869 // Only used in unparking -- unlock_full.
mutex_parking_control_unpark(void * control,int unparked,bool more_waiters)870 WEAK uintptr_t mutex_parking_control_unpark(void *control, int unparked, bool more_waiters) {
871     mutex_parking_control *mutex_control = (mutex_parking_control *)control;
872 
873     // TODO: consider handling fairness.
874     uintptr_t return_state = more_waiters ? parked_bit : 0;
875     atomic_store_release(mutex_control->lock_state, &return_state);
876 
877     return 0;
878 }
879 
880 class fast_mutex {
881     uintptr_t state;
882 
lock_full()883     ALWAYS_INLINE void lock_full() {
884         // Everyone says this should be 40. Have not measured it.
885         spin_control spinner;
886         uintptr_t expected;
887         atomic_load_relaxed(&state, &expected);
888 
889         while (true) {
890             if (!(expected & lock_bit)) {
891                 uintptr_t desired = expected | lock_bit;
892                 if (atomic_cas_weak_acquire_relaxed(&state, &expected, &desired)) {
893                     return;
894                 }
895                 continue;
896             }
897 
898             // If no one is parked, spin with spin count.
899             if ((expected & parked_bit) == 0 && spinner.should_spin()) {
900                 halide_thread_yield();
901                 atomic_load_relaxed(&state, &expected);
902                 continue;
903             }
904 
905             // Mark mutex as having parked threads if not already done.
906             if ((expected & parked_bit) == 0) {
907                 uintptr_t desired = expected | parked_bit;
908                 if (!atomic_cas_weak_relaxed_relaxed(&state, &expected, &desired)) {
909                     continue;
910                 }
911             }
912 
913             // TODO: consider handling fairness, timeout
914             mutex_parking_control control(&state);
915             uintptr_t result = park((uintptr_t)this, control);
916             if (result == (uintptr_t)this) {
917                 return;
918             }
919 
920             spinner.reset();
921             atomic_load_relaxed(&state, &expected);
922         }
923     }
924 
unlock_full()925     ALWAYS_INLINE void unlock_full() {
926         uintptr_t expected = lock_bit;
927         uintptr_t desired = 0;
928         // Try for a fast release of the lock. Redundant with code in lock, but done
929         // to make unlock_full a standalone unlock that can be called directly.
930         if (atomic_cas_strong_release_relaxed(&state, &expected, &desired)) {
931             return;
932         }
933 
934         mutex_parking_control control(&state);
935         unpark_one((uintptr_t)this, control);
936     }
937 
938 public:
lock()939     ALWAYS_INLINE void lock() {
940         uintptr_t expected = 0;
941         uintptr_t desired = lock_bit;
942         // Try for a fast grab of the lock bit. If this does not work, call the full adaptive looping code.
943         if (!atomic_cas_weak_acquire_relaxed(&state, &expected, &desired)) {
944             lock_full();
945         }
946     }
947 
unlock()948     ALWAYS_INLINE void unlock() {
949         uintptr_t expected = lock_bit;
950         uintptr_t desired = 0;
951         // Try for a fast grab of the lock bit. If this does not work, call the full adaptive looping code.
952         if (!atomic_cas_weak_release_relaxed(&state, &expected, &desired)) {
953             unlock_full();
954         }
955     }
956 
make_parked_if_locked()957     ALWAYS_INLINE bool make_parked_if_locked() {
958         uintptr_t val;
959         atomic_load_relaxed(&state, &val);
960         while (true) {
961             if (!(val & lock_bit)) {
962                 return false;
963             }
964 
965             uintptr_t desired = val | parked_bit;
966             if (atomic_cas_weak_relaxed_relaxed(&state, &val, &desired)) {
967                 return true;
968             }
969         }
970     }
971 
make_parked()972     ALWAYS_INLINE void make_parked() {
973         atomic_or_fetch_relaxed(&state, parked_bit);
974     }
975 };
976 WEAK uintptr_t signal_parking_control_unpark(void *control, int unparked, bool more_waiters);
977 struct signal_parking_control : parking_control {
978     uintptr_t *cond_state;
979     fast_mutex *mutex;
980 
signal_parking_controlsignal_parking_control981     ALWAYS_INLINE signal_parking_control(uintptr_t *cond_state, fast_mutex *mutex)
982         : cond_state(cond_state), mutex(mutex) {
983         unpark = signal_parking_control_unpark;
984     }
985 };
986 
signal_parking_control_unpark(void * control,int unparked,bool more_waiters)987 WEAK uintptr_t signal_parking_control_unpark(void *control, int unparked, bool more_waiters) {
988     signal_parking_control *signal_control = (signal_parking_control *)control;
989 
990     if (!more_waiters) {
991         uintptr_t val = 0;
992         atomic_store_relaxed(signal_control->cond_state, &val);
993     }
994 
995 #if 0  // TODO: figure out why this was here.
996     return (uintptr_t)signal_control->mutex;
997 #else
998     return 0;
999 #endif
1000 }
1001 WEAK bool broadcast_parking_control_validate(void *control, validate_action &action);
1002 WEAK void broadcast_parking_control_requeue_callback(void *control, const validate_action &action,
1003                                                      bool one_to_wake, bool some_requeued);
1004 struct broadcast_parking_control : parking_control {
1005     uintptr_t *cond_state;
1006     fast_mutex *mutex;
1007 
broadcast_parking_controlbroadcast_parking_control1008     ALWAYS_INLINE broadcast_parking_control(uintptr_t *cond_state, fast_mutex *mutex)
1009         : cond_state(cond_state), mutex(mutex) {
1010         validate = broadcast_parking_control_validate;
1011         requeue_callback = broadcast_parking_control_requeue_callback;
1012     }
1013 };
1014 
broadcast_parking_control_validate(void * control,validate_action & action)1015 WEAK bool broadcast_parking_control_validate(void *control, validate_action &action) {
1016     broadcast_parking_control *broadcast_control = (broadcast_parking_control *)control;
1017 
1018     uintptr_t val;
1019     atomic_load_relaxed(broadcast_control->cond_state, &val);
1020     // By the time this broadcast locked everything and was processed, the cond
1021     // has progressed to a new mutex, do nothing since any waiting threads have
1022     // to be waiting on what is effectively a different condition.
1023     if (val != (uintptr_t)broadcast_control->mutex) {
1024         return false;
1025     }
1026     // Clear the cond's connection to the mutex as all waiting threads are going to reque onto the mutex.
1027     val = 0;
1028     atomic_store_relaxed(broadcast_control->cond_state, &val);
1029 
1030     action.unpark_one = !broadcast_control->mutex->make_parked_if_locked();
1031 
1032     return true;
1033 }
1034 
broadcast_parking_control_requeue_callback(void * control,const validate_action & action,bool one_to_wake,bool some_requeued)1035 WEAK void broadcast_parking_control_requeue_callback(void *control, const validate_action &action, bool one_to_wake, bool some_requeued) {
1036     broadcast_parking_control *broadcast_control = (broadcast_parking_control *)control;
1037 
1038     if (action.unpark_one && some_requeued) {
1039         broadcast_control->mutex->make_parked();
1040     }
1041 }
1042 WEAK bool wait_parking_control_validate(void *control, validate_action &action);
1043 WEAK void wait_parking_control_before_sleep(void *control);
1044 WEAK uintptr_t wait_parking_control_unpark(void *control, int unparked, bool more_waiters);
1045 struct wait_parking_control : parking_control {
1046     uintptr_t *cond_state;
1047     fast_mutex *mutex;
1048 
wait_parking_controlwait_parking_control1049     ALWAYS_INLINE wait_parking_control(uintptr_t *cond_state, fast_mutex *mutex)
1050         : cond_state(cond_state), mutex(mutex) {
1051         validate = wait_parking_control_validate;
1052         before_sleep = wait_parking_control_before_sleep;
1053         unpark = wait_parking_control_unpark;
1054     }
1055 };
1056 
wait_parking_control_validate(void * control,validate_action & action)1057 WEAK bool wait_parking_control_validate(void *control, validate_action &action) {
1058     wait_parking_control *wait_control = (wait_parking_control *)control;
1059 
1060     uintptr_t val;
1061     atomic_load_relaxed(wait_control->cond_state, &val);
1062 
1063     if (val == 0) {
1064         val = (uintptr_t)wait_control->mutex;
1065         atomic_store_relaxed(wait_control->cond_state, &val);
1066     } else if (val != (uintptr_t)wait_control->mutex) {
1067         // TODO: signal error.
1068         action.invalid_unpark_info = (uintptr_t)wait_control->mutex;
1069         return false;
1070     }
1071 
1072     return true;
1073 }
1074 
wait_parking_control_before_sleep(void * control)1075 WEAK void wait_parking_control_before_sleep(void *control) {
1076     wait_parking_control *wait_control = (wait_parking_control *)control;
1077 
1078     wait_control->mutex->unlock();
1079 }
1080 
wait_parking_control_unpark(void * control,int unparked,bool more_waiters)1081 WEAK uintptr_t wait_parking_control_unpark(void *control, int unparked, bool more_waiters) {
1082     wait_parking_control *wait_control = (wait_parking_control *)control;
1083 
1084     if (!more_waiters) {
1085         uintptr_t val = 0;
1086         atomic_store_relaxed(wait_control->cond_state, &val);
1087     }
1088     return 0;
1089 }
1090 
1091 class fast_cond {
1092     uintptr_t state;
1093 
1094 public:
fast_cond()1095     ALWAYS_INLINE fast_cond()
1096         : state(0) {
1097     }
1098 
signal()1099     ALWAYS_INLINE void signal() {
1100         if_tsan_pre_signal(this);
1101 
1102         uintptr_t val;
1103         atomic_load_relaxed(&state, &val);
1104         if (val == 0) {
1105             if_tsan_post_signal(this);
1106             return;
1107         }
1108         signal_parking_control control(&state, (fast_mutex *)val);
1109         unpark_one((uintptr_t)this, control);
1110         if_tsan_post_signal(this);
1111     }
1112 
broadcast()1113     ALWAYS_INLINE void broadcast() {
1114         if_tsan_pre_signal(this);
1115         uintptr_t val;
1116         atomic_load_relaxed(&state, &val);
1117         if (val == 0) {
1118             if_tsan_post_signal(this);
1119             return;
1120         }
1121         broadcast_parking_control control(&state, (fast_mutex *)val);
1122         unpark_requeue((uintptr_t)this, val, control, 0);
1123         if_tsan_post_signal(this);
1124     }
1125 
wait(fast_mutex * mutex)1126     ALWAYS_INLINE void wait(fast_mutex *mutex) {
1127         wait_parking_control control(&state, mutex);
1128         uintptr_t result = park((uintptr_t)this, control);
1129         if (result != (uintptr_t)mutex) {
1130             mutex->lock();
1131         } else {
1132             if_tsan_pre_lock(mutex);
1133 
1134             // TODO: this is debug only.
1135             uintptr_t val;
1136             atomic_load_relaxed((uintptr_t *)mutex, &val);
1137             halide_assert(NULL, val & 0x1);
1138 
1139             if_tsan_post_lock(mutex);
1140         }
1141     }
1142 };
1143 
1144 }  // namespace Synchronization
1145 
1146 }  // namespace Internal
1147 }  // namespace Runtime
1148 }  // namespace Halide
1149 
1150 extern "C" {
1151 
halide_mutex_lock(halide_mutex * mutex)1152 WEAK void halide_mutex_lock(halide_mutex *mutex) {
1153     Halide::Runtime::Internal::Synchronization::fast_mutex *fast_mutex =
1154         (Halide::Runtime::Internal::Synchronization::fast_mutex *)mutex;
1155     fast_mutex->lock();
1156 }
1157 
halide_mutex_unlock(halide_mutex * mutex)1158 WEAK void halide_mutex_unlock(halide_mutex *mutex) {
1159     Halide::Runtime::Internal::Synchronization::fast_mutex *fast_mutex =
1160         (Halide::Runtime::Internal::Synchronization::fast_mutex *)mutex;
1161     fast_mutex->unlock();
1162 }
1163 
halide_cond_broadcast(struct halide_cond * cond)1164 WEAK void halide_cond_broadcast(struct halide_cond *cond) {
1165     Halide::Runtime::Internal::Synchronization::fast_cond *fast_cond =
1166         (Halide::Runtime::Internal::Synchronization::fast_cond *)cond;
1167     fast_cond->broadcast();
1168 }
1169 
halide_cond_signal(struct halide_cond * cond)1170 WEAK void halide_cond_signal(struct halide_cond *cond) {
1171     Halide::Runtime::Internal::Synchronization::fast_cond *fast_cond =
1172         (Halide::Runtime::Internal::Synchronization::fast_cond *)cond;
1173     fast_cond->signal();
1174 }
1175 
halide_cond_wait(struct halide_cond * cond,struct halide_mutex * mutex)1176 WEAK void halide_cond_wait(struct halide_cond *cond, struct halide_mutex *mutex) {
1177     Halide::Runtime::Internal::Synchronization::fast_cond *fast_cond =
1178         (Halide::Runtime::Internal::Synchronization::fast_cond *)cond;
1179     Halide::Runtime::Internal::Synchronization::fast_mutex *fast_mutex =
1180         (Halide::Runtime::Internal::Synchronization::fast_mutex *)mutex;
1181     fast_cond->wait(fast_mutex);
1182 }
1183 
1184 // Actual definition of the mutex array.
1185 struct halide_mutex_array {
1186     struct halide_mutex *array;
1187 };
1188 
halide_mutex_array_create(int sz)1189 WEAK halide_mutex_array *halide_mutex_array_create(int sz) {
1190     // TODO: If sz is huge, we should probably hash it down to something smaller
1191     // in the accessors below. Check for deadlocks before doing so.
1192     halide_mutex_array *array = (halide_mutex_array *)halide_malloc(
1193         NULL, sizeof(halide_mutex_array));
1194     if (array == NULL) {
1195         // Will result in a failed assertion and a call to halide_error.
1196         return NULL;
1197     }
1198     array->array = (halide_mutex *)halide_malloc(
1199         NULL, sz * sizeof(halide_mutex));
1200     if (array->array == NULL) {
1201         halide_free(NULL, array);
1202         // Will result in a failed assertion and a call to halide_error.
1203         return NULL;
1204     }
1205     memset(array->array, 0, sz * sizeof(halide_mutex));
1206     return array;
1207 }
1208 
halide_mutex_array_destroy(void * user_context,void * array)1209 WEAK void halide_mutex_array_destroy(void *user_context, void *array) {
1210     struct halide_mutex_array *arr_ptr = (struct halide_mutex_array *)array;
1211     halide_free(user_context, arr_ptr->array);
1212     halide_free(user_context, arr_ptr);
1213 }
1214 
halide_mutex_array_lock(struct halide_mutex_array * array,int entry)1215 WEAK int halide_mutex_array_lock(struct halide_mutex_array *array, int entry) {
1216     halide_mutex_lock(&array->array[entry]);
1217     return 0;
1218 }
1219 
halide_mutex_array_unlock(struct halide_mutex_array * array,int entry)1220 WEAK int halide_mutex_array_unlock(struct halide_mutex_array *array, int entry) {
1221     halide_mutex_unlock(&array->array[entry]);
1222     return 0;
1223 }
1224 }
1225