1 /*
2  * kmp_barrier.h
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_BARRIER_H
14 #define KMP_BARRIER_H
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 
19 #if KMP_HAVE_XMMINTRIN_H && KMP_HAVE__MM_MALLOC
20 #include <xmmintrin.h>
21 #define KMP_ALIGNED_ALLOCATE(size, alignment) _mm_malloc(size, alignment)
22 #define KMP_ALIGNED_FREE(ptr) _mm_free(ptr)
23 #elif KMP_HAVE_ALIGNED_ALLOC
24 #define KMP_ALGIN_UP(val, alignment)                                           \
25   (((val) + (alignment)-1) / (alignment) * (alignment))
26 #define KMP_ALIGNED_ALLOCATE(size, alignment)                                  \
27   aligned_alloc(alignment, KMP_ALGIN_UP(size, alignment))
28 #define KMP_ALIGNED_FREE(ptr) free(ptr)
29 #elif KMP_HAVE_POSIX_MEMALIGN
KMP_ALIGNED_ALLOCATE(size_t size,size_t alignment)30 static inline void *KMP_ALIGNED_ALLOCATE(size_t size, size_t alignment) {
31   void *ptr;
32   int n = posix_memalign(&ptr, alignment, size);
33   if (n != 0) {
34     if (ptr)
35       free(ptr);
36     return nullptr;
37   }
38   return ptr;
39 }
40 #define KMP_ALIGNED_FREE(ptr) free(ptr)
41 #elif KMP_HAVE__ALIGNED_MALLOC
42 #include <malloc.h>
43 #define KMP_ALIGNED_ALLOCATE(size, alignment) _aligned_malloc(size, alignment)
44 #define KMP_ALIGNED_FREE(ptr) _aligned_free(ptr)
45 #else
46 #define KMP_ALIGNED_ALLOCATE(size, alignment) KMP_INTERNAL_MALLOC(size)
47 #define KMP_ALIGNED_FREE(ptr) KMP_INTERNAL_FREE(ptr)
48 #endif
49 
50 // Use four cache lines: MLC tends to prefetch the next or previous cache line
51 // creating a possible fake conflict between cores, so this is the only way to
52 // guarantee that no such prefetch can happen.
53 #ifndef KMP_FOURLINE_ALIGN_CACHE
54 #define KMP_FOURLINE_ALIGN_CACHE KMP_ALIGN(4 * CACHE_LINE)
55 #endif
56 
57 #define KMP_OPTIMIZE_FOR_REDUCTIONS 0
58 
59 class distributedBarrier {
60   struct flags_s {
61     kmp_uint32 volatile KMP_FOURLINE_ALIGN_CACHE stillNeed;
62   };
63 
64   struct go_s {
65     std::atomic<kmp_uint64> KMP_FOURLINE_ALIGN_CACHE go;
66   };
67 
68   struct iter_s {
69     kmp_uint64 volatile KMP_FOURLINE_ALIGN_CACHE iter;
70   };
71 
72   struct sleep_s {
73     std::atomic<bool> KMP_FOURLINE_ALIGN_CACHE sleep;
74   };
75 
76   void init(size_t nthr);
77   void resize(size_t nthr);
78   void computeGo(size_t n);
79   void computeVarsForN(size_t n);
80 
81 public:
82   enum {
83     MAX_ITERS = 3,
84     MAX_GOS = 8,
85     IDEAL_GOS = 4,
86     IDEAL_CONTENTION = 16,
87   };
88 
89   flags_s *flags[MAX_ITERS];
90   go_s *go;
91   iter_s *iter;
92   sleep_s *sleep;
93 
94   size_t KMP_ALIGN_CACHE num_threads; // number of threads in barrier
95   size_t KMP_ALIGN_CACHE max_threads; // size of arrays in data structure
96   // number of go signals each requiring one write per iteration
97   size_t KMP_ALIGN_CACHE num_gos;
98   // number of groups of gos
99   size_t KMP_ALIGN_CACHE num_groups;
100   // threads per go signal
101   size_t KMP_ALIGN_CACHE threads_per_go;
102   bool KMP_ALIGN_CACHE fix_threads_per_go;
103   // threads per group
104   size_t KMP_ALIGN_CACHE threads_per_group;
105   // number of go signals in a group
106   size_t KMP_ALIGN_CACHE gos_per_group;
107   void *team_icvs;
108 
109   distributedBarrier() = delete;
110   ~distributedBarrier() = delete;
111 
112   // Used instead of constructor to create aligned data
allocate(int nThreads)113   static distributedBarrier *allocate(int nThreads) {
114     distributedBarrier *d = (distributedBarrier *)KMP_ALIGNED_ALLOCATE(
115         sizeof(distributedBarrier), 4 * CACHE_LINE);
116     if (!d) {
117       KMP_FATAL(MemoryAllocFailed);
118     }
119     d->num_threads = 0;
120     d->max_threads = 0;
121     for (int i = 0; i < MAX_ITERS; ++i)
122       d->flags[i] = NULL;
123     d->go = NULL;
124     d->iter = NULL;
125     d->sleep = NULL;
126     d->team_icvs = NULL;
127     d->fix_threads_per_go = false;
128     // calculate gos and groups ONCE on base size
129     d->computeGo(nThreads);
130     d->init(nThreads);
131     return d;
132   }
133 
deallocate(distributedBarrier * db)134   static void deallocate(distributedBarrier *db) { KMP_ALIGNED_FREE(db); }
135 
update_num_threads(size_t nthr)136   void update_num_threads(size_t nthr) { init(nthr); }
137 
need_resize(size_t new_nthr)138   bool need_resize(size_t new_nthr) { return (new_nthr > max_threads); }
get_num_threads()139   size_t get_num_threads() { return num_threads; }
140   kmp_uint64 go_release();
141   void go_reset();
142 };
143 
144 #endif // KMP_BARRIER_H
145