xref: /dragonfly/contrib/gcc-8.0/libgomp/iter.c (revision 5fb3968e)
1 /* Copyright (C) 2005-2018 Free Software Foundation, Inc.
2    Contributed by Richard Henderson <rth@redhat.com>.
3 
4    This file is part of the GNU Offloading and Multi Processing Library
5    (libgomp).
6 
7    Libgomp is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3, or (at your option)
10    any later version.
11 
12    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15    more details.
16 
17    Under Section 7 of GPL version 3, you are granted additional
18    permissions described in the GCC Runtime Library Exception, version
19    3.1, as published by the Free Software Foundation.
20 
21    You should have received a copy of the GNU General Public License and
22    a copy of the GCC Runtime Library Exception along with this program;
23    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24    <http://www.gnu.org/licenses/>.  */
25 
26 /* This file contains routines for managing work-share iteration, both
27    for loops and sections.  */
28 
29 #include "libgomp.h"
30 #include <stdlib.h>
31 
32 
33 /* This function implements the STATIC scheduling method.  The caller should
34    iterate *pstart <= x < *pend.  Return zero if there are more iterations
35    to perform; nonzero if not.  Return less than 0 if this thread had
36    received the absolutely last iteration.  */
37 
38 int
39 gomp_iter_static_next (long *pstart, long *pend)
40 {
41   struct gomp_thread *thr = gomp_thread ();
42   struct gomp_team *team = thr->ts.team;
43   struct gomp_work_share *ws = thr->ts.work_share;
44   unsigned long nthreads = team ? team->nthreads : 1;
45 
46   if (thr->ts.static_trip == -1)
47     return -1;
48 
49   /* Quick test for degenerate teams and orphaned constructs.  */
50   if (nthreads == 1)
51     {
52       *pstart = ws->next;
53       *pend = ws->end;
54       thr->ts.static_trip = -1;
55       return ws->next == ws->end;
56     }
57 
58   /* We interpret chunk_size zero as "unspecified", which means that we
59      should break up the iterations such that each thread makes only one
60      trip through the outer loop.  */
61   if (ws->chunk_size == 0)
62     {
63       unsigned long n, q, i, t;
64       unsigned long s0, e0;
65       long s, e;
66 
67       if (thr->ts.static_trip > 0)
68 	return 1;
69 
70       /* Compute the total number of iterations.  */
71       s = ws->incr + (ws->incr > 0 ? -1 : 1);
72       n = (ws->end - ws->next + s) / ws->incr;
73       i = thr->ts.team_id;
74 
75       /* Compute the "zero-based" start and end points.  That is, as
76          if the loop began at zero and incremented by one.  */
77       q = n / nthreads;
78       t = n % nthreads;
79       if (i < t)
80 	{
81 	  t = 0;
82 	  q++;
83 	}
84       s0 = q * i + t;
85       e0 = s0 + q;
86 
87       /* Notice when no iterations allocated for this thread.  */
88       if (s0 >= e0)
89 	{
90 	  thr->ts.static_trip = 1;
91 	  return 1;
92 	}
93 
94       /* Transform these to the actual start and end numbers.  */
95       s = (long)s0 * ws->incr + ws->next;
96       e = (long)e0 * ws->incr + ws->next;
97 
98       *pstart = s;
99       *pend = e;
100       thr->ts.static_trip = (e0 == n ? -1 : 1);
101       return 0;
102     }
103   else
104     {
105       unsigned long n, s0, e0, i, c;
106       long s, e;
107 
108       /* Otherwise, each thread gets exactly chunk_size iterations
109 	 (if available) each time through the loop.  */
110 
111       s = ws->incr + (ws->incr > 0 ? -1 : 1);
112       n = (ws->end - ws->next + s) / ws->incr;
113       i = thr->ts.team_id;
114       c = ws->chunk_size;
115 
116       /* Initial guess is a C sized chunk positioned nthreads iterations
117 	 in, offset by our thread number.  */
118       s0 = (thr->ts.static_trip * nthreads + i) * c;
119       e0 = s0 + c;
120 
121       /* Detect overflow.  */
122       if (s0 >= n)
123 	return 1;
124       if (e0 > n)
125 	e0 = n;
126 
127       /* Transform these to the actual start and end numbers.  */
128       s = (long)s0 * ws->incr + ws->next;
129       e = (long)e0 * ws->incr + ws->next;
130 
131       *pstart = s;
132       *pend = e;
133 
134       if (e0 == n)
135 	thr->ts.static_trip = -1;
136       else
137 	thr->ts.static_trip++;
138       return 0;
139     }
140 }
141 
142 
143 /* This function implements the DYNAMIC scheduling method.  Arguments are
144    as for gomp_iter_static_next.  This function must be called with ws->lock
145    held.  */
146 
147 bool
148 gomp_iter_dynamic_next_locked (long *pstart, long *pend)
149 {
150   struct gomp_thread *thr = gomp_thread ();
151   struct gomp_work_share *ws = thr->ts.work_share;
152   long start, end, chunk, left;
153 
154   start = ws->next;
155   if (start == ws->end)
156     return false;
157 
158   chunk = ws->chunk_size;
159   left = ws->end - start;
160   if (ws->incr < 0)
161     {
162       if (chunk < left)
163 	chunk = left;
164     }
165   else
166     {
167       if (chunk > left)
168 	chunk = left;
169     }
170   end = start + chunk;
171 
172   ws->next = end;
173   *pstart = start;
174   *pend = end;
175   return true;
176 }
177 
178 
179 #ifdef HAVE_SYNC_BUILTINS
180 /* Similar, but doesn't require the lock held, and uses compare-and-swap
181    instead.  Note that the only memory value that changes is ws->next.  */
182 
183 bool
184 gomp_iter_dynamic_next (long *pstart, long *pend)
185 {
186   struct gomp_thread *thr = gomp_thread ();
187   struct gomp_work_share *ws = thr->ts.work_share;
188   long start, end, nend, chunk, incr;
189 
190   end = ws->end;
191   incr = ws->incr;
192   chunk = ws->chunk_size;
193 
194   if (__builtin_expect (ws->mode, 1))
195     {
196       long tmp = __sync_fetch_and_add (&ws->next, chunk);
197       if (incr > 0)
198 	{
199 	  if (tmp >= end)
200 	    return false;
201 	  nend = tmp + chunk;
202 	  if (nend > end)
203 	    nend = end;
204 	  *pstart = tmp;
205 	  *pend = nend;
206 	  return true;
207 	}
208       else
209 	{
210 	  if (tmp <= end)
211 	    return false;
212 	  nend = tmp + chunk;
213 	  if (nend < end)
214 	    nend = end;
215 	  *pstart = tmp;
216 	  *pend = nend;
217 	  return true;
218 	}
219     }
220 
221   start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
222   while (1)
223     {
224       long left = end - start;
225       long tmp;
226 
227       if (start == end)
228 	return false;
229 
230       if (incr < 0)
231 	{
232 	  if (chunk < left)
233 	    chunk = left;
234 	}
235       else
236 	{
237 	  if (chunk > left)
238 	    chunk = left;
239 	}
240       nend = start + chunk;
241 
242       tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
243       if (__builtin_expect (tmp == start, 1))
244 	break;
245 
246       start = tmp;
247     }
248 
249   *pstart = start;
250   *pend = nend;
251   return true;
252 }
253 #endif /* HAVE_SYNC_BUILTINS */
254 
255 
256 /* This function implements the GUIDED scheduling method.  Arguments are
257    as for gomp_iter_static_next.  This function must be called with the
258    work share lock held.  */
259 
260 bool
261 gomp_iter_guided_next_locked (long *pstart, long *pend)
262 {
263   struct gomp_thread *thr = gomp_thread ();
264   struct gomp_work_share *ws = thr->ts.work_share;
265   struct gomp_team *team = thr->ts.team;
266   unsigned long nthreads = team ? team->nthreads : 1;
267   unsigned long n, q;
268   long start, end;
269 
270   if (ws->next == ws->end)
271     return false;
272 
273   start = ws->next;
274   n = (ws->end - start) / ws->incr;
275   q = (n + nthreads - 1) / nthreads;
276 
277   if (q < ws->chunk_size)
278     q = ws->chunk_size;
279   if (q <= n)
280     end = start + q * ws->incr;
281   else
282     end = ws->end;
283 
284   ws->next = end;
285   *pstart = start;
286   *pend = end;
287   return true;
288 }
289 
290 #ifdef HAVE_SYNC_BUILTINS
291 /* Similar, but doesn't require the lock held, and uses compare-and-swap
292    instead.  Note that the only memory value that changes is ws->next.  */
293 
294 bool
295 gomp_iter_guided_next (long *pstart, long *pend)
296 {
297   struct gomp_thread *thr = gomp_thread ();
298   struct gomp_work_share *ws = thr->ts.work_share;
299   struct gomp_team *team = thr->ts.team;
300   unsigned long nthreads = team ? team->nthreads : 1;
301   long start, end, nend, incr;
302   unsigned long chunk_size;
303 
304   start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
305   end = ws->end;
306   incr = ws->incr;
307   chunk_size = ws->chunk_size;
308 
309   while (1)
310     {
311       unsigned long n, q;
312       long tmp;
313 
314       if (start == end)
315 	return false;
316 
317       n = (end - start) / incr;
318       q = (n + nthreads - 1) / nthreads;
319 
320       if (q < chunk_size)
321 	q = chunk_size;
322       if (__builtin_expect (q <= n, 1))
323 	nend = start + q * incr;
324       else
325 	nend = end;
326 
327       tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
328       if (__builtin_expect (tmp == start, 1))
329 	break;
330 
331       start = tmp;
332     }
333 
334   *pstart = start;
335   *pend = nend;
336   return true;
337 }
338 #endif /* HAVE_SYNC_BUILTINS */
339