1 /* Copyright (C) 2005, 2008, 2009, 2011 Free Software Foundation, Inc.
2    Contributed by Richard Henderson <rth@redhat.com>.
3 
4    This file is part of the GNU OpenMP Library (libgomp).
5 
6    Libgomp is free software; you can redistribute it and/or modify it
7    under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3, or (at your option)
9    any later version.
10 
11    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
12    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14    more details.
15 
16    Under Section 7 of GPL version 3, you are granted additional
17    permissions described in the GCC Runtime Library Exception, version
18    3.1, as published by the Free Software Foundation.
19 
20    You should have received a copy of the GNU General Public License and
21    a copy of the GCC Runtime Library Exception along with this program;
22    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23    <http://www.gnu.org/licenses/>.  */
24 
25 /* This file contains routines for managing work-share iteration, both
26    for loops and sections.  */
27 
28 #include "libgomp.h"
29 #include <stdlib.h>
30 
31 typedef unsigned long long gomp_ull;
32 
33 /* This function implements the STATIC scheduling method.  The caller should
34    iterate *pstart <= x < *pend.  Return zero if there are more iterations
35    to perform; nonzero if not.  Return less than 0 if this thread had
36    received the absolutely last iteration.  */
37 
38 int
39 gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
40 {
41   struct gomp_thread *thr = gomp_thread ();
42   struct gomp_team *team = thr->ts.team;
43   struct gomp_work_share *ws = thr->ts.work_share;
44   unsigned long nthreads = team ? team->nthreads : 1;
45 
46   if (thr->ts.static_trip == -1)
47     return -1;
48 
49   /* Quick test for degenerate teams and orphaned constructs.  */
50   if (nthreads == 1)
51     {
52       *pstart = ws->next_ull;
53       *pend = ws->end_ull;
54       thr->ts.static_trip = -1;
55       return ws->next_ull == ws->end_ull;
56     }
57 
58   /* We interpret chunk_size zero as "unspecified", which means that we
59      should break up the iterations such that each thread makes only one
60      trip through the outer loop.  */
61   if (ws->chunk_size_ull == 0)
62     {
63       gomp_ull n, q, i, t, s0, e0, s, e;
64 
65       if (thr->ts.static_trip > 0)
66 	return 1;
67 
68       /* Compute the total number of iterations.  */
69       if (__builtin_expect (ws->mode, 0) == 0)
70 	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
71       else
72 	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
73       i = thr->ts.team_id;
74 
75       /* Compute the "zero-based" start and end points.  That is, as
76 	 if the loop began at zero and incremented by one.  */
77       q = n / nthreads;
78       t = n % nthreads;
79       if (i < t)
80 	{
81 	  t = 0;
82 	  q++;
83 	}
84       s0 = q * i + t;
85       e0 = s0 + q;
86 
87       /* Notice when no iterations allocated for this thread.  */
88       if (s0 >= e0)
89 	{
90 	  thr->ts.static_trip = 1;
91 	  return 1;
92 	}
93 
94       /* Transform these to the actual start and end numbers.  */
95       s = s0 * ws->incr_ull + ws->next_ull;
96       e = e0 * ws->incr_ull + ws->next_ull;
97 
98       *pstart = s;
99       *pend = e;
100       thr->ts.static_trip = (e0 == n ? -1 : 1);
101       return 0;
102     }
103   else
104     {
105       gomp_ull n, s0, e0, i, c, s, e;
106 
107       /* Otherwise, each thread gets exactly chunk_size iterations
108 	 (if available) each time through the loop.  */
109 
110       if (__builtin_expect (ws->mode, 0) == 0)
111 	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
112       else
113 	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
114       i = thr->ts.team_id;
115       c = ws->chunk_size_ull;
116 
117       /* Initial guess is a C sized chunk positioned nthreads iterations
118 	 in, offset by our thread number.  */
119       s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
120       e0 = s0 + c;
121 
122       /* Detect overflow.  */
123       if (s0 >= n)
124 	return 1;
125       if (e0 > n)
126 	e0 = n;
127 
128       /* Transform these to the actual start and end numbers.  */
129       s = s0 * ws->incr_ull + ws->next_ull;
130       e = e0 * ws->incr_ull + ws->next_ull;
131 
132       *pstart = s;
133       *pend = e;
134 
135       if (e0 == n)
136 	thr->ts.static_trip = -1;
137       else
138 	thr->ts.static_trip++;
139       return 0;
140     }
141 }
142 
143 
144 /* This function implements the DYNAMIC scheduling method.  Arguments are
145    as for gomp_iter_ull_static_next.  This function must be called with
146    ws->lock held.  */
147 
148 bool
149 gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
150 {
151   struct gomp_thread *thr = gomp_thread ();
152   struct gomp_work_share *ws = thr->ts.work_share;
153   gomp_ull start, end, chunk, left;
154 
155   start = ws->next_ull;
156   if (start == ws->end_ull)
157     return false;
158 
159   chunk = ws->chunk_size_ull;
160   left = ws->end_ull - start;
161   if (__builtin_expect (ws->mode & 2, 0))
162     {
163       if (chunk < left)
164 	chunk = left;
165     }
166   else
167     {
168       if (chunk > left)
169 	chunk = left;
170     }
171   end = start + chunk;
172 
173   ws->next_ull = end;
174   *pstart = start;
175   *pend = end;
176   return true;
177 }
178 
179 
180 #if defined HAVE_SYNC_BUILTINS && defined __LP64__
181 /* Similar, but doesn't require the lock held, and uses compare-and-swap
182    instead.  Note that the only memory value that changes is ws->next_ull.  */
183 
184 bool
185 gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
186 {
187   struct gomp_thread *thr = gomp_thread ();
188   struct gomp_work_share *ws = thr->ts.work_share;
189   gomp_ull start, end, nend, chunk;
190 
191   end = ws->end_ull;
192   chunk = ws->chunk_size_ull;
193 
194   if (__builtin_expect (ws->mode & 1, 1))
195     {
196       gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
197       if (__builtin_expect (ws->mode & 2, 0) == 0)
198 	{
199 	  if (tmp >= end)
200 	    return false;
201 	  nend = tmp + chunk;
202 	  if (nend > end)
203 	    nend = end;
204 	  *pstart = tmp;
205 	  *pend = nend;
206 	  return true;
207 	}
208       else
209 	{
210 	  if (tmp <= end)
211 	    return false;
212 	  nend = tmp + chunk;
213 	  if (nend < end)
214 	    nend = end;
215 	  *pstart = tmp;
216 	  *pend = nend;
217 	  return true;
218 	}
219     }
220 
221   start = ws->next_ull;
222   while (1)
223     {
224       gomp_ull left = end - start;
225       gomp_ull tmp;
226 
227       if (start == end)
228 	return false;
229 
230       if (__builtin_expect (ws->mode & 2, 0))
231 	{
232 	  if (chunk < left)
233 	    chunk = left;
234 	}
235       else
236 	{
237 	  if (chunk > left)
238 	    chunk = left;
239 	}
240       nend = start + chunk;
241 
242       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
243       if (__builtin_expect (tmp == start, 1))
244 	break;
245 
246       start = tmp;
247     }
248 
249   *pstart = start;
250   *pend = nend;
251   return true;
252 }
253 #endif /* HAVE_SYNC_BUILTINS */
254 
255 
256 /* This function implements the GUIDED scheduling method.  Arguments are
257    as for gomp_iter_ull_static_next.  This function must be called with the
258    work share lock held.  */
259 
260 bool
261 gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
262 {
263   struct gomp_thread *thr = gomp_thread ();
264   struct gomp_work_share *ws = thr->ts.work_share;
265   struct gomp_team *team = thr->ts.team;
266   gomp_ull nthreads = team ? team->nthreads : 1;
267   gomp_ull n, q;
268   gomp_ull start, end;
269 
270   if (ws->next_ull == ws->end_ull)
271     return false;
272 
273   start = ws->next_ull;
274   if (__builtin_expect (ws->mode, 0) == 0)
275     n = (ws->end_ull - start) / ws->incr_ull;
276   else
277     n = (start - ws->end_ull) / -ws->incr_ull;
278   q = (n + nthreads - 1) / nthreads;
279 
280   if (q < ws->chunk_size_ull)
281     q = ws->chunk_size_ull;
282   if (q <= n)
283     end = start + q * ws->incr_ull;
284   else
285     end = ws->end_ull;
286 
287   ws->next_ull = end;
288   *pstart = start;
289   *pend = end;
290   return true;
291 }
292 
293 #if defined HAVE_SYNC_BUILTINS && defined __LP64__
294 /* Similar, but doesn't require the lock held, and uses compare-and-swap
295    instead.  Note that the only memory value that changes is ws->next_ull.  */
296 
297 bool
298 gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
299 {
300   struct gomp_thread *thr = gomp_thread ();
301   struct gomp_work_share *ws = thr->ts.work_share;
302   struct gomp_team *team = thr->ts.team;
303   gomp_ull nthreads = team ? team->nthreads : 1;
304   gomp_ull start, end, nend, incr;
305   gomp_ull chunk_size;
306 
307   start = ws->next_ull;
308   end = ws->end_ull;
309   incr = ws->incr_ull;
310   chunk_size = ws->chunk_size_ull;
311 
312   while (1)
313     {
314       gomp_ull n, q;
315       gomp_ull tmp;
316 
317       if (start == end)
318 	return false;
319 
320       if (__builtin_expect (ws->mode, 0) == 0)
321 	n = (end - start) / incr;
322       else
323 	n = (start - end) / -incr;
324       q = (n + nthreads - 1) / nthreads;
325 
326       if (q < chunk_size)
327 	q = chunk_size;
328       if (__builtin_expect (q <= n, 1))
329 	nend = start + q * incr;
330       else
331 	nend = end;
332 
333       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
334       if (__builtin_expect (tmp == start, 1))
335 	break;
336 
337       start = tmp;
338     }
339 
340   *pstart = start;
341   *pend = nend;
342   return true;
343 }
344 #endif /* HAVE_SYNC_BUILTINS */
345