1 /* Copyright (C) 2005-2018 Free Software Foundation, Inc.
2    Contributed by Richard Henderson <rth@redhat.com>.
3 
4    This file is part of the GNU Offloading and Multi Processing Library
5    (libgomp).
6 
7    Libgomp is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3, or (at your option)
10    any later version.
11 
12    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15    more details.
16 
17    Under Section 7 of GPL version 3, you are granted additional
18    permissions described in the GCC Runtime Library Exception, version
19    3.1, as published by the Free Software Foundation.
20 
21    You should have received a copy of the GNU General Public License and
22    a copy of the GCC Runtime Library Exception along with this program;
23    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24    <http://www.gnu.org/licenses/>.  */
25 
26 /* This file contains routines for managing work-share iteration, both
27    for loops and sections.  */
28 
29 #include "libgomp.h"
30 #include <stdlib.h>
31 
32 typedef unsigned long long gomp_ull;
33 
34 /* This function implements the STATIC scheduling method.  The caller should
35    iterate *pstart <= x < *pend.  Return zero if there are more iterations
36    to perform; nonzero if not.  Return less than 0 if this thread had
37    received the absolutely last iteration.  */
38 
39 int
40 gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
41 {
42   struct gomp_thread *thr = gomp_thread ();
43   struct gomp_team *team = thr->ts.team;
44   struct gomp_work_share *ws = thr->ts.work_share;
45   unsigned long nthreads = team ? team->nthreads : 1;
46 
47   if (thr->ts.static_trip == -1)
48     return -1;
49 
50   /* Quick test for degenerate teams and orphaned constructs.  */
51   if (nthreads == 1)
52     {
53       *pstart = ws->next_ull;
54       *pend = ws->end_ull;
55       thr->ts.static_trip = -1;
56       return ws->next_ull == ws->end_ull;
57     }
58 
59   /* We interpret chunk_size zero as "unspecified", which means that we
60      should break up the iterations such that each thread makes only one
61      trip through the outer loop.  */
62   if (ws->chunk_size_ull == 0)
63     {
64       gomp_ull n, q, i, t, s0, e0, s, e;
65 
66       if (thr->ts.static_trip > 0)
67 	return 1;
68 
69       /* Compute the total number of iterations.  */
70       if (__builtin_expect (ws->mode, 0) == 0)
71 	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
72       else
73 	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
74       i = thr->ts.team_id;
75 
76       /* Compute the "zero-based" start and end points.  That is, as
77 	 if the loop began at zero and incremented by one.  */
78       q = n / nthreads;
79       t = n % nthreads;
80       if (i < t)
81 	{
82 	  t = 0;
83 	  q++;
84 	}
85       s0 = q * i + t;
86       e0 = s0 + q;
87 
88       /* Notice when no iterations allocated for this thread.  */
89       if (s0 >= e0)
90 	{
91 	  thr->ts.static_trip = 1;
92 	  return 1;
93 	}
94 
95       /* Transform these to the actual start and end numbers.  */
96       s = s0 * ws->incr_ull + ws->next_ull;
97       e = e0 * ws->incr_ull + ws->next_ull;
98 
99       *pstart = s;
100       *pend = e;
101       thr->ts.static_trip = (e0 == n ? -1 : 1);
102       return 0;
103     }
104   else
105     {
106       gomp_ull n, s0, e0, i, c, s, e;
107 
108       /* Otherwise, each thread gets exactly chunk_size iterations
109 	 (if available) each time through the loop.  */
110 
111       if (__builtin_expect (ws->mode, 0) == 0)
112 	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
113       else
114 	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
115       i = thr->ts.team_id;
116       c = ws->chunk_size_ull;
117 
118       /* Initial guess is a C sized chunk positioned nthreads iterations
119 	 in, offset by our thread number.  */
120       s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
121       e0 = s0 + c;
122 
123       /* Detect overflow.  */
124       if (s0 >= n)
125 	return 1;
126       if (e0 > n)
127 	e0 = n;
128 
129       /* Transform these to the actual start and end numbers.  */
130       s = s0 * ws->incr_ull + ws->next_ull;
131       e = e0 * ws->incr_ull + ws->next_ull;
132 
133       *pstart = s;
134       *pend = e;
135 
136       if (e0 == n)
137 	thr->ts.static_trip = -1;
138       else
139 	thr->ts.static_trip++;
140       return 0;
141     }
142 }
143 
144 
145 /* This function implements the DYNAMIC scheduling method.  Arguments are
146    as for gomp_iter_ull_static_next.  This function must be called with
147    ws->lock held.  */
148 
149 bool
150 gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
151 {
152   struct gomp_thread *thr = gomp_thread ();
153   struct gomp_work_share *ws = thr->ts.work_share;
154   gomp_ull start, end, chunk, left;
155 
156   start = ws->next_ull;
157   if (start == ws->end_ull)
158     return false;
159 
160   chunk = ws->chunk_size_ull;
161   left = ws->end_ull - start;
162   if (__builtin_expect (ws->mode & 2, 0))
163     {
164       if (chunk < left)
165 	chunk = left;
166     }
167   else
168     {
169       if (chunk > left)
170 	chunk = left;
171     }
172   end = start + chunk;
173 
174   ws->next_ull = end;
175   *pstart = start;
176   *pend = end;
177   return true;
178 }
179 
180 
181 #if defined HAVE_SYNC_BUILTINS && defined __LP64__
182 /* Similar, but doesn't require the lock held, and uses compare-and-swap
183    instead.  Note that the only memory value that changes is ws->next_ull.  */
184 
185 bool
186 gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
187 {
188   struct gomp_thread *thr = gomp_thread ();
189   struct gomp_work_share *ws = thr->ts.work_share;
190   gomp_ull start, end, nend, chunk;
191 
192   end = ws->end_ull;
193   chunk = ws->chunk_size_ull;
194 
195   if (__builtin_expect (ws->mode & 1, 1))
196     {
197       gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
198       if (__builtin_expect (ws->mode & 2, 0) == 0)
199 	{
200 	  if (tmp >= end)
201 	    return false;
202 	  nend = tmp + chunk;
203 	  if (nend > end)
204 	    nend = end;
205 	  *pstart = tmp;
206 	  *pend = nend;
207 	  return true;
208 	}
209       else
210 	{
211 	  if (tmp <= end)
212 	    return false;
213 	  nend = tmp + chunk;
214 	  if (nend < end)
215 	    nend = end;
216 	  *pstart = tmp;
217 	  *pend = nend;
218 	  return true;
219 	}
220     }
221 
222   start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
223   while (1)
224     {
225       gomp_ull left = end - start;
226       gomp_ull tmp;
227 
228       if (start == end)
229 	return false;
230 
231       if (__builtin_expect (ws->mode & 2, 0))
232 	{
233 	  if (chunk < left)
234 	    chunk = left;
235 	}
236       else
237 	{
238 	  if (chunk > left)
239 	    chunk = left;
240 	}
241       nend = start + chunk;
242 
243       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
244       if (__builtin_expect (tmp == start, 1))
245 	break;
246 
247       start = tmp;
248     }
249 
250   *pstart = start;
251   *pend = nend;
252   return true;
253 }
254 #endif /* HAVE_SYNC_BUILTINS */
255 
256 
257 /* This function implements the GUIDED scheduling method.  Arguments are
258    as for gomp_iter_ull_static_next.  This function must be called with the
259    work share lock held.  */
260 
261 bool
262 gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
263 {
264   struct gomp_thread *thr = gomp_thread ();
265   struct gomp_work_share *ws = thr->ts.work_share;
266   struct gomp_team *team = thr->ts.team;
267   gomp_ull nthreads = team ? team->nthreads : 1;
268   gomp_ull n, q;
269   gomp_ull start, end;
270 
271   if (ws->next_ull == ws->end_ull)
272     return false;
273 
274   start = ws->next_ull;
275   if (__builtin_expect (ws->mode, 0) == 0)
276     n = (ws->end_ull - start) / ws->incr_ull;
277   else
278     n = (start - ws->end_ull) / -ws->incr_ull;
279   q = (n + nthreads - 1) / nthreads;
280 
281   if (q < ws->chunk_size_ull)
282     q = ws->chunk_size_ull;
283   if (q <= n)
284     end = start + q * ws->incr_ull;
285   else
286     end = ws->end_ull;
287 
288   ws->next_ull = end;
289   *pstart = start;
290   *pend = end;
291   return true;
292 }
293 
294 #if defined HAVE_SYNC_BUILTINS && defined __LP64__
295 /* Similar, but doesn't require the lock held, and uses compare-and-swap
296    instead.  Note that the only memory value that changes is ws->next_ull.  */
297 
298 bool
299 gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
300 {
301   struct gomp_thread *thr = gomp_thread ();
302   struct gomp_work_share *ws = thr->ts.work_share;
303   struct gomp_team *team = thr->ts.team;
304   gomp_ull nthreads = team ? team->nthreads : 1;
305   gomp_ull start, end, nend, incr;
306   gomp_ull chunk_size;
307 
308   start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
309   end = ws->end_ull;
310   incr = ws->incr_ull;
311   chunk_size = ws->chunk_size_ull;
312 
313   while (1)
314     {
315       gomp_ull n, q;
316       gomp_ull tmp;
317 
318       if (start == end)
319 	return false;
320 
321       if (__builtin_expect (ws->mode, 0) == 0)
322 	n = (end - start) / incr;
323       else
324 	n = (start - end) / -incr;
325       q = (n + nthreads - 1) / nthreads;
326 
327       if (q < chunk_size)
328 	q = chunk_size;
329       if (__builtin_expect (q <= n, 1))
330 	nend = start + q * incr;
331       else
332 	nend = end;
333 
334       tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
335       if (__builtin_expect (tmp == start, 1))
336 	break;
337 
338       start = tmp;
339     }
340 
341   *pstart = start;
342   *pend = nend;
343   return true;
344 }
345 #endif /* HAVE_SYNC_BUILTINS */
346