1 /* Copyright (C) 2005-2018 Free Software Foundation, Inc. 2 Contributed by Richard Henderson <rth@redhat.com>. 3 4 This file is part of the GNU Offloading and Multi Processing Library 5 (libgomp). 6 7 Libgomp is free software; you can redistribute it and/or modify it 8 under the terms of the GNU General Public License as published by 9 the Free Software Foundation; either version 3, or (at your option) 10 any later version. 11 12 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY 13 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 14 FOR A PARTICULAR PURPOSE. See the GNU General Public License for 15 more details. 16 17 Under Section 7 of GPL version 3, you are granted additional 18 permissions described in the GCC Runtime Library Exception, version 19 3.1, as published by the Free Software Foundation. 20 21 You should have received a copy of the GNU General Public License and 22 a copy of the GCC Runtime Library Exception along with this program; 23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 24 <http://www.gnu.org/licenses/>. */ 25 26 /* This file contains routines for managing work-share iteration, both 27 for loops and sections. */ 28 29 #include "libgomp.h" 30 #include <stdlib.h> 31 32 33 /* This function implements the STATIC scheduling method. The caller should 34 iterate *pstart <= x < *pend. Return zero if there are more iterations 35 to perform; nonzero if not. Return less than 0 if this thread had 36 received the absolutely last iteration. */ 37 38 int 39 gomp_iter_static_next (long *pstart, long *pend) 40 { 41 struct gomp_thread *thr = gomp_thread (); 42 struct gomp_team *team = thr->ts.team; 43 struct gomp_work_share *ws = thr->ts.work_share; 44 unsigned long nthreads = team ? team->nthreads : 1; 45 46 if (thr->ts.static_trip == -1) 47 return -1; 48 49 /* Quick test for degenerate teams and orphaned constructs. */ 50 if (nthreads == 1) 51 { 52 *pstart = ws->next; 53 *pend = ws->end; 54 thr->ts.static_trip = -1; 55 return ws->next == ws->end; 56 } 57 58 /* We interpret chunk_size zero as "unspecified", which means that we 59 should break up the iterations such that each thread makes only one 60 trip through the outer loop. */ 61 if (ws->chunk_size == 0) 62 { 63 unsigned long n, q, i, t; 64 unsigned long s0, e0; 65 long s, e; 66 67 if (thr->ts.static_trip > 0) 68 return 1; 69 70 /* Compute the total number of iterations. */ 71 s = ws->incr + (ws->incr > 0 ? -1 : 1); 72 n = (ws->end - ws->next + s) / ws->incr; 73 i = thr->ts.team_id; 74 75 /* Compute the "zero-based" start and end points. That is, as 76 if the loop began at zero and incremented by one. */ 77 q = n / nthreads; 78 t = n % nthreads; 79 if (i < t) 80 { 81 t = 0; 82 q++; 83 } 84 s0 = q * i + t; 85 e0 = s0 + q; 86 87 /* Notice when no iterations allocated for this thread. */ 88 if (s0 >= e0) 89 { 90 thr->ts.static_trip = 1; 91 return 1; 92 } 93 94 /* Transform these to the actual start and end numbers. */ 95 s = (long)s0 * ws->incr + ws->next; 96 e = (long)e0 * ws->incr + ws->next; 97 98 *pstart = s; 99 *pend = e; 100 thr->ts.static_trip = (e0 == n ? -1 : 1); 101 return 0; 102 } 103 else 104 { 105 unsigned long n, s0, e0, i, c; 106 long s, e; 107 108 /* Otherwise, each thread gets exactly chunk_size iterations 109 (if available) each time through the loop. */ 110 111 s = ws->incr + (ws->incr > 0 ? -1 : 1); 112 n = (ws->end - ws->next + s) / ws->incr; 113 i = thr->ts.team_id; 114 c = ws->chunk_size; 115 116 /* Initial guess is a C sized chunk positioned nthreads iterations 117 in, offset by our thread number. */ 118 s0 = (thr->ts.static_trip * nthreads + i) * c; 119 e0 = s0 + c; 120 121 /* Detect overflow. */ 122 if (s0 >= n) 123 return 1; 124 if (e0 > n) 125 e0 = n; 126 127 /* Transform these to the actual start and end numbers. */ 128 s = (long)s0 * ws->incr + ws->next; 129 e = (long)e0 * ws->incr + ws->next; 130 131 *pstart = s; 132 *pend = e; 133 134 if (e0 == n) 135 thr->ts.static_trip = -1; 136 else 137 thr->ts.static_trip++; 138 return 0; 139 } 140 } 141 142 143 /* This function implements the DYNAMIC scheduling method. Arguments are 144 as for gomp_iter_static_next. This function must be called with ws->lock 145 held. */ 146 147 bool 148 gomp_iter_dynamic_next_locked (long *pstart, long *pend) 149 { 150 struct gomp_thread *thr = gomp_thread (); 151 struct gomp_work_share *ws = thr->ts.work_share; 152 long start, end, chunk, left; 153 154 start = ws->next; 155 if (start == ws->end) 156 return false; 157 158 chunk = ws->chunk_size; 159 left = ws->end - start; 160 if (ws->incr < 0) 161 { 162 if (chunk < left) 163 chunk = left; 164 } 165 else 166 { 167 if (chunk > left) 168 chunk = left; 169 } 170 end = start + chunk; 171 172 ws->next = end; 173 *pstart = start; 174 *pend = end; 175 return true; 176 } 177 178 179 #ifdef HAVE_SYNC_BUILTINS 180 /* Similar, but doesn't require the lock held, and uses compare-and-swap 181 instead. Note that the only memory value that changes is ws->next. */ 182 183 bool 184 gomp_iter_dynamic_next (long *pstart, long *pend) 185 { 186 struct gomp_thread *thr = gomp_thread (); 187 struct gomp_work_share *ws = thr->ts.work_share; 188 long start, end, nend, chunk, incr; 189 190 end = ws->end; 191 incr = ws->incr; 192 chunk = ws->chunk_size; 193 194 if (__builtin_expect (ws->mode, 1)) 195 { 196 long tmp = __sync_fetch_and_add (&ws->next, chunk); 197 if (incr > 0) 198 { 199 if (tmp >= end) 200 return false; 201 nend = tmp + chunk; 202 if (nend > end) 203 nend = end; 204 *pstart = tmp; 205 *pend = nend; 206 return true; 207 } 208 else 209 { 210 if (tmp <= end) 211 return false; 212 nend = tmp + chunk; 213 if (nend < end) 214 nend = end; 215 *pstart = tmp; 216 *pend = nend; 217 return true; 218 } 219 } 220 221 start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED); 222 while (1) 223 { 224 long left = end - start; 225 long tmp; 226 227 if (start == end) 228 return false; 229 230 if (incr < 0) 231 { 232 if (chunk < left) 233 chunk = left; 234 } 235 else 236 { 237 if (chunk > left) 238 chunk = left; 239 } 240 nend = start + chunk; 241 242 tmp = __sync_val_compare_and_swap (&ws->next, start, nend); 243 if (__builtin_expect (tmp == start, 1)) 244 break; 245 246 start = tmp; 247 } 248 249 *pstart = start; 250 *pend = nend; 251 return true; 252 } 253 #endif /* HAVE_SYNC_BUILTINS */ 254 255 256 /* This function implements the GUIDED scheduling method. Arguments are 257 as for gomp_iter_static_next. This function must be called with the 258 work share lock held. */ 259 260 bool 261 gomp_iter_guided_next_locked (long *pstart, long *pend) 262 { 263 struct gomp_thread *thr = gomp_thread (); 264 struct gomp_work_share *ws = thr->ts.work_share; 265 struct gomp_team *team = thr->ts.team; 266 unsigned long nthreads = team ? team->nthreads : 1; 267 unsigned long n, q; 268 long start, end; 269 270 if (ws->next == ws->end) 271 return false; 272 273 start = ws->next; 274 n = (ws->end - start) / ws->incr; 275 q = (n + nthreads - 1) / nthreads; 276 277 if (q < ws->chunk_size) 278 q = ws->chunk_size; 279 if (q <= n) 280 end = start + q * ws->incr; 281 else 282 end = ws->end; 283 284 ws->next = end; 285 *pstart = start; 286 *pend = end; 287 return true; 288 } 289 290 #ifdef HAVE_SYNC_BUILTINS 291 /* Similar, but doesn't require the lock held, and uses compare-and-swap 292 instead. Note that the only memory value that changes is ws->next. */ 293 294 bool 295 gomp_iter_guided_next (long *pstart, long *pend) 296 { 297 struct gomp_thread *thr = gomp_thread (); 298 struct gomp_work_share *ws = thr->ts.work_share; 299 struct gomp_team *team = thr->ts.team; 300 unsigned long nthreads = team ? team->nthreads : 1; 301 long start, end, nend, incr; 302 unsigned long chunk_size; 303 304 start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED); 305 end = ws->end; 306 incr = ws->incr; 307 chunk_size = ws->chunk_size; 308 309 while (1) 310 { 311 unsigned long n, q; 312 long tmp; 313 314 if (start == end) 315 return false; 316 317 n = (end - start) / incr; 318 q = (n + nthreads - 1) / nthreads; 319 320 if (q < chunk_size) 321 q = chunk_size; 322 if (__builtin_expect (q <= n, 1)) 323 nend = start + q * incr; 324 else 325 nend = end; 326 327 tmp = __sync_val_compare_and_swap (&ws->next, start, nend); 328 if (__builtin_expect (tmp == start, 1)) 329 break; 330 331 start = tmp; 332 } 333 334 *pstart = start; 335 *pend = nend; 336 return true; 337 } 338 #endif /* HAVE_SYNC_BUILTINS */ 339