1 /* Copyright (C) 2005-2018 Free Software Foundation, Inc. 2 Contributed by Richard Henderson <rth@redhat.com>. 3 4 This file is part of the GNU Offloading and Multi Processing Library 5 (libgomp). 6 7 Libgomp is free software; you can redistribute it and/or modify it 8 under the terms of the GNU General Public License as published by 9 the Free Software Foundation; either version 3, or (at your option) 10 any later version. 11 12 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY 13 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 14 FOR A PARTICULAR PURPOSE. See the GNU General Public License for 15 more details. 16 17 Under Section 7 of GPL version 3, you are granted additional 18 permissions described in the GCC Runtime Library Exception, version 19 3.1, as published by the Free Software Foundation. 20 21 You should have received a copy of the GNU General Public License and 22 a copy of the GCC Runtime Library Exception along with this program; 23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 24 <http://www.gnu.org/licenses/>. */ 25 26 /* This file contains routines for managing work-share iteration, both 27 for loops and sections. */ 28 29 #include "libgomp.h" 30 #include <stdlib.h> 31 32 typedef unsigned long long gomp_ull; 33 34 /* This function implements the STATIC scheduling method. The caller should 35 iterate *pstart <= x < *pend. Return zero if there are more iterations 36 to perform; nonzero if not. Return less than 0 if this thread had 37 received the absolutely last iteration. */ 38 39 int 40 gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend) 41 { 42 struct gomp_thread *thr = gomp_thread (); 43 struct gomp_team *team = thr->ts.team; 44 struct gomp_work_share *ws = thr->ts.work_share; 45 unsigned long nthreads = team ? team->nthreads : 1; 46 47 if (thr->ts.static_trip == -1) 48 return -1; 49 50 /* Quick test for degenerate teams and orphaned constructs. */ 51 if (nthreads == 1) 52 { 53 *pstart = ws->next_ull; 54 *pend = ws->end_ull; 55 thr->ts.static_trip = -1; 56 return ws->next_ull == ws->end_ull; 57 } 58 59 /* We interpret chunk_size zero as "unspecified", which means that we 60 should break up the iterations such that each thread makes only one 61 trip through the outer loop. */ 62 if (ws->chunk_size_ull == 0) 63 { 64 gomp_ull n, q, i, t, s0, e0, s, e; 65 66 if (thr->ts.static_trip > 0) 67 return 1; 68 69 /* Compute the total number of iterations. */ 70 if (__builtin_expect (ws->mode, 0) == 0) 71 n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull; 72 else 73 n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull; 74 i = thr->ts.team_id; 75 76 /* Compute the "zero-based" start and end points. That is, as 77 if the loop began at zero and incremented by one. */ 78 q = n / nthreads; 79 t = n % nthreads; 80 if (i < t) 81 { 82 t = 0; 83 q++; 84 } 85 s0 = q * i + t; 86 e0 = s0 + q; 87 88 /* Notice when no iterations allocated for this thread. */ 89 if (s0 >= e0) 90 { 91 thr->ts.static_trip = 1; 92 return 1; 93 } 94 95 /* Transform these to the actual start and end numbers. */ 96 s = s0 * ws->incr_ull + ws->next_ull; 97 e = e0 * ws->incr_ull + ws->next_ull; 98 99 *pstart = s; 100 *pend = e; 101 thr->ts.static_trip = (e0 == n ? -1 : 1); 102 return 0; 103 } 104 else 105 { 106 gomp_ull n, s0, e0, i, c, s, e; 107 108 /* Otherwise, each thread gets exactly chunk_size iterations 109 (if available) each time through the loop. */ 110 111 if (__builtin_expect (ws->mode, 0) == 0) 112 n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull; 113 else 114 n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull; 115 i = thr->ts.team_id; 116 c = ws->chunk_size_ull; 117 118 /* Initial guess is a C sized chunk positioned nthreads iterations 119 in, offset by our thread number. */ 120 s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c; 121 e0 = s0 + c; 122 123 /* Detect overflow. */ 124 if (s0 >= n) 125 return 1; 126 if (e0 > n) 127 e0 = n; 128 129 /* Transform these to the actual start and end numbers. */ 130 s = s0 * ws->incr_ull + ws->next_ull; 131 e = e0 * ws->incr_ull + ws->next_ull; 132 133 *pstart = s; 134 *pend = e; 135 136 if (e0 == n) 137 thr->ts.static_trip = -1; 138 else 139 thr->ts.static_trip++; 140 return 0; 141 } 142 } 143 144 145 /* This function implements the DYNAMIC scheduling method. Arguments are 146 as for gomp_iter_ull_static_next. This function must be called with 147 ws->lock held. */ 148 149 bool 150 gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend) 151 { 152 struct gomp_thread *thr = gomp_thread (); 153 struct gomp_work_share *ws = thr->ts.work_share; 154 gomp_ull start, end, chunk, left; 155 156 start = ws->next_ull; 157 if (start == ws->end_ull) 158 return false; 159 160 chunk = ws->chunk_size_ull; 161 left = ws->end_ull - start; 162 if (__builtin_expect (ws->mode & 2, 0)) 163 { 164 if (chunk < left) 165 chunk = left; 166 } 167 else 168 { 169 if (chunk > left) 170 chunk = left; 171 } 172 end = start + chunk; 173 174 ws->next_ull = end; 175 *pstart = start; 176 *pend = end; 177 return true; 178 } 179 180 181 #if defined HAVE_SYNC_BUILTINS && defined __LP64__ 182 /* Similar, but doesn't require the lock held, and uses compare-and-swap 183 instead. Note that the only memory value that changes is ws->next_ull. */ 184 185 bool 186 gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend) 187 { 188 struct gomp_thread *thr = gomp_thread (); 189 struct gomp_work_share *ws = thr->ts.work_share; 190 gomp_ull start, end, nend, chunk; 191 192 end = ws->end_ull; 193 chunk = ws->chunk_size_ull; 194 195 if (__builtin_expect (ws->mode & 1, 1)) 196 { 197 gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk); 198 if (__builtin_expect (ws->mode & 2, 0) == 0) 199 { 200 if (tmp >= end) 201 return false; 202 nend = tmp + chunk; 203 if (nend > end) 204 nend = end; 205 *pstart = tmp; 206 *pend = nend; 207 return true; 208 } 209 else 210 { 211 if (tmp <= end) 212 return false; 213 nend = tmp + chunk; 214 if (nend < end) 215 nend = end; 216 *pstart = tmp; 217 *pend = nend; 218 return true; 219 } 220 } 221 222 start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED); 223 while (1) 224 { 225 gomp_ull left = end - start; 226 gomp_ull tmp; 227 228 if (start == end) 229 return false; 230 231 if (__builtin_expect (ws->mode & 2, 0)) 232 { 233 if (chunk < left) 234 chunk = left; 235 } 236 else 237 { 238 if (chunk > left) 239 chunk = left; 240 } 241 nend = start + chunk; 242 243 tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend); 244 if (__builtin_expect (tmp == start, 1)) 245 break; 246 247 start = tmp; 248 } 249 250 *pstart = start; 251 *pend = nend; 252 return true; 253 } 254 #endif /* HAVE_SYNC_BUILTINS */ 255 256 257 /* This function implements the GUIDED scheduling method. Arguments are 258 as for gomp_iter_ull_static_next. This function must be called with the 259 work share lock held. */ 260 261 bool 262 gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend) 263 { 264 struct gomp_thread *thr = gomp_thread (); 265 struct gomp_work_share *ws = thr->ts.work_share; 266 struct gomp_team *team = thr->ts.team; 267 gomp_ull nthreads = team ? team->nthreads : 1; 268 gomp_ull n, q; 269 gomp_ull start, end; 270 271 if (ws->next_ull == ws->end_ull) 272 return false; 273 274 start = ws->next_ull; 275 if (__builtin_expect (ws->mode, 0) == 0) 276 n = (ws->end_ull - start) / ws->incr_ull; 277 else 278 n = (start - ws->end_ull) / -ws->incr_ull; 279 q = (n + nthreads - 1) / nthreads; 280 281 if (q < ws->chunk_size_ull) 282 q = ws->chunk_size_ull; 283 if (q <= n) 284 end = start + q * ws->incr_ull; 285 else 286 end = ws->end_ull; 287 288 ws->next_ull = end; 289 *pstart = start; 290 *pend = end; 291 return true; 292 } 293 294 #if defined HAVE_SYNC_BUILTINS && defined __LP64__ 295 /* Similar, but doesn't require the lock held, and uses compare-and-swap 296 instead. Note that the only memory value that changes is ws->next_ull. */ 297 298 bool 299 gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend) 300 { 301 struct gomp_thread *thr = gomp_thread (); 302 struct gomp_work_share *ws = thr->ts.work_share; 303 struct gomp_team *team = thr->ts.team; 304 gomp_ull nthreads = team ? team->nthreads : 1; 305 gomp_ull start, end, nend, incr; 306 gomp_ull chunk_size; 307 308 start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED); 309 end = ws->end_ull; 310 incr = ws->incr_ull; 311 chunk_size = ws->chunk_size_ull; 312 313 while (1) 314 { 315 gomp_ull n, q; 316 gomp_ull tmp; 317 318 if (start == end) 319 return false; 320 321 if (__builtin_expect (ws->mode, 0) == 0) 322 n = (end - start) / incr; 323 else 324 n = (start - end) / -incr; 325 q = (n + nthreads - 1) / nthreads; 326 327 if (q < chunk_size) 328 q = chunk_size; 329 if (__builtin_expect (q <= n, 1)) 330 nend = start + q * incr; 331 else 332 nend = end; 333 334 tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend); 335 if (__builtin_expect (tmp == start, 1)) 336 break; 337 338 start = tmp; 339 } 340 341 *pstart = start; 342 *pend = nend; 343 return true; 344 } 345 #endif /* HAVE_SYNC_BUILTINS */ 346