1 /* Copyright (C) 2005-2018 Free Software Foundation, Inc. 2 Contributed by Richard Henderson <rth@redhat.com>. 3 4 This file is part of the GNU Offloading and Multi Processing Library 5 (libgomp). 6 7 Libgomp is free software; you can redistribute it and/or modify it 8 under the terms of the GNU General Public License as published by 9 the Free Software Foundation; either version 3, or (at your option) 10 any later version. 11 12 Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY 13 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 14 FOR A PARTICULAR PURPOSE. See the GNU General Public License for 15 more details. 16 17 Under Section 7 of GPL version 3, you are granted additional 18 permissions described in the GCC Runtime Library Exception, version 19 3.1, as published by the Free Software Foundation. 20 21 You should have received a copy of the GNU General Public License and 22 a copy of the GCC Runtime Library Exception along with this program; 23 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 24 <http://www.gnu.org/licenses/>. */ 25 26 /* This file handles the ORDERED construct. */ 27 28 #include "libgomp.h" 29 #include <stdarg.h> 30 #include <string.h> 31 #include "doacross.h" 32 33 34 /* This function is called when first allocating an iteration block. That 35 is, the thread is not currently on the queue. The work-share lock must 36 be held on entry. */ 37 38 void 39 gomp_ordered_first (void) 40 { 41 struct gomp_thread *thr = gomp_thread (); 42 struct gomp_team *team = thr->ts.team; 43 struct gomp_work_share *ws = thr->ts.work_share; 44 unsigned index; 45 46 /* Work share constructs can be orphaned. */ 47 if (team == NULL || team->nthreads == 1) 48 return; 49 50 index = ws->ordered_cur + ws->ordered_num_used; 51 if (index >= team->nthreads) 52 index -= team->nthreads; 53 ws->ordered_team_ids[index] = thr->ts.team_id; 54 55 /* If this is the first and only thread in the queue, then there is 56 no one to release us when we get to our ordered section. Post to 57 our own release queue now so that we won't block later. */ 58 if (ws->ordered_num_used++ == 0) 59 gomp_sem_post (team->ordered_release[thr->ts.team_id]); 60 } 61 62 /* This function is called when completing the last iteration block. That 63 is, there are no more iterations to perform and so the thread should be 64 removed from the queue entirely. Because of the way ORDERED blocks are 65 managed, it follows that we currently own access to the ORDERED block, 66 and should now pass it on to the next thread. The work-share lock must 67 be held on entry. */ 68 69 void 70 gomp_ordered_last (void) 71 { 72 struct gomp_thread *thr = gomp_thread (); 73 struct gomp_team *team = thr->ts.team; 74 struct gomp_work_share *ws = thr->ts.work_share; 75 unsigned next_id; 76 77 /* Work share constructs can be orphaned. */ 78 if (team == NULL || team->nthreads == 1) 79 return; 80 81 /* We're no longer the owner. */ 82 ws->ordered_owner = -1; 83 84 /* If we're not the last thread in the queue, then wake the next. */ 85 if (--ws->ordered_num_used > 0) 86 { 87 unsigned next = ws->ordered_cur + 1; 88 if (next == team->nthreads) 89 next = 0; 90 ws->ordered_cur = next; 91 92 next_id = ws->ordered_team_ids[next]; 93 gomp_sem_post (team->ordered_release[next_id]); 94 } 95 } 96 97 98 /* This function is called when allocating a subsequent allocation block. 99 That is, we're done with the current iteration block and we're allocating 100 another. This is the logical combination of a call to gomp_ordered_last 101 followed by a call to gomp_ordered_first. The work-share lock must be 102 held on entry. */ 103 104 void 105 gomp_ordered_next (void) 106 { 107 struct gomp_thread *thr = gomp_thread (); 108 struct gomp_team *team = thr->ts.team; 109 struct gomp_work_share *ws = thr->ts.work_share; 110 unsigned index, next_id; 111 112 /* Work share constructs can be orphaned. */ 113 if (team == NULL || team->nthreads == 1) 114 return; 115 116 /* We're no longer the owner. */ 117 ws->ordered_owner = -1; 118 119 /* If there's only one thread in the queue, that must be us. */ 120 if (ws->ordered_num_used == 1) 121 { 122 /* We have a similar situation as in gomp_ordered_first 123 where we need to post to our own release semaphore. */ 124 gomp_sem_post (team->ordered_release[thr->ts.team_id]); 125 return; 126 } 127 128 /* If the queue is entirely full, then we move ourself to the end of 129 the queue merely by incrementing ordered_cur. Only if it's not 130 full do we have to write our id. */ 131 if (ws->ordered_num_used < team->nthreads) 132 { 133 index = ws->ordered_cur + ws->ordered_num_used; 134 if (index >= team->nthreads) 135 index -= team->nthreads; 136 ws->ordered_team_ids[index] = thr->ts.team_id; 137 } 138 139 index = ws->ordered_cur + 1; 140 if (index == team->nthreads) 141 index = 0; 142 ws->ordered_cur = index; 143 144 next_id = ws->ordered_team_ids[index]; 145 gomp_sem_post (team->ordered_release[next_id]); 146 } 147 148 149 /* This function is called when a statically scheduled loop is first 150 being created. */ 151 152 void 153 gomp_ordered_static_init (void) 154 { 155 struct gomp_thread *thr = gomp_thread (); 156 struct gomp_team *team = thr->ts.team; 157 158 if (team == NULL || team->nthreads == 1) 159 return; 160 161 gomp_sem_post (team->ordered_release[0]); 162 } 163 164 /* This function is called when a statically scheduled loop is moving to 165 the next allocation block. Static schedules are not first come first 166 served like the others, so we're to move to the numerically next thread, 167 not the next thread on a list. The work-share lock should *not* be held 168 on entry. */ 169 170 void 171 gomp_ordered_static_next (void) 172 { 173 struct gomp_thread *thr = gomp_thread (); 174 struct gomp_team *team = thr->ts.team; 175 struct gomp_work_share *ws = thr->ts.work_share; 176 unsigned id = thr->ts.team_id; 177 178 if (team == NULL || team->nthreads == 1) 179 return; 180 181 ws->ordered_owner = -1; 182 183 /* This thread currently owns the lock. Increment the owner. */ 184 if (++id == team->nthreads) 185 id = 0; 186 ws->ordered_team_ids[0] = id; 187 gomp_sem_post (team->ordered_release[id]); 188 } 189 190 /* This function is called when we need to assert that the thread owns the 191 ordered section. Due to the problem of posted-but-not-waited semaphores, 192 this needs to happen before completing a loop iteration. */ 193 194 void 195 gomp_ordered_sync (void) 196 { 197 struct gomp_thread *thr = gomp_thread (); 198 struct gomp_team *team = thr->ts.team; 199 struct gomp_work_share *ws = thr->ts.work_share; 200 201 /* Work share constructs can be orphaned. But this clearly means that 202 we are the only thread, and so we automatically own the section. */ 203 if (team == NULL || team->nthreads == 1) 204 return; 205 206 /* ??? I believe it to be safe to access this data without taking the 207 ws->lock. The only presumed race condition is with the previous 208 thread on the queue incrementing ordered_cur such that it points 209 to us, concurrently with our check below. But our team_id is 210 already present in the queue, and the other thread will always 211 post to our release semaphore. So the two cases are that we will 212 either win the race an momentarily block on the semaphore, or lose 213 the race and find the semaphore already unlocked and so not block. 214 Either way we get correct results. 215 However, there is an implicit flush on entry to an ordered region, 216 so we do need to have a barrier here. If we were taking a lock 217 this could be MEMMODEL_RELEASE since the acquire would be coverd 218 by the lock. */ 219 220 __atomic_thread_fence (MEMMODEL_ACQ_REL); 221 if (ws->ordered_owner != thr->ts.team_id) 222 { 223 gomp_sem_wait (team->ordered_release[thr->ts.team_id]); 224 ws->ordered_owner = thr->ts.team_id; 225 } 226 } 227 228 /* This function is called by user code when encountering the start of an 229 ORDERED block. We must check to see if the current thread is at the 230 head of the queue, and if not, block. */ 231 232 #ifdef HAVE_ATTRIBUTE_ALIAS 233 extern void GOMP_ordered_start (void) 234 __attribute__((alias ("gomp_ordered_sync"))); 235 #else 236 void 237 GOMP_ordered_start (void) 238 { 239 gomp_ordered_sync (); 240 } 241 #endif 242 243 /* This function is called by user code when encountering the end of an 244 ORDERED block. With the current ORDERED implementation there's nothing 245 for us to do. 246 247 However, the current implementation has a flaw in that it does not allow 248 the next thread into the ORDERED section immediately after the current 249 thread exits the ORDERED section in its last iteration. The existance 250 of this function allows the implementation to change. */ 251 252 void 253 GOMP_ordered_end (void) 254 { 255 } 256 257 /* DOACROSS initialization. */ 258 259 #define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__) 260 261 void 262 gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size) 263 { 264 struct gomp_thread *thr = gomp_thread (); 265 struct gomp_team *team = thr->ts.team; 266 struct gomp_work_share *ws = thr->ts.work_share; 267 unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0; 268 unsigned long ent, num_ents, elt_sz, shift_sz; 269 struct gomp_doacross_work_share *doacross; 270 271 if (team == NULL || team->nthreads == 1) 272 return; 273 274 for (i = 0; i < ncounts; i++) 275 { 276 /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */ 277 if (counts[i] == 0) 278 return; 279 280 if (num_bits <= MAX_COLLAPSED_BITS) 281 { 282 unsigned int this_bits; 283 if (counts[i] == 1) 284 this_bits = 1; 285 else 286 this_bits = __SIZEOF_LONG__ * __CHAR_BIT__ 287 - __builtin_clzl (counts[i] - 1); 288 if (num_bits + this_bits <= MAX_COLLAPSED_BITS) 289 { 290 bits[i] = this_bits; 291 num_bits += this_bits; 292 } 293 else 294 num_bits = MAX_COLLAPSED_BITS + 1; 295 } 296 } 297 298 if (ws->sched == GFS_STATIC) 299 num_ents = team->nthreads; 300 else if (ws->sched == GFS_GUIDED) 301 num_ents = counts[0]; 302 else 303 num_ents = (counts[0] - 1) / chunk_size + 1; 304 if (num_bits <= MAX_COLLAPSED_BITS) 305 { 306 elt_sz = sizeof (unsigned long); 307 shift_sz = ncounts * sizeof (unsigned int); 308 } 309 else 310 { 311 elt_sz = sizeof (unsigned long) * ncounts; 312 shift_sz = 0; 313 } 314 elt_sz = (elt_sz + 63) & ~63UL; 315 316 doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz 317 + shift_sz); 318 doacross->chunk_size = chunk_size; 319 doacross->elt_sz = elt_sz; 320 doacross->ncounts = ncounts; 321 doacross->flattened = false; 322 doacross->array = (unsigned char *) 323 ((((uintptr_t) (doacross + 1)) + 63 + shift_sz) 324 & ~(uintptr_t) 63); 325 if (num_bits <= MAX_COLLAPSED_BITS) 326 { 327 unsigned int shift_count = 0; 328 doacross->flattened = true; 329 for (i = ncounts; i > 0; i--) 330 { 331 doacross->shift_counts[i - 1] = shift_count; 332 shift_count += bits[i - 1]; 333 } 334 for (ent = 0; ent < num_ents; ent++) 335 *(unsigned long *) (doacross->array + ent * elt_sz) = 0; 336 } 337 else 338 for (ent = 0; ent < num_ents; ent++) 339 memset (doacross->array + ent * elt_sz, '\0', 340 sizeof (unsigned long) * ncounts); 341 if (ws->sched == GFS_STATIC && chunk_size == 0) 342 { 343 unsigned long q = counts[0] / num_ents; 344 unsigned long t = counts[0] % num_ents; 345 doacross->boundary = t * (q + 1); 346 doacross->q = q; 347 doacross->t = t; 348 } 349 ws->doacross = doacross; 350 } 351 352 /* DOACROSS POST operation. */ 353 354 void 355 GOMP_doacross_post (long *counts) 356 { 357 struct gomp_thread *thr = gomp_thread (); 358 struct gomp_work_share *ws = thr->ts.work_share; 359 struct gomp_doacross_work_share *doacross = ws->doacross; 360 unsigned long ent; 361 unsigned int i; 362 363 if (__builtin_expect (doacross == NULL, 0)) 364 { 365 __sync_synchronize (); 366 return; 367 } 368 369 if (__builtin_expect (ws->sched == GFS_STATIC, 1)) 370 ent = thr->ts.team_id; 371 else if (ws->sched == GFS_GUIDED) 372 ent = counts[0]; 373 else 374 ent = counts[0] / doacross->chunk_size; 375 unsigned long *array = (unsigned long *) (doacross->array 376 + ent * doacross->elt_sz); 377 378 if (__builtin_expect (doacross->flattened, 1)) 379 { 380 unsigned long flattened 381 = (unsigned long) counts[0] << doacross->shift_counts[0]; 382 383 for (i = 1; i < doacross->ncounts; i++) 384 flattened |= (unsigned long) counts[i] 385 << doacross->shift_counts[i]; 386 flattened++; 387 if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE)) 388 __atomic_thread_fence (MEMMODEL_RELEASE); 389 else 390 __atomic_store_n (array, flattened, MEMMODEL_RELEASE); 391 return; 392 } 393 394 __atomic_thread_fence (MEMMODEL_ACQUIRE); 395 for (i = doacross->ncounts; i-- > 0; ) 396 { 397 if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED)) 398 __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE); 399 } 400 } 401 402 /* DOACROSS WAIT operation. */ 403 404 void 405 GOMP_doacross_wait (long first, ...) 406 { 407 struct gomp_thread *thr = gomp_thread (); 408 struct gomp_work_share *ws = thr->ts.work_share; 409 struct gomp_doacross_work_share *doacross = ws->doacross; 410 va_list ap; 411 unsigned long ent; 412 unsigned int i; 413 414 if (__builtin_expect (doacross == NULL, 0)) 415 { 416 __sync_synchronize (); 417 return; 418 } 419 420 if (__builtin_expect (ws->sched == GFS_STATIC, 1)) 421 { 422 if (ws->chunk_size == 0) 423 { 424 if (first < doacross->boundary) 425 ent = first / (doacross->q + 1); 426 else 427 ent = (first - doacross->boundary) / doacross->q 428 + doacross->t; 429 } 430 else 431 ent = first / ws->chunk_size % thr->ts.team->nthreads; 432 } 433 else if (ws->sched == GFS_GUIDED) 434 ent = first; 435 else 436 ent = first / doacross->chunk_size; 437 unsigned long *array = (unsigned long *) (doacross->array 438 + ent * doacross->elt_sz); 439 440 if (__builtin_expect (doacross->flattened, 1)) 441 { 442 unsigned long flattened 443 = (unsigned long) first << doacross->shift_counts[0]; 444 unsigned long cur; 445 446 va_start (ap, first); 447 for (i = 1; i < doacross->ncounts; i++) 448 flattened |= (unsigned long) va_arg (ap, long) 449 << doacross->shift_counts[i]; 450 cur = __atomic_load_n (array, MEMMODEL_ACQUIRE); 451 if (flattened < cur) 452 { 453 __atomic_thread_fence (MEMMODEL_RELEASE); 454 va_end (ap); 455 return; 456 } 457 doacross_spin (array, flattened, cur); 458 __atomic_thread_fence (MEMMODEL_RELEASE); 459 va_end (ap); 460 return; 461 } 462 463 do 464 { 465 va_start (ap, first); 466 for (i = 0; i < doacross->ncounts; i++) 467 { 468 unsigned long thisv 469 = (unsigned long) (i ? va_arg (ap, long) : first) + 1; 470 unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED); 471 if (thisv < cur) 472 { 473 i = doacross->ncounts; 474 break; 475 } 476 if (thisv > cur) 477 break; 478 } 479 va_end (ap); 480 if (i == doacross->ncounts) 481 break; 482 cpu_relax (); 483 } 484 while (1); 485 __sync_synchronize (); 486 } 487 488 typedef unsigned long long gomp_ull; 489 490 void 491 gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size) 492 { 493 struct gomp_thread *thr = gomp_thread (); 494 struct gomp_team *team = thr->ts.team; 495 struct gomp_work_share *ws = thr->ts.work_share; 496 unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0; 497 unsigned long ent, num_ents, elt_sz, shift_sz; 498 struct gomp_doacross_work_share *doacross; 499 500 if (team == NULL || team->nthreads == 1) 501 return; 502 503 for (i = 0; i < ncounts; i++) 504 { 505 /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */ 506 if (counts[i] == 0) 507 return; 508 509 if (num_bits <= MAX_COLLAPSED_BITS) 510 { 511 unsigned int this_bits; 512 if (counts[i] == 1) 513 this_bits = 1; 514 else 515 this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__ 516 - __builtin_clzll (counts[i] - 1); 517 if (num_bits + this_bits <= MAX_COLLAPSED_BITS) 518 { 519 bits[i] = this_bits; 520 num_bits += this_bits; 521 } 522 else 523 num_bits = MAX_COLLAPSED_BITS + 1; 524 } 525 } 526 527 if (ws->sched == GFS_STATIC) 528 num_ents = team->nthreads; 529 else if (ws->sched == GFS_GUIDED) 530 num_ents = counts[0]; 531 else 532 num_ents = (counts[0] - 1) / chunk_size + 1; 533 if (num_bits <= MAX_COLLAPSED_BITS) 534 { 535 elt_sz = sizeof (unsigned long); 536 shift_sz = ncounts * sizeof (unsigned int); 537 } 538 else 539 { 540 if (sizeof (gomp_ull) == sizeof (unsigned long)) 541 elt_sz = sizeof (gomp_ull) * ncounts; 542 else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long)) 543 elt_sz = sizeof (unsigned long) * 2 * ncounts; 544 else 545 abort (); 546 shift_sz = 0; 547 } 548 elt_sz = (elt_sz + 63) & ~63UL; 549 550 doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz 551 + shift_sz); 552 doacross->chunk_size_ull = chunk_size; 553 doacross->elt_sz = elt_sz; 554 doacross->ncounts = ncounts; 555 doacross->flattened = false; 556 doacross->boundary = 0; 557 doacross->array = (unsigned char *) 558 ((((uintptr_t) (doacross + 1)) + 63 + shift_sz) 559 & ~(uintptr_t) 63); 560 if (num_bits <= MAX_COLLAPSED_BITS) 561 { 562 unsigned int shift_count = 0; 563 doacross->flattened = true; 564 for (i = ncounts; i > 0; i--) 565 { 566 doacross->shift_counts[i - 1] = shift_count; 567 shift_count += bits[i - 1]; 568 } 569 for (ent = 0; ent < num_ents; ent++) 570 *(unsigned long *) (doacross->array + ent * elt_sz) = 0; 571 } 572 else 573 for (ent = 0; ent < num_ents; ent++) 574 memset (doacross->array + ent * elt_sz, '\0', 575 sizeof (unsigned long) * ncounts); 576 if (ws->sched == GFS_STATIC && chunk_size == 0) 577 { 578 gomp_ull q = counts[0] / num_ents; 579 gomp_ull t = counts[0] % num_ents; 580 doacross->boundary_ull = t * (q + 1); 581 doacross->q_ull = q; 582 doacross->t = t; 583 } 584 ws->doacross = doacross; 585 } 586 587 /* DOACROSS POST operation. */ 588 589 void 590 GOMP_doacross_ull_post (gomp_ull *counts) 591 { 592 struct gomp_thread *thr = gomp_thread (); 593 struct gomp_work_share *ws = thr->ts.work_share; 594 struct gomp_doacross_work_share *doacross = ws->doacross; 595 unsigned long ent; 596 unsigned int i; 597 598 if (__builtin_expect (doacross == NULL, 0)) 599 { 600 __sync_synchronize (); 601 return; 602 } 603 604 if (__builtin_expect (ws->sched == GFS_STATIC, 1)) 605 ent = thr->ts.team_id; 606 else if (ws->sched == GFS_GUIDED) 607 ent = counts[0]; 608 else 609 ent = counts[0] / doacross->chunk_size_ull; 610 611 if (__builtin_expect (doacross->flattened, 1)) 612 { 613 unsigned long *array = (unsigned long *) (doacross->array 614 + ent * doacross->elt_sz); 615 gomp_ull flattened 616 = counts[0] << doacross->shift_counts[0]; 617 618 for (i = 1; i < doacross->ncounts; i++) 619 flattened |= counts[i] << doacross->shift_counts[i]; 620 flattened++; 621 if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE)) 622 __atomic_thread_fence (MEMMODEL_RELEASE); 623 else 624 __atomic_store_n (array, flattened, MEMMODEL_RELEASE); 625 return; 626 } 627 628 __atomic_thread_fence (MEMMODEL_ACQUIRE); 629 if (sizeof (gomp_ull) == sizeof (unsigned long)) 630 { 631 gomp_ull *array = (gomp_ull *) (doacross->array 632 + ent * doacross->elt_sz); 633 634 for (i = doacross->ncounts; i-- > 0; ) 635 { 636 if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED)) 637 __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE); 638 } 639 } 640 else 641 { 642 unsigned long *array = (unsigned long *) (doacross->array 643 + ent * doacross->elt_sz); 644 645 for (i = doacross->ncounts; i-- > 0; ) 646 { 647 gomp_ull cull = counts[i] + 1UL; 648 unsigned long c = (unsigned long) cull; 649 if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED)) 650 __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE); 651 c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2); 652 if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED)) 653 __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE); 654 } 655 } 656 } 657 658 /* DOACROSS WAIT operation. */ 659 660 void 661 GOMP_doacross_ull_wait (gomp_ull first, ...) 662 { 663 struct gomp_thread *thr = gomp_thread (); 664 struct gomp_work_share *ws = thr->ts.work_share; 665 struct gomp_doacross_work_share *doacross = ws->doacross; 666 va_list ap; 667 unsigned long ent; 668 unsigned int i; 669 670 if (__builtin_expect (doacross == NULL, 0)) 671 { 672 __sync_synchronize (); 673 return; 674 } 675 676 if (__builtin_expect (ws->sched == GFS_STATIC, 1)) 677 { 678 if (ws->chunk_size_ull == 0) 679 { 680 if (first < doacross->boundary_ull) 681 ent = first / (doacross->q_ull + 1); 682 else 683 ent = (first - doacross->boundary_ull) / doacross->q_ull 684 + doacross->t; 685 } 686 else 687 ent = first / ws->chunk_size_ull % thr->ts.team->nthreads; 688 } 689 else if (ws->sched == GFS_GUIDED) 690 ent = first; 691 else 692 ent = first / doacross->chunk_size_ull; 693 694 if (__builtin_expect (doacross->flattened, 1)) 695 { 696 unsigned long *array = (unsigned long *) (doacross->array 697 + ent * doacross->elt_sz); 698 gomp_ull flattened = first << doacross->shift_counts[0]; 699 unsigned long cur; 700 701 va_start (ap, first); 702 for (i = 1; i < doacross->ncounts; i++) 703 flattened |= va_arg (ap, gomp_ull) 704 << doacross->shift_counts[i]; 705 cur = __atomic_load_n (array, MEMMODEL_ACQUIRE); 706 if (flattened < cur) 707 { 708 __atomic_thread_fence (MEMMODEL_RELEASE); 709 va_end (ap); 710 return; 711 } 712 doacross_spin (array, flattened, cur); 713 __atomic_thread_fence (MEMMODEL_RELEASE); 714 va_end (ap); 715 return; 716 } 717 718 if (sizeof (gomp_ull) == sizeof (unsigned long)) 719 { 720 gomp_ull *array = (gomp_ull *) (doacross->array 721 + ent * doacross->elt_sz); 722 do 723 { 724 va_start (ap, first); 725 for (i = 0; i < doacross->ncounts; i++) 726 { 727 gomp_ull thisv 728 = (i ? va_arg (ap, gomp_ull) : first) + 1; 729 gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED); 730 if (thisv < cur) 731 { 732 i = doacross->ncounts; 733 break; 734 } 735 if (thisv > cur) 736 break; 737 } 738 va_end (ap); 739 if (i == doacross->ncounts) 740 break; 741 cpu_relax (); 742 } 743 while (1); 744 } 745 else 746 { 747 unsigned long *array = (unsigned long *) (doacross->array 748 + ent * doacross->elt_sz); 749 do 750 { 751 va_start (ap, first); 752 for (i = 0; i < doacross->ncounts; i++) 753 { 754 gomp_ull thisv 755 = (i ? va_arg (ap, gomp_ull) : first) + 1; 756 unsigned long t 757 = thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2); 758 unsigned long cur 759 = __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED); 760 if (t < cur) 761 { 762 i = doacross->ncounts; 763 break; 764 } 765 if (t > cur) 766 break; 767 t = thisv; 768 cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED); 769 if (t < cur) 770 { 771 i = doacross->ncounts; 772 break; 773 } 774 if (t > cur) 775 break; 776 } 777 va_end (ap); 778 if (i == doacross->ncounts) 779 break; 780 cpu_relax (); 781 } 782 while (1); 783 } 784 __sync_synchronize (); 785 } 786