1 /* Copyright (C) 2005-2019 Free Software Foundation, Inc.
2    Contributed by Richard Henderson <rth@redhat.com>.
3 
4    This file is part of the GNU Offloading and Multi Processing Library
5    (libgomp).
6 
7    Libgomp is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3, or (at your option)
10    any later version.
11 
12    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
13    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
14    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15    more details.
16 
17    Under Section 7 of GPL version 3, you are granted additional
18    permissions described in the GCC Runtime Library Exception, version
19    3.1, as published by the Free Software Foundation.
20 
21    You should have received a copy of the GNU General Public License and
22    a copy of the GCC Runtime Library Exception along with this program;
23    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
24    <http://www.gnu.org/licenses/>.  */
25 
26 /* This file handles the ORDERED construct.  */
27 
28 #include "libgomp.h"
29 #include <stdarg.h>
30 #include <string.h>
31 #include "doacross.h"
32 
33 
34 /* This function is called when first allocating an iteration block.  That
35    is, the thread is not currently on the queue.  The work-share lock must
36    be held on entry.  */
37 
38 void
gomp_ordered_first(void)39 gomp_ordered_first (void)
40 {
41   struct gomp_thread *thr = gomp_thread ();
42   struct gomp_team *team = thr->ts.team;
43   struct gomp_work_share *ws = thr->ts.work_share;
44   unsigned index;
45 
46   /* Work share constructs can be orphaned.  */
47   if (team == NULL || team->nthreads == 1)
48     return;
49 
50   index = ws->ordered_cur + ws->ordered_num_used;
51   if (index >= team->nthreads)
52     index -= team->nthreads;
53   ws->ordered_team_ids[index] = thr->ts.team_id;
54 
55   /* If this is the first and only thread in the queue, then there is
56      no one to release us when we get to our ordered section.  Post to
57      our own release queue now so that we won't block later.  */
58   if (ws->ordered_num_used++ == 0)
59     gomp_sem_post (team->ordered_release[thr->ts.team_id]);
60 }
61 
62 /* This function is called when completing the last iteration block.  That
63    is, there are no more iterations to perform and so the thread should be
64    removed from the queue entirely.  Because of the way ORDERED blocks are
65    managed, it follows that we currently own access to the ORDERED block,
66    and should now pass it on to the next thread.  The work-share lock must
67    be held on entry.  */
68 
69 void
gomp_ordered_last(void)70 gomp_ordered_last (void)
71 {
72   struct gomp_thread *thr = gomp_thread ();
73   struct gomp_team *team = thr->ts.team;
74   struct gomp_work_share *ws = thr->ts.work_share;
75   unsigned next_id;
76 
77   /* Work share constructs can be orphaned.  */
78   if (team == NULL || team->nthreads == 1)
79     return;
80 
81   /* We're no longer the owner.  */
82   ws->ordered_owner = -1;
83 
84   /* If we're not the last thread in the queue, then wake the next.  */
85   if (--ws->ordered_num_used > 0)
86     {
87       unsigned next = ws->ordered_cur + 1;
88       if (next == team->nthreads)
89 	next = 0;
90       ws->ordered_cur = next;
91 
92       next_id = ws->ordered_team_ids[next];
93       gomp_sem_post (team->ordered_release[next_id]);
94     }
95 }
96 
97 
98 /* This function is called when allocating a subsequent allocation block.
99    That is, we're done with the current iteration block and we're allocating
100    another.  This is the logical combination of a call to gomp_ordered_last
101    followed by a call to gomp_ordered_first.  The work-share lock must be
102    held on entry. */
103 
104 void
gomp_ordered_next(void)105 gomp_ordered_next (void)
106 {
107   struct gomp_thread *thr = gomp_thread ();
108   struct gomp_team *team = thr->ts.team;
109   struct gomp_work_share *ws = thr->ts.work_share;
110   unsigned index, next_id;
111 
112   /* Work share constructs can be orphaned.  */
113   if (team == NULL || team->nthreads == 1)
114     return;
115 
116   /* We're no longer the owner.  */
117   ws->ordered_owner = -1;
118 
119   /* If there's only one thread in the queue, that must be us.  */
120   if (ws->ordered_num_used == 1)
121     {
122       /* We have a similar situation as in gomp_ordered_first
123 	 where we need to post to our own release semaphore.  */
124       gomp_sem_post (team->ordered_release[thr->ts.team_id]);
125       return;
126     }
127 
128   /* If the queue is entirely full, then we move ourself to the end of
129      the queue merely by incrementing ordered_cur.  Only if it's not
130      full do we have to write our id.  */
131   if (ws->ordered_num_used < team->nthreads)
132     {
133       index = ws->ordered_cur + ws->ordered_num_used;
134       if (index >= team->nthreads)
135 	index -= team->nthreads;
136       ws->ordered_team_ids[index] = thr->ts.team_id;
137     }
138 
139   index = ws->ordered_cur + 1;
140   if (index == team->nthreads)
141     index = 0;
142   ws->ordered_cur = index;
143 
144   next_id = ws->ordered_team_ids[index];
145   gomp_sem_post (team->ordered_release[next_id]);
146 }
147 
148 
149 /* This function is called when a statically scheduled loop is first
150    being created.  */
151 
152 void
gomp_ordered_static_init(void)153 gomp_ordered_static_init (void)
154 {
155   struct gomp_thread *thr = gomp_thread ();
156   struct gomp_team *team = thr->ts.team;
157 
158   if (team == NULL || team->nthreads == 1)
159     return;
160 
161   gomp_sem_post (team->ordered_release[0]);
162 }
163 
164 /* This function is called when a statically scheduled loop is moving to
165    the next allocation block.  Static schedules are not first come first
166    served like the others, so we're to move to the numerically next thread,
167    not the next thread on a list.  The work-share lock should *not* be held
168    on entry.  */
169 
170 void
gomp_ordered_static_next(void)171 gomp_ordered_static_next (void)
172 {
173   struct gomp_thread *thr = gomp_thread ();
174   struct gomp_team *team = thr->ts.team;
175   struct gomp_work_share *ws = thr->ts.work_share;
176   unsigned id = thr->ts.team_id;
177 
178   if (team == NULL || team->nthreads == 1)
179     return;
180 
181   ws->ordered_owner = -1;
182 
183   /* This thread currently owns the lock.  Increment the owner.  */
184   if (++id == team->nthreads)
185     id = 0;
186   ws->ordered_team_ids[0] = id;
187   gomp_sem_post (team->ordered_release[id]);
188 }
189 
190 /* This function is called when we need to assert that the thread owns the
191    ordered section.  Due to the problem of posted-but-not-waited semaphores,
192    this needs to happen before completing a loop iteration.  */
193 
194 void
gomp_ordered_sync(void)195 gomp_ordered_sync (void)
196 {
197   struct gomp_thread *thr = gomp_thread ();
198   struct gomp_team *team = thr->ts.team;
199   struct gomp_work_share *ws = thr->ts.work_share;
200 
201   /* Work share constructs can be orphaned.  But this clearly means that
202      we are the only thread, and so we automatically own the section.  */
203   if (team == NULL || team->nthreads == 1)
204     return;
205 
206   /* ??? I believe it to be safe to access this data without taking the
207      ws->lock.  The only presumed race condition is with the previous
208      thread on the queue incrementing ordered_cur such that it points
209      to us, concurrently with our check below.  But our team_id is
210      already present in the queue, and the other thread will always
211      post to our release semaphore.  So the two cases are that we will
212      either win the race an momentarily block on the semaphore, or lose
213      the race and find the semaphore already unlocked and so not block.
214      Either way we get correct results.
215      However, there is an implicit flush on entry to an ordered region,
216      so we do need to have a barrier here.  If we were taking a lock
217      this could be MEMMODEL_RELEASE since the acquire would be coverd
218      by the lock.  */
219 
220   __atomic_thread_fence (MEMMODEL_ACQ_REL);
221   if (ws->ordered_owner != thr->ts.team_id)
222     {
223       gomp_sem_wait (team->ordered_release[thr->ts.team_id]);
224       ws->ordered_owner = thr->ts.team_id;
225     }
226 }
227 
228 /* This function is called by user code when encountering the start of an
229    ORDERED block.  We must check to see if the current thread is at the
230    head of the queue, and if not, block.  */
231 
232 #ifdef HAVE_ATTRIBUTE_ALIAS
233 extern void GOMP_ordered_start (void)
234 	__attribute__((alias ("gomp_ordered_sync")));
235 #else
236 void
GOMP_ordered_start(void)237 GOMP_ordered_start (void)
238 {
239   gomp_ordered_sync ();
240 }
241 #endif
242 
243 /* This function is called by user code when encountering the end of an
244    ORDERED block.  With the current ORDERED implementation there's nothing
245    for us to do.
246 
247    However, the current implementation has a flaw in that it does not allow
248    the next thread into the ORDERED section immediately after the current
249    thread exits the ORDERED section in its last iteration.  The existance
250    of this function allows the implementation to change.  */
251 
252 void
GOMP_ordered_end(void)253 GOMP_ordered_end (void)
254 {
255 }
256 
257 /* DOACROSS initialization.  */
258 
259 #define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
260 
261 void
gomp_doacross_init(unsigned ncounts,long * counts,long chunk_size,size_t extra)262 gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size,
263 		    size_t extra)
264 {
265   struct gomp_thread *thr = gomp_thread ();
266   struct gomp_team *team = thr->ts.team;
267   struct gomp_work_share *ws = thr->ts.work_share;
268   unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
269   unsigned long ent, num_ents, elt_sz, shift_sz;
270   struct gomp_doacross_work_share *doacross;
271 
272   if (team == NULL || team->nthreads == 1)
273     {
274     empty:
275       if (!extra)
276 	ws->doacross = NULL;
277       else
278 	{
279 	  doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
280 	  doacross->extra = (void *) (doacross + 1);
281 	  ws->doacross = doacross;
282 	}
283       return;
284     }
285 
286   for (i = 0; i < ncounts; i++)
287     {
288       /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
289       if (counts[i] == 0)
290 	goto empty;
291 
292       if (num_bits <= MAX_COLLAPSED_BITS)
293 	{
294 	  unsigned int this_bits;
295 	  if (counts[i] == 1)
296 	    this_bits = 1;
297 	  else
298 	    this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
299 			- __builtin_clzl (counts[i] - 1);
300 	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
301 	    {
302 	      bits[i] = this_bits;
303 	      num_bits += this_bits;
304 	    }
305 	  else
306 	    num_bits = MAX_COLLAPSED_BITS + 1;
307 	}
308     }
309 
310   if (ws->sched == GFS_STATIC)
311     num_ents = team->nthreads;
312   else if (ws->sched == GFS_GUIDED)
313     num_ents = counts[0];
314   else
315     num_ents = (counts[0] - 1) / chunk_size + 1;
316   if (num_bits <= MAX_COLLAPSED_BITS)
317     {
318       elt_sz = sizeof (unsigned long);
319       shift_sz = ncounts * sizeof (unsigned int);
320     }
321   else
322     {
323       elt_sz = sizeof (unsigned long) * ncounts;
324       shift_sz = 0;
325     }
326   elt_sz = (elt_sz + 63) & ~63UL;
327 
328   doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
329 			  + shift_sz + extra);
330   doacross->chunk_size = chunk_size;
331   doacross->elt_sz = elt_sz;
332   doacross->ncounts = ncounts;
333   doacross->flattened = false;
334   doacross->array = (unsigned char *)
335 		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
336 		     & ~(uintptr_t) 63);
337   if (extra)
338     {
339       doacross->extra = doacross->array + num_ents * elt_sz;
340       memset (doacross->extra, '\0', extra);
341     }
342   else
343     doacross->extra = NULL;
344   if (num_bits <= MAX_COLLAPSED_BITS)
345     {
346       unsigned int shift_count = 0;
347       doacross->flattened = true;
348       for (i = ncounts; i > 0; i--)
349 	{
350 	  doacross->shift_counts[i - 1] = shift_count;
351 	  shift_count += bits[i - 1];
352 	}
353       for (ent = 0; ent < num_ents; ent++)
354 	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
355     }
356   else
357     for (ent = 0; ent < num_ents; ent++)
358       memset (doacross->array + ent * elt_sz, '\0',
359 	      sizeof (unsigned long) * ncounts);
360   if (ws->sched == GFS_STATIC && chunk_size == 0)
361     {
362       unsigned long q = counts[0] / num_ents;
363       unsigned long t = counts[0] % num_ents;
364       doacross->boundary = t * (q + 1);
365       doacross->q = q;
366       doacross->t = t;
367     }
368   ws->doacross = doacross;
369 }
370 
371 /* DOACROSS POST operation.  */
372 
373 void
GOMP_doacross_post(long * counts)374 GOMP_doacross_post (long *counts)
375 {
376   struct gomp_thread *thr = gomp_thread ();
377   struct gomp_work_share *ws = thr->ts.work_share;
378   struct gomp_doacross_work_share *doacross = ws->doacross;
379   unsigned long ent;
380   unsigned int i;
381 
382   if (__builtin_expect (doacross == NULL, 0)
383       || __builtin_expect (doacross->array == NULL, 0))
384     {
385       __sync_synchronize ();
386       return;
387     }
388 
389   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
390     ent = thr->ts.team_id;
391   else if (ws->sched == GFS_GUIDED)
392     ent = counts[0];
393   else
394     ent = counts[0] / doacross->chunk_size;
395   unsigned long *array = (unsigned long *) (doacross->array
396 					    + ent * doacross->elt_sz);
397 
398   if (__builtin_expect (doacross->flattened, 1))
399     {
400       unsigned long flattened
401 	= (unsigned long) counts[0] << doacross->shift_counts[0];
402 
403       for (i = 1; i < doacross->ncounts; i++)
404 	flattened |= (unsigned long) counts[i]
405 		     << doacross->shift_counts[i];
406       flattened++;
407       if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
408 	__atomic_thread_fence (MEMMODEL_RELEASE);
409       else
410 	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
411       return;
412     }
413 
414   __atomic_thread_fence (MEMMODEL_ACQUIRE);
415   for (i = doacross->ncounts; i-- > 0; )
416     {
417       if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
418 	__atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
419     }
420 }
421 
422 /* DOACROSS WAIT operation.  */
423 
424 void
GOMP_doacross_wait(long first,...)425 GOMP_doacross_wait (long first, ...)
426 {
427   struct gomp_thread *thr = gomp_thread ();
428   struct gomp_work_share *ws = thr->ts.work_share;
429   struct gomp_doacross_work_share *doacross = ws->doacross;
430   va_list ap;
431   unsigned long ent;
432   unsigned int i;
433 
434   if (__builtin_expect (doacross == NULL, 0)
435       || __builtin_expect (doacross->array == NULL, 0))
436     {
437       __sync_synchronize ();
438       return;
439     }
440 
441   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
442     {
443       if (ws->chunk_size == 0)
444 	{
445 	  if (first < doacross->boundary)
446 	    ent = first / (doacross->q + 1);
447 	  else
448 	    ent = (first - doacross->boundary) / doacross->q
449 		  + doacross->t;
450 	}
451       else
452 	ent = first / ws->chunk_size % thr->ts.team->nthreads;
453     }
454   else if (ws->sched == GFS_GUIDED)
455     ent = first;
456   else
457     ent = first / doacross->chunk_size;
458   unsigned long *array = (unsigned long *) (doacross->array
459 					    + ent * doacross->elt_sz);
460 
461   if (__builtin_expect (doacross->flattened, 1))
462     {
463       unsigned long flattened
464 	= (unsigned long) first << doacross->shift_counts[0];
465       unsigned long cur;
466 
467       va_start (ap, first);
468       for (i = 1; i < doacross->ncounts; i++)
469 	flattened |= (unsigned long) va_arg (ap, long)
470 		     << doacross->shift_counts[i];
471       cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
472       if (flattened < cur)
473 	{
474 	  __atomic_thread_fence (MEMMODEL_RELEASE);
475 	  va_end (ap);
476 	  return;
477 	}
478       doacross_spin (array, flattened, cur);
479       __atomic_thread_fence (MEMMODEL_RELEASE);
480       va_end (ap);
481       return;
482     }
483 
484   do
485     {
486       va_start (ap, first);
487       for (i = 0; i < doacross->ncounts; i++)
488 	{
489 	  unsigned long thisv
490 	    = (unsigned long) (i ? va_arg (ap, long) : first) + 1;
491 	  unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
492 	  if (thisv < cur)
493 	    {
494 	      i = doacross->ncounts;
495 	      break;
496 	    }
497 	  if (thisv > cur)
498 	    break;
499 	}
500       va_end (ap);
501       if (i == doacross->ncounts)
502 	break;
503       cpu_relax ();
504     }
505   while (1);
506   __sync_synchronize ();
507 }
508 
509 typedef unsigned long long gomp_ull;
510 
511 void
gomp_doacross_ull_init(unsigned ncounts,gomp_ull * counts,gomp_ull chunk_size,size_t extra)512 gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts,
513 			gomp_ull chunk_size, size_t extra)
514 {
515   struct gomp_thread *thr = gomp_thread ();
516   struct gomp_team *team = thr->ts.team;
517   struct gomp_work_share *ws = thr->ts.work_share;
518   unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
519   unsigned long ent, num_ents, elt_sz, shift_sz;
520   struct gomp_doacross_work_share *doacross;
521 
522   if (team == NULL || team->nthreads == 1)
523     {
524     empty:
525       if (!extra)
526 	ws->doacross = NULL;
527       else
528 	{
529 	  doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
530 	  doacross->extra = (void *) (doacross + 1);
531 	  ws->doacross = doacross;
532 	}
533       return;
534     }
535 
536   for (i = 0; i < ncounts; i++)
537     {
538       /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
539       if (counts[i] == 0)
540 	goto empty;
541 
542       if (num_bits <= MAX_COLLAPSED_BITS)
543 	{
544 	  unsigned int this_bits;
545 	  if (counts[i] == 1)
546 	    this_bits = 1;
547 	  else
548 	    this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
549 			- __builtin_clzll (counts[i] - 1);
550 	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
551 	    {
552 	      bits[i] = this_bits;
553 	      num_bits += this_bits;
554 	    }
555 	  else
556 	    num_bits = MAX_COLLAPSED_BITS + 1;
557 	}
558     }
559 
560   if (ws->sched == GFS_STATIC)
561     num_ents = team->nthreads;
562   else if (ws->sched == GFS_GUIDED)
563     num_ents = counts[0];
564   else
565     num_ents = (counts[0] - 1) / chunk_size + 1;
566   if (num_bits <= MAX_COLLAPSED_BITS)
567     {
568       elt_sz = sizeof (unsigned long);
569       shift_sz = ncounts * sizeof (unsigned int);
570     }
571   else
572     {
573       if (sizeof (gomp_ull) == sizeof (unsigned long))
574 	elt_sz = sizeof (gomp_ull) * ncounts;
575       else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
576 	elt_sz = sizeof (unsigned long) * 2 * ncounts;
577       else
578 	abort ();
579       shift_sz = 0;
580     }
581   elt_sz = (elt_sz + 63) & ~63UL;
582 
583   doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
584 			  + shift_sz);
585   doacross->chunk_size_ull = chunk_size;
586   doacross->elt_sz = elt_sz;
587   doacross->ncounts = ncounts;
588   doacross->flattened = false;
589   doacross->boundary = 0;
590   doacross->array = (unsigned char *)
591 		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
592 		     & ~(uintptr_t) 63);
593   if (extra)
594     {
595       doacross->extra = doacross->array + num_ents * elt_sz;
596       memset (doacross->extra, '\0', extra);
597     }
598   else
599     doacross->extra = NULL;
600   if (num_bits <= MAX_COLLAPSED_BITS)
601     {
602       unsigned int shift_count = 0;
603       doacross->flattened = true;
604       for (i = ncounts; i > 0; i--)
605 	{
606 	  doacross->shift_counts[i - 1] = shift_count;
607 	  shift_count += bits[i - 1];
608 	}
609       for (ent = 0; ent < num_ents; ent++)
610 	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
611     }
612   else
613     for (ent = 0; ent < num_ents; ent++)
614       memset (doacross->array + ent * elt_sz, '\0',
615 	      sizeof (unsigned long) * ncounts);
616   if (ws->sched == GFS_STATIC && chunk_size == 0)
617     {
618       gomp_ull q = counts[0] / num_ents;
619       gomp_ull t = counts[0] % num_ents;
620       doacross->boundary_ull = t * (q + 1);
621       doacross->q_ull = q;
622       doacross->t = t;
623     }
624   ws->doacross = doacross;
625 }
626 
627 /* DOACROSS POST operation.  */
628 
629 void
GOMP_doacross_ull_post(gomp_ull * counts)630 GOMP_doacross_ull_post (gomp_ull *counts)
631 {
632   struct gomp_thread *thr = gomp_thread ();
633   struct gomp_work_share *ws = thr->ts.work_share;
634   struct gomp_doacross_work_share *doacross = ws->doacross;
635   unsigned long ent;
636   unsigned int i;
637 
638   if (__builtin_expect (doacross == NULL, 0)
639       || __builtin_expect (doacross->array == NULL, 0))
640     {
641       __sync_synchronize ();
642       return;
643     }
644 
645   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
646     ent = thr->ts.team_id;
647   else if (ws->sched == GFS_GUIDED)
648     ent = counts[0];
649   else
650     ent = counts[0] / doacross->chunk_size_ull;
651 
652   if (__builtin_expect (doacross->flattened, 1))
653     {
654       unsigned long *array = (unsigned long *) (doacross->array
655 			      + ent * doacross->elt_sz);
656       gomp_ull flattened
657 	= counts[0] << doacross->shift_counts[0];
658 
659       for (i = 1; i < doacross->ncounts; i++)
660 	flattened |= counts[i] << doacross->shift_counts[i];
661       flattened++;
662       if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
663 	__atomic_thread_fence (MEMMODEL_RELEASE);
664       else
665 	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
666       return;
667     }
668 
669   __atomic_thread_fence (MEMMODEL_ACQUIRE);
670   if (sizeof (gomp_ull) == sizeof (unsigned long))
671     {
672       gomp_ull *array = (gomp_ull *) (doacross->array
673 				      + ent * doacross->elt_sz);
674 
675       for (i = doacross->ncounts; i-- > 0; )
676 	{
677 	  if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
678 	    __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
679 	}
680     }
681   else
682     {
683       unsigned long *array = (unsigned long *) (doacross->array
684 						+ ent * doacross->elt_sz);
685 
686       for (i = doacross->ncounts; i-- > 0; )
687 	{
688 	  gomp_ull cull = counts[i] + 1UL;
689 	  unsigned long c = (unsigned long) cull;
690 	  if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
691 	    __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
692 	  c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
693 	  if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
694 	    __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
695 	}
696     }
697 }
698 
699 /* DOACROSS WAIT operation.  */
700 
701 void
GOMP_doacross_ull_wait(gomp_ull first,...)702 GOMP_doacross_ull_wait (gomp_ull first, ...)
703 {
704   struct gomp_thread *thr = gomp_thread ();
705   struct gomp_work_share *ws = thr->ts.work_share;
706   struct gomp_doacross_work_share *doacross = ws->doacross;
707   va_list ap;
708   unsigned long ent;
709   unsigned int i;
710 
711   if (__builtin_expect (doacross == NULL, 0)
712       || __builtin_expect (doacross->array == NULL, 0))
713     {
714       __sync_synchronize ();
715       return;
716     }
717 
718   if (__builtin_expect (ws->sched == GFS_STATIC, 1))
719     {
720       if (ws->chunk_size_ull == 0)
721 	{
722 	  if (first < doacross->boundary_ull)
723 	    ent = first / (doacross->q_ull + 1);
724 	  else
725 	    ent = (first - doacross->boundary_ull) / doacross->q_ull
726 		  + doacross->t;
727 	}
728       else
729 	ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
730     }
731   else if (ws->sched == GFS_GUIDED)
732     ent = first;
733   else
734     ent = first / doacross->chunk_size_ull;
735 
736   if (__builtin_expect (doacross->flattened, 1))
737     {
738       unsigned long *array = (unsigned long *) (doacross->array
739 						+ ent * doacross->elt_sz);
740       gomp_ull flattened = first << doacross->shift_counts[0];
741       unsigned long cur;
742 
743       va_start (ap, first);
744       for (i = 1; i < doacross->ncounts; i++)
745 	flattened |= va_arg (ap, gomp_ull)
746 		     << doacross->shift_counts[i];
747       cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
748       if (flattened < cur)
749 	{
750 	  __atomic_thread_fence (MEMMODEL_RELEASE);
751 	  va_end (ap);
752 	  return;
753 	}
754       doacross_spin (array, flattened, cur);
755       __atomic_thread_fence (MEMMODEL_RELEASE);
756       va_end (ap);
757       return;
758     }
759 
760   if (sizeof (gomp_ull) == sizeof (unsigned long))
761     {
762       gomp_ull *array = (gomp_ull *) (doacross->array
763 				      + ent * doacross->elt_sz);
764       do
765 	{
766 	  va_start (ap, first);
767 	  for (i = 0; i < doacross->ncounts; i++)
768 	    {
769 	      gomp_ull thisv
770 		= (i ? va_arg (ap, gomp_ull) : first) + 1;
771 	      gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
772 	      if (thisv < cur)
773 		{
774 		  i = doacross->ncounts;
775 		  break;
776 		}
777 	      if (thisv > cur)
778 		break;
779 	    }
780 	  va_end (ap);
781 	  if (i == doacross->ncounts)
782 	    break;
783 	  cpu_relax ();
784 	}
785       while (1);
786     }
787   else
788     {
789       unsigned long *array = (unsigned long *) (doacross->array
790 						+ ent * doacross->elt_sz);
791       do
792 	{
793 	  va_start (ap, first);
794 	  for (i = 0; i < doacross->ncounts; i++)
795 	    {
796 	      gomp_ull thisv
797 		= (i ? va_arg (ap, gomp_ull) : first) + 1;
798 	      unsigned long t
799 		= thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
800 	      unsigned long cur
801 		= __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
802 	      if (t < cur)
803 		{
804 		  i = doacross->ncounts;
805 		  break;
806 		}
807 	      if (t > cur)
808 		break;
809 	      t = thisv;
810 	      cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
811 	      if (t < cur)
812 		{
813 		  i = doacross->ncounts;
814 		  break;
815 		}
816 	      if (t > cur)
817 		break;
818 	    }
819 	  va_end (ap);
820 	  if (i == doacross->ncounts)
821 	    break;
822 	  cpu_relax ();
823 	}
824       while (1);
825     }
826   __sync_synchronize ();
827 }
828