1 /*
2 * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #ifndef SHARE_GC_SHARED_TASKQUEUE_HPP
26 #define SHARE_GC_SHARED_TASKQUEUE_HPP
27
28 #include "memory/allocation.hpp"
29 #include "memory/padded.hpp"
30 #include "oops/oopsHierarchy.hpp"
31 #include "utilities/globalDefinitions.hpp"
32 #include "utilities/ostream.hpp"
33 #include "utilities/stack.hpp"
34
35 // Simple TaskQueue stats that are collected by default in debug builds.
36
37 #if !defined(TASKQUEUE_STATS) && defined(ASSERT)
38 #define TASKQUEUE_STATS 1
39 #elif !defined(TASKQUEUE_STATS)
40 #define TASKQUEUE_STATS 0
41 #endif
42
43 #if TASKQUEUE_STATS
44 #define TASKQUEUE_STATS_ONLY(code) code
45 #else
46 #define TASKQUEUE_STATS_ONLY(code)
47 #endif // TASKQUEUE_STATS
48
49 #if TASKQUEUE_STATS
50 class TaskQueueStats {
51 public:
52 enum StatId {
53 push, // number of taskqueue pushes
54 pop, // number of taskqueue pops
55 pop_slow, // subset of taskqueue pops that were done slow-path
56 steal_attempt, // number of taskqueue steal attempts
57 steal, // number of taskqueue steals
58 overflow, // number of overflow pushes
59 overflow_max_len, // max length of overflow stack
60 last_stat_id
61 };
62
63 public:
TaskQueueStats()64 inline TaskQueueStats() { reset(); }
65
record_push()66 inline void record_push() { ++_stats[push]; }
record_pop()67 inline void record_pop() { ++_stats[pop]; }
record_pop_slow()68 inline void record_pop_slow() { record_pop(); ++_stats[pop_slow]; }
record_steal_attempt()69 inline void record_steal_attempt() { ++_stats[steal_attempt]; }
record_steal()70 inline void record_steal() { ++_stats[steal]; }
71 inline void record_overflow(size_t new_length);
72
73 TaskQueueStats & operator +=(const TaskQueueStats & addend);
74
get(StatId id) const75 inline size_t get(StatId id) const { return _stats[id]; }
get() const76 inline const size_t* get() const { return _stats; }
77
78 inline void reset();
79
80 // Print the specified line of the header (does not include a line separator).
81 static void print_header(unsigned int line, outputStream* const stream = tty,
82 unsigned int width = 10);
83 // Print the statistics (does not include a line separator).
84 void print(outputStream* const stream = tty, unsigned int width = 10) const;
85
86 DEBUG_ONLY(void verify() const;)
87
88 private:
89 size_t _stats[last_stat_id];
90 static const char * const _names[last_stat_id];
91 };
92
record_overflow(size_t new_len)93 void TaskQueueStats::record_overflow(size_t new_len) {
94 ++_stats[overflow];
95 if (new_len > _stats[overflow_max_len]) _stats[overflow_max_len] = new_len;
96 }
97
reset()98 void TaskQueueStats::reset() {
99 memset(_stats, 0, sizeof(_stats));
100 }
101 #endif // TASKQUEUE_STATS
102
103 // TaskQueueSuper collects functionality common to all GenericTaskQueue instances.
104
105 template <unsigned int N, MEMFLAGS F>
106 class TaskQueueSuper: public CHeapObj<F> {
107 protected:
108 // Internal type for indexing the queue; also used for the tag.
109 typedef NOT_LP64(uint16_t) LP64_ONLY(uint32_t) idx_t;
110
111 // The first free element after the last one pushed (mod N).
112 volatile uint _bottom;
113
114 enum { MOD_N_MASK = N - 1 };
115
116 class Age {
117 public:
Age(size_t data=0)118 Age(size_t data = 0) { _data = data; }
Age(const Age & age)119 Age(const Age& age) { _data = age._data; }
Age(idx_t top,idx_t tag)120 Age(idx_t top, idx_t tag) { _fields._top = top; _fields._tag = tag; }
121
get() const122 Age get() const volatile { return _data; }
set(Age age)123 void set(Age age) volatile { _data = age._data; }
124
top() const125 idx_t top() const volatile { return _fields._top; }
tag() const126 idx_t tag() const volatile { return _fields._tag; }
127
128 // Increment top; if it wraps, increment tag also.
increment()129 void increment() {
130 _fields._top = increment_index(_fields._top);
131 if (_fields._top == 0) ++_fields._tag;
132 }
133
134 Age cmpxchg(const Age new_age, const Age old_age) volatile;
135
operator ==(const Age & other) const136 bool operator ==(const Age& other) const { return _data == other._data; }
137
138 private:
139 struct fields {
140 idx_t _top;
141 idx_t _tag;
142 };
143 union {
144 size_t _data;
145 fields _fields;
146 };
147 };
148
149 volatile Age _age;
150
151 // These both operate mod N.
increment_index(uint ind)152 static uint increment_index(uint ind) {
153 return (ind + 1) & MOD_N_MASK;
154 }
decrement_index(uint ind)155 static uint decrement_index(uint ind) {
156 return (ind - 1) & MOD_N_MASK;
157 }
158
159 // Returns a number in the range [0..N). If the result is "N-1", it should be
160 // interpreted as 0.
dirty_size(uint bot,uint top) const161 uint dirty_size(uint bot, uint top) const {
162 return (bot - top) & MOD_N_MASK;
163 }
164
165 // Returns the size corresponding to the given "bot" and "top".
size(uint bot,uint top) const166 uint size(uint bot, uint top) const {
167 uint sz = dirty_size(bot, top);
168 // Has the queue "wrapped", so that bottom is less than top? There's a
169 // complicated special case here. A pair of threads could perform pop_local
170 // and pop_global operations concurrently, starting from a state in which
171 // _bottom == _top+1. The pop_local could succeed in decrementing _bottom,
172 // and the pop_global in incrementing _top (in which case the pop_global
173 // will be awarded the contested queue element.) The resulting state must
174 // be interpreted as an empty queue. (We only need to worry about one such
175 // event: only the queue owner performs pop_local's, and several concurrent
176 // threads attempting to perform the pop_global will all perform the same
177 // CAS, and only one can succeed.) Any stealing thread that reads after
178 // either the increment or decrement will see an empty queue, and will not
179 // join the competitors. The "sz == -1 || sz == N-1" state will not be
180 // modified by concurrent queues, so the owner thread can reset the state to
181 // _bottom == top so subsequent pushes will be performed normally.
182 return (sz == N - 1) ? 0 : sz;
183 }
184
185 public:
TaskQueueSuper()186 TaskQueueSuper() : _bottom(0), _age() {}
187
188 // Return true if the TaskQueue contains/does not contain any tasks.
peek() const189 bool peek() const { return _bottom != _age.top(); }
is_empty() const190 bool is_empty() const { return size() == 0; }
191
192 // Return an estimate of the number of elements in the queue.
193 // The "careful" version admits the possibility of pop_local/pop_global
194 // races.
size() const195 uint size() const {
196 return size(_bottom, _age.top());
197 }
198
dirty_size() const199 uint dirty_size() const {
200 return dirty_size(_bottom, _age.top());
201 }
202
set_empty()203 void set_empty() {
204 _bottom = 0;
205 _age.set(0);
206 }
207
208 // Maximum number of elements allowed in the queue. This is two less
209 // than the actual queue size, for somewhat complicated reasons.
max_elems() const210 uint max_elems() const { return N - 2; }
211
212 // Total size of queue.
total_size()213 static const uint total_size() { return N; }
214
215 TASKQUEUE_STATS_ONLY(TaskQueueStats stats;)
216 };
217
218 //
219 // GenericTaskQueue implements an ABP, Aurora-Blumofe-Plaxton, double-
220 // ended-queue (deque), intended for use in work stealing. Queue operations
221 // are non-blocking.
222 //
223 // A queue owner thread performs push() and pop_local() operations on one end
224 // of the queue, while other threads may steal work using the pop_global()
225 // method.
226 //
227 // The main difference to the original algorithm is that this
228 // implementation allows wrap-around at the end of its allocated
229 // storage, which is an array.
230 //
231 // The original paper is:
232 //
233 // Arora, N. S., Blumofe, R. D., and Plaxton, C. G.
234 // Thread scheduling for multiprogrammed multiprocessors.
235 // Theory of Computing Systems 34, 2 (2001), 115-144.
236 //
237 // The following paper provides an correctness proof and an
238 // implementation for weakly ordered memory models including (pseudo-)
239 // code containing memory barriers for a Chase-Lev deque. Chase-Lev is
240 // similar to ABP, with the main difference that it allows resizing of the
241 // underlying storage:
242 //
243 // Le, N. M., Pop, A., Cohen A., and Nardell, F. Z.
244 // Correct and efficient work-stealing for weak memory models
245 // Proceedings of the 18th ACM SIGPLAN symposium on Principles and
246 // practice of parallel programming (PPoPP 2013), 69-80
247 //
248
249 template <class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE>
250 class GenericTaskQueue: public TaskQueueSuper<N, F> {
251 protected:
252 typedef typename TaskQueueSuper<N, F>::Age Age;
253 typedef typename TaskQueueSuper<N, F>::idx_t idx_t;
254
255 using TaskQueueSuper<N, F>::_bottom;
256 using TaskQueueSuper<N, F>::_age;
257 using TaskQueueSuper<N, F>::increment_index;
258 using TaskQueueSuper<N, F>::decrement_index;
259 using TaskQueueSuper<N, F>::dirty_size;
260
261 public:
262 using TaskQueueSuper<N, F>::max_elems;
263 using TaskQueueSuper<N, F>::size;
264
265 #if TASKQUEUE_STATS
266 using TaskQueueSuper<N, F>::stats;
267 #endif
268
269 private:
270 // Slow paths for push, pop_local. (pop_global has no fast path.)
271 bool push_slow(E t, uint dirty_n_elems);
272 bool pop_local_slow(uint localBot, Age oldAge);
273
274 public:
275 typedef E element_type;
276
277 // Initializes the queue to empty.
278 GenericTaskQueue();
279
280 void initialize();
281
282 // Push the task "t" on the queue. Returns "false" iff the queue is full.
283 inline bool push(E t);
284
285 // Attempts to claim a task from the "local" end of the queue (the most
286 // recently pushed) as long as the number of entries exceeds the threshold.
287 // If successful, returns true and sets t to the task; otherwise, returns false
288 // (the queue is empty or the number of elements below the threshold).
289 inline bool pop_local(volatile E& t, uint threshold = 0);
290
291 // Like pop_local(), but uses the "global" end of the queue (the least
292 // recently pushed).
293 bool pop_global(volatile E& t);
294
295 // Delete any resource associated with the queue.
296 ~GenericTaskQueue();
297
298 // Apply fn to each element in the task queue. The queue must not
299 // be modified while iterating.
300 template<typename Fn> void iterate(Fn fn);
301
302 private:
303 DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, 0);
304 // Element array.
305 volatile E* _elems;
306
307 DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(E*));
308 // Queue owner local variables. Not to be accessed by other threads.
309
310 static const uint InvalidQueueId = uint(-1);
311 uint _last_stolen_queue_id; // The id of the queue we last stole from
312
313 int _seed; // Current random seed used for selecting a random queue during stealing.
314
315 DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(uint) + sizeof(int));
316 public:
317 int next_random_queue_id();
318
set_last_stolen_queue_id(uint id)319 void set_last_stolen_queue_id(uint id) { _last_stolen_queue_id = id; }
last_stolen_queue_id() const320 uint last_stolen_queue_id() const { return _last_stolen_queue_id; }
is_last_stolen_queue_id_valid() const321 bool is_last_stolen_queue_id_valid() const { return _last_stolen_queue_id != InvalidQueueId; }
invalidate_last_stolen_queue_id()322 void invalidate_last_stolen_queue_id() { _last_stolen_queue_id = InvalidQueueId; }
323 };
324
325 template<class E, MEMFLAGS F, unsigned int N>
GenericTaskQueue()326 GenericTaskQueue<E, F, N>::GenericTaskQueue() : _last_stolen_queue_id(InvalidQueueId), _seed(17 /* random number */) {
327 assert(sizeof(Age) == sizeof(size_t), "Depends on this.");
328 }
329
330 // OverflowTaskQueue is a TaskQueue that also includes an overflow stack for
331 // elements that do not fit in the TaskQueue.
332 //
333 // This class hides two methods from super classes:
334 //
335 // push() - push onto the task queue or, if that fails, onto the overflow stack
336 // is_empty() - return true if both the TaskQueue and overflow stack are empty
337 //
338 // Note that size() is not hidden--it returns the number of elements in the
339 // TaskQueue, and does not include the size of the overflow stack. This
340 // simplifies replacement of GenericTaskQueues with OverflowTaskQueues.
341 template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE>
342 class OverflowTaskQueue: public GenericTaskQueue<E, F, N>
343 {
344 public:
345 typedef Stack<E, F> overflow_t;
346 typedef GenericTaskQueue<E, F, N> taskqueue_t;
347
348 TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;)
349
350 // Push task t onto the queue or onto the overflow stack. Return true.
351 inline bool push(E t);
352 // Try to push task t onto the queue only. Returns true if successful, false otherwise.
353 inline bool try_push_to_taskqueue(E t);
354
355 // Attempt to pop from the overflow stack; return true if anything was popped.
356 inline bool pop_overflow(E& t);
357
overflow_stack()358 inline overflow_t* overflow_stack() { return &_overflow_stack; }
359
taskqueue_empty() const360 inline bool taskqueue_empty() const { return taskqueue_t::is_empty(); }
overflow_empty() const361 inline bool overflow_empty() const { return _overflow_stack.is_empty(); }
is_empty() const362 inline bool is_empty() const {
363 return taskqueue_empty() && overflow_empty();
364 }
365
366 private:
367 overflow_t _overflow_stack;
368 };
369
370 class TaskQueueSetSuper {
371 public:
372 // Returns "true" if some TaskQueue in the set contains a task.
373 virtual bool peek() = 0;
374 // Tasks in queue
375 virtual uint tasks() const = 0;
376 };
377
378 template <MEMFLAGS F> class TaskQueueSetSuperImpl: public CHeapObj<F>, public TaskQueueSetSuper {
379 };
380
381 template<class T, MEMFLAGS F>
382 class GenericTaskQueueSet: public TaskQueueSetSuperImpl<F> {
383 public:
384 typedef typename T::element_type E;
385
386 private:
387 uint _n;
388 T** _queues;
389
390 bool steal_best_of_2(uint queue_num, E& t);
391
392 public:
393 GenericTaskQueueSet(uint n);
394 ~GenericTaskQueueSet();
395
396 void register_queue(uint i, T* q);
397
398 T* queue(uint n);
399
400 // Try to steal a task from some other queue than queue_num. It may perform several attempts at doing so.
401 // Returns if stealing succeeds, and sets "t" to the stolen task.
402 bool steal(uint queue_num, E& t);
403
404 bool peek();
405 uint tasks() const;
406
size() const407 uint size() const { return _n; }
408 };
409
410 template<class T, MEMFLAGS F> void
register_queue(uint i,T * q)411 GenericTaskQueueSet<T, F>::register_queue(uint i, T* q) {
412 assert(i < _n, "index out of range.");
413 _queues[i] = q;
414 }
415
416 template<class T, MEMFLAGS F> T*
417 GenericTaskQueueSet<T, F>::queue(uint i) {
418 return _queues[i];
419 }
420
421 template<class T, MEMFLAGS F>
peek()422 bool GenericTaskQueueSet<T, F>::peek() {
423 // Try all the queues.
424 for (uint j = 0; j < _n; j++) {
425 if (_queues[j]->peek())
426 return true;
427 }
428 return false;
429 }
430
431 template<class T, MEMFLAGS F>
tasks() const432 uint GenericTaskQueueSet<T, F>::tasks() const {
433 uint n = 0;
434 for (uint j = 0; j < _n; j++) {
435 n += _queues[j]->size();
436 }
437 return n;
438 }
439
440 // When to terminate from the termination protocol.
441 class TerminatorTerminator: public CHeapObj<mtInternal> {
442 public:
443 virtual bool should_exit_termination() = 0;
444 };
445
446 // A class to aid in the termination of a set of parallel tasks using
447 // TaskQueueSet's for work stealing.
448
449 #undef TRACESPINNING
450
451 class ParallelTaskTerminator: public CHeapObj<mtGC> {
452 protected:
453 uint _n_threads;
454 TaskQueueSetSuper* _queue_set;
455
456 DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, 0);
457 volatile uint _offered_termination;
458 DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(volatile uint));
459
460 #ifdef TRACESPINNING
461 static uint _total_yields;
462 static uint _total_spins;
463 static uint _total_peeks;
464 #endif
465
466 bool peek_in_queue_set();
467 protected:
468 virtual void yield();
469 void sleep(uint millis);
470
471 // Called when exiting termination is requested.
472 // When the request is made, terminator may have already terminated
473 // (e.g. all threads are arrived and offered termination). In this case,
474 // it should ignore the request and complete the termination.
475 // Return true if termination is completed. Otherwise, return false.
476 bool complete_or_exit_termination();
477 public:
478
479 // "n_threads" is the number of threads to be terminated. "queue_set" is a
480 // queue sets of work queues of other threads.
481 ParallelTaskTerminator(uint n_threads, TaskQueueSetSuper* queue_set);
482 virtual ~ParallelTaskTerminator();
483
484 // The current thread has no work, and is ready to terminate if everyone
485 // else is. If returns "true", all threads are terminated. If returns
486 // "false", available work has been observed in one of the task queues,
487 // so the global task is not complete.
offer_termination()488 bool offer_termination() {
489 return offer_termination(NULL);
490 }
491
492 // As above, but it also terminates if the should_exit_termination()
493 // method of the terminator parameter returns true. If terminator is
494 // NULL, then it is ignored.
495 virtual bool offer_termination(TerminatorTerminator* terminator);
496
497 // Reset the terminator, so that it may be reused again.
498 // The caller is responsible for ensuring that this is done
499 // in an MT-safe manner, once the previous round of use of
500 // the terminator is finished.
501 void reset_for_reuse();
502 // Same as above but the number of parallel threads is set to the
503 // given number.
504 void reset_for_reuse(uint n_threads);
505
506 #ifdef TRACESPINNING
total_yields()507 static uint total_yields() { return _total_yields; }
total_spins()508 static uint total_spins() { return _total_spins; }
total_peeks()509 static uint total_peeks() { return _total_peeks; }
510 static void print_termination_counts();
511 #endif
512 };
513
514 class TaskTerminator : public StackObj {
515 private:
516 ParallelTaskTerminator* _terminator;
517
518 NONCOPYABLE(TaskTerminator);
519
520 public:
521 TaskTerminator(uint n_threads, TaskQueueSetSuper* queue_set);
522 ~TaskTerminator();
523
terminator() const524 ParallelTaskTerminator* terminator() const {
525 return _terminator;
526 }
527 };
528
529 typedef GenericTaskQueue<oop, mtGC> OopTaskQueue;
530 typedef GenericTaskQueueSet<OopTaskQueue, mtGC> OopTaskQueueSet;
531
532 #ifdef _MSC_VER
533 #pragma warning(push)
534 // warning C4522: multiple assignment operators specified
535 #pragma warning(disable:4522)
536 #endif
537
538 // This is a container class for either an oop* or a narrowOop*.
539 // Both are pushed onto a task queue and the consumer will test is_narrow()
540 // to determine which should be processed.
541 class StarTask {
542 void* _holder; // either union oop* or narrowOop*
543
544 enum { COMPRESSED_OOP_MASK = 1 };
545
546 public:
StarTask(narrowOop * p)547 StarTask(narrowOop* p) {
548 assert(((uintptr_t)p & COMPRESSED_OOP_MASK) == 0, "Information loss!");
549 _holder = (void *)((uintptr_t)p | COMPRESSED_OOP_MASK);
550 }
StarTask(oop * p)551 StarTask(oop* p) {
552 assert(((uintptr_t)p & COMPRESSED_OOP_MASK) == 0, "Information loss!");
553 _holder = (void*)p;
554 }
StarTask()555 StarTask() { _holder = NULL; }
operator oop*()556 operator oop*() { return (oop*)_holder; }
operator narrowOop*()557 operator narrowOop*() {
558 return (narrowOop*)((uintptr_t)_holder & ~COMPRESSED_OOP_MASK);
559 }
560
operator =(const StarTask & t)561 StarTask& operator=(const StarTask& t) {
562 _holder = t._holder;
563 return *this;
564 }
operator =(const volatile StarTask & t)565 volatile StarTask& operator=(const volatile StarTask& t) volatile {
566 _holder = t._holder;
567 return *this;
568 }
569
is_narrow() const570 bool is_narrow() const {
571 return (((uintptr_t)_holder & COMPRESSED_OOP_MASK) != 0);
572 }
573 };
574
575 class ObjArrayTask
576 {
577 public:
ObjArrayTask(oop o=NULL,int idx=0)578 ObjArrayTask(oop o = NULL, int idx = 0): _obj(o), _index(idx) { }
ObjArrayTask(oop o,size_t idx)579 ObjArrayTask(oop o, size_t idx): _obj(o), _index(int(idx)) {
580 assert(idx <= size_t(max_jint), "too big");
581 }
ObjArrayTask(const ObjArrayTask & t)582 ObjArrayTask(const ObjArrayTask& t): _obj(t._obj), _index(t._index) { }
583
operator =(const ObjArrayTask & t)584 ObjArrayTask& operator =(const ObjArrayTask& t) {
585 _obj = t._obj;
586 _index = t._index;
587 return *this;
588 }
589 volatile ObjArrayTask&
operator =(const volatile ObjArrayTask & t)590 operator =(const volatile ObjArrayTask& t) volatile {
591 (void)const_cast<oop&>(_obj = t._obj);
592 _index = t._index;
593 return *this;
594 }
595
obj() const596 inline oop obj() const { return _obj; }
index() const597 inline int index() const { return _index; }
598
599 DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid.
600
601 private:
602 oop _obj;
603 int _index;
604 };
605
606 #ifdef _MSC_VER
607 #pragma warning(pop)
608 #endif
609
610 typedef OverflowTaskQueue<StarTask, mtGC> OopStarTaskQueue;
611 typedef GenericTaskQueueSet<OopStarTaskQueue, mtGC> OopStarTaskQueueSet;
612
613 typedef OverflowTaskQueue<size_t, mtGC> RegionTaskQueue;
614 typedef GenericTaskQueueSet<RegionTaskQueue, mtGC> RegionTaskQueueSet;
615
616 #endif // SHARE_GC_SHARED_TASKQUEUE_HPP
617