xref: /freebsd/sys/contrib/ck/src/ck_epoch.c (revision b00ab754)
1 /*
2  * Copyright 2011-2015 Samy Al Bahra.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 /*
28  * The implementation here is inspired from the work described in:
29  *   Fraser, K. 2004. Practical Lock-Freedom. PhD Thesis, University
30  *   of Cambridge Computing Laboratory.
31  */
32 
33 #include <ck_backoff.h>
34 #include <ck_cc.h>
35 #include <ck_epoch.h>
36 #include <ck_pr.h>
37 #include <ck_stack.h>
38 #include <ck_stdbool.h>
39 #include <ck_string.h>
40 
41 /*
42  * Only three distinct values are used for reclamation, but reclamation occurs
43  * at e+2 rather than e+1. Any thread in a "critical section" would have
44  * acquired some snapshot (e) of the global epoch value (e_g) and set an active
45  * flag. Any hazardous references will only occur after a full memory barrier.
46  * For example, assume an initial e_g value of 1, e value of 0 and active value
47  * of 0.
48  *
49  * ck_epoch_begin(...)
50  *   e = e_g
51  *   active = 1
52  *   memory_barrier();
53  *
54  * Any serialized reads may observe e = 0 or e = 1 with active = 0, or e = 0 or
55  * e = 1 with active = 1. The e_g value can only go from 1 to 2 if every thread
56  * has already observed the value of "1" (or the value we are incrementing
57  * from). This guarantees us that for any given value e_g, any threads with-in
58  * critical sections (referred to as "active" threads from here on) would have
59  * an e value of e_g-1 or e_g. This also means that hazardous references may be
60  * shared in both e_g-1 and e_g even if they are logically deleted in e_g.
61  *
62  * For example, assume all threads have an e value of e_g. Another thread may
63  * increment to e_g to e_g+1. Older threads may have a reference to an object
64  * which is only deleted in e_g+1. It could be that reader threads are
65  * executing some hash table look-ups, while some other writer thread (which
66  * causes epoch counter tick) actually deletes the same items that reader
67  * threads are looking up (this writer thread having an e value of e_g+1).
68  * This is possible if the writer thread re-observes the epoch after the
69  * counter tick.
70  *
71  * Psuedo-code for writer:
72  *   ck_epoch_begin()
73  *   ht_delete(x)
74  *   ck_epoch_end()
75  *   ck_epoch_begin()
76  *   ht_delete(x)
77  *   ck_epoch_end()
78  *
79  * Psuedo-code for reader:
80  *   for (;;) {
81  *      x = ht_lookup(x)
82  *      ck_pr_inc(&x->value);
83  *   }
84  *
85  * Of course, it is also possible for references logically deleted at e_g-1 to
86  * still be accessed at e_g as threads are "active" at the same time
87  * (real-world time) mutating shared objects.
88  *
89  * Now, if the epoch counter is ticked to e_g+1, then no new hazardous
90  * references could exist to objects logically deleted at e_g-1. The reason for
91  * this is that at e_g+1, all epoch read-side critical sections started at
92  * e_g-1 must have been completed. If any epoch read-side critical sections at
93  * e_g-1 were still active, then we would never increment to e_g+1 (active != 0
94  * ^ e != e_g).  Additionally, e_g may still have hazardous references to
95  * objects logically deleted at e_g-1 which means objects logically deleted at
96  * e_g-1 cannot be deleted at e_g+1 unless all threads have observed e_g+1
97  * (since it is valid for active threads to be at e_g and threads at e_g still
98  * require safe memory accesses).
99  *
100  * However, at e_g+2, all active threads must be either at e_g+1 or e_g+2.
101  * Though e_g+2 may share hazardous references with e_g+1, and e_g+1 shares
102  * hazardous references to e_g, no active threads are at e_g or e_g-1. This
103  * means no hazardous references could exist to objects deleted at e_g-1 (at
104  * e_g+2).
105  *
106  * To summarize these important points,
107  *   1) Active threads will always have a value of e_g or e_g-1.
108  *   2) Items that are logically deleted e_g or e_g-1 cannot be physically
109  *      deleted.
110  *   3) Objects logically deleted at e_g-1 can be physically destroyed at e_g+2
111  *      or at e_g+1 if no threads are at e_g.
112  *
113  * Last but not least, if we are at e_g+2, then no active thread is at e_g
114  * which means it is safe to apply modulo-3 arithmetic to e_g value in order to
115  * re-use e_g to represent the e_g+3 state. This means it is sufficient to
116  * represent e_g using only the values 0, 1 or 2. Every time a thread re-visits
117  * a e_g (which can be determined with a non-empty deferral list) it can assume
118  * objects in the e_g deferral list involved at least three e_g transitions and
119  * are thus, safe, for physical deletion.
120  *
121  * Blocking semantics for epoch reclamation have additional restrictions.
122  * Though we only require three deferral lists, reasonable blocking semantics
123  * must be able to more gracefully handle bursty write work-loads which could
124  * easily cause e_g wrap-around if modulo-3 arithmetic is used. This allows for
125  * easy-to-trigger live-lock situations. The work-around to this is to not
126  * apply modulo arithmetic to e_g but only to deferral list indexing.
127  */
128 #define CK_EPOCH_GRACE 3U
129 
130 enum {
131 	CK_EPOCH_STATE_USED = 0,
132 	CK_EPOCH_STATE_FREE = 1
133 };
134 
135 CK_STACK_CONTAINER(struct ck_epoch_record, record_next,
136     ck_epoch_record_container)
137 CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry,
138     ck_epoch_entry_container)
139 
140 #define CK_EPOCH_SENSE_MASK	(CK_EPOCH_SENSE - 1)
141 
142 bool
143 _ck_epoch_delref(struct ck_epoch_record *record,
144     struct ck_epoch_section *section)
145 {
146 	struct ck_epoch_ref *current, *other;
147 	unsigned int i = section->bucket;
148 
149 	current = &record->local.bucket[i];
150 	current->count--;
151 
152 	if (current->count > 0)
153 		return false;
154 
155 	/*
156 	 * If the current bucket no longer has any references, then
157 	 * determine whether we have already transitioned into a newer
158 	 * epoch. If so, then make sure to update our shared snapshot
159 	 * to allow for forward progress.
160 	 *
161 	 * If no other active bucket exists, then the record will go
162 	 * inactive in order to allow for forward progress.
163 	 */
164 	other = &record->local.bucket[(i + 1) & CK_EPOCH_SENSE_MASK];
165 	if (other->count > 0 &&
166 	    ((int)(current->epoch - other->epoch) < 0)) {
167 		/*
168 		 * The other epoch value is actually the newest,
169 		 * transition to it.
170 		 */
171 		ck_pr_store_uint(&record->epoch, other->epoch);
172 	}
173 
174 	return true;
175 }
176 
177 void
178 _ck_epoch_addref(struct ck_epoch_record *record,
179     struct ck_epoch_section *section)
180 {
181 	struct ck_epoch *global = record->global;
182 	struct ck_epoch_ref *ref;
183 	unsigned int epoch, i;
184 
185 	epoch = ck_pr_load_uint(&global->epoch);
186 	i = epoch & CK_EPOCH_SENSE_MASK;
187 	ref = &record->local.bucket[i];
188 
189 	if (ref->count++ == 0) {
190 #ifndef CK_MD_TSO
191 		struct ck_epoch_ref *previous;
192 
193 		/*
194 		 * The system has already ticked. If another non-zero bucket
195 		 * exists, make sure to order our observations with respect
196 		 * to it. Otherwise, it is possible to acquire a reference
197 		 * from the previous epoch generation.
198 		 *
199 		 * On TSO architectures, the monoticity of the global counter
200 		 * and load-{store, load} ordering are sufficient to guarantee
201 		 * this ordering.
202 		 */
203 		previous = &record->local.bucket[(i + 1) &
204 		    CK_EPOCH_SENSE_MASK];
205 		if (previous->count > 0)
206 			ck_pr_fence_acqrel();
207 #endif /* !CK_MD_TSO */
208 
209 		/*
210 		 * If this is this is a new reference into the current
211 		 * bucket then cache the associated epoch value.
212 		 */
213 		ref->epoch = epoch;
214 	}
215 
216 	section->bucket = i;
217 	return;
218 }
219 
220 void
221 ck_epoch_init(struct ck_epoch *global)
222 {
223 
224 	ck_stack_init(&global->records);
225 	global->epoch = 1;
226 	global->n_free = 0;
227 	ck_pr_fence_store();
228 	return;
229 }
230 
231 struct ck_epoch_record *
232 ck_epoch_recycle(struct ck_epoch *global, void *ct)
233 {
234 	struct ck_epoch_record *record;
235 	ck_stack_entry_t *cursor;
236 	unsigned int state;
237 
238 	if (ck_pr_load_uint(&global->n_free) == 0)
239 		return NULL;
240 
241 	CK_STACK_FOREACH(&global->records, cursor) {
242 		record = ck_epoch_record_container(cursor);
243 
244 		if (ck_pr_load_uint(&record->state) == CK_EPOCH_STATE_FREE) {
245 			/* Serialize with respect to deferral list clean-up. */
246 			ck_pr_fence_load();
247 			state = ck_pr_fas_uint(&record->state,
248 			    CK_EPOCH_STATE_USED);
249 			if (state == CK_EPOCH_STATE_FREE) {
250 				ck_pr_dec_uint(&global->n_free);
251 				ck_pr_store_ptr(&record->ct, ct);
252 
253 				/*
254 				 * The context pointer is ordered by a
255 				 * subsequent protected section.
256 				 */
257 				return record;
258 			}
259 		}
260 	}
261 
262 	return NULL;
263 }
264 
265 void
266 ck_epoch_register(struct ck_epoch *global, struct ck_epoch_record *record,
267     void *ct)
268 {
269 	size_t i;
270 
271 	record->global = global;
272 	record->state = CK_EPOCH_STATE_USED;
273 	record->active = 0;
274 	record->epoch = 0;
275 	record->n_dispatch = 0;
276 	record->n_peak = 0;
277 	record->n_pending = 0;
278 	record->ct = ct;
279 	memset(&record->local, 0, sizeof record->local);
280 
281 	for (i = 0; i < CK_EPOCH_LENGTH; i++)
282 		ck_stack_init(&record->pending[i]);
283 
284 	ck_pr_fence_store();
285 	ck_stack_push_upmc(&global->records, &record->record_next);
286 	return;
287 }
288 
289 void
290 ck_epoch_unregister(struct ck_epoch_record *record)
291 {
292 	struct ck_epoch *global = record->global;
293 	size_t i;
294 
295 	record->active = 0;
296 	record->epoch = 0;
297 	record->n_dispatch = 0;
298 	record->n_peak = 0;
299 	record->n_pending = 0;
300 	memset(&record->local, 0, sizeof record->local);
301 
302 	for (i = 0; i < CK_EPOCH_LENGTH; i++)
303 		ck_stack_init(&record->pending[i]);
304 
305 	ck_pr_store_ptr(&record->ct, NULL);
306 	ck_pr_fence_store();
307 	ck_pr_store_uint(&record->state, CK_EPOCH_STATE_FREE);
308 	ck_pr_inc_uint(&global->n_free);
309 	return;
310 }
311 
312 static struct ck_epoch_record *
313 ck_epoch_scan(struct ck_epoch *global,
314     struct ck_epoch_record *cr,
315     unsigned int epoch,
316     bool *af)
317 {
318 	ck_stack_entry_t *cursor;
319 
320 	if (cr == NULL) {
321 		cursor = CK_STACK_FIRST(&global->records);
322 		*af = false;
323 	} else {
324 		cursor = &cr->record_next;
325 		*af = true;
326 	}
327 
328 	while (cursor != NULL) {
329 		unsigned int state, active;
330 
331 		cr = ck_epoch_record_container(cursor);
332 
333 		state = ck_pr_load_uint(&cr->state);
334 		if (state & CK_EPOCH_STATE_FREE) {
335 			cursor = CK_STACK_NEXT(cursor);
336 			continue;
337 		}
338 
339 		active = ck_pr_load_uint(&cr->active);
340 		*af |= active;
341 
342 		if (active != 0 && ck_pr_load_uint(&cr->epoch) != epoch)
343 			return cr;
344 
345 		cursor = CK_STACK_NEXT(cursor);
346 	}
347 
348 	return NULL;
349 }
350 
351 static void
352 ck_epoch_dispatch(struct ck_epoch_record *record, unsigned int e, ck_stack_t *deferred)
353 {
354 	unsigned int epoch = e & (CK_EPOCH_LENGTH - 1);
355 	ck_stack_entry_t *head, *next, *cursor;
356 	unsigned int n_pending, n_peak;
357 	unsigned int i = 0;
358 
359 	head = ck_stack_batch_pop_upmc(&record->pending[epoch]);
360 	for (cursor = head; cursor != NULL; cursor = next) {
361 		struct ck_epoch_entry *entry =
362 		    ck_epoch_entry_container(cursor);
363 
364 		next = CK_STACK_NEXT(cursor);
365 		if (deferred != NULL)
366 			ck_stack_push_spnc(deferred, &entry->stack_entry);
367 		else
368 			entry->function(entry);
369 		i++;
370 	}
371 
372 	n_peak = ck_pr_load_uint(&record->n_peak);
373 	n_pending = ck_pr_load_uint(&record->n_pending);
374 
375 	/* We don't require accuracy around peak calculation. */
376 	if (n_pending > n_peak)
377 		ck_pr_store_uint(&record->n_peak, n_peak);
378 
379 	if (i > 0) {
380 		ck_pr_add_uint(&record->n_dispatch, i);
381 		ck_pr_sub_uint(&record->n_pending, i);
382 	}
383 
384 	return;
385 }
386 
387 /*
388  * Reclaim all objects associated with a record.
389  */
390 void
391 ck_epoch_reclaim(struct ck_epoch_record *record)
392 {
393 	unsigned int epoch;
394 
395 	for (epoch = 0; epoch < CK_EPOCH_LENGTH; epoch++)
396 		ck_epoch_dispatch(record, epoch, NULL);
397 
398 	return;
399 }
400 
401 CK_CC_FORCE_INLINE static void
402 epoch_block(struct ck_epoch *global, struct ck_epoch_record *cr,
403     ck_epoch_wait_cb_t *cb, void *ct)
404 {
405 
406 	if (cb != NULL)
407 		cb(global, cr, ct);
408 
409 	return;
410 }
411 
412 /*
413  * This function must not be called with-in read section.
414  */
415 void
416 ck_epoch_synchronize_wait(struct ck_epoch *global,
417     ck_epoch_wait_cb_t *cb, void *ct)
418 {
419 	struct ck_epoch_record *cr;
420 	unsigned int delta, epoch, goal, i;
421 	bool active;
422 
423 	ck_pr_fence_memory();
424 
425 	/*
426 	 * The observation of the global epoch must be ordered with respect to
427 	 * all prior operations. The re-ordering of loads is permitted given
428 	 * monoticity of global epoch counter.
429 	 *
430 	 * If UINT_MAX concurrent mutations were to occur then it is possible
431 	 * to encounter an ABA-issue. If this is a concern, consider tuning
432 	 * write-side concurrency.
433 	 */
434 	delta = epoch = ck_pr_load_uint(&global->epoch);
435 	goal = epoch + CK_EPOCH_GRACE;
436 
437 	for (i = 0, cr = NULL; i < CK_EPOCH_GRACE - 1; cr = NULL, i++) {
438 		bool r;
439 
440 		/*
441 		 * Determine whether all threads have observed the current
442 		 * epoch with respect to the updates on invocation.
443 		 */
444 		while (cr = ck_epoch_scan(global, cr, delta, &active),
445 		    cr != NULL) {
446 			unsigned int e_d;
447 
448 			ck_pr_stall();
449 
450 			/*
451 			 * Another writer may have already observed a grace
452 			 * period.
453 			 */
454 			e_d = ck_pr_load_uint(&global->epoch);
455 			if (e_d == delta) {
456 				epoch_block(global, cr, cb, ct);
457 				continue;
458 			}
459 
460 			/*
461 			 * If the epoch has been updated, we may have already
462 			 * met our goal.
463 			 */
464 			delta = e_d;
465 			if ((goal > epoch) & (delta >= goal))
466 				goto leave;
467 
468 			epoch_block(global, cr, cb, ct);
469 
470 			/*
471 			 * If the epoch has been updated, then a grace period
472 			 * requires that all threads are observed idle at the
473 			 * same epoch.
474 			 */
475 			cr = NULL;
476 		}
477 
478 		/*
479 		 * If we have observed all threads as inactive, then we assume
480 		 * we are at a grace period.
481 		 */
482 		if (active == false)
483 			break;
484 
485 		/*
486 		 * Increment current epoch. CAS semantics are used to eliminate
487 		 * increment operations for synchronization that occurs for the
488 		 * same global epoch value snapshot.
489 		 *
490 		 * If we can guarantee there will only be one active barrier or
491 		 * epoch tick at a given time, then it is sufficient to use an
492 		 * increment operation. In a multi-barrier workload, however,
493 		 * it is possible to overflow the epoch value if we apply
494 		 * modulo-3 arithmetic.
495 		 */
496 		r = ck_pr_cas_uint_value(&global->epoch, delta, delta + 1,
497 		    &delta);
498 
499 		/* Order subsequent thread active checks. */
500 		ck_pr_fence_atomic_load();
501 
502 		/*
503 		 * If CAS has succeeded, then set delta to latest snapshot.
504 		 * Otherwise, we have just acquired latest snapshot.
505 		 */
506 		delta = delta + r;
507 	}
508 
509 	/*
510 	 * A majority of use-cases will not require full barrier semantics.
511 	 * However, if non-temporal instructions are used, full barrier
512 	 * semantics are necessary.
513 	 */
514 leave:
515 	ck_pr_fence_memory();
516 	return;
517 }
518 
519 void
520 ck_epoch_synchronize(struct ck_epoch_record *record)
521 {
522 
523 	ck_epoch_synchronize_wait(record->global, NULL, NULL);
524 	return;
525 }
526 
527 void
528 ck_epoch_barrier(struct ck_epoch_record *record)
529 {
530 
531 	ck_epoch_synchronize(record);
532 	ck_epoch_reclaim(record);
533 	return;
534 }
535 
536 void
537 ck_epoch_barrier_wait(struct ck_epoch_record *record, ck_epoch_wait_cb_t *cb,
538     void *ct)
539 {
540 
541 	ck_epoch_synchronize_wait(record->global, cb, ct);
542 	ck_epoch_reclaim(record);
543 	return;
544 }
545 
546 /*
547  * It may be worth it to actually apply these deferral semantics to an epoch
548  * that was observed at ck_epoch_call time. The problem is that the latter
549  * would require a full fence.
550  *
551  * ck_epoch_call will dispatch to the latest epoch snapshot that was observed.
552  * There are cases where it will fail to reclaim as early as it could. If this
553  * becomes a problem, we could actually use a heap for epoch buckets but that
554  * is far from ideal too.
555  */
556 bool
557 ck_epoch_poll_deferred(struct ck_epoch_record *record, ck_stack_t *deferred)
558 {
559 	bool active;
560 	unsigned int epoch;
561 	struct ck_epoch_record *cr = NULL;
562 	struct ck_epoch *global = record->global;
563 
564 	epoch = ck_pr_load_uint(&global->epoch);
565 
566 	/* Serialize epoch snapshots with respect to global epoch. */
567 	ck_pr_fence_memory();
568 	cr = ck_epoch_scan(global, cr, epoch, &active);
569 	if (cr != NULL) {
570 		record->epoch = epoch;
571 		return false;
572 	}
573 
574 	/* We are at a grace period if all threads are inactive. */
575 	if (active == false) {
576 		record->epoch = epoch;
577 		for (epoch = 0; epoch < CK_EPOCH_LENGTH; epoch++)
578 			ck_epoch_dispatch(record, epoch, deferred);
579 
580 		return true;
581 	}
582 
583 	/* If an active thread exists, rely on epoch observation. */
584 	(void)ck_pr_cas_uint(&global->epoch, epoch, epoch + 1);
585 
586 	ck_epoch_dispatch(record, epoch + 1, deferred);
587 	return true;
588 }
589 
590 bool
591 ck_epoch_poll(struct ck_epoch_record *record)
592 {
593 
594 	return ck_epoch_poll_deferred(record, NULL);
595 }
596