1 /*
2    Unix SMB/CIFS implementation.
3 
4    trivial database library
5 
6    Copyright (C) Volker Lendecke 2012,2013
7    Copyright (C) Stefan Metzmacher 2013,2014
8    Copyright (C) Michael Adam 2014
9 
10      ** NOTE! The following LGPL license applies to the tdb
11      ** library. This does NOT imply that all of Samba is released
12      ** under the LGPL
13 
14    This library is free software; you can redistribute it and/or
15    modify it under the terms of the GNU Lesser General Public
16    License as published by the Free Software Foundation; either
17    version 3 of the License, or (at your option) any later version.
18 
19    This library is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22    Lesser General Public License for more details.
23 
24    You should have received a copy of the GNU Lesser General Public
25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
26 */
27 #include "tdb_private.h"
28 #include "system/threads.h"
29 
30 #ifdef USE_TDB_MUTEX_LOCKING
31 
32 /*
33  * If we run with mutexes, we store the "struct tdb_mutexes" at the
34  * beginning of the file. We store an additional tdb_header right
35  * beyond the mutex area, page aligned. All the offsets within the tdb
36  * are relative to the area behind the mutex area. tdb->map_ptr points
37  * behind the mmap area as well, so the read and write path in the
38  * mutex case can remain unchanged.
39  *
40  * Early in the mutex development the mutexes were placed between the hash
41  * chain pointers and the real tdb data. This had two drawbacks: First, it
42  * made pointer calculations more complex. Second, we had to mmap the mutex
43  * area twice. One was the normal map_ptr in the tdb. This frequently changed
44  * from within tdb_oob. At least the Linux glibc robust mutex code assumes
45  * constant pointers in memory, so a constantly changing mmap area destroys
46  * the mutex list. So we had to mmap the first bytes of the file with a second
47  * mmap call. With that scheme, very weird errors happened that could be
48  * easily fixed by doing the mutex mmap in a second file. It seemed that
49  * mapping the same memory area twice does not end up in accessing the same
50  * physical page, looking at the mutexes in gdb it seemed that old data showed
51  * up after some re-mapping. To avoid a separate mutex file, the code now puts
52  * the real content of the tdb file after the mutex area. This way we do not
53  * have overlapping mmap areas, the mutex area is mmapped once and not
54  * changed, the tdb data area's mmap is constantly changed but does not
55  * overlap.
56  */
57 
58 struct tdb_mutexes {
59 	struct tdb_header hdr;
60 
61 	/* protect allrecord_lock */
62 	pthread_mutex_t allrecord_mutex;
63 
64 	/*
65 	 * F_UNLCK: free,
66 	 * F_RDLCK: shared,
67 	 * F_WRLCK: exclusive
68 	 */
69 	short int allrecord_lock;
70 
71 	/*
72 	 * Index 0 is the freelist mutex, followed by
73 	 * one mutex per hashchain.
74 	 */
75 	pthread_mutex_t hashchains[1];
76 };
77 
tdb_have_mutexes(struct tdb_context * tdb)78 bool tdb_have_mutexes(struct tdb_context *tdb)
79 {
80 	return ((tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) != 0);
81 }
82 
tdb_mutex_size(struct tdb_context * tdb)83 size_t tdb_mutex_size(struct tdb_context *tdb)
84 {
85 	size_t mutex_size;
86 
87 	if (!tdb_have_mutexes(tdb)) {
88 		return 0;
89 	}
90 
91 	mutex_size = sizeof(struct tdb_mutexes);
92 	mutex_size += tdb->hash_size * sizeof(pthread_mutex_t);
93 
94 	return TDB_ALIGN(mutex_size, tdb->page_size);
95 }
96 
97 /*
98  * Get the index for a chain mutex
99  */
tdb_mutex_index(struct tdb_context * tdb,off_t off,off_t len,unsigned * idx)100 static bool tdb_mutex_index(struct tdb_context *tdb, off_t off, off_t len,
101 			    unsigned *idx)
102 {
103 	/*
104 	 * Weird but true: We fcntl lock 1 byte at an offset 4 bytes before
105 	 * the 4 bytes of the freelist start and the hash chain that is about
106 	 * to be locked. See lock_offset() where the freelist is -1 vs the
107 	 * "+1" in TDB_HASH_TOP(). Because the mutex array is represented in
108 	 * the tdb file itself as data, we need to adjust the offset here.
109 	 */
110 	const off_t freelist_lock_ofs = FREELIST_TOP - sizeof(tdb_off_t);
111 
112 	if (!tdb_have_mutexes(tdb)) {
113 		return false;
114 	}
115 	if (len != 1) {
116 		/* Possibly the allrecord lock */
117 		return false;
118 	}
119 	if (off < freelist_lock_ofs) {
120 		/* One of the special locks */
121 		return false;
122 	}
123 	if (tdb->hash_size == 0) {
124 		/* tdb not initialized yet, called from tdb_open_ex() */
125 		return false;
126 	}
127 	if (off >= TDB_DATA_START(tdb->hash_size)) {
128 		/* Single record lock from traverses */
129 		return false;
130 	}
131 
132 	/*
133 	 * Now we know it's a freelist or hash chain lock. Those are always 4
134 	 * byte aligned. Paranoia check.
135 	 */
136 	if ((off % sizeof(tdb_off_t)) != 0) {
137 		abort();
138 	}
139 
140 	/*
141 	 * Re-index the fcntl offset into an offset into the mutex array
142 	 */
143 	off -= freelist_lock_ofs; /* rebase to index 0 */
144 	off /= sizeof(tdb_off_t); /* 0 for freelist 1-n for hashchain */
145 
146 	*idx = off;
147 	return true;
148 }
149 
tdb_have_mutex_chainlocks(struct tdb_context * tdb)150 static bool tdb_have_mutex_chainlocks(struct tdb_context *tdb)
151 {
152 	size_t i;
153 
154 	for (i=0; i < tdb->num_lockrecs; i++) {
155 		bool ret;
156 		unsigned idx;
157 
158 		ret = tdb_mutex_index(tdb,
159 				      tdb->lockrecs[i].off,
160 				      tdb->lockrecs[i].count,
161 				      &idx);
162 		if (!ret) {
163 			continue;
164 		}
165 
166 		if (idx == 0) {
167 			/* this is the freelist mutex */
168 			continue;
169 		}
170 
171 		return true;
172 	}
173 
174 	return false;
175 }
176 
chain_mutex_lock(pthread_mutex_t * m,bool waitflag)177 static int chain_mutex_lock(pthread_mutex_t *m, bool waitflag)
178 {
179 	int ret;
180 
181 	if (waitflag) {
182 		ret = pthread_mutex_lock(m);
183 	} else {
184 		ret = pthread_mutex_trylock(m);
185 	}
186 	if (ret != EOWNERDEAD) {
187 		return ret;
188 	}
189 
190 	/*
191 	 * For chainlocks, we don't do any cleanup (yet?)
192 	 */
193 	return pthread_mutex_consistent(m);
194 }
195 
allrecord_mutex_lock(struct tdb_mutexes * m,bool waitflag)196 static int allrecord_mutex_lock(struct tdb_mutexes *m, bool waitflag)
197 {
198 	int ret;
199 
200 	if (waitflag) {
201 		ret = pthread_mutex_lock(&m->allrecord_mutex);
202 	} else {
203 		ret = pthread_mutex_trylock(&m->allrecord_mutex);
204 	}
205 	if (ret != EOWNERDEAD) {
206 		return ret;
207 	}
208 
209 	/*
210 	 * The allrecord lock holder died. We need to reset the allrecord_lock
211 	 * to F_UNLCK. This should also be the indication for
212 	 * tdb_needs_recovery.
213 	 */
214 	m->allrecord_lock = F_UNLCK;
215 
216 	return pthread_mutex_consistent(&m->allrecord_mutex);
217 }
218 
tdb_mutex_lock(struct tdb_context * tdb,int rw,off_t off,off_t len,bool waitflag,int * pret)219 bool tdb_mutex_lock(struct tdb_context *tdb, int rw, off_t off, off_t len,
220 		    bool waitflag, int *pret)
221 {
222 	struct tdb_mutexes *m = tdb->mutexes;
223 	pthread_mutex_t *chain;
224 	int ret;
225 	unsigned idx;
226 	bool allrecord_ok;
227 
228 	if (!tdb_mutex_index(tdb, off, len, &idx)) {
229 		return false;
230 	}
231 	chain = &m->hashchains[idx];
232 
233 again:
234 	ret = chain_mutex_lock(chain, waitflag);
235 	if (ret == EBUSY) {
236 		ret = EAGAIN;
237 	}
238 	if (ret != 0) {
239 		errno = ret;
240 		goto fail;
241 	}
242 
243 	if (idx == 0) {
244 		/*
245 		 * This is a freelist lock, which is independent to
246 		 * the allrecord lock. So we're done once we got the
247 		 * freelist mutex.
248 		 */
249 		*pret = 0;
250 		return true;
251 	}
252 
253 	if (tdb_have_mutex_chainlocks(tdb)) {
254 		/*
255 		 * We can only check the allrecord lock once. If we do it with
256 		 * one chain mutex locked, we will deadlock with the allrecord
257 		 * locker process in the following way: We lock the first hash
258 		 * chain, we check for the allrecord lock. We keep the hash
259 		 * chain locked. Then the allrecord locker locks the
260 		 * allrecord_mutex. It walks the list of chain mutexes,
261 		 * locking them all in sequence. Meanwhile, we have the chain
262 		 * mutex locked, so the allrecord locker blocks trying to lock
263 		 * our chain mutex. Then we come in and try to lock the second
264 		 * chain lock, which in most cases will be the freelist. We
265 		 * see that the allrecord lock is locked and put ourselves on
266 		 * the allrecord_mutex. This will never be signalled though
267 		 * because the allrecord locker waits for us to give up the
268 		 * chain lock.
269 		 */
270 
271 		*pret = 0;
272 		return true;
273 	}
274 
275 	/*
276 	 * Check if someone is has the allrecord lock: queue if so.
277 	 */
278 
279 	allrecord_ok = false;
280 
281 	if (m->allrecord_lock == F_UNLCK) {
282 		/*
283 		 * allrecord lock not taken
284 		 */
285 		allrecord_ok = true;
286 	}
287 
288 	if ((m->allrecord_lock == F_RDLCK) && (rw == F_RDLCK)) {
289 		/*
290 		 * allrecord shared lock taken, but we only want to read
291 		 */
292 		allrecord_ok = true;
293 	}
294 
295 	if (allrecord_ok) {
296 		*pret = 0;
297 		return true;
298 	}
299 
300 	ret = pthread_mutex_unlock(chain);
301 	if (ret != 0) {
302 		TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
303 			 "(chain_mutex) failed: %s\n", strerror(ret)));
304 		errno = ret;
305 		goto fail;
306 	}
307 	ret = allrecord_mutex_lock(m, waitflag);
308 	if (ret == EBUSY) {
309 		ret = EAGAIN;
310 	}
311 	if (ret != 0) {
312 		if (waitflag || (ret != EAGAIN)) {
313 			TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_%slock"
314 				 "(allrecord_mutex) failed: %s\n",
315 				 waitflag ? "" : "try_",  strerror(ret)));
316 		}
317 		errno = ret;
318 		goto fail;
319 	}
320 	ret = pthread_mutex_unlock(&m->allrecord_mutex);
321 	if (ret != 0) {
322 		TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
323 			 "(allrecord_mutex) failed: %s\n", strerror(ret)));
324 		errno = ret;
325 		goto fail;
326 	}
327 	goto again;
328 
329 fail:
330 	*pret = -1;
331 	return true;
332 }
333 
tdb_mutex_unlock(struct tdb_context * tdb,int rw,off_t off,off_t len,int * pret)334 bool tdb_mutex_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len,
335 		      int *pret)
336 {
337 	struct tdb_mutexes *m = tdb->mutexes;
338 	pthread_mutex_t *chain;
339 	int ret;
340 	unsigned idx;
341 
342 	if (!tdb_mutex_index(tdb, off, len, &idx)) {
343 		return false;
344 	}
345 	chain = &m->hashchains[idx];
346 
347 	ret = pthread_mutex_unlock(chain);
348 	if (ret == 0) {
349 		*pret = 0;
350 		return true;
351 	}
352 	errno = ret;
353 	*pret = -1;
354 	return true;
355 }
356 
tdb_mutex_allrecord_lock(struct tdb_context * tdb,int ltype,enum tdb_lock_flags flags)357 int tdb_mutex_allrecord_lock(struct tdb_context *tdb, int ltype,
358 			     enum tdb_lock_flags flags)
359 {
360 	struct tdb_mutexes *m = tdb->mutexes;
361 	int ret;
362 	uint32_t i;
363 	bool waitflag = (flags & TDB_LOCK_WAIT);
364 	int saved_errno;
365 
366 	if (tdb->flags & TDB_NOLOCK) {
367 		return 0;
368 	}
369 
370 	if (flags & TDB_LOCK_MARK_ONLY) {
371 		return 0;
372 	}
373 
374 	ret = allrecord_mutex_lock(m, waitflag);
375 	if (!waitflag && (ret == EBUSY)) {
376 		errno = EAGAIN;
377 		tdb->ecode = TDB_ERR_LOCK;
378 		return -1;
379 	}
380 	if (ret != 0) {
381 		if (!(flags & TDB_LOCK_PROBE)) {
382 			TDB_LOG((tdb, TDB_DEBUG_TRACE,
383 				 "allrecord_mutex_lock() failed: %s\n",
384 				 strerror(ret)));
385 		}
386 		tdb->ecode = TDB_ERR_LOCK;
387 		return -1;
388 	}
389 
390 	if (m->allrecord_lock != F_UNLCK) {
391 		TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
392 			 (int)m->allrecord_lock));
393 		goto fail_unlock_allrecord_mutex;
394 	}
395 	m->allrecord_lock = (ltype == F_RDLCK) ? F_RDLCK : F_WRLCK;
396 
397 	for (i=0; i<tdb->hash_size; i++) {
398 
399 		/* ignore hashchains[0], the freelist */
400 		pthread_mutex_t *chain = &m->hashchains[i+1];
401 
402 		ret = chain_mutex_lock(chain, waitflag);
403 		if (!waitflag && (ret == EBUSY)) {
404 			errno = EAGAIN;
405 			goto fail_unroll_allrecord_lock;
406 		}
407 		if (ret != 0) {
408 			if (!(flags & TDB_LOCK_PROBE)) {
409 				TDB_LOG((tdb, TDB_DEBUG_TRACE,
410 					 "chain_mutex_lock() failed: %s\n",
411 					 strerror(ret)));
412 			}
413 			errno = ret;
414 			goto fail_unroll_allrecord_lock;
415 		}
416 
417 		ret = pthread_mutex_unlock(chain);
418 		if (ret != 0) {
419 			TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
420 				 "(chainlock) failed: %s\n", strerror(ret)));
421 			errno = ret;
422 			goto fail_unroll_allrecord_lock;
423 		}
424 	}
425 	/*
426 	 * We leave this routine with m->allrecord_mutex locked
427 	 */
428 	return 0;
429 
430 fail_unroll_allrecord_lock:
431 	m->allrecord_lock = F_UNLCK;
432 
433 fail_unlock_allrecord_mutex:
434 	saved_errno = errno;
435 	ret = pthread_mutex_unlock(&m->allrecord_mutex);
436 	if (ret != 0) {
437 		TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
438 			 "(allrecord_mutex) failed: %s\n", strerror(ret)));
439 	}
440 	errno = saved_errno;
441 	tdb->ecode = TDB_ERR_LOCK;
442 	return -1;
443 }
444 
tdb_mutex_allrecord_upgrade(struct tdb_context * tdb)445 int tdb_mutex_allrecord_upgrade(struct tdb_context *tdb)
446 {
447 	struct tdb_mutexes *m = tdb->mutexes;
448 	int ret;
449 	uint32_t i;
450 
451 	if (tdb->flags & TDB_NOLOCK) {
452 		return 0;
453 	}
454 
455 	/*
456 	 * Our only caller tdb_allrecord_upgrade()
457 	 * garantees that we already own the allrecord lock.
458 	 *
459 	 * Which means m->allrecord_mutex is still locked by us.
460 	 */
461 
462 	if (m->allrecord_lock != F_RDLCK) {
463 		tdb->ecode = TDB_ERR_LOCK;
464 		TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
465 			 (int)m->allrecord_lock));
466 		return -1;
467 	}
468 
469 	m->allrecord_lock = F_WRLCK;
470 
471 	for (i=0; i<tdb->hash_size; i++) {
472 
473 		/* ignore hashchains[0], the freelist */
474 		pthread_mutex_t *chain = &m->hashchains[i+1];
475 
476 		ret = chain_mutex_lock(chain, true);
477 		if (ret != 0) {
478 			TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_lock"
479 				 "(chainlock) failed: %s\n", strerror(ret)));
480 			goto fail_unroll_allrecord_lock;
481 		}
482 
483 		ret = pthread_mutex_unlock(chain);
484 		if (ret != 0) {
485 			TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
486 				 "(chainlock) failed: %s\n", strerror(ret)));
487 			goto fail_unroll_allrecord_lock;
488 		}
489 	}
490 
491 	return 0;
492 
493 fail_unroll_allrecord_lock:
494 	m->allrecord_lock = F_RDLCK;
495 	tdb->ecode = TDB_ERR_LOCK;
496 	return -1;
497 }
498 
tdb_mutex_allrecord_downgrade(struct tdb_context * tdb)499 void tdb_mutex_allrecord_downgrade(struct tdb_context *tdb)
500 {
501 	struct tdb_mutexes *m = tdb->mutexes;
502 
503 	/*
504 	 * Our only caller tdb_allrecord_upgrade() (in the error case)
505 	 * garantees that we already own the allrecord lock.
506 	 *
507 	 * Which means m->allrecord_mutex is still locked by us.
508 	 */
509 
510 	if (m->allrecord_lock != F_WRLCK) {
511 		TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
512 			 (int)m->allrecord_lock));
513 		return;
514 	}
515 
516 	m->allrecord_lock = F_RDLCK;
517 	return;
518 }
519 
520 
tdb_mutex_allrecord_unlock(struct tdb_context * tdb)521 int tdb_mutex_allrecord_unlock(struct tdb_context *tdb)
522 {
523 	struct tdb_mutexes *m = tdb->mutexes;
524 	short old;
525 	int ret;
526 
527 	if (tdb->flags & TDB_NOLOCK) {
528 		return 0;
529 	}
530 
531 	/*
532 	 * Our only callers tdb_allrecord_unlock() and
533 	 * tdb_allrecord_lock() (in the error path)
534 	 * garantee that we already own the allrecord lock.
535 	 *
536 	 * Which means m->allrecord_mutex is still locked by us.
537 	 */
538 
539 	if ((m->allrecord_lock != F_RDLCK) && (m->allrecord_lock != F_WRLCK)) {
540 		TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
541 			 (int)m->allrecord_lock));
542 		return -1;
543 	}
544 
545 	old = m->allrecord_lock;
546 	m->allrecord_lock = F_UNLCK;
547 
548 	ret = pthread_mutex_unlock(&m->allrecord_mutex);
549 	if (ret != 0) {
550 		m->allrecord_lock = old;
551 		TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
552 			 "(allrecord_mutex) failed: %s\n", strerror(ret)));
553 		return -1;
554 	}
555 	return 0;
556 }
557 
tdb_mutex_init(struct tdb_context * tdb)558 int tdb_mutex_init(struct tdb_context *tdb)
559 {
560 	struct tdb_mutexes *m;
561 	pthread_mutexattr_t ma;
562 	int i, ret;
563 
564 	ret = tdb_mutex_mmap(tdb);
565 	if (ret == -1) {
566 		return -1;
567 	}
568 	m = tdb->mutexes;
569 
570 	ret = pthread_mutexattr_init(&ma);
571 	if (ret != 0) {
572 		goto fail_munmap;
573 	}
574 	ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
575 	if (ret != 0) {
576 		goto fail;
577 	}
578 	ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
579 	if (ret != 0) {
580 		goto fail;
581 	}
582 	ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
583 	if (ret != 0) {
584 		goto fail;
585 	}
586 
587 	for (i=0; i<tdb->hash_size+1; i++) {
588 		pthread_mutex_t *chain = &m->hashchains[i];
589 
590 		ret = pthread_mutex_init(chain, &ma);
591 		if (ret != 0) {
592 			goto fail;
593 		}
594 	}
595 
596 	m->allrecord_lock = F_UNLCK;
597 
598 	ret = pthread_mutex_init(&m->allrecord_mutex, &ma);
599 	if (ret != 0) {
600 		goto fail;
601 	}
602 	ret = 0;
603 fail:
604 	pthread_mutexattr_destroy(&ma);
605 fail_munmap:
606 
607 	if (ret == 0) {
608 		return 0;
609 	}
610 
611 	tdb_mutex_munmap(tdb);
612 
613 	errno = ret;
614 	return -1;
615 }
616 
tdb_mutex_mmap(struct tdb_context * tdb)617 int tdb_mutex_mmap(struct tdb_context *tdb)
618 {
619 	size_t len;
620 	void *ptr;
621 
622 	len = tdb_mutex_size(tdb);
623 	if (len == 0) {
624 		return 0;
625 	}
626 
627 	if (tdb->mutexes != NULL) {
628 		return 0;
629 	}
630 
631 	ptr = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FILE,
632 		   tdb->fd, 0);
633 	if (ptr == MAP_FAILED) {
634 		return -1;
635 	}
636 	tdb->mutexes = (struct tdb_mutexes *)ptr;
637 
638 	return 0;
639 }
640 
tdb_mutex_munmap(struct tdb_context * tdb)641 int tdb_mutex_munmap(struct tdb_context *tdb)
642 {
643 	size_t len;
644 	int ret;
645 
646 	len = tdb_mutex_size(tdb);
647 	if (len == 0) {
648 		return 0;
649 	}
650 
651 	ret = munmap(tdb->mutexes, len);
652 	if (ret == -1) {
653 		return -1;
654 	}
655 	tdb->mutexes = NULL;
656 
657 	return 0;
658 }
659 
660 static bool tdb_mutex_locking_cached;
661 
tdb_mutex_locking_supported(void)662 static bool tdb_mutex_locking_supported(void)
663 {
664 	pthread_mutexattr_t ma;
665 	pthread_mutex_t m;
666 	int ret;
667 	static bool initialized;
668 
669 	if (initialized) {
670 		return tdb_mutex_locking_cached;
671 	}
672 
673 	initialized = true;
674 
675 	ret = pthread_mutexattr_init(&ma);
676 	if (ret != 0) {
677 		return false;
678 	}
679 	ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
680 	if (ret != 0) {
681 		goto cleanup_ma;
682 	}
683 	ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
684 	if (ret != 0) {
685 		goto cleanup_ma;
686 	}
687 	ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
688 	if (ret != 0) {
689 		goto cleanup_ma;
690 	}
691 	ret = pthread_mutex_init(&m, &ma);
692 	if (ret != 0) {
693 		goto cleanup_ma;
694 	}
695 	ret = pthread_mutex_lock(&m);
696 	if (ret != 0) {
697 		goto cleanup_m;
698 	}
699 	/*
700 	 * This makes sure we have real mutexes
701 	 * from a threading library instead of just
702 	 * stubs from libc.
703 	 */
704 	ret = pthread_mutex_lock(&m);
705 	if (ret != EDEADLK) {
706 		goto cleanup_lock;
707 	}
708 	ret = pthread_mutex_unlock(&m);
709 	if (ret != 0) {
710 		goto cleanup_m;
711 	}
712 
713 	tdb_mutex_locking_cached = true;
714 	goto cleanup_m;
715 
716 cleanup_lock:
717 	pthread_mutex_unlock(&m);
718 cleanup_m:
719 	pthread_mutex_destroy(&m);
720 cleanup_ma:
721 	pthread_mutexattr_destroy(&ma);
722 	return tdb_mutex_locking_cached;
723 }
724 
725 static void (*tdb_robust_mutext_old_handler)(int) = SIG_ERR;
726 static pid_t tdb_robust_mutex_pid = -1;
727 
tdb_robust_mutex_setup_sigchild(void (* handler)(int),void (** p_old_handler)(int))728 static bool tdb_robust_mutex_setup_sigchild(void (*handler)(int),
729 			void (**p_old_handler)(int))
730 {
731 #ifdef HAVE_SIGACTION
732 	struct sigaction act;
733 	struct sigaction oldact;
734 
735 	memset(&act, '\0', sizeof(act));
736 
737 	act.sa_handler = handler;
738 #ifdef SA_RESTART
739 	act.sa_flags = SA_RESTART;
740 #endif
741 	sigemptyset(&act.sa_mask);
742 	sigaddset(&act.sa_mask, SIGCHLD);
743 	sigaction(SIGCHLD, &act, &oldact);
744 	if (p_old_handler) {
745 		*p_old_handler = oldact.sa_handler;
746 	}
747 	return true;
748 #else /* !HAVE_SIGACTION */
749 	return false;
750 #endif
751 }
752 
tdb_robust_mutex_handler(int sig)753 static void tdb_robust_mutex_handler(int sig)
754 {
755 	pid_t child_pid = tdb_robust_mutex_pid;
756 
757 	if (child_pid != -1) {
758 		pid_t pid;
759 
760 		pid = waitpid(child_pid, NULL, WNOHANG);
761 		if (pid == -1) {
762 			switch (errno) {
763 			case ECHILD:
764 				tdb_robust_mutex_pid = -1;
765 				return;
766 
767 			default:
768 				return;
769 			}
770 		}
771 		if (pid == child_pid) {
772 			tdb_robust_mutex_pid = -1;
773 			return;
774 		}
775 	}
776 
777 	if (tdb_robust_mutext_old_handler == SIG_DFL) {
778 		return;
779 	}
780 	if (tdb_robust_mutext_old_handler == SIG_IGN) {
781 		return;
782 	}
783 	if (tdb_robust_mutext_old_handler == SIG_ERR) {
784 		return;
785 	}
786 
787 	tdb_robust_mutext_old_handler(sig);
788 }
789 
tdb_robust_mutex_wait_for_child(pid_t * child_pid)790 static void tdb_robust_mutex_wait_for_child(pid_t *child_pid)
791 {
792 	int options = WNOHANG;
793 
794 	if (*child_pid == -1) {
795 		return;
796 	}
797 
798 	while (tdb_robust_mutex_pid > 0) {
799 		pid_t pid;
800 
801 		/*
802 		 * First we try with WNOHANG, as the process might not exist
803 		 * anymore. Once we've sent SIGKILL we block waiting for the
804 		 * exit.
805 		 */
806 		pid = waitpid(*child_pid, NULL, options);
807 		if (pid == -1) {
808 			if (errno == EINTR) {
809 				continue;
810 			} else if (errno == ECHILD) {
811 				break;
812 			} else {
813 				abort();
814 			}
815 		}
816 		if (pid == *child_pid) {
817 			break;
818 		}
819 
820 		kill(*child_pid, SIGKILL);
821 		options = 0;
822 	}
823 
824 	tdb_robust_mutex_pid = -1;
825 	*child_pid = -1;
826 }
827 
tdb_runtime_check_for_robust_mutexes(void)828 _PUBLIC_ bool tdb_runtime_check_for_robust_mutexes(void)
829 {
830 	void *ptr = NULL;
831 	pthread_mutex_t *m = NULL;
832 	pthread_mutexattr_t ma;
833 	int ret = 1;
834 	int pipe_down[2] = { -1, -1 };
835 	int pipe_up[2] = { -1, -1 };
836 	ssize_t nread;
837 	char c = 0;
838 	bool ok;
839 	static bool initialized;
840 	pid_t saved_child_pid = -1;
841 	bool cleanup_ma = false;
842 
843 	if (initialized) {
844 		return tdb_mutex_locking_cached;
845 	}
846 
847 	initialized = true;
848 
849 	ok = tdb_mutex_locking_supported();
850 	if (!ok) {
851 		return false;
852 	}
853 
854 	tdb_mutex_locking_cached = false;
855 
856 	ptr = mmap(NULL, sizeof(pthread_mutex_t), PROT_READ|PROT_WRITE,
857 		   MAP_SHARED|MAP_ANON, -1 /* fd */, 0);
858 	if (ptr == MAP_FAILED) {
859 		return false;
860 	}
861 
862 	ret = pipe(pipe_down);
863 	if (ret != 0) {
864 		goto cleanup;
865 	}
866 	ret = pipe(pipe_up);
867 	if (ret != 0) {
868 		goto cleanup;
869 	}
870 
871 	ret = pthread_mutexattr_init(&ma);
872 	if (ret != 0) {
873 		goto cleanup;
874 	}
875 	cleanup_ma = true;
876 	ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
877 	if (ret != 0) {
878 		goto cleanup;
879 	}
880 	ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
881 	if (ret != 0) {
882 		goto cleanup;
883 	}
884 	ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
885 	if (ret != 0) {
886 		goto cleanup;
887 	}
888 	ret = pthread_mutex_init(ptr, &ma);
889 	if (ret != 0) {
890 		goto cleanup;
891 	}
892 	m = (pthread_mutex_t *)ptr;
893 
894 	if (tdb_robust_mutex_setup_sigchild(tdb_robust_mutex_handler,
895 			&tdb_robust_mutext_old_handler) == false) {
896 		goto cleanup;
897 	}
898 
899 	tdb_robust_mutex_pid = fork();
900 	saved_child_pid = tdb_robust_mutex_pid;
901 	if (tdb_robust_mutex_pid == 0) {
902 		size_t nwritten;
903 		close(pipe_down[1]);
904 		close(pipe_up[0]);
905 		ret = pthread_mutex_lock(m);
906 		nwritten = write(pipe_up[1], &ret, sizeof(ret));
907 		if (nwritten != sizeof(ret)) {
908 			_exit(1);
909 		}
910 		if (ret != 0) {
911 			_exit(1);
912 		}
913 		nread = read(pipe_down[0], &c, 1);
914 		if (nread != 1) {
915 			_exit(1);
916 		}
917 		/* leave locked */
918 		_exit(0);
919 	}
920 	if (tdb_robust_mutex_pid == -1) {
921 		goto cleanup;
922 	}
923 	close(pipe_down[0]);
924 	pipe_down[0] = -1;
925 	close(pipe_up[1]);
926 	pipe_up[1] = -1;
927 
928 	nread = read(pipe_up[0], &ret, sizeof(ret));
929 	if (nread != sizeof(ret)) {
930 		goto cleanup;
931 	}
932 
933 	ret = pthread_mutex_trylock(m);
934 	if (ret != EBUSY) {
935 		if (ret == 0) {
936 			pthread_mutex_unlock(m);
937 		}
938 		goto cleanup;
939 	}
940 
941 	if (write(pipe_down[1], &c, 1) != 1) {
942 		goto cleanup;
943 	}
944 
945 	nread = read(pipe_up[0], &c, 1);
946 	if (nread != 0) {
947 		goto cleanup;
948 	}
949 
950 	tdb_robust_mutex_wait_for_child(&saved_child_pid);
951 
952 	ret = pthread_mutex_trylock(m);
953 	if (ret != EOWNERDEAD) {
954 		if (ret == 0) {
955 			pthread_mutex_unlock(m);
956 		}
957 		goto cleanup;
958 	}
959 
960 	ret = pthread_mutex_consistent(m);
961 	if (ret != 0) {
962 		goto cleanup;
963 	}
964 
965 	ret = pthread_mutex_trylock(m);
966 	if (ret != EDEADLK && ret != EBUSY) {
967 		pthread_mutex_unlock(m);
968 		goto cleanup;
969 	}
970 
971 	ret = pthread_mutex_unlock(m);
972 	if (ret != 0) {
973 		goto cleanup;
974 	}
975 
976 	tdb_mutex_locking_cached = true;
977 
978 cleanup:
979 	/*
980 	 * Note that we don't reset the signal handler we just reset
981 	 * tdb_robust_mutex_pid to -1. This is ok as this code path is only
982 	 * called once per process.
983 	 *
984 	 * Leaving our signal handler avoids races with other threads potentialy
985 	 * setting up their SIGCHLD handlers.
986 	 *
987 	 * The worst thing that can happen is that the other newer signal
988 	 * handler will get the SIGCHLD signal for our child and/or reap the
989 	 * child with a wait() function. tdb_robust_mutex_wait_for_child()
990 	 * handles the case where waitpid returns ECHILD.
991 	 */
992 	tdb_robust_mutex_wait_for_child(&saved_child_pid);
993 
994 	if (m != NULL) {
995 		pthread_mutex_destroy(m);
996 	}
997 	if (cleanup_ma) {
998 		pthread_mutexattr_destroy(&ma);
999 	}
1000 	if (pipe_down[0] != -1) {
1001 		close(pipe_down[0]);
1002 	}
1003 	if (pipe_down[1] != -1) {
1004 		close(pipe_down[1]);
1005 	}
1006 	if (pipe_up[0] != -1) {
1007 		close(pipe_up[0]);
1008 	}
1009 	if (pipe_up[1] != -1) {
1010 		close(pipe_up[1]);
1011 	}
1012 	if (ptr != NULL) {
1013 		munmap(ptr, sizeof(pthread_mutex_t));
1014 	}
1015 
1016 	return tdb_mutex_locking_cached;
1017 }
1018 
1019 #else
1020 
tdb_mutex_size(struct tdb_context * tdb)1021 size_t tdb_mutex_size(struct tdb_context *tdb)
1022 {
1023 	return 0;
1024 }
1025 
tdb_have_mutexes(struct tdb_context * tdb)1026 bool tdb_have_mutexes(struct tdb_context *tdb)
1027 {
1028 	return false;
1029 }
1030 
tdb_mutex_allrecord_lock(struct tdb_context * tdb,int ltype,enum tdb_lock_flags flags)1031 int tdb_mutex_allrecord_lock(struct tdb_context *tdb, int ltype,
1032 			     enum tdb_lock_flags flags)
1033 {
1034 	tdb->ecode = TDB_ERR_LOCK;
1035 	return -1;
1036 }
1037 
tdb_mutex_allrecord_unlock(struct tdb_context * tdb)1038 int tdb_mutex_allrecord_unlock(struct tdb_context *tdb)
1039 {
1040 	return -1;
1041 }
1042 
tdb_mutex_allrecord_upgrade(struct tdb_context * tdb)1043 int tdb_mutex_allrecord_upgrade(struct tdb_context *tdb)
1044 {
1045 	tdb->ecode = TDB_ERR_LOCK;
1046 	return -1;
1047 }
1048 
tdb_mutex_allrecord_downgrade(struct tdb_context * tdb)1049 void tdb_mutex_allrecord_downgrade(struct tdb_context *tdb)
1050 {
1051 	return;
1052 }
1053 
tdb_mutex_mmap(struct tdb_context * tdb)1054 int tdb_mutex_mmap(struct tdb_context *tdb)
1055 {
1056 	errno = ENOSYS;
1057 	return -1;
1058 }
1059 
tdb_mutex_munmap(struct tdb_context * tdb)1060 int tdb_mutex_munmap(struct tdb_context *tdb)
1061 {
1062 	errno = ENOSYS;
1063 	return -1;
1064 }
1065 
tdb_mutex_init(struct tdb_context * tdb)1066 int tdb_mutex_init(struct tdb_context *tdb)
1067 {
1068 	errno = ENOSYS;
1069 	return -1;
1070 }
1071 
tdb_runtime_check_for_robust_mutexes(void)1072 _PUBLIC_ bool tdb_runtime_check_for_robust_mutexes(void)
1073 {
1074 	return false;
1075 }
1076 
1077 #endif
1078