1 /*	$NetBSD: vfs_lockf.c,v 1.73 2011/01/31 08:25:32 dholland Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Scooter Morris at Genentech Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)ufs_lockf.c	8.4 (Berkeley) 10/26/94
35  */
36 
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: vfs_lockf.c,v 1.73 2011/01/31 08:25:32 dholland Exp $");
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/file.h>
44 #include <sys/proc.h>
45 #include <sys/vnode.h>
46 #include <sys/pool.h>
47 #include <sys/fcntl.h>
48 #include <sys/lockf.h>
49 #include <sys/atomic.h>
50 #include <sys/kauth.h>
51 #include <sys/uidinfo.h>
52 
53 /*
54  * The lockf structure is a kernel structure which contains the information
55  * associated with a byte range lock.  The lockf structures are linked into
56  * the vnode structure.  Locks are sorted by the starting byte of the lock for
57  * efficiency.
58  *
59  * lf_next is used for two purposes, depending on whether the lock is
60  * being held, or is in conflict with an existing lock.  If this lock
61  * is held, it indicates the next lock on the same vnode.
62  * For pending locks, if lock->lf_next is non-NULL, then lock->lf_block
63  * must be queued on the lf_blkhd TAILQ of lock->lf_next.
64  */
65 
66 TAILQ_HEAD(locklist, lockf);
67 
68 struct lockf {
69 	kcondvar_t lf_cv;	 /* Signalling */
70 	short	lf_flags;	 /* Lock semantics: F_POSIX, F_FLOCK, F_WAIT */
71 	short	lf_type;	 /* Lock type: F_RDLCK, F_WRLCK */
72 	off_t	lf_start;	 /* The byte # of the start of the lock */
73 	off_t	lf_end;		 /* The byte # of the end of the lock (-1=EOF)*/
74 	void	*lf_id;		 /* process or file description holding lock */
75 	struct	lockf **lf_head; /* Back pointer to the head of lockf list */
76 	struct	lockf *lf_next;	 /* Next lock on this vnode, or blocking lock */
77 	struct  locklist lf_blkhd; /* List of requests blocked on this lock */
78 	TAILQ_ENTRY(lockf) lf_block;/* A request waiting for a lock */
79 	uid_t	lf_uid;		 /* User ID responsible */
80 };
81 
82 /* Maximum length of sleep chains to traverse to try and detect deadlock. */
83 #define MAXDEPTH 50
84 
85 static pool_cache_t lockf_cache;
86 static kmutex_t *lockf_lock;
87 static char lockstr[] = "lockf";
88 
89 /*
90  * This variable controls the maximum number of processes that will
91  * be checked in doing deadlock detection.
92  */
93 int maxlockdepth = MAXDEPTH;
94 
95 #ifdef LOCKF_DEBUG
96 int	lockf_debug = 0;
97 #endif
98 
99 #define SELF	0x1
100 #define OTHERS	0x2
101 
102 /*
103  * XXX TODO
104  * Misc cleanups: "void *id" should be visible in the API as a
105  * "struct proc *".
106  * (This requires rototilling all VFS's which support advisory locking).
107  */
108 
109 /*
110  * If there's a lot of lock contention on a single vnode, locking
111  * schemes which allow for more paralleism would be needed.  Given how
112  * infrequently byte-range locks are actually used in typical BSD
113  * code, a more complex approach probably isn't worth it.
114  */
115 
116 /*
117  * We enforce a limit on locks by uid, so that a single user cannot
118  * run the kernel out of memory.  For now, the limit is pretty coarse.
119  * There is no limit on root.
120  *
121  * Splitting a lock will always succeed, regardless of current allocations.
122  * If you're slightly above the limit, we still have to permit an allocation
123  * so that the unlock can succeed.  If the unlocking causes too many splits,
124  * however, you're totally cutoff.
125  */
126 int maxlocksperuid = 1024;
127 
128 #ifdef LOCKF_DEBUG
129 /*
130  * Print out a lock.
131  */
132 static void
lf_print(const char * tag,struct lockf * lock)133 lf_print(const char *tag, struct lockf *lock)
134 {
135 
136 	printf("%s: lock %p for ", tag, lock);
137 	if (lock->lf_flags & F_POSIX)
138 		printf("proc %d", ((struct proc *)lock->lf_id)->p_pid);
139 	else
140 		printf("file %p", (struct file *)lock->lf_id);
141 	printf(" %s, start %jd, end %jd",
142 		lock->lf_type == F_RDLCK ? "shared" :
143 		lock->lf_type == F_WRLCK ? "exclusive" :
144 		lock->lf_type == F_UNLCK ? "unlock" :
145 		"unknown", (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
146 	if (TAILQ_FIRST(&lock->lf_blkhd))
147 		printf(" block %p\n", TAILQ_FIRST(&lock->lf_blkhd));
148 	else
149 		printf("\n");
150 }
151 
152 static void
lf_printlist(const char * tag,struct lockf * lock)153 lf_printlist(const char *tag, struct lockf *lock)
154 {
155 	struct lockf *lf, *blk;
156 
157 	printf("%s: Lock list:\n", tag);
158 	for (lf = *lock->lf_head; lf; lf = lf->lf_next) {
159 		printf("\tlock %p for ", lf);
160 		if (lf->lf_flags & F_POSIX)
161 			printf("proc %d", ((struct proc *)lf->lf_id)->p_pid);
162 		else
163 			printf("file %p", (struct file *)lf->lf_id);
164 		printf(", %s, start %jd, end %jd",
165 			lf->lf_type == F_RDLCK ? "shared" :
166 			lf->lf_type == F_WRLCK ? "exclusive" :
167 			lf->lf_type == F_UNLCK ? "unlock" :
168 			"unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
169 		TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
170 			if (blk->lf_flags & F_POSIX)
171 				printf("; proc %d",
172 				    ((struct proc *)blk->lf_id)->p_pid);
173 			else
174 				printf("; file %p", (struct file *)blk->lf_id);
175 			printf(", %s, start %jd, end %jd",
176 				blk->lf_type == F_RDLCK ? "shared" :
177 				blk->lf_type == F_WRLCK ? "exclusive" :
178 				blk->lf_type == F_UNLCK ? "unlock" :
179 				"unknown", (intmax_t)blk->lf_start, (intmax_t)blk->lf_end);
180 			if (TAILQ_FIRST(&blk->lf_blkhd))
181 				 panic("lf_printlist: bad list");
182 		}
183 		printf("\n");
184 	}
185 }
186 #endif /* LOCKF_DEBUG */
187 
188 /*
189  * 3 options for allowfail.
190  * 0 - always allocate.  1 - cutoff at limit.  2 - cutoff at double limit.
191  */
192 static struct lockf *
lf_alloc(int allowfail)193 lf_alloc(int allowfail)
194 {
195 	struct uidinfo *uip;
196 	struct lockf *lock;
197 	u_long lcnt;
198 	const uid_t uid = kauth_cred_geteuid(kauth_cred_get());
199 
200 	uip = uid_find(uid);
201 	lcnt = atomic_inc_ulong_nv(&uip->ui_lockcnt);
202 	if (uid && allowfail && lcnt >
203 	    (allowfail == 1 ? maxlocksperuid : (maxlocksperuid * 2))) {
204 		atomic_dec_ulong(&uip->ui_lockcnt);
205 		return NULL;
206 	}
207 
208 	lock = pool_cache_get(lockf_cache, PR_WAITOK);
209 	lock->lf_uid = uid;
210 	return lock;
211 }
212 
213 static void
lf_free(struct lockf * lock)214 lf_free(struct lockf *lock)
215 {
216 	struct uidinfo *uip;
217 
218 	uip = uid_find(lock->lf_uid);
219 	atomic_dec_ulong(&uip->ui_lockcnt);
220 	pool_cache_put(lockf_cache, lock);
221 }
222 
223 static int
lf_ctor(void * arg,void * obj,int flag)224 lf_ctor(void *arg, void *obj, int flag)
225 {
226 	struct lockf *lock;
227 
228 	lock = obj;
229 	cv_init(&lock->lf_cv, lockstr);
230 
231 	return 0;
232 }
233 
234 static void
lf_dtor(void * arg,void * obj)235 lf_dtor(void *arg, void *obj)
236 {
237 	struct lockf *lock;
238 
239 	lock = obj;
240 	cv_destroy(&lock->lf_cv);
241 }
242 
243 /*
244  * Walk the list of locks for an inode to
245  * find an overlapping lock (if any).
246  *
247  * NOTE: this returns only the FIRST overlapping lock.  There
248  *	 may be more than one.
249  */
250 static int
lf_findoverlap(struct lockf * lf,struct lockf * lock,int type,struct lockf *** prev,struct lockf ** overlap)251 lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
252     struct lockf ***prev, struct lockf **overlap)
253 {
254 	off_t start, end;
255 
256 	*overlap = lf;
257 	if (lf == NULL)
258 		return 0;
259 #ifdef LOCKF_DEBUG
260 	if (lockf_debug & 2)
261 		lf_print("lf_findoverlap: looking for overlap in", lock);
262 #endif /* LOCKF_DEBUG */
263 	start = lock->lf_start;
264 	end = lock->lf_end;
265 	while (lf != NULL) {
266 		if (((type == SELF) && lf->lf_id != lock->lf_id) ||
267 		    ((type == OTHERS) && lf->lf_id == lock->lf_id)) {
268 			*prev = &lf->lf_next;
269 			*overlap = lf = lf->lf_next;
270 			continue;
271 		}
272 #ifdef LOCKF_DEBUG
273 		if (lockf_debug & 2)
274 			lf_print("\tchecking", lf);
275 #endif /* LOCKF_DEBUG */
276 		/*
277 		 * OK, check for overlap
278 		 *
279 		 * Six cases:
280 		 *	0) no overlap
281 		 *	1) overlap == lock
282 		 *	2) overlap contains lock
283 		 *	3) lock contains overlap
284 		 *	4) overlap starts before lock
285 		 *	5) overlap ends after lock
286 		 */
287 		if ((lf->lf_end != -1 && start > lf->lf_end) ||
288 		    (end != -1 && lf->lf_start > end)) {
289 			/* Case 0 */
290 #ifdef LOCKF_DEBUG
291 			if (lockf_debug & 2)
292 				printf("no overlap\n");
293 #endif /* LOCKF_DEBUG */
294 			if ((type & SELF) && end != -1 && lf->lf_start > end)
295 				return 0;
296 			*prev = &lf->lf_next;
297 			*overlap = lf = lf->lf_next;
298 			continue;
299 		}
300 		if ((lf->lf_start == start) && (lf->lf_end == end)) {
301 			/* Case 1 */
302 #ifdef LOCKF_DEBUG
303 			if (lockf_debug & 2)
304 				printf("overlap == lock\n");
305 #endif /* LOCKF_DEBUG */
306 			return 1;
307 		}
308 		if ((lf->lf_start <= start) &&
309 		    (end != -1) &&
310 		    ((lf->lf_end >= end) || (lf->lf_end == -1))) {
311 			/* Case 2 */
312 #ifdef LOCKF_DEBUG
313 			if (lockf_debug & 2)
314 				printf("overlap contains lock\n");
315 #endif /* LOCKF_DEBUG */
316 			return 2;
317 		}
318 		if (start <= lf->lf_start &&
319 		           (end == -1 ||
320 			   (lf->lf_end != -1 && end >= lf->lf_end))) {
321 			/* Case 3 */
322 #ifdef LOCKF_DEBUG
323 			if (lockf_debug & 2)
324 				printf("lock contains overlap\n");
325 #endif /* LOCKF_DEBUG */
326 			return 3;
327 		}
328 		if ((lf->lf_start < start) &&
329 			((lf->lf_end >= start) || (lf->lf_end == -1))) {
330 			/* Case 4 */
331 #ifdef LOCKF_DEBUG
332 			if (lockf_debug & 2)
333 				printf("overlap starts before lock\n");
334 #endif /* LOCKF_DEBUG */
335 			return 4;
336 		}
337 		if ((lf->lf_start > start) &&
338 			(end != -1) &&
339 			((lf->lf_end > end) || (lf->lf_end == -1))) {
340 			/* Case 5 */
341 #ifdef LOCKF_DEBUG
342 			if (lockf_debug & 2)
343 				printf("overlap ends after lock\n");
344 #endif /* LOCKF_DEBUG */
345 			return 5;
346 		}
347 		panic("lf_findoverlap: default");
348 	}
349 	return 0;
350 }
351 
352 /*
353  * Split a lock and a contained region into
354  * two or three locks as necessary.
355  */
356 static void
lf_split(struct lockf * lock1,struct lockf * lock2,struct lockf ** sparelock)357 lf_split(struct lockf *lock1, struct lockf *lock2, struct lockf **sparelock)
358 {
359 	struct lockf *splitlock;
360 
361 #ifdef LOCKF_DEBUG
362 	if (lockf_debug & 2) {
363 		lf_print("lf_split", lock1);
364 		lf_print("splitting from", lock2);
365 	}
366 #endif /* LOCKF_DEBUG */
367 	/*
368 	 * Check to see if spliting into only two pieces.
369 	 */
370 	if (lock1->lf_start == lock2->lf_start) {
371 		lock1->lf_start = lock2->lf_end + 1;
372 		lock2->lf_next = lock1;
373 		return;
374 	}
375 	if (lock1->lf_end == lock2->lf_end) {
376 		lock1->lf_end = lock2->lf_start - 1;
377 		lock2->lf_next = lock1->lf_next;
378 		lock1->lf_next = lock2;
379 		return;
380 	}
381 	/*
382 	 * Make a new lock consisting of the last part of
383 	 * the encompassing lock
384 	 */
385 	splitlock = *sparelock;
386 	*sparelock = NULL;
387 	cv_destroy(&splitlock->lf_cv);
388 	memcpy(splitlock, lock1, sizeof(*splitlock));
389 	cv_init(&splitlock->lf_cv, lockstr);
390 
391 	splitlock->lf_start = lock2->lf_end + 1;
392 	TAILQ_INIT(&splitlock->lf_blkhd);
393 	lock1->lf_end = lock2->lf_start - 1;
394 	/*
395 	 * OK, now link it in
396 	 */
397 	splitlock->lf_next = lock1->lf_next;
398 	lock2->lf_next = splitlock;
399 	lock1->lf_next = lock2;
400 }
401 
402 /*
403  * Wakeup a blocklist
404  */
405 static void
lf_wakelock(struct lockf * listhead)406 lf_wakelock(struct lockf *listhead)
407 {
408 	struct lockf *wakelock;
409 
410 	while ((wakelock = TAILQ_FIRST(&listhead->lf_blkhd))) {
411 		KASSERT(wakelock->lf_next == listhead);
412 		TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
413 		wakelock->lf_next = NULL;
414 #ifdef LOCKF_DEBUG
415 		if (lockf_debug & 2)
416 			lf_print("lf_wakelock: awakening", wakelock);
417 #endif
418 		cv_broadcast(&wakelock->lf_cv);
419 	}
420 }
421 
422 /*
423  * Remove a byte-range lock on an inode.
424  *
425  * Generally, find the lock (or an overlap to that lock)
426  * and remove it (or shrink it), then wakeup anyone we can.
427  */
428 static int
lf_clearlock(struct lockf * unlock,struct lockf ** sparelock)429 lf_clearlock(struct lockf *unlock, struct lockf **sparelock)
430 {
431 	struct lockf **head = unlock->lf_head;
432 	struct lockf *lf = *head;
433 	struct lockf *overlap, **prev;
434 	int ovcase;
435 
436 	if (lf == NULL)
437 		return 0;
438 #ifdef LOCKF_DEBUG
439 	if (unlock->lf_type != F_UNLCK)
440 		panic("lf_clearlock: bad type");
441 	if (lockf_debug & 1)
442 		lf_print("lf_clearlock", unlock);
443 #endif /* LOCKF_DEBUG */
444 	prev = head;
445 	while ((ovcase = lf_findoverlap(lf, unlock, SELF,
446 	    &prev, &overlap)) != 0) {
447 		/*
448 		 * Wakeup the list of locks to be retried.
449 		 */
450 		lf_wakelock(overlap);
451 
452 		switch (ovcase) {
453 
454 		case 1: /* overlap == lock */
455 			*prev = overlap->lf_next;
456 			lf_free(overlap);
457 			break;
458 
459 		case 2: /* overlap contains lock: split it */
460 			if (overlap->lf_start == unlock->lf_start) {
461 				overlap->lf_start = unlock->lf_end + 1;
462 				break;
463 			}
464 			lf_split(overlap, unlock, sparelock);
465 			overlap->lf_next = unlock->lf_next;
466 			break;
467 
468 		case 3: /* lock contains overlap */
469 			*prev = overlap->lf_next;
470 			lf = overlap->lf_next;
471 			lf_free(overlap);
472 			continue;
473 
474 		case 4: /* overlap starts before lock */
475 			overlap->lf_end = unlock->lf_start - 1;
476 			prev = &overlap->lf_next;
477 			lf = overlap->lf_next;
478 			continue;
479 
480 		case 5: /* overlap ends after lock */
481 			overlap->lf_start = unlock->lf_end + 1;
482 			break;
483 		}
484 		break;
485 	}
486 #ifdef LOCKF_DEBUG
487 	if (lockf_debug & 1)
488 		lf_printlist("lf_clearlock", unlock);
489 #endif /* LOCKF_DEBUG */
490 	return 0;
491 }
492 
493 /*
494  * Walk the list of locks for an inode and
495  * return the first blocking lock.
496  */
497 static struct lockf *
lf_getblock(struct lockf * lock)498 lf_getblock(struct lockf *lock)
499 {
500 	struct lockf **prev, *overlap, *lf = *(lock->lf_head);
501 
502 	prev = lock->lf_head;
503 	while (lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != 0) {
504 		/*
505 		 * We've found an overlap, see if it blocks us
506 		 */
507 		if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
508 			return overlap;
509 		/*
510 		 * Nope, point to the next one on the list and
511 		 * see if it blocks us
512 		 */
513 		lf = overlap->lf_next;
514 	}
515 	return NULL;
516 }
517 
518 /*
519  * Set a byte-range lock.
520  */
521 static int
lf_setlock(struct lockf * lock,struct lockf ** sparelock,kmutex_t * interlock)522 lf_setlock(struct lockf *lock, struct lockf **sparelock,
523     kmutex_t *interlock)
524 {
525 	struct lockf *block;
526 	struct lockf **head = lock->lf_head;
527 	struct lockf **prev, *overlap, *ltmp;
528 	int ovcase, needtolink, error;
529 
530 #ifdef LOCKF_DEBUG
531 	if (lockf_debug & 1)
532 		lf_print("lf_setlock", lock);
533 #endif /* LOCKF_DEBUG */
534 
535 	/*
536 	 * Scan lock list for this file looking for locks that would block us.
537 	 */
538 	while ((block = lf_getblock(lock)) != NULL) {
539 		/*
540 		 * Free the structure and return if nonblocking.
541 		 */
542 		if ((lock->lf_flags & F_WAIT) == 0) {
543 			lf_free(lock);
544 			return EAGAIN;
545 		}
546 		/*
547 		 * We are blocked. Since flock style locks cover
548 		 * the whole file, there is no chance for deadlock.
549 		 * For byte-range locks we must check for deadlock.
550 		 *
551 		 * Deadlock detection is done by looking through the
552 		 * wait channels to see if there are any cycles that
553 		 * involve us. MAXDEPTH is set just to make sure we
554 		 * do not go off into neverneverland.
555 		 */
556 		if ((lock->lf_flags & F_POSIX) &&
557 		    (block->lf_flags & F_POSIX)) {
558 			struct lwp *wlwp;
559 			volatile const struct lockf *waitblock;
560 			int i = 0;
561 			struct proc *p;
562 
563 			p = (struct proc *)block->lf_id;
564 			KASSERT(p != NULL);
565 			while (i++ < maxlockdepth) {
566 				mutex_enter(p->p_lock);
567 				if (p->p_nlwps > 1) {
568 					mutex_exit(p->p_lock);
569 					break;
570 				}
571 				wlwp = LIST_FIRST(&p->p_lwps);
572 				lwp_lock(wlwp);
573 				if (wlwp->l_wchan == NULL ||
574 				    wlwp->l_wmesg != lockstr) {
575 					lwp_unlock(wlwp);
576 					mutex_exit(p->p_lock);
577 					break;
578 				}
579 				waitblock = wlwp->l_wchan;
580 				lwp_unlock(wlwp);
581 				mutex_exit(p->p_lock);
582 				/* Get the owner of the blocking lock */
583 				waitblock = waitblock->lf_next;
584 				if ((waitblock->lf_flags & F_POSIX) == 0)
585 					break;
586 				p = (struct proc *)waitblock->lf_id;
587 				if (p == curproc) {
588 					lf_free(lock);
589 					return EDEADLK;
590 				}
591 			}
592 			/*
593 			 * If we're still following a dependency chain
594 			 * after maxlockdepth iterations, assume we're in
595 			 * a cycle to be safe.
596 			 */
597 			if (i >= maxlockdepth) {
598 				lf_free(lock);
599 				return EDEADLK;
600 			}
601 		}
602 		/*
603 		 * For flock type locks, we must first remove
604 		 * any shared locks that we hold before we sleep
605 		 * waiting for an exclusive lock.
606 		 */
607 		if ((lock->lf_flags & F_FLOCK) &&
608 		    lock->lf_type == F_WRLCK) {
609 			lock->lf_type = F_UNLCK;
610 			(void) lf_clearlock(lock, NULL);
611 			lock->lf_type = F_WRLCK;
612 		}
613 		/*
614 		 * Add our lock to the blocked list and sleep until we're free.
615 		 * Remember who blocked us (for deadlock detection).
616 		 */
617 		lock->lf_next = block;
618 		TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
619 #ifdef LOCKF_DEBUG
620 		if (lockf_debug & 1) {
621 			lf_print("lf_setlock: blocking on", block);
622 			lf_printlist("lf_setlock", block);
623 		}
624 #endif /* LOCKF_DEBUG */
625 		error = cv_wait_sig(&lock->lf_cv, interlock);
626 
627 		/*
628 		 * We may have been awoken by a signal (in
629 		 * which case we must remove ourselves from the
630 		 * blocked list) and/or by another process
631 		 * releasing a lock (in which case we have already
632 		 * been removed from the blocked list and our
633 		 * lf_next field set to NULL).
634 		 */
635 		if (lock->lf_next != NULL) {
636 			TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
637 			lock->lf_next = NULL;
638 		}
639 		if (error) {
640 			lf_free(lock);
641 			return error;
642 		}
643 	}
644 	/*
645 	 * No blocks!!  Add the lock.  Note that we will
646 	 * downgrade or upgrade any overlapping locks this
647 	 * process already owns.
648 	 *
649 	 * Skip over locks owned by other processes.
650 	 * Handle any locks that overlap and are owned by ourselves.
651 	 */
652 	prev = head;
653 	block = *head;
654 	needtolink = 1;
655 	for (;;) {
656 		ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
657 		if (ovcase)
658 			block = overlap->lf_next;
659 		/*
660 		 * Six cases:
661 		 *	0) no overlap
662 		 *	1) overlap == lock
663 		 *	2) overlap contains lock
664 		 *	3) lock contains overlap
665 		 *	4) overlap starts before lock
666 		 *	5) overlap ends after lock
667 		 */
668 		switch (ovcase) {
669 		case 0: /* no overlap */
670 			if (needtolink) {
671 				*prev = lock;
672 				lock->lf_next = overlap;
673 			}
674 			break;
675 
676 		case 1: /* overlap == lock */
677 			/*
678 			 * If downgrading lock, others may be
679 			 * able to acquire it.
680 			 */
681 			if (lock->lf_type == F_RDLCK &&
682 			    overlap->lf_type == F_WRLCK)
683 				lf_wakelock(overlap);
684 			overlap->lf_type = lock->lf_type;
685 			lf_free(lock);
686 			lock = overlap; /* for debug output below */
687 			break;
688 
689 		case 2: /* overlap contains lock */
690 			/*
691 			 * Check for common starting point and different types.
692 			 */
693 			if (overlap->lf_type == lock->lf_type) {
694 				lf_free(lock);
695 				lock = overlap; /* for debug output below */
696 				break;
697 			}
698 			if (overlap->lf_start == lock->lf_start) {
699 				*prev = lock;
700 				lock->lf_next = overlap;
701 				overlap->lf_start = lock->lf_end + 1;
702 			} else
703 				lf_split(overlap, lock, sparelock);
704 			lf_wakelock(overlap);
705 			break;
706 
707 		case 3: /* lock contains overlap */
708 			/*
709 			 * If downgrading lock, others may be able to
710 			 * acquire it, otherwise take the list.
711 			 */
712 			if (lock->lf_type == F_RDLCK &&
713 			    overlap->lf_type == F_WRLCK) {
714 				lf_wakelock(overlap);
715 			} else {
716 				while ((ltmp = TAILQ_FIRST(&overlap->lf_blkhd))) {
717 					KASSERT(ltmp->lf_next == overlap);
718 					TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
719 					    lf_block);
720 					ltmp->lf_next = lock;
721 					TAILQ_INSERT_TAIL(&lock->lf_blkhd,
722 					    ltmp, lf_block);
723 				}
724 			}
725 			/*
726 			 * Add the new lock if necessary and delete the overlap.
727 			 */
728 			if (needtolink) {
729 				*prev = lock;
730 				lock->lf_next = overlap->lf_next;
731 				prev = &lock->lf_next;
732 				needtolink = 0;
733 			} else
734 				*prev = overlap->lf_next;
735 			lf_free(overlap);
736 			continue;
737 
738 		case 4: /* overlap starts before lock */
739 			/*
740 			 * Add lock after overlap on the list.
741 			 */
742 			lock->lf_next = overlap->lf_next;
743 			overlap->lf_next = lock;
744 			overlap->lf_end = lock->lf_start - 1;
745 			prev = &lock->lf_next;
746 			lf_wakelock(overlap);
747 			needtolink = 0;
748 			continue;
749 
750 		case 5: /* overlap ends after lock */
751 			/*
752 			 * Add the new lock before overlap.
753 			 */
754 			if (needtolink) {
755 				*prev = lock;
756 				lock->lf_next = overlap;
757 			}
758 			overlap->lf_start = lock->lf_end + 1;
759 			lf_wakelock(overlap);
760 			break;
761 		}
762 		break;
763 	}
764 #ifdef LOCKF_DEBUG
765 	if (lockf_debug & 1) {
766 		lf_print("lf_setlock: got the lock", lock);
767 		lf_printlist("lf_setlock", lock);
768 	}
769 #endif /* LOCKF_DEBUG */
770 	return 0;
771 }
772 
773 /*
774  * Check whether there is a blocking lock,
775  * and if so return its process identifier.
776  */
777 static int
lf_getlock(struct lockf * lock,struct flock * fl)778 lf_getlock(struct lockf *lock, struct flock *fl)
779 {
780 	struct lockf *block;
781 
782 #ifdef LOCKF_DEBUG
783 	if (lockf_debug & 1)
784 		lf_print("lf_getlock", lock);
785 #endif /* LOCKF_DEBUG */
786 
787 	if ((block = lf_getblock(lock)) != NULL) {
788 		fl->l_type = block->lf_type;
789 		fl->l_whence = SEEK_SET;
790 		fl->l_start = block->lf_start;
791 		if (block->lf_end == -1)
792 			fl->l_len = 0;
793 		else
794 			fl->l_len = block->lf_end - block->lf_start + 1;
795 		if (block->lf_flags & F_POSIX)
796 			fl->l_pid = ((struct proc *)block->lf_id)->p_pid;
797 		else
798 			fl->l_pid = -1;
799 	} else {
800 		fl->l_type = F_UNLCK;
801 	}
802 	return 0;
803 }
804 
805 /*
806  * Do an advisory lock operation.
807  */
808 int
lf_advlock(struct vop_advlock_args * ap,struct lockf ** head,off_t size)809 lf_advlock(struct vop_advlock_args *ap, struct lockf **head, off_t size)
810 {
811 	struct flock *fl = ap->a_fl;
812 	struct lockf *lock = NULL;
813 	struct lockf *sparelock;
814 	kmutex_t *interlock = lockf_lock;
815 	off_t start, end;
816 	int error = 0;
817 
818 	/*
819 	 * Convert the flock structure into a start and end.
820 	 */
821 	switch (fl->l_whence) {
822 	case SEEK_SET:
823 	case SEEK_CUR:
824 		/*
825 		 * Caller is responsible for adding any necessary offset
826 		 * when SEEK_CUR is used.
827 		 */
828 		start = fl->l_start;
829 		break;
830 
831 	case SEEK_END:
832 		start = size + fl->l_start;
833 		break;
834 
835 	default:
836 		return EINVAL;
837 	}
838 
839 	if (fl->l_len == 0)
840 		end = -1;
841 	else {
842 		if (fl->l_len > 0)
843 			end = start + fl->l_len - 1;
844 		else {
845 			/* lockf() allows -ve lengths */
846 			end = start - 1;
847 			start += fl->l_len;
848 		}
849 	}
850 	if (start < 0)
851 		return EINVAL;
852 
853 	/*
854 	 * Allocate locks before acquiring the interlock.  We need two
855 	 * locks in the worst case.
856 	 */
857 	switch (ap->a_op) {
858 	case F_SETLK:
859 	case F_UNLCK:
860 		/*
861 		 * XXX For F_UNLCK case, we can re-use the lock.
862 		 */
863 		if ((ap->a_flags & F_FLOCK) == 0) {
864 			/*
865 			 * Byte-range lock might need one more lock.
866 			 */
867 			sparelock = lf_alloc(0);
868 			if (sparelock == NULL) {
869 				error = ENOMEM;
870 				goto quit;
871 			}
872 			break;
873 		}
874 		/* FALLTHROUGH */
875 
876 	case F_GETLK:
877 		sparelock = NULL;
878 		break;
879 
880 	default:
881 		return EINVAL;
882 	}
883 
884 	switch (ap->a_op) {
885 	case F_SETLK:
886 		lock = lf_alloc(1);
887 		break;
888 	case F_UNLCK:
889 		if (start == 0 || end == -1) {
890 			/* never split */
891 			lock = lf_alloc(0);
892 		} else {
893 			/* might split */
894 			lock = lf_alloc(2);
895 		}
896 		break;
897 	case F_GETLK:
898 		lock = lf_alloc(0);
899 		break;
900 	}
901 	if (lock == NULL) {
902 		error = ENOMEM;
903 		goto quit;
904 	}
905 
906 	mutex_enter(interlock);
907 
908 	/*
909 	 * Avoid the common case of unlocking when inode has no locks.
910 	 */
911 	if (*head == (struct lockf *)0) {
912 		if (ap->a_op != F_SETLK) {
913 			fl->l_type = F_UNLCK;
914 			error = 0;
915 			goto quit_unlock;
916 		}
917 	}
918 
919 	/*
920 	 * Create the lockf structure.
921 	 */
922 	lock->lf_start = start;
923 	lock->lf_end = end;
924 	lock->lf_head = head;
925 	lock->lf_type = fl->l_type;
926 	lock->lf_next = (struct lockf *)0;
927 	TAILQ_INIT(&lock->lf_blkhd);
928 	lock->lf_flags = ap->a_flags;
929 	if (lock->lf_flags & F_POSIX) {
930 		KASSERT(curproc == (struct proc *)ap->a_id);
931 	}
932 	lock->lf_id = ap->a_id;
933 
934 	/*
935 	 * Do the requested operation.
936 	 */
937 	switch (ap->a_op) {
938 
939 	case F_SETLK:
940 		error = lf_setlock(lock, &sparelock, interlock);
941 		lock = NULL; /* lf_setlock freed it */
942 		break;
943 
944 	case F_UNLCK:
945 		error = lf_clearlock(lock, &sparelock);
946 		break;
947 
948 	case F_GETLK:
949 		error = lf_getlock(lock, fl);
950 		break;
951 
952 	default:
953 		break;
954 		/* NOTREACHED */
955 	}
956 
957 quit_unlock:
958 	mutex_exit(interlock);
959 quit:
960 	if (lock)
961 		lf_free(lock);
962 	if (sparelock)
963 		lf_free(sparelock);
964 
965 	return error;
966 }
967 
968 /*
969  * Initialize subsystem.   XXX We use a global lock.  This could be the
970  * vnode interlock, but the deadlock detection code may need to inspect
971  * locks belonging to other files.
972  */
973 void
lf_init(void)974 lf_init(void)
975 {
976 
977 	lockf_cache = pool_cache_init(sizeof(struct lockf), 0, 0, 0, "lockf",
978  	    NULL, IPL_NONE, lf_ctor, lf_dtor, NULL);
979         lockf_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
980 }
981