xref: /dragonfly/sys/kern/kern_lockf.c (revision 3d33658b)
1 /*
2  * Copyright (c) 2004 Joerg Sonnenberger <joerg@bec.de>.  All rights reserved.
3  * Copyright (c) 2006-2018 Matthew Dillon <dillon@backplane.com>.  All rights reserved.
4  *
5  * Copyright (c) 1982, 1986, 1989, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Scooter Morris at Genentech Inc.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *	@(#)ufs_lockf.c	8.3 (Berkeley) 1/6/94
36  * $FreeBSD: src/sys/kern/kern_lockf.c,v 1.25 1999/11/16 16:28:56 phk Exp $
37  */
38 
39 #include "opt_debug_lockf.h"
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/lock.h>
45 #include <sys/proc.h>
46 #include <sys/unistd.h>
47 #include <sys/vnode.h>
48 #include <sys/malloc.h>
49 #include <sys/fcntl.h>
50 #include <sys/resourcevar.h>
51 
52 #include <sys/lockf.h>
53 #include <machine/limits.h>	/* for LLONG_MAX */
54 #include <machine/stdarg.h>
55 
56 #include <sys/spinlock2.h>
57 
58 struct lf_pcpu {
59 	struct lockf_range *free1;
60 	struct lockf_range *free2;
61 } __cachealign;
62 
63 static struct lf_pcpu	*lf_pcpu_array;
64 
65 #ifdef LOCKF_DEBUG
66 int lf_print_ranges = 0;
67 
68 static void _lf_print_lock(const struct lockf *);
69 static void _lf_printf(const char *, ...) __printflike(1, 2);
70 
71 #define lf_print_lock(lock) if (lf_print_ranges) _lf_print_lock(lock)
72 #define lf_printf(ctl, args...)	if (lf_print_ranges) _lf_printf(ctl, args)
73 #else
74 #define lf_print_lock(lock)
75 #define lf_printf(ctl, args...)
76 #endif
77 
78 static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
79 
80 static void	lf_wakeup(struct lockf *, off_t, off_t);
81 static struct lockf_range *lf_alloc_range(void);
82 static void	lf_create_range(struct lockf_range *, struct proc *, int, int,
83 				off_t, off_t);
84 static void	lf_insert(struct lockf_range_list *list,
85 				struct lockf_range *elm,
86 				struct lockf_range *insert_point);
87 static void	lf_destroy_range(struct lockf_range *);
88 
89 static int	lf_setlock(struct lockf *, struct proc *, int, int,
90 			   off_t, off_t);
91 static int	lf_getlock(struct flock *, struct lockf *, struct proc *,
92 			   int, int, off_t, off_t);
93 
94 static int	lf_count_change(struct proc *, int);
95 
96 /*
97  * Return TRUE (non-zero) if the type and posix flags match.
98  */
99 static __inline
100 int
101 lf_match(struct lockf_range *range, int type, int flags)
102 {
103 	if (range->lf_type != type)
104 		return(0);
105 	if ((range->lf_flags ^ flags) & F_POSIX)
106 		return(0);
107 	return(1);
108 }
109 
110 /*
111  * Check whether range and [start, end] overlap.
112  */
113 static __inline
114 int
115 lf_overlap(const struct lockf_range *range, off_t start, off_t end)
116 {
117 	if (range->lf_start >= start && range->lf_start <= end)
118 		return(1);
119 	else if (start >= range->lf_start && start <= range->lf_end)
120 		return(1);
121 	else
122 		return(0);
123 }
124 
125 
126 /*
127  * Change the POSIX lock accounting for the given process.
128  */
129 void
130 lf_count_adjust(struct proc *p, int increase)
131 {
132 	struct uidinfo *uip;
133 	struct uidcount *pup;
134 	int n;
135 
136 	KKASSERT(p != NULL);
137 
138 	uip = p->p_ucred->cr_uidinfo;
139 	pup = &uip->ui_pcpu[mycpuid];
140 
141 	if (increase) {
142 		for (n = 0; n < ncpus; ++n)
143 			pup->pu_posixlocks += p->p_uidpcpu[n].pu_posixlocks;
144 	} else {
145 		for (n = 0; n < ncpus; ++n)
146 			pup->pu_posixlocks -= p->p_uidpcpu[n].pu_posixlocks;
147 	}
148 
149 	if (pup->pu_posixlocks < -PUP_LIMIT ||
150 	    pup->pu_posixlocks > PUP_LIMIT) {
151 		atomic_add_int(&uip->ui_posixlocks, pup->pu_posixlocks);
152 		pup->pu_posixlocks = 0;
153 	}
154 }
155 
156 static int
157 lf_count_change(struct proc *owner, int diff)
158 {
159 	struct uidinfo *uip;
160 	int max, ret;
161 
162 	/* we might actually not have a process context */
163 	if (owner == NULL)
164 		return(0);
165 
166 	uip = owner->p_ucred->cr_uidinfo;
167 
168 	max = MIN(owner->p_rlimit[RLIMIT_POSIXLOCKS].rlim_cur,
169 		  maxposixlocksperuid);
170 
171 	if (diff > 0 && owner->p_ucred->cr_uid != 0 && max != -1 &&
172 	    uip->ui_posixlocks >= max ) {
173 		ret = 1;
174 	} else {
175 		struct uidcount *pup;
176 		int cpu = mycpuid;
177 
178 		pup = &uip->ui_pcpu[cpu];
179 		pup->pu_posixlocks += diff;
180 		if (pup->pu_posixlocks < -PUP_LIMIT ||
181 		    pup->pu_posixlocks > PUP_LIMIT) {
182 			atomic_add_int(&uip->ui_posixlocks, pup->pu_posixlocks);
183 			pup->pu_posixlocks = 0;
184 		}
185 		owner->p_uidpcpu[cpu].pu_posixlocks += diff;
186 		ret = 0;
187 	}
188 	return ret;
189 }
190 
191 /*
192  * Advisory record locking support
193  */
194 int
195 lf_advlock(struct vop_advlock_args *ap, struct lockf *lock, u_quad_t size)
196 {
197 	struct flock *fl = ap->a_fl;
198 	struct proc *owner;
199 	off_t start, end;
200 	int type, flags, error;
201 	lwkt_token_t token;
202 
203 	/*
204 	 * Convert the flock structure into a start and end.
205 	 */
206 	switch (fl->l_whence) {
207 	case SEEK_SET:
208 	case SEEK_CUR:
209 		/*
210 		 * Caller is responsible for adding any necessary offset
211 		 * when SEEK_CUR is used.
212 		 */
213 		start = fl->l_start;
214 		break;
215 
216 	case SEEK_END:
217 		start = size + fl->l_start;
218 		break;
219 
220 	default:
221 		return(EINVAL);
222 	}
223 
224 	flags = ap->a_flags;
225 	if (start < 0)
226 		return(EINVAL);
227 	if (fl->l_len == 0) {
228 		flags |= F_NOEND;
229 		end = LLONG_MAX;
230 	} else if (fl->l_len < 0) {
231 		return(EINVAL);
232 	} else {
233 		end = start + fl->l_len - 1;
234 		if (end < start)
235 			return(EINVAL);
236 	}
237 
238 	type = fl->l_type;
239 	/*
240 	 * This isn't really correct for flock-style locks,
241 	 * but the current handling is somewhat broken anyway.
242 	 */
243 	owner = (struct proc *)ap->a_id;
244 
245 	/*
246 	 * Do the requested operation.
247 	 */
248 	token = lwkt_getpooltoken(lock);
249 
250 	if (lock->init_done == 0) {
251 		TAILQ_INIT(&lock->lf_range);
252 		TAILQ_INIT(&lock->lf_blocked);
253 		lock->init_done = 1;
254 	}
255 
256 	switch(ap->a_op) {
257 	case F_SETLK:
258 		/*
259 		 * NOTE: It is possible for both lf_range and lf_blocked to
260 		 * be empty if we block and get woken up, but another process
261 		 * then gets in and issues an unlock.  So VMAYHAVELOCKS must
262 		 * be set after the lf_setlock() operation completes rather
263 		 * then before.
264 		 */
265 		error = lf_setlock(lock, owner, type, flags, start, end);
266 		if ((ap->a_vp->v_flag & VMAYHAVELOCKS) == 0)
267 			vsetflags(ap->a_vp, VMAYHAVELOCKS);
268 		break;
269 
270 	case F_UNLCK:
271 		error = lf_setlock(lock, owner, type, flags, start, end);
272 #if 0
273 		/*
274 		 * XXX REMOVED. don't bother doing this in the critical path.
275 		 * close() overhead is minimal.
276 		 */
277 		if (TAILQ_EMPTY(&lock->lf_range) &&
278 		    TAILQ_EMPTY(&lock->lf_blocked)) {
279 			vclrflags(ap->a_vp, VMAYHAVELOCKS);
280 		}
281 #endif
282 		break;
283 
284 	case F_GETLK:
285 		error = lf_getlock(fl, lock, owner, type, flags, start, end);
286 		break;
287 
288 	default:
289 		error = EINVAL;
290 		break;
291 	}
292 	lwkt_reltoken(token);
293 	return(error);
294 }
295 
296 static int
297 lf_setlock(struct lockf *lock, struct proc *owner, int type, int flags,
298 	   off_t start, off_t end)
299 {
300 	struct lockf_range *range;
301 	struct lockf_range *brange;
302 	struct lockf_range *next;
303 	struct lockf_range *first_match;
304 	struct lockf_range *last_match;
305 	struct lockf_range *insert_point;
306 	struct lockf_range *new_range1;
307 	struct lockf_range *new_range2;
308 	int wakeup_needed;
309 	int double_clip;
310 	int unlock_override;
311 	int error = 0;
312 	int count;
313 	struct lockf_range_list deadlist;
314 
315 	new_range1 = NULL;
316 	new_range2 = NULL;
317 	count = 0;
318 
319 restart:
320 	/*
321 	 * Preallocate two ranges so we don't have to worry about blocking
322 	 * in the middle of the lock code.
323 	 */
324 	if (new_range1 == NULL)
325 		new_range1 = lf_alloc_range();
326 	if (new_range2 == NULL)
327 		new_range2 = lf_alloc_range();
328 	first_match = NULL;
329 	last_match = NULL;
330 	insert_point = NULL;
331 	wakeup_needed = 0;
332 
333 	lf_print_lock(lock);
334 
335 	/*
336 	 * Locate the insertion point for the new lock (the first range
337 	 * with an lf_start >= start).
338 	 *
339 	 * Locate the first and latch ranges owned by us that overlap
340 	 * the requested range.
341 	 */
342 	TAILQ_FOREACH(range, &lock->lf_range, lf_link) {
343 		if (insert_point == NULL && range->lf_start >= start)
344 			insert_point = range;
345 
346 		/*
347 		 * Skip non-overlapping locks.  Locks are sorted by lf_start
348 		 * So we can terminate the search when lf_start exceeds the
349 		 * requested range (insert_point is still guarenteed to be
350 		 * set properly).
351 		 */
352 		if (range->lf_end < start)
353 			continue;
354 		if (range->lf_start > end) {
355 			range = NULL;
356 			break;
357 		}
358 
359 		/*
360 		 * Overlapping lock.  Set first_match and last_match if we
361 		 * are the owner.
362 		 */
363 		if (range->lf_owner == owner) {
364 			if (first_match == NULL)
365 				first_match = range;
366 			last_match = range;
367 			continue;
368 		}
369 
370 		/*
371 		 * If we aren't the owner check for a conflicting lock.  Only
372 		 * if not unlocking.
373 		 */
374 		if (type != F_UNLCK) {
375 			if (type == F_WRLCK || range->lf_type == F_WRLCK)
376 				break;
377 		}
378 	}
379 
380 	/*
381 	 * If a conflicting lock was observed, block or fail as appropriate.
382 	 * (this code is skipped when unlocking)
383 	 */
384 	if (range != NULL) {
385 		if ((flags & F_WAIT) == 0) {
386 			error = EAGAIN;
387 			goto do_cleanup;
388 		}
389 
390 		/*
391 		 * We are blocked. For POSIX locks we have to check
392 		 * for deadlocks and return with EDEADLK. This is done
393 		 * by checking whether range->lf_owner is already
394 		 * blocked.
395 		 *
396 		 * Since flock-style locks cover the whole file, a
397 		 * deadlock between those is nearly impossible.
398 		 * This can only occur if a process tries to lock the
399 		 * same inode exclusively while holding a shared lock
400 		 * with another descriptor.
401 		 * XXX How can we cleanly detect this?
402 		 * XXX The current mixing of flock & fcntl/lockf is evil.
403 		 *
404 		 * Handle existing locks of flock-style like POSIX locks.
405 		 */
406 		if (flags & F_POSIX) {
407 			TAILQ_FOREACH(brange, &lock->lf_blocked, lf_link) {
408 				if (brange->lf_owner == range->lf_owner) {
409 					error = EDEADLK;
410 					goto do_cleanup;
411 				}
412 			}
413 		}
414 
415 		/*
416 		 * For flock-style locks, we must first remove
417 		 * any shared locks that we hold before we sleep
418 		 * waiting for an exclusive lock.
419 		 */
420 		if ((flags & F_POSIX) == 0 && type == F_WRLCK)
421 			lf_setlock(lock, owner, F_UNLCK, 0, start, end);
422 
423 		brange = new_range1;
424 		new_range1 = NULL;
425 		lf_create_range(brange, owner, type, 0, start, end);
426 		TAILQ_INSERT_TAIL(&lock->lf_blocked, brange, lf_link);
427 		error = tsleep(brange, PCATCH, "lockf", 0);
428 
429 		/*
430 		 * We may have been awaked by a signal and/or by a
431 		 * debugger continuing us (in which case we must remove
432 		 * ourselves from the blocked list) and/or by another
433 		 * process releasing/downgrading a lock (in which case
434 		 * we have already been removed from the blocked list
435 		 * and our lf_flags field is 1).
436 		 *
437 		 * Sleep if it looks like we might be livelocking.
438 		 */
439 		if (brange->lf_flags == 0)
440 			TAILQ_REMOVE(&lock->lf_blocked, brange, lf_link);
441 		if (count == 2)
442 			tsleep(brange, 0, "lockfz", 2);
443 		else
444 			++count;
445 		lf_destroy_range(brange);
446 
447 		if (error)
448 			goto do_cleanup;
449 		goto restart;
450 	}
451 
452 	/*
453 	 * If there are no overlapping locks owned by us then creating
454 	 * the new lock is easy.  This is the most common case.
455 	 */
456 	if (first_match == NULL) {
457 		if (type == F_UNLCK)
458 			goto do_wakeup;
459 		if (flags & F_POSIX) {
460 			if (lf_count_change(owner, 1)) {
461 				error = ENOLCK;
462 				goto do_cleanup;
463 			}
464 		}
465 		range = new_range1;
466 		new_range1 = NULL;
467 		lf_create_range(range, owner, type, flags, start, end);
468 		lf_insert(&lock->lf_range, range, insert_point);
469 		goto do_wakeup;
470 	}
471 
472 	/*
473 	 * double_clip - Calculate a special case where TWO locks may have
474 	 *		 to be added due to the new lock breaking up an
475 	 *		 existing incompatible lock in the middle.
476 	 *
477 	 * unlock_override - Calculate a special case where NO locks
478 	 *		 need to be created.  This occurs when an unlock
479 	 *		 does not clip any locks at the front and rear.
480 	 *
481 	 * WARNING!  closef() and fdrop() assume that an F_UNLCK of the
482 	 *	     entire range will always succeed so the unlock_override
483 	 *	     case is mandatory.
484 	 */
485 	double_clip = 0;
486 	unlock_override = 0;
487 	if (first_match->lf_start < start) {
488 		if (first_match == last_match && last_match->lf_end > end)
489 			double_clip = 1;
490 	} else if (type == F_UNLCK && last_match->lf_end <= end) {
491 		unlock_override = 1;
492 	}
493 
494 	/*
495 	 * Figure out the worst case net increase in POSIX locks and account
496 	 * for it now before we start modifying things.  If neither the
497 	 * first or last locks match we have an issue.  If there is only
498 	 * one overlapping range which needs to be clipped on both ends
499 	 * we wind up having to create up to two new locks, else only one.
500 	 *
501 	 * When unlocking the worst case is always 1 new lock if our
502 	 * unlock request cuts the middle out of an existing lock range.
503 	 *
504 	 * count represents the 'cleanup' adjustment needed.  It starts
505 	 * negative, is incremented whenever we create a new POSIX lock,
506 	 * and decremented whenever we delete an existing one.  At the
507 	 * end of the day it had better be <= 0 or we didn't calculate the
508 	 * worse case properly here.
509 	 */
510 	count = 0;
511 	if ((flags & F_POSIX) && !unlock_override) {
512 		if (!lf_match(first_match, type, flags) &&
513 		    !lf_match(last_match, type, flags)
514 		) {
515 			if (double_clip && type != F_UNLCK)
516 				count = -2;
517 			else
518 				count = -1;
519 		}
520 		if (count && lf_count_change(owner, -count)) {
521 			error = ENOLCK;
522 			goto do_cleanup;
523 		}
524 	}
525 	/* else flock style lock which encompasses entire range */
526 
527 	/*
528 	 * Create and insert the lock represented the requested range.
529 	 * Adjust the net POSIX lock count.  We have to move our insertion
530 	 * point since brange now represents the first record >= start.
531 	 *
532 	 * When unlocking, no new lock is inserted but we still clip.
533 	 */
534 	if (type != F_UNLCK) {
535 		brange = new_range1;
536 		new_range1 = NULL;
537 		lf_create_range(brange, owner, type, flags, start, end);
538 		lf_insert(&lock->lf_range, brange, insert_point);
539 		insert_point = brange;
540 		if (flags & F_POSIX)
541 			++count;
542 	} else {
543 		brange = NULL;
544 	}
545 
546 	/*
547 	 * Handle the double_clip case.  This is the only case where
548 	 * we wind up having to add TWO locks.
549 	 */
550 	if (double_clip) {
551 		KKASSERT(first_match == last_match);
552 		last_match = new_range2;
553 		new_range2 = NULL;
554 		lf_create_range(last_match, first_match->lf_owner,
555 				first_match->lf_type, first_match->lf_flags,
556 				end + 1, first_match->lf_end);
557 		first_match->lf_end = start - 1;
558 		first_match->lf_flags &= ~F_NOEND;
559 
560 		/*
561 		 * Figure out where to insert the right side clip.
562 		 */
563 		lf_insert(&lock->lf_range, last_match, first_match);
564 		if (last_match->lf_flags & F_POSIX)
565 			++count;
566 	}
567 
568 	/*
569 	 * Clip or destroy the locks between first_match and last_match,
570 	 * inclusive.  Ignore the primary lock we created (brange).  Note
571 	 * that if double-clipped, first_match and last_match will be
572 	 * outside our clipping range.  Otherwise first_match and last_match
573 	 * will be deleted.
574 	 *
575 	 * We have already taken care of any double clipping.
576 	 *
577 	 * The insert_point may become invalid as we delete records, do not
578 	 * use that pointer any more.  Also, when removing something other
579 	 * then 'range' we have to check to see if the item we are removing
580 	 * is 'next' and adjust 'next' properly.
581 	 *
582 	 * NOTE: brange will be NULL if F_UNLCKing.
583 	 */
584 	TAILQ_INIT(&deadlist);
585 	next = first_match;
586 
587 	while ((range = next) != NULL) {
588 		next = TAILQ_NEXT(range, lf_link);
589 
590 		/*
591 		 * Ignore elements that we do not own and ignore the
592 		 * primary request range which we just created.
593 		 */
594 		if (range->lf_owner != owner || range == brange)
595 			continue;
596 
597 		/*
598 		 * We may have to wakeup a waiter when downgrading a lock.
599 		 */
600 		if (type == F_UNLCK)
601 			wakeup_needed = 1;
602 		if (type == F_RDLCK && range->lf_type == F_WRLCK)
603 			wakeup_needed = 1;
604 
605 		/*
606 		 * Clip left.  This can only occur on first_match.
607 		 *
608 		 * Merge the left clip with brange if possible.  This must
609 		 * be done specifically, not in the optimized merge heuristic
610 		 * below, since we may have counted on it in our 'count'
611 		 * calculation above.
612 		 */
613 		if (range->lf_start < start) {
614 			KKASSERT(range == first_match);
615 			if (brange &&
616 			    range->lf_end >= start - 1 &&
617 			    lf_match(range, type, flags)) {
618 				range->lf_end = brange->lf_end;
619 				range->lf_flags |= brange->lf_flags & F_NOEND;
620 				/*
621 				 * Removing something other then 'range',
622 				 * adjust 'next' if necessary.
623 				 */
624 				if (next == brange)
625 					next = TAILQ_NEXT(next, lf_link);
626 				TAILQ_REMOVE(&lock->lf_range, brange, lf_link);
627 				if (brange->lf_flags & F_POSIX)
628 					--count;
629 				TAILQ_INSERT_TAIL(&deadlist, brange, lf_link);
630 				brange = range;
631 			} else if (range->lf_end >= start) {
632 				range->lf_end = start - 1;
633 				if (type != F_UNLCK)
634 					range->lf_flags &= ~F_NOEND;
635 			}
636 			if (range == last_match)
637 				break;
638 			continue;
639 		}
640 
641 		/*
642 		 * Clip right.  This can only occur on last_match.
643 		 *
644 		 * Merge the right clip if possible.  This must be done
645 		 * specifically, not in the optimized merge heuristic
646 		 * below, since we may have counted on it in our 'count'
647 		 * calculation.
648 		 *
649 		 * Since we are adjusting lf_start, we have to move the
650 		 * record to maintain the sorted list.  Since lf_start is
651 		 * only getting larger we can use the next element as the
652 		 * insert point (we don't have to backtrack).
653 		 */
654 		if (range->lf_end > end) {
655 			KKASSERT(range == last_match);
656 			if (brange &&
657 			    range->lf_start <= end + 1 &&
658 			    lf_match(range, type, flags)) {
659 				brange->lf_end = range->lf_end;
660 				brange->lf_flags |= range->lf_flags & F_NOEND;
661 				TAILQ_REMOVE(&lock->lf_range, range, lf_link);
662 				if (range->lf_flags & F_POSIX)
663 					--count;
664 				TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
665 			} else if (range->lf_start <= end) {
666 				range->lf_start = end + 1;
667 				TAILQ_REMOVE(&lock->lf_range, range, lf_link);
668 				lf_insert(&lock->lf_range, range, next);
669 			}
670 			/* range == last_match, we are done */
671 			break;
672 		}
673 
674 		/*
675 		 * The record must be entirely enclosed.  Note that the
676 		 * record could be first_match or last_match, and will be
677 		 * deleted.
678 		 */
679 		KKASSERT(range->lf_start >= start && range->lf_end <= end);
680 		TAILQ_REMOVE(&lock->lf_range, range, lf_link);
681 		if (range->lf_flags & F_POSIX)
682 			--count;
683 		TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
684 		if (range == last_match)
685 			break;
686 	}
687 
688 	/*
689 	 * Attempt to merge locks adjacent to brange.  For example, we may
690 	 * have had to clip first_match and/or last_match, and they might
691 	 * be adjacent.  Or there might simply have been an adjacent lock
692 	 * already there.
693 	 *
694 	 * Don't get fancy, just check adjacent elements in the list if they
695 	 * happen to be owned by us.
696 	 *
697 	 * This case only gets hit if we have a situation where a shared
698 	 * and exclusive lock are adjacent, and the exclusive lock is
699 	 * downgraded to shared or the shared lock is upgraded to exclusive.
700 	 */
701 	if (brange) {
702 		range = TAILQ_PREV(brange, lockf_range_list, lf_link);
703 		if (range &&
704 		    range->lf_owner == owner &&
705 		    range->lf_end == brange->lf_start - 1 &&
706 		    lf_match(range, type, flags)
707 		) {
708 			/*
709 			 * Extend range to cover brange and scrap brange.
710 			 */
711 			range->lf_end = brange->lf_end;
712 			range->lf_flags |= brange->lf_flags & F_NOEND;
713 			TAILQ_REMOVE(&lock->lf_range, brange, lf_link);
714 			if (brange->lf_flags & F_POSIX)
715 				--count;
716 			TAILQ_INSERT_TAIL(&deadlist, brange, lf_link);
717 			brange = range;
718 		}
719 		range = TAILQ_NEXT(brange, lf_link);
720 		if (range &&
721 		    range->lf_owner == owner &&
722 		    range->lf_start == brange->lf_end + 1 &&
723 		    lf_match(range, type, flags)
724 		) {
725 			/*
726 			 * Extend brange to cover range and scrap range.
727 			 */
728 			brange->lf_end = range->lf_end;
729 			brange->lf_flags |= range->lf_flags & F_NOEND;
730 			TAILQ_REMOVE(&lock->lf_range, range, lf_link);
731 			if (range->lf_flags & F_POSIX)
732 				--count;
733 			TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
734 		}
735 	}
736 
737 	/*
738 	 * Destroy deleted elements.  We didn't want to do it in the loop
739 	 * because the free() might have blocked.
740 	 *
741 	 * Adjust the count for any posix locks we thought we might create
742 	 * but didn't.
743 	 */
744 	while ((range = TAILQ_FIRST(&deadlist)) != NULL) {
745 		TAILQ_REMOVE(&deadlist, range, lf_link);
746 		lf_destroy_range(range);
747 	}
748 
749 	KKASSERT(count <= 0);
750 	if (count < 0)
751 		lf_count_change(owner, count);
752 do_wakeup:
753 	lf_print_lock(lock);
754 	if (wakeup_needed)
755 		lf_wakeup(lock, start, end);
756 	error = 0;
757 do_cleanup:
758 	if (new_range1 != NULL)
759 		lf_destroy_range(new_range1);
760 	if (new_range2 != NULL)
761 		lf_destroy_range(new_range2);
762 	return(error);
763 }
764 
765 /*
766  * Check whether there is a blocking lock,
767  * and if so return its process identifier.
768  */
769 static int
770 lf_getlock(struct flock *fl, struct lockf *lock, struct proc *owner,
771 	   int type, int flags, off_t start, off_t end)
772 {
773 	struct lockf_range *range;
774 
775 	TAILQ_FOREACH(range, &lock->lf_range, lf_link)
776 		if (range->lf_owner != owner &&
777 		    lf_overlap(range, start, end) &&
778 		    (type == F_WRLCK || range->lf_type == F_WRLCK))
779 			break;
780 	if (range == NULL) {
781 		fl->l_type = F_UNLCK;
782 		return(0);
783 	}
784 	fl->l_type = range->lf_type;
785 	fl->l_whence = SEEK_SET;
786 	fl->l_start = range->lf_start;
787 	if (range->lf_flags & F_NOEND)
788 		fl->l_len = 0;
789 	else
790 		fl->l_len = range->lf_end - range->lf_start + 1;
791 	if (range->lf_owner != NULL && (range->lf_flags & F_POSIX))
792 		fl->l_pid = range->lf_owner->p_pid;
793 	else
794 		fl->l_pid = -1;
795 	return(0);
796 }
797 
798 /*
799  * Wakeup pending lock attempts.  Theoretically we can stop as soon as
800  * we encounter an exclusive request that covers the whole range (at least
801  * insofar as the sleep code above calls lf_wakeup() if it would otherwise
802  * exit instead of loop), but for now just wakeup all overlapping
803  * requests.  XXX
804  */
805 static void
806 lf_wakeup(struct lockf *lock, off_t start, off_t end)
807 {
808 	struct lockf_range *range, *nrange;
809 
810 	TAILQ_FOREACH_MUTABLE(range, &lock->lf_blocked, lf_link, nrange) {
811 		if (lf_overlap(range, start, end) == 0)
812 			continue;
813 		TAILQ_REMOVE(&lock->lf_blocked, range, lf_link);
814 		range->lf_flags = 1;
815 		wakeup(range);
816 	}
817 }
818 
819 /*
820  * Allocate a range structure and initialize it sufficiently such that
821  * lf_destroy_range() does not barf.
822  *
823  * Most use cases are temporary, implement a small 2-entry-per-cpu
824  * cache.
825  */
826 static struct lockf_range *
827 lf_alloc_range(void)
828 {
829 	struct lockf_range *range;
830 	struct lf_pcpu *lfpc;
831 
832 	lfpc = &lf_pcpu_array[mycpuid];
833 	if ((range = lfpc->free1) != NULL) {
834 		lfpc->free1 = NULL;
835 		return range;
836 	}
837 	if ((range = lfpc->free2) != NULL) {
838 		lfpc->free2 = NULL;
839 		return range;
840 	}
841 	range = kmalloc(sizeof(struct lockf_range), M_LOCKF, M_WAITOK);
842 	range->lf_owner = NULL;
843 
844 	return(range);
845 }
846 
847 static void
848 lf_insert(struct lockf_range_list *list, struct lockf_range *elm,
849 	  struct lockf_range *insert_point)
850 {
851 	while (insert_point && insert_point->lf_start < elm->lf_start)
852 		insert_point = TAILQ_NEXT(insert_point, lf_link);
853 	if (insert_point != NULL)
854 		TAILQ_INSERT_BEFORE(insert_point, elm, lf_link);
855 	else
856 		TAILQ_INSERT_TAIL(list, elm, lf_link);
857 }
858 
859 static void
860 lf_create_range(struct lockf_range *range, struct proc *owner, int type,
861 		int flags, off_t start, off_t end)
862 {
863 	KKASSERT(start <= end);
864 	range->lf_type = type;
865 	range->lf_flags = flags;
866 	range->lf_start = start;
867 	range->lf_end = end;
868 	range->lf_owner = owner;
869 
870 	lf_printf("lf_create_range: %ju..%ju\n",
871 	    (uintmax_t)range->lf_start, (uintmax_t)range->lf_end);
872 }
873 
874 static void
875 lf_destroy_range(struct lockf_range *range)
876 {
877 	struct lf_pcpu *lfpc;
878 
879 	lf_printf("lf_destroy_range: %ju..%ju\n",
880 		  (uintmax_t)range->lf_start, (uintmax_t)range->lf_end);
881 
882 	lfpc = &lf_pcpu_array[mycpuid];
883 	if (lfpc->free1 == NULL) {
884 		range->lf_owner = NULL;
885 		lfpc->free1 = range;
886 		return;
887 	}
888 	if (lfpc->free2 == NULL) {
889 		range->lf_owner = NULL;
890 		lfpc->free2 = range;
891 		return;
892 	}
893 	kfree(range, M_LOCKF);
894 }
895 
896 #ifdef LOCKF_DEBUG
897 
898 static void
899 _lf_printf(const char *ctl, ...)
900 {
901 	struct proc *p;
902 	__va_list va;
903 
904 	if (lf_print_ranges) {
905 	    if ((p = curproc) != NULL)
906 		kprintf("pid %d (%s): ", p->p_pid, p->p_comm);
907 	}
908 	__va_start(va, ctl);
909 	kvprintf(ctl, va);
910 	__va_end(va);
911 }
912 
913 static void
914 _lf_print_lock(const struct lockf *lock)
915 {
916 	struct lockf_range *range;
917 
918 	if (lf_print_ranges == 0)
919 		return;
920 
921 	if (TAILQ_EMPTY(&lock->lf_range)) {
922 		lf_printf("lockf %p: no ranges locked\n", lock);
923 	} else {
924 		lf_printf("lockf %p:\n", lock);
925 	}
926 	TAILQ_FOREACH(range, &lock->lf_range, lf_link)
927 		kprintf("\t%jd..%jd type %s owned by %d\n",
928 		       (uintmax_t)range->lf_start, (uintmax_t)range->lf_end,
929 		       range->lf_type == F_RDLCK ? "shared" : "exclusive",
930 		       range->lf_flags & F_POSIX ? range->lf_owner->p_pid : -1);
931 	if (TAILQ_EMPTY(&lock->lf_blocked))
932 		kprintf("no process waiting for range\n");
933 	else
934 		kprintf("blocked locks:");
935 	TAILQ_FOREACH(range, &lock->lf_blocked, lf_link)
936 		kprintf("\t%jd..%jd type %s waiting on %p\n",
937 		       (uintmax_t)range->lf_start, (uintmax_t)range->lf_end,
938 		       range->lf_type == F_RDLCK ? "shared" : "exclusive",
939 		       range);
940 }
941 #endif /* LOCKF_DEBUG */
942 
943 static void
944 lf_init(void *dummy __unused)
945 {
946 	lf_pcpu_array = kmalloc(sizeof(*lf_pcpu_array) * ncpus,
947 				M_LOCKF, M_WAITOK | M_ZERO);
948 }
949 
950 SYSINIT(lockf, SI_BOOT2_MACHDEP, SI_ORDER_ANY, lf_init, NULL);
951