1 /*
2 * Copyright (c) 2004 Joerg Sonnenberger <joerg@bec.de>. All rights reserved.
3 * Copyright (c) 2006-2018 Matthew Dillon <dillon@backplane.com>. All rights reserved.
4 *
5 * Copyright (c) 1982, 1986, 1989, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Scooter Morris at Genentech Inc.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
36 * $FreeBSD: src/sys/kern/kern_lockf.c,v 1.25 1999/11/16 16:28:56 phk Exp $
37 */
38
39 #include "opt_debug_lockf.h"
40
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/lock.h>
45 #include <sys/proc.h>
46 #include <sys/unistd.h>
47 #include <sys/vnode.h>
48 #include <sys/malloc.h>
49 #include <sys/fcntl.h>
50 #include <sys/resourcevar.h>
51
52 #include <sys/lockf.h>
53 #include <machine/limits.h> /* for LLONG_MAX */
54 #include <machine/stdarg.h>
55
56 #include <sys/spinlock2.h>
57
58 struct lf_pcpu {
59 struct lockf_range *free1;
60 struct lockf_range *free2;
61 } __cachealign;
62
63 static struct lf_pcpu *lf_pcpu_array;
64
65 #ifdef LOCKF_DEBUG
66 int lf_print_ranges = 0;
67
68 static void _lf_print_lock(const struct lockf *);
69 static void _lf_printf(const char *, ...) __printflike(1, 2);
70
71 #define lf_print_lock(lock) if (lf_print_ranges) _lf_print_lock(lock)
72 #define lf_printf(ctl, args...) if (lf_print_ranges) _lf_printf(ctl, args)
73 #else
74 #define lf_print_lock(lock)
75 #define lf_printf(ctl, args...)
76 #endif
77
78 static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
79
80 static void lf_wakeup(struct lockf *, off_t, off_t);
81 static struct lockf_range *lf_alloc_range(void);
82 static void lf_create_range(struct lockf_range *, struct proc *, int, int,
83 off_t, off_t);
84 static void lf_insert(struct lockf_range_list *list,
85 struct lockf_range *elm,
86 struct lockf_range *insert_point);
87 static void lf_destroy_range(struct lockf_range *);
88
89 static int lf_setlock(struct lockf *, struct proc *, int, int,
90 off_t, off_t);
91 static int lf_getlock(struct flock *, struct lockf *, struct proc *,
92 int, int, off_t, off_t);
93
94 static int lf_count_change(struct proc *, int);
95
96 /*
97 * Return TRUE (non-zero) if the type and posix flags match.
98 */
99 static __inline
100 int
lf_match(struct lockf_range * range,int type,int flags)101 lf_match(struct lockf_range *range, int type, int flags)
102 {
103 if (range->lf_type != type)
104 return(0);
105 if ((range->lf_flags ^ flags) & F_POSIX)
106 return(0);
107 return(1);
108 }
109
110 /*
111 * Check whether range and [start, end] overlap.
112 */
113 static __inline
114 int
lf_overlap(const struct lockf_range * range,off_t start,off_t end)115 lf_overlap(const struct lockf_range *range, off_t start, off_t end)
116 {
117 if (range->lf_start >= start && range->lf_start <= end)
118 return(1);
119 else if (start >= range->lf_start && start <= range->lf_end)
120 return(1);
121 else
122 return(0);
123 }
124
125
126 /*
127 * Change the POSIX lock accounting for the given process.
128 */
129 void
lf_count_adjust(struct proc * p,int increase)130 lf_count_adjust(struct proc *p, int increase)
131 {
132 struct uidinfo *uip;
133 struct uidcount *pup;
134 int n;
135
136 KKASSERT(p != NULL);
137
138 uip = p->p_ucred->cr_uidinfo;
139 pup = &uip->ui_pcpu[mycpuid];
140
141 if (increase) {
142 for (n = 0; n < ncpus; ++n)
143 pup->pu_posixlocks += p->p_uidpcpu[n].pu_posixlocks;
144 } else {
145 for (n = 0; n < ncpus; ++n)
146 pup->pu_posixlocks -= p->p_uidpcpu[n].pu_posixlocks;
147 }
148
149 if (pup->pu_posixlocks < -PUP_LIMIT ||
150 pup->pu_posixlocks > PUP_LIMIT) {
151 atomic_add_int(&uip->ui_posixlocks, pup->pu_posixlocks);
152 pup->pu_posixlocks = 0;
153 }
154 }
155
156 static int
lf_count_change(struct proc * owner,int diff)157 lf_count_change(struct proc *owner, int diff)
158 {
159 struct uidinfo *uip;
160 int max, ret;
161
162 /* we might actually not have a process context */
163 if (owner == NULL)
164 return(0);
165
166 uip = owner->p_ucred->cr_uidinfo;
167
168 max = MIN(owner->p_rlimit[RLIMIT_POSIXLOCKS].rlim_cur,
169 maxposixlocksperuid);
170
171 if (diff > 0 && owner->p_ucred->cr_uid != 0 && max != -1 &&
172 uip->ui_posixlocks >= max ) {
173 ret = 1;
174 } else {
175 struct uidcount *pup;
176 int cpu = mycpuid;
177
178 pup = &uip->ui_pcpu[cpu];
179 pup->pu_posixlocks += diff;
180 if (pup->pu_posixlocks < -PUP_LIMIT ||
181 pup->pu_posixlocks > PUP_LIMIT) {
182 atomic_add_int(&uip->ui_posixlocks, pup->pu_posixlocks);
183 pup->pu_posixlocks = 0;
184 }
185 owner->p_uidpcpu[cpu].pu_posixlocks += diff;
186 ret = 0;
187 }
188 return ret;
189 }
190
191 /*
192 * Advisory record locking support
193 */
194 int
lf_advlock(struct vop_advlock_args * ap,struct lockf * lock,u_quad_t size)195 lf_advlock(struct vop_advlock_args *ap, struct lockf *lock, u_quad_t size)
196 {
197 struct flock *fl = ap->a_fl;
198 struct proc *owner;
199 off_t start, end;
200 int type, flags, error;
201 lwkt_token_t token;
202
203 /*
204 * Convert the flock structure into a start and end.
205 */
206 switch (fl->l_whence) {
207 case SEEK_SET:
208 case SEEK_CUR:
209 /*
210 * Caller is responsible for adding any necessary offset
211 * when SEEK_CUR is used.
212 */
213 start = fl->l_start;
214 break;
215
216 case SEEK_END:
217 start = size + fl->l_start;
218 break;
219
220 default:
221 return(EINVAL);
222 }
223
224 flags = ap->a_flags;
225 if (start < 0)
226 return(EINVAL);
227 if (fl->l_len == 0) {
228 flags |= F_NOEND;
229 end = LLONG_MAX;
230 } else if (fl->l_len < 0) {
231 return(EINVAL);
232 } else {
233 end = start + fl->l_len - 1;
234 if (end < start)
235 return(EINVAL);
236 }
237
238 type = fl->l_type;
239 /*
240 * This isn't really correct for flock-style locks,
241 * but the current handling is somewhat broken anyway.
242 */
243 owner = (struct proc *)ap->a_id;
244
245 /*
246 * Do the requested operation.
247 */
248 token = lwkt_getpooltoken(lock);
249
250 if (lock->init_done == 0) {
251 TAILQ_INIT(&lock->lf_range);
252 TAILQ_INIT(&lock->lf_blocked);
253 lock->init_done = 1;
254 }
255
256 switch(ap->a_op) {
257 case F_SETLK:
258 /*
259 * NOTE: It is possible for both lf_range and lf_blocked to
260 * be empty if we block and get woken up, but another process
261 * then gets in and issues an unlock. So VMAYHAVELOCKS must
262 * be set after the lf_setlock() operation completes rather
263 * then before.
264 */
265 error = lf_setlock(lock, owner, type, flags, start, end);
266 if ((ap->a_vp->v_flag & VMAYHAVELOCKS) == 0)
267 vsetflags(ap->a_vp, VMAYHAVELOCKS);
268 break;
269
270 case F_UNLCK:
271 error = lf_setlock(lock, owner, type, flags, start, end);
272 #if 0
273 /*
274 * XXX REMOVED. don't bother doing this in the critical path.
275 * close() overhead is minimal.
276 */
277 if (TAILQ_EMPTY(&lock->lf_range) &&
278 TAILQ_EMPTY(&lock->lf_blocked)) {
279 vclrflags(ap->a_vp, VMAYHAVELOCKS);
280 }
281 #endif
282 break;
283
284 case F_GETLK:
285 error = lf_getlock(fl, lock, owner, type, flags, start, end);
286 break;
287
288 default:
289 error = EINVAL;
290 break;
291 }
292 lwkt_reltoken(token);
293 return(error);
294 }
295
296 static int
lf_setlock(struct lockf * lock,struct proc * owner,int type,int flags,off_t start,off_t end)297 lf_setlock(struct lockf *lock, struct proc *owner, int type, int flags,
298 off_t start, off_t end)
299 {
300 struct lockf_range *range;
301 struct lockf_range *brange;
302 struct lockf_range *next;
303 struct lockf_range *first_match;
304 struct lockf_range *last_match;
305 struct lockf_range *insert_point;
306 struct lockf_range *new_range1;
307 struct lockf_range *new_range2;
308 int wakeup_needed;
309 int double_clip;
310 int unlock_override;
311 int error = 0;
312 int count;
313 struct lockf_range_list deadlist;
314
315 new_range1 = NULL;
316 new_range2 = NULL;
317 count = 0;
318
319 restart:
320 /*
321 * Preallocate two ranges so we don't have to worry about blocking
322 * in the middle of the lock code.
323 */
324 if (new_range1 == NULL)
325 new_range1 = lf_alloc_range();
326 if (new_range2 == NULL)
327 new_range2 = lf_alloc_range();
328 first_match = NULL;
329 last_match = NULL;
330 insert_point = NULL;
331 wakeup_needed = 0;
332
333 lf_print_lock(lock);
334
335 /*
336 * Locate the insertion point for the new lock (the first range
337 * with an lf_start >= start).
338 *
339 * Locate the first and latch ranges owned by us that overlap
340 * the requested range.
341 */
342 TAILQ_FOREACH(range, &lock->lf_range, lf_link) {
343 if (insert_point == NULL && range->lf_start >= start)
344 insert_point = range;
345
346 /*
347 * Skip non-overlapping locks. Locks are sorted by lf_start
348 * So we can terminate the search when lf_start exceeds the
349 * requested range (insert_point is still guarenteed to be
350 * set properly).
351 */
352 if (range->lf_end < start)
353 continue;
354 if (range->lf_start > end) {
355 range = NULL;
356 break;
357 }
358
359 /*
360 * Overlapping lock. Set first_match and last_match if we
361 * are the owner.
362 */
363 if (range->lf_owner == owner) {
364 if (first_match == NULL)
365 first_match = range;
366 last_match = range;
367 continue;
368 }
369
370 /*
371 * If we aren't the owner check for a conflicting lock. Only
372 * if not unlocking.
373 */
374 if (type != F_UNLCK) {
375 if (type == F_WRLCK || range->lf_type == F_WRLCK)
376 break;
377 }
378 }
379
380 /*
381 * If a conflicting lock was observed, block or fail as appropriate.
382 * (this code is skipped when unlocking)
383 */
384 if (range != NULL) {
385 if ((flags & F_WAIT) == 0) {
386 error = EAGAIN;
387 goto do_cleanup;
388 }
389
390 /*
391 * We are blocked. For POSIX locks we have to check
392 * for deadlocks and return with EDEADLK. This is done
393 * by checking whether range->lf_owner is already
394 * blocked.
395 *
396 * Since flock-style locks cover the whole file, a
397 * deadlock between those is nearly impossible.
398 * This can only occur if a process tries to lock the
399 * same inode exclusively while holding a shared lock
400 * with another descriptor.
401 * XXX How can we cleanly detect this?
402 * XXX The current mixing of flock & fcntl/lockf is evil.
403 *
404 * Handle existing locks of flock-style like POSIX locks.
405 */
406 if (flags & F_POSIX) {
407 TAILQ_FOREACH(brange, &lock->lf_blocked, lf_link) {
408 if (brange->lf_owner == range->lf_owner) {
409 error = EDEADLK;
410 goto do_cleanup;
411 }
412 }
413 }
414
415 /*
416 * For flock-style locks, we must first remove
417 * any shared locks that we hold before we sleep
418 * waiting for an exclusive lock.
419 */
420 if ((flags & F_POSIX) == 0 && type == F_WRLCK)
421 lf_setlock(lock, owner, F_UNLCK, 0, start, end);
422
423 brange = new_range1;
424 new_range1 = NULL;
425 lf_create_range(brange, owner, type, 0, start, end);
426 TAILQ_INSERT_TAIL(&lock->lf_blocked, brange, lf_link);
427 error = tsleep(brange, PCATCH, "lockf", 0);
428
429 /*
430 * We may have been awaked by a signal and/or by a
431 * debugger continuing us (in which case we must remove
432 * ourselves from the blocked list) and/or by another
433 * process releasing/downgrading a lock (in which case
434 * we have already been removed from the blocked list
435 * and our lf_flags field is 1).
436 *
437 * Sleep if it looks like we might be livelocking.
438 */
439 if (brange->lf_flags == 0)
440 TAILQ_REMOVE(&lock->lf_blocked, brange, lf_link);
441 if (error == 0 && count == 2)
442 tsleep(brange, 0, "lockfz", 2);
443 else
444 ++count;
445 lf_destroy_range(brange);
446
447 if (error)
448 goto do_cleanup;
449 goto restart;
450 }
451
452 /*
453 * If there are no overlapping locks owned by us then creating
454 * the new lock is easy. This is the most common case.
455 */
456 if (first_match == NULL) {
457 if (type == F_UNLCK)
458 goto do_wakeup;
459 if (flags & F_POSIX) {
460 if (lf_count_change(owner, 1)) {
461 error = ENOLCK;
462 goto do_cleanup;
463 }
464 }
465 range = new_range1;
466 new_range1 = NULL;
467 lf_create_range(range, owner, type, flags, start, end);
468 lf_insert(&lock->lf_range, range, insert_point);
469 goto do_wakeup;
470 }
471
472 /*
473 * double_clip - Calculate a special case where TWO locks may have
474 * to be added due to the new lock breaking up an
475 * existing incompatible lock in the middle.
476 *
477 * unlock_override - Calculate a special case where NO locks
478 * need to be created. This occurs when an unlock
479 * does not clip any locks at the front and rear.
480 *
481 * WARNING! closef() and fdrop() assume that an F_UNLCK of the
482 * entire range will always succeed so the unlock_override
483 * case is mandatory.
484 */
485 double_clip = 0;
486 unlock_override = 0;
487 if (first_match->lf_start < start) {
488 if (first_match == last_match && last_match->lf_end > end)
489 double_clip = 1;
490 } else if (type == F_UNLCK && last_match->lf_end <= end) {
491 unlock_override = 1;
492 }
493
494 /*
495 * Figure out the worst case net increase in POSIX locks and account
496 * for it now before we start modifying things. If neither the
497 * first or last locks match we have an issue. If there is only
498 * one overlapping range which needs to be clipped on both ends
499 * we wind up having to create up to two new locks, else only one.
500 *
501 * When unlocking the worst case is always 1 new lock if our
502 * unlock request cuts the middle out of an existing lock range.
503 *
504 * count represents the 'cleanup' adjustment needed. It starts
505 * negative, is incremented whenever we create a new POSIX lock,
506 * and decremented whenever we delete an existing one. At the
507 * end of the day it had better be <= 0 or we didn't calculate the
508 * worse case properly here.
509 */
510 count = 0;
511 if ((flags & F_POSIX) && !unlock_override) {
512 if (!lf_match(first_match, type, flags) &&
513 !lf_match(last_match, type, flags)
514 ) {
515 if (double_clip && type != F_UNLCK)
516 count = -2;
517 else
518 count = -1;
519 }
520 if (count && lf_count_change(owner, -count)) {
521 error = ENOLCK;
522 goto do_cleanup;
523 }
524 }
525 /* else flock style lock which encompasses entire range */
526
527 /*
528 * Create and insert the lock represented the requested range.
529 * Adjust the net POSIX lock count. We have to move our insertion
530 * point since brange now represents the first record >= start.
531 *
532 * When unlocking, no new lock is inserted but we still clip.
533 */
534 if (type != F_UNLCK) {
535 brange = new_range1;
536 new_range1 = NULL;
537 lf_create_range(brange, owner, type, flags, start, end);
538 lf_insert(&lock->lf_range, brange, insert_point);
539 insert_point = brange;
540 if (flags & F_POSIX)
541 ++count;
542 } else {
543 brange = NULL;
544 }
545
546 /*
547 * Handle the double_clip case. This is the only case where
548 * we wind up having to add TWO locks.
549 */
550 if (double_clip) {
551 KKASSERT(first_match == last_match);
552 last_match = new_range2;
553 new_range2 = NULL;
554 lf_create_range(last_match, first_match->lf_owner,
555 first_match->lf_type, first_match->lf_flags,
556 end + 1, first_match->lf_end);
557 first_match->lf_end = start - 1;
558 first_match->lf_flags &= ~F_NOEND;
559
560 /*
561 * Figure out where to insert the right side clip.
562 */
563 lf_insert(&lock->lf_range, last_match, first_match);
564 if (last_match->lf_flags & F_POSIX)
565 ++count;
566 }
567
568 /*
569 * Clip or destroy the locks between first_match and last_match,
570 * inclusive. Ignore the primary lock we created (brange). Note
571 * that if double-clipped, first_match and last_match will be
572 * outside our clipping range. Otherwise first_match and last_match
573 * will be deleted.
574 *
575 * We have already taken care of any double clipping.
576 *
577 * The insert_point may become invalid as we delete records, do not
578 * use that pointer any more. Also, when removing something other
579 * then 'range' we have to check to see if the item we are removing
580 * is 'next' and adjust 'next' properly.
581 *
582 * NOTE: brange will be NULL if F_UNLCKing.
583 */
584 TAILQ_INIT(&deadlist);
585 next = first_match;
586
587 while ((range = next) != NULL) {
588 next = TAILQ_NEXT(range, lf_link);
589
590 /*
591 * Ignore elements that we do not own and ignore the
592 * primary request range which we just created.
593 */
594 if (range->lf_owner != owner || range == brange)
595 continue;
596
597 /*
598 * We may have to wakeup a waiter when downgrading a lock.
599 */
600 if (type == F_UNLCK)
601 wakeup_needed = 1;
602 if (type == F_RDLCK && range->lf_type == F_WRLCK)
603 wakeup_needed = 1;
604
605 /*
606 * Clip left. This can only occur on first_match.
607 *
608 * Merge the left clip with brange if possible. This must
609 * be done specifically, not in the optimized merge heuristic
610 * below, since we may have counted on it in our 'count'
611 * calculation above.
612 */
613 if (range->lf_start < start) {
614 KKASSERT(range == first_match);
615 if (brange &&
616 range->lf_end >= start - 1 &&
617 lf_match(range, type, flags)) {
618 range->lf_end = brange->lf_end;
619 range->lf_flags |= brange->lf_flags & F_NOEND;
620 /*
621 * Removing something other then 'range',
622 * adjust 'next' if necessary.
623 */
624 if (next == brange)
625 next = TAILQ_NEXT(next, lf_link);
626 TAILQ_REMOVE(&lock->lf_range, brange, lf_link);
627 if (brange->lf_flags & F_POSIX)
628 --count;
629 TAILQ_INSERT_TAIL(&deadlist, brange, lf_link);
630 brange = range;
631 } else if (range->lf_end >= start) {
632 range->lf_end = start - 1;
633 if (type != F_UNLCK)
634 range->lf_flags &= ~F_NOEND;
635 }
636 if (range == last_match)
637 break;
638 continue;
639 }
640
641 /*
642 * Clip right. This can only occur on last_match.
643 *
644 * Merge the right clip if possible. This must be done
645 * specifically, not in the optimized merge heuristic
646 * below, since we may have counted on it in our 'count'
647 * calculation.
648 *
649 * Since we are adjusting lf_start, we have to move the
650 * record to maintain the sorted list. Since lf_start is
651 * only getting larger we can use the next element as the
652 * insert point (we don't have to backtrack).
653 */
654 if (range->lf_end > end) {
655 KKASSERT(range == last_match);
656 if (brange &&
657 range->lf_start <= end + 1 &&
658 lf_match(range, type, flags)) {
659 brange->lf_end = range->lf_end;
660 brange->lf_flags |= range->lf_flags & F_NOEND;
661 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
662 if (range->lf_flags & F_POSIX)
663 --count;
664 TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
665 } else if (range->lf_start <= end) {
666 range->lf_start = end + 1;
667 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
668 lf_insert(&lock->lf_range, range, next);
669 }
670 /* range == last_match, we are done */
671 break;
672 }
673
674 /*
675 * The record must be entirely enclosed. Note that the
676 * record could be first_match or last_match, and will be
677 * deleted.
678 */
679 KKASSERT(range->lf_start >= start && range->lf_end <= end);
680 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
681 if (range->lf_flags & F_POSIX)
682 --count;
683 TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
684 if (range == last_match)
685 break;
686 }
687
688 /*
689 * Attempt to merge locks adjacent to brange. For example, we may
690 * have had to clip first_match and/or last_match, and they might
691 * be adjacent. Or there might simply have been an adjacent lock
692 * already there.
693 *
694 * Don't get fancy, just check adjacent elements in the list if they
695 * happen to be owned by us.
696 *
697 * This case only gets hit if we have a situation where a shared
698 * and exclusive lock are adjacent, and the exclusive lock is
699 * downgraded to shared or the shared lock is upgraded to exclusive.
700 */
701 if (brange) {
702 range = TAILQ_PREV(brange, lockf_range_list, lf_link);
703 if (range &&
704 range->lf_owner == owner &&
705 range->lf_end == brange->lf_start - 1 &&
706 lf_match(range, type, flags)
707 ) {
708 /*
709 * Extend range to cover brange and scrap brange.
710 */
711 range->lf_end = brange->lf_end;
712 range->lf_flags |= brange->lf_flags & F_NOEND;
713 TAILQ_REMOVE(&lock->lf_range, brange, lf_link);
714 if (brange->lf_flags & F_POSIX)
715 --count;
716 TAILQ_INSERT_TAIL(&deadlist, brange, lf_link);
717 brange = range;
718 }
719 range = TAILQ_NEXT(brange, lf_link);
720 if (range &&
721 range->lf_owner == owner &&
722 range->lf_start == brange->lf_end + 1 &&
723 lf_match(range, type, flags)
724 ) {
725 /*
726 * Extend brange to cover range and scrap range.
727 */
728 brange->lf_end = range->lf_end;
729 brange->lf_flags |= range->lf_flags & F_NOEND;
730 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
731 if (range->lf_flags & F_POSIX)
732 --count;
733 TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
734 }
735 }
736
737 /*
738 * Destroy deleted elements. We didn't want to do it in the loop
739 * because the free() might have blocked.
740 *
741 * Adjust the count for any posix locks we thought we might create
742 * but didn't.
743 */
744 while ((range = TAILQ_FIRST(&deadlist)) != NULL) {
745 TAILQ_REMOVE(&deadlist, range, lf_link);
746 lf_destroy_range(range);
747 }
748
749 KKASSERT(count <= 0);
750 if (count < 0)
751 lf_count_change(owner, count);
752 do_wakeup:
753 lf_print_lock(lock);
754 if (wakeup_needed)
755 lf_wakeup(lock, start, end);
756 error = 0;
757 do_cleanup:
758 if (new_range1 != NULL)
759 lf_destroy_range(new_range1);
760 if (new_range2 != NULL)
761 lf_destroy_range(new_range2);
762 return(error);
763 }
764
765 /*
766 * Check whether there is a blocking lock,
767 * and if so return its process identifier.
768 */
769 static int
lf_getlock(struct flock * fl,struct lockf * lock,struct proc * owner,int type,int flags,off_t start,off_t end)770 lf_getlock(struct flock *fl, struct lockf *lock, struct proc *owner,
771 int type, int flags, off_t start, off_t end)
772 {
773 struct lockf_range *range;
774
775 TAILQ_FOREACH(range, &lock->lf_range, lf_link)
776 if (range->lf_owner != owner &&
777 lf_overlap(range, start, end) &&
778 (type == F_WRLCK || range->lf_type == F_WRLCK))
779 break;
780 if (range == NULL) {
781 fl->l_type = F_UNLCK;
782 return(0);
783 }
784 fl->l_type = range->lf_type;
785 fl->l_whence = SEEK_SET;
786 fl->l_start = range->lf_start;
787 if (range->lf_flags & F_NOEND)
788 fl->l_len = 0;
789 else
790 fl->l_len = range->lf_end - range->lf_start + 1;
791 if (range->lf_owner != NULL && (range->lf_flags & F_POSIX))
792 fl->l_pid = range->lf_owner->p_pid;
793 else
794 fl->l_pid = -1;
795 return(0);
796 }
797
798 /*
799 * Wakeup pending lock attempts. Theoretically we can stop as soon as
800 * we encounter an exclusive request that covers the whole range (at least
801 * insofar as the sleep code above calls lf_wakeup() if it would otherwise
802 * exit instead of loop), but for now just wakeup all overlapping
803 * requests. XXX
804 */
805 static void
lf_wakeup(struct lockf * lock,off_t start,off_t end)806 lf_wakeup(struct lockf *lock, off_t start, off_t end)
807 {
808 struct lockf_range *range, *nrange;
809
810 TAILQ_FOREACH_MUTABLE(range, &lock->lf_blocked, lf_link, nrange) {
811 if (lf_overlap(range, start, end) == 0)
812 continue;
813 TAILQ_REMOVE(&lock->lf_blocked, range, lf_link);
814 range->lf_flags = 1;
815 wakeup(range);
816 }
817 }
818
819 /*
820 * Allocate a range structure and initialize it sufficiently such that
821 * lf_destroy_range() does not barf.
822 *
823 * Most use cases are temporary, implement a small 2-entry-per-cpu
824 * cache.
825 */
826 static struct lockf_range *
lf_alloc_range(void)827 lf_alloc_range(void)
828 {
829 struct lockf_range *range;
830 struct lf_pcpu *lfpc;
831
832 lfpc = &lf_pcpu_array[mycpuid];
833 if ((range = lfpc->free1) != NULL) {
834 lfpc->free1 = NULL;
835 return range;
836 }
837 if ((range = lfpc->free2) != NULL) {
838 lfpc->free2 = NULL;
839 return range;
840 }
841 range = kmalloc(sizeof(struct lockf_range), M_LOCKF, M_WAITOK);
842 range->lf_owner = NULL;
843
844 return(range);
845 }
846
847 static void
lf_insert(struct lockf_range_list * list,struct lockf_range * elm,struct lockf_range * insert_point)848 lf_insert(struct lockf_range_list *list, struct lockf_range *elm,
849 struct lockf_range *insert_point)
850 {
851 while (insert_point && insert_point->lf_start < elm->lf_start)
852 insert_point = TAILQ_NEXT(insert_point, lf_link);
853 if (insert_point != NULL)
854 TAILQ_INSERT_BEFORE(insert_point, elm, lf_link);
855 else
856 TAILQ_INSERT_TAIL(list, elm, lf_link);
857 }
858
859 static void
lf_create_range(struct lockf_range * range,struct proc * owner,int type,int flags,off_t start,off_t end)860 lf_create_range(struct lockf_range *range, struct proc *owner, int type,
861 int flags, off_t start, off_t end)
862 {
863 KKASSERT(start <= end);
864 range->lf_type = type;
865 range->lf_flags = flags;
866 range->lf_start = start;
867 range->lf_end = end;
868 range->lf_owner = owner;
869
870 lf_printf("lf_create_range: %ju..%ju\n",
871 (uintmax_t)range->lf_start, (uintmax_t)range->lf_end);
872 }
873
874 static void
lf_destroy_range(struct lockf_range * range)875 lf_destroy_range(struct lockf_range *range)
876 {
877 struct lf_pcpu *lfpc;
878
879 lf_printf("lf_destroy_range: %ju..%ju\n",
880 (uintmax_t)range->lf_start, (uintmax_t)range->lf_end);
881
882 lfpc = &lf_pcpu_array[mycpuid];
883 if (lfpc->free1 == NULL) {
884 range->lf_owner = NULL;
885 lfpc->free1 = range;
886 return;
887 }
888 if (lfpc->free2 == NULL) {
889 range->lf_owner = NULL;
890 lfpc->free2 = range;
891 return;
892 }
893 kfree(range, M_LOCKF);
894 }
895
896 #ifdef LOCKF_DEBUG
897
898 static void
_lf_printf(const char * ctl,...)899 _lf_printf(const char *ctl, ...)
900 {
901 struct proc *p;
902 __va_list va;
903
904 if (lf_print_ranges) {
905 if ((p = curproc) != NULL)
906 kprintf("pid %d (%s): ", p->p_pid, p->p_comm);
907 }
908 __va_start(va, ctl);
909 kvprintf(ctl, va);
910 __va_end(va);
911 }
912
913 static void
_lf_print_lock(const struct lockf * lock)914 _lf_print_lock(const struct lockf *lock)
915 {
916 struct lockf_range *range;
917
918 if (lf_print_ranges == 0)
919 return;
920
921 if (TAILQ_EMPTY(&lock->lf_range)) {
922 lf_printf("lockf %p: no ranges locked\n", lock);
923 } else {
924 lf_printf("lockf %p:\n", lock);
925 }
926 TAILQ_FOREACH(range, &lock->lf_range, lf_link)
927 kprintf("\t%jd..%jd type %s owned by %d\n",
928 (uintmax_t)range->lf_start, (uintmax_t)range->lf_end,
929 range->lf_type == F_RDLCK ? "shared" : "exclusive",
930 range->lf_flags & F_POSIX ? range->lf_owner->p_pid : -1);
931 if (TAILQ_EMPTY(&lock->lf_blocked))
932 kprintf("no process waiting for range\n");
933 else
934 kprintf("blocked locks:");
935 TAILQ_FOREACH(range, &lock->lf_blocked, lf_link)
936 kprintf("\t%jd..%jd type %s waiting on %p\n",
937 (uintmax_t)range->lf_start, (uintmax_t)range->lf_end,
938 range->lf_type == F_RDLCK ? "shared" : "exclusive",
939 range);
940 }
941 #endif /* LOCKF_DEBUG */
942
943 static void
lf_init(void * dummy __unused)944 lf_init(void *dummy __unused)
945 {
946 lf_pcpu_array = kmalloc(sizeof(*lf_pcpu_array) * ncpus,
947 M_LOCKF, M_WAITOK | M_ZERO);
948 }
949
950 SYSINIT(lockf, SI_BOOT2_MACHDEP, SI_ORDER_ANY, lf_init, NULL);
951