xref: /illumos-gate/usr/src/uts/common/os/flock.c (revision 494f7e12)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /*	All Rights Reserved */
29 
30 #include <sys/flock_impl.h>
31 #include <sys/vfs.h>
32 #include <sys/t_lock.h>		/* for <sys/callb.h> */
33 #include <sys/callb.h>
34 #include <sys/clconf.h>
35 #include <sys/cladm.h>
36 #include <sys/nbmlock.h>
37 #include <sys/cred.h>
38 #include <sys/policy.h>
39 
40 /*
41  * The following four variables are for statistics purposes and they are
42  * not protected by locks. They may not be accurate but will at least be
43  * close to the actual value.
44  */
45 
46 int	flk_lock_allocs;
47 int	flk_lock_frees;
48 int 	edge_allocs;
49 int	edge_frees;
50 int 	flk_proc_vertex_allocs;
51 int 	flk_proc_edge_allocs;
52 int	flk_proc_vertex_frees;
53 int	flk_proc_edge_frees;
54 
55 static kmutex_t flock_lock;
56 
57 #ifdef DEBUG
58 int check_debug = 0;
59 #define	CHECK_ACTIVE_LOCKS(gp)	if (check_debug) \
60 					check_active_locks(gp);
61 #define	CHECK_SLEEPING_LOCKS(gp)	if (check_debug) \
62 						check_sleeping_locks(gp);
63 #define	CHECK_OWNER_LOCKS(gp, pid, sysid, vp) 	\
64 		if (check_debug)	\
65 			check_owner_locks(gp, pid, sysid, vp);
66 #define	CHECK_LOCK_TRANSITION(old_state, new_state) \
67 	{ \
68 		if (check_lock_transition(old_state, new_state)) { \
69 			cmn_err(CE_PANIC, "Illegal lock transition \
70 			    from %d to %d", old_state, new_state); \
71 		} \
72 	}
73 #else
74 
75 #define	CHECK_ACTIVE_LOCKS(gp)
76 #define	CHECK_SLEEPING_LOCKS(gp)
77 #define	CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
78 #define	CHECK_LOCK_TRANSITION(old_state, new_state)
79 
80 #endif /* DEBUG */
81 
82 struct kmem_cache	*flk_edge_cache;
83 
84 graph_t		*lock_graph[HASH_SIZE];
85 proc_graph_t	pgraph;
86 
87 /*
88  * Clustering.
89  *
90  * NLM REGISTRY TYPE IMPLEMENTATION
91  *
92  * Assumptions:
93  *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
94  *	integers; maximum node id is returned by clconf_maximum_nodeid().
95  *  2.  We use this node id to identify the node an NLM server runs on.
96  */
97 
98 /*
99  * NLM registry object keeps track of NLM servers via their
100  * nlmids (which are the node ids of the node in the cluster they run on)
101  * that have requested locks at this LLM with which this registry is
102  * associated.
103  *
104  * Representation of abstraction:
105  *    rep = record[	states: array[nlm_state],
106  *			lock: mutex]
107  *
108  *    Representation invariants:
109  *	1. index i of rep.states is between 0 and n - 1 where n is number
110  *	   of elements in the array, which happen to be the maximum number
111  *	   of nodes in the cluster configuration + 1.
112  *	2. map nlmid to index i of rep.states
113  *		0   -> 0
114  *		1   -> 1
115  *		2   -> 2
116  *		n-1 -> clconf_maximum_nodeid()+1
117  *	3.  This 1-1 mapping is quite convenient and it avoids errors resulting
118  *	    from forgetting to subtract 1 from the index.
119  *	4.  The reason we keep the 0th index is the following.  A legitimate
120  *	    cluster configuration includes making a UFS file system NFS
121  *	    exportable.  The code is structured so that if you're in a cluster
122  *	    you do one thing; otherwise, you do something else.  The problem
123  *	    is what to do if you think you're in a cluster with PXFS loaded,
124  *	    but you're using UFS not PXFS?  The upper two bytes of the sysid
125  *	    encode the node id of the node where NLM server runs; these bytes
126  *	    are zero for UFS.  Since the nodeid is used to index into the
127  *	    registry, we can record the NLM server state information at index
128  *	    0 using the same mechanism used for PXFS file locks!
129  */
130 static flk_nlm_status_t *nlm_reg_status = NULL;	/* state array 0..N-1 */
131 static kmutex_t nlm_reg_lock;			/* lock to protect arrary */
132 static uint_t nlm_status_size;			/* size of state array */
133 
134 /*
135  * Although we need a global lock dependency graph (and associated data
136  * structures), we also need a per-zone notion of whether the lock manager is
137  * running, and so whether to allow lock manager requests or not.
138  *
139  * Thus, on a per-zone basis we maintain a ``global'' variable
140  * (flk_lockmgr_status), protected by flock_lock, and set when the lock
141  * manager is determined to be changing state (starting or stopping).
142  *
143  * Each graph/zone pair also has a copy of this variable, which is protected by
144  * the graph's mutex.
145  *
146  * The per-graph copies are used to synchronize lock requests with shutdown
147  * requests.  The global copy is used to initialize the per-graph field when a
148  * new graph is created.
149  */
150 struct flock_globals {
151 	flk_lockmgr_status_t flk_lockmgr_status;
152 	flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
153 };
154 
155 zone_key_t flock_zone_key;
156 
157 static void create_flock(lock_descriptor_t *, flock64_t *);
158 static lock_descriptor_t	*flk_get_lock(void);
159 static void	flk_free_lock(lock_descriptor_t	*lock);
160 static void	flk_get_first_blocking_lock(lock_descriptor_t *request);
161 static int flk_process_request(lock_descriptor_t *);
162 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
163 static edge_t *flk_get_edge(void);
164 static int flk_wait_execute_request(lock_descriptor_t *);
165 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
166 static void flk_insert_active_lock(lock_descriptor_t *);
167 static void flk_delete_active_lock(lock_descriptor_t *, int);
168 static void flk_insert_sleeping_lock(lock_descriptor_t *);
169 static void flk_graph_uncolor(graph_t *);
170 static void flk_wakeup(lock_descriptor_t *, int);
171 static void flk_free_edge(edge_t *);
172 static void flk_recompute_dependencies(lock_descriptor_t *,
173 			lock_descriptor_t **,  int, int);
174 static int flk_find_barriers(lock_descriptor_t *);
175 static void flk_update_barriers(lock_descriptor_t *);
176 static int flk_color_reachables(lock_descriptor_t *);
177 static int flk_canceled(lock_descriptor_t *);
178 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
179 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
180 static void wait_for_lock(lock_descriptor_t *);
181 static void unlock_lockmgr_granted(struct flock_globals *);
182 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
183 
184 /* Clustering hooks */
185 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
186 static void cl_flk_wakeup_sleeping_nlm_locks(int);
187 static void cl_flk_unlock_nlm_granted(int);
188 
189 #ifdef DEBUG
190 static int check_lock_transition(int, int);
191 static void check_sleeping_locks(graph_t *);
192 static void check_active_locks(graph_t *);
193 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
194 static void path(lock_descriptor_t *, lock_descriptor_t *);
195 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
196 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
197 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
198 #endif
199 
200 /*	proc_graph function definitions */
201 static int flk_check_deadlock(lock_descriptor_t *);
202 static void flk_proc_graph_uncolor(void);
203 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
204 static proc_edge_t *flk_get_proc_edge(void);
205 static void flk_proc_release(proc_vertex_t *);
206 static void flk_free_proc_edge(proc_edge_t *);
207 static void flk_update_proc_graph(edge_t *, int);
208 
209 /* Non-blocking mandatory locking */
210 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
211 			u_offset_t);
212 
213 static struct flock_globals *
214 flk_get_globals(void)
215 {
216 	/*
217 	 * The KLM module had better be loaded if we're attempting to handle
218 	 * lockmgr requests.
219 	 */
220 	ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
221 	return (zone_getspecific(flock_zone_key, curproc->p_zone));
222 }
223 
224 static flk_lockmgr_status_t
225 flk_get_lockmgr_status(void)
226 {
227 	struct flock_globals *fg;
228 
229 	ASSERT(MUTEX_HELD(&flock_lock));
230 
231 	if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
232 		/*
233 		 * KLM module not loaded; lock manager definitely not running.
234 		 */
235 		return (FLK_LOCKMGR_DOWN);
236 	}
237 	fg = flk_get_globals();
238 	return (fg->flk_lockmgr_status);
239 }
240 
241 /*
242  * Routine called from fs_frlock in fs/fs_subr.c
243  */
244 
245 int
246 reclock(vnode_t		*vp,
247 	flock64_t	*lckdat,
248 	int		cmd,
249 	int		flag,
250 	u_offset_t	offset,
251 	flk_callback_t	*flk_cbp)
252 {
253 	lock_descriptor_t	stack_lock_request;
254 	lock_descriptor_t	*lock_request;
255 	int error = 0;
256 	graph_t	*gp;
257 	int			nlmid;
258 
259 	/*
260 	 * Check access permissions
261 	 */
262 	if ((cmd & SETFLCK) &&
263 		((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
264 		(lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
265 			return (EBADF);
266 
267 	/*
268 	 * for query and unlock we use the stack_lock_request
269 	 */
270 
271 	if ((lckdat->l_type == F_UNLCK) ||
272 			!((cmd & INOFLCK) || (cmd & SETFLCK))) {
273 		lock_request = &stack_lock_request;
274 		(void) bzero((caddr_t)lock_request,
275 				sizeof (lock_descriptor_t));
276 
277 		/*
278 		 * following is added to make the assertions in
279 		 * flk_execute_request() to pass through
280 		 */
281 
282 		lock_request->l_edge.edge_in_next = &lock_request->l_edge;
283 		lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
284 		lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
285 		lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
286 		lock_request->l_status = FLK_INITIAL_STATE;
287 	} else {
288 		lock_request = flk_get_lock();
289 	}
290 	lock_request->l_state = 0;
291 	lock_request->l_vnode = vp;
292 	lock_request->l_zoneid = getzoneid();
293 
294 	/*
295 	 * Convert the request range into the canonical start and end
296 	 * values.  The NLM protocol supports locking over the entire
297 	 * 32-bit range, so there's no range checking for remote requests,
298 	 * but we still need to verify that local requests obey the rules.
299 	 */
300 	/* Clustering */
301 	if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
302 		ASSERT(lckdat->l_whence == 0);
303 		lock_request->l_start = lckdat->l_start;
304 		lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
305 			lckdat->l_start + (lckdat->l_len - 1);
306 	} else {
307 		/* check the validity of the lock range */
308 		error = flk_convert_lock_data(vp, lckdat,
309 			&lock_request->l_start, &lock_request->l_end,
310 			offset);
311 		if (error) {
312 			goto done;
313 		}
314 		error = flk_check_lock_data(lock_request->l_start,
315 					    lock_request->l_end, MAXEND);
316 		if (error) {
317 			goto done;
318 		}
319 	}
320 
321 	ASSERT(lock_request->l_end >= lock_request->l_start);
322 
323 	lock_request->l_type = lckdat->l_type;
324 	if (cmd & INOFLCK)
325 		lock_request->l_state |= IO_LOCK;
326 	if (cmd & SLPFLCK)
327 		lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
328 	if (cmd & RCMDLCK)
329 		lock_request->l_state |= LOCKMGR_LOCK;
330 	if (cmd & NBMLCK)
331 		lock_request->l_state |= NBMAND_LOCK;
332 	/*
333 	 * Clustering: set flag for PXFS locks
334 	 * We do not _only_ check for the PCMDLCK flag because PXFS locks could
335 	 * also be of type 'RCMDLCK'.
336 	 * We do not _only_ check the GETPXFSID() macro because local PXFS
337 	 * clients use a pxfsid of zero to permit deadlock detection in the LLM.
338 	 */
339 
340 	if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
341 		lock_request->l_state |= PXFS_LOCK;
342 	}
343 	if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
344 		if (lock_request->l_type == F_RDLCK ||
345 			lock_request->l_type == F_WRLCK)
346 			lock_request->l_state |= QUERY_LOCK;
347 	}
348 	lock_request->l_flock = (*lckdat);
349 	lock_request->l_callbacks = flk_cbp;
350 
351 	/*
352 	 * We are ready for processing the request
353 	 */
354 	if (IS_LOCKMGR(lock_request)) {
355 		/*
356 		 * If the lock request is an NLM server request ....
357 		 */
358 		if (nlm_status_size == 0) { /* not booted as cluster */
359 			mutex_enter(&flock_lock);
360 			/*
361 			 * Bail out if this is a lock manager request and the
362 			 * lock manager is not supposed to be running.
363 			 */
364 			if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
365 				mutex_exit(&flock_lock);
366 				error = ENOLCK;
367 				goto done;
368 			}
369 			mutex_exit(&flock_lock);
370 		} else {			/* booted as a cluster */
371 			nlmid = GETNLMID(lock_request->l_flock.l_sysid);
372 			ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
373 
374 			mutex_enter(&nlm_reg_lock);
375 			/*
376 			 * If the NLM registry does not know about this
377 			 * NLM server making the request, add its nlmid
378 			 * to the registry.
379 			 */
380 			if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
381 				nlmid)) {
382 				FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
383 			} else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
384 				nlmid)) {
385 				/*
386 				 * If the NLM server is already known (has made
387 				 * previous lock requests) and its state is
388 				 * not NLM_UP (means that NLM server is
389 				 * shutting down), then bail out with an
390 				 * error to deny the lock request.
391 				 */
392 				mutex_exit(&nlm_reg_lock);
393 				error = ENOLCK;
394 				goto done;
395 			}
396 			mutex_exit(&nlm_reg_lock);
397 		}
398 	}
399 
400 	/* Now get the lock graph for a particular vnode */
401 	gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
402 
403 	/*
404 	 * We drop rwlock here otherwise this might end up causing a
405 	 * deadlock if this IOLOCK sleeps. (bugid # 1183392).
406 	 */
407 
408 	if (IS_IO_LOCK(lock_request)) {
409 		VOP_RWUNLOCK(vp,
410 			(lock_request->l_type == F_RDLCK) ?
411 				V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
412 	}
413 	mutex_enter(&gp->gp_mutex);
414 
415 	lock_request->l_state |= REFERENCED_LOCK;
416 	lock_request->l_graph = gp;
417 
418 	switch (lock_request->l_type) {
419 	case F_RDLCK:
420 	case F_WRLCK:
421 		if (IS_QUERY_LOCK(lock_request)) {
422 			flk_get_first_blocking_lock(lock_request);
423 			(*lckdat) = lock_request->l_flock;
424 			break;
425 		}
426 
427 		/* process the request now */
428 
429 		error = flk_process_request(lock_request);
430 		break;
431 
432 	case F_UNLCK:
433 		/* unlock request will not block so execute it immediately */
434 
435 		if (IS_LOCKMGR(lock_request) &&
436 		    flk_canceled(lock_request)) {
437 			error = 0;
438 		} else {
439 			error = flk_execute_request(lock_request);
440 		}
441 		break;
442 
443 	case F_UNLKSYS:
444 		/*
445 		 * Recovery mechanism to release lock manager locks when
446 		 * NFS client crashes and restart. NFS server will clear
447 		 * old locks and grant new locks.
448 		 */
449 
450 		if (lock_request->l_flock.l_sysid == 0) {
451 			mutex_exit(&gp->gp_mutex);
452 			return (EINVAL);
453 		}
454 		if (secpolicy_nfs(CRED()) != 0) {
455 			mutex_exit(&gp->gp_mutex);
456 			return (EPERM);
457 		}
458 		flk_delete_locks_by_sysid(lock_request);
459 		lock_request->l_state &= ~REFERENCED_LOCK;
460 		flk_set_state(lock_request, FLK_DEAD_STATE);
461 		flk_free_lock(lock_request);
462 		mutex_exit(&gp->gp_mutex);
463 		return (0);
464 
465 	default:
466 		error = EINVAL;
467 		break;
468 	}
469 
470 	/* Clustering: For blocked PXFS locks, return */
471 	if (error == PXFS_LOCK_BLOCKED) {
472 		lock_request->l_state &= ~REFERENCED_LOCK;
473 		mutex_exit(&gp->gp_mutex);
474 		return (error);
475 	}
476 
477 	/*
478 	 * Now that we have seen the status of locks in the system for
479 	 * this vnode we acquire the rwlock if it is an IO_LOCK.
480 	 */
481 
482 	if (IS_IO_LOCK(lock_request)) {
483 		(void) VOP_RWLOCK(vp,
484 			(lock_request->l_type == F_RDLCK) ?
485 				V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
486 		if (!error) {
487 			lckdat->l_type = F_UNLCK;
488 
489 			/*
490 			 * This wake up is needed otherwise
491 			 * if IO_LOCK has slept the dependents on this
492 			 * will not be woken up at all. (bugid # 1185482).
493 			 */
494 
495 			flk_wakeup(lock_request, 1);
496 			flk_set_state(lock_request, FLK_DEAD_STATE);
497 			flk_free_lock(lock_request);
498 		}
499 		/*
500 		 * else if error had occurred either flk_process_request()
501 		 * has returned EDEADLK in which case there will be no
502 		 * dependents for this lock or EINTR from flk_wait_execute_
503 		 * request() in which case flk_cancel_sleeping_lock()
504 		 * would have been done. same is true with EBADF.
505 		 */
506 	}
507 
508 	if (lock_request == &stack_lock_request) {
509 		flk_set_state(lock_request, FLK_DEAD_STATE);
510 	} else {
511 		lock_request->l_state &= ~REFERENCED_LOCK;
512 		if ((error != 0) || IS_DELETED(lock_request)) {
513 			flk_set_state(lock_request, FLK_DEAD_STATE);
514 			flk_free_lock(lock_request);
515 		}
516 	}
517 
518 	mutex_exit(&gp->gp_mutex);
519 	return (error);
520 
521 done:
522 	flk_set_state(lock_request, FLK_DEAD_STATE);
523 	if (lock_request != &stack_lock_request)
524 		flk_free_lock(lock_request);
525 	return (error);
526 }
527 
528 /*
529  * Invoke the callbacks in the given list.  If before sleeping, invoke in
530  * list order.  If after sleeping, invoke in reverse order.
531  *
532  * CPR (suspend/resume) support: if one of the callbacks returns a
533  * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
534  * while it is sleeping.  There should be at most one callb_cpr_t for the
535  * thread.
536  * XXX This is unnecessarily complicated.  The CPR information should just
537  * get passed in directly through VOP_FRLOCK and reclock, rather than
538  * sneaking it in via a callback.
539  */
540 
541 callb_cpr_t *
542 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
543 {
544 	callb_cpr_t *cpr_callbackp = NULL;
545 	callb_cpr_t *one_result;
546 	flk_callback_t *cb;
547 
548 	if (cblist == NULL)
549 		return (NULL);
550 
551 	if (when == FLK_BEFORE_SLEEP) {
552 		cb = cblist;
553 		do {
554 			one_result = (*cb->cb_callback)(when, cb->cb_data);
555 			if (one_result != NULL) {
556 				ASSERT(cpr_callbackp == NULL);
557 				cpr_callbackp = one_result;
558 			}
559 			cb = cb->cb_next;
560 		} while (cb != cblist);
561 	} else {
562 		cb = cblist->cb_prev;
563 		do {
564 			one_result = (*cb->cb_callback)(when, cb->cb_data);
565 			if (one_result != NULL) {
566 				cpr_callbackp = one_result;
567 			}
568 			cb = cb->cb_prev;
569 		} while (cb != cblist->cb_prev);
570 	}
571 
572 	return (cpr_callbackp);
573 }
574 
575 /*
576  * Initialize a flk_callback_t to hold the given callback.
577  */
578 
579 void
580 flk_init_callback(flk_callback_t *flk_cb,
581 	callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
582 {
583 	flk_cb->cb_next = flk_cb;
584 	flk_cb->cb_prev = flk_cb;
585 	flk_cb->cb_callback = cb_fcn;
586 	flk_cb->cb_data = cbdata;
587 }
588 
589 /*
590  * Initialize an flk_callback_t and then link it into the head of an
591  * existing list (which may be NULL).
592  */
593 
594 void
595 flk_add_callback(flk_callback_t *newcb,
596 		callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
597 		void *cbdata, flk_callback_t *cblist)
598 {
599 	flk_init_callback(newcb, cb_fcn, cbdata);
600 
601 	if (cblist == NULL)
602 		return;
603 
604 	newcb->cb_prev = cblist->cb_prev;
605 	newcb->cb_next = cblist;
606 	cblist->cb_prev->cb_next = newcb;
607 	cblist->cb_prev = newcb;
608 }
609 
610 /*
611  * Initialize the flk_edge_cache data structure and create the
612  * nlm_reg_status array.
613  */
614 
615 void
616 flk_init(void)
617 {
618 	uint_t	i;
619 
620 	flk_edge_cache = kmem_cache_create("flk_edges",
621 		sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
622 	if (flk_edge_cache == NULL) {
623 		cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
624 	}
625 	/*
626 	 * Create the NLM registry object.
627 	 */
628 
629 	if (cluster_bootflags & CLUSTER_BOOTED) {
630 		/*
631 		 * This routine tells you the maximum node id that will be used
632 		 * in the cluster.  This number will be the size of the nlm
633 		 * registry status array.  We add 1 because we will be using
634 		 * all entries indexed from 0 to maxnodeid; e.g., from 0
635 		 * to 64, for a total of 65 entries.
636 		 */
637 		nlm_status_size = clconf_maximum_nodeid() + 1;
638 	} else {
639 		nlm_status_size = 0;
640 	}
641 
642 	if (nlm_status_size != 0) {	/* booted as a cluster */
643 		nlm_reg_status = (flk_nlm_status_t *)
644 			kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
645 				KM_SLEEP);
646 
647 		/* initialize all NLM states in array to NLM_UNKNOWN */
648 		for (i = 0; i < nlm_status_size; i++) {
649 			nlm_reg_status[i] = FLK_NLM_UNKNOWN;
650 		}
651 	}
652 }
653 
654 /*
655  * Zone constructor/destructor callbacks to be executed when a zone is
656  * created/destroyed.
657  */
658 /* ARGSUSED */
659 void *
660 flk_zone_init(zoneid_t zoneid)
661 {
662 	struct flock_globals *fg;
663 	uint_t i;
664 
665 	fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
666 	fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
667 	for (i = 0; i < HASH_SIZE; i++)
668 		fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
669 	return (fg);
670 }
671 
672 /* ARGSUSED */
673 void
674 flk_zone_fini(zoneid_t zoneid, void *data)
675 {
676 	struct flock_globals *fg = data;
677 
678 	kmem_free(fg, sizeof (*fg));
679 }
680 
681 /*
682  * Get a lock_descriptor structure with initialization of edge lists.
683  */
684 
685 static lock_descriptor_t *
686 flk_get_lock(void)
687 {
688 	lock_descriptor_t	*l;
689 
690 	l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
691 
692 	cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
693 	l->l_edge.edge_in_next = &l->l_edge;
694 	l->l_edge.edge_in_prev = &l->l_edge;
695 	l->l_edge.edge_adj_next = &l->l_edge;
696 	l->l_edge.edge_adj_prev = &l->l_edge;
697 	l->pvertex = -1;
698 	l->l_status = FLK_INITIAL_STATE;
699 	flk_lock_allocs++;
700 	return (l);
701 }
702 
703 /*
704  * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
705  * when some thread has a reference to it as in reclock().
706  */
707 
708 void
709 flk_free_lock(lock_descriptor_t	*lock)
710 {
711 	ASSERT(IS_DEAD(lock));
712 	if (IS_REFERENCED(lock)) {
713 		lock->l_state |= DELETED_LOCK;
714 		return;
715 	}
716 	flk_lock_frees++;
717 	kmem_free((void *)lock, sizeof (lock_descriptor_t));
718 }
719 
720 void
721 flk_set_state(lock_descriptor_t *lock, int new_state)
722 {
723 	/*
724 	 * Locks in the sleeping list may be woken up in a number of ways,
725 	 * and more than once.  If a sleeping lock is signaled awake more
726 	 * than once, then it may or may not change state depending on its
727 	 * current state.
728 	 * Also note that NLM locks that are sleeping could be moved to an
729 	 * interrupted state more than once if the unlock request is
730 	 * retransmitted by the NLM client - the second time around, this is
731 	 * just a nop.
732 	 * The ordering of being signaled awake is:
733 	 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
734 	 * The checks below implement this ordering.
735 	 */
736 	if (IS_INTERRUPTED(lock)) {
737 		if ((new_state == FLK_CANCELLED_STATE) ||
738 		    (new_state == FLK_GRANTED_STATE) ||
739 		    (new_state == FLK_INTERRUPTED_STATE)) {
740 			return;
741 		}
742 	}
743 	if (IS_CANCELLED(lock)) {
744 		if ((new_state == FLK_GRANTED_STATE) ||
745 		    (new_state == FLK_CANCELLED_STATE)) {
746 			return;
747 		}
748 	}
749 	CHECK_LOCK_TRANSITION(lock->l_status, new_state);
750 	if (IS_PXFS(lock)) {
751 		cl_flk_state_transition_notify(lock, lock->l_status, new_state);
752 	}
753 	lock->l_status = new_state;
754 }
755 
756 /*
757  * Routine that checks whether there are any blocking locks in the system.
758  *
759  * The policy followed is if a write lock is sleeping we don't allow read
760  * locks before this write lock even though there may not be any active
761  * locks corresponding to the read locks' region.
762  *
763  * flk_add_edge() function adds an edge between l1 and l2 iff there
764  * is no path between l1 and l2. This is done to have a "minimum
765  * storage representation" of the dependency graph.
766  *
767  * Another property of the graph is since only the new request throws
768  * edges to the existing locks in the graph, the graph is always topologically
769  * ordered.
770  */
771 
772 static int
773 flk_process_request(lock_descriptor_t *request)
774 {
775 	graph_t	*gp = request->l_graph;
776 	lock_descriptor_t *lock;
777 	int request_blocked_by_active = 0;
778 	int request_blocked_by_granted = 0;
779 	int request_blocked_by_sleeping = 0;
780 	vnode_t	*vp = request->l_vnode;
781 	int	error = 0;
782 	int request_will_wait = 0;
783 	int found_covering_lock = 0;
784 	lock_descriptor_t *covered_by = NULL;
785 
786 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
787 	request_will_wait = IS_WILLING_TO_SLEEP(request);
788 
789 	/*
790 	 * check active locks
791 	 */
792 
793 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
794 
795 
796 	if (lock) {
797 		do {
798 			if (BLOCKS(lock, request)) {
799 				if (!request_will_wait)
800 					return (EAGAIN);
801 				request_blocked_by_active = 1;
802 				break;
803 			}
804 			/*
805 			 * Grant lock if it is for the same owner holding active
806 			 * lock that covers the request.
807 			 */
808 
809 			if (SAME_OWNER(lock, request) &&
810 					COVERS(lock, request) &&
811 						(request->l_type == F_RDLCK))
812 				return (flk_execute_request(request));
813 			lock = lock->l_next;
814 		} while (lock->l_vnode == vp);
815 	}
816 
817 	if (!request_blocked_by_active) {
818 			lock_descriptor_t *lk[1];
819 			lock_descriptor_t *first_glock = NULL;
820 		/*
821 		 * Shall we grant this?! NO!!
822 		 * What about those locks that were just granted and still
823 		 * in sleep queue. Those threads are woken up and so locks
824 		 * are almost active.
825 		 */
826 		SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
827 		if (lock) {
828 			do {
829 				if (BLOCKS(lock, request)) {
830 					if (IS_GRANTED(lock)) {
831 						request_blocked_by_granted = 1;
832 					} else {
833 						request_blocked_by_sleeping = 1;
834 					}
835 				}
836 
837 				lock = lock->l_next;
838 			} while ((lock->l_vnode == vp));
839 			first_glock = lock->l_prev;
840 			ASSERT(first_glock->l_vnode == vp);
841 		}
842 
843 		if (request_blocked_by_granted)
844 			goto block;
845 
846 		if (!request_blocked_by_sleeping) {
847 			/*
848 			 * If the request isn't going to be blocked by a
849 			 * sleeping request, we know that it isn't going to
850 			 * be blocked; we can just execute the request --
851 			 * without performing costly deadlock detection.
852 			 */
853 			ASSERT(!request_blocked_by_active);
854 			return (flk_execute_request(request));
855 		} else if (request->l_type == F_RDLCK) {
856 			/*
857 			 * If we have a sleeping writer in the requested
858 			 * lock's range, block.
859 			 */
860 			goto block;
861 		}
862 
863 		lk[0] = request;
864 		request->l_state |= RECOMPUTE_LOCK;
865 		SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
866 		if (lock) {
867 			do {
868 				flk_recompute_dependencies(lock, lk, 1, 0);
869 				lock = lock->l_next;
870 			} while (lock->l_vnode == vp);
871 		}
872 		lock = first_glock;
873 		if (lock) {
874 			do {
875 				if (IS_GRANTED(lock)) {
876 				flk_recompute_dependencies(lock, lk, 1, 0);
877 				}
878 				lock = lock->l_prev;
879 			} while ((lock->l_vnode == vp));
880 		}
881 		request->l_state &= ~RECOMPUTE_LOCK;
882 		if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
883 			return (EDEADLK);
884 		return (flk_execute_request(request));
885 	}
886 
887 block:
888 	if (request_will_wait)
889 		flk_graph_uncolor(gp);
890 
891 	/* check sleeping locks */
892 
893 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
894 
895 	/*
896 	 * If we find a sleeping write lock that is a superset of the
897 	 * region wanted by request we can be assured that by adding an
898 	 * edge to this write lock we have paths to all locks in the
899 	 * graph that blocks the request except in one case and that is why
900 	 * another check for SAME_OWNER in the loop below. The exception
901 	 * case is when this process that owns the sleeping write lock 'l1'
902 	 * has other locks l2, l3, l4 that are in the system and arrived
903 	 * before l1. l1 does not have path to these locks as they are from
904 	 * same process. We break when we find a second covering sleeping
905 	 * lock l5 owned by a process different from that owning l1, because
906 	 * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
907 	 * it has l1 would have produced a deadlock already.
908 	 */
909 
910 	if (lock) {
911 		do {
912 			if (BLOCKS(lock, request)) {
913 				if (!request_will_wait)
914 					return (EAGAIN);
915 				if (COVERS(lock, request) &&
916 						lock->l_type == F_WRLCK) {
917 					if (found_covering_lock &&
918 					    !SAME_OWNER(lock, covered_by)) {
919 						found_covering_lock++;
920 						break;
921 					}
922 					found_covering_lock = 1;
923 					covered_by = lock;
924 				}
925 				if (found_covering_lock &&
926 					!SAME_OWNER(lock, covered_by)) {
927 					lock = lock->l_next;
928 					continue;
929 				}
930 				if ((error = flk_add_edge(request, lock,
931 						!found_covering_lock, 0)))
932 					return (error);
933 			}
934 			lock = lock->l_next;
935 		} while (lock->l_vnode == vp);
936 	}
937 
938 /*
939  * found_covering_lock == 2 iff at this point 'request' has paths
940  * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
941  * point 'request' has paths to all locks that blocks 'request' whose owners
942  * are not same as the one that covers 'request' (covered_by above) and
943  * we can have locks whose owner is same as covered_by in the active list.
944  */
945 
946 	if (request_blocked_by_active && found_covering_lock != 2) {
947 		SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
948 		ASSERT(lock != NULL);
949 		do {
950 			if (BLOCKS(lock, request)) {
951 				if (found_covering_lock &&
952 					!SAME_OWNER(lock, covered_by)) {
953 					lock = lock->l_next;
954 					continue;
955 				}
956 				if ((error = flk_add_edge(request, lock,
957 							CHECK_CYCLE, 0)))
958 					return (error);
959 			}
960 			lock = lock->l_next;
961 		} while (lock->l_vnode == vp);
962 	}
963 
964 	if (NOT_BLOCKED(request)) {
965 		/*
966 		 * request not dependent on any other locks
967 		 * so execute this request
968 		 */
969 		return (flk_execute_request(request));
970 	} else {
971 		/*
972 		 * check for deadlock
973 		 */
974 		if (flk_check_deadlock(request))
975 			return (EDEADLK);
976 		/*
977 		 * this thread has to sleep
978 		 */
979 		return (flk_wait_execute_request(request));
980 	}
981 }
982 
983 /*
984  * The actual execution of the request in the simple case is only to
985  * insert the 'request' in the list of active locks if it is not an
986  * UNLOCK.
987  * We have to consider the existing active locks' relation to
988  * this 'request' if they are owned by same process. flk_relation() does
989  * this job and sees to that the dependency graph information is maintained
990  * properly.
991  */
992 
993 int
994 flk_execute_request(lock_descriptor_t *request)
995 {
996 	graph_t	*gp = request->l_graph;
997 	vnode_t	*vp = request->l_vnode;
998 	lock_descriptor_t	*lock, *lock1;
999 	int done_searching = 0;
1000 
1001 	CHECK_SLEEPING_LOCKS(gp);
1002 	CHECK_ACTIVE_LOCKS(gp);
1003 
1004 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1005 
1006 	flk_set_state(request, FLK_START_STATE);
1007 
1008 	ASSERT(NOT_BLOCKED(request));
1009 
1010 	/* IO_LOCK requests are only to check status */
1011 
1012 	if (IS_IO_LOCK(request))
1013 		return (0);
1014 
1015 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1016 
1017 	if (lock == NULL && request->l_type == F_UNLCK)
1018 		return (0);
1019 	if (lock == NULL) {
1020 		flk_insert_active_lock(request);
1021 		return (0);
1022 	}
1023 
1024 	do {
1025 		lock1 = lock->l_next;
1026 		if (SAME_OWNER(request, lock)) {
1027 			done_searching = flk_relation(lock, request);
1028 		}
1029 		lock = lock1;
1030 	} while (lock->l_vnode == vp && !done_searching);
1031 
1032 	/*
1033 	 * insert in active queue
1034 	 */
1035 
1036 	if (request->l_type != F_UNLCK)
1037 		flk_insert_active_lock(request);
1038 
1039 	return (0);
1040 }
1041 
1042 /*
1043  * 'request' is blocked by some one therefore we put it into sleep queue.
1044  */
1045 static int
1046 flk_wait_execute_request(lock_descriptor_t *request)
1047 {
1048 	graph_t	*gp = request->l_graph;
1049 	callb_cpr_t 	*cprp;		/* CPR info from callback */
1050 	struct flock_globals *fg;
1051 	int index;
1052 
1053 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1054 	ASSERT(IS_WILLING_TO_SLEEP(request));
1055 
1056 	flk_insert_sleeping_lock(request);
1057 
1058 	if (IS_LOCKMGR(request)) {
1059 		index = HASH_INDEX(request->l_vnode);
1060 		fg = flk_get_globals();
1061 
1062 		if (nlm_status_size == 0) {	/* not booted as a cluster */
1063 			if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1064 				flk_cancel_sleeping_lock(request, 1);
1065 				return (ENOLCK);
1066 			}
1067 		} else {			/* booted as a cluster */
1068 			/*
1069 			 * If the request is an NLM server lock request,
1070 			 * and the NLM state of the lock request is not
1071 			 * NLM_UP (because the NLM server is shutting
1072 			 * down), then cancel the sleeping lock and
1073 			 * return error ENOLCK that will encourage the
1074 			 * client to retransmit.
1075 			 */
1076 			if (!IS_NLM_UP(request)) {
1077 				flk_cancel_sleeping_lock(request, 1);
1078 				return (ENOLCK);
1079 			}
1080 		}
1081 	}
1082 
1083 	/* Clustering: For blocking PXFS locks, return */
1084 	if (IS_PXFS(request)) {
1085 		/*
1086 		 * PXFS locks sleep on the client side.
1087 		 * The callback argument is used to wake up the sleeper
1088 		 * when the lock is granted.
1089 		 * We return -1 (rather than an errno value) to indicate
1090 		 * the client side should sleep
1091 		 */
1092 		return (PXFS_LOCK_BLOCKED);
1093 	}
1094 
1095 	if (request->l_callbacks != NULL) {
1096 		/*
1097 		 * To make sure the shutdown code works correctly, either
1098 		 * the callback must happen after putting the lock on the
1099 		 * sleep list, or we must check the shutdown status after
1100 		 * returning from the callback (and before sleeping).  At
1101 		 * least for now, we'll use the first option.  If a
1102 		 * shutdown or signal or whatever happened while the graph
1103 		 * mutex was dropped, that will be detected by
1104 		 * wait_for_lock().
1105 		 */
1106 		mutex_exit(&gp->gp_mutex);
1107 
1108 		cprp = flk_invoke_callbacks(request->l_callbacks,
1109 					    FLK_BEFORE_SLEEP);
1110 
1111 		mutex_enter(&gp->gp_mutex);
1112 
1113 		if (cprp == NULL) {
1114 			wait_for_lock(request);
1115 		} else {
1116 			mutex_enter(cprp->cc_lockp);
1117 			CALLB_CPR_SAFE_BEGIN(cprp);
1118 			mutex_exit(cprp->cc_lockp);
1119 			wait_for_lock(request);
1120 			mutex_enter(cprp->cc_lockp);
1121 			CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1122 			mutex_exit(cprp->cc_lockp);
1123 		}
1124 
1125 		mutex_exit(&gp->gp_mutex);
1126 		(void) flk_invoke_callbacks(request->l_callbacks,
1127 					    FLK_AFTER_SLEEP);
1128 		mutex_enter(&gp->gp_mutex);
1129 	} else {
1130 		wait_for_lock(request);
1131 	}
1132 
1133 	if (IS_LOCKMGR(request)) {
1134 		/*
1135 		 * If the lock manager is shutting down, return an
1136 		 * error that will encourage the client to retransmit.
1137 		 */
1138 		if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1139 			!IS_GRANTED(request)) {
1140 			flk_cancel_sleeping_lock(request, 1);
1141 			return (ENOLCK);
1142 		}
1143 	}
1144 
1145 	if (IS_INTERRUPTED(request)) {
1146 		/* we got a signal, or act like we did */
1147 		flk_cancel_sleeping_lock(request, 1);
1148 		return (EINTR);
1149 	}
1150 
1151 	/* Cancelled if some other thread has closed the file */
1152 
1153 	if (IS_CANCELLED(request)) {
1154 		flk_cancel_sleeping_lock(request, 1);
1155 		return (EBADF);
1156 	}
1157 
1158 	request->l_state &= ~GRANTED_LOCK;
1159 	REMOVE_SLEEP_QUEUE(request);
1160 	return (flk_execute_request(request));
1161 }
1162 
1163 /*
1164  * This routine adds an edge between from and to because from depends
1165  * to. If asked to check for deadlock it checks whether there are any
1166  * reachable locks from "from_lock" that is owned by the same process
1167  * as "from_lock".
1168  * NOTE: It is the caller's responsibility to make sure that the color
1169  * of the graph is consistent between the calls to flk_add_edge as done
1170  * in flk_process_request. This routine does not color and check for
1171  * deadlock explicitly.
1172  */
1173 
1174 static int
1175 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1176 			int check_cycle, int update_graph)
1177 {
1178 	edge_t	*edge;
1179 	edge_t	*ep;
1180 	lock_descriptor_t	*vertex;
1181 	lock_descriptor_t *vertex_stack;
1182 
1183 	STACK_INIT(vertex_stack);
1184 
1185 	/*
1186 	 * if to vertex already has mark_color just return
1187 	 * don't add an edge as it is reachable from from vertex
1188 	 * before itself.
1189 	 */
1190 
1191 	if (COLORED(to_lock))
1192 		return (0);
1193 
1194 	edge = flk_get_edge();
1195 
1196 	/*
1197 	 * set the from and to vertex
1198 	 */
1199 
1200 	edge->from_vertex = from_lock;
1201 	edge->to_vertex = to_lock;
1202 
1203 	/*
1204 	 * put in adjacency list of from vertex
1205 	 */
1206 
1207 	from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1208 	edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1209 	edge->edge_adj_prev = &from_lock->l_edge;
1210 	from_lock->l_edge.edge_adj_next = edge;
1211 
1212 	/*
1213 	 * put in in list of to vertex
1214 	 */
1215 
1216 	to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1217 	edge->edge_in_next = to_lock->l_edge.edge_in_next;
1218 	to_lock->l_edge.edge_in_next = edge;
1219 	edge->edge_in_prev = &to_lock->l_edge;
1220 
1221 
1222 	if (update_graph) {
1223 		flk_update_proc_graph(edge, 0);
1224 		return (0);
1225 	}
1226 	if (!check_cycle) {
1227 		return (0);
1228 	}
1229 
1230 	STACK_PUSH(vertex_stack, from_lock, l_stack);
1231 
1232 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1233 
1234 		STACK_POP(vertex_stack, l_stack);
1235 
1236 		for (ep = FIRST_ADJ(vertex);
1237 			ep != HEAD(vertex);
1238 				ep = NEXT_ADJ(ep)) {
1239 			if (COLORED(ep->to_vertex))
1240 				continue;
1241 			COLOR(ep->to_vertex);
1242 			if (SAME_OWNER(ep->to_vertex, from_lock))
1243 				goto dead_lock;
1244 			STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1245 		}
1246 	}
1247 	return (0);
1248 
1249 dead_lock:
1250 
1251 	/*
1252 	 * remove all edges
1253 	 */
1254 
1255 	ep = FIRST_ADJ(from_lock);
1256 
1257 	while (ep != HEAD(from_lock)) {
1258 		IN_LIST_REMOVE(ep);
1259 		from_lock->l_sedge = NEXT_ADJ(ep);
1260 		ADJ_LIST_REMOVE(ep);
1261 		flk_free_edge(ep);
1262 		ep = from_lock->l_sedge;
1263 	}
1264 	return (EDEADLK);
1265 }
1266 
1267 /*
1268  * Get an edge structure for representing the dependency between two locks.
1269  */
1270 
1271 static edge_t *
1272 flk_get_edge()
1273 {
1274 	edge_t	*ep;
1275 
1276 	ASSERT(flk_edge_cache != NULL);
1277 
1278 	ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1279 	edge_allocs++;
1280 	return (ep);
1281 }
1282 
1283 /*
1284  * Free the edge structure.
1285  */
1286 
1287 static void
1288 flk_free_edge(edge_t *ep)
1289 {
1290 	edge_frees++;
1291 	kmem_cache_free(flk_edge_cache, (void *)ep);
1292 }
1293 
1294 /*
1295  * Check the relationship of request with lock and perform the
1296  * recomputation of dependencies, break lock if required, and return
1297  * 1 if request cannot have any more relationship with the next
1298  * active locks.
1299  * The 'lock' and 'request' are compared and in case of overlap we
1300  * delete the 'lock' and form new locks to represent the non-overlapped
1301  * portion of original 'lock'. This function has side effects such as
1302  * 'lock' will be freed, new locks will be added to the active list.
1303  */
1304 
1305 static int
1306 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1307 {
1308 	int lock_effect;
1309 	lock_descriptor_t *lock1, *lock2;
1310 	lock_descriptor_t *topology[3];
1311 	int nvertex = 0;
1312 	int i;
1313 	edge_t	*ep;
1314 	graph_t	*gp = (lock->l_graph);
1315 
1316 
1317 	CHECK_SLEEPING_LOCKS(gp);
1318 	CHECK_ACTIVE_LOCKS(gp);
1319 
1320 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1321 
1322 	topology[0] = topology[1] = topology[2] = NULL;
1323 
1324 	if (request->l_type == F_UNLCK)
1325 		lock_effect = FLK_UNLOCK;
1326 	else if (request->l_type == F_RDLCK &&
1327 			lock->l_type == F_WRLCK)
1328 		lock_effect = FLK_DOWNGRADE;
1329 	else if (request->l_type == F_WRLCK &&
1330 			lock->l_type == F_RDLCK)
1331 		lock_effect = FLK_UPGRADE;
1332 	else
1333 		lock_effect = FLK_STAY_SAME;
1334 
1335 	if (lock->l_end < request->l_start) {
1336 		if (lock->l_end == request->l_start - 1 &&
1337 				lock_effect == FLK_STAY_SAME) {
1338 			topology[0] = request;
1339 			request->l_start = lock->l_start;
1340 			nvertex = 1;
1341 			goto recompute;
1342 		} else {
1343 			return (0);
1344 		}
1345 	}
1346 
1347 	if (lock->l_start > request->l_end) {
1348 		if (request->l_end == lock->l_start - 1 &&
1349 					lock_effect == FLK_STAY_SAME) {
1350 			topology[0] = request;
1351 			request->l_end = lock->l_end;
1352 			nvertex = 1;
1353 			goto recompute;
1354 		} else {
1355 			return (1);
1356 		}
1357 	}
1358 
1359 	if (request->l_end < lock->l_end) {
1360 		if (request->l_start > lock->l_start) {
1361 			if (lock_effect == FLK_STAY_SAME) {
1362 				request->l_start = lock->l_start;
1363 				request->l_end = lock->l_end;
1364 				topology[0] = request;
1365 				nvertex = 1;
1366 			} else {
1367 				lock1 = flk_get_lock();
1368 				lock2 = flk_get_lock();
1369 				COPY(lock1, lock);
1370 				COPY(lock2, lock);
1371 				lock1->l_start = lock->l_start;
1372 				lock1->l_end = request->l_start - 1;
1373 				lock2->l_start = request->l_end + 1;
1374 				lock2->l_end = lock->l_end;
1375 				topology[0] = lock1;
1376 				topology[1] = lock2;
1377 				topology[2] = request;
1378 				nvertex = 3;
1379 			}
1380 		} else if (request->l_start < lock->l_start) {
1381 			if (lock_effect == FLK_STAY_SAME) {
1382 				request->l_end = lock->l_end;
1383 				topology[0] = request;
1384 				nvertex = 1;
1385 			} else {
1386 				lock1 = flk_get_lock();
1387 				COPY(lock1, lock);
1388 				lock1->l_start = request->l_end + 1;
1389 				topology[0] = lock1;
1390 				topology[1] = request;
1391 				nvertex = 2;
1392 			}
1393 		} else  {
1394 			if (lock_effect == FLK_STAY_SAME) {
1395 				request->l_start = lock->l_start;
1396 				request->l_end = lock->l_end;
1397 				topology[0] = request;
1398 				nvertex = 1;
1399 			} else {
1400 				lock1 = flk_get_lock();
1401 				COPY(lock1, lock);
1402 				lock1->l_start = request->l_end + 1;
1403 				topology[0] = lock1;
1404 				topology[1] = request;
1405 				nvertex = 2;
1406 			}
1407 		}
1408 	} else if (request->l_end > lock->l_end) {
1409 		if (request->l_start > lock->l_start)  {
1410 			if (lock_effect == FLK_STAY_SAME) {
1411 				request->l_start = lock->l_start;
1412 				topology[0] = request;
1413 				nvertex = 1;
1414 			} else {
1415 				lock1 = flk_get_lock();
1416 				COPY(lock1, lock);
1417 				lock1->l_end = request->l_start - 1;
1418 				topology[0] = lock1;
1419 				topology[1] = request;
1420 				nvertex = 2;
1421 			}
1422 		} else if (request->l_start < lock->l_start)  {
1423 			topology[0] = request;
1424 			nvertex = 1;
1425 		} else {
1426 			topology[0] = request;
1427 			nvertex = 1;
1428 		}
1429 	} else {
1430 		if (request->l_start > lock->l_start) {
1431 			if (lock_effect == FLK_STAY_SAME) {
1432 				request->l_start = lock->l_start;
1433 				topology[0] = request;
1434 				nvertex = 1;
1435 			} else {
1436 				lock1 = flk_get_lock();
1437 				COPY(lock1, lock);
1438 				lock1->l_end = request->l_start - 1;
1439 				topology[0] = lock1;
1440 				topology[1] = request;
1441 				nvertex = 2;
1442 			}
1443 		} else if (request->l_start < lock->l_start) {
1444 			topology[0] = request;
1445 			nvertex = 1;
1446 		} else {
1447 			if (lock_effect !=  FLK_UNLOCK) {
1448 				topology[0] = request;
1449 				nvertex = 1;
1450 			} else {
1451 				flk_delete_active_lock(lock, 0);
1452 				flk_wakeup(lock, 1);
1453 				flk_free_lock(lock);
1454 				CHECK_SLEEPING_LOCKS(gp);
1455 				CHECK_ACTIVE_LOCKS(gp);
1456 				return (1);
1457 			}
1458 		}
1459 	}
1460 
1461 recompute:
1462 
1463 	/*
1464 	 * For unlock we don't send the 'request' to for recomputing
1465 	 * dependencies because no lock will add an edge to this.
1466 	 */
1467 
1468 	if (lock_effect == FLK_UNLOCK) {
1469 		topology[nvertex-1] = NULL;
1470 		nvertex--;
1471 	}
1472 	for (i = 0; i < nvertex; i++) {
1473 		topology[i]->l_state |= RECOMPUTE_LOCK;
1474 		topology[i]->l_color = NO_COLOR;
1475 	}
1476 
1477 	ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1478 
1479 	/*
1480 	 * we remove the adjacent edges for all vertices' to this vertex
1481 	 * 'lock'.
1482 	 */
1483 
1484 	ep = FIRST_IN(lock);
1485 	while (ep != HEAD(lock)) {
1486 		ADJ_LIST_REMOVE(ep);
1487 		ep = NEXT_IN(ep);
1488 	}
1489 
1490 	flk_delete_active_lock(lock, 0);
1491 
1492 	/* We are ready for recomputing the dependencies now */
1493 
1494 	flk_recompute_dependencies(lock, topology, nvertex, 1);
1495 
1496 	for (i = 0; i < nvertex; i++) {
1497 		topology[i]->l_state &= ~RECOMPUTE_LOCK;
1498 		topology[i]->l_color = NO_COLOR;
1499 	}
1500 
1501 
1502 	if (lock_effect == FLK_UNLOCK) {
1503 		nvertex++;
1504 	}
1505 	for (i = 0; i < nvertex - 1; i++) {
1506 		flk_insert_active_lock(topology[i]);
1507 	}
1508 
1509 
1510 	if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1511 		flk_wakeup(lock, 0);
1512 	} else {
1513 		ep = FIRST_IN(lock);
1514 		while (ep != HEAD(lock)) {
1515 			lock->l_sedge = NEXT_IN(ep);
1516 			IN_LIST_REMOVE(ep);
1517 			flk_update_proc_graph(ep, 1);
1518 			flk_free_edge(ep);
1519 			ep = lock->l_sedge;
1520 		}
1521 	}
1522 	flk_free_lock(lock);
1523 
1524 	CHECK_SLEEPING_LOCKS(gp);
1525 	CHECK_ACTIVE_LOCKS(gp);
1526 	return (0);
1527 }
1528 
1529 /*
1530  * Insert a lock into the active queue.
1531  */
1532 
1533 static void
1534 flk_insert_active_lock(lock_descriptor_t *new_lock)
1535 {
1536 	graph_t	*gp = new_lock->l_graph;
1537 	vnode_t	*vp = new_lock->l_vnode;
1538 	lock_descriptor_t *first_lock, *lock;
1539 
1540 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1541 
1542 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1543 	first_lock = lock;
1544 
1545 	if (first_lock != NULL) {
1546 		for (; (lock->l_vnode == vp &&
1547 			lock->l_start < new_lock->l_start); lock = lock->l_next)
1548 			;
1549 	} else {
1550 		lock = ACTIVE_HEAD(gp);
1551 	}
1552 
1553 	lock->l_prev->l_next = new_lock;
1554 	new_lock->l_next = lock;
1555 	new_lock->l_prev = lock->l_prev;
1556 	lock->l_prev = new_lock;
1557 
1558 	if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1559 		vp->v_filocks = (struct filock *)new_lock;
1560 	}
1561 	flk_set_state(new_lock, FLK_ACTIVE_STATE);
1562 	new_lock->l_state |= ACTIVE_LOCK;
1563 
1564 	CHECK_ACTIVE_LOCKS(gp);
1565 	CHECK_SLEEPING_LOCKS(gp);
1566 }
1567 
1568 /*
1569  * Delete the active lock : Performs two functions depending on the
1570  * value of second parameter. One is to remove from the active lists
1571  * only and other is to both remove and free the lock.
1572  */
1573 
1574 static void
1575 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1576 {
1577 	vnode_t *vp = lock->l_vnode;
1578 	graph_t	*gp = lock->l_graph;
1579 
1580 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1581 	if (free_lock)
1582 		ASSERT(NO_DEPENDENTS(lock));
1583 	ASSERT(NOT_BLOCKED(lock));
1584 	ASSERT(IS_ACTIVE(lock));
1585 
1586 	ASSERT((vp->v_filocks != NULL));
1587 
1588 	if (vp->v_filocks == (struct filock *)lock) {
1589 		vp->v_filocks = (struct filock *)
1590 				((lock->l_next->l_vnode == vp) ? lock->l_next :
1591 								NULL);
1592 	}
1593 	lock->l_next->l_prev = lock->l_prev;
1594 	lock->l_prev->l_next = lock->l_next;
1595 	lock->l_next = lock->l_prev = NULL;
1596 	flk_set_state(lock, FLK_DEAD_STATE);
1597 	lock->l_state &= ~ACTIVE_LOCK;
1598 
1599 	if (free_lock)
1600 		flk_free_lock(lock);
1601 	CHECK_ACTIVE_LOCKS(gp);
1602 	CHECK_SLEEPING_LOCKS(gp);
1603 }
1604 
1605 /*
1606  * Insert into the sleep queue.
1607  */
1608 
1609 static void
1610 flk_insert_sleeping_lock(lock_descriptor_t *request)
1611 {
1612 	graph_t *gp = request->l_graph;
1613 	vnode_t	*vp = request->l_vnode;
1614 	lock_descriptor_t	*lock;
1615 
1616 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1617 	ASSERT(IS_INITIAL(request));
1618 
1619 	for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1620 		lock->l_vnode < vp); lock = lock->l_next)
1621 		;
1622 
1623 	lock->l_prev->l_next = request;
1624 	request->l_prev = lock->l_prev;
1625 	lock->l_prev = request;
1626 	request->l_next = lock;
1627 	flk_set_state(request, FLK_SLEEPING_STATE);
1628 	request->l_state |= SLEEPING_LOCK;
1629 }
1630 
1631 /*
1632  * Cancelling a sleeping lock implies removing a vertex from the
1633  * dependency graph and therefore we should recompute the dependencies
1634  * of all vertices that have a path  to this vertex, w.r.t. all
1635  * vertices reachable from this vertex.
1636  */
1637 
1638 void
1639 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1640 {
1641 	graph_t	*gp = request->l_graph;
1642 	vnode_t *vp = request->l_vnode;
1643 	lock_descriptor_t **topology = NULL;
1644 	edge_t	*ep;
1645 	lock_descriptor_t *vertex, *lock;
1646 	int nvertex = 0;
1647 	int i;
1648 	lock_descriptor_t *vertex_stack;
1649 
1650 	STACK_INIT(vertex_stack);
1651 
1652 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1653 	/*
1654 	 * count number of vertex pointers that has to be allocated
1655 	 * All vertices that are reachable from request.
1656 	 */
1657 
1658 	STACK_PUSH(vertex_stack, request, l_stack);
1659 
1660 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1661 		STACK_POP(vertex_stack, l_stack);
1662 		for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1663 					ep = NEXT_ADJ(ep)) {
1664 			if (IS_RECOMPUTE(ep->to_vertex))
1665 				continue;
1666 			ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1667 			STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1668 			nvertex++;
1669 		}
1670 	}
1671 
1672 	/*
1673 	 * allocate memory for holding the vertex pointers
1674 	 */
1675 
1676 	if (nvertex) {
1677 		topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1678 						KM_SLEEP);
1679 	}
1680 
1681 	/*
1682 	 * one more pass to actually store the vertices in the
1683 	 * allocated array.
1684 	 * We first check sleeping locks and then active locks
1685 	 * so that topology array will be in a topological
1686 	 * order.
1687 	 */
1688 
1689 	nvertex = 0;
1690 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1691 
1692 	if (lock) {
1693 		do {
1694 			if (IS_RECOMPUTE(lock)) {
1695 				lock->l_index = nvertex;
1696 				topology[nvertex++] = lock;
1697 			}
1698 			lock->l_color = NO_COLOR;
1699 			lock = lock->l_next;
1700 		} while (lock->l_vnode == vp);
1701 	}
1702 
1703 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1704 
1705 	if (lock) {
1706 		do {
1707 			if (IS_RECOMPUTE(lock)) {
1708 				lock->l_index = nvertex;
1709 				topology[nvertex++] = lock;
1710 			}
1711 			lock->l_color = NO_COLOR;
1712 			lock = lock->l_next;
1713 		} while (lock->l_vnode == vp);
1714 	}
1715 
1716 	/*
1717 	 * remove in and out edges of request
1718 	 * They are freed after updating proc_graph below.
1719 	 */
1720 
1721 	for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
1722 		ADJ_LIST_REMOVE(ep);
1723 	}
1724 
1725 
1726 	if (remove_from_queue)
1727 		REMOVE_SLEEP_QUEUE(request);
1728 
1729 	/* we are ready to recompute */
1730 
1731 	flk_recompute_dependencies(request, topology, nvertex, 1);
1732 
1733 	ep = FIRST_ADJ(request);
1734 	while (ep != HEAD(request)) {
1735 		IN_LIST_REMOVE(ep);
1736 		request->l_sedge = NEXT_ADJ(ep);
1737 		ADJ_LIST_REMOVE(ep);
1738 		flk_update_proc_graph(ep, 1);
1739 		flk_free_edge(ep);
1740 		ep = request->l_sedge;
1741 	}
1742 
1743 
1744 	/*
1745 	 * unset the RECOMPUTE flag in those vertices
1746 	 */
1747 
1748 	for (i = 0; i < nvertex; i++) {
1749 		topology[i]->l_state &= ~RECOMPUTE_LOCK;
1750 	}
1751 
1752 	/*
1753 	 * free the topology
1754 	 */
1755 	if (nvertex)
1756 		kmem_free((void *)topology,
1757 			(nvertex * sizeof (lock_descriptor_t *)));
1758 	/*
1759 	 * Possibility of some locks unblocked now
1760 	 */
1761 
1762 	flk_wakeup(request, 0);
1763 
1764 	/*
1765 	 * we expect to have a correctly recomputed graph  now.
1766 	 */
1767 	flk_set_state(request, FLK_DEAD_STATE);
1768 	flk_free_lock(request);
1769 	CHECK_SLEEPING_LOCKS(gp);
1770 	CHECK_ACTIVE_LOCKS(gp);
1771 
1772 }
1773 
1774 /*
1775  * Uncoloring the graph is simply to increment the mark value of the graph
1776  * And only when wrap round takes place will we color all vertices in
1777  * the graph explicitly.
1778  */
1779 
1780 static void
1781 flk_graph_uncolor(graph_t *gp)
1782 {
1783 	lock_descriptor_t *lock;
1784 
1785 	if (gp->mark == UINT_MAX) {
1786 		gp->mark = 1;
1787 	for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
1788 					lock = lock->l_next)
1789 			lock->l_color  = 0;
1790 
1791 	for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
1792 					lock = lock->l_next)
1793 			lock->l_color  = 0;
1794 	} else {
1795 		gp->mark++;
1796 	}
1797 }
1798 
1799 /*
1800  * Wake up locks that are blocked on the given lock.
1801  */
1802 
1803 static void
1804 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
1805 {
1806 	edge_t	*ep;
1807 	graph_t	*gp = lock->l_graph;
1808 	lock_descriptor_t	*lck;
1809 
1810 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1811 	if (NO_DEPENDENTS(lock))
1812 		return;
1813 	ep = FIRST_IN(lock);
1814 	do {
1815 		/*
1816 		 * delete the edge from the adjacency list
1817 		 * of from vertex. if no more adjacent edges
1818 		 * for this vertex wake this process.
1819 		 */
1820 		lck = ep->from_vertex;
1821 		if (adj_list_remove)
1822 			ADJ_LIST_REMOVE(ep);
1823 		flk_update_proc_graph(ep, 1);
1824 		if (NOT_BLOCKED(lck)) {
1825 			GRANT_WAKEUP(lck);
1826 		}
1827 		lock->l_sedge = NEXT_IN(ep);
1828 		IN_LIST_REMOVE(ep);
1829 		flk_free_edge(ep);
1830 		ep = lock->l_sedge;
1831 	} while (ep != HEAD(lock));
1832 	ASSERT(NO_DEPENDENTS(lock));
1833 }
1834 
1835 /*
1836  * The dependents of request, is checked for its dependency against the
1837  * locks in topology (called topology because the array is and should be in
1838  * topological order for this algorithm, if not in topological order the
1839  * inner loop below might add more edges than necessary. Topological ordering
1840  * of vertices satisfies the property that all edges will be from left to
1841  * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
1842  * If lock l1 in the dependent set of request is dependent (blocked by)
1843  * on lock l2 in topology but does not have a path to it, we add an edge
1844  * in the inner loop below.
1845  *
1846  * We don't want to add an edge between l1 and l2 if there exists
1847  * already a path from l1 to l2, so care has to be taken for those vertices
1848  * that  have two paths to 'request'. These vertices are referred to here
1849  * as barrier locks.
1850  *
1851  * The barriers has to be found (those vertex that originally had two paths
1852  * to request) because otherwise we may end up adding edges unnecessarily
1853  * to vertices in topology, and thus barrier vertices can have an edge
1854  * to a vertex in topology as well a path to it.
1855  */
1856 
1857 static void
1858 flk_recompute_dependencies(lock_descriptor_t *request,
1859 		lock_descriptor_t **topology,
1860 			int nvertex, int update_graph)
1861 {
1862 	lock_descriptor_t *vertex, *lock;
1863 	graph_t	*gp = request->l_graph;
1864 	int i, count;
1865 	int barrier_found = 0;
1866 	edge_t	*ep;
1867 	lock_descriptor_t *vertex_stack;
1868 
1869 	STACK_INIT(vertex_stack);
1870 
1871 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1872 	if (nvertex == 0)
1873 		return;
1874 	flk_graph_uncolor(request->l_graph);
1875 	barrier_found = flk_find_barriers(request);
1876 	request->l_state |= RECOMPUTE_DONE;
1877 
1878 	STACK_PUSH(vertex_stack, request, l_stack);
1879 	request->l_sedge = FIRST_IN(request);
1880 
1881 
1882 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1883 		if (vertex->l_state & RECOMPUTE_DONE) {
1884 			count = 0;
1885 			goto next_in_edge;
1886 		}
1887 		if (IS_BARRIER(vertex)) {
1888 			/* decrement the barrier count */
1889 			if (vertex->l_index) {
1890 				vertex->l_index--;
1891 				/* this guy will be pushed again anyway ? */
1892 				STACK_POP(vertex_stack, l_stack);
1893 				if (vertex->l_index == 0)  {
1894 				/*
1895 				 * barrier is over we can recompute
1896 				 * dependencies for this lock in the
1897 				 * next stack pop
1898 				 */
1899 					vertex->l_state &= ~BARRIER_LOCK;
1900 				}
1901 				continue;
1902 			}
1903 		}
1904 		vertex->l_state |= RECOMPUTE_DONE;
1905 		flk_graph_uncolor(gp);
1906 		count = flk_color_reachables(vertex);
1907 		for (i = 0; i < nvertex; i++) {
1908 			lock = topology[i];
1909 			if (COLORED(lock))
1910 				continue;
1911 			if (BLOCKS(lock, vertex)) {
1912 				(void) flk_add_edge(vertex, lock,
1913 				    NO_CHECK_CYCLE, update_graph);
1914 				COLOR(lock);
1915 				count++;
1916 				count += flk_color_reachables(lock);
1917 			}
1918 
1919 		}
1920 
1921 next_in_edge:
1922 		if (count == nvertex ||
1923 				vertex->l_sedge == HEAD(vertex)) {
1924 			/* prune the tree below this */
1925 			STACK_POP(vertex_stack, l_stack);
1926 			vertex->l_state &= ~RECOMPUTE_DONE;
1927 			/* update the barrier locks below this! */
1928 			if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
1929 				flk_graph_uncolor(gp);
1930 				flk_update_barriers(vertex);
1931 			}
1932 			continue;
1933 		}
1934 
1935 		ep = vertex->l_sedge;
1936 		lock = ep->from_vertex;
1937 		STACK_PUSH(vertex_stack, lock, l_stack);
1938 		lock->l_sedge = FIRST_IN(lock);
1939 		vertex->l_sedge = NEXT_IN(ep);
1940 	}
1941 
1942 }
1943 
1944 /*
1945  * Color all reachable vertices from vertex that belongs to topology (here
1946  * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
1947  *
1948  * Note: we need to use a different stack_link l_stack1 because this is
1949  * called from flk_recompute_dependencies() that already uses a stack with
1950  * l_stack as stack_link.
1951  */
1952 
1953 static int
1954 flk_color_reachables(lock_descriptor_t *vertex)
1955 {
1956 	lock_descriptor_t *ver, *lock;
1957 	int count;
1958 	edge_t	*ep;
1959 	lock_descriptor_t *vertex_stack;
1960 
1961 	STACK_INIT(vertex_stack);
1962 
1963 	STACK_PUSH(vertex_stack, vertex, l_stack1);
1964 	count = 0;
1965 	while ((ver = STACK_TOP(vertex_stack)) != NULL) {
1966 
1967 		STACK_POP(vertex_stack, l_stack1);
1968 		for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
1969 					ep = NEXT_ADJ(ep)) {
1970 			lock = ep->to_vertex;
1971 			if (COLORED(lock))
1972 				continue;
1973 			COLOR(lock);
1974 			if (IS_RECOMPUTE(lock))
1975 				count++;
1976 			STACK_PUSH(vertex_stack, lock, l_stack1);
1977 		}
1978 
1979 	}
1980 	return (count);
1981 }
1982 
1983 /*
1984  * Called from flk_recompute_dependencies() this routine decrements
1985  * the barrier count of barrier vertices that are reachable from lock.
1986  */
1987 
1988 static void
1989 flk_update_barriers(lock_descriptor_t *lock)
1990 {
1991 	lock_descriptor_t *vertex, *lck;
1992 	edge_t	*ep;
1993 	lock_descriptor_t *vertex_stack;
1994 
1995 	STACK_INIT(vertex_stack);
1996 
1997 	STACK_PUSH(vertex_stack, lock, l_stack1);
1998 
1999 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2000 		STACK_POP(vertex_stack, l_stack1);
2001 		for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2002 						ep = NEXT_IN(ep)) {
2003 			lck = ep->from_vertex;
2004 			if (COLORED(lck)) {
2005 				if (IS_BARRIER(lck)) {
2006 					ASSERT(lck->l_index > 0);
2007 					lck->l_index--;
2008 					if (lck->l_index == 0)
2009 						lck->l_state &= ~BARRIER_LOCK;
2010 				}
2011 				continue;
2012 			}
2013 			COLOR(lck);
2014 			if (IS_BARRIER(lck)) {
2015 				ASSERT(lck->l_index > 0);
2016 				lck->l_index--;
2017 				if (lck->l_index == 0)
2018 					lck->l_state &= ~BARRIER_LOCK;
2019 			}
2020 			STACK_PUSH(vertex_stack, lck, l_stack1);
2021 		}
2022 	}
2023 }
2024 
2025 /*
2026  * Finds all vertices that are reachable from 'lock' more than once and
2027  * mark them as barrier vertices and increment their barrier count.
2028  * The barrier count is one minus the total number of paths from lock
2029  * to that vertex.
2030  */
2031 
2032 static int
2033 flk_find_barriers(lock_descriptor_t *lock)
2034 {
2035 	lock_descriptor_t *vertex, *lck;
2036 	int found = 0;
2037 	edge_t	*ep;
2038 	lock_descriptor_t *vertex_stack;
2039 
2040 	STACK_INIT(vertex_stack);
2041 
2042 	STACK_PUSH(vertex_stack, lock, l_stack1);
2043 
2044 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2045 		STACK_POP(vertex_stack, l_stack1);
2046 		for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2047 						ep = NEXT_IN(ep)) {
2048 			lck = ep->from_vertex;
2049 			if (COLORED(lck)) {
2050 				/* this is a barrier */
2051 				lck->l_state |= BARRIER_LOCK;
2052 				/* index will have barrier count */
2053 				lck->l_index++;
2054 				if (!found)
2055 					found = 1;
2056 				continue;
2057 			}
2058 			COLOR(lck);
2059 			lck->l_index = 0;
2060 			STACK_PUSH(vertex_stack, lck, l_stack1);
2061 		}
2062 	}
2063 	return (found);
2064 }
2065 
2066 /*
2067  * Finds the first lock that is mainly responsible for blocking this
2068  * request.  If there is no such lock, request->l_flock.l_type is set to
2069  * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
2070  * of the blocking lock.
2071  *
2072  * Note: It is possible a request is blocked by a sleeping lock because
2073  * of the fairness policy used in flk_process_request() to construct the
2074  * dependencies. (see comments before flk_process_request()).
2075  */
2076 
2077 static void
2078 flk_get_first_blocking_lock(lock_descriptor_t *request)
2079 {
2080 	graph_t	*gp = request->l_graph;
2081 	vnode_t *vp = request->l_vnode;
2082 	lock_descriptor_t *lock, *blocker;
2083 
2084 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2085 	blocker = NULL;
2086 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2087 
2088 	if (lock) {
2089 		do {
2090 			if (BLOCKS(lock, request)) {
2091 				blocker = lock;
2092 				break;
2093 			}
2094 			lock = lock->l_next;
2095 		} while (lock->l_vnode == vp);
2096 	}
2097 
2098 	if (blocker) {
2099 		report_blocker(blocker, request);
2100 	} else
2101 		request->l_flock.l_type = F_UNLCK;
2102 }
2103 
2104 /*
2105  * Get the graph_t structure associated with a vnode.
2106  * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2107  * not yet been initialized, then a new element is allocated and returned.
2108  */
2109 graph_t *
2110 flk_get_lock_graph(vnode_t *vp, int initialize)
2111 {
2112 	graph_t *gp;
2113 	graph_t *gp_alloc = NULL;
2114 	int index = HASH_INDEX(vp);
2115 
2116 	if (initialize == FLK_USE_GRAPH) {
2117 		mutex_enter(&flock_lock);
2118 		gp = lock_graph[index];
2119 		mutex_exit(&flock_lock);
2120 		return (gp);
2121 	}
2122 
2123 	ASSERT(initialize == FLK_INIT_GRAPH);
2124 
2125 	if (lock_graph[index] == NULL) {
2126 
2127 		gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2128 
2129 		/* Initialize the graph */
2130 
2131 		gp_alloc->active_locks.l_next =
2132 		    gp_alloc->active_locks.l_prev =
2133 		    (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2134 		gp_alloc->sleeping_locks.l_next =
2135 		    gp_alloc->sleeping_locks.l_prev =
2136 		    (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2137 		gp_alloc->index = index;
2138 		mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2139 	}
2140 
2141 	mutex_enter(&flock_lock);
2142 
2143 	gp = lock_graph[index];
2144 
2145 	/* Recheck the value within flock_lock */
2146 	if (gp == NULL) {
2147 		struct flock_globals *fg;
2148 
2149 		/* We must have previously allocated the graph_t structure */
2150 		ASSERT(gp_alloc != NULL);
2151 		lock_graph[index] = gp = gp_alloc;
2152 		/*
2153 		 * The lockmgr status is only needed if KLM is loaded.
2154 		 */
2155 		if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2156 			fg = flk_get_globals();
2157 			fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2158 		}
2159 	}
2160 
2161 	mutex_exit(&flock_lock);
2162 
2163 	if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2164 		/* There was a race to allocate the graph_t and we lost */
2165 		mutex_destroy(&gp_alloc->gp_mutex);
2166 		kmem_free(gp_alloc, sizeof (graph_t));
2167 	}
2168 
2169 	return (gp);
2170 }
2171 
2172 /*
2173  * PSARC case 1997/292
2174  */
2175 int
2176 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2177 {
2178 	lock_descriptor_t *lock;
2179 	int result = 0;
2180 	graph_t *gp;
2181 	int			lock_nlmid;
2182 
2183 	/*
2184 	 * Check to see if node is booted as a cluster. If not, return.
2185 	 */
2186 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2187 		return (0);
2188 	}
2189 
2190 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2191 	if (gp == NULL) {
2192 		return (0);
2193 	}
2194 
2195 	mutex_enter(&gp->gp_mutex);
2196 
2197 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2198 
2199 	if (lock) {
2200 		while (lock->l_vnode == vp) {
2201 			/* get NLM id from sysid */
2202 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2203 
2204 			/*
2205 			 * If NLM server request _and_ nlmid of lock matches
2206 			 * nlmid of argument, then we've found a remote lock.
2207 			 */
2208 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2209 				result = 1;
2210 				goto done;
2211 			}
2212 			lock = lock->l_next;
2213 		}
2214 	}
2215 
2216 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2217 
2218 	if (lock) {
2219 		while (lock->l_vnode == vp) {
2220 			/* get NLM id from sysid */
2221 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2222 
2223 			/*
2224 			 * If NLM server request _and_ nlmid of lock matches
2225 			 * nlmid of argument, then we've found a remote lock.
2226 			 */
2227 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2228 				result = 1;
2229 				goto done;
2230 			}
2231 			lock = lock->l_next;
2232 		}
2233 	}
2234 
2235 done:
2236 	mutex_exit(&gp->gp_mutex);
2237 	return (result);
2238 }
2239 
2240 /*
2241  * Determine whether there are any locks for the given vnode with a remote
2242  * sysid.  Returns zero if not, non-zero if there are.
2243  *
2244  * Note that the return value from this function is potentially invalid
2245  * once it has been returned.  The caller is responsible for providing its
2246  * own synchronization mechanism to ensure that the return value is useful
2247  * (e.g., see nfs_lockcompletion()).
2248  */
2249 int
2250 flk_has_remote_locks(vnode_t *vp)
2251 {
2252 	lock_descriptor_t *lock;
2253 	int result = 0;
2254 	graph_t *gp;
2255 
2256 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2257 	if (gp == NULL) {
2258 		return (0);
2259 	}
2260 
2261 	mutex_enter(&gp->gp_mutex);
2262 
2263 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2264 
2265 	if (lock) {
2266 		while (lock->l_vnode == vp) {
2267 			if (IS_REMOTE(lock)) {
2268 				result = 1;
2269 				goto done;
2270 			}
2271 			lock = lock->l_next;
2272 		}
2273 	}
2274 
2275 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2276 
2277 	if (lock) {
2278 		while (lock->l_vnode == vp) {
2279 			if (IS_REMOTE(lock)) {
2280 				result = 1;
2281 				goto done;
2282 			}
2283 			lock = lock->l_next;
2284 		}
2285 	}
2286 
2287 done:
2288 	mutex_exit(&gp->gp_mutex);
2289 	return (result);
2290 }
2291 
2292 /*
2293  * Determine if there are any locks owned by the given sysid.
2294  * Returns zero if not, non-zero if there are.  Note that this return code
2295  * could be derived from flk_get_{sleeping,active}_locks, but this routine
2296  * avoids all the memory allocations of those routines.
2297  *
2298  * This routine has the same synchronization issues as
2299  * flk_has_remote_locks.
2300  */
2301 
2302 int
2303 flk_sysid_has_locks(int sysid, int lck_type)
2304 {
2305 	int		has_locks = 0;
2306 	lock_descriptor_t	*lock;
2307 	graph_t 	*gp;
2308 	int		i;
2309 
2310 	for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2311 		mutex_enter(&flock_lock);
2312 		gp = lock_graph[i];
2313 		mutex_exit(&flock_lock);
2314 		if (gp == NULL) {
2315 			continue;
2316 		}
2317 
2318 		mutex_enter(&gp->gp_mutex);
2319 
2320 		if (lck_type & FLK_QUERY_ACTIVE) {
2321 			for (lock = ACTIVE_HEAD(gp)->l_next;
2322 			    lock != ACTIVE_HEAD(gp) && !has_locks;
2323 			    lock = lock->l_next) {
2324 				if (lock->l_flock.l_sysid == sysid)
2325 					has_locks = 1;
2326 			}
2327 		}
2328 
2329 		if (lck_type & FLK_QUERY_SLEEPING) {
2330 			for (lock = SLEEPING_HEAD(gp)->l_next;
2331 				lock != SLEEPING_HEAD(gp) && !has_locks;
2332 				lock = lock->l_next) {
2333 				if (lock->l_flock.l_sysid == sysid)
2334 					has_locks = 1;
2335 			}
2336 		}
2337 		mutex_exit(&gp->gp_mutex);
2338 	}
2339 
2340 	return (has_locks);
2341 }
2342 
2343 
2344 /*
2345  * PSARC case 1997/292
2346  *
2347  * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
2348  *  quantity, the real sysid generated by the NLM server; the upper half
2349  *  identifies the node of the cluster where the NLM server ran.
2350  *  This routine is only called by an NLM server running in a cluster.
2351  * Effects: Remove all locks held on behalf of the client identified
2352  *  by "sysid."
2353  */
2354 void
2355 cl_flk_remove_locks_by_sysid(int sysid)
2356 {
2357 	graph_t	*gp;
2358 	int i;
2359 	lock_descriptor_t *lock, *nlock;
2360 
2361 	/*
2362 	 * Check to see if node is booted as a cluster. If not, return.
2363 	 */
2364 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2365 		return;
2366 	}
2367 
2368 	ASSERT(sysid != 0);
2369 	for (i = 0; i < HASH_SIZE; i++) {
2370 		mutex_enter(&flock_lock);
2371 		gp = lock_graph[i];
2372 		mutex_exit(&flock_lock);
2373 
2374 		if (gp == NULL)
2375 			continue;
2376 
2377 		mutex_enter(&gp->gp_mutex);	/*  get mutex on lock graph */
2378 
2379 		/* signal sleeping requests so that they bail out */
2380 		lock = SLEEPING_HEAD(gp)->l_next;
2381 		while (lock != SLEEPING_HEAD(gp)) {
2382 			nlock = lock->l_next;
2383 			if (lock->l_flock.l_sysid == sysid) {
2384 				INTERRUPT_WAKEUP(lock);
2385 			}
2386 			lock = nlock;
2387 		}
2388 
2389 		/* delete active locks */
2390 		lock = ACTIVE_HEAD(gp)->l_next;
2391 		while (lock != ACTIVE_HEAD(gp)) {
2392 			nlock = lock->l_next;
2393 			if (lock->l_flock.l_sysid == sysid) {
2394 				flk_delete_active_lock(lock, 0);
2395 				flk_wakeup(lock, 1);
2396 				flk_free_lock(lock);
2397 			}
2398 			lock = nlock;
2399 		}
2400 		mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
2401 	}
2402 }
2403 
2404 /*
2405  * Delete all locks in the system that belongs to the sysid of the request.
2406  */
2407 
2408 static void
2409 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2410 {
2411 	int	sysid  = request->l_flock.l_sysid;
2412 	lock_descriptor_t *lock, *nlock;
2413 	graph_t	*gp;
2414 	int i;
2415 
2416 	ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2417 	ASSERT(sysid != 0);
2418 
2419 	mutex_exit(&request->l_graph->gp_mutex);
2420 
2421 	for (i = 0; i < HASH_SIZE; i++) {
2422 		mutex_enter(&flock_lock);
2423 		gp = lock_graph[i];
2424 		mutex_exit(&flock_lock);
2425 
2426 		if (gp == NULL)
2427 			continue;
2428 
2429 		mutex_enter(&gp->gp_mutex);
2430 
2431 		/* signal sleeping requests so that they bail out */
2432 		lock = SLEEPING_HEAD(gp)->l_next;
2433 		while (lock != SLEEPING_HEAD(gp)) {
2434 			nlock = lock->l_next;
2435 			if (lock->l_flock.l_sysid == sysid) {
2436 				INTERRUPT_WAKEUP(lock);
2437 			}
2438 			lock = nlock;
2439 		}
2440 
2441 		/* delete active locks */
2442 		lock = ACTIVE_HEAD(gp)->l_next;
2443 		while (lock != ACTIVE_HEAD(gp)) {
2444 			nlock = lock->l_next;
2445 			if (lock->l_flock.l_sysid == sysid) {
2446 				flk_delete_active_lock(lock, 0);
2447 				flk_wakeup(lock, 1);
2448 				flk_free_lock(lock);
2449 			}
2450 			lock = nlock;
2451 		}
2452 		mutex_exit(&gp->gp_mutex);
2453 	}
2454 
2455 	mutex_enter(&request->l_graph->gp_mutex);
2456 }
2457 
2458 /*
2459  * Clustering: Deletes PXFS locks
2460  * Effects: Delete all locks on files in the given file system and with the
2461  *  given PXFS id.
2462  */
2463 void
2464 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2465 {
2466 	lock_descriptor_t *lock, *nlock;
2467 	graph_t	*gp;
2468 	int i;
2469 
2470 	for (i = 0; i < HASH_SIZE; i++) {
2471 		mutex_enter(&flock_lock);
2472 		gp = lock_graph[i];
2473 		mutex_exit(&flock_lock);
2474 
2475 		if (gp == NULL)
2476 			continue;
2477 
2478 		mutex_enter(&gp->gp_mutex);
2479 
2480 		/* signal sleeping requests so that they bail out */
2481 		lock = SLEEPING_HEAD(gp)->l_next;
2482 		while (lock != SLEEPING_HEAD(gp)) {
2483 			nlock = lock->l_next;
2484 			if (lock->l_vnode->v_vfsp == vfsp) {
2485 				ASSERT(IS_PXFS(lock));
2486 				if (GETPXFSID(lock->l_flock.l_sysid) ==
2487 				    pxfsid) {
2488 					flk_set_state(lock,
2489 					    FLK_CANCELLED_STATE);
2490 					flk_cancel_sleeping_lock(lock, 1);
2491 				}
2492 			}
2493 			lock = nlock;
2494 		}
2495 
2496 		/* delete active locks */
2497 		lock = ACTIVE_HEAD(gp)->l_next;
2498 		while (lock != ACTIVE_HEAD(gp)) {
2499 			nlock = lock->l_next;
2500 			if (lock->l_vnode->v_vfsp == vfsp) {
2501 				ASSERT(IS_PXFS(lock));
2502 				if (GETPXFSID(lock->l_flock.l_sysid) ==
2503 				    pxfsid) {
2504 					flk_delete_active_lock(lock, 0);
2505 					flk_wakeup(lock, 1);
2506 					flk_free_lock(lock);
2507 				}
2508 			}
2509 			lock = nlock;
2510 		}
2511 		mutex_exit(&gp->gp_mutex);
2512 	}
2513 }
2514 
2515 /*
2516  * Search for a sleeping lock manager lock which matches exactly this lock
2517  * request; if one is found, fake a signal to cancel it.
2518  *
2519  * Return 1 if a matching lock was found, 0 otherwise.
2520  */
2521 
2522 static int
2523 flk_canceled(lock_descriptor_t *request)
2524 {
2525 	lock_descriptor_t *lock, *nlock;
2526 	graph_t *gp = request->l_graph;
2527 	vnode_t *vp = request->l_vnode;
2528 
2529 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2530 	ASSERT(IS_LOCKMGR(request));
2531 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2532 
2533 	if (lock) {
2534 		while (lock->l_vnode == vp) {
2535 			nlock = lock->l_next;
2536 			if (SAME_OWNER(lock, request) &&
2537 				lock->l_start == request->l_start &&
2538 					lock->l_end == request->l_end) {
2539 				INTERRUPT_WAKEUP(lock);
2540 				return (1);
2541 			}
2542 			lock = nlock;
2543 		}
2544 	}
2545 	return (0);
2546 }
2547 
2548 /*
2549  * Remove all the locks for the vnode belonging to the given pid and sysid.
2550  */
2551 
2552 void
2553 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2554 {
2555 	graph_t	*gp;
2556 	lock_descriptor_t *lock, *nlock;
2557 	lock_descriptor_t *link_stack;
2558 
2559 	STACK_INIT(link_stack);
2560 
2561 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2562 
2563 	if (gp == NULL)
2564 		return;
2565 	mutex_enter(&gp->gp_mutex);
2566 
2567 	CHECK_SLEEPING_LOCKS(gp);
2568 	CHECK_ACTIVE_LOCKS(gp);
2569 
2570 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2571 
2572 	if (lock) {
2573 		do {
2574 			nlock = lock->l_next;
2575 			if ((lock->l_flock.l_pid == pid ||
2576 					pid == IGN_PID) &&
2577 				lock->l_flock.l_sysid == sysid) {
2578 				CANCEL_WAKEUP(lock);
2579 			}
2580 			lock = nlock;
2581 		} while (lock->l_vnode == vp);
2582 	}
2583 
2584 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2585 
2586 	if (lock) {
2587 		do {
2588 			nlock = lock->l_next;
2589 			if ((lock->l_flock.l_pid == pid ||
2590 					pid == IGN_PID) &&
2591 				lock->l_flock.l_sysid == sysid) {
2592 				flk_delete_active_lock(lock, 0);
2593 				STACK_PUSH(link_stack, lock, l_stack);
2594 			}
2595 			lock = nlock;
2596 		} while (lock->l_vnode == vp);
2597 	}
2598 
2599 	while ((lock = STACK_TOP(link_stack)) != NULL) {
2600 		STACK_POP(link_stack, l_stack);
2601 		flk_wakeup(lock, 1);
2602 		flk_free_lock(lock);
2603 	}
2604 
2605 	CHECK_SLEEPING_LOCKS(gp);
2606 	CHECK_ACTIVE_LOCKS(gp);
2607 	CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2608 	mutex_exit(&gp->gp_mutex);
2609 }
2610 
2611 
2612 /*
2613  * Called from 'fs' read and write routines for files that have mandatory
2614  * locking enabled.
2615  */
2616 
2617 int
2618 chklock(
2619 	struct vnode	*vp,
2620 	int 		iomode,
2621 	u_offset_t	offset,
2622 	ssize_t		len,
2623 	int 		fmode,
2624 	caller_context_t *ct)
2625 {
2626 	register int	i;
2627 	struct flock64 	bf;
2628 	int 		error = 0;
2629 
2630 	bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
2631 	bf.l_whence = 0;
2632 	bf.l_start = offset;
2633 	bf.l_len = len;
2634 	if (ct == NULL) {
2635 		bf.l_pid = curproc->p_pid;
2636 		bf.l_sysid = 0;
2637 	} else {
2638 		bf.l_pid = ct->cc_pid;
2639 		bf.l_sysid = ct->cc_sysid;
2640 	}
2641 	i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
2642 	if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
2643 	    bf.l_type != F_UNLCK)
2644 		error = i ? i : EAGAIN;
2645 	return (error);
2646 }
2647 
2648 /*
2649  * convoff - converts the given data (start, whence) to the
2650  * given whence.
2651  */
2652 int
2653 convoff(vp, lckdat, whence, offset)
2654 	struct vnode 	*vp;
2655 	struct flock64 	*lckdat;
2656 	int 		whence;
2657 	offset_t	offset;
2658 {
2659 	int 		error;
2660 	struct vattr 	vattr;
2661 
2662 	if ((lckdat->l_whence == 2) || (whence == 2)) {
2663 		vattr.va_mask = AT_SIZE;
2664 		if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
2665 			return (error);
2666 	}
2667 
2668 	switch (lckdat->l_whence) {
2669 	case 1:
2670 		lckdat->l_start += offset;
2671 		break;
2672 	case 2:
2673 		lckdat->l_start += vattr.va_size;
2674 		/* FALLTHRU */
2675 	case 0:
2676 		break;
2677 	default:
2678 		return (EINVAL);
2679 	}
2680 
2681 	if (lckdat->l_start < 0)
2682 		return (EINVAL);
2683 
2684 	switch (whence) {
2685 	case 1:
2686 		lckdat->l_start -= offset;
2687 		break;
2688 	case 2:
2689 		lckdat->l_start -= vattr.va_size;
2690 		/* FALLTHRU */
2691 	case 0:
2692 		break;
2693 	default:
2694 		return (EINVAL);
2695 	}
2696 
2697 	lckdat->l_whence = (short)whence;
2698 	return (0);
2699 }
2700 
2701 
2702 /* 	proc_graph function definitions */
2703 
2704 /*
2705  * Function checks for deadlock due to the new 'lock'. If deadlock found
2706  * edges of this lock are freed and returned.
2707  */
2708 
2709 static int
2710 flk_check_deadlock(lock_descriptor_t *lock)
2711 {
2712 	proc_vertex_t	*start_vertex, *pvertex;
2713 	proc_vertex_t *dvertex;
2714 	proc_edge_t *pep, *ppep;
2715 	edge_t	*ep, *nep;
2716 	proc_vertex_t *process_stack;
2717 
2718 	STACK_INIT(process_stack);
2719 
2720 	mutex_enter(&flock_lock);
2721 	start_vertex = flk_get_proc_vertex(lock);
2722 	ASSERT(start_vertex != NULL);
2723 
2724 	/* construct the edges from this process to other processes */
2725 
2726 	ep = FIRST_ADJ(lock);
2727 	while (ep != HEAD(lock)) {
2728 		proc_vertex_t *adj_proc;
2729 
2730 		adj_proc = flk_get_proc_vertex(ep->to_vertex);
2731 		for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
2732 			if (pep->to_proc == adj_proc) {
2733 				ASSERT(pep->refcount);
2734 				pep->refcount++;
2735 				break;
2736 			}
2737 		}
2738 		if (pep == NULL) {
2739 			pep = flk_get_proc_edge();
2740 			pep->to_proc = adj_proc;
2741 			pep->refcount = 1;
2742 			adj_proc->incount++;
2743 			pep->next = start_vertex->edge;
2744 			start_vertex->edge = pep;
2745 		}
2746 		ep = NEXT_ADJ(ep);
2747 	}
2748 
2749 	ep = FIRST_IN(lock);
2750 
2751 	while (ep != HEAD(lock)) {
2752 		proc_vertex_t *in_proc;
2753 
2754 		in_proc = flk_get_proc_vertex(ep->from_vertex);
2755 
2756 		for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
2757 			if (pep->to_proc == start_vertex) {
2758 				ASSERT(pep->refcount);
2759 				pep->refcount++;
2760 				break;
2761 			}
2762 		}
2763 		if (pep == NULL) {
2764 			pep = flk_get_proc_edge();
2765 			pep->to_proc = start_vertex;
2766 			pep->refcount = 1;
2767 			start_vertex->incount++;
2768 			pep->next = in_proc->edge;
2769 			in_proc->edge = pep;
2770 		}
2771 		ep = NEXT_IN(ep);
2772 	}
2773 
2774 	if (start_vertex->incount == 0) {
2775 		mutex_exit(&flock_lock);
2776 		return (0);
2777 	}
2778 
2779 	flk_proc_graph_uncolor();
2780 
2781 	start_vertex->p_sedge = start_vertex->edge;
2782 
2783 	STACK_PUSH(process_stack, start_vertex, p_stack);
2784 
2785 	while ((pvertex = STACK_TOP(process_stack)) != NULL) {
2786 		for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
2787 			dvertex = pep->to_proc;
2788 			if (!PROC_ARRIVED(dvertex)) {
2789 				STACK_PUSH(process_stack, dvertex, p_stack);
2790 				dvertex->p_sedge = dvertex->edge;
2791 				PROC_ARRIVE(pvertex);
2792 				pvertex->p_sedge = pep->next;
2793 				break;
2794 			}
2795 			if (!PROC_DEPARTED(dvertex))
2796 				goto deadlock;
2797 		}
2798 		if (pep == NULL) {
2799 			PROC_DEPART(pvertex);
2800 			STACK_POP(process_stack, p_stack);
2801 		}
2802 	}
2803 	mutex_exit(&flock_lock);
2804 	return (0);
2805 
2806 deadlock:
2807 
2808 	/* we remove all lock edges and proc edges */
2809 
2810 	ep = FIRST_ADJ(lock);
2811 	while (ep != HEAD(lock)) {
2812 		proc_vertex_t *adj_proc;
2813 		adj_proc = flk_get_proc_vertex(ep->to_vertex);
2814 		nep = NEXT_ADJ(ep);
2815 		IN_LIST_REMOVE(ep);
2816 		ADJ_LIST_REMOVE(ep);
2817 		flk_free_edge(ep);
2818 		ppep = start_vertex->edge;
2819 		for (pep = start_vertex->edge; pep != NULL; ppep = pep,
2820 						pep = ppep->next) {
2821 			if (pep->to_proc == adj_proc) {
2822 				pep->refcount--;
2823 				if (pep->refcount == 0) {
2824 					if (pep == ppep) {
2825 						start_vertex->edge = pep->next;
2826 					} else {
2827 						ppep->next = pep->next;
2828 					}
2829 					adj_proc->incount--;
2830 					flk_proc_release(adj_proc);
2831 					flk_free_proc_edge(pep);
2832 				}
2833 				break;
2834 			}
2835 		}
2836 		ep = nep;
2837 	}
2838 	ep = FIRST_IN(lock);
2839 	while (ep != HEAD(lock)) {
2840 		proc_vertex_t *in_proc;
2841 		in_proc = flk_get_proc_vertex(ep->from_vertex);
2842 		nep = NEXT_IN(ep);
2843 		IN_LIST_REMOVE(ep);
2844 		ADJ_LIST_REMOVE(ep);
2845 		flk_free_edge(ep);
2846 		ppep = in_proc->edge;
2847 		for (pep = in_proc->edge; pep != NULL; ppep = pep,
2848 						pep = ppep->next) {
2849 			if (pep->to_proc == start_vertex) {
2850 				pep->refcount--;
2851 				if (pep->refcount == 0) {
2852 					if (pep == ppep) {
2853 						in_proc->edge = pep->next;
2854 					} else {
2855 						ppep->next = pep->next;
2856 					}
2857 					start_vertex->incount--;
2858 					flk_proc_release(in_proc);
2859 					flk_free_proc_edge(pep);
2860 				}
2861 				break;
2862 			}
2863 		}
2864 		ep = nep;
2865 	}
2866 	flk_proc_release(start_vertex);
2867 	mutex_exit(&flock_lock);
2868 	return (1);
2869 }
2870 
2871 /*
2872  * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
2873  * from the list we return that, otherwise we allocate one. If necessary,
2874  * we grow the list of vertices also.
2875  */
2876 
2877 static proc_vertex_t *
2878 flk_get_proc_vertex(lock_descriptor_t *lock)
2879 {
2880 	int i;
2881 	proc_vertex_t	*pv;
2882 	proc_vertex_t	**palloc;
2883 
2884 	ASSERT(MUTEX_HELD(&flock_lock));
2885 	if (lock->pvertex != -1) {
2886 		ASSERT(lock->pvertex >= 0);
2887 		pv = pgraph.proc[lock->pvertex];
2888 		if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2889 			return (pv);
2890 		}
2891 	}
2892 	for (i = 0; i < pgraph.gcount; i++) {
2893 		pv = pgraph.proc[i];
2894 		if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2895 			lock->pvertex = pv->index = i;
2896 			return (pv);
2897 		}
2898 	}
2899 	pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
2900 	pv->pid = lock->l_flock.l_pid;
2901 	pv->sysid = lock->l_flock.l_sysid;
2902 	flk_proc_vertex_allocs++;
2903 	if (pgraph.free != 0) {
2904 		for (i = 0; i < pgraph.gcount; i++) {
2905 			if (pgraph.proc[i] == NULL) {
2906 				pgraph.proc[i] = pv;
2907 				lock->pvertex = pv->index = i;
2908 				pgraph.free--;
2909 				return (pv);
2910 			}
2911 		}
2912 	}
2913 	palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
2914 				sizeof (proc_vertex_t *), KM_SLEEP);
2915 
2916 	if (pgraph.proc) {
2917 		bcopy(pgraph.proc, palloc,
2918 			pgraph.gcount * sizeof (proc_vertex_t *));
2919 
2920 		kmem_free(pgraph.proc,
2921 			pgraph.gcount * sizeof (proc_vertex_t *));
2922 	}
2923 	pgraph.proc = palloc;
2924 	pgraph.free += (PROC_CHUNK - 1);
2925 	pv->index = lock->pvertex = pgraph.gcount;
2926 	pgraph.gcount += PROC_CHUNK;
2927 	pgraph.proc[pv->index] = pv;
2928 	return (pv);
2929 }
2930 
2931 /*
2932  * Allocate a proc edge.
2933  */
2934 
2935 static proc_edge_t *
2936 flk_get_proc_edge()
2937 {
2938 	proc_edge_t *pep;
2939 
2940 	pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
2941 	flk_proc_edge_allocs++;
2942 	return (pep);
2943 }
2944 
2945 /*
2946  * Free the proc edge. Called whenever its reference count goes to zero.
2947  */
2948 
2949 static void
2950 flk_free_proc_edge(proc_edge_t *pep)
2951 {
2952 	ASSERT(pep->refcount == 0);
2953 	kmem_free((void *)pep, sizeof (proc_edge_t));
2954 	flk_proc_edge_frees++;
2955 }
2956 
2957 /*
2958  * Color the graph explicitly done only when the mark value hits max value.
2959  */
2960 
2961 static void
2962 flk_proc_graph_uncolor()
2963 {
2964 	int i;
2965 
2966 	if (pgraph.mark == UINT_MAX) {
2967 		for (i = 0; i < pgraph.gcount; i++)
2968 			if (pgraph.proc[i] != NULL) {
2969 				pgraph.proc[i]->atime = 0;
2970 				pgraph.proc[i]->dtime = 0;
2971 			}
2972 		pgraph.mark = 1;
2973 	} else {
2974 		pgraph.mark++;
2975 	}
2976 }
2977 
2978 /*
2979  * Release the proc vertex iff both there are no in edges and out edges
2980  */
2981 
2982 static void
2983 flk_proc_release(proc_vertex_t *proc)
2984 {
2985 	ASSERT(MUTEX_HELD(&flock_lock));
2986 	if (proc->edge == NULL && proc->incount == 0) {
2987 		pgraph.proc[proc->index] = NULL;
2988 		pgraph.free++;
2989 		kmem_free(proc, sizeof (proc_vertex_t));
2990 		flk_proc_vertex_frees++;
2991 	}
2992 }
2993 
2994 /*
2995  * Updates process graph to reflect change in a lock_graph.
2996  * Note: We should call this function only after we have a correctly
2997  * recomputed lock graph. Otherwise we might miss a deadlock detection.
2998  * eg: in function flk_relation() we call this function after flk_recompute_
2999  * dependencies() otherwise if a process tries to lock a vnode hashed
3000  * into another graph it might sleep for ever.
3001  */
3002 
3003 static void
3004 flk_update_proc_graph(edge_t *ep, int delete)
3005 {
3006 	proc_vertex_t *toproc, *fromproc;
3007 	proc_edge_t *pep, *prevpep;
3008 
3009 	mutex_enter(&flock_lock);
3010 	toproc = flk_get_proc_vertex(ep->to_vertex);
3011 	fromproc = flk_get_proc_vertex(ep->from_vertex);
3012 
3013 	if (!delete)
3014 		goto add;
3015 	pep = prevpep = fromproc->edge;
3016 
3017 	ASSERT(pep != NULL);
3018 	while (pep != NULL) {
3019 		if (pep->to_proc == toproc) {
3020 			ASSERT(pep->refcount > 0);
3021 			pep->refcount--;
3022 			if (pep->refcount == 0) {
3023 				if (pep == prevpep) {
3024 					fromproc->edge = pep->next;
3025 				} else {
3026 					prevpep->next = pep->next;
3027 				}
3028 				toproc->incount--;
3029 				flk_proc_release(toproc);
3030 				flk_free_proc_edge(pep);
3031 			}
3032 			break;
3033 		}
3034 		prevpep = pep;
3035 		pep = pep->next;
3036 	}
3037 	flk_proc_release(fromproc);
3038 	mutex_exit(&flock_lock);
3039 	return;
3040 add:
3041 
3042 	pep = fromproc->edge;
3043 
3044 	while (pep != NULL) {
3045 		if (pep->to_proc == toproc) {
3046 			ASSERT(pep->refcount > 0);
3047 			pep->refcount++;
3048 			break;
3049 		}
3050 		pep = pep->next;
3051 	}
3052 	if (pep == NULL) {
3053 		pep = flk_get_proc_edge();
3054 		pep->to_proc = toproc;
3055 		pep->refcount = 1;
3056 		toproc->incount++;
3057 		pep->next = fromproc->edge;
3058 		fromproc->edge = pep;
3059 	}
3060 	mutex_exit(&flock_lock);
3061 }
3062 
3063 /*
3064  * Set the control status for lock manager requests.
3065  *
3066  */
3067 
3068 /*
3069  * PSARC case 1997/292
3070  *
3071  * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3072  * Effects: Set the state of the NLM server identified by "nlmid"
3073  *   in the NLM registry to state "nlm_state."
3074  *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
3075  *   NLM server to this LLM.
3076  *   Note that when this routine is called with NLM_SHUTTING_DOWN there
3077  *   may be locks requests that have gotten started but not finished.  In
3078  *   particular, there may be blocking requests that are in the callback code
3079  *   before sleeping (so they're not holding the lock for the graph).  If
3080  *   such a thread reacquires the graph's lock (to go to sleep) after
3081  *   NLM state in the NLM registry  is set to a non-up value,
3082  *   it will notice the status and bail out.  If the request gets
3083  *   granted before the thread can check the NLM registry, let it
3084  *   continue normally.  It will get flushed when we are called with NLM_DOWN.
3085  *
3086  * Modifies: nlm_reg_obj (global)
3087  * Arguments:
3088  *    nlmid	(IN):    id uniquely identifying an NLM server
3089  *    nlm_state (IN):    NLM server state to change "nlmid" to
3090  */
3091 void
3092 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3093 {
3094 	/*
3095 	 * Check to see if node is booted as a cluster. If not, return.
3096 	 */
3097 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3098 		return;
3099 	}
3100 
3101 	/*
3102 	 * Check for development/debugging.  It is possible to boot a node
3103 	 * in non-cluster mode, and then run a special script, currently
3104 	 * available only to developers, to bring up the node as part of a
3105 	 * cluster.  The problem is that running such a script does not
3106 	 * result in the routine flk_init() being called and hence global array
3107 	 * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
3108 	 * but the LLM needs to do an additional check to see if the global
3109 	 * array has been created or not. If nlm_reg_status is NULL, then
3110 	 * return, else continue.
3111 	 */
3112 	if (nlm_reg_status == NULL) {
3113 		return;
3114 	}
3115 
3116 	ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3117 	mutex_enter(&nlm_reg_lock);
3118 
3119 	if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3120 		/*
3121 		 * If the NLM server "nlmid" is unknown in the NLM registry,
3122 		 * add it to the registry in the nlm shutting down state.
3123 		 */
3124 		FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3125 			FLK_NLM_SHUTTING_DOWN);
3126 	} else {
3127 		/*
3128 		 * Change the state of the NLM server identified by "nlmid"
3129 		 * in the NLM registry to the argument "nlm_state."
3130 		 */
3131 		FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3132 			nlm_state);
3133 	}
3134 
3135 	/*
3136 	 *  The reason we must register the NLM server that is shutting down
3137 	 *  with an LLM that doesn't already know about it (never sent a lock
3138 	 *  request) is to handle correctly a race between shutdown and a new
3139 	 *  lock request.  Suppose that a shutdown request from the NLM server
3140 	 *  invokes this routine at the LLM, and a thread is spawned to
3141 	 *  service the request. Now suppose a new lock request is in
3142 	 *  progress and has already passed the first line of defense in
3143 	 *  reclock(), which denies new locks requests from NLM servers
3144 	 *  that are not in the NLM_UP state.  After the current routine
3145 	 *  is invoked for both phases of shutdown, the routine will return,
3146 	 *  having done nothing, and the lock request will proceed and
3147 	 *  probably be granted.  The problem is that the shutdown was ignored
3148 	 *  by the lock request because there was no record of that NLM server
3149 	 *  shutting down.   We will be in the peculiar position of thinking
3150 	 *  that we've shutdown the NLM server and all locks at all LLMs have
3151 	 *  been discarded, but in fact there's still one lock held.
3152 	 *  The solution is to record the existence of NLM server and change
3153 	 *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
3154 	 *  progress may proceed because the next phase NLM_DOWN will catch
3155 	 *  this lock and discard it.
3156 	 */
3157 	mutex_exit(&nlm_reg_lock);
3158 
3159 	switch (nlm_state) {
3160 	case FLK_NLM_UP:
3161 		/*
3162 		 * Change the NLM state of all locks still held on behalf of
3163 		 * the NLM server identified by "nlmid" to NLM_UP.
3164 		 */
3165 		cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3166 		break;
3167 
3168 	case FLK_NLM_SHUTTING_DOWN:
3169 		/*
3170 		 * Wake up all sleeping locks for the NLM server identified
3171 		 * by "nlmid." Note that eventually all woken threads will
3172 		 * have their lock requests cancelled and descriptors
3173 		 * removed from the sleeping lock list.  Note that the NLM
3174 		 * server state associated with each lock descriptor is
3175 		 * changed to FLK_NLM_SHUTTING_DOWN.
3176 		 */
3177 		cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3178 		break;
3179 
3180 	case FLK_NLM_DOWN:
3181 		/*
3182 		 * Discard all active, granted locks for this NLM server
3183 		 * identified by "nlmid."
3184 		 */
3185 		cl_flk_unlock_nlm_granted(nlmid);
3186 		break;
3187 
3188 	default:
3189 		panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3190 	}
3191 }
3192 
3193 /*
3194  * Set the control status for lock manager requests.
3195  *
3196  * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3197  * may be locks requests that have gotten started but not finished.  In
3198  * particular, there may be blocking requests that are in the callback code
3199  * before sleeping (so they're not holding the lock for the graph).  If
3200  * such a thread reacquires the graph's lock (to go to sleep) after
3201  * flk_lockmgr_status is set to a non-up value, it will notice the status
3202  * and bail out.  If the request gets granted before the thread can check
3203  * flk_lockmgr_status, let it continue normally.  It will get flushed when
3204  * we are called with FLK_LOCKMGR_DOWN.
3205  */
3206 
3207 void
3208 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3209 {
3210 	int i;
3211 	graph_t *gp;
3212 	struct flock_globals *fg;
3213 
3214 	fg = flk_get_globals();
3215 	ASSERT(fg != NULL);
3216 
3217 	mutex_enter(&flock_lock);
3218 	fg->flk_lockmgr_status = status;
3219 	mutex_exit(&flock_lock);
3220 
3221 	/*
3222 	 * If the lock manager is coming back up, all that's needed is to
3223 	 * propagate this information to the graphs.  If the lock manager
3224 	 * is going down, additional action is required, and each graph's
3225 	 * copy of the state is updated atomically with this other action.
3226 	 */
3227 	switch (status) {
3228 	case FLK_LOCKMGR_UP:
3229 		for (i = 0; i < HASH_SIZE; i++) {
3230 			mutex_enter(&flock_lock);
3231 			gp = lock_graph[i];
3232 			mutex_exit(&flock_lock);
3233 			if (gp == NULL)
3234 				continue;
3235 			mutex_enter(&gp->gp_mutex);
3236 			fg->lockmgr_status[i] = status;
3237 			mutex_exit(&gp->gp_mutex);
3238 		}
3239 		break;
3240 	case FLK_WAKEUP_SLEEPERS:
3241 		wakeup_sleeping_lockmgr_locks(fg);
3242 		break;
3243 	case FLK_LOCKMGR_DOWN:
3244 		unlock_lockmgr_granted(fg);
3245 		break;
3246 	default:
3247 		panic("flk_set_lockmgr_status: bad status (%d)", status);
3248 		break;
3249 	}
3250 }
3251 
3252 /*
3253  * This routine returns all the locks that are active or sleeping and are
3254  * associated with a particular set of identifiers.  If lock_state != 0, then
3255  * only locks that match the lock_state are returned. If lock_state == 0, then
3256  * all locks are returned. If pid == NOPID, the pid is ignored.  If
3257  * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
3258  * vnode pointer is ignored.
3259  *
3260  * A list containing the vnode pointer and an flock structure
3261  * describing the lock is returned.  Each element in the list is
3262  * dynamically allocated and must be freed by the caller.  The
3263  * last item in the list is denoted by a NULL value in the ll_next
3264  * field.
3265  *
3266  * The vnode pointers returned are held.  The caller is responsible
3267  * for releasing these.  Note that the returned list is only a snapshot of
3268  * the current lock information, and that it is a snapshot of a moving
3269  * target (only one graph is locked at a time).
3270  */
3271 
3272 locklist_t *
3273 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3274 		pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3275 {
3276 	lock_descriptor_t	*lock;
3277 	lock_descriptor_t	*graph_head;
3278 	locklist_t		listhead;
3279 	locklist_t		*llheadp;
3280 	locklist_t		*llp;
3281 	locklist_t		*lltp;
3282 	graph_t			*gp;
3283 	int			i;
3284 	int			first_index; /* graph index */
3285 	int			num_indexes; /* graph index */
3286 
3287 	ASSERT((list_type == FLK_ACTIVE_STATE) ||
3288 	    (list_type == FLK_SLEEPING_STATE));
3289 
3290 	/*
3291 	 * Get a pointer to something to use as a list head while building
3292 	 * the rest of the list.
3293 	 */
3294 	llheadp = &listhead;
3295 	lltp = llheadp;
3296 	llheadp->ll_next = (locklist_t *)NULL;
3297 
3298 	/* Figure out which graphs we want to look at. */
3299 	if (vp == NULL) {
3300 		first_index = 0;
3301 		num_indexes = HASH_SIZE;
3302 	} else {
3303 		first_index = HASH_INDEX(vp);
3304 		num_indexes = 1;
3305 	}
3306 
3307 	for (i = first_index; i < first_index + num_indexes; i++) {
3308 		mutex_enter(&flock_lock);
3309 		gp = lock_graph[i];
3310 		mutex_exit(&flock_lock);
3311 		if (gp == NULL) {
3312 			continue;
3313 		}
3314 
3315 		mutex_enter(&gp->gp_mutex);
3316 		graph_head = (list_type == FLK_ACTIVE_STATE) ?
3317 			ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3318 		for (lock = graph_head->l_next;
3319 		    lock != graph_head;
3320 		    lock = lock->l_next) {
3321 			if (use_sysid && lock->l_flock.l_sysid != sysid)
3322 				continue;
3323 			if (pid != NOPID && lock->l_flock.l_pid != pid)
3324 				continue;
3325 			if (vp != NULL && lock->l_vnode != vp)
3326 				continue;
3327 			if (lock_state && !(lock_state & lock->l_state))
3328 				continue;
3329 			if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3330 				continue;
3331 			/*
3332 			 * A matching lock was found.  Allocate
3333 			 * space for a new locklist entry and fill
3334 			 * it in.
3335 			 */
3336 			llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3337 			lltp->ll_next = llp;
3338 			VN_HOLD(lock->l_vnode);
3339 			llp->ll_vp = lock->l_vnode;
3340 			create_flock(lock, &(llp->ll_flock));
3341 			llp->ll_next = (locklist_t *)NULL;
3342 			lltp = llp;
3343 		}
3344 		mutex_exit(&gp->gp_mutex);
3345 	}
3346 
3347 	llp = llheadp->ll_next;
3348 	return (llp);
3349 }
3350 
3351 /*
3352  * These two functions are simply interfaces to get_lock_list.  They return
3353  * a list of sleeping or active locks for the given sysid and pid.  See
3354  * get_lock_list for details.
3355  *
3356  * In either case we don't particularly care to specify the zone of interest;
3357  * the sysid-space is global across zones, so the sysid will map to exactly one
3358  * zone, and we'll return information for that zone.
3359  */
3360 
3361 locklist_t *
3362 flk_get_sleeping_locks(int sysid, pid_t pid)
3363 {
3364 	return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3365 		    ALL_ZONES));
3366 }
3367 
3368 locklist_t *
3369 flk_get_active_locks(int sysid, pid_t pid)
3370 {
3371 	return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3372 		    ALL_ZONES));
3373 }
3374 
3375 /*
3376  * Another interface to get_lock_list.  This one returns all the active
3377  * locks for a given vnode.  Again, see get_lock_list for details.
3378  *
3379  * We don't need to specify which zone's locks we're interested in.  The matter
3380  * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3381  * be used by multiple zones, so the list of locks will all be from the right
3382  * zone.
3383  */
3384 
3385 locklist_t *
3386 flk_active_locks_for_vp(const vnode_t *vp)
3387 {
3388 	return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3389 		    ALL_ZONES));
3390 }
3391 
3392 /*
3393  * Another interface to get_lock_list.  This one returns all the active
3394  * nbmand locks for a given vnode.  Again, see get_lock_list for details.
3395  *
3396  * See the comment for flk_active_locks_for_vp() for why we don't care to
3397  * specify the particular zone of interest.
3398  */
3399 locklist_t *
3400 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3401 {
3402 	return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3403 				NOPID, vp, ALL_ZONES));
3404 }
3405 
3406 /*
3407  * Another interface to get_lock_list.  This one returns all the active
3408  * nbmand locks for a given pid.  Again, see get_lock_list for details.
3409  *
3410  * The zone doesn't need to be specified here; the locks held by a
3411  * particular process will either be local (ie, non-NFS) or from the zone
3412  * the process is executing in.  This is because other parts of the system
3413  * ensure that an NFS vnode can't be used in a zone other than that in
3414  * which it was opened.
3415  */
3416 locklist_t *
3417 flk_active_nbmand_locks(pid_t pid)
3418 {
3419 	return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3420 				pid, NULL, ALL_ZONES));
3421 }
3422 
3423 /*
3424  * Free up all entries in the locklist.
3425  */
3426 void
3427 flk_free_locklist(locklist_t *llp)
3428 {
3429 	locklist_t *next_llp;
3430 
3431 	while (llp) {
3432 		next_llp = llp->ll_next;
3433 		VN_RELE(llp->ll_vp);
3434 		kmem_free(llp, sizeof (*llp));
3435 		llp = next_llp;
3436 	}
3437 }
3438 
3439 static void
3440 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3441 {
3442 	/*
3443 	 * For each graph "lg" in the hash table lock_graph do
3444 	 * a.  Get the list of sleeping locks
3445 	 * b.  For each lock descriptor in the list do
3446 	 *	i.   If the requested lock is an NLM server request AND
3447 	 *		the nlmid is the same as the routine argument then
3448 	 *		change the lock descriptor's state field to
3449 	 *		"nlm_state."
3450 	 * c.  Get the list of active locks
3451 	 * d.  For each lock descriptor in the list do
3452 	 *	i.   If the requested lock is an NLM server request AND
3453 	 *		the nlmid is the same as the routine argument then
3454 	 *		change the lock descriptor's state field to
3455 	 *		"nlm_state."
3456 	 */
3457 
3458 	int			i;
3459 	graph_t			*gp;			/* lock graph */
3460 	lock_descriptor_t	*lock;			/* lock */
3461 	lock_descriptor_t	*nlock = NULL;		/* next lock */
3462 	int			lock_nlmid;
3463 
3464 	for (i = 0; i < HASH_SIZE; i++) {
3465 		mutex_enter(&flock_lock);
3466 		gp = lock_graph[i];
3467 		mutex_exit(&flock_lock);
3468 		if (gp == NULL) {
3469 			continue;
3470 		}
3471 
3472 		/* Get list of sleeping locks in current lock graph. */
3473 		mutex_enter(&gp->gp_mutex);
3474 		for (lock = SLEEPING_HEAD(gp)->l_next;
3475 		    lock != SLEEPING_HEAD(gp);
3476 		    lock = nlock) {
3477 			nlock = lock->l_next;
3478 			/* get NLM id */
3479 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3480 
3481 			/*
3482 			 * If NLM server request AND nlmid of lock matches
3483 			 * nlmid of argument, then set the NLM state of the
3484 			 * lock to "nlm_state."
3485 			 */
3486 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3487 				SET_NLM_STATE(lock, nlm_state);
3488 			}
3489 		}
3490 
3491 		/* Get list of active locks in current lock graph. */
3492 		for (lock = ACTIVE_HEAD(gp)->l_next;
3493 		    lock != ACTIVE_HEAD(gp);
3494 		    lock = nlock) {
3495 			nlock = lock->l_next;
3496 			/* get NLM id */
3497 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3498 
3499 			/*
3500 			 * If NLM server request AND nlmid of lock matches
3501 			 * nlmid of argument, then set the NLM state of the
3502 			 * lock to "nlm_state."
3503 			 */
3504 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3505 				ASSERT(IS_ACTIVE(lock));
3506 				SET_NLM_STATE(lock, nlm_state);
3507 			}
3508 		}
3509 		mutex_exit(&gp->gp_mutex);
3510 	}
3511 }
3512 
3513 /*
3514  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3515  * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3516  *   identified by "nlmid." Poke those lock requests.
3517  */
3518 static void
3519 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3520 {
3521 	lock_descriptor_t *lock;
3522 	lock_descriptor_t *nlock = NULL; /* next lock */
3523 	int i;
3524 	graph_t *gp;
3525 	int	lock_nlmid;
3526 
3527 	for (i = 0; i < HASH_SIZE; i++) {
3528 		mutex_enter(&flock_lock);
3529 		gp = lock_graph[i];
3530 		mutex_exit(&flock_lock);
3531 		if (gp == NULL) {
3532 			continue;
3533 		}
3534 
3535 		mutex_enter(&gp->gp_mutex);
3536 		for (lock = SLEEPING_HEAD(gp)->l_next;
3537 		    lock != SLEEPING_HEAD(gp);
3538 		    lock = nlock) {
3539 			nlock = lock->l_next;
3540 			/*
3541 			 * If NLM server request _and_ nlmid of lock matches
3542 			 * nlmid of argument, then set the NLM state of the
3543 			 * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3544 			 * request.
3545 			 */
3546 			if (IS_LOCKMGR(lock)) {
3547 				/* get NLM id */
3548 				lock_nlmid =
3549 					GETNLMID(lock->l_flock.l_sysid);
3550 				if (nlmid == lock_nlmid) {
3551 					SET_NLM_STATE(lock,
3552 						FLK_NLM_SHUTTING_DOWN);
3553 					INTERRUPT_WAKEUP(lock);
3554 				}
3555 			}
3556 		}
3557 		mutex_exit(&gp->gp_mutex);
3558 	}
3559 }
3560 
3561 /*
3562  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3563  * Effects:  Find all active (granted) lock manager locks _only_ for the
3564  *   NLM server identified by "nlmid" and release them.
3565  */
3566 static void
3567 cl_flk_unlock_nlm_granted(int nlmid)
3568 {
3569 	lock_descriptor_t *lock;
3570 	lock_descriptor_t *nlock = NULL; /* next lock */
3571 	int i;
3572 	graph_t *gp;
3573 	int	lock_nlmid;
3574 
3575 	for (i = 0; i < HASH_SIZE; i++) {
3576 		mutex_enter(&flock_lock);
3577 		gp = lock_graph[i];
3578 		mutex_exit(&flock_lock);
3579 		if (gp == NULL) {
3580 			continue;
3581 		}
3582 
3583 		mutex_enter(&gp->gp_mutex);
3584 		for (lock = ACTIVE_HEAD(gp)->l_next;
3585 		    lock != ACTIVE_HEAD(gp);
3586 		    lock = nlock) {
3587 			nlock = lock->l_next;
3588 			ASSERT(IS_ACTIVE(lock));
3589 
3590 			/*
3591 			 * If it's an  NLM server request _and_ nlmid of
3592 			 * the lock matches nlmid of argument, then
3593 			 * remove the active lock the list, wakup blocked
3594 			 * threads, and free the storage for the lock.
3595 			 * Note that there's no need to mark the NLM state
3596 			 * of this lock to NLM_DOWN because the lock will
3597 			 * be deleted anyway and its storage freed.
3598 			 */
3599 			if (IS_LOCKMGR(lock)) {
3600 				/* get NLM id */
3601 				lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3602 				if (nlmid == lock_nlmid) {
3603 					flk_delete_active_lock(lock, 0);
3604 					flk_wakeup(lock, 1);
3605 					flk_free_lock(lock);
3606 				}
3607 			}
3608 		}
3609 		mutex_exit(&gp->gp_mutex);
3610 	}
3611 }
3612 
3613 /*
3614  * Find all sleeping lock manager requests and poke them.
3615  */
3616 static void
3617 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
3618 {
3619 	lock_descriptor_t *lock;
3620 	lock_descriptor_t *nlock = NULL; /* next lock */
3621 	int i;
3622 	graph_t *gp;
3623 	zoneid_t zoneid = getzoneid();
3624 
3625 	for (i = 0; i < HASH_SIZE; i++) {
3626 		mutex_enter(&flock_lock);
3627 		gp = lock_graph[i];
3628 		mutex_exit(&flock_lock);
3629 		if (gp == NULL) {
3630 			continue;
3631 		}
3632 
3633 		mutex_enter(&gp->gp_mutex);
3634 		fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
3635 		for (lock = SLEEPING_HEAD(gp)->l_next;
3636 		    lock != SLEEPING_HEAD(gp);
3637 		    lock = nlock) {
3638 			nlock = lock->l_next;
3639 			if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3640 				INTERRUPT_WAKEUP(lock);
3641 			}
3642 		}
3643 		mutex_exit(&gp->gp_mutex);
3644 	}
3645 }
3646 
3647 
3648 /*
3649  * Find all active (granted) lock manager locks and release them.
3650  */
3651 static void
3652 unlock_lockmgr_granted(struct flock_globals *fg)
3653 {
3654 	lock_descriptor_t *lock;
3655 	lock_descriptor_t *nlock = NULL; /* next lock */
3656 	int i;
3657 	graph_t *gp;
3658 	zoneid_t zoneid = getzoneid();
3659 
3660 	for (i = 0; i < HASH_SIZE; i++) {
3661 		mutex_enter(&flock_lock);
3662 		gp = lock_graph[i];
3663 		mutex_exit(&flock_lock);
3664 		if (gp == NULL) {
3665 			continue;
3666 		}
3667 
3668 		mutex_enter(&gp->gp_mutex);
3669 		fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
3670 		for (lock = ACTIVE_HEAD(gp)->l_next;
3671 		    lock != ACTIVE_HEAD(gp);
3672 		    lock = nlock) {
3673 			nlock = lock->l_next;
3674 			if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3675 				ASSERT(IS_ACTIVE(lock));
3676 				flk_delete_active_lock(lock, 0);
3677 				flk_wakeup(lock, 1);
3678 				flk_free_lock(lock);
3679 			}
3680 		}
3681 		mutex_exit(&gp->gp_mutex);
3682 	}
3683 }
3684 
3685 
3686 /*
3687  * Wait until a lock is granted, cancelled, or interrupted.
3688  */
3689 
3690 static void
3691 wait_for_lock(lock_descriptor_t *request)
3692 {
3693 	graph_t *gp = request->l_graph;
3694 
3695 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
3696 
3697 	while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
3698 	    !(IS_INTERRUPTED(request))) {
3699 		if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
3700 			flk_set_state(request, FLK_INTERRUPTED_STATE);
3701 			request->l_state |= INTERRUPTED_LOCK;
3702 		}
3703 	}
3704 }
3705 
3706 /*
3707  * Create an flock structure from the existing lock information
3708  *
3709  * This routine is used to create flock structures for the lock manager
3710  * to use in a reclaim request.  Since the lock was originated on this
3711  * host, it must be conforming to UNIX semantics, so no checking is
3712  * done to make sure it falls within the lower half of the 32-bit range.
3713  */
3714 
3715 static void
3716 create_flock(lock_descriptor_t *lp, flock64_t *flp)
3717 {
3718 	ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
3719 	ASSERT(lp->l_end >= lp->l_start);
3720 
3721 	flp->l_type = lp->l_type;
3722 	flp->l_whence = 0;
3723 	flp->l_start = lp->l_start;
3724 	flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
3725 		(lp->l_end - lp->l_start + 1);
3726 	flp->l_sysid = lp->l_flock.l_sysid;
3727 	flp->l_pid = lp->l_flock.l_pid;
3728 }
3729 
3730 /*
3731  * Convert flock_t data describing a lock range into unsigned long starting
3732  * and ending points, which are put into lock_request.  Returns 0 or an
3733  * errno value.
3734  * Large Files: max is passed by the caller and we return EOVERFLOW
3735  * as defined by LFS API.
3736  */
3737 
3738 int
3739 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
3740     u_offset_t *start, u_offset_t *end, offset_t offset)
3741 {
3742 	struct vattr	vattr;
3743 	int	error;
3744 
3745 	/*
3746 	 * Determine the starting point of the request
3747 	 */
3748 	switch (flp->l_whence) {
3749 	case 0:		/* SEEK_SET */
3750 		*start = (u_offset_t)flp->l_start;
3751 		break;
3752 	case 1:		/* SEEK_CUR */
3753 		*start = (u_offset_t)(flp->l_start + offset);
3754 		break;
3755 	case 2:		/* SEEK_END */
3756 		vattr.va_mask = AT_SIZE;
3757 		if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3758 			return (error);
3759 		*start = (u_offset_t)(flp->l_start + vattr.va_size);
3760 		break;
3761 	default:
3762 		return (EINVAL);
3763 	}
3764 
3765 	/*
3766 	 * Determine the range covered by the request.
3767 	 */
3768 	if (flp->l_len == 0)
3769 		*end = MAX_U_OFFSET_T;
3770 	else if ((offset_t)flp->l_len > 0) {
3771 		*end = (u_offset_t)(*start + (flp->l_len - 1));
3772 	} else {
3773 		/*
3774 		 * Negative length; why do we even allow this ?
3775 		 * Because this allows easy specification of
3776 		 * the last n bytes of the file.
3777 		 */
3778 		*end = *start;
3779 		*start += (u_offset_t)flp->l_len;
3780 		(*start)++;
3781 	}
3782 	return (0);
3783 }
3784 
3785 /*
3786  * Check the validity of lock data.  This can used by the NFS
3787  * frlock routines to check data before contacting the server.  The
3788  * server must support semantics that aren't as restrictive as
3789  * the UNIX API, so the NFS client is required to check.
3790  * The maximum is now passed in by the caller.
3791  */
3792 
3793 int
3794 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
3795 {
3796 	/*
3797 	 * The end (length) for local locking should never be greater
3798 	 * than MAXEND. However, the representation for
3799 	 * the entire file is MAX_U_OFFSET_T.
3800 	 */
3801 	if ((start > max) ||
3802 	    ((end > max) && (end != MAX_U_OFFSET_T))) {
3803 		return (EINVAL);
3804 	}
3805 	if (start > end) {
3806 	    return (EINVAL);
3807 	}
3808 	return (0);
3809 }
3810 
3811 /*
3812  * Fill in request->l_flock with information about the lock blocking the
3813  * request.  The complexity here is that lock manager requests are allowed
3814  * to see into the upper part of the 32-bit address range, whereas local
3815  * requests are only allowed to see signed values.
3816  *
3817  * What should be done when "blocker" is a lock manager lock that uses the
3818  * upper portion of the 32-bit range, but "request" is local?  Since the
3819  * request has already been determined to have been blocked by the blocker,
3820  * at least some portion of "blocker" must be in the range of the request,
3821  * or the request extends to the end of file.  For the first case, the
3822  * portion in the lower range is returned with the indication that it goes
3823  * "to EOF."  For the second case, the last byte of the lower range is
3824  * returned with the indication that it goes "to EOF."
3825  */
3826 
3827 static void
3828 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
3829 {
3830 	flock64_t *flrp;			/* l_flock portion of request */
3831 
3832 	ASSERT(blocker != NULL);
3833 
3834 	flrp = &request->l_flock;
3835 	flrp->l_whence = 0;
3836 	flrp->l_type = blocker->l_type;
3837 	flrp->l_pid = blocker->l_flock.l_pid;
3838 	flrp->l_sysid = blocker->l_flock.l_sysid;
3839 
3840 	if (IS_LOCKMGR(request)) {
3841 		flrp->l_start = blocker->l_start;
3842 		if (blocker->l_end == MAX_U_OFFSET_T)
3843 			flrp->l_len = 0;
3844 		else
3845 			flrp->l_len = blocker->l_end - blocker->l_start + 1;
3846 	} else {
3847 		if (blocker->l_start > MAXEND) {
3848 			flrp->l_start = MAXEND;
3849 			flrp->l_len = 0;
3850 		} else {
3851 			flrp->l_start = blocker->l_start;
3852 			if (blocker->l_end == MAX_U_OFFSET_T)
3853 				flrp->l_len = 0;
3854 			else
3855 				flrp->l_len = blocker->l_end -
3856 					blocker->l_start + 1;
3857 		}
3858 	}
3859 }
3860 
3861 /*
3862  * PSARC case 1997/292
3863  */
3864 /*
3865  * This is the public routine exported by flock.h.
3866  */
3867 void
3868 cl_flk_change_nlm_state_to_unknown(int nlmid)
3869 {
3870 	/*
3871 	 * Check to see if node is booted as a cluster. If not, return.
3872 	 */
3873 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3874 		return;
3875 	}
3876 
3877 	/*
3878 	 * See comment in cl_flk_set_nlm_status().
3879 	 */
3880 	if (nlm_reg_status == NULL) {
3881 		return;
3882 	}
3883 
3884 	/*
3885 	 * protect NLM registry state with a mutex.
3886 	 */
3887 	ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3888 	mutex_enter(&nlm_reg_lock);
3889 	FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
3890 	mutex_exit(&nlm_reg_lock);
3891 }
3892 
3893 /*
3894  * Return non-zero if the given I/O request conflicts with an active NBMAND
3895  * lock.
3896  * If svmand is non-zero, it means look at all active locks, not just NBMAND
3897  * locks.
3898  */
3899 
3900 int
3901 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
3902 		ssize_t length, int svmand, caller_context_t *ct)
3903 {
3904 	int conflict = 0;
3905 	graph_t			*gp;
3906 	lock_descriptor_t	*lock;
3907 	pid_t pid;
3908 	int sysid;
3909 
3910 	if (ct == NULL) {
3911 		pid = curproc->p_pid;
3912 		sysid = 0;
3913 	} else {
3914 		pid = ct->cc_pid;
3915 		sysid = ct->cc_sysid;
3916 	}
3917 
3918 	mutex_enter(&flock_lock);
3919 	gp = lock_graph[HASH_INDEX(vp)];
3920 	mutex_exit(&flock_lock);
3921 	if (gp == NULL)
3922 		return (0);
3923 
3924 	mutex_enter(&gp->gp_mutex);
3925 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
3926 
3927 	for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
3928 		if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
3929 		    (lock->l_flock.l_sysid != sysid ||
3930 		    lock->l_flock.l_pid != pid) &&
3931 		    lock_blocks_io(op, offset, length,
3932 				lock->l_type, lock->l_start, lock->l_end)) {
3933 			conflict = 1;
3934 			break;
3935 		}
3936 	}
3937 	mutex_exit(&gp->gp_mutex);
3938 
3939 	return (conflict);
3940 }
3941 
3942 /*
3943  * Return non-zero if the given I/O request conflicts with the given lock.
3944  */
3945 
3946 static int
3947 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
3948 	    int lock_type, u_offset_t lock_start, u_offset_t lock_end)
3949 {
3950 	ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
3951 	ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
3952 
3953 	if (op == NBL_READ && lock_type == F_RDLCK)
3954 		return (0);
3955 
3956 	if (offset <= lock_start && lock_start < offset + length)
3957 		return (1);
3958 	if (lock_start <= offset && offset <= lock_end)
3959 		return (1);
3960 
3961 	return (0);
3962 }
3963 
3964 #ifdef DEBUG
3965 static void
3966 check_active_locks(graph_t *gp)
3967 {
3968 	lock_descriptor_t *lock, *lock1;
3969 	edge_t	*ep;
3970 
3971 	for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
3972 						lock = lock->l_next) {
3973 		ASSERT(IS_ACTIVE(lock));
3974 		ASSERT(NOT_BLOCKED(lock));
3975 		ASSERT(!IS_BARRIER(lock));
3976 
3977 		ep = FIRST_IN(lock);
3978 
3979 		while (ep != HEAD(lock)) {
3980 			ASSERT(IS_SLEEPING(ep->from_vertex));
3981 			ASSERT(!NOT_BLOCKED(ep->from_vertex));
3982 			ep = NEXT_IN(ep);
3983 		}
3984 
3985 		for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
3986 					lock1 = lock1->l_next) {
3987 			if (lock1->l_vnode == lock->l_vnode) {
3988 			if (BLOCKS(lock1, lock)) {
3989 				cmn_err(CE_PANIC,
3990 				    "active lock %p blocks %p",
3991 				    (void *)lock1, (void *)lock);
3992 			} else if (BLOCKS(lock, lock1)) {
3993 				cmn_err(CE_PANIC,
3994 				    "active lock %p blocks %p",
3995 				    (void *)lock, (void *)lock1);
3996 			}
3997 			}
3998 		}
3999 	}
4000 }
4001 
4002 /*
4003  * Effect: This functions checks to see if the transition from 'old_state' to
4004  *	'new_state' is a valid one.  It returns 0 if the transition is valid
4005  *	and 1 if it is not.
4006  *	For a map of valid transitions, see sys/flock_impl.h
4007  */
4008 static int
4009 check_lock_transition(int old_state, int new_state)
4010 {
4011 	switch (old_state) {
4012 	case FLK_INITIAL_STATE:
4013 		if ((new_state == FLK_START_STATE) ||
4014 		    (new_state == FLK_SLEEPING_STATE) ||
4015 		    (new_state == FLK_ACTIVE_STATE) ||
4016 		    (new_state == FLK_DEAD_STATE)) {
4017 			return (0);
4018 		} else {
4019 			return (1);
4020 		}
4021 	case FLK_START_STATE:
4022 		if ((new_state == FLK_ACTIVE_STATE) ||
4023 		    (new_state == FLK_DEAD_STATE)) {
4024 			return (0);
4025 		} else {
4026 			return (1);
4027 		}
4028 	case FLK_ACTIVE_STATE:
4029 		if (new_state == FLK_DEAD_STATE) {
4030 			return (0);
4031 		} else {
4032 			return (1);
4033 		}
4034 	case FLK_SLEEPING_STATE:
4035 		if ((new_state == FLK_GRANTED_STATE) ||
4036 		    (new_state == FLK_INTERRUPTED_STATE) ||
4037 		    (new_state == FLK_CANCELLED_STATE)) {
4038 			return (0);
4039 		} else {
4040 			return (1);
4041 		}
4042 	case FLK_GRANTED_STATE:
4043 		if ((new_state == FLK_START_STATE) ||
4044 		    (new_state == FLK_INTERRUPTED_STATE) ||
4045 		    (new_state == FLK_CANCELLED_STATE)) {
4046 			return (0);
4047 		} else {
4048 			return (1);
4049 		}
4050 	case FLK_CANCELLED_STATE:
4051 		if ((new_state == FLK_INTERRUPTED_STATE) ||
4052 		    (new_state == FLK_DEAD_STATE)) {
4053 			return (0);
4054 		} else {
4055 			return (1);
4056 		}
4057 	case FLK_INTERRUPTED_STATE:
4058 		if (new_state == FLK_DEAD_STATE) {
4059 			return (0);
4060 		} else {
4061 			return (1);
4062 		}
4063 	case FLK_DEAD_STATE:
4064 		/* May be set more than once */
4065 		if (new_state == FLK_DEAD_STATE) {
4066 			return (0);
4067 		} else {
4068 			return (1);
4069 		}
4070 	default:
4071 		return (1);
4072 	}
4073 }
4074 
4075 static void
4076 check_sleeping_locks(graph_t *gp)
4077 {
4078 	lock_descriptor_t *lock1, *lock2;
4079 	edge_t *ep;
4080 	for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4081 				lock1 = lock1->l_next) {
4082 				ASSERT(!IS_BARRIER(lock1));
4083 	for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4084 				lock2 = lock2->l_next) {
4085 		if (lock1->l_vnode == lock2->l_vnode) {
4086 			if (BLOCKS(lock2, lock1)) {
4087 				ASSERT(!IS_GRANTED(lock1));
4088 				ASSERT(!NOT_BLOCKED(lock1));
4089 				path(lock1, lock2);
4090 			}
4091 		}
4092 	}
4093 
4094 	for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4095 					lock2 = lock2->l_next) {
4096 				ASSERT(!IS_BARRIER(lock1));
4097 		if (lock1->l_vnode == lock2->l_vnode) {
4098 			if (BLOCKS(lock2, lock1)) {
4099 				ASSERT(!IS_GRANTED(lock1));
4100 				ASSERT(!NOT_BLOCKED(lock1));
4101 				path(lock1, lock2);
4102 			}
4103 		}
4104 	}
4105 	ep = FIRST_ADJ(lock1);
4106 	while (ep != HEAD(lock1)) {
4107 		ASSERT(BLOCKS(ep->to_vertex, lock1));
4108 		ep = NEXT_ADJ(ep);
4109 	}
4110 	}
4111 }
4112 
4113 static int
4114 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4115 {
4116 	edge_t	*ep;
4117 	lock_descriptor_t	*vertex;
4118 	lock_descriptor_t *vertex_stack;
4119 
4120 	STACK_INIT(vertex_stack);
4121 
4122 	flk_graph_uncolor(lock1->l_graph);
4123 	ep = FIRST_ADJ(lock1);
4124 	ASSERT(ep != HEAD(lock1));
4125 	while (ep != HEAD(lock1)) {
4126 		if (no_path)
4127 			ASSERT(ep->to_vertex != lock2);
4128 		STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4129 		COLOR(ep->to_vertex);
4130 		ep = NEXT_ADJ(ep);
4131 	}
4132 
4133 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4134 		STACK_POP(vertex_stack, l_dstack);
4135 		for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4136 						ep = NEXT_ADJ(ep)) {
4137 			if (COLORED(ep->to_vertex))
4138 				continue;
4139 			COLOR(ep->to_vertex);
4140 			if (ep->to_vertex == lock2)
4141 				return (1);
4142 
4143 			STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4144 		}
4145 	}
4146 	return (0);
4147 }
4148 
4149 static void
4150 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4151 {
4152 	lock_descriptor_t *lock;
4153 
4154 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4155 
4156 	if (lock) {
4157 		while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4158 			if (lock->l_flock.l_pid == pid &&
4159 			    lock->l_flock.l_sysid == sysid)
4160 				cmn_err(CE_PANIC,
4161 				    "owner pid %d's lock %p in active queue",
4162 				    pid, (void *)lock);
4163 			lock = lock->l_next;
4164 		}
4165 	}
4166 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4167 
4168 	if (lock) {
4169 		while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4170 			if (lock->l_flock.l_pid == pid &&
4171 			    lock->l_flock.l_sysid == sysid)
4172 				cmn_err(CE_PANIC,
4173 				    "owner pid %d's lock %p in sleep queue",
4174 				    pid, (void *)lock);
4175 			lock = lock->l_next;
4176 		}
4177 	}
4178 }
4179 
4180 static int
4181 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4182 {
4183 	edge_t *ep = FIRST_ADJ(lock1);
4184 
4185 	while (ep != HEAD(lock1)) {
4186 		if (ep->to_vertex == lock2)
4187 			return (1);
4188 		else
4189 			ep = NEXT_ADJ(ep);
4190 	}
4191 	return (0);
4192 }
4193 
4194 static int
4195 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4196 {
4197 	return (!level_two_path(lock1, lock2, 1));
4198 }
4199 
4200 static void
4201 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4202 {
4203 	if (level_one_path(lock1, lock2)) {
4204 		if (level_two_path(lock1, lock2, 0) != 0) {
4205 			cmn_err(CE_WARN,
4206 			    "one edge one path from lock1 %p lock2 %p",
4207 			    (void *)lock1, (void *)lock2);
4208 		}
4209 	} else if (no_path(lock1, lock2)) {
4210 		cmn_err(CE_PANIC,
4211 		    "No path from  lock1 %p to lock2 %p",
4212 		    (void *)lock1, (void *)lock2);
4213 	}
4214 }
4215 #endif /* DEBUG */
4216