1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2024 Oxide Computer Company
14 */
15
16 /*
17 * This implements the general locking routines. See the big theory section
18 * 'ioctls, Errors, and Exclusive Access' for more information.
19 */
20
21 #include <sys/stddef.h>
22 #include <sys/nvme.h>
23
24 #include "nvme_reg.h"
25 #include "nvme_var.h"
26
27 /*
28 * Do we have a writer or someone pending. Note, some cases require checking
29 * both of these and others do not. Please see each individual check for the
30 * nuance here. As a general rule of thumb, when locking, the pending writers
31 * are important. However, when passing the lock on to the next owner (the
32 * handoff functions below), one doesn't check it.
33 */
34 static boolean_t
nvme_rwlock_wr_or_pend(nvme_lock_t * lock)35 nvme_rwlock_wr_or_pend(nvme_lock_t *lock)
36 {
37 return (lock->nl_writer != NULL ||
38 list_is_empty(&lock->nl_pend_writers) == 0);
39 }
40
41 /*
42 * Taking a namespace read lock requires that there is no writer (or pending) on
43 * the controller and the namespace.
44 */
45 static boolean_t
nvme_rwlock_block_ns_rdlock(nvme_t * nvme,nvme_namespace_t * ns)46 nvme_rwlock_block_ns_rdlock(nvme_t *nvme, nvme_namespace_t *ns)
47 {
48 return (nvme_rwlock_wr_or_pend(&nvme->n_lock) ||
49 nvme_rwlock_wr_or_pend(&ns->ns_lock));
50 }
51
52 /*
53 * The following entities all block a namespace write lock from being taken:
54 *
55 * 1) Any active or pending writer on the controller lock. They block and starve
56 * namespace writers respectively.
57 * 2) Any active or pending writers on the namespace lock. We must wait in line.
58 * 3) Any active readers on the namespace lock. We ignore pending namespace
59 * readers as by definition that implies some other situation will cause
60 * this.
61 */
62 static boolean_t
nvme_rwlock_block_ns_wrlock(nvme_t * nvme,nvme_namespace_t * ns)63 nvme_rwlock_block_ns_wrlock(nvme_t *nvme, nvme_namespace_t *ns)
64 {
65 return (nvme_rwlock_wr_or_pend(&nvme->n_lock) ||
66 nvme_rwlock_wr_or_pend(&ns->ns_lock) ||
67 list_is_empty(&ns->ns_lock.nl_readers) == 0);
68 }
69
70
71 /*
72 * The only thing that blocks acquisition of a controller read lock is if
73 * there are outstanding or pending writers on the controller lock. We can
74 * ignore the state of all namespaces here.
75 */
76 static boolean_t
nvme_rwlock_block_ctrl_rdlock(nvme_t * nvme)77 nvme_rwlock_block_ctrl_rdlock(nvme_t *nvme)
78 {
79 return (nvme_rwlock_wr_or_pend(&nvme->n_lock));
80 }
81
82 /*
83 * Taking the controller write lock is the most challenging of all, but also
84 * takes priority. The following all block a controller write lock from being
85 * taken:
86 *
87 * 1) Any controller write lock or pending write
88 * 2) Any controller read lock. We skip pending reads because if they exist,
89 * some other situation causes that that will trip us.
90 * 3) Any namespace having a write lock. We ignore pending writes because by
91 * definition there is some condition that causes that to be the case.
92 * 4) Any read lock on a namespace. We ignore pending reads like in the
93 * controller case.
94 */
95 static boolean_t
nvme_rwlock_block_ctrl_wrlock(nvme_t * nvme)96 nvme_rwlock_block_ctrl_wrlock(nvme_t *nvme)
97 {
98 if (nvme_rwlock_wr_or_pend(&nvme->n_lock) ||
99 list_is_empty(&nvme->n_lock.nl_readers) == 0) {
100 return (B_TRUE);
101 }
102
103 for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
104 nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
105 if (ns->ns_lock.nl_writer != NULL ||
106 list_is_empty(&ns->ns_lock.nl_readers) == 0) {
107 return (B_TRUE);
108 }
109 }
110
111 return (B_FALSE);
112 }
113
114 /*
115 * Answer can we hand off the world to a pending controller write lock. This has
116 * similar rules to the above; however, we critically _ignore_ pending
117 * controller write lock holds, as the assumption is that they are here, so the
118 * only consideration from above are controller reader locks and namespace
119 * locks.
120 */
121 static boolean_t
nvme_rwlock_handoff_ctrl_wrlock(nvme_t * nvme)122 nvme_rwlock_handoff_ctrl_wrlock(nvme_t *nvme)
123 {
124 /* See nvme_rwlock_wakeup() for on why this can be done. */
125 ASSERT3P(nvme->n_lock.nl_writer, ==, NULL);
126
127 if (list_is_empty(&nvme->n_lock.nl_readers) == 0) {
128 return (B_FALSE);
129 }
130
131 for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
132 nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
133 if (ns->ns_lock.nl_writer != NULL ||
134 list_is_empty(&ns->ns_lock.nl_readers) == 0) {
135 return (B_FALSE);
136 }
137 }
138
139 return (B_TRUE);
140 }
141
142 /*
143 * Namespace handoff variant. It skips pending writers on the namespace lock,
144 * but fully considers them on the controller due to their priority. Otherwise
145 * this follows the same rules as the normal blocking check.
146 */
147 static boolean_t
nvme_rwlock_handoff_ns_wrlock(nvme_t * nvme,nvme_namespace_t * ns)148 nvme_rwlock_handoff_ns_wrlock(nvme_t *nvme, nvme_namespace_t *ns)
149 {
150 if (nvme_rwlock_wr_or_pend(&nvme->n_lock) ||
151 list_is_empty(&nvme->n_lock.nl_readers) == 0) {
152 return (B_FALSE);
153 }
154
155 if (ns->ns_lock.nl_writer != NULL ||
156 list_is_empty(&ns->ns_lock.nl_readers) == 0) {
157 return (B_FALSE);
158 }
159
160 return (B_TRUE);
161 }
162
163 static void
nvme_rwlock_rdlock(nvme_minor_lock_info_t * info,nvme_lock_t * lock)164 nvme_rwlock_rdlock(nvme_minor_lock_info_t *info, nvme_lock_t *lock)
165 {
166 ASSERT3U(list_is_empty(&lock->nl_pend_writers), !=, 0);
167 ASSERT3P(lock->nl_writer, ==, NULL);
168 ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_UNLOCKED);
169 ASSERT3U(list_link_active(&info->nli_node), ==, 0);
170 ASSERT3P(info->nli_minor, !=, NULL);
171 ASSERT3P(info->nli_nvme, !=, NULL);
172 ASSERT3U(info->nli_curlevel, ==, NVME_LOCK_L_READ);
173
174 info->nli_state = NVME_LOCK_STATE_ACQUIRED;
175 info->nli_last_change = gethrtime();
176 info->nli_acq_kthread = (uintptr_t)curthread;
177 info->nli_acq_pid = (uint32_t)curproc->p_pid;
178
179 list_insert_tail(&lock->nl_readers, info);
180 lock->nl_nread_locks++;
181 }
182
183 static void
nvme_rwlock_wrlock(nvme_minor_lock_info_t * info,nvme_lock_t * lock)184 nvme_rwlock_wrlock(nvme_minor_lock_info_t *info, nvme_lock_t *lock)
185 {
186 ASSERT3P(lock->nl_writer, ==, NULL);
187 ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_UNLOCKED);
188 ASSERT3U(list_link_active(&info->nli_node), ==, 0);
189 ASSERT3P(info->nli_minor, !=, NULL);
190 ASSERT3P(info->nli_nvme, !=, NULL);
191
192 info->nli_state = NVME_LOCK_STATE_ACQUIRED;
193 info->nli_curlevel = NVME_LOCK_L_WRITE;
194 info->nli_last_change = gethrtime();
195 info->nli_acq_kthread = (uintptr_t)curthread;
196 info->nli_acq_pid = (uint32_t)curproc->p_pid;
197
198 lock->nl_writer = info;
199 lock->nl_nwrite_locks++;
200 }
201
202 #ifdef DEBUG
203 /*
204 * This is just a sanity check for our lock logic.
205 */
206 static boolean_t
nvme_rwlock_is_reader(nvme_lock_t * lock,const nvme_minor_lock_info_t * info)207 nvme_rwlock_is_reader(nvme_lock_t *lock, const nvme_minor_lock_info_t *info)
208 {
209 for (nvme_minor_lock_info_t *i = list_head(&lock->nl_readers);
210 i != NULL; i = list_next(&lock->nl_readers, i)) {
211 if (i == info) {
212 return (B_TRUE);
213 }
214 }
215 return (B_FALSE);
216 }
217 #endif
218
219 static void
nvme_rwlock_signal_one(nvme_minor_lock_info_t * info,nvme_ioctl_errno_t err)220 nvme_rwlock_signal_one(nvme_minor_lock_info_t *info, nvme_ioctl_errno_t err)
221 {
222 ASSERT3P(info->nli_ioc, !=, NULL);
223 ASSERT3P(info->nli_minor, !=, NULL);
224 ASSERT3P(info->nli_state, !=, NVME_LOCK_STATE_BLOCKED);
225
226 if (err == NVME_IOCTL_E_OK) {
227 nvme_ioctl_success(info->nli_ioc);
228 } else {
229 (void) nvme_ioctl_error(info->nli_ioc, err, 0, 0);
230 }
231
232 cv_signal(&info->nli_minor->nm_cv);
233 }
234
235 static void
nvme_rwlock_wakeup_readers(nvme_lock_t * lock)236 nvme_rwlock_wakeup_readers(nvme_lock_t *lock)
237 {
238 nvme_minor_lock_info_t *info;
239
240 if (list_is_empty(&lock->nl_pend_readers) != 0) {
241 return;
242 }
243
244 ASSERT3U(list_is_empty(&lock->nl_readers), !=, 0);
245 ASSERT3P(lock->nl_writer, ==, NULL);
246 ASSERT3U(list_is_empty(&lock->nl_pend_writers), !=, 0);
247 while ((info = list_remove_head(&lock->nl_pend_readers)) != NULL) {
248 info->nli_state = NVME_LOCK_STATE_UNLOCKED;
249 nvme_rwlock_rdlock(info, lock);
250 nvme_rwlock_signal_one(info, NVME_IOCTL_E_OK);
251 }
252 }
253
254 /*
255 * An unlock occurred somewhere. We need to evaluate the total state of the
256 * world. An unlock of a namespace can allow a controller lock to proceed. On
257 * the other hand, dropping the controller write lock allows every namespace to
258 * proceed. While we know the context of where the unlock occurred, it's simpler
259 * right now to just allow everything to continue. This is somewhat expensive,
260 * but this can be sped up with more cached information when it's justified. We
261 * process things in the following order:
262 *
263 * 1) Evaluate if someone can now take a controller write lock. If so, wake up
264 * the head of the list and then all subsequent processing is done.
265 * 2) Evaluate if there are pending readers for the controller. If so, wake up
266 * each and every waiter. Always continue to namespaces in this case.
267 *
268 * For each namespace:
269 *
270 * 1) Evaluate if there are pending writers and they can take the write lock. If
271 * so, wake up the head of the list. If so, continue to the next namespace.
272 * 2) Otherwise, if there are pending readers. If so, wake up each and every
273 * reader. Continue onto the next namespace.
274 */
275 static void
nvme_rwlock_wakeup(nvme_t * nvme)276 nvme_rwlock_wakeup(nvme_t *nvme)
277 {
278 nvme_lock_t *ctrl_lock = &nvme->n_lock;
279
280 /*
281 * This assertion may seem weird, but it's actually a bit of an
282 * invariant. When the controller's write lock is taken, by definition
283 * there are no other locks that can be taken. Therefore if we were
284 * somehow unable to unlock a lock on this controller, then we'd be
285 * violating our rules.
286 */
287 VERIFY3P(ctrl_lock->nl_writer, ==, NULL);
288
289 /*
290 * If there are pending writers, either one of them will be woken up or
291 * no one will. Writers trump readers, but it's possible that we may not
292 * be able to wake up a waiting writer yet. If we take this arm, we
293 * should not process anything else. The same logic applies in the
294 * namespace case as well.
295 */
296 if (list_is_empty(&ctrl_lock->nl_pend_writers) == 0) {
297 nvme_minor_lock_info_t *info;
298
299 if (!nvme_rwlock_handoff_ctrl_wrlock(nvme))
300 return;
301
302 /*
303 * We opt to indicate that this is unlocked ahead of
304 * taking the lock for state tracking purposes.
305 */
306 info = list_remove_head(&ctrl_lock->nl_pend_writers);
307 info->nli_state = NVME_LOCK_STATE_UNLOCKED;
308 nvme_rwlock_wrlock(info, ctrl_lock);
309 nvme_rwlock_signal_one(info, NVME_IOCTL_E_OK);
310 return;
311 }
312
313 nvme_rwlock_wakeup_readers(ctrl_lock);
314 for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
315 nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
316 nvme_lock_t *ns_lock = &ns->ns_lock;
317
318 if (list_is_empty(&ns_lock->nl_pend_writers) == 0) {
319 nvme_minor_lock_info_t *info;
320
321 if (!nvme_rwlock_handoff_ns_wrlock(nvme, ns))
322 continue;
323
324 info = list_remove_head(&ns_lock->nl_pend_writers);
325 info->nli_state = NVME_LOCK_STATE_UNLOCKED;
326 nvme_rwlock_wrlock(info, ns_lock);
327 nvme_rwlock_signal_one(info, NVME_IOCTL_E_OK);
328 } else {
329 nvme_rwlock_wakeup_readers(ns_lock);
330 }
331 }
332 }
333
334 /*
335 * This cleans up all the state in the minor for returning without a lock held.
336 */
337 static void
nvme_rwunlock_cleanup_minor(nvme_minor_lock_info_t * info)338 nvme_rwunlock_cleanup_minor(nvme_minor_lock_info_t *info)
339 {
340 info->nli_lock = NULL;
341 info->nli_state = NVME_LOCK_STATE_UNLOCKED;
342 info->nli_curlevel = 0;
343 info->nli_ns = NULL;
344 }
345
346 /*
347 * We've been asked to unlock a lock. Not only must we remove our hold from this
348 * lock, we must go through and wake up the next waiter. The waiters that we
349 * have to wake up vary depending on our lock. See section 'ioctls, Errors, and
350 * Exclusive Access' in the theory statement for more information.
351 */
352
353 void
nvme_rwunlock(nvme_minor_lock_info_t * info,nvme_lock_t * lock)354 nvme_rwunlock(nvme_minor_lock_info_t *info, nvme_lock_t *lock)
355 {
356 nvme_t *const nvme = info->nli_nvme;
357 boolean_t is_read;
358
359 VERIFY(MUTEX_HELD(&nvme->n_minor_mutex));
360 VERIFY3P(info->nli_lock, ==, lock);
361 VERIFY(info->nli_curlevel == NVME_LOCK_L_READ ||
362 info->nli_curlevel == NVME_LOCK_L_WRITE);
363 is_read = info->nli_curlevel == NVME_LOCK_L_READ;
364
365 /*
366 * First we need to remove this minor from the lock and clean up all of
367 * the state this lock in the info structure.
368 */
369 info->nli_last_change = gethrtime();
370 if (is_read) {
371 VERIFY3U(list_link_active(&info->nli_node), !=, 0);
372 ASSERT3U(nvme_rwlock_is_reader(lock, info), ==, B_TRUE);
373 list_remove(&lock->nl_readers, info);
374 } else {
375 VERIFY3U(list_link_active(&info->nli_node), ==, 0);
376 VERIFY3P(lock->nl_writer, ==, info);
377 lock->nl_writer = NULL;
378 }
379
380 nvme_rwunlock_cleanup_minor(info);
381 nvme_rwlock_wakeup(nvme);
382 }
383
384 /*
385 * We were just interrupted due to a signal. However, just because our block was
386 * interrupted due to a signal doesn't mean that other activity didn't occur. In
387 * particular, the signal wake up could race with a subsequent wake up that was
388 * due to the device being removed or actually acquiring the lock. Depending on
389 * which state we were in, we need to perform the appropriate clean up. In all
390 * cases, the signal trumps all, which may mean actually unlocking!
391 */
392 static void
nvme_rwlock_signal(nvme_minor_lock_info_t * info,nvme_lock_t * lock,boolean_t is_read)393 nvme_rwlock_signal(nvme_minor_lock_info_t *info, nvme_lock_t *lock,
394 boolean_t is_read)
395 {
396 ASSERT3P(info->nli_ioc, !=, NULL);
397
398 /*
399 * We're changing the state here, so update the minor's last change
400 * time.
401 */
402 info->nli_last_change = gethrtime();
403 lock->nl_nsignals++;
404
405 /*
406 * This is the simplest case. We've already been removed from the lock
407 * that we're on. All we need to do is change the error to indicate that
408 * we received a signal.
409 */
410 if (info->nli_state == NVME_LOCK_STATE_UNLOCKED) {
411 ASSERT3P(info->nli_lock, ==, NULL);
412 (void) nvme_ioctl_error(info->nli_ioc,
413 NVME_IOCTL_E_LOCK_WAIT_SIGNAL, 0, 0);
414 lock->nl_nsig_unlock++;
415 return;
416 }
417
418 /*
419 * For all others, the lock should be set here.
420 */
421 ASSERT3P(info->nli_lock, ==, lock);
422
423 /*
424 * For someone that was blocked, we need to remove them from the pending
425 * lists.
426 */
427 if (info->nli_state == NVME_LOCK_STATE_BLOCKED) {
428 ASSERT3S(list_link_active(&info->nli_node), !=, 0);
429 if (is_read) {
430 list_remove(&lock->nl_pend_readers, info);
431 } else {
432 list_remove(&lock->nl_pend_writers, info);
433 }
434
435 nvme_rwunlock_cleanup_minor(info);
436 (void) nvme_ioctl_error(info->nli_ioc,
437 NVME_IOCTL_E_LOCK_WAIT_SIGNAL, 0, 0);
438 lock->nl_nsig_blocks++;
439 return;
440 }
441
442 /*
443 * Now, the most nuanced thing that we need to do. We need to unlock
444 * this node. We synthesize an unlock request and submit that.
445 */
446 lock->nl_nsig_acq++;
447 nvme_rwunlock(info, lock);
448 }
449
450 /*
451 * Here we need to implement our read-write lock policy. Refer to the big theory
452 * statement for more information. Here's a summary of the priority that's
453 * relevant here:
454 *
455 * 1) Waiting writers starve waiting readers
456 * 2) Waiting writers for the controller starve all namespace writers and
457 * readers
458 * 3) A read lock can be taken if there are no pending or active writers on the
459 * lock (and the controller lock for a namespace).
460 */
461 void
nvme_rwlock(nvme_minor_t * minor,nvme_ioctl_lock_t * req)462 nvme_rwlock(nvme_minor_t *minor, nvme_ioctl_lock_t *req)
463 {
464 nvme_t *const nvme = minor->nm_ctrl;
465 const boolean_t is_nonblock = (req->nil_flags &
466 NVME_LOCK_F_DONT_BLOCK) != 0;
467 const boolean_t is_read = req->nil_level == NVME_LOCK_L_READ;
468 const boolean_t is_ctrl = req->nil_ent == NVME_LOCK_E_CTRL;
469 nvme_minor_lock_info_t *info;
470 nvme_lock_t *lock;
471 boolean_t waiters;
472 hrtime_t sleep_time;
473
474 VERIFY(MUTEX_HELD(&nvme->n_minor_mutex));
475
476 if (is_ctrl) {
477 info = &minor->nm_ctrl_lock;
478 lock = &nvme->n_lock;
479
480 if (is_read) {
481 waiters = nvme_rwlock_block_ctrl_rdlock(nvme);
482 } else {
483 waiters = nvme_rwlock_block_ctrl_wrlock(nvme);
484 }
485 } else {
486 nvme_namespace_t *ns;
487 const uint32_t nsid = req->nil_common.nioc_nsid;
488 info = &minor->nm_ns_lock;
489
490 VERIFY3U(req->nil_ent, ==, NVME_LOCK_E_NS);
491 ns = nvme_nsid2ns(nvme, nsid);
492 minor->nm_ns_lock.nli_ns = ns;
493 lock = &ns->ns_lock;
494
495 if (is_read) {
496 waiters = nvme_rwlock_block_ns_rdlock(nvme, ns);
497 } else {
498 waiters = nvme_rwlock_block_ns_wrlock(nvme, ns);
499 }
500 }
501
502 /*
503 * Set the information that indicates what kind of lock we're attempting
504 * to acquire and that we're operating on.
505 */
506 info->nli_curlevel = is_read ? NVME_LOCK_L_READ : NVME_LOCK_L_WRITE;
507 info->nli_lock = lock;
508
509
510 /*
511 * We think we can get the lock, hurrah.
512 */
513 if (!waiters) {
514 if (is_read) {
515 nvme_rwlock_rdlock(info, lock);
516 } else {
517 nvme_rwlock_wrlock(info, lock);
518 }
519 (void) nvme_ioctl_success(&req->nil_common);
520 return;
521 }
522
523 /*
524 * We failed to get the lock. At this point we will set ourselves up to
525 * block. Once we go to sleep on the CV, our assumption is that anyone
526 * who has woken us up will have filled in the information the status of
527 * this operation and therefore after this point, all we have to do is
528 * return.
529 */
530 if (is_nonblock) {
531 nvme_rwunlock_cleanup_minor(info);
532 lock->nl_nnonblock++;
533 (void) nvme_ioctl_error(&req->nil_common,
534 NVME_IOCTL_E_LOCK_WOULD_BLOCK, 0, 0);
535 return;
536 }
537
538 ASSERT3P(info->nli_ioc, ==, NULL);
539 info->nli_ioc = &req->nil_common;
540 if (is_read) {
541 list_insert_tail(&lock->nl_pend_readers, info);
542 lock->nl_npend_reads++;
543 } else {
544 list_insert_tail(&lock->nl_pend_writers, info);
545 lock->nl_npend_writes++;
546 }
547
548 ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_UNLOCKED);
549 info->nli_state = NVME_LOCK_STATE_BLOCKED;
550 sleep_time = gethrtime();
551 info->nli_last_change = sleep_time;
552 while (info->nli_state == NVME_LOCK_STATE_BLOCKED) {
553 /*
554 * Block until we receive a signal. Note, a signal trumps all
555 * other processing. We may be woken up here because we acquired
556 * a lock, we may also end up woken up here if the controller is
557 * marked as dead.
558 */
559 if (cv_wait_sig(&minor->nm_cv, &nvme->n_minor_mutex) == 0) {
560 nvme_rwlock_signal(info, lock, is_read);
561 break;
562 }
563 }
564
565 /*
566 * Before we return, clean up and sanity check our state.
567 */
568 info->nli_ioc = NULL;
569 #ifdef DEBUG
570 ASSERT3S(info->nli_last_change, !=, sleep_time);
571 if (info->nli_state == NVME_LOCK_STATE_UNLOCKED) {
572 ASSERT3S(list_link_active(&info->nli_node), ==, 0);
573 ASSERT3P(info->nli_ns, ==, NULL);
574 ASSERT3U(req->nil_common.nioc_drv_err, !=, NVME_IOCTL_E_OK);
575 } else {
576 ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_ACQUIRED);
577 ASSERT3U(req->nil_common.nioc_drv_err, ==, NVME_IOCTL_E_OK);
578 if (is_read) {
579 ASSERT3S(list_link_active(&info->nli_node), !=, 0);
580 } else {
581 ASSERT3P(lock->nl_writer, ==, info);
582 }
583 }
584 ASSERT3P(info->nli_minor, ==, minor);
585 ASSERT3P(info->nli_nvme, ==, minor->nm_ctrl);
586 #endif
587 }
588
589 /*
590 * This is used to clean up a single minor that was blocking trying to get a
591 * lock prior to a controller going dead. In particular, the key here is we need
592 * to change its state to unlocked by cleaning it up and then signal it to wake
593 * up and process things. The clean up also helps deal with the case of a racing
594 * signal, though it does leave the state a little awkward in this intermediate
595 * moment; however, since it's been removed from a list that's really the proper
596 * action and no one can issue new lock ioctls at this point.
597 */
598 static void
nvme_rwlock_ctrl_dead_cleanup_one(nvme_t * nvme,nvme_minor_lock_info_t * info)599 nvme_rwlock_ctrl_dead_cleanup_one(nvme_t *nvme, nvme_minor_lock_info_t *info)
600 {
601 ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_BLOCKED);
602 ASSERT3P(info->nli_ioc, !=, NULL);
603
604 /*
605 * Update the last time this has changed for our snaity checks.
606 */
607 info->nli_last_change = gethrtime();
608 nvme_rwunlock_cleanup_minor(info);
609 nvme_rwlock_signal_one(info, nvme->n_dead_status);
610 }
611
612 /*
613 * We've just been informed that this controller has set n_dead. This is most
614 * unfortunate for anyone trying to actively use it right now and we must notify
615 * them. Anyone who has successfully obtained a lock gets to keep it until they
616 * drop it (hopefully soon). Anyone who is asleep should be kicked out being
617 * told they are not getting it.
618 *
619 * The moment we grab n_minor_mutex, no other state here can change. So we can
620 * go ahead and wake up all waiters with impunity. This is being called from the
621 * nvme_dead_taskq.
622 */
623 void
nvme_rwlock_ctrl_dead(void * arg)624 nvme_rwlock_ctrl_dead(void *arg)
625 {
626 nvme_t *nvme = arg;
627 nvme_lock_t *ctrl_lock = &nvme->n_lock;
628 nvme_minor_lock_info_t *info;
629
630 mutex_enter(&nvme->n_minor_mutex);
631 for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
632 nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
633 nvme_lock_t *ns_lock = &ns->ns_lock;
634
635 while ((info = list_remove_head(&ns_lock->nl_pend_readers)) !=
636 NULL) {
637 nvme_rwlock_ctrl_dead_cleanup_one(nvme, info);
638 }
639
640 while ((info = list_remove_head(&ns_lock->nl_pend_writers)) !=
641 NULL) {
642 nvme_rwlock_ctrl_dead_cleanup_one(nvme, info);
643 }
644 }
645
646 while ((info = list_remove_head(&ctrl_lock->nl_pend_readers)) != NULL) {
647 nvme_rwlock_ctrl_dead_cleanup_one(nvme, info);
648 }
649
650 while ((info = list_remove_head(&ctrl_lock->nl_pend_writers)) != NULL) {
651
652 nvme_rwlock_ctrl_dead_cleanup_one(nvme, info);
653 }
654 mutex_exit(&nvme->n_minor_mutex);
655 }
656
657 void
nvme_lock_fini(nvme_lock_t * lock)658 nvme_lock_fini(nvme_lock_t *lock)
659 {
660 VERIFY3P(lock->nl_writer, ==, NULL);
661 list_destroy(&lock->nl_pend_writers);
662 list_destroy(&lock->nl_pend_readers);
663 list_destroy(&lock->nl_readers);
664 }
665
666 void
nvme_lock_init(nvme_lock_t * lock)667 nvme_lock_init(nvme_lock_t *lock)
668 {
669 list_create(&lock->nl_readers, sizeof (nvme_minor_lock_info_t),
670 offsetof(nvme_minor_lock_info_t, nli_node));
671 list_create(&lock->nl_pend_readers, sizeof (nvme_minor_lock_info_t),
672 offsetof(nvme_minor_lock_info_t, nli_node));
673 list_create(&lock->nl_pend_writers, sizeof (nvme_minor_lock_info_t),
674 offsetof(nvme_minor_lock_info_t, nli_node));
675 }
676