xref: /freebsd/cddl/usr.sbin/zfsd/case_file.cc (revision 2b833162)
1 /*-
2  * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions, and the following disclaimer,
10  *    without modification.
11  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
12  *    substantially similar to the "NO WARRANTY" disclaimer below
13  *    ("Disclaimer") and any redistribution must be conditioned upon
14  *    including a substantially similar Disclaimer requirement for further
15  *    binary redistribution.
16  *
17  * NO WARRANTY
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
27  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGES.
29  *
30  * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
31  */
32 
33 /**
34  * \file case_file.cc
35  *
36  * We keep case files for any leaf vdev that is not in the optimal state.
37  * However, we only serialize to disk those events that need to be preserved
38  * across reboots.  For now, this is just a log of soft errors which we
39  * accumulate in order to mark a device as degraded.
40  */
41 #include <sys/cdefs.h>
42 #include <sys/byteorder.h>
43 #include <sys/time.h>
44 
45 #include <sys/fs/zfs.h>
46 
47 #include <dirent.h>
48 #include <fcntl.h>
49 #include <iomanip>
50 #include <fstream>
51 #include <functional>
52 #include <sstream>
53 #include <syslog.h>
54 #include <unistd.h>
55 
56 #include <libzfs.h>
57 
58 #include <list>
59 #include <map>
60 #include <string>
61 
62 #include <devdctl/guid.h>
63 #include <devdctl/event.h>
64 #include <devdctl/event_factory.h>
65 #include <devdctl/exception.h>
66 #include <devdctl/consumer.h>
67 
68 #include "callout.h"
69 #include "vdev_iterator.h"
70 #include "zfsd_event.h"
71 #include "case_file.h"
72 #include "vdev.h"
73 #include "zfsd.h"
74 #include "zfsd_exception.h"
75 #include "zpool_list.h"
76 
77 __FBSDID("$FreeBSD$");
78 
79 /*============================ Namespace Control =============================*/
80 using std::hex;
81 using std::ifstream;
82 using std::stringstream;
83 using std::setfill;
84 using std::setw;
85 
86 using DevdCtl::Event;
87 using DevdCtl::EventFactory;
88 using DevdCtl::EventList;
89 using DevdCtl::Guid;
90 using DevdCtl::ParseException;
91 
92 /*--------------------------------- CaseFile ---------------------------------*/
93 //- CaseFile Static Data -------------------------------------------------------
94 
95 CaseFileList  CaseFile::s_activeCases;
96 const string  CaseFile::s_caseFilePath = "/var/db/zfsd/cases";
97 const timeval CaseFile::s_removeGracePeriod = { 60 /*sec*/, 0 /*usec*/};
98 
99 //- CaseFile Static Public Methods ---------------------------------------------
100 CaseFile *
101 CaseFile::Find(Guid poolGUID, Guid vdevGUID)
102 {
103 	for (CaseFileList::iterator curCase = s_activeCases.begin();
104 	     curCase != s_activeCases.end(); curCase++) {
105 
106 		if (((*curCase)->PoolGUID() != poolGUID
107 		  && Guid::InvalidGuid() != poolGUID)
108 		 || (*curCase)->VdevGUID() != vdevGUID)
109 			continue;
110 
111 		/*
112 		 * We only carry one active case per-vdev.
113 		 */
114 		return (*curCase);
115 	}
116 	return (NULL);
117 }
118 
119 void
120 CaseFile::Find(Guid poolGUID, Guid vdevGUID, CaseFileList &cases)
121 {
122 	for (CaseFileList::iterator curCase = s_activeCases.begin();
123 	    curCase != s_activeCases.end(); curCase++) {
124 		if (((*curCase)->PoolGUID() != poolGUID &&
125 		    Guid::InvalidGuid() != poolGUID) ||
126 		    (*curCase)->VdevGUID() != vdevGUID)
127 			continue;
128 
129 		/*
130 		 * We can have multiple cases for spare vdevs
131 		 */
132 		cases.push_back(*curCase);
133 		if (!(*curCase)->IsSpare()) {
134 			return;
135 		}
136 	}
137 }
138 
139 CaseFile *
140 CaseFile::Find(const string &physPath)
141 {
142 	CaseFile *result = NULL;
143 
144 	for (CaseFileList::iterator curCase = s_activeCases.begin();
145 	     curCase != s_activeCases.end(); curCase++) {
146 
147 		if ((*curCase)->PhysicalPath() != physPath)
148 			continue;
149 
150 		if (result != NULL) {
151 			syslog(LOG_WARNING, "Multiple casefiles found for "
152 			    "physical path %s.  "
153 			    "This is most likely a bug in zfsd",
154 			    physPath.c_str());
155 		}
156 		result = *curCase;
157 	}
158 	return (result);
159 }
160 
161 
162 void
163 CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event)
164 {
165 	CaseFileList::iterator casefile;
166 	for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){
167 		CaseFileList::iterator next = casefile;
168 		next++;
169 		if (poolGUID == (*casefile)->PoolGUID())
170 			(*casefile)->ReEvaluate(event);
171 		casefile = next;
172 	}
173 }
174 
175 CaseFile &
176 CaseFile::Create(Vdev &vdev)
177 {
178 	CaseFile *activeCase;
179 
180 	activeCase = Find(vdev.PoolGUID(), vdev.GUID());
181 	if (activeCase == NULL)
182 		activeCase = new CaseFile(vdev);
183 
184 	return (*activeCase);
185 }
186 
187 void
188 CaseFile::DeSerialize()
189 {
190 	struct dirent **caseFiles;
191 
192 	int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles,
193 			 DeSerializeSelector, /*compar*/NULL));
194 
195 	if (numCaseFiles == -1)
196 		return;
197 	if (numCaseFiles == 0) {
198 		free(caseFiles);
199 		return;
200 	}
201 
202 	for (int i = 0; i < numCaseFiles; i++) {
203 
204 		DeSerializeFile(caseFiles[i]->d_name);
205 		free(caseFiles[i]);
206 	}
207 	free(caseFiles);
208 }
209 
210 bool
211 CaseFile::Empty()
212 {
213 	return (s_activeCases.empty());
214 }
215 
216 void
217 CaseFile::LogAll()
218 {
219 	for (CaseFileList::iterator curCase = s_activeCases.begin();
220 	     curCase != s_activeCases.end(); curCase++)
221 		(*curCase)->Log();
222 }
223 
224 void
225 CaseFile::PurgeAll()
226 {
227 	/*
228 	 * Serialize casefiles before deleting them so that they can be reread
229 	 * and revalidated during BuildCaseFiles.
230 	 * CaseFiles remove themselves from this list on destruction.
231 	 */
232 	while (s_activeCases.size() != 0) {
233 		CaseFile *casefile = s_activeCases.front();
234 		casefile->Serialize();
235 		delete casefile;
236 	}
237 
238 }
239 
240 int
241 CaseFile::IsSpare()
242 {
243 	return (m_is_spare);
244 }
245 
246 //- CaseFile Public Methods ----------------------------------------------------
247 bool
248 CaseFile::RefreshVdevState()
249 {
250 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
251 	zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front());
252 	if (casePool == NULL)
253 		return (false);
254 
255 	Vdev vd(casePool, CaseVdev(casePool));
256 	if (vd.DoesNotExist())
257 		return (false);
258 
259 	m_vdevState    = vd.State();
260 	m_vdevPhysPath = vd.PhysicalPath();
261 	return (true);
262 }
263 
264 bool
265 CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev)
266 {
267 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
268 	zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front());
269 	int flags = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE;
270 
271 	if (pool == NULL || !RefreshVdevState()) {
272 		/*
273 		 * The pool or vdev for this case file is no longer
274 		 * part of the configuration.  This can happen
275 		 * if we process a device arrival notification
276 		 * before seeing the ZFS configuration change
277 		 * event.
278 		 */
279 		syslog(LOG_INFO,
280 		       "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured.  "
281 		       "Closing\n",
282 		       PoolGUIDString().c_str(),
283 		       VdevGUIDString().c_str());
284 		Close();
285 
286 		/*
287 		 * Since this event was not used to close this
288 		 * case, do not report it as consumed.
289 		 */
290 		return (/*consumed*/false);
291 	}
292 
293 	if (VdevState() > VDEV_STATE_CANT_OPEN) {
294 		/*
295 		 * For now, newly discovered devices only help for
296 		 * devices that are missing.  In the future, we might
297 		 * use a newly inserted spare to replace a degraded
298 		 * or faulted device.
299 		 */
300 		syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored",
301 		    PoolGUIDString().c_str(), VdevGUIDString().c_str());
302 		return (/*consumed*/false);
303 	}
304 
305 	if (vdev != NULL
306 	 && ( vdev->PoolGUID() == m_poolGUID
307 	   || vdev->PoolGUID() == Guid::InvalidGuid())
308 	 && vdev->GUID() == m_vdevGUID) {
309 
310 		if (IsSpare())
311 			flags |= ZFS_ONLINE_SPARE;
312 		if (zpool_vdev_online(pool, vdev->GUIDString().c_str(),
313 		    flags, &m_vdevState) != 0) {
314 			syslog(LOG_ERR,
315 			    "Failed to online vdev(%s/%s:%s): %s: %s\n",
316 			    zpool_get_name(pool), vdev->GUIDString().c_str(),
317 			    devPath.c_str(), libzfs_error_action(g_zfsHandle),
318 			    libzfs_error_description(g_zfsHandle));
319 			return (/*consumed*/false);
320 		}
321 
322 		syslog(LOG_INFO, "Onlined vdev(%s/%s:%s).  State now %s.\n",
323 		       zpool_get_name(pool), vdev->GUIDString().c_str(),
324 		       devPath.c_str(),
325 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
326 
327 		/*
328 		 * Check the vdev state post the online action to see
329 		 * if we can retire this case.
330 		 */
331 		CloseIfSolved();
332 
333 		return (/*consumed*/true);
334 	}
335 
336 	/*
337 	 * If the auto-replace policy is enabled, and we have physical
338 	 * path information, try a physical path replacement.
339 	 */
340 	if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) {
341 		syslog(LOG_INFO,
342 		       "CaseFile(%s:%s:%s): AutoReplace not set.  "
343 		       "Ignoring device insertion.\n",
344 		       PoolGUIDString().c_str(),
345 		       VdevGUIDString().c_str(),
346 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
347 		return (/*consumed*/false);
348 	}
349 
350 	if (PhysicalPath().empty()) {
351 		syslog(LOG_INFO,
352 		       "CaseFile(%s:%s:%s): No physical path information.  "
353 		       "Ignoring device insertion.\n",
354 		       PoolGUIDString().c_str(),
355 		       VdevGUIDString().c_str(),
356 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
357 		return (/*consumed*/false);
358 	}
359 
360 	if (physPath != PhysicalPath()) {
361 		syslog(LOG_INFO,
362 		       "CaseFile(%s:%s:%s): Physical path mismatch.  "
363 		       "Ignoring device insertion.\n",
364 		       PoolGUIDString().c_str(),
365 		       VdevGUIDString().c_str(),
366 		       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
367 		return (/*consumed*/false);
368 	}
369 
370 	/* Write a label on the newly inserted disk. */
371 	if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) {
372 		syslog(LOG_ERR,
373 		       "Replace vdev(%s/%s) by physical path (label): %s: %s\n",
374 		       zpool_get_name(pool), VdevGUIDString().c_str(),
375 		       libzfs_error_action(g_zfsHandle),
376 		       libzfs_error_description(g_zfsHandle));
377 		return (/*consumed*/false);
378 	}
379 
380 	syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s",
381 	    PoolGUIDString().c_str(), VdevGUIDString().c_str(),
382 	    devPath.c_str());
383 	return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false));
384 }
385 
386 bool
387 CaseFile::ReEvaluate(const ZfsEvent &event)
388 {
389 	bool consumed(false);
390 
391 	if (event.Value("type") == "misc.fs.zfs.vdev_remove") {
392 		/*
393 		 * The Vdev we represent has been removed from the
394 		 * configuration.  This case is no longer of value.
395 		 */
396 		Close();
397 
398 		return (/*consumed*/true);
399 	} else if (event.Value("type") == "misc.fs.zfs.pool_destroy") {
400 		/* This Pool has been destroyed.  Discard the case */
401 		Close();
402 
403 		return (/*consumed*/true);
404 	} else if (event.Value("type") == "misc.fs.zfs.config_sync") {
405 		RefreshVdevState();
406 		if (VdevState() < VDEV_STATE_HEALTHY)
407 			consumed = ActivateSpare();
408 	}
409 
410 
411 	if (event.Value("class") == "resource.fs.zfs.removed") {
412 		bool spare_activated;
413 
414 		if (!RefreshVdevState()) {
415 			/*
416 			 * The pool or vdev for this case file is no longer
417 			 * part of the configuration.  This can happen
418 			 * if we process a device arrival notification
419 			 * before seeing the ZFS configuration change
420 			 * event.
421 			 */
422 			syslog(LOG_INFO,
423 			       "CaseFile::ReEvaluate(%s,%s) Pool/Vdev "
424 			       "unconfigured.  Closing\n",
425 			       PoolGUIDString().c_str(),
426 			       VdevGUIDString().c_str());
427 			/*
428 			 * Close the case now so we won't waste cycles in the
429 			 * system rescan
430 			 */
431 			Close();
432 
433 			/*
434 			 * Since this event was not used to close this
435 			 * case, do not report it as consumed.
436 			 */
437 			return (/*consumed*/false);
438 		}
439 
440 		/*
441 		 * Discard any tentative I/O error events for
442 		 * this case.  They were most likely caused by the
443 		 * hot-unplug of this device.
444 		 */
445 		PurgeTentativeEvents();
446 
447 		/* Try to activate spares if they are available */
448 		spare_activated = ActivateSpare();
449 
450 		/*
451 		 * Rescan the drives in the system to see if a recent
452 		 * drive arrival can be used to solve this case.
453 		 */
454 		ZfsDaemon::RequestSystemRescan();
455 
456 		/*
457 		 * Consume the event if we successfully activated a spare.
458 		 * Otherwise, leave it in the unconsumed events list so that the
459 		 * future addition of a spare to this pool might be able to
460 		 * close the case
461 		 */
462 		consumed = spare_activated;
463 	} else if (event.Value("class") == "resource.fs.zfs.statechange") {
464 		RefreshVdevState();
465 		/*
466 		 * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to
467 		 * activate a hotspare.  Otherwise, ignore the event
468 		 */
469 		if (VdevState() == VDEV_STATE_FAULTED ||
470 		    VdevState() == VDEV_STATE_DEGRADED ||
471 		    VdevState() == VDEV_STATE_CANT_OPEN)
472 			(void) ActivateSpare();
473 		consumed = true;
474 	}
475 	else if (event.Value("class") == "ereport.fs.zfs.io" ||
476 	         event.Value("class") == "ereport.fs.zfs.checksum") {
477 
478 		m_tentativeEvents.push_front(event.DeepCopy());
479 		RegisterCallout(event);
480 		consumed = true;
481 	}
482 
483 	bool closed(CloseIfSolved());
484 
485 	return (consumed || closed);
486 }
487 
488 /* Find a Vdev containing the vdev with the given GUID */
489 static nvlist_t*
490 find_parent(nvlist_t *pool_config, nvlist_t *config, DevdCtl::Guid child_guid)
491 {
492 	nvlist_t **vdevChildren;
493 	int        error;
494 	unsigned   ch, numChildren;
495 
496 	error = nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN,
497 					   &vdevChildren, &numChildren);
498 
499 	if (error != 0 || numChildren == 0)
500 		return (NULL);
501 
502 	for (ch = 0; ch < numChildren; ch++) {
503 		nvlist *result;
504 		Vdev vdev(pool_config, vdevChildren[ch]);
505 
506 		if (vdev.GUID() == child_guid)
507 			return (config);
508 
509 		result = find_parent(pool_config, vdevChildren[ch], child_guid);
510 		if (result != NULL)
511 			return (result);
512 	}
513 
514 	return (NULL);
515 }
516 
517 bool
518 CaseFile::ActivateSpare() {
519 	nvlist_t	*config, *nvroot, *parent_config;
520 	nvlist_t       **spares;
521 	const char	*devPath, *poolname, *vdev_type;
522 	u_int		 nspares, i;
523 	int		 error;
524 
525 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
526 	zpool_handle_t	*zhp(zpl.empty() ? NULL : zpl.front());
527 	if (zhp == NULL) {
528 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
529 		       "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID);
530 		return (false);
531 	}
532 	poolname = zpool_get_name(zhp);
533 	config = zpool_get_config(zhp, NULL);
534 	if (config == NULL) {
535 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
536 		       "config for pool %s", poolname);
537 		return (false);
538 	}
539 	error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot);
540 	if (error != 0){
541 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev "
542 		       "tree for pool %s", poolname);
543 		return (false);
544 	}
545 
546 	parent_config = find_parent(config, nvroot, m_vdevGUID);
547 	if (parent_config != NULL) {
548 		const char *parent_type;
549 
550 		/*
551 		 * Don't activate spares for members of a "replacing" vdev.
552 		 * They're already dealt with.  Sparing them will just drag out
553 		 * the resilver process.
554 		 */
555 		error = nvlist_lookup_string(parent_config,
556 		    ZPOOL_CONFIG_TYPE, &parent_type);
557 		if (error == 0 && strcmp(parent_type, VDEV_TYPE_REPLACING) == 0)
558 			return (false);
559 	}
560 
561 	nspares = 0;
562 	nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
563 				   &nspares);
564 	if (nspares == 0) {
565 		/* The pool has no spares configured */
566 		syslog(LOG_INFO, "CaseFile::ActivateSpare: "
567 		       "No spares available for pool %s", poolname);
568 		return (false);
569 	}
570 	for (i = 0; i < nspares; i++) {
571 		uint64_t    *nvlist_array;
572 		vdev_stat_t *vs;
573 		uint_t	     nstats;
574 
575 		if (nvlist_lookup_uint64_array(spares[i],
576 		    ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) {
577 			syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not "
578 			       "find vdev stats for pool %s, spare %d",
579 			       poolname, i);
580 			return (false);
581 		}
582 		vs = reinterpret_cast<vdev_stat_t *>(nvlist_array);
583 
584 		if ((vs->vs_aux != VDEV_AUX_SPARED)
585 		 && (vs->vs_state == VDEV_STATE_HEALTHY)) {
586 			/* We found a usable spare */
587 			break;
588 		}
589 	}
590 
591 	if (i == nspares) {
592 		/* No available spares were found */
593 		return (false);
594 	}
595 
596 	error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath);
597 	if (error != 0) {
598 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
599 		       "the path of pool %s, spare %d. Error %d",
600 		       poolname, i, error);
601 		return (false);
602 	}
603 
604 	error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type);
605 	if (error != 0) {
606 		syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
607 		       "the vdev type of pool %s, spare %d. Error %d",
608 		       poolname, i, error);
609 		return (false);
610 	}
611 
612 	return (Replace(vdev_type, devPath, /*isspare*/true));
613 }
614 
615 void
616 CaseFile::RegisterCallout(const Event &event)
617 {
618 	timeval now, countdown, elapsed, timestamp, zero, remaining;
619 
620 	gettimeofday(&now, 0);
621 	timestamp = event.GetTimestamp();
622 	timersub(&now, &timestamp, &elapsed);
623 	timersub(&s_removeGracePeriod, &elapsed, &countdown);
624 	/*
625 	 * If countdown is <= zero, Reset the timer to the
626 	 * smallest positive time value instead
627 	 */
628 	timerclear(&zero);
629 	if (timercmp(&countdown, &zero, <=)) {
630 		timerclear(&countdown);
631 		countdown.tv_usec = 1;
632 	}
633 
634 	remaining = m_tentativeTimer.TimeRemaining();
635 
636 	if (!m_tentativeTimer.IsPending()
637 	 || timercmp(&countdown, &remaining, <))
638 		m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this);
639 }
640 
641 
642 bool
643 CaseFile::CloseIfSolved()
644 {
645 	if (m_events.empty()
646 	 && m_tentativeEvents.empty()) {
647 
648 		/*
649 		 * We currently do not track or take actions on
650 		 * devices in the degraded or faulted state.
651 		 * Once we have support for spare pools, we'll
652 		 * retain these cases so that any spares added in
653 		 * the future can be applied to them.
654 		 */
655 		switch (VdevState()) {
656 		case VDEV_STATE_HEALTHY:
657 			/* No need to keep cases for healthy vdevs */
658 			Close();
659 			return (true);
660 		case VDEV_STATE_REMOVED:
661 		case VDEV_STATE_CANT_OPEN:
662 			/*
663 			 * Keep open.  We may solve it with a newly inserted
664 			 * device.
665 			 */
666 		case VDEV_STATE_FAULTED:
667 		case VDEV_STATE_DEGRADED:
668 			/*
669 			 * Keep open.  We may solve it with the future
670 			 * addition of a spare to the pool
671 			 */
672 		case VDEV_STATE_UNKNOWN:
673 		case VDEV_STATE_CLOSED:
674 		case VDEV_STATE_OFFLINE:
675 			/*
676 			 * Keep open?  This may not be the correct behavior,
677 			 * but it's what we've always done
678 			 */
679 			;
680 		}
681 
682 		/*
683 		 * Re-serialize the case in order to remove any
684 		 * previous event data.
685 		 */
686 		Serialize();
687 	}
688 
689 	return (false);
690 }
691 
692 void
693 CaseFile::Log()
694 {
695 	syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(),
696 	       VdevGUIDString().c_str(), PhysicalPath().c_str());
697 	syslog(LOG_INFO, "\tVdev State = %s\n",
698 	       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
699 	if (m_tentativeEvents.size() != 0) {
700 		syslog(LOG_INFO, "\t=== Tentative Events ===\n");
701 		for (EventList::iterator event(m_tentativeEvents.begin());
702 		     event != m_tentativeEvents.end(); event++)
703 			(*event)->Log(LOG_INFO);
704 	}
705 	if (m_events.size() != 0) {
706 		syslog(LOG_INFO, "\t=== Events ===\n");
707 		for (EventList::iterator event(m_events.begin());
708 		     event != m_events.end(); event++)
709 			(*event)->Log(LOG_INFO);
710 	}
711 }
712 
713 //- CaseFile Static Protected Methods ------------------------------------------
714 void
715 CaseFile::OnGracePeriodEnded(void *arg)
716 {
717 	CaseFile &casefile(*static_cast<CaseFile *>(arg));
718 
719 	casefile.OnGracePeriodEnded();
720 }
721 
722 int
723 CaseFile::DeSerializeSelector(const struct dirent *dirEntry)
724 {
725 	uint64_t poolGUID;
726 	uint64_t vdevGUID;
727 
728 	if (dirEntry->d_type == DT_REG
729 	 && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
730 		   &poolGUID, &vdevGUID) == 2)
731 		return (1);
732 	return (0);
733 }
734 
735 void
736 CaseFile::DeSerializeFile(const char *fileName)
737 {
738 	string	  fullName(s_caseFilePath + '/' + fileName);
739 	CaseFile *existingCaseFile(NULL);
740 	CaseFile *caseFile(NULL);
741 
742 	try {
743 		uint64_t poolGUID;
744 		uint64_t vdevGUID;
745 		nvlist_t *vdevConf;
746 
747 		if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
748 		       &poolGUID, &vdevGUID) != 2) {
749 			throw ZfsdException("CaseFile::DeSerialize: "
750 			    "Unintelligible CaseFile filename %s.\n", fileName);
751 		}
752 		existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID));
753 		if (existingCaseFile != NULL) {
754 			/*
755 			 * If the vdev is already degraded or faulted,
756 			 * there's no point in keeping the state around
757 			 * that we use to put a drive into the degraded
758 			 * state.  However, if the vdev is simply missing,
759 			 * preserve the case data in the hopes that it will
760 			 * return.
761 			 */
762 			caseFile = existingCaseFile;
763 			vdev_state curState(caseFile->VdevState());
764 			if (curState > VDEV_STATE_CANT_OPEN
765 			 && curState < VDEV_STATE_HEALTHY) {
766 				unlink(fileName);
767 				return;
768 			}
769 		} else {
770 			ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID);
771 			if (zpl.empty()
772 			 || (vdevConf = VdevIterator(zpl.front())
773 						    .Find(vdevGUID)) == NULL) {
774 				/*
775 				 * Either the pool no longer exists
776 				 * or this vdev is no longer a member of
777 				 * the pool.
778 				 */
779 				unlink(fullName.c_str());
780 				return;
781 			}
782 
783 			/*
784 			 * Any vdev we find that does not have a case file
785 			 * must be in the healthy state and thus worthy of
786 			 * continued SERD data tracking.
787 			 */
788 			caseFile = new CaseFile(Vdev(zpl.front(), vdevConf));
789 		}
790 
791 		ifstream caseStream(fullName.c_str());
792 		if (!caseStream)
793 			throw ZfsdException("CaseFile::DeSerialize: Unable to "
794 					    "read %s.\n", fileName);
795 
796 		caseFile->DeSerialize(caseStream);
797 	} catch (const ParseException &exp) {
798 
799 		exp.Log();
800 		if (caseFile != existingCaseFile)
801 			delete caseFile;
802 
803 		/*
804 		 * Since we can't parse the file, unlink it so we don't
805 		 * trip over it again.
806 		 */
807 		unlink(fileName);
808 	} catch (const ZfsdException &zfsException) {
809 
810 		zfsException.Log();
811 		if (caseFile != existingCaseFile)
812 			delete caseFile;
813 	}
814 }
815 
816 //- CaseFile Protected Methods -------------------------------------------------
817 CaseFile::CaseFile(const Vdev &vdev)
818  : m_poolGUID(vdev.PoolGUID()),
819    m_vdevGUID(vdev.GUID()),
820    m_vdevState(vdev.State()),
821    m_vdevPhysPath(vdev.PhysicalPath()),
822    m_is_spare(vdev.IsSpare())
823 {
824 	stringstream guidString;
825 
826 	guidString << m_vdevGUID;
827 	m_vdevGUIDString = guidString.str();
828 	guidString.str("");
829 	guidString << m_poolGUID;
830 	m_poolGUIDString = guidString.str();
831 
832 	s_activeCases.push_back(this);
833 
834 	syslog(LOG_INFO, "Creating new CaseFile:\n");
835 	Log();
836 }
837 
838 CaseFile::~CaseFile()
839 {
840 	PurgeEvents();
841 	PurgeTentativeEvents();
842 	m_tentativeTimer.Stop();
843 	s_activeCases.remove(this);
844 }
845 
846 void
847 CaseFile::PurgeEvents()
848 {
849 	for (EventList::iterator event(m_events.begin());
850 	     event != m_events.end(); event++)
851 		delete *event;
852 
853 	m_events.clear();
854 }
855 
856 void
857 CaseFile::PurgeTentativeEvents()
858 {
859 	for (EventList::iterator event(m_tentativeEvents.begin());
860 	     event != m_tentativeEvents.end(); event++)
861 		delete *event;
862 
863 	m_tentativeEvents.clear();
864 }
865 
866 void
867 CaseFile::SerializeEvList(const EventList events, int fd,
868 		const char* prefix) const
869 {
870 	if (events.empty())
871 		return;
872 	for (EventList::const_iterator curEvent = events.begin();
873 	     curEvent != events.end(); curEvent++) {
874 		const string &eventString((*curEvent)->GetEventString());
875 
876 		// TODO: replace many write(2) calls with a single writev(2)
877 		if (prefix)
878 			write(fd, prefix, strlen(prefix));
879 		write(fd, eventString.c_str(), eventString.length());
880 	}
881 }
882 
883 void
884 CaseFile::Serialize()
885 {
886 	stringstream saveFile;
887 
888 	saveFile << setfill('0')
889 		 << s_caseFilePath << "/"
890 		 << "pool_" << PoolGUIDString()
891 		 << "_vdev_" << VdevGUIDString()
892 		 << ".case";
893 
894 	if (m_events.empty() && m_tentativeEvents.empty()) {
895 		unlink(saveFile.str().c_str());
896 		return;
897 	}
898 
899 	int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644));
900 	if (fd == -1) {
901 		syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n",
902 		       saveFile.str().c_str());
903 		return;
904 	}
905 	SerializeEvList(m_events, fd);
906 	SerializeEvList(m_tentativeEvents, fd, "tentative ");
907 	close(fd);
908 }
909 
910 /*
911  * XXX: This method assumes that events may not contain embedded newlines.  If
912  * ever events can contain embedded newlines, then CaseFile must switch
913  * serialization formats
914  */
915 void
916 CaseFile::DeSerialize(ifstream &caseStream)
917 {
918 	string	      evString;
919 	const EventFactory &factory(ZfsDaemon::Get().GetFactory());
920 
921 	caseStream >> std::noskipws >> std::ws;
922 	while (caseStream.good()) {
923 		/*
924 		 * Outline:
925 		 * read the beginning of a line and check it for
926 		 * "tentative".  If found, discard "tentative".
927 		 * Create a new event
928 		 * continue
929 		 */
930 		EventList* destEvents;
931 		const string tentFlag("tentative ");
932 		string line;
933 		std::stringbuf lineBuf;
934 
935 		caseStream.get(lineBuf);
936 		caseStream.ignore();  /*discard the newline character*/
937 		line = lineBuf.str();
938 		if (line.compare(0, tentFlag.size(), tentFlag) == 0) {
939 			/* Discard "tentative" */
940 			line.erase(0, tentFlag.size());
941 			destEvents = &m_tentativeEvents;
942 		} else {
943 			destEvents = &m_events;
944 		}
945 		Event *event(Event::CreateEvent(factory, line));
946 		if (event != NULL) {
947 			destEvents->push_back(event);
948 			RegisterCallout(*event);
949 		}
950 	}
951 }
952 
953 void
954 CaseFile::Close()
955 {
956 	/*
957 	 * This case is no longer relevant.  Clean up our
958 	 * serialization file, and delete the case.
959 	 */
960 	syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n",
961 	       PoolGUIDString().c_str(), VdevGUIDString().c_str(),
962 	       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
963 
964 	/*
965 	 * Serialization of a Case with no event data, clears the
966 	 * Serialization data for that event.
967 	 */
968 	PurgeEvents();
969 	Serialize();
970 
971 	delete this;
972 }
973 
974 void
975 CaseFile::OnGracePeriodEnded()
976 {
977 	bool should_fault, should_degrade;
978 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
979 	zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
980 
981 	m_events.splice(m_events.begin(), m_tentativeEvents);
982 	should_fault = ShouldFault();
983 	should_degrade = ShouldDegrade();
984 
985 	if (should_fault || should_degrade) {
986 		if (zhp == NULL
987 		 || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) {
988 			/*
989 			 * Either the pool no longer exists
990 			 * or this vdev is no longer a member of
991 			 * the pool.
992 			 */
993 			Close();
994 			return;
995 		}
996 
997 	}
998 
999 	/* A fault condition has priority over a degrade condition */
1000 	if (ShouldFault()) {
1001 		/* Fault the vdev and close the case. */
1002 		if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID,
1003 				       VDEV_AUX_ERR_EXCEEDED) == 0) {
1004 			syslog(LOG_INFO, "Faulting vdev(%s/%s)",
1005 			       PoolGUIDString().c_str(),
1006 			       VdevGUIDString().c_str());
1007 			Close();
1008 			return;
1009 		}
1010 		else {
1011 			syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n",
1012 			       PoolGUIDString().c_str(),
1013 			       VdevGUIDString().c_str(),
1014 			       libzfs_error_action(g_zfsHandle),
1015 			       libzfs_error_description(g_zfsHandle));
1016 		}
1017 	}
1018 	else if (ShouldDegrade()) {
1019 		/* Degrade the vdev and close the case. */
1020 		if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID,
1021 				       VDEV_AUX_ERR_EXCEEDED) == 0) {
1022 			syslog(LOG_INFO, "Degrading vdev(%s/%s)",
1023 			       PoolGUIDString().c_str(),
1024 			       VdevGUIDString().c_str());
1025 			Close();
1026 			return;
1027 		}
1028 		else {
1029 			syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n",
1030 			       PoolGUIDString().c_str(),
1031 			       VdevGUIDString().c_str(),
1032 			       libzfs_error_action(g_zfsHandle),
1033 			       libzfs_error_description(g_zfsHandle));
1034 		}
1035 	}
1036 	Serialize();
1037 }
1038 
1039 Vdev
1040 CaseFile::BeingReplacedBy(zpool_handle_t *zhp) {
1041 	Vdev vd(zhp, CaseVdev(zhp));
1042 	std::list<Vdev> children;
1043 	std::list<Vdev>::iterator children_it;
1044 
1045 	Vdev parent(vd.Parent());
1046 	Vdev replacing(NonexistentVdev);
1047 
1048 	/*
1049 	 * To determine whether we are being replaced by another spare that
1050 	 * is still working, then make sure that it is currently spared and
1051 	 * that the spare is either resilvering or healthy.  If any of these
1052 	 * conditions fail, then we are not being replaced by a spare.
1053 	 *
1054 	 * If the spare is healthy, then the case file should be closed very
1055 	 * soon after this check.
1056 	 */
1057 	if (parent.DoesNotExist()
1058 	 || parent.Name(zhp, /*verbose*/false) != "spare")
1059 		return (NonexistentVdev);
1060 
1061 	children = parent.Children();
1062 	children_it = children.begin();
1063 	for (;children_it != children.end(); children_it++) {
1064 		Vdev child = *children_it;
1065 
1066 		/* Skip our vdev. */
1067 		if (child.GUID() == VdevGUID())
1068 			continue;
1069 		/*
1070 		 * Accept the first child that doesn't match our GUID, or
1071 		 * any resilvering/healthy device if one exists.
1072 		 */
1073 		if (replacing.DoesNotExist() || child.IsResilvering()
1074 		 || child.State() == VDEV_STATE_HEALTHY)
1075 			replacing = child;
1076 	}
1077 
1078 	return (replacing);
1079 }
1080 
1081 bool
1082 CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) {
1083 	nvlist_t *nvroot, *newvd;
1084 	const char *poolname;
1085 	string oldstr(VdevGUIDString());
1086 	bool retval = true;
1087 
1088 	/* Figure out what pool we're working on */
1089 	ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
1090 	zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
1091 	if (zhp == NULL) {
1092 		syslog(LOG_ERR, "CaseFile::Replace: could not find pool for "
1093 		       "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID);
1094 		return (false);
1095 	}
1096 	poolname = zpool_get_name(zhp);
1097 	Vdev vd(zhp, CaseVdev(zhp));
1098 	Vdev replaced(BeingReplacedBy(zhp));
1099 
1100 	if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) {
1101 		/* If we are already being replaced by a working spare, pass. */
1102 		if (replaced.IsResilvering()
1103 		 || replaced.State() == VDEV_STATE_HEALTHY) {
1104 			syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already "
1105 			    "replaced", VdevGUIDString().c_str(), path);
1106 			return (/*consumed*/false);
1107 		}
1108 		/*
1109 		 * If we have already been replaced by a spare, but that spare
1110 		 * is broken, we must spare the spare, not the original device.
1111 		 */
1112 		oldstr = replaced.GUIDString();
1113 		syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing "
1114 		    "broken spare %s instead", VdevGUIDString().c_str(),
1115 		    path, oldstr.c_str());
1116 	}
1117 
1118 	/*
1119 	 * Build a root vdev/leaf vdev configuration suitable for
1120 	 * zpool_vdev_attach. Only enough data for the kernel to find
1121 	 * the device (i.e. type and disk device node path) are needed.
1122 	 */
1123 	nvroot = NULL;
1124 	newvd = NULL;
1125 
1126 	if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0
1127 	 || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
1128 		syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate "
1129 		    "configuration data.", poolname, oldstr.c_str());
1130 		if (nvroot != NULL)
1131 			nvlist_free(nvroot);
1132 		return (false);
1133 	}
1134 	if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0
1135 	 || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0
1136 	 || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0
1137 	 || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1138 				    &newvd, 1) != 0) {
1139 		syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize "
1140 		    "configuration data.", poolname, oldstr.c_str());
1141 		nvlist_free(newvd);
1142 		nvlist_free(nvroot);
1143 		return (true);
1144 	}
1145 
1146 	/* Data was copied when added to the root vdev. */
1147 	nvlist_free(newvd);
1148 
1149 	retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot,
1150        /*replace*/B_TRUE, /*rebuild*/ B_FALSE) == 0);
1151 	if (retval)
1152 		syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n",
1153 		    poolname, oldstr.c_str(), path);
1154 	else
1155 		syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n",
1156 		    poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle),
1157 		    libzfs_error_description(g_zfsHandle));
1158 	nvlist_free(nvroot);
1159 
1160 	return (retval);
1161 }
1162 
1163 /* Does the argument event refer to a checksum error? */
1164 static bool
1165 IsChecksumEvent(const Event* const event)
1166 {
1167 	return ("ereport.fs.zfs.checksum" == event->Value("type"));
1168 }
1169 
1170 /* Does the argument event refer to an IO error? */
1171 static bool
1172 IsIOEvent(const Event* const event)
1173 {
1174 	return ("ereport.fs.zfs.io" == event->Value("type"));
1175 }
1176 
1177 bool
1178 CaseFile::ShouldDegrade() const
1179 {
1180 	return (std::count_if(m_events.begin(), m_events.end(),
1181 			      IsChecksumEvent) > ZFS_DEGRADE_IO_COUNT);
1182 }
1183 
1184 bool
1185 CaseFile::ShouldFault() const
1186 {
1187 	return (std::count_if(m_events.begin(), m_events.end(),
1188 			      IsIOEvent) > ZFS_DEGRADE_IO_COUNT);
1189 }
1190 
1191 nvlist_t *
1192 CaseFile::CaseVdev(zpool_handle_t *zhp) const
1193 {
1194 	return (VdevIterator(zhp).Find(VdevGUID()));
1195 }
1196