1#!/bin/ksh -p
2
3#
4# CDDL HEADER START
5#
6# This file and its contents are supplied under the terms of the
7# Common Development and Distribution License ("CDDL"), version 1.0.
8# You may only use this file in accordance with the terms of version
9# 1.0 of the CDDL.
10#
11# A full copy of the text of the CDDL should have accompanied this
12# source.  A copy of the CDDL is also available via the Internet at
13# http://www.illumos.org/license/CDDL.
14#
15# CDDL HEADER END
16#
17
18#
19# Copyright (c) 2017 by Intel Corporation. All rights reserved.
20# Copyright 2017, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
21#
22
23. $STF_SUITE/include/libtest.shlib
24. $STF_SUITE/tests/functional/fault/fault.cfg
25
26#
27# DESCRIPTION:
28# Testing Fault Management Agent ZED Logic - Automated Auto-Spare Test when
29# multiple drives are faulted.
30#
31# STRATEGY:
32# 1. Create a pool with two hot spares
33# 2. Inject IO ERRORS with a zinject error handler on the first device
34# 3. Start a scrub
35# 4. Verify the ZED kicks in a hot spare and expected pool/device status
36# 5. Inject IO ERRORS on a second device
37# 6. Start a scrub
38# 7. Verify the ZED kicks in a second hot spare
39# 8. Clear the fault on both devices
40# 9. Verify the hot spares are available and expected pool/device status
41# 10. Rinse and repeat, this time faulting both devices at the same time
42#
43
44verify_runnable "both"
45
46function cleanup
47{
48	log_must zinject -c all
49	destroy_pool $TESTPOOL
50	rm -f $DATA_DEVS $SPARE_DEVS
51}
52
53log_assert "ZED should be able to handle multiple faulted devices"
54log_onexit cleanup
55
56# Events not supported on FreeBSD
57if ! is_freebsd; then
58	# Clear events from previous runs
59	zed_events_drain
60fi
61
62FAULT_DEV1="$TEST_BASE_DIR/fault-dev1"
63FAULT_DEV2="$TEST_BASE_DIR/fault-dev2"
64SAFE_DEV1="$TEST_BASE_DIR/safe-dev1"
65SAFE_DEV2="$TEST_BASE_DIR/safe-dev2"
66DATA_DEVS="$FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 $SAFE_DEV2"
67SPARE_DEV1="$TEST_BASE_DIR/spare-dev1"
68SPARE_DEV2="$TEST_BASE_DIR/spare-dev2"
69SPARE_DEVS="$SPARE_DEV1 $SPARE_DEV2"
70
71for type in "mirror" "raidz" "raidz2" "raidz3"; do
72	# 1. Create a pool with two hot spares
73	truncate -s $SPA_MINDEVSIZE $DATA_DEVS $SPARE_DEVS
74	log_must zpool create -f $TESTPOOL $type $DATA_DEVS spare $SPARE_DEVS
75
76	# 2. Inject IO ERRORS with a zinject error handler on the first device
77	log_must zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL
78
79	# 3. Start a scrub
80	log_must zpool scrub $TESTPOOL
81
82	# 4. Verify the ZED kicks in a hot spare and expected pool/device status
83	log_note "Wait for ZED to auto-spare"
84	log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60
85	log_must wait_vdev_state $TESTPOOL $SPARE_DEV1 "ONLINE" 60
86	log_must wait_hotspare_state $TESTPOOL $SPARE_DEV1 "INUSE"
87	log_must check_state $TESTPOOL "" "DEGRADED"
88
89	# 5. Inject IO ERRORS on a second device
90	log_must zinject -d $FAULT_DEV2 -e io -T all -f 100 $TESTPOOL
91
92	# 6. Start a scrub
93	while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do
94		sleep 1
95	done
96	log_must zpool scrub $TESTPOOL
97
98	# 7. Verify the ZED kicks in a second hot spare
99	log_note "Wait for ZED to auto-spare"
100	log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60
101	log_must wait_vdev_state $TESTPOOL $SPARE_DEV2 "ONLINE" 60
102	log_must wait_hotspare_state $TESTPOOL $SPARE_DEV2 "INUSE"
103	log_must check_state $TESTPOOL "" "DEGRADED"
104
105	# 8. Clear the fault on both devices
106	log_must zinject -c all
107	log_must zpool clear $TESTPOOL $FAULT_DEV1
108	log_must zpool clear $TESTPOOL $FAULT_DEV2
109
110	# 9. Verify the hot spares are available and expected pool/device status
111	log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "ONLINE" 60
112	log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "ONLINE" 60
113	log_must wait_hotspare_state $TESTPOOL $SPARE_DEV1 "AVAIL"
114	log_must wait_hotspare_state $TESTPOOL $SPARE_DEV2 "AVAIL"
115	log_must check_state $TESTPOOL "" "ONLINE"
116
117	# Cleanup
118	cleanup
119done
120
121# Rinse and repeat, this time faulting both devices at the same time
122# NOTE: "raidz" is excluded since it cannot survive 2 faulted devices
123# NOTE: "mirror" is a 4-way mirror here and should survive this test
124for type in "mirror" "raidz2" "raidz3"; do
125	# 1. Create a pool with two hot spares
126	truncate -s $SPA_MINDEVSIZE $DATA_DEVS $SPARE_DEVS
127	log_must zpool create -f $TESTPOOL $type $DATA_DEVS spare $SPARE_DEVS
128
129	# 2. Inject IO ERRORS with a zinject error handler on two devices
130	log_must eval "zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL &"
131	log_must eval "zinject -d $FAULT_DEV2 -e io -T all -f 100 $TESTPOOL &"
132
133	# 3. Start a scrub
134	log_must zpool scrub $TESTPOOL
135
136	# 4. Verify the ZED kicks in two hot spares and expected pool/device status
137	log_note "Wait for ZED to auto-spare"
138	log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60
139	log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60
140	log_must wait_vdev_state $TESTPOOL $SPARE_DEV1 "ONLINE" 60
141	log_must wait_vdev_state $TESTPOOL $SPARE_DEV2 "ONLINE" 60
142	log_must wait_hotspare_state $TESTPOOL $SPARE_DEV1 "INUSE"
143	log_must wait_hotspare_state $TESTPOOL $SPARE_DEV2 "INUSE"
144	log_must check_state $TESTPOOL "" "DEGRADED"
145
146	# 5. Clear the fault on both devices
147	log_must zinject -c all
148	log_must zpool clear $TESTPOOL $FAULT_DEV1
149	log_must zpool clear $TESTPOOL $FAULT_DEV2
150
151	# Cleanup
152	cleanup
153done
154
155log_pass "ZED successfully handles multiple faulted devices"
156