1#!/bin/ksh -p
2
3#
4# CDDL HEADER START
5#
6# This file and its contents are supplied under the terms of the
7# Common Development and Distribution License ("CDDL"), version 1.0.
8# You may only use this file in accordance with the terms of version
9# 1.0 of the CDDL.
10#
11# A full copy of the text of the CDDL should have accompanied this
12# source.  A copy of the CDDL is also available via the Internet at
13# http://www.illumos.org/license/CDDL.
14#
15# CDDL HEADER END
16#
17
18#
19# Copyright (c) 2017 by Intel Corporation. All rights reserved.
20# Copyright 2017, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
21#
22
23. $STF_SUITE/include/libtest.shlib
24. $STF_SUITE/tests/functional/fault/fault.cfg
25
26#
27# DESCRIPTION:
28# Testing Fault Management Agent ZED Logic - Automated Auto-Spare Test when
29# multiple drives are faulted.
30#
31# STRATEGY:
32# 1. Create a pool with two hot spares
33# 2. Inject IO ERRORS with a zinject error handler on the first device
34# 3. Start a scrub
35# 4. Verify the ZED kicks in a hot spare and expected pool/device status
36# 5. Inject IO ERRORS on a second device
37# 6. Start a scrub
38# 7. Verify the ZED kicks in a second hot spare
39# 8. Clear the fault on both devices
40# 9. Verify the hot spares are available and expected pool/device status
41# 10. Rinse and repeat, this time faulting both devices at the same time
42#
43
44verify_runnable "both"
45
46function cleanup
47{
48	log_must zinject -c all
49	destroy_pool $TESTPOOL
50	rm -f $DATA_DEVS $SPARE_DEVS
51}
52
53log_assert "ZED should be able to handle multiple faulted devices"
54log_onexit cleanup
55
56# Events not supported on FreeBSD
57if ! is_freebsd; then
58	# Clear events from previous runs
59	zed_events_drain
60fi
61
62FAULT_DEV1="$TEST_BASE_DIR/fault-dev1"
63FAULT_DEV2="$TEST_BASE_DIR/fault-dev2"
64SAFE_DEV1="$TEST_BASE_DIR/safe-dev1"
65SAFE_DEV2="$TEST_BASE_DIR/safe-dev2"
66SAFE_DEV3="$TEST_BASE_DIR/safe-dev3"
67SAFE_DEV4="$TEST_BASE_DIR/safe-dev4"
68DATA_DEVS="$FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 $SAFE_DEV2 $SAFE_DEV3 $SAFE_DEV4"
69SPARE_DEV1="$TEST_BASE_DIR/spare-dev1"
70SPARE_DEV2="$TEST_BASE_DIR/spare-dev2"
71SPARE_DEVS="$SPARE_DEV1 $SPARE_DEV2"
72
73for type in "mirror" "raidz" "raidz2" "raidz3" "draid2:1s"; do
74	if [ "$type" = "draid2:1s" ]; then
75		# 1. Create a dRAID pool with a distributed and traditional
76		# hot spare to provide test coverage for both configurations.
77		#
78		# Corruption is injected in the third and fourth vdevs
79		# since the dRAID permutation at these offsets maps to
80		# distributed spare space and not data devices.
81		#
82		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1
83		log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \
84		    $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \
85		    spare $SPARE_DEV1
86		SPARE1=$SPARE_DEV1
87		SPARE2="draid2-0-0"
88	elif [ "$type" = "mirror" ]; then
89		# 1. Create a 3-way mirror pool with two hot spares
90		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS
91		log_must zpool create -f $TESTPOOL $type \
92		    $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS
93		SPARE1=$SPARE_DEV1
94		SPARE2=$SPARE_DEV2
95	else
96		# 1. Create a raidz pool with two hot spares
97		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS
98		log_must zpool create -f $TESTPOOL $type $DATA_DEVS \
99		    spare $SPARE_DEVS
100		SPARE1=$SPARE_DEV1
101		SPARE2=$SPARE_DEV2
102	fi
103
104	# 2. Inject IO ERRORS with a zinject error handler on the first device
105	log_must zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL
106
107	# 3. Start a scrub
108	log_must zpool scrub $TESTPOOL
109
110	# 4. Verify the ZED kicks in a hot spare and the pool/device status
111	log_note "Wait for ZED to auto-spare"
112	log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60
113	log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60
114	log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE"
115	log_must check_state $TESTPOOL "" "DEGRADED"
116
117	# 5. Inject IO ERRORS on a second device
118	log_must zinject -d $FAULT_DEV2 -e io -T all -f 100 $TESTPOOL
119
120	# 6. Start a scrub
121	while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do
122		sleep 1
123	done
124	log_must zpool scrub $TESTPOOL
125
126	# 7. Verify the ZED kicks in a second hot spare
127	log_note "Wait for ZED to auto-spare"
128	log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60
129	log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60
130	log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE"
131	log_must check_state $TESTPOOL "" "DEGRADED"
132
133	while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do
134		sleep 1
135	done
136
137	# 8. Clear the fault on both devices
138	log_must zinject -c all
139	log_must zpool clear $TESTPOOL $FAULT_DEV1
140	log_must zpool clear $TESTPOOL $FAULT_DEV2
141
142	# 9. Verify the hot spares are available and expected pool/device status
143	log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "ONLINE" 60
144	log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "ONLINE" 60
145	log_must wait_hotspare_state $TESTPOOL $SPARE1 "AVAIL"
146	log_must wait_hotspare_state $TESTPOOL $SPARE2 "AVAIL"
147	log_must check_state $TESTPOOL "" "ONLINE"
148
149	# Cleanup
150	cleanup
151done
152
153# Rinse and repeat, this time faulting both devices at the same time
154# NOTE: "raidz" is excluded since it cannot survive 2 faulted devices
155# NOTE: "mirror" is a 3-way mirror here and should survive this test
156for type in "mirror" "raidz2" "raidz3" "draid2:1s"; do
157	if [ "$type" = "draid2:1s" ]; then
158		# 1. Create a dRAID pool with a distributed and traditional
159		# hot spare to provide test coverage for both configurations.
160		#
161		# Corruption is injected in the third and fourth vdevs
162		# since the dRAID permutation at these offsets maps to
163		# distributed spare space and not data devices.
164		#
165		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1
166		log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \
167		    $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \
168		    spare $SPARE_DEV1
169		SPARE1=$SPARE_DEV1
170		SPARE2="draid2-0-0"
171	elif [ "$type" = "mirror" ]; then
172		# 1. Create a 3-way mirror pool with two hot spares
173		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS
174		log_must zpool create -f $TESTPOOL $type \
175		    $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS
176		SPARE1=$SPARE_DEV1
177		SPARE2=$SPARE_DEV2
178	else
179		# 1. Create a raidz pool with two hot spares
180		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS
181		log_must zpool create -f $TESTPOOL $type $DATA_DEVS \
182		    spare $SPARE_DEVS
183		SPARE1=$SPARE_DEV1
184		SPARE2=$SPARE_DEV2
185	fi
186
187	# 2. Inject IO ERRORS with a zinject error handler on two devices
188	log_must eval "zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL &"
189	log_must eval "zinject -d $FAULT_DEV2 -e io -T all -f 100 $TESTPOOL &"
190
191	# 3. Start a scrub
192	log_must zpool scrub $TESTPOOL
193
194	# 4. Verify the ZED kicks in two hot spares and the pool/device status
195	log_note "Wait for ZED to auto-spare"
196	log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60
197	log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60
198	log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60
199	log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60
200	log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE"
201	log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE"
202	log_must check_state $TESTPOOL "" "DEGRADED"
203
204	# 5. Clear the fault on both devices
205	log_must zinject -c all
206	log_must zpool clear $TESTPOOL $FAULT_DEV1
207	log_must zpool clear $TESTPOOL $FAULT_DEV2
208
209	# Cleanup
210	cleanup
211done
212
213log_pass "ZED successfully handles multiple faulted devices"
214