1#!/bin/ksh -p
2#
3# CDDL HEADER START
4#
5# This file and its contents are supplied under the terms of the
6# Common Development and Distribution License ("CDDL"), version 1.0.
7# You may only use this file in accordance with the terms of version
8# 1.0 of the CDDL.
9#
10# A full copy of the text of the CDDL should have accompanied this
11# source.  A copy of the CDDL is also available via the Internet at
12# http://www.illumos.org/license/CDDL.
13#
14# CDDL HEADER END
15#
16
17#
18# Copyright 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
19#
20
21. $STF_SUITE/include/libtest.shlib
22. $STF_SUITE/tests/functional/events/events_common.kshlib
23. $STF_SUITE/tests/functional/fault/fault.cfg
24
25#
26# DESCRIPTION:
27# Testing Fault Management Agent ZED Logic - Physically removed device is
28# made unavail and onlined when reattached
29#
30# STRATEGY:
31# 1. Create a pool
32# 2. Simulate physical removal of one device
33# 3. Verify the device is unavailable
34# 4. Reattach the device
35# 5. Verify the device is onlined
36# 6. Repeat the same tests with a spare device:
37#    zed will use the spare to handle the removed data device
38# 7. Repeat the same tests again with a faulted spare device:
39#    the removed data device should be unavailable
40#
41# NOTE: the use of 'block_device_wait' throughout the test helps avoid race
42# conditions caused by mixing creation/removal events from partitioning the
43# disk (zpool create) and events from physically removing it (remove_disk).
44#
45# NOTE: the test relies on 'zpool sync' to prompt the kmods to transition a
46# vdev to the unavailable state.  The ZED does receive a removal notification
47# but only relies on it to activate a hot spare.  Additional work is planned
48# to extend an existing ioctl interface to allow the ZED to transition the
49# vdev in to a removed state.
50#
51verify_runnable "both"
52
53if is_linux; then
54	# Add one 512b scsi_debug device (4Kn would generate IO errors)
55	# NOTE: must be larger than other "file" vdevs and minimum SPA devsize:
56	# add 32m of fudge
57	load_scsi_debug $(($MINVDEVSIZE/1024/1024+32)) 1 1 1 '512b'
58else
59	log_unsupported "scsi debug module unsupported"
60fi
61
62function cleanup
63{
64	destroy_pool $TESTPOOL
65	rm -f $filedev1
66	rm -f $filedev2
67	rm -f $filedev3
68	rm -f $sparedev
69	unload_scsi_debug
70}
71
72log_assert "ZED detects physically removed devices"
73
74log_onexit cleanup
75
76filedev1="$TEST_BASE_DIR/file-vdev-1"
77filedev2="$TEST_BASE_DIR/file-vdev-2"
78filedev3="$TEST_BASE_DIR/file-vdev-3"
79sparedev="$TEST_BASE_DIR/file-vdev-spare"
80removedev=$(get_debug_device)
81
82typeset poolconfs=(
83    "mirror $filedev1 $removedev"
84    "raidz3 $filedev1 $filedev2 $filedev3 $removedev"
85    "mirror $filedev1 $filedev2 special mirror $filedev3 $removedev"
86)
87
88log_must truncate -s $MINVDEVSIZE $filedev1
89log_must truncate -s $MINVDEVSIZE $filedev2
90log_must truncate -s $MINVDEVSIZE $filedev3
91log_must truncate -s $MINVDEVSIZE $sparedev
92
93for conf in "${poolconfs[@]}"
94do
95	# 1. Create a pool
96	log_must zpool create -f $TESTPOOL $conf
97	block_device_wait ${DEV_DSKDIR}/${removedev}
98
99	mntpnt=$(get_prop mountpoint /$TESTPOOL) ||
100	    log_fail "get_prop mountpoint /$TESTPOOL"
101
102	# 2. Simulate physical removal of one device
103	remove_disk $removedev
104	log_must mkfile 1m $mntpnt/file
105	log_must zpool sync $TESTPOOL
106
107	# 3. Verify the device is unavailable.
108	log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL"
109
110	# 4. Reattach the device
111	insert_disk $removedev
112
113	# 5. Verify the device is onlined
114	log_must wait_vdev_state $TESTPOOL $removedev "ONLINE"
115
116	# cleanup
117	destroy_pool $TESTPOOL
118	log_must parted "${DEV_DSKDIR}/${removedev}" -s -- mklabel msdos
119	block_device_wait ${DEV_DSKDIR}/${removedev}
120done
121
122# 6. Repeat the same tests with a spare device: zed will use the spare to handle
123#    the removed data device
124for conf in "${poolconfs[@]}"
125do
126	# 1. Create a pool with a spare
127	log_must zpool create -f $TESTPOOL $conf
128	block_device_wait ${DEV_DSKDIR}/${removedev}
129	log_must zpool add $TESTPOOL spare $sparedev
130
131	mntpnt=$(get_prop mountpoint /$TESTPOOL) ||
132	    log_fail "get_prop mountpoint /$TESTPOOL"
133
134	# 2. Simulate physical removal of one device
135	remove_disk $removedev
136	log_must mkfile 1m $mntpnt/file
137	log_must zpool sync $TESTPOOL
138
139	# 3. Verify the device is handled by the spare.
140	log_must wait_hotspare_state $TESTPOOL $sparedev "INUSE"
141	log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL"
142
143	# 4. Reattach the device
144	insert_disk $removedev
145
146	# 5. Verify the device is onlined
147	log_must wait_vdev_state $TESTPOOL $removedev "ONLINE"
148
149	# cleanup
150	destroy_pool $TESTPOOL
151	log_must parted "${DEV_DSKDIR}/${removedev}" -s -- mklabel msdos
152	block_device_wait ${DEV_DSKDIR}/${removedev}
153done
154
155# 7. Repeat the same tests again with a faulted spare device: zed should offline
156#    the removed data device if no spare is available
157for conf in "${poolconfs[@]}"
158do
159	# 1. Create a pool with a spare
160	log_must zpool create -f $TESTPOOL $conf
161	block_device_wait ${DEV_DSKDIR}/${removedev}
162	log_must zpool add $TESTPOOL spare $sparedev
163
164	mntpnt=$(get_prop mountpoint /$TESTPOOL) ||
165	    log_fail "get_prop mountpoint /$TESTPOOL"
166
167	# 2. Fault the spare device making it unavailable
168	log_must zpool offline -f $TESTPOOL $sparedev
169	log_must wait_hotspare_state $TESTPOOL $sparedev "FAULTED"
170
171	# 3. Simulate physical removal of one device
172	remove_disk $removedev
173	log_must mkfile 1m $mntpnt/file
174	log_must zpool sync $TESTPOOL
175
176	# 4. Verify the device is unavailable
177	log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL"
178
179	# 5. Reattach the device
180	insert_disk $removedev
181
182	# 6. Verify the device is onlined
183	log_must wait_vdev_state $TESTPOOL $removedev "ONLINE"
184
185	# cleanup
186	destroy_pool $TESTPOOL
187	log_must parted "${DEV_DSKDIR}/${removedev}" -s -- mklabel msdos
188	block_device_wait ${DEV_DSKDIR}/${removedev}
189done
190
191log_pass "ZED detects physically removed devices"
192