1#!/bin/ksh -p 2 3# 4# CDDL HEADER START 5# 6# This file and its contents are supplied under the terms of the 7# Common Development and Distribution License ("CDDL"), version 1.0. 8# You may only use this file in accordance with the terms of version 9# 1.0 of the CDDL. 10# 11# A full copy of the text of the CDDL should have accompanied this 12# source. A copy of the CDDL is also available via the Internet at 13# http://www.illumos.org/license/CDDL. 14# 15# CDDL HEADER END 16# 17 18# 19# Copyright (c) 2017 by Intel Corporation. All rights reserved. 20# Copyright 2017, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. 21# 22 23. $STF_SUITE/include/libtest.shlib 24. $STF_SUITE/tests/functional/fault/fault.cfg 25 26# 27# DESCRIPTION: 28# Testing Fault Management Agent ZED Logic - Automated Auto-Spare Test when 29# multiple drives are faulted. 30# 31# STRATEGY: 32# 1. Create a pool with two hot spares 33# 2. Inject IO ERRORS with a zinject error handler on the first device 34# 3. Start a scrub 35# 4. Verify the ZED kicks in a hot spare and expected pool/device status 36# 5. Inject IO ERRORS on a second device 37# 6. Start a scrub 38# 7. Verify the ZED kicks in a second hot spare 39# 8. Clear the fault on both devices 40# 9. Verify the hot spares are available and expected pool/device status 41# 10. Rinse and repeat, this time faulting both devices at the same time 42# 43 44verify_runnable "both" 45 46function cleanup 47{ 48 log_must zinject -c all 49 destroy_pool $TESTPOOL 50 rm -f $DATA_DEVS $SPARE_DEVS 51} 52 53log_assert "ZED should be able to handle multiple faulted devices" 54log_onexit cleanup 55 56# Events not supported on FreeBSD 57if ! is_freebsd; then 58 # Clear events from previous runs 59 zed_events_drain 60fi 61 62FAULT_DEV1="$TEST_BASE_DIR/fault-dev1" 63FAULT_DEV2="$TEST_BASE_DIR/fault-dev2" 64SAFE_DEV1="$TEST_BASE_DIR/safe-dev1" 65SAFE_DEV2="$TEST_BASE_DIR/safe-dev2" 66DATA_DEVS="$FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 $SAFE_DEV2" 67SPARE_DEV1="$TEST_BASE_DIR/spare-dev1" 68SPARE_DEV2="$TEST_BASE_DIR/spare-dev2" 69SPARE_DEVS="$SPARE_DEV1 $SPARE_DEV2" 70 71for type in "mirror" "raidz" "raidz2" "raidz3"; do 72 # 1. Create a pool with two hot spares 73 truncate -s $SPA_MINDEVSIZE $DATA_DEVS $SPARE_DEVS 74 log_must zpool create -f $TESTPOOL $type $DATA_DEVS spare $SPARE_DEVS 75 76 # 2. Inject IO ERRORS with a zinject error handler on the first device 77 log_must zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL 78 79 # 3. Start a scrub 80 log_must zpool scrub $TESTPOOL 81 82 # 4. Verify the ZED kicks in a hot spare and expected pool/device status 83 log_note "Wait for ZED to auto-spare" 84 log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60 85 log_must wait_vdev_state $TESTPOOL $SPARE_DEV1 "ONLINE" 60 86 log_must wait_hotspare_state $TESTPOOL $SPARE_DEV1 "INUSE" 87 log_must check_state $TESTPOOL "" "DEGRADED" 88 89 # 5. Inject IO ERRORS on a second device 90 log_must zinject -d $FAULT_DEV2 -e io -T all -f 100 $TESTPOOL 91 92 # 6. Start a scrub 93 while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do 94 sleep 1 95 done 96 log_must zpool scrub $TESTPOOL 97 98 # 7. Verify the ZED kicks in a second hot spare 99 log_note "Wait for ZED to auto-spare" 100 log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60 101 log_must wait_vdev_state $TESTPOOL $SPARE_DEV2 "ONLINE" 60 102 log_must wait_hotspare_state $TESTPOOL $SPARE_DEV2 "INUSE" 103 log_must check_state $TESTPOOL "" "DEGRADED" 104 105 # 8. Clear the fault on both devices 106 log_must zinject -c all 107 log_must zpool clear $TESTPOOL $FAULT_DEV1 108 log_must zpool clear $TESTPOOL $FAULT_DEV2 109 110 # 9. Verify the hot spares are available and expected pool/device status 111 log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "ONLINE" 60 112 log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "ONLINE" 60 113 log_must wait_hotspare_state $TESTPOOL $SPARE_DEV1 "AVAIL" 114 log_must wait_hotspare_state $TESTPOOL $SPARE_DEV2 "AVAIL" 115 log_must check_state $TESTPOOL "" "ONLINE" 116 117 # Cleanup 118 cleanup 119done 120 121# Rinse and repeat, this time faulting both devices at the same time 122# NOTE: "raidz" is excluded since it cannot survive 2 faulted devices 123# NOTE: "mirror" is a 4-way mirror here and should survive this test 124for type in "mirror" "raidz2" "raidz3"; do 125 # 1. Create a pool with two hot spares 126 truncate -s $SPA_MINDEVSIZE $DATA_DEVS $SPARE_DEVS 127 log_must zpool create -f $TESTPOOL $type $DATA_DEVS spare $SPARE_DEVS 128 129 # 2. Inject IO ERRORS with a zinject error handler on two devices 130 log_must eval "zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL &" 131 log_must eval "zinject -d $FAULT_DEV2 -e io -T all -f 100 $TESTPOOL &" 132 133 # 3. Start a scrub 134 log_must zpool scrub $TESTPOOL 135 136 # 4. Verify the ZED kicks in two hot spares and expected pool/device status 137 log_note "Wait for ZED to auto-spare" 138 log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60 139 log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60 140 log_must wait_vdev_state $TESTPOOL $SPARE_DEV1 "ONLINE" 60 141 log_must wait_vdev_state $TESTPOOL $SPARE_DEV2 "ONLINE" 60 142 log_must wait_hotspare_state $TESTPOOL $SPARE_DEV1 "INUSE" 143 log_must wait_hotspare_state $TESTPOOL $SPARE_DEV2 "INUSE" 144 log_must check_state $TESTPOOL "" "DEGRADED" 145 146 # 5. Clear the fault on both devices 147 log_must zinject -c all 148 log_must zpool clear $TESTPOOL $FAULT_DEV1 149 log_must zpool clear $TESTPOOL $FAULT_DEV2 150 151 # Cleanup 152 cleanup 153done 154 155log_pass "ZED successfully handles multiple faulted devices" 156