1#
2# This file and its contents are supplied under the terms of the
3# Common Development and Distribution License ("CDDL"), version 1.0.
4# You may only use this file in accordance with the terms of version
5# 1.0 of the CDDL.
6#
7# A full copy of the text of the CDDL should have accompanied this
8# source.  A copy of the CDDL is also available via the Internet at
9# http://www.illumos.org/license/CDDL.
10#
11
12#
13# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
14# Use is subject to license terms.
15# Copyright (c) 2012, 2019 by Delphix. All rights reserved.
16# Copyright 2016 Nexenta Systems, Inc.
17# Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved.
18# Copyright (c) 2017 Lawrence Livermore National Security, LLC.
19# Copyright (c) 2017 Datto Inc.
20# Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
21# Copyright 2019 Richard Elling
22#
23
24#
25# Returns SCSI host number for the given disk
26#
27function get_scsi_host #disk
28{
29	typeset disk=$1
30	ls /sys/block/${disk}/device/scsi_device | cut -d : -f 1
31}
32
33#
34# Cause a scan of all scsi host adapters by default
35#
36# $1 optional host number
37#
38function scan_scsi_hosts
39{
40	typeset hostnum=${1}
41
42	if is_linux; then
43		if [[ -z $hostnum ]]; then
44			for host in /sys/class/scsi_host/host*; do
45				log_must eval "echo '- - -' > $host/scan"
46			done
47		else
48			log_note "/sys/class/scsi_host/host$hostnum/scan"
49			log_must eval \
50			    "echo '- - -' > /sys/class/scsi_host/host$hostnum/scan"
51		fi
52	fi
53}
54
55#
56# Wait for newly created block devices to have their minors created.
57# Additional arguments can be passed to udevadm trigger, with the expected
58# arguments to typically be a block device pathname. This is useful when
59# checking waiting on a specific device to settle rather than triggering
60# all devices and waiting for them all to settle.
61#
62# The udevadm settle timeout can be 120 or 180 seconds by default for
63# some distros. If a long delay is experienced, it could be due to some
64# strangeness in a malfunctioning device that isn't related to the devices
65# under test. To help debug this condition, a notice is given if settle takes
66# too long.
67#
68# Note: there is no meaningful return code if udevadm fails. Consumers
69# should not expect a return code (do not call as argument to log_must)
70#
71function block_device_wait
72{
73	if is_linux; then
74		udevadm trigger $* 2>/dev/null
75		typeset start=$SECONDS
76		udevadm settle
77		typeset elapsed=$((SECONDS - start))
78		[[ $elapsed > 60 ]] && \
79		    log_note udevadm settle time too long: $elapsed
80	elif is_freebsd; then
81		if [[ ${#@} -eq 0 ]]; then
82			# Do something that has to go through the geom event
83			# queue to complete.
84			sysctl kern.geom.conftxt >/dev/null
85			return
86		fi
87	fi
88	# Poll for the given paths to appear, but give up eventually.
89	typeset -i i
90	for (( i = 0; i < 5; ++i )); do
91		typeset missing=false
92		typeset dev
93		for dev in "${@}"; do
94			if ! [[ -e $dev ]]; then
95				missing=true
96				break
97			fi
98		done
99		if ! $missing; then
100			break
101		fi
102		sleep ${#@}
103	done
104}
105
106#
107# Check if the given device is physical device
108#
109function is_physical_device #device
110{
111	typeset device=${1#$DEV_DSKDIR/}
112	device=${device#$DEV_RDSKDIR/}
113
114	if is_linux; then
115		is_disk_device "$DEV_DSKDIR/$device" && \
116		[ -f /sys/module/loop/parameters/max_part ]
117	elif is_freebsd; then
118		is_disk_device "$DEV_DSKDIR/$device" && \
119		echo $device | grep -qE \
120		    -e '^a?da[0-9]+$' \
121		    -e '^md[0-9]+$' \
122		    -e '^mfid[0-9]+$' \
123		    -e '^nda[0-9]+$' \
124		    -e '^nvd[0-9]+$' \
125		    -e '^vtbd[0-9]+$'
126	else
127		echo $device | grep -qE "^c[0-F]+([td][0-F]+)+$"
128	fi
129}
130
131#
132# Check if the given device is a real device (ie SCSI device)
133#
134function is_real_device #disk
135{
136	typeset disk=$1
137	[[ -z $disk ]] && log_fail "No argument for disk given."
138
139	if is_linux; then
140		lsblk $DEV_RDSKDIR/$disk -o TYPE 2>/dev/null | \
141		    grep -q disk
142	fi
143}
144
145#
146# Check if the given device is a loop device
147#
148function is_loop_device #disk
149{
150	typeset disk=$1
151	[[ -z $disk ]] && log_fail "No argument for disk given."
152
153	if is_linux; then
154		lsblk $DEV_RDSKDIR/$disk -o TYPE 2>/dev/null | \
155		    grep -q loop
156	fi
157}
158
159#
160# Linux:
161# Check if the given device is a multipath device and if there is a symbolic
162# link to a device mapper and to a disk
163# Currently no support for dm devices alone without multipath
164#
165# FreeBSD:
166# Check if the given device is a gmultipath device.
167#
168# Others:
169# No multipath detection.
170#
171function is_mpath_device #disk
172{
173	typeset disk=$1
174	[[ -z $disk ]] && log_fail "No argument for disk given."
175
176	if is_linux; then
177		if lsblk $DEV_MPATHDIR/$disk -o TYPE 2>/dev/null | \
178		   grep -q mpath; then
179			readlink $DEV_MPATHDIR/$disk > /dev/null 2>&1
180		else
181			false
182		fi
183	elif is_freebsd; then
184		is_disk_device $DEV_MPATHDIR/$disk
185	else
186		false
187	fi
188}
189
190#
191# Check if the given path is the appropriate sort of device special node.
192#
193function is_disk_device #path
194{
195	typeset path=$1
196
197	if is_freebsd; then
198		# FreeBSD doesn't have block devices, only character devices.
199		test -c $path
200	else
201		test -b $path
202	fi
203}
204
205# Set the slice prefix for disk partitioning depending
206# on whether the device is a real, multipath, or loop device.
207# Currently all disks have to be of the same type, so only
208# checks first disk to determine slice prefix.
209#
210function set_slice_prefix
211{
212	typeset disk
213	typeset -i i=0
214
215	if is_linux; then
216		while (( i < $DISK_ARRAY_NUM )); do
217			disk="$(echo $DISKS | awk '{print $(i + 1)}')"
218			if is_mpath_device $disk && ! echo $disk | awk 'substr($1,18,1) ~ /^[[:digit:]]+$/ {exit 1}' || is_real_device $disk; then
219				export SLICE_PREFIX=""
220				return 0
221			elif is_mpath_device $disk || is_loop_device $disk; then
222				export SLICE_PREFIX="p"
223				return 0
224			else
225				log_fail "$disk not supported for partitioning."
226			fi
227			(( i = i + 1))
228		done
229	fi
230}
231
232#
233# Set the directory path of the listed devices in $DISK_ARRAY_NUM
234# Currently all disks have to be of the same type, so only
235# checks first disk to determine device directory
236# default = /dev (linux)
237# real disk = /dev (linux)
238# multipath device = /dev/mapper (linux)
239#
240function set_device_dir
241{
242	typeset disk
243	typeset -i i=0
244
245	if is_linux; then
246		while (( i < $DISK_ARRAY_NUM )); do
247			disk="$(echo $DISKS | awk '{print $(i + 1)}')"
248			if is_mpath_device $disk; then
249				export DEV_DSKDIR=$DEV_MPATHDIR
250				return 0
251			else
252				export DEV_DSKDIR=$DEV_RDSKDIR
253				return 0
254			fi
255			(( i = i + 1))
256		done
257	else
258		export DEV_DSKDIR=$DEV_RDSKDIR
259	fi
260}
261
262#
263# Get the directory path of given device
264#
265function get_device_dir #device
266{
267	typeset device=$1
268
269	if ! is_freebsd && ! is_physical_device $device; then
270		if [[ $device != "/" ]]; then
271			device=${device%/*}
272		fi
273		if is_disk_device "$DEV_DSKDIR/$device"; then
274			device="$DEV_DSKDIR"
275		fi
276		echo $device
277	else
278		echo "$DEV_DSKDIR"
279	fi
280}
281
282#
283# Get persistent name for given disk
284#
285function get_persistent_disk_name #device
286{
287	typeset device=$1
288
289	if is_linux; then
290		if is_real_device $device; then
291			udevadm info -q all -n $DEV_DSKDIR/$device \
292			    | awk '/disk\/by-id/ {print $2; exit}' | cut -d/ -f3-
293		elif is_mpath_device $device; then
294			udevadm info -q all -n $DEV_DSKDIR/$device \
295			    | awk '/disk\/by-id\/dm-uuid/ {print $2; exit}' \
296			    | cut -d/ -f3
297		else
298			echo $device
299		fi
300	else
301		echo $device
302	fi
303}
304
305#
306# Online or offline a disk on the system
307#
308# First checks state of disk. Test will fail if disk is not properly onlined
309# or offlined. Online is a full rescan of SCSI disks by echoing to every
310# host entry.
311#
312function on_off_disk # disk state{online,offline} host
313{
314	typeset disk=$1
315	typeset state=$2
316	typeset host=$3
317
318	[[ -z $disk ]] || [[ -z $state ]] &&  \
319	    log_fail "Arguments invalid or missing"
320
321	if is_linux; then
322		if [[ $state == "offline" ]] && ( is_mpath_device $disk ); then
323			dm_name="$(readlink $DEV_DSKDIR/$disk | cut -d/ -f2)"
324			dep="$(ls /sys/block/${dm_name}/slaves | awk '{print $1}')"
325			while [[ -n $dep ]]; do
326				#check if disk is online
327				if lsscsi | grep -qF $dep; then
328					dep_dir="/sys/block/${dm_name}"
329					dep_dir+="/slaves/${dep}/device"
330					ss="${dep_dir}/state"
331					sd="${dep_dir}/delete"
332					log_must eval "echo 'offline' > ${ss}"
333					log_must eval "echo '1' > ${sd}"
334					if lsscsi | grep -qF $dep; then
335						log_fail "Offlining $disk failed"
336					fi
337				fi
338				dep="$(ls /sys/block/$dm_name/slaves 2>/dev/null | awk '{print $1}')"
339			done
340		elif [[ $state == "offline" ]] && ( is_real_device $disk ); then
341			#check if disk is online
342			if lsscsi | grep -qF $disk; then
343				dev_state="/sys/block/$disk/device/state"
344				dev_delete="/sys/block/$disk/device/delete"
345				log_must eval "echo 'offline' > ${dev_state}"
346				log_must eval "echo '1' > ${dev_delete}"
347				if lsscsi | grep -qF $disk; then
348					log_fail "Offlining $disk failed"
349				fi
350			else
351				log_note "$disk is already offline"
352			fi
353		elif [[ $state == "online" ]]; then
354			#force a full rescan
355			scan_scsi_hosts $host
356			block_device_wait
357			if is_mpath_device $disk; then
358				dm_name="$(readlink $DEV_DSKDIR/$disk | cut -d/ -f2)"
359				dep="$(ls /sys/block/$dm_name/slaves | awk '{print $1}')"
360				if lsscsi | grep -qF $dep; then
361					log_fail "Onlining $disk failed"
362				fi
363			elif is_real_device $disk; then
364				block_device_wait
365				typeset -i retries=0
366				while ! lsscsi | grep -qF $disk; do
367					if (( $retries > 2 )); then
368						log_fail "Onlining $disk failed"
369						break
370					fi
371					(( ++retries ))
372					sleep 1
373				done
374			else
375				log_fail "$disk is not a real dev"
376			fi
377		else
378			log_fail "$disk failed to $state"
379		fi
380	fi
381}
382
383#
384# Simulate disk removal
385#
386function remove_disk #disk
387{
388	typeset disk=$1
389	on_off_disk $disk "offline"
390	block_device_wait
391}
392
393#
394# Simulate disk insertion for the given SCSI host
395#
396function insert_disk #disk scsi_host
397{
398	typeset disk=$1
399	typeset scsi_host=$2
400	on_off_disk $disk "online" $scsi_host
401	block_device_wait
402}
403
404#
405# Load scsi_debug module with specified parameters
406# $blksz can be either one of: < 512b | 512e | 4Kn >
407#
408function load_scsi_debug # dev_size_mb add_host num_tgts max_luns blksz
409{
410	typeset devsize=$1
411	typeset hosts=$2
412	typeset tgts=$3
413	typeset luns=$4
414	typeset blksz=$5
415
416	[[ -z $devsize ]] || [[ -z $hosts ]] || [[ -z $tgts ]] || \
417	    [[ -z $luns ]] || [[ -z $blksz ]] && \
418	    log_fail "Arguments invalid or missing"
419
420	case "$5" in
421		'512b')
422			typeset sector=512
423			typeset blkexp=0
424		;;
425		'512e')
426			typeset sector=512
427			typeset blkexp=3
428		;;
429		'4Kn')
430			typeset sector=4096
431			typeset blkexp=0
432		;;
433		*) log_fail "Unsupported blksz value: $5" ;;
434	esac
435
436	if is_linux; then
437		modprobe -n scsi_debug ||
438			log_unsupported "Platform does not have scsi_debug module"
439		if lsmod | grep -q scsi_debug; then
440			log_fail "scsi_debug module already installed"
441		else
442			log_must modprobe scsi_debug dev_size_mb=$devsize \
443			    add_host=$hosts num_tgts=$tgts max_luns=$luns \
444			    sector_size=$sector physblk_exp=$blkexp
445			block_device_wait
446			if ! lsscsi | grep -q scsi_debug; then
447				log_fail "scsi_debug module install failed"
448			fi
449		fi
450	fi
451}
452
453#
454# Unload scsi_debug module, if needed.
455#
456function unload_scsi_debug
457{
458	log_must_retry "in use" 5 modprobe -r scsi_debug
459}
460
461#
462# Get scsi_debug device name.
463# Returns basename of scsi_debug device (for example "sdb").
464#
465function get_debug_device
466{
467	for i in {1..10} ; do
468		val=$(lsscsi | awk '/scsi_debug/ {print $6; exit}' | cut -d/ -f3)
469
470		# lsscsi can take time to settle
471		if [ "$val" != "-" ] ; then
472			break
473		fi
474		sleep 1
475	done
476	echo "$val"
477}
478
479#
480# Get actual devices used by the pool (i.e. linux sdb1 not sdb).
481#
482function get_pool_devices #testpool #devdir
483{
484	typeset testpool=$1
485	typeset devdir=$2
486	typeset out=""
487
488	case "$UNAME" in
489	Linux|FreeBSD)
490		zpool status -P $testpool | awk -v d="$devdir" '$1 ~ d {sub(d "/", ""); printf("%s ", $1)}'
491		;;
492	esac
493}
494
495#
496# Write to standard out giving the level, device name, offset and length
497# of all blocks in an input file. The offset and length are in units of
498# 512 byte blocks. In the case of mirrored vdevs, only the first
499# device is listed, as the levels, blocks and offsets will be the same
500# on other devices. Note that this function only works with mirrored
501# or non-redundant pools, not raidz.
502#
503# The output of this function can be used to introduce corruption at
504# varying levels of indirection.
505#
506function list_file_blocks # input_file
507{
508	typeset input_file=$1
509
510	[[ -f $input_file ]] || log_fail "Couldn't find $input_file"
511
512	typeset ds="$(zfs list -H -o name $input_file)"
513	typeset pool="${ds%%/*}"
514	typeset objnum="$(get_objnum $input_file)"
515
516	#
517	# Establish a mapping between vdev ids as shown in a DVA and the
518	# pathnames they correspond to in ${VDEV_MAP[][]}.
519	#
520	# The vdev bits in a DVA refer to the top level vdev id.
521	# ${VDEV_MAP[$id]} is an array of the vdev paths within that vdev.
522	#
523	eval $(zdb -C $pool | awk '
524	    BEGIN { printf "typeset -a VDEV_MAP;" }
525	    function subscript(s) {
526	        # "[#]" is more convenient than the bare "#"
527	        match(s, /\[[0-9]*\]/)
528		return substr(s, RSTART, RLENGTH)
529	    }
530	    id && !/^                / {
531	        # left a top level vdev
532	        id = 0
533	    }
534	    id && $1 ~ /^path:$/ {
535	        # found a vdev path; save it in the map
536	        printf "VDEV_MAP%s%s=%s;", id, child, $2
537	    }
538	    /^            children/ {
539	        # entering a top level vdev
540	        id = subscript($0)
541		child = "[0]" # default in case there is no nested vdev
542		printf "typeset -a VDEV_MAP%s;", id
543	    }
544	    /^                children/ {
545	        # entering a nested vdev (e.g. child of a top level mirror)
546	        child = subscript($0)
547	    }
548	')
549
550	#
551	# The awk below parses the output of zdb, printing out the level
552	# of each block along with vdev id, offset and length. The last
553	# two are converted to decimal in the while loop. 4M is added to
554	# the offset to compensate for the first two labels and boot
555	# block. Lastly, the offset and length are printed in units of
556	# 512B blocks for ease of use with dd.
557	#
558	typeset level vdev path offset length
559	if awk -n '' 2>/dev/null; then
560		# gawk needs -n to decode hex
561		AWK='awk -n'
562	else
563		AWK='awk'
564	fi
565	sync_all_pools true
566	zdb -dddddd $ds $objnum | $AWK -v pad=$((4<<20)) -v bs=512 '
567	    /^$/ { looking = 0 }
568	    looking {
569	        level = $2
570	        field = 3
571	        while (split($field, dva, ":") == 3) {
572	            # top level vdev id
573	            vdev = int(dva[1])
574	            # offset + 4M label/boot pad in 512B blocks
575	            offset = (int("0x"dva[2]) + pad) / bs
576		    # length in 512B blocks
577		    len = int("0x"dva[3]) / bs
578
579	            print level, vdev, offset, len
580
581	            ++field
582	        }
583	    }
584	    /^Indirect blocks:/ { looking = 1 }
585	' | \
586	while read level vdev offset length; do
587		for path in ${VDEV_MAP[$vdev][@]}; do
588			echo "$level $path $offset $length"
589		done
590	done 2>/dev/null
591}
592
593function corrupt_blocks_at_level # input_file corrupt_level
594{
595	typeset input_file=$1
596	typeset corrupt_level="L${2:-0}"
597	typeset level path offset length
598
599	[[ -f $input_file ]] || log_fail "Couldn't find $input_file"
600
601	if is_freebsd; then
602		# Temporarily allow corrupting an inuse device.
603		debugflags=$(sysctl -n kern.geom.debugflags)
604		sysctl kern.geom.debugflags=16
605	fi
606
607	list_file_blocks $input_file | \
608	while read level path offset length; do
609		if [[ $level = $corrupt_level ]]; then
610			log_must dd if=/dev/urandom of=$path bs=512 \
611			    count=$length seek=$offset conv=notrunc
612		fi
613	done
614
615	if is_freebsd; then
616		sysctl kern.geom.debugflags=$debugflags
617	fi
618
619	# This is necessary for pools made of loop devices.
620	sync
621}
622
623function corrupt_label_checksum # label_number vdev_path
624{
625	typeset label_size=$((256*1024))
626	typeset vdev_size=$(stat_size ${2})
627	typeset -a offsets=("$((128*1024 - 32))" \
628	    "$(($label_size + (128*1024 - 32)))" \
629	    "$(($vdev_size - $label_size - (128*1024 + 32)))" \
630	    "$(($vdev_size - (128*1024 + 32)))")
631
632	dd if=/dev/urandom of=${2} seek=${offsets[$1]} bs=1 count=32 \
633	    conv=notrunc
634}
635