1#
2# This file and its contents are supplied under the terms of the
3# Common Development and Distribution License ("CDDL"), version 1.0.
4# You may only use this file in accordance with the terms of version
5# 1.0 of the CDDL.
6#
7# A full copy of the text of the CDDL should have accompanied this
8# source.  A copy of the CDDL is also available via the Internet at
9# http://www.illumos.org/license/CDDL.
10#
11
12#
13# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
14# Use is subject to license terms.
15# Copyright (c) 2012, 2019 by Delphix. All rights reserved.
16# Copyright 2016 Nexenta Systems, Inc.
17# Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved.
18# Copyright (c) 2017 Lawrence Livermore National Security, LLC.
19# Copyright (c) 2017 Datto Inc.
20# Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
21# Copyright 2019 Richard Elling
22#
23
24#
25# Returns SCSI host number for the given disk
26#
27function get_scsi_host #disk
28{
29	typeset disk=$1
30	ls /sys/block/${disk}/device/scsi_device | cut -d : -f 1
31}
32
33#
34# Cause a scan of all scsi host adapters by default
35#
36# $1 optional host number
37#
38function scan_scsi_hosts
39{
40	typeset hostnum=${1}
41
42	if is_linux; then
43		if [[ -z $hostnum ]]; then
44			for host in /sys/class/scsi_host/host*; do
45				log_must eval "echo '- - -' > $host/scan"
46			done
47		else
48			log_must eval \
49			    "echo /sys/class/scsi_host/host$hostnum/scan" \
50			    > /dev/null
51			log_must eval \
52			    "echo '- - -' > /sys/class/scsi_host/host$hostnum/scan"
53		fi
54	fi
55}
56
57#
58# Wait for newly created block devices to have their minors created.
59# Additional arguments can be passed to udevadm trigger, with the expected
60# arguments to typically be a block device pathname. This is useful when
61# checking waiting on a specific device to settle rather than triggering
62# all devices and waiting for them all to settle.
63#
64# The udevadm settle timeout can be 120 or 180 seconds by default for
65# some distros. If a long delay is experienced, it could be due to some
66# strangeness in a malfunctioning device that isn't related to the devices
67# under test. To help debug this condition, a notice is given if settle takes
68# too long.
69#
70# Note: there is no meaningful return code if udevadm fails. Consumers
71# should not expect a return code (do not call as argument to log_must)
72#
73function block_device_wait
74{
75	if is_linux; then
76		udevadm trigger $* 2>/dev/null
77		typeset start=$SECONDS
78		udevadm settle
79		typeset elapsed=$((SECONDS - start))
80		[[ $elapsed > 60 ]] && \
81		    log_note udevadm settle time too long: $elapsed
82	elif is_freebsd; then
83		if [[ ${#@} -eq 0 ]]; then
84			# Do something that has to go through the geom event
85			# queue to complete.
86			sysctl kern.geom.conftxt >/dev/null
87			return
88		fi
89	fi
90	# Poll for the given paths to appear, but give up eventually.
91	typeset -i i
92	for (( i = 0; i < 5; ++i )); do
93		typeset missing=false
94		typeset dev
95		for dev in "${@}"; do
96			if ! [[ -e $dev ]]; then
97				missing=true
98				break
99			fi
100		done
101		if ! $missing; then
102			break
103		fi
104		sleep ${#@}
105	done
106}
107
108#
109# Check if the given device is physical device
110#
111function is_physical_device #device
112{
113	typeset device=${1#$DEV_DSKDIR/}
114	device=${device#$DEV_RDSKDIR/}
115
116	if is_linux; then
117		is_disk_device "$DEV_DSKDIR/$device" && \
118		[[ -f /sys/module/loop/parameters/max_part ]]
119		return $?
120	elif is_freebsd; then
121		is_disk_device "$DEV_DSKDIR/$device" && \
122		echo $device | egrep -q \
123		    -e '^a?da[0-9]+$' \
124		    -e '^md[0-9]+$' \
125		    -e '^mfid[0-9]+$' \
126		    -e '^nda[0-9]+$' \
127		    -e '^nvd[0-9]+$' \
128		    -e '^vtbd[0-9]+$'
129		return $?
130	else
131		echo $device | egrep "^c[0-F]+([td][0-F]+)+$" > /dev/null 2>&1
132		return $?
133	fi
134}
135
136#
137# Check if the given device is a real device (ie SCSI device)
138#
139function is_real_device #disk
140{
141	typeset disk=$1
142	[[ -z $disk ]] && log_fail "No argument for disk given."
143
144	if is_linux; then
145		lsblk $DEV_RDSKDIR/$disk -o TYPE 2>/dev/null | \
146		    egrep disk >/dev/null
147		return $?
148	fi
149}
150
151#
152# Check if the given device is a loop device
153#
154function is_loop_device #disk
155{
156	typeset disk=$1
157	[[ -z $disk ]] && log_fail "No argument for disk given."
158
159	if is_linux; then
160		lsblk $DEV_RDSKDIR/$disk -o TYPE 2>/dev/null | \
161		    egrep loop >/dev/null
162		return $?
163	fi
164}
165
166#
167# Linux:
168# Check if the given device is a multipath device and if there is a symbolic
169# link to a device mapper and to a disk
170# Currently no support for dm devices alone without multipath
171#
172# FreeBSD:
173# Check if the given device is a gmultipath device.
174#
175# Others:
176# No multipath detection.
177#
178function is_mpath_device #disk
179{
180	typeset disk=$1
181	[[ -z $disk ]] && log_fail "No argument for disk given."
182
183	if is_linux; then
184		lsblk $DEV_MPATHDIR/$disk -o TYPE 2>/dev/null | \
185		   egrep mpath >/dev/null
186		if (($? == 0)); then
187			readlink $DEV_MPATHDIR/$disk > /dev/null 2>&1
188			return $?
189		else
190			return $?
191		fi
192	elif is_freebsd; then
193		is_disk_device $DEV_MPATHDIR/$disk
194	else
195		false
196	fi
197}
198
199#
200# Check if the given path is the appropriate sort of device special node.
201#
202function is_disk_device #path
203{
204	typeset path=$1
205
206	if is_freebsd; then
207		# FreeBSD doesn't have block devices, only character devices.
208		test -c $path
209	else
210		test -b $path
211	fi
212}
213
214# Set the slice prefix for disk partitioning depending
215# on whether the device is a real, multipath, or loop device.
216# Currently all disks have to be of the same type, so only
217# checks first disk to determine slice prefix.
218#
219function set_slice_prefix
220{
221	typeset disk
222	typeset -i i=0
223
224	if is_linux; then
225		while (( i < $DISK_ARRAY_NUM )); do
226			disk="$(echo $DISKS | nawk '{print $(i + 1)}')"
227			if ( is_mpath_device $disk ) && [[ -z $(echo $disk | awk 'substr($1,18,1)\
228			     ~ /^[[:digit:]]+$/') ]] || ( is_real_device $disk ); then
229				export SLICE_PREFIX=""
230				return 0
231			elif ( is_mpath_device $disk || is_loop_device \
232			    $disk ); then
233				export SLICE_PREFIX="p"
234				return 0
235			else
236				log_fail "$disk not supported for partitioning."
237			fi
238			(( i = i + 1))
239		done
240	fi
241}
242
243#
244# Set the directory path of the listed devices in $DISK_ARRAY_NUM
245# Currently all disks have to be of the same type, so only
246# checks first disk to determine device directory
247# default = /dev (linux)
248# real disk = /dev (linux)
249# multipath device = /dev/mapper (linux)
250#
251function set_device_dir
252{
253	typeset disk
254	typeset -i i=0
255
256	if is_linux; then
257		while (( i < $DISK_ARRAY_NUM )); do
258			disk="$(echo $DISKS | nawk '{print $(i + 1)}')"
259			if is_mpath_device $disk; then
260				export DEV_DSKDIR=$DEV_MPATHDIR
261				return 0
262			else
263				export DEV_DSKDIR=$DEV_RDSKDIR
264				return 0
265			fi
266			(( i = i + 1))
267		done
268	else
269		export DEV_DSKDIR=$DEV_RDSKDIR
270	fi
271}
272
273#
274# Get the directory path of given device
275#
276function get_device_dir #device
277{
278	typeset device=$1
279
280	if ! is_freebsd && ! is_physical_device $device; then
281		if [[ $device != "/" ]]; then
282			device=${device%/*}
283		fi
284		if is_disk_device "$DEV_DSKDIR/$device"; then
285			device="$DEV_DSKDIR"
286		fi
287		echo $device
288	else
289		echo "$DEV_DSKDIR"
290	fi
291}
292
293#
294# Get persistent name for given disk
295#
296function get_persistent_disk_name #device
297{
298	typeset device=$1
299	typeset dev_id
300
301	if is_linux; then
302		if is_real_device $device; then
303			dev_id="$(udevadm info -q all -n $DEV_DSKDIR/$device \
304			    | egrep disk/by-id | nawk '{print $2; exit}' \
305			    | nawk -F / '{print $3}')"
306			echo $dev_id
307		elif is_mpath_device $device; then
308			dev_id="$(udevadm info -q all -n $DEV_DSKDIR/$device \
309			    | egrep disk/by-id/dm-uuid \
310			    | nawk '{print $2; exit}' \
311			    | nawk -F / '{print $3}')"
312			echo $dev_id
313		else
314			echo $device
315		fi
316	else
317		echo $device
318	fi
319}
320
321#
322# Online or offline a disk on the system
323#
324# First checks state of disk. Test will fail if disk is not properly onlined
325# or offlined. Online is a full rescan of SCSI disks by echoing to every
326# host entry.
327#
328function on_off_disk # disk state{online,offline} host
329{
330	typeset disk=$1
331	typeset state=$2
332	typeset host=$3
333
334	[[ -z $disk ]] || [[ -z $state ]] &&  \
335	    log_fail "Arguments invalid or missing"
336
337	if is_linux; then
338		if [[ $state == "offline" ]] && ( is_mpath_device $disk ); then
339			dm_name="$(readlink $DEV_DSKDIR/$disk \
340			    | nawk -F / '{print $2}')"
341			dep="$(ls /sys/block/${dm_name}/slaves \
342			    | nawk '{print $1}')"
343			while [[ -n $dep ]]; do
344				#check if disk is online
345				lsscsi | egrep $dep > /dev/null
346				if (($? == 0)); then
347					dep_dir="/sys/block/${dm_name}"
348					dep_dir+="/slaves/${dep}/device"
349					ss="${dep_dir}/state"
350					sd="${dep_dir}/delete"
351					log_must eval "echo 'offline' > ${ss}"
352					log_must eval "echo '1' > ${sd}"
353					lsscsi | egrep $dep > /dev/null
354						if (($? == 0)); then
355							log_fail "Offlining" \
356							    "$disk failed"
357						fi
358				fi
359				dep="$(ls /sys/block/$dm_name/slaves \
360				    2>/dev/null | nawk '{print $1}')"
361			done
362		elif [[ $state == "offline" ]] && ( is_real_device $disk ); then
363			#check if disk is online
364			lsscsi | egrep $disk > /dev/null
365			if (($? == 0)); then
366				dev_state="/sys/block/$disk/device/state"
367				dev_delete="/sys/block/$disk/device/delete"
368				log_must eval "echo 'offline' > ${dev_state}"
369				log_must eval "echo '1' > ${dev_delete}"
370				lsscsi | egrep $disk > /dev/null
371					if (($? == 0)); then
372						log_fail "Offlining $disk" \
373						    "failed"
374					fi
375			else
376				log_note "$disk is already offline"
377			fi
378		elif [[ $state == "online" ]]; then
379			#force a full rescan
380			scan_scsi_hosts $host
381			block_device_wait
382			if is_mpath_device $disk; then
383				dm_name="$(readlink $DEV_DSKDIR/$disk \
384				    | nawk -F / '{print $2}')"
385				dep="$(ls /sys/block/$dm_name/slaves \
386				    | nawk '{print $1}')"
387				lsscsi | egrep $dep > /dev/null
388				if (($? != 0)); then
389					log_fail "Onlining $disk failed"
390				fi
391			elif is_real_device $disk; then
392				block_device_wait
393				typeset -i retries=0
394				while ! lsscsi | egrep -q $disk; do
395					if (( $retries > 2 )); then
396						log_fail "Onlining $disk failed"
397						break
398					fi
399					(( ++retries ))
400					sleep 1
401				done
402			else
403				log_fail "$disk is not a real dev"
404			fi
405		else
406			log_fail "$disk failed to $state"
407		fi
408	fi
409}
410
411#
412# Simulate disk removal
413#
414function remove_disk #disk
415{
416	typeset disk=$1
417	on_off_disk $disk "offline"
418	block_device_wait
419}
420
421#
422# Simulate disk insertion for the given SCSI host
423#
424function insert_disk #disk scsi_host
425{
426	typeset disk=$1
427	typeset scsi_host=$2
428	on_off_disk $disk "online" $scsi_host
429	block_device_wait
430}
431
432#
433# Load scsi_debug module with specified parameters
434# $blksz can be either one of: < 512b | 512e | 4Kn >
435#
436function load_scsi_debug # dev_size_mb add_host num_tgts max_luns blksz
437{
438	typeset devsize=$1
439	typeset hosts=$2
440	typeset tgts=$3
441	typeset luns=$4
442	typeset blksz=$5
443
444	[[ -z $devsize ]] || [[ -z $hosts ]] || [[ -z $tgts ]] || \
445	    [[ -z $luns ]] || [[ -z $blksz ]] && \
446	    log_fail "Arguments invalid or missing"
447
448	case "$5" in
449		'512b')
450			typeset sector=512
451			typeset blkexp=0
452		;;
453		'512e')
454			typeset sector=512
455			typeset blkexp=3
456		;;
457		'4Kn')
458			typeset sector=4096
459			typeset blkexp=0
460		;;
461		*) log_fail "Unsupported blksz value: $5" ;;
462	esac
463
464	if is_linux; then
465		modprobe -n scsi_debug
466		if (($? != 0)); then
467			log_unsupported "Platform does not have scsi_debug"
468			    "module"
469		fi
470		lsmod | egrep scsi_debug > /dev/null
471		if (($? == 0)); then
472			log_fail "scsi_debug module already installed"
473		else
474			log_must modprobe scsi_debug dev_size_mb=$devsize \
475			    add_host=$hosts num_tgts=$tgts max_luns=$luns \
476			    sector_size=$sector physblk_exp=$blkexp
477			block_device_wait
478			lsscsi | egrep scsi_debug > /dev/null
479			if (($? == 1)); then
480				log_fail "scsi_debug module install failed"
481			fi
482		fi
483	fi
484}
485
486#
487# Unload scsi_debug module, if needed.
488#
489function unload_scsi_debug
490{
491	log_must_retry "in use" 5 modprobe -r scsi_debug
492}
493
494#
495# Get scsi_debug device name.
496# Returns basename of scsi_debug device (for example "sdb").
497#
498function get_debug_device
499{
500	for i in {1..10} ; do
501		val=$(lsscsi | nawk '/scsi_debug/ {print $6; exit}' | cut -d / -f3)
502
503		# lsscsi can take time to settle
504		if [ "$val" != "-" ] ; then
505			break
506		fi
507		sleep 1
508	done
509	echo "$val"
510}
511
512#
513# Get actual devices used by the pool (i.e. linux sdb1 not sdb).
514#
515function get_pool_devices #testpool #devdir
516{
517	typeset testpool=$1
518	typeset devdir=$2
519	typeset out=""
520
521	if is_linux || is_freebsd; then
522		out=$(zpool status -P $testpool |grep ${devdir} | awk '{print $1}')
523		out=$(echo $out | sed -e "s|${devdir}/||g" | tr '\n' ' ')
524	fi
525	echo $out
526}
527
528#
529# Write to standard out giving the level, device name, offset and length
530# of all blocks in an input file. The offset and length are in units of
531# 512 byte blocks. In the case of mirrored vdevs, only the first
532# device is listed, as the levels, blocks and offsets will be the same
533# on other devices. Note that this function only works with mirrored
534# or non-redundant pools, not raidz.
535#
536# The output of this function can be used to introduce corruption at
537# varying levels of indirection.
538#
539function list_file_blocks # input_file
540{
541	typeset input_file=$1
542
543	[[ -f $input_file ]] || log_fail "Couldn't find $input_file"
544
545	typeset ds="$(zfs list -H -o name $input_file)"
546	typeset pool="${ds%%/*}"
547	typeset objnum="$(get_objnum $input_file)"
548
549	#
550	# Establish a mapping between vdev ids as shown in a DVA and the
551	# pathnames they correspond to in ${VDEV_MAP[][]}.
552	#
553	# The vdev bits in a DVA refer to the top level vdev id.
554	# ${VDEV_MAP[$id]} is an array of the vdev paths within that vdev.
555	#
556	eval $(zdb -C $pool | awk '
557	    BEGIN { printf "typeset -a VDEV_MAP;" }
558	    function subscript(s) {
559	        # "[#]" is more convenient than the bare "#"
560	        match(s, /\[[0-9]*\]/)
561		return substr(s, RSTART, RLENGTH)
562	    }
563	    id && !/^                / {
564	        # left a top level vdev
565	        id = 0
566	    }
567	    id && $1 ~ /^path:$/ {
568	        # found a vdev path; save it in the map
569	        printf "VDEV_MAP%s%s=%s;", id, child, $2
570	    }
571	    /^            children/ {
572	        # entering a top level vdev
573	        id = subscript($0)
574		child = "[0]" # default in case there is no nested vdev
575		printf "typeset -a VDEV_MAP%s;", id
576	    }
577	    /^                children/ {
578	        # entering a nested vdev (e.g. child of a top level mirror)
579	        child = subscript($0)
580	    }
581	')
582
583	#
584	# The awk below parses the output of zdb, printing out the level
585	# of each block along with vdev id, offset and length. The last
586	# two are converted to decimal in the while loop. 4M is added to
587	# the offset to compensate for the first two labels and boot
588	# block. Lastly, the offset and length are printed in units of
589	# 512B blocks for ease of use with dd.
590	#
591	typeset level vdev path offset length
592	if awk -n '' 2>/dev/null; then
593		# gawk needs -n to decode hex
594		AWK='awk -n'
595	else
596		AWK='awk'
597	fi
598	sync_all_pools true
599	zdb -dddddd $ds $objnum | $AWK -v pad=$((4<<20)) -v bs=512 '
600	    /^$/ { looking = 0 }
601	    looking {
602	        level = $2
603	        field = 3
604	        while (split($field, dva, ":") == 3) {
605	            # top level vdev id
606	            vdev = int(dva[1])
607	            # offset + 4M label/boot pad in 512B blocks
608	            offset = (int("0x"dva[2]) + pad) / bs
609		    # length in 512B blocks
610		    len = int("0x"dva[3]) / bs
611
612	            print level, vdev, offset, len
613
614	            ++field
615	        }
616	    }
617	    /^Indirect blocks:/ { looking = 1 }
618	' | \
619	while read level vdev offset length; do
620		for path in ${VDEV_MAP[$vdev][@]}; do
621			echo "$level $path $offset $length"
622		done
623	done 2>/dev/null
624}
625
626function corrupt_blocks_at_level # input_file corrupt_level
627{
628	typeset input_file=$1
629	typeset corrupt_level="L${2:-0}"
630	typeset level path offset length
631
632	[[ -f $input_file ]] || log_fail "Couldn't find $input_file"
633
634	if is_freebsd; then
635		# Temporarily allow corrupting an inuse device.
636		debugflags=$(sysctl -n kern.geom.debugflags)
637		sysctl kern.geom.debugflags=16
638	fi
639
640	list_file_blocks $input_file | \
641	while read level path offset length; do
642		if [[ $level = $corrupt_level ]]; then
643			log_must dd if=/dev/urandom of=$path bs=512 \
644			    count=$length seek=$offset conv=notrunc
645		fi
646	done
647
648	if is_freebsd; then
649		sysctl kern.geom.debugflags=$debugflags
650	fi
651
652	# This is necessary for pools made of loop devices.
653	sync
654}
655
656function corrupt_label_checksum # label_number vdev_path
657{
658	typeset label_size=$((256*1024))
659	typeset vdev_size=$(stat_size ${2})
660	typeset -a offsets=("$((128*1024 - 32))" \
661	    "$(($label_size + (128*1024 - 32)))" \
662	    "$(($vdev_size - $label_size - (128*1024 + 32)))" \
663	    "$(($vdev_size - (128*1024 + 32)))")
664
665	dd if=/dev/urandom of=${2} seek=${offsets[$1]} bs=1 count=32 \
666	    conv=notrunc
667}
668