1#!/usr/local/bin/bash
2#
3#       Shared Disk File EXclusiveness (SF-EX) OCF RA.
4#       prevent a destruction of data on shared disk file system
5#	due to Split-Brain.
6#
7# This program is free software; you can redistribute it and/or
8# modify it under the terms of the GNU General Public License
9# as published by the Free Software Foundation; either version 2
10# of the License, or (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
20# 02110-1301, USA.
21#
22# Copyright (c) 2007 NIPPON TELEGRAPH AND TELEPHONE CORPORATION
23#
24# NOTE:
25#	As a prerequisite for running SF-EX, one device should be
26#	initialized as below.
27#
28#		sfex_init [-n <numlocks>] <device>
29#
30#	Example:
31#
32#		/usr/sbin/sfex_init -n 10 /dev/sdb1
33#
34#	if further information is necessary, See README.
35#
36#######################################################################
37# Initialization:
38
39# switching ocf-shellfuncs path
40: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
41. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
42
43# Parameter defaults
44
45OCF_RESKEY_device_default=""
46OCF_RESKEY_index_default="1"
47OCF_RESKEY_collision_timeout_default="1"
48OCF_RESKEY_monitor_interval_default="10"
49OCF_RESKEY_lock_timeout_default="100"
50
51: ${OCF_RESKEY_device=${OCF_RESKEY_device_default}}
52: ${OCF_RESKEY_index=${OCF_RESKEY_index_default}}
53: ${OCF_RESKEY_collision_timeout=${OCF_RESKEY_collision_timeout_default}}
54: ${OCF_RESKEY_monitor_interval=${OCF_RESKEY_monitor_interval_default}}
55: ${OCF_RESKEY_lock_timeout=${OCF_RESKEY_lock_timeout_default}}
56
57#######################################################################
58
59SFEX_DAEMON=${HA_BIN}/sfex_daemon
60
61usage() {
62    cat <<END
63    usage: $0 {start|stop|monitor|meta-data}
64END
65}
66
67meta_data() {
68    cat <<END
69<?xml version="1.0"?>
70<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
71<resource-agent name="sfex">
72<version>1.3</version>
73
74<longdesc lang="en">
75Resource script for SF-EX. It manages a shared storage medium exclusively .
76</longdesc>
77<shortdesc lang="en">Manages exclusive access to shared storage using Shared Disk File EXclusiveness (SF-EX)</shortdesc>
78<parameters>
79<parameter name="device" unique="0" required="1">
80<longdesc lang="en">
81Block device path that stores exclusive control data.
82</longdesc>
83<shortdesc lang="en">block device</shortdesc>
84<content type="string" default="${OCF_RESKEY_device_default}" />
85</parameter>
86<parameter name="index" unique="0" required="0">
87<longdesc lang="en">
88Location in block device where exclusive control data is stored. 1 or more is specified. Default is 1.
89</longdesc>
90<shortdesc lang="en">index</shortdesc>
91<content type="integer" default="${OCF_RESKEY_index_default}" />
92</parameter>
93<parameter name="collision_timeout" unique="0" required="0">
94<longdesc lang="en">
95Waiting time when a collision of lock acquisition is detected. Default is 1 second.
96</longdesc>
97<shortdesc lang="en">waiting time for lock acquisition</shortdesc>
98<content type="integer" default="${OCF_RESKEY_collision_timeout_default}" />
99</parameter>
100<parameter name="monitor_interval" unique="0" required="0">
101<longdesc lang="en">
102Monitor interval(sec). Default is ${OCF_RESKEY_monitor_interval_default} seconds
103</longdesc>
104<shortdesc lang="en">monitor interval</shortdesc>
105<content type="integer" default="${OCF_RESKEY_monitor_interval_default}" />
106</parameter>
107<parameter name="lock_timeout" unique="0" required="0">
108<longdesc lang="en">
109Valid term of lock(sec). Default is ${OCF_RESKEY_lock_timeout_default} seconds.
110The lock_timeout is calculated by the following formula.
111
112  lock_timeout = monitor_interval + "The expiration time of the lock"
113
114We suggest 90 seconds as a default value of the "The expiration time of the lock", but you should change it in consideration of access delay to the shared disk and the switch time of the multipath driver.
115
116The lock timeout have an impact on start action timeout because start action timeout value is calculated by the following formula.
117
118  start timeout = collision_timeout + lock_timeout + "safety margin"
119
120The "safety margin" is decided within the range of about 10-20 seconds(It depends on your system requirement).
121</longdesc>
122<shortdesc lang="en">Valid term of lock</shortdesc>
123<content type="integer" default="${OCF_RESKEY_lock_timeout_default}" />
124</parameter>
125</parameters>
126
127<actions>
128<action name="start" timeout="120s" />
129<action name="stop" timeout="20s" />
130<action name="monitor" depth="0" timeout="10s" interval="10s" />
131<action name="meta-data" timeout="5s" />
132<action name="validate-all" timeout="5s" />
133</actions>
134</resource-agent>
135END
136}
137
138#
139# START: Exclusive control starts.
140#
141# It loops permanently until the lock can be acquired when locked with
142# the other node. In this case, the reception of the stop signal by the
143# timeout time passage set to CIB becomes the only stop opportunity.
144#
145sfex_start() {
146	ocf_log info "sfex_daemon: starting..."
147
148	sfex_monitor
149	if [ $? -eq $OCF_SUCCESS ]; then
150		ocf_log info "sfex_daemon already started."
151		return $OCF_SUCCESS
152	fi
153
154	$SFEX_DAEMON -i $INDEX -c $COLLISION_TIMEOUT -t $LOCK_TIMEOUT -m $MONITOR_INTERVAL -r ${OCF_RESOURCE_INSTANCE} $DEVICE
155
156	rc=$?
157	if [ $rc -ne 0 ]; then
158		ocf_log err "sfex_daemon failed to start."
159		return $OCF_ERR_GENERIC
160	fi
161
162	while :
163	do
164		sfex_monitor
165		if [ $? -eq $OCF_SUCCESS ]; then
166			ocf_log info "sfex_daemon: started."
167			return $OCF_SUCCESS
168		fi
169		ocf_log debug "Waiting for the start-up of the sfex_daemon..."
170		sleep 1
171	done
172	ocf_log err "Can't find a sfex_daemon process. Starting a sfex_daemon failed."
173	return $OCF_ERR_GENERIC
174}
175
176#
177# STOP: stop exclusive control
178#
179sfex_stop() {
180	ocf_log info "sfex_daemon: stopping..."
181
182	# Check the sfex daemon has already stopped.
183	sfex_monitor
184	if [ $? -eq $OCF_NOT_RUNNING ]; then
185		ocf_log info "sfex_daemon already stopped."
186		return $OCF_SUCCESS
187	fi
188
189	# Stop sfex daemon by sending SIGTERM signal.
190	pid=`/usr/bin/pgrep -f "$SFEX_DAEMON .* ${OCF_RESOURCE_INSTANCE} "`
191	/bin/kill $pid
192	rc=$?
193	if [ $rc -ne 0 ]; then
194		ocf_log err "sfex_daemon failed to stop"
195		return $rc
196	fi
197
198#sfex could be in state D if the device is gone, and then not terminate.
199#Wait and check again if the daemon is already properly shutdown.
200
201	shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-5))
202	count=0
203	while [ $count -lt $shutdown_timeout ]
204	do
205		sfex_monitor
206		if [ $? -eq $OCF_NOT_RUNNING ]; then
207			ocf_log info "sfex_daemon: stopped."
208			return $OCF_SUCCESS
209		fi
210		count=`expr $count + 1`
211		ocf_log debug "waiting for sfex_daemon to exit ($count/$shutdown_timeout)"
212		sleep 1
213	done
214
215	sfex_monitor
216	if [ $? -ne $OCF_NOT_RUNNING ]; then
217		ocf_log warn "regular shutdown of sfex_daemon timed out, using SIGKILL"
218		/bin/kill -s KILL $pid
219	fi
220
221	while :
222	do
223		sfex_monitor
224		if [ $? -eq $OCF_NOT_RUNNING ]; then
225			break;
226		fi
227		ocf_log debug "waiting for sfex_daemon to exit after SIGKILL"
228		sleep 1
229	done
230
231	ocf_log info "sfex_daemon: stopped."
232	return $OCF_SUCCESS
233}
234
235sfex_monitor() {
236	ocf_log debug "sfex_monitor: started..."
237
238	# Find a sfex_daemon process using daemon name and resource name.
239	if /usr/bin/pgrep -f "$SFEX_DAEMON .* ${OCF_RESOURCE_INSTANCE} " > /dev/null 2>&1; then
240		ocf_log debug "sfex_monitor: complete. sfex_daemon is running."
241		return $OCF_SUCCESS
242	fi
243
244	ocf_log debug "sfex_monitor: complete. sfex_daemon is not running."
245	return $OCF_NOT_RUNNING
246}
247
248#
249# main process
250#
251
252# check arguments
253if [ $# -ne 1 ]; then
254	usage
255	exit $OCF_ERR_ARGS
256fi
257OP=$1
258
259# These operations do not require instance parameters
260case $OP in
261	meta-data)
262		meta_data
263		exit $OCF_SUCCESS
264		;;
265	usage)
266		usage
267		exit $OCF_SUCCESS
268		;;
269esac
270
271# check parameters
272DEVICE=$OCF_RESKEY_device
273INDEX=${OCF_RESKEY_index}
274COLLISION_TIMEOUT=${OCF_RESKEY_collision_timeout}
275LOCK_TIMEOUT=${OCF_RESKEY_lock_timeout}
276MONITOR_INTERVAL=${OCF_RESKEY_monitor_interval}
277
278sfex_validate () {
279if [ -z "$DEVICE" ]; then
280	ocf_log err "Please set OCF_RESKEY_device to device for sfex meta-data"
281	exit $OCF_ERR_ARGS
282fi
283if [ ! -w "$DEVICE" ]; then
284	ocf_log warn "Couldn't find device [$DEVICE]. Expected /dev/??? to exist"
285	exit $OCF_ERR_ARGS
286fi
287}
288
289if [ -n "$OCF_RESKEY_CRM_meta_clone" ]; then
290	ocf_log err "THIS RA DO NOT SUPPORT CLONE MODE!"
291	exit $OCF_ERR_CONFIGURED
292fi
293
294case $OP in
295	start)
296		sfex_start
297		;;
298	stop)
299		sfex_stop
300		;;
301	monitor)
302		sfex_monitor
303		;;
304	validate-all)
305		sfex_validate
306		;;
307	*)
308		exit $OCF_ERR_UNIMPLEMENTED
309		;;
310esac
311exit $?
312