1#!/usr/local/bin/bash 2# 3# Shared Disk File EXclusiveness (SF-EX) OCF RA. 4# prevent a destruction of data on shared disk file system 5# due to Split-Brain. 6# 7# This program is free software; you can redistribute it and/or 8# modify it under the terms of the GNU General Public License 9# as published by the Free Software Foundation; either version 2 10# of the License, or (at your option) any later version. 11# 12# This program is distributed in the hope that it will be useful, 13# but WITHOUT ANY WARRANTY; without even the implied warranty of 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15# GNU General Public License for more details. 16# 17# You should have received a copy of the GNU General Public License 18# along with this program; if not, write to the Free Software 19# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 20# 02110-1301, USA. 21# 22# Copyright (c) 2007 NIPPON TELEGRAPH AND TELEPHONE CORPORATION 23# 24# NOTE: 25# As a prerequisite for running SF-EX, one device should be 26# initialized as below. 27# 28# sfex_init [-n <numlocks>] <device> 29# 30# Example: 31# 32# /usr/sbin/sfex_init -n 10 /dev/sdb1 33# 34# if further information is necessary, See README. 35# 36####################################################################### 37# Initialization: 38 39# switching ocf-shellfuncs path 40: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} 41. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs 42 43# Parameter defaults 44 45OCF_RESKEY_device_default="" 46OCF_RESKEY_index_default="1" 47OCF_RESKEY_collision_timeout_default="1" 48OCF_RESKEY_monitor_interval_default="10" 49OCF_RESKEY_lock_timeout_default="100" 50 51: ${OCF_RESKEY_device=${OCF_RESKEY_device_default}} 52: ${OCF_RESKEY_index=${OCF_RESKEY_index_default}} 53: ${OCF_RESKEY_collision_timeout=${OCF_RESKEY_collision_timeout_default}} 54: ${OCF_RESKEY_monitor_interval=${OCF_RESKEY_monitor_interval_default}} 55: ${OCF_RESKEY_lock_timeout=${OCF_RESKEY_lock_timeout_default}} 56 57####################################################################### 58 59SFEX_DAEMON=${HA_BIN}/sfex_daemon 60 61usage() { 62 cat <<END 63 usage: $0 {start|stop|monitor|meta-data} 64END 65} 66 67meta_data() { 68 cat <<END 69<?xml version="1.0"?> 70<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> 71<resource-agent name="sfex"> 72<version>1.3</version> 73 74<longdesc lang="en"> 75Resource script for SF-EX. It manages a shared storage medium exclusively . 76</longdesc> 77<shortdesc lang="en">Manages exclusive access to shared storage using Shared Disk File EXclusiveness (SF-EX)</shortdesc> 78<parameters> 79<parameter name="device" unique="0" required="1"> 80<longdesc lang="en"> 81Block device path that stores exclusive control data. 82</longdesc> 83<shortdesc lang="en">block device</shortdesc> 84<content type="string" default="${OCF_RESKEY_device_default}" /> 85</parameter> 86<parameter name="index" unique="0" required="0"> 87<longdesc lang="en"> 88Location in block device where exclusive control data is stored. 1 or more is specified. Default is 1. 89</longdesc> 90<shortdesc lang="en">index</shortdesc> 91<content type="integer" default="${OCF_RESKEY_index_default}" /> 92</parameter> 93<parameter name="collision_timeout" unique="0" required="0"> 94<longdesc lang="en"> 95Waiting time when a collision of lock acquisition is detected. Default is 1 second. 96</longdesc> 97<shortdesc lang="en">waiting time for lock acquisition</shortdesc> 98<content type="integer" default="${OCF_RESKEY_collision_timeout_default}" /> 99</parameter> 100<parameter name="monitor_interval" unique="0" required="0"> 101<longdesc lang="en"> 102Monitor interval(sec). Default is ${OCF_RESKEY_monitor_interval_default} seconds 103</longdesc> 104<shortdesc lang="en">monitor interval</shortdesc> 105<content type="integer" default="${OCF_RESKEY_monitor_interval_default}" /> 106</parameter> 107<parameter name="lock_timeout" unique="0" required="0"> 108<longdesc lang="en"> 109Valid term of lock(sec). Default is ${OCF_RESKEY_lock_timeout_default} seconds. 110The lock_timeout is calculated by the following formula. 111 112 lock_timeout = monitor_interval + "The expiration time of the lock" 113 114We suggest 90 seconds as a default value of the "The expiration time of the lock", but you should change it in consideration of access delay to the shared disk and the switch time of the multipath driver. 115 116The lock timeout have an impact on start action timeout because start action timeout value is calculated by the following formula. 117 118 start timeout = collision_timeout + lock_timeout + "safety margin" 119 120The "safety margin" is decided within the range of about 10-20 seconds(It depends on your system requirement). 121</longdesc> 122<shortdesc lang="en">Valid term of lock</shortdesc> 123<content type="integer" default="${OCF_RESKEY_lock_timeout_default}" /> 124</parameter> 125</parameters> 126 127<actions> 128<action name="start" timeout="120s" /> 129<action name="stop" timeout="20s" /> 130<action name="monitor" depth="0" timeout="10s" interval="10s" /> 131<action name="meta-data" timeout="5s" /> 132<action name="validate-all" timeout="5s" /> 133</actions> 134</resource-agent> 135END 136} 137 138# 139# START: Exclusive control starts. 140# 141# It loops permanently until the lock can be acquired when locked with 142# the other node. In this case, the reception of the stop signal by the 143# timeout time passage set to CIB becomes the only stop opportunity. 144# 145sfex_start() { 146 ocf_log info "sfex_daemon: starting..." 147 148 sfex_monitor 149 if [ $? -eq $OCF_SUCCESS ]; then 150 ocf_log info "sfex_daemon already started." 151 return $OCF_SUCCESS 152 fi 153 154 $SFEX_DAEMON -i $INDEX -c $COLLISION_TIMEOUT -t $LOCK_TIMEOUT -m $MONITOR_INTERVAL -r ${OCF_RESOURCE_INSTANCE} $DEVICE 155 156 rc=$? 157 if [ $rc -ne 0 ]; then 158 ocf_log err "sfex_daemon failed to start." 159 return $OCF_ERR_GENERIC 160 fi 161 162 while : 163 do 164 sfex_monitor 165 if [ $? -eq $OCF_SUCCESS ]; then 166 ocf_log info "sfex_daemon: started." 167 return $OCF_SUCCESS 168 fi 169 ocf_log debug "Waiting for the start-up of the sfex_daemon..." 170 sleep 1 171 done 172 ocf_log err "Can't find a sfex_daemon process. Starting a sfex_daemon failed." 173 return $OCF_ERR_GENERIC 174} 175 176# 177# STOP: stop exclusive control 178# 179sfex_stop() { 180 ocf_log info "sfex_daemon: stopping..." 181 182 # Check the sfex daemon has already stopped. 183 sfex_monitor 184 if [ $? -eq $OCF_NOT_RUNNING ]; then 185 ocf_log info "sfex_daemon already stopped." 186 return $OCF_SUCCESS 187 fi 188 189 # Stop sfex daemon by sending SIGTERM signal. 190 pid=`/usr/bin/pgrep -f "$SFEX_DAEMON .* ${OCF_RESOURCE_INSTANCE} "` 191 /bin/kill $pid 192 rc=$? 193 if [ $rc -ne 0 ]; then 194 ocf_log err "sfex_daemon failed to stop" 195 return $rc 196 fi 197 198#sfex could be in state D if the device is gone, and then not terminate. 199#Wait and check again if the daemon is already properly shutdown. 200 201 shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-5)) 202 count=0 203 while [ $count -lt $shutdown_timeout ] 204 do 205 sfex_monitor 206 if [ $? -eq $OCF_NOT_RUNNING ]; then 207 ocf_log info "sfex_daemon: stopped." 208 return $OCF_SUCCESS 209 fi 210 count=`expr $count + 1` 211 ocf_log debug "waiting for sfex_daemon to exit ($count/$shutdown_timeout)" 212 sleep 1 213 done 214 215 sfex_monitor 216 if [ $? -ne $OCF_NOT_RUNNING ]; then 217 ocf_log warn "regular shutdown of sfex_daemon timed out, using SIGKILL" 218 /bin/kill -s KILL $pid 219 fi 220 221 while : 222 do 223 sfex_monitor 224 if [ $? -eq $OCF_NOT_RUNNING ]; then 225 break; 226 fi 227 ocf_log debug "waiting for sfex_daemon to exit after SIGKILL" 228 sleep 1 229 done 230 231 ocf_log info "sfex_daemon: stopped." 232 return $OCF_SUCCESS 233} 234 235sfex_monitor() { 236 ocf_log debug "sfex_monitor: started..." 237 238 # Find a sfex_daemon process using daemon name and resource name. 239 if /usr/bin/pgrep -f "$SFEX_DAEMON .* ${OCF_RESOURCE_INSTANCE} " > /dev/null 2>&1; then 240 ocf_log debug "sfex_monitor: complete. sfex_daemon is running." 241 return $OCF_SUCCESS 242 fi 243 244 ocf_log debug "sfex_monitor: complete. sfex_daemon is not running." 245 return $OCF_NOT_RUNNING 246} 247 248# 249# main process 250# 251 252# check arguments 253if [ $# -ne 1 ]; then 254 usage 255 exit $OCF_ERR_ARGS 256fi 257OP=$1 258 259# These operations do not require instance parameters 260case $OP in 261 meta-data) 262 meta_data 263 exit $OCF_SUCCESS 264 ;; 265 usage) 266 usage 267 exit $OCF_SUCCESS 268 ;; 269esac 270 271# check parameters 272DEVICE=$OCF_RESKEY_device 273INDEX=${OCF_RESKEY_index} 274COLLISION_TIMEOUT=${OCF_RESKEY_collision_timeout} 275LOCK_TIMEOUT=${OCF_RESKEY_lock_timeout} 276MONITOR_INTERVAL=${OCF_RESKEY_monitor_interval} 277 278sfex_validate () { 279if [ -z "$DEVICE" ]; then 280 ocf_log err "Please set OCF_RESKEY_device to device for sfex meta-data" 281 exit $OCF_ERR_ARGS 282fi 283if [ ! -w "$DEVICE" ]; then 284 ocf_log warn "Couldn't find device [$DEVICE]. Expected /dev/??? to exist" 285 exit $OCF_ERR_ARGS 286fi 287} 288 289if [ -n "$OCF_RESKEY_CRM_meta_clone" ]; then 290 ocf_log err "THIS RA DO NOT SUPPORT CLONE MODE!" 291 exit $OCF_ERR_CONFIGURED 292fi 293 294case $OP in 295 start) 296 sfex_start 297 ;; 298 stop) 299 sfex_stop 300 ;; 301 monitor) 302 sfex_monitor 303 ;; 304 validate-all) 305 sfex_validate 306 ;; 307 *) 308 exit $OCF_ERR_UNIMPLEMENTED 309 ;; 310esac 311exit $? 312