1#!/usr/bin/sh 2 3# Watchdog events are logged here. 4PIDFILE="/var/cfengine/watchdog.pid" 5trap cleanup SIGHUP SIGINT SIGQUIT SIGABRT SIGTERM 6cleanup() 7{ 8 # Don't leave behind junk if the script is killed 9 if [ -d "${COLLECTION_DIR}" ]; then 10 rm -rf "${COLLECTION_DIR}" 11 fi 12} 13 14all_cfengine_daemons_running() 15{ 16 /etc/rc.d/init.d/cfengine3 restart > ${COLLECTION_DIR}/etc_rc_d_init_d_cfengine3_restart.$(date +%s).log 2>&1 17 18 sleep 3 19 20 # Log if any expected daemon is not running 21 if [ $(ps -ef -o args | grep -c ^\/var\/cfengine\/bin\/[c]f-execd) -lt 1 ]; then 22 echo "- *cf-execd is not running after service restart*" >> ${COLLECTION_REPORT} 23 echo "$(date) cf-execd is not running after service restart" | tee -a ${LOGFILE} 24 fi 25 if [ $(ps -ef -o args | grep -c ^\/var\/cfengine\/bin\/[c]f-serverd) -lt 1 ]; then 26 echo "- *cf-serverd is not running after service restart*" >> ${COLLECTION_REPORT} 27 echo "$(date) cf-serverd is not running after service restart" | tee -a ${LOGFILE} 28 fi 29 if [ $(ps -ef -o args | grep -c ^\/var\/cfengine\/bin\/[c]f-monitord) -lt 1 ]; then 30 echo "- *cf-monitord is not running after service restart*" >> ${COLLECTION_REPORT} 31 echo "$(date) cf-monitord is not running after service restart" | tee -a ${LOGFILE} 32 fi 33 34 if [ $(ps -ef -o args | grep -c ^\/var\/cfengine\/bin\/[c]f-) -lt 1 ]; then 35 echo "- *No cf- processes running after service restart*" >> ${COLLECTION_REPORT} 36 echo "$(date) No cf- processes running after service restart" | tee -a ${LOGFILE} 37 fi 38 39 for each in execd serverd monitord; do 40 if [ $(ps -ef -o args | grep -c ^\/var\/cfengine\/bin\/[c]f-${each}) -lt 1 ]; then 41 /etc/rc.d/init.d/cfengine3 stop > ${COLLECTION_DIR}/etc_rc_d_init_d_cfengine3_stop.$(date +%s).log 2>&1 42 return 1 43 fi 44 done 45 46 return 0 47} 48 49LOGFILE="/var/cfengine/watchdog.log" 50echo "$(date) Initiating watchdog $$" >> ${LOGFILE} 51 52if [ -s $PIDFILE ]; then 53 ps -p $(cat $PIDFILE) > /dev/null 2>&1 54 _ret=$? 55 if [ "${_ret}" -eq 0 ] ; then 56 echo "$(date) Aborting execution of watchdog $$, existing watchdog process $(cat $PIDFILE) running" >> ${LOGFILE} 57 exit 1 58 else 59 # No current process matching pid in file 60 echo $$ > $PIDFILE 61 fi 62else 63 echo $$ > $PIDFILE 64fi 65 66 67 68TMPDIR="/tmp" 69mkdir -p $TMPDIR 70CFENGINE_WORKDIR="/var/cfengine" 71CFENGINE_WORKDIR_COLLECTION="" 72OUTPUTS_DIR="${CFENGINE_WORKDIR}/outputs" 73ARCHIVE_DIR="/var/cfengine/watchdog-archives" 74mkdir -p "${ARCHIVE_DIR}" 75PATHOLOGY_COUNT=0 76PATHOLOGY_THRESHOLD=0 77 78# Collection Dir 79# - We create a directory in order to collect artifacts about our observations 80# - If there are enough pathology indicators found to warrant a report, the 81# directory will be packaged for sending 82# Portable mktemp: https://stackoverflow.com/questions/10224921/how-to-create-a-temporary-file-with-portable-shell-in-a-secure-way#comment86787877_10235393 83# Adjusted, known to work on aix 7.1.0.0 84if [ -r "/dev/urandom" ]; then 85 RNDM="/dev/urandom" 86else 87 RNDM="/dev/random" 88fi 89length=7; safetemp=$(od -An -N${length} -tx1 ${RNDM} | tr -d ' \t').cfengine-watchdog 90COLLECTION_DIR="${TMPDIR}/${safetemp}" 91mkdir -p "${COLLECTION_DIR}" 92COLLECTION_REPORT="${COLLECTION_DIR}/README.org" 93echo "#+Title: CFEngine Watchdog Engineering Summary [$(date '+%Y-%m-%d %a')]" >> ${COLLECTION_REPORT} 94echo "* Pathologies" >> ${COLLECTION_REPORT} 95 96ARCHIVE_DIR_FREEk=$(df -k ${ARCHIVE_DIR} | tail -n 1 | awk '{print $3}') 97if [ "${ARCHIVE_DIR_FREEk}" -lt 500000 ]; then 98 echo "$(date) ${ARCHIVE_DIR} has less than 500MB free space, initiating watchdog archive cleanup" | tee -a ${LOGFILE} 99 # Archive dir has less than 500MB, cleanup leaving oldest and newest archives 100 files=$(ls ${ARCHIVE_DIR}) 101 count=$(echo $files | wc -w) 102 _counter=0 103 for i in $files; do 104 if [ "${_counter}" -eq 0 ]; then 105 echo "Not deleting $i it's the oldest and may contain valuable information about the first event" 106 elif [ "${_counter}" -eq "$(( $count - 1))" ]; then 107 echo "Not deleting $i it's the most recent" 108 else 109 echo "Delete $i" 110 rm "${ARCHIVE_DIR}/$i" 111 fi 112 _counter=$((1 + ${_counter})) 113 done 114 ARCHIVE_DIR_FREEk=$(df -k ${ARCHIVE_DIR} | tail -n 1 | awk '{print $3}') 115 if [ "${ARCHIVE_DIR_FREEk}" -lt 500000 ]; then 116 echo "$(date) ${ARCHIVE_DIR} still has less than 500MB free space after cleaning up archives." | tee -a ${LOGFILE} 117 echo "$(date) Aborting watchdog $$" | tee -a ${LOGFILE} 118 cleanup 119 exit 1 120 fi 121fi 122 123# We check free space in tmp second (in case tmp is on same filesystem as archives, and archives get cleaned up) 124TMPDIR_FREEk=$(df -k ${TMPDIR} | tail -n 1 | awk '{print $3}') 125if [ "${TMPDIR_FREEk}" -lt 500000 ]; then 126 echo "$(date) ${TMPDIR} has less than 500MB free space" | tee -a ${LOGFILE} 127 echo "$(date) Aborting watchdog $$" | tee -a ${LOGFILE} 128 cleanup 129 exit 1 130fi 131 132 133# Pathology #1: cf-execd is not running. 134# While not strictly a pathology, it is non-standard to run cf-agent without cf-execd. 135ps -ef -o args | grep ^\/var\/cfengine\/bin\/[c]f-execd > "${COLLECTION_DIR}/ps_grep_cf-execd.txt" 136_COUNT_CF_EXECD_PROCS="$(cat ${COLLECTION_DIR}/ps_grep_cf-execd.txt | wc -l)" 137if [ "${_COUNT_CF_EXECD_PROCS}" -lt "1" ]; then 138 echo "$(date) Found cf-execd not running" >> ${LOGFILE} 139 echo "- cf-execd not running" >> ${COLLECTION_REPORT} 140 PATHOLOGY_COUNT=$((${PATHOLOGY_COUNT}+1)) 141fi 142 143# Pathology #1.5: More than one cf-execd is running. 144if [ "${_COUNT_CF_EXECD_PROCS}" -gt "1" ]; then 145 echo "$(date) Found ${_COUNT_CF_EXECD_PROCS} cf-execd processes running" >> ${LOGFILE} 146 echo "- Found ${_COUNT_CF_EXECD_PROCS} cf-execd running" >> ${COLLECTION_REPORT} 147 PATHOLOGY_COUNT=$((${PATHOLOGY_COUNT}+1)) 148fi 149 150# Pathology #2: cf-agent execution times exceed body executor control agent_expireafter 151# cf-agent processes launched by cf-execd have not communicated back to cf-execd 152# within $(agent_expireafter). This may be OK, but may also indicate that 153# cf-agent is hung. 154# Outputs could be empty because: 155# - cf-execd hasn't executed cf-agent since purging outputs 156# - cf-agent is not producing output (the expected normal state) 157if [ "$(ls -A ${OUTPUTS_DIR})" ]; then 158 observation=$(find "${OUTPUTS_DIR}" ! -name previous | xargs grep "cf-execd: timeout waiting for output from agent") 159 if [ -n "$observation" ]; then 160 count=$(expr 0 + $(echo -n "${observation}" | wc -l)) 161 echo "$(date) Found ${count} occurrences of cf-execd terminating unresponsive cf-agent" >> ${LOGFILE} 162 echo "- ${count} cf-agent terminations" >> ${COLLECTION_REPORT} 163 PATHOLOGY_COUNT=$((${PATHOLOGY_COUNT}+1)) 164 fi 165fi 166 167# Pathology #3: cf-agent process(s) running longer than expected 168# While not strictly a pathology, this may indicate a hung agent 169# On AIX, etime (and etimes) is displayed in human readable form 170# e.g: 171# USER PID ELAPSED COMMAND 172# root 10551366 5-00:35:58 /var/cfengine/bin/cf-agent 173# We have to convert that into seconds so that we can determine if it's been 174# running longer than we expect, that's what the second awk command is for. 175observation=$(ps -e -o user,pid,etime,args | awk 'FNR == 1 {next} /\/var\/cfengine\/bin\/cf-agent/ {print $3}' | awk 'BEGIN { FS = ":" } \ 176{ 177 if (NF == 2) { 178 etimes = $1*60 + $2 179 } else if (NF == 3) { 180 split($1, a, "-"); 181 if (a[2] != "" ) { 182 etimes = ((a[1]*24+a[2])*60 + $2) * 60 + $3; 183 } else { 184 etimes = ($1*60 + $2) * 60 + $3; 185 } 186 } 187 if (etimes > 300) { 188 print 189} 190}') 191if [ ${#observation} -gt 0 ]; then 192 count=$(expr 1 + $(echo -n "${observation}" | wc -l)) 193 echo "$(date) Found ${count} cf-agent processes running longer than 300s" >> ${LOGFILE} 194 echo "- ${count} cf-agent processes running longer than 300s" >> ${COLLECTION_REPORT} 195 PATHOLOGY_COUNT=$((${PATHOLOGY_COUNT}+1)) 196fi 197 198# Pathology #4: High number of concurrent cf-agent processes 199# While cf-agent is designed to be able to run concurrently, having many 200# concurrent cf-agent processes may indicate an an issue like an agent pile up 201observation=$(ps -e -o pid,etime,args | awk '/\/var\/cfengine\/bin\/cf-agent/ {print}') 202if [ ${#observation} -gt 0 ]; then 203 count=$(expr 1 + $(echo -n "${observation}" | wc -l)) 204 if [ ${count} -gt 3 ]; then 205 echo "$(date) Found ${count} concurrently running agents" >> ${LOGFILE} 206 echo "- ${count} concurrently running cf-agent processes" >> ${COLLECTION_REPORT} 207 PATHOLOGY_COUNT=$((${PATHOLOGY_COUNT}+1)) 208 fi 209fi 210 211# Pathology #5: cf-check has encountered a critical issue 212# This indicates that there are one or more integrity issues 213if [ -x /var/cfengine/bin/cf-check ]; then 214 observation=$(/var/cfengine/bin/cf-check diagnose /var/cfengine/state/*.lmdb) 215 if [ $? -ne 0 ]; then 216 echo "$(date) cf-check observed critical integrity issues" >> ${LOGFILE} 217 echo "- cf-check observed critical integrity issues" >> ${COLLECTION_REPORT} 218 echo " #+begin_example\n${observation}\n#+end_example" >> ${COLLECTION_REPORT} 219 fi 220fi 221 222if [ "${PATHOLOGY_COUNT}" -gt "${PATHOLOGY_THRESHOLD}" ]; then 223 224 echo "$(date) Found ${PATHOLOGY_COUNT} symptoms, threshold (${PATHOLOGY_THRESHOLD}) breached." | tee -a ${LOGFILE} 225 echo "* Observations" >> ${COLLECTION_REPORT} 226 227 ps auxwww > "${COLLECTION_DIR}/ps_auxwww.txt" 228 echo "- [[./ps_auxwww.txt][~ps auxwww~]]" >> ${COLLECTION_REPORT} 229 230 ps -elf > "${COLLECTION_DIR}/ps_-elf.txt" 231 echo "- [[./ps_-elf.txt][~ps -elf~]]" >> ${COLLECTION_REPORT} 232 233 find "${CFENGINE_WORKDIR}" >> "${COLLECTION_DIR}/find__var_cfengine.txt" 234 echo "- [[./find__var_cfengine.txt][=/var/cfengine= file list]] - Before remediation" >> ${COLLECTION_REPORT} 235 236 tar -c -f "${COLLECTION_DIR}/sys.workdir-before-remediation.tar" -C "${CFENGINE_WORKDIR}" state outputs 237 echo "- [[./sys.workdir-before-remediation.tar][=sys.workdir-before-remediation.tar=]] - CFEngine WORKDIR artifacts before remediation" >> ${COLLECTION_REPORT} 238 239 ps -efl | grep cf- > "${COLLECTION_DIR}/cf-procs.txt" 240 echo "- [[./cf-procs.txt][~ps -efl | grep cf-~]]" >> ${COLLECTION_REPORT} 241 242 echo "$(date) Initiating apoptosis" | tee -a ${LOGFILE} 243 while IFS= read -r proc; do 244 _PID=$(echo $proc | awk '{print $4}') 245 _PROC=$(echo $proc | awk '{print $15}') 246 _PROCFILE=$(echo "$_PROC" | sed 's./._.g') 247 _COREFILE=$(printf "%s_%s.core" "$_PID" "$_PROCFILE") 248 _COREPATH=$(printf "%s/%s" "${COLLECTION_DIR}" "$_COREFILE") 249 _DBX_ERR_LOG=$(printf "%s/%s.dbx.err" "${COLLECTION_DIR}" "$_COREFILE") 250 251 gencore "$_PID" "$_COREPATH" > "${COLLECTION_DIR}/gencore-$_PID.output" 2>&1 252 echo "- [[./gencore-$_PID.output][output from gencore $_PID $_COREPATH]]" >> ${COLLECTION_REPORT} 253 echo "- [[./$_COREFILE][core from $_PID]]" >> ${COLLECTION_REPORT} 254 255 observation=$(echo "where" | dbx "$_PROC" "$_COREPATH" 2> "$_DBX_ERR_LOG") 256 echo " - backtrace:" >> ${COLLECTION_REPORT} 257 echo " #+begin_example\n${observation}\n#+end_example" >> ${COLLECTION_REPORT} 258 echo " - dbx stderr: [[./$(basename $_DBX_ERR_LOG)][dbx $_PROC $_COREPATH]]" >> ${COLLECTION_REPORT} 259 kill -s SIGKILL "$_PID" > "${COLLECTION_DIR}/kill_$_PID.txt" 2>&1 260 echo " - [[./kill_$_PID.txt][~kill -s SIGKILL $_PID~]]" >> ${COLLECTION_REPORT} 261 262 done < "${COLLECTION_DIR}/cf-procs.txt" 263 264 echo "- Purged outputs (don't want them to trigger pathology remediation more than once)" >> ${COLLECTION_REPORT} 265 for each in $(ls -A "${CFENGINE_WORKDIR}/outputs/"); do 266 rm "${CFENGINE_WORKDIR}/outputs/${each}" 267 done 268 269 # Switch to more intelligent cf-check repair in 3.12.3 270 echo "- Purged LMDBs" >> ${COLLECTION_REPORT} 271 for each in ${CFENGINE_WORKDIR}/state/*.lmdb*; do 272 rm "${each}" 273 done 274 275 tar -c -f "${COLLECTION_DIR}/sys.workdir-after-remediation.tar" -C "${CFENGINE_WORKDIR}" state outputs 276 echo "- [[./sys.workdir-after-remediation.tar][=sys.workdir-after-remediation.tar=]] - CFEngine WORKDIR artifacts after remediation" >> ${COLLECTION_REPORT} 277 278####+begin_critical 279 # We will re-try up to 10 times to get all the daemons running 280 echo "$(date) Initiating anastasis" | tee -a ${LOGFILE} 281 for try in 1 2 3 4 5 6 7 8 9 10; do 282 echo "- Anastasis [[./remediation-re-start-try-${try}.log][try ${try}]]" >> ${COLLECTION_REPORT} 283 if all_cfengine_daemons_running > "${COLLECTION_DIR}/remediation-re-start-try-${try}.log"; then break; fi 284 if [ "${try}" -ne 10 ]; then 285 echo "Attempt $((1 + ${try})) of 10 in 10 seconds ..." >> ${LOGFILE} 286 sleep 10 287 else 288 echo "$(date) Failed to bring all services online after 10 retries, giving up" | tee -a ${LOGFILE} 289 echo "- Failed to bring all services online after 10 retries, giving up" >> ${COLLECTION_REPORT} 290 fi 291 done 292####+end_critical 293 294 295 tar -c -f "${COLLECTION_DIR}/sys.workdir-after-restarting.tar" -C "${CFENGINE_WORKDIR}" state outputs 296 echo "- [[./sys.workdir-after-restarting.tar][=sys.workdir-after-restarting.tar=]] - CFEngine WORKDIR artifacts after restarting" >> ${COLLECTION_REPORT} 297 298 cp $0 ${COLLECTION_DIR}/watchdog 299 echo "- [[./watchdog][=watchdog=]] - The watchdog that generated this report" >> ${COLLECTION_REPORT} 300 301 # Package the artifacts together 302 tar -c -f "${COLLECTION_DIR}.tar" -C "${COLLECTION_DIR}" . 303 gzip "${COLLECTION_DIR}.tar" 304 mv "${COLLECTION_DIR}.tar.gz" "${ARCHIVE_DIR}/$(date +%Y-%m-%d_%s).tar.gz" 305 echo "$(date) Collected artifacts stored in ${ARCHIVE_DIR}/$(date +%Y-%m-%d_%s).tar.gz" | tee -a ${LOGFILE} 306elif [ "${PATHOLOGY_COUNT}" -gt 0 ]; then 307 echo "$(date) Found ${PATHOLOGY_COUNT} symptoms, threshold (${PATHOLOGY_THRESHOLD}) not breached." | tee -a ${LOGFILE} 308else 309 echo "$(date) Found ${PATHOLOGY_COUNT} symptoms, threshold (${PATHOLOGY_THRESHOLD}) not breached, no remediation or collection performed" >> ${LOGFILE} 310fi 311echo "$(date) DONE watchdog $$" >> ${LOGFILE} 312 313cleanup 314