1#!/usr/bin/sh
2
3# Watchdog events are logged here.
4PIDFILE="/var/cfengine/watchdog.pid"
5trap cleanup SIGHUP SIGINT SIGQUIT SIGABRT SIGTERM
6cleanup()
7{
8    # Don't leave behind junk if the script is killed
9    if [ -d "${COLLECTION_DIR}" ]; then
10        rm -rf "${COLLECTION_DIR}"
11    fi
12}
13
14all_cfengine_daemons_running()
15{
16    /etc/rc.d/init.d/cfengine3 restart > ${COLLECTION_DIR}/etc_rc_d_init_d_cfengine3_restart.$(date +%s).log 2>&1
17
18    sleep 3
19
20    # Log if any expected daemon is not running
21    if [ $(ps -ef -o args | grep -c ^\/var\/cfengine\/bin\/[c]f-execd) -lt 1 ]; then
22        echo "- *cf-execd is not running after service restart*" >> ${COLLECTION_REPORT}
23        echo "$(date) cf-execd is not running after service restart" | tee -a ${LOGFILE}
24    fi
25    if [ $(ps -ef -o args | grep -c ^\/var\/cfengine\/bin\/[c]f-serverd) -lt 1 ]; then
26        echo "- *cf-serverd is not running after service restart*" >> ${COLLECTION_REPORT}
27        echo "$(date) cf-serverd is not running after service restart" | tee -a ${LOGFILE}
28    fi
29    if [ $(ps -ef -o args | grep -c ^\/var\/cfengine\/bin\/[c]f-monitord) -lt 1 ]; then
30        echo "- *cf-monitord is not running after service restart*" >> ${COLLECTION_REPORT}
31        echo "$(date) cf-monitord is not running after service restart" | tee -a ${LOGFILE}
32    fi
33
34    if [ $(ps -ef -o args | grep -c ^\/var\/cfengine\/bin\/[c]f-) -lt 1 ]; then
35        echo "- *No cf- processes running after service restart*" >> ${COLLECTION_REPORT}
36        echo "$(date) No cf- processes running after service restart" | tee -a ${LOGFILE}
37    fi
38
39    for each in execd serverd monitord; do
40        if [ $(ps -ef -o args | grep -c ^\/var\/cfengine\/bin\/[c]f-${each}) -lt 1 ]; then
41            /etc/rc.d/init.d/cfengine3 stop > ${COLLECTION_DIR}/etc_rc_d_init_d_cfengine3_stop.$(date +%s).log 2>&1
42            return 1
43        fi
44    done
45
46    return 0
47}
48
49LOGFILE="/var/cfengine/watchdog.log"
50echo "$(date) Initiating watchdog $$" >> ${LOGFILE}
51
52if [ -s $PIDFILE ]; then
53    ps -p $(cat $PIDFILE) > /dev/null 2>&1
54    _ret=$?
55    if [ "${_ret}" -eq 0 ] ; then
56        echo "$(date) Aborting execution of watchdog $$, existing watchdog process $(cat $PIDFILE) running" >> ${LOGFILE}
57        exit 1
58    else
59        # No current process matching pid in file
60        echo $$ > $PIDFILE
61    fi
62else
63    echo $$ > $PIDFILE
64fi
65
66
67
68TMPDIR="/tmp"
69mkdir -p $TMPDIR
70CFENGINE_WORKDIR="/var/cfengine"
71CFENGINE_WORKDIR_COLLECTION=""
72OUTPUTS_DIR="${CFENGINE_WORKDIR}/outputs"
73ARCHIVE_DIR="/var/cfengine/watchdog-archives"
74mkdir -p "${ARCHIVE_DIR}"
75PATHOLOGY_COUNT=0
76PATHOLOGY_THRESHOLD=0
77
78# Collection Dir
79# - We create a directory in order to collect artifacts about our observations
80# - If there are enough pathology indicators found to warrant a report, the
81#   directory will be packaged for sending
82# Portable mktemp: https://stackoverflow.com/questions/10224921/how-to-create-a-temporary-file-with-portable-shell-in-a-secure-way#comment86787877_10235393
83# Adjusted, known to work on aix 7.1.0.0
84if [ -r "/dev/urandom" ]; then
85    RNDM="/dev/urandom"
86else
87    RNDM="/dev/random"
88fi
89length=7; safetemp=$(od -An -N${length} -tx1 ${RNDM} | tr -d ' \t').cfengine-watchdog
90COLLECTION_DIR="${TMPDIR}/${safetemp}"
91mkdir -p "${COLLECTION_DIR}"
92COLLECTION_REPORT="${COLLECTION_DIR}/README.org"
93echo "#+Title: CFEngine Watchdog Engineering Summary [$(date '+%Y-%m-%d %a')]" >> ${COLLECTION_REPORT}
94echo "* Pathologies" >> ${COLLECTION_REPORT}
95
96ARCHIVE_DIR_FREEk=$(df -k ${ARCHIVE_DIR} | tail -n 1 | awk '{print $3}')
97if [ "${ARCHIVE_DIR_FREEk}" -lt 500000 ]; then
98    echo "$(date) ${ARCHIVE_DIR} has less than 500MB free space, initiating watchdog archive cleanup" | tee -a ${LOGFILE}
99    # Archive dir has less than 500MB, cleanup leaving oldest and newest archives
100    files=$(ls ${ARCHIVE_DIR})
101    count=$(echo $files | wc -w)
102    _counter=0
103    for i in $files; do
104        if [ "${_counter}" -eq 0 ]; then
105            echo "Not deleting $i it's the oldest and may contain valuable information about the first event"
106        elif [ "${_counter}" -eq "$(( $count - 1))" ]; then
107            echo "Not deleting $i it's the most recent"
108        else
109            echo "Delete $i"
110            rm "${ARCHIVE_DIR}/$i"
111        fi
112        _counter=$((1 + ${_counter}))
113    done
114    ARCHIVE_DIR_FREEk=$(df -k ${ARCHIVE_DIR} | tail -n 1 | awk '{print $3}')
115    if [ "${ARCHIVE_DIR_FREEk}" -lt 500000 ]; then
116        echo "$(date) ${ARCHIVE_DIR} still has less than 500MB free space after cleaning up archives." | tee -a ${LOGFILE}
117        echo "$(date) Aborting watchdog $$" | tee -a ${LOGFILE}
118        cleanup
119        exit 1
120    fi
121fi
122
123# We check free space in tmp second (in case tmp is on same filesystem as archives, and archives get cleaned up)
124TMPDIR_FREEk=$(df -k ${TMPDIR} | tail -n 1 | awk '{print $3}')
125if [ "${TMPDIR_FREEk}" -lt 500000 ]; then
126    echo "$(date) ${TMPDIR} has less than 500MB free space" | tee -a ${LOGFILE}
127    echo "$(date) Aborting watchdog $$" | tee -a ${LOGFILE}
128    cleanup
129    exit 1
130fi
131
132
133# Pathology #1: cf-execd is not running.
134# While not strictly a pathology, it is non-standard to run cf-agent without cf-execd.
135ps -ef -o args | grep ^\/var\/cfengine\/bin\/[c]f-execd > "${COLLECTION_DIR}/ps_grep_cf-execd.txt"
136_COUNT_CF_EXECD_PROCS="$(cat ${COLLECTION_DIR}/ps_grep_cf-execd.txt | wc -l)"
137if [ "${_COUNT_CF_EXECD_PROCS}" -lt "1" ]; then
138    echo "$(date) Found cf-execd not running" >> ${LOGFILE}
139    echo "- cf-execd not running" >> ${COLLECTION_REPORT}
140    PATHOLOGY_COUNT=$((${PATHOLOGY_COUNT}+1))
141fi
142
143# Pathology #1.5: More than one cf-execd is running.
144if [ "${_COUNT_CF_EXECD_PROCS}" -gt "1" ]; then
145    echo "$(date) Found ${_COUNT_CF_EXECD_PROCS} cf-execd processes running" >> ${LOGFILE}
146    echo "- Found ${_COUNT_CF_EXECD_PROCS} cf-execd running" >> ${COLLECTION_REPORT}
147    PATHOLOGY_COUNT=$((${PATHOLOGY_COUNT}+1))
148fi
149
150# Pathology #2: cf-agent execution times exceed body executor control agent_expireafter
151# cf-agent processes launched by cf-execd have not communicated back to cf-execd
152# within $(agent_expireafter). This may be OK, but may also indicate that
153# cf-agent is hung.
154# Outputs could be empty because:
155# - cf-execd hasn't executed cf-agent since purging outputs
156# - cf-agent is not producing output (the expected normal state)
157if [ "$(ls -A ${OUTPUTS_DIR})" ]; then
158    observation=$(find "${OUTPUTS_DIR}" ! -name previous | xargs grep "cf-execd: timeout waiting for output from agent")
159    if [ -n "$observation" ]; then
160        count=$(expr 0 + $(echo -n "${observation}" | wc -l))
161        echo "$(date) Found ${count} occurrences of cf-execd terminating unresponsive cf-agent" >> ${LOGFILE}
162        echo "- ${count} cf-agent terminations" >> ${COLLECTION_REPORT}
163        PATHOLOGY_COUNT=$((${PATHOLOGY_COUNT}+1))
164    fi
165fi
166
167# Pathology #3: cf-agent process(s) running longer than expected
168# While not strictly a pathology, this may indicate a hung agent
169# On AIX, etime (and etimes) is displayed in human readable form
170# e.g:
171#    USER      PID     ELAPSED COMMAND
172#    root 10551366  5-00:35:58 /var/cfengine/bin/cf-agent
173# We have to convert that into seconds so that we can determine if it's been
174# running longer than we expect, that's what the second awk command is for.
175observation=$(ps -e -o user,pid,etime,args | awk 'FNR == 1 {next} /\/var\/cfengine\/bin\/cf-agent/ {print $3}' | awk 'BEGIN { FS = ":" } \
176{
177  if (NF == 2) {
178    etimes = $1*60 + $2
179  } else if (NF == 3) {
180    split($1, a, "-");
181    if (a[2] != "" ) {
182      etimes = ((a[1]*24+a[2])*60 + $2) * 60 + $3;
183    } else {
184      etimes = ($1*60 + $2) * 60 + $3;
185    }
186  }
187 if (etimes > 300) {
188   print
189}
190}')
191if [ ${#observation} -gt 0 ]; then
192    count=$(expr 1 + $(echo -n "${observation}" | wc -l))
193    echo "$(date) Found ${count} cf-agent processes running longer than 300s" >> ${LOGFILE}
194    echo "- ${count} cf-agent processes running longer than 300s" >> ${COLLECTION_REPORT}
195    PATHOLOGY_COUNT=$((${PATHOLOGY_COUNT}+1))
196fi
197
198# Pathology #4: High number of concurrent cf-agent processes
199# While cf-agent is designed to be able to run concurrently, having many
200# concurrent cf-agent processes may indicate an an issue like an agent pile up
201observation=$(ps -e -o pid,etime,args | awk '/\/var\/cfengine\/bin\/cf-agent/ {print}')
202if [ ${#observation} -gt 0 ]; then
203    count=$(expr 1 + $(echo -n "${observation}" | wc -l))
204    if [ ${count} -gt 3 ]; then
205      echo "$(date) Found ${count} concurrently running agents" >> ${LOGFILE}
206      echo "- ${count} concurrently running cf-agent processes" >> ${COLLECTION_REPORT}
207      PATHOLOGY_COUNT=$((${PATHOLOGY_COUNT}+1))
208    fi
209fi
210
211# Pathology #5: cf-check has encountered a critical issue
212# This indicates that there are one or more integrity issues
213if [ -x /var/cfengine/bin/cf-check ]; then
214    observation=$(/var/cfengine/bin/cf-check diagnose /var/cfengine/state/*.lmdb)
215    if [ $? -ne 0 ]; then
216      echo "$(date) cf-check observed critical integrity issues" >> ${LOGFILE}
217      echo "- cf-check observed critical integrity issues" >> ${COLLECTION_REPORT}
218      echo "  #+begin_example\n${observation}\n#+end_example" >> ${COLLECTION_REPORT}
219    fi
220fi
221
222if [ "${PATHOLOGY_COUNT}" -gt "${PATHOLOGY_THRESHOLD}" ]; then
223
224    echo "$(date) Found ${PATHOLOGY_COUNT} symptoms, threshold (${PATHOLOGY_THRESHOLD}) breached." | tee -a ${LOGFILE}
225    echo "* Observations" >> ${COLLECTION_REPORT}
226
227    ps auxwww > "${COLLECTION_DIR}/ps_auxwww.txt"
228    echo "- [[./ps_auxwww.txt][~ps auxwww~]]" >> ${COLLECTION_REPORT}
229
230    ps -elf > "${COLLECTION_DIR}/ps_-elf.txt"
231    echo "- [[./ps_-elf.txt][~ps -elf~]]" >> ${COLLECTION_REPORT}
232
233    find "${CFENGINE_WORKDIR}" >> "${COLLECTION_DIR}/find__var_cfengine.txt"
234    echo "- [[./find__var_cfengine.txt][=/var/cfengine= file list]] - Before remediation" >> ${COLLECTION_REPORT}
235
236    tar -c -f "${COLLECTION_DIR}/sys.workdir-before-remediation.tar" -C "${CFENGINE_WORKDIR}" state outputs
237    echo "- [[./sys.workdir-before-remediation.tar][=sys.workdir-before-remediation.tar=]] - CFEngine WORKDIR artifacts before remediation" >> ${COLLECTION_REPORT}
238
239    ps -efl | grep cf- > "${COLLECTION_DIR}/cf-procs.txt"
240    echo "- [[./cf-procs.txt][~ps -efl | grep cf-~]]" >> ${COLLECTION_REPORT}
241
242    echo "$(date) Initiating apoptosis" | tee -a ${LOGFILE}
243    while IFS= read -r proc; do
244        _PID=$(echo $proc | awk '{print $4}')
245        _PROC=$(echo $proc | awk '{print $15}')
246        _PROCFILE=$(echo "$_PROC" | sed 's./._.g')
247        _COREFILE=$(printf "%s_%s.core" "$_PID" "$_PROCFILE")
248        _COREPATH=$(printf "%s/%s" "${COLLECTION_DIR}" "$_COREFILE")
249        _DBX_ERR_LOG=$(printf "%s/%s.dbx.err" "${COLLECTION_DIR}" "$_COREFILE")
250
251        gencore "$_PID" "$_COREPATH" > "${COLLECTION_DIR}/gencore-$_PID.output" 2>&1
252        echo "- [[./gencore-$_PID.output][output from gencore $_PID $_COREPATH]]" >> ${COLLECTION_REPORT}
253        echo "- [[./$_COREFILE][core from $_PID]]" >> ${COLLECTION_REPORT}
254
255        observation=$(echo "where" | dbx "$_PROC" "$_COREPATH" 2> "$_DBX_ERR_LOG")
256        echo "  - backtrace:" >> ${COLLECTION_REPORT}
257        echo "    #+begin_example\n${observation}\n#+end_example" >> ${COLLECTION_REPORT}
258        echo "  - dbx stderr: [[./$(basename $_DBX_ERR_LOG)][dbx $_PROC $_COREPATH]]" >> ${COLLECTION_REPORT}
259        kill -s SIGKILL "$_PID" > "${COLLECTION_DIR}/kill_$_PID.txt" 2>&1
260        echo "  - [[./kill_$_PID.txt][~kill -s SIGKILL $_PID~]]" >> ${COLLECTION_REPORT}
261
262    done < "${COLLECTION_DIR}/cf-procs.txt"
263
264    echo "- Purged outputs (don't want them to trigger pathology remediation more than once)" >> ${COLLECTION_REPORT}
265    for each in $(ls -A "${CFENGINE_WORKDIR}/outputs/"); do
266        rm "${CFENGINE_WORKDIR}/outputs/${each}"
267    done
268
269    # Switch to more intelligent cf-check repair in 3.12.3
270    echo "- Purged LMDBs" >> ${COLLECTION_REPORT}
271    for each in ${CFENGINE_WORKDIR}/state/*.lmdb*; do
272        rm "${each}"
273    done
274
275    tar -c -f "${COLLECTION_DIR}/sys.workdir-after-remediation.tar" -C "${CFENGINE_WORKDIR}" state outputs
276    echo "- [[./sys.workdir-after-remediation.tar][=sys.workdir-after-remediation.tar=]] - CFEngine WORKDIR artifacts after remediation" >> ${COLLECTION_REPORT}
277
278####+begin_critical
279    # We will re-try up to 10 times to get all the daemons running
280    echo "$(date) Initiating anastasis" | tee -a ${LOGFILE}
281    for try in 1 2 3 4 5 6 7 8 9 10; do
282        echo "- Anastasis [[./remediation-re-start-try-${try}.log][try ${try}]]" >> ${COLLECTION_REPORT}
283        if all_cfengine_daemons_running > "${COLLECTION_DIR}/remediation-re-start-try-${try}.log"; then break; fi
284        if [ "${try}" -ne 10 ]; then
285            echo "Attempt $((1 + ${try})) of 10 in 10 seconds ..." >> ${LOGFILE}
286            sleep 10
287        else
288            echo "$(date) Failed to bring all services online after 10 retries, giving up" | tee -a ${LOGFILE}
289            echo "- Failed to bring all services online after 10 retries, giving up" >> ${COLLECTION_REPORT}
290        fi
291    done
292####+end_critical
293
294
295    tar -c -f "${COLLECTION_DIR}/sys.workdir-after-restarting.tar" -C "${CFENGINE_WORKDIR}" state outputs
296    echo "- [[./sys.workdir-after-restarting.tar][=sys.workdir-after-restarting.tar=]] - CFEngine WORKDIR artifacts after restarting" >> ${COLLECTION_REPORT}
297
298    cp $0 ${COLLECTION_DIR}/watchdog
299    echo "- [[./watchdog][=watchdog=]] - The watchdog that generated this report" >> ${COLLECTION_REPORT}
300
301    # Package the artifacts together
302    tar -c -f "${COLLECTION_DIR}.tar" -C "${COLLECTION_DIR}" .
303    gzip "${COLLECTION_DIR}.tar"
304    mv "${COLLECTION_DIR}.tar.gz" "${ARCHIVE_DIR}/$(date +%Y-%m-%d_%s).tar.gz"
305    echo "$(date) Collected artifacts stored in ${ARCHIVE_DIR}/$(date +%Y-%m-%d_%s).tar.gz" | tee -a ${LOGFILE}
306elif [ "${PATHOLOGY_COUNT}" -gt 0 ]; then
307    echo "$(date) Found ${PATHOLOGY_COUNT} symptoms, threshold (${PATHOLOGY_THRESHOLD}) not breached." | tee -a ${LOGFILE}
308else
309    echo "$(date) Found ${PATHOLOGY_COUNT} symptoms, threshold (${PATHOLOGY_THRESHOLD}) not breached, no remediation or collection performed" >> ${LOGFILE}
310fi
311echo "$(date) DONE watchdog $$" >> ${LOGFILE}
312
313cleanup
314