1 /*
2 * apcaction.c
3 *
4 * Actions taken when something happens to the UPS.
5 */
6
7 /*
8 * Copyright (C) 2000-2004 Kern Sibbald
9 * Copyright (C) 1996-1999 Andre M. Hedrick <andre@suse.com>
10 * Copyright (C) 1999-2000 Riccardo Facchetti <riccardo@master.oasi.gpa.it>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of version 2 of the GNU General
14 * Public License as published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the Free
23 * Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
24 * MA 02110-1335, USA.
25 */
26
27 #include "apc.h"
28
29 extern int kill_on_powerfail;
30 static void do_shutdown(UPSINFO *ups, int cmdtype);
31
32 /*
33 * These are the commands understood by the apccontrol shell script.
34 * You _must_ keep the the commands[] array in sync with the defines in
35 * include/apc_defines.h
36 */
37 UPSCOMMANDS ups_event[] = {
38 {"powerout", 0}, /* CMDPOWEROUT */
39 {"onbattery", 0}, /* CMDONBATTERY */
40 {"failing", 0}, /* CMDFAILING */
41 {"timeout", 0}, /* CMDTIMEOUT */
42 {"loadlimit", 0}, /* CMDLOADLIMIT */
43 {"runlimit", 0}, /* CMDRUNLIMIT */
44 {"doshutdown", 0}, /* CMDDOSHUTDOWN */
45 {"mainsback", 0}, /* CMDMAINSBACK */
46 {"annoyme", 0}, /* CMDANNOYME */
47 {"emergency", 0}, /* CMDEMERGENCY */
48 {"changeme", 0}, /* CMDCHANGEME */
49 {"remotedown", 0}, /* CMDREMOTEDOWN */
50 {"commfailure", 0}, /* CMDCOMMFAILURE */
51 {"commok", 0}, /* CMDCOMMOK */
52 {"startselftest", 0}, /* CMDSTARTSELFTEST */
53 {"endselftest", 0}, /* CMDENDSELFTEST */
54 {"offbattery", 0}, /* CMDOFFBATTERY */
55 {"battdetach", 0}, /* CMDBATTDETACH */
56 {"battattach", 0} /* CMDBATTATTACH */
57 };
58
59 /*
60 * These messages must be kept in sync with the above array
61 * and the defines in include/apc_defines.h
62 */
63 UPSCMDMSG event_msg[] = {
64 {LOG_CRIT, "Power failure."},
65 {LOG_CRIT, "Running on UPS batteries."},
66 {LOG_ALERT, "Battery power exhausted."},
67 {LOG_ALERT, "Reached run time limit on batteries."},
68 {LOG_ALERT, "Battery charge below low limit."},
69 {LOG_ALERT, "Reached remaining time percentage limit on batteries."},
70 {LOG_ALERT, "Initiating system shutdown!"},
71 {LOG_ALERT, "Power is back. UPS running on mains."},
72 {LOG_ALERT, "Users requested to logoff."},
73 {LOG_ALERT, "Battery failure. Emergency."},
74 {LOG_CRIT, "UPS battery must be replaced."},
75 {LOG_CRIT, "Remote shutdown requested."},
76 {LOG_WARNING, "Communications with UPS lost."},
77 {LOG_WARNING, "Communications with UPS restored."},
78 {LOG_WARNING, "UPS Self Test switch to battery."},
79 {LOG_WARNING, "UPS Self Test completed."},
80 {LOG_CRIT, "Mains returned. No longer on UPS batteries."},
81 {LOG_CRIT, "Battery disconnected."},
82 {LOG_CRIT, "Battery reattached."}
83 };
84
generate_event(UPSINFO * ups,int event)85 void generate_event(UPSINFO *ups, int event)
86 {
87 /* Log message and execute script for this event */
88 log_event(ups, event_msg[event].level, event_msg[event].msg);
89 Dmsg(80, "calling execute_ups_event %s event=%d\n", ups_event[event], event);
90 execute_command(ups, ups_event[event]);
91
92 /*
93 * Additional possible actions. For certain, we now do a
94 * shutdown
95 */
96 switch (event) {
97 /*
98 * For the following, in addition to the basic,
99 * message logged and executed above, we do a
100 * system shutdown.
101 */
102 case CMDFAILING:
103 case CMDTIMEOUT:
104 case CMDRUNLIMIT:
105 case CMDLOADLIMIT:
106 case CMDEMERGENCY:
107 case CMDREMOTEDOWN:
108 log_event(ups, event_msg[CMDDOSHUTDOWN].level,
109 event_msg[CMDDOSHUTDOWN].msg);
110 do_shutdown(ups, CMDDOSHUTDOWN);
111 break;
112
113 /* For the following, everything is already done. */
114 case CMDSTARTSELFTEST:
115 case CMDENDSELFTEST:
116 case CMDCOMMFAILURE:
117 case CMDCOMMOK:
118 case CMDCHANGEME:
119 case CMDANNOYME:
120 case CMDMAINSBACK:
121 case CMDDOSHUTDOWN: /* Already shutdown, don't recall */
122 case CMDPOWEROUT:
123 case CMDONBATTERY:
124 case CMDOFFBATTERY:
125 case CMDBATTDETACH:
126 case CMDBATTATTACH:
127 default:
128 break;
129
130 }
131 }
132
133 /*
134 * Closes procfile and logfile to preserve information.
135 *
136 * ok = 1 => power is back
137 * ok = 2 => power failure
138 * ok = 3 => remote shutdown
139 */
powerfail(int ok)140 static void powerfail(int ok)
141 {
142 /*
143 * If apcctrl terminates here, it will never get a chance to
144 * report the event of returning mains-power. I think apcctrl
145 * has no need to force terminate() by itself. It will receive
146 * a SIGTERM from init, when system goes down. This signal is
147 * trapped and will trigger apcctrl's terminate() function.
148 */
149
150 if (ok == 2) {
151 clear_files();
152 if (terminate_on_powerfail) {
153 /*
154 * This sends a SIGTERM signal to itself.
155 * The SIGTERM is bound to apcctrl_ or apctest_terminate(),
156 * depending on which program is running this code, so it will
157 * do in anyway the right thing.
158 */
159 sendsig_terminate();
160 }
161 }
162
163 /*
164 * For network slaves, apcctrl needs to terminate here for now.
165 * This is sloppy, but it works. If you are networked, then the
166 * master must fall also. This is required so that the UPS
167 * can reboot the slaves.
168 */
169 if (ok == 3)
170 sendsig_terminate();
171 }
172
173 /*
174 * If called with zero, prevent users from logging in.
175 * If called with one, allow users to login.
176 */
logonfail(UPSINFO * ups,int ok)177 static void logonfail(UPSINFO *ups, int ok)
178 {
179 int lgnfd;
180
181 unlink(ups->nologinpath);
182
183 if (ok == 0 &&
184 ((lgnfd = open(ups->nologinpath, O_CREAT | O_WRONLY | O_CLOEXEC, 0644)) >= 0)) {
185 write(lgnfd, POWERFAIL, strlen(POWERFAIL));
186 close(lgnfd);
187 }
188 }
189
prohibit_logins(UPSINFO * ups)190 static void prohibit_logins(UPSINFO *ups)
191 {
192 if (ups->nologin_file)
193 return; /* already done */
194
195 logonfail(ups, 0);
196 ups->nologin_file = true;
197
198 log_event(ups, LOG_ALERT, "User logins prohibited");
199 }
200
do_shutdown(UPSINFO * ups,int cmdtype)201 static void do_shutdown(UPSINFO *ups, int cmdtype)
202 {
203 if (ups->is_shutdown())
204 return; /* already done */
205
206 ups->ShutDown = time(NULL);
207 ups->set_shutdown();
208 delete_lockfile(ups);
209 ups->set_fastpoll();
210 make_file(ups, ups->pwrfailpath);
211 prohibit_logins(ups);
212
213 if (!ups->is_slave()) {
214 /*
215 * Note, try avoid using this option if at all possible
216 * as it will shutoff the UPS power, and you cannot
217 * be guaranteed that the shutdown command will have
218 * succeeded. This PROBABLY should be executed AFTER
219 * the shutdown command is given (the execute_command below).
220 */
221 if (kill_on_powerfail)
222 initiate_hibernate(ups);
223 }
224
225 /* Now execute the shutdown command */
226 execute_command(ups, ups_event[cmdtype]);
227
228 /*
229 * On some systems we may stop on the previous
230 * line if a SIGTERM signal is sent to us.
231 */
232
233 if (cmdtype == CMDREMOTEDOWN)
234 powerfail(3);
235 else
236 powerfail(2);
237 }
238
239 /* These are the different "states" that the UPS can be in. */
240 enum a_state {
241 st_PowerFailure,
242 st_SelfTest,
243 st_OnBattery,
244 st_MainsBack,
245 st_OnMains,
246 st_Calibration
247 };
248
249 /*
250 * Figure out what "state" the UPS is in and
251 * return it for use in do_action()
252 */
get_state(UPSINFO * ups,time_t now)253 static enum a_state get_state(UPSINFO *ups, time_t now)
254 {
255 enum a_state state;
256
257 /* If we're on battery for calibration, treat as not on battery */
258 if (ups->is_onbatt()) {
259 if (ups->is_calibration()) {
260 state = st_Calibration;
261 } else if (ups->chg_onbatt()) {
262 state = st_PowerFailure; /* Power failure just detected */
263 } else {
264 if (ups->SelfTest) /* see if UPS is doing self test */
265 state = st_SelfTest; /* yes */
266 else
267 state = st_OnBattery; /* No, this must be real power failure */
268 }
269 } else {
270 if (ups->chg_onbatt()) /* if we were on batteries */
271 state = st_MainsBack; /* then we just got power back */
272 else
273 state = st_OnMains; /* Solid on mains, normal condition */
274 }
275 return state;
276 }
277
testresult_to_string(SelfTestResult res)278 static const char *testresult_to_string(SelfTestResult res)
279 {
280 switch (res) {
281 case TEST_NA:
282 return "Not supported";
283 case TEST_NONE:
284 return "No test results available";
285 case TEST_FAILED:
286 return "Test failed";
287 case TEST_WARNING:
288 return "Warning";
289 case TEST_INPROGRESS:
290 return "In progress";
291 case TEST_PASSED:
292 return "Battery OK";
293 case TEST_FAILCAP:
294 return "Test failed -- insufficient battery capacity";
295 case TEST_FAILLOAD:
296 return "Test failed -- battery overloaded";
297 case TEST_UNKNOWN:
298 default:
299 return "Unknown";
300 }
301 }
302
303 /*
304 * Carl Lindberg <lindberg@clindberg.org> patch applied 24Dec04
305 *
306 * The APC network management cards have options to shut down, reboot, or
307 * "sleep" (really just a delayed reboot) the UPS. For all of these, it
308 * has a "graceful" option, meaning it gives the PowerChute software a
309 * chance to cleanly shutdown the machine before the UPS is shut down. To
310 * do this, the card sets the ONBATT and LOWBATT statuses at the same
311 * time, waits several minutes, then cuts power. PowerChute (presumably)
312 * notices this and shuts the machine down, but unfortunately apcctrl did
313 * not.
314 *
315 * The problem happens because in this situation, apcctrl sets the
316 * UPS_prev_battlow status before testing for it. In the do_action()
317 * function, apcctrl notices the ONBATT status, and uses the
318 * "st_PowerFailure" state to send off an initial power failure event.
319 * After a short delay, do_action() is invoked again. If ONBATT is
320 * still set, the "st_OnBattery" state is used, and the onbattery event
321 * (among other things) is sent.
322 *
323 * The test for LOWBATT to see if shutdown is needed is only done in the
324 * st_OnBattery state, and it's done if LOWBATT is set but
325 * UPS_prev_battlow is not set yet. In normal operation, LOWBATT will
326 * only come on after a period of ONBATT, and this situation works fine.
327 * However, since ONBATT and LOWBATT were set simultaneously, the
328 * UPS_prev_battlow was set the first time through, when the
329 * st_PowerFailure was used, meaning the test for LOWBATT was not
330 * performed. The second time through in st_OnBattery, UPS_prev_battlow
331 * is already set, meaning apcctrl is assuming that the needed shutdown
332 * has already been invoked.
333 *
334 * The code fix just moves setting of the UPS_prev_battlow status to
335 * inside the block that tests for it, ensuring that LOWBATT will never be
336 * ignored. Clearing the UPS_prev_battlow status remains where it is in
337 * the code, and it will always be turned off if LOWBATT is no longer set.
338 *
339 * After the fix, UPS_prev_battlow is not prematurely set, and apcctrl
340 * catches the signal from the management card to shut down. I've had the
341 * code in for over a month, and it's worked fine, both from using the
342 * management card and regular pull-the-plug tests as well. This was
343 * only tested with a serial UPS, but I assume it would be a problem with
344 * USB and SNMP connections as well.
345 */
346
347 /*********************************************************************/
do_action(UPSINFO * ups)348 void do_action(UPSINFO *ups)
349 {
350 time_t now;
351 static int requested_logoff = 0; /* asked user to logoff */
352 static int first = 1;
353 enum a_state state;
354
355 write_lock(ups);
356
357 time(&now); /* get current time */
358 if (first) {
359 first = 0;
360 ups->last_time_nologon = ups->last_time_annoy = now;
361 ups->last_time_on_line = now;
362
363 /*
364 * This is cheating slightly. We want to initialize the previous
365 * status to zero so all set bits in current status will appear
366 * as changes, thus allowing us to handle starting up when power
367 * has already failed, for instance. However, we don't want to
368 * get a BATTATTACHED event every time the daemon starts, so we
369 * set the UPS_battpresent bit in the previous status.
370 */
371 ups->PrevStatus = UPS_battpresent;
372 }
373
374 if (ups->is_replacebatt()) { /* Replace battery */
375 /*
376 * Complain every 9 hours, this causes the complaint to
377 * cycle around the clock and hopefully be more noticable
378 * without being too annoying. Also, ignore all change battery
379 * indications for the first 10 minutes of running time to
380 * prevent false alerts. Finally, issue the event 5 times, then
381 * clear the flag to silence false alarms. If the battery is
382 * really dead, the flag will be reset in apcsmart.c
383 *
384 * UPS_replacebatt is a flag. To count use a static local counter.
385 * The counter is initialized only one time at startup.
386 */
387 if (now - ups->start_time < 60 * 10 || ups->ChangeBattCounter > 5) {
388 ups->clear_replacebatt();
389 ups->ChangeBattCounter = 0;
390 } else if (now - ups->last_time_changeme > 60 * 60 * 9) {
391 generate_event(ups, CMDCHANGEME);
392 ups->last_time_changeme = now;
393 ups->ChangeBattCounter++;
394 }
395 }
396
397 /* Remote is shutting down, so must we. */
398 if (ups->is_shut_remote()) {
399 if (ups->chg_shut_remote()) {
400 generate_event(ups, CMDREMOTEDOWN);
401 }
402 ups->PrevStatus = ups->Status;
403 write_unlock(ups);
404 return;
405 }
406
407 /* Generate event if battery is disconnected or reattached */
408 if (ups->chg_battpresent()) {
409 if (ups->is_battpresent())
410 generate_event(ups, CMDBATTATTACH);
411 else
412 generate_event(ups, CMDBATTDETACH);
413 }
414
415 /*
416 * Did BattLow bit go high? If so, start the battlow shutdown
417 * timer. We will only act on this timer if we switch to battery
418 * (or are already on battery). It is possible that this event occurs
419 * at the same time as or even slightly before we switch to battery.
420 * Therefore we must check it every time we get new status.
421 */
422 if (ups->chg_battlow()) {
423 if (ups->is_battlow()) {
424 Dmsg(100, "BATTLOW asserted\n");
425 ups->start_shut_lbatt = now;
426 } else {
427 Dmsg(100, "BATTLOW glitch\n");
428 }
429 }
430
431 state = get_state(ups, now);
432 switch (state) {
433 case st_OnMains:
434 /* If power is good, update the timers. */
435 ups->last_time_nologon = ups->last_time_annoy = now;
436 ups->last_time_on_line = now;
437 ups->clear_fastpoll();
438 break;
439
440 case st_PowerFailure:
441 /* This is our first indication of a power problem */
442 ups->set_fastpoll(); /* speed up polling */
443
444 /* Check if selftest */
445 Dmsg(80, "Power failure detected. 0x%x\n", ups->Status);
446 device_entry_point(ups, DEVICE_CMD_CHECK_SELFTEST, NULL);
447
448 if (ups->SelfTest)
449 generate_event(ups, CMDSTARTSELFTEST);
450 else
451 generate_event(ups, CMDPOWEROUT);
452
453 ups->last_time_nologon = ups->last_time_annoy = now;
454 ups->last_time_on_line = now;
455 ups->last_onbatt_time = now;
456 ups->num_xfers++;
457
458 /* Enable DTR on dumb UPSes with CUSTOM_SIMPLE cable. */
459 device_entry_point(ups, DEVICE_CMD_DTR_ENABLE, NULL);
460 break;
461
462 case st_SelfTest:
463 /* allow 40 seconds max for selftest */
464 if (now - ups->SelfTest < 40 && !ups->is_battlow())
465 break;
466
467 /* Cancel self test, announce power failure */
468 ups->SelfTest = 0;
469 Dmsg(80, "UPS Self Test cancelled, fall-thru to On Battery. 0x%x\n",
470 ups->Status);
471 break;
472
473 /* ...FALL-THRU to st_OnBattery... */
474
475 case st_OnBattery:
476 /* Did the second test verify the power is failing? */
477 if (!ups->is_onbatt_msg() &&
478 time(NULL) - ups->last_time_on_line >= ups->onbattdelay) {
479 ups->set_onbatt_msg(); /* it is confirmed, we are on batteries */
480 generate_event(ups, CMDONBATTERY);
481 ups->last_time_nologon = ups->last_time_annoy = now;
482 ups->last_time_on_line = now;
483 break;
484 }
485
486 /* shutdown requested but still running */
487 if (ups->is_shutdown()) {
488 if (ups->killdelay && now - ups->ShutDown >= ups->killdelay) {
489 if (!ups->is_slave())
490 initiate_hibernate(ups);
491 ups->ShutDown = now; /* wait a bit before doing again */
492 ups->set_shutdown();
493 }
494 } else { /* not shutdown yet */
495 /*
496 * Did MaxTimeOnBattery Expire? (TIMEOUT in apcctrl.conf)
497 * Normal Power down during Power Failure: Shutdown immediately.
498 */
499 if ((ups->maxtime > 0) && ((now - ups->last_time_on_line) > ups->maxtime)) {
500 ups->set_shut_btime();
501 generate_event(ups, CMDTIMEOUT);
502 break;
503 }
504
505 /*
506 * Did Battery Charge or Runtime go below percent cutoff?
507 * Normal Power down during Power Failure: Start shutdown timer.
508 */
509 if (ups->UPS_Cap[CI_BATTLEV] && ups->BattChg <= ups->percent) {
510 if (!ups->is_shut_load()) {
511 Dmsg(100, "CI_BATTLEV shutdown\n");
512 ups->set_shut_load();
513 ups->start_shut_load = now;
514 }
515 } else {
516 if (ups->UPS_Cap[CI_BATTLEV] && ups->is_shut_load())
517 Dmsg(100, "CI_BATTLEV glitch\n");
518 ups->clear_shut_load();
519 }
520
521 if (ups->UPS_Cap[CI_RUNTIM] && ups->TimeLeft <= ups->runtime) {
522 if (!ups->is_shut_ltime()) {
523 Dmsg(100, "CI_RUNTIM shutdown\n");
524 ups->set_shut_ltime();
525 ups->start_shut_ltime = now;
526 }
527 } else {
528 if (ups->UPS_Cap[CI_RUNTIM] && ups->is_shut_ltime())
529 Dmsg(100, "CI_RUNTIM glitch\n");
530 ups->clear_shut_ltime();
531 }
532
533 /*
534 * Check for expired shutdown timers and act on them.
535 */
536 if (ups->is_battlow() && ((now - ups->start_shut_lbatt) >= 5)) {
537 generate_event(ups, CMDFAILING);
538 break;
539 }
540 if (ups->is_shut_load() && ((now - ups->start_shut_load) >= 5)) {
541 generate_event(ups, CMDLOADLIMIT);
542 break;
543 }
544 if (ups->is_shut_ltime() && ((now - ups->start_shut_ltime) >= 5)) {
545 generate_event(ups, CMDRUNLIMIT);
546 break;
547 }
548
549 /*
550 * We are on batteries, the battery is low, and the power is not
551 * down ==> the battery is dead. KES Sept 2000
552 *
553 * Then the battery has failed!!!
554 * Must do Emergency Shutdown NOW
555 */
556 if (ups->is_battlow() && ups->is_online()) {
557 ups->set_shut_emerg();
558 generate_event(ups, CMDEMERGENCY);
559 }
560
561 /* Announce to LogOff, with initial delay. */
562 if (((now - ups->last_time_on_line) > ups->annoydelay) &&
563 ((now - ups->last_time_annoy) > ups->annoy) && ups->nologin_file) {
564 if (!requested_logoff) {
565 /* generate log message once */
566 generate_event(ups, CMDANNOYME);
567 } else {
568 /* but execute script every time */
569 execute_command(ups, ups_event[CMDANNOYME]);
570 }
571
572 time(&ups->last_time_annoy);
573 requested_logoff = true;
574 }
575
576 /* Delay NoLogons. */
577 if (!ups->nologin_file) {
578 switch (ups->nologin.type) {
579 case NEVER:
580 break;
581 case TIMEOUT:
582 if ((now - ups->last_time_nologon) > ups->nologin_time)
583 prohibit_logins(ups);
584 break;
585 case PERCENT:
586 if (ups->UPS_Cap[CI_BATTLEV] && ups->nologin_time >= ups->BattChg)
587 prohibit_logins(ups);
588 break;
589 case MINUTES:
590 if (ups->UPS_Cap[CI_RUNTIM] && ups->nologin_time >= ups->TimeLeft)
591 prohibit_logins(ups);
592 break;
593 case ALWAYS:
594 default:
595 prohibit_logins(ups);
596 break;
597 }
598 }
599 }
600 break;
601
602 case st_MainsBack:
603 /* The power is back after a power failure or a self test */
604 if (ups->is_onbatt_msg()) {
605 ups->clear_onbatt_msg();
606 generate_event(ups, CMDOFFBATTERY);
607 }
608
609 if (ups->SelfTest) {
610 ups->LastSelfTest = ups->SelfTest;
611 ups->SelfTest = 0;
612
613 /* Get last selftest results, only for smart UPSes. */
614 device_entry_point(ups, DEVICE_CMD_GET_SELFTEST_MSG, NULL);
615 log_event(ups, LOG_ALERT, "UPS Self Test completed: %s",
616 testresult_to_string(ups->testresult));
617 execute_command(ups, ups_event[CMDENDSELFTEST]);
618 } else {
619 generate_event(ups, CMDMAINSBACK);
620 }
621
622 if (ups->nologin_file)
623 log_event(ups, LOG_ALERT, "Allowing logins");
624
625 logonfail(ups, 1);
626 ups->nologin_file = false;
627 requested_logoff = false;
628 device_entry_point(ups, DEVICE_CMD_DTR_ST_DISABLE, NULL);
629 ups->last_offbatt_time = now;
630
631 /*
632 * Sanity check. Sometimes only first power problem trips
633 * thus last_onbatt_time is not set when we get here.
634 */
635 if (ups->last_onbatt_time <= 0)
636 ups->last_onbatt_time = ups->last_offbatt_time;
637
638 ups->cum_time_on_batt += (ups->last_offbatt_time - ups->last_onbatt_time);
639 break;
640
641 case st_Calibration:
642 /*
643 * During calibration we ignore battery level, runtime remaining, etc.
644 * since the UPS will switch us back online when it is done. We also have
645 * no timeout here since we can't predict how long the calibration will
646 * take.
647 */
648 break;
649
650 default:
651 break;
652 }
653
654 /* Do a non-blocking wait on any exec()ed children */
655 if (ups->num_execed_children > 0) {
656 while (waitpid(-1, NULL, WNOHANG) > 0)
657 ups->num_execed_children--;
658 }
659
660 /* Remember status */
661 ups->PrevStatus = ups->Status;
662
663 write_unlock(ups);
664 }
665