1 /* Copyright © 2012 Brandon L Black <blblack@gmail.com>
2  *
3  * This file is part of gdnsd.
4  *
5  * gdnsd is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * gdnsd is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with gdnsd.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  */
19 
20 #include <config.h>
21 #include <gdnsd/mon.h>
22 #include <gdnsd-prot/mon.h>
23 
24 #include "plugapi.h"
25 
26 #include <gdnsd-prot/plugapi.h>
27 #include <gdnsd/alloc.h>
28 #include <gdnsd/log.h>
29 #include <gdnsd/paths.h>
30 #include <gdnsd/plugapi.h>
31 #include <gdnsd/prcu.h>
32 #include <gdnsd/vscf.h>
33 #include <gdnsd/misc.h>
34 
35 #include <string.h>
36 #include <unistd.h>
37 #include <fnmatch.h>
38 
39 #include <ev.h>
40 
41 typedef struct {
42     const char* name;
43     const plugin_t* plugin;
44     unsigned up_thresh;
45     unsigned ok_thresh;
46     unsigned down_thresh;
47     unsigned interval;
48     unsigned timeout;
49 } service_type_t;
50 
51 // if type is NULL, there is no real monitoring,
52 //   it's only possible to administratively change
53 //   the state.  This means a virtual resource
54 //   via mon_add_admin(), and cname/addr/dname
55 //   are invalid.  Otherwise is_cname flags which
56 //   member of the union is valid.
57 typedef struct {
58     const char* desc;
59     service_type_t* type;
60     char* cname; // normalized text form of addr or dname below
61     union {
62         dmn_anysin_t addr;
63         const uint8_t* dname; // dname-form of a CNAME
64     };
65     unsigned n_failure;
66     unsigned n_success;
67     bool is_cname;
68     gdnsd_sttl_t real_sttl;
69 } smgr_t;
70 
71 static unsigned num_svc_types = 0;
72 static service_type_t* service_types = NULL;
73 
74 static unsigned num_smgrs = 0;
75 static smgr_t* smgrs = NULL;
76 
77 // There are two copies of the sttl table.
78 // The "consumer" copy is always ready for consumption
79 //   (via prcu deref) by other threads, and does not
80 //   get mutated directly.  The updates flow into
81 //   the non-consumer table and the tables are later
82 //   prcu swapped (with the old copy updated to new values)
83 // There are only ever the two chunks of memory that are
84 //   first allocated for these, they just get swapped around
85 //   and copied over each other.
86 // (see sttl_table_update() below)
87 static gdnsd_sttl_t* smgr_sttl = NULL;
88 gdnsd_sttl_t* smgr_sttl_consumer_ = NULL;
89 
90 static unsigned max_stats_len = 0;
91 
92 static bool initial_round = false;
93 static bool testsuite_nodelay = false;
94 
95 static struct ev_loop* mon_loop = NULL;
96 static ev_timer* sttl_update_timer = NULL;
97 
98 #define DEF_UP_THRESH 20
99 #define DEF_OK_THRESH 10
100 #define DEF_DOWN_THRESH 10
101 #define DEF_INTERVAL 10
102 
103 F_NONNULL
sttl_table_update(struct ev_loop * loop V_UNUSED,ev_timer * w V_UNUSED,int revents V_UNUSED)104 static void sttl_table_update(struct ev_loop* loop V_UNUSED, ev_timer* w V_UNUSED, int revents V_UNUSED) {
105     dmn_assert(w == sttl_update_timer);
106     dmn_assert(revents == EV_TIMER);
107 
108     // prcu-swap of the two tables
109     gdnsd_sttl_t* saved_old_consumer = smgr_sttl_consumer_;
110     gdnsd_prcu_upd_lock();
111     gdnsd_prcu_upd_assign(smgr_sttl_consumer_, smgr_sttl);
112     gdnsd_prcu_upd_unlock();
113     smgr_sttl = saved_old_consumer;
114 
115     // now copy the (new) consumer table back over the old one
116     //   that we're using for future offline updates until the next swap
117     memcpy(smgr_sttl, smgr_sttl_consumer_, sizeof(gdnsd_sttl_t) * num_smgrs);
118 }
119 
120 // anything that ends up changing a value in smgr_sttl[] calls
121 //   this to push the updates towards visibility to consumers.
122 // the timer coalesces rapid-fire updates into at most one table-swap
123 //   per second, at the cost of a second of latency on updates.
kick_sttl_update_timer(void)124 static void kick_sttl_update_timer(void) {
125     if(testsuite_nodelay) {
126         sttl_table_update(mon_loop, sttl_update_timer, EV_TIMER);
127     }
128     else if(!ev_is_active(sttl_update_timer) && !ev_is_pending(sttl_update_timer)) {
129         ev_timer_set(sttl_update_timer, 1.0, 0.0);
130         ev_timer_start(mon_loop, sttl_update_timer);
131     }
132 }
133 
gdnsd_logf_sttl(const gdnsd_sttl_t s)134 const char* gdnsd_logf_sttl(const gdnsd_sttl_t s) {
135     // the maximal length here is "DOWN/268435455"
136     // the minimal is "UP/1"
137     // 4-14 bytes not counting NUL
138     char tmpbuf[15];
139     const unsigned ttl = s & GDNSD_STTL_TTL_MASK;
140     const char* state = (s & GDNSD_STTL_DOWN) ? "DOWN" : "UP";
141     int snp_rv;
142     if(!ttl || ttl == GDNSD_STTL_TTL_MAX)
143         snp_rv = snprintf(tmpbuf, 15, "%s/%s", state, ttl ? "MAX" : "MIN");
144     else
145         snp_rv = snprintf(tmpbuf, 15, "%s/%u", state, ttl);
146 
147     dmn_assert(snp_rv >= 4 && snp_rv <= 14);
148     const unsigned snp_len = (unsigned)snp_rv;
149     char* out = dmn_fmtbuf_alloc(snp_len + 1U);
150     memcpy(out, tmpbuf, snp_len + 1U);
151     return out;
152 }
153 
154 //--------------------------------------------------
155 // admin state-force stuff
156 //--------------------------------------------------
157 
158 static ev_stat* admin_file_watcher = NULL;
159 static ev_timer* admin_quiesce_timer = NULL;
160 
161 // shared with plugin_extfile!
gdnsd_mon_parse_sttl(const char * sttl_str,gdnsd_sttl_t * sttl_out,unsigned def_ttl)162 bool gdnsd_mon_parse_sttl(const char* sttl_str, gdnsd_sttl_t* sttl_out, unsigned def_ttl) {
163     bool failed = true;
164     gdnsd_sttl_t out = def_ttl;
165     assert_valid_sttl(out);
166 
167     const char* ttl_suffix = NULL;
168     if(!strncasecmp(sttl_str, "UP", 2)) {
169         ttl_suffix = sttl_str + 2;
170     }
171     else if(!strncasecmp(sttl_str, "DOWN", 4)) {
172         out |= GDNSD_STTL_DOWN;
173         ttl_suffix = sttl_str + 4;
174     }
175 
176     if(ttl_suffix) {
177         char slash = *ttl_suffix++;
178         if(!slash) {
179             failed = false; // no TTL suffix
180             *sttl_out = out;
181         }
182         else if(slash == '/' && *ttl_suffix) {
183             char* endptr = NULL;
184             unsigned long ttl_tmp = strtoul(ttl_suffix, &endptr, 10);
185             if(endptr && !*endptr) { // strtoul finished the string successfully
186                 if(ttl_tmp <= GDNSD_STTL_TTL_MAX) {
187                     out = (out & ~GDNSD_STTL_TTL_MASK) | ttl_tmp;
188                     assert_valid_sttl(out);
189                     *sttl_out = out;
190                     failed = false;
191                 }
192             }
193         }
194     }
195 
196     return failed;
197 }
198 
199 F_NONNULL
admin_process_entry(const char * matchme,gdnsd_sttl_t * updates,gdnsd_sttl_t update_val)200 static bool admin_process_entry(const char* matchme, gdnsd_sttl_t* updates, gdnsd_sttl_t update_val) {
201     assert_valid_sttl(update_val);
202     dmn_assert(update_val & GDNSD_STTL_FORCED);
203 
204     bool success = true;
205     bool matched = false;
206 
207     for(unsigned i = 0; i < num_smgrs; i++) {
208         smgr_t* smgr = &smgrs[i];
209         int err = fnmatch(matchme, smgr->desc, 0);
210         if(err && err != FNM_NOMATCH) {
211             log_err("admin_state: fnmatch() failed with error code %i: probably glob-parsing error on '%s'", err, matchme);
212             success = false;
213             break;
214         }
215         if(!err) { // matched!
216             matched = true;
217             updates[i] = update_val;
218         }
219     }
220 
221     if(success && !matched)
222         log_warn("admin_state: glob '%s' did not match anything!", matchme);
223 
224     return success;
225 }
226 
227 F_NONNULL
admin_process_hash(vscf_data_t * raw,const bool check_only)228 static bool admin_process_hash(vscf_data_t* raw, const bool check_only) {
229     dmn_assert(vscf_is_hash(raw));
230 
231     bool success = true;
232 
233     gdnsd_sttl_t updates[num_smgrs];
234     memset(updates, 0, sizeof(updates));
235 
236     const unsigned num_raw = vscf_hash_get_len(raw);
237     for(unsigned i = 0; i < num_raw; i++) {
238         const char* matchme = vscf_hash_get_key_byindex(raw, i, NULL);
239         vscf_data_t* val = vscf_hash_get_data_byindex(raw, i);
240         if(!vscf_is_simple(val)) {
241             log_err("admin_state: value for '%s' must be a simple string!", matchme);
242             success = false;
243             break;
244         }
245         else {
246             gdnsd_sttl_t update_val;
247             if(gdnsd_mon_parse_sttl(vscf_simple_get_data(val), &update_val, GDNSD_STTL_TTL_MAX)) {
248                 log_err("admin_state: value for '%s' must be of the form STATE[/TTL] (where STATE is 'UP' or 'DOWN', and the optional TTL is an unsigned integer in the range 0 - %u)", matchme, GDNSD_STTL_TTL_MAX);
249                 success = false;
250                 break;
251             }
252             else {
253                 update_val |= GDNSD_STTL_FORCED; // all admin-states are forced
254                 if(!admin_process_entry(matchme, updates, update_val)) {
255                     success = false;
256                     break;
257                 }
258             }
259         }
260     }
261 
262     if(success && !check_only) {
263         bool affected = false;
264 
265         for(unsigned i = 0; i < num_smgrs; i++) {
266             if(updates[i]) { // some entry wants to affect this slot
267                 if(smgr_sttl[i] != updates[i]) { // new state change
268                     if(smgr_sttl[i] != smgrs[i].real_sttl) // already forced
269                         log_info("admin_state: state of '%s' re-forced from %s to %s, real state is %s", smgrs[i].desc, logf_sttl(smgr_sttl[i]), logf_sttl(updates[i]), smgrs[i].type ? logf_sttl(smgrs[i].real_sttl) : "NA");
270                     else
271                         log_info("admin_state: state of '%s' forced to %s, real state is %s", smgrs[i].desc, logf_sttl(updates[i]), smgrs[i].type ? logf_sttl(smgrs[i].real_sttl) : "NA");
272                     smgr_sttl[i] = updates[i];
273                     affected = true;
274                 }
275             }
276             else if(smgr_sttl[i] & GDNSD_STTL_FORCED) { // was forced before, isn't now
277                 log_info("admin_state: state of '%s' no longer forced (was forced to %s), real and current state is %s", smgrs[i].desc, logf_sttl(smgr_sttl[i]), smgrs[i].type ? logf_sttl(smgrs[i].real_sttl) : "NA");
278                 smgr_sttl[i] = smgrs[i].real_sttl;
279                 dmn_assert(!(smgr_sttl[i] & GDNSD_STTL_FORCED));
280                 affected = true;
281             }
282         }
283 
284         if(affected) {
285             if(!initial_round)
286                 kick_sttl_update_timer();
287             log_info("admin_state: load complete");
288         }
289         else {
290             log_info("admin_state: load complete, no net changes");
291         }
292     }
293 
294     return success;
295 }
296 
297 F_NONNULL
admin_process_file(const char * pathname,const bool check_only)298 static bool admin_process_file(const char* pathname, const bool check_only) {
299     if(check_only)
300         log_info("admin_state: checking state file '%s'...", pathname);
301     else
302         log_info("admin_state: (re-)loading state file '%s'...", pathname);
303 
304     bool success = false;
305 
306     vscf_data_t* raw = vscf_scan_filename(pathname);
307     if(!raw) {
308         log_err("admin_state: Loading file '%s' failed", pathname);
309     }
310     else {
311         if(!vscf_is_hash(raw))
312             log_err("admin_state: top level of file '%s' must be a hash", pathname);
313         else
314             success = admin_process_hash(raw, check_only);
315         vscf_destroy(raw);
316     }
317 
318     if(!success && !check_only)
319         log_err("admin_state: file '%s' had errors; all contents were ignored and any current forced states are unaffected", pathname);
320 
321     return success;
322 }
323 
324 F_NONNULL
admin_deleted_file(const char * pathname)325 static void admin_deleted_file(const char* pathname) {
326     log_info("admin_state: state file '%s' deleted, clearing any forced states...", pathname);
327     bool affected = false;
328     for(unsigned i = 0; i < num_smgrs; i++) {
329         if(smgr_sttl[i] & GDNSD_STTL_FORCED) {
330             log_info("admin_state: state of '%s' no longer forced (was forced to %s), real and current state is %s", smgrs[i].desc, logf_sttl(smgr_sttl[i]), smgrs[i].type ? logf_sttl(smgrs[i].real_sttl) : "NA");
331             smgr_sttl[i] = smgrs[i].real_sttl;
332             dmn_assert(!(smgr_sttl[i] & GDNSD_STTL_FORCED));
333             affected = true;
334         }
335     }
336     if(affected)
337         kick_sttl_update_timer();
338 }
339 
340 F_NONNULL
admin_timer_cb(struct ev_loop * loop,ev_timer * w,int revents V_UNUSED)341 static void admin_timer_cb(struct ev_loop* loop, ev_timer* w, int revents V_UNUSED) {
342     dmn_assert(revents == EV_TIMER);
343     ev_timer_stop(loop, w);
344     if(admin_file_watcher->attr.st_nlink)
345         admin_process_file(admin_file_watcher->path, false);
346     else
347         admin_deleted_file(admin_file_watcher->path);
348 }
349 
350 F_NONNULL
admin_file_cb(struct ev_loop * loop,ev_stat * w V_UNUSED,int revents V_UNUSED)351 static void admin_file_cb(struct ev_loop* loop, ev_stat* w V_UNUSED, int revents V_UNUSED) {
352     dmn_assert(revents == EV_STAT);
353     if(testsuite_nodelay)
354         admin_timer_cb(loop, admin_quiesce_timer, EV_TIMER);
355     else
356         ev_timer_again(loop, admin_quiesce_timer);
357 }
358 
359 // Note this invoked *after* the initial round of monitoring,
360 //   but before the main loop begins runtime execution.
361 F_NONNULL
admin_init(struct ev_loop * mloop)362 static void admin_init(struct ev_loop* mloop) {
363     char* pathname = gdnsd_resolve_path_state("admin_state", NULL);
364 
365     admin_quiesce_timer = xmalloc(sizeof(ev_timer));
366     ev_timer_init(admin_quiesce_timer, admin_timer_cb, 0.0, 1.02);
367     admin_file_watcher = xmalloc(sizeof(ev_stat));
368     memset(&admin_file_watcher->attr, 0, sizeof(admin_file_watcher->attr));
369     ev_stat_init(admin_file_watcher, admin_file_cb, pathname,
370         testsuite_nodelay ? 0.01 : 3.0);
371     ev_stat_start(mloop, admin_file_watcher);
372 
373     // ev_stat_start stat()'s the file for ->attr, use that
374     //   to process the file initially if it exists.
375     if(admin_file_watcher->attr.st_nlink)
376         admin_process_file(pathname, false);
377     else
378         log_info("admin_state: state file '%s' does not yet exist at startup", pathname);
379 }
380 
381 //--------------------------------------------------
382 // core monitoring stuff
383 //--------------------------------------------------
384 
385 // public interface to just check admin_state parsing
gdnsd_mon_check_admin_file(void)386 void gdnsd_mon_check_admin_file(void) {
387     struct stat st;
388     char* pathname = gdnsd_resolve_path_state("admin_state", NULL);
389 
390     if(!stat(pathname, &st)) {
391         if(!admin_process_file(pathname, true))
392             log_fatal("%s has errors!", pathname);
393     }
394     else if(errno != ENOENT) {
395         log_fatal("Error checking admin_state pathname '%s': %s",
396             pathname, dmn_logf_errno());
397     }
398 
399     free(pathname);
400 }
401 
402 // Called once after all servicetypes and monitored stuff
403 //  have been configured, from main thread.  mloop happens
404 //  to be the default loop currently, and should be empty of
405 //  events at this point so that we can fall out after the
406 //  initial round of monitoring.
gdnsd_mon_start(struct ev_loop * mloop)407 void gdnsd_mon_start(struct ev_loop* mloop) {
408     // Fall out quickly if nothing to monitor
409     if(!num_smgrs) return;
410 
411     if(getenv("GDNSD_TESTSUITE_NODELAY"))
412         testsuite_nodelay = true;
413 
414     // saved for timer usage later
415     mon_loop = mloop;
416 
417     // Run the loop once until all events drain, which will
418     // be one full monitoring cycle of each resource (without
419     // any artificial delays).
420     log_info("Starting initial round of monitoring ...");
421     initial_round = true;
422     gdnsd_plugins_action_init_monitors(mloop);
423     ev_run(mloop, 0);
424     log_info("Initial round of monitoring complete");
425 
426     // initialize admin_state stuff
427     admin_init(mloop);
428 
429     // this flag prevents table update timers for admin_init stuff as well!
430     initial_round = false;
431 
432     // set up the table-update coalescing timer
433     sttl_update_timer = xmalloc(sizeof(ev_timer));
434     ev_timer_init(sttl_update_timer, sttl_table_update, 1.0, 0.0);
435 
436     // trigger it once manually to invoke prcu stuff
437     //   for the initial round results to ensure there's
438     //   no confusion.
439     sttl_table_update(mloop, sttl_update_timer, EV_TIMER);
440 
441     // add real watchers to the monitor loop for runtime
442     //   (the loop itself begins execution later back in main.c)
443     gdnsd_plugins_action_start_monitors(mloop);
444 }
445 
446 // We only have to check the address, because the port
447 //  is determined by service type.
448 F_NONNULL
addr_eq(const dmn_anysin_t * a,const dmn_anysin_t * b)449 static bool addr_eq(const dmn_anysin_t* a, const dmn_anysin_t* b) {
450     dmn_assert(a->sa.sa_family == AF_INET || a->sa.sa_family == AF_INET6);
451 
452     bool rv = false;
453     if(a->sa.sa_family == b->sa.sa_family) {
454         if(a->sa.sa_family == AF_INET)
455             rv = (a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr);
456         else
457             rv = !memcmp(a->sin6.sin6_addr.s6_addr, b->sin6.sin6_addr.s6_addr, 16);
458     }
459     return rv;
460 }
461 
462 F_NONNULLX(1)
mon_thing(const char * svctype_name,const dmn_anysin_t * addr,const char * cname,const uint8_t * dname)463 static unsigned mon_thing(const char* svctype_name, const dmn_anysin_t* addr, const char* cname, const uint8_t* dname) {
464     if(addr)
465         dmn_assert(!cname && !dname);
466     else
467         dmn_assert(cname && dname);
468 
469     // first, sort out what svctype_name actually means to us
470     service_type_t* this_svc = NULL;
471     for(unsigned i = 0; i < num_svc_types; i++) {
472         if(!strcmp(svctype_name, service_types[i].name)) {
473             this_svc = &service_types[i];
474             break;
475         }
476     }
477 
478     if(!this_svc)
479         log_fatal("Invalid service type '%s' in monitoring request for '%s'",
480             svctype_name, addr ? dmn_logf_anysin_noport(addr) : cname);
481 
482     // next, check if this is a duplicate of a request issued earlier
483     //   by some other plugin/resource, in which case we can just give
484     //   them the existing index
485     if(addr) {
486         for(unsigned i = 0; i < num_smgrs; i++) {
487             smgr_t* that_smgr = &smgrs[i];
488             if(!that_smgr->is_cname && addr_eq(addr, &that_smgr->addr) && this_svc == that_smgr->type)
489                 return i;
490         }
491     }
492     else {
493         for(unsigned i = 0; i < num_smgrs; i++) {
494             smgr_t* that_smgr = &smgrs[i];
495             if(that_smgr->is_cname && !gdnsd_dname_cmp(dname, that_smgr->dname) && this_svc == that_smgr->type)
496                 return i;
497         }
498     }
499 
500     // allocate the new smgr/sttl
501     const unsigned idx = num_smgrs++;
502     smgrs = xrealloc(smgrs, sizeof(smgr_t) * num_smgrs);
503     smgr_t* this_smgr = &smgrs[idx];
504     this_smgr->type = this_svc;
505 
506     // for a new stype+addr combo, check that the plugin supports addr monitoring
507     if(addr) {
508         if(this_svc->plugin && !this_svc->plugin->add_mon_addr)
509             log_fatal("Service type '%s' does not support address monitoring for '%s'",
510                 svctype_name, dmn_logf_anysin_noport(addr));
511 
512         // construct desc for this new unique monitor
513         char addr_str[INET6_ADDRSTRLEN];
514         int name_err = dmn_anysin2str_noport(addr, addr_str);
515         // this should basically never happen since the same family of functions will
516         //   have already converted it from dmn_anysin_t -> text earlier, but if it does,
517         //   we really don't have much we can do about logging it informatively...
518         if(name_err)
519             log_fatal("Error converting address back to text form: %s", gai_strerror(errno));
520 
521         this_smgr->desc = gdnsd_str_combine_n(3, addr_str, "/", svctype_name);
522         this_smgr->is_cname = false;
523         this_smgr->cname = strdup(addr_str);
524         gdnsd_downcase_str(this_smgr->cname);
525         memcpy(&this_smgr->addr, addr, sizeof(dmn_anysin_t));
526     }
527     else { // cname
528         if(this_svc->plugin && !this_svc->plugin->add_mon_cname)
529             log_fatal("Service type '%s' does not support CNAME monitoring for '%s'",
530                 svctype_name, cname);
531         this_smgr->desc = gdnsd_str_combine_n(3, cname, "/", svctype_name);
532         this_smgr->is_cname = true;
533         this_smgr->cname = strdup(cname);
534         gdnsd_downcase_str(this_smgr->cname);
535         this_smgr->dname = gdnsd_dname_dup(dname, true);
536     }
537 
538     this_smgr->n_failure = 0;
539     this_smgr->n_success = 0;
540     this_smgr->real_sttl = GDNSD_STTL_TTL_MAX;
541 
542     // the "down" special gets a different default than the rest
543     if(!strcmp(svctype_name, "down"))
544         this_smgr->real_sttl |= GDNSD_STTL_DOWN;
545 
546     smgr_sttl = xrealloc(smgr_sttl, sizeof(gdnsd_sttl_t) * num_smgrs);
547     smgr_sttl_consumer_ = xrealloc(smgr_sttl_consumer_, sizeof(gdnsd_sttl_t) * num_smgrs);
548     smgr_sttl_consumer_[idx] = smgr_sttl[idx] = this_smgr->real_sttl;
549 
550     return idx;
551 }
552 
553 // Called from plugins once per monitored service type+IP combination
554 //  to request monitoring and initialize various data/state.
gdnsd_mon_addr(const char * svctype_name,const dmn_anysin_t * addr)555 unsigned gdnsd_mon_addr(const char* svctype_name, const dmn_anysin_t* addr) {
556     return mon_thing(svctype_name, addr, NULL, NULL);
557 }
558 
559 // As above for CNAMEs
gdnsd_mon_cname(const char * svctype_name,const char * cname,const uint8_t * dname)560 unsigned gdnsd_mon_cname(const char* svctype_name, const char* cname, const uint8_t* dname) {
561     return mon_thing(svctype_name, NULL, cname, dname);
562 }
563 
564 // .. for virtual entities (e.g. datacenters), which have no service_type
gdnsd_mon_admin(const char * desc)565 unsigned gdnsd_mon_admin(const char* desc) {
566     const unsigned idx = num_smgrs++;
567     smgrs = xrealloc(smgrs, sizeof(smgr_t) * num_smgrs);
568     smgr_sttl = xrealloc(smgr_sttl, sizeof(gdnsd_sttl_t) * num_smgrs);
569     smgr_sttl_consumer_ = xrealloc(smgr_sttl_consumer_, sizeof(gdnsd_sttl_t) * num_smgrs);
570     smgr_t* this_smgr = &smgrs[idx];
571     memset(this_smgr, 0, sizeof(smgr_t));
572     this_smgr->desc = strdup(desc);
573     this_smgr->real_sttl = GDNSD_STTL_TTL_MAX;
574     smgr_sttl_consumer_[idx] = smgr_sttl[idx] = this_smgr->real_sttl;
575     return idx;
576 }
577 
578 #define SVC_OPT_UINT(_hash, _typnam, _loc, _min, _max) \
579     do { \
580         vscf_data_t* _data = vscf_hash_get_data_byconstkey(_hash, #_loc, true); \
581         if(_data) { \
582             unsigned long _val; \
583             if(!vscf_is_simple(_data) \
584             || !vscf_simple_get_as_ulong(_data, &_val)) \
585                 log_fatal("Service type '%s': option '%s': Value must be a positive integer", _typnam, #_loc); \
586             if(_val < _min || _val > _max) \
587                 log_fatal("Service type '%s': option '%s': Value out of range (%lu, %lu)", _typnam, #_loc, _min, _max); \
588             this_svc->_loc = (unsigned) _val; \
589         } \
590     } while(0)
591 
592 F_NONNULL
bad_svc_opt(const char * key,unsigned klen V_UNUSED,vscf_data_t * d V_UNUSED,const void * svcname_asvoid)593 static bool bad_svc_opt(const char* key, unsigned klen V_UNUSED, vscf_data_t* d V_UNUSED, const void* svcname_asvoid) {
594     const char* svcname = svcname_asvoid;
595     log_fatal("Service type '%s', bad option '%s'", svcname, key);
596 }
597 
gdnsd_mon_cfg_stypes_p1(vscf_data_t * svctypes_cfg)598 void gdnsd_mon_cfg_stypes_p1(vscf_data_t* svctypes_cfg) {
599     unsigned num_svc_types_cfg = 0;
600 
601     if(svctypes_cfg) {
602         if(!vscf_is_hash(svctypes_cfg))
603             log_fatal("service_types, if defined, must have a hash value");
604         num_svc_types_cfg = vscf_hash_get_len(svctypes_cfg);
605     }
606 
607     num_svc_types = num_svc_types_cfg + 2; // "up", "down"
608 
609     // the last 2 service types are fixed to up and down
610     service_types = xcalloc(num_svc_types, sizeof(service_type_t));
611     service_types[num_svc_types - 2].name = "up";
612     service_types[num_svc_types - 1].name = "down";
613 
614     // if this loop executes at all, svctypes_cfg is defined
615     //   (see if() block at top of func, and definition of num_svc_types)
616     for(unsigned i = 0; i < num_svc_types_cfg; i++) {
617         service_type_t* this_svc = &service_types[i];
618         this_svc->name = strdup(vscf_hash_get_key_byindex(svctypes_cfg, i, NULL));
619         if(!strcmp(this_svc->name, "up") || !strcmp(this_svc->name, "down"))
620             log_fatal("Explicit service type name '%s' not allowed", this_svc->name);
621         vscf_data_t* svctype_cfg = vscf_hash_get_data_byindex(svctypes_cfg, i);
622         if(!vscf_is_hash(svctype_cfg))
623             log_fatal("Definition of service type '%s' must be a hash", this_svc->name);
624         vscf_data_t* pname_cfg = vscf_hash_get_data_byconstkey(svctype_cfg, "plugin", true);
625         if(!pname_cfg)
626             log_fatal("Service type '%s': 'plugin' must be defined", this_svc->name);
627         if(!vscf_is_simple(pname_cfg) || !vscf_simple_get_len(pname_cfg))
628             log_fatal("Service type '%s': 'plugin' must be a string", this_svc->name);
629         const char* pname = vscf_simple_get_data(pname_cfg);
630         this_svc->plugin = gdnsd_plugin_find_or_load(pname);
631         if(!this_svc->plugin->add_svctype)
632             log_fatal("Service type '%s' references plugin '%s', which does not support service monitoring (lacks add_svctype func)", this_svc->name, pname);
633     }
634 }
635 
gdnsd_mon_cfg_stypes_p2(vscf_data_t * svctypes_cfg)636 void gdnsd_mon_cfg_stypes_p2(vscf_data_t* svctypes_cfg) {
637 
638     // If no plugins actually used any plugin-monitored services, there's
639     //   no point in setting up the remainder of this.  At the very least
640     //   it lets us skip loading http_status.
641     bool need_p2 = false;
642     for(unsigned i = 0; i < num_smgrs; i++) {
643         if(smgrs[i].type) {
644             need_p2 = true;
645             break;
646         }
647     }
648     if(!need_p2)
649         return;
650 
651     dmn_assert(num_svc_types > 1); // up, down always exist
652 
653     for(unsigned i = 0; i < (num_svc_types - 2); i++) {
654         dmn_assert(svctypes_cfg);
655         service_type_t* this_svc = &service_types[i];
656 
657         // assert same ordering as _p1
658         dmn_assert(!strcmp(this_svc->name, vscf_hash_get_key_byindex(svctypes_cfg, i, NULL)));
659         dmn_assert(this_svc->plugin);
660 
661         vscf_data_t* svctype_cfg = vscf_hash_get_data_byindex(svctypes_cfg, i);
662         dmn_assert(svctype_cfg);
663 
664         this_svc->up_thresh = DEF_UP_THRESH;
665         this_svc->ok_thresh = DEF_OK_THRESH;
666         this_svc->down_thresh = DEF_DOWN_THRESH;
667         this_svc->interval = DEF_INTERVAL;
668         SVC_OPT_UINT(svctype_cfg, this_svc->name, up_thresh, 1LU, 65535LU);
669         SVC_OPT_UINT(svctype_cfg, this_svc->name, ok_thresh, 1LU, 65535LU);
670         SVC_OPT_UINT(svctype_cfg, this_svc->name, down_thresh, 1LU, 65535LU);
671         SVC_OPT_UINT(svctype_cfg, this_svc->name, interval, 2LU, 3600LU);
672         this_svc->timeout = this_svc->interval >> 1U; // default timeout is half of interval
673         SVC_OPT_UINT(svctype_cfg, this_svc->name, timeout, 1LU, 300LU);
674         if(this_svc->timeout >= this_svc->interval)
675             log_fatal("Service type '%s': timeout must be less than interval)", this_svc->name);
676 
677         this_svc->plugin->add_svctype(this_svc->name, svctype_cfg, this_svc->interval, this_svc->timeout);
678         vscf_hash_iterate_const(svctype_cfg, true, bad_svc_opt, this_svc->name);
679     }
680 
681     // dummy config for up+down
682     for(unsigned i = (num_svc_types - 2); i < num_svc_types; i++) {
683         service_type_t* this_svc = &service_types[i];
684         this_svc->plugin = NULL;
685         this_svc->up_thresh = DEF_UP_THRESH;
686         this_svc->ok_thresh = DEF_OK_THRESH;
687         this_svc->down_thresh = DEF_DOWN_THRESH;
688         this_svc->interval = DEF_INTERVAL;
689         this_svc->timeout = 1;
690     }
691 
692     // now that we've solved the chicken-and-egg, finish processing
693     //   the monitoring requests resolver plugins asked about earlier
694     for(unsigned i = 0; i < num_smgrs; i++) {
695         smgr_t* this_smgr = &smgrs[i];
696         if(this_smgr->type) { // virtuals (mon_admin) get no service_type at all
697             if(this_smgr->type->plugin) { // down/up get no plugin
698                 if(this_smgr->is_cname) {
699                     dmn_assert(this_smgr->type->plugin->add_mon_cname);
700                     this_smgr->type->plugin->add_mon_cname(this_smgr->desc, this_smgr->type->name, this_smgr->cname, i);
701                 }
702                 else {
703                     dmn_assert(this_smgr->type->plugin->add_mon_addr);
704                     this_smgr->type->plugin->add_mon_addr(this_smgr->desc, this_smgr->type->name, this_smgr->cname, &this_smgr->addr, i);
705                 }
706             }
707         }
708     }
709 }
710 
711 F_NONNULL
raw_sttl_update(smgr_t * smgr,unsigned idx,gdnsd_sttl_t new_sttl)712 static void raw_sttl_update(smgr_t* smgr, unsigned idx, gdnsd_sttl_t new_sttl) {
713     dmn_assert(idx < num_smgrs);
714 
715     // Note that the updater interfaces from monitoring plugins cannot set
716     //  the FORCED bit - only the admin-state interface can do that.
717     assert_valid_sttl(new_sttl);
718     dmn_assert(!(new_sttl & GDNSD_STTL_FORCED));
719 
720     if(initial_round) {
721         log_info("state of '%s' initialized to %s", smgr->desc, logf_sttl(new_sttl));
722         smgr_sttl[idx] = smgr->real_sttl = new_sttl;
723         // table update taken care of in gdnsd_mon_start()
724         //  after all initial monitors complete
725     }
726     else if(new_sttl != smgr->real_sttl) {
727         if((new_sttl & GDNSD_STTL_DOWN) != (smgr->real_sttl & GDNSD_STTL_DOWN)) {
728             if(smgr_sttl[idx] & GDNSD_STTL_FORCED)
729                 log_info("state of '%s' changed from %s to %s,"
730                     " effective state remains administratively forced to %s",
731                     smgr->desc, logf_sttl(smgr->real_sttl), logf_sttl(new_sttl),
732                     logf_sttl(smgr_sttl[idx]));
733             else
734                 log_info("state of '%s' changed from %s to %s",
735                     smgr->desc, logf_sttl(smgr->real_sttl), logf_sttl(new_sttl));
736         }
737         smgr->real_sttl = new_sttl;
738         if(new_sttl != smgr_sttl[idx] && !(smgr_sttl[idx] & GDNSD_STTL_FORCED)) {
739             smgr_sttl[idx] = new_sttl;
740             kick_sttl_update_timer();
741         }
742     }
743 }
744 
gdnsd_mon_sttl_updater(unsigned idx,gdnsd_sttl_t new_sttl)745 void gdnsd_mon_sttl_updater(unsigned idx, gdnsd_sttl_t new_sttl) {
746     dmn_assert(idx < num_smgrs);
747     raw_sttl_update(&smgrs[idx], idx, new_sttl);
748 }
749 
gdnsd_mon_state_updater(unsigned idx,const bool latest)750 void gdnsd_mon_state_updater(unsigned idx, const bool latest) {
751     dmn_assert(idx < num_smgrs);
752     smgr_t* smgr = &smgrs[idx];
753 
754     // a bit spammy to leave in all debug builds, but handy at times...
755     //log_debug("'%s' new monitor result: %s", smgr->desc, latest ? "OK" : "FAIL");
756 
757     bool down;
758 
759     // XXX think up a better way to set TTL on initial monitoring round?
760     //   may involve a whole new counting system, or at least
761     //   a count of rounds_since_start until some period has passed?
762     //  The idea would be to serve a shorter TTL until stability has
763     //   been demonstrated.  For now, just going with pretending initial
764     //   state is stable.
765     if(initial_round) {
766         dmn_assert(!smgr->n_failure);
767         dmn_assert(!smgr->n_success);
768         down = !latest;
769     }
770     else {
771         // First handle basic up/down state and the counters
772         down = smgr->real_sttl & GDNSD_STTL_DOWN;
773         if(down) { // Currently DOWN
774             if(latest) { // New Success
775                 if(++smgr->n_success == smgr->type->up_thresh) {
776                     smgr->n_success = 0;
777                     smgr->n_failure = 0;
778                     down = false;
779                 }
780             }
781             else { // New failure when already down, reset for up_thresh
782                 smgr->n_success = 0;
783             }
784         }
785         else { // Currently UP
786             if(latest) { // New Success
787                 // Was UP with some intermittent failure history, but has cleared ok_thresh...
788                 if(smgr->n_failure && (++smgr->n_success == smgr->type->ok_thresh)) {
789                     smgr->n_failure = 0;
790                     smgr->n_success = 0;
791                 }
792             }
793             else { // New Failure
794                 smgr->n_success = 0;
795                 if(++smgr->n_failure == smgr->type->down_thresh) { // Fail threshold check on failure
796                     smgr->n_failure = 0;
797                     down = true;
798                 }
799             }
800         }
801     }
802 
803     // calculate new TTL based on counters + interval
804     const unsigned count_to_change = down
805         ? smgr->type->up_thresh - smgr->n_success
806         : smgr->type->down_thresh - smgr->n_failure;
807     gdnsd_sttl_t new_sttl = smgr->type->interval * count_to_change;
808     if(new_sttl > GDNSD_STTL_TTL_MAX)
809         new_sttl = GDNSD_STTL_TTL_MAX;
810     if(down)
811         new_sttl |= GDNSD_STTL_DOWN;
812 
813     raw_sttl_update(smgr, idx, new_sttl);
814 }
815 
816 //--------------------------------------------------
817 // stats code from here to the end
818 //--------------------------------------------------
819 
820 static const char http_head[] = "<p><span class='bold big'>Monitored Service States:</span></p><table>\r\n"
821     "<tr><th>Service</th><th>State</th><th>Real State</th></tr>\r\n";
822 static const unsigned http_head_len = sizeof(http_head) - 1;
823 
824 static const char http_tmpl[] = "<tr><td>%s</td><td class='%s'>%s</td><td class='%s'>%s</td></tr>\r\n";
825 static const unsigned http_tmpl_len = sizeof(http_tmpl) - 11; // 5x%s
826 
827 static const char http_foot[] = "</table>\r\n";
828 static const unsigned http_foot_len = sizeof(http_foot) - 1;
829 
830 static const char csv_head[] = "Service,State,RealState\r\n";
831 static const unsigned csv_head_len = sizeof(csv_head) - 1;
832 
833 static const char csv_tmpl[] = "%s,%s,%s\r\n";
834 
835 static const char json_head[] = "\t\"services\": [\r\n";
836 static const unsigned json_head_len = sizeof(json_head) - 1;
837 static const char json_tmpl[] = "\t\t{\r\n\t\t\t\"service\": \"%s\",\r\n\t\t\t\"state\": \"%s\",\r\n\t\t\t\"real_state\": \"%s\"\r\n\t\t}";
838 static const unsigned json_tmpl_len = sizeof(json_tmpl) - 7; // 3x%s
839 static const char json_sep[] = ",\r\n";
840 static const unsigned json_sep_len = sizeof(json_sep) - 1;
841 static const char json_nl[] = "\r\n";
842 static const unsigned json_nl_len = sizeof(json_nl) - 1;
843 static const char json_foot[] = "\r\n\t]\r\n";
844 static const unsigned json_foot_len = sizeof(json_foot) - 1;
845 
846 // statio calls this at the appropriate time (long after all
847 //  basic setup is done, but before monio_start() time).
848 // monio's job here is to inform statio of the maximum possible
849 //  size of its stats output
gdnsd_mon_stats_get_max_len(void)850 unsigned gdnsd_mon_stats_get_max_len(void) {
851     // overall length calculations.
852     //   Note that *_var_len doesn't include the service name length,
853     //     and that 5 is the longest state_txt string "DOWN!"
854     //   CSV is not included because it is very obviously shorter than
855     //     either of these in all possible cases
856 
857     const unsigned html_fixed_len = http_head_len + http_foot_len;
858     const unsigned html_var_len = http_tmpl_len + (5 * 4);
859     const unsigned html_len = html_fixed_len + (num_smgrs * html_var_len);
860 
861     const unsigned json_fixed_len = json_head_len + json_sep_len + json_foot_len;
862     const unsigned json_var_len = json_tmpl_len + (5 * 2) + json_sep_len;
863     const unsigned json_len = json_fixed_len + (num_smgrs * json_var_len);
864 
865     max_stats_len = html_len > json_len ? html_len : json_len;
866 
867     for(unsigned i = 0; i < num_smgrs; i++)
868         max_stats_len += strlen(smgrs[i].desc);
869 
870     max_stats_len++; // leave room for trailing pointless sprintf \0, JIC
871 
872     return max_stats_len;
873 }
874 
875 // !!type -> forced -> down
876 static const char* state_str_map[2][2][2] = {
877     { // !type
878         { // !forced
879             "NA", // up
880             "NA", // down
881         },
882         { // forced
883             "UP!",   // up
884             "DOWN!", // down
885         },
886     },
887     { // has type
888         { // !forced
889             "UP",   // up
890             "DOWN", // down
891         },
892         { // forced
893             "UP!",   // up
894             "DOWN!", // down
895         },
896     },
897 };
898 
899 // !!type -> forced -> down
900 static const char* class_str_map[2][2][2] = {
901     { // !type
902         [0] = { // !forced
903             "UP", // up
904             "DOWN", // down
905         },
906         [1] = { // forced
907             "FORCE",   // up
908             "FORCE", // down
909         },
910     },
911     { // has type
912         { // !forced
913             "UP",   // up
914             "DOWN", // down
915         },
916         { // forced
917             "FORCE",   // up
918             "FORCE", // down
919         },
920     },
921 };
922 
923 F_NONNULL
get_state_texts(const unsigned i,const char ** cur_state_out,const char ** real_state_out)924 static void get_state_texts(const unsigned i, const char** cur_state_out, const char** real_state_out) {
925     dmn_assert(i < num_smgrs);
926 
927     *cur_state_out = state_str_map
928         [!!smgrs[i].type]
929         [!!(smgr_sttl[i] & GDNSD_STTL_FORCED)]
930         [!!(smgr_sttl[i] & GDNSD_STTL_DOWN)];
931     *real_state_out = state_str_map
932         [!!smgrs[i].type]
933         [!!(smgrs[i].real_sttl & GDNSD_STTL_FORCED)]
934         [!!(smgrs[i].real_sttl & GDNSD_STTL_DOWN)];
935 }
936 
937 F_NONNULL
get_class_texts(const unsigned i,const char ** cur_class_out,const char ** real_class_out)938 static void get_class_texts(const unsigned i, const char** cur_class_out, const char** real_class_out) {
939     dmn_assert(i < num_smgrs);
940 
941     *cur_class_out = class_str_map
942         [!!smgrs[i].type]
943         [!!(smgr_sttl[i] & GDNSD_STTL_FORCED)]
944         [!!(smgr_sttl[i] & GDNSD_STTL_DOWN)];
945     *real_class_out = class_str_map
946         [!!smgrs[i].type]
947         [!!(smgrs[i].real_sttl & GDNSD_STTL_FORCED)]
948         [!!(smgrs[i].real_sttl & GDNSD_STTL_DOWN)];
949 }
950 
951 // Output our stats in html form to buf, returning
952 //  how many characters we added to the buf.
gdnsd_mon_stats_out_html(char * buf)953 unsigned gdnsd_mon_stats_out_html(char* buf) {
954     if(!num_smgrs) return 0;
955     dmn_assert(max_stats_len);
956 
957     const char* const buf_start = buf;
958     unsigned avail = max_stats_len;
959 
960     if(avail <= http_head_len)
961         log_fatal("BUG: monio stats buf miscalculated (html mon head)");
962     memcpy(buf, http_head, http_head_len);
963     buf += http_head_len;
964     avail -= http_head_len;
965 
966     for(unsigned i = 0; i < num_smgrs; i++) {
967         const char* cur_st;
968         const char* real_st;
969         const char* cur_class;
970         const char* real_class;
971         get_state_texts(i, &cur_st, &real_st);
972         get_class_texts(i, &cur_class, &real_class);
973         const int snp_rv = snprintf(buf, avail, http_tmpl, smgrs[i].desc, cur_class, cur_st, real_class, real_st);
974         dmn_assert(snp_rv > 0);
975         const unsigned written = (unsigned)snp_rv;
976         if(written >= avail)
977             log_fatal("BUG: monio stats buf miscalculated (html mon data)");
978         buf += written;
979         avail -= written;
980     }
981 
982     if(avail <= http_foot_len)
983         log_fatal("BUG: monio stats buf miscalculated (html mon foot)");
984 
985     memcpy(buf, http_foot, http_foot_len);
986     buf += http_foot_len;
987 
988     return (buf - buf_start);
989 }
990 
991 // Output our stats in csv form to buf, returning
992 //  how many characters we added to the buf.
gdnsd_mon_stats_out_csv(char * buf)993 unsigned gdnsd_mon_stats_out_csv(char* buf) {
994     if(!num_smgrs) return 0;
995     dmn_assert(max_stats_len);
996 
997     const char* const buf_start = buf;
998     unsigned avail = max_stats_len;
999 
1000     if(avail <= csv_head_len)
1001         log_fatal("BUG: monio stats buf miscalculated (csv mon head)");
1002     memcpy(buf, csv_head, csv_head_len);
1003     buf += csv_head_len;
1004     avail -= csv_head_len;
1005 
1006     for(unsigned i = 0; i < num_smgrs; i++) {
1007         const char* cur_st;
1008         const char* real_st;
1009         get_state_texts(i, &cur_st, &real_st);
1010         const int snp_rv = snprintf(buf, avail, csv_tmpl, smgrs[i].desc, cur_st, real_st);
1011         dmn_assert(snp_rv > 0);
1012         const unsigned written = (unsigned)snp_rv;
1013         if(written >= avail)
1014             log_fatal("BUG: monio stats buf miscalculated (csv data)");
1015         buf += written;
1016         avail -= written;
1017     }
1018 
1019     return (buf - buf_start);
1020 }
1021 
gdnsd_mon_stats_out_json(char * buf)1022 unsigned gdnsd_mon_stats_out_json(char* buf) {
1023     dmn_assert(max_stats_len);
1024     unsigned avail = max_stats_len;
1025 
1026     const char* const buf_start = buf;
1027 
1028     if(avail <= json_sep_len + json_head_len)
1029         log_fatal("BUG: monio stats buf miscalculated (json mon head)");
1030 
1031     if(num_smgrs == 0) {
1032         memcpy(buf, json_nl, json_nl_len);
1033         buf += json_nl_len;
1034         return (buf - buf_start);
1035     } else {
1036         memcpy(buf, json_sep, json_sep_len);
1037         buf += json_sep_len;
1038         avail -= json_sep_len;
1039     }
1040 
1041     memcpy(buf, json_head, json_head_len);
1042     buf += json_head_len;
1043     avail -= json_head_len;
1044 
1045     for(unsigned i = 0; i < num_smgrs; i++) {
1046         const char* cur_st;
1047         const char* real_st;
1048         get_state_texts(i, &cur_st, &real_st);
1049         const int snp_rv = snprintf(buf, avail, json_tmpl, smgrs[i].desc, cur_st, real_st);
1050         dmn_assert(snp_rv > 0);
1051         const unsigned written = (unsigned)snp_rv;
1052         if(written >= avail)
1053             log_fatal("BUG: monio stats buf miscalculated (json mon data)");
1054         buf += written;
1055         avail -= written;
1056         if( i < num_smgrs -1 ) {
1057             if(avail <= json_sep_len)
1058                 log_fatal("BUG: monio stats buf miscalculated (json mon data-sep)");
1059             memcpy(buf, json_sep, json_sep_len);
1060             buf += json_sep_len;
1061             avail -= json_sep_len;
1062         }
1063     }
1064 
1065     if(avail <= json_foot_len)
1066         log_fatal("BUG: monio stats buf miscalculated (json mon footer)");
1067 
1068     memcpy(buf, json_foot, json_foot_len);
1069     buf += json_foot_len;
1070 
1071     return (buf - buf_start);
1072 }
1073