1 /** @file
2 
3   Function defs for the Alarms keeper.
4 
5   @section license License
6 
7   Licensed to the Apache Software Foundation (ASF) under one
8   or more contributor license agreements.  See the NOTICE file
9   distributed with this work for additional information
10   regarding copyright ownership.  The ASF licenses this file
11   to you under the Apache License, Version 2.0 (the
12   "License"); you may not use this file except in compliance
13   with the License.  You may obtain a copy of the License at
14 
15       http://www.apache.org/licenses/LICENSE-2.0
16 
17   Unless required by applicable law or agreed to in writing, software
18   distributed under the License is distributed on an "AS IS" BASIS,
19   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20   See the License for the specific language governing permissions and
21   limitations under the License.
22  */
23 
24 #include "tscore/ink_platform.h"
25 #include "tscore/ink_string.h"
26 #include "tscore/ink_file.h"
27 #include "tscore/ink_time.h"
28 #include "MgmtUtils.h"
29 #include "Alarms.h"
30 
31 #include "records/P_RecCore.h"
32 
33 const char *alarmText[] = {
34   "Unknown Alarm",                                        // 0
35   "[TrafficManager] Traffic Server process was reset.",   // 1
36   "[TrafficManager] Traffic Server process established.", // 2
37   "Invalid Configuration",                                // 3
38   "System Error",                                         // 4
39   "Cache Error",                                          // 5
40   "Cache Warning",                                        // 6
41   "Logging Error",                                        // 7
42   "Logging Warning",                                      // 8
43   "Alarms configuration update failed",                   // 9
44   "Librecords",                                           // 10 (unclear if needed / used)
45   "Plugin set configuration",                             // 11 (unclear if needed / used)
46 };
47 
48 const int alarmTextNum = sizeof(alarmText) / sizeof(char *);
49 
50 // Return the alarm script directory. Use proxy.config.alarm.abs_path if it is
51 // set, falling back to proxy.config.bin_path otherwise.
52 static char *
alarm_script_dir()53 alarm_script_dir()
54 {
55   char *path;
56 
57   path = REC_readString("proxy.config.alarm.abs_path", nullptr);
58   if (path && *path) {
59     return path;
60   }
61 
62   return ats_stringdup(RecConfigReadBinDir());
63 }
64 
Alarms()65 Alarms::Alarms()
66 {
67   cur_cb = 0;
68   ink_mutex_init(&mutex);
69 } /* End Alarms::Alarms */
70 
~Alarms()71 Alarms::~Alarms()
72 {
73   for (auto &&it : local_alarms) {
74     ats_free(it.second);
75   }
76   for (auto &&it : remote_alarms) {
77     ats_free(it.second);
78   }
79   ink_mutex_destroy(&mutex);
80 } /* End Alarms::Alarms */
81 
82 void
registerCallback(AlarmCallbackFunc func)83 Alarms::registerCallback(AlarmCallbackFunc func)
84 {
85   char cb_buf[80];
86 
87   ink_mutex_acquire(&mutex);
88   snprintf(cb_buf, sizeof(cb_buf), "%d", cur_cb++);
89   Debug("alarm", "[Alarms::registerCallback] Registering Alarms callback");
90   cblist.emplace(cb_buf, func);
91   ink_mutex_release(&mutex);
92 } /* End Alarms::registerCallback */
93 
94 bool
isCurrentAlarm(alarm_t a,char * ip)95 Alarms::isCurrentAlarm(alarm_t a, char *ip)
96 {
97   bool ret = false;
98   char buf[80];
99 
100   ink_mutex_acquire(&mutex);
101   if (!ip) {
102     snprintf(buf, sizeof(buf), "%d", a);
103   } else {
104     snprintf(buf, sizeof(buf), "%d-%s", a, ip);
105   }
106 
107   if (!ip && local_alarms.find(buf) != local_alarms.end()) {
108     ret = true;
109   } else if (ip && remote_alarms.find(buf) != remote_alarms.end()) {
110     ret = true;
111   }
112   ink_mutex_release(&mutex);
113   return ret;
114 } /* End Alarms::isCurrentAlarm */
115 
116 void
resolveAlarm(alarm_t a,char * ip)117 Alarms::resolveAlarm(alarm_t a, char *ip)
118 {
119   char buf[80];
120 
121   ink_mutex_acquire(&mutex);
122   if (!ip) {
123     snprintf(buf, sizeof(buf), "%d", a);
124   } else {
125     snprintf(buf, sizeof(buf), "%d-%s", a, ip);
126   }
127 
128   if (!ip && local_alarms.find(buf) != local_alarms.end()) {
129     Alarm *hash_value = local_alarms[buf];
130     local_alarms.erase(buf);
131     ats_free(hash_value->description);
132     ats_free(hash_value);
133   } else if (ip && remote_alarms.find(buf) != remote_alarms.end()) {
134     Alarm *hash_value = remote_alarms[buf];
135     remote_alarms.erase(buf);
136     ats_free(hash_value->description);
137     ats_free(hash_value);
138   }
139   ink_mutex_release(&mutex);
140 
141   return;
142 } /* End Alarms::resolveAlarm */
143 
144 void
signalAlarm(alarm_t a,const char * desc,const char * ip)145 Alarms::signalAlarm(alarm_t a, const char *desc, const char *ip)
146 {
147   static time_t last_sent           = 0;
148   static char prev_alarm_text[2048] = "";
149 
150   int priority;
151   char buf[80];
152   Alarm *atmp;
153 
154   /* Assign correct priorities */
155   switch (a) {
156   case MGMT_ALARM_PROXY_CACHE_ERROR:
157     priority = 1; // INKqa07595
158     break;
159   case MGMT_ALARM_PROXY_CACHE_WARNING:
160     return;
161   case MGMT_ALARM_PROXY_PROCESS_DIED:
162     priority = 1;
163     break;
164   case MGMT_ALARM_PROXY_PROCESS_BORN:
165     mgmt_log("[Alarms::signalAlarm] Server Process born\n");
166     return;
167   default:
168     priority = 2;
169     break;
170   }
171 
172   /* Quick hack to buffer repeat alarms and only send every 15 min */
173   if (desc && (priority == 1 || priority == 2) && !ip) {
174     if (strcmp(prev_alarm_text, desc) == 0) { /* a repeated alarm */
175       time_t time_delta = time(nullptr) - last_sent;
176       if (time_delta < 900) {
177         mgmt_log("[Alarms::signalAlarm] Skipping Alarm: '%s'\n", desc);
178         return;
179       } else {
180         last_sent = time(nullptr);
181       }
182     } else {
183       ink_strlcpy(prev_alarm_text, desc, sizeof(prev_alarm_text));
184       last_sent = time(nullptr);
185     }
186   }
187 
188   Debug("alarm", "[Alarms::signalAlarm] Sending Alarm: '%s'", desc);
189 
190   if (!desc) {
191     desc = const_cast<char *>(getAlarmText(a));
192   }
193 
194   /*
195    * Exec alarm bin for priority alarms every time, regardless if they are
196    * potentially duplicates. However, only exec this for you own alarms,
197    * don't want every node in the cluster reporting the same alarm.
198    */
199   if (priority == 1 && !ip) {
200     execAlarmBin(desc);
201   }
202 
203   ink_mutex_acquire(&mutex);
204   if (!ip) {
205     snprintf(buf, sizeof(buf), "%d", a);
206     if (local_alarms.find(buf) != local_alarms.end()) {
207       ink_mutex_release(&mutex);
208       return;
209     }
210   } else {
211     snprintf(buf, sizeof(buf), "%d-%s", a, ip);
212     if (auto it = remote_alarms.find(buf); it != remote_alarms.end()) {
213       // Reset the seen flag so that we know the remote alarm is
214       //   still active
215       atmp       = it->second;
216       atmp->seen = true;
217       ink_mutex_release(&mutex);
218       return;
219     }
220   }
221 
222   atmp              = static_cast<Alarm *>(ats_malloc(sizeof(Alarm)));
223   atmp->type        = a;
224   atmp->linger      = true;
225   atmp->seen        = true;
226   atmp->priority    = priority;
227   atmp->description = nullptr;
228 
229   if (!ip) {
230     atmp->local        = true;
231     atmp->inet_address = 0;
232     local_alarms.emplace(buf, atmp);
233   } else {
234     atmp->local        = false;
235     atmp->inet_address = inet_addr(ip);
236     local_alarms.emplace(buf, atmp);
237   }
238 
239   // Swap desc with time-stamped description.  Kinda hackish
240   // Temporary until we get a new
241   // alarm system in place.  TS 5.0.0, 02/08/2001
242   time_t my_time_t;
243   char my_ctime_str[32];
244   time(&my_time_t);
245   ink_ctime_r(&my_time_t, my_ctime_str);
246   char *p = my_ctime_str;
247   while (*p != '\n' && *p != '\0') {
248     p++;
249   }
250   if (*p == '\n') {
251     *p = '\0';
252   }
253   const size_t sz = sizeof(char) * (strlen(desc) + strlen(my_ctime_str) + 4);
254   ats_free(atmp->description);
255   atmp->description = static_cast<char *>(ats_malloc(sz));
256   snprintf(atmp->description, sz, "[%s] %s", my_ctime_str, desc);
257 
258   ink_mutex_release(&mutex);
259 
260   for (auto &&it : cblist) {
261     AlarmCallbackFunc func = it.second;
262     Debug("alarm", "[Alarms::signalAlarm] invoke callback for %d", a);
263     (*(func))(a, ip, desc);
264   }
265 
266   /* Priority 2 alarms get signaled if they are the first unsolved occurrence. */
267   if (priority == 2 && !ip) {
268     execAlarmBin(desc);
269   }
270 
271 } /* End Alarms::signalAlarm */
272 
273 /*
274  * resetSeenFlag(...)
275  *   Function resets the "seen" flag for a given peer's alarms. This allows
276  * us to flush alarms that may have expired naturally or were dealt.
277  */
278 void
resetSeenFlag(char * ip)279 Alarms::resetSeenFlag(char *ip)
280 {
281   ink_mutex_acquire(&mutex);
282   for (auto &&it : remote_alarms) {
283     std::string const &key = it.first;
284     Alarm *tmp             = it.second;
285     if (key.find(ip) != std::string::npos) {
286       tmp->seen = false;
287     }
288   }
289   ink_mutex_release(&mutex);
290   return;
291 } /* End Alarms::resetSeenFlag */
292 
293 /*
294  * clearUnSeen(...)
295  *   This function is a sweeper function to clean up those alarms that have
296  * been taken care of through other local managers or at the peer itself.
297  */
298 void
clearUnSeen(char * ip)299 Alarms::clearUnSeen(char *ip)
300 {
301   ink_mutex_acquire(&mutex);
302   for (auto &&it : remote_alarms) {
303     std::string const &key = it.first;
304     Alarm *tmp             = it.second;
305 
306     if (key.find(ip) != std::string::npos) { /* Make sure alarm is for correct ip */
307       if (!tmp->seen) {                      /* Make sure we did not see it in peer's report */
308         remote_alarms.erase(key);
309         ats_free(tmp->description);
310         ats_free(tmp);
311       }
312     }
313   }
314   ink_mutex_release(&mutex);
315   return;
316 } /* End Alarms::clearUnSeen */
317 
318 void
execAlarmBin(const char * desc)319 Alarms::execAlarmBin(const char *desc)
320 {
321   ats_scoped_str bindir(alarm_script_dir());
322   char cmd_line[MAXPATHLEN];
323 
324   ats_scoped_str alarm_bin(REC_readString("proxy.config.alarm.bin", nullptr));
325   ats_scoped_str alarm_email_from_name;
326   ats_scoped_str alarm_email_from_addr;
327   ats_scoped_str alarm_email_to_addr;
328 
329   pid_t pid;
330 
331   // If there's no alarm script configured, don't even bother.
332   if (!alarm_bin || *alarm_bin == '\0') {
333     return;
334   }
335 
336   ink_filepath_make(cmd_line, sizeof(cmd_line), bindir, alarm_bin);
337 
338 #ifdef POSIX_THREAD
339   if ((pid = fork()) < 0)
340 #else
341   if ((pid = fork1()) < 0)
342 #endif
343   {
344     mgmt_elog(errno, "[Alarms::execAlarmBin] Unable to fork1 process\n");
345   } else if (pid > 0) { /* Parent */
346     int status;
347     bool script_done = false;
348     time_t timeout   = static_cast<time_t>(REC_readInteger("proxy.config.alarm.script_runtime", nullptr));
349     if (!timeout) {
350       timeout = 5; // default time = 5 secs
351     }
352     time_t time_delta = 0;
353     time_t first_time = time(nullptr);
354     while (time_delta <= timeout) {
355       // waitpid will return child's pid if status is available
356       // or -1 if there is some problem; returns 0 if child status
357       // is not available
358       if (waitpid(pid, &status, WNOHANG) != 0) {
359         Debug("alarm", "[Alarms::execAlarmBin] child pid %" PRId64 " has status", (int64_t)pid);
360         script_done = true;
361         break;
362       }
363       time_delta = time(nullptr) - first_time;
364     }
365     // need to kill the child script process if it's not complete
366     if (!script_done) {
367       Debug("alarm", "[Alarms::execAlarmBin] kill child pid %" PRId64 "", (int64_t)pid);
368       kill(pid, SIGKILL);
369       waitpid(pid, &status, 0); // to reap the thread
370     }
371   } else {
372     int res = execl(cmd_line, (const char *)alarm_bin, desc, (char *)nullptr);
373 
374     _exit(res);
375   }
376 }
377 
378 //
379 // getAlarmText
380 //
381 // returns the corresponding text for the alarm id
382 //
383 const char *
getAlarmText(alarm_t id)384 Alarms::getAlarmText(alarm_t id)
385 {
386   if (id < alarmTextNum) {
387     return alarmText[id];
388   } else {
389     return alarmText[0]; // "Unknown Alarm";
390   }
391 }
392