1 /** @file
2
3 Function defs for the Alarms keeper.
4
5 @section license License
6
7 Licensed to the Apache Software Foundation (ASF) under one
8 or more contributor license agreements. See the NOTICE file
9 distributed with this work for additional information
10 regarding copyright ownership. The ASF licenses this file
11 to you under the Apache License, Version 2.0 (the
12 "License"); you may not use this file except in compliance
13 with the License. You may obtain a copy of the License at
14
15 http://www.apache.org/licenses/LICENSE-2.0
16
17 Unless required by applicable law or agreed to in writing, software
18 distributed under the License is distributed on an "AS IS" BASIS,
19 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 See the License for the specific language governing permissions and
21 limitations under the License.
22 */
23
24 #include "tscore/ink_platform.h"
25 #include "tscore/ink_string.h"
26 #include "tscore/ink_file.h"
27 #include "tscore/ink_time.h"
28 #include "MgmtUtils.h"
29 #include "Alarms.h"
30
31 #include "records/P_RecCore.h"
32
33 const char *alarmText[] = {
34 "Unknown Alarm", // 0
35 "[TrafficManager] Traffic Server process was reset.", // 1
36 "[TrafficManager] Traffic Server process established.", // 2
37 "Invalid Configuration", // 3
38 "System Error", // 4
39 "Cache Error", // 5
40 "Cache Warning", // 6
41 "Logging Error", // 7
42 "Logging Warning", // 8
43 "Alarms configuration update failed", // 9
44 "Librecords", // 10 (unclear if needed / used)
45 "Plugin set configuration", // 11 (unclear if needed / used)
46 };
47
48 const int alarmTextNum = sizeof(alarmText) / sizeof(char *);
49
50 // Return the alarm script directory. Use proxy.config.alarm.abs_path if it is
51 // set, falling back to proxy.config.bin_path otherwise.
52 static char *
alarm_script_dir()53 alarm_script_dir()
54 {
55 char *path;
56
57 path = REC_readString("proxy.config.alarm.abs_path", nullptr);
58 if (path && *path) {
59 return path;
60 }
61
62 return ats_stringdup(RecConfigReadBinDir());
63 }
64
Alarms()65 Alarms::Alarms()
66 {
67 cur_cb = 0;
68 ink_mutex_init(&mutex);
69 } /* End Alarms::Alarms */
70
~Alarms()71 Alarms::~Alarms()
72 {
73 for (auto &&it : local_alarms) {
74 ats_free(it.second);
75 }
76 for (auto &&it : remote_alarms) {
77 ats_free(it.second);
78 }
79 ink_mutex_destroy(&mutex);
80 } /* End Alarms::Alarms */
81
82 void
registerCallback(AlarmCallbackFunc func)83 Alarms::registerCallback(AlarmCallbackFunc func)
84 {
85 char cb_buf[80];
86
87 ink_mutex_acquire(&mutex);
88 snprintf(cb_buf, sizeof(cb_buf), "%d", cur_cb++);
89 Debug("alarm", "[Alarms::registerCallback] Registering Alarms callback");
90 cblist.emplace(cb_buf, func);
91 ink_mutex_release(&mutex);
92 } /* End Alarms::registerCallback */
93
94 bool
isCurrentAlarm(alarm_t a,char * ip)95 Alarms::isCurrentAlarm(alarm_t a, char *ip)
96 {
97 bool ret = false;
98 char buf[80];
99
100 ink_mutex_acquire(&mutex);
101 if (!ip) {
102 snprintf(buf, sizeof(buf), "%d", a);
103 } else {
104 snprintf(buf, sizeof(buf), "%d-%s", a, ip);
105 }
106
107 if (!ip && local_alarms.find(buf) != local_alarms.end()) {
108 ret = true;
109 } else if (ip && remote_alarms.find(buf) != remote_alarms.end()) {
110 ret = true;
111 }
112 ink_mutex_release(&mutex);
113 return ret;
114 } /* End Alarms::isCurrentAlarm */
115
116 void
resolveAlarm(alarm_t a,char * ip)117 Alarms::resolveAlarm(alarm_t a, char *ip)
118 {
119 char buf[80];
120
121 ink_mutex_acquire(&mutex);
122 if (!ip) {
123 snprintf(buf, sizeof(buf), "%d", a);
124 } else {
125 snprintf(buf, sizeof(buf), "%d-%s", a, ip);
126 }
127
128 if (!ip && local_alarms.find(buf) != local_alarms.end()) {
129 Alarm *hash_value = local_alarms[buf];
130 local_alarms.erase(buf);
131 ats_free(hash_value->description);
132 ats_free(hash_value);
133 } else if (ip && remote_alarms.find(buf) != remote_alarms.end()) {
134 Alarm *hash_value = remote_alarms[buf];
135 remote_alarms.erase(buf);
136 ats_free(hash_value->description);
137 ats_free(hash_value);
138 }
139 ink_mutex_release(&mutex);
140
141 return;
142 } /* End Alarms::resolveAlarm */
143
144 void
signalAlarm(alarm_t a,const char * desc,const char * ip)145 Alarms::signalAlarm(alarm_t a, const char *desc, const char *ip)
146 {
147 static time_t last_sent = 0;
148 static char prev_alarm_text[2048] = "";
149
150 int priority;
151 char buf[80];
152 Alarm *atmp;
153
154 /* Assign correct priorities */
155 switch (a) {
156 case MGMT_ALARM_PROXY_CACHE_ERROR:
157 priority = 1; // INKqa07595
158 break;
159 case MGMT_ALARM_PROXY_CACHE_WARNING:
160 return;
161 case MGMT_ALARM_PROXY_PROCESS_DIED:
162 priority = 1;
163 break;
164 case MGMT_ALARM_PROXY_PROCESS_BORN:
165 mgmt_log("[Alarms::signalAlarm] Server Process born\n");
166 return;
167 default:
168 priority = 2;
169 break;
170 }
171
172 /* Quick hack to buffer repeat alarms and only send every 15 min */
173 if (desc && (priority == 1 || priority == 2) && !ip) {
174 if (strcmp(prev_alarm_text, desc) == 0) { /* a repeated alarm */
175 time_t time_delta = time(nullptr) - last_sent;
176 if (time_delta < 900) {
177 mgmt_log("[Alarms::signalAlarm] Skipping Alarm: '%s'\n", desc);
178 return;
179 } else {
180 last_sent = time(nullptr);
181 }
182 } else {
183 ink_strlcpy(prev_alarm_text, desc, sizeof(prev_alarm_text));
184 last_sent = time(nullptr);
185 }
186 }
187
188 Debug("alarm", "[Alarms::signalAlarm] Sending Alarm: '%s'", desc);
189
190 if (!desc) {
191 desc = const_cast<char *>(getAlarmText(a));
192 }
193
194 /*
195 * Exec alarm bin for priority alarms every time, regardless if they are
196 * potentially duplicates. However, only exec this for you own alarms,
197 * don't want every node in the cluster reporting the same alarm.
198 */
199 if (priority == 1 && !ip) {
200 execAlarmBin(desc);
201 }
202
203 ink_mutex_acquire(&mutex);
204 if (!ip) {
205 snprintf(buf, sizeof(buf), "%d", a);
206 if (local_alarms.find(buf) != local_alarms.end()) {
207 ink_mutex_release(&mutex);
208 return;
209 }
210 } else {
211 snprintf(buf, sizeof(buf), "%d-%s", a, ip);
212 if (auto it = remote_alarms.find(buf); it != remote_alarms.end()) {
213 // Reset the seen flag so that we know the remote alarm is
214 // still active
215 atmp = it->second;
216 atmp->seen = true;
217 ink_mutex_release(&mutex);
218 return;
219 }
220 }
221
222 atmp = static_cast<Alarm *>(ats_malloc(sizeof(Alarm)));
223 atmp->type = a;
224 atmp->linger = true;
225 atmp->seen = true;
226 atmp->priority = priority;
227 atmp->description = nullptr;
228
229 if (!ip) {
230 atmp->local = true;
231 atmp->inet_address = 0;
232 local_alarms.emplace(buf, atmp);
233 } else {
234 atmp->local = false;
235 atmp->inet_address = inet_addr(ip);
236 local_alarms.emplace(buf, atmp);
237 }
238
239 // Swap desc with time-stamped description. Kinda hackish
240 // Temporary until we get a new
241 // alarm system in place. TS 5.0.0, 02/08/2001
242 time_t my_time_t;
243 char my_ctime_str[32];
244 time(&my_time_t);
245 ink_ctime_r(&my_time_t, my_ctime_str);
246 char *p = my_ctime_str;
247 while (*p != '\n' && *p != '\0') {
248 p++;
249 }
250 if (*p == '\n') {
251 *p = '\0';
252 }
253 const size_t sz = sizeof(char) * (strlen(desc) + strlen(my_ctime_str) + 4);
254 ats_free(atmp->description);
255 atmp->description = static_cast<char *>(ats_malloc(sz));
256 snprintf(atmp->description, sz, "[%s] %s", my_ctime_str, desc);
257
258 ink_mutex_release(&mutex);
259
260 for (auto &&it : cblist) {
261 AlarmCallbackFunc func = it.second;
262 Debug("alarm", "[Alarms::signalAlarm] invoke callback for %d", a);
263 (*(func))(a, ip, desc);
264 }
265
266 /* Priority 2 alarms get signaled if they are the first unsolved occurrence. */
267 if (priority == 2 && !ip) {
268 execAlarmBin(desc);
269 }
270
271 } /* End Alarms::signalAlarm */
272
273 /*
274 * resetSeenFlag(...)
275 * Function resets the "seen" flag for a given peer's alarms. This allows
276 * us to flush alarms that may have expired naturally or were dealt.
277 */
278 void
resetSeenFlag(char * ip)279 Alarms::resetSeenFlag(char *ip)
280 {
281 ink_mutex_acquire(&mutex);
282 for (auto &&it : remote_alarms) {
283 std::string const &key = it.first;
284 Alarm *tmp = it.second;
285 if (key.find(ip) != std::string::npos) {
286 tmp->seen = false;
287 }
288 }
289 ink_mutex_release(&mutex);
290 return;
291 } /* End Alarms::resetSeenFlag */
292
293 /*
294 * clearUnSeen(...)
295 * This function is a sweeper function to clean up those alarms that have
296 * been taken care of through other local managers or at the peer itself.
297 */
298 void
clearUnSeen(char * ip)299 Alarms::clearUnSeen(char *ip)
300 {
301 ink_mutex_acquire(&mutex);
302 for (auto &&it : remote_alarms) {
303 std::string const &key = it.first;
304 Alarm *tmp = it.second;
305
306 if (key.find(ip) != std::string::npos) { /* Make sure alarm is for correct ip */
307 if (!tmp->seen) { /* Make sure we did not see it in peer's report */
308 remote_alarms.erase(key);
309 ats_free(tmp->description);
310 ats_free(tmp);
311 }
312 }
313 }
314 ink_mutex_release(&mutex);
315 return;
316 } /* End Alarms::clearUnSeen */
317
318 void
execAlarmBin(const char * desc)319 Alarms::execAlarmBin(const char *desc)
320 {
321 ats_scoped_str bindir(alarm_script_dir());
322 char cmd_line[MAXPATHLEN];
323
324 ats_scoped_str alarm_bin(REC_readString("proxy.config.alarm.bin", nullptr));
325 ats_scoped_str alarm_email_from_name;
326 ats_scoped_str alarm_email_from_addr;
327 ats_scoped_str alarm_email_to_addr;
328
329 pid_t pid;
330
331 // If there's no alarm script configured, don't even bother.
332 if (!alarm_bin || *alarm_bin == '\0') {
333 return;
334 }
335
336 ink_filepath_make(cmd_line, sizeof(cmd_line), bindir, alarm_bin);
337
338 #ifdef POSIX_THREAD
339 if ((pid = fork()) < 0)
340 #else
341 if ((pid = fork1()) < 0)
342 #endif
343 {
344 mgmt_elog(errno, "[Alarms::execAlarmBin] Unable to fork1 process\n");
345 } else if (pid > 0) { /* Parent */
346 int status;
347 bool script_done = false;
348 time_t timeout = static_cast<time_t>(REC_readInteger("proxy.config.alarm.script_runtime", nullptr));
349 if (!timeout) {
350 timeout = 5; // default time = 5 secs
351 }
352 time_t time_delta = 0;
353 time_t first_time = time(nullptr);
354 while (time_delta <= timeout) {
355 // waitpid will return child's pid if status is available
356 // or -1 if there is some problem; returns 0 if child status
357 // is not available
358 if (waitpid(pid, &status, WNOHANG) != 0) {
359 Debug("alarm", "[Alarms::execAlarmBin] child pid %" PRId64 " has status", (int64_t)pid);
360 script_done = true;
361 break;
362 }
363 time_delta = time(nullptr) - first_time;
364 }
365 // need to kill the child script process if it's not complete
366 if (!script_done) {
367 Debug("alarm", "[Alarms::execAlarmBin] kill child pid %" PRId64 "", (int64_t)pid);
368 kill(pid, SIGKILL);
369 waitpid(pid, &status, 0); // to reap the thread
370 }
371 } else {
372 int res = execl(cmd_line, (const char *)alarm_bin, desc, (char *)nullptr);
373
374 _exit(res);
375 }
376 }
377
378 //
379 // getAlarmText
380 //
381 // returns the corresponding text for the alarm id
382 //
383 const char *
getAlarmText(alarm_t id)384 Alarms::getAlarmText(alarm_t id)
385 {
386 if (id < alarmTextNum) {
387 return alarmText[id];
388 } else {
389 return alarmText[0]; // "Unknown Alarm";
390 }
391 }
392