1 /**
2 * parser.c -- web log parsing
3 * ______ ___
4 * / ____/___ / | _____________ __________
5 * / / __/ __ \/ /| |/ ___/ ___/ _ \/ ___/ ___/
6 * / /_/ / /_/ / ___ / /__/ /__/ __(__ |__ )
7 * \____/\____/_/ |_\___/\___/\___/____/____/
8 *
9 * The MIT License (MIT)
10 * Copyright (c) 2009-2020 Gerardo Orellana <hello @ goaccess.io>
11 *
12 * Permission is hereby granted, free of charge, to any person obtaining a copy
13 * of this software and associated documentation files (the "Software"), to deal
14 * in the Software without restriction, including without limitation the rights
15 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 * copies of the Software, and to permit persons to whom the Software is
17 * furnished to do so, subject to the following conditions:
18 *
19 * The above copyright notice and this permission notice shall be included in all
20 * copies or substantial portions of the Software.
21 *
22 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 * SOFTWARE.
29 */
30
31 /*
32 * "_XOPEN_SOURCE" is required for the GNU libc to export "strptime(3)"
33 * correctly.
34 */
35 #define _LARGEFILE_SOURCE
36 #define _LARGEFILE64_SOURCE
37 #define _FILE_OFFSET_BITS 64
38
39 #define _XOPEN_SOURCE 700
40 #define _DEFAULT_SOURCE
41
42 #include <ctype.h>
43 #include <errno.h>
44
45 #if HAVE_CONFIG_H
46 #include <config.h>
47 #endif
48
49 #include <arpa/inet.h>
50 #include <stddef.h>
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <strings.h>
55 #include <sys/stat.h>
56 #include <sys/types.h>
57 #include <unistd.h>
58 #include <sys/stat.h>
59 #include <fcntl.h>
60
61 #include "gkhash.h"
62
63 #ifdef HAVE_GEOLOCATION
64 #include "geoip1.h"
65 #endif
66
67 #include "parser.h"
68
69 #include "browsers.h"
70 #include "error.h"
71 #include "goaccess.h"
72 #include "opesys.h"
73 #include "pdjson.h"
74 #include "util.h"
75 #include "websocket.h"
76 #include "xmalloc.h"
77
78 /* private prototypes */
79 /* key/data generators for each module */
80
81 static int gen_visitor_key (GKeyData * kdata, GLogItem * logitem);
82 static int gen_404_key (GKeyData * kdata, GLogItem * logitem);
83 static int gen_browser_key (GKeyData * kdata, GLogItem * logitem);
84 static int gen_host_key (GKeyData * kdata, GLogItem * logitem);
85 static int gen_keyphrase_key (GKeyData * kdata, GLogItem * logitem);
86 static int gen_os_key (GKeyData * kdata, GLogItem * logitem);
87 static int gen_vhost_key (GKeyData * kdata, GLogItem * logitem);
88 static int gen_remote_user_key (GKeyData * kdata, GLogItem * logitem);
89 static int gen_cache_status_key (GKeyData * kdata, GLogItem * logitem);
90 static int gen_referer_key (GKeyData * kdata, GLogItem * logitem);
91 static int gen_ref_site_key (GKeyData * kdata, GLogItem * logitem);
92 static int gen_request_key (GKeyData * kdata, GLogItem * logitem);
93 static int gen_static_request_key (GKeyData * kdata, GLogItem * logitem);
94 static int gen_status_code_key (GKeyData * kdata, GLogItem * logitem);
95 static int gen_visit_time_key (GKeyData * kdata, GLogItem * logitem);
96 #ifdef HAVE_GEOLOCATION
97 static int gen_geolocation_key (GKeyData * kdata, GLogItem * logitem);
98 #endif
99 /* UMS */
100 static int gen_mime_type_key (GKeyData * kdata, GLogItem * logitem);
101 static int gen_tls_type_key (GKeyData * kdata, GLogItem * logitem);
102
103 /* insertion metric routines */
104 static void insert_data (GModule module, GKeyData * kdata);
105 static void insert_rootmap (GModule module, GKeyData * kdata);
106 static void insert_root (GModule module, GKeyData * kdata);
107 static void insert_hit (GModule module, GKeyData * kdata);
108 static void insert_visitor (GModule module, GKeyData * kdata);
109 static void insert_bw (GModule module, GKeyData * kdata, uint64_t size);
110 static void insert_cumts (GModule module, GKeyData * kdata, uint64_t ts);
111 static void insert_maxts (GModule module, GKeyData * kdata, uint64_t ts);
112 static void insert_method (GModule module, GKeyData * kdata, const char *data);
113 static void insert_protocol (GModule module, GKeyData * kdata, const char *data);
114 static void insert_agent (GModule module, GKeyData * kdata, uint32_t agent_nkey);
115
116 /* *INDENT-OFF* */
117 static GParse paneling[] = {
118 {
119 VISITORS,
120 gen_visitor_key,
121 insert_data,
122 NULL,
123 insert_hit,
124 insert_visitor,
125 insert_bw,
126 insert_cumts,
127 insert_maxts,
128 NULL,
129 NULL,
130 NULL,
131 }, {
132 REQUESTS,
133 gen_request_key,
134 insert_data,
135 NULL,
136 insert_hit,
137 insert_visitor,
138 insert_bw,
139 insert_cumts,
140 insert_maxts,
141 insert_method,
142 insert_protocol,
143 NULL,
144 }, {
145 REQUESTS_STATIC,
146 gen_static_request_key,
147 insert_data,
148 NULL,
149 insert_hit,
150 insert_visitor,
151 insert_bw,
152 insert_cumts,
153 insert_maxts,
154 insert_method,
155 insert_protocol,
156 NULL,
157 }, {
158 NOT_FOUND,
159 gen_404_key,
160 insert_data,
161 NULL,
162 insert_hit,
163 insert_visitor,
164 insert_bw,
165 insert_cumts,
166 insert_maxts,
167 insert_method,
168 insert_protocol,
169 NULL,
170 }, {
171 HOSTS,
172 gen_host_key,
173 insert_data,
174 NULL,
175 insert_hit,
176 insert_visitor,
177 insert_bw,
178 insert_cumts,
179 insert_maxts,
180 NULL,
181 NULL,
182 insert_agent,
183 }, {
184 OS,
185 gen_os_key,
186 insert_data,
187 insert_rootmap,
188 insert_hit,
189 insert_visitor,
190 insert_bw,
191 insert_cumts,
192 insert_maxts,
193 insert_method,
194 insert_protocol,
195 NULL,
196 }, {
197 BROWSERS,
198 gen_browser_key,
199 insert_data,
200 insert_rootmap,
201 insert_hit,
202 insert_visitor,
203 insert_bw,
204 insert_cumts,
205 insert_maxts,
206 NULL,
207 NULL,
208 NULL,
209 }, {
210 REFERRERS,
211 gen_referer_key,
212 insert_data,
213 NULL,
214 insert_hit,
215 insert_visitor,
216 insert_bw,
217 insert_cumts,
218 insert_maxts,
219 NULL,
220 NULL,
221 NULL,
222 }, {
223 REFERRING_SITES,
224 gen_ref_site_key,
225 insert_data,
226 NULL,
227 insert_hit,
228 insert_visitor,
229 insert_bw,
230 insert_cumts,
231 insert_maxts,
232 NULL,
233 NULL,
234 NULL,
235 }, {
236 KEYPHRASES,
237 gen_keyphrase_key,
238 insert_data,
239 NULL,
240 insert_hit,
241 insert_visitor,
242 insert_bw,
243 insert_cumts,
244 insert_maxts,
245 NULL,
246 NULL,
247 NULL,
248 },
249 #ifdef HAVE_GEOLOCATION
250 {
251 GEO_LOCATION,
252 gen_geolocation_key,
253 insert_data,
254 insert_rootmap,
255 insert_hit,
256 insert_visitor,
257 insert_bw,
258 insert_cumts,
259 insert_maxts,
260 NULL,
261 NULL,
262 NULL,
263 },
264 #endif
265 {
266 STATUS_CODES,
267 gen_status_code_key,
268 insert_data,
269 insert_rootmap,
270 insert_hit,
271 insert_visitor,
272 insert_bw,
273 insert_cumts,
274 insert_maxts,
275 NULL,
276 NULL,
277 NULL,
278 }, {
279 VISIT_TIMES,
280 gen_visit_time_key,
281 insert_data,
282 NULL,
283 insert_hit,
284 insert_visitor,
285 insert_bw,
286 insert_cumts,
287 insert_maxts,
288 NULL,
289 NULL,
290 NULL,
291 }, {
292 VIRTUAL_HOSTS,
293 gen_vhost_key,
294 insert_data,
295 NULL,
296 insert_hit,
297 insert_visitor,
298 insert_bw,
299 insert_cumts,
300 insert_maxts,
301 NULL,
302 NULL,
303 NULL,
304 }, {
305 REMOTE_USER,
306 gen_remote_user_key,
307 insert_data,
308 NULL,
309 insert_hit,
310 insert_visitor,
311 insert_bw,
312 insert_cumts,
313 insert_maxts,
314 NULL,
315 NULL,
316 NULL,
317 }, {
318 CACHE_STATUS,
319 gen_cache_status_key,
320 insert_data,
321 NULL,
322 insert_hit,
323 insert_visitor,
324 insert_bw,
325 insert_cumts,
326 insert_maxts,
327 NULL,
328 NULL,
329 NULL,
330 }, {
331 MIME_TYPE,
332 gen_mime_type_key,
333 insert_data,
334 insert_rootmap,
335 insert_hit,
336 insert_visitor,
337 insert_bw,
338 insert_cumts,
339 insert_maxts,
340 NULL, /*method*/
341 NULL, /*protocol*/
342 NULL, /*agent*/
343 }, {
344 TLS_TYPE,
345 gen_tls_type_key,
346 insert_data,
347 insert_rootmap,
348 insert_hit,
349 insert_visitor,
350 insert_bw,
351 insert_cumts,
352 insert_maxts,
353 NULL,
354 NULL,
355 NULL,
356 },
357 };
358 /* *INDENT-ON* */
359
360 /* Initialize a new GKeyData instance */
361 static void
new_modulekey(GKeyData * kdata)362 new_modulekey (GKeyData * kdata) {
363 GKeyData key = {
364 .data = NULL,
365 .data_key = NULL,
366 .data_nkey = 0,
367 .root = NULL,
368 .root_key = NULL,
369 .root_nkey = 0,
370 .uniq_key = NULL,
371 .uniq_nkey = 0,
372 };
373 *kdata = key;
374 }
375
376 /* Get a panel from the GParse structure given a module.
377 *
378 * On error, or if not found, NULL is returned.
379 * On success, the panel value is returned. */
380 static GParse *
panel_lookup(GModule module)381 panel_lookup (GModule module) {
382 int i, num_panels = ARRAY_SIZE (paneling);
383
384 for (i = 0; i < num_panels; i++) {
385 if (paneling[i].module == module)
386 return &paneling[i];
387 }
388 return NULL;
389 }
390
391 /* Allocate memory for a new GRawData instance.
392 *
393 * On success, the newly allocated GRawData is returned . */
394 GRawData *
new_grawdata(void)395 new_grawdata (void) {
396 GRawData *raw_data = xmalloc (sizeof (*raw_data));
397 memset (raw_data, 0, sizeof *raw_data);
398
399 return raw_data;
400 }
401
402 /* Allocate memory for a new GRawDataItem instance.
403 *
404 * On success, the newly allocated GRawDataItem is returned . */
405 GRawDataItem *
new_grawdata_item(unsigned int size)406 new_grawdata_item (unsigned int size) {
407 GRawDataItem *item = xcalloc (size, sizeof (*item));
408 return item;
409 }
410
411 /* Free memory allocated for a GRawData and GRawDataItem instance. */
412 void
free_raw_data(GRawData * raw_data)413 free_raw_data (GRawData * raw_data) {
414 free (raw_data->items);
415 free (raw_data);
416 }
417
418 /* Reset an instance of GLog structure. */
419 void
reset_struct(Logs * logs)420 reset_struct (Logs * logs) {
421 int i = 0;
422
423 for (i = 0; i < logs->size; ++i)
424 logs->glog[i].invalid = logs->glog[i].processed = 0;
425 }
426
427 /* Allocate memory for a new set of Logs including a GLog instance.
428 *
429 * On success, the newly allocated Logs is returned . */
430 Logs *
init_logs(int size)431 init_logs (int size) {
432 Logs *logs = NULL;
433 GLog *glog = NULL;
434 int i = 0;
435
436 /* if no logs no a pipe nor restoring, nothing to do then */
437 if (!size && !conf.restore)
438 return NULL;
439
440 /* If no logs nor a pipe but restoring, we still need an minimal instance of
441 * logs and a glog */
442 logs = xcalloc (1, sizeof (*logs));
443 if (!size) {
444 logs->glog = xcalloc (1, sizeof (*glog));
445 logs->processed = &(logs->glog[0].processed);
446 return logs;
447 }
448
449 glog = xcalloc (size, sizeof (*glog));
450 for (i = 0; i < size; ++i) {
451 glog[i].errors = xcalloc (MAX_LOG_ERRORS, sizeof (char *));
452 glog[i].filename = xstrdup (conf.filenames[i]);
453
454 logs->processed = &(glog[i].processed);
455 logs->filename = glog[i].filename;
456 }
457
458 logs->glog = glog;
459 logs->size = size;
460
461 return logs;
462 }
463
464 /* Free all log errors stored during parsing. */
465 void
free_logerrors(GLog * glog)466 free_logerrors (GLog * glog) {
467 int i;
468
469 if (!glog->log_erridx)
470 return;
471
472 for (i = 0; i < glog->log_erridx; ++i)
473 free (glog->errors[i]);
474 glog->log_erridx = 0;
475 }
476
477 /* Free all log containers. */
478 void
free_logs(Logs * logs)479 free_logs (Logs * logs) {
480 GLog *glog = NULL;
481 int i;
482
483 for (i = 0; i < logs->size; ++i) {
484 glog = &logs->glog[i];
485
486 free (glog->filename);
487 free_logerrors (glog);
488 free (glog->errors);
489 if (glog->pipe) {
490 fclose (glog->pipe);
491 }
492 }
493
494 free (logs->glog);
495 free (logs);
496 }
497
498 /* Initialize a new GLogItem instance.
499 *
500 * On success, the new GLogItem instance is returned. */
501 GLogItem *
init_log_item(GLog * glog)502 init_log_item (GLog * glog) {
503 time_t now = time (0);
504 GLogItem *logitem;
505 glog->items = xmalloc (sizeof (GLogItem));
506 logitem = glog->items;
507 memset (logitem, 0, sizeof *logitem);
508
509 logitem->agent = NULL;
510 logitem->browser = NULL;
511 logitem->browser_type = NULL;
512 logitem->continent = NULL;
513 logitem->country = NULL;
514 logitem->date = NULL;
515 logitem->errstr = NULL;
516 logitem->host = NULL;
517 logitem->keyphrase = NULL;
518 logitem->method = NULL;
519 logitem->os = NULL;
520 logitem->os_type = NULL;
521 logitem->protocol = NULL;
522 logitem->qstr = NULL;
523 logitem->ref = NULL;
524 logitem->req_key = NULL;
525 logitem->req = NULL;
526 logitem->resp_size = 0LL;
527 logitem->serve_time = 0;
528 logitem->status = NULL;
529 logitem->time = NULL;
530 logitem->uniq_key = NULL;
531 logitem->vhost = NULL;
532 logitem->userid = NULL;
533 logitem->cache_status = NULL;
534
535 /* UMS */
536 logitem->mime_type = NULL;
537 logitem->tls_type = NULL;
538 logitem->tls_cypher = NULL;
539 logitem->tls_type_cypher = NULL;
540
541 memset (logitem->site, 0, sizeof (logitem->site));
542 localtime_r (&now, &logitem->dt);
543
544 return logitem;
545 }
546
547 /* Free all members of a GLogItem */
548 static void
free_glog(GLogItem * logitem)549 free_glog (GLogItem * logitem) {
550 if (logitem->agent != NULL)
551 free (logitem->agent);
552 if (logitem->browser != NULL)
553 free (logitem->browser);
554 if (logitem->browser_type != NULL)
555 free (logitem->browser_type);
556 if (logitem->continent != NULL)
557 free (logitem->continent);
558 if (logitem->country != NULL)
559 free (logitem->country);
560 if (logitem->date != NULL)
561 free (logitem->date);
562 if (logitem->errstr != NULL)
563 free (logitem->errstr);
564 if (logitem->host != NULL)
565 free (logitem->host);
566 if (logitem->keyphrase != NULL)
567 free (logitem->keyphrase);
568 if (logitem->method != NULL)
569 free (logitem->method);
570 if (logitem->os != NULL)
571 free (logitem->os);
572 if (logitem->os_type != NULL)
573 free (logitem->os_type);
574 if (logitem->protocol != NULL)
575 free (logitem->protocol);
576 if (logitem->qstr != NULL)
577 free (logitem->qstr);
578 if (logitem->ref != NULL)
579 free (logitem->ref);
580 if (logitem->req_key != NULL)
581 free (logitem->req_key);
582 if (logitem->req != NULL)
583 free (logitem->req);
584 if (logitem->status != NULL)
585 free (logitem->status);
586 if (logitem->time != NULL)
587 free (logitem->time);
588 if (logitem->uniq_key != NULL)
589 free (logitem->uniq_key);
590 if (logitem->userid != NULL)
591 free (logitem->userid);
592 if (logitem->cache_status != NULL)
593 free (logitem->cache_status);
594 if (logitem->vhost != NULL)
595 free (logitem->vhost);
596
597 if (logitem->mime_type != NULL)
598 free (logitem->mime_type);
599 if (logitem->tls_type != NULL)
600 free (logitem->tls_type);
601 if (logitem->tls_cypher != NULL)
602 free (logitem->tls_cypher);
603 if (logitem->tls_type_cypher != NULL)
604 free (logitem->tls_type_cypher);
605
606 free (logitem);
607 }
608
609 /* Decodes the given URL-encoded string.
610 *
611 * On success, the decoded string is assigned to the output buffer. */
612 #define B16210(x) (((x) >= '0' && (x) <= '9') ? ((x) - '0') : (toupper((x)) - 'A' + 10))
613 static void
decode_hex(char * url,char * out)614 decode_hex (char *url, char *out) {
615 char *ptr;
616 const char *c;
617
618 for (c = url, ptr = out; *c; c++) {
619 if (*c != '%' || !isxdigit (c[1]) || !isxdigit (c[2])) {
620 *ptr++ = *c;
621 } else {
622 *ptr++ = (char) ((B16210 (c[1]) * 16) + (B16210 (c[2])));
623 c += 2;
624 }
625 }
626 *ptr = 0;
627 }
628
629 /* Entry point to decode the given URL-encoded string.
630 *
631 * On success, the decoded trimmed string is assigned to the output
632 * buffer. */
633 static char *
decode_url(char * url)634 decode_url (char *url) {
635 char *out, *decoded;
636
637 if ((url == NULL) || (*url == '\0'))
638 return NULL;
639
640 out = decoded = xstrdup (url);
641 decode_hex (url, out);
642 /* double encoded URL? */
643 if (conf.double_decode)
644 decode_hex (decoded, out);
645 strip_newlines (out);
646
647 return trim_str (char_replace (out, '+', ' '));
648 }
649
650 /* Process keyphrases from Google search, cache, and translate.
651 * Note that the referer hasn't been decoded at the entry point
652 * since there could be '&' within the search query.
653 *
654 * On error, 1 is returned.
655 * On success, the extracted keyphrase is assigned and 0 is returned. */
656 static int
extract_keyphrase(char * ref,char ** keyphrase)657 extract_keyphrase (char *ref, char **keyphrase) {
658 char *r, *ptr, *pch, *referer;
659 int encoded = 0;
660
661 if (!(strstr (ref, "http://www.google.")) &&
662 !(strstr (ref, "http://webcache.googleusercontent.com/")) &&
663 !(strstr (ref, "http://translate.googleusercontent.com/")) &&
664 !(strstr (ref, "https://www.google.")) &&
665 !(strstr (ref, "https://webcache.googleusercontent.com/")) &&
666 !(strstr (ref, "https://translate.googleusercontent.com/")))
667 return 1;
668
669 /* webcache.googleusercontent */
670 if ((r = strstr (ref, "/+&")) != NULL)
671 return 1;
672 /* webcache.googleusercontent */
673 else if ((r = strstr (ref, "/+")) != NULL)
674 r += 2;
675 /* webcache.googleusercontent */
676 else if ((r = strstr (ref, "q=cache:")) != NULL) {
677 pch = strchr (r, '+');
678 if (pch)
679 r += pch - r + 1;
680 }
681 /* www.google.* or translate.googleusercontent */
682 else if ((r = strstr (ref, "&q=")) != NULL || (r = strstr (ref, "?q=")) != NULL)
683 r += 3;
684 else if ((r = strstr (ref, "%26q%3D")) != NULL || (r = strstr (ref, "%3Fq%3D")) != NULL)
685 encoded = 1, r += 7;
686 else
687 return 1;
688
689 if (!encoded && (ptr = strchr (r, '&')) != NULL)
690 *ptr = '\0';
691 else if (encoded && (ptr = strstr (r, "%26")) != NULL)
692 *ptr = '\0';
693
694 referer = decode_url (r);
695 if (referer == NULL || *referer == '\0') {
696 free (referer);
697 return 1;
698 }
699
700 referer = char_replace (referer, '+', ' ');
701 *keyphrase = trim_str (referer);
702
703 return 0;
704 }
705
706 #ifdef HAVE_GEOLOCATION
707 /* Extract geolocation for the given host.
708 *
709 * On error, 1 is returned.
710 * On success, the extracted continent and country are set and 0 is
711 * returned. */
712 static int
extract_geolocation(GLogItem * logitem,char * continent,char * country)713 extract_geolocation (GLogItem * logitem, char *continent, char *country) {
714 if (!is_geoip_resource ())
715 return 1;
716
717 geoip_get_country (logitem->host, country, logitem->type_ip);
718 geoip_get_continent (logitem->host, continent, logitem->type_ip);
719
720 return 0;
721 }
722 #endif
723
724
725 /* Parse a URI and extracts the *host* part from it
726 * i.e., //www.example.com/path?googleguy > www.example.com
727 *
728 * On error, 1 is returned.
729 * On success, the extracted referer is set and 0 is returned. */
730 static int
extract_referer_site(const char * referer,char * host)731 extract_referer_site (const char *referer, char *host) {
732 char *url, *begin, *end;
733 int len = 0;
734
735 if ((referer == NULL) || (*referer == '\0'))
736 return 1;
737
738 url = strdup (referer);
739 if ((begin = strstr (url, "//")) == NULL)
740 goto clean;
741
742 begin += 2;
743 if ((len = strlen (begin)) == 0)
744 goto clean;
745
746 if ((end = strchr (begin, '/')) != NULL)
747 len = end - begin;
748
749 if (len == 0)
750 goto clean;
751
752 if (len >= REF_SITE_LEN)
753 len = REF_SITE_LEN;
754
755 memcpy (host, begin, len);
756 host[len] = '\0';
757 free (url);
758 return 0;
759 clean:
760 free (url);
761
762 return 1;
763 }
764
765 /* Determine if the given request is static (e.g., jpg, css, js, etc).
766 *
767 * On error, or if not static, 0 is returned.
768 * On success, the 1 is returned. */
769 static int
verify_static_content(const char * req)770 verify_static_content (const char *req) {
771 const char *nul = req + strlen (req);
772 const char *ext = NULL, *pch = NULL;
773 int elen = 0, i;
774
775 if (strlen (req) < conf.static_file_max_len)
776 return 0;
777
778 for (i = 0; i < conf.static_file_idx; ++i) {
779 ext = conf.static_files[i];
780 if (ext == NULL || *ext == '\0')
781 continue;
782
783 elen = strlen (ext);
784 if (conf.all_static_files && (pch = strchr (req, '?')) != NULL && pch - req > elen) {
785 pch -= elen;
786 if (0 == strncasecmp (ext, pch, elen))
787 return 1;
788 continue;
789 }
790
791 if (!strncasecmp (nul - elen, ext, elen))
792 return 1;
793 }
794
795 return 0;
796 }
797
798 /* Extract the HTTP method.
799 *
800 * On error, or if not found, NULL is returned.
801 * On success, the HTTP method is returned. */
802 static const char *
extract_method(const char * token)803 extract_method (const char *token) {
804 const char *methods[] = {
805 "OPTIONS", "GET", "HEAD", "POST", "PUT",
806 "DELETE", "TRACE", "CONNECT", "PATCH", "options",
807 "get", "head", "post", "put", "delete",
808 "trace", "connect", "patch",
809 /* WebDAV */
810 "PROPFIND", "PROPPATCH", "MKCOL", "COPY", "MOVE",
811 "LOCK", "UNLOCK", "VERSION-CONTROL", "REPORT", "CHECKOUT",
812 "CHECKIN", "UNCHECKOUT", "MKWORKSPACE", "UPDATE", "LABEL",
813 "MERGE", "BASELINE-CONTROL", "MKACTIVITY", "ORDERPATCH", "propfind",
814 "propwatch", "mkcol", "copy", "move", "lock",
815 "unlock", "version-control", "report", "checkout", "checkin",
816 "uncheckout", "mkworkspace", "update", "label", "merge",
817 "baseline-control", "mkactivity", "orderpatch"
818 };
819
820 const int methods_count = sizeof (methods) / sizeof (*methods);
821
822 int i;
823 /* Length of every string in list */
824 static int list_length[sizeof (methods) / sizeof (*methods)] = { -1 };
825 /* Only calculate length on first time */
826 if (list_length[0] == -1) {
827 for (i = 0; i < methods_count; i++) {
828 list_length[i] = strlen (methods[i]);
829 }
830 }
831
832 for (i = 0; i < methods_count; i++) {
833 if (strncmp (token, methods[i], list_length[i]) == 0) {
834 return methods[i];
835 }
836 }
837 return NULL;
838 }
839
840 /* Determine if time-served data was stored on-disk. */
841 static void
contains_usecs(void)842 contains_usecs (void) {
843 if (conf.serve_usecs)
844 return;
845 conf.serve_usecs = 1; /* flag */
846 }
847
848 static int
is_cache_hit(const char * tkn)849 is_cache_hit (const char *tkn) {
850 if (strcasecmp ("MISS", tkn) == 0)
851 return 1;
852 else if (strcasecmp ("BYPASS", tkn) == 0)
853 return 1;
854 else if (strcasecmp ("EXPIRED", tkn) == 0)
855 return 1;
856 else if (strcasecmp ("STALE", tkn) == 0)
857 return 1;
858 else if (strcasecmp ("UPDATING", tkn) == 0)
859 return 1;
860 else if (strcasecmp ("REVALIDATED", tkn) == 0)
861 return 1;
862 else if (strcasecmp ("HIT", tkn) == 0)
863 return 1;
864 return 0;
865 }
866
867 /* Determine if the given token is a valid HTTP protocol.
868 *
869 * If not valid, 1 is returned.
870 * If valid, 0 is returned. */
871 static const char *
extract_protocol(const char * token)872 extract_protocol (const char *token) {
873 const char *lookfor;
874
875 if ((lookfor = "HTTP/1.0", !strncmp (token, lookfor, 8)) ||
876 (lookfor = "HTTP/1.1", !strncmp (token, lookfor, 8)) ||
877 (lookfor = "HTTP/2", !strncmp (token, lookfor, 6)))
878 return lookfor;
879 return NULL;
880 }
881
882 /* Parse a request containing the method and protocol.
883 *
884 * On error, or unable to parse, NULL is returned.
885 * On success, the HTTP request is returned and the method and
886 * protocol are assigned to the corresponding buffers. */
887 static char *
parse_req(char * line,char ** method,char ** protocol)888 parse_req (char *line, char **method, char **protocol) {
889 char *req = NULL, *request = NULL, *dreq = NULL, *ptr = NULL;
890 const char *meth, *proto;
891 ptrdiff_t rlen;
892
893 meth = extract_method (line);
894
895 /* couldn't find a method, so use the whole request line */
896 if (meth == NULL) {
897 request = xstrdup (line);
898 }
899 /* method found, attempt to parse request */
900 else {
901 req = line + strlen (meth);
902 if (!(ptr = strrchr (req, ' ')) || !(proto = extract_protocol (++ptr)))
903 return alloc_string ("-");
904
905 req++;
906 if ((rlen = ptr - req) <= 0)
907 return alloc_string ("-");
908
909 request = xmalloc (rlen + 1);
910 strncpy (request, req, rlen);
911 request[rlen] = 0;
912
913 if (conf.append_method)
914 (*method) = strtoupper (xstrdup (meth));
915
916 if (conf.append_protocol)
917 (*protocol) = strtoupper (xstrdup (proto));
918 }
919
920 if (!(dreq = decode_url (request)))
921 return request;
922 else if (*dreq == '\0') {
923 free (dreq);
924 return request;
925 }
926
927 free (request);
928 return dreq;
929 }
930
931 #if defined(HAVE_LIBSSL) && defined(HAVE_CIPHER_STD_NAME)
932 static int
extract_tls_version_cipher(char * tkn,char ** cipher,char ** tls_version)933 extract_tls_version_cipher (char *tkn, char **cipher, char **tls_version) {
934 SSL_CTX *ctx = NULL;
935 SSL *ssl = NULL;
936 int code = 0;
937 unsigned short code_be;
938 unsigned char cipherid[3];
939 const SSL_CIPHER *c = NULL;
940 char *bEnd;
941 const char *sn = NULL;
942
943 code = strtoull (tkn, &bEnd, 10);
944 if (tkn == bEnd || *bEnd != '\0' || errno == ERANGE) {
945 LOG_DEBUG (("unable to convert cipher code to a valid decimal."));
946 free (tkn);
947 return 1;
948 }
949
950 /* ssl context */
951 if (!(ctx = SSL_CTX_new (SSLv23_server_method ()))) {
952 LOG_DEBUG (("Unable to create a new SSL_CTX_new to extact TLS."));
953 free (tkn);
954 return 1;
955 }
956 if (!(ssl = SSL_new (ctx))) {
957 LOG_DEBUG (("Unable to create a new instace of SSL_new to extact TLS."));
958 free (tkn);
959 return 1;
960 }
961
962 code_be = htobe16 (code);
963 memcpy (cipherid, &code_be, 2);
964 cipherid[2] = 0;
965
966 if (!(c = SSL_CIPHER_find (ssl, cipherid))) {
967 LOG_DEBUG (("Unable to find cipher to extact TLS."));
968 free (tkn);
969 return 1;
970 }
971
972 if (!(sn = SSL_CIPHER_standard_name (c))) {
973 LOG_DEBUG (("Unable to get cipher standard name to extact TLS."));
974 free (tkn);
975 return 1;
976 }
977 *cipher = xstrdup (sn);
978 *tls_version = xstrdup (SSL_CIPHER_get_version (c));
979
980 free (tkn);
981 SSL_free (ssl);
982 SSL_CTX_free (ctx);
983
984 return 0;
985 }
986 #endif
987
988 /* Extract the next delimiter given a log format and copy the delimiter to the
989 * destination buffer.
990 *
991 * On error, the dest buffer will be empty.
992 * On success, the delimiter(s) are stored in the dest buffer. */
993 static void
get_delim(char * dest,const char * p)994 get_delim (char *dest, const char *p) {
995 /* done, nothing to do */
996 if (p[0] == '\0' || p[1] == '\0') {
997 dest[0] = '\0';
998 return;
999 }
1000 /* add the first delim */
1001 dest[0] = *(p + 1);
1002 }
1003
1004 /* Extract and malloc a token given the parsed rule.
1005 *
1006 * On success, the malloc'd token is returned. */
1007 static char *
parsed_string(const char * pch,char ** str,int move_ptr)1008 parsed_string (const char *pch, char **str, int move_ptr) {
1009 char *p;
1010 size_t len = (pch - *str + 1);
1011
1012 p = xmalloc (len);
1013 memcpy (p, *str, (len - 1));
1014 p[len - 1] = '\0';
1015 if (move_ptr)
1016 *str += len - 1;
1017
1018 return trim_str (p);
1019 }
1020
1021 /* Find and extract a token given a log format rule.
1022 *
1023 * On error, or unable to parse it, NULL is returned.
1024 * On success, the malloc'd token is returned. */
1025 static char *
parse_string(char ** str,const char * delims,int cnt)1026 parse_string (char **str, const char *delims, int cnt) {
1027 int idx = 0;
1028 char *pch = *str, *p = NULL;
1029 char end;
1030
1031 if ((*delims != 0x0) && (p = strpbrk (*str, delims)) == NULL)
1032 return NULL;
1033
1034 end = !*delims ? 0x0 : *p;
1035 do {
1036 /* match number of delims */
1037 if (*pch == end)
1038 idx++;
1039 /* delim found, parse string then */
1040 if ((*pch == end && cnt == idx) || *pch == '\0')
1041 return parsed_string (pch, str, 1);
1042 /* advance to the first unescaped delim */
1043 if (*pch == '\\')
1044 pch++;
1045 } while (*pch++);
1046
1047 return NULL;
1048 }
1049
1050 /* Move forward through the log string until a non-space (!isspace)
1051 * char is found. */
1052 static void
find_alpha(char ** str)1053 find_alpha (char **str) {
1054 char *s = *str;
1055 while (*s) {
1056 if (isspace (*s))
1057 s++;
1058 else
1059 break;
1060 }
1061 *str += s - *str;
1062 }
1063
1064 /* Move forward through the log string until a non-space (!isspace)
1065 * char is found and returns the count. */
1066 static int
find_alpha_count(char * str)1067 find_alpha_count (char *str) {
1068 int cnt = 0;
1069 char *s = str;
1070 while (*s) {
1071 if (isspace (*s))
1072 s++, cnt++;
1073 else
1074 break;
1075 }
1076 return cnt;
1077 }
1078
1079 /* Format the broken-down time tm to a numeric date format.
1080 *
1081 * On error, or unable to format the given tm, 1 is returned.
1082 * On success, a malloc'd format is returned. */
1083 #pragma GCC diagnostic ignored "-Wformat-nonliteral"
1084 static int
set_date(char ** fdate,struct tm tm)1085 set_date (char **fdate, struct tm tm) {
1086 char buf[DATE_LEN] = ""; /* Ymd */
1087
1088 memset (buf, 0, sizeof (buf));
1089 if (strftime (buf, DATE_LEN, conf.date_num_format, &tm) <= 0)
1090 return 1;
1091 *fdate = xstrdup (buf);
1092
1093 return 0;
1094 }
1095
1096 /* Format the broken-down time tm to a numeric time format.
1097 *
1098 * On error, or unable to format the given tm, 1 is returned.
1099 * On success, a malloc'd format is returned. */
1100 static int
set_time(char ** ftime,struct tm tm)1101 set_time (char **ftime, struct tm tm) {
1102 char buf[TIME_LEN] = "";
1103
1104 memset (buf, 0, sizeof (buf));
1105 if (strftime (buf, TIME_LEN, "%H:%M:%S", &tm) <= 0)
1106 return 1;
1107 *ftime = xstrdup (buf);
1108
1109 return 0;
1110 }
1111
1112 /* Determine the parsing specifier error and construct a message out
1113 * of it.
1114 *
1115 * On success, a malloc'd error message is assigned to the log
1116 * structure and 1 is returned. */
1117 static int
spec_err(GLogItem * logitem,int code,const char spec,const char * tkn)1118 spec_err (GLogItem * logitem, int code, const char spec, const char *tkn) {
1119 char *err = NULL;
1120 const char *fmt = NULL;
1121
1122 switch (code) {
1123 case SPEC_TOKN_NUL:
1124 fmt = "Token for '%%%c' specifier is NULL.";
1125 err = xmalloc (snprintf (NULL, 0, fmt, spec) + 1);
1126 sprintf (err, fmt, spec);
1127 break;
1128 case SPEC_TOKN_INV:
1129 fmt = "Token '%s' doesn't match specifier '%%%c'";
1130 err = xmalloc (snprintf (NULL, 0, fmt, (tkn ? tkn : "-"), spec) + 1);
1131 sprintf (err, fmt, (tkn ? tkn : "-"), spec);
1132 break;
1133 case SPEC_SFMT_MIS:
1134 fmt = "Missing braces '%s' and ignore chars for specifier '%%%c'";
1135 err = xmalloc (snprintf (NULL, 0, fmt, (tkn ? tkn : "-"), spec) + 1);
1136 sprintf (err, fmt, (tkn ? tkn : "-"), spec);
1137 break;
1138 }
1139 logitem->errstr = err;
1140
1141 return code;
1142 }
1143
1144 static void
set_tm_dt_logitem(GLogItem * logitem,struct tm tm)1145 set_tm_dt_logitem (GLogItem * logitem, struct tm tm) {
1146 logitem->dt.tm_year = tm.tm_year;
1147 logitem->dt.tm_mon = tm.tm_mon;
1148 logitem->dt.tm_mday = tm.tm_mday;
1149 }
1150
1151 static void
set_tm_tm_logitem(GLogItem * logitem,struct tm tm)1152 set_tm_tm_logitem (GLogItem * logitem, struct tm tm) {
1153 logitem->dt.tm_hour = tm.tm_hour;
1154 logitem->dt.tm_min = tm.tm_min;
1155 logitem->dt.tm_sec = tm.tm_sec;
1156 }
1157
1158 static void
set_numeric_date(uint32_t * numdate,const char * date)1159 set_numeric_date (uint32_t * numdate, const char *date) {
1160 int res = 0;
1161 if ((res = str2int (date)) == -1)
1162 FATAL ("Unable to parse date to integer %s", date);
1163 *numdate = res;
1164 }
1165
1166 #pragma GCC diagnostic warning "-Wformat-nonliteral"
1167
1168 /* Parse the log string given log format rule.
1169 *
1170 * On error, or unable to parse it, 1 is returned.
1171 * On success, the malloc'd token is assigned to a GLogItem member. */
1172 static int
parse_specifier(GLogItem * logitem,char ** str,const char * p,const char * end)1173 parse_specifier (GLogItem * logitem, char **str, const char *p, const char *end) {
1174 struct tm tm;
1175 time_t now = time (0);
1176 const char *dfmt = conf.date_format;
1177 const char *tfmt = conf.time_format;
1178
1179 char *pch, *sEnd, *bEnd, *tkn = NULL;
1180 double serve_secs = 0.0;
1181 uint64_t bandw = 0, serve_time = 0;
1182 long status = 0L;
1183 int dspc = 0, fmtspcs = 0;
1184
1185 errno = 0;
1186 memset (&tm, 0, sizeof (tm));
1187 localtime_r (&now, &tm);
1188
1189 switch (*p) {
1190 /* date */
1191 case 'd':
1192 if (logitem->date)
1193 return 0;
1194
1195 /* Attempt to parse date format containing spaces,
1196 * i.e., syslog date format (Jul\s15, Nov\s\s2).
1197 * Note that it's possible a date could contain some padding, e.g.,
1198 * Dec\s\s2 vs Nov\s22, so we attempt to take that into consideration by looking
1199 * ahead the log string and counting the # of spaces until we find an alphanum char. */
1200 if ((fmtspcs = count_matches (dfmt, ' ')) && (pch = strchr (*str, ' ')))
1201 dspc = find_alpha_count (pch);
1202
1203 if (!(tkn = parse_string (&(*str), end, MAX (dspc, fmtspcs) + 1)))
1204 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1205
1206 if (str_to_time (tkn, dfmt, &tm) != 0 || set_date (&logitem->date, tm) != 0) {
1207 spec_err (logitem, SPEC_TOKN_INV, *p, tkn);
1208 free (tkn);
1209 return 1;
1210 }
1211
1212 set_numeric_date (&logitem->numdate, logitem->date);
1213 set_tm_dt_logitem (logitem, tm);
1214 free (tkn);
1215 break;
1216 /* time */
1217 case 't':
1218 if (logitem->time)
1219 return 0;
1220 if (!(tkn = parse_string (&(*str), end, 1)))
1221 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1222
1223 if (str_to_time (tkn, tfmt, &tm) != 0 || set_time (&logitem->time, tm) != 0) {
1224 spec_err (logitem, SPEC_TOKN_INV, *p, tkn);
1225 free (tkn);
1226 return 1;
1227 }
1228
1229 set_tm_tm_logitem (logitem, tm);
1230 free (tkn);
1231 break;
1232 /* date/time as decimal, i.e., timestamps, ms/us */
1233 case 'x':
1234 if (logitem->time && logitem->date)
1235 return 0;
1236 if (!(tkn = parse_string (&(*str), end, 1)))
1237 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1238
1239 if (str_to_time (tkn, tfmt, &tm) != 0 || set_date (&logitem->date, tm) != 0 ||
1240 set_time (&logitem->time, tm) != 0) {
1241 spec_err (logitem, SPEC_TOKN_INV, *p, tkn);
1242 free (tkn);
1243 return 1;
1244 }
1245 set_numeric_date (&logitem->numdate, logitem->date);
1246 set_tm_dt_logitem (logitem, tm);
1247 set_tm_tm_logitem (logitem, tm);
1248 free (tkn);
1249 break;
1250 /* Virtual Host */
1251 case 'v':
1252 if (logitem->vhost)
1253 return 0;
1254 tkn = parse_string (&(*str), end, 1);
1255 if (tkn == NULL)
1256 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1257 logitem->vhost = tkn;
1258 break;
1259 /* remote user */
1260 case 'e':
1261 if (logitem->userid)
1262 return 0;
1263 tkn = parse_string (&(*str), end, 1);
1264 if (tkn == NULL)
1265 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1266 logitem->userid = tkn;
1267 break;
1268 /* cache status */
1269 case 'C':
1270 if (logitem->cache_status)
1271 return 0;
1272 tkn = parse_string (&(*str), end, 1);
1273 if (tkn == NULL)
1274 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1275 if (is_cache_hit (tkn))
1276 logitem->cache_status = tkn;
1277 else
1278 free (tkn);
1279 break;
1280 /* remote hostname (IP only) */
1281 case 'h':
1282 if (logitem->host)
1283 return 0;
1284 if (!(tkn = parse_string (&(*str), end, 1)))
1285 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1286
1287 if (!conf.no_ip_validation && invalid_ipaddr (tkn, &logitem->type_ip)) {
1288 spec_err (logitem, SPEC_TOKN_INV, *p, tkn);
1289 free (tkn);
1290 return 1;
1291 }
1292 logitem->host = tkn;
1293 break;
1294 /* request method */
1295 case 'm':
1296 if (logitem->method)
1297 return 0;
1298 if (!(tkn = parse_string (&(*str), end, 1)))
1299 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1300
1301 if (!extract_method (tkn)) {
1302 spec_err (logitem, SPEC_TOKN_INV, *p, tkn);
1303 free (tkn);
1304 return 1;
1305 }
1306 logitem->method = tkn;
1307 break;
1308 /* request not including method or protocol */
1309 case 'U':
1310 if (logitem->req)
1311 return 0;
1312 tkn = parse_string (&(*str), end, 1);
1313 if (tkn == NULL || *tkn == '\0') {
1314 free (tkn);
1315 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1316 }
1317
1318 if ((logitem->req = decode_url (tkn)) == NULL) {
1319 spec_err (logitem, SPEC_TOKN_INV, *p, tkn);
1320 free (tkn);
1321 return 1;
1322 }
1323 free (tkn);
1324 break;
1325 /* query string alone, e.g., ?param=goaccess&tbm=shop */
1326 case 'q':
1327 if (logitem->qstr)
1328 return 0;
1329 tkn = parse_string (&(*str), end, 1);
1330 if (tkn == NULL || *tkn == '\0') {
1331 free (tkn);
1332 return 0;
1333 }
1334
1335 if ((logitem->qstr = decode_url (tkn)) == NULL) {
1336 spec_err (logitem, SPEC_TOKN_INV, *p, tkn);
1337 free (tkn);
1338 return 1;
1339 }
1340 free (tkn);
1341 break;
1342 /* request protocol */
1343 case 'H':
1344 if (logitem->protocol)
1345 return 0;
1346 if (!(tkn = parse_string (&(*str), end, 1)))
1347 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1348
1349 if (!extract_protocol (tkn)) {
1350 spec_err (logitem, SPEC_TOKN_INV, *p, tkn);
1351 free (tkn);
1352 return 1;
1353 }
1354 logitem->protocol = tkn;
1355 break;
1356 /* request, including method + protocol */
1357 case 'r':
1358 if (logitem->req)
1359 return 0;
1360 if (!(tkn = parse_string (&(*str), end, 1)))
1361 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1362
1363 logitem->req = parse_req (tkn, &logitem->method, &logitem->protocol);
1364 free (tkn);
1365 break;
1366 /* Status Code */
1367 case 's':
1368 if (logitem->status)
1369 return 0;
1370 if (!(tkn = parse_string (&(*str), end, 1)))
1371 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1372
1373 /* do not validate HTTP status code */
1374 if (conf.no_strict_status) {
1375 logitem->status = tkn;
1376 break;
1377 }
1378
1379 status = strtol (tkn, &sEnd, 10);
1380 if (tkn == sEnd || *sEnd != '\0' || errno == ERANGE || status < 100 || status > 599) {
1381 spec_err (logitem, SPEC_TOKN_INV, *p, tkn);
1382 free (tkn);
1383 return 1;
1384 }
1385 logitem->status = tkn;
1386 break;
1387 /* size of response in bytes - excluding HTTP headers */
1388 case 'b':
1389 if (logitem->resp_size)
1390 return 0;
1391 if (!(tkn = parse_string (&(*str), end, 1)))
1392 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1393
1394 bandw = strtoull (tkn, &bEnd, 10);
1395 if (tkn == bEnd || *bEnd != '\0' || errno == ERANGE)
1396 bandw = 0;
1397 logitem->resp_size = bandw;
1398 conf.bandwidth = 1;
1399 free (tkn);
1400 break;
1401 /* referrer */
1402 case 'R':
1403 if (logitem->ref)
1404 return 0;
1405
1406 if (!(tkn = parse_string (&(*str), end, 1)))
1407 tkn = alloc_string ("-");
1408 if (*tkn == '\0') {
1409 free (tkn);
1410 tkn = alloc_string ("-");
1411 }
1412 if (strcmp (tkn, "-") != 0) {
1413 extract_keyphrase (tkn, &logitem->keyphrase);
1414 extract_referer_site (tkn, logitem->site);
1415
1416 /* hide referrers from report */
1417 if (hide_referer (logitem->site)) {
1418 logitem->site[0] = '\0';
1419 free (tkn);
1420 } else
1421 logitem->ref = tkn;
1422 break;
1423 }
1424 logitem->ref = tkn;
1425
1426 break;
1427 /* user agent */
1428 case 'u':
1429 if (logitem->agent)
1430 return 0;
1431
1432 tkn = parse_string (&(*str), end, 1);
1433 if (tkn != NULL && *tkn != '\0') {
1434 /* Make sure the user agent is decoded (i.e.: CloudFront)
1435 * and replace all '+' with ' ' (i.e.: w3c) */
1436 logitem->agent = decode_url (tkn);
1437 free (tkn);
1438 break;
1439 } else if (tkn != NULL && *tkn == '\0') {
1440 free (tkn);
1441 tkn = alloc_string ("-");
1442 }
1443 /* must be null */
1444 else {
1445 tkn = alloc_string ("-");
1446 }
1447 logitem->agent = tkn;
1448 break;
1449 /* time taken to serve the request, in milliseconds as a decimal number */
1450 case 'L':
1451 /* ignore it if we already have served time */
1452 if (logitem->serve_time)
1453 return 0;
1454 if (!(tkn = parse_string (&(*str), end, 1)))
1455 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1456
1457 serve_secs = strtoull (tkn, &bEnd, 10);
1458 if (tkn == bEnd || *bEnd != '\0' || errno == ERANGE)
1459 serve_secs = 0;
1460 /* convert it to microseconds */
1461 logitem->serve_time = (serve_secs > 0) ? serve_secs * MILS : 0;
1462
1463 contains_usecs (); /* set flag */
1464 free (tkn);
1465 break;
1466 /* time taken to serve the request, in seconds with a milliseconds
1467 * resolution */
1468 case 'T':
1469 /* ignore it if we already have served time */
1470 if (logitem->serve_time)
1471 return 0;
1472 if (!(tkn = parse_string (&(*str), end, 1)))
1473 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1474
1475 if (strchr (tkn, '.') != NULL)
1476 serve_secs = strtod (tkn, &bEnd);
1477 else
1478 serve_secs = strtoull (tkn, &bEnd, 10);
1479
1480 if (tkn == bEnd || *bEnd != '\0' || errno == ERANGE)
1481 serve_secs = 0;
1482 /* convert it to microseconds */
1483 logitem->serve_time = (serve_secs > 0) ? serve_secs * SECS : 0;
1484
1485 contains_usecs (); /* set flag */
1486 free (tkn);
1487 break;
1488 /* time taken to serve the request, in microseconds */
1489 case 'D':
1490 /* ignore it if we already have served time */
1491 if (logitem->serve_time)
1492 return 0;
1493 if (!(tkn = parse_string (&(*str), end, 1)))
1494 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1495
1496 serve_time = strtoull (tkn, &bEnd, 10);
1497 if (tkn == bEnd || *bEnd != '\0' || errno == ERANGE)
1498 serve_time = 0;
1499 logitem->serve_time = serve_time;
1500
1501 contains_usecs (); /* set flag */
1502 free (tkn);
1503 break;
1504
1505 /* UMS: Krypto (TLS) "ECDHE-RSA-AES128-GCM-SHA256" */
1506 case 'k':
1507 /* error to set this twice */
1508 if (logitem->tls_cypher)
1509 return 0;
1510 if (!(tkn = parse_string (&(*str), end, 1)))
1511 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1512
1513 #if defined(HAVE_LIBSSL) && defined(HAVE_CIPHER_STD_NAME)
1514 {
1515 char *tmp = NULL;
1516 for (tmp = tkn; isdigit (*tmp); tmp++);
1517 if (!strlen (tmp))
1518 extract_tls_version_cipher (tkn, &logitem->tls_cypher, &logitem->tls_type);
1519 else
1520 logitem->tls_cypher = tkn;
1521 }
1522 #else
1523 logitem->tls_cypher = tkn;
1524 #endif
1525
1526 break;
1527
1528 /* UMS: Krypto (TLS) parameters like "TLSv1.2" */
1529 case 'K':
1530 /* error to set this twice */
1531 if (logitem->tls_type)
1532 return 0;
1533 if (!(tkn = parse_string (&(*str), end, 1)))
1534 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1535
1536 logitem->tls_type = tkn;
1537 break;
1538
1539 /* UMS: Mime-Type like "text/html" */
1540 case 'M':
1541 /* error to set this twice */
1542 if (logitem->mime_type)
1543 return 0;
1544 if (!(tkn = parse_string (&(*str), end, 1)))
1545 return spec_err (logitem, SPEC_TOKN_NUL, *p, NULL);
1546
1547 logitem->mime_type = tkn;
1548
1549 break;
1550 /* move forward through str until not a space */
1551 case '~':
1552 find_alpha (&(*str));
1553 break;
1554 /* everything else skip it */
1555 default:
1556 if ((pch = strchr (*str, p[1])) != NULL)
1557 *str += pch - *str;
1558 }
1559
1560 return 0;
1561 }
1562
1563 /* Parse the special host specifier and extract the characters that
1564 * need to be rejected when attempting to parse the XFF field.
1565 *
1566 * If no unable to find both curly braces (boundaries), NULL is returned.
1567 * On success, the malloc'd reject set is returned. */
1568 static char *
extract_braces(char ** p)1569 extract_braces (char **p) {
1570 char *b1 = NULL, *b2 = NULL, *ret = NULL, *s = *p;
1571 int esc = 0;
1572 ptrdiff_t len = 0;
1573
1574 /* iterate over the log format */
1575 for (; *s; s++) {
1576 if (*s == '\\') {
1577 esc = 1;
1578 } else if (*s == '{' && !esc) {
1579 b1 = s;
1580 } else if (*s == '}' && !esc) {
1581 b2 = s;
1582 break;
1583 } else {
1584 esc = 0;
1585 }
1586 }
1587
1588 if ((!b1) || (!b2))
1589 return NULL;
1590 if ((len = b2 - (b1 + 1)) <= 0)
1591 return NULL;
1592
1593 /* Found braces, extract 'reject' character set. */
1594 ret = xmalloc (len + 1);
1595 memcpy (ret, b1 + 1, len);
1596 ret[len] = '\0';
1597 (*p) = b2 + 1;
1598
1599 return ret;
1600 }
1601
1602 /* Attempt to extract the client IP from an X-Forwarded-For (XFF) field.
1603 *
1604 * If no IP is found, 1 is returned.
1605 * On success, the malloc'd token is assigned to a GLogItem->host and
1606 * 0 is returned. */
1607 static int
find_xff_host(GLogItem * logitem,char ** str,char ** p)1608 find_xff_host (GLogItem * logitem, char **str, char **p) {
1609 char *ptr = NULL, *tkn = NULL, *skips = NULL;
1610 int invalid_ip = 1, len = 0, type_ip = TYPE_IPINV;
1611 int idx = 0, skips_len = 0;
1612
1613 if (!(skips = extract_braces (p)))
1614 return spec_err (logitem, SPEC_SFMT_MIS, **p, "{}");
1615
1616 skips_len = strlen (skips);
1617 ptr = *str;
1618 while (*ptr != '\0') {
1619 if ((len = strcspn (ptr, skips)) == 0) {
1620 len++, ptr++, idx++;
1621 goto move;
1622 }
1623 /* If our index does not match the number of delimiters and we have already a
1624 * valid client IP, then we assume we have reached the length of the XFF */
1625 if (idx < skips_len && logitem->host)
1626 break;
1627
1628 ptr += len;
1629 /* extract possible IP */
1630 if (!(tkn = parsed_string (ptr, str, 0)))
1631 break;
1632
1633 invalid_ip = invalid_ipaddr (tkn, &type_ip);
1634 /* done, already have IP and current token is not a host */
1635 if (logitem->host && invalid_ip) {
1636 free (tkn);
1637 break;
1638 }
1639 if (!logitem->host && !invalid_ip) {
1640 logitem->host = xstrdup (tkn);
1641 logitem->type_ip = type_ip;
1642 }
1643 free (tkn);
1644 idx = 0;
1645
1646 move:
1647 *str += len;
1648 }
1649
1650 free (skips);
1651
1652 return logitem->host == NULL;
1653 }
1654
1655 /* Handle special specifiers.
1656 *
1657 * On error, or unable to parse it, 1 is returned.
1658 * On success, the malloc'd token is assigned to a GLogItem member and
1659 * 0 is returned. */
1660 static int
special_specifier(GLogItem * logitem,char ** str,char ** p)1661 special_specifier (GLogItem * logitem, char **str, char **p) {
1662 switch (**p) {
1663 /* XFF remote hostname (IP only) */
1664 case 'h':
1665 if (logitem->host)
1666 return 0;
1667 if (find_xff_host (logitem, str, p))
1668 return spec_err (logitem, SPEC_TOKN_NUL, 'h', NULL);
1669 break;
1670 }
1671
1672 return 0;
1673 }
1674
1675 /* Iterate over the given log format.
1676 *
1677 * On error, or unable to parse it, 1 is returned.
1678 * On success, the malloc'd token is assigned to a GLogItem member and
1679 * 0 is returned. */
1680 static int
parse_format(GLogItem * logitem,char * str,char * lfmt)1681 parse_format (GLogItem * logitem, char *str, char *lfmt) {
1682 char end[2 + 1] = { 0 };
1683 char *p = NULL;
1684 int perc = 0, tilde = 0, ret = 0;
1685
1686 if (str == NULL || *str == '\0')
1687 return 1;
1688
1689 /* iterate over the log format */
1690 for (p = lfmt; *p; p++) {
1691 if (*p == '%') {
1692 perc++;
1693 continue;
1694 }
1695 if (*p == '~' && perc == 0) {
1696 tilde++;
1697 continue;
1698 }
1699 if (*str == '\n')
1700 return 0;
1701
1702 if (tilde && *p != '\0') {
1703 if ((str == NULL) || (*str == '\0'))
1704 return 0;
1705 if (special_specifier (logitem, &str, &p) == 1)
1706 return 1;
1707 tilde = 0;
1708 }
1709 /* %h */
1710 else if (perc && *p != '\0') {
1711 if ((str == NULL) || (*str == '\0'))
1712 return 0;
1713
1714 memset (end, 0, sizeof end);
1715 get_delim (end, p);
1716 /* attempt to parse format specifiers */
1717 if ((ret = parse_specifier (logitem, &str, p, end)))
1718 return ret;
1719 perc = 0;
1720 } else if (perc && isspace (p[0])) {
1721 return 1;
1722 } else {
1723 str++;
1724 }
1725 }
1726
1727 return 0;
1728 }
1729
1730 /* Determine if the log string is valid and if it's not a comment.
1731 *
1732 * On error, or invalid, 1 is returned.
1733 * On success, or valid line, 0 is returned. */
1734 static int
valid_line(char * line)1735 valid_line (char *line) {
1736 /* invalid line */
1737 if ((line == NULL) || (*line == '\0'))
1738 return 1;
1739 /* ignore comments */
1740 if (*line == '#' || *line == '\n')
1741 return 1;
1742
1743 return 0;
1744 }
1745
1746 /* Determine if we need to lock the mutex. */
1747 static void
lock_spinner(void)1748 lock_spinner (void) {
1749 if (parsing_spinner != NULL && parsing_spinner->state == SPN_RUN)
1750 pthread_mutex_lock (&parsing_spinner->mutex);
1751 }
1752
1753 /* Determine if we need to unlock the mutex. */
1754 static void
unlock_spinner(void)1755 unlock_spinner (void) {
1756 if (parsing_spinner != NULL && parsing_spinner->state == SPN_RUN)
1757 pthread_mutex_unlock (&parsing_spinner->mutex);
1758 }
1759
1760 /* Ignore request's query string. e.g.,
1761 * /index.php?timestamp=1454385289 */
1762 static void
strip_qstring(char * req)1763 strip_qstring (char *req) {
1764 char *qmark;
1765 if ((qmark = strchr (req, '?')) != NULL) {
1766 if ((qmark - req) > 0)
1767 *qmark = '\0';
1768 }
1769 }
1770
1771 /* Increment the overall bandwidth. */
1772 static void
count_bw(int numdate,uint64_t resp_size)1773 count_bw (int numdate, uint64_t resp_size) {
1774 ht_inc_cnt_bw (numdate, resp_size);
1775 }
1776
1777 /* Output all log errors stored during parsing. */
1778 void
output_logerrors(Logs * logs)1779 output_logerrors (Logs * logs) {
1780 GLog *glog = NULL;
1781 int pid = getpid (), i;
1782
1783 for (i = 0; i < logs->size; ++i) {
1784 glog = &logs->glog[i];
1785 if (!glog->log_erridx)
1786 continue;
1787
1788 fprintf (stderr, "==%d== GoAccess - Copyright (C) 2009-2020 by Gerardo Orellana\n", pid);
1789 fprintf (stderr, "==%d== https://goaccess.io - <hello@goaccess.io>\n", pid);
1790 fprintf (stderr, "==%d== Released under the MIT License.\n", pid);
1791 fprintf (stderr, "==%d==\n", pid);
1792 fprintf (stderr, "==%d== FILE: %s\n", pid, glog->filename);
1793 fprintf (stderr, "==%d== ", pid);
1794 fprintf (stderr, ERR_PARSED_NLINES, glog->log_erridx);
1795 fprintf (stderr, " %s:\n", ERR_PARSED_NLINES_DESC);
1796 fprintf (stderr, "==%d==\n", pid);
1797 for (i = 0; i < glog->log_erridx; ++i)
1798 fprintf (stderr, "==%d== %s\n", pid, glog->errors[i]);
1799 }
1800 fprintf (stderr, "==%d==\n", pid);
1801 fprintf (stderr, "==%d== %s\n", pid, ERR_FORMAT_HEADER);
1802 }
1803
1804 /* Ensure we have the following fields. */
1805 static int
verify_missing_fields(GLogItem * logitem)1806 verify_missing_fields (GLogItem * logitem) {
1807 /* must have the following fields */
1808 if (logitem->host == NULL)
1809 logitem->errstr = xstrdup ("IPv4/6 is required.");
1810 else if (logitem->date == NULL)
1811 logitem->errstr = xstrdup ("A valid date is required.");
1812 else if (logitem->req == NULL)
1813 logitem->errstr = xstrdup ("A request is required.");
1814
1815 return logitem->errstr != NULL;
1816 }
1817
1818 /* Keep track of all invalid log strings. */
1819 static void
count_invalid(GLog * glog,const char * line)1820 count_invalid (GLog * glog, const char *line) {
1821 glog->invalid++;
1822 ht_inc_cnt_overall ("failed_requests", 1);
1823
1824 if (conf.invalid_requests_log) {
1825 LOG_INVALID (("%s", line));
1826 }
1827
1828 if (glog->items->errstr && glog->invalid < MAX_LOG_ERRORS) {
1829 glog->errors[glog->log_erridx++] = xstrdup (glog->items->errstr);
1830 }
1831 }
1832
1833 /* Count down the number of invalids hits.
1834 * Note: Upon performing a log test, invalid hits are counted, since
1835 * no valid records were found, then we count down by the number of
1836 * tests ran.
1837 */
1838 static void
uncount_invalid(GLog * glog)1839 uncount_invalid (GLog * glog) {
1840 if (glog->invalid > conf.num_tests)
1841 glog->invalid -= conf.num_tests;
1842 else
1843 glog->invalid = 0;
1844 }
1845
1846 /* Count down the number of processed hits.
1847 * Note: Upon performing a log test, processed hits are counted, since
1848 * no valid records were found, then we count down by the number of
1849 * tests ran.
1850 */
1851 static void
uncount_processed(GLog * glog)1852 uncount_processed (GLog * glog) {
1853 lock_spinner ();
1854 if (glog->processed > conf.num_tests)
1855 glog->processed -= conf.num_tests;
1856 else
1857 glog->processed = 0;
1858 unlock_spinner ();
1859 }
1860
1861 /* Keep track of all valid log strings. */
1862 static void
count_valid(int numdate)1863 count_valid (int numdate) {
1864 lock_spinner ();
1865 ht_inc_cnt_valid (numdate, 1);
1866 unlock_spinner ();
1867 }
1868
1869 /* Keep track of all valid and processed log strings. */
1870 static void
count_process(GLog * glog)1871 count_process (GLog * glog) {
1872 lock_spinner ();
1873 glog->processed++;
1874 ht_inc_cnt_overall ("total_requests", 1);
1875 unlock_spinner ();
1876 }
1877
1878 static void
count_process_and_invalid(GLog * glog,const char * line)1879 count_process_and_invalid (GLog * glog, const char *line) {
1880 count_process (glog);
1881 count_invalid (glog, line);
1882 }
1883
1884 /* Keep track of all excluded log strings (IPs).
1885 *
1886 * If IP not range, 1 is returned.
1887 * If IP is excluded, 0 is returned. */
1888 static int
excluded_ip(GLogItem * logitem)1889 excluded_ip (GLogItem * logitem) {
1890 if (conf.ignore_ip_idx && ip_in_range (logitem->host)) {
1891 ht_inc_cnt_overall ("excluded_ip", 1);
1892 return 0;
1893 }
1894 return 1;
1895 }
1896
1897 /* Determine if the request is from a robot or spider and check if we
1898 * need to ignore or show crawlers only.
1899 *
1900 * If the request line is not ignored, 0 is returned.
1901 * If the request line is ignored, 1 is returned. */
1902 static int
handle_crawler(const char * agent)1903 handle_crawler (const char *agent) {
1904 int bot = 0;
1905
1906 if (!conf.ignore_crawlers && !conf.crawlers_only)
1907 return 1;
1908
1909 bot = is_crawler (agent);
1910 return (conf.ignore_crawlers && bot) || (conf.crawlers_only && !bot) ? 0 : 1;
1911 }
1912
1913 /* A wrapper function to determine if the request is static.
1914 *
1915 * If the request is not static, 0 is returned.
1916 * If the request is static, 1 is returned. */
1917 static int
is_static(const char * req)1918 is_static (const char *req) {
1919 return verify_static_content (req);
1920 }
1921
1922 /* Determine if the request of the given status code needs to be
1923 * ignored.
1924 *
1925 * If the status code is not within the ignore-array, 0 is returned.
1926 * If the status code is within the ignore-array, 1 is returned. */
1927 static int
ignore_status_code(const char * status)1928 ignore_status_code (const char *status) {
1929 if (conf.ignore_status_idx == 0)
1930 return 0;
1931
1932 if (str_inarray (status, conf.ignore_status, conf.ignore_status_idx) != -1)
1933 return 1;
1934 return 0;
1935 }
1936
1937 /* Determine if static file request should be ignored
1938 *
1939 * If the request line is not ignored, 0 is returned.
1940 * If the request line is ignored, 1 is returned. */
1941 static int
ignore_static(const char * req)1942 ignore_static (const char *req) {
1943 if (conf.ignore_statics && is_static (req))
1944 return 1;
1945 return 0;
1946 }
1947
1948 /* Determine if the request status code is a 404.
1949 *
1950 * If the request is not a 404, 0 is returned.
1951 * If the request is a 404, 1 is returned. */
1952 static int
is_404(GLogItem * logitem)1953 is_404 (GLogItem * logitem) {
1954 /* is this a 404? */
1955 if (logitem->status && !memcmp (logitem->status, "404", 3))
1956 return 1;
1957 /* treat 444 as 404? */
1958 else if (logitem->status && !memcmp (logitem->status, "444", 3) && conf.code444_as_404)
1959 return 1;
1960 return 0;
1961 }
1962
1963 /* A wrapper function to determine if a log line needs to be ignored.
1964 *
1965 * If the request line is not ignored, 0 is returned.
1966 * If the request line is ignored, IGNORE_LEVEL_PANEL is returned.
1967 * If the request line is only not counted as valid, IGNORE_LEVEL_REQ is returned. */
1968 static int
ignore_line(GLogItem * logitem)1969 ignore_line (GLogItem * logitem) {
1970 if (excluded_ip (logitem) == 0)
1971 return IGNORE_LEVEL_PANEL;
1972 if (handle_crawler (logitem->agent) == 0)
1973 return IGNORE_LEVEL_PANEL;
1974 if (ignore_referer (logitem->ref))
1975 return IGNORE_LEVEL_PANEL;
1976 if (ignore_status_code (logitem->status))
1977 return IGNORE_LEVEL_PANEL;
1978 if (ignore_static (logitem->req))
1979 return conf.ignore_statics; // IGNORE_LEVEL_PANEL or IGNORE_LEVEL_REQ
1980
1981 /* check if we need to remove the request's query string */
1982 if (conf.ignore_qstr)
1983 strip_qstring (logitem->req);
1984
1985 return 0;
1986 }
1987
1988 /* A wrapper function to insert a data keymap string key.
1989 *
1990 * If the given key exists, its value is returned.
1991 * On error, -1 is returned.
1992 * On success the value of the key inserted is returned */
1993 static int
insert_dkeymap(GModule module,GKeyData * kdata)1994 insert_dkeymap (GModule module, GKeyData * kdata) {
1995 return ht_insert_keymap (module, kdata->numdate, kdata->data_key, &kdata->cdnkey);
1996 }
1997
1998 /* A wrapper function to insert a root keymap string key.
1999 *
2000 * If the given key exists, its value is returned.
2001 * On error, -1 is returned.
2002 * On success the value of the key inserted is returned */
2003 static int
insert_rkeymap(GModule module,GKeyData * kdata)2004 insert_rkeymap (GModule module, GKeyData * kdata) {
2005 return ht_insert_keymap (module, kdata->numdate, kdata->root_key, &kdata->crnkey);
2006 }
2007
2008 /* A wrapper function to insert a datamap uint32_t key and string value. */
2009 static void
insert_data(GModule module,GKeyData * kdata)2010 insert_data (GModule module, GKeyData * kdata) {
2011 ht_insert_datamap (module, kdata->numdate, kdata->data_nkey, kdata->data, kdata->cdnkey);
2012 }
2013
2014 /* A wrapper function to insert a uniqmap string key.
2015 *
2016 * If the given key exists, 0 is returned.
2017 * On error, -1 is returned.
2018 * On success the value of the key inserted is returned */
2019 static int
insert_uniqmap(GModule module,GKeyData * kdata,uint32_t uniq_nkey)2020 insert_uniqmap (GModule module, GKeyData * kdata, uint32_t uniq_nkey) {
2021 return ht_insert_uniqmap (module, kdata->numdate, kdata->data_nkey, uniq_nkey);
2022 }
2023
2024 /* A wrapper function to insert a rootmap uint32_t key from the keymap
2025 * store mapped to its string value. */
2026 static void
insert_rootmap(GModule module,GKeyData * kdata)2027 insert_rootmap (GModule module, GKeyData * kdata) {
2028 ht_insert_rootmap (module, kdata->numdate, kdata->root_nkey, kdata->root, kdata->crnkey);
2029 }
2030
2031 /* A wrapper function to insert a data uint32_t key mapped to the
2032 * corresponding uint32_t root key. */
2033 static void
insert_root(GModule module,GKeyData * kdata)2034 insert_root (GModule module, GKeyData * kdata) {
2035 ht_insert_root (module, kdata->numdate, kdata->data_nkey, kdata->root_nkey, kdata->cdnkey,
2036 kdata->crnkey);
2037 }
2038
2039 /* A wrapper function to increase hits counter from an uint32_t key. */
2040 static void
insert_hit(GModule module,GKeyData * kdata)2041 insert_hit (GModule module, GKeyData * kdata) {
2042 ht_insert_hits (module, kdata->numdate, kdata->data_nkey, 1, kdata->cdnkey);
2043 ht_insert_meta_data (module, kdata->numdate, "hits", 1);
2044 }
2045
2046 /* A wrapper function to increase visitors counter from an uint32_t
2047 * key. */
2048 static void
insert_visitor(GModule module,GKeyData * kdata)2049 insert_visitor (GModule module, GKeyData * kdata) {
2050 ht_insert_visitor (module, kdata->numdate, kdata->data_nkey, 1, kdata->cdnkey);
2051 ht_insert_meta_data (module, kdata->numdate, "visitors", 1);
2052 }
2053
2054 /* A wrapper function to increases bandwidth counter from an uint32_t
2055 * key. */
2056 static void
insert_bw(GModule module,GKeyData * kdata,uint64_t size)2057 insert_bw (GModule module, GKeyData * kdata, uint64_t size) {
2058 ht_insert_bw (module, kdata->numdate, kdata->data_nkey, size, kdata->cdnkey);
2059 ht_insert_meta_data (module, kdata->numdate, "bytes", size);
2060 }
2061
2062 /* A wrapper call to increases cumulative time served counter
2063 * from an uint32_t key. */
2064 static void
insert_cumts(GModule module,GKeyData * kdata,uint64_t ts)2065 insert_cumts (GModule module, GKeyData * kdata, uint64_t ts) {
2066 ht_insert_cumts (module, kdata->numdate, kdata->data_nkey, ts, kdata->cdnkey);
2067 ht_insert_meta_data (module, kdata->numdate, "cumts", ts);
2068 }
2069
2070 /* A wrapper call to insert the maximum time served counter from
2071 * an uint32_t key. */
2072 static void
insert_maxts(GModule module,GKeyData * kdata,uint64_t ts)2073 insert_maxts (GModule module, GKeyData * kdata, uint64_t ts) {
2074 ht_insert_maxts (module, kdata->numdate, kdata->data_nkey, ts, kdata->cdnkey);
2075 ht_insert_meta_data (module, kdata->numdate, "maxts", ts);
2076 }
2077
2078 static void
insert_method(GModule module,GKeyData * kdata,const char * data)2079 insert_method (GModule module, GKeyData * kdata, const char *data) {
2080 ht_insert_method (module, kdata->numdate, kdata->data_nkey, data ? data : "---",
2081 kdata->cdnkey);
2082 }
2083
2084 /* A wrapper call to insert a method given an uint32_t key and string
2085 * value. */
2086 static void
insert_protocol(GModule module,GKeyData * kdata,const char * data)2087 insert_protocol (GModule module, GKeyData * kdata, const char *data) {
2088 ht_insert_protocol (module, kdata->numdate, kdata->data_nkey, data ? data : "---",
2089 kdata->cdnkey);
2090 }
2091
2092 /* A wrapper call to insert an agent for a hostname given an uint32_t
2093 * key and uint32_t value. */
2094 static void
insert_agent(GModule module,GKeyData * kdata,uint32_t agent_nkey)2095 insert_agent (GModule module, GKeyData * kdata, uint32_t agent_nkey) {
2096 ht_insert_agent (module, kdata->numdate, kdata->data_nkey, agent_nkey);
2097 }
2098
2099 /* The following generates a unique key to identity unique visitors.
2100 * The key is made out of the IP, date, and user agent.
2101 * Note that for readability, doing a simple snprintf/sprintf should
2102 * suffice, however, memcpy is the fastest solution
2103 *
2104 * On success the new unique visitor key is returned */
2105 static char *
get_uniq_visitor_key(GLogItem * logitem)2106 get_uniq_visitor_key (GLogItem * logitem) {
2107 char *ua = NULL, *key = NULL;
2108 size_t s1, s2, s3;
2109
2110 ua = deblank (xstrdup (logitem->agent));
2111
2112 s1 = strlen (logitem->date);
2113 s2 = strlen (logitem->host);
2114 s3 = strlen (ua);
2115
2116 /* includes terminating null */
2117 key = xcalloc (s1 + s2 + s3 + 3, sizeof (char));
2118
2119 memcpy (key, logitem->date, s1);
2120
2121 key[s1] = '|';
2122 memcpy (key + s1 + 1, logitem->host, s2 + 1);
2123
2124 key[s1 + s2 + 1] = '|';
2125 memcpy (key + s1 + s2 + 2, ua, s3 + 1);
2126
2127 free (ua);
2128 return key;
2129 }
2130
2131 /* The following generates a unique key to identity unique requests.
2132 * The key is made out of the actual request, and if available, the
2133 * method and the protocol. Note that for readability, doing a simple
2134 * snprintf/sprintf should suffice, however, memcpy is the fastest
2135 * solution
2136 *
2137 * On success the new unique request key is returned */
2138 static char *
gen_unique_req_key(GLogItem * logitem)2139 gen_unique_req_key (GLogItem * logitem) {
2140 char *key = NULL;
2141 size_t s1 = 0, s2 = 0, s3 = 0, nul = 1, sep = 0;
2142
2143 /* nothing to do */
2144 if (!conf.append_method && !conf.append_protocol)
2145 return xstrdup (logitem->req);
2146 /* still nothing to do */
2147 if (!logitem->method && !logitem->protocol)
2148 return xstrdup (logitem->req);
2149
2150 s1 = strlen (logitem->req);
2151 if (logitem->method && conf.append_method) {
2152 s2 = strlen (logitem->method);
2153 nul++;
2154 }
2155 if (logitem->protocol && conf.append_protocol) {
2156 s3 = strlen (logitem->protocol);
2157 nul++;
2158 }
2159
2160 /* includes terminating null */
2161 key = xcalloc (s1 + s2 + s3 + nul, sizeof (char));
2162 /* append request */
2163 memcpy (key, logitem->req, s1);
2164
2165 if (logitem->method && conf.append_method) {
2166 key[s1] = '|';
2167 sep++;
2168 memcpy (key + s1 + sep, logitem->method, s2 + 1);
2169 }
2170 if (logitem->protocol && conf.append_protocol) {
2171 key[s1 + s2 + sep] = '|';
2172 sep++;
2173 memcpy (key + s1 + s2 + sep, logitem->protocol, s3 + 1);
2174 }
2175
2176 return key;
2177 }
2178
2179 /* Append the query string to the request, and therefore, it modifies
2180 * the original logitem->req */
2181 static void
append_query_string(char ** req,const char * qstr)2182 append_query_string (char **req, const char *qstr) {
2183 char *r;
2184 size_t s1, s2, qm = 0;
2185
2186 s1 = strlen (*req);
2187 s2 = strlen (qstr);
2188
2189 /* add '?' between the URL and the query string */
2190 if (*qstr != '?')
2191 qm = 1;
2192
2193 r = xmalloc (s1 + s2 + qm + 1);
2194 memcpy (r, *req, s1);
2195 if (qm)
2196 r[s1] = '?';
2197 memcpy (r + s1 + qm, qstr, s2 + 1);
2198
2199 free (*req);
2200 *req = r;
2201 }
2202
2203 /* A wrapper to assign the given data key and the data item to the key
2204 * data structure */
2205 static void
get_kdata(GKeyData * kdata,char * data_key,char * data)2206 get_kdata (GKeyData * kdata, char *data_key, char *data) {
2207 /* inserted in keymap */
2208 kdata->data_key = data_key;
2209 /* inserted in datamap */
2210 kdata->data = data;
2211 }
2212
2213 /* Generate a visitor's key given the date specificity. For instance,
2214 * if the specificity if set to hours, then a generated key would
2215 * look like: 03/Jan/2016:09 */
2216 static void
set_spec_visitor_key(char ** fdate,const char * ftime)2217 set_spec_visitor_key (char **fdate, const char *ftime) {
2218 size_t dlen = 0, tlen = 0;
2219 char *key = NULL, *tkey = NULL, *pch = NULL;
2220
2221 tkey = xstrdup (ftime);
2222 if (conf.date_spec_hr && (pch = strchr (tkey, ':')) && (pch - tkey) > 0)
2223 *pch = '\0';
2224
2225 dlen = strlen (*fdate);
2226 tlen = strlen (tkey);
2227
2228 key = xmalloc (dlen + tlen + 1);
2229 memcpy (key, *fdate, dlen);
2230 memcpy (key + dlen, tkey, tlen + 1);
2231
2232 free (*fdate);
2233 free (tkey);
2234 *fdate = key;
2235 }
2236
2237 /* Generate a unique key for the visitors panel from the given logitem
2238 * structure and assign it to the output key data structure.
2239 *
2240 * On error, or if no date is found, 1 is returned.
2241 * On success, the date key is assigned to our key data structure.
2242 */
2243 static int
gen_visitor_key(GKeyData * kdata,GLogItem * logitem)2244 gen_visitor_key (GKeyData * kdata, GLogItem * logitem) {
2245 if (!logitem->date || !logitem->time)
2246 return 1;
2247
2248 /* Append time specificity to date */
2249 if (conf.date_spec_hr)
2250 set_spec_visitor_key (&logitem->date, logitem->time);
2251
2252 get_kdata (kdata, logitem->date, logitem->date);
2253 kdata->numdate = logitem->numdate;
2254
2255 return 0;
2256 }
2257
2258 /* Generate a unique key for the requests panel from the given logitem
2259 * structure and assign it to out key data structure.
2260 *
2261 * On success, the generated request key is assigned to our key data
2262 * structure.
2263 */
2264 static int
gen_req_key(GKeyData * kdata,GLogItem * logitem)2265 gen_req_key (GKeyData * kdata, GLogItem * logitem) {
2266 if (!logitem->req)
2267 return 1;
2268
2269 if (logitem->qstr)
2270 append_query_string (&logitem->req, logitem->qstr);
2271 logitem->req_key = gen_unique_req_key (logitem);
2272
2273 get_kdata (kdata, logitem->req_key, logitem->req);
2274 kdata->numdate = logitem->numdate;
2275
2276 return 0;
2277 }
2278
2279 /* A wrapper to generate a unique key for the request panel.
2280 *
2281 * On error, or if the request is static or a 404, 1 is returned.
2282 * On success, the generated request key is assigned to our key data
2283 * structure.
2284 */
2285 static int
gen_request_key(GKeyData * kdata,GLogItem * logitem)2286 gen_request_key (GKeyData * kdata, GLogItem * logitem) {
2287 if (!logitem->req || logitem->is_404 || logitem->is_static)
2288 return 1;
2289
2290 return gen_req_key (kdata, logitem);
2291 }
2292
2293 /* A wrapper to generate a unique key for the request panel.
2294 *
2295 * On error, or if the request is not a 404, 1 is returned.
2296 * On success, the generated request key is assigned to our key data
2297 * structure. */
2298 static int
gen_404_key(GKeyData * kdata,GLogItem * logitem)2299 gen_404_key (GKeyData * kdata, GLogItem * logitem) {
2300 if (logitem->req && logitem->is_404)
2301 return gen_req_key (kdata, logitem);
2302 return 1;
2303 }
2304
2305 /* A wrapper to generate a unique key for the request panel.
2306 *
2307 * On error, or if the request is not a static request, 1 is returned.
2308 * On success, the generated request key is assigned to our key data
2309 * structure. */
2310 static int
gen_static_request_key(GKeyData * kdata,GLogItem * logitem)2311 gen_static_request_key (GKeyData * kdata, GLogItem * logitem) {
2312 if (logitem->req && logitem->is_static)
2313 return gen_req_key (kdata, logitem);
2314 return 1;
2315 }
2316
2317 /* A wrapper to generate a unique key for the virtual host panel.
2318 *
2319 * On error, 1 is returned.
2320 * On success, the generated vhost key is assigned to our key data
2321 * structure. */
2322 static int
gen_vhost_key(GKeyData * kdata,GLogItem * logitem)2323 gen_vhost_key (GKeyData * kdata, GLogItem * logitem) {
2324 if (!logitem->vhost)
2325 return 1;
2326
2327 get_kdata (kdata, logitem->vhost, logitem->vhost);
2328 kdata->numdate = logitem->numdate;
2329
2330 return 0;
2331 }
2332
2333 /* A wrapper to generate a unique key for the virtual host panel.
2334 *
2335 * On error, 1 is returned.
2336 * On success, the generated userid key is assigned to our key data
2337 * structure. */
2338 static int
gen_remote_user_key(GKeyData * kdata,GLogItem * logitem)2339 gen_remote_user_key (GKeyData * kdata, GLogItem * logitem) {
2340 if (!logitem->userid)
2341 return 1;
2342
2343 get_kdata (kdata, logitem->userid, logitem->userid);
2344 kdata->numdate = logitem->numdate;
2345
2346 return 0;
2347 }
2348
2349 /* A wrapper to generate a unique key for the cache status panel.
2350 *
2351 * On error, 1 is returned.
2352 * On success, the generated cache status key is assigned to our key data
2353 * structure. */
2354 static int
gen_cache_status_key(GKeyData * kdata,GLogItem * logitem)2355 gen_cache_status_key (GKeyData * kdata, GLogItem * logitem) {
2356 if (!logitem->cache_status)
2357 return 1;
2358
2359 get_kdata (kdata, logitem->cache_status, logitem->cache_status);
2360 kdata->numdate = logitem->numdate;
2361
2362 return 0;
2363 }
2364
2365 /* A wrapper to generate a unique key for the hosts panel.
2366 *
2367 * On error, 1 is returned.
2368 * On success, the generated host key is assigned to our key data
2369 * structure. */
2370 static int
gen_host_key(GKeyData * kdata,GLogItem * logitem)2371 gen_host_key (GKeyData * kdata, GLogItem * logitem) {
2372 if (!logitem->host)
2373 return 1;
2374
2375 get_kdata (kdata, logitem->host, logitem->host);
2376 kdata->numdate = logitem->numdate;
2377
2378 return 0;
2379 }
2380
2381 /* Generate a browser unique key for the browser's panel given a user
2382 * agent and assign the browser type/category as a root element.
2383 *
2384 * On error, 1 is returned.
2385 * On success, the generated browser key is assigned to our key data
2386 * structure. */
2387 static int
gen_browser_key(GKeyData * kdata,GLogItem * logitem)2388 gen_browser_key (GKeyData * kdata, GLogItem * logitem) {
2389 char *agent = NULL;
2390 char browser_type[BROWSER_TYPE_LEN] = "";
2391
2392 if (logitem->agent == NULL || *logitem->agent == '\0')
2393 return 1;
2394
2395 agent = xstrdup (logitem->agent);
2396 logitem->browser = verify_browser (agent, browser_type);
2397 logitem->browser_type = xstrdup (browser_type);
2398
2399 /* e.g., Firefox 11.12 */
2400 kdata->data = logitem->browser;
2401 kdata->data_key = logitem->browser;
2402
2403 /* Firefox */
2404 kdata->root = logitem->browser_type;
2405 kdata->root_key = logitem->browser_type;
2406 kdata->numdate = logitem->numdate;
2407
2408 free (agent);
2409
2410 return 0;
2411 }
2412
2413 /* Generate an operating system unique key for the OS' panel given a
2414 * user agent and assign the OS type/category as a root element.
2415 *
2416 * On error, 1 is returned.
2417 * On success, the generated OS key is assigned to our key data
2418 * structure. */
2419 static int
gen_os_key(GKeyData * kdata,GLogItem * logitem)2420 gen_os_key (GKeyData * kdata, GLogItem * logitem) {
2421 char *agent = NULL;
2422 char os_type[OPESYS_TYPE_LEN] = "";
2423
2424 if (logitem->agent == NULL || *logitem->agent == '\0')
2425 return 1;
2426
2427 agent = xstrdup (logitem->agent);
2428 logitem->os = verify_os (agent, os_type);
2429 logitem->os_type = xstrdup (os_type);
2430
2431 /* e.g., Linux,Ubuntu 10.12 */
2432 kdata->data = logitem->os;
2433 kdata->data_key = logitem->os;
2434
2435 /* Linux */
2436 kdata->root = logitem->os_type;
2437 kdata->root_key = logitem->os_type;
2438 kdata->numdate = logitem->numdate;
2439
2440 free (agent);
2441
2442 return 0;
2443 }
2444
2445 /* Determine if the given token starts with a valid MIME major type.
2446 *
2447 * If not valid, NULL is returned.
2448 * If valid, the appropriate constant string is returned. */
2449 static const char *
extract_mimemajor(const char * token)2450 extract_mimemajor (const char *token) {
2451 const char *lookfor;
2452
2453 /* official IANA registries as per https://www.iana.org/assignments/media-types/ */
2454
2455 if ((lookfor = "application", !strncmp (token, lookfor, 11)) ||
2456 (lookfor = "audio", !strncmp (token, lookfor, 5)) ||
2457 (lookfor = "font", !strncmp (token, lookfor, 4)) ||
2458 /* unlikely */
2459 (lookfor = "example", !strncmp (token, lookfor, 7)) ||
2460 (lookfor = "image", !strncmp (token, lookfor, 5)) ||
2461 /* unlikely */
2462 (lookfor = "message", !strncmp (token, lookfor, 7)) ||
2463 (lookfor = "model", !strncmp (token, lookfor, 5)) ||
2464 (lookfor = "multipart", !strncmp (token, lookfor, 9)) ||
2465 (lookfor = "text", !strncmp (token, lookfor, 4)) ||
2466 (lookfor = "video", !strncmp (token, lookfor, 5))
2467 )
2468 return lookfor;
2469 return NULL;
2470 }
2471
2472 /* UMS: generate an Mime-Type unique key
2473 *
2474 * On error, 1 is returned.
2475 * On success, the generated key is assigned to our key data structure.
2476 */
2477 static int
gen_mime_type_key(GKeyData * kdata,GLogItem * logitem)2478 gen_mime_type_key (GKeyData * kdata, GLogItem * logitem) {
2479 const char *major = NULL;
2480
2481 if (!logitem->mime_type)
2482 return 1;
2483
2484 /* redirects and the like only register as "-", ignore those */
2485 major = extract_mimemajor (logitem->mime_type);
2486 if (!major)
2487 return 1;
2488
2489 kdata->data = logitem->mime_type;
2490 kdata->data_key = logitem->mime_type;
2491 kdata->numdate = logitem->numdate;
2492
2493 kdata->root = major;
2494 kdata->root_key = major;
2495
2496 return 0;
2497 }
2498
2499 /* Determine if the given token starts with the usual TLS/SSL result string.
2500 *
2501 * If not valid, NULL is returned.
2502 * If valid, the appropriate constant string is returned. */
2503 static const char *
extract_tlsmajor(const char * token)2504 extract_tlsmajor (const char *token) {
2505 const char *lookfor;
2506
2507 if ((lookfor = "SSLv3", !strncmp (token, lookfor, 5)) ||
2508 (lookfor = "TLSv1.1", !strncmp (token, lookfor, 7)) ||
2509 (lookfor = "TLSv1.2", !strncmp (token, lookfor, 7)) ||
2510 (lookfor = "TLSv1.3", !strncmp (token, lookfor, 7)) ||
2511 /* Nope, it's not 1.0 */
2512 (lookfor = "TLSv1", !strncmp (token, lookfor, 5)))
2513 return lookfor;
2514 return NULL;
2515 }
2516
2517 /* UMS: generate a TLS settings unique key
2518 *
2519 * On error, 1 is returned.
2520 * On success, the generated key is assigned to our key data structure.
2521 */
2522 static int
gen_tls_type_key(GKeyData * kdata,GLogItem * logitem)2523 gen_tls_type_key (GKeyData * kdata, GLogItem * logitem) {
2524 const char *tls;
2525 size_t tlen = 0, clen = 0;
2526
2527 if (!logitem->tls_type)
2528 return 1;
2529
2530 /* '-' means no TLS at all, just ignore for the panel? */
2531 tls = extract_tlsmajor (logitem->tls_type);
2532
2533 if (!tls)
2534 return 1;
2535
2536 kdata->numdate = logitem->numdate;
2537 if (!logitem->tls_cypher) {
2538 kdata->data_key = kdata->data = kdata->root = kdata->root_key = tls;
2539 return 0;
2540 }
2541
2542 clen = strlen (logitem->tls_cypher);
2543 tlen = strlen (tls);
2544
2545 logitem->tls_type_cypher = xmalloc (tlen + clen + 2);
2546 memcpy (logitem->tls_type_cypher, tls, tlen);
2547 logitem->tls_type_cypher[tlen] = '/';
2548 /* includes terminating null */
2549 memcpy (logitem->tls_type_cypher + tlen + 1, logitem->tls_cypher, clen + 1);
2550
2551 kdata->data = logitem->tls_type_cypher;
2552 kdata->data_key = logitem->tls_type_cypher;
2553
2554 kdata->root = tls;
2555 kdata->root_key = tls;
2556
2557 return 0;
2558 }
2559
2560
2561 /* A wrapper to generate a unique key for the referrers panel.
2562 *
2563 * On error, 1 is returned.
2564 * On success, the generated referrer key is assigned to our key data
2565 * structure. */
2566 static int
gen_referer_key(GKeyData * kdata,GLogItem * logitem)2567 gen_referer_key (GKeyData * kdata, GLogItem * logitem) {
2568 if (!logitem->ref)
2569 return 1;
2570
2571 get_kdata (kdata, logitem->ref, logitem->ref);
2572 kdata->numdate = logitem->numdate;
2573
2574 return 0;
2575 }
2576
2577 /* A wrapper to generate a unique key for the referring sites panel.
2578 *
2579 * On error, 1 is returned.
2580 * On success, the generated referring site key is assigned to our key data
2581 * structure. */
2582 static int
gen_ref_site_key(GKeyData * kdata,GLogItem * logitem)2583 gen_ref_site_key (GKeyData * kdata, GLogItem * logitem) {
2584 if (logitem->site[0] == '\0')
2585 return 1;
2586
2587 get_kdata (kdata, logitem->site, logitem->site);
2588 kdata->numdate = logitem->numdate;
2589
2590 return 0;
2591 }
2592
2593 /* A wrapper to generate a unique key for the keyphrases panel.
2594 *
2595 * On error, 1 is returned.
2596 * On success, the generated keyphrase key is assigned to our key data
2597 * structure. */
2598 static int
gen_keyphrase_key(GKeyData * kdata,GLogItem * logitem)2599 gen_keyphrase_key (GKeyData * kdata, GLogItem * logitem) {
2600 if (!logitem->keyphrase)
2601 return 1;
2602
2603 get_kdata (kdata, logitem->keyphrase, logitem->keyphrase);
2604 kdata->numdate = logitem->numdate;
2605
2606 return 0;
2607 }
2608
2609 /* A wrapper to generate a unique key for the geolocation panel.
2610 *
2611 * On error, 1 is returned.
2612 * On success, the generated geolocation key is assigned to our key
2613 * data structure. */
2614 #ifdef HAVE_GEOLOCATION
2615 static int
gen_geolocation_key(GKeyData * kdata,GLogItem * logitem)2616 gen_geolocation_key (GKeyData * kdata, GLogItem * logitem) {
2617 char continent[CONTINENT_LEN] = "";
2618 char country[COUNTRY_LEN] = "";
2619
2620 if (extract_geolocation (logitem, continent, country) == 1)
2621 return 1;
2622
2623 if (country[0] != '\0')
2624 logitem->country = xstrdup (country);
2625
2626 if (continent[0] != '\0')
2627 logitem->continent = xstrdup (continent);
2628
2629 kdata->data_key = logitem->country;
2630 kdata->data = logitem->country;
2631
2632 kdata->root = logitem->continent;
2633 kdata->root_key = logitem->continent;
2634 kdata->numdate = logitem->numdate;
2635
2636 return 0;
2637 }
2638 #endif
2639
2640 /* A wrapper to generate a unique key for the status code panel.
2641 *
2642 * On error, 1 is returned.
2643 * On success, the generated status code key is assigned to our key
2644 * data structure. */
2645 static int
gen_status_code_key(GKeyData * kdata,GLogItem * logitem)2646 gen_status_code_key (GKeyData * kdata, GLogItem * logitem) {
2647 const char *status = NULL, *type = NULL;
2648
2649 if (!logitem->status)
2650 return 1;
2651
2652 type = verify_status_code_type (logitem->status);
2653 status = verify_status_code (logitem->status);
2654
2655 kdata->data = (char *) status;
2656 kdata->data_key = (char *) status;
2657
2658 kdata->root = (char *) type;
2659 kdata->root_key = (char *) type;
2660 kdata->numdate = logitem->numdate;
2661
2662 return 0;
2663 }
2664
2665 /* Given a time string containing at least %H:%M, extract either the
2666 * tenth of a minute or an hour.
2667 *
2668 * On error, the given string is not modified.
2669 * On success, the conf specificity is extracted. */
2670 static void
parse_time_specificity_string(char * hmark,char * ftime)2671 parse_time_specificity_string (char *hmark, char *ftime) {
2672 /* tenth of a minute specificity - e.g., 18:2 */
2673 if (conf.hour_spec_min && hmark[1] != '\0') {
2674 hmark[2] = '\0';
2675 return;
2676 }
2677
2678 /* hour specificity (default) */
2679 if ((hmark - ftime) > 0)
2680 *hmark = '\0';
2681 }
2682
2683 /* A wrapper to generate a unique key for the time distribution panel.
2684 *
2685 * On error, 1 is returned.
2686 * On success, the generated time key is assigned to our key data
2687 * structure. */
2688 static int
gen_visit_time_key(GKeyData * kdata,GLogItem * logitem)2689 gen_visit_time_key (GKeyData * kdata, GLogItem * logitem) {
2690 char *hmark = NULL;
2691 char hour[HRMI_LEN] = ""; /* %H:%M */
2692 if (!logitem->time)
2693 return 1;
2694
2695 /* if not a timestamp, then it must be a string containing the hour.
2696 * this is faster than actual date conversion */
2697 if (!has_timestamp (conf.time_format) && (hmark = strchr (logitem->time, ':'))) {
2698 parse_time_specificity_string (hmark, logitem->time);
2699
2700 kdata->numdate = logitem->numdate;
2701 get_kdata (kdata, logitem->time, logitem->time);
2702 return 0;
2703 }
2704
2705 /* otherwise it attempts to convert the date given a time format,
2706 * though this is slower */
2707 memset (hour, 0, sizeof *hour);
2708 if (convert_date (hour, logitem->time, "%T", "%H:%M", HRMI_LEN) != 0)
2709 return 1;
2710
2711 if (*hour == '\0')
2712 return 1;
2713
2714 if ((hmark = strchr (hour, ':')))
2715 parse_time_specificity_string (hmark, hour);
2716
2717 free (logitem->time);
2718 logitem->time = xstrdup (hour);
2719
2720 get_kdata (kdata, logitem->time, logitem->time);
2721 kdata->numdate = logitem->numdate;
2722
2723 return 0;
2724 }
2725
2726 /* Determine if 404s need to be added to the unique visitors count.
2727 *
2728 * If it needs to be added, 0 is returned else 1 is returned. */
2729 static int
include_uniq(GLogItem * logitem)2730 include_uniq (GLogItem * logitem) {
2731 int u = conf.client_err_to_unique_count;
2732
2733 if (!logitem->status || logitem->status[0] != '4' || (u && logitem->status[0] == '4'))
2734 return 1;
2735 return 0;
2736 }
2737
2738 /* Determine which data metrics need to be set and set them. */
2739 static void
set_datamap(GLogItem * logitem,GKeyData * kdata,const GParse * parse)2740 set_datamap (GLogItem * logitem, GKeyData * kdata, const GParse * parse) {
2741 GModule module;
2742 module = parse->module;
2743
2744 /* insert data */
2745 parse->datamap (module, kdata);
2746
2747 /* insert rootmap and root-data map */
2748 if (parse->rootmap && kdata->root) {
2749 parse->rootmap (module, kdata);
2750 insert_root (module, kdata);
2751 }
2752 /* insert hits */
2753 if (parse->hits)
2754 parse->hits (module, kdata);
2755 /* insert visitors */
2756 if (parse->visitor && kdata->uniq_nkey == 1)
2757 parse->visitor (module, kdata);
2758 /* insert bandwidth */
2759 if (parse->bw)
2760 parse->bw (module, kdata, logitem->resp_size);
2761 /* insert averages time served */
2762 if (parse->cumts)
2763 parse->cumts (module, kdata, logitem->serve_time);
2764 /* insert averages time served */
2765 if (parse->maxts)
2766 parse->maxts (module, kdata, logitem->serve_time);
2767 /* insert method */
2768 if (parse->method && conf.append_method)
2769 parse->method (module, kdata, logitem->method);
2770 /* insert protocol */
2771 if (parse->protocol && conf.append_protocol)
2772 parse->protocol (module, kdata, logitem->protocol);
2773 /* insert agent */
2774 if (parse->agent && conf.list_agents)
2775 parse->agent (module, kdata, logitem->agent_nkey);
2776 }
2777
2778 /* Set data mapping and metrics. */
2779 static void
map_log(GLogItem * logitem,const GParse * parse,GModule module)2780 map_log (GLogItem * logitem, const GParse * parse, GModule module) {
2781 GKeyData kdata;
2782
2783 new_modulekey (&kdata);
2784 /* set key data into out structure */
2785 if (parse->key_data (&kdata, logitem) == 1)
2786 return;
2787
2788 /* each module requires a data key/value */
2789 if (parse->datamap && kdata.data_key)
2790 kdata.data_nkey = insert_dkeymap (module, &kdata);
2791
2792 /* each module contains a uniq visitor key/value */
2793 if (parse->visitor && logitem->uniq_key && include_uniq (logitem))
2794 kdata.uniq_nkey = insert_uniqmap (module, &kdata, logitem->uniq_nkey);
2795
2796 /* root keys are optional */
2797 if (parse->rootmap && kdata.root_key)
2798 kdata.root_nkey = insert_rkeymap (module, &kdata);
2799
2800 /* each module requires a root key/value */
2801 if (parse->datamap && kdata.data_key)
2802 set_datamap (logitem, &kdata, parse);
2803 }
2804
2805 static void
ins_agent_key_val(GLogItem * logitem,uint32_t numdate)2806 ins_agent_key_val (GLogItem * logitem, uint32_t numdate) {
2807 logitem->agent_nkey = ht_insert_agent_key (numdate, logitem->agent);
2808 /* insert UA key and get a numeric value */
2809 if (logitem->agent_nkey != 0) {
2810 /* insert a numeric key and map it to a UA string */
2811 ht_insert_agent_value (numdate, logitem->agent_nkey, logitem->agent);
2812 }
2813 }
2814
2815 static int
clean_old_data_by_date(uint32_t numdate)2816 clean_old_data_by_date (uint32_t numdate) {
2817 uint32_t *dates = NULL;
2818 uint32_t idx, len = 0;
2819
2820 if (ht_get_size_dates () < conf.keep_last)
2821 return 1;
2822
2823 dates = get_sorted_dates (&len);
2824
2825 /* If currently parsed date is in the set of dates, keep inserting it.
2826 * We count down since more likely the currently parsed date is at the last pos */
2827 for (idx = len; idx-- > 0;) {
2828 if (dates[idx] == numdate) {
2829 free (dates);
2830 return 1;
2831 }
2832 }
2833
2834 /* ignore older dates */
2835 if (dates[0] > numdate) {
2836 free (dates);
2837 return -1;
2838 }
2839
2840 /* invalidate the first date we inserted then */
2841 invalidate_date (dates[0]);
2842 /* rebuild all existing dates and let new data
2843 * be added upon existing cache */
2844 rebuild_rawdata_cache ();
2845 free (dates);
2846
2847 return 0;
2848 }
2849
2850 /* Process a log line and set the data into the corresponding data
2851 * structure. */
2852 static void
process_log(GLogItem * logitem)2853 process_log (GLogItem * logitem) {
2854 GModule module;
2855 const GParse *parse = NULL;
2856 size_t idx = 0;
2857 uint32_t numdate = logitem->numdate;
2858
2859 if (conf.keep_last > 0 && clean_old_data_by_date (numdate) == -1)
2860 return;
2861
2862 /* insert date and start partitioning tables */
2863 if (ht_insert_date (numdate) == -1)
2864 return;
2865
2866 /* Insert one unique visitor key per request to avoid the
2867 * overhead of storing one key per module */
2868 if ((logitem->uniq_nkey = ht_insert_unique_key (numdate, logitem->uniq_key)) == 0)
2869 return;
2870
2871 /* If we need to store user agents per IP, then we store them and retrieve
2872 * its numeric key.
2873 * It maintains two maps, one for key -> value, and another
2874 * map for value -> key*/
2875 if (conf.list_agents)
2876 ins_agent_key_val (logitem, numdate);
2877
2878 FOREACH_MODULE (idx, module_list) {
2879 module = module_list[idx];
2880 if (!(parse = panel_lookup (module)))
2881 continue;
2882 map_log (logitem, parse, module);
2883 }
2884
2885 count_bw (numdate, logitem->resp_size);
2886 /* don't ignore line but neither count as valid */
2887 if (logitem->ignorelevel != IGNORE_LEVEL_REQ)
2888 count_valid (numdate);
2889 }
2890
2891 /* Determine if the current log has the content from the last time it was
2892 * parsed. It does this by comparing READ_BYTES against the beginning of the
2893 * log.
2894 *
2895 * Returns 1 if the content is likely the same or no data to compare
2896 * Returns 0 if it has different content */
2897 static int
is_likely_same_log(GLog * glog,const GLastParse * lp)2898 is_likely_same_log (GLog * glog, const GLastParse * lp) {
2899 size_t size = 0;
2900
2901 if (!lp->size)
2902 return 1;
2903
2904 /* Must be a LOG */
2905 size = MIN (glog->snippetlen, lp->snippetlen);
2906 if (glog->snippet[0] != '\0' && lp->snippet[0] != '\0' &&
2907 memcmp (glog->snippet, lp->snippet, size) == 0)
2908 return 1;
2909
2910 return 0;
2911 }
2912
2913 /* Determine if we should insert new record or if it's a duplicate record from
2914 * a previoulsy persisted dataset
2915 *
2916 * Returns 1 if it thinks the record it's being restored from disk
2917 * Returns 0 if we need to parse the record */
2918 static int
should_restore_from_disk(GLog * glog)2919 should_restore_from_disk (GLog * glog) {
2920 GLastParse lp = { 0 };
2921
2922 if (!conf.restore)
2923 return 0;
2924
2925 lp = ht_get_last_parse (glog->inode);
2926
2927 /* No last parse timestamp, continue parsing as we got nothing to compare
2928 * against */
2929 if (!lp.ts)
2930 return 0;
2931
2932 /* If our current line is greater or equal (zero indexed) to the last parsed
2933 * line and have equal timestamps, then keep parsing then */
2934 if (glog->inode && is_likely_same_log (glog, &lp)) {
2935 if (glog->size > lp.size && glog->read >= lp.line)
2936 return 0;
2937 return 1;
2938 }
2939
2940 /* No inode (probably a pipe), prior or equal timestamps means restore from
2941 * disk (exclusive) */
2942 if (!glog->inode && lp.ts >= glog->lp.ts)
2943 return 1;
2944
2945 /* If not likely the same content, then fallback to the following checks */
2946 /* If timestamp is greater than last parsed, read the line then */
2947 if (glog->lp.ts > lp.ts)
2948 return 0;
2949
2950 /* Check if current log size is smaller than the one last parsed, if it is,
2951 * it was possibly truncated and thus it may be smaller, so fallback to
2952 * timestamp even if they are equal to the last parsed timestamp */
2953 else if (glog->size < lp.size && glog->lp.ts == lp.ts)
2954 return 0;
2955
2956 /* Everything else we ignore it. For instance, we if current log size is
2957 * greater than the one last parsed, if the timestamp are equal, we ignore the
2958 * request.
2959 *
2960 * **NOTE* We try to play safe here as we would rather miss a few lines
2961 * than double-count a few. */
2962 return 1;
2963 }
2964
2965 static void
process_invalid(GLog * glog,GLogItem * logitem,const char * line)2966 process_invalid (GLog * glog, GLogItem * logitem, const char *line) {
2967 GLastParse lp = { 0 };
2968
2969 /* if not restoring from disk, then count entry as proceeded and invalid */
2970 if (!conf.restore) {
2971 count_process_and_invalid (glog, line);
2972 return;
2973 }
2974
2975 lp = ht_get_last_parse (glog->inode);
2976
2977 /* If our current line is greater or equal (zero indexed) to the last parsed
2978 * line then keep parsing then */
2979 if (glog->inode && is_likely_same_log (glog, &lp)) {
2980 /* only count invalids if we're past the last parsed line */
2981 if (glog->size > lp.size && glog->read >= lp.line)
2982 count_process_and_invalid (glog, line);
2983 return;
2984 }
2985
2986 /* no timestamp to compare against, just count the invalid then */
2987 if (!logitem->numdate) {
2988 count_process_and_invalid (glog, line);
2989 return;
2990 }
2991
2992 /* if there's a valid timestamp, count only if greater than last parsed ts */
2993 if ((glog->lp.ts = mktime (&logitem->dt)) == -1)
2994 return;
2995
2996 /* check if we were able to at least parse the date/time, if no date/time
2997 * then we simply don't count the entry as proceed & invalid to attempt over
2998 * counting restored data */
2999 if (should_restore_from_disk (glog) == 0)
3000 count_process_and_invalid (glog, line);
3001 }
3002
3003 static int
parse_json_specifier(void * ptr_data,char * key,char * str)3004 parse_json_specifier (void *ptr_data, char *key, char *str) {
3005 GLogItem *logitem = (GLogItem *) ptr_data;
3006 char *spec = NULL;
3007 int ret = 0;
3008
3009 if (!(spec = ht_get_json_logfmt (key)) || 0 == strlen (str))
3010 return 0;
3011
3012 ret = parse_format (logitem, str, spec);
3013 free (spec);
3014
3015 return ret;
3016 }
3017
3018 static int
parse_json_format(GLogItem * logitem,char * str)3019 parse_json_format (GLogItem * logitem, char *str) {
3020 return parse_json_string (logitem, str, parse_json_specifier);
3021 }
3022
3023 /* Process a line from the log and store it accordingly taking into
3024 * account multiple parsing options prior to setting data into the
3025 * corresponding data structure.
3026 *
3027 * On success, 0 is returned */
3028 int
pre_process_log(GLog * glog,char * line,int dry_run)3029 pre_process_log (GLog * glog, char *line, int dry_run) {
3030 GLogItem *logitem;
3031 int ret = 0;
3032 char *fmt = conf.log_format;
3033
3034 /* soft ignore these lines */
3035 if (valid_line (line))
3036 return -1;
3037
3038 logitem = init_log_item (glog);
3039
3040 /* Parse a line of log, and fill structure with appropriate values */
3041 if (conf.is_json_log_format)
3042 ret = parse_json_format (logitem, line);
3043 else
3044 ret = parse_format (logitem, line, fmt);
3045
3046 if (ret || (ret = verify_missing_fields (logitem))) {
3047 process_invalid (glog, logitem, line);
3048 goto cleanup;
3049 }
3050
3051 if ((glog->lp.ts = mktime (&logitem->dt)) == -1)
3052 goto cleanup;
3053
3054 if (should_restore_from_disk (glog))
3055 goto cleanup;
3056
3057 count_process (glog);
3058
3059 /* agent will be null in cases where %u is not specified */
3060 if (logitem->agent == NULL)
3061 logitem->agent = alloc_string ("-");
3062
3063 /* testing log only */
3064 if (dry_run)
3065 goto cleanup;
3066
3067 logitem->ignorelevel = ignore_line (logitem);
3068 /* ignore line */
3069 if (logitem->ignorelevel == IGNORE_LEVEL_PANEL)
3070 goto cleanup;
3071
3072 if (is_404 (logitem))
3073 logitem->is_404 = 1;
3074 else if (is_static (logitem->req))
3075 logitem->is_static = 1;
3076
3077 logitem->uniq_key = get_uniq_visitor_key (logitem);
3078
3079 process_log (logitem);
3080
3081 cleanup:
3082 free_glog (logitem);
3083
3084 return ret;
3085 }
3086
3087 /* Entry point to process the given live from the log.
3088 *
3089 * On error, 1 is returned.
3090 * On success or soft ignores, 0 is returned. */
3091 static int
read_line(GLog * glog,char * line,int * test,int * cnt,int dry_run)3092 read_line (GLog * glog, char *line, int *test, int *cnt, int dry_run) {
3093 int ret = 0;
3094
3095 /* start processing log line */
3096 if ((ret = pre_process_log (glog, line, dry_run)) == 0 && *test)
3097 *test = 0;
3098
3099 /* soft ignores */
3100 if (ret == -1)
3101 return 0;
3102
3103 /* reached num of lines to test and no valid records were found, log
3104 * format is likely not matching */
3105 if (conf.num_tests && ++(*cnt) == (int) conf.num_tests && *test) {
3106 uncount_processed (glog);
3107 uncount_invalid (glog);
3108 return 1;
3109 }
3110
3111 return 0;
3112 }
3113
3114 /* A replacement for GNU getline() to dynamically expand fgets buffer.
3115 *
3116 * On error, NULL is returned.
3117 * On success, the malloc'd line is returned. */
3118 char *
fgetline(FILE * fp)3119 fgetline (FILE * fp) {
3120 char buf[LINE_BUFFER] = { 0 };
3121 char *line = NULL, *tmp = NULL;
3122 size_t linelen = 0, len = 0;
3123
3124 while (1) {
3125 if (!fgets (buf, sizeof (buf), fp)) {
3126 if (conf.process_and_exit && errno == EAGAIN) {
3127 nanosleep ((const struct timespec[]) { {0, 100000000L} }, NULL);
3128 continue;
3129 } else
3130 break;
3131 }
3132
3133 len = strlen (buf);
3134
3135 /* overflow check */
3136 if (SIZE_MAX - len - 1 < linelen)
3137 break;
3138
3139 if ((tmp = realloc (line, linelen + len + 1)) == NULL)
3140 break;
3141
3142 line = tmp;
3143 /* append */
3144 strcpy (line + linelen, buf);
3145 linelen += len;
3146
3147 if (feof (fp) || buf[len - 1] == '\n')
3148 return line;
3149 }
3150 free (line);
3151
3152 return NULL;
3153 }
3154
3155 /* Iterate over the log and read line by line (use GNU get_line to parse the
3156 * whole line).
3157 *
3158 * On error, 1 is returned.
3159 * On success, 0 is returned. */
3160 #ifdef WITH_GETLINE
3161 static int
read_lines(FILE * fp,GLog * glog,int dry_run)3162 read_lines (FILE * fp, GLog * glog, int dry_run) {
3163 char *line = NULL;
3164 int ret = 0, cnt = 0, test = conf.num_tests > 0 ? 1 : 0;
3165
3166 glog->bytes = 0;
3167 while ((line = fgetline (fp)) != NULL) {
3168 /* handle SIGINT */
3169 if (conf.stop_processing)
3170 goto out;
3171 if ((ret = read_line (glog, line, &test, &cnt, dry_run)))
3172 goto out;
3173 if (dry_run && NUM_TESTS == cnt)
3174 goto out;
3175 glog->bytes += strlen (line);
3176 free (line);
3177 glog->read++;
3178 }
3179
3180 /* if no data was available to read from (probably from a pipe) and
3181 * still in test mode, we simply return until data becomes available */
3182 if (!line && (errno == EAGAIN || errno == EWOULDBLOCK) && test)
3183 return 0;
3184
3185 return (line && test) || ret || (!line && test && glog->processed);
3186
3187 out:
3188 free (line);
3189 /* fails if
3190 - we're still reading the log but the test flag was still set
3191 - ret flag is not 0, read_line failed
3192 - reached the end of file, test flag was still set and we processed lines */
3193 return test || ret || (test && glog->processed);
3194 }
3195 #endif
3196
3197 /* Iterate over the log and read line by line (uses a buffer of fixed size).
3198 *
3199 * On error, 1 is returned.
3200 * On success, 0 is returned. */
3201 #ifndef WITH_GETLINE
3202 static int
read_lines(FILE * fp,GLog * glog,int dry_run)3203 read_lines (FILE * fp, GLog * glog, int dry_run) {
3204 char *s = NULL;
3205 char line[LINE_BUFFER] = { 0 };
3206 int ret = 0, cnt = 0, test = conf.num_tests > 0 ? 1 : 0;
3207
3208 glog->bytes = 0;
3209 while ((s = fgets (line, LINE_BUFFER, fp)) != NULL) {
3210 /* handle SIGINT */
3211 if (conf.stop_processing)
3212 break;
3213 if ((ret = read_line (glog, line, &test, &cnt, dry_run)))
3214 break;
3215 if (dry_run && NUM_TESTS == cnt)
3216 break;
3217 glog->bytes += strlen (line);
3218 glog->read++;
3219 }
3220
3221 /* if no data was available to read from (probably from a pipe) and
3222 * still in test mode, we simply return until data becomes available */
3223 if (!s && (errno == EAGAIN || errno == EWOULDBLOCK) && test)
3224 return 0;
3225
3226 /* fails if
3227 - we're still reading the log but the test flag was still set
3228 - ret flag is not 0, read_line failed
3229 - reached the end of file, test flag was still set and we processed lines */
3230 return (s && test) || ret || (!s && test && glog->processed);
3231 }
3232 #endif
3233
3234 /* Read the given log file and attempt to mmap a fixed number of bytes so we
3235 * can compare its content on future runs.
3236 *
3237 * On error, 1 is returned.
3238 * On success, 0 is returned. */
3239 int
set_initial_persisted_data(GLog * glog,FILE * fp,const char * fn)3240 set_initial_persisted_data (GLog * glog, FILE * fp, const char *fn) {
3241 size_t len;
3242
3243 /* reset the snippet */
3244 memset (glog->snippet, 0, sizeof (glog->snippet));
3245 glog->snippetlen = 0;
3246
3247 if (glog->size == 0)
3248 return 1;
3249
3250 len = MIN (glog->size, READ_BYTES);
3251 if ((fread (glog->snippet, len, 1, fp)) != 1 && ferror (fp))
3252 FATAL ("Unable to fread the specified log file '%s'", fn);
3253 glog->snippetlen = len;
3254
3255 fseek (fp, 0, SEEK_SET);
3256
3257 return 0;
3258 }
3259
3260 static void
persist_last_parse(GLog * glog)3261 persist_last_parse (GLog * glog) {
3262 /* insert last parsed data for the recently file parsed */
3263 if (glog->inode && glog->size) {
3264 glog->lp.line = glog->read;
3265 glog->lp.snippetlen = glog->snippetlen;
3266
3267 memcpy (glog->lp.snippet, glog->snippet, glog->snippetlen);
3268
3269 ht_insert_last_parse (glog->inode, glog->lp);
3270 }
3271 /* probably from a pipe */
3272 else if (!glog->inode) {
3273 ht_insert_last_parse (0, glog->lp);
3274 }
3275 }
3276
3277 /* Read the given log line by line and process its data.
3278 *
3279 * On error, 1 is returned.
3280 * On success, 0 is returned. */
3281 static int
read_log(GLog * glog,int dry_run)3282 read_log (GLog * glog, int dry_run) {
3283 FILE *fp = NULL;
3284 int piping = 0;
3285 struct stat fdstat;
3286
3287 /* Ensure we have a valid pipe to read from stdin. Only checking for
3288 * conf.read_stdin without verifying for a valid FILE pointer would certainly
3289 * lead to issues. */
3290 if (glog->filename[0] == '-' && glog->filename[1] == '\0' && glog->pipe) {
3291 fp = glog->pipe;
3292 glog->piping = piping = 1;
3293 }
3294
3295 /* make sure we can open the log (if not reading from stdin) */
3296 if (!piping && (fp = fopen (glog->filename, "r")) == NULL)
3297 FATAL ("Unable to open the specified log file '%s'. %s", glog->filename, strerror (errno));
3298
3299 /* grab the inode of the file being parsed */
3300 if (!piping && stat (glog->filename, &fdstat) == 0) {
3301 glog->inode = fdstat.st_ino;
3302 glog->size = glog->lp.size = fdstat.st_size;
3303 set_initial_persisted_data (glog, fp, glog->filename);
3304 }
3305
3306 /* read line by line */
3307 if (read_lines (fp, glog, dry_run)) {
3308 if (!piping)
3309 fclose (fp);
3310 return 1;
3311 }
3312
3313 persist_last_parse (glog);
3314
3315 /* close log file if not a pipe */
3316 if (!piping)
3317 fclose (fp);
3318
3319 return 0;
3320 }
3321
3322 static void
set_log_processing(Logs * logs,GLog * glog)3323 set_log_processing (Logs * logs, GLog * glog) {
3324 lock_spinner ();
3325 logs->processed = &(glog->processed);
3326 logs->filename = glog->filename;
3327 unlock_spinner ();
3328 }
3329
3330 /* Entry point to parse the log line by line.
3331 *
3332 * On error, 1 is returned.
3333 * On success, 0 is returned. */
3334 int
parse_log(Logs * logs,int dry_run)3335 parse_log (Logs * logs, int dry_run) {
3336 GLog *glog = NULL;
3337 const char *err_log = NULL;
3338 int idx;
3339
3340 /* verify that we have the required formats */
3341 if ((err_log = verify_formats ()))
3342 FATAL ("%s", err_log);
3343
3344 /* no data piped, no logs passed, load from disk only then */
3345 if (conf.restore && !logs->restored)
3346 logs->restored = rebuild_rawdata_cache ();
3347
3348 /* no data piped, no logs passed, load from disk only then */
3349 if (conf.restore && !conf.filenames_idx && !conf.read_stdin) {
3350 logs->load_from_disk_only = 1;
3351 return 0;
3352 }
3353
3354 for (idx = 0; idx < logs->size; ++idx) {
3355 glog = &logs->glog[idx];
3356 set_log_processing (logs, glog);
3357
3358 if (read_log (glog, dry_run))
3359 return 1;
3360
3361 glog->length = glog->bytes;
3362 }
3363
3364 return 0;
3365 }
3366
3367 /* Ensure we have valid hits
3368 *
3369 * On error, an array of pointers containing the error strings.
3370 * On success, NULL is returned. */
3371 char **
test_format(Logs * logs,int * len)3372 test_format (Logs * logs, int *len) {
3373 char **errors = NULL;
3374 GLog *glog = NULL;
3375 int i;
3376
3377 if (parse_log (logs, 1) == 0)
3378 return NULL;
3379
3380 for (i = 0; i < logs->size; ++i) {
3381 glog = &logs->glog[i];
3382 if (!glog->log_erridx)
3383 continue;
3384 break;
3385 }
3386
3387 errors = xcalloc (glog->log_erridx, sizeof (char *));
3388 *len = glog->log_erridx;
3389 for (i = 0; i < glog->log_erridx; ++i)
3390 errors[i] = xstrdup (glog->errors[i]);
3391 free_logerrors (glog);
3392
3393 return errors;
3394 }
3395