1 /* visitors -- very fast web logs analyzer.
2 *
3 * Copyright (C) 2004-2006 Salvatore Sanfilippo <antirez@invece.org>
4 * All Rights Reserved.
5 *
6 * This software is released under the terms of the GPL license version 2.
7 * Read the COPYING file in this distribution for more details. */
8
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <time.h>
13 #include <stdarg.h>
14 #include <errno.h>
15 #include <locale.h>
16 #include <ctype.h>
17
18 #include "aht.h"
19 #include "antigetopt.h"
20 #include "sleep.h"
21 #include "blacklist.h"
22
23 /* Max length of an error stored in the visitors handle */
24 #define VI_ERROR_MAX 1024
25 /* Max length of a log line */
26 #define VI_LINE_MAX 4096
27 /* Max number of filenames in the command line */
28 #define VI_FILENAMES_MAX 1024
29 /* Max number of prefixes in the command line */
30 #define VI_PREFIXES_MAX 1024
31 /* Max number of --grep --exclude patterns in the command line */
32 #define VI_GREP_PATTERNS_MAX 1024
33 /* Abbreviation length for HTML outputs */
34 #define VI_HTML_ABBR_LEN 100
35 /* Version as a string */
36 #define VI_DATE_MAX 64
37 /* Max length of a log entry date */
38 #define VI_VERSION_STR "0.7"
39
40 /*------------------------------- data structures ----------------------------*/
41
42 /* visitors handle */
43 struct vih {
44 int startt;
45 int endt;
46 int processed;
47 int invalid;
48 int blacklisted;
49 int hour[24];
50 int weekday[7];
51 int weekdayhour[7][24]; /* hour and weekday combined data */
52 int monthday[12][31]; /* month and day combined data */
53 struct hashtable visitors;
54 struct hashtable googlevisitors;
55 struct hashtable pages;
56 struct hashtable images;
57 struct hashtable error404;
58 struct hashtable pageviews;
59 struct hashtable pageviews_grouped;
60 struct hashtable referers;
61 struct hashtable referersage;
62 struct hashtable date;
63 struct hashtable googledate;
64 struct hashtable adsensed;
65 struct hashtable month;
66 struct hashtable googlemonth;
67 struct hashtable agents;
68 struct hashtable googled;
69 struct hashtable googlevisits;
70 struct hashtable googlekeyphrases;
71 struct hashtable googlekeyphrasesage;
72 struct hashtable trails;
73 struct hashtable tld;
74 struct hashtable os;
75 struct hashtable browsers;
76 struct hashtable robots;
77 struct hashtable googlehumanlanguage;
78 struct hashtable screenres;
79 struct hashtable screendepth;
80 char *error;
81 };
82
83 /* info associated with a line of log */
84 struct logline {
85 char *host;
86 char *date;
87 char *hour;
88 char *timezone;
89 char *req;
90 char *ref;
91 char *agent;
92 time_t time;
93 struct tm tm;
94 };
95
96 /* output module structure. See below for the definition of
97 * the text and html output modules. */
98 struct outputmodule {
99 void (*print_header)(FILE *fp);
100 void (*print_footer)(FILE *fp);
101 void (*print_title)(FILE *fp, char *title);
102 void (*print_subtitle)(FILE *fp, char *title);
103 void (*print_numkey_info)(FILE *fp, char *key, int val);
104 void (*print_keykey_entry)(FILE *fp, char *key1, char *key2, int num);
105 void (*print_numkey_entry)(FILE *fp, char *key, int val, char *link,
106 int num);
107 void (*print_numkeybar_entry)(FILE *fp, char *key, int max, int tot,
108 int this);
109 void (*print_numkeycomparativebar_entry)(FILE *fp, char *key, int tot,
110 int this);
111 void (*print_bidimentional_map)(FILE *fp, int xlen, int ylen,
112 char **xlabel, char **ylabel, int *value);
113 void (*print_hline)(FILE *fp);
114 void (*print_credits)(FILE *fp);
115 void (*print_report_link)(FILE *fp, char *report);
116 };
117
118 /* Just a string with cached length */
119 struct vistring {
120 char *str;
121 int len;
122 };
123
124 /* Grep pattern for --grep --exclude */
125 #define VI_PATTERNTYPE_GREP 0
126 #define VI_PATTERNTYPE_EXCLUDE 1
127 struct greppat {
128 int type;
129 char *pattern;
130 };
131
132 /* ---------------------- global configuration parameters ------------------- */
133 int Config_debug = 0;
134 int Config_max_referers = 20;
135 int Config_max_referers_age = 20;
136 int Config_max_pages = 20;
137 int Config_max_images = 20;
138 int Config_max_error404 = 20;
139 int Config_max_agents = 20;
140 int Config_max_googled = 20;
141 int Config_max_adsensed = 20;
142 int Config_max_google_keyphrases = 20;
143 int Config_max_google_keyphrases_age = 20;
144 int Config_max_trails = 20;
145 int Config_max_tld = 20;
146 int Config_max_robots = 20;
147 int Config_process_agents = 0;
148 int Config_process_google = 0;
149 int Config_process_google_keyphrases = 0;
150 int Config_process_google_keyphrases_age = 0;
151 int Config_process_google_human_language = 0;
152 int Config_process_web_trails = 0;
153 int Config_process_weekdayhour_map = 0;
154 int Config_process_monthday_map = 0;
155 int Config_process_referers_age = 0;
156 int Config_process_tld = 0;
157 int Config_process_os = 0;
158 int Config_process_browsers = 0;
159 int Config_process_error404 = 0;
160 int Config_process_pageviews = 0;
161 int Config_process_monthly_visitors = 1;
162 int Config_process_robots = 0;
163 int Config_process_screen_info = 0;
164 int Config_graphviz_mode = 0;
165 int Config_graphviz_ignorenode_google = 0;
166 int Config_graphviz_ignorenode_external = 0;
167 int Config_graphviz_ignorenode_noreferer = 0;
168 int Config_tail_mode = 0;
169 int Config_stream_mode = 0;
170 int Config_update_every = 60*10; /* update every 10 minutes for default. */
171 int Config_reset_every = 0; /* never reset for default */
172 int Config_time_delta = 0; /* adjustable time difference */
173 int Config_filter_spam = 0;
174 int Config_ignore_404 = 0;
175 char *Config_output_file = NULL; /* stdout if not set. */
176 struct outputmodule *Output = NULL; /* intialized to 'text' in main() */
177
178 /* Prefixes */
179 int Config_prefix_num = 0; /* number of set prefixes */
180 struct vistring Config_prefix[VI_PREFIXES_MAX];
181
182 /* Grep/Exclude array */
183 struct greppat Config_grep_pattern[VI_GREP_PATTERNS_MAX];
184 int Config_grep_pattern_num = 0; /* number of set patterns */
185
186 /*----------------------------------- Tables ---------------------------------*/
187 static char *vi_wdname[7] = {"Mo", "Tu", "We", "Th", "Fr", "Sa", "Su"};
188 #if 0
189 static int vi_monthdays[12] = {31, 29, 31, 30, 31, 30 , 31, 31, 30, 31, 30, 31};
190 #endif
191
192 /* -------------------------------- prototypes ------------------------------ */
193 void vi_clear_error(struct vih *vih);
194 void vi_tail(int filec, char **filev);
195
196 /*------------------- Options parsing help functions ------------------------ */
ConfigAddGrepPattern(char * pattern,int type)197 void ConfigAddGrepPattern(char *pattern, int type)
198 {
199 char *s;
200 int len = strlen(pattern);
201
202 if (Config_grep_pattern_num == VI_GREP_PATTERNS_MAX) {
203 fprintf(stderr, "Too many grep/exclude options specified\n");
204 exit(1);
205 }
206 s = malloc(strlen(pattern)+3);
207 s[0] = '*';
208 memcpy(s+1, pattern, len);
209 s[len+1] = '*';
210 s[len+2] = '\0';
211 Config_grep_pattern[Config_grep_pattern_num].type = type;
212 Config_grep_pattern[Config_grep_pattern_num].pattern = s;
213 Config_grep_pattern_num++;
214 }
215
216 /*------------------------------ support functions -------------------------- */
217 /* Returns non-zero if the link seems like a google link, zero otherwise.
218 * Note that this function only checks for a prefix of www.google.<something>.
219 * so may be fooled. */
vi_is_google_link(char * s)220 int vi_is_google_link(char *s)
221 {
222 return !strncmp(s, "http://www.google.", 18);
223 }
224
225 /* Returns non-zero if the user agent appears to be the GoogleBot. */
vi_is_googlebot_agent(char * agent)226 int vi_is_googlebot_agent(char *agent) {
227 if (strstr(agent, "Googlebot") ||
228 strstr(agent, "googlebot")) return 1;
229 return 0;
230 }
231
232 /* Returns non-zero if the user agent appears to be the Mediapartners-Google. */
vi_is_adsensebot_agent(char * agent)233 int vi_is_adsensebot_agent(char *agent) {
234 if (strstr(agent, "Mediapartners-Google")) return 1;
235 return 0;
236 }
237
vi_is_yahoobot_agent(char * agent)238 int vi_is_yahoobot_agent(char *agent) {
239 if (strstr(agent, "Yahoo! Slurp")) return 1;
240 return 0;
241 }
242
vi_is_msbot_agent(char * agent)243 int vi_is_msbot_agent(char *agent) {
244 if (strstr(agent, "msn.com/msnbot.htm")) return 1;
245 return 0;
246 }
247
248 /* Try to guess if a given agent string is about a crawler/bot
249 * of some time. This function MUST be conservative, because
250 * false negatives are acceptable while false positives arent. */
vi_is_genericbot_agent(char * agent)251 int vi_is_genericbot_agent(char *agent) {
252 if (strstr(agent, "crawler") ||
253 strstr(agent, "Crawler") ||
254 strstr(agent, "bot/") ||
255 strstr(agent, "Bot/") ||
256 strstr(agent, "bot.htm") ||
257 strstr(agent, "+http://")) return 1;
258 return 0;
259 }
260
vi_is_bot_agent(char * agent)261 int vi_is_bot_agent(char *agent) {
262 if (vi_is_googlebot_agent(agent) ||
263 vi_is_adsensebot_agent(agent) ||
264 vi_is_yahoobot_agent(agent) ||
265 vi_is_msbot_agent(agent)) return 1;
266 return 0;
267 }
268
269 /* Returns non-zero if the url matches some user-specified prefix.
270 * being a link "internal" to the site. Otherwise zero is returned.
271 *
272 * When there is a match, the value returned is the length of
273 * the matching prefix. */
vi_is_internal_link(char * url)274 int vi_is_internal_link(char *url)
275 {
276 int i, l;
277
278 if (!Config_prefix_num) return 0; /* no prefixes set? */
279 l = strlen(url);
280 for (i = 0; i < Config_prefix_num; i++) {
281 if (Config_prefix[i].len <= l &&
282 !strncasecmp(url, Config_prefix[i].str,
283 Config_prefix[i].len))
284 {
285 return Config_prefix[i].len;
286 }
287 }
288 return 0;
289 }
290
291 /* returns non-zero if the URL 's' seems an image or a CSS file. */
vi_is_image(char * s)292 int vi_is_image(char *s)
293 {
294 int l = strlen(s);
295 char *end = s + l; /* point to the nul term */
296
297 if (l < 5) return 0;
298 if (!memcmp(end-4, ".css", 4) ||
299 !memcmp(end-4, ".jpg", 4) ||
300 !memcmp(end-4, ".gif", 4) ||
301 !memcmp(end-4, ".png", 4) ||
302 !memcmp(end-4, ".ico", 4) ||
303 !memcmp(end-4, ".swf", 4) ||
304 !memcmp(end-3, ".js", 3) ||
305 !memcmp(end-5, ".jpeg", 5) ||
306 !memcmp(end-4, ".CSS", 4) ||
307 !memcmp(end-4, ".JPG", 4) ||
308 !memcmp(end-4, ".GIF", 4) ||
309 !memcmp(end-4, ".PNG", 4) ||
310 !memcmp(end-4, ".ICO", 4) ||
311 !memcmp(end-4, ".SWF", 4) ||
312 !memcmp(end-3, ".JS", 3) ||
313 !memcmp(end-5, ".JPEG", 5)) return 1;
314 return 0;
315 }
316
317 /* returns non-zero if the URL 's' seems a real page. */
vi_is_pageview(char * s)318 int vi_is_pageview(char *s)
319 {
320 int l = strlen(s);
321 char *end = s + l; /* point to the nul term */
322 char *dot, *slash;
323
324 if (s[l-1] == '/') return 1;
325 if (l >= 6 &&
326 (!memcmp(end-5, ".html", 5) ||
327 !memcmp(end-4, ".htm", 4) ||
328 !memcmp(end-4, ".php", 4) ||
329 !memcmp(end-4, ".asp", 4) ||
330 !memcmp(end-4, ".jsp", 4) ||
331 !memcmp(end-4, ".xdl", 4) ||
332 !memcmp(end-5, ".xhtml", 5) ||
333 !memcmp(end-4, ".xml", 4) ||
334 !memcmp(end-4, ".cgi", 4) ||
335 !memcmp(end-3, ".pl", 3) ||
336 !memcmp(end-6, ".shtml", 6) ||
337 !memcmp(end-5, ".HTML", 5) ||
338 !memcmp(end-4, ".HTM", 4) ||
339 !memcmp(end-4, ".PHP", 4) ||
340 !memcmp(end-4, ".ASP", 4) ||
341 !memcmp(end-4, ".JSP", 4) ||
342 !memcmp(end-4, ".XDL", 4) ||
343 !memcmp(end-6, ".XHTML", 6) ||
344 !memcmp(end-4, ".XML", 4) ||
345 !memcmp(end-4, ".CGI", 4) ||
346 !memcmp(end-3, ".PL", 3) ||
347 !memcmp(end-6, ".SHTML", 6))) return 1;
348 dot = strrchr(s, '.');
349 if (!dot) return 1;
350 slash = strrchr(s, '/');
351 if (slash && slash > dot) return 1;
352 return 0;
353 }
354
355 /* returns non-zero if 'ip' seems a string representing an IP address
356 * like "1.2.3.4". Note that 'ip' is always an IP or an hostname
357 * so this function actually test if the string pointed by 'ip' only
358 * contains characters in the "[0-9.]" set */
vi_is_numeric_address(char * ip)359 int vi_is_numeric_address(char *ip)
360 {
361 unsigned int l = strlen(ip);
362 return strspn(ip, "0123456789.") == l;
363 }
364
365 /* returns the time converted into a time_t value.
366 * On error (time_t) -1 is returned.
367 * Note that this function is specific for the following format:
368 * "10/May/2004:04:15:33". Works if the month is not an abbreviation, or if the
369 * year is abbreviated to only the last two digits.
370 * The time can be omitted like in "10/May/2004". */
parse_date(char * s,struct tm * tmptr)371 time_t parse_date(char *s, struct tm *tmptr)
372 {
373 struct tm tm;
374 time_t t;
375 char *months[] = {
376 "jan", "feb", "mar", "apr", "may", "jun",
377 "jul", "aug", "sep", "oct", "nov", "dec",
378 };
379 char *day, *month, *year, *time = NULL;
380 char monthaux[32];
381 int i, len;
382
383 /* make a copy to mess with it */
384 len = strlen(s);
385 if (len >= 32) goto fmterr;
386 memcpy(monthaux, s, len);
387 monthaux[len] = '\0';
388
389 /* Inizialize the tm structure. We just fill three fields */
390 tm.tm_sec = 0;
391 tm.tm_min = 0;
392 tm.tm_hour = 0;
393 tm.tm_mday = 0;
394 tm.tm_mon = 0;
395 tm.tm_year = 0;
396 tm.tm_wday = 0;
397 tm.tm_yday = 0;
398 tm.tm_isdst = -1;
399
400 /* search delimiters */
401 day = monthaux;
402 if ((month = strchr(day, '/')) == NULL) goto fmterr;
403 *month++ = '\0';
404 if ((year = strchr(month, '/')) == NULL) goto fmterr;
405 *year++ = '\0';
406 /* time, optional for this parser. */
407 if ((time = strchr(year, ':')) != NULL) {
408 *time++ = '\0';
409 }
410 /* convert day */
411 tm.tm_mday = atoi(day);
412 if (tm.tm_mday < 1 || tm.tm_mday > 31) goto fmterr;
413 /* convert month */
414 if (strlen(month) < 3) goto fmterr;
415 month[0] = tolower(month[0]);
416 month[1] = tolower(month[1]);
417 month[2] = tolower(month[2]);
418 for (i = 0; i < 12; i++) {
419 if (memcmp(month, months[i], 3) == 0) break;
420 }
421 if (i == 12) goto fmterr;
422 tm.tm_mon = i;
423 /* convert year */
424 tm.tm_year = atoi(year);
425 if (tm.tm_year > 100) {
426 if (tm.tm_year < 1900 || tm.tm_year > 2500) goto fmterr;
427 tm.tm_year -= 1900;
428 } else {
429 /* if the year is in two-digits form, the 0 - 68 range
430 * is converted to 2000 - 2068 */
431 if (tm.tm_year < 69)
432 tm.tm_year += 100;
433 }
434 /* convert time */
435 if (time) { /* format is HH:MM:SS */
436 if (strlen(time) < 8) goto fmterr;
437 tm.tm_hour = ((time[0]-'0')*10)+(time[1]-'0');
438 if (tm.tm_hour < 0 || tm.tm_hour > 23) goto fmterr;
439 tm.tm_min = ((time[3]-'0')*10)+(time[4]-'0');
440 if (tm.tm_min < 0 || tm.tm_min > 59) goto fmterr;
441 tm.tm_sec = ((time[6]-'0')*10)+(time[7]-'0');
442 if (tm.tm_sec < 0 || tm.tm_sec > 60) goto fmterr;
443 }
444 t = mktime(&tm);
445 if (t == (time_t)-1) goto fmterr;
446 t += (Config_time_delta*3600);
447 if (tmptr) {
448 struct tm *auxtm;
449
450 if ((auxtm = localtime(&t)) != NULL)
451 *tmptr = *auxtm;
452 }
453 return t;
454
455 fmterr: /* format error */
456 return (time_t) -1;
457 }
458
459 /* returns 1 if the given date is Saturday or Sunday.
460 * Zero is otherwise returned. */
vi_is_weekend(char * s)461 int vi_is_weekend(char *s)
462 {
463 struct tm tm;
464
465 if (parse_date(s, &tm) != (time_t)-1) {
466 if (tm.tm_wday == 0 || tm.tm_wday == 6)
467 return 1;
468 }
469 return 0;
470 }
471
472 #if 0
473 /* Returns true if 'year' is a leap year. */
474 int isleap(int year)
475 {
476 int conda, condb, condc;
477
478 conda = (year%4) == 0;
479 condb = (year%100) == 0;
480 condc = (year%400) == 0;
481 return conda && !(condb && !condc);
482 }
483 #endif
484
485 /* URL decoding and white spaces trimming function.
486 * Input: the encoded string 's'.
487 * Output: the decoded string written at 'd' that has room for at least 'n'
488 * bytes of data. */
vi_urldecode(char * d,char * s,int n)489 void vi_urldecode(char *d, char *s, int n)
490 {
491 char *start = d;
492 if (n < 1) return;
493 while(*s && n > 1) {
494 int c = *s;
495 switch(c) {
496 case '+': c = ' '; break;
497 case '%':
498 if (*(s+1) && *(s+2)) {
499 int high = toupper(*(s+1));
500 int low = toupper(*(s+2));
501
502 if (high <= '9') high -= '0';
503 else high = (high - 'A') + 10;
504 if (low <= '9') low -= '0';
505 else low = (low - 'A') + 10;
506 c = (high << 4)+low;
507 s += 2;
508 }
509 break;
510 }
511 if (c != ' ' || d != start) {
512 *d++ = c;
513 n--;
514 }
515 s++;
516 }
517 /* Right trim */
518 *d = '\0';
519 d--;
520 while (d >= start && *d == ' ') {
521 *d = '\0';
522 d--;
523 }
524 }
525
526 /* URL encoding function
527 * Input: the unencoded string 's'.
528 * Output: the url-encoded string written at 'd' that has room for at least 'n'
529 * bytes of data. */
vi_urlencode(char * d,char * s,int n)530 void vi_urlencode(char *d, char *s, int n)
531 {
532 if (n < 1) return;
533 n--;
534 while(*s && n > 0) {
535 int c = *s;
536 if ((c >= 'A' && c <= 'Z') ||
537 (c >= 'a' && c <= 'z') ||
538 (c >= '0' && c <= '9'))
539 {
540 *d++ = c;
541 n--;
542 } else if (c == ' ') {
543 *d++ = '+';
544 n--;
545 } else if (c == '\n') {
546 if (n < 6) break;
547 memcpy(d, "%0d%0a", 6);
548 d += 6;
549 n -= 6;
550 } else {
551 unsigned int t;
552 char *hexset = "0123456789abcdef";
553
554 if (n < 3) break;
555 t = (unsigned) c;
556 *d++ = '%';
557 *d++ = hexset [(t & 0xF0) >> 4];
558 *d++ = hexset [(t & 0x0F)];
559 n -= 3;
560 }
561 s++;
562 }
563 *d = '\0';
564 }
565
566 /* Convert a nul-term string to lowercase in place */
vi_strtolower(char * s)567 void vi_strtolower(char *s)
568 {
569 while (*s) {
570 *s = tolower(*s);
571 s++;
572 }
573 }
574
575 /* Note: the following function strlcat and strlcpy are (possibly) modified
576 * version of OpenBSD's functions. Original copyright notice:
577 * Copyright (c) 1998 Todd C. Miller <Todd.Miller@courtesan.com>
578 * Originally under the BSD license. */
vi_strlcpy(char * dst,char * src,int siz)579 int vi_strlcpy(char *dst, char *src, int siz)
580 {
581 char *d = dst;
582 const char *s = src;
583 int n = siz;
584
585 /* Copy as many bytes as will fit */
586 if (n != 0 && --n != 0) {
587 do {
588 if ((*d++ = *s++) == 0)
589 break;
590 } while (--n != 0);
591 }
592 /* Not enough room in dst, add NUL and traverse rest of src */
593 if (n == 0) {
594 if (siz != 0)
595 *d = '\0'; /* NUL-terminate dst */
596 while (*s++)
597 ;
598 }
599 return(s - src - 1); /* count does not include NUL */
600 }
601
vi_strlcat(char * dst,const char * src,int siz)602 int vi_strlcat(char *dst, const char *src, int siz)
603 {
604 char *d = dst;
605 const char *s = src;
606 size_t n = siz;
607 size_t dlen;
608
609 /* Find the end of dst and adjust bytes left but don't go past end */
610 while (n-- != 0 && *d != '\0')
611 d++;
612 dlen = d - dst;
613 n = siz - dlen;
614
615 if (n == 0)
616 return(dlen + strlen(s));
617 while (*s != '\0') {
618 if (n != 1) {
619 *d++ = *s;
620 n--;
621 }
622 s++;
623 }
624 *d = '\0';
625
626 return(dlen + (s - src)); /* count does not include NUL */
627 }
628
629 /* Returns non-zero if the url matches one of the keywords in
630 * blacklist.h, otherwise zero is returned. Warning!!! This function
631 * run time is proportional to the size of blacklist.h, so it is
632 * very slow. */
vi_is_blacklisted_url(struct vih * vih,char * url)633 int vi_is_blacklisted_url(struct vih *vih, char *url)
634 {
635 unsigned int i;
636
637 for (i = 0; i < VI_BLACKLIST_LEN; i++) {
638 if (strstr(url, vi_blacklist[i])) {
639 vih->blacklisted++;
640 return 1;
641 }
642 }
643 return 0;
644 }
645
646 /* Glob-style pattern matching. */
vi_match_len(const char * pattern,int patternLen,const char * string,int stringLen,int nocase)647 int vi_match_len(const char *pattern, int patternLen,
648 const char *string, int stringLen, int nocase)
649 {
650 while(patternLen) {
651 switch(pattern[0]) {
652 case '*':
653 while (pattern[1] == '*') {
654 pattern++;
655 patternLen--;
656 }
657 if (patternLen == 1)
658 return 1; /* match */
659 while(stringLen) {
660 if (vi_match_len(pattern+1, patternLen-1,
661 string, stringLen, nocase))
662 return 1; /* match */
663 string++;
664 stringLen--;
665 }
666 return 0; /* no match */
667 break;
668 case '?':
669 if (stringLen == 0)
670 return 0; /* no match */
671 string++;
672 stringLen--;
673 break;
674 case '[':
675 {
676 int not, match;
677
678 pattern++;
679 patternLen--;
680 not = pattern[0] == '^';
681 if (not) {
682 pattern++;
683 patternLen--;
684 }
685 match = 0;
686 while(1) {
687 if (pattern[0] == '\\') {
688 pattern++;
689 patternLen--;
690 if (pattern[0] == string[0])
691 match = 1;
692 } else if (pattern[0] == ']') {
693 break;
694 } else if (patternLen == 0) {
695 pattern--;
696 patternLen++;
697 break;
698 } else if (pattern[1] == '-' && patternLen >= 3) {
699 int start = pattern[0];
700 int end = pattern[2];
701 int c = string[0];
702 if (start > end) {
703 int t = start;
704 start = end;
705 end = t;
706 }
707 if (nocase) {
708 start = tolower(start);
709 end = tolower(end);
710 c = tolower(c);
711 }
712 pattern += 2;
713 patternLen -= 2;
714 if (c >= start && c <= end)
715 match = 1;
716 } else {
717 if (!nocase) {
718 if (pattern[0] == string[0])
719 match = 1;
720 } else {
721 if (tolower((int)pattern[0]) == tolower((int)string[0]))
722 match = 1;
723 }
724 }
725 pattern++;
726 patternLen--;
727 }
728 if (not)
729 match = !match;
730 if (!match)
731 return 0; /* no match */
732 string++;
733 stringLen--;
734 break;
735 }
736 case '\\':
737 if (patternLen >= 2) {
738 pattern++;
739 patternLen--;
740 }
741 /* fall through */
742 default:
743 if (!nocase) {
744 if (pattern[0] != string[0])
745 return 0; /* no match */
746 } else {
747 if (tolower((int)pattern[0]) != tolower((int)string[0]))
748 return 0; /* no match */
749 }
750 string++;
751 stringLen--;
752 break;
753 }
754 pattern++;
755 patternLen--;
756 if (stringLen == 0) {
757 while(*pattern == '*') {
758 pattern++;
759 patternLen--;
760 }
761 break;
762 }
763 }
764 if (patternLen == 0 && stringLen == 0)
765 return 1;
766 return 0;
767 }
768
769 /* Like vi_match_len but more handly if used against nul-term strings. */
vi_match(const char * pattern,const char * string,int nocase)770 int vi_match(const char *pattern, const char *string, int nocase)
771 {
772 int patternLen = strlen(pattern);
773 int stringLen = strlen(string);
774 return vi_match_len(pattern, patternLen, string, stringLen, nocase);
775 }
776
777 /*-------------------------- visitors handler functions --------------------- */
778 /* Init the hashtable with methods suitable for an "occurrences counter" */
vi_ht_init(struct hashtable * ht)779 void vi_ht_init(struct hashtable *ht)
780 {
781 ht_init(ht);
782 ht_set_hash(ht, ht_hash_string);
783 ht_set_key_destructor(ht, ht_destructor_free);
784 ht_set_val_destructor(ht, ht_no_destructor);
785 ht_set_key_compare(ht, ht_compare_string);
786 }
787
788 /* Reset the weekday/hour info in the visitors handler. */
vi_reset_combined_maps(struct vih * vih)789 void vi_reset_combined_maps(struct vih *vih)
790 {
791 int i, j;
792
793 for (i = 0; i < 24; i++) {
794 vih->hour[i] = 0;
795 for (j = 0; j < 7; j++)
796 vih->weekdayhour[j][i] = 0;
797 }
798 for (i = 0; i < 7; i++) vih->weekday[i] = 0;
799 for (i = 0; i < 31; i++)
800 for (j = 0; j < 12; j++)
801 vih->monthday[j][i] = 0;
802 }
803
804 /* Reset the hashtables from the handler, that are left
805 * in a reusable state (but all empty). */
vi_reset_hashtables(struct vih * vih)806 void vi_reset_hashtables(struct vih *vih)
807 {
808 ht_destroy(&vih->visitors);
809 ht_destroy(&vih->googlevisitors);
810 ht_destroy(&vih->pages);
811 ht_destroy(&vih->images);
812 ht_destroy(&vih->error404);
813 ht_destroy(&vih->pageviews);
814 ht_destroy(&vih->pageviews_grouped);
815 ht_destroy(&vih->referers);
816 ht_destroy(&vih->referersage);
817 ht_destroy(&vih->agents);
818 ht_destroy(&vih->googled);
819 ht_destroy(&vih->adsensed);
820 ht_destroy(&vih->googlekeyphrases);
821 ht_destroy(&vih->googlekeyphrasesage);
822 ht_destroy(&vih->googlevisits);
823 ht_destroy(&vih->trails);
824 ht_destroy(&vih->tld);
825 ht_destroy(&vih->os);
826 ht_destroy(&vih->browsers);
827 ht_destroy(&vih->date);
828 ht_destroy(&vih->googledate);
829 ht_destroy(&vih->month);
830 ht_destroy(&vih->googlemonth);
831 ht_destroy(&vih->robots);
832 ht_destroy(&vih->googlehumanlanguage);
833 ht_destroy(&vih->screenres);
834 ht_destroy(&vih->screendepth);
835 }
836
837 /* Reset handler informations to support --reset option in
838 * stream mode. */
vi_reset(struct vih * vih)839 void vi_reset(struct vih *vih)
840 {
841 vi_reset_combined_maps(vih);
842 vi_reset_hashtables(vih);
843 }
844
845 /* Return a new visitors handle.
846 * On out of memory NULL is returned.
847 * The handle obtained with this call must be released with vi_free()
848 * when no longer useful. */
vi_new(void)849 struct vih *vi_new(void)
850 {
851 struct vih *vih;
852
853 if ((vih = malloc(sizeof(*vih))) == NULL)
854 return NULL;
855 /* Initialization */
856 vih->startt = vih->endt = time(NULL);
857 vih->processed = 0;
858 vih->invalid = 0;
859 vih->blacklisted = 0;
860 vi_reset_combined_maps(vih);
861 vih->error = NULL;
862 vi_ht_init(&vih->visitors);
863 vi_ht_init(&vih->googlevisitors);
864 vi_ht_init(&vih->pages);
865 vi_ht_init(&vih->images);
866 vi_ht_init(&vih->error404);
867 vi_ht_init(&vih->pageviews);
868 vi_ht_init(&vih->pageviews_grouped);
869 vi_ht_init(&vih->referers);
870 vi_ht_init(&vih->referersage);
871 vi_ht_init(&vih->agents);
872 vi_ht_init(&vih->googled);
873 vi_ht_init(&vih->adsensed);
874 vi_ht_init(&vih->googlevisits);
875 vi_ht_init(&vih->googlekeyphrases);
876 vi_ht_init(&vih->googlekeyphrasesage);
877 vi_ht_init(&vih->trails);
878 vi_ht_init(&vih->tld);
879 vi_ht_init(&vih->os);
880 vi_ht_init(&vih->browsers);
881 vi_ht_init(&vih->date);
882 vi_ht_init(&vih->month);
883 vi_ht_init(&vih->googledate);
884 vi_ht_init(&vih->googlemonth);
885 vi_ht_init(&vih->robots);
886 vi_ht_init(&vih->googlehumanlanguage);
887 vi_ht_init(&vih->screenres);
888 vi_ht_init(&vih->screendepth);
889 return vih;
890 }
891
892 /* Free an handle created with vi_new(). */
vi_free(struct vih * vih)893 void vi_free(struct vih *vih)
894 {
895 if (!vih) return;
896 vi_reset_hashtables(vih);
897 vi_clear_error(vih);
898 free(vih);
899 }
900
901 /* Add a new entry in the counter hashtable. If the key does not
902 * exists creates a new entry with "1" as number of hits, otherwise
903 * increment the old value.
904 *
905 * Return the value of hits after the increment or creation. If the
906 * returned value is greater than one, the key was already seen.
907 *
908 * Return 0 on out of memory.
909 *
910 * NOTE: the pointer of the "value" part of the hashtable entry is
911 * used as a counter casting it to a "long" integer. */
vi_counter_incr(struct hashtable * ht,char * key)912 int vi_counter_incr(struct hashtable *ht, char *key)
913 {
914 char *k;
915 unsigned int idx;
916 int r;
917 long val;
918
919 r = ht_search(ht, key, &idx);
920 if (r == HT_NOTFOUND) {
921 k = strdup(key);
922 if (k == NULL) return 0;
923 if (ht_add(ht, k, (void*)1) != HT_OK) {
924 free(k);
925 return 0;
926 }
927 return 1;
928 } else {
929 val = (long) ht_value(ht, idx);
930 val++;
931 ht_value(ht, idx) = (void*) val;
932 return val;
933 }
934 }
935
936 /* Similar to vi_counter_incr, but only read the old value of
937 * the counter without to alter it. If the specified key does not
938 * exists zero is returned. */
vi_counter_val(struct hashtable * ht,char * key)939 int vi_counter_val(struct hashtable *ht, char *key)
940 {
941 unsigned int idx;
942 int r;
943 long val;
944
945 r = ht_search(ht, key, &idx);
946 if (r == HT_NOTFOUND) {
947 return 0;
948 } else {
949 val = (long) ht_value(ht, idx);
950 return val;
951 }
952 }
953
954 /* Set a key/value pair inside the hash table with
955 * a create-else-replace semantic.
956 *
957 * Return non-zero on out of memory. */
vi_replace(struct hashtable * ht,char * key,char * value)958 int vi_replace(struct hashtable *ht, char *key, char *value)
959 {
960 char *k, *v;
961
962 k = strdup(key);
963 v = strdup(value);
964 if (!k || !v) goto err;
965 if (ht_replace(ht, k, v) != HT_OK)
966 goto err;
967 return 0;
968 err:
969 if (k) free(k);
970 if (v) free(v);
971 return 1;
972 }
973
974 /* Replace the time value of the given key with the new one if this
975 * is newer/older of the old one. If the key is new, it's just added
976 * to the hash table with the specified time as value.
977 *
978 * If the 'ifolder' flag is set, values are replaced with older one,
979 * otherwise with newer.
980 * This function is only used by wrappers replace_if_older() and
981 * replace_if_newer().
982 *
983 * Return 0 on success, non-zero on out of memory. */
vi_replace_time(struct hashtable * ht,char * key,time_t time,int ifolder)984 int vi_replace_time(struct hashtable *ht, char *key, time_t time, int ifolder)
985 {
986 char *k = NULL;
987 unsigned int idx;
988 int r;
989
990 r = ht_search(ht, key, &idx);
991 if (r == HT_NOTFOUND) {
992 k = strdup(key);
993 if (!k) goto err;
994 if (ht_add(ht, k, (void*)time) != HT_OK) goto err;
995 } else {
996 time_t oldt = (time_t) ht_value(ht, idx);
997 /* Update the date if this one is older/nwer. */
998 if (ifolder) {
999 if (time < oldt)
1000 ht_value(ht, idx) = (void*) time;
1001 } else {
1002 if (time > oldt)
1003 ht_value(ht, idx) = (void*) time;
1004 }
1005 }
1006 return 0;
1007 err:
1008 if (k) free(k);
1009 return 1;
1010 }
1011
1012 /* see vi_replace_time */
vi_replace_if_older(struct hashtable * ht,char * key,time_t time)1013 int vi_replace_if_older(struct hashtable *ht, char *key, time_t time)
1014 {
1015 return vi_replace_time(ht, key, time, 1);
1016 }
1017
1018 /* see vi_replace_time */
vi_replace_if_newer(struct hashtable * ht,char * key,time_t time)1019 int vi_replace_if_newer(struct hashtable *ht, char *key, time_t time)
1020 {
1021 return vi_replace_time(ht, key, time, 0);
1022 }
1023
1024 /* Set an error in the visitors handle */
vi_set_error(struct vih * vih,char * fmt,...)1025 void vi_set_error(struct vih *vih, char *fmt, ...)
1026 {
1027 va_list ap;
1028 char buf[VI_ERROR_MAX];
1029
1030 va_start(ap, fmt);
1031 vsnprintf(buf, VI_ERROR_MAX, fmt, ap);
1032 buf[VI_ERROR_MAX-1] = '\0';
1033 free(vih->error);
1034 vih->error = strdup(buf);
1035 va_end(ap);
1036 }
1037
1038 /* Get the error */
vi_get_error(struct vih * vih)1039 char *vi_get_error(struct vih *vih)
1040 {
1041 if (!vih->error) {
1042 return "No error";
1043 }
1044 return vih->error;
1045 }
1046
1047 /* Clear the error */
vi_clear_error(struct vih * vih)1048 void vi_clear_error(struct vih *vih)
1049 {
1050 free(vih->error);
1051 vih->error = NULL;
1052 }
1053
1054 /*----------------------------------- parsing ----------------------------- */
1055 /* Parse a line of log, and fill the logline structure with
1056 * appropriate values. On error (bad line format) non-zero is returned. */
vi_parse_line(struct logline * ll,char * l)1057 int vi_parse_line(struct logline *ll, char *l)
1058 {
1059 char *date, *hour, *timezone, *host, *agent, *req, *ref, *p;
1060 char *agent_start = NULL, *req_end = NULL, *ref_end = NULL;
1061 int agent_without_parens = 0;
1062
1063 /* Seek the start of the different components */
1064
1065 /* host */
1066 host = l;
1067 /* date */
1068 if ((date = strchr(l, '[')) == NULL) return 1;
1069 date++;
1070 /* Identify user-agent start char. */
1071 if ((agent = strchr(l, '(')) == NULL) {
1072 /* Bad... user agent without (...) string, makes
1073 * the detection a bit slower and guessworkish. */
1074
1075 /* Check if the count of '"' chars in the string
1076 * is equal to six. If so, it's very likely that the
1077 * last field inside "" is the User Agent string, so
1078 * we get it. */
1079 char *aux = l, *last = NULL;
1080 int count = 0;
1081
1082 /* Count '"' chars, save the last occurence found. */
1083 while (*aux) {
1084 if (*aux == '"') {
1085 count++;
1086 last = aux;
1087 }
1088 aux++;
1089 }
1090
1091 if (count == 6) {
1092 /* Ok! it seems like Combined log format.
1093 * Set a flag and get it later when the
1094 * rest of the log file is splitted. Now it's
1095 * too early to add \0 chars inside the line. */
1096 agent_without_parens = 1;
1097 agent_start = last-1;
1098 while(*agent_start != '"')
1099 agent_start--;
1100 } else {
1101 /* No way... no user agent detected in this line. */
1102 agent = "";
1103 }
1104 } else {
1105 /* User agent with () inside. Simple to detect, just
1106 * search the left and the right '"' chars enclosing
1107 * it. */
1108 p = agent;
1109 while (p >= l) {
1110 if (*p == '"') {
1111 agent_start = p;
1112 break;
1113 }
1114 p--;
1115 }
1116 }
1117 /* req */
1118 if ((req = strstr(l, "\"GET")) != NULL ||
1119 (req = strstr(l, "\"POST")) != NULL ||
1120 (req = strstr(l, "\"HEAD")) != NULL ||
1121 (req = strstr(l, "\"get")) != NULL ||
1122 (req = strstr(l, "\"post")) != NULL ||
1123 (req = strstr(l, "\"head")) != NULL)
1124 {
1125 req++;
1126 } else {
1127 req = "";
1128 }
1129 /* ref */
1130 if ((ref = strstr(l, "\"http")) != NULL ||
1131 (ref = strstr(l, "\"HTTP")) != NULL)
1132 {
1133 ref++;
1134 } else {
1135 ref = "";
1136 }
1137
1138 /* Nul-term the components */
1139
1140 /* host */
1141 if ((p = strchr(host, ' ')) == NULL) return 1;
1142 *p = '\0';
1143 /* date */
1144 if ((p = strchr(date, ']')) == NULL) return 1;
1145 *p = '\0';
1146 ll->time = parse_date(date, &ll->tm);
1147 if (ll->time == (time_t)-1) return 1;
1148 /* hour */
1149 if ((p = strchr(date, ':')) == NULL) return 1;
1150 hour = p+1;
1151 *p = '\0';
1152 /* timezone */
1153 if ((p = strchr(hour, ' ')) == NULL) return 1;
1154 timezone = p+1;
1155 *p = '\0';
1156 /* req */
1157 if ((p = strchr(req, '"')) == NULL) {
1158 req = "";
1159 } else {
1160 req_end = p;
1161 *p = '\0';
1162 if ((p = strchr(req, ' ')) != NULL) {
1163 req = p+1;
1164 if ((p = strchr(req, ' ')) != NULL)
1165 *p = '\0';
1166 }
1167 }
1168 /* ref */
1169 if ((p = strchr(ref, '"')) == NULL) {
1170 ref = "";
1171 } else {
1172 ref_end = p;
1173 *p = '\0';
1174 }
1175 /* agent */
1176 if (agent_without_parens) {
1177 /* User agent without (...) inside in a string with six '"' chars.
1178 * Just search for the end. */
1179 char *aux = strchr(agent_start+1, '"');
1180 if (!aux) {
1181 /* No way! */
1182 agent = "";
1183 } else {
1184 *aux = '\0';
1185 agent = agent_start+1;
1186 }
1187 } else if ((p = strchr(agent, ')')) == NULL) {
1188 agent = "";
1189 } else {
1190 char *aux;
1191
1192 aux = strchr(p, '"');
1193 if (aux)
1194 *aux = '\0';
1195 else
1196 *(p+1) = '\0';
1197 if (agent_start) {
1198 if ((!req_end || (req_end != agent_start)) &&
1199 (!ref_end || (ref_end != agent_start))) {
1200 agent = agent_start+1;
1201 }
1202 }
1203 }
1204
1205 /* Fill the struture */
1206 ll->host = host;
1207 ll->date = date;
1208 ll->hour = hour;
1209 ll->timezone = timezone;
1210 ll->agent = agent;
1211 ll->req = req;
1212 ll->ref = ref;
1213 return 0;
1214 }
1215
1216 /* process the weekday and hour information */
vi_process_date_and_hour(struct vih * vih,int weekday,int hour)1217 void vi_process_date_and_hour(struct vih *vih, int weekday, int hour)
1218 {
1219 /* Note, the following sanity check is useless in theory. */
1220 if (weekday < 0 || weekday > 6 || hour < 0 || hour > 23) return;
1221 vih->weekday[weekday]++;
1222 vih->hour[hour]++;
1223 /* store the combined info. We always compute this information
1224 * even if the report is disabled because it's cheap. */
1225 vih->weekdayhour[weekday][hour]++;
1226 }
1227
1228 /* process the month and day information */
vi_process_month_and_day(struct vih * vih,int month,int day)1229 void vi_process_month_and_day(struct vih *vih, int month, int day)
1230 {
1231 if (month < 0 || month > 11 || day < 0 || day > 30) return;
1232 vih->monthday[month][day]++;
1233 }
1234
1235 /* Process unique visitors populating the relative hash table.
1236 * Return non-zero on out of memory. This is also used to populate
1237 * the hashtable used for the "pageviews per user" statistics.
1238 *
1239 * Note that the last argument 'seen', is an integer passed by reference
1240 * that is set to '1' if this is not a new visit (otherwise it's set to zero) */
vi_process_visitors_per_day(struct vih * vih,char * host,char * agent,char * date,char * ref,char * req,int * seen)1241 int vi_process_visitors_per_day(struct vih *vih, char *host, char *agent, char *date, char *ref, char *req, int *seen)
1242 {
1243 char visday[VI_LINE_MAX], *p, *month = "fixme if I'm here!";
1244 char buf[64];
1245 int res, host_len, agent_len, date_len, hash_len;
1246 unsigned long h;
1247
1248 /* Ignore visits from Bots */
1249 if (vi_is_bot_agent(agent)) {
1250 if (seen != NULL) seen = 0;
1251 return 0;
1252 }
1253
1254 /* Build an unique identifier for this visit
1255 * adding together host, date and hash(user agent) */
1256 host_len = strlen(host);
1257 agent_len = strlen(agent);
1258 date_len = strlen(date);
1259 h = djb_hash((unsigned char*) agent, agent_len);
1260 sprintf(buf, "%lu", h);
1261 hash_len = strlen(buf);
1262 if (host_len+agent_len+date_len+4 > VI_LINE_MAX)
1263 return 0;
1264 p = visday;
1265 memcpy(p, host, host_len); p += host_len;
1266 *p++ = '|';
1267 memcpy(p, date, date_len); p += date_len;
1268 *p++ = '|';
1269 memcpy(p, buf, hash_len); p += hash_len;
1270 *p = '\0';
1271 /* fprintf(stderr, "%s\n", visday); */
1272
1273 if (Config_process_monthly_visitors) {
1274 /* Skip the day number. */
1275 month = strchr(date, '/');
1276 if (!month) return 0; /* should never happen */
1277 month++;
1278 }
1279
1280 /* Visits with Google as referer are also stored in another hash
1281 * table. */
1282 if (vi_is_google_link(ref)) {
1283 res = vi_counter_incr(&vih->googlevisitors, visday);
1284 if (res == 0) return 1; /* out of memory */
1285 if (res == 1) { /* new visit! */
1286 res = vi_counter_incr(&vih->googledate, date);
1287 if (res == 0) return 1; /* out of memory */
1288 if (Config_process_monthly_visitors) {
1289 res = vi_counter_incr(&vih->googlemonth, month);
1290 if (res == 0) return 1; /* out of memory */
1291 }
1292 }
1293 }
1294 /* Populate the 'pageviews per visitor' hash table */
1295 if (Config_process_pageviews && vi_is_pageview(req)) {
1296 res = vi_counter_incr(&vih->pageviews, visday);
1297 if (res == 0) return 1; /* out of memory */
1298 }
1299 /* Mark the visit in the non-google-specific hashtable */
1300 res = vi_counter_incr(&vih->visitors, visday);
1301 if (res == 0) return 1; /* out of memory */
1302 if (res > 1) {
1303 if (seen) *seen = 1;
1304 return 0; /* visit alredy seen. */
1305 }
1306 if (seen) *seen = 0; /* new visitor */
1307 res = vi_counter_incr(&vih->date, date);
1308 if (res == 0) return 1;
1309 if (Config_process_monthly_visitors) {
1310 res = vi_counter_incr(&vih->month, month);
1311 if (res == 0) return 1;
1312 }
1313 return 0;
1314 }
1315
1316 /* Process referers populating the relative hash tables.
1317 * Return non-zero on out of memory. */
vi_process_referer(struct vih * vih,char * ref,time_t age)1318 int vi_process_referer(struct vih *vih, char *ref, time_t age)
1319 {
1320 int res;
1321
1322 /* Check the url against the blacklist if needed
1323 * this can be very slow... */
1324 if (Config_filter_spam && vi_is_blacklisted_url(vih, ref))
1325 return 0;
1326 /* Don't count internal referer (specified by the user
1327 * using --prefix options), nor google referers. */
1328 if (vi_is_internal_link(ref))
1329 return !vi_counter_incr(&vih->referers, "Internal Link");
1330 if (vi_is_google_link(ref))
1331 return !vi_counter_incr(&vih->referers, "Google Search Engine");
1332 res = vi_counter_incr(&vih->referers, ref);
1333 if (res == 0) return 1;
1334 /* Process the referers age if enabled */
1335 if (Config_process_referers_age) {
1336 if (vi_replace_if_older(&vih->referersage, ref, age)) return 1;
1337 }
1338 return 0;
1339 }
1340
1341 /* Process requested URLs. Split the entries in two hash tables,
1342 * one for pages and one for images.
1343 * Return non-zero on out of memory. */
vi_process_page_request(struct vih * vih,char * url)1344 int vi_process_page_request(struct vih *vih, char *url)
1345 {
1346 int res;
1347 char urldecoded[VI_LINE_MAX];
1348
1349 vi_urldecode(urldecoded, url, VI_LINE_MAX);
1350 if (vi_is_image(url))
1351 res = vi_counter_incr(&vih->images, urldecoded);
1352 else
1353 res = vi_counter_incr(&vih->pages, urldecoded);
1354 if (res == 0) return 1;
1355 return 0;
1356 }
1357
1358 /* Process log lines for 404 errors report. */
vi_process_error404(struct vih * vih,char * l,char * url,int * is404)1359 int vi_process_error404(struct vih *vih, char *l, char *url, int *is404)
1360 {
1361 char urldecoded[VI_LINE_MAX];
1362
1363 if (is404) *is404 = 0;
1364 vi_urldecode(urldecoded, url, VI_LINE_MAX);
1365 if (strstr(l, " 404 ") && !strstr(l, " 200 ")) {
1366 if (is404) *is404 = 1;
1367 return !vi_counter_incr(&vih->error404, urldecoded);
1368 }
1369 return 0;
1370 }
1371
1372 /* Process agents populating the relative hash table.
1373 * Return non-zero on out of memory. */
vi_process_agents(struct vih * vih,char * agent)1374 int vi_process_agents(struct vih *vih, char *agent)
1375 {
1376 int res;
1377
1378 res = vi_counter_incr(&vih->agents, agent);
1379 if (res == 0) return 1;
1380 return 0;
1381 }
1382
1383 /* Match the list of keywords 't' against the string 's', and if
1384 * a match is found increment the matching keyword in the hashtable.
1385 * Return zero on success, non-zero on out of memory . */
vi_counter_incr_matchtable(struct hashtable * ht,char * s,char ** t)1386 int vi_counter_incr_matchtable(struct hashtable *ht, char *s, char **t)
1387 {
1388 while(*t) {
1389 int res;
1390 if ((*t)[0] == '\0' || strstr(s, *t) != NULL) {
1391 char *key = *(t+1) ? *(t+1) : *t;
1392 res = vi_counter_incr(ht, key);
1393 if (res == 0) return 1;
1394 return 0;
1395 }
1396 t += 2;
1397 }
1398 return 0;
1399 }
1400
1401 /* Process Operating Systems populating the relative hash table.
1402 * Return non-zero on out of memory. */
vi_process_os(struct vih * vih,char * agent)1403 int vi_process_os(struct vih *vih, char *agent)
1404 {
1405 /* Order may matter. */
1406 char *oslist[] = {
1407 "Windows", NULL,
1408 "Win98", "Windows",
1409 "Win95", "Windows",
1410 "WinNT", "Windows",
1411 "Win32", "Windows",
1412 "Linux", NULL,
1413 "-linux-", "Linux",
1414 "Macintosh", NULL,
1415 "Mac_PowerPC", "Macintosh",
1416 "SunOS", NULL,
1417 "FreeBSD", NULL,
1418 "OpenBSD", NULL,
1419 "NetBSD", NULL,
1420 "BEOS", NULL,
1421 "", "Unknown",
1422 NULL, NULL,
1423 };
1424 return vi_counter_incr_matchtable(&vih->os, agent, oslist);
1425 }
1426
1427 /* Process browsers information. */
vi_process_browsers(struct vih * vih,char * agent)1428 int vi_process_browsers(struct vih *vih, char *agent)
1429 {
1430 /* Note that the order matters. For example Safari
1431 * send an user agent where there is the string "Gecko"
1432 * so it must be before Gecko. */
1433 char *browserslist[] = {
1434 "Opera", NULL,
1435 "MSIE 4", "Explorer 4.x",
1436 "MSIE 5", "Explorer 5.x",
1437 "MSIE 6", "Explorer 6.x",
1438 "MSIE", "Explorer unknown version",
1439 "Safari", NULL,
1440 "Konqueror", NULL,
1441 "Galeon", NULL,
1442 "Firefox", NULL,
1443 "MultiZilla", NULL,
1444 "Gecko", "Other Mozilla based",
1445 "Wget", NULL,
1446 "Lynx", NULL,
1447 "Links ", "Links",
1448 "ELinks ", "Links",
1449 "Elinks ", "Links",
1450 "w3m", "W3M",
1451 "NATSU-MICAN", NULL,
1452 "msnbot", "MSNbot",
1453 "Slurp", "Yahoo Slurp",
1454 "Jeeves", "Ask Jeeves",
1455 "ZyBorg", NULL,
1456 "asteria", NULL,
1457 "contype", "Explorer",
1458 "Gigabot", NULL,
1459 "Windows-Media-Player", "Windows-MP",
1460 "NSPlayer", NULL,
1461 "Googlebot", "GoogleBot",
1462 "googlebot", "GoogleBot",
1463 "", "Unknown",
1464 NULL, NULL,
1465 };
1466 return vi_counter_incr_matchtable(&vih->browsers, agent, browserslist);
1467 }
1468
1469 /* Process req/agents to get information about pages retrivied by Google.
1470 * Return non-zero on out of memory. */
vi_process_googled(struct vih * vih,char * req,char * agent,time_t age)1471 int vi_process_googled(struct vih *vih, char *req, char *agent, time_t age)
1472 {
1473 if (vi_is_googlebot_agent(agent)) {
1474 return vi_replace_if_newer(&vih->googled, req, age);
1475 } else if (vi_is_adsensebot_agent(agent)) {
1476 return vi_replace_if_newer(&vih->adsensed, req, age);
1477 }
1478 return 0;
1479 }
1480
1481 /* Process screen resolution and color depth info, if the javascript
1482 * code needed was inserted in the pages (see the README file). */
vi_process_screen_info(struct vih * vih,char * req)1483 int vi_process_screen_info(struct vih *vih, char *req) {
1484 char *p;
1485
1486 if ((p = strstr(req, "visitors-screen-res-check.jpg?"))) {
1487 char buf[64];
1488
1489 p += 30;
1490 if (p[0] == '\0' || strstr(p, "undefined")) goto parseerror;
1491 vi_strlcpy(buf, p, 64);
1492 /* The string is somethink like: 1024x768x32, so we
1493 * search for the second 'x' char. */
1494 p = strchr(buf,'x'); if (!p) goto parseerror;
1495 p = strchr(p+1,'x'); if (!p) goto parseerror;
1496 *p = '\0'; p++;
1497 /* Populate the screen resolution hash table */
1498 if (vi_counter_incr(&vih->screenres, buf) == 0)
1499 return 1;
1500 /* ... and the screen color depth one. */
1501 if (vi_counter_incr(&vih->screendepth, p) == 0)
1502 return 1;
1503 }
1504 parseerror:
1505 return 0;
1506 }
1507
1508 /* Process accesses with the referer from google.
1509 * This is used to populate the keyphrases hashtable.
1510 * TODO: url decoding */
vi_process_google_keyphrases(struct vih * vih,char * ref,time_t age)1511 int vi_process_google_keyphrases(struct vih *vih, char *ref, time_t age)
1512 {
1513 char *s, *p, *e;
1514 int res, page;
1515 char urldecoded[VI_LINE_MAX];
1516 char buf[64];
1517
1518 if (!vi_is_google_link(ref)) return 0;
1519 /* Try to process gogoe human language info first. */
1520 if (Config_process_google_human_language) {
1521 s = strstr(ref+18, "&hl=");
1522 if (s == NULL) s = strstr(ref+18, "?hl=");
1523 if (s && s[4] && s[5]) {
1524 buf[0] = s[4];
1525 buf[1] = s[5];
1526 buf[2] = '\0';
1527 if (vi_counter_incr(&vih->googlehumanlanguage, buf) == 0)
1528 return 1;
1529 }
1530 }
1531
1532 /* It's possible to start the search for the query 18 chars
1533 * after the start of the referer because all the
1534 * google links will start with "http://www.google.". */
1535 if ((s = strstr(ref+18, "?q=")) == NULL &&
1536 (s = strstr(ref+18, "&q=")) == NULL) return 0;
1537 if ((p = strstr(ref+18, "&start=")) == NULL)
1538 p = strstr(ref+18, "?start=");
1539 if ((e = strchr(s+3, '&')) != NULL)
1540 *e = '\0';
1541 if (p && (e = strchr(p+7, '&')) != NULL)
1542 *e = '\0';
1543 if (!strncmp(s+3, "cache:", 6))
1544 return !vi_counter_incr(&vih->googlekeyphrases, "Google Cache Access");
1545 vi_urldecode(urldecoded, s+3, VI_LINE_MAX);
1546 vi_strtolower(urldecoded);
1547 page = p ? (1+(atoi(p+7)/10)) : 1;
1548 snprintf(buf, 64, " (page %d)", page);
1549 buf[63] = '\0';
1550 vi_strlcat(urldecoded, buf, VI_LINE_MAX);
1551 res = vi_counter_incr(&vih->googlekeyphrases, urldecoded);
1552 if (e) *e = '&';
1553 if (res == 0) return 1;
1554 /* Process keyphrases by first time */
1555 if (Config_process_google_keyphrases_age) {
1556 if (vi_replace_if_older(&vih->googlekeyphrasesage,
1557 urldecoded, age)) return 1;
1558 }
1559 return 0;
1560 }
1561
1562 /* Process robots information. For visitors every client accessing
1563 * to robots.txt is considered a robot.
1564 * Returns 1 on out of memory, otherwise zero is returned. */
vi_process_robots(struct vih * vih,char * req,char * agent)1565 int vi_process_robots(struct vih *vih, char *req, char *agent)
1566 {
1567 if (strncmp(req, "/robots.txt", 11) != 0) return 0;
1568 if (strstr(agent, "MSIECrawler")) return 0;
1569 return !vi_counter_incr(&vih->robots, agent);
1570 }
1571
1572 /* Process referer -> request pairs for web trails */
vi_process_web_trails(struct vih * vih,char * ref,char * req)1573 int vi_process_web_trails(struct vih *vih, char *ref, char *req)
1574 {
1575 int res, plen, google;
1576 char buf[VI_LINE_MAX];
1577 char *src;
1578
1579 if (vi_is_image(req)) return 0;
1580 plen = vi_is_internal_link(ref);
1581 google = vi_is_google_link(ref);
1582 if (plen) {
1583 src = (ref[plen] == '\0') ? "/" : ref+plen;
1584 } else if (google) {
1585 if (Config_graphviz_ignorenode_google) return 0;
1586 src = "Google";
1587 } else if (ref[0] != '\0') {
1588 if (Config_graphviz_ignorenode_external) return 0;
1589 src = "External Link";
1590 } else {
1591 if (Config_graphviz_ignorenode_noreferer) return 0;
1592 src = "No Referer";
1593 }
1594 if (!strcmp(src, req)) return 0; /* avoid self references */
1595
1596 snprintf(buf, VI_LINE_MAX, "%s -> %s", src, req);
1597 buf[VI_LINE_MAX-1] = '\0';
1598 res = vi_counter_incr(&vih->trails, buf);
1599 if (res == 0) return 1;
1600 return 0;
1601 }
1602
1603 /* Process Top Level Domains.
1604 * Returns zero on success. Non zero is returned on out of memory. */
vi_process_tld(struct vih * vih,char * hostname)1605 int vi_process_tld(struct vih *vih, char *hostname)
1606 {
1607 char *tld;
1608 int res;
1609
1610 if (vi_is_numeric_address(hostname)) {
1611 tld = "numeric IP";
1612 } else {
1613 tld = strrchr(hostname, '.');
1614 if (!tld) return 0;
1615 tld++;
1616 }
1617 res = vi_counter_incr(&vih->tld, tld);
1618 if (res == 0) return 1;
1619 return 0;
1620 }
1621
1622 /* Match a log line against --grep and --exclude patters to check
1623 * if the line must be processed or not. */
vi_match_line(char * line)1624 int vi_match_line(char *line)
1625 {
1626 int i;
1627
1628 for (i = 0; i < Config_grep_pattern_num; i++) {
1629 char *pattern = Config_grep_pattern[i].pattern;
1630 int nocase = 1;
1631
1632 /* Patterns starting with 'cs:' are matched in a case-sensitive
1633 * way after the 'cs:' prefix is discarded. */
1634 if (pattern[0] == 'c' && pattern[1] == 's' && pattern[2] == ':') {
1635 nocase = 0;
1636 pattern += 3;
1637 }
1638 if (vi_match(Config_grep_pattern[i].pattern, line, nocase)) {
1639 if (Config_grep_pattern[i].type == VI_PATTERNTYPE_EXCLUDE)
1640 return 0;
1641 } else {
1642 if (Config_grep_pattern[i].type == VI_PATTERNTYPE_GREP)
1643 return 0;
1644 }
1645 }
1646 return 1;
1647 }
1648
1649 /* Process a line of log. Returns non-zero on error. */
vi_process_line(struct vih * vih,char * l)1650 int vi_process_line(struct vih *vih, char *l)
1651 {
1652 struct logline ll;
1653 char origline[VI_LINE_MAX];
1654
1655 /* Test the line against --grep --exclude patterns before
1656 * to process it. */
1657 if (Config_grep_pattern_num) {
1658 if (vi_match_line(l) == 0)
1659 return 0; /* No match? skip. */
1660 }
1661
1662 vih->processed++;
1663 /* Take a copy of the original log line before to
1664 * copy it. Will be useful for some processing.
1665 * Do it only if required in order to speedup. */
1666 if (Config_process_error404 || Config_debug)
1667 vi_strlcpy(origline, l, VI_LINE_MAX);
1668 /* Split the line and run all the selected processing. */
1669 if (vi_parse_line(&ll, l) == 0) {
1670 int seen, is404;
1671
1672 /* We process 404 errors first, in order to skip
1673 * all the other reports if --ignore-404 option is active. */
1674 if (Config_process_error404 &&
1675 vi_process_error404(vih, origline, ll.req, &is404))
1676 goto oom;
1677 /* Process screen info if needed. */
1678 if (Config_process_screen_info && is404)
1679 if (vi_process_screen_info(vih, ll.req)) goto oom;
1680 /* 404 error AND --ignore-404? Stop processing of this line. */
1681 if (Config_ignore_404 && is404)
1682 return 0;
1683
1684 /* Now it's time to process unique visitors. The 'save'
1685 * local var saves if this log line is about a new visit
1686 * or not. Some report is generated only against the first
1687 * line of every visitor, other reports are generated
1688 * for every single log line. */
1689 if (vi_process_visitors_per_day(vih, ll.host, ll.agent,
1690 ll.date, ll.ref, ll.req, &seen))
1691 goto oom;
1692
1693 /* The following are processed for every log line */
1694 if (vi_process_page_request(vih, ll.req)) goto oom;
1695 if (Config_process_google &&
1696 vi_process_googled(vih, ll.req, ll.agent, ll.time))
1697 goto oom;
1698 if (Config_process_web_trails &&
1699 vi_process_web_trails(vih, ll.ref, ll.req)) goto oom;
1700 if (Config_process_google_keyphrases &&
1701 vi_process_google_keyphrases(vih, ll.ref, ll.time))
1702 goto oom;
1703
1704 /* The following are processed only for new visits */
1705 if (seen) return 0;
1706 vi_process_date_and_hour(vih, (ll.tm.tm_wday+6)%7,
1707 ll.tm.tm_hour);
1708 vi_process_month_and_day(vih, ll.tm.tm_mon, ll.tm.tm_mday-1);
1709 if (vi_process_referer(vih, ll.ref, ll.time)) goto oom;
1710 if (Config_process_agents &&
1711 vi_process_agents(vih, ll.agent)) goto oom;
1712 if (Config_process_os &&
1713 vi_process_os(vih, ll.agent)) goto oom;
1714 if (Config_process_browsers &&
1715 vi_process_browsers(vih, ll.agent)) goto oom;
1716 if (Config_process_tld &&
1717 vi_process_tld(vih, ll.host)) goto oom;
1718 if (Config_process_robots &&
1719 vi_process_robots(vih, ll.req, ll.agent)) goto oom;
1720 return 0;
1721 } else {
1722 vih->invalid++;
1723 if (Config_debug)
1724 fprintf(stderr, "Invalid line: %s\n", origline);
1725 return 0;
1726 }
1727 oom:
1728 vi_set_error(vih, "Out of memory processing data");
1729 return 1;
1730 }
1731
1732 /* Process the specified log file. Returns zero on success.
1733 * On error non zero is returned and an error is set in the handle. */
vi_scan(struct vih * vih,char * filename)1734 int vi_scan(struct vih *vih, char *filename)
1735 {
1736 FILE *fp;
1737 char buf[VI_LINE_MAX];
1738 int use_stdin = 0;
1739
1740 if (filename[0] == '-' && filename[1] == '\0') {
1741 /* If we are in stream mode, just return. Stdin
1742 * is implicit in this mode and will be read
1743 * after all the other files are processed. */
1744 if (Config_stream_mode) return 0;
1745 fp = stdin;
1746 use_stdin = 1;
1747 } else {
1748 if ((fp = fopen(filename, "r")) == NULL) {
1749 vi_set_error(vih, "Unable to open '%s': '%s'", filename, strerror(errno));
1750 return 1;
1751 }
1752 }
1753 while (fgets(buf, VI_LINE_MAX, fp) != NULL) {
1754 if (vi_process_line(vih, buf)) {
1755 fclose(fp);
1756 fprintf(stderr, "%s: %s\n", filename, vi_get_error(vih));
1757 return 1;
1758 }
1759 }
1760 if (!use_stdin)
1761 fclose(fp);
1762 vih->endt = time(NULL);
1763 return 0;
1764 }
1765
1766 /* Postprocessing of pageviews per visit data.
1767 * The source hashtable entries are in the form: uniqe-visitor -> pageviews.
1768 * After the postprocessing we obtain another hashtable in the form:
1769 * pageviews-range -> quantity. This hashtable can be used directly
1770 * with generic output functions to generate the output. */
vi_postprocess_pageviews(struct vih * vih)1771 int vi_postprocess_pageviews(struct vih *vih)
1772 {
1773 void **table;
1774 int len = ht_used(&vih->pageviews), i;
1775
1776 if ((table = ht_get_array(&vih->pageviews)) == NULL) {
1777 fprintf(stderr, "Out of memory in vi_postprocess_pageviews()\n");
1778 return 1;
1779 }
1780 /* Run the hashtable in order to populate 'pageviews_grouped' */
1781 for (i = 0; i < len; i++) {
1782 int pv = (long) table[(i*2)+1]; /* pageviews of visit */
1783 int res;
1784 char *key;
1785
1786 if (pv == 1) key = "1";
1787 else if (pv == 2) key = "2";
1788 else if (pv == 3) key = "3";
1789 else if (pv == 4) key = "4";
1790 else if (pv == 5) key = "5";
1791 else if (pv == 6) key = "6";
1792 else if (pv == 7) key = "7";
1793 else if (pv == 8) key = "8";
1794 else if (pv == 9) key = "9";
1795 else if (pv == 10) key = "10";
1796 else if (pv >= 11 && pv <= 20) key = "11-20";
1797 else if (pv >= 21 && pv <= 30) key = "21-30";
1798 else key = "> 30";
1799
1800 res = vi_counter_incr(&vih->pageviews_grouped, key);
1801 if (res == 0) {
1802 free(table);
1803 return 1; /* out of memory */
1804 }
1805 }
1806 free(table);
1807 return 0;
1808 }
1809
1810 /* This function is called from vi_print_report() in order to
1811 * run some postprocessing to raw data collected needed to generate reports. */
vi_postprocess(struct vih * vih)1812 int vi_postprocess(struct vih *vih)
1813 {
1814 if (vi_postprocess_pageviews(vih)) goto oom;
1815 return 0;
1816 oom:
1817 vi_set_error(vih, "Out of memory");
1818 return 1;
1819 }
1820
1821 /* ---------------------------- text output module -------------------------- */
om_text_print_header(FILE * fp)1822 void om_text_print_header(FILE *fp)
1823 {
1824 fp = fp;
1825 return;
1826 }
1827
om_text_print_footer(FILE * fp)1828 void om_text_print_footer(FILE *fp)
1829 {
1830 fp = fp;
1831 return;
1832 }
1833
om_text_print_title(FILE * fp,char * title)1834 void om_text_print_title(FILE *fp, char *title)
1835 {
1836 fprintf(fp, "=== %s ===\n", title);
1837 }
1838
om_text_print_subtitle(FILE * fp,char * subtitle)1839 void om_text_print_subtitle(FILE *fp, char *subtitle)
1840 {
1841 fprintf(fp, "--- %s\n", subtitle);
1842 }
1843
om_text_print_numkey_info(FILE * fp,char * key,int val)1844 void om_text_print_numkey_info(FILE *fp, char *key, int val)
1845 {
1846 fprintf(fp, "* %s: %d\n", key, val);
1847 }
1848
om_text_print_keykey_entry(FILE * fp,char * key1,char * key2,int num)1849 void om_text_print_keykey_entry(FILE *fp, char *key1, char *key2, int num)
1850 {
1851 fprintf(fp, "%d) %s: %s\n", num, key1, key2);
1852 }
1853
om_text_print_numkey_entry(FILE * fp,char * key,int val,char * link,int num)1854 void om_text_print_numkey_entry(FILE *fp, char *key, int val, char *link,
1855 int num)
1856 {
1857 link = link; /* avoid warning. Text output don't use this argument. */
1858 fprintf(fp, "%d) %s: %d\n", num, key, val);
1859 }
1860
1861 /* Print a bar, c1 and c2 are the colors of the left and right parts.
1862 * Max is the maximum value of the bar, the bar length is printed
1863 * to be porportional to max. tot is the "total" needed to compute
1864 * the precentage value. */
om_text_print_bar(FILE * fp,int max,int tot,int this,int cols,char c1,char c2)1865 void om_text_print_bar(FILE *fp, int max, int tot, int this, int cols,
1866 char c1, char c2)
1867 {
1868 int l;
1869 float p;
1870 char *bar;
1871 if (tot == 0) tot++;
1872 if (max == 0) max++;
1873 l = ((float)(cols*this))/max;
1874 p = ((float)(100*this))/tot;
1875 bar = malloc(cols+1);
1876 if (!bar) return;
1877 memset(bar, c2, cols+1);
1878 memset(bar, c1, l);
1879 bar[cols] = '\0';
1880 fprintf(fp, "%s %02.1f%%", bar, p);
1881 free(bar);
1882 }
1883
om_text_print_numkeybar_entry(FILE * fp,char * key,int max,int tot,int this)1884 void om_text_print_numkeybar_entry(FILE *fp, char *key, int max, int tot, int this)
1885 {
1886 fprintf(fp, " %-12s: %-9d |", key, this);
1887 om_text_print_bar(fp, max, tot, this, 44, '#', ' ');
1888 fprintf(fp, "\n");
1889 }
1890
om_text_print_numkeycomparativebar_entry(FILE * fp,char * key,int tot,int this)1891 void om_text_print_numkeycomparativebar_entry(FILE *fp, char *key, int tot, int this)
1892 {
1893 fprintf(fp, " %s: %-10d |", key, this);
1894 om_text_print_bar(fp, tot, tot, this, 44, '#', '.');
1895 fprintf(fp, "\n");
1896 }
1897
om_text_print_bidimentional_map(FILE * fp,int xlen,int ylen,char ** xlabel,char ** ylabel,int * value)1898 void om_text_print_bidimentional_map(FILE *fp, int xlen, int ylen,
1899 char **xlabel, char **ylabel, int *value)
1900 {
1901 char *asciipal = " .-+#";
1902 int pallen = strlen(asciipal);
1903 int x, y, l, max = 0;
1904
1905 /* Get the max value */
1906 l = xlen*ylen;
1907 for (x = 0; x < l; x++)
1908 if (max < value[x])
1909 max = value[x];
1910 if (max == 0) max++; /* avoid division by zero */
1911 /* print the map */
1912 for (y = 0; y < ylen; y++) {
1913 fprintf(fp, "%15s: ", ylabel[y]);
1914 for (x = 0; x < xlen; x++) {
1915 int coloridx;
1916 int val = value[(y*xlen)+x];
1917
1918 coloridx = ((pallen-1)*val)/max;
1919 fputc(asciipal[coloridx], fp);
1920 }
1921 fprintf(fp, "\n");
1922 }
1923 fprintf(fp, "\n");
1924 /* print the x-labels in vertical */
1925 {
1926 char **p = malloc(sizeof(char*)*xlen);
1927 /* The 'p' pointers array is initialized at the
1928 * start of all the x-labels. */
1929 for (x = 0; x < xlen; x++)
1930 p[x] = xlabel[x];
1931 while(1) {
1932 int sentinel = 0;
1933 fprintf(fp, "%15s ", "");
1934 for (x = 0; x < xlen; x++) {
1935 if (*(p[x]) != '\0') {
1936 fputc(*(p[x]), fp);
1937 p[x]++;
1938 sentinel++;
1939 } else {
1940 fputc(' ', fp);
1941 }
1942 }
1943 fputc('\n', fp);
1944 if (sentinel == 0) break;
1945 }
1946 free(p);
1947 }
1948 }
1949
om_text_print_hline(FILE * fp)1950 void om_text_print_hline(FILE *fp)
1951 {
1952 fprintf(fp, "\n");
1953 }
1954
om_text_print_credits(FILE * fp)1955 void om_text_print_credits(FILE *fp)
1956 {
1957 fprintf(fp, "Statistics generated with VISITORS version %s\n"
1958 "http://www.hping.org/visitors for more information\n",
1959 VI_VERSION_STR);
1960 }
1961
om_text_print_report_link(FILE * fp,char * report)1962 void om_text_print_report_link(FILE *fp, char *report)
1963 {
1964 fprintf(fp, "-> %s\n", report);
1965 return;
1966 }
1967
1968 struct outputmodule OutputModuleText = {
1969 om_text_print_header,
1970 om_text_print_footer,
1971 om_text_print_title,
1972 om_text_print_subtitle,
1973 om_text_print_numkey_info,
1974 om_text_print_keykey_entry,
1975 om_text_print_numkey_entry,
1976 om_text_print_numkeybar_entry,
1977 om_text_print_numkeycomparativebar_entry,
1978 om_text_print_bidimentional_map,
1979 om_text_print_hline,
1980 om_text_print_credits,
1981 om_text_print_report_link,
1982 };
1983
1984 /* ---------------------------- html output module -------------------------- */
1985 /* Use html entities for special chars. Abbreviates at 'maxlen' if needed. */
om_html_entities_abbr(FILE * fp,char * s,int maxlen)1986 void om_html_entities_abbr(FILE *fp, char *s, int maxlen)
1987 {
1988 while(*s) {
1989 if (maxlen-- == 0) {
1990 fprintf(fp, "...");
1991 break;
1992 }
1993 switch(*s) {
1994 case '\'': fprintf(fp, "'"); break;
1995 case '"': fprintf(fp, """); break;
1996 case '&': fprintf(fp, "&"); break;
1997 case '<': fprintf(fp, "<"); break;
1998 case '>': fprintf(fp, ">"); break;
1999 default: fputc(*s, fp); break;
2000 }
2001 s++;
2002 }
2003 }
2004
2005 /* A wrapper to om_html_entities_abbr() with a fixed abbreviation length */
om_html_entities(FILE * fp,char * s)2006 void om_html_entities(FILE *fp, char *s)
2007 {
2008 om_html_entities_abbr(fp, s, VI_HTML_ABBR_LEN);
2009 }
2010
om_html_print_header(FILE * fp)2011 void om_html_print_header(FILE *fp)
2012 {
2013 fprintf(fp,
2014 "<html>\n"
2015 "<head>\n"
2016 "<style>\n"
2017 "BODY, TD, B, LI, U, DIV, SPAN {\n"
2018 " background-color: #ffffff;\n"
2019 " color: #000000;\n"
2020 " font-family: Verdana, Arial, Helvetica, Sans-Serif;\n"
2021 " font-size: 10px;\n"
2022 "}\n"
2023 "A {\n"
2024 " color: #0066ff;\n"
2025 " text-decoration: none;\n"
2026 "}\n"
2027 "A:visited {\n"
2028 " color: #000099;\n"
2029 " text-decoration: none;\n"
2030 "}\n"
2031 "A:active {\n"
2032 " color: #26a0be;\n"
2033 " text-decoration: none;\n"
2034 "}\n"
2035 "A:hover {\n"
2036 " color: #ffffff;\n"
2037 " text-decoration: none;\n"
2038 " background-color: #26a0be;\n"
2039 "}\n"
2040 ".barfill {\n"
2041 " background-color: #96ef94;\n"
2042 " border-left: 1px;\n"
2043 " border-right: 1px;\n"
2044 " border-top: 1px;\n"
2045 " border-bottom: 1px;\n"
2046 " border-color: #4c934a;\n"
2047 " border-style: solid;\n"
2048 " font-size: 10px;\n"
2049 " height: 3px;\n"
2050 " line-height: 4px;\n"
2051 "}\n"
2052 ".barempty {\n"
2053 " font-size: 10px;\n"
2054 " line-height: 4px;\n"
2055 "}\n"
2056 ".barleft {\n"
2057 " background-color: #ff9696;\n"
2058 " border-left: 1px;\n"
2059 " border-right: 1px;\n"
2060 " border-top: 1px;\n"
2061 " border-bottom: 1px;\n"
2062 " border-color: #4c934a;\n"
2063 " border-style: solid;\n"
2064 " font-size: 10px;\n"
2065 " height: 3px;\n"
2066 " line-height: 4px;\n"
2067 "}\n"
2068 ".barright {\n"
2069 " background-color: #f8f8f8;\n"
2070 " border-left: 0px;\n"
2071 " border-right: 1px;\n"
2072 " border-top: 1px;\n"
2073 " border-bottom: 1px;\n"
2074 " border-color: #4c934a;\n"
2075 " border-style: solid;\n"
2076 " font-size: 10px;\n"
2077 " height: 3px;\n"
2078 " line-height: 4px;\n"
2079 "}\n"
2080 ".title {\n"
2081 " background-color: #007f9e;\n"
2082 " font-size: 12px;\n"
2083 " font-weight: bold;\n"
2084 " padding: 3px;\n"
2085 " color: #ffffff;\n"
2086 "}\n"
2087 ".reportlink {\n"
2088 " background-color: #ffffff;\n"
2089 " font-size: 12px;\n"
2090 " font-weight: bold;\n"
2091 " color: #000000;\n"
2092 " padding-left: 3px;\n"
2093 "}\n"
2094 ".subtitle {\n"
2095 " background-color: #007f9e;\n"
2096 " font-size: 12px;\n"
2097 " font-weight: normal;\n"
2098 " padding: 3px;\n"
2099 " color: #ffffff;\n"
2100 "}\n"
2101 ".info {\n"
2102 " background-color: #badfee;\n"
2103 " font-size: 12px;\n"
2104 " padding-left: 3px;\n"
2105 " padding-right: 3px;\n"
2106 "}\n"
2107 ".keyentry {\n"
2108 " font-size: 10px;\n"
2109 " padding-left: 2px;\n"
2110 " border-bottom: 1px dashed #bcbcbc;\n"
2111 "}\n"
2112 ".keyentrywe {\n"
2113 " background-color: #f0f090;\n"
2114 " font-size: 10px;\n"
2115 " padding-left: 2px;\n"
2116 " border-bottom: 1px dashed #bcbcbc;\n"
2117 "}\n"
2118 ".valueentry {\n"
2119 " font-size: 10px;\n"
2120 " padding-left: 2px;\n"
2121 " color: #905d14;\n"
2122 " border-bottom: 1px dashed #f6c074;\n"
2123 "}\n"
2124 ".credits {\n"
2125 " font-size: 12px;\n"
2126 " font-weight: bold;\n"
2127 "}\n"
2128 ".maintable {\n"
2129 " border-style: solid;\n"
2130 " border-color: #0b4b5b;\n"
2131 " border-width: 1px;\n"
2132 "}\n"
2133 "</style>\n"
2134 "</head>\n"
2135 "<body><table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" class=\"maintable\">\n"
2136 );
2137 }
2138
om_html_print_footer(FILE * fp)2139 void om_html_print_footer(FILE *fp)
2140 {
2141 fprintf(fp, "</table></body></html>\n");
2142 }
2143
om_html_print_title(FILE * fp,char * title)2144 void om_html_print_title(FILE *fp, char *title)
2145 {
2146 fprintf(fp, "<tr><td align=\"center\" class=\"title\" colspan=\"3\"><a name=\"%s\"></a>", title);
2147 om_html_entities(fp, title);
2148 fprintf(fp, "</td></tr>\n");
2149 }
2150
om_html_print_subtitle(FILE * fp,char * subtitle)2151 void om_html_print_subtitle(FILE *fp, char *subtitle)
2152 {
2153 fprintf(fp, "<tr><td align=\"center\" class=\"subtitle\" colspan=\"3\">");
2154 om_html_entities(fp, subtitle);
2155 fprintf(fp, "</td></tr>\n");
2156 }
2157
om_html_print_numkey_info(FILE * fp,char * key,int val)2158 void om_html_print_numkey_info(FILE *fp, char *key, int val)
2159 {
2160 fprintf(fp, "<tr><td align=\"left\" colspan=\"3\" class=\"info\">");
2161 om_html_entities(fp, key);
2162 fprintf(fp, " %d", val);
2163 fprintf(fp, "</td></tr>\n");
2164 }
2165
om_html_print_keykey_entry(FILE * fp,char * key1,char * key2,int num)2166 void om_html_print_keykey_entry(FILE *fp, char *key1, char *key2, int num)
2167 {
2168 fprintf(fp, "<tr><td align=\"left\" class=\"keyentry\">");
2169 fprintf(fp, "%d)", num);
2170 fprintf(fp, "<td align=\"left\" class=\"valueentry\">");
2171 om_html_entities(fp, key1);
2172 fprintf(fp, "</td><td align=\"left\" class=\"keyentry\">");
2173 if (!strncmp(key2, "http://", 7)) {
2174 fprintf(fp, "<a class=\"url\" href=\"%s\">", key2);
2175 om_html_entities(fp, key2);
2176 fprintf(fp, "</a>");
2177 } else {
2178 om_html_entities(fp, key2);
2179 }
2180 fprintf(fp, "</td></tr>\n");
2181 }
2182
om_html_print_numkey_entry(FILE * fp,char * key,int val,char * link,int num)2183 void om_html_print_numkey_entry(FILE *fp, char *key, int val, char *link,
2184 int num)
2185 {
2186 fprintf(fp, "<tr><td align=\"left\" class=\"keyentry\">");
2187 fprintf(fp, "%d)", num);
2188 fprintf(fp, "<td align=\"left\" class=\"valueentry\">");
2189 fprintf(fp, "%d", val);
2190 fprintf(fp, "</td><td align=\"left\" class=\"keyentry\">");
2191 if (link != NULL) {
2192 fprintf(fp, "<a class=\"url\" href=\"%s\">", link);
2193 om_html_entities(fp, key);
2194 fprintf(fp, "</a>");
2195 } else if (!strncmp(key, "http://", 7)) {
2196 fprintf(fp, "<a class=\"url\" href=\"%s\">", key);
2197 om_html_entities(fp, key);
2198 fprintf(fp, "</a>");
2199 } else {
2200 om_html_entities(fp, key);
2201 }
2202 fprintf(fp, "</td></tr>\n");
2203 }
2204
om_html_print_bar(FILE * fp,int l,char * leftclass,char * rightclass)2205 void om_html_print_bar(FILE *fp, int l, char *leftclass, char *rightclass)
2206 {
2207 fprintf(fp, "<table cellpadding=\"0\" cellspacing=\"0\" width=\"400\" border=\"0\">\n");
2208 fprintf(fp, "<tr><td align=\"center\" class=\"%s\" width=\"%d%%\">%s</td>\n", leftclass, l, l ? " " : "");
2209 fprintf(fp, "<td align=\"center\" class=\"%s\" width=\"%d%%\">%s</td></tr>\n", rightclass, 100-l, (l!=100) ? " " : "");
2210 fprintf(fp, "</table>\n");
2211 }
2212
om_html_print_numkeybar_entry(FILE * fp,char * key,int max,int tot,int this)2213 void om_html_print_numkeybar_entry(FILE *fp, char *key, int max, int tot, int this)
2214 {
2215 int l, weekend;
2216 float p;
2217
2218 if (tot == 0) tot++;
2219 if (max == 0) max++;
2220 l = ((float)(100*this))/max;
2221 p = ((float)(100*this))/tot;
2222 weekend = vi_is_weekend(key);
2223
2224 if (weekend)
2225 fprintf(fp, "<tr><td align=\"left\" class=\"keyentrywe\">");
2226 else
2227 fprintf(fp, "<tr><td align=\"left\" class=\"keyentry\">");
2228 om_html_entities(fp, key);
2229 fprintf(fp, " </td><td align=\"left\" class=\"valueentry\">");
2230 fprintf(fp, "%d (%02.1f%%)", this, p);
2231 fprintf(fp, "</td><td align=\"left\" class=\"bar\">");
2232 om_html_print_bar(fp, l, "barfill", "barempty");
2233 fprintf(fp, "</td></tr>\n");
2234 }
2235
om_html_print_numkeycomparativebar_entry(FILE * fp,char * key,int tot,int this)2236 void om_html_print_numkeycomparativebar_entry(FILE *fp, char *key, int tot, int this)
2237 {
2238 int l, weekend;
2239 float p;
2240
2241 if (tot == 0) tot++;
2242 p = ((float)(100*this))/tot;
2243 l = (int) p;
2244 weekend = vi_is_weekend(key);
2245
2246 if (weekend)
2247 fprintf(fp, "<tr><td align=\"left\" class=\"keyentrywe\">");
2248 else
2249 fprintf(fp, "<tr><td align=\"left\" class=\"keyentry\">");
2250 om_html_entities(fp, key);
2251 fprintf(fp, " </td><td align=\"left\" class=\"valueentry\">");
2252 fprintf(fp, "%d (%02.1f%%)", this, p);
2253 fprintf(fp, "</td><td align=\"left\" class=\"bar\">");
2254 om_html_print_bar(fp, l, "barleft", "barright");
2255 fprintf(fp, "</td></tr>\n");
2256 }
2257
om_html_print_bidimentional_map(FILE * fp,int xlen,int ylen,char ** xlabel,char ** ylabel,int * value)2258 void om_html_print_bidimentional_map(FILE *fp, int xlen, int ylen,
2259 char **xlabel, char **ylabel, int *value)
2260 {
2261 int x, y, l, max = 0;
2262
2263 /* Get the max value */
2264 l = xlen*ylen;
2265 for (x = 0; x < l; x++)
2266 if (max < value[x])
2267 max = value[x];
2268 if (max == 0) max++; /* avoid division by zero */
2269 /* print the map */
2270 fprintf(fp, "<tr><td colspan=\"3\" align=\"center\">");
2271 fprintf(fp, "<table border=\"0\" cellpadding=\"0\" cellspacing=\"0\">");
2272 for (y = 0; y < ylen; y++) {
2273 fprintf(fp, "<tr>");
2274 fprintf(fp, "<td class=\"valueentry\">%s</td>", ylabel[y]);
2275 for (x = 0; x < xlen; x++) {
2276 int r, g, b;
2277 int val = value[(y*xlen)+x];
2278
2279 r = (0xAA*val)/max;
2280 g = (0xBB*val)/max;
2281 b = (0xFF*val)/max;
2282 fprintf(fp, "<td style=\"background-color: #%02X%02X%02X;\"> </td>\n", r, g, b);
2283 }
2284 fprintf(fp, "</tr>\n");
2285 }
2286 fprintf(fp, "<tr><td> </td>");
2287 for (x = 0; x < xlen; x++) {
2288 fprintf(fp, "<td class=\"keyentry\">%s</td>", xlabel[x]);
2289 }
2290 fprintf(fp, "</tr></table></td></tr>");
2291 }
2292
om_html_print_hline(FILE * fp)2293 void om_html_print_hline(FILE *fp)
2294 {
2295 fprintf(fp, "<tr><td colspan=\"3\"> </td></tr>");
2296 }
2297
om_html_print_credits(FILE * fp)2298 void om_html_print_credits(FILE *fp)
2299 {
2300 fprintf(fp, "<tr><td colspan=\"3\" align=\"center\" class=\"credits\">Statistics generated with <a href=\"http://www.hping.org/visitors\">VISITORS Web Log Analyzer</a> version %s\n</td></tr>", VI_VERSION_STR);
2301 }
2302
om_html_print_report_link(FILE * fp,char * report)2303 void om_html_print_report_link(FILE *fp, char *report)
2304 {
2305 fprintf(fp, "<tr><td align=\"left\" class=\"reportlink\" colspan=\"3\"><a href=\"#%s\">", report);
2306 om_html_entities(fp, report);
2307 fprintf(fp, "</a></td></tr>\n");
2308 return;
2309 }
2310
2311 struct outputmodule OutputModuleHtml = {
2312 om_html_print_header,
2313 om_html_print_footer,
2314 om_html_print_title,
2315 om_html_print_subtitle,
2316 om_html_print_numkey_info,
2317 om_html_print_keykey_entry,
2318 om_html_print_numkey_entry,
2319 om_html_print_numkeybar_entry,
2320 om_html_print_numkeycomparativebar_entry,
2321 om_html_print_bidimentional_map,
2322 om_html_print_hline,
2323 om_html_print_credits,
2324 om_html_print_report_link,
2325 };
2326
2327
2328 /* ---------------------------------- output -------------------------------- */
vi_print_statistics(struct vih * vih)2329 void vi_print_statistics(struct vih *vih)
2330 {
2331 time_t elapsed = vih->endt - vih->startt;
2332
2333 if (elapsed == 0) elapsed++;
2334 fprintf(stderr, "--\n%d lines processed in %ld seconds\n"
2335 "%d invalid lines, %d blacklisted referers\n",
2336 vih->processed, (long) elapsed,
2337 vih->invalid, vih->blacklisted);
2338 }
2339
vi_print_hours_report(FILE * fp,struct vih * vih)2340 void vi_print_hours_report(FILE *fp, struct vih *vih)
2341 {
2342 int i, max = 0, tot = 0;
2343 for (i = 0; i < 24; i++) {
2344 if (vih->hour[i] > max)
2345 max = vih->hour[i];
2346 tot += vih->hour[i];
2347 }
2348 Output->print_title(fp, "Hours distribution");
2349 Output->print_subtitle(fp, "Percentage of hits in every hour of the day");
2350 for (i = 0; i < 24; i++) {
2351 char buf[8];
2352 sprintf(buf, "%02d", i);
2353 Output->print_numkeybar_entry(fp, buf, max, tot, vih->hour[i]);
2354 }
2355 }
2356
vi_print_weekdays_report(FILE * fp,struct vih * vih)2357 void vi_print_weekdays_report(FILE *fp, struct vih *vih)
2358 {
2359 int i, max = 0, tot = 0;
2360 for (i = 0; i < 7; i++) {
2361 if (vih->weekday[i] > max)
2362 max = vih->weekday[i];
2363 tot += vih->weekday[i];
2364 }
2365 Output->print_title(fp, "Weekdays distribution");
2366 Output->print_subtitle(fp, "Percentage of hits in every day of the week");
2367 for (i = 0; i < 7; i++) {
2368 Output->print_numkeybar_entry(fp, vi_wdname[i], max, tot, vih->weekday[i]);
2369 }
2370 }
2371
2372 /* Generic function for qsort(3) called to sort a table.
2373 * this function is actually only used by the following wrappers. */
qsort_cmp_dates_generic(const void * a,const void * b,int off,int mul)2374 int qsort_cmp_dates_generic(const void *a, const void *b, int off, int mul)
2375 {
2376 time_t ta, tb;
2377 void **A = (void**) a;
2378 void **B = (void**) b;
2379 char *dateA = (char*) *(A+off);
2380 char *dateB = (char*) *(B+off);
2381
2382 ta = parse_date(dateA, NULL);
2383 tb = parse_date(dateB, NULL);
2384 if (ta == (time_t)-1 && tb == (time_t)-1) return 0;
2385 if (ta == (time_t)-1) return 1*mul;
2386 if (tb == (time_t)-1) return -1*mul;
2387 if (ta > tb) return 1*mul;
2388 if (ta < tb) return -1*mul;
2389 return 0;
2390 }
2391
2392 /* Compare dates in the log format: hashtable key part version */
qsort_cmp_dates_key(const void * a,const void * b)2393 int qsort_cmp_dates_key(const void *a, const void *b)
2394 {
2395 return qsort_cmp_dates_generic(a, b, 0, 1);
2396 }
2397
2398 /* Compare dates (only the month/year part) in the log format:
2399 * hashtable key part version */
qsort_cmp_months_key(const void * a,const void * b)2400 int qsort_cmp_months_key(const void *a, const void *b)
2401 {
2402 int ret;
2403 char dateA[VI_DATE_MAX];
2404 char dateB[VI_DATE_MAX];
2405 void *savedA, *savedB; /* backups of the original pointers */
2406 void **A = (void**) a;
2407 void **B = (void**) b;
2408
2409 /* We use an hack here, in order to call qsort_cmp_dates_generic
2410 * even in this case, we substitute the hashtable entries
2411 * with versions of the strings prefixed with "01", so they
2412 * will be parseble by parse_date().
2413 * In pratice for "May/2004" we instead put "01/May/2004" and so on. */
2414 savedA = *A;
2415 savedB = *B;
2416 dateA[0] = dateB[0] = '0';
2417 dateA[1] = dateB[1] = '1';
2418 dateA[2] = dateB[2] = '/';
2419 dateA[3] = dateB[3] = '\0';
2420 vi_strlcat(dateA, (char*)*A, VI_DATE_MAX);
2421 vi_strlcat(dateB, (char*)*B, VI_DATE_MAX);
2422 *A = dateA;
2423 *B = dateB;
2424 ret = qsort_cmp_dates_generic(a, b, 0, 1);
2425 /* Restore */
2426 *A = savedA;
2427 *B = savedB;
2428 return ret;
2429 }
2430
2431 /* Compare dates in the log format: hashtable value part version.
2432 * this sorts in reverse order, more recent dates first. */
qsort_cmp_dates_value(const void * a,const void * b)2433 int qsort_cmp_dates_value(const void *a, const void *b)
2434 {
2435 return qsort_cmp_dates_generic(a, b, 1, -1);
2436 }
2437
qsort_cmp_long_value(const void * a,const void * b)2438 int qsort_cmp_long_value(const void *a, const void *b)
2439 {
2440 void **A = (void**) a;
2441 void **B = (void**) b;
2442 long la = (long) *(A+1);
2443 long lb = (long) *(B+1);
2444 if (la > lb) return -1;
2445 if (lb > la) return 1;
2446 return 0;
2447 }
2448
qsort_cmp_time_value(const void * a,const void * b)2449 int qsort_cmp_time_value(const void *a, const void *b)
2450 {
2451 void **A = (void**) a;
2452 void **B = (void**) b;
2453 time_t ta = (time_t) *(A+1);
2454 time_t tb = (time_t) *(B+1);
2455 if (ta > tb) return -1;
2456 if (tb > ta) return 1;
2457 return 0;
2458 }
2459
vi_print_visits_report(FILE * fp,struct vih * vih)2460 void vi_print_visits_report(FILE *fp, struct vih *vih)
2461 {
2462 int days = ht_used(&vih->date), i, tot = 0, max = 0;
2463 int months;
2464 void **table;
2465
2466 Output->print_title(fp, "Unique visitors in each day");
2467 Output->print_subtitle(fp, "Multiple hits with the same IP, user agent and access day, are considered a single visit");
2468 Output->print_numkey_info(fp, "Number of unique visitors",
2469 ht_used(&vih->visitors));
2470 Output->print_numkey_info(fp, "Different days in logfile",
2471 ht_used(&vih->date));
2472
2473 if ((table = ht_get_array(&vih->date)) == NULL) {
2474 fprintf(stderr, "Out Of Memory in print_visits_report()\n");
2475 return;
2476 }
2477 qsort(table, days, sizeof(void*)*2, qsort_cmp_dates_key);
2478 for (i = 0; i < days; i++) {
2479 long value = (long) table[(i*2)+1];
2480 if (value > max)
2481 max = value;
2482 tot += value;
2483 }
2484 for (i = 0; i < days; i++) {
2485 char *key = table[i*2];
2486 long value = (long) table[(i*2)+1];
2487 Output->print_numkeybar_entry(fp, key, max, tot, value);
2488 }
2489 free(table);
2490 Output->print_hline(fp);
2491
2492 /* Montly */
2493 if (Config_process_monthly_visitors == 0) return;
2494 tot = max = 0;
2495 months = ht_used(&vih->month);
2496 Output->print_title(fp, "Unique visitors in each month");
2497 Output->print_subtitle(fp, "Multiple hits with the same IP, user agent and access day, are considered a single visit");
2498 Output->print_numkey_info(fp, "Number of unique visitors",
2499 ht_used(&vih->visitors));
2500 Output->print_numkey_info(fp, "Different months in logfile",
2501 ht_used(&vih->month));
2502
2503 if ((table = ht_get_array(&vih->month)) == NULL) {
2504 fprintf(stderr, "Out Of Memory in print_visits_report()\n");
2505 return;
2506 }
2507 qsort(table, months, sizeof(void*)*2, qsort_cmp_months_key);
2508 for (i = 0; i < months; i++) {
2509 long value = (long) table[(i*2)+1];
2510 if (value > max)
2511 max = value;
2512 tot += value;
2513 }
2514 for (i = 0; i < months; i++) {
2515 char *key = table[i*2];
2516 long value = (long) table[(i*2)+1];
2517 Output->print_numkeybar_entry(fp, key, max, tot, value);
2518 }
2519 free(table);
2520 }
2521
2522 /* A report to compare visits originating from google VS all the rest. */
vi_print_googlevisits_report(FILE * fp,struct vih * vih)2523 void vi_print_googlevisits_report(FILE *fp, struct vih *vih)
2524 {
2525 int days = ht_used(&vih->date), i, months;
2526 void **table;
2527
2528 Output->print_title(fp, "Unique visitors from Google in each day");
2529 Output->print_subtitle(fp, "The red part of the bar expresses the percentage of visits originated from Google");
2530 Output->print_numkey_info(fp, "Number of unique visitors",
2531 ht_used(&vih->visitors));
2532 Output->print_numkey_info(fp, "Number of unique visitors from google",
2533 ht_used(&vih->googlevisitors));
2534 Output->print_numkey_info(fp, "Different days in logfile",
2535 ht_used(&vih->date));
2536
2537 if ((table = ht_get_array(&vih->date)) == NULL) {
2538 fprintf(stderr, "Out Of Memory in print_visits_report()\n");
2539 return;
2540 }
2541 qsort(table, days, sizeof(void*)*2, qsort_cmp_dates_key);
2542 for (i = 0; i < days; i++) {
2543 char *key = table[i*2];
2544 long value = (long) table[(i*2)+1];
2545 long googlevalue;
2546
2547 googlevalue = vi_counter_val(&vih->googledate, key);
2548 Output->print_numkeycomparativebar_entry(fp, key, value, googlevalue);
2549 }
2550 free(table);
2551 Output->print_hline(fp);
2552
2553 /* Montly */
2554 if (Config_process_monthly_visitors == 0) return;
2555 months = ht_used(&vih->month);
2556 Output->print_title(fp, "Unique visitors from Google in each month");
2557 Output->print_subtitle(fp, "The red part of the bar expresses the percentage of visits originated from Google");
2558 Output->print_numkey_info(fp, "Number of unique visitors",
2559 ht_used(&vih->visitors));
2560 Output->print_numkey_info(fp, "Number of unique visitors from google",
2561 ht_used(&vih->googlevisitors));
2562 Output->print_numkey_info(fp, "Different months in logfile",
2563 ht_used(&vih->month));
2564
2565 if ((table = ht_get_array(&vih->month)) == NULL) {
2566 fprintf(stderr, "Out Of Memory in print_visits_report()\n");
2567 return;
2568 }
2569 qsort(table, months, sizeof(void*)*2, qsort_cmp_months_key);
2570 for (i = 0; i < months; i++) {
2571 char *key = table[i*2];
2572 long value = (long) table[(i*2)+1];
2573 long googlevalue;
2574
2575 googlevalue = vi_counter_val(&vih->googlemonth, key);
2576 Output->print_numkeycomparativebar_entry(fp, key, value, googlevalue);
2577 }
2578 free(table);
2579 }
2580
vi_print_generic_keyval_report(FILE * fp,char * title,char * subtitle,char * info,int maxlines,struct hashtable * ht,int (* compar)(const void *,const void *))2581 void vi_print_generic_keyval_report(FILE *fp, char *title, char *subtitle,
2582 char *info, int maxlines,
2583 struct hashtable *ht,
2584 int(*compar)(const void *, const void *))
2585 {
2586 int items = ht_used(ht), i;
2587 void **table;
2588
2589 Output->print_title(fp, title);
2590 Output->print_subtitle(fp, subtitle);
2591 Output->print_numkey_info(fp, info, items);
2592 if ((table = ht_get_array(ht)) == NULL) {
2593 fprintf(stderr, "Out of memory in print_generic_report()\n");
2594 return;
2595 }
2596 qsort(table, items, sizeof(void*)*2, compar);
2597 for (i = 0; i < items; i++) {
2598 char *key = table[i*2];
2599 long value = (long) table[(i*2)+1];
2600 if (i >= maxlines) break;
2601 if (key[0] == '\0')
2602 Output->print_numkey_entry(fp, "none", value, NULL,
2603 i+1);
2604 else
2605 Output->print_numkey_entry(fp, key, value, NULL, i+1);
2606 }
2607 free(table);
2608 }
2609
vi_print_generic_keyvalbar_report(FILE * fp,char * title,char * subtitle,char * info,int maxlines,struct hashtable * ht,int (* compar)(const void *,const void *))2610 void vi_print_generic_keyvalbar_report(FILE *fp, char *title, char *subtitle,
2611 char *info, int maxlines,
2612 struct hashtable *ht,
2613 int(*compar)(const void *, const void *))
2614 {
2615 int items = ht_used(ht), i, max = 0, tot = 0;
2616 void **table;
2617
2618 Output->print_title(fp, title);
2619 Output->print_subtitle(fp, subtitle);
2620 Output->print_numkey_info(fp, info, items);
2621 if ((table = ht_get_array(ht)) == NULL) {
2622 fprintf(stderr, "Out of memory in print_generic_report()\n");
2623 return;
2624 }
2625 qsort(table, items, sizeof(void*)*2, compar);
2626 for (i = 0; i < items; i++) {
2627 long value = (long) table[(i*2)+1];
2628 tot += value;
2629 if (value > max) max = value;
2630 }
2631 for (i = 0; i < items; i++) {
2632 char *key = table[i*2];
2633 long value = (long) table[(i*2)+1];
2634 if (i >= maxlines) break;
2635 if (key[0] == '\0')
2636 Output->print_numkeybar_entry(fp, "none", max, tot, value);
2637 else
2638 Output->print_numkeybar_entry(fp, key, max, tot, value);
2639 }
2640 free(table);
2641 }
2642
2643 /* This is similar to the generic key/val report, but
2644 * different enough to be better served by a specific function. */
vi_print_keyphrases_report(FILE * fp,char * title,char * subtitle,char * info,int maxlines,struct hashtable * ht,int (* compar)(const void *,const void *))2645 void vi_print_keyphrases_report(FILE *fp, char *title, char *subtitle,
2646 char *info, int maxlines,
2647 struct hashtable *ht,
2648 int(*compar)(const void *, const void *))
2649 {
2650 int items = ht_used(ht), i;
2651 void **table;
2652
2653 Output->print_title(fp, title);
2654 Output->print_subtitle(fp, subtitle);
2655 Output->print_numkey_info(fp, info, items);
2656 if ((table = ht_get_array(ht)) == NULL) {
2657 fprintf(stderr, "Out of memory in print_keyphrases_report()\n");
2658 return;
2659 }
2660 qsort(table, items, sizeof(void*)*2, compar);
2661 for (i = 0; i < items; i++) {
2662 char *key = table[i*2];
2663 long value = (long) table[(i*2)+1];
2664 if (i >= maxlines) break;
2665 if (key[0] == '\0')
2666 Output->print_numkey_entry(fp, "none", value, NULL,
2667 i+1);
2668 else {
2669 char *p;
2670 char link[VI_LINE_MAX];
2671 char aux[VI_LINE_MAX];
2672 char encodedkey[VI_LINE_MAX];
2673
2674 vi_strlcpy(link, "http://www.google.com/search?q=", VI_LINE_MAX);
2675 vi_strlcpy(aux, key, VI_LINE_MAX);
2676 p = strrchr(aux, '(');
2677 if (p) {
2678 if (p > aux) p--; /* seek the space on left */
2679 *p = '\0';
2680 }
2681 vi_urlencode(encodedkey, aux, VI_LINE_MAX);
2682 vi_strlcat(link, encodedkey, VI_LINE_MAX);
2683 Output->print_numkey_entry(fp, key, value, link, i+1);
2684 }
2685 }
2686 free(table);
2687 }
2688
vi_print_referers_report(FILE * fp,struct vih * vih)2689 void vi_print_referers_report(FILE *fp, struct vih *vih)
2690 {
2691 vi_print_generic_keyval_report(
2692 fp,
2693 "Referers",
2694 "Referers ordered by visits (google excluded)",
2695 "Different referers",
2696 Config_max_referers,
2697 &vih->referers,
2698 qsort_cmp_long_value);
2699 }
2700
vi_print_pages_report(FILE * fp,struct vih * vih)2701 void vi_print_pages_report(FILE *fp, struct vih *vih)
2702 {
2703 vi_print_generic_keyval_report(
2704 fp,
2705 "Requested pages",
2706 "Page requests ordered by hits",
2707 "Different pages requested",
2708 Config_max_pages,
2709 &vih->pages,
2710 qsort_cmp_long_value);
2711 }
2712
vi_print_error404_report(FILE * fp,struct vih * vih)2713 void vi_print_error404_report(FILE *fp, struct vih *vih)
2714 {
2715 vi_print_generic_keyval_report(
2716 fp,
2717 "404 Errors",
2718 "Requests for missing documents",
2719 "Different missing documents requested",
2720 Config_max_error404,
2721 &vih->error404,
2722 qsort_cmp_long_value);
2723 }
2724
vi_print_pageviews_report(FILE * fp,struct vih * vih)2725 void vi_print_pageviews_report(FILE *fp, struct vih *vih)
2726 {
2727 vi_print_generic_keyvalbar_report(
2728 fp,
2729 "Pageviews per visit",
2730 "Number of pages requested per visit",
2731 "Only documents are counted (not images). Reported ranges:",
2732 100,
2733 &vih->pageviews_grouped,
2734 qsort_cmp_long_value);
2735 }
2736
vi_print_images_report(FILE * fp,struct vih * vih)2737 void vi_print_images_report(FILE *fp, struct vih *vih)
2738 {
2739 vi_print_generic_keyval_report(
2740 fp,
2741 "Requested images and CSS",
2742 "Images and CSS requests ordered by hits",
2743 "Different images and CSS requested",
2744 Config_max_images,
2745 &vih->images,
2746 qsort_cmp_long_value);
2747 }
2748
vi_print_agents_report(FILE * fp,struct vih * vih)2749 void vi_print_agents_report(FILE *fp, struct vih *vih)
2750 {
2751 vi_print_generic_keyval_report(
2752 fp,
2753 "User agents",
2754 "The entire user agent string ordered by visits",
2755 "Different agents",
2756 Config_max_agents,
2757 &vih->agents,
2758 qsort_cmp_long_value);
2759 }
2760
vi_print_os_report(FILE * fp,struct vih * vih)2761 void vi_print_os_report(FILE *fp, struct vih *vih)
2762 {
2763 vi_print_generic_keyvalbar_report(
2764 fp,
2765 "Operating Systems",
2766 "Operating Systems by visits",
2767 "Different operating systems listed",
2768 100,
2769 &vih->os,
2770 qsort_cmp_long_value);
2771 }
2772
vi_print_browsers_report(FILE * fp,struct vih * vih)2773 void vi_print_browsers_report(FILE *fp, struct vih *vih)
2774 {
2775 vi_print_generic_keyvalbar_report(
2776 fp,
2777 "Browsers",
2778 "Browsers used by visits",
2779 "Different browsers listed",
2780 100,
2781 &vih->browsers,
2782 qsort_cmp_long_value);
2783 }
2784
vi_print_trails_report(FILE * fp,struct vih * vih)2785 void vi_print_trails_report(FILE *fp, struct vih *vih)
2786 {
2787 vi_print_generic_keyval_report(
2788 fp,
2789 "Web trails",
2790 "Referer -> Target common moves",
2791 "Total number of trails",
2792 Config_max_trails,
2793 &vih->trails,
2794 qsort_cmp_long_value);
2795 }
2796
vi_print_google_keyphrases_report(FILE * fp,struct vih * vih)2797 void vi_print_google_keyphrases_report(FILE *fp, struct vih *vih)
2798 {
2799 vi_print_keyphrases_report(
2800 fp,
2801 "Google Keyphrases",
2802 "Keyphrases used in google searches ordered by visits",
2803 "Total number of keyphrases",
2804 Config_max_google_keyphrases,
2805 &vih->googlekeyphrases,
2806 qsort_cmp_long_value);
2807 }
2808
vi_print_tld_report(FILE * fp,struct vih * vih)2809 void vi_print_tld_report(FILE *fp, struct vih *vih)
2810 {
2811 vi_print_generic_keyvalbar_report(
2812 fp,
2813 "Domains",
2814 "Top Level Domains sorted by visits",
2815 "Total number of Top Level Domains",
2816 Config_max_tld,
2817 &vih->tld,
2818 qsort_cmp_long_value);
2819 }
2820
vi_print_robots_report(FILE * fp,struct vih * vih)2821 void vi_print_robots_report(FILE *fp, struct vih *vih)
2822 {
2823 vi_print_generic_keyval_report(
2824 fp,
2825 "Robots and web spiders",
2826 "Agents requesting robots.txt. MSIECrawler excluded.",
2827 "Total number of different robots",
2828 Config_max_robots,
2829 &vih->robots,
2830 qsort_cmp_long_value);
2831 }
2832
2833 /* Print a generic report where the two report items are strings
2834 * (usually url and date). Used to print the 'googled' and 'referers age'
2835 * reports. */
vi_print_generic_keytime_report(FILE * fp,char * title,char * subtitle,char * info,int maxlines,struct hashtable * ht,int (* compar)(const void *,const void *))2836 void vi_print_generic_keytime_report(FILE *fp, char *title, char *subtitle,
2837 char *info, int maxlines,
2838 struct hashtable *ht,
2839 int(*compar)(const void *, const void *))
2840 {
2841 int items = ht_used(ht), i;
2842 void **table;
2843
2844 Output->print_title(fp, title);
2845 Output->print_subtitle(fp, subtitle);
2846 Output->print_numkey_info(fp, info, items);
2847 if ((table = ht_get_array(ht)) == NULL) {
2848 fprintf(stderr, "Out Of Memory in print_generic_keytime_report()\n");
2849 return;
2850 }
2851 qsort(table, items, sizeof(void*)*2, compar);
2852 for (i = 0; i < items; i++) {
2853 struct tm *tm;
2854 char ftime[1024];
2855 char *url = table[i*2];
2856 time_t time = (time_t) table[(i*2)+1];
2857 if (i >= maxlines) break;
2858 tm = localtime(&time);
2859 if (tm) {
2860 ftime[0] = '\0';
2861 strftime(ftime, 1024, "%d/%b/%Y", tm);
2862 Output->print_keykey_entry(fp, ftime,
2863 (url[0] == '\0') ? "none" : url, i+1);
2864 }
2865 }
2866 free(table);
2867 }
2868
vi_print_googled_report(FILE * fp,struct vih * vih)2869 void vi_print_googled_report(FILE *fp, struct vih *vih)
2870 {
2871 vi_print_generic_keytime_report(
2872 fp,
2873 "Googled pages",
2874 "Pages accessed by the Google crawler, last access reported",
2875 "Number of pages googled",
2876 Config_max_googled,
2877 &vih->googled,
2878 qsort_cmp_time_value);
2879 }
2880
vi_print_adsensed_report(FILE * fp,struct vih * vih)2881 void vi_print_adsensed_report(FILE *fp, struct vih *vih)
2882 {
2883 vi_print_generic_keytime_report(
2884 fp,
2885 "Adsensed pages",
2886 "Pages accessed by the Adsense crawler, last access reported",
2887 "Number of pages adsensed",
2888 Config_max_adsensed,
2889 &vih->adsensed,
2890 qsort_cmp_time_value);
2891 }
2892
vi_print_referers_age_report(FILE * fp,struct vih * vih)2893 void vi_print_referers_age_report(FILE *fp, struct vih *vih)
2894 {
2895 vi_print_generic_keytime_report(
2896 fp,
2897 "Referers by first time",
2898 "Referers ordered by first time date, newer on top (referers from google excluded)",
2899 "Different referers",
2900 Config_max_referers_age,
2901 &vih->referersage,
2902 qsort_cmp_time_value);
2903 }
2904
vi_print_google_keyphrases_age_report(FILE * fp,struct vih * vih)2905 void vi_print_google_keyphrases_age_report(FILE *fp, struct vih *vih)
2906 {
2907 vi_print_generic_keytime_report(
2908 fp,
2909 "Google Keyphrases by first time",
2910 "Keyphrases ordered by first time date, newer on top",
2911 "Different referers",
2912 Config_max_google_keyphrases_age,
2913 &vih->googlekeyphrasesage,
2914 qsort_cmp_time_value);
2915 }
2916
vi_print_google_human_language_report(FILE * fp,struct vih * vih)2917 void vi_print_google_human_language_report(FILE *fp, struct vih *vih)
2918 {
2919 vi_print_generic_keyval_report(
2920 fp,
2921 "Google Human Language",
2922 "The 'hl' field in the query string of google searches",
2923 "Different human languages",
2924 1000,
2925 &vih->googlehumanlanguage,
2926 qsort_cmp_long_value);
2927 }
2928
vi_print_screen_res_report(FILE * fp,struct vih * vih)2929 void vi_print_screen_res_report(FILE *fp, struct vih *vih) {
2930 vi_print_generic_keyval_report(
2931 fp,
2932 "Screen resolution",
2933 "user screen width x height resolution",
2934 "Different resolutions",
2935 1000,
2936 &vih->screenres,
2937 qsort_cmp_long_value);
2938 }
2939
vi_print_screen_depth_report(FILE * fp,struct vih * vih)2940 void vi_print_screen_depth_report(FILE *fp, struct vih *vih) {
2941 vi_print_generic_keyval_report(
2942 fp,
2943 "Screen color depth",
2944 "user screen color depth in bits per pixel",
2945 "Different color depths",
2946 1000,
2947 &vih->screendepth,
2948 qsort_cmp_long_value);
2949 }
2950
vi_print_information_report(FILE * fp,struct vih * vih)2951 void vi_print_information_report(FILE *fp, struct vih *vih)
2952 {
2953 char buf[VI_LINE_MAX];
2954 time_t now = time(NULL);
2955 snprintf(buf, VI_LINE_MAX, "Generated: %s", ctime(&now));
2956 Output->print_title(fp, "General information");
2957 Output->print_subtitle(fp, "Information about analyzed log files");
2958 Output->print_subtitle(fp, buf);
2959 Output->print_numkey_info(fp, "Number of entries processed", vih->processed);
2960 Output->print_numkey_info(fp, "Number of invalid entries", vih->invalid);
2961 Output->print_numkey_info(fp, "Processing time in seconds", (vih->endt)-(vih->startt));
2962 }
2963
vi_print_report_links(FILE * fp)2964 void vi_print_report_links(FILE *fp)
2965 {
2966 void *l[] = {
2967 "Unique visitors in each day", NULL,
2968 "Unique visitors in each month", &Config_process_monthly_visitors,
2969 "Unique visitors from Google in each day", NULL,
2970 "Unique visitors from Google in each month", &Config_process_monthly_visitors,
2971 "Pageviews per visit", &Config_process_pageviews,
2972 "Weekday-Hour combined map", &Config_process_weekdayhour_map,
2973 "Month-Day combined map", &Config_process_monthday_map,
2974 "Requested pages", NULL,
2975 "Requested images and CSS", NULL,
2976 "Referers", NULL,
2977 "Referers by first time", &Config_process_referers_age,
2978 "Robots and web spiders", &Config_process_robots,
2979 "User agents", &Config_process_agents,
2980 "Operating Systems", &Config_process_os,
2981 "Browsers", &Config_process_browsers,
2982 "404 Errors", &Config_process_error404,
2983 "Domains", &Config_process_tld,
2984 "Googled pages", &Config_process_google,
2985 "Adsensed pages", &Config_process_google,
2986 "Google Keyphrases", &Config_process_google_keyphrases,
2987 "Google Keyphrases by first time", &Config_process_google_keyphrases_age,
2988 "Google Human Language", &Config_process_google_human_language,
2989 "Screen resolution", &Config_process_screen_info,
2990 "Screen color depth", &Config_process_screen_info,
2991 "Web trails", &Config_process_web_trails,
2992 "Weekday distribution", NULL,
2993 "Hours distribution", NULL,
2994 };
2995 unsigned int i, num = 0;
2996
2997 Output->print_title(fp, "Generated reports");
2998 Output->print_subtitle(fp, "Click on the report name you want to see");
2999 for (i = 0; i < sizeof(l)/sizeof(void*); i += 2) {
3000 int active = l[i+1] == NULL ? 1 : *((int*)l[i+1]);
3001 if (active) num++;
3002 }
3003 Output->print_numkey_info(fp, "Number of reports generated", num);
3004 for (i = 0; i < sizeof(l)/sizeof(void*); i += 2) {
3005 int active = l[i+1] == NULL ? 1 : *((int*)l[i+1]);
3006 if (active)
3007 Output->print_report_link(fp, (char*)l[i]);
3008 }
3009 }
3010
vi_print_weekdayhour_map_report(FILE * fp,struct vih * vih)3011 void vi_print_weekdayhour_map_report(FILE *fp, struct vih *vih)
3012 {
3013 char *xlabel[24] = {
3014 "00", "01", "02", "03", "04", "05", "06", "07",
3015 "08", "09", "10", "11", "12", "13", "14", "15",
3016 "16", "17", "18", "19", "20", "21", "22", "23"};
3017 char **ylabel = vi_wdname;
3018 int j, minj = 0, maxj = 0;
3019 int *hw = (int*) vih->weekdayhour;
3020 char buf[VI_LINE_MAX];
3021
3022 /* Check idexes of minimum and maximum in the array. */
3023 for (j = 0; j < 24*7; j++) {
3024 if (hw[j] > hw[maxj])
3025 maxj = j;
3026 if (hw[j] < hw[minj])
3027 minj = j;
3028 }
3029
3030 Output->print_title(fp, "Weekday-Hour combined map");
3031 Output->print_subtitle(fp, "Brighter means higher level of hits");
3032 snprintf(buf, VI_LINE_MAX, "Hour with max traffic starting at %s %s:00 with hits",
3033 ylabel[maxj/24], xlabel[maxj%24]);
3034 Output->print_numkey_info(fp, buf, hw[maxj]);
3035 snprintf(buf, VI_LINE_MAX, "Hour with min traffic starting at %s %s:00 with hits",
3036 ylabel[minj/24], xlabel[minj%24]);
3037 Output->print_numkey_info(fp, buf, hw[minj]);
3038 Output->print_hline(fp);
3039 Output->print_bidimentional_map(fp, 24, 7, xlabel, ylabel, hw);
3040 }
3041
vi_print_monthday_map_report(FILE * fp,struct vih * vih)3042 void vi_print_monthday_map_report(FILE *fp, struct vih *vih)
3043 {
3044 char *xlabel[31] = {
3045 "01", "02", "03", "04", "05", "06", "07", "08",
3046 "09", "10", "11", "12", "13", "14", "15", "16",
3047 "17", "18", "19", "20", "21", "22", "23", "24",
3048 "25", "26", "27", "28", "29", "30", "31"};
3049 char *ylabel[12] = {
3050 "Jan", "Feb", "Mar", "Apr", "May", "Jun",
3051 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
3052 };
3053 int j, minj = 0, maxj = 0;
3054 int *md = (int*) vih->monthday;
3055 char buf[VI_LINE_MAX];
3056
3057 /* Check idexes of minimum and maximum in the array. */
3058 for (j = 0; j < 12*31; j++) {
3059 if (md[j] > md[maxj])
3060 maxj = j;
3061 if (md[j] != 0 && (md[j] < md[minj] || md[minj] == 0))
3062 minj = j;
3063 }
3064
3065 Output->print_title(fp, "Month-Day combined map");
3066 Output->print_subtitle(fp, "Brighter means higher level of hits");
3067 snprintf(buf, VI_LINE_MAX, "Day with max traffic is %s %s with hits",
3068 ylabel[maxj/31], xlabel[maxj%31]);
3069 Output->print_numkey_info(fp, buf, md[maxj]);
3070 snprintf(buf, VI_LINE_MAX, "Day with min traffic is %s %s with hits",
3071 ylabel[minj/31], xlabel[minj%31]);
3072 Output->print_numkey_info(fp, buf, md[minj]);
3073 Output->print_hline(fp);
3074 Output->print_bidimentional_map(fp, 31, 12, xlabel, ylabel, md);
3075 }
3076
vi_print_hline(FILE * fp)3077 void vi_print_hline(FILE *fp)
3078 {
3079 Output->print_hline(fp);
3080 }
3081
vi_print_credits(FILE * fp)3082 void vi_print_credits(FILE *fp)
3083 {
3084 Output->print_credits(fp);
3085 }
3086
vi_print_header(FILE * fp)3087 void vi_print_header(FILE *fp)
3088 {
3089 Output->print_header(fp);
3090 }
3091
vi_print_footer(FILE * fp)3092 void vi_print_footer(FILE *fp)
3093 {
3094 Output->print_footer(fp);
3095 }
3096
3097 /* Generate the report writing it to the output file 'of'.
3098 * If op is NULL, output the report to standard output.
3099 * On success zero is returned. Otherwise the function returns
3100 * non-zero and set an error in the vih handler. */
vi_print_report(char * of,struct vih * vih)3101 int vi_print_report(char *of, struct vih *vih)
3102 {
3103 FILE *fp;
3104
3105 if (of == NULL) {
3106 fp = stdout;
3107 } else {
3108 fp = fopen(of, "w");
3109 if (fp == NULL) {
3110 vi_set_error(vih, "Writing the report to '%s': %s",
3111 of, strerror(errno));
3112 return 1;
3113 }
3114 }
3115
3116 /* Disable specific reports when there is no data. */
3117 if (ht_used(&vih->screenres) == 0)
3118 Config_process_screen_info = 0;
3119 /* Do some data postprocessing needed to generate reports */
3120 if (vi_postprocess(vih))
3121 return 1;
3122 /* Report generation */
3123 vi_print_header(fp);
3124 vi_print_credits(fp);
3125 vi_print_hline(fp);
3126 vi_print_information_report(fp, vih);
3127 vi_print_hline(fp);
3128 vi_print_report_links(fp);
3129 vi_print_hline(fp);
3130 vi_print_visits_report(fp, vih);
3131 vi_print_hline(fp);
3132 vi_print_googlevisits_report(fp, vih);
3133 vi_print_hline(fp);
3134 if (Config_process_weekdayhour_map) {
3135 vi_print_weekdayhour_map_report(fp, vih);
3136 vi_print_hline(fp);
3137 }
3138 if (Config_process_monthday_map) {
3139 vi_print_monthday_map_report(fp, vih);
3140 vi_print_hline(fp);
3141 }
3142 if (Config_process_pageviews) {
3143 vi_print_pageviews_report(fp, vih);
3144 vi_print_hline(fp);
3145 }
3146 vi_print_pages_report(fp, vih);
3147 vi_print_hline(fp);
3148 vi_print_images_report(fp, vih);
3149 vi_print_hline(fp);
3150 vi_print_referers_report(fp, vih);
3151 vi_print_hline(fp);
3152 if (Config_process_referers_age) {
3153 vi_print_referers_age_report(fp, vih);
3154 vi_print_hline(fp);
3155 }
3156 if (Config_process_robots) {
3157 vi_print_robots_report(fp, vih);
3158 vi_print_hline(fp);
3159 }
3160 if (Config_process_agents) {
3161 vi_print_agents_report(fp, vih);
3162 vi_print_hline(fp);
3163 }
3164 if (Config_process_os) {
3165 vi_print_os_report(fp, vih);
3166 vi_print_hline(fp);
3167 }
3168 if (Config_process_browsers) {
3169 vi_print_browsers_report(fp, vih);
3170 vi_print_hline(fp);
3171 }
3172 if (Config_process_error404) {
3173 vi_print_error404_report(fp, vih);
3174 vi_print_hline(fp);
3175 }
3176 if (Config_process_tld) {
3177 vi_print_tld_report(fp, vih);
3178 vi_print_hline(fp);
3179 }
3180 if (Config_process_google) {
3181 vi_print_googled_report(fp, vih);
3182 vi_print_hline(fp);
3183 vi_print_adsensed_report(fp, vih);
3184 vi_print_hline(fp);
3185 }
3186 if (Config_process_google_keyphrases) {
3187 vi_print_google_keyphrases_report(fp, vih);
3188 vi_print_hline(fp);
3189 }
3190 if (Config_process_google_keyphrases) {
3191 vi_print_google_keyphrases_age_report(fp, vih);
3192 vi_print_hline(fp);
3193 }
3194 if (Config_process_google_human_language) {
3195 vi_print_google_human_language_report(fp, vih);
3196 vi_print_hline(fp);
3197 }
3198 if (Config_process_screen_info) {
3199 vi_print_screen_res_report(fp, vih);
3200 vi_print_hline(fp);
3201 vi_print_screen_depth_report(fp, vih);
3202 vi_print_hline(fp);
3203 }
3204 if (Config_process_web_trails) {
3205 vi_print_trails_report(fp, vih);
3206 vi_print_hline(fp);
3207 }
3208 vi_print_weekdays_report(fp, vih);
3209 vi_print_hline(fp);
3210 vi_print_hours_report(fp, vih);
3211 vi_print_hline(fp);
3212 vi_print_credits(fp);
3213 vi_print_hline(fp);
3214 vi_print_footer(fp);
3215 if (of != NULL)
3216 fclose(fp);
3217 return 0;
3218 }
3219
3220 /* ------------------------- graphviz graph generation ---------------------- */
vi_print_graphviz(struct vih * vih)3221 void vi_print_graphviz(struct vih *vih)
3222 {
3223 int items = ht_used(&vih->trails), i, max = 0, tot = 0;
3224 void **table;
3225
3226 printf("digraph webtrails {\n");
3227 printf("\tgraph [splines=true overlap=false rankdir=LR]\n");
3228 printf("\tnode [color=lightblue2,style=\"filled\"]\n");
3229 printf("\tedge [style=bold]\n");
3230 if ((table = ht_get_array(&vih->trails)) == NULL) {
3231 fprintf(stderr, "Out of memory in vi_print_graphviz()\n");
3232 return;
3233 }
3234 qsort(table, items, sizeof(void*)*2, qsort_cmp_long_value);
3235 for (i = 0; i < items; i++) {
3236 long value = (long) table[(i*2)+1];
3237 tot += value;
3238 if (i > Config_max_trails) continue;
3239 if (max < value)
3240 max = value;
3241 }
3242 if (max == 0) max = 1; /* avoid division by zero */
3243 if (tot == 0) tot = 1;
3244 for (i = 0; i < items; i++) {
3245 int color;
3246 char *key = table[i*2];
3247 char *t;
3248 long value = (long) table[(i*2)+1];
3249 float percentage = ((float)value/tot)*100;
3250 if (i > Config_max_trails) break;
3251 color = (value*255)/max;
3252 t = strstr(key, " -> ");
3253 *t = '\0'; /* alter */
3254 printf("\t\"%s\" -> \"%s\" [color=\"#%02X00%02X\" label=\"%.2f\"]\n", key, t+4, color, 255-color, percentage);
3255 *t = ' '; /* restore */
3256 }
3257 if (!Config_graphviz_ignorenode_google)
3258 printf("\tGoogle [color=\"#c0ffc0\"]\n");
3259 if (!Config_graphviz_ignorenode_external)
3260 printf("\t\"External Link\" [color=\"#c0ffc0\"]\n");
3261 if (!Config_graphviz_ignorenode_noreferer)
3262 printf("\t\"No Referer\" [color=\"#c0ffc0\"]\n");
3263 free(table);
3264 printf("}\n");
3265 }
3266
3267 /* -------------------------------- stream mode ----------------------------- */
vi_stream_mode(struct vih * vih)3268 void vi_stream_mode(struct vih *vih)
3269 {
3270 time_t lastupdate_t, lastreset_t, now_t;
3271
3272 lastupdate_t = lastreset_t = time(NULL);
3273 while(1) {
3274 char buf[VI_LINE_MAX];
3275
3276 if (fgets(buf, VI_LINE_MAX, stdin) == NULL) {
3277 vi_sleep(1);
3278 continue;
3279 }
3280 if (vi_process_line(vih, buf)) {
3281 fprintf(stderr, "%s\n", vi_get_error(vih));
3282 }
3283 now_t = time(NULL);
3284 /* update */
3285 if ((now_t - lastupdate_t) >= Config_update_every) {
3286 lastupdate_t = now_t;
3287 if (vi_print_report(Config_output_file, vih)) {
3288 fprintf(stderr, "%s\n", vi_get_error(vih));
3289 }
3290 }
3291 /* reset */
3292 if (Config_reset_every &&
3293 ((now_t - lastreset_t) >= Config_reset_every))
3294 {
3295 lastreset_t = now_t;
3296 vi_reset(vih);
3297 }
3298 }
3299 }
3300
3301 /* ----------------------------------- main --------------------------------- */
3302
3303 /* command line switche IDs */
3304 enum { OPT_MAXREFERERS, OPT_MAXPAGES, OPT_MAXIMAGES, OPT_USERAGENTS, OPT_ALL, OPT_MAXLINES, OPT_GOOGLE, OPT_MAXGOOGLED, OPT_MAXUSERAGENTS, OPT_OUTPUT, OPT_VERSION, OPT_HELP, OPT_PREFIX, OPT_TRAILS, OPT_GOOGLEKEYPHRASES, OPT_GOOGLEKEYPHRASESAGE, OPT_MAXGOOGLEKEYPHRASES, OPT_MAXGOOGLEKEYPHRASESAGE, OPT_MAXTRAILS, OPT_GRAPHVIZ, OPT_WEEKDAYHOUR_MAP, OPT_MONTHDAY_MAP, OPT_REFERERSAGE, OPT_MAXREFERERSAGE, OPT_TAIL, OPT_TLD, OPT_MAXTLD, OPT_STREAM, OPT_OUTPUTFILE, OPT_UPDATEEVERY, OPT_RESETEVERY, OPT_OS, OPT_BROWSERS, OPT_ERROR404, OPT_MAXERROR404, OPT_TIMEDELTA, OPT_PAGEVIEWS, OPT_ROBOTS, OPT_MAXROBOTS, OPT_GRAPHVIZ_ignorenode_GOOGLE, OPT_GRAPHVIZ_ignorenode_EXTERNAL, OPT_GRAPHVIZ_ignorenode_NOREFERER, OPT_GOOGLEHUMANLANGUAGE, OPT_FILTERSPAM, OPT_MAXADSENSED, OPT_GREP, OPT_EXCLUDE, OPT_IGNORE404, OPT_DEBUG, OPT_SCREENINFO};
3305
3306 /* command line switches definition:
3307 * the rule with short options is to take upper case the
3308 * 'special' options (the option a normal user should not use) */
3309 static struct ago_optlist visitors_optlist[] = {
3310 { 'A', "all", OPT_ALL, AGO_NOARG},
3311 { 'T', "trails", OPT_TRAILS, AGO_NOARG},
3312 { 'G', "google", OPT_GOOGLE, AGO_NOARG},
3313 { 'K', "google-keyphrases", OPT_GOOGLEKEYPHRASES, AGO_NOARG},
3314 { 'Z', "google-keyphrases-age", OPT_GOOGLEKEYPHRASESAGE, AGO_NOARG},
3315 { 'H', "google-human-language", OPT_GOOGLEHUMANLANGUAGE, AGO_NOARG},
3316 { 'U', "user-agents", OPT_USERAGENTS, AGO_NOARG},
3317 { 'W', "weekday-hour-map", OPT_WEEKDAYHOUR_MAP, AGO_NOARG},
3318 { 'M', "month-day-map", OPT_MONTHDAY_MAP, AGO_NOARG},
3319 { 'R', "referers-age", OPT_REFERERSAGE, AGO_NOARG},
3320 { 'D', "domains", OPT_TLD, AGO_NOARG},
3321 { 'O', "operating-systems", OPT_OS, AGO_NOARG},
3322 { 'B', "browsers", OPT_BROWSERS, AGO_NOARG},
3323 { 'X', "error404", OPT_ERROR404, AGO_NOARG},
3324 { 'Y', "pageviews", OPT_PAGEVIEWS, AGO_NOARG},
3325 { 'S', "robots", OPT_ROBOTS, AGO_NOARG},
3326 { '\0', "screen-info", OPT_SCREENINFO, AGO_NOARG},
3327 { '\0', "stream", OPT_STREAM, AGO_NOARG},
3328 { '\0', "update-every", OPT_UPDATEEVERY, AGO_NEEDARG},
3329 { '\0', "reset-every", OPT_RESETEVERY, AGO_NEEDARG},
3330 { 'f', "output-file", OPT_OUTPUTFILE, AGO_NEEDARG},
3331 { 'm', "max-lines", OPT_MAXLINES, AGO_NEEDARG},
3332 { 'r', "max-referers", OPT_MAXREFERERS, AGO_NEEDARG},
3333 { 'p', "max-pages", OPT_MAXPAGES, AGO_NEEDARG},
3334 { 'i', "max-images", OPT_MAXIMAGES, AGO_NEEDARG},
3335 { 'x', "max-error404", OPT_MAXERROR404, AGO_NEEDARG},
3336 { 'u', "max-useragents", OPT_MAXUSERAGENTS, AGO_NEEDARG},
3337 { 't', "max-trails", OPT_MAXTRAILS, AGO_NEEDARG},
3338 { 'g', "max-googled", OPT_MAXGOOGLED, AGO_NEEDARG},
3339 { '\0', "max-adsensed", OPT_MAXADSENSED, AGO_NEEDARG},
3340 { 'k', "max-google-keyphrases",OPT_MAXGOOGLEKEYPHRASES,AGO_NEEDARG},
3341 { 'z', "max-google-keyphrases-age",OPT_MAXGOOGLEKEYPHRASESAGE,
3342 AGO_NEEDARG},
3343 { 'a', "max-referers-age", OPT_MAXREFERERSAGE, AGO_NEEDARG},
3344 { 'd', "max-domains", OPT_MAXTLD, AGO_NEEDARG},
3345 { 's', "max-robots", OPT_MAXROBOTS, AGO_NEEDARG},
3346 { '\0', "grep", OPT_GREP, AGO_NEEDARG},
3347 { '\0', "exclude", OPT_EXCLUDE, AGO_NEEDARG},
3348 { 'P', "prefix", OPT_PREFIX, AGO_NEEDARG},
3349 { 'o', "output", OPT_OUTPUT, AGO_NEEDARG},
3350 { 'V', "graphviz", OPT_GRAPHVIZ, AGO_NOARG},
3351 { '\0', "graphviz-ignorenode-google", OPT_GRAPHVIZ_ignorenode_GOOGLE,
3352 AGO_NOARG},
3353 { '\0', "graphviz-ignorenode-external", OPT_GRAPHVIZ_ignorenode_EXTERNAL,
3354 AGO_NOARG},
3355 { '\0', "graphviz-ignorenode-noreferer", OPT_GRAPHVIZ_ignorenode_NOREFERER,
3356 AGO_NOARG},
3357 { 'v', "version", OPT_VERSION, AGO_NOARG},
3358 { '\0', "tail", OPT_TAIL, AGO_NOARG},
3359 { '\0', "time-delta", OPT_TIMEDELTA, AGO_NEEDARG},
3360 { '\0', "filter-spam", OPT_FILTERSPAM, AGO_NOARG},
3361 { '\0', "ignore-404", OPT_IGNORE404, AGO_NOARG},
3362 { '\0', "debug", OPT_DEBUG, AGO_NOARG},
3363 { 'h', "help", OPT_HELP, AGO_NOARG},
3364 AGO_LIST_TERM
3365 };
3366
visitors_show_help(void)3367 void visitors_show_help(void)
3368 {
3369 int i;
3370
3371 printf("Usage: visitors [options] <filename> [<filename> ...]\n");
3372 printf("Available options:\n");
3373 for (i = 0; visitors_optlist[i].ao_long != NULL; i++) {
3374 if (visitors_optlist[i].ao_short != '\0') {
3375 printf(" -%c ", visitors_optlist[i].ao_short);
3376 } else {
3377 printf(" ");
3378 }
3379 printf("--%-30s %s\n",
3380 visitors_optlist[i].ao_long,
3381 (visitors_optlist[i].ao_flags & AGO_NEEDARG) ?
3382 "<argument>" : "");
3383 }
3384 printf("\nNOTE: --filter-spam can be *very* slow. Use with care.\n\n");
3385 printf("For more information visit http://www.hping.org/visitors\n"
3386 "Visitors is Copyright(C) 2004-2006 Salvatore Sanfilippo <antirez@invece.org>\n");
3387 }
3388
main(int argc,char ** argv)3389 int main(int argc, char **argv)
3390 {
3391 int i, o;
3392 struct vih *vih;
3393 char *filenames[VI_FILENAMES_MAX];
3394 int filenamec = 0;
3395
3396 /* Handle command line options */
3397 while((o = antigetopt(argc, argv, visitors_optlist)) != AGO_EOF) {
3398 switch(o) {
3399 case AGO_UNKNOWN:
3400 case AGO_REQARG:
3401 case AGO_AMBIG:
3402 ago_gnu_error("visitors", o);
3403 visitors_show_help();
3404 exit(1);
3405 break;
3406 case OPT_HELP:
3407 visitors_show_help();
3408 exit(0);
3409 break;
3410 case OPT_VERSION:
3411 printf("Visitors %s\n", VI_VERSION_STR);
3412 exit(0);
3413 case OPT_MAXREFERERS:
3414 Config_max_referers = atoi(ago_optarg);
3415 break;
3416 case OPT_MAXPAGES:
3417 Config_max_pages = atoi(ago_optarg);
3418 break;
3419 case OPT_MAXIMAGES:
3420 Config_max_images = atoi(ago_optarg);
3421 break;
3422 case OPT_MAXERROR404:
3423 Config_max_error404 = atoi(ago_optarg);
3424 break;
3425 case OPT_MAXUSERAGENTS:
3426 Config_max_agents = atoi(ago_optarg);
3427 break;
3428 case OPT_MAXTRAILS:
3429 Config_max_trails = atoi(ago_optarg);
3430 break;
3431 case OPT_MAXGOOGLED:
3432 Config_max_googled = atoi(ago_optarg);
3433 break;
3434 case OPT_MAXADSENSED:
3435 Config_max_adsensed = atoi(ago_optarg);
3436 break;
3437 case OPT_MAXGOOGLEKEYPHRASES:
3438 Config_max_google_keyphrases = atoi(ago_optarg);
3439 break;
3440 case OPT_MAXGOOGLEKEYPHRASESAGE:
3441 Config_max_google_keyphrases_age = atoi(ago_optarg);
3442 break;
3443 case OPT_MAXREFERERSAGE:
3444 Config_max_referers_age = atoi(ago_optarg);
3445 break;
3446 case OPT_MAXTLD:
3447 Config_max_tld = atoi(ago_optarg);
3448 break;
3449 case OPT_MAXROBOTS:
3450 Config_max_robots = atoi(ago_optarg);
3451 break;
3452 case OPT_USERAGENTS:
3453 Config_process_agents = 1;
3454 break;
3455 case OPT_GOOGLE:
3456 Config_process_google = 1;
3457 break;
3458 case OPT_GOOGLEKEYPHRASES:
3459 Config_process_google_keyphrases = 1;
3460 break;
3461 case OPT_GOOGLEKEYPHRASESAGE:
3462 Config_process_google_keyphrases_age = 1;
3463 break;
3464 case OPT_GOOGLEHUMANLANGUAGE:
3465 Config_process_google_keyphrases = 1;
3466 Config_process_google_human_language = 1;
3467 break;
3468 case OPT_TLD:
3469 Config_process_tld = 1;
3470 break;
3471 case OPT_OS:
3472 Config_process_os = 1;
3473 break;
3474 case OPT_BROWSERS:
3475 Config_process_browsers = 1;
3476 break;
3477 case OPT_ERROR404:
3478 Config_process_error404 = 1;
3479 break;
3480 case OPT_PAGEVIEWS:
3481 Config_process_pageviews = 1;
3482 break;
3483 case OPT_ROBOTS:
3484 Config_process_robots = 1;
3485 break;
3486 case OPT_ALL:
3487 Config_process_agents = 1;
3488 Config_process_google = 1;
3489 Config_process_google_keyphrases = 1;
3490 Config_process_google_keyphrases_age = 1;
3491 Config_process_google_human_language = 1;
3492 Config_process_weekdayhour_map = 1;
3493 Config_process_monthday_map = 1;
3494 Config_process_referers_age = 1;
3495 Config_process_tld = 1;
3496 Config_process_os = 1;
3497 Config_process_browsers = 1;
3498 Config_process_error404 = 1;
3499 Config_process_pageviews = 1;
3500 Config_process_robots = 1;
3501 Config_process_screen_info = 1;
3502 break;
3503 case OPT_PREFIX:
3504 if (Config_prefix_num < VI_PREFIXES_MAX) {
3505 Config_prefix[Config_prefix_num].str = ago_optarg;
3506 Config_prefix[Config_prefix_num].len = strlen(ago_optarg);
3507 Config_prefix_num++;
3508 } else {
3509 fprintf(stderr, "Error: too many prefixes specified\n");
3510 exit(1);
3511 }
3512 break;
3513 case OPT_TRAILS:
3514 Config_process_web_trails = 1;
3515 break;
3516 case OPT_MAXLINES:
3517 {
3518 int aux = atoi(ago_optarg);
3519 Config_max_referers = aux;
3520 Config_max_pages = aux;
3521 Config_max_images = aux;
3522 Config_max_error404 = aux;
3523 Config_max_agents = aux;
3524 Config_max_googled = aux;
3525 Config_max_adsensed = aux;
3526 Config_max_trails = aux;
3527 Config_max_google_keyphrases = aux;
3528 Config_max_google_keyphrases_age = aux;
3529 Config_max_referers_age = aux;
3530 Config_max_tld = aux;
3531 Config_max_robots = aux;
3532 }
3533 break;
3534 case OPT_OUTPUT:
3535 if (!strcasecmp(ago_optarg, "text"))
3536 Output = &OutputModuleText;
3537 else if (!strcasecmp(ago_optarg, "html"))
3538 Output = &OutputModuleHtml;
3539 else {
3540 fprintf(stderr, "Unknown output module '%s'\n",
3541 ago_optarg);
3542 exit(1);
3543 }
3544 break;
3545 case OPT_GRAPHVIZ:
3546 Config_graphviz_mode = 1;
3547 Config_process_web_trails = 1;
3548 break;
3549 case OPT_GRAPHVIZ_ignorenode_GOOGLE:
3550 Config_graphviz_ignorenode_google = 1;
3551 break;
3552 case OPT_GRAPHVIZ_ignorenode_EXTERNAL:
3553 Config_graphviz_ignorenode_external= 1;
3554 break;
3555 case OPT_GRAPHVIZ_ignorenode_NOREFERER:
3556 Config_graphviz_ignorenode_noreferer = 1;
3557 break;
3558 case OPT_TAIL:
3559 Config_tail_mode = 1;
3560 break;
3561 case OPT_WEEKDAYHOUR_MAP:
3562 Config_process_weekdayhour_map = 1;
3563 break;
3564 case OPT_MONTHDAY_MAP:
3565 Config_process_monthday_map = 1;
3566 break;
3567 case OPT_REFERERSAGE:
3568 Config_process_referers_age = 1;
3569 break;
3570 case OPT_STREAM:
3571 Config_stream_mode = 1;
3572 break;
3573 case OPT_OUTPUTFILE:
3574 Config_output_file = ago_optarg;
3575 break;
3576 case OPT_UPDATEEVERY:
3577 Config_update_every = atoi(ago_optarg);
3578 break;
3579 case OPT_RESETEVERY:
3580 Config_reset_every = atoi(ago_optarg);
3581 break;
3582 case OPT_TIMEDELTA:
3583 Config_time_delta = atoi(ago_optarg);
3584 break;
3585 case OPT_FILTERSPAM:
3586 Config_filter_spam = 1;
3587 break;
3588 case OPT_GREP:
3589 ConfigAddGrepPattern(ago_optarg, VI_PATTERNTYPE_GREP);
3590 break;
3591 case OPT_EXCLUDE:
3592 ConfigAddGrepPattern(ago_optarg, VI_PATTERNTYPE_EXCLUDE);
3593 break;
3594 case OPT_IGNORE404:
3595 Config_ignore_404 = 1;
3596 break;
3597 case OPT_DEBUG:
3598 Config_debug = 1;
3599 break;
3600 case OPT_SCREENINFO:
3601 Config_process_screen_info = 1;
3602 break;
3603 case AGO_ALONE:
3604 if (filenamec < VI_FILENAMES_MAX)
3605 filenames[filenamec++] = ago_optarg;
3606 break;
3607 }
3608 }
3609 /* If the user specified the 'tail' mode, we
3610 * just emulate a "tail -f" for the specified files. */
3611 if (Config_tail_mode) {
3612 vi_tail(filenamec, filenames);
3613 return 0;
3614 }
3615 /* Check if at least one file was specified */
3616 if (filenamec == 0 && !Config_stream_mode) {
3617 fprintf(stderr, "No logfile specified\n");
3618 visitors_show_help();
3619 exit(1);
3620 }
3621 /* If the prefix was not set, but the user asks for
3622 * web trails, notify it and exit. */
3623 if (Config_process_web_trails && !Config_prefix_num) {
3624 fprintf(stderr, "At least one prefix must be specified (using --prefix) for web trails\nExample: --prefix http://your.site.org\n");
3625 exit(1);
3626 }
3627 /* If screen-info is enabled, error 404 must be too, auto-enable it. */
3628 if (Config_process_screen_info && !Config_process_error404) {
3629 fprintf(stderr, "Note: 404 error processing enabled for screen-info report\n");
3630 Config_process_error404 = 1;
3631 }
3632 /* If stream-mode is enabled, --output-file should be specified. */
3633 if (Config_stream_mode && Config_output_file == NULL) {
3634 fprintf(stderr, "--stream requires --output-file\n");
3635 exit(1);
3636 }
3637 /* Set the default output module */
3638 if (Output == NULL)
3639 Output = &OutputModuleHtml;
3640 /* Change to "C" locale for date/time related functions */
3641 setlocale(LC_ALL, "C");
3642 /* Process all the log files specified. */
3643 vih = vi_new();
3644 for (i = 0; i < filenamec; i++) {
3645 if (vi_scan(vih, filenames[i])) {
3646 fprintf(stderr, "%s: %s\n", filenames[i], vi_get_error(vih));
3647 exit(1);
3648 }
3649 }
3650 if (Config_graphviz_mode) {
3651 vi_print_graphviz(vih);
3652 } else {
3653 if (vi_print_report(Config_output_file, vih)) {
3654 fprintf(stderr, "%s\n", vi_get_error(vih));
3655 exit(1);
3656 }
3657 if (Config_stream_mode) {
3658 vi_stream_mode(vih);
3659 }
3660 }
3661 vi_print_statistics(vih);
3662 /* The following is commented in releases as to free the hashtable
3663 * memory is very slow, it's better to just exit the program.
3664 * Still it is important to be able to re-enable a good cleanup
3665 * in order to run visitors against valgrind to check for memory
3666 * leaks. */
3667 /* vi_free(vih); */
3668 return 0;
3669 }
3670