1 #include <cstdio>
2 #include <unordered_map>
3 #include <cstdlib>
4 #include <map>
5 #include <cstring>
6 #include <unistd.h>
7 #include <cctype>
8 #include <string>
9 
10 #include "logtools.h"
11 
12 #define BUF_SIZE 4096
13 
14 using namespace std;
15 #if !defined(_LIBCPP_VERSION)
16 using namespace __gnu_cxx;
17 #endif
18 
19 struct hashing_func {
operator ()hashing_func20   unsigned long operator()(const char *key) const
21   { return (key[0] << 8) + key[1]; }
22 };
23 
24 struct eqstr
25 {
operator ()eqstr26   bool operator()(const char *s1, const char *s2) const
27   { return strcmp(s1, s2) == 0; }
28 };
29 
30 unordered_map<const char *, const char *, hashing_func, eqstr> months;
31 
32 class LogFile
33 {
34 public:
35   LogFile();
~LogFile()36   ~LogFile(){ if(m_fp) fclose(m_fp); }
37 
38   // open a file, return 0 for success, 1 for minor error (0 length file)
39   // and 2 for serious error (file not found)
40   int open(const char *name, bool domain_mangling);
41 
42   // get the date string of the current item
date() const43   const char *date() const { return m_date; }
44   // get the full data for the current item
line()45   const char *line() { m_valid = false; return m_line; }
46 
47   // get a line from the file and parse it
48   // return 0 for success and 1 for EOF
49   int getLine();
50 
valid() const51   bool valid() const { return m_valid; }
52 
53   bool verbose;
54 
55 private:
56   bool m_valid;
57   FILE *m_fp;
58   char m_date[17];
59   char m_lineBuf[BUF_SIZE];
60   char m_lineBuf2[BUF_SIZE + 7];
61   char *m_line;
62   bool m_web_first;
63 
64   // store the date in numeric format so that strcmp() can be used to compare
65   // dates.  Returns 1 for error and 0 for OK.
66   int setDate();
67 
68   LogFile(const LogFile&);
69   LogFile & operator=(const LogFile&);
70 };
71 
LogFile()72 LogFile::LogFile()
73  : m_valid(false)
74  , m_fp(NULL)
75 {
76   m_line = m_lineBuf;
77   m_lineBuf[0] = '\0';
78 }
79 
80 // Common log format:
81 // 1.2.3.4 - - [23/Aug/2000:12:00:32 +0200] etc
setDate()82 int LogFile::setDate()
83 {
84   unsigned int i;
85   // find '[' or abort if we can't
86   for(i = 0; m_lineBuf[i] != '[' && m_lineBuf[i] != '\0'; i++)
87   { }
88 
89   // if not enough data left for the full date then return
90   if(i + 21 > strlen(m_lineBuf))
91     return 1;
92   memcpy(m_date, &m_lineBuf[i + 8], 4);
93   char mon[4];
94   memcpy(mon, &m_lineBuf[i + 4], 3);
95   mon[3] = '\0';
96   const char *m = months[mon];
97   if(!m) return 1;
98   strcpy(&m_date[4], m);
99   memcpy(&m_date[6], &m_lineBuf[i + 1], 2);
100   memcpy(&m_date[8], &m_lineBuf[i + 13], 8);
101   m_date[16] = '\0';
102   if(m_web_first) // make m_lineBuf2 have the data for the mangled line
103   {
104     unsigned int end_webname;
105     // find where the domain name ends
106     for(end_webname = 0; m_lineBuf[end_webname] != ' ' && m_lineBuf[end_webname] != '\0'; end_webname++)
107     { }
108 
109     if(end_webname >= i)
110       return 1;
111     for(i = 0; i < end_webname; i++)
112       m_lineBuf[end_webname] = tolower(m_lineBuf[end_webname]);
113 
114     // there will be more than 40 chars in between
115     unsigned int start_url = end_webname + 40;
116     // search for the start quote character
117     for(; m_lineBuf[start_url] != '\"' && m_lineBuf[start_url] != '\0'; start_url++)
118     { }
119     // search for the space in the web request
120     for(; m_lineBuf[start_url] != ' ' && m_lineBuf[start_url] != '\0'; start_url++)
121     { }
122 
123     if(strlen(&m_lineBuf[start_url]) < 6) return 1;
124 
125     memcpy(m_lineBuf2, &m_lineBuf[end_webname + 1], start_url - end_webname);
126     m_line = &m_lineBuf2[start_url - end_webname]; // m_line points to next char
127     if(strncmp(&m_lineBuf[start_url + 1], "http://", 7))
128     {
129       strcpy(m_line, "http://");
130       m_line += 7;
131       memcpy(m_line, m_lineBuf, end_webname);
132       m_line += end_webname;
133       if(m_lineBuf[start_url + 1] != '/')
134       {
135         // if URL doesn't start with a '/' then we add one
136         *m_line = '/';
137         m_line++;
138       }
139     }
140     strcpy(m_line, &m_lineBuf[start_url + 1]);
141     m_line = m_lineBuf2;
142   }
143   return 0;
144 }
145 
open(const char * name,bool domain_mangling)146 int LogFile::open(const char *name, bool domain_mangling)
147 {
148   m_web_first = domain_mangling;
149   m_fp = fopen(name, "r");
150   if(!m_fp)
151   {
152     fprintf(stderr, "Can't open %s.\n", name);
153     return 2;
154   }
155   if(getLine())
156     return 1;
157   return 0;
158 }
159 
getLine()160 int LogFile::getLine()
161 {
162   while(1)
163   {
164     // if can't get more data then return 1
165     if(!fgets(m_lineBuf, sizeof(m_lineBuf) - 1, m_fp))
166       return 1;
167     m_lineBuf[sizeof(m_lineBuf) - 1] = '\0';
168     m_line = m_lineBuf;
169     strtok(m_line, "\n\r");
170     // if setDate() returns 1 then we can't parse the line so we keep looping
171     // if setDate() returns 0 then return success!
172     if(!setDate())
173     {
174       m_valid = true;
175       return 0;
176     }
177     if(verbose)
178       fprintf(stderr, "Skipping bad line: %s\n", m_line);
179   }
180   return 0; // to make compilers happy - will not be reached
181 }
182 
183 typedef LogFile *PLogFile;
184 
item_compare(const void * a,const void * b)185 int item_compare(const void *a, const void *b)
186 {
187   const LogFile * const left = *(LogFile * const *)a;
188   const LogFile * const right = *(LogFile * const *)b;
189   return strcmp(left->date(), right->date());
190 }
191 
192 struct ltstr
193 {
operator ()ltstr194   bool operator()(const string s1, const string s2) const
195   {
196     return strcmp(s1.c_str(), s2.c_str()) < 0;
197   }
198 };
199 
usage(const char * const arg)200 void usage(const char *const arg)
201 {
202   fprintf(stderr, "usage: %s [OPTION] [filenames]", arg);
203   fprintf(stderr, "\n"
204   "This program merges web logs in common log format into a single stream\n"
205   "on standard output.  It reads from multiple input files and outputs the\n"
206   "data in-order as much as is possible.  If there is only a single input\n"
207   "file it will re-order it (with a 1000 line buffer size) to deal with web\n"
208   "servers that output data out of order.\n"
209   "\nVersion: " VERSION "\n");
210   exit(ERR_PARAM);
211 }
212 
main(int argc,char ** argv)213 int main(int argc, char **argv)
214 {
215   if(argc == 1)
216     return 0;
217 
218   unsigned int map_items = 0;
219   bool set_map_items = false, domain_mangling = false, verbose = false;
220   int int_c;
221   optind = 1;
222   while(-1 != (int_c = getopt(argc, argv, "b:hdv")) )
223   {
224     switch(char(int_c))
225     {
226       case '?':
227       case ':':
228       case 'h':
229         usage(argv[0]);
230       break;
231       case 'b':
232         set_map_items = true;
233         map_items = atoi(optarg);
234       break;
235       case 'd':
236         domain_mangling = true;
237       case 'v':
238         verbose = true;
239       break;
240     }
241   }
242   months["Jan"] = "01";
243   months["Feb"] = "02";
244   months["Mar"] = "03";
245   months["Apr"] = "04";
246   months["May"] = "05";
247   months["Jun"] = "06";
248   months["Jul"] = "07";
249   months["Aug"] = "08";
250   months["Sep"] = "09";
251   months["Oct"] = "10";
252   months["Nov"] = "11";
253   months["Dec"] = "12";
254 
255   multimap<const string, const string, ltstr> outputMap;
256 
257   LogFile **items = new PLogFile[argc - optind];
258 
259   unsigned int item_count = 0;
260   int i;
261   for(i = optind; i < argc; i++)
262   {
263     items[item_count] = new LogFile;
264     items[item_count]->verbose = verbose;
265     int rc = items[item_count]->open(argv[i], domain_mangling);
266     // if rc==2 then file not found, if rc==1 then 0 length file
267     if(rc > 1)
268       return ERR_INPUT;
269     if(rc == 1)
270       delete items[item_count];
271     else
272       item_count++;
273   }
274 
275   if(!set_map_items)
276   {
277     map_items = item_count * 400;
278     if(map_items < 4000)
279       map_items = 4000;
280   }
281   while(item_count > 1)
282   {
283     qsort(items, item_count, sizeof(LogFile *), item_compare);
284     while(items[0]->valid() && strcmp(items[0]->date(), items[1]->date()) <= 0)
285     {
286       if(map_items > 0)
287       {
288         outputMap.insert(pair<string, string>(items[0]->date(), items[0]->line()));
289         while(outputMap.size() > map_items)
290         {
291           printf("%s\n", outputMap.begin()->second.c_str());
292           outputMap.erase(outputMap.begin());
293         }
294       }
295       else
296       {
297         printf("%s\n", items[0]->line());
298       }
299       if(items[0]->getLine())
300       {
301         delete(items[0]);
302         item_count--;
303         items[0] = items[item_count];
304         break;
305       }
306     }
307   }
308   if(item_count == 1)
309   {
310     if(map_items > 0)
311     {
312       do
313       {
314         outputMap.insert(pair<string, string>(items[0]->date(), items[0]->line()));
315       } while(!items[0]->getLine() && outputMap.size() < map_items);
316 
317       if(items[0]->valid())
318       {
319         do
320         {
321           outputMap.insert(pair<string, string>(items[0]->date(), items[0]->line()));
322           CPCCHAR tmp = outputMap.begin()->second.c_str();
323           if(printf("%s\n", tmp) != int(strlen(tmp) + 1))
324           {
325             fprintf(stderr, "Can't write output!\n");
326               return ERR_OUTPUT;
327           }
328           outputMap.erase(outputMap.begin());
329         } while(!items[0]->getLine());
330       }
331       delete items[0];
332       while(!outputMap.empty())
333       {
334         CPCCHAR tmp = outputMap.begin()->second.c_str();
335         if(printf("%s\n", tmp) != int(strlen(tmp) + 1))
336         {
337           fprintf(stderr, "Can't write output!\n");
338             return ERR_OUTPUT;
339         }
340         outputMap.erase(outputMap.begin());
341       }
342     }
343     else
344     {
345       do
346       {
347         CPCCHAR tmp = items[0]->line();
348         if(printf("%s\n", tmp) != int(strlen(tmp) + 1))
349         {
350           fprintf(stderr, "Can't write output!\n");
351             return ERR_OUTPUT;
352         }
353       } while(!items[0]->getLine());
354     }
355   }
356   delete[] items;
357   return 0;
358 }
359