1 #include <cstdio>
2 #include <unordered_map>
3 #include <cstdlib>
4 #include <map>
5 #include <cstring>
6 #include <unistd.h>
7 #include <cctype>
8 #include <string>
9
10 #include "logtools.h"
11
12 #define BUF_SIZE 4096
13
14 using namespace std;
15 #if !defined(_LIBCPP_VERSION)
16 using namespace __gnu_cxx;
17 #endif
18
19 struct hashing_func {
operator ()hashing_func20 unsigned long operator()(const char *key) const
21 { return (key[0] << 8) + key[1]; }
22 };
23
24 struct eqstr
25 {
operator ()eqstr26 bool operator()(const char *s1, const char *s2) const
27 { return strcmp(s1, s2) == 0; }
28 };
29
30 unordered_map<const char *, const char *, hashing_func, eqstr> months;
31
32 class LogFile
33 {
34 public:
35 LogFile();
~LogFile()36 ~LogFile(){ if(m_fp) fclose(m_fp); }
37
38 // open a file, return 0 for success, 1 for minor error (0 length file)
39 // and 2 for serious error (file not found)
40 int open(const char *name, bool domain_mangling);
41
42 // get the date string of the current item
date() const43 const char *date() const { return m_date; }
44 // get the full data for the current item
line()45 const char *line() { m_valid = false; return m_line; }
46
47 // get a line from the file and parse it
48 // return 0 for success and 1 for EOF
49 int getLine();
50
valid() const51 bool valid() const { return m_valid; }
52
53 bool verbose;
54
55 private:
56 bool m_valid;
57 FILE *m_fp;
58 char m_date[17];
59 char m_lineBuf[BUF_SIZE];
60 char m_lineBuf2[BUF_SIZE + 7];
61 char *m_line;
62 bool m_web_first;
63
64 // store the date in numeric format so that strcmp() can be used to compare
65 // dates. Returns 1 for error and 0 for OK.
66 int setDate();
67
68 LogFile(const LogFile&);
69 LogFile & operator=(const LogFile&);
70 };
71
LogFile()72 LogFile::LogFile()
73 : m_valid(false)
74 , m_fp(NULL)
75 {
76 m_line = m_lineBuf;
77 m_lineBuf[0] = '\0';
78 }
79
80 // Common log format:
81 // 1.2.3.4 - - [23/Aug/2000:12:00:32 +0200] etc
setDate()82 int LogFile::setDate()
83 {
84 unsigned int i;
85 // find '[' or abort if we can't
86 for(i = 0; m_lineBuf[i] != '[' && m_lineBuf[i] != '\0'; i++)
87 { }
88
89 // if not enough data left for the full date then return
90 if(i + 21 > strlen(m_lineBuf))
91 return 1;
92 memcpy(m_date, &m_lineBuf[i + 8], 4);
93 char mon[4];
94 memcpy(mon, &m_lineBuf[i + 4], 3);
95 mon[3] = '\0';
96 const char *m = months[mon];
97 if(!m) return 1;
98 strcpy(&m_date[4], m);
99 memcpy(&m_date[6], &m_lineBuf[i + 1], 2);
100 memcpy(&m_date[8], &m_lineBuf[i + 13], 8);
101 m_date[16] = '\0';
102 if(m_web_first) // make m_lineBuf2 have the data for the mangled line
103 {
104 unsigned int end_webname;
105 // find where the domain name ends
106 for(end_webname = 0; m_lineBuf[end_webname] != ' ' && m_lineBuf[end_webname] != '\0'; end_webname++)
107 { }
108
109 if(end_webname >= i)
110 return 1;
111 for(i = 0; i < end_webname; i++)
112 m_lineBuf[end_webname] = tolower(m_lineBuf[end_webname]);
113
114 // there will be more than 40 chars in between
115 unsigned int start_url = end_webname + 40;
116 // search for the start quote character
117 for(; m_lineBuf[start_url] != '\"' && m_lineBuf[start_url] != '\0'; start_url++)
118 { }
119 // search for the space in the web request
120 for(; m_lineBuf[start_url] != ' ' && m_lineBuf[start_url] != '\0'; start_url++)
121 { }
122
123 if(strlen(&m_lineBuf[start_url]) < 6) return 1;
124
125 memcpy(m_lineBuf2, &m_lineBuf[end_webname + 1], start_url - end_webname);
126 m_line = &m_lineBuf2[start_url - end_webname]; // m_line points to next char
127 if(strncmp(&m_lineBuf[start_url + 1], "http://", 7))
128 {
129 strcpy(m_line, "http://");
130 m_line += 7;
131 memcpy(m_line, m_lineBuf, end_webname);
132 m_line += end_webname;
133 if(m_lineBuf[start_url + 1] != '/')
134 {
135 // if URL doesn't start with a '/' then we add one
136 *m_line = '/';
137 m_line++;
138 }
139 }
140 strcpy(m_line, &m_lineBuf[start_url + 1]);
141 m_line = m_lineBuf2;
142 }
143 return 0;
144 }
145
open(const char * name,bool domain_mangling)146 int LogFile::open(const char *name, bool domain_mangling)
147 {
148 m_web_first = domain_mangling;
149 m_fp = fopen(name, "r");
150 if(!m_fp)
151 {
152 fprintf(stderr, "Can't open %s.\n", name);
153 return 2;
154 }
155 if(getLine())
156 return 1;
157 return 0;
158 }
159
getLine()160 int LogFile::getLine()
161 {
162 while(1)
163 {
164 // if can't get more data then return 1
165 if(!fgets(m_lineBuf, sizeof(m_lineBuf) - 1, m_fp))
166 return 1;
167 m_lineBuf[sizeof(m_lineBuf) - 1] = '\0';
168 m_line = m_lineBuf;
169 strtok(m_line, "\n\r");
170 // if setDate() returns 1 then we can't parse the line so we keep looping
171 // if setDate() returns 0 then return success!
172 if(!setDate())
173 {
174 m_valid = true;
175 return 0;
176 }
177 if(verbose)
178 fprintf(stderr, "Skipping bad line: %s\n", m_line);
179 }
180 return 0; // to make compilers happy - will not be reached
181 }
182
183 typedef LogFile *PLogFile;
184
item_compare(const void * a,const void * b)185 int item_compare(const void *a, const void *b)
186 {
187 const LogFile * const left = *(LogFile * const *)a;
188 const LogFile * const right = *(LogFile * const *)b;
189 return strcmp(left->date(), right->date());
190 }
191
192 struct ltstr
193 {
operator ()ltstr194 bool operator()(const string s1, const string s2) const
195 {
196 return strcmp(s1.c_str(), s2.c_str()) < 0;
197 }
198 };
199
usage(const char * const arg)200 void usage(const char *const arg)
201 {
202 fprintf(stderr, "usage: %s [OPTION] [filenames]", arg);
203 fprintf(stderr, "\n"
204 "This program merges web logs in common log format into a single stream\n"
205 "on standard output. It reads from multiple input files and outputs the\n"
206 "data in-order as much as is possible. If there is only a single input\n"
207 "file it will re-order it (with a 1000 line buffer size) to deal with web\n"
208 "servers that output data out of order.\n"
209 "\nVersion: " VERSION "\n");
210 exit(ERR_PARAM);
211 }
212
main(int argc,char ** argv)213 int main(int argc, char **argv)
214 {
215 if(argc == 1)
216 return 0;
217
218 unsigned int map_items = 0;
219 bool set_map_items = false, domain_mangling = false, verbose = false;
220 int int_c;
221 optind = 1;
222 while(-1 != (int_c = getopt(argc, argv, "b:hdv")) )
223 {
224 switch(char(int_c))
225 {
226 case '?':
227 case ':':
228 case 'h':
229 usage(argv[0]);
230 break;
231 case 'b':
232 set_map_items = true;
233 map_items = atoi(optarg);
234 break;
235 case 'd':
236 domain_mangling = true;
237 case 'v':
238 verbose = true;
239 break;
240 }
241 }
242 months["Jan"] = "01";
243 months["Feb"] = "02";
244 months["Mar"] = "03";
245 months["Apr"] = "04";
246 months["May"] = "05";
247 months["Jun"] = "06";
248 months["Jul"] = "07";
249 months["Aug"] = "08";
250 months["Sep"] = "09";
251 months["Oct"] = "10";
252 months["Nov"] = "11";
253 months["Dec"] = "12";
254
255 multimap<const string, const string, ltstr> outputMap;
256
257 LogFile **items = new PLogFile[argc - optind];
258
259 unsigned int item_count = 0;
260 int i;
261 for(i = optind; i < argc; i++)
262 {
263 items[item_count] = new LogFile;
264 items[item_count]->verbose = verbose;
265 int rc = items[item_count]->open(argv[i], domain_mangling);
266 // if rc==2 then file not found, if rc==1 then 0 length file
267 if(rc > 1)
268 return ERR_INPUT;
269 if(rc == 1)
270 delete items[item_count];
271 else
272 item_count++;
273 }
274
275 if(!set_map_items)
276 {
277 map_items = item_count * 400;
278 if(map_items < 4000)
279 map_items = 4000;
280 }
281 while(item_count > 1)
282 {
283 qsort(items, item_count, sizeof(LogFile *), item_compare);
284 while(items[0]->valid() && strcmp(items[0]->date(), items[1]->date()) <= 0)
285 {
286 if(map_items > 0)
287 {
288 outputMap.insert(pair<string, string>(items[0]->date(), items[0]->line()));
289 while(outputMap.size() > map_items)
290 {
291 printf("%s\n", outputMap.begin()->second.c_str());
292 outputMap.erase(outputMap.begin());
293 }
294 }
295 else
296 {
297 printf("%s\n", items[0]->line());
298 }
299 if(items[0]->getLine())
300 {
301 delete(items[0]);
302 item_count--;
303 items[0] = items[item_count];
304 break;
305 }
306 }
307 }
308 if(item_count == 1)
309 {
310 if(map_items > 0)
311 {
312 do
313 {
314 outputMap.insert(pair<string, string>(items[0]->date(), items[0]->line()));
315 } while(!items[0]->getLine() && outputMap.size() < map_items);
316
317 if(items[0]->valid())
318 {
319 do
320 {
321 outputMap.insert(pair<string, string>(items[0]->date(), items[0]->line()));
322 CPCCHAR tmp = outputMap.begin()->second.c_str();
323 if(printf("%s\n", tmp) != int(strlen(tmp) + 1))
324 {
325 fprintf(stderr, "Can't write output!\n");
326 return ERR_OUTPUT;
327 }
328 outputMap.erase(outputMap.begin());
329 } while(!items[0]->getLine());
330 }
331 delete items[0];
332 while(!outputMap.empty())
333 {
334 CPCCHAR tmp = outputMap.begin()->second.c_str();
335 if(printf("%s\n", tmp) != int(strlen(tmp) + 1))
336 {
337 fprintf(stderr, "Can't write output!\n");
338 return ERR_OUTPUT;
339 }
340 outputMap.erase(outputMap.begin());
341 }
342 }
343 else
344 {
345 do
346 {
347 CPCCHAR tmp = items[0]->line();
348 if(printf("%s\n", tmp) != int(strlen(tmp) + 1))
349 {
350 fprintf(stderr, "Can't write output!\n");
351 return ERR_OUTPUT;
352 }
353 } while(!items[0]->getLine());
354 }
355 }
356 delete[] items;
357 return 0;
358 }
359