1 /* Licensed to the Apache Software Foundation (ASF) under one or more
2  * contributor license agreements.  See the NOTICE file distributed with
3  * this work for additional information regarding copyright ownership.
4  * The ASF licenses this file to You under the Apache License, Version 2.0
5  * (the "License"); you may not use this file except in compliance with
6  * the License.  You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /*
18  * logresolve 2.0
19  *
20  * Tom Rathborne - tomr uunet.ca - http://www.uunet.ca/~tomr/
21  * UUNET Canada, April 16, 1995
22  *
23  * Rewritten by David Robinson. (drtr ast.cam.ac.uk)
24  * Rewritten again, and ported to APR by Colm MacCarthaigh
25  *
26  * Usage: logresolve [-s filename] [-c] < access_log > new_log
27  *
28  * Arguments:
29  *    -s filename     name of a file to record statistics
30  *    -c              check the DNS for a matching A record for the host.
31  *
32  * Notes:             (For historical interest)
33  *
34  * To generate meaningful statistics from an HTTPD log file, it's good
35  * to have the domain name of each machine that accessed your site, but
36  * doing this on the fly can slow HTTPD down.
37  *
38  * Compiling NCSA HTTPD with the -DMINIMAL_DNS flag turns IP#->hostname
39  * resolution off. Before running your stats program, just run your log
40  * file through this program (logresolve) and all of your IP numbers will
41  * be resolved into hostnames (where possible).
42  *
43  * logresolve takes an HTTPD access log (in the COMMON log file format,
44  * or any other format that has the IP number/domain name as the first
45  * field for that matter), and outputs the same file with all of the
46  * domain names looked up. Where no domain name can be found, the IP
47  * number is left in.
48  *
49  * To minimize impact on your nameserver, logresolve has its very own
50  * internal hash-table cache. This means that each IP number will only
51  * be looked up the first time it is found in the log file.
52  *
53  * The -c option causes logresolve to apply the same check as httpd
54  * compiled with -DMAXIMUM_DNS; after finding the hostname from the IP
55  * address, it looks up the IP addresses for the hostname and checks
56  * that one of these matches the original address.
57  */
58 
59 #include "apr.h"
60 #include "apr_lib.h"
61 #include "apr_hash.h"
62 #include "apr_getopt.h"
63 #include "apr_strings.h"
64 #include "apr_file_io.h"
65 #include "apr_network_io.h"
66 
67 #if APR_HAVE_STDLIB_H
68 #include <stdlib.h>
69 #endif
70 
71 #define READ_BUF_SIZE  128*1024
72 #define WRITE_BUF_SIZE 128*1024
73 #define LINE_BUF_SIZE  128*1024
74 
75 static apr_file_t *errfile;
76 static const char *shortname = "logresolve";
77 static apr_hash_t *cache;
78 
79 /* Statistics */
80 static int cachehits = 0;
81 static int cachesize = 0;
82 static int entries = 0;
83 static int resolves = 0;
84 static int withname = 0;
85 static int doublefailed = 0;
86 static int noreverse = 0;
87 
88 /*
89  * prints various statistics to output
90  */
91 #define NL APR_EOL_STR
print_statistics(apr_file_t * output)92 static void print_statistics (apr_file_t *output)
93 {
94     apr_file_printf(output, "logresolve Statistics:" NL);
95     apr_file_printf(output, "Entries: %d" NL, entries);
96     apr_file_printf(output, "    With name   : %d" NL, withname);
97     apr_file_printf(output, "    Resolves    : %d" NL, resolves);
98 
99     if (noreverse) {
100         apr_file_printf(output, "    - No reverse : %d" NL,
101                         noreverse);
102     }
103 
104     if (doublefailed) {
105         apr_file_printf(output, "    - Double lookup failed : %d" NL,
106                         doublefailed);
107     }
108 
109     apr_file_printf(output, "Cache hits      : %d" NL, cachehits);
110     apr_file_printf(output, "Cache size      : %d" NL, cachesize);
111 }
112 
113 /*
114  * usage info
115  */
usage(void)116 static void usage(void)
117 {
118     apr_file_printf(errfile,
119     "%s -- Resolve IP-addresses to hostnames in Apache log files."           NL
120     "Usage: %s [-s STATFILE] [-c]"                                           NL
121                                                                              NL
122     "Options:"                                                               NL
123     "  -s   Record statistics to STATFILE when finished."                    NL
124                                                                              NL
125     "  -c   Perform double lookups when resolving IP addresses."             NL,
126     shortname, shortname);
127     exit(1);
128 }
129 #undef NL
130 
main(int argc,const char * const argv[])131 int main(int argc, const char * const argv[])
132 {
133     apr_file_t * outfile;
134     apr_file_t * infile;
135     apr_getopt_t * o;
136     apr_pool_t * pool;
137     apr_pool_t *pline;
138     apr_status_t status;
139     const char * arg;
140     char * stats = NULL;
141     char * inbuffer;
142     char * outbuffer;
143     char * line;
144     int doublelookups = 0;
145 
146     if (apr_app_initialize(&argc, &argv, NULL) != APR_SUCCESS) {
147         return 1;
148     }
149     atexit(apr_terminate);
150 
151     if (argc) {
152         shortname = apr_filepath_name_get(argv[0]);
153     }
154 
155     if (apr_pool_create(&pool, NULL) != APR_SUCCESS) {
156         return 1;
157     }
158     apr_file_open_stderr(&errfile, pool);
159     apr_getopt_init(&o, pool, argc, argv);
160 
161     while (1) {
162         char opt;
163         status = apr_getopt(o, "s:c", &opt, &arg);
164         if (status == APR_EOF) {
165             break;
166         }
167         else if (status != APR_SUCCESS) {
168             usage();
169         }
170         else {
171             switch (opt) {
172             case 'c':
173                 if (doublelookups) {
174                     usage();
175                 }
176                 doublelookups = 1;
177                 break;
178             case 's':
179                 if (stats) {
180                     usage();
181                 }
182                 stats = apr_pstrdup(pool, arg);
183                 break;
184             } /* switch */
185         } /* else */
186     } /* while */
187 
188     apr_file_open_stdout(&outfile, pool);
189     apr_file_open_stdin(&infile, pool);
190 
191     /* Allocate two new 10k file buffers */
192     if (   (outbuffer = apr_palloc(pool, WRITE_BUF_SIZE)) == NULL
193         || (inbuffer  = apr_palloc(pool, READ_BUF_SIZE))  == NULL
194         || (line      = apr_palloc(pool, LINE_BUF_SIZE))  == NULL) {
195         return 1;
196     }
197 
198     /* Set the buffers */
199     apr_file_buffer_set(infile, inbuffer, READ_BUF_SIZE);
200     apr_file_buffer_set(outfile, outbuffer, WRITE_BUF_SIZE);
201 
202     cache = apr_hash_make(pool);
203     if (apr_pool_create(&pline, pool) != APR_SUCCESS) {
204         return 1;
205     }
206 
207     while (apr_file_gets(line, LINE_BUF_SIZE, infile) == APR_SUCCESS) {
208         char *hostname;
209         char *space;
210         apr_sockaddr_t *ip;
211         apr_sockaddr_t *ipdouble;
212         char dummy[] = " " APR_EOL_STR;
213 
214         if (line[0] == '\0') {
215             continue;
216         }
217 
218         /* Count our log entries */
219         entries++;
220 
221         /* Check if this could even be an IP address */
222         if (!apr_isxdigit(line[0]) && line[0] != ':') {
223             withname++;
224             apr_file_puts(line, outfile);
225             continue;
226         }
227 
228         /* Terminate the line at the next space */
229         if ((space = strchr(line, ' ')) != NULL) {
230             *space = '\0';
231         }
232         else {
233             space = dummy;
234         }
235 
236         /* See if we have it in our cache */
237         hostname = (char *) apr_hash_get(cache, line, APR_HASH_KEY_STRING);
238         if (hostname) {
239             apr_file_printf(outfile, "%s %s", hostname, space + 1);
240             cachehits++;
241             continue;
242         }
243 
244         /* Parse the IP address */
245         status = apr_sockaddr_info_get(&ip, line, APR_UNSPEC, 0, 0, pline);
246         if (status != APR_SUCCESS) {
247             /* Not an IP address */
248             withname++;
249             *space = ' ';
250             apr_file_puts(line, outfile);
251             continue;
252         }
253 
254         /* This does not make much sense, but historically "resolves" means
255          * "parsed as an IP address". It does not mean we actually resolved
256          * the IP address into a hostname.
257          */
258         resolves++;
259 
260         /* From here on our we cache each result, even if it was not
261          * successful
262          */
263         cachesize++;
264 
265         /* Try and perform a reverse lookup */
266         status = apr_getnameinfo(&hostname, ip, 0) != APR_SUCCESS;
267         if (status || hostname == NULL) {
268             /* Could not perform a reverse lookup */
269             *space = ' ';
270             apr_file_puts(line, outfile);
271             noreverse++;
272 
273             /* Add to cache */
274             *space = '\0';
275             apr_hash_set(cache, line, APR_HASH_KEY_STRING,
276                          apr_pstrdup(apr_hash_pool_get(cache), line));
277             continue;
278         }
279 
280         /* Perform a double lookup */
281         if (doublelookups) {
282             /* Do a forward lookup on our hostname, and see if that matches our
283              * original IP address.
284              */
285             status = apr_sockaddr_info_get(&ipdouble, hostname, ip->family, 0,
286                                            0, pline);
287             if (status != APR_SUCCESS ||
288                 memcmp(ipdouble->ipaddr_ptr, ip->ipaddr_ptr, ip->ipaddr_len)) {
289                 /* Double-lookup failed  */
290                 *space = ' ';
291                 apr_file_puts(line, outfile);
292                 doublefailed++;
293 
294                 /* Add to cache */
295                 *space = '\0';
296                 apr_hash_set(cache, line, APR_HASH_KEY_STRING,
297                              apr_pstrdup(apr_hash_pool_get(cache), line));
298                 continue;
299             }
300         }
301 
302         /* Output the resolved name */
303         apr_file_printf(outfile, "%s %s", hostname, space + 1);
304 
305         /* Store it in the cache */
306         apr_hash_set(cache, line, APR_HASH_KEY_STRING,
307                      apr_pstrdup(apr_hash_pool_get(cache), hostname));
308 
309         apr_pool_clear(pline);
310     }
311 
312     /* Flush any remaining output */
313     apr_file_flush(outfile);
314 
315     if (stats) {
316         apr_file_t *statsfile;
317         if (apr_file_open(&statsfile, stats,
318                           APR_FOPEN_WRITE | APR_FOPEN_CREATE | APR_FOPEN_TRUNCATE,
319                           APR_OS_DEFAULT, pool) != APR_SUCCESS) {
320             apr_file_printf(errfile, "%s: Could not open %s for writing.",
321                             shortname, stats);
322             return 1;
323         }
324         print_statistics(statsfile);
325         apr_file_close(statsfile);
326     }
327 
328     return 0;
329 }
330