1 
2 /* Web Polygraph       http://www.web-polygraph.org/
3  * Copyright 2003-2011 The Measurement Factory
4  * Licensed under the Apache License, Version 2.0 */
5 
6 #include "base/polygraph.h"
7 
8 #include "xstd/h/iostream.h"
9 #include <fstream>
10 #include <ctype.h>
11 
12 #include "xstd/NetAddr.h"
13 #include "base/AddrParsers.h"
14 #include "base/ForeignTrace.h"
15 
16 
ForeignTrace()17 ForeignTrace::ForeignTrace(): doIgnoreBad(true) {
18 }
19 
configure(const String & aName,bool ignoreBad)20 void ForeignTrace::configure(const String &aName, bool ignoreBad) {
21 	theName = aName;
22 	doIgnoreBad = ignoreBad;
23 }
24 
25 
gatherUrls(Array<String * > & urls) const26 int ForeignTrace::gatherUrls(Array<String*> &urls) const {
27 	ifstream is(theName.cstr());
28 	if (open(is)) {
29 		while (String *url = getUrl(is)) {
30 			urls.append(url);
31 			theMemSize += url->len();     // content
32 			theMemSize += SizeOf(String);  // overhead
33 			theMemSize += SizeOf(String*); // overhead
34 		}
35 		close(urls.count());
36 	}
37 	return urls.count();
38 }
39 
gatherHosts(Array<NetAddr * > & hosts) const40 int ForeignTrace::gatherHosts(Array<NetAddr*> &hosts) const {
41 	ifstream is(theName.cstr());
42 	if (open(is)) {
43 		while (String *url = getUrl(is)) {
44 			// extract host from the URL
45 			NetAddr host;
46 			const char *urlB = url->cstr();
47 			const char *urlE = urlB + url->len();
48 			if (const char *hostEnd = SkipHostInUri(urlB, urlE, host)) {
49 				hosts.append(new NetAddr(host));
50 				theMemSize += hostEnd - urlB;   // content
51 				theMemSize += SizeOf(NetAddr);  // overhead
52 				theMemSize += SizeOf(NetAddr*); // overhead
53 			}
54 			delete url;
55 		}
56 		close(hosts.count());
57 	}
58 	return hosts.count();
59 }
60 
getUrl(istream & is) const61 String *ForeignTrace::getUrl(istream &is) const {
62 	char line[16*1024];
63 	while (is.good() && is.getline(line, sizeof(line))) {
64 
65 		// delete comments
66 		if (char *comment = strchr(line, '#'))
67 			*comment = '\0';
68 
69 		// find first URL on the line
70 		const char *urlB;
71 		if ((urlB = strstr(line, "http://")) ||
72 			(urlB = strstr(line, "https://")) ||
73 			(urlB = strstr(line, "ftp://"))) {
74 			// find URL end (white space or eol)
75 			const char *urlE = urlB;
76 			while (*urlE && !isspace(*urlE))
77 				++urlE;
78 
79 			String *url = new String;
80 			url->append(urlB, urlE - urlB);
81 			++theEntryCount;
82 			return url;
83 		}
84 
85 		// skip leading spaces to avoid warning about empty lines
86 		urlB = line;
87 		while (*urlB && isspace(*urlB))
88 			++urlB;
89 
90 		if (*urlB && !doIgnoreBad) {
91 			cerr << here <<
92 				"error: all trace URLs must follow " <<
93 				"http|https|ftp://host/path format; " << endl <<
94 				"\tfound: " << urlB << endl;
95 		}
96 	}
97 
98 	return 0;
99 }
100 
open(istream & is) const101 bool ForeignTrace::open(istream &is) const {
102 	theMemSize = 0;
103 	theEntryCount = 0;
104 	if (!is) {
105 		cerr << "failed to open '" << theName << "' trace for reading: " <<
106 			Error::Last() << endl;
107 		return false;
108 	}
109 	return true;
110 }
111 
close(const int goodCount) const112 void ForeignTrace::close(const int goodCount) const {
113 	if (goodCount > 0)
114 		clog << "fyi: loaded trace from ";
115 	else
116 		clog << "warning: empty trace in ";
117 
118 	clog << "'" << theName << "': " <<
119 		"used " << goodCount << " entries out of " << theEntryCount << ", " <<
120 		"spent at least " << theMemSize << " bytes" <<
121 		endl;
122 }
123