1 #include "Uri.h"
2 
3 #include "wincstring.h"
4 #include <sstream>
5 #include <cstdlib>
6 #include <cassert>
7 #include "tld.h"
8 
9 //#define DEBUG
10 #include "debug.h"
11 
12 using namespace std;
13 using namespace htmlcxx;
14 
15 /** Structure to store various schemes and their default ports */
16 struct schemes_t {
17     /** The name of the scheme */
18     const char *name;
19     /** The default port for the scheme */
20     unsigned int default_port;
21 };
22 
23 /* Some WWW schemes and their default ports; this is basically /etc/services */
24 /* This will become global when the protocol abstraction comes */
25 /* As the schemes are searched by a linear search, */
26 /* they are sorted by their expected frequency */
27 static schemes_t schemes[] =
28 {
29     {"http",     Uri::URI_HTTP_DEFAULT_PORT},
30     {"ftp",      Uri::URI_FTP_DEFAULT_PORT},
31     {"https",    Uri::URI_HTTPS_DEFAULT_PORT},
32     {"gopher",   Uri::URI_GOPHER_DEFAULT_PORT},
33     {"ldap",     Uri::URI_LDAP_DEFAULT_PORT},
34     {"nntp",     Uri::URI_NNTP_DEFAULT_PORT},
35     {"snews",    Uri::URI_SNEWS_DEFAULT_PORT},
36     {"imap",     Uri::URI_IMAP_DEFAULT_PORT},
37     {"pop",      Uri::URI_POP_DEFAULT_PORT},
38     {"sip",      Uri::URI_SIP_DEFAULT_PORT},
39     {"rtsp",     Uri::URI_RTSP_DEFAULT_PORT},
40     {"wais",     Uri::URI_WAIS_DEFAULT_PORT},
41     {"z39.50r",  Uri::URI_WAIS_DEFAULT_PORT},
42     {"z39.50s",  Uri::URI_WAIS_DEFAULT_PORT},
43     {"prospero", Uri::URI_PROSPERO_DEFAULT_PORT},
44     {"nfs",      Uri::URI_NFS_DEFAULT_PORT},
45     {"tip",      Uri::URI_TIP_DEFAULT_PORT},
46     {"acap",     Uri::URI_ACAP_DEFAULT_PORT},
47     {"telnet",   Uri::URI_TELNET_DEFAULT_PORT},
48     {"ssh",      Uri::URI_SSH_DEFAULT_PORT},
49     { NULL, 0xFFFF }     /* unknown port */
50 };
51 
port_of_Scheme(const char * scheme_str)52 static unsigned int port_of_Scheme(const char *scheme_str)
53 {
54     schemes_t *scheme;
55 
56     if (scheme_str) {
57         for (scheme = schemes; scheme->name != NULL; ++scheme) {
58             if (strcasecmp(scheme_str, scheme->name) == 0) {
59                 return scheme->default_port;
60             }
61         }
62     }
63     return 0;
64 }
65 
66 /* We have a apr_table_t that we can index by character and it tells us if the
67  * character is one of the interesting delimiters.  Note that we even get
68  * compares for NUL for free -- it's just another delimiter.
69  */
70 
71 #define T_COLON           0x01        /* ':' */
72 #define T_SLASH           0x02        /* '/' */
73 #define T_QUESTION        0x04        /* '?' */
74 #define T_HASH            0x08        /* '#' */
75 #define T_NUL             0x80        /* '\0' */
76 
77 /* the uri_delims.h file is autogenerated by gen_uri_delims.c */
78 /* this file is automatically generated by gen_uri_delims, do not edit */
79 static const unsigned char uri_delims[256] = {
80     T_NUL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
81     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,T_HASH,0,0,0,0,
82     0,0,0,0,0,0,0,T_SLASH,0,0,0,0,0,0,0,0,0,0,T_COLON,0,
83     0,0,0,T_QUESTION,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
84     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
85     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
86     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
87     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
88     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
89     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
90     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
91     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
92     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
93 };
94 
95 /* it works like this:
96     if (uri_delims[ch] & NOTEND_foobar) {
97         then we're not at a delimiter for foobar
98     }
99 */
100 
101 /* Note that we optimize the scheme scanning here, we cheat and let the
102  * compiler know that it doesn't have to do the & masking.
103  */
104 #define NOTEND_SCHEME     (0xff)
105 #define NOTEND_HOSTINFO   (T_SLASH | T_QUESTION | T_HASH | T_NUL)
106 #define NOTEND_PATH       (T_QUESTION | T_HASH | T_NUL)
107 
108 
109 static size_t wwwPrefixOffset(const std::string& hostname);
110 
Uri()111 Uri::Uri()
112 : mScheme(), mUser(), mPassword(), mHostname(), mPath(), mQuery(), mFragment(), mExistsQuery(false), mExistsFragment(false), mPort(0)
113 {}
114 
Uri(const string & uri_str)115 Uri::Uri(const string &uri_str)
116 : mScheme(), mUser(), mPassword(), mHostname(), mPath(), mQuery(), mFragment(), mExistsQuery(false), mExistsFragment(false), mPort(0)
117 {
118 	init(uri_str);
119 }
120 
init(const string & uri_str)121 void Uri::init(const string &uri_str)
122 {
123 	DEBUGP("Parsing uri %s\n", uri_str.c_str());
124 
125 	if(uri_str.empty()) return;
126 	const char *uri = uri_str.c_str();
127 	const char *s;
128 	const char *s1;
129 	const char *hostinfo;
130 	char *endstr;
131 
132 	/* We assume the processor has a branch predictor like most --
133 	 * it assumes forward branches are untaken and backwards are taken.  That's
134 	 * the reason for the gotos.  -djg
135 	 */
136 	if (uri[0] == '/') {
137 		deal_with_path:
138 		DEBUGP("Dealing with path\n");
139 		/* we expect uri to point to first character of path ... remember
140 		 * that the path could be empty -- http://foobar?query for example
141 		 */
142 		s = uri;
143 		while ((uri_delims[*(unsigned char *)s] & NOTEND_PATH) == 0) {
144 			++s;
145 		}
146 		if (s != uri) {
147 			mPath.assign(uri, s - uri);
148 			DEBUGP("Path is %s\n", mPath.c_str());
149 		}
150 		if (*s == 0) {
151 			return;
152 		}
153 		if (*s == '?') {
154 			++s;
155 			s1 = strchr(s, '#');
156 			if (s1) {
157 				mFragment.assign(s1 + 1);
158 				mExistsFragment = true;
159 				DEBUGP("Fragment is %s\n", mFragment.c_str());
160 				mQuery.assign(s, s1 - s);
161 				mExistsQuery = true;
162 				DEBUGP("Query is %s\n", mQuery.c_str());
163 			}
164 			else {
165 				mQuery.assign(s);
166 				mExistsQuery = true;
167 				DEBUGP("Query is %s\n", mQuery.c_str());
168 			}
169 			return;
170 		}
171 		/* otherwise it's a fragment */
172 		mFragment.assign(s + 1);
173 		mExistsFragment = true;
174 		DEBUGP("Fragment is %s\n", mFragment.c_str());
175 		return;
176 	}
177 
178 	DEBUGP("Dealing with scheme\n");
179 	/* find the scheme: */
180 	if (!isalpha(*uri)) goto deal_with_path;
181 	s = uri;
182 	while ((uri_delims[*(unsigned char *)s] & NOTEND_SCHEME) == 0) {
183 		++s;
184 	}
185 	/* scheme must be non-empty and followed by :// */
186 	if (s == uri || s[0] != ':' || s[1] != '/' || s[2] != '/') {
187 		goto deal_with_path;        /* backwards predicted taken! */
188 	}
189 
190 	mScheme.assign(uri, s - uri);
191 	DEBUGP("Scheme is %s\n", mScheme.c_str());
192 	s += 3;
193 
194 	DEBUGP("Finding hostinfo\n");
195 	hostinfo = s;
196 	DEBUGP("Hostinfo is %s\n", hostinfo);
197 	while ((uri_delims[*(unsigned char *)s] & NOTEND_HOSTINFO) == 0) {
198 		++s;
199 	}
200 	uri = s;        /* whatever follows hostinfo is start of uri */
201 //	mHostinfo.assign(hostinfo, uri - hostinfo);
202 
203 	/* If there's a username:password@host:port, the @ we want is the last @...
204 	 * too bad there's no memrchr()... For the C purists, note that hostinfo
205 	 * is definately not the first character of the original uri so therefore
206 	 * &hostinfo[-1] < &hostinfo[0] ... and this loop is valid C.
207 	 */
208 	do {
209 		--s;
210 	} while (s >= hostinfo && *s != '@');
211 	if (s < hostinfo) {
212 		/* again we want the common case to be fall through */
213 deal_with_host:
214 		DEBUGP("Dealing with host\n");
215 		/* We expect hostinfo to point to the first character of
216 		 * the hostname.  If there's a port it is the first colon.
217 		 */
218 		s = (char *)memchr(hostinfo, ':', uri - hostinfo);
219 		if (s == NULL) {
220 			/* we expect the common case to have no port */
221 			mHostname.assign(hostinfo, uri - hostinfo);
222 			DEBUGP("Hostname is %s\n", mHostname.c_str());
223 			goto deal_with_path;
224 		}
225 		mHostname.assign(hostinfo, s - hostinfo);
226 		DEBUGP("Hostname is %s\n", mHostname.c_str());
227 		++s;
228 		if (uri != s) {
229 			mPortStr.assign(s, uri - s);
230 			mPort = strtol(mPortStr.c_str(), &endstr, 10);
231 			if (*endstr == '\0') {
232 				goto deal_with_path;
233 			}
234 			/* Invalid characters after ':' found */
235 			DEBUGP("Throwing invalid url exception\n");
236 			throw Exception("Invalid character after ':'");
237 		}
238 		this->mPort = port_of_Scheme(mScheme.c_str());
239 		goto deal_with_path;
240 	}
241 
242 	/* first colon delimits username:password */
243 	s1 = (char *)memchr(hostinfo, ':', s - hostinfo);
244 	if (s1) {
245 		mUser.assign(hostinfo, s1 - hostinfo);
246 		++s1;
247 		mPassword.assign(s1, s - s1);
248 	}
249 	else {
250 		mUser.assign(hostinfo, s - hostinfo);
251 	}
252 	hostinfo = s + 1;
253 	goto deal_with_host;
254 }
255 
~Uri()256 Uri::~Uri() {
257 }
258 
scheme() const259 string Uri::scheme() const { return mScheme; }
260 
scheme(string scheme)261 void Uri::scheme(string scheme) {
262 	mScheme = scheme;
263 }
user() const264 string Uri::user() const { return mUser; }
user(string user)265 void Uri::user(string user) {
266 	mUser = user;
267 }
password() const268 string Uri::password() const { return mPassword; }
password(string password)269 void Uri::password(string password) {
270 	mPassword = password;
271 }
hostname() const272 string Uri::hostname() const { return mHostname; }
hostname(string hostname)273 void Uri::hostname(string hostname) {
274 	mHostname = hostname;
275 }
path() const276 string Uri::path() const { return mPath; }
path(string path)277 void Uri::path(string path) {
278 	mPath = path;
279 }
existsFragment() const280 bool Uri::existsFragment() const { return mExistsFragment; }
existsFragment(bool existsFragment)281 void Uri::existsFragment(bool existsFragment) {
282 	mExistsFragment = existsFragment;
283 }
existsQuery() const284 bool Uri::existsQuery() const { return mExistsQuery; }
existsQuery(bool existsQuery)285 void Uri::existsQuery(bool existsQuery) {
286 	mExistsQuery = existsQuery;
287 }
query() const288 string Uri::query() const { return mQuery; }
query(string query)289 void Uri::query(string query) {
290 	mQuery = query;
291 }
fragment() const292 string Uri::fragment() const { return mFragment; }
fragment(string fragment)293 void Uri::fragment(string fragment) {
294 	mFragment = fragment;
295 }
port() const296 unsigned int Uri::port() const { return mPort; }
port(unsigned int port)297 void Uri::port(unsigned int port) { mPort = port; }
298 
299 static const char *default_filenames[] = { "index", "default", NULL };
300 static const char *default_extensions[] = { ".html", ".htm", ".php", ".shtml", ".asp", ".cgi", NULL };
301 
default_port_for_scheme(const char * scheme_str)302 static unsigned short default_port_for_scheme(const char *scheme_str)
303 {
304 	schemes_t *scheme;
305 
306 	if (scheme_str == NULL)
307 		return 0;
308 
309 	for (scheme = schemes; scheme->name != NULL; ++scheme)
310 		if (strcasecmp(scheme_str, scheme->name) == 0)
311 			return scheme->default_port;
312 
313 	return 0;
314 }
315 
absolute(const Uri & base) const316 Uri Uri::absolute(const Uri &base) const
317 {
318 	if (mScheme.empty())
319 	{
320 		Uri root(base);
321 
322 		if (root.mPath.empty()) root.mPath = "/";
323 
324 		if (mPath.empty())
325 		{
326 			if (mExistsQuery)
327 			{
328 				root.mQuery = mQuery;
329 				root.mExistsQuery = mExistsQuery;
330 				root.mFragment = mFragment;
331 				root.mExistsFragment = mExistsFragment;
332 			}
333 			else if (mExistsFragment)
334 			{
335 				root.mFragment = mFragment;
336 				root.mExistsFragment = mExistsFragment;
337 			}
338 		}
339 		else if (mPath[0] == '/')
340 		{
341 			root.mPath = mPath;
342 			root.mQuery = mQuery;
343 			root.mExistsQuery = mExistsQuery;
344 			root.mFragment = mFragment;
345 			root.mExistsFragment = mExistsFragment;
346 		}
347 		else
348 		{
349 			string path(root.mPath);
350 			string::size_type find;
351 			find = path.rfind("/");
352 			if (find != string::npos) path.erase(find+1);
353 			path += mPath;
354 			root.mPath = path;
355 			root.mQuery = mQuery;
356 			root.mExistsQuery = mExistsQuery;
357 			root.mFragment = mFragment;
358 			root.mExistsFragment = mExistsFragment;
359 		}
360 
361 		return root;
362 	}
363 
364 	if (mPath.empty())
365 	{
366 		Uri root(*this);
367 		root.mPath = "/";
368 
369 		return root;
370 	}
371 
372 	return *this;
373 }
374 
unparse(int flags) const375 string Uri::unparse(int flags ) const
376 {
377 	string ret;
378 	ret.reserve(mScheme.length() + mUser.length() + mPassword.length() + mHostname.length() + mPath.length() + mQuery.length() + mFragment.length() + mPortStr.length());
379 
380 	DEBUGP("Unparsing scheme\n");
381 	if(!(Uri::REMOVE_SCHEME & flags)) {
382 		if(!mScheme.empty()) {
383 			ret +=  mScheme;
384 			ret += "://";
385 		}
386 	}
387 	DEBUGP("Unparsing hostname\n");
388 	if(!mHostname.empty()) {
389 		size_t offset = 0;
390 		if(flags & Uri::REMOVE_WWW_PREFIX && mHostname.length() > 3) {
391 			offset = wwwPrefixOffset(mHostname);
392 		}
393 		ret += (mHostname.c_str() + offset);
394 	}
395 	DEBUGP("Unparsing port\n");
396 	if (!mPortStr.empty() && !(!mScheme.empty() && mPort == default_port_for_scheme(mScheme.c_str())))
397 	{
398 		ret += ':';
399 		ret += mPortStr;
400 	}
401 	DEBUGP("Unparsing path\n");
402 	if(!mPath.empty())
403 	{
404 		char *buf = new char[mPath.length() + 1];
405 		memcpy(buf, mPath.c_str(), mPath.length() + 1);
406 		if(flags & Uri::REMOVE_DEFAULT_FILENAMES) {
407 			const char **ptr = default_extensions;
408 			char *end = buf + mPath.length();
409 			size_t offset = 0;
410 			while(*ptr != NULL) {
411 				size_t len = strlen(*ptr);
412 				if((strcmp(end - len, *ptr)) == 0) {
413 					offset = len;
414 					break;
415 				}
416 				++ptr;
417 			}
418 			if(offset == 0) goto remove_bar;
419 			ptr = default_filenames;
420 			bool found = false;
421 			while(*ptr != NULL) {
422 				size_t len = strlen(*ptr);
423 				if(strncmp(end - offset - len, *ptr, len) == 0) {
424 					offset += len;
425 					found = true;
426 					break;
427 				}
428 				++ptr;
429 			}
430 			if(found) {
431 				*(end - offset) = 0; //cut filename
432 			}
433 
434 		}
435 		remove_bar:
436 		if(flags & Uri::REMOVE_TRAILING_BAR) {
437 			if(strlen(buf) > 1 && buf[strlen(buf) - 1] == '/') { //do not remove if path is only the bar
438 				buf[strlen(buf) - 1] = 0;
439 			}
440 		}
441 		ret += buf;
442 		delete [] buf;
443 	}
444 	DEBUGP("Unparsing query\n");
445 	if(!(flags & Uri::REMOVE_QUERY) && mExistsQuery) {
446 		ret += '?';
447 		if(flags & Uri::REMOVE_QUERY_VALUES) {
448 			const char *ptr = mQuery.c_str();
449 			bool inside = false;
450 			while(*ptr) {
451 				if(*ptr == '=') {
452 					inside = true;
453 				}
454 				if(*ptr == '&') {
455 					inside = false;
456 				}
457 				if(inside) {
458 					++ptr;
459 				} else {
460 					ret += *ptr;
461 					++ptr;
462 				}
463 			}
464 		} else {
465 			ret += mQuery;
466 		}
467 	}
468 	DEBUGP("Unparsing fragment\n");
469 	if(!(flags & Uri::REMOVE_FRAGMENT) && mExistsFragment)
470 	{
471 		ret += '#';
472 		ret += mFragment;
473 	}
474 
475 	return ret;
476 }
477 
wwwPrefixOffset(const std::string & hostname)478 static size_t wwwPrefixOffset(const std::string& hostname)
479 {
480 	string::size_type len = hostname.length();
481 	if(strncasecmp("www", hostname.c_str(), 3) == 0)
482 	{
483 		if(len > 3 && hostname[3] == '.')
484 		{
485 			return 4;
486 		}
487 		if(len > 4 && isdigit(hostname[3]) && hostname[4] == '.')
488 		{
489 			return 5;
490 		}
491 	}
492 	return 0;
493 }
494 
495 
496 
497 
canonicalHostname(unsigned int maxDepth) const498 std::string Uri::canonicalHostname(unsigned int maxDepth) const
499 {
500 
501 	size_t prefixOffset = wwwPrefixOffset(mHostname);
502 	size_t suffixOffset = tldOffset(mHostname.c_str());
503 	unsigned int depth = 0;
504 	string::const_iterator canonicalStart = mHostname.begin() + prefixOffset;
505 	string::const_iterator ptr = mHostname.begin();
506 	ptr += mHostname.length() - suffixOffset;
507 	while (depth < maxDepth && ptr > canonicalStart)
508 	{
509 		--ptr;
510 		if (*ptr == '.') ++depth;
511 	}
512 	if (*ptr == '.') ++ptr;
513 	return string(ptr, mHostname.end());
514 }
515 
decode(const std::string & uri)516 std::string Uri::decode(const std::string &uri)
517 {
518     //Note from RFC1630:  "Sequences which start with a percent sign
519     //but are not followed by two hexadecimal characters (0-9,A-F) are reserved
520     //for future extension"
521 	const unsigned char *ptr = (const unsigned char *)uri.c_str();
522 	string ret;
523 	ret.reserve(uri.length());
524 	for (; *ptr; ++ptr)
525 	{
526 		if (*ptr == '%')
527 		{
528 			if (*(ptr + 1))
529 			{
530 				char a = *(ptr + 1);
531 				char b = *(ptr + 2);
532 				if (!((a >= 0x30 && a < 0x40) || (a >= 0x41 && a < 0x47))) continue;
533 				if (!((b >= 0x30 && b < 0x40) || (b >= 0x41 && b < 0x47))) continue;
534 				char buf[3];
535 				buf[0] = a;
536 				buf[1] = b;
537 				buf[2] = 0;
538 				ret += (char)strtoul(buf, NULL, 16);
539 				ptr += 2;
540 				continue;
541 			}
542 		}
543 		ret += *ptr;
544 	}
545 	return ret;
546 }
547 
548 //This vector is generated by safechars.py. Please do not edit by hand.
549 static const char safe[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
550 
551 
encode(const std::string & uri)552 std::string Uri::encode(const std::string &uri)
553 {
554 	string ret;
555 	const unsigned char *ptr = (const unsigned char *)uri.c_str();
556 	ret.reserve(uri.length());
557 
558 	for (; *ptr ; ++ptr)
559 	{
560 		if (!safe[*ptr])
561 		{
562 			char buf[5];
563 			memset(buf, 0, 5);
564 			snprintf(buf, 5, "%%%X", (*ptr));
565 			ret.append(buf);
566 		}
567 		else
568 		{
569 			ret += *ptr;
570 		}
571 	}
572 	return ret;
573 }
574 
575