1 #include "Uri.h"
2
3 #include "wincstring.h"
4 #include <sstream>
5 #include <cstdlib>
6 #include <cassert>
7 #include "tld.h"
8
9 //#define DEBUG
10 #include "debug.h"
11
12 using namespace std;
13 using namespace htmlcxx;
14
15 /** Structure to store various schemes and their default ports */
16 struct schemes_t {
17 /** The name of the scheme */
18 const char *name;
19 /** The default port for the scheme */
20 unsigned int default_port;
21 };
22
23 /* Some WWW schemes and their default ports; this is basically /etc/services */
24 /* This will become global when the protocol abstraction comes */
25 /* As the schemes are searched by a linear search, */
26 /* they are sorted by their expected frequency */
27 static schemes_t schemes[] =
28 {
29 {"http", Uri::URI_HTTP_DEFAULT_PORT},
30 {"ftp", Uri::URI_FTP_DEFAULT_PORT},
31 {"https", Uri::URI_HTTPS_DEFAULT_PORT},
32 {"gopher", Uri::URI_GOPHER_DEFAULT_PORT},
33 {"ldap", Uri::URI_LDAP_DEFAULT_PORT},
34 {"nntp", Uri::URI_NNTP_DEFAULT_PORT},
35 {"snews", Uri::URI_SNEWS_DEFAULT_PORT},
36 {"imap", Uri::URI_IMAP_DEFAULT_PORT},
37 {"pop", Uri::URI_POP_DEFAULT_PORT},
38 {"sip", Uri::URI_SIP_DEFAULT_PORT},
39 {"rtsp", Uri::URI_RTSP_DEFAULT_PORT},
40 {"wais", Uri::URI_WAIS_DEFAULT_PORT},
41 {"z39.50r", Uri::URI_WAIS_DEFAULT_PORT},
42 {"z39.50s", Uri::URI_WAIS_DEFAULT_PORT},
43 {"prospero", Uri::URI_PROSPERO_DEFAULT_PORT},
44 {"nfs", Uri::URI_NFS_DEFAULT_PORT},
45 {"tip", Uri::URI_TIP_DEFAULT_PORT},
46 {"acap", Uri::URI_ACAP_DEFAULT_PORT},
47 {"telnet", Uri::URI_TELNET_DEFAULT_PORT},
48 {"ssh", Uri::URI_SSH_DEFAULT_PORT},
49 { NULL, 0xFFFF } /* unknown port */
50 };
51
port_of_Scheme(const char * scheme_str)52 static unsigned int port_of_Scheme(const char *scheme_str)
53 {
54 schemes_t *scheme;
55
56 if (scheme_str) {
57 for (scheme = schemes; scheme->name != NULL; ++scheme) {
58 if (strcasecmp(scheme_str, scheme->name) == 0) {
59 return scheme->default_port;
60 }
61 }
62 }
63 return 0;
64 }
65
66 /* We have a apr_table_t that we can index by character and it tells us if the
67 * character is one of the interesting delimiters. Note that we even get
68 * compares for NUL for free -- it's just another delimiter.
69 */
70
71 #define T_COLON 0x01 /* ':' */
72 #define T_SLASH 0x02 /* '/' */
73 #define T_QUESTION 0x04 /* '?' */
74 #define T_HASH 0x08 /* '#' */
75 #define T_NUL 0x80 /* '\0' */
76
77 /* the uri_delims.h file is autogenerated by gen_uri_delims.c */
78 /* this file is automatically generated by gen_uri_delims, do not edit */
79 static const unsigned char uri_delims[256] = {
80 T_NUL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
81 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,T_HASH,0,0,0,0,
82 0,0,0,0,0,0,0,T_SLASH,0,0,0,0,0,0,0,0,0,0,T_COLON,0,
83 0,0,0,T_QUESTION,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
84 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
85 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
86 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
87 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
88 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
89 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
90 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
91 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
92 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
93 };
94
95 /* it works like this:
96 if (uri_delims[ch] & NOTEND_foobar) {
97 then we're not at a delimiter for foobar
98 }
99 */
100
101 /* Note that we optimize the scheme scanning here, we cheat and let the
102 * compiler know that it doesn't have to do the & masking.
103 */
104 #define NOTEND_SCHEME (0xff)
105 #define NOTEND_HOSTINFO (T_SLASH | T_QUESTION | T_HASH | T_NUL)
106 #define NOTEND_PATH (T_QUESTION | T_HASH | T_NUL)
107
108
109 static size_t wwwPrefixOffset(const std::string& hostname);
110
Uri()111 Uri::Uri()
112 : mScheme(), mUser(), mPassword(), mHostname(), mPath(), mQuery(), mFragment(), mExistsQuery(false), mExistsFragment(false), mPort(0)
113 {}
114
Uri(const string & uri_str)115 Uri::Uri(const string &uri_str)
116 : mScheme(), mUser(), mPassword(), mHostname(), mPath(), mQuery(), mFragment(), mExistsQuery(false), mExistsFragment(false), mPort(0)
117 {
118 init(uri_str);
119 }
120
init(const string & uri_str)121 void Uri::init(const string &uri_str)
122 {
123 DEBUGP("Parsing uri %s\n", uri_str.c_str());
124
125 if(uri_str.empty()) return;
126 const char *uri = uri_str.c_str();
127 const char *s;
128 const char *s1;
129 const char *hostinfo;
130 char *endstr;
131
132 /* We assume the processor has a branch predictor like most --
133 * it assumes forward branches are untaken and backwards are taken. That's
134 * the reason for the gotos. -djg
135 */
136 if (uri[0] == '/') {
137 deal_with_path:
138 DEBUGP("Dealing with path\n");
139 /* we expect uri to point to first character of path ... remember
140 * that the path could be empty -- http://foobar?query for example
141 */
142 s = uri;
143 while ((uri_delims[*(unsigned char *)s] & NOTEND_PATH) == 0) {
144 ++s;
145 }
146 if (s != uri) {
147 mPath.assign(uri, s - uri);
148 DEBUGP("Path is %s\n", mPath.c_str());
149 }
150 if (*s == 0) {
151 return;
152 }
153 if (*s == '?') {
154 ++s;
155 s1 = strchr(s, '#');
156 if (s1) {
157 mFragment.assign(s1 + 1);
158 mExistsFragment = true;
159 DEBUGP("Fragment is %s\n", mFragment.c_str());
160 mQuery.assign(s, s1 - s);
161 mExistsQuery = true;
162 DEBUGP("Query is %s\n", mQuery.c_str());
163 }
164 else {
165 mQuery.assign(s);
166 mExistsQuery = true;
167 DEBUGP("Query is %s\n", mQuery.c_str());
168 }
169 return;
170 }
171 /* otherwise it's a fragment */
172 mFragment.assign(s + 1);
173 mExistsFragment = true;
174 DEBUGP("Fragment is %s\n", mFragment.c_str());
175 return;
176 }
177
178 DEBUGP("Dealing with scheme\n");
179 /* find the scheme: */
180 if (!isalpha(*uri)) goto deal_with_path;
181 s = uri;
182 while ((uri_delims[*(unsigned char *)s] & NOTEND_SCHEME) == 0) {
183 ++s;
184 }
185 /* scheme must be non-empty and followed by :// */
186 if (s == uri || s[0] != ':' || s[1] != '/' || s[2] != '/') {
187 goto deal_with_path; /* backwards predicted taken! */
188 }
189
190 mScheme.assign(uri, s - uri);
191 DEBUGP("Scheme is %s\n", mScheme.c_str());
192 s += 3;
193
194 DEBUGP("Finding hostinfo\n");
195 hostinfo = s;
196 DEBUGP("Hostinfo is %s\n", hostinfo);
197 while ((uri_delims[*(unsigned char *)s] & NOTEND_HOSTINFO) == 0) {
198 ++s;
199 }
200 uri = s; /* whatever follows hostinfo is start of uri */
201 // mHostinfo.assign(hostinfo, uri - hostinfo);
202
203 /* If there's a username:password@host:port, the @ we want is the last @...
204 * too bad there's no memrchr()... For the C purists, note that hostinfo
205 * is definately not the first character of the original uri so therefore
206 * &hostinfo[-1] < &hostinfo[0] ... and this loop is valid C.
207 */
208 do {
209 --s;
210 } while (s >= hostinfo && *s != '@');
211 if (s < hostinfo) {
212 /* again we want the common case to be fall through */
213 deal_with_host:
214 DEBUGP("Dealing with host\n");
215 /* We expect hostinfo to point to the first character of
216 * the hostname. If there's a port it is the first colon.
217 */
218 s = (char *)memchr(hostinfo, ':', uri - hostinfo);
219 if (s == NULL) {
220 /* we expect the common case to have no port */
221 mHostname.assign(hostinfo, uri - hostinfo);
222 DEBUGP("Hostname is %s\n", mHostname.c_str());
223 goto deal_with_path;
224 }
225 mHostname.assign(hostinfo, s - hostinfo);
226 DEBUGP("Hostname is %s\n", mHostname.c_str());
227 ++s;
228 if (uri != s) {
229 mPortStr.assign(s, uri - s);
230 mPort = strtol(mPortStr.c_str(), &endstr, 10);
231 if (*endstr == '\0') {
232 goto deal_with_path;
233 }
234 /* Invalid characters after ':' found */
235 DEBUGP("Throwing invalid url exception\n");
236 throw Exception("Invalid character after ':'");
237 }
238 this->mPort = port_of_Scheme(mScheme.c_str());
239 goto deal_with_path;
240 }
241
242 /* first colon delimits username:password */
243 s1 = (char *)memchr(hostinfo, ':', s - hostinfo);
244 if (s1) {
245 mUser.assign(hostinfo, s1 - hostinfo);
246 ++s1;
247 mPassword.assign(s1, s - s1);
248 }
249 else {
250 mUser.assign(hostinfo, s - hostinfo);
251 }
252 hostinfo = s + 1;
253 goto deal_with_host;
254 }
255
~Uri()256 Uri::~Uri() {
257 }
258
scheme() const259 string Uri::scheme() const { return mScheme; }
260
scheme(string scheme)261 void Uri::scheme(string scheme) {
262 mScheme = scheme;
263 }
user() const264 string Uri::user() const { return mUser; }
user(string user)265 void Uri::user(string user) {
266 mUser = user;
267 }
password() const268 string Uri::password() const { return mPassword; }
password(string password)269 void Uri::password(string password) {
270 mPassword = password;
271 }
hostname() const272 string Uri::hostname() const { return mHostname; }
hostname(string hostname)273 void Uri::hostname(string hostname) {
274 mHostname = hostname;
275 }
path() const276 string Uri::path() const { return mPath; }
path(string path)277 void Uri::path(string path) {
278 mPath = path;
279 }
existsFragment() const280 bool Uri::existsFragment() const { return mExistsFragment; }
existsFragment(bool existsFragment)281 void Uri::existsFragment(bool existsFragment) {
282 mExistsFragment = existsFragment;
283 }
existsQuery() const284 bool Uri::existsQuery() const { return mExistsQuery; }
existsQuery(bool existsQuery)285 void Uri::existsQuery(bool existsQuery) {
286 mExistsQuery = existsQuery;
287 }
query() const288 string Uri::query() const { return mQuery; }
query(string query)289 void Uri::query(string query) {
290 mQuery = query;
291 }
fragment() const292 string Uri::fragment() const { return mFragment; }
fragment(string fragment)293 void Uri::fragment(string fragment) {
294 mFragment = fragment;
295 }
port() const296 unsigned int Uri::port() const { return mPort; }
port(unsigned int port)297 void Uri::port(unsigned int port) { mPort = port; }
298
299 static const char *default_filenames[] = { "index", "default", NULL };
300 static const char *default_extensions[] = { ".html", ".htm", ".php", ".shtml", ".asp", ".cgi", NULL };
301
default_port_for_scheme(const char * scheme_str)302 static unsigned short default_port_for_scheme(const char *scheme_str)
303 {
304 schemes_t *scheme;
305
306 if (scheme_str == NULL)
307 return 0;
308
309 for (scheme = schemes; scheme->name != NULL; ++scheme)
310 if (strcasecmp(scheme_str, scheme->name) == 0)
311 return scheme->default_port;
312
313 return 0;
314 }
315
absolute(const Uri & base) const316 Uri Uri::absolute(const Uri &base) const
317 {
318 if (mScheme.empty())
319 {
320 Uri root(base);
321
322 if (root.mPath.empty()) root.mPath = "/";
323
324 if (mPath.empty())
325 {
326 if (mExistsQuery)
327 {
328 root.mQuery = mQuery;
329 root.mExistsQuery = mExistsQuery;
330 root.mFragment = mFragment;
331 root.mExistsFragment = mExistsFragment;
332 }
333 else if (mExistsFragment)
334 {
335 root.mFragment = mFragment;
336 root.mExistsFragment = mExistsFragment;
337 }
338 }
339 else if (mPath[0] == '/')
340 {
341 root.mPath = mPath;
342 root.mQuery = mQuery;
343 root.mExistsQuery = mExistsQuery;
344 root.mFragment = mFragment;
345 root.mExistsFragment = mExistsFragment;
346 }
347 else
348 {
349 string path(root.mPath);
350 string::size_type find;
351 find = path.rfind("/");
352 if (find != string::npos) path.erase(find+1);
353 path += mPath;
354 root.mPath = path;
355 root.mQuery = mQuery;
356 root.mExistsQuery = mExistsQuery;
357 root.mFragment = mFragment;
358 root.mExistsFragment = mExistsFragment;
359 }
360
361 return root;
362 }
363
364 if (mPath.empty())
365 {
366 Uri root(*this);
367 root.mPath = "/";
368
369 return root;
370 }
371
372 return *this;
373 }
374
unparse(int flags) const375 string Uri::unparse(int flags ) const
376 {
377 string ret;
378 ret.reserve(mScheme.length() + mUser.length() + mPassword.length() + mHostname.length() + mPath.length() + mQuery.length() + mFragment.length() + mPortStr.length());
379
380 DEBUGP("Unparsing scheme\n");
381 if(!(Uri::REMOVE_SCHEME & flags)) {
382 if(!mScheme.empty()) {
383 ret += mScheme;
384 ret += "://";
385 }
386 }
387 DEBUGP("Unparsing hostname\n");
388 if(!mHostname.empty()) {
389 size_t offset = 0;
390 if(flags & Uri::REMOVE_WWW_PREFIX && mHostname.length() > 3) {
391 offset = wwwPrefixOffset(mHostname);
392 }
393 ret += (mHostname.c_str() + offset);
394 }
395 DEBUGP("Unparsing port\n");
396 if (!mPortStr.empty() && !(!mScheme.empty() && mPort == default_port_for_scheme(mScheme.c_str())))
397 {
398 ret += ':';
399 ret += mPortStr;
400 }
401 DEBUGP("Unparsing path\n");
402 if(!mPath.empty())
403 {
404 char *buf = new char[mPath.length() + 1];
405 memcpy(buf, mPath.c_str(), mPath.length() + 1);
406 if(flags & Uri::REMOVE_DEFAULT_FILENAMES) {
407 const char **ptr = default_extensions;
408 char *end = buf + mPath.length();
409 size_t offset = 0;
410 while(*ptr != NULL) {
411 size_t len = strlen(*ptr);
412 if((strcmp(end - len, *ptr)) == 0) {
413 offset = len;
414 break;
415 }
416 ++ptr;
417 }
418 if(offset == 0) goto remove_bar;
419 ptr = default_filenames;
420 bool found = false;
421 while(*ptr != NULL) {
422 size_t len = strlen(*ptr);
423 if(strncmp(end - offset - len, *ptr, len) == 0) {
424 offset += len;
425 found = true;
426 break;
427 }
428 ++ptr;
429 }
430 if(found) {
431 *(end - offset) = 0; //cut filename
432 }
433
434 }
435 remove_bar:
436 if(flags & Uri::REMOVE_TRAILING_BAR) {
437 if(strlen(buf) > 1 && buf[strlen(buf) - 1] == '/') { //do not remove if path is only the bar
438 buf[strlen(buf) - 1] = 0;
439 }
440 }
441 ret += buf;
442 delete [] buf;
443 }
444 DEBUGP("Unparsing query\n");
445 if(!(flags & Uri::REMOVE_QUERY) && mExistsQuery) {
446 ret += '?';
447 if(flags & Uri::REMOVE_QUERY_VALUES) {
448 const char *ptr = mQuery.c_str();
449 bool inside = false;
450 while(*ptr) {
451 if(*ptr == '=') {
452 inside = true;
453 }
454 if(*ptr == '&') {
455 inside = false;
456 }
457 if(inside) {
458 ++ptr;
459 } else {
460 ret += *ptr;
461 ++ptr;
462 }
463 }
464 } else {
465 ret += mQuery;
466 }
467 }
468 DEBUGP("Unparsing fragment\n");
469 if(!(flags & Uri::REMOVE_FRAGMENT) && mExistsFragment)
470 {
471 ret += '#';
472 ret += mFragment;
473 }
474
475 return ret;
476 }
477
wwwPrefixOffset(const std::string & hostname)478 static size_t wwwPrefixOffset(const std::string& hostname)
479 {
480 string::size_type len = hostname.length();
481 if(strncasecmp("www", hostname.c_str(), 3) == 0)
482 {
483 if(len > 3 && hostname[3] == '.')
484 {
485 return 4;
486 }
487 if(len > 4 && isdigit(hostname[3]) && hostname[4] == '.')
488 {
489 return 5;
490 }
491 }
492 return 0;
493 }
494
495
496
497
canonicalHostname(unsigned int maxDepth) const498 std::string Uri::canonicalHostname(unsigned int maxDepth) const
499 {
500
501 size_t prefixOffset = wwwPrefixOffset(mHostname);
502 size_t suffixOffset = tldOffset(mHostname.c_str());
503 unsigned int depth = 0;
504 string::const_iterator canonicalStart = mHostname.begin() + prefixOffset;
505 string::const_iterator ptr = mHostname.begin();
506 ptr += mHostname.length() - suffixOffset;
507 while (depth < maxDepth && ptr > canonicalStart)
508 {
509 --ptr;
510 if (*ptr == '.') ++depth;
511 }
512 if (*ptr == '.') ++ptr;
513 return string(ptr, mHostname.end());
514 }
515
decode(const std::string & uri)516 std::string Uri::decode(const std::string &uri)
517 {
518 //Note from RFC1630: "Sequences which start with a percent sign
519 //but are not followed by two hexadecimal characters (0-9,A-F) are reserved
520 //for future extension"
521 const unsigned char *ptr = (const unsigned char *)uri.c_str();
522 string ret;
523 ret.reserve(uri.length());
524 for (; *ptr; ++ptr)
525 {
526 if (*ptr == '%')
527 {
528 if (*(ptr + 1))
529 {
530 char a = *(ptr + 1);
531 char b = *(ptr + 2);
532 if (!((a >= 0x30 && a < 0x40) || (a >= 0x41 && a < 0x47))) continue;
533 if (!((b >= 0x30 && b < 0x40) || (b >= 0x41 && b < 0x47))) continue;
534 char buf[3];
535 buf[0] = a;
536 buf[1] = b;
537 buf[2] = 0;
538 ret += (char)strtoul(buf, NULL, 16);
539 ptr += 2;
540 continue;
541 }
542 }
543 ret += *ptr;
544 }
545 return ret;
546 }
547
548 //This vector is generated by safechars.py. Please do not edit by hand.
549 static const char safe[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
550
551
encode(const std::string & uri)552 std::string Uri::encode(const std::string &uri)
553 {
554 string ret;
555 const unsigned char *ptr = (const unsigned char *)uri.c_str();
556 ret.reserve(uri.length());
557
558 for (; *ptr ; ++ptr)
559 {
560 if (!safe[*ptr])
561 {
562 char buf[5];
563 memset(buf, 0, 5);
564 snprintf(buf, 5, "%%%X", (*ptr));
565 ret.append(buf);
566 }
567 else
568 {
569 ret += *ptr;
570 }
571 }
572 return ret;
573 }
574
575