1 /********************************************************************************
2 * *
3 * U R L M a n i p u l a t i o n *
4 * *
5 *********************************************************************************
6 * Copyright (C) 2000,2020 by Jeroen van der Zijp. All Rights Reserved. *
7 *********************************************************************************
8 * This library is free software; you can redistribute it and/or modify *
9 * it under the terms of the GNU Lesser General Public License as published by *
10 * the Free Software Foundation; either version 3 of the License, or *
11 * (at your option) any later version. *
12 * *
13 * This library is distributed in the hope that it will be useful, *
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
16 * GNU Lesser General Public License for more details. *
17 * *
18 * You should have received a copy of the GNU Lesser General Public License *
19 * along with this program. If not, see <http://www.gnu.org/licenses/> *
20 ********************************************************************************/
21 #include "xincs.h"
22 #include "fxver.h"
23 #include "fxdefs.h"
24 #include "fxmath.h"
25 #include "fxascii.h"
26 #include "FXArray.h"
27 #include "FXHash.h"
28 #include "FXStream.h"
29 #include "FXString.h"
30 #include "FXPath.h"
31 #include "FXSystem.h"
32 #include "FXURL.h"
33
34
35 /*
36 Notes:
37
38 - Functions contributed by Sean Hubbell and Sander Jansen.
39
40 - About drive letters in URL's, Daniel Gehriger has some some
41 empirical tests, and determined the following:
42
43 NS = works on Netscape
44 IE = works on IE
45 O = works on Opera
46
47 - file:///C|/TEMP/ NS, IE, O
48 - file:///C:/TEMP/ NS, IE, O
49
50 - file://localhost/C:/TEMP/ NS, IE, O
51 - file://localhost/C|/TEMP/ NS, IE, O
52
53 - file://C:/TEMP/ NS, IE, --
54 - file:///C/TEMP/ --, --, --
55
56 The conclusion seems to be we should probably try to handle all
57 of these possibilities, although keeping the ':' seems favorable.
58
59 - Syntax (as per rfc3986):
60
61 URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
62
63 hier-part = "//" authority path-abempty
64 / path-absolute
65 / path-rootless
66 / path-empty
67
68 URI-reference = URI / relative-ref
69
70 absolute-URI = scheme ":" hier-part [ "?" query ]
71
72 relative-ref = relative-part [ "?" query ] [ "#" fragment ]
73
74 relative-part = "//" authority path-abempty
75 / path-absolute
76 / path-noscheme
77 / path-empty
78
79 scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
80
81 authority = [ userinfo "@" ] host [ ":" port ]
82
83 userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
84
85 host = IP-literal / IPv4address / reg-name
86
87 IP-literal = "[" ( IPv6address / IPvFuture ) "]"
88
89 IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
90
91 IPv6address = 6( h16 ":" ) ls32
92 / "::" 5( h16 ":" ) ls32
93 / [ h16 ] "::" 4( h16 ":" ) ls32
94 / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
95 / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
96 / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
97 / [ *4( h16 ":" ) h16 ] "::" ls32
98 / [ *5( h16 ":" ) h16 ] "::" h16
99 / [ *6( h16 ":" ) h16 ] "::"
100
101 ls32 = ( h16 ":" h16 ) / IPv4address ; least-significant 32 bits of address
102
103 h16 = 1*4HEXDIG ; 16 bits of address represented in hexadecimal
104
105
106 IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
107
108 dec-octet = DIGIT ; 0-9
109 / %x31-39 DIGIT ; 10-99
110 / "1" 2DIGIT ; 100-199
111 / "2" %x30-34 DIGIT ; 200-249
112 / "25" %x30-35 ; 250-255
113
114 reg-name = *( unreserved / pct-encoded / sub-delims )
115
116 port = *DIGIT
117
118 path = path-abempty ; begins with "/" or is empty
119 / path-absolute ; begins with "/" but not "//"
120 / path-noscheme ; begins with a non-colon segment
121 / path-rootless ; begins with a segment
122 / path-empty ; zero characters
123
124 path-abempty = *( "/" segment )
125
126 path-absolute = "/" [ segment-nz *( "/" segment ) ]
127
128 path-noscheme = segment-nz-nc *( "/" segment )
129
130 path-rootless = segment-nz *( "/" segment )
131
132 path-empty = 0<pchar>
133
134 segment = *pchar
135
136 segment-nz = 1*pchar
137
138 segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) ; non-zero-length segment without any colon ":"
139
140 pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
141
142 query = *( pchar / "/" / "?" )
143
144 fragment = *( pchar / "/" / "?" )
145
146 pct-encoded = "%" HEXDIG HEXDIG
147
148 unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
149
150 reserved = gen-delims / sub-delims
151
152 gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
153
154 sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
155 / "*" / "+" / "," / ";" / "="
156
157 - Also, encode all non-ascii bytes from a string.
158 */
159
160 #define ENCODE_THESE "<>#%{}|^~[]`\"?$&'*,;=" // Encode these for pathnames
161
162 using namespace FX;
163
164 /*******************************************************************************/
165
166 namespace FX {
167
168 // Character classes
169 enum {
170 UNRESERVED = 1,
171 PERCENT = 2,
172 SUBDELIM = 4,
173 GENDELIM = 8,
174 PATHCHAR = 16,
175 QUERYCHAR = 32
176 };
177
178
179 // Table of character classes
180 static const FXuchar properties[256]={
181 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
182 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
183 0x10,0x04,0x00,0x08,0x04,0x32,0x04,0x04,0x04,0x04,0x04,0x04,0x04,0x01,0x01,0x38,
184 0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x38,0x04,0x00,0x04,0x00,0x28,
185 0x38,0x03,0x03,0x03,0x03,0x03,0x03,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
186 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x08,0x10,0x08,0x00,0x01,
187 0x00,0x03,0x03,0x03,0x03,0x03,0x03,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
188 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x00,0x00,0x00,0x01,0x00,
189 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
190 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
191 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
192 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
193 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
194 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
195 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
196 0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
197 };
198
199
200 // URL parts
201 class URL {
202 public:
203 FXint prot[2];
204 FXint user[2];
205 FXint pass[2];
206 FXint host[2];
207 FXint port[2];
208 FXint path[2];
209 FXint quer[2];
210 FXint frag[2];
211 public:
212 URL(const FXString& string);
213 };
214
215
216 // Parse string to url parts
URL(const FXString & string)217 URL::URL(const FXString& string){
218 FXint s=0;
219 FXuchar c;
220
221 prot[0]=prot[1]=0;
222
223 // Parse protocol
224 if(Ascii::isLetter(string[0])){
225 s++;
226
227 // Scan till end of scheme name
228 while(Ascii::isAlphaNumeric(string[s]) || string[s]=='+' || string[s]=='-' || string[s]=='.') s++;
229
230 // Scheme end found
231 if(string[s]==':' && s>1){
232 prot[1]=s++;
233 }
234 else{
235 s=prot[0]; // Reset:- wasn't protocol after all since no ':' found
236 }
237 }
238
239 user[0]=user[1]=s;
240 pass[0]=pass[1]=s;
241 host[0]=host[1]=s;
242 port[0]=port[1]=s;
243
244 // Parse hier part
245 if(string[s]=='/' && string[s+1]=='/'){
246 s+=2;
247
248 // Parse username
249 user[0]=s;
250 while((c=string[s])!='\0' && (properties[c]&(UNRESERVED|SUBDELIM|PERCENT))){
251 s++;
252 }
253
254 // Parse password
255 user[1]=pass[0]=s;
256 if(string[s]==':'){
257 pass[0]=++s;
258 while((c=string[s])!='\0' && (properties[c]&(UNRESERVED|SUBDELIM|PERCENT))){
259 s++;
260 }
261 }
262 pass[1]=s;
263
264 // Check for @ after user:pass
265 if(string[s]=='@'){
266 s++;
267 }
268 else{
269 s=pass[0]=pass[1]=user[1]=user[0]; // Reset:- wasn't user:pass after all since no '@' found
270 }
271
272 // Parse hostname
273 host[0]=s;
274 while((c=string[s])!='\0' && (properties[c]&(UNRESERVED|SUBDELIM|PERCENT))){
275 s++;
276 }
277
278 // Parse port number
279 host[1]=port[0]=s;
280 if(string[s]==':'){
281 port[0]=++s;
282 while(Ascii::isDigit(string[s])) s++;
283 }
284 port[1]=s;
285 }
286
287 // Parse path, allowing for \ path delimiters (legacy urls)
288 path[0]=s;
289 while((c=string[s])!='\0' && (properties[c]&(UNRESERVED|SUBDELIM|PATHCHAR))){
290 s++;
291 }
292
293 // Parse query
294 path[1]=quer[0]=s;
295 if(string[s]=='?'){
296 quer[0]=++s;
297 while((c=string[s])!='\0' && (properties[c]&(UNRESERVED|SUBDELIM|QUERYCHAR))){
298 s++;
299 }
300 }
301
302 // Parse fragment
303 quer[1]=frag[0]=s;
304 if(string[s]=='#'){
305 frag[0]=++s;
306 while((c=string[s])!='\0' && (properties[c]&(UNRESERVED|SUBDELIM|QUERYCHAR))){
307 s++;
308 }
309 }
310 frag[1]=s;
311 }
312
313
314 // Encode control characters and characters from set using %-encoding
encode(const FXString & url,const FXchar * set)315 FXString FXURL::encode(const FXString& url,const FXchar* set){
316 FXString result;
317 if(!url.empty()){
318 FXint p,q,c;
319 for(p=q=0; p<url.length(); ++p){
320 c=(FXuchar)url[p];
321 if(c<0x20 || 128<=c || c=='%' || (set && strchr(set,c))){
322 q+=3;
323 continue;
324 }
325 q++;
326 }
327 result.length(q);
328 for(p=q=0; p<url.length(); ++p){
329 c=(FXuchar)url[p];
330 if(c<0x20 || 128<=c || c=='%' || (set && strchr(set,c))){
331 result[q++]='%';
332 result[q++]=FXString::value2Digit[c>>4];
333 result[q++]=FXString::value2Digit[c&15];
334 continue;
335 }
336 result[q++]=c;
337 }
338 }
339 return result;
340 }
341
342
343 // Decode string containing %-encoded characters
decode(const FXString & url)344 FXString FXURL::decode(const FXString& url){
345 FXString result;
346 if(!url.empty()){
347 FXint p,q,c;
348 for(p=q=0; p<url.length(); ++p){
349 c=(FXuchar)url[p];
350 if(c=='%' && Ascii::isHexDigit(url[p+1]) && Ascii::isHexDigit(url[p+2])){
351 p+=2;
352 }
353 q++;
354 }
355 result.length(q);
356 for(p=q=0; p<url.length(); ++p){
357 c=(FXuchar)url[p];
358 if(c=='%' && Ascii::isHexDigit(url[p+1]) && Ascii::isHexDigit(url[p+2])){
359 c=(Ascii::digitValue(url[p+1])<<4)+Ascii::digitValue(url[p+2]);
360 p+=2;
361 }
362 result[q++]=c;
363 }
364 }
365 return result;
366 }
367
368 /*******************************************************************************/
369
370 // Convert path from using 'sepfm' to use 'septo' path-separators
convertPathSep(const FXString & file,FXchar septo,FXchar sepfm)371 static FXString convertPathSep(const FXString& file,FXchar septo,FXchar sepfm){
372 if(!file.empty()){
373 FXString result(file);
374 FXint p=0;
375 FXint q=0;
376 #if defined(WIN32)
377 if(result[q]==sepfm || result[q]==septo){ // UNC
378 result[p++]=septo; q++;
379 if(result[q]==sepfm || result[q]==septo){
380 result[p++]=septo; q++;
381 while(result[q]==sepfm || result[q]==septo) q++;
382 }
383 }
384 else if(Ascii::isLetter(result[q]) && result[q+1]==':'){ // C:
385 result[p++]=result[q++];
386 result[p++]=':'; q++;
387 if(result[q]==sepfm || result[q]==septo){
388 result[p++]=septo; q++;
389 while(result[q]==sepfm || result[q]==septo) q++;
390 }
391 }
392 while(result[q]){
393 if(result[q]==sepfm || result[q]==septo){ // FIXME don't convert escaped path separators!!
394 result[p++]=septo; q++;
395 while(result[q]==sepfm || result[q]==septo) q++;
396 continue;
397 }
398 result[p++]=result[q++];
399 }
400 return result.trunc(p);
401 #else
402 if(result[q]==sepfm || result[q]==septo){
403 result[p++]=septo; q++;
404 while(result[q]==sepfm || result[q]==septo) q++;
405 }
406 while(result[q]){
407 if(result[q]==sepfm || result[q]==septo){ // FIXME don't convert escaped path separators!!
408 result[p++]=septo; q++;
409 while(result[q]==sepfm || result[q]==septo) q++;
410 continue;
411 }
412 result[p++]=result[q++];
413 }
414 return result.trunc(p);
415 #endif
416 }
417 return FXString::null;
418 }
419
420
421 /*******************************************************************************/
422
423 // Return URL of filename
fileToURL(const FXString & file)424 FXString FXURL::fileToURL(const FXString& file){
425 #ifdef WIN32
426 if(ISPATHSEP(file[0]) && ISPATHSEP(file[1])){
427 return "file:"+encode(convertPathSep(file,'/','\\'),ENCODE_THESE); // file://share/path-with-slashes
428 }
429 if(Ascii::isLetter(file[0]) && file[1]==':'){
430 return "file:///"+encode(convertPathSep(file,'/','\\'),ENCODE_THESE); // file:///c:/path-with-slashes
431 }
432 return "file://"+encode(convertPathSep(file,'/','\\'),ENCODE_THESE); // file://path-with-slashes
433 #else
434 return "file://"+encode(file,ENCODE_THESE); // file://path
435 #endif
436 }
437
438
439 // Return filename from URL, empty if url is not a local file
fileFromURL(const FXString & string)440 FXString FXURL::fileFromURL(const FXString& string){
441 if(comparecase(string,"file:",5)==0){
442 #ifdef WIN32
443 URL url(string);
444 if(url.host[0]<url.host[1]){
445 return "\\\\"+string.mid(url.host[0],url.host[1]-url.host[0])+decode(convertPathSep(string.mid(url.path[0],url.path[1]-url.path[0]),'\\','/'));
446 }
447 return decode(convertPathSep(string.mid(url.path[0],url.path[1]-url.path[0]),'\\','/'));
448 #else
449 URL url(string);
450 return decode(string.mid(url.path[0],url.path[1]-url.path[0]));
451 #endif
452 }
453 return FXString::null;
454 }
455
456 /*******************************************************************************/
457
458 // Make URI list from array of filenames
filesToURIList(const FXString * files)459 FXString FXURL::filesToURIList(const FXString* files){
460 FXString result;
461 if(files){
462 FXint n=0;
463 while(!files[n].empty()){
464 result.append(FXURL::fileToURL(files[n++]));
465 result.append("\r\n");
466 }
467 }
468 return result;
469 }
470
471
472 // Make array of filenames from URI list
filesFromURIList(const FXString & urilist)473 FXString* FXURL::filesFromURIList(const FXString& urilist){
474 FXString* result=NULL;
475 if(!urilist.empty()){
476 FXint beg,end,n=0;
477 result=new FXString [urilist.contains("\r\n")+2];
478 for(beg=n=0; beg<urilist.length(); beg=end+2){
479 if((end=urilist.find("\r\n",beg))<0) end=urilist.length();
480 result[n++]=FXURL::fileFromURL(urilist.mid(beg,end-beg));
481 }
482 }
483 return result;
484 }
485
486 /*******************************************************************************/
487
488 // Parse scheme from url
scheme(const FXString & string)489 FXString FXURL::scheme(const FXString& string){
490 URL url(string);
491 return string.mid(url.prot[0],url.prot[1]-url.prot[0]);
492 }
493
494
495 // Parse username from string containing url
username(const FXString & string)496 FXString FXURL::username(const FXString& string){
497 URL url(string);
498 return string.mid(url.user[0],url.user[1]-url.user[0]);
499 }
500
501
502 // Parse password from string containing url
password(const FXString & string)503 FXString FXURL::password(const FXString& string){
504 URL url(string);
505 return string.mid(url.pass[0],url.pass[1]-url.pass[0]);
506 }
507
508
509 // Parse hostname from string containing url
host(const FXString & string)510 FXString FXURL::host(const FXString& string){
511 URL url(string);
512 return string.mid(url.host[0],url.host[1]-url.host[0]);
513 }
514
515
516 // Parse port number from string containing url
port(const FXString & string,FXint def)517 FXint FXURL::port(const FXString& string,FXint def){
518 FXint result=def;
519 URL url(string);
520 if(url.port[0]<url.port[1]){
521 result=Ascii::digitValue(string[url.port[0]++]);
522 while(url.port[0]<url.port[1]){
523 result=result*10+Ascii::digitValue(string[url.port[0]++]);
524 }
525 }
526 return result;
527 }
528
529
530 // Parse path from string containing url
path(const FXString & string)531 FXString FXURL::path(const FXString& string){
532 URL url(string);
533 return string.mid(url.path[0],url.path[1]-url.path[0]);
534 }
535
536
537 // Parse query from string containing url
query(const FXString & string)538 FXString FXURL::query(const FXString& string){
539 URL url(string);
540 return string.mid(url.quer[0],url.quer[1]-url.quer[0]);
541 }
542
543
544 // Parse fragment from string containing url
fragment(const FXString & string)545 FXString FXURL::fragment(const FXString& string){
546 URL url(string);
547 return string.mid(url.frag[0],url.frag[1]-url.frag[0]);
548 }
549
550 }
551