1 /********************************************************************************
2 *                                                                               *
3 *                       U R L   M a n i p u l a t i o n                         *
4 *                                                                               *
5 *********************************************************************************
6 * Copyright (C) 2000,2020 by Jeroen van der Zijp.   All Rights Reserved.        *
7 *********************************************************************************
8 * This library is free software; you can redistribute it and/or modify          *
9 * it under the terms of the GNU Lesser General Public License as published by   *
10 * the Free Software Foundation; either version 3 of the License, or             *
11 * (at your option) any later version.                                           *
12 *                                                                               *
13 * This library is distributed in the hope that it will be useful,               *
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of                *
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                 *
16 * GNU Lesser General Public License for more details.                           *
17 *                                                                               *
18 * You should have received a copy of the GNU Lesser General Public License      *
19 * along with this program.  If not, see <http://www.gnu.org/licenses/>          *
20 ********************************************************************************/
21 #include "xincs.h"
22 #include "fxver.h"
23 #include "fxdefs.h"
24 #include "fxmath.h"
25 #include "fxascii.h"
26 #include "FXArray.h"
27 #include "FXHash.h"
28 #include "FXStream.h"
29 #include "FXString.h"
30 #include "FXPath.h"
31 #include "FXSystem.h"
32 #include "FXURL.h"
33 
34 
35 /*
36   Notes:
37 
38   - Functions contributed by Sean Hubbell and Sander Jansen.
39 
40   - About drive letters in URL's, Daniel Gehriger has some some
41     empirical tests, and determined the following:
42 
43      NS = works on Netscape
44      IE = works on IE
45      O  = works on Opera
46 
47      - file:///C|/TEMP/                    NS, IE, O
48      - file:///C:/TEMP/                    NS, IE, O
49 
50      - file://localhost/C:/TEMP/           NS, IE, O
51      - file://localhost/C|/TEMP/           NS, IE, O
52 
53      - file://C:/TEMP/                     NS, IE, --
54      - file:///C/TEMP/                     --, --, --
55 
56     The conclusion seems to be we should probably try to handle all
57     of these possibilities, although keeping the ':' seems favorable.
58 
59   - Syntax (as per rfc3986):
60 
61       URI           =  scheme ":" hier-part [ "?" query ] [ "#" fragment ]
62 
63       hier-part     =  "//" authority path-abempty
64                     /  path-absolute
65                     /  path-rootless
66                     /  path-empty
67 
68       URI-reference =  URI / relative-ref
69 
70       absolute-URI  =  scheme ":" hier-part [ "?" query ]
71 
72       relative-ref  =  relative-part [ "?" query ] [ "#" fragment ]
73 
74       relative-part =  "//" authority path-abempty
75                     /  path-absolute
76                     /  path-noscheme
77                     /  path-empty
78 
79       scheme        =  ALPHA  *( ALPHA / DIGIT / "+" / "-" / "." )
80 
81       authority     =  [ userinfo "@" ] host [ ":" port ]
82 
83       userinfo      =  *( unreserved / pct-encoded / sub-delims / ":" )
84 
85       host          =  IP-literal / IPv4address / reg-name
86 
87       IP-literal    =  "[" ( IPv6address / IPvFuture  ) "]"
88 
89       IPvFuture     =  "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
90 
91       IPv6address   =                             6( h16 ":" ) ls32
92                     /                        "::" 5( h16 ":" ) ls32
93                     /  [               h16 ] "::" 4( h16 ":" ) ls32
94                     /  [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
95                     /  [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
96                     /  [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
97                     /  [ *4( h16 ":" ) h16 ] "::"              ls32
98                     /  [ *5( h16 ":" ) h16 ] "::"              h16
99                     /  [ *6( h16 ":" ) h16 ] "::"
100 
101       ls32          =  ( h16 ":" h16 ) / IPv4address                   ; least-significant 32 bits of address
102 
103       h16           = 1*4HEXDIG                                        ; 16 bits of address represented in hexadecimal
104 
105 
106       IPv4address   =  dec-octet "." dec-octet "." dec-octet "." dec-octet
107 
108       dec-octet     =  DIGIT                                            ; 0-9
109                     /  %x31-39 DIGIT                                    ; 10-99
110                     /  "1" 2DIGIT                                       ; 100-199
111                     /  "2" %x30-34 DIGIT                                ; 200-249
112                     /  "25" %x30-35                                     ; 250-255
113 
114       reg-name      = *( unreserved / pct-encoded / sub-delims )
115 
116       port          =  *DIGIT
117 
118       path          =  path-abempty                                     ; begins with "/" or is empty
119                     /  path-absolute                                    ; begins with "/" but not "//"
120                     /  path-noscheme                                    ; begins with a non-colon segment
121                     /  path-rootless                                    ; begins with a segment
122                     /  path-empty                                       ; zero characters
123 
124       path-abempty  =  *( "/" segment )
125 
126       path-absolute =  "/" [ segment-nz *( "/" segment ) ]
127 
128       path-noscheme =  segment-nz-nc *( "/" segment )
129 
130       path-rootless =  segment-nz *( "/" segment )
131 
132       path-empty    =  0<pchar>
133 
134       segment       =  *pchar
135 
136       segment-nz    =  1*pchar
137 
138       segment-nz-nc =  1*( unreserved / pct-encoded / sub-delims / "@" ) ; non-zero-length segment without any colon ":"
139 
140       pchar         =  unreserved / pct-encoded / sub-delims / ":" / "@"
141 
142       query         =  *( pchar / "/" / "?" )
143 
144       fragment      =  *( pchar / "/" / "?" )
145 
146       pct-encoded   =  "%" HEXDIG HEXDIG
147 
148       unreserved    =  ALPHA / DIGIT / "-" / "." / "_" / "~"
149 
150       reserved      =  gen-delims / sub-delims
151 
152       gen-delims    =  ":" / "/" / "?" / "#" / "[" / "]" / "@"
153 
154       sub-delims    =  "!" / "$" / "&" / "'" / "(" / ")"
155                     /  "*" / "+" / "," / ";" / "="
156 
157   - Also, encode all non-ascii bytes from a string.
158 */
159 
160 #define ENCODE_THESE "<>#%{}|^~[]`\"?$&'*,;="           // Encode these for pathnames
161 
162 using namespace FX;
163 
164 /*******************************************************************************/
165 
166 namespace FX {
167 
168 // Character classes
169 enum {
170   UNRESERVED =  1,
171   PERCENT    =  2,
172   SUBDELIM   =  4,
173   GENDELIM   =  8,
174   PATHCHAR   = 16,
175   QUERYCHAR  = 32
176   };
177 
178 
179 // Table of character classes
180 static const FXuchar properties[256]={
181   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
182   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
183   0x10,0x04,0x00,0x08,0x04,0x32,0x04,0x04,0x04,0x04,0x04,0x04,0x04,0x01,0x01,0x38,
184   0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x38,0x04,0x00,0x04,0x00,0x28,
185   0x38,0x03,0x03,0x03,0x03,0x03,0x03,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
186   0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x08,0x10,0x08,0x00,0x01,
187   0x00,0x03,0x03,0x03,0x03,0x03,0x03,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
188   0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x00,0x00,0x00,0x01,0x00,
189   0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
190   0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
191   0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
192   0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
193   0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
194   0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
195   0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,
196   0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
197   };
198 
199 
200 // URL parts
201 class URL {
202 public:
203   FXint prot[2];
204   FXint user[2];
205   FXint pass[2];
206   FXint host[2];
207   FXint port[2];
208   FXint path[2];
209   FXint quer[2];
210   FXint frag[2];
211 public:
212   URL(const FXString& string);
213   };
214 
215 
216 // Parse string to url parts
URL(const FXString & string)217 URL::URL(const FXString& string){
218   FXint s=0;
219   FXuchar c;
220 
221   prot[0]=prot[1]=0;
222 
223   // Parse protocol
224   if(Ascii::isLetter(string[0])){
225     s++;
226 
227     // Scan till end of scheme name
228     while(Ascii::isAlphaNumeric(string[s]) || string[s]=='+' || string[s]=='-' || string[s]=='.') s++;
229 
230     // Scheme end found
231     if(string[s]==':' && s>1){
232       prot[1]=s++;
233       }
234     else{
235       s=prot[0];                                // Reset:- wasn't protocol after all since no ':' found
236       }
237     }
238 
239   user[0]=user[1]=s;
240   pass[0]=pass[1]=s;
241   host[0]=host[1]=s;
242   port[0]=port[1]=s;
243 
244   // Parse hier part
245   if(string[s]=='/' && string[s+1]=='/'){
246     s+=2;
247 
248     // Parse username
249     user[0]=s;
250     while((c=string[s])!='\0' && (properties[c]&(UNRESERVED|SUBDELIM|PERCENT))){
251       s++;
252       }
253 
254     // Parse password
255     user[1]=pass[0]=s;
256     if(string[s]==':'){
257       pass[0]=++s;
258       while((c=string[s])!='\0' && (properties[c]&(UNRESERVED|SUBDELIM|PERCENT))){
259         s++;
260         }
261       }
262     pass[1]=s;
263 
264     // Check for @ after user:pass
265     if(string[s]=='@'){
266       s++;
267       }
268     else{
269       s=pass[0]=pass[1]=user[1]=user[0];        // Reset:- wasn't user:pass after all since no '@' found
270       }
271 
272     // Parse hostname
273     host[0]=s;
274     while((c=string[s])!='\0' && (properties[c]&(UNRESERVED|SUBDELIM|PERCENT))){
275       s++;
276       }
277 
278     // Parse port number
279     host[1]=port[0]=s;
280     if(string[s]==':'){
281       port[0]=++s;
282       while(Ascii::isDigit(string[s])) s++;
283       }
284     port[1]=s;
285     }
286 
287   // Parse path, allowing for \ path delimiters (legacy urls)
288   path[0]=s;
289   while((c=string[s])!='\0' && (properties[c]&(UNRESERVED|SUBDELIM|PATHCHAR))){
290     s++;
291     }
292 
293   // Parse query
294   path[1]=quer[0]=s;
295   if(string[s]=='?'){
296     quer[0]=++s;
297     while((c=string[s])!='\0' && (properties[c]&(UNRESERVED|SUBDELIM|QUERYCHAR))){
298       s++;
299       }
300     }
301 
302   // Parse fragment
303   quer[1]=frag[0]=s;
304   if(string[s]=='#'){
305     frag[0]=++s;
306     while((c=string[s])!='\0' && (properties[c]&(UNRESERVED|SUBDELIM|QUERYCHAR))){
307       s++;
308       }
309     }
310   frag[1]=s;
311   }
312 
313 
314 // Encode control characters and characters from set using %-encoding
encode(const FXString & url,const FXchar * set)315 FXString FXURL::encode(const FXString& url,const FXchar* set){
316   FXString result;
317   if(!url.empty()){
318     FXint p,q,c;
319     for(p=q=0; p<url.length(); ++p){
320       c=(FXuchar)url[p];
321       if(c<0x20 || 128<=c || c=='%' || (set && strchr(set,c))){
322         q+=3;
323         continue;
324         }
325       q++;
326       }
327     result.length(q);
328     for(p=q=0; p<url.length(); ++p){
329       c=(FXuchar)url[p];
330       if(c<0x20 || 128<=c || c=='%' || (set && strchr(set,c))){
331         result[q++]='%';
332         result[q++]=FXString::value2Digit[c>>4];
333         result[q++]=FXString::value2Digit[c&15];
334         continue;
335         }
336       result[q++]=c;
337       }
338     }
339   return result;
340   }
341 
342 
343 // Decode string containing %-encoded characters
decode(const FXString & url)344 FXString FXURL::decode(const FXString& url){
345   FXString result;
346   if(!url.empty()){
347     FXint p,q,c;
348     for(p=q=0; p<url.length(); ++p){
349       c=(FXuchar)url[p];
350       if(c=='%' && Ascii::isHexDigit(url[p+1]) && Ascii::isHexDigit(url[p+2])){
351         p+=2;
352         }
353       q++;
354       }
355     result.length(q);
356     for(p=q=0; p<url.length(); ++p){
357       c=(FXuchar)url[p];
358       if(c=='%' && Ascii::isHexDigit(url[p+1]) && Ascii::isHexDigit(url[p+2])){
359         c=(Ascii::digitValue(url[p+1])<<4)+Ascii::digitValue(url[p+2]);
360         p+=2;
361         }
362       result[q++]=c;
363       }
364     }
365   return result;
366   }
367 
368 /*******************************************************************************/
369 
370 // Convert path from using 'sepfm' to use 'septo' path-separators
convertPathSep(const FXString & file,FXchar septo,FXchar sepfm)371 static FXString convertPathSep(const FXString& file,FXchar septo,FXchar sepfm){
372   if(!file.empty()){
373     FXString result(file);
374     FXint p=0;
375     FXint q=0;
376 #if defined(WIN32)
377     if(result[q]==sepfm || result[q]==septo){                   // UNC
378       result[p++]=septo; q++;
379       if(result[q]==sepfm || result[q]==septo){
380         result[p++]=septo; q++;
381         while(result[q]==sepfm || result[q]==septo) q++;
382         }
383       }
384     else if(Ascii::isLetter(result[q]) && result[q+1]==':'){    // C:
385       result[p++]=result[q++];
386       result[p++]=':'; q++;
387       if(result[q]==sepfm || result[q]==septo){
388         result[p++]=septo; q++;
389         while(result[q]==sepfm || result[q]==septo) q++;
390         }
391       }
392     while(result[q]){
393       if(result[q]==sepfm || result[q]==septo){                 // FIXME don't convert escaped path separators!!
394         result[p++]=septo; q++;
395         while(result[q]==sepfm || result[q]==septo) q++;
396         continue;
397         }
398       result[p++]=result[q++];
399       }
400     return result.trunc(p);
401 #else
402     if(result[q]==sepfm || result[q]==septo){
403       result[p++]=septo; q++;
404       while(result[q]==sepfm || result[q]==septo) q++;
405       }
406     while(result[q]){
407       if(result[q]==sepfm || result[q]==septo){                 // FIXME don't convert escaped path separators!!
408         result[p++]=septo; q++;
409         while(result[q]==sepfm || result[q]==septo) q++;
410         continue;
411         }
412       result[p++]=result[q++];
413       }
414     return result.trunc(p);
415 #endif
416     }
417   return FXString::null;
418   }
419 
420 
421 /*******************************************************************************/
422 
423 // Return URL of filename
fileToURL(const FXString & file)424 FXString FXURL::fileToURL(const FXString& file){
425 #ifdef WIN32
426   if(ISPATHSEP(file[0]) && ISPATHSEP(file[1])){
427     return "file:"+encode(convertPathSep(file,'/','\\'),ENCODE_THESE);         // file://share/path-with-slashes
428     }
429   if(Ascii::isLetter(file[0]) && file[1]==':'){
430     return "file:///"+encode(convertPathSep(file,'/','\\'),ENCODE_THESE);      // file:///c:/path-with-slashes
431     }
432   return "file://"+encode(convertPathSep(file,'/','\\'),ENCODE_THESE);         // file://path-with-slashes
433 #else
434   return "file://"+encode(file,ENCODE_THESE);                                   // file://path
435 #endif
436   }
437 
438 
439 // Return filename from URL, empty if url is not a local file
fileFromURL(const FXString & string)440 FXString FXURL::fileFromURL(const FXString& string){
441   if(comparecase(string,"file:",5)==0){
442 #ifdef WIN32
443     URL url(string);
444     if(url.host[0]<url.host[1]){
445       return "\\\\"+string.mid(url.host[0],url.host[1]-url.host[0])+decode(convertPathSep(string.mid(url.path[0],url.path[1]-url.path[0]),'\\','/'));
446       }
447     return decode(convertPathSep(string.mid(url.path[0],url.path[1]-url.path[0]),'\\','/'));
448 #else
449     URL url(string);
450     return decode(string.mid(url.path[0],url.path[1]-url.path[0]));
451 #endif
452     }
453   return FXString::null;
454   }
455 
456 /*******************************************************************************/
457 
458 // Make URI list from array of filenames
filesToURIList(const FXString * files)459 FXString FXURL::filesToURIList(const FXString* files){
460   FXString result;
461   if(files){
462     FXint n=0;
463     while(!files[n].empty()){
464       result.append(FXURL::fileToURL(files[n++]));
465       result.append("\r\n");
466       }
467     }
468   return result;
469   }
470 
471 
472 // Make array of filenames from URI list
filesFromURIList(const FXString & urilist)473 FXString* FXURL::filesFromURIList(const FXString& urilist){
474   FXString* result=NULL;
475   if(!urilist.empty()){
476     FXint beg,end,n=0;
477     result=new FXString [urilist.contains("\r\n")+2];
478     for(beg=n=0; beg<urilist.length(); beg=end+2){
479       if((end=urilist.find("\r\n",beg))<0) end=urilist.length();
480       result[n++]=FXURL::fileFromURL(urilist.mid(beg,end-beg));
481       }
482     }
483   return result;
484   }
485 
486 /*******************************************************************************/
487 
488 // Parse scheme from url
scheme(const FXString & string)489 FXString FXURL::scheme(const FXString& string){
490   URL url(string);
491   return string.mid(url.prot[0],url.prot[1]-url.prot[0]);
492   }
493 
494 
495 // Parse username from string containing url
username(const FXString & string)496 FXString FXURL::username(const FXString& string){
497   URL url(string);
498   return string.mid(url.user[0],url.user[1]-url.user[0]);
499   }
500 
501 
502 // Parse password from string containing url
password(const FXString & string)503 FXString FXURL::password(const FXString& string){
504   URL url(string);
505   return string.mid(url.pass[0],url.pass[1]-url.pass[0]);
506   }
507 
508 
509 // Parse hostname from string containing url
host(const FXString & string)510 FXString FXURL::host(const FXString& string){
511   URL url(string);
512   return string.mid(url.host[0],url.host[1]-url.host[0]);
513   }
514 
515 
516 // Parse port number from string containing url
port(const FXString & string,FXint def)517 FXint FXURL::port(const FXString& string,FXint def){
518   FXint result=def;
519   URL url(string);
520   if(url.port[0]<url.port[1]){
521     result=Ascii::digitValue(string[url.port[0]++]);
522     while(url.port[0]<url.port[1]){
523       result=result*10+Ascii::digitValue(string[url.port[0]++]);
524       }
525     }
526   return result;
527   }
528 
529 
530 // Parse path from string containing url
path(const FXString & string)531 FXString FXURL::path(const FXString& string){
532   URL url(string);
533   return string.mid(url.path[0],url.path[1]-url.path[0]);
534   }
535 
536 
537 // Parse query from string containing url
query(const FXString & string)538 FXString FXURL::query(const FXString& string){
539   URL url(string);
540   return string.mid(url.quer[0],url.quer[1]-url.quer[0]);
541   }
542 
543 
544 // Parse fragment from string containing url
fragment(const FXString & string)545 FXString FXURL::fragment(const FXString& string){
546   URL url(string);
547   return string.mid(url.frag[0],url.frag[1]-url.frag[0]);
548   }
549 
550 }
551