1 /*
2     GSK - a library to write servers
3     Copyright (C) 1999-2000 Dave Benson
4 
5     This library is free software; you can redistribute it and/or
6     modify it under the terms of the GNU Lesser General Public
7     License as published by the Free Software Foundation; either
8     version 2 of the License, or (at your option) any later version.
9 
10     This library is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13     Lesser General Public License for more details.
14 
15     You should have received a copy of the GNU Lesser General Public
16     License along with this library; if not, write to the Free Software
17     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
18 
19     Contact:
20         daveb@ffem.org <Dave Benson>
21 */
22 
23 
24 #include <ctype.h>
25 #include <string.h>
26 #include <stdlib.h>
27 #include "gskurl.h"
28 #include "../gskmacros.h"
29 
30 static GObjectClass *parent_class = NULL;
31 
32 #define IS_SCHEME_CHAR(c)			\
33 	(isalnum(c) || (c) == '+' || (c) == '-' || (c) == '.')
34 
35 typedef enum
36 {
37   GSK_URL_INTERPRETATION_RELATIVE, /* relative url */
38   GSK_URL_INTERPRETATION_ABSOLUTE, /* same host, absolute url */
39   GSK_URL_INTERPRETATION_REMOTE,   /* url on remote host */
40   GSK_URL_INTERPRETATION_UNKNOWN
41 } GskUrlInterpretation;
42 
43 static const char *
gsk_url_scheme_name(GskUrlScheme scheme)44 gsk_url_scheme_name (GskUrlScheme scheme)
45 {
46   switch (scheme)
47     {
48       case GSK_URL_SCHEME_FILE: return "file";
49       case GSK_URL_SCHEME_HTTP: return "http";
50       case GSK_URL_SCHEME_HTTPS: return "https";
51       case GSK_URL_SCHEME_FTP: return "ftp";
52       case GSK_URL_SCHEME_OTHER: return "?other?";
53       default: return NULL;
54     }
55 }
56 
57 /* general sanity check */
58 gboolean
gsk_url_is_valid_hostname(const char * hostname,char * bad_char_out)59 gsk_url_is_valid_hostname (const char *hostname, char *bad_char_out)
60 {
61   while (*hostname)
62     {
63       if (!isalnum (*hostname)
64        && *hostname != '-'
65        && *hostname != '-'
66        && *hostname != '.')
67         {
68           *bad_char_out = *hostname;
69           return FALSE;
70         }
71       hostname++;
72     }
73   return TRUE;
74 }
75 
76 gboolean
gsk_url_is_valid_generic_component(const char * str,char * bad_char_out)77 gsk_url_is_valid_generic_component (const char *str, char *bad_char_out)
78 {
79   while (33 <= *str && *str <= 126)
80     str++;
81   if (*str == 0)
82     return TRUE;
83   *bad_char_out = *str;
84   return FALSE;
85 }
86 
87 static inline gboolean
url_check_is_valid(GskUrl * url,GError ** error)88 url_check_is_valid (GskUrl *url, GError **error)
89 {
90   char bad_char;
91   if (url->host && !gsk_url_is_valid_hostname (url->host, &bad_char))
92     {
93       g_set_error (error, GSK_G_ERROR_DOMAIN, GSK_ERROR_INVALID_ARGUMENT,
94 	           "URL %s constructed uses unallowed character '%c' (0x%02x)",
95                    "host", bad_char, bad_char);
96       return FALSE;
97     }
98   if (url->path && !gsk_url_is_valid_path (url->path, &bad_char))
99     {
100       g_set_error (error, GSK_G_ERROR_DOMAIN, GSK_ERROR_INVALID_ARGUMENT,
101 	           "URL %s constructed uses unallowed character '%c' (0x%02x)",
102                    "path", bad_char, bad_char);
103       return FALSE;
104     }
105   if (url->query && !gsk_url_is_valid_query (url->query, &bad_char))
106     {
107       g_set_error (error, GSK_G_ERROR_DOMAIN, GSK_ERROR_INVALID_ARGUMENT,
108 	           "URL %s constructed uses unallowed character '%c' (0x%02x)",
109                    "query", bad_char, bad_char);
110       return FALSE;
111     }
112   if (url->fragment && !gsk_url_is_valid_fragment (url->fragment, &bad_char))
113     {
114       g_set_error (error, GSK_G_ERROR_DOMAIN, GSK_ERROR_INVALID_ARGUMENT,
115 	           "URL %s constructed uses unallowed character '%c' (0x%02x)",
116                    "fragment", bad_char, bad_char);
117       return FALSE;
118     }
119   return TRUE;
120 }
121 
122 static GskUrl *
gsk_url_new_from_scheme_specific(GskUrlScheme scheme,const char * spec,GError ** error)123 gsk_url_new_from_scheme_specific  (GskUrlScheme       scheme,
124 				   const char        *spec,
125 				   GError           **error)
126 {
127   int num_slashes;
128   const char *start = spec;
129   GskUrlInterpretation interpretation = GSK_URL_INTERPRETATION_UNKNOWN;
130   GskUrl *url;
131 
132   char *host, *user_name, *password, *path, *query, *fragment;
133   int port;
134 
135   num_slashes = 0;
136   while (*spec == '/')
137     {
138       num_slashes++;
139       spec++;
140     }
141   if (scheme == GSK_URL_SCHEME_FILE)
142     interpretation = GSK_URL_INTERPRETATION_ABSOLUTE;
143   else
144     switch (num_slashes)
145       {
146 	case 0:
147 	  interpretation = GSK_URL_INTERPRETATION_RELATIVE;
148 	  break;
149 	case 1:
150 	  interpretation = GSK_URL_INTERPRETATION_ABSOLUTE;
151 	  break;
152 	case 2:
153 	  /* ``schemes including a top hierarchical element for a naming
154 	   *   authority'' (Section 3.2)
155 	   */
156 	  interpretation = GSK_URL_INTERPRETATION_REMOTE;
157 	  break;
158 	case 3:
159 	  /* File urls (well those are now handled above so this
160 	   * is pretty dubious)
161 	   */
162 	  interpretation = GSK_URL_INTERPRETATION_ABSOLUTE;
163 	  break;
164 	default:
165 	  /* syntax error? */
166 	  break;
167       }
168 
169 
170   host = NULL;
171   port = 0;
172   user_name = NULL;
173   path = NULL;
174   query = NULL;
175   fragment = NULL;
176   password = NULL;
177 
178   switch (interpretation)
179     {
180       case GSK_URL_INTERPRETATION_REMOTE:
181 	/* rfc 2396, section 3.2.2. */
182 	{
183 	  const char *end_hostport;
184 	  const char *host_start;
185 	  const char *host_end;
186 	  const char *at_sign;
187 	  const char *colon;
188 	  /* basically the syntax is:
189            *    USER@HOST:PORT/
190            *        ^    |    ^
191            *     at_sign ^  end_hostport
192            *            colon
193            */
194 	  end_hostport = strchr (spec, '/');
195 	  if (end_hostport == NULL)
196 #if 1
197             end_hostport = strchr (spec, 0);
198 #else           /* too strict for casual use ;) */
199 	    {
200 	      /* TODO: it's kinda hard to pinpoint where this
201 		 is specified.  See Section 3 in RFC 2396. */
202 	      g_set_error (error, GSK_G_ERROR_DOMAIN,
203 			   GSK_ERROR_INVALID_ARGUMENT,
204 			   _("missing / after host in URL"));
205 	      return NULL;
206 	    }
207 #endif
208 	  at_sign = memchr (spec, '@', end_hostport - spec);
209 	  host_start = at_sign != NULL ? (at_sign + 1) : spec;
210 	  colon = memchr (host_start, ':', end_hostport - host_start);
211 	  if (at_sign != NULL)
212 	    {
213               const char *password_sep = memchr (spec, ':', at_sign - spec);
214               if (password_sep)
215                 {
216                   user_name = g_strndup (spec, password_sep - spec);
217                   password = g_strndup (password_sep + 1,
218                                         at_sign - (password_sep + 1));
219                 }
220               else
221                 {
222                   user_name = g_strndup (spec, at_sign - spec);
223                 }
224 	      /* XXX: should validate username against
225 	       *         GSK_URL_USERNAME_CHARSET
226 	       */
227 	    }
228 	  host_end = colon != NULL ? colon : end_hostport;
229 	  host = g_strndup (host_start, host_end - host_start);
230 
231 	  if (colon != NULL)
232 	    port = atoi (colon + 1);
233 
234 	  spec = end_hostport;
235           if (*spec == 0)
236             {
237               GskUrl *url;
238               url = gsk_url_new_from_parts (scheme, host, port,
239                                             NULL, NULL, "/", NULL, NULL);
240               g_free (host);
241               return url;
242             }
243 	}
244 
245 	/* fall through to parse the host-specific part of the url */
246       case GSK_URL_INTERPRETATION_RELATIVE:
247       case GSK_URL_INTERPRETATION_ABSOLUTE:
248         {
249 	  const char *query_start;
250 	  const char *frag_start;
251 	  if (num_slashes > 0
252            && interpretation == GSK_URL_INTERPRETATION_ABSOLUTE)
253 	    spec--;
254 	  query_start = strchr (spec, '?');
255 	  frag_start = strchr (query_start != NULL ? query_start : spec, '#');
256 	  if (query_start != NULL)
257 	    path = g_strndup (spec, query_start - spec);
258 	  else if (frag_start != NULL)
259 	    path = g_strndup (spec, frag_start - spec);
260 	  else
261 	    path = g_strdup (spec);
262 	  if (query_start != NULL)
263 	    {
264 	      if (frag_start != NULL)
265 		query = g_strndup ((query_start+1), frag_start - (query_start+1));
266 	      else
267 		query = g_strdup (query_start + 1);
268 	    }
269 	  if (frag_start != NULL)
270 	    fragment = g_strdup (frag_start + 1);
271 	  break;
272 	}
273       case GSK_URL_INTERPRETATION_UNKNOWN:
274         {
275 	  g_set_error (error, GSK_G_ERROR_DOMAIN, GSK_ERROR_BAD_FORMAT,
276 		       _("cannot guess how to interpret %s:%s"),
277 	  	       gsk_url_scheme_name (scheme), start);
278 	  goto error;
279 	}
280     }
281 
282   if (interpretation == GSK_URL_INTERPRETATION_REMOTE
283   && (host == NULL || host[0] == '\0' || !isalnum (host[0])))
284     {
285       g_set_error (error, GSK_G_ERROR_DOMAIN, GSK_ERROR_BAD_FORMAT,
286 		   _("malformed host: should begin with a letter or number (%s)"),
287 		   host);
288       goto error;
289     }
290 
291 
292 
293   url = g_object_new (GSK_TYPE_URL, NULL);
294   url->scheme = scheme;
295   if (scheme == GSK_URL_SCHEME_OTHER)
296     url->scheme_name = NULL;
297   else
298     url->scheme_name = (char *) gsk_url_scheme_name (scheme);
299   url->host = host;
300   url->user_name = user_name;
301   url->password = password;
302   url->query = query;
303   url->fragment = fragment;
304   url->port = port;
305   url->path = path;
306 
307   if (!url_check_is_valid (url, error))
308     {
309       g_object_unref (url);
310       return NULL;
311     }
312   return url;
313 
314 error:
315   g_free (host);
316   g_free (user_name);
317   g_free (password);
318   g_free (query);
319   g_free (fragment);
320   g_free (path);
321   return NULL;
322 }
323 
324 /**
325  * gsk_url_get_relative_path:
326  * @url: the URL to get the host-relative path from.
327  *
328  * Obtain the path portion of a URL without
329  * the initial slash (/) character.
330  *
331  * The query component and fragment are also returned.
332  *
333  * returns: the URL as a string.  This must be freed by the caller.
334  */
335 char *
gsk_url_get_relative_path(GskUrl * url)336 gsk_url_get_relative_path (GskUrl *url)
337 {
338   GString *string = g_string_new ("");
339   g_string_append (string, url->path);
340   if (url->query != NULL)
341     {
342       g_string_append_c (string, '?');
343       g_string_append (string, url->query);
344     }
345   if (url->fragment != NULL)
346     {
347       g_string_append_c (string, '#');
348       g_string_append (string, url->fragment);
349     }
350   return g_string_free (string, FALSE);
351 }
352 
353 /**
354  * gsk_url_new_from_parts:
355  * @scheme: the type of URL being created.
356  * @host: the name (or numeric address as ASCII digits and dots) of the host.
357  * This is called the Authority by RFC 2396, Section 3.2.
358  * @port: the port number to use for the service, or 0 to use the default port
359  * for this type of URL scheme.  For FTP, this is the control port and the data port
360  * will default to the next integer.
361  * @user_name: optional username identifier from the client.
362  * @password: optional password to authenticate.
363  * @path: the host-relative path for the URL
364  * @query: optional query string for URL.
365  * @fragment: optional information about a sublocation in the resource.
366  *
367  * Allocate a new URL from a bunch of pieces.
368  *
369  * returns: a reference to a new URL object.
370  */
371 GskUrl *
gsk_url_new_from_parts(GskUrlScheme scheme,const char * host,int port,const char * user_name,const char * password,const char * path,const char * query,const char * fragment)372 gsk_url_new_from_parts      (GskUrlScheme     scheme,
373 			     const char      *host,
374 			     int              port,
375 			     const char      *user_name,
376 			     const char      *password,
377 			     const char      *path,
378 			     const char      *query,
379 			     const char      *fragment)
380 {
381   GskUrl *url = g_object_new (GSK_TYPE_URL, NULL);
382   url->scheme = scheme;
383   url->scheme_name = (char *) gsk_url_scheme_name (scheme);
384   url->host = g_strdup (host);
385   url->port = port;
386   url->user_name = g_strdup (user_name);
387   url->password = g_strdup (password);
388   url->path = g_strdup (path);
389   url->query = g_strdup (query);
390   url->fragment = g_strdup (fragment);
391   return url;
392 }
393 
394 static void
gsk_url_finalize(GObject * object)395 gsk_url_finalize(GObject *object)
396 {
397   GskUrl *url = GSK_URL (object);
398   if (url->scheme == GSK_URL_SCHEME_OTHER)
399     g_free (url->scheme_name);
400   g_free (url->host);
401   g_free (url->user_name);
402   g_free (url->path);
403   g_free (url->query);
404   g_free (url->fragment);
405   (*parent_class->finalize) (object);
406 }
407 
408 typedef struct _UrlSchemeTableEntry UrlSchemeTableEntry;
409 struct _UrlSchemeTableEntry
410 {
411   char        *name;
412   GskUrlScheme scheme;
413 };
414 
415 static void
skip_scheme(const char ** ptr)416 skip_scheme (const char **ptr)
417 {
418   /* RFC 2396, Section 3.1 */
419   if (isalpha (**ptr))
420     (*ptr)++;
421   else
422     return;
423   while (**ptr && (IS_SCHEME_CHAR (**ptr)))
424     (*ptr)++;
425 }
426 
pstrcmp(const void * a,const void * b)427 static int pstrcmp (const void *a, const void *b)
428 {
429   return strcmp (*(char**)a, *(char**)b);
430 }
431 
lookup_scheme_from_name(const char * scheme_start,const char * scheme_end,GskUrlScheme * scheme_out)432 static gboolean lookup_scheme_from_name (const char     *scheme_start,
433                                          const char     *scheme_end,
434 					 GskUrlScheme   *scheme_out)
435 {
436   static UrlSchemeTableEntry table[] = {
437     /* MUST BE SORTED */
438     { "file", GSK_URL_SCHEME_FILE },
439     { "ftp", GSK_URL_SCHEME_FTP },
440     { "http", GSK_URL_SCHEME_HTTP },
441     { "https", GSK_URL_SCHEME_HTTPS },
442   };
443   int i;
444   UrlSchemeTableEntry tmp;
445   UrlSchemeTableEntry *entry;
446   #define NUM_SCHEMES 		G_N_ELEMENTS (table)
447   tmp.name = alloca (scheme_end - scheme_start + 1);
448   for (i = 0; i < scheme_end - scheme_start; i++)
449     tmp.name[i] = tolower (scheme_start[i]);
450   tmp.name[i] = '\0';
451   entry = bsearch (&tmp, table, G_N_ELEMENTS (table),
452 		   sizeof (UrlSchemeTableEntry), pstrcmp);
453   if (entry == NULL)
454     return FALSE;
455   *scheme_out = entry->scheme;
456   return TRUE;
457 }
458 
459 /**
460  * gsk_url_new:
461  * @spec: standard string representation of the URL.
462  * @error: place to store a #GError if an error occurs.
463  *
464  * Parse a URL object from a string.
465  *
466  * returns: a reference to a new URL object, or NULL if an error occurred.
467  */
gsk_url_new(const char * spec,GError ** error)468 GskUrl       *gsk_url_new           (const char      *spec,
469 				     GError         **error)
470 {
471   const char *scheme_start;
472   const char *scheme_end;
473   GskUrlScheme scheme;
474 
475   scheme_start = spec;
476   skip_scheme (&spec);
477   scheme_end = spec;
478 
479   if (*spec != ':')
480     {
481       /* Url scheme did not end in ':' */
482       g_set_error (error, GSK_G_ERROR_DOMAIN, GSK_ERROR_BAD_FORMAT,
483                    "URL did not begin scheme:");
484       return NULL;
485     }
486   scheme_end = spec;
487   /* skip the colon */
488   spec++;
489   if (!lookup_scheme_from_name (scheme_start, scheme_end, &scheme))
490     {
491       g_set_error (error, GSK_G_ERROR_DOMAIN, GSK_ERROR_BAD_FORMAT,
492                    "URL did not begin with known scheme");
493       return NULL;
494     }
495 
496   return gsk_url_new_from_scheme_specific (scheme, spec, error);
497 }
498 
499 /**
500  * gsk_url_new_in_context:
501  * @spec: rough URL specification.  This may be a complete URL,
502  * or it may have an implied scheme.
503  * @context: default scheme for URL's in your context.
504  * @error: place to store a #GError if something goes wrong.
505  *
506  * For places where you expect a certain type of URL,
507  * soemtimes people get lazy and drop the scheme.
508  * We support this here, by allowing a "backup scheme"
509  * to be specified.
510  *
511  * To be fully paranoid in such a situation, you may wish to
512  * if there appears to be a scheme, use gsk_url_new();
513  * otherwise call gsk_url_new_from_scheme_specific() directly.
514  * Alternately, it may be easier just to call
515  * gsk_url_new_in_context() directly all the time.
516  *
517  * See also gsk_url_new_relative().
518  *
519  * returns: a newly allocated URL object.
520  */
gsk_url_new_in_context(const char * spec,GskUrlScheme context,GError ** error)521 GskUrl       *gsk_url_new_in_context(const char      *spec,
522                                      GskUrlScheme     context,
523 				     GError         **error)
524 {
525   const char *scheme_start;
526   const char *scheme_end;
527   GskUrlScheme scheme;
528   scheme_start = spec;
529   skip_scheme (&spec);
530   scheme_end = spec;
531   if (scheme_start == scheme_end)
532     scheme = context;
533   else
534     {
535       if (!lookup_scheme_from_name (scheme_start, scheme_end, &scheme))
536 	{
537 	  g_set_error (error, GSK_G_ERROR_DOMAIN, GSK_ERROR_INVALID_ARGUMENT,
538 		       _("unknown url scheme (start of '%s')"), scheme_start);
539 	  return NULL;
540 	}
541       /* skip the colon */
542       spec++;
543     }
544 
545   return gsk_url_new_from_scheme_specific (scheme, spec, error);
546 }
547 
548 /**
549  * gsk_url_new_relative:
550  * @base_url: context of the @spec found.  This tells where @spec
551  * may be relative to.
552  * @location: the possibly relative spec.
553  * @error: place to store a #GError if something goes wrong.
554  *
555  * Allocate a new URL, which will be taken
556  * to be relative to @base_url if the @location
557  * is not obviously an absolute URL.
558  *
559  * Note that there is some ambiguity in how relative urls are
560  * interpreted.  Note especially that
561  *    /foo + /bar = /bar.
562  *    /foo +  bar = /bar.
563  *    /foo/ + bar = /foo/bar.
564  * That is, a symbol with a trailing slash is a directory,
565  * otherwise the last piece of the url is assumed to be a file.
566  *
567  * returns: a newly allocated URL object.
568  */
569 GskUrl *
gsk_url_new_relative(GskUrl * base_url,const char * location,GError ** error)570 gsk_url_new_relative  (GskUrl     *base_url,
571 		       const char *location,
572 		       GError    **error)
573 {
574   /* XXX: what is the right way to determine if a string is
575    * 	    a absolute v. relative url???
576    * XXX: definitely NOT this, which doesn't have
577    *      http:foo/bar.html
578    */
579 /* TODO: See RFC 2396 section 5, "Relative URI References"? */
580 
581   /* if we have a ':' before a '/' character,
582      then assume a full url. */
583   const char *tmp;
584 
585   GSK_SKIP_WHITESPACE (location);
586 
587   if (*location == 0)
588     {
589       g_set_error (error, GSK_G_ERROR_DOMAIN, GSK_ERROR_INVALID_ARGUMENT,
590                    "gsk_url_new_relative: location was empty");
591       return NULL;
592     }
593 
594   tmp = location;
595   while (*tmp && *tmp != '/' && *tmp != ':')
596     tmp++;
597   if (*tmp == ':')
598     {
599       /* absolute redirect */
600       return gsk_url_new (location, error);
601     }
602   else
603     {
604       const char *query_start = strchr (location, '?');
605       const char *frag_start = strchr (query_start ? query_start : location, '#');
606       const char *location_end = query_start ? query_start
607                                 : frag_start ? frag_start
608                                 : strchr (location, 0);
609       char *query = NULL;
610       char *fragment = NULL;
611       char *path;
612       guint path_len;
613       char bad_char;
614       GskUrl *rv;
615       if (query_start)
616 	{
617           query_start++;
618 	  query = g_alloca (strlen (query_start));
619 	  if (frag_start)
620 	    {
621 	      memcpy (query, query_start + 1, frag_start - query_start);
622 	      query[frag_start - query_start] = 0;
623 	    }
624 	  else
625 	    {
626 	      strcpy (query, query_start);
627 	    }
628 	}
629       if (frag_start)
630 	fragment = strcpy (g_alloca (strlen (frag_start)), frag_start + 1);
631 
632       path_len = location_end - location;
633       if (*location == '/')
634 	{
635 	  path = memcpy (g_alloca (path_len + 1), location, path_len);
636 	  path[path_len] = 0;
637 	}
638       else
639 	{
640 	  const char *last_slash = strrchr (base_url->path, '/');
641 	  guint len, total_len;
642           guint location_len = location_end - location;
643 	  if (!last_slash)
644 	    len = strlen (base_url->path);
645 	  else
646 	    len = last_slash - base_url->path;
647 
648 	  /* TODO: deal with '.' and '..' */
649 
650 	  total_len = len + 1 + location_len;
651 	  path = g_alloca (total_len + 1);
652 	  memcpy (path, base_url->path, len);
653 	  path[len] = '/';
654 	  memcpy (path + len + 1, location, location_len);
655           path[len + 1 + location_len] = '\0';
656 	}
657       if (path && !gsk_url_is_valid_path (path, &bad_char))
658         {
659           g_set_error (error, GSK_G_ERROR_DOMAIN, GSK_ERROR_INVALID_ARGUMENT,
660 	           "URL %s constructed uses unallowed character '%c' (0x%02x)",
661                    "relative path", bad_char, bad_char);
662           return NULL;
663         }
664       if (query && !gsk_url_is_valid_query (query, &bad_char))
665         {
666           g_set_error (error, GSK_G_ERROR_DOMAIN, GSK_ERROR_INVALID_ARGUMENT,
667 	           "URL %s constructed uses unallowed character '%c' (0x%02x)",
668                    "query", bad_char, bad_char);
669           return NULL;
670         }
671       if (fragment && !gsk_url_is_valid_fragment (fragment, &bad_char))
672         {
673           g_set_error (error, GSK_G_ERROR_DOMAIN, GSK_ERROR_INVALID_ARGUMENT,
674 	           "URL %s constructed uses unallowed character '%c' (0x%02x)",
675                    "fragment", bad_char, bad_char);
676           return NULL;
677         }
678       rv = gsk_url_new_from_parts (base_url->scheme,
679                                    base_url->host,
680                                    base_url->port,
681                                    base_url->user_name,
682                                    base_url->password,
683                                    path, query, fragment);
684       if (!url_check_is_valid (rv, error))
685         {
686           g_object_unref (rv);
687           return NULL;
688         }
689       return rv;
690     }
691 }
692 
693 /**
694  * gsk_url_to_string:
695  * @url: the URL to stringify.
696  *
697  * Convert the URL to a string.
698  *
699  * returns: the newly allocated string.
700  */
701 char *
gsk_url_to_string(const GskUrl * url)702 gsk_url_to_string (const GskUrl *url)
703 {
704   guint len = strlen (url->scheme_name)
705             + 4         /* :/// (max) */
706             + (url->host ? strlen (url->host) : 0)
707             + 3
708             + (url->password ? strlen (url->password) : 0)
709             + 10        /* port */
710             + (url->user_name ? strlen (url->user_name) : 0)
711             + 1
712             + (url->path ? strlen (url->path) : 0)
713             + 1
714             + (url->query ? strlen (url->query) : 0)
715             + 1
716             + (url->fragment ? strlen (url->fragment) : 0)
717             + 10        /* extra! */
718             ;
719   char *rv = g_malloc (len);
720   char *at = rv;
721 #define ADD_STR(str)    \
722   G_STMT_START{ strcpy(at,str); at=strchr(at,0); }G_STMT_END
723 #define ADD_CHAR(c)    \
724   G_STMT_START{ *at++ = c; }G_STMT_END
725   ADD_STR (url->scheme_name);
726   if (url->scheme == GSK_URL_SCHEME_FILE)
727     ADD_STR ("://");    /* note: the path typically includes one more '/' */
728   else if (url->host != NULL)
729     ADD_STR ("://");
730   else
731     ADD_STR (":");
732   if (url->user_name)
733     {
734       ADD_STR (url->user_name);
735       if (url->password)
736         {
737           ADD_CHAR (':');
738           ADD_STR (url->password);
739         }
740       ADD_CHAR ('@');
741     }
742   if (url->host)
743     {
744       ADD_STR (url->host);
745     }
746   if (url->port)
747     {
748       char buf[64];
749       g_snprintf(buf,sizeof(buf),":%u", url->port);
750       ADD_STR (buf);
751     }
752   if (url->path)
753     ADD_STR (url->path);
754   if (url->query)
755     {
756       ADD_CHAR ('?');
757       ADD_STR (url->query);
758     }
759   if (url->fragment)
760     {
761       ADD_CHAR ('#');
762       ADD_STR (url->fragment);
763     }
764   *at = 0;
765 
766   return rv;
767 }
768 
769 /**
770  * gsk_url_get_port:
771  * @url: the URL whose port is desired.
772  *
773  * Returns the port.  If the port is 0, the default port
774  * for the type of scheme is returned (80 for HTTP, 21 for FTP
775  * and 443 for HTTP/SSL).  If no default exists, 0 is returned.
776  *
777  * returns: the port as an integer, or 0 if no port could be computed.
778  */
779 guint
gsk_url_get_port(const GskUrl * url)780 gsk_url_get_port (const GskUrl *url)
781 {
782   if (url->port == 0)
783     {
784       switch (url->scheme)
785 	{
786 	case GSK_URL_SCHEME_HTTP:
787 	  return 80;
788 	case GSK_URL_SCHEME_HTTPS:
789 	  return 443;
790 	case GSK_URL_SCHEME_FTP:
791 	  return 21;
792 
793 	case GSK_URL_SCHEME_FILE:
794 	case GSK_URL_SCHEME_OTHER:
795 	  return 0;
796 	}
797     }
798   return url->port;
799 }
800 
801 /* --- arguments --- */
802 enum
803 {
804   PROP_0,
805   PROP_HOST,
806   PROP_PASSWORD,
807   PROP_PORT,
808   PROP_USER_NAME,
809   PROP_PATH,
810   PROP_QUERY,
811   PROP_FRAGMENT,
812 };
813 
814 static void
gsk_url_get_property(GObject * object,guint property_id,GValue * value,GParamSpec * pspec)815 gsk_url_get_property (GObject        *object,
816 		      guint           property_id,
817 		      GValue         *value,
818 		      GParamSpec     *pspec)
819 {
820   GskUrl *url = GSK_URL (object);
821   switch (property_id)
822     {
823     case PROP_HOST:
824       g_value_set_string (value, url->host);
825       break;
826     case PROP_PASSWORD:
827       g_value_set_string (value, url->password);
828       break;
829     case PROP_PORT:
830       g_value_set_uint (value, gsk_url_get_port (url));
831       break;
832     case PROP_USER_NAME:
833       g_value_set_string (value, url->user_name);
834       break;
835     case PROP_PATH:
836       g_value_set_string (value, url->path);
837       break;
838     case PROP_QUERY:
839       g_value_set_string (value, url->query);
840       break;
841     case PROP_FRAGMENT:
842       g_value_set_string (value, url->fragment);
843       break;
844     default:
845       G_OBJECT_WARN_INVALID_PROPERTY_ID (object, property_id, pspec);
846       break;
847     }
848 }
849 
850 
851 static void
gsk_url_set_property(GObject * object,guint property_id,const GValue * value,GParamSpec * pspec)852 gsk_url_set_property (GObject        *object,
853 		      guint           property_id,
854 		      const GValue   *value,
855 		      GParamSpec     *pspec)
856 {
857   GskUrl *url = GSK_URL (object);
858   switch (property_id)
859     {
860     case PROP_HOST:
861       g_free (url->host);
862       url->host = g_strdup (g_value_get_string (value));
863       break;
864     case PROP_PASSWORD:
865       g_free (url->password);
866       url->password = g_strdup (g_value_get_string (value));
867       break;
868     case PROP_PORT:
869       url->port = g_value_get_uint (value);
870       break;
871     case PROP_USER_NAME:
872       g_free (url->user_name);
873       url->user_name = g_strdup (g_value_get_string (value));
874       break;
875     case PROP_PATH:
876       g_free (url->path);
877       url->path = g_strdup (g_value_get_string (value));
878       break;
879     case PROP_QUERY:
880       g_free (url->query);
881       url->query = g_strdup (g_value_get_string (value));
882       break;
883     case PROP_FRAGMENT:
884       g_free (url->fragment);
885       url->fragment = g_strdup (g_value_get_string (value));
886       break;
887     default:
888       G_OBJECT_WARN_INVALID_PROPERTY_ID (object, property_id, pspec);
889       break;
890     }
891 }
892 
893 static void
gsk_url_init(GskUrl * url)894 gsk_url_init (GskUrl *url)
895 {
896   url->scheme = GSK_URL_SCHEME_OTHER;
897 }
898 
899 static void
gsk_url_class_init(GskUrlClass * class)900 gsk_url_class_init (GskUrlClass *class)
901 {
902   GObjectClass *object_class = G_OBJECT_CLASS (class);
903   GParamSpec *pspec;
904   parent_class = g_type_class_peek_parent (class);
905   object_class->set_property = gsk_url_set_property;
906   object_class->get_property = gsk_url_get_property;
907   object_class->finalize = gsk_url_finalize;
908   pspec = g_param_spec_string ("host",
909 			       _("Host Name"),
910 			       _("name of host having resource"),
911 			       NULL,
912                                G_PARAM_READWRITE);
913   g_object_class_install_property (object_class, PROP_HOST, pspec);
914 
915   pspec = g_param_spec_string ("password",
916 			       _("Password"),
917 			       _("password protecting resource"),
918 			       NULL,
919                                G_PARAM_READWRITE);
920   g_object_class_install_property (object_class, PROP_PASSWORD, pspec);
921 
922   pspec = g_param_spec_uint ("port",
923 			     _("Port"),
924 			     _("port for resource (or 0 for default)"),
925 			     0, 65536, 0,
926 			     G_PARAM_READWRITE);
927   g_object_class_install_property (object_class, PROP_PORT, pspec);
928 
929   pspec = g_param_spec_string ("user-name",
930 			       _("Username"),
931 			       _("username for resource"),
932 			       NULL,
933                                G_PARAM_READWRITE);
934   g_object_class_install_property (object_class, PROP_USER_NAME, pspec);
935 
936   pspec = g_param_spec_string ("path",
937 			       _("Path"),
938 			       _("Path on the server to the resource"),
939 			       NULL,
940                                G_PARAM_READWRITE);
941   g_object_class_install_property (object_class, PROP_PATH, pspec);
942 
943   pspec = g_param_spec_string ("query",
944 			       _("Query"),
945 			       _("Query (for HTTP resources)"),
946 			       NULL,
947                                G_PARAM_READWRITE);
948   g_object_class_install_property (object_class, PROP_QUERY, pspec);
949 
950   pspec = g_param_spec_string ("fragment",
951 			       _("Fragment"),
952 			       _("Fragment (for HTTP resources)"),
953 			       NULL,
954                                G_PARAM_READWRITE);
955   g_object_class_install_property (object_class, PROP_FRAGMENT, pspec);
956 }
957 
958 GType
gsk_url_get_type()959 gsk_url_get_type()
960 {
961   static GType url_type = 0;
962   if (!url_type)
963     {
964       static const GTypeInfo url_info =
965       {
966 	sizeof(GskUrlClass),
967 	(GBaseInitFunc) NULL,
968 	(GBaseFinalizeFunc) NULL,
969 	(GClassInitFunc) gsk_url_class_init,
970 	NULL,		/* class_finalize */
971 	NULL,		/* class_data */
972 	sizeof (GskUrl),
973 	0,		/* n_preallocs */
974 	(GInstanceInitFunc) gsk_url_init,
975 	NULL		/* value_table */
976       };
977       url_type = g_type_register_static (G_TYPE_OBJECT,
978                                                   "GskUrl",
979 						  &url_info, 0);
980     }
981   return url_type;
982 }
983 
984 /**
985  * gsk_url_hash:
986  * @url: a url.
987  *
988  * Compute a randomish hash code based on the URL.
989  *
990  * You can create a GHashTable that's keyed off of URLs with:
991  *   g_hash_table_new((GHashFunc)gsk_url_hash,
992  *                    (GEqualFunc)gsk_url_equal);
993  *
994  * returns: the hash code.
995  */
gsk_url_hash(const GskUrl * url)996 guint           gsk_url_hash                (const GskUrl    *url)
997 {
998   guint rv = 0;
999   rv += g_str_hash (url->scheme_name);
1000   if (url->host)
1001     rv += 33 * g_str_hash (url->host);
1002   if (url->password)
1003     rv += 1001 * g_str_hash (url->password);
1004   rv += 11 * url->port;
1005   if (url->user_name)
1006     rv ^= g_str_hash (url->user_name);
1007   if (url->path)
1008     rv ^= 101 * g_str_hash (url->path);
1009   if (url->query)
1010     rv ^= 10009 * g_str_hash (url->query);
1011   if (url->fragment)
1012     rv += 100001 * g_str_hash (url->fragment);
1013   return rv;
1014 }
safe_strs_equal(const char * a,const char * b)1015 static inline gboolean safe_strs_equal (const char *a, const char *b)
1016 {
1017   if (a == NULL && b == NULL)
1018     return TRUE;
1019   if (a == NULL || b == NULL)
1020     return FALSE;
1021   return strcmp (a,b) == 0;
1022 }
1023 
1024 /**
1025  * gsk_url_equal:
1026  * @a: a url.
1027  * @b: another url.
1028  *
1029  * Test to see if two URLs are the same.
1030  *
1031  * returns: whether the URLs are the same.
1032  */
gsk_url_equal(const GskUrl * a,const GskUrl * b)1033 gboolean gsk_url_equal (const GskUrl *a,
1034                         const GskUrl *b)
1035 {
1036   return safe_strs_equal (a->scheme_name, b->scheme_name)
1037       && safe_strs_equal (a->host, b->host)
1038       && safe_strs_equal (a->password, b->password)
1039       && a->port == b->port
1040       && safe_strs_equal (a->user_name, b->user_name)
1041       && safe_strs_equal (a->path, b->path)
1042       && safe_strs_equal (a->query, b->query)
1043       && safe_strs_equal (a->fragment, b->fragment);
1044 }
1045 
1046 /*
1047  * True if the ascii character c should be escaped within a URI.
1048  * See RFC 2396, section 2.
1049  *
1050  * According to section 2.4: "data must be escaped if it does not have a
1051  * representation using an unreserved character," where unreserved
1052  * characters are (section 2.3): "upper and lower case letters, decimal
1053  * digits, and a limited set of punctuation marks and symbols" [see below].
1054  */
1055 
1056 static guint8 should_be_escaped_data[16] =
1057 {
1058   0xff, 0xff, 0xff, 0xff, 0x7d, 0x98, 0x00, 0xfc,
1059   0x01, 0x00, 0x00, 0x78, 0x01, 0x00, 0x00, 0xb8,
1060 };
1061 static inline gboolean
should_be_escaped(char c)1062 should_be_escaped (char c)
1063 {
1064   if (c & 0x80)
1065     return TRUE;
1066   return (should_be_escaped_data[c>>3] & (1<<(7&c))) != 0;
1067 }
1068 
1069 static const char *hex_characters = "0123456789abcdef";
1070 
1071 /**
1072  * gsk_url_encode:
1073  * @decoded: decoded data to escape.
1074  *
1075  * Encode characters to be passed in a URL.
1076  * Basically, "unsafe" characters are converted
1077  * to %xx where 'x' is a hexidecimal digit.
1078  *
1079  * See RFC 2396 Section 2.
1080  *
1081  * returns: a newly allocated string.
1082  */
1083 char *
gsk_url_encode(const char * raw)1084 gsk_url_encode (const char      *raw)
1085 {
1086   int length = 0;
1087   const char *at;
1088   char *out;
1089   char *rv;
1090   for (at = raw; *at != '\0'; at++)
1091     if (should_be_escaped (*at))
1092       length += 3;
1093     else
1094       length += 1;
1095   rv = g_new (char, length + 1);
1096   out = rv;
1097   for (at = raw; *at != '\0'; at++)
1098     if (should_be_escaped (*at))
1099       {
1100         *out++ = '%';
1101 	*out++ = hex_characters [((guint8) *at) >> 4];
1102 	*out++ = hex_characters [((guint8) *at) & 0xf];
1103       }
1104     else
1105       {
1106 	*out = *at;
1107         out++;
1108       }
1109   *out = '\0';
1110   return rv;
1111 }
1112 
1113 /**
1114  * gsk_url_decode:
1115  * @encoded: encoded URL to convert to plaintext.
1116  *
1117  * Decode characters to be passed in a URL.
1118  * Basically, any %xx string is changed to the
1119  * character whose ASCII code is xx, treating xx as
1120  * a hexidecimal 2-digit number.
1121  *
1122  * See RFC ??, Section ??.
1123  *
1124  * returns: a newly allocated string.
1125  */
1126 char *
gsk_url_decode(const char * encoded)1127 gsk_url_decode  (const char      *encoded)
1128 {
1129   const char *at = encoded;
1130   int length = 0;
1131   char *rv;
1132   char *out;
1133   while (*at != '\0')
1134     {
1135       if (*at == '%')
1136         {
1137 	  if (at[1] == '\0' || at[2] == '\0')
1138 	    {
1139 	      g_warning ("malformed URL encoded string");
1140 	      return NULL;
1141 	    }
1142 	  at += 3;
1143 	  length++;
1144 	}
1145       else
1146 	{
1147 	  at++;
1148 	  length++;
1149 	}
1150     }
1151   rv = g_new (char, length + 1);
1152   out = rv;
1153   at = encoded;
1154   while (*at != '\0')
1155     {
1156       if (*at == '%')
1157         {
1158 	  char hex[3];
1159 	  hex[0] = at[1];
1160 	  hex[1] = at[2];
1161 	  hex[2] = '\0';
1162 	  if (at[1] == '\0' || at[2] == '\0')
1163 	    return NULL;
1164 	  at += 3;
1165 	  *out++ = (char) strtol (hex, NULL, 16);
1166 	}
1167       else
1168 	{
1169 	  *out++ = *at++;
1170 	  length++;
1171 	}
1172     }
1173   *out = '\0';
1174   return rv;
1175 }
1176 
1177 /**
1178  * gsk_url_encode_http:
1179  * @decoded: the raw url text; this is treated as raw 8-bit data,
1180  * not UTF-8.
1181  *
1182  * Do what is typically thought of
1183  * as "url encoding" in http-land... namely SPACE maps to '+'
1184  * and funny characters are encoded
1185  * as %xx where 'x' denotes a single hex-digit.
1186  *
1187  * returns: a newly allocated encoded string that the caller
1188  * must free.
1189  */
1190 char *
gsk_url_encode_http(const char * decoded)1191 gsk_url_encode_http (const char *decoded)
1192 {
1193   const char *at;
1194   guint len = 0;
1195   char *rv;
1196   char *rv_at;
1197   for (at = decoded; *at != '\0'; at++)
1198     {
1199       if (*at != ' ' && should_be_escaped (*at))
1200 	len += 3;
1201       else
1202 	len++;
1203     }
1204 
1205   rv = g_malloc (len + 1);
1206   rv_at = rv;
1207   for (at = decoded; *at != '\0'; at++)
1208     {
1209       if (*at == ' ')
1210 	*rv_at++ = '+';
1211       else if (should_be_escaped (*at))
1212 	{
1213 	  *rv_at++ = '%';
1214 	  *rv_at++ = hex_characters [((guint8) *at) >> 4];
1215 	  *rv_at++ = hex_characters [((guint8) *at) & 0xf];
1216 	}
1217       else
1218 	*rv_at++ = *at;
1219     }
1220   *rv_at = '\0';
1221   return rv;
1222 }
1223 
1224 /**
1225  * gsk_url_encode_http_binary:
1226  * @decoded: the raw binary data: may contain NULs.
1227  * @length: length of the binary data, in bytes.
1228  *
1229  * Do what is typically thought of
1230  * as "url encoding" in http-land... namely SPACE maps to '+'
1231  * and funny characters are encoded
1232  * as %xx where 'x' denotes a single hex-digit.
1233  *
1234  * returns: a newly allocated encoded string that the caller
1235  * must free.
1236  */
1237 char *
gsk_url_encode_http_binary(const guint8 * decoded,guint length)1238 gsk_url_encode_http_binary (const guint8 *decoded,
1239                             guint         length)
1240 {
1241   guint rv_len = length;
1242   char *rv;
1243   char *at;
1244   guint i;
1245   for (i = 0; i < length; i++)
1246     if (should_be_escaped (decoded[i]))
1247       rv_len += 2;
1248   rv = g_malloc (rv_len + 1);
1249   at = rv;
1250   for (i = 0; i < length; i++)
1251     if (should_be_escaped (decoded[i]))
1252       {
1253         *at++ = '%';
1254         *at++ = hex_characters[decoded[i] >> 4];
1255         *at++ = hex_characters[decoded[i] & 0xf];
1256       }
1257     else
1258       *at++ = decoded[i];
1259   *at = 0;
1260   return rv;
1261 }
1262 
1263 /**
1264  * gsk_url_decode_http:
1265  * @encoded: the encoded url text.
1266  *
1267  * Do what is typically thought of
1268  * as "url decoding" in http-land... namely '+' maps to SPACE
1269  * and %xx, where 'x' denotes a single hex-digit, maps to the character
1270  * given as hexidecimal.  (warning: the resulting string is not UTF-8)
1271  *
1272  * returns: a newly allocated encoded string that the caller
1273  * must free (the empty string "" when unable to decode hex).
1274  */
1275 char *
gsk_url_decode_http(const char * encoded)1276 gsk_url_decode_http (const char *encoded)
1277 {
1278   const char *at;
1279   guint len = 0;
1280   char *rv;
1281   char *rv_at;
1282   for (at = encoded; *at != '\0'; at++)
1283     {
1284       if (*at == '%')
1285 	{
1286 	  at++;
1287 	  if (!isxdigit(*at))
1288 	    return g_strdup ("");
1289 	  at++;
1290 	  if (!isxdigit(*at))
1291 	    return g_strdup ("");
1292 	  len++;
1293 	}
1294       else
1295 	{
1296 	  len++;
1297 	}
1298     }
1299   rv = g_malloc (len + 1);
1300   rv_at = rv;
1301   for (at = encoded; *at != '\0'; at++)
1302     {
1303       if (*at == '%')
1304 	{
1305 	  char hex[3];
1306 	  hex[0] = *(++at);
1307 	  hex[1] = *(++at);
1308 	  hex[2] = 0;
1309 	  *rv_at++ = (char) strtol (hex, NULL, 16);
1310 	}
1311       else if (*at == '+')
1312 	*rv_at++ = ' ';
1313       else
1314 	*rv_at++ = *at;
1315     }
1316   *rv_at = '\0';
1317   return rv;
1318 }
1319 
1320 /* gsk_url_split_form_urlencoded:
1321  * @encoded_query: the encoded form data
1322  *
1323  * Split an "application/x-www-form-urlencoded"
1324  * format query string into key-value pairs.
1325  *
1326  * See RFC 1866, section 8.2.1.
1327  *
1328  * returns: a null-terminated array of strings: key, value, ... NULL.
1329  * Caller must free result with g_strfreev.
1330  */
1331 char **
gsk_url_split_form_urlencoded(const char * encoded_query)1332 gsk_url_split_form_urlencoded (const char *encoded_query)
1333 {
1334   enum { START, GOT_OTHER, GOT_EQUALS, INVALID } state = START;
1335   guint num_pairs = 0;
1336   const char *query_at;
1337   char **rv, **rv_at;
1338   char *copy, *copy_at;
1339   const char *name = "", *value = "";
1340 
1341   g_return_val_if_fail (encoded_query, NULL);
1342 
1343   /* Scan for valid pairs:
1344    * one more more [^&=]; =; zero or more [^&=]; & or end.
1345    */
1346   for (query_at = encoded_query; ; ++query_at)
1347     switch (*query_at)
1348       {
1349 	case '\0':
1350 	  if (state == GOT_EQUALS)
1351 	    ++num_pairs;
1352 	  goto DONE_SCANNING;
1353 	case '&':
1354 	  if (state == GOT_EQUALS)
1355 	    ++num_pairs;
1356 	  state = START;
1357 	  break;
1358 	case '=':
1359 	  state = GOT_OTHER ? GOT_EQUALS : INVALID;
1360 	  break;
1361 	default:
1362 	  if (state == START)
1363 	    state = GOT_OTHER;
1364 	  break;
1365       }
1366 DONE_SCANNING:
1367   /* num_pairs * (name, value) + terminating NULL */
1368   rv = g_new (gchar *, (num_pairs << 1) + 1);
1369 
1370   copy = g_strdup (encoded_query);
1371   for (state = START, rv_at = rv, copy_at = copy; ; ++copy_at)
1372     switch (*copy_at)
1373       {
1374 	case '\0':
1375 	  if (state == GOT_EQUALS)
1376 	    {
1377 	      *rv_at++ = gsk_url_decode_http (name);
1378 	      *rv_at++ = gsk_url_decode_http (value);
1379 	    }
1380 	  goto DONE;
1381 	case '&':
1382 	  if (state == GOT_EQUALS)
1383 	    {
1384 	      *copy_at = 0;
1385 	      *rv_at++ = gsk_url_decode_http (name);
1386 	      *rv_at++ = gsk_url_decode_http (value);
1387 	    }
1388 	  state = START;
1389 	  break;
1390 	case '=':
1391 	  if (state == GOT_OTHER)
1392 	    {
1393 	      state = GOT_EQUALS;
1394 	      *copy_at = 0;
1395 	      value = copy_at + 1;
1396 	    }
1397 	  else
1398 	    state = INVALID;
1399 	  break;
1400 	default:
1401 	  if (state == START)
1402 	    {
1403 	      state = GOT_OTHER;
1404 	      name = copy_at;
1405 	    }
1406 	  break;
1407       }
1408 DONE:
1409   g_free (copy);
1410   *rv_at = NULL;
1411   return rv;
1412 }
1413