1 /* tld.c --- Declarations for TLD restriction checking.
2    Copyright (C) 2004-2016 Simon Josefsson.
3    Copyright (C) 2003-2014, 2016 Free Software Foundation, Inc.
4 
5    Author: Thomas Jacob, Internet24.de
6 
7    This file is part of GNU Libidn.
8 
9    GNU Libidn is free software: you can redistribute it and/or
10    modify it under the terms of either:
11 
12      * the GNU Lesser General Public License as published by the Free
13        Software Foundation; either version 3 of the License, or (at
14        your option) any later version.
15 
16    or
17 
18      * the GNU General Public License as published by the Free
19        Software Foundation; either version 2 of the License, or (at
20        your option) any later version.
21 
22    or both in parallel, as here.
23 
24    GNU Libidn is distributed in the hope that it will be useful,
25    but WITHOUT ANY WARRANTY; without even the implied warranty of
26    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
27    General Public License for more details.
28 
29    You should have received copies of the GNU General Public License and
30    the GNU Lesser General Public License along with this program.  If
31    not, see <http://www.gnu.org/licenses/>. */
32 
33 #include <config.h>
34 
35 /* Get stringprep_utf8_to_ucs4, stringprep_locale_to_utf8. */
36 #include <stringprep.h>
37 
38 /* Get strcmp(). */
39 #include <string.h>
40 
41 /* Get specifications. */
42 #include <tld.h>
43 
44 /* Array of built-in domain restriction structures.  See tlds.c.  */
45 extern const Tld_table *_tld_tables[];
46 
47 /**
48  * tld_get_table:
49  * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
50  * @tables: Zero terminated array of #Tld_table info-structures for
51  *   TLDs.
52  *
53  * Get the TLD table for a named TLD by searching through the given
54  * TLD table array.
55  *
56  * Return value: Return structure corresponding to TLD @tld by going
57  *   thru @tables, or return %NULL if no such structure is found.
58  */
59 const Tld_table *
tld_get_table(const char * tld,const Tld_table ** tables)60 tld_get_table (const char *tld, const Tld_table ** tables)
61 {
62   const Tld_table **tldtable = NULL;
63 
64   if (!tld || !tables)
65     return NULL;
66 
67   for (tldtable = tables; *tldtable; tldtable++)
68     if (!strcmp ((*tldtable)->name, tld))
69       return *tldtable;
70 
71   return NULL;
72 }
73 
74 /**
75  * tld_default_table:
76  * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
77  * @overrides: Additional zero terminated array of #Tld_table
78  *   info-structures for TLDs, or %NULL to only use library deault
79  *   tables.
80  *
81  * Get the TLD table for a named TLD, using the internal defaults,
82  * possibly overrided by the (optional) supplied tables.
83  *
84  * Return value: Return structure corresponding to TLD @tld_str, first
85  *   looking through @overrides then thru built-in list, or %NULL if
86  *   no such structure found.
87  */
88 const Tld_table *
tld_default_table(const char * tld,const Tld_table ** overrides)89 tld_default_table (const char *tld, const Tld_table ** overrides)
90 {
91   const Tld_table *tldtable = NULL;
92 
93   if (!tld)
94     return NULL;
95 
96   if (overrides)
97     tldtable = tld_get_table (tld, overrides);
98 
99   if (!tldtable)
100     tldtable = tld_get_table (tld, _tld_tables);
101 
102   return tldtable;
103 }
104 
105 #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 ||	\
106 		 (c) == 0xFF0E || (c) == 0xFF61)
107 
108 /**
109  * tld_get_4:
110  * @in: Array of unicode code points to process. Does not need to be
111  *   zero terminated.
112  * @inlen: Number of unicode code points.
113  * @out: Zero terminated ascii result string pointer.
114  *
115  * Isolate the top-level domain of @in and return it as an ASCII
116  * string in @out.
117  *
118  * Return value: Return %TLD_SUCCESS on success, or the corresponding
119  *   #Tld_rc error code otherwise.
120  */
121 int
tld_get_4(const uint32_t * in,size_t inlen,char ** out)122 tld_get_4 (const uint32_t * in, size_t inlen, char **out)
123 {
124   const uint32_t *ipos;
125   size_t olen;
126 
127   *out = NULL;
128   if (!in || inlen == 0)
129     return TLD_NODATA;
130 
131   ipos = &in[inlen - 1];
132   olen = 0;
133   /* Scan backwards for non(latin)letters. */
134   while (ipos >= in && ((*ipos >= 0x41 && *ipos <= 0x5A) ||
135 			(*ipos >= 0x61 && *ipos <= 0x7A)))
136     ipos--, olen++;
137 
138   if (olen > 0 && ipos >= in && DOTP (*ipos))
139     {
140       /* Found something that appears a TLD. */
141       char *out_s = malloc (sizeof (char) * (olen + 1));
142       char *opos = out_s;
143 
144       if (!opos)
145 	return TLD_MALLOC_ERROR;
146 
147       ipos++;
148       /* Transcribe to lowercase ascii string. */
149       for (; ipos < &in[inlen]; ipos++, opos++)
150 	*opos = *ipos > 0x5A ? *ipos : *ipos + 0x20;
151       *opos = 0;
152       *out = out_s;
153       return TLD_SUCCESS;
154     }
155 
156   return TLD_NO_TLD;
157 }
158 
159 /**
160  * tld_get_4z:
161  * @in: Zero terminated array of unicode code points to process.
162  * @out: Zero terminated ascii result string pointer.
163  *
164  * Isolate the top-level domain of @in and return it as an ASCII
165  * string in @out.
166  *
167  * Return value: Return %TLD_SUCCESS on success, or the corresponding
168  *   #Tld_rc error code otherwise.
169  */
170 int
tld_get_4z(const uint32_t * in,char ** out)171 tld_get_4z (const uint32_t * in, char **out)
172 {
173   const uint32_t *ipos = in;
174 
175   if (!in)
176     return TLD_NODATA;
177 
178   while (*ipos)
179     ipos++;
180 
181   return tld_get_4 (in, ipos - in, out);
182 }
183 
184 /**
185  * tld_get_z:
186  * @in: Zero terminated character array to process.
187  * @out: Zero terminated ascii result string pointer.
188  *
189  * Isolate the top-level domain of @in and return it as an ASCII
190  * string in @out.  The input string @in may be UTF-8, ISO-8859-1 or
191  * any ASCII compatible character encoding.
192  *
193  * Return value: Return %TLD_SUCCESS on success, or the corresponding
194  *   #Tld_rc error code otherwise.
195  */
196 int
tld_get_z(const char * in,char ** out)197 tld_get_z (const char *in, char **out)
198 {
199   uint32_t *iucs;
200   size_t i, ilen;
201   int rc;
202 
203   ilen = strlen (in);
204   iucs = calloc (ilen, sizeof (*iucs));
205 
206   if (!iucs)
207     return TLD_MALLOC_ERROR;
208 
209   for (i = 0; i < ilen; i++)
210     iucs[i] = in[i];
211 
212   rc = tld_get_4 (iucs, ilen, out);
213 
214   free (iucs);
215 
216   return rc;
217 }
218 
219 /*
220  * tld_checkchar - verify that character is permitted
221  * @ch: 32 bit unicode character to check.
222  * @tld: A #Tld_table data structure to check @ch against.
223  *
224  * Verify if @ch is either in [a-z0-9-.] or mentioned as a valid
225  * character in @tld.
226  *
227  * Return value: Return the #Tld_rc value %TLD_SUCCESS if @ch is a
228  *   valid character for the TLD @tld or if @tld is %NULL,
229  *   %TLD_INVALID if @ch is invalid as defined by @tld.
230  */
231 static int
_tld_checkchar(uint32_t ch,const Tld_table * tld)232 _tld_checkchar (uint32_t ch, const Tld_table * tld)
233 {
234   const Tld_table_element *s, *e, *m;
235 
236   if (!tld)
237     return TLD_SUCCESS;
238 
239   /* Check for [-a-z0-9.]. */
240   if ((ch >= 0x61 && ch <= 0x7A) ||
241       (ch >= 0x30 && ch <= 0x39) || ch == 0x2D || DOTP (ch))
242     return TLD_SUCCESS;
243 
244   s = tld->valid;
245   e = s + tld->nvalid;
246   while (s < e)
247     {
248       m = s + ((e - s) >> 1);
249       if (ch < m->start)
250 	e = m;
251       else if (ch > m->end)
252 	s = m + 1;
253       else
254 	return TLD_SUCCESS;
255     }
256 
257   return TLD_INVALID;
258 }
259 
260 /**
261  * tld_check_4t:
262  * @in: Array of unicode code points to process. Does not need to be
263  *   zero terminated.
264  * @inlen: Number of unicode code points.
265  * @errpos: Position of offending character is returned here.
266  * @tld: A #Tld_table data structure representing the restrictions for
267  *   which the input should be tested.
268  *
269  * Test each of the code points in @in for whether or not
270  * they are allowed by the data structure in @tld, return
271  * the position of the first character for which this is not
272  * the case in @errpos.
273  *
274  * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
275  *   points are valid or when @tld is null, %TLD_INVALID if a
276  *   character is not allowed, or additional error codes on general
277  *   failure conditions.
278  */
279 int
tld_check_4t(const uint32_t * in,size_t inlen,size_t * errpos,const Tld_table * tld)280 tld_check_4t (const uint32_t * in, size_t inlen, size_t * errpos,
281 	      const Tld_table * tld)
282 {
283   const uint32_t *ipos;
284   int rc;
285 
286   if (!tld)			/* No data for TLD so everything is valid. */
287     return TLD_SUCCESS;
288 
289   ipos = in;
290   while (ipos < &in[inlen])
291     {
292       rc = _tld_checkchar (*ipos, tld);
293       if (rc != TLD_SUCCESS)
294 	{
295 	  if (errpos)
296 	    *errpos = ipos - in;
297 	  return rc;
298 	}
299       ipos++;
300     }
301   return TLD_SUCCESS;
302 }
303 
304 /**
305  * tld_check_4tz:
306  * @in: Zero terminated array of unicode code points to process.
307  * @errpos: Position of offending character is returned here.
308  * @tld: A #Tld_table data structure representing the restrictions for
309  *   which the input should be tested.
310  *
311  * Test each of the code points in @in for whether or not
312  * they are allowed by the data structure in @tld, return
313  * the position of the first character for which this is not
314  * the case in @errpos.
315  *
316  * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
317  *   points are valid or when @tld is null, %TLD_INVALID if a
318  *   character is not allowed, or additional error codes on general
319  *   failure conditions.
320  */
321 int
tld_check_4tz(const uint32_t * in,size_t * errpos,const Tld_table * tld)322 tld_check_4tz (const uint32_t * in, size_t * errpos, const Tld_table * tld)
323 {
324   const uint32_t *ipos = in;
325 
326   if (!ipos)
327     return TLD_NODATA;
328 
329   while (*ipos)
330     ipos++;
331 
332   return tld_check_4t (in, ipos - in, errpos, tld);
333 }
334 
335 /**
336  * tld_check_4:
337  * @in: Array of unicode code points to process. Does not need to be
338  *   zero terminated.
339  * @inlen: Number of unicode code points.
340  * @errpos: Position of offending character is returned here.
341  * @overrides: A #Tld_table array of additional domain restriction
342  *  structures that complement and supersede the built-in information.
343  *
344  * Test each of the code points in @in for whether or not they are
345  * allowed by the information in @overrides or by the built-in TLD
346  * restriction data. When data for the same TLD is available both
347  * internally and in @overrides, the information in @overrides takes
348  * precedence. If several entries for a specific TLD are found, the
349  * first one is used.  If @overrides is %NULL, only the built-in
350  * information is used.  The position of the first offending character
351  * is returned in @errpos.
352  *
353  * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
354  *   points are valid or when @tld is null, %TLD_INVALID if a
355  *   character is not allowed, or additional error codes on general
356  *   failure conditions.
357  */
358 int
tld_check_4(const uint32_t * in,size_t inlen,size_t * errpos,const Tld_table ** overrides)359 tld_check_4 (const uint32_t * in, size_t inlen, size_t * errpos,
360 	     const Tld_table ** overrides)
361 {
362   const Tld_table *tld;
363   char *domain;
364   int rc;
365 
366   if (errpos)
367     *errpos = 0;
368 
369   /* Get TLD name. */
370   rc = tld_get_4 (in, inlen, &domain);
371 
372   if (rc != TLD_SUCCESS)
373     {
374       if (rc == TLD_NO_TLD)	/* No TLD, say OK */
375 	return TLD_SUCCESS;
376       else
377 	return rc;
378     }
379 
380   /* Retrieve appropriate data structure. */
381   tld = tld_default_table (domain, overrides);
382   free (domain);
383 
384   return tld_check_4t (in, inlen, errpos, tld);
385 }
386 
387 /**
388  * tld_check_4z:
389  * @in: Zero-terminated array of unicode code points to process.
390  * @errpos: Position of offending character is returned here.
391  * @overrides: A #Tld_table array of additional domain restriction
392  *   structures that complement and supersede the built-in information.
393  *
394  * Test each of the code points in @in for whether or not they are
395  * allowed by the information in @overrides or by the built-in TLD
396  * restriction data. When data for the same TLD is available both
397  * internally and in @overrides, the information in @overrides takes
398  * precedence. If several entries for a specific TLD are found, the
399  * first one is used.  If @overrides is %NULL, only the built-in
400  * information is used.  The position of the first offending character
401  * is returned in @errpos.
402  *
403  * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
404  *   points are valid or when @tld is null, %TLD_INVALID if a
405  *   character is not allowed, or additional error codes on general
406  *   failure conditions.
407  */
408 int
tld_check_4z(const uint32_t * in,size_t * errpos,const Tld_table ** overrides)409 tld_check_4z (const uint32_t * in, size_t * errpos,
410 	      const Tld_table ** overrides)
411 {
412   const uint32_t *ipos = in;
413 
414   if (!ipos)
415     return TLD_NODATA;
416 
417   while (*ipos)
418     ipos++;
419 
420   return tld_check_4 (in, ipos - in, errpos, overrides);
421 }
422 
423 /**
424  * tld_check_8z:
425  * @in: Zero-terminated UTF8 string to process.
426  * @errpos: Position of offending character is returned here.
427  * @overrides: A #Tld_table array of additional domain restriction
428  *   structures that complement and supersede the built-in information.
429  *
430  * Test each of the characters in @in for whether or not they are
431  * allowed by the information in @overrides or by the built-in TLD
432  * restriction data. When data for the same TLD is available both
433  * internally and in @overrides, the information in @overrides takes
434  * precedence. If several entries for a specific TLD are found, the
435  * first one is used.  If @overrides is %NULL, only the built-in
436  * information is used.  The position of the first offending character
437  * is returned in @errpos.  Note that the error position refers to the
438  * decoded character offset rather than the byte position in the
439  * string.
440  *
441  * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all
442  *   characters are valid or when @tld is null, %TLD_INVALID if a
443  *   character is not allowed, or additional error codes on general
444  *   failure conditions.
445  */
446 int
tld_check_8z(const char * in,size_t * errpos,const Tld_table ** overrides)447 tld_check_8z (const char *in, size_t * errpos, const Tld_table ** overrides)
448 {
449   uint32_t *iucs;
450   size_t ilen;
451   int rc;
452 
453   if (!in)
454     return TLD_NODATA;
455 
456   iucs = stringprep_utf8_to_ucs4 (in, -1, &ilen);
457 
458   if (!iucs)
459     return TLD_MALLOC_ERROR;
460 
461   rc = tld_check_4 (iucs, ilen, errpos, overrides);
462 
463   free (iucs);
464 
465   return rc;
466 }
467 
468 /**
469  * tld_check_lz:
470  * @in: Zero-terminated string in the current locales encoding to process.
471  * @errpos: Position of offending character is returned here.
472  * @overrides: A #Tld_table array of additional domain restriction
473  *   structures that complement and supersede the built-in information.
474  *
475  * Test each of the characters in @in for whether or not they are
476  * allowed by the information in @overrides or by the built-in TLD
477  * restriction data. When data for the same TLD is available both
478  * internally and in @overrides, the information in @overrides takes
479  * precedence. If several entries for a specific TLD are found, the
480  * first one is used.  If @overrides is %NULL, only the built-in
481  * information is used.  The position of the first offending character
482  * is returned in @errpos.  Note that the error position refers to the
483  * decoded character offset rather than the byte position in the
484  * string.
485  *
486  * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all
487  *   characters are valid or when @tld is null, %TLD_INVALID if a
488  *   character is not allowed, or additional error codes on general
489  *   failure conditions.
490  */
491 int
tld_check_lz(const char * in,size_t * errpos,const Tld_table ** overrides)492 tld_check_lz (const char *in, size_t * errpos, const Tld_table ** overrides)
493 {
494   char *utf8;
495   int rc;
496 
497   if (!in)
498     return TLD_NODATA;
499 
500   utf8 = stringprep_locale_to_utf8 (in);
501   if (!utf8)
502     return TLD_ICONV_ERROR;
503 
504 
505   rc = tld_check_8z (utf8, errpos, overrides);
506 
507   free (utf8);
508 
509   return rc;
510 }
511 
512 /**
513  * Tld_rc:
514  * @TLD_SUCCESS: Successful operation.  This value is guaranteed to
515  *   always be zero, the remaining ones are only guaranteed to hold
516  *   non-zero values, for logical comparison purposes.
517  * @TLD_INVALID: Invalid character found.
518  * @TLD_NODATA: No input data was provided.
519  * @TLD_MALLOC_ERROR: Error during memory allocation.
520  * @TLD_ICONV_ERROR: Error during iconv string conversion.
521  * @TLD_NO_TLD: No top-level domain found in domain string.
522  * @TLD_NOTLD: Same as @TLD_NO_TLD, for compatibility
523  *   with typo in earlier versions.
524  *
525  * Enumerated return codes of the TLD checking functions.
526  * The value 0 is guaranteed to always correspond to success.
527  */
528