1 /* tld.c --- Declarations for TLD restriction checking.
2 Copyright (C) 2004-2016 Simon Josefsson.
3 Copyright (C) 2003-2014, 2016 Free Software Foundation, Inc.
4
5 Author: Thomas Jacob, Internet24.de
6
7 This file is part of GNU Libidn.
8
9 GNU Libidn is free software: you can redistribute it and/or
10 modify it under the terms of either:
11
12 * the GNU Lesser General Public License as published by the Free
13 Software Foundation; either version 3 of the License, or (at
14 your option) any later version.
15
16 or
17
18 * the GNU General Public License as published by the Free
19 Software Foundation; either version 2 of the License, or (at
20 your option) any later version.
21
22 or both in parallel, as here.
23
24 GNU Libidn is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
27 General Public License for more details.
28
29 You should have received copies of the GNU General Public License and
30 the GNU Lesser General Public License along with this program. If
31 not, see <http://www.gnu.org/licenses/>. */
32
33 #include <config.h>
34
35 /* Get stringprep_utf8_to_ucs4, stringprep_locale_to_utf8. */
36 #include <stringprep.h>
37
38 /* Get strcmp(). */
39 #include <string.h>
40
41 /* Get specifications. */
42 #include <tld.h>
43
44 /* Array of built-in domain restriction structures. See tlds.c. */
45 extern const Tld_table *_tld_tables[];
46
47 /**
48 * tld_get_table:
49 * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
50 * @tables: Zero terminated array of #Tld_table info-structures for
51 * TLDs.
52 *
53 * Get the TLD table for a named TLD by searching through the given
54 * TLD table array.
55 *
56 * Return value: Return structure corresponding to TLD @tld by going
57 * thru @tables, or return %NULL if no such structure is found.
58 */
59 const Tld_table *
tld_get_table(const char * tld,const Tld_table ** tables)60 tld_get_table (const char *tld, const Tld_table ** tables)
61 {
62 const Tld_table **tldtable = NULL;
63
64 if (!tld || !tables)
65 return NULL;
66
67 for (tldtable = tables; *tldtable; tldtable++)
68 if (!strcmp ((*tldtable)->name, tld))
69 return *tldtable;
70
71 return NULL;
72 }
73
74 /**
75 * tld_default_table:
76 * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
77 * @overrides: Additional zero terminated array of #Tld_table
78 * info-structures for TLDs, or %NULL to only use library deault
79 * tables.
80 *
81 * Get the TLD table for a named TLD, using the internal defaults,
82 * possibly overrided by the (optional) supplied tables.
83 *
84 * Return value: Return structure corresponding to TLD @tld_str, first
85 * looking through @overrides then thru built-in list, or %NULL if
86 * no such structure found.
87 */
88 const Tld_table *
tld_default_table(const char * tld,const Tld_table ** overrides)89 tld_default_table (const char *tld, const Tld_table ** overrides)
90 {
91 const Tld_table *tldtable = NULL;
92
93 if (!tld)
94 return NULL;
95
96 if (overrides)
97 tldtable = tld_get_table (tld, overrides);
98
99 if (!tldtable)
100 tldtable = tld_get_table (tld, _tld_tables);
101
102 return tldtable;
103 }
104
105 #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \
106 (c) == 0xFF0E || (c) == 0xFF61)
107
108 /**
109 * tld_get_4:
110 * @in: Array of unicode code points to process. Does not need to be
111 * zero terminated.
112 * @inlen: Number of unicode code points.
113 * @out: Zero terminated ascii result string pointer.
114 *
115 * Isolate the top-level domain of @in and return it as an ASCII
116 * string in @out.
117 *
118 * Return value: Return %TLD_SUCCESS on success, or the corresponding
119 * #Tld_rc error code otherwise.
120 */
121 int
tld_get_4(const uint32_t * in,size_t inlen,char ** out)122 tld_get_4 (const uint32_t * in, size_t inlen, char **out)
123 {
124 const uint32_t *ipos;
125 size_t olen;
126
127 *out = NULL;
128 if (!in || inlen == 0)
129 return TLD_NODATA;
130
131 ipos = &in[inlen - 1];
132 olen = 0;
133 /* Scan backwards for non(latin)letters. */
134 while (ipos >= in && ((*ipos >= 0x41 && *ipos <= 0x5A) ||
135 (*ipos >= 0x61 && *ipos <= 0x7A)))
136 ipos--, olen++;
137
138 if (olen > 0 && ipos >= in && DOTP (*ipos))
139 {
140 /* Found something that appears a TLD. */
141 char *out_s = malloc (sizeof (char) * (olen + 1));
142 char *opos = out_s;
143
144 if (!opos)
145 return TLD_MALLOC_ERROR;
146
147 ipos++;
148 /* Transcribe to lowercase ascii string. */
149 for (; ipos < &in[inlen]; ipos++, opos++)
150 *opos = *ipos > 0x5A ? *ipos : *ipos + 0x20;
151 *opos = 0;
152 *out = out_s;
153 return TLD_SUCCESS;
154 }
155
156 return TLD_NO_TLD;
157 }
158
159 /**
160 * tld_get_4z:
161 * @in: Zero terminated array of unicode code points to process.
162 * @out: Zero terminated ascii result string pointer.
163 *
164 * Isolate the top-level domain of @in and return it as an ASCII
165 * string in @out.
166 *
167 * Return value: Return %TLD_SUCCESS on success, or the corresponding
168 * #Tld_rc error code otherwise.
169 */
170 int
tld_get_4z(const uint32_t * in,char ** out)171 tld_get_4z (const uint32_t * in, char **out)
172 {
173 const uint32_t *ipos = in;
174
175 if (!in)
176 return TLD_NODATA;
177
178 while (*ipos)
179 ipos++;
180
181 return tld_get_4 (in, ipos - in, out);
182 }
183
184 /**
185 * tld_get_z:
186 * @in: Zero terminated character array to process.
187 * @out: Zero terminated ascii result string pointer.
188 *
189 * Isolate the top-level domain of @in and return it as an ASCII
190 * string in @out. The input string @in may be UTF-8, ISO-8859-1 or
191 * any ASCII compatible character encoding.
192 *
193 * Return value: Return %TLD_SUCCESS on success, or the corresponding
194 * #Tld_rc error code otherwise.
195 */
196 int
tld_get_z(const char * in,char ** out)197 tld_get_z (const char *in, char **out)
198 {
199 uint32_t *iucs;
200 size_t i, ilen;
201 int rc;
202
203 ilen = strlen (in);
204 iucs = calloc (ilen, sizeof (*iucs));
205
206 if (!iucs)
207 return TLD_MALLOC_ERROR;
208
209 for (i = 0; i < ilen; i++)
210 iucs[i] = in[i];
211
212 rc = tld_get_4 (iucs, ilen, out);
213
214 free (iucs);
215
216 return rc;
217 }
218
219 /*
220 * tld_checkchar - verify that character is permitted
221 * @ch: 32 bit unicode character to check.
222 * @tld: A #Tld_table data structure to check @ch against.
223 *
224 * Verify if @ch is either in [a-z0-9-.] or mentioned as a valid
225 * character in @tld.
226 *
227 * Return value: Return the #Tld_rc value %TLD_SUCCESS if @ch is a
228 * valid character for the TLD @tld or if @tld is %NULL,
229 * %TLD_INVALID if @ch is invalid as defined by @tld.
230 */
231 static int
_tld_checkchar(uint32_t ch,const Tld_table * tld)232 _tld_checkchar (uint32_t ch, const Tld_table * tld)
233 {
234 const Tld_table_element *s, *e, *m;
235
236 if (!tld)
237 return TLD_SUCCESS;
238
239 /* Check for [-a-z0-9.]. */
240 if ((ch >= 0x61 && ch <= 0x7A) ||
241 (ch >= 0x30 && ch <= 0x39) || ch == 0x2D || DOTP (ch))
242 return TLD_SUCCESS;
243
244 s = tld->valid;
245 e = s + tld->nvalid;
246 while (s < e)
247 {
248 m = s + ((e - s) >> 1);
249 if (ch < m->start)
250 e = m;
251 else if (ch > m->end)
252 s = m + 1;
253 else
254 return TLD_SUCCESS;
255 }
256
257 return TLD_INVALID;
258 }
259
260 /**
261 * tld_check_4t:
262 * @in: Array of unicode code points to process. Does not need to be
263 * zero terminated.
264 * @inlen: Number of unicode code points.
265 * @errpos: Position of offending character is returned here.
266 * @tld: A #Tld_table data structure representing the restrictions for
267 * which the input should be tested.
268 *
269 * Test each of the code points in @in for whether or not
270 * they are allowed by the data structure in @tld, return
271 * the position of the first character for which this is not
272 * the case in @errpos.
273 *
274 * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
275 * points are valid or when @tld is null, %TLD_INVALID if a
276 * character is not allowed, or additional error codes on general
277 * failure conditions.
278 */
279 int
tld_check_4t(const uint32_t * in,size_t inlen,size_t * errpos,const Tld_table * tld)280 tld_check_4t (const uint32_t * in, size_t inlen, size_t * errpos,
281 const Tld_table * tld)
282 {
283 const uint32_t *ipos;
284 int rc;
285
286 if (!tld) /* No data for TLD so everything is valid. */
287 return TLD_SUCCESS;
288
289 ipos = in;
290 while (ipos < &in[inlen])
291 {
292 rc = _tld_checkchar (*ipos, tld);
293 if (rc != TLD_SUCCESS)
294 {
295 if (errpos)
296 *errpos = ipos - in;
297 return rc;
298 }
299 ipos++;
300 }
301 return TLD_SUCCESS;
302 }
303
304 /**
305 * tld_check_4tz:
306 * @in: Zero terminated array of unicode code points to process.
307 * @errpos: Position of offending character is returned here.
308 * @tld: A #Tld_table data structure representing the restrictions for
309 * which the input should be tested.
310 *
311 * Test each of the code points in @in for whether or not
312 * they are allowed by the data structure in @tld, return
313 * the position of the first character for which this is not
314 * the case in @errpos.
315 *
316 * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
317 * points are valid or when @tld is null, %TLD_INVALID if a
318 * character is not allowed, or additional error codes on general
319 * failure conditions.
320 */
321 int
tld_check_4tz(const uint32_t * in,size_t * errpos,const Tld_table * tld)322 tld_check_4tz (const uint32_t * in, size_t * errpos, const Tld_table * tld)
323 {
324 const uint32_t *ipos = in;
325
326 if (!ipos)
327 return TLD_NODATA;
328
329 while (*ipos)
330 ipos++;
331
332 return tld_check_4t (in, ipos - in, errpos, tld);
333 }
334
335 /**
336 * tld_check_4:
337 * @in: Array of unicode code points to process. Does not need to be
338 * zero terminated.
339 * @inlen: Number of unicode code points.
340 * @errpos: Position of offending character is returned here.
341 * @overrides: A #Tld_table array of additional domain restriction
342 * structures that complement and supersede the built-in information.
343 *
344 * Test each of the code points in @in for whether or not they are
345 * allowed by the information in @overrides or by the built-in TLD
346 * restriction data. When data for the same TLD is available both
347 * internally and in @overrides, the information in @overrides takes
348 * precedence. If several entries for a specific TLD are found, the
349 * first one is used. If @overrides is %NULL, only the built-in
350 * information is used. The position of the first offending character
351 * is returned in @errpos.
352 *
353 * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
354 * points are valid or when @tld is null, %TLD_INVALID if a
355 * character is not allowed, or additional error codes on general
356 * failure conditions.
357 */
358 int
tld_check_4(const uint32_t * in,size_t inlen,size_t * errpos,const Tld_table ** overrides)359 tld_check_4 (const uint32_t * in, size_t inlen, size_t * errpos,
360 const Tld_table ** overrides)
361 {
362 const Tld_table *tld;
363 char *domain;
364 int rc;
365
366 if (errpos)
367 *errpos = 0;
368
369 /* Get TLD name. */
370 rc = tld_get_4 (in, inlen, &domain);
371
372 if (rc != TLD_SUCCESS)
373 {
374 if (rc == TLD_NO_TLD) /* No TLD, say OK */
375 return TLD_SUCCESS;
376 else
377 return rc;
378 }
379
380 /* Retrieve appropriate data structure. */
381 tld = tld_default_table (domain, overrides);
382 free (domain);
383
384 return tld_check_4t (in, inlen, errpos, tld);
385 }
386
387 /**
388 * tld_check_4z:
389 * @in: Zero-terminated array of unicode code points to process.
390 * @errpos: Position of offending character is returned here.
391 * @overrides: A #Tld_table array of additional domain restriction
392 * structures that complement and supersede the built-in information.
393 *
394 * Test each of the code points in @in for whether or not they are
395 * allowed by the information in @overrides or by the built-in TLD
396 * restriction data. When data for the same TLD is available both
397 * internally and in @overrides, the information in @overrides takes
398 * precedence. If several entries for a specific TLD are found, the
399 * first one is used. If @overrides is %NULL, only the built-in
400 * information is used. The position of the first offending character
401 * is returned in @errpos.
402 *
403 * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
404 * points are valid or when @tld is null, %TLD_INVALID if a
405 * character is not allowed, or additional error codes on general
406 * failure conditions.
407 */
408 int
tld_check_4z(const uint32_t * in,size_t * errpos,const Tld_table ** overrides)409 tld_check_4z (const uint32_t * in, size_t * errpos,
410 const Tld_table ** overrides)
411 {
412 const uint32_t *ipos = in;
413
414 if (!ipos)
415 return TLD_NODATA;
416
417 while (*ipos)
418 ipos++;
419
420 return tld_check_4 (in, ipos - in, errpos, overrides);
421 }
422
423 /**
424 * tld_check_8z:
425 * @in: Zero-terminated UTF8 string to process.
426 * @errpos: Position of offending character is returned here.
427 * @overrides: A #Tld_table array of additional domain restriction
428 * structures that complement and supersede the built-in information.
429 *
430 * Test each of the characters in @in for whether or not they are
431 * allowed by the information in @overrides or by the built-in TLD
432 * restriction data. When data for the same TLD is available both
433 * internally and in @overrides, the information in @overrides takes
434 * precedence. If several entries for a specific TLD are found, the
435 * first one is used. If @overrides is %NULL, only the built-in
436 * information is used. The position of the first offending character
437 * is returned in @errpos. Note that the error position refers to the
438 * decoded character offset rather than the byte position in the
439 * string.
440 *
441 * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all
442 * characters are valid or when @tld is null, %TLD_INVALID if a
443 * character is not allowed, or additional error codes on general
444 * failure conditions.
445 */
446 int
tld_check_8z(const char * in,size_t * errpos,const Tld_table ** overrides)447 tld_check_8z (const char *in, size_t * errpos, const Tld_table ** overrides)
448 {
449 uint32_t *iucs;
450 size_t ilen;
451 int rc;
452
453 if (!in)
454 return TLD_NODATA;
455
456 iucs = stringprep_utf8_to_ucs4 (in, -1, &ilen);
457
458 if (!iucs)
459 return TLD_MALLOC_ERROR;
460
461 rc = tld_check_4 (iucs, ilen, errpos, overrides);
462
463 free (iucs);
464
465 return rc;
466 }
467
468 /**
469 * tld_check_lz:
470 * @in: Zero-terminated string in the current locales encoding to process.
471 * @errpos: Position of offending character is returned here.
472 * @overrides: A #Tld_table array of additional domain restriction
473 * structures that complement and supersede the built-in information.
474 *
475 * Test each of the characters in @in for whether or not they are
476 * allowed by the information in @overrides or by the built-in TLD
477 * restriction data. When data for the same TLD is available both
478 * internally and in @overrides, the information in @overrides takes
479 * precedence. If several entries for a specific TLD are found, the
480 * first one is used. If @overrides is %NULL, only the built-in
481 * information is used. The position of the first offending character
482 * is returned in @errpos. Note that the error position refers to the
483 * decoded character offset rather than the byte position in the
484 * string.
485 *
486 * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all
487 * characters are valid or when @tld is null, %TLD_INVALID if a
488 * character is not allowed, or additional error codes on general
489 * failure conditions.
490 */
491 int
tld_check_lz(const char * in,size_t * errpos,const Tld_table ** overrides)492 tld_check_lz (const char *in, size_t * errpos, const Tld_table ** overrides)
493 {
494 char *utf8;
495 int rc;
496
497 if (!in)
498 return TLD_NODATA;
499
500 utf8 = stringprep_locale_to_utf8 (in);
501 if (!utf8)
502 return TLD_ICONV_ERROR;
503
504
505 rc = tld_check_8z (utf8, errpos, overrides);
506
507 free (utf8);
508
509 return rc;
510 }
511
512 /**
513 * Tld_rc:
514 * @TLD_SUCCESS: Successful operation. This value is guaranteed to
515 * always be zero, the remaining ones are only guaranteed to hold
516 * non-zero values, for logical comparison purposes.
517 * @TLD_INVALID: Invalid character found.
518 * @TLD_NODATA: No input data was provided.
519 * @TLD_MALLOC_ERROR: Error during memory allocation.
520 * @TLD_ICONV_ERROR: Error during iconv string conversion.
521 * @TLD_NO_TLD: No top-level domain found in domain string.
522 * @TLD_NOTLD: Same as @TLD_NO_TLD, for compatibility
523 * with typo in earlier versions.
524 *
525 * Enumerated return codes of the TLD checking functions.
526 * The value 0 is guaranteed to always correspond to success.
527 */
528