1 /* -*- Mode: c; c-basic-offset: 2 -*-
2  *
3  * raptor_utf8.c - Raptor UTF-8 and Unicode support
4  *
5  * Copyright (C) 2002-2007, David Beckett http://www.dajobe.org/
6  * Copyright (C) 2002-2004, University of Bristol, UK http://www.bristol.ac.uk/
7  *
8  * This package is Free Software and part of Redland http://librdf.org/
9  *
10  * It is licensed under the following three licenses as alternatives:
11  *   1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
12  *   2. GNU General Public License (GPL) V2 or any newer version
13  *   3. Apache License, V2.0 or any newer version
14  *
15  * You may not use this file except in compliance with at least one of
16  * the above three licenses.
17  *
18  * See LICENSE.html or LICENSE.txt at the top of this package for the
19  * complete terms and further detail along with the license texts for
20  * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
21  *
22  *
23  */
24 
25 
26 #ifdef HAVE_CONFIG_H
27 #include <raptor_config.h>
28 #endif
29 
30 #ifdef WIN32
31 #include <win32_raptor_config.h>
32 #endif
33 
34 #include <stdio.h>
35 #include <stdarg.h>
36 #ifdef HAVE_STDLIB_H
37 #include <stdlib.h>
38 #endif
39 
40 /* Raptor includes */
41 #include "raptor.h"
42 #include "raptor_internal.h"
43 #ifdef RAPTOR_NFC_CHECK
44 #include "raptor_nfc.h"
45 #endif
46 
47 
48 /**
49  * raptor_unicode_char_to_utf8:
50  * @c: Unicode character
51  * @output: UTF-8 string buffer or NULL
52  *
53  * Convert a Unicode character to UTF-8 encoding.
54  *
55  * Based on librdf_unicode_char_to_utf8() with no need to calculate
56  * length since the encoded character is always copied into a buffer
57  * with sufficient size.
58  *
59  * Return value: bytes encoded to output buffer or <0 on failure
60  **/
61 int
raptor_unicode_char_to_utf8(raptor_unichar c,unsigned char * output)62 raptor_unicode_char_to_utf8(raptor_unichar c, unsigned char *output)
63 {
64   int size=0;
65 
66   if      (c < 0x00000080)
67     size=1;
68   else if (c < 0x00000800)
69     size=2;
70   else if (c < 0x00010000)
71     size=3;
72   else if (c < 0x00200000)
73     size=4;
74   else if (c < 0x04000000)
75     size=5;
76   else if (c < 0x80000000)
77     size=6;
78   else
79     return -1;
80 
81   switch(size) {
82     case 6:
83       output[5]=0x80 | (unsigned char)(c & 0x3F);
84       c= c >> 6;
85        /* set bit 2 (bits 7,6,5,4,3,2 less 7,6,5,4,3 set below) on last byte */
86       c |= 0x4000000; /* 0x10000 = 0x04 << 24 */
87       /* FALLTHROUGH */
88     case 5:
89       output[4]=0x80 | (unsigned char)(c & 0x3F);
90       c= c >> 6;
91        /* set bit 3 (bits 7,6,5,4,3 less 7,6,5,4 set below) on last byte */
92       c |= 0x200000; /* 0x10000 = 0x08 << 18 */
93       /* FALLTHROUGH */
94     case 4:
95       output[3]=0x80 | (unsigned char)(c & 0x3F);
96       c= c >> 6;
97        /* set bit 4 (bits 7,6,5,4 less 7,6,5 set below) on last byte */
98       c |= 0x10000; /* 0x10000 = 0x10 << 12 */
99       /* FALLTHROUGH */
100     case 3:
101       output[2]=0x80 | (unsigned char)(c & 0x3F);
102       c= c >> 6;
103       /* set bit 5 (bits 7,6,5 less 7,6 set below) on last byte */
104       c |= 0x800; /* 0x800 = 0x20 << 6 */
105       /* FALLTHROUGH */
106     case 2:
107       output[1]=0x80 | (unsigned char)(c & 0x3F);
108       c= c >> 6;
109       /* set bits 7,6 on last byte */
110       c |= 0xc0;
111       /* FALLTHROUGH */
112     case 1:
113       output[0]=(unsigned char)c;
114   }
115 
116   return size;
117 }
118 
119 
120 /**
121  * raptor_utf8_to_unicode_char:
122  * @output: Pointer to the Unicode character or NULL
123  * @input: UTF-8 string buffer
124  * @length: buffer size
125  *
126  * Convert an UTF-8 encoded buffer to a Unicode character.
127  *
128  * If output is NULL, then will calculate the number of bytes that
129  * will be used from the input buffer and not perform the conversion.
130  *
131  * Return value: bytes used from input buffer or <0 on failure: -1 input buffer too short or length error, -2 overlong UTF-8 sequence, -3 illegal code positions, -4 code out of range U+0000 to U+10FFFF.  In cases -2, -3 and -4 the coded character is stored in the output.
132  */
133 int
raptor_utf8_to_unicode_char(raptor_unichar * output,const unsigned char * input,int length)134 raptor_utf8_to_unicode_char(raptor_unichar *output,
135                             const unsigned char *input, int length)
136 {
137   unsigned char in;
138   int size;
139   raptor_unichar c=0;
140 
141   if(length < 1)
142     return -1;
143 
144   in=*input++;
145   if((in & 0x80) == 0) {
146     size=1;
147     c= in & 0x7f;
148   } else if((in & 0xe0) == 0xc0) {
149     size=2;
150     c= in & 0x1f;
151   } else if((in & 0xf0) == 0xe0) {
152     size=3;
153     c= in & 0x0f;
154   } else if((in & 0xf8) == 0xf0) {
155     size=4;
156     c = in & 0x07;
157   } else if((in & 0xfc) == 0xf8) {
158     size=5;
159     c = in & 0x03;
160   } else if((in & 0xfe) == 0xfc) {
161     size=6;
162     c = in & 0x01;
163   } else
164     return -1;
165 
166 
167   if(!output)
168     return size;
169 
170   if(length < size)
171     return -1;
172 
173   switch(size) {
174     case 6:
175       in=*input++ & 0x3f;
176       c= c << 6;
177       c |= in;
178       /* FALLTHROUGH */
179     case 5:
180       in=*input++ & 0x3f;
181       c= c << 6;
182       c |= in;
183       /* FALLTHROUGH */
184     case 4:
185       in=*input++ & 0x3f;
186       c= c << 6;
187       c |= in;
188       /* FALLTHROUGH */
189     case 3:
190       in=*input++ & 0x3f;
191       c= c << 6;
192       c |= in;
193       /* FALLTHROUGH */
194     case 2:
195       in=*input++ & 0x3f;
196       c= c << 6;
197       c |= in;
198       /* FALLTHROUGH */
199     default:
200       break;
201   }
202 
203   *output=c;
204 
205   /* check for overlong UTF-8 sequences */
206   switch(size) {
207     case 2:
208       if(c < 0x00000080)
209         return -2;
210       break;
211     case 3:
212       if(c < 0x00000800)
213         return -2;
214       break;
215     case 4:
216       if(c < 0x00010000)
217         return -2;
218       break;
219 
220     default: /* 1 */
221       break;
222   }
223 
224 
225   /* check for illegal code positions:
226    * U+D800 to U+DFFF (UTF-16 surrogates)
227    * U+FFFE and U+FFFF
228    */
229   if((c > 0xD7FF && c < 0xE000) || c == 0xFFFE || c == 0xFFFF)
230     return -3;
231 
232   /* Unicode 3.2 only defines U+0000 to U+10FFFF and UTF-8 encodings of it */
233   /* of course this makes some 4 byte forms illegal */
234   if(c > 0x10ffff)
235     return -4;
236 
237   return size;
238 }
239 
240 
241 static int raptor_unicode_is_letter(long c);
242 static int raptor_unicode_is_basechar(long c);
243 static int raptor_unicode_is_ideographic(long c);
244 static int raptor_unicode_is_combiningchar(long c);
245 static int raptor_unicode_is_digit(long c);
246 static int raptor_unicode_is_extender(long c);
247 
248 
249 /**
250  * raptor_unicode_is_xml11_namestartchar:
251  * @c: Unicode character to check
252  *
253  * Check if Unicode character is legal to start an XML 1.1 Name
254  *
255  * Namespaces in XML 1.1 REC 2004-02-04
256  *   http://www.w3.org/TR/2004/REC-xml11-20040204/#NT-NameStartChar
257  * updating
258  *   Extensible Markup Language (XML) 1.1 REC 2004-02-04
259  *   http://www.w3.org/TR/2004/REC-xml11-20040204/ sec 2.3, [4a]
260  * excluding the ':'
261  *
262  * Return value: non-0 if legal
263  **/
264 int
raptor_unicode_is_xml11_namestartchar(raptor_unichar c)265 raptor_unicode_is_xml11_namestartchar(raptor_unichar c)
266 {
267   return (((c >= 0x0041)  && (c <= 0x005A)) || /* [A-Z] */
268           (c == 0x005F) ||                     /* '_' */
269           ((c >= 0x0061)  && (c <= 0x007A)) || /* [a-z] */
270           ((c >= 0x00C0)  && (c <= 0x00D6)) ||
271           ((c >= 0x00D8)  && (c <= 0x00F6)) ||
272           ((c >= 0x00F8)  && (c <= 0x02FF)) ||
273           ((c >= 0x0370)  && (c <= 0x037D)) ||
274           ((c >= 0x037F)  && (c <= 0x1FFF)) ||
275           ((c >= 0x200C)  && (c <= 0x200D)) ||
276           ((c >= 0x2070)  && (c <= 0x218F)) ||
277           ((c >= 0x2C00)  && (c <= 0x2FEF)) ||
278           ((c >= 0x3001)  && (c <= 0xD7FF)) ||
279           ((c >= 0xF900)  && (c <= 0xFDCF)) ||
280           ((c >= 0xFDF0)  && (c <= 0xFFFD)) ||
281           ((c >= 0x10000) && (c <= 0xEFFFF)));
282 }
283 
284 
285 /**
286  * raptor_unicode_is_xml10_namestartchar:
287  * @c: Unicode character to check
288  *
289  * Check if Unicode character is legal to start an XML 1.0 Name
290  *
291  * Namespaces in XML REC 1999-01-14
292  *   http://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName
293  * updating
294  *   Extensible Markup Language (XML) 1.0 (Third Edition) REC 2004-02-04
295  *   http://www.w3.org/TR/2004/REC-xml-20040204/
296  * excluding the ':'
297  *
298  * Return value: non-0 if legal
299  **/
300 int
raptor_unicode_is_xml10_namestartchar(raptor_unichar c)301 raptor_unicode_is_xml10_namestartchar(raptor_unichar c)
302 {
303   return (raptor_unicode_is_letter(c) ||
304           (c == '_'));
305 }
306 
307 
308 /**
309  * raptor_unicode_is_namestartchar:
310  * @c: Unicode character to check
311  *
312  * Check if Unicode character is legal to start an XML Name
313  *
314  * Return value: non-0 if the character is legal
315  **/
316 int
raptor_unicode_is_namestartchar(raptor_unichar c)317 raptor_unicode_is_namestartchar(raptor_unichar c) {
318 #ifdef RAPTOR_XML_1_1
319    return raptor_unicode_is_xml11_namestartchar(c);
320 #else
321    return raptor_unicode_is_xml10_namestartchar(c);
322 #endif
323 }
324 
325 
326 /**
327  * raptor_unicode_is_xml11_namechar:
328  * @c: Unicode character
329  *
330  * Check if a Unicode codepoint is a legal to continue an XML 1.1 Name
331  *
332  * Namespaces in XML 1.1 REC 2004-02-04
333  *   http://www.w3.org/TR/2004/REC-xml11-20040204/
334  * updating
335  *   Extensible Markup Language (XML) 1.1 REC 2004-02-04
336  *   http://www.w3.org/TR/2004/REC-xml11-20040204/ sec 2.3, [4a]
337  * excluding the ':'
338  *
339  * Return value: non-0 if legal
340  **/
341 int
raptor_unicode_is_xml11_namechar(raptor_unichar c)342 raptor_unicode_is_xml11_namechar(raptor_unichar c)
343 {
344   return (raptor_unicode_is_xml11_namestartchar(c) ||
345           (c == 0x002D) || /* '-' */
346           (c == 0x002E) || /* '.' */
347           (c >= 0x0030 && c <= 0x0039) || /* 0-9 */
348           (c == 0x00B7) ||
349           (c >= 0x0300 && c <=0x036F) ||
350           (c >= 0x203F && c <=0x2040));
351 }
352 
353 
354 /**
355  * raptor_unicode_is_xml10_namechar:
356  * @c: Unicode character
357  *
358  * Check if a Unicode codepoint is a legal to continue an XML 1.0 Name
359  *
360  * Namespaces in XML REC 1999-01-14
361  *   http://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCNameChar
362  * updating
363  *   Extensible Markup Language (XML) 1.0 (Third Edition) REC 2004-02-04
364  *   http://www.w3.org/TR/2004/REC-xml-20040204/
365  * excluding the ':'
366  *
367  * Return value: non-0 if legal
368  **/
369 int
raptor_unicode_is_xml10_namechar(raptor_unichar c)370 raptor_unicode_is_xml10_namechar(raptor_unichar c)
371 {
372   return (raptor_unicode_is_letter(c) ||
373           raptor_unicode_is_digit(c) ||
374           (c == 0x002E) || /* '.' */
375           (c == 0x002D) || /* '-' */
376           (c == 0x005F) || /* '_' */
377           raptor_unicode_is_combiningchar(c) ||
378           raptor_unicode_is_extender(c));
379 }
380 
381 
382 /**
383  * raptor_unicode_is_namechar:
384  * @c: Unicode character to check
385  *
386  * Check if Unicode character is legal to continue an XML Name .
387  *
388  * Return value: non-0 if the character is legal
389  **/
390 int
raptor_unicode_is_namechar(raptor_unichar c)391 raptor_unicode_is_namechar(raptor_unichar c)
392 {
393 #ifdef RAPTOR_XML_1_1
394    return raptor_unicode_is_xml11_namechar(c);
395 #else
396    return raptor_unicode_is_xml10_namechar(c);
397 #endif
398 }
399 
400 
401 /*
402  * All this below was derived by machine-transforming the classes in Appendix B
403  * of http://www.w3.org/TR/2000/REC-xml-20001006
404  */
405 
406 static int
raptor_unicode_is_letter(long c)407 raptor_unicode_is_letter(long c)
408 {
409   return(raptor_unicode_is_basechar(c) ||
410          raptor_unicode_is_ideographic(c));
411 }
412 
413 
414 static int
raptor_unicode_is_basechar(long c)415 raptor_unicode_is_basechar(long c)
416 {
417   /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-BaseChar */
418   return((c >= 0x0041 && c <= 0x005A ) ||
419          (c >= 0x0061 && c <= 0x007A ) ||
420          (c >= 0x00C0 && c <= 0x00D6 ) ||
421          (c >= 0x00D8 && c <= 0x00F6 ) ||
422          (c >= 0x00F8 && c <= 0x00FF ) ||
423          (c >= 0x0100 && c <= 0x0131 ) ||
424          (c >= 0x0134 && c <= 0x013E ) ||
425          (c >= 0x0141 && c <= 0x0148 ) ||
426          (c >= 0x014A && c <= 0x017E ) ||
427          (c >= 0x0180 && c <= 0x01C3 ) ||
428          (c >= 0x01CD && c <= 0x01F0 ) ||
429          (c >= 0x01F4 && c <= 0x01F5 ) ||
430          (c >= 0x01FA && c <= 0x0217 ) ||
431          (c >= 0x0250 && c <= 0x02A8 ) ||
432          (c >= 0x02BB && c <= 0x02C1 ) ||
433          (c == 0x0386) ||
434          (c >= 0x0388 && c <= 0x038A ) ||
435          (c == 0x038C) ||
436          (c >= 0x038E && c <= 0x03A1 ) ||
437          (c >= 0x03A3 && c <= 0x03CE ) ||
438          (c >= 0x03D0 && c <= 0x03D6 ) ||
439          (c == 0x03DA) ||
440          (c == 0x03DC) ||
441          (c == 0x03DE) ||
442          (c == 0x03E0) ||
443          (c >= 0x03E2 && c <= 0x03F3 ) ||
444          (c >= 0x0401 && c <= 0x040C ) ||
445          (c >= 0x040E && c <= 0x044F ) ||
446          (c >= 0x0451 && c <= 0x045C ) ||
447          (c >= 0x045E && c <= 0x0481 ) ||
448          (c >= 0x0490 && c <= 0x04C4 ) ||
449          (c >= 0x04C7 && c <= 0x04C8 ) ||
450          (c >= 0x04CB && c <= 0x04CC ) ||
451          (c >= 0x04D0 && c <= 0x04EB ) ||
452          (c >= 0x04EE && c <= 0x04F5 ) ||
453          (c >= 0x04F8 && c <= 0x04F9 ) ||
454          (c >= 0x0531 && c <= 0x0556 ) ||
455          (c == 0x0559) ||
456          (c >= 0x0561 && c <= 0x0586 ) ||
457          (c >= 0x05D0 && c <= 0x05EA ) ||
458          (c >= 0x05F0 && c <= 0x05F2 ) ||
459          (c >= 0x0621 && c <= 0x063A ) ||
460          (c >= 0x0641 && c <= 0x064A ) ||
461          (c >= 0x0671 && c <= 0x06B7 ) ||
462          (c >= 0x06BA && c <= 0x06BE ) ||
463          (c >= 0x06C0 && c <= 0x06CE ) ||
464          (c >= 0x06D0 && c <= 0x06D3 ) ||
465          (c == 0x06D5) ||
466          (c >= 0x06E5 && c <= 0x06E6 ) ||
467          (c >= 0x0905 && c <= 0x0939 ) ||
468          (c == 0x093D) ||
469          (c >= 0x0958 && c <= 0x0961 ) ||
470          (c >= 0x0985 && c <= 0x098C ) ||
471          (c >= 0x098F && c <= 0x0990 ) ||
472          (c >= 0x0993 && c <= 0x09A8 ) ||
473          (c >= 0x09AA && c <= 0x09B0 ) ||
474          (c == 0x09B2) ||
475          (c >= 0x09B6 && c <= 0x09B9 ) ||
476          (c >= 0x09DC && c <= 0x09DD ) ||
477          (c >= 0x09DF && c <= 0x09E1 ) ||
478          (c >= 0x09F0 && c <= 0x09F1 ) ||
479          (c >= 0x0A05 && c <= 0x0A0A ) ||
480          (c >= 0x0A0F && c <= 0x0A10 ) ||
481          (c >= 0x0A13 && c <= 0x0A28 ) ||
482          (c >= 0x0A2A && c <= 0x0A30 ) ||
483          (c >= 0x0A32 && c <= 0x0A33 ) ||
484          (c >= 0x0A35 && c <= 0x0A36 ) ||
485          (c >= 0x0A38 && c <= 0x0A39 ) ||
486          (c >= 0x0A59 && c <= 0x0A5C ) ||
487          (c == 0x0A5E) ||
488          (c >= 0x0A72 && c <= 0x0A74 ) ||
489          (c >= 0x0A85 && c <= 0x0A8B ) ||
490          (c == 0x0A8D) ||
491 	 (c >= 0x0A8F && c <= 0x0A91 ) ||
492          (c >= 0x0A93 && c <= 0x0AA8 ) ||
493          (c >= 0x0AAA && c <= 0x0AB0 ) ||
494          (c >= 0x0AB2 && c <= 0x0AB3 ) ||
495 	 (c >= 0x0AB5 && c <= 0x0AB9 ) ||
496          (c == 0x0ABD) ||
497          (c == 0x0AE0) ||
498          (c >= 0x0B05 && c <= 0x0B0C ) ||
499 	 (c >= 0x0B0F && c <= 0x0B10 ) ||
500          (c >= 0x0B13 && c <= 0x0B28 ) ||
501          (c >= 0x0B2A && c <= 0x0B30 ) ||
502          (c >= 0x0B32 && c <= 0x0B33 ) ||
503 	 (c >= 0x0B36 && c <= 0x0B39 ) ||
504          (c == 0x0B3D) ||
505          (c >= 0x0B5C && c <= 0x0B5D ) ||
506          (c >= 0x0B5F && c <= 0x0B61 ) ||
507 	 (c >= 0x0B85 && c <= 0x0B8A ) ||
508          (c >= 0x0B8E && c <= 0x0B90 ) ||
509          (c >= 0x0B92 && c <= 0x0B95 ) ||
510          (c >= 0x0B99 && c <= 0x0B9A ) ||
511 	 (c == 0x0B9C) ||
512          (c >= 0x0B9E && c <= 0x0B9F ) ||
513          (c >= 0x0BA3 && c <= 0x0BA4 ) ||
514          (c >= 0x0BA8 && c <= 0x0BAA ) ||
515 	 (c >= 0x0BAE && c <= 0x0BB5 ) ||
516          (c >= 0x0BB7 && c <= 0x0BB9 ) ||
517          (c >= 0x0C05 && c <= 0x0C0C ) ||
518          (c >= 0x0C0E && c <= 0x0C10 ) ||
519 	 (c >= 0x0C12 && c <= 0x0C28 ) ||
520          (c >= 0x0C2A && c <= 0x0C33 ) ||
521          (c >= 0x0C35 && c <= 0x0C39 ) ||
522          (c >= 0x0C60 && c <= 0x0C61 ) ||
523 	 (c >= 0x0C85 && c <= 0x0C8C ) ||
524          (c >= 0x0C8E && c <= 0x0C90 ) ||
525          (c >= 0x0C92 && c <= 0x0CA8 ) ||
526          (c >= 0x0CAA && c <= 0x0CB3 ) ||
527 	 (c >= 0x0CB5 && c <= 0x0CB9 ) ||
528          (c == 0x0CDE) ||
529          (c >= 0x0CE0 && c <= 0x0CE1 ) ||
530          (c >= 0x0D05 && c <= 0x0D0C ) ||
531 	 (c >= 0x0D0E && c <= 0x0D10 ) ||
532          (c >= 0x0D12 && c <= 0x0D28 ) ||
533          (c >= 0x0D2A && c <= 0x0D39 ) ||
534          (c >= 0x0D60 && c <= 0x0D61 ) ||
535 	 (c >= 0x0E01 && c <= 0x0E2E ) ||
536          (c == 0x0E30) ||
537          (c >= 0x0E32 && c <= 0x0E33 ) ||
538          (c >= 0x0E40 && c <= 0x0E45 ) ||
539 	 (c >= 0x0E81 && c <= 0x0E82 ) ||
540          (c == 0x0E84) ||
541          (c >= 0x0E87 && c <= 0x0E88 ) ||
542          (c == 0x0E8A) ||
543 	 (c == 0x0E8D) ||
544          (c >= 0x0E94 && c <= 0x0E97 ) ||
545          (c >= 0x0E99 && c <= 0x0E9F ) ||
546          (c >= 0x0EA1 && c <= 0x0EA3 ) ||
547 	 (c == 0x0EA5) ||
548          (c == 0x0EA7) ||
549          (c >= 0x0EAA && c <= 0x0EAB ) ||
550          (c >= 0x0EAD && c <= 0x0EAE ) ||
551 	 (c == 0x0EB0) ||
552          (c >= 0x0EB2 && c <= 0x0EB3 ) ||
553          (c == 0x0EBD) ||
554          (c >= 0x0EC0 && c <= 0x0EC4 ) ||
555 	 (c >= 0x0F40 && c <= 0x0F47 ) ||
556          (c >= 0x0F49 && c <= 0x0F69 ) ||
557          (c >= 0x10A0 && c <= 0x10C5 ) ||
558          (c >= 0x10D0 && c <= 0x10F6 ) ||
559 	 (c == 0x1100) ||
560          (c >= 0x1102 && c <= 0x1103 ) ||
561          (c >= 0x1105 && c <= 0x1107 ) ||
562          (c == 0x1109) ||
563          (c >= 0x110B && c <= 0x110C ) ||
564          (c >= 0x110E && c <= 0x1112 ) ||
565          (c == 0x113C) ||
566          (c == 0x113E) ||
567          (c == 0x1140) ||
568          (c == 0x114C) ||
569          (c == 0x114E) ||
570          (c == 0x1150) ||
571          (c >= 0x1154 && c <= 0x1155 ) ||
572          (c == 0x1159) ||
573          (c >= 0x115F && c <= 0x1161 ) ||
574          (c == 0x1163) ||
575          (c == 0x1165) ||
576          (c == 0x1167) ||
577 	 (c == 0x1169) ||
578          (c >= 0x116D && c <= 0x116E ) ||
579          (c >= 0x1172 && c <= 0x1173 ) ||
580          (c == 0x1175) ||
581 	 (c == 0x119E) ||
582          (c == 0x11A8) ||
583          (c == 0x11AB) ||
584          (c >= 0x11AE && c <= 0x11AF ) ||
585          (c >= 0x11B7 && c <= 0x11B8 ) ||
586 	 (c == 0x11BA) ||
587          (c >= 0x11BC && c <= 0x11C2 ) ||
588          (c == 0x11EB) ||
589          (c == 0x11F0) ||
590          (c == 0x11F9) ||
591 	 (c >= 0x1E00 && c <= 0x1E9B ) ||
592          (c >= 0x1EA0 && c <= 0x1EF9 ) ||
593          (c >= 0x1F00 && c <= 0x1F15 ) ||
594          (c >= 0x1F18 && c <= 0x1F1D ) ||
595 	 (c >= 0x1F20 && c <= 0x1F45 ) ||
596          (c >= 0x1F48 && c <= 0x1F4D ) ||
597          (c >= 0x1F50 && c <= 0x1F57 ) ||
598          (c == 0x1F59) ||
599 	 (c == 0x1F5B) ||
600          (c == 0x1F5D) ||
601          (c >= 0x1F5F && c <= 0x1F7D ) ||
602          (c >= 0x1F80 && c <= 0x1FB4 ) ||
603 	 (c >= 0x1FB6 && c <= 0x1FBC ) ||
604          (c == 0x1FBE) ||
605          (c >= 0x1FC2 && c <= 0x1FC4 ) ||
606          (c >= 0x1FC6 && c <= 0x1FCC ) ||
607 	 (c >= 0x1FD0 && c <= 0x1FD3 ) ||
608          (c >= 0x1FD6 && c <= 0x1FDB ) ||
609          (c >= 0x1FE0 && c <= 0x1FEC ) ||
610          (c >= 0x1FF2 && c <= 0x1FF4 ) ||
611 	 (c >= 0x1FF6 && c <= 0x1FFC ) ||
612          (c == 0x2126) ||
613          (c >= 0x212A && c <= 0x212B ) ||
614          (c == 0x212E) ||
615 	 (c >= 0x2180 && c <= 0x2182 ) ||
616          (c >= 0x3041 && c <= 0x3094 ) ||
617          (c >= 0x30A1 && c <= 0x30FA ) ||
618          (c >= 0x3105 && c <= 0x312C ) ||
619          (c >= 0xAC00 && c <= 0xD7A3 )
620          );
621 }
622 
623 
624 static int
raptor_unicode_is_ideographic(long c)625 raptor_unicode_is_ideographic(long c)
626 {
627   /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Ideographic */
628   return((c >= 0x4E00 && c <= 0x9FA5 ) ||
629          (c == 0x3007) ||
630          (c >= 0x3021 && c <= 0x3029 ));
631 }
632 
633 
634 static int
raptor_unicode_is_combiningchar(long c)635 raptor_unicode_is_combiningchar(long c)
636 {
637   /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-CombiningChar */
638   return((c >= 0x0300 && c <= 0x0345 ) ||
639          (c >= 0x0360 && c <= 0x0361 ) ||
640          (c >= 0x0483 && c <= 0x0486 ) ||
641          (c >= 0x0591 && c <= 0x05A1 ) ||
642          (c >= 0x05A3 && c <= 0x05B9 ) ||
643          (c >= 0x05BB && c <= 0x05BD ) ||
644          (c == 0x05BF) ||
645          (c >= 0x05C1 && c <= 0x05C2 ) ||
646          (c == 0x05C4) ||
647          (c >= 0x064B && c <= 0x0652 ) ||
648          (c == 0x0670) ||
649          (c >= 0x06D6 && c <= 0x06DC ) ||
650 	 (c >= 0x06DD && c <= 0x06DF ) ||
651          (c >= 0x06E0 && c <= 0x06E4 ) ||
652          (c >= 0x06E7 && c <= 0x06E8 ) ||
653          (c >= 0x06EA && c <= 0x06ED ) ||
654 	 (c >= 0x0901 && c <= 0x0903 ) ||
655          (c == 0x093C) ||
656          (c >= 0x093E && c <= 0x094C ) ||
657          (c == 0x094D) ||
658 	 (c >= 0x0951 && c <= 0x0954 ) ||
659          (c >= 0x0962 && c <= 0x0963 ) ||
660          (c >= 0x0981 && c <= 0x0983 ) ||
661          (c == 0x09BC) ||
662 	 (c == 0x09BE) ||
663          (c == 0x09BF) ||
664          (c >= 0x09C0 && c <= 0x09C4 ) ||
665          (c >= 0x09C7 && c <= 0x09C8 ) ||
666 	 (c >= 0x09CB && c <= 0x09CD ) ||
667          (c == 0x09D7) ||
668          (c >= 0x09E2 && c <= 0x09E3 ) ||
669          (c == 0x0A02) ||
670 	 (c == 0x0A3C) ||
671          (c == 0x0A3E) ||
672          (c == 0x0A3F) ||
673          (c >= 0x0A40 && c <= 0x0A42 ) ||
674          (c >= 0x0A47 && c <= 0x0A48 ) ||
675 	 (c >= 0x0A4B && c <= 0x0A4D ) ||
676          (c >= 0x0A70 && c <= 0x0A71 ) ||
677          (c >= 0x0A81 && c <= 0x0A83 ) ||
678          (c == 0x0ABC) ||
679 	 (c >= 0x0ABE && c <= 0x0AC5 ) ||
680          (c >= 0x0AC7 && c <= 0x0AC9 ) ||
681          (c >= 0x0ACB && c <= 0x0ACD ) ||
682          (c >= 0x0B01 && c <= 0x0B03 ) ||
683 	 (c == 0x0B3C) ||
684          (c >= 0x0B3E && c <= 0x0B43 ) ||
685          (c >= 0x0B47 && c <= 0x0B48 ) ||
686          (c >= 0x0B4B && c <= 0x0B4D ) ||
687 	 (c >= 0x0B56 && c <= 0x0B57 ) ||
688          (c >= 0x0B82 && c <= 0x0B83 ) ||
689          (c >= 0x0BBE && c <= 0x0BC2 ) ||
690          (c >= 0x0BC6 && c <= 0x0BC8 ) ||
691 	 (c >= 0x0BCA && c <= 0x0BCD ) ||
692          (c == 0x0BD7) ||
693          (c >= 0x0C01 && c <= 0x0C03 ) ||
694          (c >= 0x0C3E && c <= 0x0C44 ) ||
695 	 (c >= 0x0C46 && c <= 0x0C48 ) ||
696          (c >= 0x0C4A && c <= 0x0C4D ) ||
697          (c >= 0x0C55 && c <= 0x0C56 ) ||
698          (c >= 0x0C82 && c <= 0x0C83 ) ||
699 	 (c >= 0x0CBE && c <= 0x0CC4 ) ||
700          (c >= 0x0CC6 && c <= 0x0CC8 ) ||
701          (c >= 0x0CCA && c <= 0x0CCD ) ||
702          (c >= 0x0CD5 && c <= 0x0CD6 ) ||
703 	 (c >= 0x0D02 && c <= 0x0D03 ) ||
704          (c >= 0x0D3E && c <= 0x0D43 ) ||
705          (c >= 0x0D46 && c <= 0x0D48 ) ||
706          (c >= 0x0D4A && c <= 0x0D4D ) ||
707 	 (c == 0x0D57) ||
708          (c == 0x0E31) ||
709          (c >= 0x0E34 && c <= 0x0E3A ) ||
710          (c >= 0x0E47 && c <= 0x0E4E ) ||
711 	 (c == 0x0EB1) ||
712          (c >= 0x0EB4 && c <= 0x0EB9 ) ||
713          (c >= 0x0EBB && c <= 0x0EBC ) ||
714          (c >= 0x0EC8 && c <= 0x0ECD ) ||
715 	 (c >= 0x0F18 && c <= 0x0F19 ) ||
716          (c == 0x0F35) ||
717          (c == 0x0F37) ||
718          (c == 0x0F39) ||
719          (c == 0x0F3E) ||
720 	 (c == 0x0F3F) ||
721          (c >= 0x0F71 && c <= 0x0F84 ) ||
722          (c >= 0x0F86 && c <= 0x0F8B ) ||
723          (c >= 0x0F90 && c <= 0x0F95 ) ||
724 	 (c == 0x0F97) ||
725          (c >= 0x0F99 && c <= 0x0FAD ) ||
726          (c >= 0x0FB1 && c <= 0x0FB7 ) ||
727          (c == 0x0FB9) ||
728 	 (c >= 0x20D0 && c <= 0x20DC ) ||
729          (c == 0x20E1) ||
730          (c >= 0x302A && c <= 0x302F ) ||
731          (c == 0x3099) ||
732 	 (c == 0x309A));
733 }
734 
735 
736 static int
raptor_unicode_is_digit(long c)737 raptor_unicode_is_digit(long c)
738 {
739   /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Digit */
740   return((c >= 0x0030 && c <= 0x0039 ) ||
741          (c >= 0x0660 && c <= 0x0669 ) ||
742          (c >= 0x06F0 && c <= 0x06F9 ) ||
743          (c >= 0x0966 && c <= 0x096F ) ||
744          (c >= 0x09E6 && c <= 0x09EF ) ||
745          (c >= 0x0A66 && c <= 0x0A6F ) ||
746          (c >= 0x0AE6 && c <= 0x0AEF ) ||
747          (c >= 0x0B66 && c <= 0x0B6F ) ||
748          (c >= 0x0BE7 && c <= 0x0BEF ) ||
749          (c >= 0x0C66 && c <= 0x0C6F ) ||
750          (c >= 0x0CE6 && c <= 0x0CEF ) ||
751          (c >= 0x0D66 && c <= 0x0D6F ) ||
752          (c >= 0x0E50 && c <= 0x0E59 ) ||
753          (c >= 0x0ED0 && c <= 0x0ED9 ) ||
754          (c >= 0x0F20 && c <= 0x0F29 ));
755 }
756 
757 
758 static int
raptor_unicode_is_extender(long c)759 raptor_unicode_is_extender(long c)
760 {
761   /* http://www.w3.org/TR/2000/REC-xml-20001006#NT-Extender */
762   return((c == 0x00B7) ||
763          (c == 0x02D0) ||
764          (c == 0x02D1) ||
765          (c == 0x0387) ||
766          (c == 0x0640) ||
767          (c == 0x0E46) ||
768          (c == 0x0EC6) ||
769          (c == 0x3005) ||
770          (c >= 0x3031 && c <= 0x3035 ) ||
771          (c >= 0x309D && c <= 0x309E ) ||
772          (c >= 0x30FC && c <= 0x30FE ));
773 }
774 
775 
776 /**
777  * raptor_utf8_is_nfc:
778  * @input: UTF-8 string
779  * @length: length of string
780  *
781  * Check a string is in Unicode Normal Form C.
782  *
783  * Return value: Non 0 if the string is NFC
784  **/
785 int
raptor_utf8_is_nfc(const unsigned char * input,size_t length)786 raptor_utf8_is_nfc(const unsigned char *input, size_t length)
787 {
788   unsigned int i;
789   int plain=1;
790 
791   for(i=0; i<length; i++)
792     if(input[i]>0x7f) {
793       plain=0;
794       break;
795     }
796 
797   if(plain)
798     return 1;
799 
800 #ifdef RAPTOR_NFC_CHECK
801   return raptor_nfc_check(input, length, NULL);
802 #else
803   return 1;
804 #endif
805 }
806 
807 
808 /**
809  * raptor_utf8_check:
810  * @string: UTF-8 string
811  * @length: length of string
812  *
813  * Check a string is UTF-8.
814  *
815  * Return value: Non 0 if the string is UTF-8
816  **/
817 int
raptor_utf8_check(const unsigned char * string,size_t length)818 raptor_utf8_check(const unsigned char *string, size_t length)
819 {
820   while(length > 0) {
821     raptor_unichar unichar=0;
822 
823     int unichar_len=raptor_utf8_to_unicode_char(&unichar, string, length);
824     if(unichar_len < 0 || unichar_len > (int)length)
825       return 0;
826 
827     if(unichar > 0x10ffff)
828       return 0;
829 
830     string += unichar_len;
831     length -= unichar_len;
832   }
833   return 1;
834 }
835