1 /*
2  * This file is part of libdom.
3  * Licensed under the MIT License,
4  *			http://www.opensource.org/licenses/mit-license.php
5  * Copyright 2009 Bo Yang <struggleyb.nku@gmail.com>
6  */
7 
8 #include <inttypes.h>
9 #include <stddef.h>
10 
11 #include "utils/validate.h"
12 
13 #include <dom/core/string.h>
14 
15 #include "utils/character_valid.h"
16 #include "utils/namespace.h"
17 #include "utils/utils.h"
18 
19 #include <parserutils/charset/utf8.h>
20 
21 /* An combination of various tests */
22 static bool is_first_char(uint32_t ch);
23 static bool is_name_char(uint32_t ch);
24 
25 /* Test whether the character can be the first character of
26  * a NCName. */
is_first_char(uint32_t ch)27 static bool is_first_char(uint32_t ch)
28 {
29 	/* Refer http://www.w3.org/TR/REC-xml/ for detail */
30 	if (((ch >= 'a') && (ch <= 'z')) ||
31 		((ch >= 'A') && (ch <= 'Z')) ||
32 		(ch == '_') || (ch == ':') ||
33 		((ch >= 0xC0) && (ch <= 0xD6)) ||
34 		((ch >= 0xD8) && (ch <= 0xF6)) ||
35 		((ch >= 0xF8) && (ch <= 0x2FF)) ||
36 		((ch >= 0x370) && (ch <= 0x37D)) ||
37 		((ch >= 0x37F) && (ch <= 0x1FFF)) ||
38 		((ch >= 0x200C) && (ch <= 0x200D)) ||
39 		((ch >= 0x2070) && (ch <= 0x218F)) ||
40 		((ch >= 0x2C00) && (ch <= 0x2FEF)) ||
41 		((ch >= 0x3001) && (ch <= 0xD7FF)) ||
42 		((ch >= 0xF900) && (ch <= 0xFDCF)) ||
43 		((ch >= 0xFDF0) && (ch <= 0xFFFD)) ||
44 		((ch >= 0x10000) && (ch <= 0xEFFFF)))
45 		return true;
46 
47 	if (is_letter(ch) || ch == (uint32_t) '_' || ch == (uint32_t) ':') {
48 		return true;
49 	}
50 
51 	return false;
52 }
53 
54 /* Test whether the character can be a part of a NCName */
is_name_char(uint32_t ch)55 static bool is_name_char(uint32_t ch)
56 {
57 	/* Refer http://www.w3.org/TR/REC-xml/ for detail */
58 	if (((ch >= 'a') && (ch <= 'z')) ||
59 		((ch >= 'A') && (ch <= 'Z')) ||
60 		((ch >= '0') && (ch <= '9')) || /* !start */
61 		(ch == '_') || (ch == ':') ||
62 		(ch == '-') || (ch == '.') || (ch == 0xB7) || /* !start */
63 		((ch >= 0xC0) && (ch <= 0xD6)) ||
64 		((ch >= 0xD8) && (ch <= 0xF6)) ||
65 		((ch >= 0xF8) && (ch <= 0x2FF)) ||
66 		((ch >= 0x300) && (ch <= 0x36F)) || /* !start */
67 		((ch >= 0x370) && (ch <= 0x37D)) ||
68 		((ch >= 0x37F) && (ch <= 0x1FFF)) ||
69 		((ch >= 0x200C) && (ch <= 0x200D)) ||
70 		((ch >= 0x203F) && (ch <= 0x2040)) || /* !start */
71 		((ch >= 0x2070) && (ch <= 0x218F)) ||
72 		((ch >= 0x2C00) && (ch <= 0x2FEF)) ||
73 		((ch >= 0x3001) && (ch <= 0xD7FF)) ||
74 		((ch >= 0xF900) && (ch <= 0xFDCF)) ||
75 		((ch >= 0xFDF0) && (ch <= 0xFFFD)) ||
76 		((ch >= 0x10000) && (ch <= 0xEFFFF)))
77 		return true;
78 
79 	if (is_letter(ch) == true)
80 		return true;
81 	if (is_digit(ch) == true)
82 		return true;
83 	if (is_combining_char(ch) == true)
84 		return true;
85 	if (is_extender(ch) == true)
86 		return true;
87 
88 	return false;
89 }
90 
91 /**
92  * Test whether the name is a valid one according XML 1.0 standard.
93  * For the standard please refer:
94  *
95  * http://www.w3.org/TR/2004/REC-xml-20040204/
96  *
97  * \param name  The name need to be tested
98  * \return true if ::name is valid, false otherwise.
99  */
_dom_validate_name(dom_string * name)100 bool _dom_validate_name(dom_string *name)
101 {
102 	uint32_t ch;
103 	size_t clen, slen;
104 	parserutils_error err;
105 	const uint8_t *s;
106 
107 	if (name == NULL)
108 		return false;
109 
110 	slen = dom_string_length(name);
111 	if (slen == 0)
112 		return false;
113 
114 	s = (const uint8_t *) dom_string_data(name);
115 	slen = dom_string_byte_length(name);
116 
117 	err = parserutils_charset_utf8_to_ucs4(s, slen, &ch, &clen);
118 	if (err != PARSERUTILS_OK) {
119 		return false;
120 	}
121 
122 	if (is_first_char(ch) == false)
123 		return false;
124 
125 	s += clen;
126 	slen -= clen;
127 
128 	while (slen > 0) {
129 		err = parserutils_charset_utf8_to_ucs4(s, slen, &ch, &clen);
130 		if (err != PARSERUTILS_OK) {
131 			return false;
132 		}
133 
134 		if (is_name_char(ch) == false)
135 			return false;
136 
137 		s += clen;
138 		slen -= clen;
139 	}
140 
141 	return true;
142 }
143 
144 /**
145  * Validate whether the string is a legal NCName.
146  * Refer http://www.w3.org/TR/REC-xml-names/ for detail.
147  *
148  * \param str  The name to validate
149  * \return true if ::name is valid, false otherwise.
150  */
_dom_validate_ncname(dom_string * name)151 bool _dom_validate_ncname(dom_string *name)
152 {
153 	uint32_t ch;
154 	size_t clen, slen;
155 	parserutils_error err;
156 	const uint8_t *s;
157 
158 	if (name == NULL)
159 		return false;
160 
161 	slen = dom_string_length(name);
162 	if (slen == 0)
163 		return false;
164 
165 	s = (const uint8_t *) dom_string_data(name);
166 	slen = dom_string_byte_length(name);
167 
168 	err = parserutils_charset_utf8_to_ucs4(s, slen, &ch, &clen);
169 	if (err != PARSERUTILS_OK) {
170 		return false;
171 	}
172 
173 	if (is_letter(ch) == false && ch != (uint32_t) '_')
174 		return false;
175 
176 	s += clen;
177 	slen -= clen;
178 
179 	while (slen > 0) {
180 		err = parserutils_charset_utf8_to_ucs4(s, slen, &ch, &clen);
181 		if (err != PARSERUTILS_OK) {
182 			return false;
183 		}
184 
185 		if (is_name_char(ch) == false)
186 			return false;
187 
188 		if (ch == (uint32_t) ':')
189 			return false;
190 
191 		s += clen;
192 		slen -= clen;
193 	}
194 
195 	return true;
196 }
197 
198