1 /*
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000-2017 Expat development team
11 Licensed under the MIT license:
12
13 Permission is hereby granted, free of charge, to any person obtaining
14 a copy of this software and associated documentation files (the
15 "Software"), to deal in the Software without restriction, including
16 without limitation the rights to use, copy, modify, merge, publish,
17 distribute, sublicense, and/or sell copies of the Software, and to permit
18 persons to whom the Software is furnished to do so, subject to the
19 following conditions:
20
21 The above copyright notice and this permission notice shall be included
22 in all copies or substantial portions of the Software.
23
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
29 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30 USE OR OTHER DEALINGS IN THE SOFTWARE.
31 */
32
33 #ifdef _WIN32
34 # include "winconfig.h"
35 #else
36 # ifdef HAVE_EXPAT_CONFIG_H
37 # include <expat_config.h>
38 # endif
39 #endif /* ndef _WIN32 */
40
41 #include <stddef.h>
42 #include <string.h> /* memcpy */
43
44 #if defined(_MSC_VER) && (_MSC_VER <= 1700)
45 /* for vs2012/11.0/1700 and earlier Visual Studio compilers */
46 # define bool int
47 # define false 0
48 # define true 1
49 #else
50 # include <stdbool.h>
51 #endif
52
53 #include "expat_external.h"
54 #include "internal.h"
55 #include "xmltok.h"
56 #include "nametab.h"
57
58 #ifdef XML_DTD
59 # define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
60 #else
61 # define IGNORE_SECTION_TOK_VTABLE /* as nothing */
62 #endif
63
64 #define VTABLE1 \
65 {PREFIX(prologTok), PREFIX(contentTok), \
66 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE}, \
67 {PREFIX(attributeValueTok), PREFIX(entityValueTok)}, \
68 PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS), \
69 PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName), \
70 PREFIX(updatePosition), PREFIX(isPublicId)
71
72 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
73
74 #define UCS2_GET_NAMING(pages, hi, lo) \
75 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
76
77 /* A 2 byte UTF-8 representation splits the characters 11 bits between
78 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
79 pages, 3 bits to add to that index and 5 bits to generate the mask.
80 */
81 #define UTF8_GET_NAMING2(pages, byte) \
82 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
83 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] \
84 & (1u << (((byte)[1]) & 0x1F)))
85
86 /* A 3 byte UTF-8 representation splits the characters 16 bits between
87 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
88 into pages, 3 bits to add to that index and 5 bits to generate the
89 mask.
90 */
91 #define UTF8_GET_NAMING3(pages, byte) \
92 (namingBitmap \
93 [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] \
94 << 3) \
95 + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \
96 & (1u << (((byte)[2]) & 0x1F)))
97
98 #define UTF8_GET_NAMING(pages, p, n) \
99 ((n) == 2 \
100 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
101 : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0))
102
103 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
104 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
105 with the additional restriction of not allowing the Unicode
106 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
107 Implementation details:
108 (A & 0x80) == 0 means A < 0x80
109 and
110 (A & 0xC0) == 0xC0 means A > 0xBF
111 */
112
113 #define UTF8_INVALID2(p) \
114 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
115
116 #define UTF8_INVALID3(p) \
117 (((p)[2] & 0x80) == 0 \
118 || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD \
119 : ((p)[2] & 0xC0) == 0xC0) \
120 || ((*p) == 0xE0 \
121 ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
122 : ((p)[1] & 0x80) == 0 \
123 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
124
125 #define UTF8_INVALID4(p) \
126 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0 \
127 || ((p)[2] & 0xC0) == 0xC0 \
128 || ((*p) == 0xF0 \
129 ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
130 : ((p)[1] & 0x80) == 0 \
131 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
132
133 static int PTRFASTCALL
isNever(const ENCODING * enc,const char * p)134 isNever(const ENCODING *enc, const char *p) {
135 UNUSED_P(enc);
136 UNUSED_P(p);
137 return 0;
138 }
139
140 static int PTRFASTCALL
utf8_isName2(const ENCODING * enc,const char * p)141 utf8_isName2(const ENCODING *enc, const char *p) {
142 UNUSED_P(enc);
143 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
144 }
145
146 static int PTRFASTCALL
utf8_isName3(const ENCODING * enc,const char * p)147 utf8_isName3(const ENCODING *enc, const char *p) {
148 UNUSED_P(enc);
149 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
150 }
151
152 #define utf8_isName4 isNever
153
154 static int PTRFASTCALL
utf8_isNmstrt2(const ENCODING * enc,const char * p)155 utf8_isNmstrt2(const ENCODING *enc, const char *p) {
156 UNUSED_P(enc);
157 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
158 }
159
160 static int PTRFASTCALL
utf8_isNmstrt3(const ENCODING * enc,const char * p)161 utf8_isNmstrt3(const ENCODING *enc, const char *p) {
162 UNUSED_P(enc);
163 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
164 }
165
166 #define utf8_isNmstrt4 isNever
167
168 static int PTRFASTCALL
utf8_isInvalid2(const ENCODING * enc,const char * p)169 utf8_isInvalid2(const ENCODING *enc, const char *p) {
170 UNUSED_P(enc);
171 return UTF8_INVALID2((const unsigned char *)p);
172 }
173
174 static int PTRFASTCALL
utf8_isInvalid3(const ENCODING * enc,const char * p)175 utf8_isInvalid3(const ENCODING *enc, const char *p) {
176 UNUSED_P(enc);
177 return UTF8_INVALID3((const unsigned char *)p);
178 }
179
180 static int PTRFASTCALL
utf8_isInvalid4(const ENCODING * enc,const char * p)181 utf8_isInvalid4(const ENCODING *enc, const char *p) {
182 UNUSED_P(enc);
183 return UTF8_INVALID4((const unsigned char *)p);
184 }
185
186 struct normal_encoding {
187 ENCODING enc;
188 unsigned char type[256];
189 #ifdef XML_MIN_SIZE
190 int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
191 int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
192 int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
193 int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
194 int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
195 #endif /* XML_MIN_SIZE */
196 int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
197 int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
198 int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
199 int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
200 int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
201 int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
202 int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
203 int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
204 int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
205 };
206
207 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
208
209 #ifdef XML_MIN_SIZE
210
211 # define STANDARD_VTABLE(E) \
212 E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
213
214 #else
215
216 # define STANDARD_VTABLE(E) /* as nothing */
217
218 #endif
219
220 #define NORMAL_VTABLE(E) \
221 E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, \
222 E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
223
224 #define NULL_VTABLE \
225 /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL, \
226 /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL, \
227 /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
228
229 static int FASTCALL checkCharRefNumber(int);
230
231 #include "xmltok_impl.h"
232 #include "ascii.h"
233
234 #ifdef XML_MIN_SIZE
235 # define sb_isNameMin isNever
236 # define sb_isNmstrtMin isNever
237 #endif
238
239 #ifdef XML_MIN_SIZE
240 # define MINBPC(enc) ((enc)->minBytesPerChar)
241 #else
242 /* minimum bytes per character */
243 # define MINBPC(enc) 1
244 #endif
245
246 #define SB_BYTE_TYPE(enc, p) \
247 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
248
249 #ifdef XML_MIN_SIZE
250 static int PTRFASTCALL
sb_byteType(const ENCODING * enc,const char * p)251 sb_byteType(const ENCODING *enc, const char *p) {
252 return SB_BYTE_TYPE(enc, p);
253 }
254 # define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
255 #else
256 # define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
257 #endif
258
259 #ifdef XML_MIN_SIZE
260 # define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
261 static int PTRFASTCALL
sb_byteToAscii(const ENCODING * enc,const char * p)262 sb_byteToAscii(const ENCODING *enc, const char *p) {
263 UNUSED_P(enc);
264 return *p;
265 }
266 #else
267 # define BYTE_TO_ASCII(enc, p) (*(p))
268 #endif
269
270 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
271 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
272 #define IS_INVALID_CHAR(enc, p, n) \
273 (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
274
275 #ifdef XML_MIN_SIZE
276 # define IS_NAME_CHAR_MINBPC(enc, p) \
277 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
278 # define IS_NMSTRT_CHAR_MINBPC(enc, p) \
279 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
280 #else
281 # define IS_NAME_CHAR_MINBPC(enc, p) (0)
282 # define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
283 #endif
284
285 #ifdef XML_MIN_SIZE
286 # define CHAR_MATCHES(enc, p, c) \
287 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
288 static int PTRCALL
sb_charMatches(const ENCODING * enc,const char * p,int c)289 sb_charMatches(const ENCODING *enc, const char *p, int c) {
290 UNUSED_P(enc);
291 return *p == c;
292 }
293 #else
294 /* c is an ASCII character */
295 # define CHAR_MATCHES(enc, p, c) (*(p) == c)
296 #endif
297
298 #define PREFIX(ident) normal_##ident
299 #define XML_TOK_IMPL_C
300 #include "xmltok_impl.c"
301 #undef XML_TOK_IMPL_C
302
303 #undef MINBPC
304 #undef BYTE_TYPE
305 #undef BYTE_TO_ASCII
306 #undef CHAR_MATCHES
307 #undef IS_NAME_CHAR
308 #undef IS_NAME_CHAR_MINBPC
309 #undef IS_NMSTRT_CHAR
310 #undef IS_NMSTRT_CHAR_MINBPC
311 #undef IS_INVALID_CHAR
312
313 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
314 UTF8_cval1 = 0x00,
315 UTF8_cval2 = 0xc0,
316 UTF8_cval3 = 0xe0,
317 UTF8_cval4 = 0xf0
318 };
319
320 void
_INTERNAL_trim_to_complete_utf8_characters(const char * from,const char ** fromLimRef)321 _INTERNAL_trim_to_complete_utf8_characters(const char *from,
322 const char **fromLimRef) {
323 const char *fromLim = *fromLimRef;
324 size_t walked = 0;
325 for (; fromLim > from; fromLim--, walked++) {
326 const unsigned char prev = (unsigned char)fromLim[-1];
327 if ((prev & 0xf8u)
328 == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
329 if (walked + 1 >= 4) {
330 fromLim += 4 - 1;
331 break;
332 } else {
333 walked = 0;
334 }
335 } else if ((prev & 0xf0u)
336 == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
337 if (walked + 1 >= 3) {
338 fromLim += 3 - 1;
339 break;
340 } else {
341 walked = 0;
342 }
343 } else if ((prev & 0xe0u)
344 == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
345 if (walked + 1 >= 2) {
346 fromLim += 2 - 1;
347 break;
348 } else {
349 walked = 0;
350 }
351 } else if ((prev & 0x80u)
352 == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
353 break;
354 }
355 }
356 *fromLimRef = fromLim;
357 }
358
359 static enum XML_Convert_Result PTRCALL
utf8_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)360 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
361 char **toP, const char *toLim) {
362 bool input_incomplete = false;
363 bool output_exhausted = false;
364
365 /* Avoid copying partial characters (due to limited space). */
366 const ptrdiff_t bytesAvailable = fromLim - *fromP;
367 const ptrdiff_t bytesStorable = toLim - *toP;
368 UNUSED_P(enc);
369 if (bytesAvailable > bytesStorable) {
370 fromLim = *fromP + bytesStorable;
371 output_exhausted = true;
372 }
373
374 /* Avoid copying partial characters (from incomplete input). */
375 {
376 const char *const fromLimBefore = fromLim;
377 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
378 if (fromLim < fromLimBefore) {
379 input_incomplete = true;
380 }
381 }
382
383 {
384 const ptrdiff_t bytesToCopy = fromLim - *fromP;
385 memcpy(*toP, *fromP, bytesToCopy);
386 *fromP += bytesToCopy;
387 *toP += bytesToCopy;
388 }
389
390 if (output_exhausted) /* needs to go first */
391 return XML_CONVERT_OUTPUT_EXHAUSTED;
392 else if (input_incomplete)
393 return XML_CONVERT_INPUT_INCOMPLETE;
394 else
395 return XML_CONVERT_COMPLETED;
396 }
397
398 static enum XML_Convert_Result PTRCALL
utf8_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)399 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
400 unsigned short **toP, const unsigned short *toLim) {
401 enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
402 unsigned short *to = *toP;
403 const char *from = *fromP;
404 while (from < fromLim && to < toLim) {
405 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
406 case BT_LEAD2:
407 if (fromLim - from < 2) {
408 res = XML_CONVERT_INPUT_INCOMPLETE;
409 goto after;
410 }
411 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
412 from += 2;
413 break;
414 case BT_LEAD3:
415 if (fromLim - from < 3) {
416 res = XML_CONVERT_INPUT_INCOMPLETE;
417 goto after;
418 }
419 *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
420 | (from[2] & 0x3f));
421 from += 3;
422 break;
423 case BT_LEAD4: {
424 unsigned long n;
425 if (toLim - to < 2) {
426 res = XML_CONVERT_OUTPUT_EXHAUSTED;
427 goto after;
428 }
429 if (fromLim - from < 4) {
430 res = XML_CONVERT_INPUT_INCOMPLETE;
431 goto after;
432 }
433 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
434 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
435 n -= 0x10000;
436 to[0] = (unsigned short)((n >> 10) | 0xD800);
437 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
438 to += 2;
439 from += 4;
440 } break;
441 default:
442 *to++ = *from++;
443 break;
444 }
445 }
446 if (from < fromLim)
447 res = XML_CONVERT_OUTPUT_EXHAUSTED;
448 after:
449 *fromP = from;
450 *toP = to;
451 return res;
452 }
453
454 #ifdef XML_NS
455 static const struct normal_encoding utf8_encoding_ns
456 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
457 {
458 # include "asciitab.h"
459 # include "utf8tab.h"
460 },
461 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
462 #endif
463
464 static const struct normal_encoding utf8_encoding
465 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
466 {
467 #define BT_COLON BT_NMSTRT
468 #include "asciitab.h"
469 #undef BT_COLON
470 #include "utf8tab.h"
471 },
472 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
473
474 #ifdef XML_NS
475
476 static const struct normal_encoding internal_utf8_encoding_ns
477 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
478 {
479 # include "iasciitab.h"
480 # include "utf8tab.h"
481 },
482 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
483
484 #endif
485
486 static const struct normal_encoding internal_utf8_encoding
487 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
488 {
489 #define BT_COLON BT_NMSTRT
490 #include "iasciitab.h"
491 #undef BT_COLON
492 #include "utf8tab.h"
493 },
494 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
495
496 static enum XML_Convert_Result PTRCALL
latin1_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)497 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
498 char **toP, const char *toLim) {
499 UNUSED_P(enc);
500 for (;;) {
501 unsigned char c;
502 if (*fromP == fromLim)
503 return XML_CONVERT_COMPLETED;
504 c = (unsigned char)**fromP;
505 if (c & 0x80) {
506 if (toLim - *toP < 2)
507 return XML_CONVERT_OUTPUT_EXHAUSTED;
508 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
509 *(*toP)++ = (char)((c & 0x3f) | 0x80);
510 (*fromP)++;
511 } else {
512 if (*toP == toLim)
513 return XML_CONVERT_OUTPUT_EXHAUSTED;
514 *(*toP)++ = *(*fromP)++;
515 }
516 }
517 }
518
519 static enum XML_Convert_Result PTRCALL
latin1_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)520 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
521 unsigned short **toP, const unsigned short *toLim) {
522 UNUSED_P(enc);
523 while (*fromP < fromLim && *toP < toLim)
524 *(*toP)++ = (unsigned char)*(*fromP)++;
525
526 if ((*toP == toLim) && (*fromP < fromLim))
527 return XML_CONVERT_OUTPUT_EXHAUSTED;
528 else
529 return XML_CONVERT_COMPLETED;
530 }
531
532 #ifdef XML_NS
533
534 static const struct normal_encoding latin1_encoding_ns
535 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
536 {
537 # include "asciitab.h"
538 # include "latin1tab.h"
539 },
540 STANDARD_VTABLE(sb_) NULL_VTABLE};
541
542 #endif
543
544 static const struct normal_encoding latin1_encoding
545 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
546 {
547 #define BT_COLON BT_NMSTRT
548 #include "asciitab.h"
549 #undef BT_COLON
550 #include "latin1tab.h"
551 },
552 STANDARD_VTABLE(sb_) NULL_VTABLE};
553
554 static enum XML_Convert_Result PTRCALL
ascii_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)555 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
556 char **toP, const char *toLim) {
557 UNUSED_P(enc);
558 while (*fromP < fromLim && *toP < toLim)
559 *(*toP)++ = *(*fromP)++;
560
561 if ((*toP == toLim) && (*fromP < fromLim))
562 return XML_CONVERT_OUTPUT_EXHAUSTED;
563 else
564 return XML_CONVERT_COMPLETED;
565 }
566
567 #ifdef XML_NS
568
569 static const struct normal_encoding ascii_encoding_ns
570 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
571 {
572 # include "asciitab.h"
573 /* BT_NONXML == 0 */
574 },
575 STANDARD_VTABLE(sb_) NULL_VTABLE};
576
577 #endif
578
579 static const struct normal_encoding ascii_encoding
580 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
581 {
582 #define BT_COLON BT_NMSTRT
583 #include "asciitab.h"
584 #undef BT_COLON
585 /* BT_NONXML == 0 */
586 },
587 STANDARD_VTABLE(sb_) NULL_VTABLE};
588
589 static int PTRFASTCALL
unicode_byte_type(char hi,char lo)590 unicode_byte_type(char hi, char lo) {
591 switch ((unsigned char)hi) {
592 /* 0xD800–0xDBFF first 16-bit code unit or high surrogate (W1) */
593 case 0xD8:
594 case 0xD9:
595 case 0xDA:
596 case 0xDB:
597 return BT_LEAD4;
598 /* 0xDC00–0xDFFF second 16-bit code unit or low surrogate (W2) */
599 case 0xDC:
600 case 0xDD:
601 case 0xDE:
602 case 0xDF:
603 return BT_TRAIL;
604 case 0xFF:
605 switch ((unsigned char)lo) {
606 case 0xFF: /* noncharacter-FFFF */
607 case 0xFE: /* noncharacter-FFFE */
608 return BT_NONXML;
609 }
610 break;
611 }
612 return BT_NONASCII;
613 }
614
615 #define DEFINE_UTF16_TO_UTF8(E) \
616 static enum XML_Convert_Result PTRCALL E##toUtf8( \
617 const ENCODING *enc, const char **fromP, const char *fromLim, \
618 char **toP, const char *toLim) { \
619 const char *from = *fromP; \
620 UNUSED_P(enc); \
621 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \
622 for (; from < fromLim; from += 2) { \
623 int plane; \
624 unsigned char lo2; \
625 unsigned char lo = GET_LO(from); \
626 unsigned char hi = GET_HI(from); \
627 switch (hi) { \
628 case 0: \
629 if (lo < 0x80) { \
630 if (*toP == toLim) { \
631 *fromP = from; \
632 return XML_CONVERT_OUTPUT_EXHAUSTED; \
633 } \
634 *(*toP)++ = lo; \
635 break; \
636 } \
637 /* fall through */ \
638 case 0x1: \
639 case 0x2: \
640 case 0x3: \
641 case 0x4: \
642 case 0x5: \
643 case 0x6: \
644 case 0x7: \
645 if (toLim - *toP < 2) { \
646 *fromP = from; \
647 return XML_CONVERT_OUTPUT_EXHAUSTED; \
648 } \
649 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
650 *(*toP)++ = ((lo & 0x3f) | 0x80); \
651 break; \
652 default: \
653 if (toLim - *toP < 3) { \
654 *fromP = from; \
655 return XML_CONVERT_OUTPUT_EXHAUSTED; \
656 } \
657 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
658 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
659 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
660 *(*toP)++ = ((lo & 0x3f) | 0x80); \
661 break; \
662 case 0xD8: \
663 case 0xD9: \
664 case 0xDA: \
665 case 0xDB: \
666 if (toLim - *toP < 4) { \
667 *fromP = from; \
668 return XML_CONVERT_OUTPUT_EXHAUSTED; \
669 } \
670 if (fromLim - from < 4) { \
671 *fromP = from; \
672 return XML_CONVERT_INPUT_INCOMPLETE; \
673 } \
674 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
675 *(*toP)++ = (char)((plane >> 2) | UTF8_cval4); \
676 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
677 from += 2; \
678 lo2 = GET_LO(from); \
679 *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) \
680 | (lo2 >> 6) | 0x80); \
681 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
682 break; \
683 } \
684 } \
685 *fromP = from; \
686 if (from < fromLim) \
687 return XML_CONVERT_INPUT_INCOMPLETE; \
688 else \
689 return XML_CONVERT_COMPLETED; \
690 }
691
692 #define DEFINE_UTF16_TO_UTF16(E) \
693 static enum XML_Convert_Result PTRCALL E##toUtf16( \
694 const ENCODING *enc, const char **fromP, const char *fromLim, \
695 unsigned short **toP, const unsigned short *toLim) { \
696 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
697 UNUSED_P(enc); \
698 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \
699 /* Avoid copying first half only of surrogate */ \
700 if (fromLim - *fromP > ((toLim - *toP) << 1) \
701 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
702 fromLim -= 2; \
703 res = XML_CONVERT_INPUT_INCOMPLETE; \
704 } \
705 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
706 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
707 if ((*toP == toLim) && (*fromP < fromLim)) \
708 return XML_CONVERT_OUTPUT_EXHAUSTED; \
709 else \
710 return res; \
711 }
712
713 #define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
714 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
715 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
716
717 DEFINE_UTF16_TO_UTF8(little2_)
DEFINE_UTF16_TO_UTF16(little2_)718 DEFINE_UTF16_TO_UTF16(little2_)
719
720 #undef SET2
721 #undef GET_LO
722 #undef GET_HI
723
724 #define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
725 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
726 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
727
728 DEFINE_UTF16_TO_UTF8(big2_)
729 DEFINE_UTF16_TO_UTF16(big2_)
730
731 #undef SET2
732 #undef GET_LO
733 #undef GET_HI
734
735 #define LITTLE2_BYTE_TYPE(enc, p) \
736 ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
737 : unicode_byte_type((p)[1], (p)[0]))
738 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
739 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == c)
740 #define LITTLE2_IS_NAME_CHAR_MINBPC(p) \
741 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
742 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) \
743 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
744
745 #ifdef XML_MIN_SIZE
746
747 static int PTRFASTCALL
748 little2_byteType(const ENCODING *enc, const char *p) {
749 return LITTLE2_BYTE_TYPE(enc, p);
750 }
751
752 static int PTRFASTCALL
little2_byteToAscii(const ENCODING * enc,const char * p)753 little2_byteToAscii(const ENCODING *enc, const char *p) {
754 UNUSED_P(enc);
755 return LITTLE2_BYTE_TO_ASCII(p);
756 }
757
758 static int PTRCALL
little2_charMatches(const ENCODING * enc,const char * p,int c)759 little2_charMatches(const ENCODING *enc, const char *p, int c) {
760 UNUSED_P(enc);
761 return LITTLE2_CHAR_MATCHES(p, c);
762 }
763
764 static int PTRFASTCALL
little2_isNameMin(const ENCODING * enc,const char * p)765 little2_isNameMin(const ENCODING *enc, const char *p) {
766 UNUSED_P(enc);
767 return LITTLE2_IS_NAME_CHAR_MINBPC(p);
768 }
769
770 static int PTRFASTCALL
little2_isNmstrtMin(const ENCODING * enc,const char * p)771 little2_isNmstrtMin(const ENCODING *enc, const char *p) {
772 UNUSED_P(enc);
773 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
774 }
775
776 # undef VTABLE
777 # define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
778
779 #else /* not XML_MIN_SIZE */
780
781 # undef PREFIX
782 # define PREFIX(ident) little2_##ident
783 # define MINBPC(enc) 2
784 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
785 # define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
786 # define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
787 # define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
788 # define IS_NAME_CHAR(enc, p, n) 0
789 # define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
790 # define IS_NMSTRT_CHAR(enc, p, n) (0)
791 # define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
792
793 # define XML_TOK_IMPL_C
794 # include "xmltok_impl.c"
795 # undef XML_TOK_IMPL_C
796
797 # undef MINBPC
798 # undef BYTE_TYPE
799 # undef BYTE_TO_ASCII
800 # undef CHAR_MATCHES
801 # undef IS_NAME_CHAR
802 # undef IS_NAME_CHAR_MINBPC
803 # undef IS_NMSTRT_CHAR
804 # undef IS_NMSTRT_CHAR_MINBPC
805 # undef IS_INVALID_CHAR
806
807 #endif /* not XML_MIN_SIZE */
808
809 #ifdef XML_NS
810
811 static const struct normal_encoding little2_encoding_ns
812 = {{VTABLE, 2, 0,
813 # if BYTEORDER == 1234
814 1
815 # else
816 0
817 # endif
818 },
819 {
820 # include "asciitab.h"
821 # include "latin1tab.h"
822 },
823 STANDARD_VTABLE(little2_) NULL_VTABLE};
824
825 #endif
826
827 static const struct normal_encoding little2_encoding
828 = {{VTABLE, 2, 0,
829 #if BYTEORDER == 1234
830 1
831 #else
832 0
833 #endif
834 },
835 {
836 #define BT_COLON BT_NMSTRT
837 #include "asciitab.h"
838 #undef BT_COLON
839 #include "latin1tab.h"
840 },
841 STANDARD_VTABLE(little2_) NULL_VTABLE};
842
843 #if BYTEORDER != 4321
844
845 # ifdef XML_NS
846
847 static const struct normal_encoding internal_little2_encoding_ns
848 = {{VTABLE, 2, 0, 1},
849 {
850 # include "iasciitab.h"
851 # include "latin1tab.h"
852 },
853 STANDARD_VTABLE(little2_) NULL_VTABLE};
854
855 # endif
856
857 static const struct normal_encoding internal_little2_encoding
858 = {{VTABLE, 2, 0, 1},
859 {
860 # define BT_COLON BT_NMSTRT
861 # include "iasciitab.h"
862 # undef BT_COLON
863 # include "latin1tab.h"
864 },
865 STANDARD_VTABLE(little2_) NULL_VTABLE};
866
867 #endif
868
869 #define BIG2_BYTE_TYPE(enc, p) \
870 ((p)[0] == 0 \
871 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
872 : unicode_byte_type((p)[0], (p)[1]))
873 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
874 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == c)
875 #define BIG2_IS_NAME_CHAR_MINBPC(p) \
876 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
877 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p) \
878 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
879
880 #ifdef XML_MIN_SIZE
881
882 static int PTRFASTCALL
big2_byteType(const ENCODING * enc,const char * p)883 big2_byteType(const ENCODING *enc, const char *p) {
884 return BIG2_BYTE_TYPE(enc, p);
885 }
886
887 static int PTRFASTCALL
big2_byteToAscii(const ENCODING * enc,const char * p)888 big2_byteToAscii(const ENCODING *enc, const char *p) {
889 UNUSED_P(enc);
890 return BIG2_BYTE_TO_ASCII(p);
891 }
892
893 static int PTRCALL
big2_charMatches(const ENCODING * enc,const char * p,int c)894 big2_charMatches(const ENCODING *enc, const char *p, int c) {
895 UNUSED_P(enc);
896 return BIG2_CHAR_MATCHES(p, c);
897 }
898
899 static int PTRFASTCALL
big2_isNameMin(const ENCODING * enc,const char * p)900 big2_isNameMin(const ENCODING *enc, const char *p) {
901 UNUSED_P(enc);
902 return BIG2_IS_NAME_CHAR_MINBPC(p);
903 }
904
905 static int PTRFASTCALL
big2_isNmstrtMin(const ENCODING * enc,const char * p)906 big2_isNmstrtMin(const ENCODING *enc, const char *p) {
907 UNUSED_P(enc);
908 return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
909 }
910
911 # undef VTABLE
912 # define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
913
914 #else /* not XML_MIN_SIZE */
915
916 # undef PREFIX
917 # define PREFIX(ident) big2_##ident
918 # define MINBPC(enc) 2
919 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
920 # define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
921 # define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
922 # define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
923 # define IS_NAME_CHAR(enc, p, n) 0
924 # define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
925 # define IS_NMSTRT_CHAR(enc, p, n) (0)
926 # define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
927
928 # define XML_TOK_IMPL_C
929 # include "xmltok_impl.c"
930 # undef XML_TOK_IMPL_C
931
932 # undef MINBPC
933 # undef BYTE_TYPE
934 # undef BYTE_TO_ASCII
935 # undef CHAR_MATCHES
936 # undef IS_NAME_CHAR
937 # undef IS_NAME_CHAR_MINBPC
938 # undef IS_NMSTRT_CHAR
939 # undef IS_NMSTRT_CHAR_MINBPC
940 # undef IS_INVALID_CHAR
941
942 #endif /* not XML_MIN_SIZE */
943
944 #ifdef XML_NS
945
946 static const struct normal_encoding big2_encoding_ns
947 = {{VTABLE, 2, 0,
948 # if BYTEORDER == 4321
949 1
950 # else
951 0
952 # endif
953 },
954 {
955 # include "asciitab.h"
956 # include "latin1tab.h"
957 },
958 STANDARD_VTABLE(big2_) NULL_VTABLE};
959
960 #endif
961
962 static const struct normal_encoding big2_encoding
963 = {{VTABLE, 2, 0,
964 #if BYTEORDER == 4321
965 1
966 #else
967 0
968 #endif
969 },
970 {
971 #define BT_COLON BT_NMSTRT
972 #include "asciitab.h"
973 #undef BT_COLON
974 #include "latin1tab.h"
975 },
976 STANDARD_VTABLE(big2_) NULL_VTABLE};
977
978 #if BYTEORDER != 1234
979
980 # ifdef XML_NS
981
982 static const struct normal_encoding internal_big2_encoding_ns
983 = {{VTABLE, 2, 0, 1},
984 {
985 # include "iasciitab.h"
986 # include "latin1tab.h"
987 },
988 STANDARD_VTABLE(big2_) NULL_VTABLE};
989
990 # endif
991
992 static const struct normal_encoding internal_big2_encoding
993 = {{VTABLE, 2, 0, 1},
994 {
995 # define BT_COLON BT_NMSTRT
996 # include "iasciitab.h"
997 # undef BT_COLON
998 # include "latin1tab.h"
999 },
1000 STANDARD_VTABLE(big2_) NULL_VTABLE};
1001
1002 #endif
1003
1004 #undef PREFIX
1005
1006 static int FASTCALL
streqci(const char * s1,const char * s2)1007 streqci(const char *s1, const char *s2) {
1008 for (;;) {
1009 char c1 = *s1++;
1010 char c2 = *s2++;
1011 if (ASCII_a <= c1 && c1 <= ASCII_z)
1012 c1 += ASCII_A - ASCII_a;
1013 if (ASCII_a <= c2 && c2 <= ASCII_z)
1014 /* The following line will never get executed. streqci() is
1015 * only called from two places, both of which guarantee to put
1016 * upper-case strings into s2.
1017 */
1018 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1019 if (c1 != c2)
1020 return 0;
1021 if (! c1)
1022 break;
1023 }
1024 return 1;
1025 }
1026
1027 static void PTRCALL
initUpdatePosition(const ENCODING * enc,const char * ptr,const char * end,POSITION * pos)1028 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1029 POSITION *pos) {
1030 UNUSED_P(enc);
1031 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1032 }
1033
1034 static int
toAscii(const ENCODING * enc,const char * ptr,const char * end)1035 toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1036 char buf[1];
1037 char *p = buf;
1038 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1039 if (p == buf)
1040 return -1;
1041 else
1042 return buf[0];
1043 }
1044
1045 static int FASTCALL
isSpace(int c)1046 isSpace(int c) {
1047 switch (c) {
1048 case 0x20:
1049 case 0xD:
1050 case 0xA:
1051 case 0x9:
1052 return 1;
1053 }
1054 return 0;
1055 }
1056
1057 /* Return 1 if there's just optional white space or there's an S
1058 followed by name=val.
1059 */
1060 static int
parsePseudoAttribute(const ENCODING * enc,const char * ptr,const char * end,const char ** namePtr,const char ** nameEndPtr,const char ** valPtr,const char ** nextTokPtr)1061 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1062 const char **namePtr, const char **nameEndPtr,
1063 const char **valPtr, const char **nextTokPtr) {
1064 int c;
1065 char open;
1066 if (ptr == end) {
1067 *namePtr = NULL;
1068 return 1;
1069 }
1070 if (! isSpace(toAscii(enc, ptr, end))) {
1071 *nextTokPtr = ptr;
1072 return 0;
1073 }
1074 do {
1075 ptr += enc->minBytesPerChar;
1076 } while (isSpace(toAscii(enc, ptr, end)));
1077 if (ptr == end) {
1078 *namePtr = NULL;
1079 return 1;
1080 }
1081 *namePtr = ptr;
1082 for (;;) {
1083 c = toAscii(enc, ptr, end);
1084 if (c == -1) {
1085 *nextTokPtr = ptr;
1086 return 0;
1087 }
1088 if (c == ASCII_EQUALS) {
1089 *nameEndPtr = ptr;
1090 break;
1091 }
1092 if (isSpace(c)) {
1093 *nameEndPtr = ptr;
1094 do {
1095 ptr += enc->minBytesPerChar;
1096 } while (isSpace(c = toAscii(enc, ptr, end)));
1097 if (c != ASCII_EQUALS) {
1098 *nextTokPtr = ptr;
1099 return 0;
1100 }
1101 break;
1102 }
1103 ptr += enc->minBytesPerChar;
1104 }
1105 if (ptr == *namePtr) {
1106 *nextTokPtr = ptr;
1107 return 0;
1108 }
1109 ptr += enc->minBytesPerChar;
1110 c = toAscii(enc, ptr, end);
1111 while (isSpace(c)) {
1112 ptr += enc->minBytesPerChar;
1113 c = toAscii(enc, ptr, end);
1114 }
1115 if (c != ASCII_QUOT && c != ASCII_APOS) {
1116 *nextTokPtr = ptr;
1117 return 0;
1118 }
1119 open = (char)c;
1120 ptr += enc->minBytesPerChar;
1121 *valPtr = ptr;
1122 for (;; ptr += enc->minBytesPerChar) {
1123 c = toAscii(enc, ptr, end);
1124 if (c == open)
1125 break;
1126 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1127 && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1128 && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1129 *nextTokPtr = ptr;
1130 return 0;
1131 }
1132 }
1133 *nextTokPtr = ptr + enc->minBytesPerChar;
1134 return 1;
1135 }
1136
1137 static const char KW_version[]
1138 = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1139
1140 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1141 ASCII_i, ASCII_n, ASCII_g, '\0'};
1142
1143 static const char KW_standalone[]
1144 = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1145 ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1146
1147 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1148
1149 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1150
1151 static int
doParseXmlDecl(const ENCODING * (* encodingFinder)(const ENCODING *,const char *,const char *),int isGeneralTextEntity,const ENCODING * enc,const char * ptr,const char * end,const char ** badPtr,const char ** versionPtr,const char ** versionEndPtr,const char ** encodingName,const ENCODING ** encoding,int * standalone)1152 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1153 const char *),
1154 int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1155 const char *end, const char **badPtr, const char **versionPtr,
1156 const char **versionEndPtr, const char **encodingName,
1157 const ENCODING **encoding, int *standalone) {
1158 const char *val = NULL;
1159 const char *name = NULL;
1160 const char *nameEnd = NULL;
1161 ptr += 5 * enc->minBytesPerChar;
1162 end -= 2 * enc->minBytesPerChar;
1163 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1164 || ! name) {
1165 *badPtr = ptr;
1166 return 0;
1167 }
1168 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1169 if (! isGeneralTextEntity) {
1170 *badPtr = name;
1171 return 0;
1172 }
1173 } else {
1174 if (versionPtr)
1175 *versionPtr = val;
1176 if (versionEndPtr)
1177 *versionEndPtr = ptr;
1178 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1179 *badPtr = ptr;
1180 return 0;
1181 }
1182 if (! name) {
1183 if (isGeneralTextEntity) {
1184 /* a TextDecl must have an EncodingDecl */
1185 *badPtr = ptr;
1186 return 0;
1187 }
1188 return 1;
1189 }
1190 }
1191 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1192 int c = toAscii(enc, val, end);
1193 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1194 *badPtr = val;
1195 return 0;
1196 }
1197 if (encodingName)
1198 *encodingName = val;
1199 if (encoding)
1200 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1201 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1202 *badPtr = ptr;
1203 return 0;
1204 }
1205 if (! name)
1206 return 1;
1207 }
1208 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1209 || isGeneralTextEntity) {
1210 *badPtr = name;
1211 return 0;
1212 }
1213 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1214 if (standalone)
1215 *standalone = 1;
1216 } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1217 if (standalone)
1218 *standalone = 0;
1219 } else {
1220 *badPtr = val;
1221 return 0;
1222 }
1223 while (isSpace(toAscii(enc, ptr, end)))
1224 ptr += enc->minBytesPerChar;
1225 if (ptr != end) {
1226 *badPtr = ptr;
1227 return 0;
1228 }
1229 return 1;
1230 }
1231
1232 static int FASTCALL
checkCharRefNumber(int result)1233 checkCharRefNumber(int result) {
1234 switch (result >> 8) {
1235 case 0xD8:
1236 case 0xD9:
1237 case 0xDA:
1238 case 0xDB:
1239 case 0xDC:
1240 case 0xDD:
1241 case 0xDE:
1242 case 0xDF:
1243 return -1;
1244 case 0:
1245 if (latin1_encoding.type[result] == BT_NONXML)
1246 return -1;
1247 break;
1248 case 0xFF:
1249 if (result == 0xFFFE || result == 0xFFFF)
1250 return -1;
1251 break;
1252 }
1253 return result;
1254 }
1255
1256 int FASTCALL
XmlUtf8Encode(int c,char * buf)1257 XmlUtf8Encode(int c, char *buf) {
1258 enum {
1259 /* minN is minimum legal resulting value for N byte sequence */
1260 min2 = 0x80,
1261 min3 = 0x800,
1262 min4 = 0x10000
1263 };
1264
1265 if (c < 0)
1266 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1267 if (c < min2) {
1268 buf[0] = (char)(c | UTF8_cval1);
1269 return 1;
1270 }
1271 if (c < min3) {
1272 buf[0] = (char)((c >> 6) | UTF8_cval2);
1273 buf[1] = (char)((c & 0x3f) | 0x80);
1274 return 2;
1275 }
1276 if (c < min4) {
1277 buf[0] = (char)((c >> 12) | UTF8_cval3);
1278 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1279 buf[2] = (char)((c & 0x3f) | 0x80);
1280 return 3;
1281 }
1282 if (c < 0x110000) {
1283 buf[0] = (char)((c >> 18) | UTF8_cval4);
1284 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1285 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1286 buf[3] = (char)((c & 0x3f) | 0x80);
1287 return 4;
1288 }
1289 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1290 }
1291
1292 int FASTCALL
XmlUtf16Encode(int charNum,unsigned short * buf)1293 XmlUtf16Encode(int charNum, unsigned short *buf) {
1294 if (charNum < 0)
1295 return 0;
1296 if (charNum < 0x10000) {
1297 buf[0] = (unsigned short)charNum;
1298 return 1;
1299 }
1300 if (charNum < 0x110000) {
1301 charNum -= 0x10000;
1302 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1303 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1304 return 2;
1305 }
1306 return 0;
1307 }
1308
1309 struct unknown_encoding {
1310 struct normal_encoding normal;
1311 CONVERTER convert;
1312 void *userData;
1313 unsigned short utf16[256];
1314 char utf8[256][4];
1315 };
1316
1317 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1318
1319 int
XmlSizeOfUnknownEncoding(void)1320 XmlSizeOfUnknownEncoding(void) {
1321 return sizeof(struct unknown_encoding);
1322 }
1323
1324 static int PTRFASTCALL
unknown_isName(const ENCODING * enc,const char * p)1325 unknown_isName(const ENCODING *enc, const char *p) {
1326 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1327 int c = uenc->convert(uenc->userData, p);
1328 if (c & ~0xFFFF)
1329 return 0;
1330 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1331 }
1332
1333 static int PTRFASTCALL
unknown_isNmstrt(const ENCODING * enc,const char * p)1334 unknown_isNmstrt(const ENCODING *enc, const char *p) {
1335 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1336 int c = uenc->convert(uenc->userData, p);
1337 if (c & ~0xFFFF)
1338 return 0;
1339 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1340 }
1341
1342 static int PTRFASTCALL
unknown_isInvalid(const ENCODING * enc,const char * p)1343 unknown_isInvalid(const ENCODING *enc, const char *p) {
1344 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1345 int c = uenc->convert(uenc->userData, p);
1346 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1347 }
1348
1349 static enum XML_Convert_Result PTRCALL
unknown_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)1350 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1351 char **toP, const char *toLim) {
1352 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1353 char buf[XML_UTF8_ENCODE_MAX];
1354 for (;;) {
1355 const char *utf8;
1356 int n;
1357 if (*fromP == fromLim)
1358 return XML_CONVERT_COMPLETED;
1359 utf8 = uenc->utf8[(unsigned char)**fromP];
1360 n = *utf8++;
1361 if (n == 0) {
1362 int c = uenc->convert(uenc->userData, *fromP);
1363 n = XmlUtf8Encode(c, buf);
1364 if (n > toLim - *toP)
1365 return XML_CONVERT_OUTPUT_EXHAUSTED;
1366 utf8 = buf;
1367 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1368 - (BT_LEAD2 - 2));
1369 } else {
1370 if (n > toLim - *toP)
1371 return XML_CONVERT_OUTPUT_EXHAUSTED;
1372 (*fromP)++;
1373 }
1374 memcpy(*toP, utf8, n);
1375 *toP += n;
1376 }
1377 }
1378
1379 static enum XML_Convert_Result PTRCALL
unknown_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)1380 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1381 unsigned short **toP, const unsigned short *toLim) {
1382 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1383 while (*fromP < fromLim && *toP < toLim) {
1384 unsigned short c = uenc->utf16[(unsigned char)**fromP];
1385 if (c == 0) {
1386 c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1387 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1388 - (BT_LEAD2 - 2));
1389 } else
1390 (*fromP)++;
1391 *(*toP)++ = c;
1392 }
1393
1394 if ((*toP == toLim) && (*fromP < fromLim))
1395 return XML_CONVERT_OUTPUT_EXHAUSTED;
1396 else
1397 return XML_CONVERT_COMPLETED;
1398 }
1399
1400 ENCODING *
XmlInitUnknownEncoding(void * mem,int * table,CONVERTER convert,void * userData)1401 XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1402 void *userData) {
1403 int i;
1404 struct unknown_encoding *e = (struct unknown_encoding *)mem;
1405 memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1406 for (i = 0; i < 128; i++)
1407 if (latin1_encoding.type[i] != BT_OTHER
1408 && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1409 return 0;
1410 for (i = 0; i < 256; i++) {
1411 int c = table[i];
1412 if (c == -1) {
1413 e->normal.type[i] = BT_MALFORM;
1414 /* This shouldn't really get used. */
1415 e->utf16[i] = 0xFFFF;
1416 e->utf8[i][0] = 1;
1417 e->utf8[i][1] = 0;
1418 } else if (c < 0) {
1419 if (c < -4)
1420 return 0;
1421 /* Multi-byte sequences need a converter function */
1422 if (! convert)
1423 return 0;
1424 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1425 e->utf8[i][0] = 0;
1426 e->utf16[i] = 0;
1427 } else if (c < 0x80) {
1428 if (latin1_encoding.type[c] != BT_OTHER
1429 && latin1_encoding.type[c] != BT_NONXML && c != i)
1430 return 0;
1431 e->normal.type[i] = latin1_encoding.type[c];
1432 e->utf8[i][0] = 1;
1433 e->utf8[i][1] = (char)c;
1434 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1435 } else if (checkCharRefNumber(c) < 0) {
1436 e->normal.type[i] = BT_NONXML;
1437 /* This shouldn't really get used. */
1438 e->utf16[i] = 0xFFFF;
1439 e->utf8[i][0] = 1;
1440 e->utf8[i][1] = 0;
1441 } else {
1442 if (c > 0xFFFF)
1443 return 0;
1444 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1445 e->normal.type[i] = BT_NMSTRT;
1446 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1447 e->normal.type[i] = BT_NAME;
1448 else
1449 e->normal.type[i] = BT_OTHER;
1450 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1451 e->utf16[i] = (unsigned short)c;
1452 }
1453 }
1454 e->userData = userData;
1455 e->convert = convert;
1456 if (convert) {
1457 e->normal.isName2 = unknown_isName;
1458 e->normal.isName3 = unknown_isName;
1459 e->normal.isName4 = unknown_isName;
1460 e->normal.isNmstrt2 = unknown_isNmstrt;
1461 e->normal.isNmstrt3 = unknown_isNmstrt;
1462 e->normal.isNmstrt4 = unknown_isNmstrt;
1463 e->normal.isInvalid2 = unknown_isInvalid;
1464 e->normal.isInvalid3 = unknown_isInvalid;
1465 e->normal.isInvalid4 = unknown_isInvalid;
1466 }
1467 e->normal.enc.utf8Convert = unknown_toUtf8;
1468 e->normal.enc.utf16Convert = unknown_toUtf16;
1469 return &(e->normal.enc);
1470 }
1471
1472 /* If this enumeration is changed, getEncodingIndex and encodings
1473 must also be changed. */
1474 enum {
1475 UNKNOWN_ENC = -1,
1476 ISO_8859_1_ENC = 0,
1477 US_ASCII_ENC,
1478 UTF_8_ENC,
1479 UTF_16_ENC,
1480 UTF_16BE_ENC,
1481 UTF_16LE_ENC,
1482 /* must match encodingNames up to here */
1483 NO_ENC
1484 };
1485
1486 static const char KW_ISO_8859_1[]
1487 = {ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8,
1488 ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'};
1489 static const char KW_US_ASCII[]
1490 = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1491 ASCII_C, ASCII_I, ASCII_I, '\0'};
1492 static const char KW_UTF_8[]
1493 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1494 static const char KW_UTF_16[]
1495 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1496 static const char KW_UTF_16BE[]
1497 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1498 ASCII_6, ASCII_B, ASCII_E, '\0'};
1499 static const char KW_UTF_16LE[]
1500 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1501 ASCII_6, ASCII_L, ASCII_E, '\0'};
1502
1503 static int FASTCALL
getEncodingIndex(const char * name)1504 getEncodingIndex(const char *name) {
1505 static const char *const encodingNames[] = {
1506 KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1507 };
1508 int i;
1509 if (name == NULL)
1510 return NO_ENC;
1511 for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1512 if (streqci(name, encodingNames[i]))
1513 return i;
1514 return UNKNOWN_ENC;
1515 }
1516
1517 /* For binary compatibility, we store the index of the encoding
1518 specified at initialization in the isUtf16 member.
1519 */
1520
1521 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1522 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1523
1524 /* This is what detects the encoding. encodingTable maps from
1525 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1526 the external (protocol) specified encoding; state is
1527 XML_CONTENT_STATE if we're parsing an external text entity, and
1528 XML_PROLOG_STATE otherwise.
1529 */
1530
1531 static int
initScan(const ENCODING * const * encodingTable,const INIT_ENCODING * enc,int state,const char * ptr,const char * end,const char ** nextTokPtr)1532 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1533 int state, const char *ptr, const char *end, const char **nextTokPtr) {
1534 const ENCODING **encPtr;
1535
1536 if (ptr >= end)
1537 return XML_TOK_NONE;
1538 encPtr = enc->encPtr;
1539 if (ptr + 1 == end) {
1540 /* only a single byte available for auto-detection */
1541 #ifndef XML_DTD /* FIXME */
1542 /* a well-formed document entity must have more than one byte */
1543 if (state != XML_CONTENT_STATE)
1544 return XML_TOK_PARTIAL;
1545 #endif
1546 /* so we're parsing an external text entity... */
1547 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1548 switch (INIT_ENC_INDEX(enc)) {
1549 case UTF_16_ENC:
1550 case UTF_16LE_ENC:
1551 case UTF_16BE_ENC:
1552 return XML_TOK_PARTIAL;
1553 }
1554 switch ((unsigned char)*ptr) {
1555 case 0xFE:
1556 case 0xFF:
1557 case 0xEF: /* possibly first byte of UTF-8 BOM */
1558 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1559 break;
1560 /* fall through */
1561 case 0x00:
1562 case 0x3C:
1563 return XML_TOK_PARTIAL;
1564 }
1565 } else {
1566 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1567 case 0xFEFF:
1568 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1569 break;
1570 *nextTokPtr = ptr + 2;
1571 *encPtr = encodingTable[UTF_16BE_ENC];
1572 return XML_TOK_BOM;
1573 /* 00 3C is handled in the default case */
1574 case 0x3C00:
1575 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1576 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1577 && state == XML_CONTENT_STATE)
1578 break;
1579 *encPtr = encodingTable[UTF_16LE_ENC];
1580 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1581 case 0xFFFE:
1582 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1583 break;
1584 *nextTokPtr = ptr + 2;
1585 *encPtr = encodingTable[UTF_16LE_ENC];
1586 return XML_TOK_BOM;
1587 case 0xEFBB:
1588 /* Maybe a UTF-8 BOM (EF BB BF) */
1589 /* If there's an explicitly specified (external) encoding
1590 of ISO-8859-1 or some flavour of UTF-16
1591 and this is an external text entity,
1592 don't look for the BOM,
1593 because it might be a legal data.
1594 */
1595 if (state == XML_CONTENT_STATE) {
1596 int e = INIT_ENC_INDEX(enc);
1597 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1598 || e == UTF_16_ENC)
1599 break;
1600 }
1601 if (ptr + 2 == end)
1602 return XML_TOK_PARTIAL;
1603 if ((unsigned char)ptr[2] == 0xBF) {
1604 *nextTokPtr = ptr + 3;
1605 *encPtr = encodingTable[UTF_8_ENC];
1606 return XML_TOK_BOM;
1607 }
1608 break;
1609 default:
1610 if (ptr[0] == '\0') {
1611 /* 0 isn't a legal data character. Furthermore a document
1612 entity can only start with ASCII characters. So the only
1613 way this can fail to be big-endian UTF-16 if it it's an
1614 external parsed general entity that's labelled as
1615 UTF-16LE.
1616 */
1617 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1618 break;
1619 *encPtr = encodingTable[UTF_16BE_ENC];
1620 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1621 } else if (ptr[1] == '\0') {
1622 /* We could recover here in the case:
1623 - parsing an external entity
1624 - second byte is 0
1625 - no externally specified encoding
1626 - no encoding declaration
1627 by assuming UTF-16LE. But we don't, because this would mean when
1628 presented just with a single byte, we couldn't reliably determine
1629 whether we needed further bytes.
1630 */
1631 if (state == XML_CONTENT_STATE)
1632 break;
1633 *encPtr = encodingTable[UTF_16LE_ENC];
1634 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1635 }
1636 break;
1637 }
1638 }
1639 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1640 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1641 }
1642
1643 #define NS(x) x
1644 #define ns(x) x
1645 #define XML_TOK_NS_C
1646 #include "xmltok_ns.c"
1647 #undef XML_TOK_NS_C
1648 #undef NS
1649 #undef ns
1650
1651 #ifdef XML_NS
1652
1653 # define NS(x) x##NS
1654 # define ns(x) x##_ns
1655
1656 # define XML_TOK_NS_C
1657 # include "xmltok_ns.c"
1658 # undef XML_TOK_NS_C
1659
1660 # undef NS
1661 # undef ns
1662
1663 ENCODING *
XmlInitUnknownEncodingNS(void * mem,int * table,CONVERTER convert,void * userData)1664 XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1665 void *userData) {
1666 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1667 if (enc)
1668 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1669 return enc;
1670 }
1671
1672 #endif /* XML_NS */
1673