xref: /freebsd/contrib/expat/lib/xmltok.c (revision 783d3ff6)
1 /*
2                             __  __            _
3                          ___\ \/ /_ __   __ _| |_
4                         / _ \\  /| '_ \ / _` | __|
5                        |  __//  \| |_) | (_| | |_
6                         \___/_/\_\ .__/ \__,_|\__|
7                                  |_| XML parser
8 
9    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10    Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11    Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12    Copyright (c) 2002      Greg Stein <gstein@users.sourceforge.net>
13    Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14    Copyright (c) 2005-2009 Steven Solie <steven@solie.ca>
15    Copyright (c) 2016-2024 Sebastian Pipping <sebastian@pipping.org>
16    Copyright (c) 2016      Pascal Cuoq <cuoq@trust-in-soft.com>
17    Copyright (c) 2016      Don Lewis <truckman@apache.org>
18    Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
19    Copyright (c) 2017      Alexander Bluhm <alexander.bluhm@gmx.net>
20    Copyright (c) 2017      Benbuck Nason <bnason@netflix.com>
21    Copyright (c) 2017      José Gutiérrez de la Concha <jose@zeroc.com>
22    Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
23    Copyright (c) 2021      Donghee Na <donghee.na@python.org>
24    Copyright (c) 2022      Martin Ettl <ettl.martin78@googlemail.com>
25    Copyright (c) 2022      Sean McBride <sean@rogue-research.com>
26    Copyright (c) 2023      Hanno Böck <hanno@gentoo.org>
27    Licensed under the MIT license:
28 
29    Permission is  hereby granted,  free of charge,  to any  person obtaining
30    a  copy  of  this  software   and  associated  documentation  files  (the
31    "Software"),  to  deal in  the  Software  without restriction,  including
32    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
33    distribute, sublicense, and/or sell copies of the Software, and to permit
34    persons  to whom  the Software  is  furnished to  do so,  subject to  the
35    following conditions:
36 
37    The above copyright  notice and this permission notice  shall be included
38    in all copies or substantial portions of the Software.
39 
40    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
41    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
42    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
43    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
44    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
45    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
46    USE OR OTHER DEALINGS IN THE SOFTWARE.
47 */
48 
49 #include "expat_config.h"
50 
51 #include <stddef.h>
52 #include <string.h> /* memcpy */
53 #include <stdbool.h>
54 
55 #ifdef _WIN32
56 #  include "winconfig.h"
57 #endif
58 
59 #include "expat_external.h"
60 #include "internal.h"
61 #include "xmltok.h"
62 #include "nametab.h"
63 
64 #ifdef XML_DTD
65 #  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
66 #else
67 #  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
68 #endif
69 
70 #define VTABLE1                                                                \
71   {PREFIX(prologTok), PREFIX(contentTok),                                      \
72    PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
73       {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
74       PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
75       PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
76       PREFIX(updatePosition), PREFIX(isPublicId)
77 
78 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
79 
80 #define UCS2_GET_NAMING(pages, hi, lo)                                         \
81   (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
82 
83 /* A 2 byte UTF-8 representation splits the characters 11 bits between
84    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
85    pages, 3 bits to add to that index and 5 bits to generate the mask.
86 */
87 #define UTF8_GET_NAMING2(pages, byte)                                          \
88   (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
89                 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
90    & (1u << (((byte)[1]) & 0x1F)))
91 
92 /* A 3 byte UTF-8 representation splits the characters 16 bits between
93    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
94    into pages, 3 bits to add to that index and 5 bits to generate the
95    mask.
96 */
97 #define UTF8_GET_NAMING3(pages, byte)                                          \
98   (namingBitmap                                                                \
99        [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
100          << 3)                                                                 \
101         + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
102    & (1u << (((byte)[2]) & 0x1F)))
103 
104 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
105    of Unicode 3.2: https://www.unicode.org/unicode/reports/tr28/
106    with the additional restriction of not allowing the Unicode
107    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
108    Implementation details:
109      (A & 0x80) == 0     means A < 0x80
110    and
111      (A & 0xC0) == 0xC0  means A > 0xBF
112 */
113 
114 #define UTF8_INVALID2(p)                                                       \
115   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
116 
117 #define UTF8_INVALID3(p)                                                       \
118   (((p)[2] & 0x80) == 0                                                        \
119    || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
120                                       : ((p)[2] & 0xC0) == 0xC0)               \
121    || ((*p) == 0xE0                                                            \
122            ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
123            : ((p)[1] & 0x80) == 0                                              \
124                  || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
125 
126 #define UTF8_INVALID4(p)                                                       \
127   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
128    || ((p)[2] & 0xC0) == 0xC0                                                  \
129    || ((*p) == 0xF0                                                            \
130            ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
131            : ((p)[1] & 0x80) == 0                                              \
132                  || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
133 
134 static int PTRFASTCALL
135 isNever(const ENCODING *enc, const char *p) {
136   UNUSED_P(enc);
137   UNUSED_P(p);
138   return 0;
139 }
140 
141 static int PTRFASTCALL
142 utf8_isName2(const ENCODING *enc, const char *p) {
143   UNUSED_P(enc);
144   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
145 }
146 
147 static int PTRFASTCALL
148 utf8_isName3(const ENCODING *enc, const char *p) {
149   UNUSED_P(enc);
150   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
151 }
152 
153 #define utf8_isName4 isNever
154 
155 static int PTRFASTCALL
156 utf8_isNmstrt2(const ENCODING *enc, const char *p) {
157   UNUSED_P(enc);
158   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
159 }
160 
161 static int PTRFASTCALL
162 utf8_isNmstrt3(const ENCODING *enc, const char *p) {
163   UNUSED_P(enc);
164   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
165 }
166 
167 #define utf8_isNmstrt4 isNever
168 
169 static int PTRFASTCALL
170 utf8_isInvalid2(const ENCODING *enc, const char *p) {
171   UNUSED_P(enc);
172   return UTF8_INVALID2((const unsigned char *)p);
173 }
174 
175 static int PTRFASTCALL
176 utf8_isInvalid3(const ENCODING *enc, const char *p) {
177   UNUSED_P(enc);
178   return UTF8_INVALID3((const unsigned char *)p);
179 }
180 
181 static int PTRFASTCALL
182 utf8_isInvalid4(const ENCODING *enc, const char *p) {
183   UNUSED_P(enc);
184   return UTF8_INVALID4((const unsigned char *)p);
185 }
186 
187 struct normal_encoding {
188   ENCODING enc;
189   unsigned char type[256];
190 #ifdef XML_MIN_SIZE
191   int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
192   int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
193   int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
194   int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
195   int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
196 #endif /* XML_MIN_SIZE */
197   int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
198   int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
199   int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
200   int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
201   int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
202   int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
203   int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
204   int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
205   int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
206 };
207 
208 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
209 
210 #ifdef XML_MIN_SIZE
211 
212 #  define STANDARD_VTABLE(E)                                                   \
213     E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
214 
215 #else
216 
217 #  define STANDARD_VTABLE(E) /* as nothing */
218 
219 #endif
220 
221 #define NORMAL_VTABLE(E)                                                       \
222   E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
223       E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
224 
225 #define NULL_VTABLE                                                            \
226   /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
227       /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
228       /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
229 
230 static int FASTCALL checkCharRefNumber(int result);
231 
232 #include "xmltok_impl.h"
233 #include "ascii.h"
234 
235 #ifdef XML_MIN_SIZE
236 #  define sb_isNameMin isNever
237 #  define sb_isNmstrtMin isNever
238 #endif
239 
240 #ifdef XML_MIN_SIZE
241 #  define MINBPC(enc) ((enc)->minBytesPerChar)
242 #else
243 /* minimum bytes per character */
244 #  define MINBPC(enc) 1
245 #endif
246 
247 #define SB_BYTE_TYPE(enc, p)                                                   \
248   (((const struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
249 
250 #ifdef XML_MIN_SIZE
251 static int PTRFASTCALL
252 sb_byteType(const ENCODING *enc, const char *p) {
253   return SB_BYTE_TYPE(enc, p);
254 }
255 #  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
256 #else
257 #  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
258 #endif
259 
260 #ifdef XML_MIN_SIZE
261 #  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
262 static int PTRFASTCALL
263 sb_byteToAscii(const ENCODING *enc, const char *p) {
264   UNUSED_P(enc);
265   return *p;
266 }
267 #else
268 #  define BYTE_TO_ASCII(enc, p) (*(p))
269 #endif
270 
271 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
272 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
273 #ifdef XML_MIN_SIZE
274 #  define IS_INVALID_CHAR(enc, p, n)                                           \
275     (AS_NORMAL_ENCODING(enc)->isInvalid##n                                     \
276      && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
277 #else
278 #  define IS_INVALID_CHAR(enc, p, n)                                           \
279     (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
280 #endif
281 
282 #ifdef XML_MIN_SIZE
283 #  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
284     (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
285 #  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
286     (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
287 #else
288 #  define IS_NAME_CHAR_MINBPC(enc, p) (0)
289 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
290 #endif
291 
292 #ifdef XML_MIN_SIZE
293 #  define CHAR_MATCHES(enc, p, c)                                              \
294     (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
295 static int PTRCALL
296 sb_charMatches(const ENCODING *enc, const char *p, int c) {
297   UNUSED_P(enc);
298   return *p == c;
299 }
300 #else
301 /* c is an ASCII character */
302 #  define CHAR_MATCHES(enc, p, c) (*(p) == (c))
303 #endif
304 
305 #define PREFIX(ident) normal_##ident
306 #define XML_TOK_IMPL_C
307 #include "xmltok_impl.c"
308 #undef XML_TOK_IMPL_C
309 
310 #undef MINBPC
311 #undef BYTE_TYPE
312 #undef BYTE_TO_ASCII
313 #undef CHAR_MATCHES
314 #undef IS_NAME_CHAR
315 #undef IS_NAME_CHAR_MINBPC
316 #undef IS_NMSTRT_CHAR
317 #undef IS_NMSTRT_CHAR_MINBPC
318 #undef IS_INVALID_CHAR
319 
320 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
321        UTF8_cval1 = 0x00,
322        UTF8_cval2 = 0xc0,
323        UTF8_cval3 = 0xe0,
324        UTF8_cval4 = 0xf0
325 };
326 
327 void
328 _INTERNAL_trim_to_complete_utf8_characters(const char *from,
329                                            const char **fromLimRef) {
330   const char *fromLim = *fromLimRef;
331   size_t walked = 0;
332   for (; fromLim > from; fromLim--, walked++) {
333     const unsigned char prev = (unsigned char)fromLim[-1];
334     if ((prev & 0xf8u)
335         == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
336       if (walked + 1 >= 4) {
337         fromLim += 4 - 1;
338         break;
339       } else {
340         walked = 0;
341       }
342     } else if ((prev & 0xf0u)
343                == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
344       if (walked + 1 >= 3) {
345         fromLim += 3 - 1;
346         break;
347       } else {
348         walked = 0;
349       }
350     } else if ((prev & 0xe0u)
351                == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
352       if (walked + 1 >= 2) {
353         fromLim += 2 - 1;
354         break;
355       } else {
356         walked = 0;
357       }
358     } else if ((prev & 0x80u)
359                == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
360       break;
361     }
362   }
363   *fromLimRef = fromLim;
364 }
365 
366 static enum XML_Convert_Result PTRCALL
367 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
368             char **toP, const char *toLim) {
369   bool input_incomplete = false;
370   bool output_exhausted = false;
371 
372   /* Avoid copying partial characters (due to limited space). */
373   const ptrdiff_t bytesAvailable = fromLim - *fromP;
374   const ptrdiff_t bytesStorable = toLim - *toP;
375   UNUSED_P(enc);
376   if (bytesAvailable > bytesStorable) {
377     fromLim = *fromP + bytesStorable;
378     output_exhausted = true;
379   }
380 
381   /* Avoid copying partial characters (from incomplete input). */
382   {
383     const char *const fromLimBefore = fromLim;
384     _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
385     if (fromLim < fromLimBefore) {
386       input_incomplete = true;
387     }
388   }
389 
390   {
391     const ptrdiff_t bytesToCopy = fromLim - *fromP;
392     memcpy(*toP, *fromP, bytesToCopy);
393     *fromP += bytesToCopy;
394     *toP += bytesToCopy;
395   }
396 
397   if (output_exhausted) /* needs to go first */
398     return XML_CONVERT_OUTPUT_EXHAUSTED;
399   else if (input_incomplete)
400     return XML_CONVERT_INPUT_INCOMPLETE;
401   else
402     return XML_CONVERT_COMPLETED;
403 }
404 
405 static enum XML_Convert_Result PTRCALL
406 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
407              unsigned short **toP, const unsigned short *toLim) {
408   enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
409   unsigned short *to = *toP;
410   const char *from = *fromP;
411   while (from < fromLim && to < toLim) {
412     switch (SB_BYTE_TYPE(enc, from)) {
413     case BT_LEAD2:
414       if (fromLim - from < 2) {
415         res = XML_CONVERT_INPUT_INCOMPLETE;
416         goto after;
417       }
418       *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
419       from += 2;
420       break;
421     case BT_LEAD3:
422       if (fromLim - from < 3) {
423         res = XML_CONVERT_INPUT_INCOMPLETE;
424         goto after;
425       }
426       *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
427                                | (from[2] & 0x3f));
428       from += 3;
429       break;
430     case BT_LEAD4: {
431       unsigned long n;
432       if (toLim - to < 2) {
433         res = XML_CONVERT_OUTPUT_EXHAUSTED;
434         goto after;
435       }
436       if (fromLim - from < 4) {
437         res = XML_CONVERT_INPUT_INCOMPLETE;
438         goto after;
439       }
440       n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
441           | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
442       n -= 0x10000;
443       to[0] = (unsigned short)((n >> 10) | 0xD800);
444       to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
445       to += 2;
446       from += 4;
447     } break;
448     default:
449       *to++ = *from++;
450       break;
451     }
452   }
453   if (from < fromLim)
454     res = XML_CONVERT_OUTPUT_EXHAUSTED;
455 after:
456   *fromP = from;
457   *toP = to;
458   return res;
459 }
460 
461 #ifdef XML_NS
462 static const struct normal_encoding utf8_encoding_ns
463     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
464        {
465 #  include "asciitab.h"
466 #  include "utf8tab.h"
467        },
468        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
469 #endif
470 
471 static const struct normal_encoding utf8_encoding
472     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
473        {
474 #define BT_COLON BT_NMSTRT
475 #include "asciitab.h"
476 #undef BT_COLON
477 #include "utf8tab.h"
478        },
479        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
480 
481 #ifdef XML_NS
482 
483 static const struct normal_encoding internal_utf8_encoding_ns
484     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
485        {
486 #  include "iasciitab.h"
487 #  include "utf8tab.h"
488        },
489        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
490 
491 #endif
492 
493 static const struct normal_encoding internal_utf8_encoding
494     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
495        {
496 #define BT_COLON BT_NMSTRT
497 #include "iasciitab.h"
498 #undef BT_COLON
499 #include "utf8tab.h"
500        },
501        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
502 
503 static enum XML_Convert_Result PTRCALL
504 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
505               char **toP, const char *toLim) {
506   UNUSED_P(enc);
507   for (;;) {
508     unsigned char c;
509     if (*fromP == fromLim)
510       return XML_CONVERT_COMPLETED;
511     c = (unsigned char)**fromP;
512     if (c & 0x80) {
513       if (toLim - *toP < 2)
514         return XML_CONVERT_OUTPUT_EXHAUSTED;
515       *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
516       *(*toP)++ = (char)((c & 0x3f) | 0x80);
517       (*fromP)++;
518     } else {
519       if (*toP == toLim)
520         return XML_CONVERT_OUTPUT_EXHAUSTED;
521       *(*toP)++ = *(*fromP)++;
522     }
523   }
524 }
525 
526 static enum XML_Convert_Result PTRCALL
527 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
528                unsigned short **toP, const unsigned short *toLim) {
529   UNUSED_P(enc);
530   while (*fromP < fromLim && *toP < toLim)
531     *(*toP)++ = (unsigned char)*(*fromP)++;
532 
533   if ((*toP == toLim) && (*fromP < fromLim))
534     return XML_CONVERT_OUTPUT_EXHAUSTED;
535   else
536     return XML_CONVERT_COMPLETED;
537 }
538 
539 #ifdef XML_NS
540 
541 static const struct normal_encoding latin1_encoding_ns
542     = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
543        {
544 #  include "asciitab.h"
545 #  include "latin1tab.h"
546        },
547        STANDARD_VTABLE(sb_) NULL_VTABLE};
548 
549 #endif
550 
551 static const struct normal_encoding latin1_encoding
552     = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
553        {
554 #define BT_COLON BT_NMSTRT
555 #include "asciitab.h"
556 #undef BT_COLON
557 #include "latin1tab.h"
558        },
559        STANDARD_VTABLE(sb_) NULL_VTABLE};
560 
561 static enum XML_Convert_Result PTRCALL
562 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
563              char **toP, const char *toLim) {
564   UNUSED_P(enc);
565   while (*fromP < fromLim && *toP < toLim)
566     *(*toP)++ = *(*fromP)++;
567 
568   if ((*toP == toLim) && (*fromP < fromLim))
569     return XML_CONVERT_OUTPUT_EXHAUSTED;
570   else
571     return XML_CONVERT_COMPLETED;
572 }
573 
574 #ifdef XML_NS
575 
576 static const struct normal_encoding ascii_encoding_ns
577     = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
578        {
579 #  include "asciitab.h"
580            /* BT_NONXML == 0 */
581        },
582        STANDARD_VTABLE(sb_) NULL_VTABLE};
583 
584 #endif
585 
586 static const struct normal_encoding ascii_encoding
587     = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
588        {
589 #define BT_COLON BT_NMSTRT
590 #include "asciitab.h"
591 #undef BT_COLON
592            /* BT_NONXML == 0 */
593        },
594        STANDARD_VTABLE(sb_) NULL_VTABLE};
595 
596 static int PTRFASTCALL
597 unicode_byte_type(char hi, char lo) {
598   switch ((unsigned char)hi) {
599   /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
600   case 0xD8:
601   case 0xD9:
602   case 0xDA:
603   case 0xDB:
604     return BT_LEAD4;
605   /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
606   case 0xDC:
607   case 0xDD:
608   case 0xDE:
609   case 0xDF:
610     return BT_TRAIL;
611   case 0xFF:
612     switch ((unsigned char)lo) {
613     case 0xFF: /* noncharacter-FFFF */
614     case 0xFE: /* noncharacter-FFFE */
615       return BT_NONXML;
616     }
617     break;
618   }
619   return BT_NONASCII;
620 }
621 
622 #define DEFINE_UTF16_TO_UTF8(E)                                                \
623   static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
624       const ENCODING *enc, const char **fromP, const char *fromLim,            \
625       char **toP, const char *toLim) {                                         \
626     const char *from = *fromP;                                                 \
627     UNUSED_P(enc);                                                             \
628     fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
629     for (; from < fromLim; from += 2) {                                        \
630       int plane;                                                               \
631       unsigned char lo2;                                                       \
632       unsigned char lo = GET_LO(from);                                         \
633       unsigned char hi = GET_HI(from);                                         \
634       switch (hi) {                                                            \
635       case 0:                                                                  \
636         if (lo < 0x80) {                                                       \
637           if (*toP == toLim) {                                                 \
638             *fromP = from;                                                     \
639             return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
640           }                                                                    \
641           *(*toP)++ = lo;                                                      \
642           break;                                                               \
643         }                                                                      \
644         /* fall through */                                                     \
645       case 0x1:                                                                \
646       case 0x2:                                                                \
647       case 0x3:                                                                \
648       case 0x4:                                                                \
649       case 0x5:                                                                \
650       case 0x6:                                                                \
651       case 0x7:                                                                \
652         if (toLim - *toP < 2) {                                                \
653           *fromP = from;                                                       \
654           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
655         }                                                                      \
656         *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
657         *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
658         break;                                                                 \
659       default:                                                                 \
660         if (toLim - *toP < 3) {                                                \
661           *fromP = from;                                                       \
662           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
663         }                                                                      \
664         /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
665         *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
666         *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
667         *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
668         break;                                                                 \
669       case 0xD8:                                                               \
670       case 0xD9:                                                               \
671       case 0xDA:                                                               \
672       case 0xDB:                                                               \
673         if (toLim - *toP < 4) {                                                \
674           *fromP = from;                                                       \
675           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
676         }                                                                      \
677         if (fromLim - from < 4) {                                              \
678           *fromP = from;                                                       \
679           return XML_CONVERT_INPUT_INCOMPLETE;                                 \
680         }                                                                      \
681         plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
682         *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
683         *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
684         from += 2;                                                             \
685         lo2 = GET_LO(from);                                                    \
686         *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
687                      | (lo2 >> 6) | 0x80);                                     \
688         *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
689         break;                                                                 \
690       }                                                                        \
691     }                                                                          \
692     *fromP = from;                                                             \
693     if (from < fromLim)                                                        \
694       return XML_CONVERT_INPUT_INCOMPLETE;                                     \
695     else                                                                       \
696       return XML_CONVERT_COMPLETED;                                            \
697   }
698 
699 #define DEFINE_UTF16_TO_UTF16(E)                                               \
700   static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
701       const ENCODING *enc, const char **fromP, const char *fromLim,            \
702       unsigned short **toP, const unsigned short *toLim) {                     \
703     enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
704     UNUSED_P(enc);                                                             \
705     fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
706     /* Avoid copying first half only of surrogate */                           \
707     if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
708         && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
709       fromLim -= 2;                                                            \
710       res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
711     }                                                                          \
712     for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
713       *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
714     if ((*toP == toLim) && (*fromP < fromLim))                                 \
715       return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
716     else                                                                       \
717       return res;                                                              \
718   }
719 
720 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
721 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
722 
723 DEFINE_UTF16_TO_UTF8(little2_)
724 DEFINE_UTF16_TO_UTF16(little2_)
725 
726 #undef GET_LO
727 #undef GET_HI
728 
729 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
730 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
731 
732 DEFINE_UTF16_TO_UTF8(big2_)
733 DEFINE_UTF16_TO_UTF16(big2_)
734 
735 #undef GET_LO
736 #undef GET_HI
737 
738 #define LITTLE2_BYTE_TYPE(enc, p)                                              \
739   ((p)[1] == 0 ? SB_BYTE_TYPE(enc, p) : unicode_byte_type((p)[1], (p)[0]))
740 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
741 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == (c))
742 #define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
743   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
744 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
745   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
746 
747 #ifdef XML_MIN_SIZE
748 
749 static int PTRFASTCALL
750 little2_byteType(const ENCODING *enc, const char *p) {
751   return LITTLE2_BYTE_TYPE(enc, p);
752 }
753 
754 static int PTRFASTCALL
755 little2_byteToAscii(const ENCODING *enc, const char *p) {
756   UNUSED_P(enc);
757   return LITTLE2_BYTE_TO_ASCII(p);
758 }
759 
760 static int PTRCALL
761 little2_charMatches(const ENCODING *enc, const char *p, int c) {
762   UNUSED_P(enc);
763   return LITTLE2_CHAR_MATCHES(p, c);
764 }
765 
766 static int PTRFASTCALL
767 little2_isNameMin(const ENCODING *enc, const char *p) {
768   UNUSED_P(enc);
769   return LITTLE2_IS_NAME_CHAR_MINBPC(p);
770 }
771 
772 static int PTRFASTCALL
773 little2_isNmstrtMin(const ENCODING *enc, const char *p) {
774   UNUSED_P(enc);
775   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
776 }
777 
778 #  undef VTABLE
779 #  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
780 
781 #else /* not XML_MIN_SIZE */
782 
783 #  undef PREFIX
784 #  define PREFIX(ident) little2_##ident
785 #  define MINBPC(enc) 2
786 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
787 #  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
788 #  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
789 #  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
790 #  define IS_NAME_CHAR(enc, p, n) 0
791 #  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
792 #  define IS_NMSTRT_CHAR(enc, p, n) (0)
793 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
794 
795 #  define XML_TOK_IMPL_C
796 #  include "xmltok_impl.c"
797 #  undef XML_TOK_IMPL_C
798 
799 #  undef MINBPC
800 #  undef BYTE_TYPE
801 #  undef BYTE_TO_ASCII
802 #  undef CHAR_MATCHES
803 #  undef IS_NAME_CHAR
804 #  undef IS_NAME_CHAR_MINBPC
805 #  undef IS_NMSTRT_CHAR
806 #  undef IS_NMSTRT_CHAR_MINBPC
807 #  undef IS_INVALID_CHAR
808 
809 #endif /* not XML_MIN_SIZE */
810 
811 #ifdef XML_NS
812 
813 static const struct normal_encoding little2_encoding_ns
814     = {{VTABLE, 2, 0,
815 #  if BYTEORDER == 1234
816         1
817 #  else
818         0
819 #  endif
820        },
821        {
822 #  include "asciitab.h"
823 #  include "latin1tab.h"
824        },
825        STANDARD_VTABLE(little2_) NULL_VTABLE};
826 
827 #endif
828 
829 static const struct normal_encoding little2_encoding
830     = {{VTABLE, 2, 0,
831 #if BYTEORDER == 1234
832         1
833 #else
834         0
835 #endif
836        },
837        {
838 #define BT_COLON BT_NMSTRT
839 #include "asciitab.h"
840 #undef BT_COLON
841 #include "latin1tab.h"
842        },
843        STANDARD_VTABLE(little2_) NULL_VTABLE};
844 
845 #if BYTEORDER != 4321
846 
847 #  ifdef XML_NS
848 
849 static const struct normal_encoding internal_little2_encoding_ns
850     = {{VTABLE, 2, 0, 1},
851        {
852 #    include "iasciitab.h"
853 #    include "latin1tab.h"
854        },
855        STANDARD_VTABLE(little2_) NULL_VTABLE};
856 
857 #  endif
858 
859 static const struct normal_encoding internal_little2_encoding
860     = {{VTABLE, 2, 0, 1},
861        {
862 #  define BT_COLON BT_NMSTRT
863 #  include "iasciitab.h"
864 #  undef BT_COLON
865 #  include "latin1tab.h"
866        },
867        STANDARD_VTABLE(little2_) NULL_VTABLE};
868 
869 #endif
870 
871 #define BIG2_BYTE_TYPE(enc, p)                                                 \
872   ((p)[0] == 0 ? SB_BYTE_TYPE(enc, p + 1) : unicode_byte_type((p)[0], (p)[1]))
873 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
874 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == (c))
875 #define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
876   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
877 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
878   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
879 
880 #ifdef XML_MIN_SIZE
881 
882 static int PTRFASTCALL
883 big2_byteType(const ENCODING *enc, const char *p) {
884   return BIG2_BYTE_TYPE(enc, p);
885 }
886 
887 static int PTRFASTCALL
888 big2_byteToAscii(const ENCODING *enc, const char *p) {
889   UNUSED_P(enc);
890   return BIG2_BYTE_TO_ASCII(p);
891 }
892 
893 static int PTRCALL
894 big2_charMatches(const ENCODING *enc, const char *p, int c) {
895   UNUSED_P(enc);
896   return BIG2_CHAR_MATCHES(p, c);
897 }
898 
899 static int PTRFASTCALL
900 big2_isNameMin(const ENCODING *enc, const char *p) {
901   UNUSED_P(enc);
902   return BIG2_IS_NAME_CHAR_MINBPC(p);
903 }
904 
905 static int PTRFASTCALL
906 big2_isNmstrtMin(const ENCODING *enc, const char *p) {
907   UNUSED_P(enc);
908   return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
909 }
910 
911 #  undef VTABLE
912 #  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
913 
914 #else /* not XML_MIN_SIZE */
915 
916 #  undef PREFIX
917 #  define PREFIX(ident) big2_##ident
918 #  define MINBPC(enc) 2
919 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
920 #  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
921 #  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
922 #  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
923 #  define IS_NAME_CHAR(enc, p, n) 0
924 #  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
925 #  define IS_NMSTRT_CHAR(enc, p, n) (0)
926 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
927 
928 #  define XML_TOK_IMPL_C
929 #  include "xmltok_impl.c"
930 #  undef XML_TOK_IMPL_C
931 
932 #  undef MINBPC
933 #  undef BYTE_TYPE
934 #  undef BYTE_TO_ASCII
935 #  undef CHAR_MATCHES
936 #  undef IS_NAME_CHAR
937 #  undef IS_NAME_CHAR_MINBPC
938 #  undef IS_NMSTRT_CHAR
939 #  undef IS_NMSTRT_CHAR_MINBPC
940 #  undef IS_INVALID_CHAR
941 
942 #endif /* not XML_MIN_SIZE */
943 
944 #ifdef XML_NS
945 
946 static const struct normal_encoding big2_encoding_ns
947     = {{VTABLE, 2, 0,
948 #  if BYTEORDER == 4321
949         1
950 #  else
951         0
952 #  endif
953        },
954        {
955 #  include "asciitab.h"
956 #  include "latin1tab.h"
957        },
958        STANDARD_VTABLE(big2_) NULL_VTABLE};
959 
960 #endif
961 
962 static const struct normal_encoding big2_encoding
963     = {{VTABLE, 2, 0,
964 #if BYTEORDER == 4321
965         1
966 #else
967         0
968 #endif
969        },
970        {
971 #define BT_COLON BT_NMSTRT
972 #include "asciitab.h"
973 #undef BT_COLON
974 #include "latin1tab.h"
975        },
976        STANDARD_VTABLE(big2_) NULL_VTABLE};
977 
978 #if BYTEORDER != 1234
979 
980 #  ifdef XML_NS
981 
982 static const struct normal_encoding internal_big2_encoding_ns
983     = {{VTABLE, 2, 0, 1},
984        {
985 #    include "iasciitab.h"
986 #    include "latin1tab.h"
987        },
988        STANDARD_VTABLE(big2_) NULL_VTABLE};
989 
990 #  endif
991 
992 static const struct normal_encoding internal_big2_encoding
993     = {{VTABLE, 2, 0, 1},
994        {
995 #  define BT_COLON BT_NMSTRT
996 #  include "iasciitab.h"
997 #  undef BT_COLON
998 #  include "latin1tab.h"
999        },
1000        STANDARD_VTABLE(big2_) NULL_VTABLE};
1001 
1002 #endif
1003 
1004 #undef PREFIX
1005 
1006 static int FASTCALL
1007 streqci(const char *s1, const char *s2) {
1008   for (;;) {
1009     char c1 = *s1++;
1010     char c2 = *s2++;
1011     if (ASCII_a <= c1 && c1 <= ASCII_z)
1012       c1 += ASCII_A - ASCII_a;
1013     if (ASCII_a <= c2 && c2 <= ASCII_z)
1014       /* The following line will never get executed.  streqci() is
1015        * only called from two places, both of which guarantee to put
1016        * upper-case strings into s2.
1017        */
1018       c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1019     if (c1 != c2)
1020       return 0;
1021     if (! c1)
1022       break;
1023   }
1024   return 1;
1025 }
1026 
1027 static void PTRCALL
1028 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1029                    POSITION *pos) {
1030   UNUSED_P(enc);
1031   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1032 }
1033 
1034 static int
1035 toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1036   char buf[1];
1037   char *p = buf;
1038   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1039   if (p == buf)
1040     return -1;
1041   else
1042     return buf[0];
1043 }
1044 
1045 static int FASTCALL
1046 isSpace(int c) {
1047   switch (c) {
1048   case 0x20:
1049   case 0xD:
1050   case 0xA:
1051   case 0x9:
1052     return 1;
1053   }
1054   return 0;
1055 }
1056 
1057 /* Return 1 if there's just optional white space or there's an S
1058    followed by name=val.
1059 */
1060 static int
1061 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1062                      const char **namePtr, const char **nameEndPtr,
1063                      const char **valPtr, const char **nextTokPtr) {
1064   int c;
1065   char open;
1066   if (ptr == end) {
1067     *namePtr = NULL;
1068     return 1;
1069   }
1070   if (! isSpace(toAscii(enc, ptr, end))) {
1071     *nextTokPtr = ptr;
1072     return 0;
1073   }
1074   do {
1075     ptr += enc->minBytesPerChar;
1076   } while (isSpace(toAscii(enc, ptr, end)));
1077   if (ptr == end) {
1078     *namePtr = NULL;
1079     return 1;
1080   }
1081   *namePtr = ptr;
1082   for (;;) {
1083     c = toAscii(enc, ptr, end);
1084     if (c == -1) {
1085       *nextTokPtr = ptr;
1086       return 0;
1087     }
1088     if (c == ASCII_EQUALS) {
1089       *nameEndPtr = ptr;
1090       break;
1091     }
1092     if (isSpace(c)) {
1093       *nameEndPtr = ptr;
1094       do {
1095         ptr += enc->minBytesPerChar;
1096       } while (isSpace(c = toAscii(enc, ptr, end)));
1097       if (c != ASCII_EQUALS) {
1098         *nextTokPtr = ptr;
1099         return 0;
1100       }
1101       break;
1102     }
1103     ptr += enc->minBytesPerChar;
1104   }
1105   if (ptr == *namePtr) {
1106     *nextTokPtr = ptr;
1107     return 0;
1108   }
1109   ptr += enc->minBytesPerChar;
1110   c = toAscii(enc, ptr, end);
1111   while (isSpace(c)) {
1112     ptr += enc->minBytesPerChar;
1113     c = toAscii(enc, ptr, end);
1114   }
1115   if (c != ASCII_QUOT && c != ASCII_APOS) {
1116     *nextTokPtr = ptr;
1117     return 0;
1118   }
1119   open = (char)c;
1120   ptr += enc->minBytesPerChar;
1121   *valPtr = ptr;
1122   for (;; ptr += enc->minBytesPerChar) {
1123     c = toAscii(enc, ptr, end);
1124     if (c == open)
1125       break;
1126     if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1127         && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1128         && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1129       *nextTokPtr = ptr;
1130       return 0;
1131     }
1132   }
1133   *nextTokPtr = ptr + enc->minBytesPerChar;
1134   return 1;
1135 }
1136 
1137 static const char KW_version[]
1138     = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1139 
1140 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1141                                    ASCII_i, ASCII_n, ASCII_g, '\0'};
1142 
1143 static const char KW_standalone[]
1144     = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1145        ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1146 
1147 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1148 
1149 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1150 
1151 static int
1152 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1153                                                  const char *),
1154                int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1155                const char *end, const char **badPtr, const char **versionPtr,
1156                const char **versionEndPtr, const char **encodingName,
1157                const ENCODING **encoding, int *standalone) {
1158   const char *val = NULL;
1159   const char *name = NULL;
1160   const char *nameEnd = NULL;
1161   ptr += 5 * enc->minBytesPerChar;
1162   end -= 2 * enc->minBytesPerChar;
1163   if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1164       || ! name) {
1165     *badPtr = ptr;
1166     return 0;
1167   }
1168   if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1169     if (! isGeneralTextEntity) {
1170       *badPtr = name;
1171       return 0;
1172     }
1173   } else {
1174     if (versionPtr)
1175       *versionPtr = val;
1176     if (versionEndPtr)
1177       *versionEndPtr = ptr;
1178     if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1179       *badPtr = ptr;
1180       return 0;
1181     }
1182     if (! name) {
1183       if (isGeneralTextEntity) {
1184         /* a TextDecl must have an EncodingDecl */
1185         *badPtr = ptr;
1186         return 0;
1187       }
1188       return 1;
1189     }
1190   }
1191   if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1192     int c = toAscii(enc, val, end);
1193     if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1194       *badPtr = val;
1195       return 0;
1196     }
1197     if (encodingName)
1198       *encodingName = val;
1199     if (encoding)
1200       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1201     if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1202       *badPtr = ptr;
1203       return 0;
1204     }
1205     if (! name)
1206       return 1;
1207   }
1208   if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1209       || isGeneralTextEntity) {
1210     *badPtr = name;
1211     return 0;
1212   }
1213   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1214     if (standalone)
1215       *standalone = 1;
1216   } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1217     if (standalone)
1218       *standalone = 0;
1219   } else {
1220     *badPtr = val;
1221     return 0;
1222   }
1223   while (isSpace(toAscii(enc, ptr, end)))
1224     ptr += enc->minBytesPerChar;
1225   if (ptr != end) {
1226     *badPtr = ptr;
1227     return 0;
1228   }
1229   return 1;
1230 }
1231 
1232 static int FASTCALL
1233 checkCharRefNumber(int result) {
1234   switch (result >> 8) {
1235   case 0xD8:
1236   case 0xD9:
1237   case 0xDA:
1238   case 0xDB:
1239   case 0xDC:
1240   case 0xDD:
1241   case 0xDE:
1242   case 0xDF:
1243     return -1;
1244   case 0:
1245     if (latin1_encoding.type[result] == BT_NONXML)
1246       return -1;
1247     break;
1248   case 0xFF:
1249     if (result == 0xFFFE || result == 0xFFFF)
1250       return -1;
1251     break;
1252   }
1253   return result;
1254 }
1255 
1256 int FASTCALL
1257 XmlUtf8Encode(int c, char *buf) {
1258   enum {
1259     /* minN is minimum legal resulting value for N byte sequence */
1260     min2 = 0x80,
1261     min3 = 0x800,
1262     min4 = 0x10000
1263   };
1264 
1265   if (c < 0)
1266     return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1267   if (c < min2) {
1268     buf[0] = (char)(c | UTF8_cval1);
1269     return 1;
1270   }
1271   if (c < min3) {
1272     buf[0] = (char)((c >> 6) | UTF8_cval2);
1273     buf[1] = (char)((c & 0x3f) | 0x80);
1274     return 2;
1275   }
1276   if (c < min4) {
1277     buf[0] = (char)((c >> 12) | UTF8_cval3);
1278     buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1279     buf[2] = (char)((c & 0x3f) | 0x80);
1280     return 3;
1281   }
1282   if (c < 0x110000) {
1283     buf[0] = (char)((c >> 18) | UTF8_cval4);
1284     buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1285     buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1286     buf[3] = (char)((c & 0x3f) | 0x80);
1287     return 4;
1288   }
1289   return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1290 }
1291 
1292 int FASTCALL
1293 XmlUtf16Encode(int charNum, unsigned short *buf) {
1294   if (charNum < 0)
1295     return 0;
1296   if (charNum < 0x10000) {
1297     buf[0] = (unsigned short)charNum;
1298     return 1;
1299   }
1300   if (charNum < 0x110000) {
1301     charNum -= 0x10000;
1302     buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1303     buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1304     return 2;
1305   }
1306   return 0;
1307 }
1308 
1309 struct unknown_encoding {
1310   struct normal_encoding normal;
1311   CONVERTER convert;
1312   void *userData;
1313   unsigned short utf16[256];
1314   char utf8[256][4];
1315 };
1316 
1317 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1318 
1319 int
1320 XmlSizeOfUnknownEncoding(void) {
1321   return sizeof(struct unknown_encoding);
1322 }
1323 
1324 static int PTRFASTCALL
1325 unknown_isName(const ENCODING *enc, const char *p) {
1326   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1327   int c = uenc->convert(uenc->userData, p);
1328   if (c & ~0xFFFF)
1329     return 0;
1330   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1331 }
1332 
1333 static int PTRFASTCALL
1334 unknown_isNmstrt(const ENCODING *enc, const char *p) {
1335   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1336   int c = uenc->convert(uenc->userData, p);
1337   if (c & ~0xFFFF)
1338     return 0;
1339   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1340 }
1341 
1342 static int PTRFASTCALL
1343 unknown_isInvalid(const ENCODING *enc, const char *p) {
1344   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1345   int c = uenc->convert(uenc->userData, p);
1346   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1347 }
1348 
1349 static enum XML_Convert_Result PTRCALL
1350 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1351                char **toP, const char *toLim) {
1352   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1353   char buf[XML_UTF8_ENCODE_MAX];
1354   for (;;) {
1355     const char *utf8;
1356     int n;
1357     if (*fromP == fromLim)
1358       return XML_CONVERT_COMPLETED;
1359     utf8 = uenc->utf8[(unsigned char)**fromP];
1360     n = *utf8++;
1361     if (n == 0) {
1362       int c = uenc->convert(uenc->userData, *fromP);
1363       n = XmlUtf8Encode(c, buf);
1364       if (n > toLim - *toP)
1365         return XML_CONVERT_OUTPUT_EXHAUSTED;
1366       utf8 = buf;
1367       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1368                  - (BT_LEAD2 - 2));
1369     } else {
1370       if (n > toLim - *toP)
1371         return XML_CONVERT_OUTPUT_EXHAUSTED;
1372       (*fromP)++;
1373     }
1374     memcpy(*toP, utf8, n);
1375     *toP += n;
1376   }
1377 }
1378 
1379 static enum XML_Convert_Result PTRCALL
1380 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1381                 unsigned short **toP, const unsigned short *toLim) {
1382   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1383   while (*fromP < fromLim && *toP < toLim) {
1384     unsigned short c = uenc->utf16[(unsigned char)**fromP];
1385     if (c == 0) {
1386       c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1387       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1388                  - (BT_LEAD2 - 2));
1389     } else
1390       (*fromP)++;
1391     *(*toP)++ = c;
1392   }
1393 
1394   if ((*toP == toLim) && (*fromP < fromLim))
1395     return XML_CONVERT_OUTPUT_EXHAUSTED;
1396   else
1397     return XML_CONVERT_COMPLETED;
1398 }
1399 
1400 ENCODING *
1401 XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1402                        void *userData) {
1403   int i;
1404   struct unknown_encoding *e = (struct unknown_encoding *)mem;
1405   memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1406   for (i = 0; i < 128; i++)
1407     if (latin1_encoding.type[i] != BT_OTHER
1408         && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1409       return 0;
1410   for (i = 0; i < 256; i++) {
1411     int c = table[i];
1412     if (c == -1) {
1413       e->normal.type[i] = BT_MALFORM;
1414       /* This shouldn't really get used. */
1415       e->utf16[i] = 0xFFFF;
1416       e->utf8[i][0] = 1;
1417       e->utf8[i][1] = 0;
1418     } else if (c < 0) {
1419       if (c < -4)
1420         return 0;
1421       /* Multi-byte sequences need a converter function */
1422       if (! convert)
1423         return 0;
1424       e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1425       e->utf8[i][0] = 0;
1426       e->utf16[i] = 0;
1427     } else if (c < 0x80) {
1428       if (latin1_encoding.type[c] != BT_OTHER
1429           && latin1_encoding.type[c] != BT_NONXML && c != i)
1430         return 0;
1431       e->normal.type[i] = latin1_encoding.type[c];
1432       e->utf8[i][0] = 1;
1433       e->utf8[i][1] = (char)c;
1434       e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1435     } else if (checkCharRefNumber(c) < 0) {
1436       e->normal.type[i] = BT_NONXML;
1437       /* This shouldn't really get used. */
1438       e->utf16[i] = 0xFFFF;
1439       e->utf8[i][0] = 1;
1440       e->utf8[i][1] = 0;
1441     } else {
1442       if (c > 0xFFFF)
1443         return 0;
1444       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1445         e->normal.type[i] = BT_NMSTRT;
1446       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1447         e->normal.type[i] = BT_NAME;
1448       else
1449         e->normal.type[i] = BT_OTHER;
1450       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1451       e->utf16[i] = (unsigned short)c;
1452     }
1453   }
1454   e->userData = userData;
1455   e->convert = convert;
1456   if (convert) {
1457     e->normal.isName2 = unknown_isName;
1458     e->normal.isName3 = unknown_isName;
1459     e->normal.isName4 = unknown_isName;
1460     e->normal.isNmstrt2 = unknown_isNmstrt;
1461     e->normal.isNmstrt3 = unknown_isNmstrt;
1462     e->normal.isNmstrt4 = unknown_isNmstrt;
1463     e->normal.isInvalid2 = unknown_isInvalid;
1464     e->normal.isInvalid3 = unknown_isInvalid;
1465     e->normal.isInvalid4 = unknown_isInvalid;
1466   }
1467   e->normal.enc.utf8Convert = unknown_toUtf8;
1468   e->normal.enc.utf16Convert = unknown_toUtf16;
1469   return &(e->normal.enc);
1470 }
1471 
1472 /* If this enumeration is changed, getEncodingIndex and encodings
1473 must also be changed. */
1474 enum {
1475   UNKNOWN_ENC = -1,
1476   ISO_8859_1_ENC = 0,
1477   US_ASCII_ENC,
1478   UTF_8_ENC,
1479   UTF_16_ENC,
1480   UTF_16BE_ENC,
1481   UTF_16LE_ENC,
1482   /* must match encodingNames up to here */
1483   NO_ENC
1484 };
1485 
1486 static const char KW_ISO_8859_1[]
1487     = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
1488        ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
1489 static const char KW_US_ASCII[]
1490     = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1491        ASCII_C, ASCII_I, ASCII_I,     '\0'};
1492 static const char KW_UTF_8[]
1493     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1494 static const char KW_UTF_16[]
1495     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1496 static const char KW_UTF_16BE[]
1497     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1498        ASCII_6, ASCII_B, ASCII_E, '\0'};
1499 static const char KW_UTF_16LE[]
1500     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1501        ASCII_6, ASCII_L, ASCII_E, '\0'};
1502 
1503 static int FASTCALL
1504 getEncodingIndex(const char *name) {
1505   static const char *const encodingNames[] = {
1506       KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1507   };
1508   int i;
1509   if (name == NULL)
1510     return NO_ENC;
1511   for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1512     if (streqci(name, encodingNames[i]))
1513       return i;
1514   return UNKNOWN_ENC;
1515 }
1516 
1517 /* For binary compatibility, we store the index of the encoding
1518    specified at initialization in the isUtf16 member.
1519 */
1520 
1521 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1522 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1523 
1524 /* This is what detects the encoding.  encodingTable maps from
1525    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1526    the external (protocol) specified encoding; state is
1527    XML_CONTENT_STATE if we're parsing an external text entity, and
1528    XML_PROLOG_STATE otherwise.
1529 */
1530 
1531 static int
1532 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1533          int state, const char *ptr, const char *end, const char **nextTokPtr) {
1534   const ENCODING **encPtr;
1535 
1536   if (ptr >= end)
1537     return XML_TOK_NONE;
1538   encPtr = enc->encPtr;
1539   if (ptr + 1 == end) {
1540     /* only a single byte available for auto-detection */
1541 #ifndef XML_DTD /* FIXME */
1542     /* a well-formed document entity must have more than one byte */
1543     if (state != XML_CONTENT_STATE)
1544       return XML_TOK_PARTIAL;
1545 #endif
1546     /* so we're parsing an external text entity... */
1547     /* if UTF-16 was externally specified, then we need at least 2 bytes */
1548     switch (INIT_ENC_INDEX(enc)) {
1549     case UTF_16_ENC:
1550     case UTF_16LE_ENC:
1551     case UTF_16BE_ENC:
1552       return XML_TOK_PARTIAL;
1553     }
1554     switch ((unsigned char)*ptr) {
1555     case 0xFE:
1556     case 0xFF:
1557     case 0xEF: /* possibly first byte of UTF-8 BOM */
1558       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1559         break;
1560       /* fall through */
1561     case 0x00:
1562     case 0x3C:
1563       return XML_TOK_PARTIAL;
1564     }
1565   } else {
1566     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1567     case 0xFEFF:
1568       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1569         break;
1570       *nextTokPtr = ptr + 2;
1571       *encPtr = encodingTable[UTF_16BE_ENC];
1572       return XML_TOK_BOM;
1573     /* 00 3C is handled in the default case */
1574     case 0x3C00:
1575       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1576            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1577           && state == XML_CONTENT_STATE)
1578         break;
1579       *encPtr = encodingTable[UTF_16LE_ENC];
1580       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1581     case 0xFFFE:
1582       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1583         break;
1584       *nextTokPtr = ptr + 2;
1585       *encPtr = encodingTable[UTF_16LE_ENC];
1586       return XML_TOK_BOM;
1587     case 0xEFBB:
1588       /* Maybe a UTF-8 BOM (EF BB BF) */
1589       /* If there's an explicitly specified (external) encoding
1590          of ISO-8859-1 or some flavour of UTF-16
1591          and this is an external text entity,
1592          don't look for the BOM,
1593          because it might be a legal data.
1594       */
1595       if (state == XML_CONTENT_STATE) {
1596         int e = INIT_ENC_INDEX(enc);
1597         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1598             || e == UTF_16_ENC)
1599           break;
1600       }
1601       if (ptr + 2 == end)
1602         return XML_TOK_PARTIAL;
1603       if ((unsigned char)ptr[2] == 0xBF) {
1604         *nextTokPtr = ptr + 3;
1605         *encPtr = encodingTable[UTF_8_ENC];
1606         return XML_TOK_BOM;
1607       }
1608       break;
1609     default:
1610       if (ptr[0] == '\0') {
1611         /* 0 isn't a legal data character. Furthermore a document
1612            entity can only start with ASCII characters.  So the only
1613            way this can fail to be big-endian UTF-16 if it it's an
1614            external parsed general entity that's labelled as
1615            UTF-16LE.
1616         */
1617         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1618           break;
1619         *encPtr = encodingTable[UTF_16BE_ENC];
1620         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1621       } else if (ptr[1] == '\0') {
1622         /* We could recover here in the case:
1623             - parsing an external entity
1624             - second byte is 0
1625             - no externally specified encoding
1626             - no encoding declaration
1627            by assuming UTF-16LE.  But we don't, because this would mean when
1628            presented just with a single byte, we couldn't reliably determine
1629            whether we needed further bytes.
1630         */
1631         if (state == XML_CONTENT_STATE)
1632           break;
1633         *encPtr = encodingTable[UTF_16LE_ENC];
1634         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1635       }
1636       break;
1637     }
1638   }
1639   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1640   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1641 }
1642 
1643 #define NS(x) x
1644 #define ns(x) x
1645 #define XML_TOK_NS_C
1646 #include "xmltok_ns.c"
1647 #undef XML_TOK_NS_C
1648 #undef NS
1649 #undef ns
1650 
1651 #ifdef XML_NS
1652 
1653 #  define NS(x) x##NS
1654 #  define ns(x) x##_ns
1655 
1656 #  define XML_TOK_NS_C
1657 #  include "xmltok_ns.c"
1658 #  undef XML_TOK_NS_C
1659 
1660 #  undef NS
1661 #  undef ns
1662 
1663 ENCODING *
1664 XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1665                          void *userData) {
1666   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1667   if (enc)
1668     ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1669   return enc;
1670 }
1671 
1672 #endif /* XML_NS */
1673