xref: /freebsd/contrib/expat/lib/xmltok.c (revision 0957b409)
1 /*
2                             __  __            _
3                          ___\ \/ /_ __   __ _| |_
4                         / _ \\  /| '_ \ / _` | __|
5                        |  __//  \| |_) | (_| | |_
6                         \___/_/\_\ .__/ \__,_|\__|
7                                  |_| XML parser
8 
9    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10    Copyright (c) 2000-2017 Expat development team
11    Licensed under the MIT license:
12 
13    Permission is  hereby granted,  free of charge,  to any  person obtaining
14    a  copy  of  this  software   and  associated  documentation  files  (the
15    "Software"),  to  deal in  the  Software  without restriction,  including
16    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
17    distribute, sublicense, and/or sell copies of the Software, and to permit
18    persons  to whom  the Software  is  furnished to  do so,  subject to  the
19    following conditions:
20 
21    The above copyright  notice and this permission notice  shall be included
22    in all copies or substantial portions of the Software.
23 
24    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
25    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
26    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
29    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30    USE OR OTHER DEALINGS IN THE SOFTWARE.
31 */
32 
33 #include <stddef.h>
34 #include <string.h>  /* memcpy */
35 
36 #if defined(_MSC_VER) && (_MSC_VER <= 1700)
37   /* for vs2012/11.0/1700 and earlier Visual Studio compilers */
38 # define bool   int
39 # define false  0
40 # define true   1
41 #else
42 # include <stdbool.h>
43 #endif
44 
45 
46 #ifdef _WIN32
47 #include "winconfig.h"
48 #else
49 #ifdef HAVE_EXPAT_CONFIG_H
50 #include <expat_config.h>
51 #endif
52 #endif /* ndef _WIN32 */
53 
54 #include "expat_external.h"
55 #include "internal.h"
56 #include "xmltok.h"
57 #include "nametab.h"
58 
59 #ifdef XML_DTD
60 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
61 #else
62 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
63 #endif
64 
65 #define VTABLE1 \
66   { PREFIX(prologTok), PREFIX(contentTok), \
67     PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
68   { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
69   PREFIX(nameMatchesAscii), \
70   PREFIX(nameLength), \
71   PREFIX(skipS), \
72   PREFIX(getAtts), \
73   PREFIX(charRefNumber), \
74   PREFIX(predefinedEntityName), \
75   PREFIX(updatePosition), \
76   PREFIX(isPublicId)
77 
78 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
79 
80 #define UCS2_GET_NAMING(pages, hi, lo) \
81    (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo) & 0x1F)))
82 
83 /* A 2 byte UTF-8 representation splits the characters 11 bits between
84    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
85    pages, 3 bits to add to that index and 5 bits to generate the mask.
86 */
87 #define UTF8_GET_NAMING2(pages, byte) \
88     (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
89                       + ((((byte)[0]) & 3) << 1) \
90                       + ((((byte)[1]) >> 5) & 1)] \
91          & (1u << (((byte)[1]) & 0x1F)))
92 
93 /* A 3 byte UTF-8 representation splits the characters 16 bits between
94    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
95    into pages, 3 bits to add to that index and 5 bits to generate the
96    mask.
97 */
98 #define UTF8_GET_NAMING3(pages, byte) \
99   (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
100                              + ((((byte)[1]) >> 2) & 0xF)] \
101                        << 3) \
102                       + ((((byte)[1]) & 3) << 1) \
103                       + ((((byte)[2]) >> 5) & 1)] \
104          & (1u << (((byte)[2]) & 0x1F)))
105 
106 #define UTF8_GET_NAMING(pages, p, n) \
107   ((n) == 2 \
108   ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
109   : ((n) == 3 \
110      ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
111      : 0))
112 
113 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
114    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
115    with the additional restriction of not allowing the Unicode
116    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
117    Implementation details:
118      (A & 0x80) == 0     means A < 0x80
119    and
120      (A & 0xC0) == 0xC0  means A > 0xBF
121 */
122 
123 #define UTF8_INVALID2(p) \
124   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
125 
126 #define UTF8_INVALID3(p) \
127   (((p)[2] & 0x80) == 0 \
128   || \
129   ((*p) == 0xEF && (p)[1] == 0xBF \
130     ? \
131     (p)[2] > 0xBD \
132     : \
133     ((p)[2] & 0xC0) == 0xC0) \
134   || \
135   ((*p) == 0xE0 \
136     ? \
137     (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
138     : \
139     ((p)[1] & 0x80) == 0 \
140     || \
141     ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
142 
143 #define UTF8_INVALID4(p) \
144   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
145   || \
146   ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
147   || \
148   ((*p) == 0xF0 \
149     ? \
150     (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
151     : \
152     ((p)[1] & 0x80) == 0 \
153     || \
154     ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
155 
156 static int PTRFASTCALL
157 isNever(const ENCODING *UNUSED_P(enc), const char *UNUSED_P(p))
158 {
159   return 0;
160 }
161 
162 static int PTRFASTCALL
163 utf8_isName2(const ENCODING *UNUSED_P(enc), const char *p)
164 {
165   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
166 }
167 
168 static int PTRFASTCALL
169 utf8_isName3(const ENCODING *UNUSED_P(enc), const char *p)
170 {
171   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
172 }
173 
174 #define utf8_isName4 isNever
175 
176 static int PTRFASTCALL
177 utf8_isNmstrt2(const ENCODING *UNUSED_P(enc), const char *p)
178 {
179   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
180 }
181 
182 static int PTRFASTCALL
183 utf8_isNmstrt3(const ENCODING *UNUSED_P(enc), const char *p)
184 {
185   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
186 }
187 
188 #define utf8_isNmstrt4 isNever
189 
190 static int PTRFASTCALL
191 utf8_isInvalid2(const ENCODING *UNUSED_P(enc), const char *p)
192 {
193   return UTF8_INVALID2((const unsigned char *)p);
194 }
195 
196 static int PTRFASTCALL
197 utf8_isInvalid3(const ENCODING *UNUSED_P(enc), const char *p)
198 {
199   return UTF8_INVALID3((const unsigned char *)p);
200 }
201 
202 static int PTRFASTCALL
203 utf8_isInvalid4(const ENCODING *UNUSED_P(enc), const char *p)
204 {
205   return UTF8_INVALID4((const unsigned char *)p);
206 }
207 
208 struct normal_encoding {
209   ENCODING enc;
210   unsigned char type[256];
211 #ifdef XML_MIN_SIZE
212   int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
213   int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
214   int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
215   int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
216   int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
217 #endif /* XML_MIN_SIZE */
218   int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
219   int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
220   int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
221   int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
222   int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
223   int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
224   int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
225   int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
226   int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
227 };
228 
229 #define AS_NORMAL_ENCODING(enc)   ((const struct normal_encoding *) (enc))
230 
231 #ifdef XML_MIN_SIZE
232 
233 #define STANDARD_VTABLE(E) \
234  E ## byteType, \
235  E ## isNameMin, \
236  E ## isNmstrtMin, \
237  E ## byteToAscii, \
238  E ## charMatches,
239 
240 #else
241 
242 #define STANDARD_VTABLE(E) /* as nothing */
243 
244 #endif
245 
246 #define NORMAL_VTABLE(E) \
247  E ## isName2, \
248  E ## isName3, \
249  E ## isName4, \
250  E ## isNmstrt2, \
251  E ## isNmstrt3, \
252  E ## isNmstrt4, \
253  E ## isInvalid2, \
254  E ## isInvalid3, \
255  E ## isInvalid4
256 
257 #define NULL_VTABLE \
258  /* isName2 */ NULL, \
259  /* isName3 */ NULL, \
260  /* isName4 */ NULL, \
261  /* isNmstrt2 */ NULL, \
262  /* isNmstrt3 */ NULL, \
263  /* isNmstrt4 */ NULL, \
264  /* isInvalid2 */ NULL, \
265  /* isInvalid3 */ NULL, \
266  /* isInvalid4 */ NULL
267 
268 static int FASTCALL checkCharRefNumber(int);
269 
270 #include "xmltok_impl.h"
271 #include "ascii.h"
272 
273 #ifdef XML_MIN_SIZE
274 #define sb_isNameMin isNever
275 #define sb_isNmstrtMin isNever
276 #endif
277 
278 #ifdef XML_MIN_SIZE
279 #define MINBPC(enc) ((enc)->minBytesPerChar)
280 #else
281 /* minimum bytes per character */
282 #define MINBPC(enc) 1
283 #endif
284 
285 #define SB_BYTE_TYPE(enc, p) \
286   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
287 
288 #ifdef XML_MIN_SIZE
289 static int PTRFASTCALL
290 sb_byteType(const ENCODING *enc, const char *p)
291 {
292   return SB_BYTE_TYPE(enc, p);
293 }
294 #define BYTE_TYPE(enc, p) \
295  (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
296 #else
297 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
298 #endif
299 
300 #ifdef XML_MIN_SIZE
301 #define BYTE_TO_ASCII(enc, p) \
302  (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
303 static int PTRFASTCALL
304 sb_byteToAscii(const ENCODING *enc, const char *p)
305 {
306   return *p;
307 }
308 #else
309 #define BYTE_TO_ASCII(enc, p) (*(p))
310 #endif
311 
312 #define IS_NAME_CHAR(enc, p, n) \
313  (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
314 #define IS_NMSTRT_CHAR(enc, p, n) \
315  (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
316 #define IS_INVALID_CHAR(enc, p, n) \
317  (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
318 
319 #ifdef XML_MIN_SIZE
320 #define IS_NAME_CHAR_MINBPC(enc, p) \
321  (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
322 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
323  (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
324 #else
325 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
326 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
327 #endif
328 
329 #ifdef XML_MIN_SIZE
330 #define CHAR_MATCHES(enc, p, c) \
331  (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
332 static int PTRCALL
333 sb_charMatches(const ENCODING *enc, const char *p, int c)
334 {
335   return *p == c;
336 }
337 #else
338 /* c is an ASCII character */
339 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
340 #endif
341 
342 #define PREFIX(ident) normal_ ## ident
343 #define XML_TOK_IMPL_C
344 #include "xmltok_impl.c"
345 #undef XML_TOK_IMPL_C
346 
347 #undef MINBPC
348 #undef BYTE_TYPE
349 #undef BYTE_TO_ASCII
350 #undef CHAR_MATCHES
351 #undef IS_NAME_CHAR
352 #undef IS_NAME_CHAR_MINBPC
353 #undef IS_NMSTRT_CHAR
354 #undef IS_NMSTRT_CHAR_MINBPC
355 #undef IS_INVALID_CHAR
356 
357 enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
358   UTF8_cval1 = 0x00,
359   UTF8_cval2 = 0xc0,
360   UTF8_cval3 = 0xe0,
361   UTF8_cval4 = 0xf0
362 };
363 
364 void
365 _INTERNAL_trim_to_complete_utf8_characters(const char * from, const char ** fromLimRef)
366 {
367   const char * fromLim = *fromLimRef;
368   size_t walked = 0;
369   for (; fromLim > from; fromLim--, walked++) {
370     const unsigned char prev = (unsigned char)fromLim[-1];
371     if ((prev & 0xf8u) == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
372       if (walked + 1 >= 4) {
373         fromLim += 4 - 1;
374         break;
375       } else {
376         walked = 0;
377       }
378     } else if ((prev & 0xf0u) == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
379       if (walked + 1 >= 3) {
380         fromLim += 3 - 1;
381         break;
382       } else {
383         walked = 0;
384       }
385     } else if ((prev & 0xe0u) == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
386       if (walked + 1 >= 2) {
387         fromLim += 2 - 1;
388         break;
389       } else {
390         walked = 0;
391       }
392     } else if ((prev & 0x80u) == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
393       break;
394     }
395   }
396   *fromLimRef = fromLim;
397 }
398 
399 static enum XML_Convert_Result PTRCALL
400 utf8_toUtf8(const ENCODING *UNUSED_P(enc),
401             const char **fromP, const char *fromLim,
402             char **toP, const char *toLim)
403 {
404   bool input_incomplete = false;
405   bool output_exhausted = false;
406 
407   /* Avoid copying partial characters (due to limited space). */
408   const ptrdiff_t bytesAvailable = fromLim - *fromP;
409   const ptrdiff_t bytesStorable = toLim - *toP;
410   if (bytesAvailable > bytesStorable) {
411     fromLim = *fromP + bytesStorable;
412     output_exhausted = true;
413   }
414 
415   /* Avoid copying partial characters (from incomplete input). */
416   {
417     const char * const fromLimBefore = fromLim;
418     _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
419     if (fromLim < fromLimBefore) {
420       input_incomplete = true;
421     }
422   }
423 
424   {
425     const ptrdiff_t bytesToCopy = fromLim - *fromP;
426     memcpy(*toP, *fromP, bytesToCopy);
427     *fromP += bytesToCopy;
428     *toP += bytesToCopy;
429   }
430 
431   if (output_exhausted)  /* needs to go first */
432     return XML_CONVERT_OUTPUT_EXHAUSTED;
433   else if (input_incomplete)
434     return XML_CONVERT_INPUT_INCOMPLETE;
435   else
436     return XML_CONVERT_COMPLETED;
437 }
438 
439 static enum XML_Convert_Result PTRCALL
440 utf8_toUtf16(const ENCODING *enc,
441              const char **fromP, const char *fromLim,
442              unsigned short **toP, const unsigned short *toLim)
443 {
444   enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
445   unsigned short *to = *toP;
446   const char *from = *fromP;
447   while (from < fromLim && to < toLim) {
448     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
449     case BT_LEAD2:
450       if (fromLim - from < 2) {
451         res = XML_CONVERT_INPUT_INCOMPLETE;
452         goto after;
453       }
454       *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
455       from += 2;
456       break;
457     case BT_LEAD3:
458       if (fromLim - from < 3) {
459         res = XML_CONVERT_INPUT_INCOMPLETE;
460         goto after;
461       }
462       *to++ = (unsigned short)(((from[0] & 0xf) << 12)
463                                | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
464       from += 3;
465       break;
466     case BT_LEAD4:
467       {
468         unsigned long n;
469         if (toLim - to < 2) {
470           res = XML_CONVERT_OUTPUT_EXHAUSTED;
471           goto after;
472         }
473         if (fromLim - from < 4) {
474           res = XML_CONVERT_INPUT_INCOMPLETE;
475           goto after;
476         }
477         n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
478             | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
479         n -= 0x10000;
480         to[0] = (unsigned short)((n >> 10) | 0xD800);
481         to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
482         to += 2;
483         from += 4;
484       }
485       break;
486     default:
487       *to++ = *from++;
488       break;
489     }
490   }
491   if (from < fromLim)
492     res = XML_CONVERT_OUTPUT_EXHAUSTED;
493 after:
494   *fromP = from;
495   *toP = to;
496   return res;
497 }
498 
499 #ifdef XML_NS
500 static const struct normal_encoding utf8_encoding_ns = {
501   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
502   {
503 #include "asciitab.h"
504 #include "utf8tab.h"
505   },
506   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
507 };
508 #endif
509 
510 static const struct normal_encoding utf8_encoding = {
511   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
512   {
513 #define BT_COLON BT_NMSTRT
514 #include "asciitab.h"
515 #undef BT_COLON
516 #include "utf8tab.h"
517   },
518   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
519 };
520 
521 #ifdef XML_NS
522 
523 static const struct normal_encoding internal_utf8_encoding_ns = {
524   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
525   {
526 #include "iasciitab.h"
527 #include "utf8tab.h"
528   },
529   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
530 };
531 
532 #endif
533 
534 static const struct normal_encoding internal_utf8_encoding = {
535   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
536   {
537 #define BT_COLON BT_NMSTRT
538 #include "iasciitab.h"
539 #undef BT_COLON
540 #include "utf8tab.h"
541   },
542   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
543 };
544 
545 static enum XML_Convert_Result PTRCALL
546 latin1_toUtf8(const ENCODING *UNUSED_P(enc),
547               const char **fromP, const char *fromLim,
548               char **toP, const char *toLim)
549 {
550   for (;;) {
551     unsigned char c;
552     if (*fromP == fromLim)
553       return XML_CONVERT_COMPLETED;
554     c = (unsigned char)**fromP;
555     if (c & 0x80) {
556       if (toLim - *toP < 2)
557         return XML_CONVERT_OUTPUT_EXHAUSTED;
558       *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
559       *(*toP)++ = (char)((c & 0x3f) | 0x80);
560       (*fromP)++;
561     }
562     else {
563       if (*toP == toLim)
564         return XML_CONVERT_OUTPUT_EXHAUSTED;
565       *(*toP)++ = *(*fromP)++;
566     }
567   }
568 }
569 
570 static enum XML_Convert_Result PTRCALL
571 latin1_toUtf16(const ENCODING *UNUSED_P(enc),
572                const char **fromP, const char *fromLim,
573                unsigned short **toP, const unsigned short *toLim)
574 {
575   while (*fromP < fromLim && *toP < toLim)
576     *(*toP)++ = (unsigned char)*(*fromP)++;
577 
578   if ((*toP == toLim) && (*fromP < fromLim))
579     return XML_CONVERT_OUTPUT_EXHAUSTED;
580   else
581     return XML_CONVERT_COMPLETED;
582 }
583 
584 #ifdef XML_NS
585 
586 static const struct normal_encoding latin1_encoding_ns = {
587   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
588   {
589 #include "asciitab.h"
590 #include "latin1tab.h"
591   },
592   STANDARD_VTABLE(sb_) NULL_VTABLE
593 };
594 
595 #endif
596 
597 static const struct normal_encoding latin1_encoding = {
598   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
599   {
600 #define BT_COLON BT_NMSTRT
601 #include "asciitab.h"
602 #undef BT_COLON
603 #include "latin1tab.h"
604   },
605   STANDARD_VTABLE(sb_) NULL_VTABLE
606 };
607 
608 static enum XML_Convert_Result PTRCALL
609 ascii_toUtf8(const ENCODING *UNUSED_P(enc),
610              const char **fromP, const char *fromLim,
611              char **toP, const char *toLim)
612 {
613   while (*fromP < fromLim && *toP < toLim)
614     *(*toP)++ = *(*fromP)++;
615 
616   if ((*toP == toLim) && (*fromP < fromLim))
617     return XML_CONVERT_OUTPUT_EXHAUSTED;
618   else
619     return XML_CONVERT_COMPLETED;
620 }
621 
622 #ifdef XML_NS
623 
624 static const struct normal_encoding ascii_encoding_ns = {
625   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
626   {
627 #include "asciitab.h"
628 /* BT_NONXML == 0 */
629   },
630   STANDARD_VTABLE(sb_) NULL_VTABLE
631 };
632 
633 #endif
634 
635 static const struct normal_encoding ascii_encoding = {
636   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
637   {
638 #define BT_COLON BT_NMSTRT
639 #include "asciitab.h"
640 #undef BT_COLON
641 /* BT_NONXML == 0 */
642   },
643   STANDARD_VTABLE(sb_) NULL_VTABLE
644 };
645 
646 static int PTRFASTCALL
647 unicode_byte_type(char hi, char lo)
648 {
649   switch ((unsigned char)hi) {
650   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
651     return BT_LEAD4;
652   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
653     return BT_TRAIL;
654   case 0xFF:
655     switch ((unsigned char)lo) {
656     case 0xFF:
657     case 0xFE:
658       return BT_NONXML;
659     }
660     break;
661   }
662   return BT_NONASCII;
663 }
664 
665 #define DEFINE_UTF16_TO_UTF8(E) \
666 static enum XML_Convert_Result  PTRCALL \
667 E ## toUtf8(const ENCODING *UNUSED_P(enc), \
668             const char **fromP, const char *fromLim, \
669             char **toP, const char *toLim) \
670 { \
671   const char *from = *fromP; \
672   fromLim = from + (((fromLim - from) >> 1) << 1);  /* shrink to even */ \
673   for (; from < fromLim; from += 2) { \
674     int plane; \
675     unsigned char lo2; \
676     unsigned char lo = GET_LO(from); \
677     unsigned char hi = GET_HI(from); \
678     switch (hi) { \
679     case 0: \
680       if (lo < 0x80) { \
681         if (*toP == toLim) { \
682           *fromP = from; \
683           return XML_CONVERT_OUTPUT_EXHAUSTED; \
684         } \
685         *(*toP)++ = lo; \
686         break; \
687       } \
688       /* fall through */ \
689     case 0x1: case 0x2: case 0x3: \
690     case 0x4: case 0x5: case 0x6: case 0x7: \
691       if (toLim -  *toP < 2) { \
692         *fromP = from; \
693         return XML_CONVERT_OUTPUT_EXHAUSTED; \
694       } \
695       *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
696       *(*toP)++ = ((lo & 0x3f) | 0x80); \
697       break; \
698     default: \
699       if (toLim -  *toP < 3)  { \
700         *fromP = from; \
701         return XML_CONVERT_OUTPUT_EXHAUSTED; \
702       } \
703       /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
704       *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
705       *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
706       *(*toP)++ = ((lo & 0x3f) | 0x80); \
707       break; \
708     case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
709       if (toLim -  *toP < 4) { \
710         *fromP = from; \
711         return XML_CONVERT_OUTPUT_EXHAUSTED; \
712       } \
713       if (fromLim - from < 4) { \
714         *fromP = from; \
715         return XML_CONVERT_INPUT_INCOMPLETE; \
716       } \
717       plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
718       *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
719       *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
720       from += 2; \
721       lo2 = GET_LO(from); \
722       *(*toP)++ = (((lo & 0x3) << 4) \
723                    | ((GET_HI(from) & 0x3) << 2) \
724                    | (lo2 >> 6) \
725                    | 0x80); \
726       *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
727       break; \
728     } \
729   } \
730   *fromP = from; \
731   if (from < fromLim) \
732     return XML_CONVERT_INPUT_INCOMPLETE; \
733   else \
734     return XML_CONVERT_COMPLETED; \
735 }
736 
737 #define DEFINE_UTF16_TO_UTF16(E) \
738 static enum XML_Convert_Result  PTRCALL \
739 E ## toUtf16(const ENCODING *UNUSED_P(enc), \
740              const char **fromP, const char *fromLim, \
741              unsigned short **toP, const unsigned short *toLim) \
742 { \
743   enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
744   fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1);  /* shrink to even */ \
745   /* Avoid copying first half only of surrogate */ \
746   if (fromLim - *fromP > ((toLim - *toP) << 1) \
747       && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
748     fromLim -= 2; \
749     res = XML_CONVERT_INPUT_INCOMPLETE; \
750   } \
751   for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
752     *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
753   if ((*toP == toLim) && (*fromP < fromLim)) \
754     return XML_CONVERT_OUTPUT_EXHAUSTED; \
755   else \
756     return res; \
757 }
758 
759 #define SET2(ptr, ch) \
760   (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
761 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
762 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
763 
764 DEFINE_UTF16_TO_UTF8(little2_)
765 DEFINE_UTF16_TO_UTF16(little2_)
766 
767 #undef SET2
768 #undef GET_LO
769 #undef GET_HI
770 
771 #define SET2(ptr, ch) \
772   (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
773 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
774 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
775 
776 DEFINE_UTF16_TO_UTF8(big2_)
777 DEFINE_UTF16_TO_UTF16(big2_)
778 
779 #undef SET2
780 #undef GET_LO
781 #undef GET_HI
782 
783 #define LITTLE2_BYTE_TYPE(enc, p) \
784  ((p)[1] == 0 \
785   ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
786   : unicode_byte_type((p)[1], (p)[0]))
787 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
788 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
789 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
790   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
791 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
792   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
793 
794 #ifdef XML_MIN_SIZE
795 
796 static int PTRFASTCALL
797 little2_byteType(const ENCODING *enc, const char *p)
798 {
799   return LITTLE2_BYTE_TYPE(enc, p);
800 }
801 
802 static int PTRFASTCALL
803 little2_byteToAscii(const ENCODING *enc, const char *p)
804 {
805   return LITTLE2_BYTE_TO_ASCII(enc, p);
806 }
807 
808 static int PTRCALL
809 little2_charMatches(const ENCODING *enc, const char *p, int c)
810 {
811   return LITTLE2_CHAR_MATCHES(enc, p, c);
812 }
813 
814 static int PTRFASTCALL
815 little2_isNameMin(const ENCODING *enc, const char *p)
816 {
817   return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
818 }
819 
820 static int PTRFASTCALL
821 little2_isNmstrtMin(const ENCODING *enc, const char *p)
822 {
823   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
824 }
825 
826 #undef VTABLE
827 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
828 
829 #else /* not XML_MIN_SIZE */
830 
831 #undef PREFIX
832 #define PREFIX(ident) little2_ ## ident
833 #define MINBPC(enc) 2
834 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
835 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
836 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
837 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
838 #define IS_NAME_CHAR(enc, p, n) 0
839 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
840 #define IS_NMSTRT_CHAR(enc, p, n) (0)
841 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
842 
843 #define XML_TOK_IMPL_C
844 #include "xmltok_impl.c"
845 #undef XML_TOK_IMPL_C
846 
847 #undef MINBPC
848 #undef BYTE_TYPE
849 #undef BYTE_TO_ASCII
850 #undef CHAR_MATCHES
851 #undef IS_NAME_CHAR
852 #undef IS_NAME_CHAR_MINBPC
853 #undef IS_NMSTRT_CHAR
854 #undef IS_NMSTRT_CHAR_MINBPC
855 #undef IS_INVALID_CHAR
856 
857 #endif /* not XML_MIN_SIZE */
858 
859 #ifdef XML_NS
860 
861 static const struct normal_encoding little2_encoding_ns = {
862   { VTABLE, 2, 0,
863 #if BYTEORDER == 1234
864     1
865 #else
866     0
867 #endif
868   },
869   {
870 #include "asciitab.h"
871 #include "latin1tab.h"
872   },
873   STANDARD_VTABLE(little2_) NULL_VTABLE
874 };
875 
876 #endif
877 
878 static const struct normal_encoding little2_encoding = {
879   { VTABLE, 2, 0,
880 #if BYTEORDER == 1234
881     1
882 #else
883     0
884 #endif
885   },
886   {
887 #define BT_COLON BT_NMSTRT
888 #include "asciitab.h"
889 #undef BT_COLON
890 #include "latin1tab.h"
891   },
892   STANDARD_VTABLE(little2_) NULL_VTABLE
893 };
894 
895 #if BYTEORDER != 4321
896 
897 #ifdef XML_NS
898 
899 static const struct normal_encoding internal_little2_encoding_ns = {
900   { VTABLE, 2, 0, 1 },
901   {
902 #include "iasciitab.h"
903 #include "latin1tab.h"
904   },
905   STANDARD_VTABLE(little2_) NULL_VTABLE
906 };
907 
908 #endif
909 
910 static const struct normal_encoding internal_little2_encoding = {
911   { VTABLE, 2, 0, 1 },
912   {
913 #define BT_COLON BT_NMSTRT
914 #include "iasciitab.h"
915 #undef BT_COLON
916 #include "latin1tab.h"
917   },
918   STANDARD_VTABLE(little2_) NULL_VTABLE
919 };
920 
921 #endif
922 
923 
924 #define BIG2_BYTE_TYPE(enc, p) \
925  ((p)[0] == 0 \
926   ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
927   : unicode_byte_type((p)[0], (p)[1]))
928 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
929 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
930 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
931   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
932 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
933   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
934 
935 #ifdef XML_MIN_SIZE
936 
937 static int PTRFASTCALL
938 big2_byteType(const ENCODING *enc, const char *p)
939 {
940   return BIG2_BYTE_TYPE(enc, p);
941 }
942 
943 static int PTRFASTCALL
944 big2_byteToAscii(const ENCODING *enc, const char *p)
945 {
946   return BIG2_BYTE_TO_ASCII(enc, p);
947 }
948 
949 static int PTRCALL
950 big2_charMatches(const ENCODING *enc, const char *p, int c)
951 {
952   return BIG2_CHAR_MATCHES(enc, p, c);
953 }
954 
955 static int PTRFASTCALL
956 big2_isNameMin(const ENCODING *enc, const char *p)
957 {
958   return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
959 }
960 
961 static int PTRFASTCALL
962 big2_isNmstrtMin(const ENCODING *enc, const char *p)
963 {
964   return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
965 }
966 
967 #undef VTABLE
968 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
969 
970 #else /* not XML_MIN_SIZE */
971 
972 #undef PREFIX
973 #define PREFIX(ident) big2_ ## ident
974 #define MINBPC(enc) 2
975 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
976 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
977 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
978 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
979 #define IS_NAME_CHAR(enc, p, n) 0
980 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
981 #define IS_NMSTRT_CHAR(enc, p, n) (0)
982 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
983 
984 #define XML_TOK_IMPL_C
985 #include "xmltok_impl.c"
986 #undef XML_TOK_IMPL_C
987 
988 #undef MINBPC
989 #undef BYTE_TYPE
990 #undef BYTE_TO_ASCII
991 #undef CHAR_MATCHES
992 #undef IS_NAME_CHAR
993 #undef IS_NAME_CHAR_MINBPC
994 #undef IS_NMSTRT_CHAR
995 #undef IS_NMSTRT_CHAR_MINBPC
996 #undef IS_INVALID_CHAR
997 
998 #endif /* not XML_MIN_SIZE */
999 
1000 #ifdef XML_NS
1001 
1002 static const struct normal_encoding big2_encoding_ns = {
1003   { VTABLE, 2, 0,
1004 #if BYTEORDER == 4321
1005   1
1006 #else
1007   0
1008 #endif
1009   },
1010   {
1011 #include "asciitab.h"
1012 #include "latin1tab.h"
1013   },
1014   STANDARD_VTABLE(big2_) NULL_VTABLE
1015 };
1016 
1017 #endif
1018 
1019 static const struct normal_encoding big2_encoding = {
1020   { VTABLE, 2, 0,
1021 #if BYTEORDER == 4321
1022   1
1023 #else
1024   0
1025 #endif
1026   },
1027   {
1028 #define BT_COLON BT_NMSTRT
1029 #include "asciitab.h"
1030 #undef BT_COLON
1031 #include "latin1tab.h"
1032   },
1033   STANDARD_VTABLE(big2_) NULL_VTABLE
1034 };
1035 
1036 #if BYTEORDER != 1234
1037 
1038 #ifdef XML_NS
1039 
1040 static const struct normal_encoding internal_big2_encoding_ns = {
1041   { VTABLE, 2, 0, 1 },
1042   {
1043 #include "iasciitab.h"
1044 #include "latin1tab.h"
1045   },
1046   STANDARD_VTABLE(big2_) NULL_VTABLE
1047 };
1048 
1049 #endif
1050 
1051 static const struct normal_encoding internal_big2_encoding = {
1052   { VTABLE, 2, 0, 1 },
1053   {
1054 #define BT_COLON BT_NMSTRT
1055 #include "iasciitab.h"
1056 #undef BT_COLON
1057 #include "latin1tab.h"
1058   },
1059   STANDARD_VTABLE(big2_) NULL_VTABLE
1060 };
1061 
1062 #endif
1063 
1064 #undef PREFIX
1065 
1066 static int FASTCALL
1067 streqci(const char *s1, const char *s2)
1068 {
1069   for (;;) {
1070     char c1 = *s1++;
1071     char c2 = *s2++;
1072     if (ASCII_a <= c1 && c1 <= ASCII_z)
1073       c1 += ASCII_A - ASCII_a;
1074     if (ASCII_a <= c2 && c2 <= ASCII_z)
1075       /* The following line will never get executed.  streqci() is
1076        * only called from two places, both of which guarantee to put
1077        * upper-case strings into s2.
1078        */
1079       c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1080     if (c1 != c2)
1081       return 0;
1082     if (!c1)
1083       break;
1084   }
1085   return 1;
1086 }
1087 
1088 static void PTRCALL
1089 initUpdatePosition(const ENCODING *UNUSED_P(enc), const char *ptr,
1090                    const char *end, POSITION *pos)
1091 {
1092   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1093 }
1094 
1095 static int
1096 toAscii(const ENCODING *enc, const char *ptr, const char *end)
1097 {
1098   char buf[1];
1099   char *p = buf;
1100   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1101   if (p == buf)
1102     return -1;
1103   else
1104     return buf[0];
1105 }
1106 
1107 static int FASTCALL
1108 isSpace(int c)
1109 {
1110   switch (c) {
1111   case 0x20:
1112   case 0xD:
1113   case 0xA:
1114   case 0x9:
1115     return 1;
1116   }
1117   return 0;
1118 }
1119 
1120 /* Return 1 if there's just optional white space or there's an S
1121    followed by name=val.
1122 */
1123 static int
1124 parsePseudoAttribute(const ENCODING *enc,
1125                      const char *ptr,
1126                      const char *end,
1127                      const char **namePtr,
1128                      const char **nameEndPtr,
1129                      const char **valPtr,
1130                      const char **nextTokPtr)
1131 {
1132   int c;
1133   char open;
1134   if (ptr == end) {
1135     *namePtr = NULL;
1136     return 1;
1137   }
1138   if (!isSpace(toAscii(enc, ptr, end))) {
1139     *nextTokPtr = ptr;
1140     return 0;
1141   }
1142   do {
1143     ptr += enc->minBytesPerChar;
1144   } while (isSpace(toAscii(enc, ptr, end)));
1145   if (ptr == end) {
1146     *namePtr = NULL;
1147     return 1;
1148   }
1149   *namePtr = ptr;
1150   for (;;) {
1151     c = toAscii(enc, ptr, end);
1152     if (c == -1) {
1153       *nextTokPtr = ptr;
1154       return 0;
1155     }
1156     if (c == ASCII_EQUALS) {
1157       *nameEndPtr = ptr;
1158       break;
1159     }
1160     if (isSpace(c)) {
1161       *nameEndPtr = ptr;
1162       do {
1163         ptr += enc->minBytesPerChar;
1164       } while (isSpace(c = toAscii(enc, ptr, end)));
1165       if (c != ASCII_EQUALS) {
1166         *nextTokPtr = ptr;
1167         return 0;
1168       }
1169       break;
1170     }
1171     ptr += enc->minBytesPerChar;
1172   }
1173   if (ptr == *namePtr) {
1174     *nextTokPtr = ptr;
1175     return 0;
1176   }
1177   ptr += enc->minBytesPerChar;
1178   c = toAscii(enc, ptr, end);
1179   while (isSpace(c)) {
1180     ptr += enc->minBytesPerChar;
1181     c = toAscii(enc, ptr, end);
1182   }
1183   if (c != ASCII_QUOT && c != ASCII_APOS) {
1184     *nextTokPtr = ptr;
1185     return 0;
1186   }
1187   open = (char)c;
1188   ptr += enc->minBytesPerChar;
1189   *valPtr = ptr;
1190   for (;; ptr += enc->minBytesPerChar) {
1191     c = toAscii(enc, ptr, end);
1192     if (c == open)
1193       break;
1194     if (!(ASCII_a <= c && c <= ASCII_z)
1195         && !(ASCII_A <= c && c <= ASCII_Z)
1196         && !(ASCII_0 <= c && c <= ASCII_9)
1197         && c != ASCII_PERIOD
1198         && c != ASCII_MINUS
1199         && c != ASCII_UNDERSCORE) {
1200       *nextTokPtr = ptr;
1201       return 0;
1202     }
1203   }
1204   *nextTokPtr = ptr + enc->minBytesPerChar;
1205   return 1;
1206 }
1207 
1208 static const char KW_version[] = {
1209   ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1210 };
1211 
1212 static const char KW_encoding[] = {
1213   ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1214 };
1215 
1216 static const char KW_standalone[] = {
1217   ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
1218   ASCII_n, ASCII_e, '\0'
1219 };
1220 
1221 static const char KW_yes[] = {
1222   ASCII_y, ASCII_e, ASCII_s,  '\0'
1223 };
1224 
1225 static const char KW_no[] = {
1226   ASCII_n, ASCII_o,  '\0'
1227 };
1228 
1229 static int
1230 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1231                                                  const char *,
1232                                                  const char *),
1233                int isGeneralTextEntity,
1234                const ENCODING *enc,
1235                const char *ptr,
1236                const char *end,
1237                const char **badPtr,
1238                const char **versionPtr,
1239                const char **versionEndPtr,
1240                const char **encodingName,
1241                const ENCODING **encoding,
1242                int *standalone)
1243 {
1244   const char *val = NULL;
1245   const char *name = NULL;
1246   const char *nameEnd = NULL;
1247   ptr += 5 * enc->minBytesPerChar;
1248   end -= 2 * enc->minBytesPerChar;
1249   if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1250       || !name) {
1251     *badPtr = ptr;
1252     return 0;
1253   }
1254   if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1255     if (!isGeneralTextEntity) {
1256       *badPtr = name;
1257       return 0;
1258     }
1259   }
1260   else {
1261     if (versionPtr)
1262       *versionPtr = val;
1263     if (versionEndPtr)
1264       *versionEndPtr = ptr;
1265     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1266       *badPtr = ptr;
1267       return 0;
1268     }
1269     if (!name) {
1270       if (isGeneralTextEntity) {
1271         /* a TextDecl must have an EncodingDecl */
1272         *badPtr = ptr;
1273         return 0;
1274       }
1275       return 1;
1276     }
1277   }
1278   if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1279     int c = toAscii(enc, val, end);
1280     if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1281       *badPtr = val;
1282       return 0;
1283     }
1284     if (encodingName)
1285       *encodingName = val;
1286     if (encoding)
1287       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1288     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1289       *badPtr = ptr;
1290       return 0;
1291     }
1292     if (!name)
1293       return 1;
1294   }
1295   if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1296       || isGeneralTextEntity) {
1297     *badPtr = name;
1298     return 0;
1299   }
1300   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1301     if (standalone)
1302       *standalone = 1;
1303   }
1304   else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1305     if (standalone)
1306       *standalone = 0;
1307   }
1308   else {
1309     *badPtr = val;
1310     return 0;
1311   }
1312   while (isSpace(toAscii(enc, ptr, end)))
1313     ptr += enc->minBytesPerChar;
1314   if (ptr != end) {
1315     *badPtr = ptr;
1316     return 0;
1317   }
1318   return 1;
1319 }
1320 
1321 static int FASTCALL
1322 checkCharRefNumber(int result)
1323 {
1324   switch (result >> 8) {
1325   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1326   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1327     return -1;
1328   case 0:
1329     if (latin1_encoding.type[result] == BT_NONXML)
1330       return -1;
1331     break;
1332   case 0xFF:
1333     if (result == 0xFFFE || result == 0xFFFF)
1334       return -1;
1335     break;
1336   }
1337   return result;
1338 }
1339 
1340 int FASTCALL
1341 XmlUtf8Encode(int c, char *buf)
1342 {
1343   enum {
1344     /* minN is minimum legal resulting value for N byte sequence */
1345     min2 = 0x80,
1346     min3 = 0x800,
1347     min4 = 0x10000
1348   };
1349 
1350   if (c < 0)
1351     return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1352   if (c < min2) {
1353     buf[0] = (char)(c | UTF8_cval1);
1354     return 1;
1355   }
1356   if (c < min3) {
1357     buf[0] = (char)((c >> 6) | UTF8_cval2);
1358     buf[1] = (char)((c & 0x3f) | 0x80);
1359     return 2;
1360   }
1361   if (c < min4) {
1362     buf[0] = (char)((c >> 12) | UTF8_cval3);
1363     buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1364     buf[2] = (char)((c & 0x3f) | 0x80);
1365     return 3;
1366   }
1367   if (c < 0x110000) {
1368     buf[0] = (char)((c >> 18) | UTF8_cval4);
1369     buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1370     buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1371     buf[3] = (char)((c & 0x3f) | 0x80);
1372     return 4;
1373   }
1374   return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1375 }
1376 
1377 int FASTCALL
1378 XmlUtf16Encode(int charNum, unsigned short *buf)
1379 {
1380   if (charNum < 0)
1381     return 0;
1382   if (charNum < 0x10000) {
1383     buf[0] = (unsigned short)charNum;
1384     return 1;
1385   }
1386   if (charNum < 0x110000) {
1387     charNum -= 0x10000;
1388     buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1389     buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1390     return 2;
1391   }
1392   return 0;
1393 }
1394 
1395 struct unknown_encoding {
1396   struct normal_encoding normal;
1397   CONVERTER convert;
1398   void *userData;
1399   unsigned short utf16[256];
1400   char utf8[256][4];
1401 };
1402 
1403 #define AS_UNKNOWN_ENCODING(enc)  ((const struct unknown_encoding *) (enc))
1404 
1405 int
1406 XmlSizeOfUnknownEncoding(void)
1407 {
1408   return sizeof(struct unknown_encoding);
1409 }
1410 
1411 static int PTRFASTCALL
1412 unknown_isName(const ENCODING *enc, const char *p)
1413 {
1414   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1415   int c = uenc->convert(uenc->userData, p);
1416   if (c & ~0xFFFF)
1417     return 0;
1418   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1419 }
1420 
1421 static int PTRFASTCALL
1422 unknown_isNmstrt(const ENCODING *enc, const char *p)
1423 {
1424   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1425   int c = uenc->convert(uenc->userData, p);
1426   if (c & ~0xFFFF)
1427     return 0;
1428   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1429 }
1430 
1431 static int PTRFASTCALL
1432 unknown_isInvalid(const ENCODING *enc, const char *p)
1433 {
1434   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1435   int c = uenc->convert(uenc->userData, p);
1436   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1437 }
1438 
1439 static enum XML_Convert_Result PTRCALL
1440 unknown_toUtf8(const ENCODING *enc,
1441                const char **fromP, const char *fromLim,
1442                char **toP, const char *toLim)
1443 {
1444   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1445   char buf[XML_UTF8_ENCODE_MAX];
1446   for (;;) {
1447     const char *utf8;
1448     int n;
1449     if (*fromP == fromLim)
1450       return XML_CONVERT_COMPLETED;
1451     utf8 = uenc->utf8[(unsigned char)**fromP];
1452     n = *utf8++;
1453     if (n == 0) {
1454       int c = uenc->convert(uenc->userData, *fromP);
1455       n = XmlUtf8Encode(c, buf);
1456       if (n > toLim - *toP)
1457         return XML_CONVERT_OUTPUT_EXHAUSTED;
1458       utf8 = buf;
1459       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1460                  - (BT_LEAD2 - 2));
1461     }
1462     else {
1463       if (n > toLim - *toP)
1464         return XML_CONVERT_OUTPUT_EXHAUSTED;
1465       (*fromP)++;
1466     }
1467     memcpy(*toP, utf8, n);
1468     *toP += n;
1469   }
1470 }
1471 
1472 static enum XML_Convert_Result PTRCALL
1473 unknown_toUtf16(const ENCODING *enc,
1474                 const char **fromP, const char *fromLim,
1475                 unsigned short **toP, const unsigned short *toLim)
1476 {
1477   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1478   while (*fromP < fromLim && *toP < toLim) {
1479     unsigned short c = uenc->utf16[(unsigned char)**fromP];
1480     if (c == 0) {
1481       c = (unsigned short)
1482           uenc->convert(uenc->userData, *fromP);
1483       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1484                  - (BT_LEAD2 - 2));
1485     }
1486     else
1487       (*fromP)++;
1488     *(*toP)++ = c;
1489   }
1490 
1491   if ((*toP == toLim) && (*fromP < fromLim))
1492     return XML_CONVERT_OUTPUT_EXHAUSTED;
1493   else
1494     return XML_CONVERT_COMPLETED;
1495 }
1496 
1497 ENCODING *
1498 XmlInitUnknownEncoding(void *mem,
1499                        int *table,
1500                        CONVERTER convert,
1501                        void *userData)
1502 {
1503   int i;
1504   struct unknown_encoding *e = (struct unknown_encoding *)mem;
1505   for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1506     ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1507   for (i = 0; i < 128; i++)
1508     if (latin1_encoding.type[i] != BT_OTHER
1509         && latin1_encoding.type[i] != BT_NONXML
1510         && table[i] != i)
1511       return 0;
1512   for (i = 0; i < 256; i++) {
1513     int c = table[i];
1514     if (c == -1) {
1515       e->normal.type[i] = BT_MALFORM;
1516       /* This shouldn't really get used. */
1517       e->utf16[i] = 0xFFFF;
1518       e->utf8[i][0] = 1;
1519       e->utf8[i][1] = 0;
1520     }
1521     else if (c < 0) {
1522       if (c < -4)
1523         return 0;
1524       /* Multi-byte sequences need a converter function */
1525       if (!convert)
1526         return 0;
1527       e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1528       e->utf8[i][0] = 0;
1529       e->utf16[i] = 0;
1530     }
1531     else if (c < 0x80) {
1532       if (latin1_encoding.type[c] != BT_OTHER
1533           && latin1_encoding.type[c] != BT_NONXML
1534           && c != i)
1535         return 0;
1536       e->normal.type[i] = latin1_encoding.type[c];
1537       e->utf8[i][0] = 1;
1538       e->utf8[i][1] = (char)c;
1539       e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1540     }
1541     else if (checkCharRefNumber(c) < 0) {
1542       e->normal.type[i] = BT_NONXML;
1543       /* This shouldn't really get used. */
1544       e->utf16[i] = 0xFFFF;
1545       e->utf8[i][0] = 1;
1546       e->utf8[i][1] = 0;
1547     }
1548     else {
1549       if (c > 0xFFFF)
1550         return 0;
1551       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1552         e->normal.type[i] = BT_NMSTRT;
1553       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1554         e->normal.type[i] = BT_NAME;
1555       else
1556         e->normal.type[i] = BT_OTHER;
1557       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1558       e->utf16[i] = (unsigned short)c;
1559     }
1560   }
1561   e->userData = userData;
1562   e->convert = convert;
1563   if (convert) {
1564     e->normal.isName2 = unknown_isName;
1565     e->normal.isName3 = unknown_isName;
1566     e->normal.isName4 = unknown_isName;
1567     e->normal.isNmstrt2 = unknown_isNmstrt;
1568     e->normal.isNmstrt3 = unknown_isNmstrt;
1569     e->normal.isNmstrt4 = unknown_isNmstrt;
1570     e->normal.isInvalid2 = unknown_isInvalid;
1571     e->normal.isInvalid3 = unknown_isInvalid;
1572     e->normal.isInvalid4 = unknown_isInvalid;
1573   }
1574   e->normal.enc.utf8Convert = unknown_toUtf8;
1575   e->normal.enc.utf16Convert = unknown_toUtf16;
1576   return &(e->normal.enc);
1577 }
1578 
1579 /* If this enumeration is changed, getEncodingIndex and encodings
1580 must also be changed. */
1581 enum {
1582   UNKNOWN_ENC = -1,
1583   ISO_8859_1_ENC = 0,
1584   US_ASCII_ENC,
1585   UTF_8_ENC,
1586   UTF_16_ENC,
1587   UTF_16BE_ENC,
1588   UTF_16LE_ENC,
1589   /* must match encodingNames up to here */
1590   NO_ENC
1591 };
1592 
1593 static const char KW_ISO_8859_1[] = {
1594   ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
1595   ASCII_MINUS, ASCII_1, '\0'
1596 };
1597 static const char KW_US_ASCII[] = {
1598   ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
1599   '\0'
1600 };
1601 static const char KW_UTF_8[] =  {
1602   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1603 };
1604 static const char KW_UTF_16[] = {
1605   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1606 };
1607 static const char KW_UTF_16BE[] = {
1608   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
1609   '\0'
1610 };
1611 static const char KW_UTF_16LE[] = {
1612   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
1613   '\0'
1614 };
1615 
1616 static int FASTCALL
1617 getEncodingIndex(const char *name)
1618 {
1619   static const char * const encodingNames[] = {
1620     KW_ISO_8859_1,
1621     KW_US_ASCII,
1622     KW_UTF_8,
1623     KW_UTF_16,
1624     KW_UTF_16BE,
1625     KW_UTF_16LE,
1626   };
1627   int i;
1628   if (name == NULL)
1629     return NO_ENC;
1630   for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1631     if (streqci(name, encodingNames[i]))
1632       return i;
1633   return UNKNOWN_ENC;
1634 }
1635 
1636 /* For binary compatibility, we store the index of the encoding
1637    specified at initialization in the isUtf16 member.
1638 */
1639 
1640 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1641 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1642 
1643 /* This is what detects the encoding.  encodingTable maps from
1644    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1645    the external (protocol) specified encoding; state is
1646    XML_CONTENT_STATE if we're parsing an external text entity, and
1647    XML_PROLOG_STATE otherwise.
1648 */
1649 
1650 
1651 static int
1652 initScan(const ENCODING * const *encodingTable,
1653          const INIT_ENCODING *enc,
1654          int state,
1655          const char *ptr,
1656          const char *end,
1657          const char **nextTokPtr)
1658 {
1659   const ENCODING **encPtr;
1660 
1661   if (ptr >= end)
1662     return XML_TOK_NONE;
1663   encPtr = enc->encPtr;
1664   if (ptr + 1 == end) {
1665     /* only a single byte available for auto-detection */
1666 #ifndef XML_DTD /* FIXME */
1667     /* a well-formed document entity must have more than one byte */
1668     if (state != XML_CONTENT_STATE)
1669       return XML_TOK_PARTIAL;
1670 #endif
1671     /* so we're parsing an external text entity... */
1672     /* if UTF-16 was externally specified, then we need at least 2 bytes */
1673     switch (INIT_ENC_INDEX(enc)) {
1674     case UTF_16_ENC:
1675     case UTF_16LE_ENC:
1676     case UTF_16BE_ENC:
1677       return XML_TOK_PARTIAL;
1678     }
1679     switch ((unsigned char)*ptr) {
1680     case 0xFE:
1681     case 0xFF:
1682     case 0xEF: /* possibly first byte of UTF-8 BOM */
1683       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1684           && state == XML_CONTENT_STATE)
1685         break;
1686       /* fall through */
1687     case 0x00:
1688     case 0x3C:
1689       return XML_TOK_PARTIAL;
1690     }
1691   }
1692   else {
1693     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1694     case 0xFEFF:
1695       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1696           && state == XML_CONTENT_STATE)
1697         break;
1698       *nextTokPtr = ptr + 2;
1699       *encPtr = encodingTable[UTF_16BE_ENC];
1700       return XML_TOK_BOM;
1701     /* 00 3C is handled in the default case */
1702     case 0x3C00:
1703       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1704            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1705           && state == XML_CONTENT_STATE)
1706         break;
1707       *encPtr = encodingTable[UTF_16LE_ENC];
1708       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1709     case 0xFFFE:
1710       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1711           && state == XML_CONTENT_STATE)
1712         break;
1713       *nextTokPtr = ptr + 2;
1714       *encPtr = encodingTable[UTF_16LE_ENC];
1715       return XML_TOK_BOM;
1716     case 0xEFBB:
1717       /* Maybe a UTF-8 BOM (EF BB BF) */
1718       /* If there's an explicitly specified (external) encoding
1719          of ISO-8859-1 or some flavour of UTF-16
1720          and this is an external text entity,
1721          don't look for the BOM,
1722          because it might be a legal data.
1723       */
1724       if (state == XML_CONTENT_STATE) {
1725         int e = INIT_ENC_INDEX(enc);
1726         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1727             || e == UTF_16LE_ENC || e == UTF_16_ENC)
1728           break;
1729       }
1730       if (ptr + 2 == end)
1731         return XML_TOK_PARTIAL;
1732       if ((unsigned char)ptr[2] == 0xBF) {
1733         *nextTokPtr = ptr + 3;
1734         *encPtr = encodingTable[UTF_8_ENC];
1735         return XML_TOK_BOM;
1736       }
1737       break;
1738     default:
1739       if (ptr[0] == '\0') {
1740         /* 0 isn't a legal data character. Furthermore a document
1741            entity can only start with ASCII characters.  So the only
1742            way this can fail to be big-endian UTF-16 if it it's an
1743            external parsed general entity that's labelled as
1744            UTF-16LE.
1745         */
1746         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1747           break;
1748         *encPtr = encodingTable[UTF_16BE_ENC];
1749         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1750       }
1751       else if (ptr[1] == '\0') {
1752         /* We could recover here in the case:
1753             - parsing an external entity
1754             - second byte is 0
1755             - no externally specified encoding
1756             - no encoding declaration
1757            by assuming UTF-16LE.  But we don't, because this would mean when
1758            presented just with a single byte, we couldn't reliably determine
1759            whether we needed further bytes.
1760         */
1761         if (state == XML_CONTENT_STATE)
1762           break;
1763         *encPtr = encodingTable[UTF_16LE_ENC];
1764         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1765       }
1766       break;
1767     }
1768   }
1769   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1770   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1771 }
1772 
1773 
1774 #define NS(x) x
1775 #define ns(x) x
1776 #define XML_TOK_NS_C
1777 #include "xmltok_ns.c"
1778 #undef XML_TOK_NS_C
1779 #undef NS
1780 #undef ns
1781 
1782 #ifdef XML_NS
1783 
1784 #define NS(x) x ## NS
1785 #define ns(x) x ## _ns
1786 
1787 #define XML_TOK_NS_C
1788 #include "xmltok_ns.c"
1789 #undef XML_TOK_NS_C
1790 
1791 #undef NS
1792 #undef ns
1793 
1794 ENCODING *
1795 XmlInitUnknownEncodingNS(void *mem,
1796                          int *table,
1797                          CONVERTER convert,
1798                          void *userData)
1799 {
1800   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1801   if (enc)
1802     ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1803   return enc;
1804 }
1805 
1806 #endif /* XML_NS */
1807