xref: /freebsd/contrib/expat/lib/xmltok_impl.c (revision e17f5b1d)
1 /* This file is included!
2                             __  __            _
3                          ___\ \/ /_ __   __ _| |_
4                         / _ \\  /| '_ \ / _` | __|
5                        |  __//  \| |_) | (_| | |_
6                         \___/_/\_\ .__/ \__,_|\__|
7                                  |_| XML parser
8 
9    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10    Copyright (c) 2000-2017 Expat development team
11    Licensed under the MIT license:
12 
13    Permission is  hereby granted,  free of charge,  to any  person obtaining
14    a  copy  of  this  software   and  associated  documentation  files  (the
15    "Software"),  to  deal in  the  Software  without restriction,  including
16    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
17    distribute, sublicense, and/or sell copies of the Software, and to permit
18    persons  to whom  the Software  is  furnished to  do so,  subject to  the
19    following conditions:
20 
21    The above copyright  notice and this permission notice  shall be included
22    in all copies or substantial portions of the Software.
23 
24    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
25    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
26    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
29    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30    USE OR OTHER DEALINGS IN THE SOFTWARE.
31 */
32 
33 #ifdef XML_TOK_IMPL_C
34 
35 #  ifndef IS_INVALID_CHAR
36 #    define IS_INVALID_CHAR(enc, ptr, n) (0)
37 #  endif
38 
39 #  define INVALID_LEAD_CASE(n, ptr, nextTokPtr)                                \
40   case BT_LEAD##n:                                                             \
41     if (end - ptr < n)                                                         \
42       return XML_TOK_PARTIAL_CHAR;                                             \
43     if (IS_INVALID_CHAR(enc, ptr, n)) {                                        \
44       *(nextTokPtr) = (ptr);                                                   \
45       return XML_TOK_INVALID;                                                  \
46     }                                                                          \
47     ptr += n;                                                                  \
48     break;
49 
50 #  define INVALID_CASES(ptr, nextTokPtr)                                       \
51     INVALID_LEAD_CASE(2, ptr, nextTokPtr)                                      \
52     INVALID_LEAD_CASE(3, ptr, nextTokPtr)                                      \
53     INVALID_LEAD_CASE(4, ptr, nextTokPtr)                                      \
54   case BT_NONXML:                                                              \
55   case BT_MALFORM:                                                             \
56   case BT_TRAIL:                                                               \
57     *(nextTokPtr) = (ptr);                                                     \
58     return XML_TOK_INVALID;
59 
60 #  define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr)                        \
61   case BT_LEAD##n:                                                             \
62     if (end - ptr < n)                                                         \
63       return XML_TOK_PARTIAL_CHAR;                                             \
64     if (! IS_NAME_CHAR(enc, ptr, n)) {                                         \
65       *nextTokPtr = ptr;                                                       \
66       return XML_TOK_INVALID;                                                  \
67     }                                                                          \
68     ptr += n;                                                                  \
69     break;
70 
71 #  define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)                          \
72   case BT_NONASCII:                                                            \
73     if (! IS_NAME_CHAR_MINBPC(enc, ptr)) {                                     \
74       *nextTokPtr = ptr;                                                       \
75       return XML_TOK_INVALID;                                                  \
76     }                                                                          \
77     /* fall through */                                                         \
78   case BT_NMSTRT:                                                              \
79   case BT_HEX:                                                                 \
80   case BT_DIGIT:                                                               \
81   case BT_NAME:                                                                \
82   case BT_MINUS:                                                               \
83     ptr += MINBPC(enc);                                                        \
84     break;                                                                     \
85     CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr)                              \
86     CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr)                              \
87     CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
88 
89 #  define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr)                      \
90   case BT_LEAD##n:                                                             \
91     if (end - ptr < n)                                                         \
92       return XML_TOK_PARTIAL_CHAR;                                             \
93     if (! IS_NMSTRT_CHAR(enc, ptr, n)) {                                       \
94       *nextTokPtr = ptr;                                                       \
95       return XML_TOK_INVALID;                                                  \
96     }                                                                          \
97     ptr += n;                                                                  \
98     break;
99 
100 #  define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)                        \
101   case BT_NONASCII:                                                            \
102     if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {                                   \
103       *nextTokPtr = ptr;                                                       \
104       return XML_TOK_INVALID;                                                  \
105     }                                                                          \
106     /* fall through */                                                         \
107   case BT_NMSTRT:                                                              \
108   case BT_HEX:                                                                 \
109     ptr += MINBPC(enc);                                                        \
110     break;                                                                     \
111     CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr)                            \
112     CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr)                            \
113     CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
114 
115 #  ifndef PREFIX
116 #    define PREFIX(ident) ident
117 #  endif
118 
119 #  define HAS_CHARS(enc, ptr, end, count) (end - ptr >= count * MINBPC(enc))
120 
121 #  define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1)
122 
123 #  define REQUIRE_CHARS(enc, ptr, end, count)                                  \
124     {                                                                          \
125       if (! HAS_CHARS(enc, ptr, end, count)) {                                 \
126         return XML_TOK_PARTIAL;                                                \
127       }                                                                        \
128     }
129 
130 #  define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1)
131 
132 /* ptr points to character following "<!-" */
133 
134 static int PTRCALL
135 PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
136                     const char **nextTokPtr) {
137   if (HAS_CHAR(enc, ptr, end)) {
138     if (! CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
139       *nextTokPtr = ptr;
140       return XML_TOK_INVALID;
141     }
142     ptr += MINBPC(enc);
143     while (HAS_CHAR(enc, ptr, end)) {
144       switch (BYTE_TYPE(enc, ptr)) {
145         INVALID_CASES(ptr, nextTokPtr)
146       case BT_MINUS:
147         ptr += MINBPC(enc);
148         REQUIRE_CHAR(enc, ptr, end);
149         if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
150           ptr += MINBPC(enc);
151           REQUIRE_CHAR(enc, ptr, end);
152           if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
153             *nextTokPtr = ptr;
154             return XML_TOK_INVALID;
155           }
156           *nextTokPtr = ptr + MINBPC(enc);
157           return XML_TOK_COMMENT;
158         }
159         break;
160       default:
161         ptr += MINBPC(enc);
162         break;
163       }
164     }
165   }
166   return XML_TOK_PARTIAL;
167 }
168 
169 /* ptr points to character following "<!" */
170 
171 static int PTRCALL
172 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
173                  const char **nextTokPtr) {
174   REQUIRE_CHAR(enc, ptr, end);
175   switch (BYTE_TYPE(enc, ptr)) {
176   case BT_MINUS:
177     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
178   case BT_LSQB:
179     *nextTokPtr = ptr + MINBPC(enc);
180     return XML_TOK_COND_SECT_OPEN;
181   case BT_NMSTRT:
182   case BT_HEX:
183     ptr += MINBPC(enc);
184     break;
185   default:
186     *nextTokPtr = ptr;
187     return XML_TOK_INVALID;
188   }
189   while (HAS_CHAR(enc, ptr, end)) {
190     switch (BYTE_TYPE(enc, ptr)) {
191     case BT_PERCNT:
192       REQUIRE_CHARS(enc, ptr, end, 2);
193       /* don't allow <!ENTITY% foo "whatever"> */
194       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
195       case BT_S:
196       case BT_CR:
197       case BT_LF:
198       case BT_PERCNT:
199         *nextTokPtr = ptr;
200         return XML_TOK_INVALID;
201       }
202       /* fall through */
203     case BT_S:
204     case BT_CR:
205     case BT_LF:
206       *nextTokPtr = ptr;
207       return XML_TOK_DECL_OPEN;
208     case BT_NMSTRT:
209     case BT_HEX:
210       ptr += MINBPC(enc);
211       break;
212     default:
213       *nextTokPtr = ptr;
214       return XML_TOK_INVALID;
215     }
216   }
217   return XML_TOK_PARTIAL;
218 }
219 
220 static int PTRCALL
221 PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end,
222                       int *tokPtr) {
223   int upper = 0;
224   UNUSED_P(enc);
225   *tokPtr = XML_TOK_PI;
226   if (end - ptr != MINBPC(enc) * 3)
227     return 1;
228   switch (BYTE_TO_ASCII(enc, ptr)) {
229   case ASCII_x:
230     break;
231   case ASCII_X:
232     upper = 1;
233     break;
234   default:
235     return 1;
236   }
237   ptr += MINBPC(enc);
238   switch (BYTE_TO_ASCII(enc, ptr)) {
239   case ASCII_m:
240     break;
241   case ASCII_M:
242     upper = 1;
243     break;
244   default:
245     return 1;
246   }
247   ptr += MINBPC(enc);
248   switch (BYTE_TO_ASCII(enc, ptr)) {
249   case ASCII_l:
250     break;
251   case ASCII_L:
252     upper = 1;
253     break;
254   default:
255     return 1;
256   }
257   if (upper)
258     return 0;
259   *tokPtr = XML_TOK_XML_DECL;
260   return 1;
261 }
262 
263 /* ptr points to character following "<?" */
264 
265 static int PTRCALL
266 PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
267                const char **nextTokPtr) {
268   int tok;
269   const char *target = ptr;
270   REQUIRE_CHAR(enc, ptr, end);
271   switch (BYTE_TYPE(enc, ptr)) {
272     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
273   default:
274     *nextTokPtr = ptr;
275     return XML_TOK_INVALID;
276   }
277   while (HAS_CHAR(enc, ptr, end)) {
278     switch (BYTE_TYPE(enc, ptr)) {
279       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
280     case BT_S:
281     case BT_CR:
282     case BT_LF:
283       if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
284         *nextTokPtr = ptr;
285         return XML_TOK_INVALID;
286       }
287       ptr += MINBPC(enc);
288       while (HAS_CHAR(enc, ptr, end)) {
289         switch (BYTE_TYPE(enc, ptr)) {
290           INVALID_CASES(ptr, nextTokPtr)
291         case BT_QUEST:
292           ptr += MINBPC(enc);
293           REQUIRE_CHAR(enc, ptr, end);
294           if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
295             *nextTokPtr = ptr + MINBPC(enc);
296             return tok;
297           }
298           break;
299         default:
300           ptr += MINBPC(enc);
301           break;
302         }
303       }
304       return XML_TOK_PARTIAL;
305     case BT_QUEST:
306       if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
307         *nextTokPtr = ptr;
308         return XML_TOK_INVALID;
309       }
310       ptr += MINBPC(enc);
311       REQUIRE_CHAR(enc, ptr, end);
312       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
313         *nextTokPtr = ptr + MINBPC(enc);
314         return tok;
315       }
316       /* fall through */
317     default:
318       *nextTokPtr = ptr;
319       return XML_TOK_INVALID;
320     }
321   }
322   return XML_TOK_PARTIAL;
323 }
324 
325 static int PTRCALL
326 PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
327                          const char **nextTokPtr) {
328   static const char CDATA_LSQB[]
329       = {ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB};
330   int i;
331   UNUSED_P(enc);
332   /* CDATA[ */
333   REQUIRE_CHARS(enc, ptr, end, 6);
334   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
335     if (! CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
336       *nextTokPtr = ptr;
337       return XML_TOK_INVALID;
338     }
339   }
340   *nextTokPtr = ptr;
341   return XML_TOK_CDATA_SECT_OPEN;
342 }
343 
344 static int PTRCALL
345 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
346                         const char **nextTokPtr) {
347   if (ptr >= end)
348     return XML_TOK_NONE;
349   if (MINBPC(enc) > 1) {
350     size_t n = end - ptr;
351     if (n & (MINBPC(enc) - 1)) {
352       n &= ~(MINBPC(enc) - 1);
353       if (n == 0)
354         return XML_TOK_PARTIAL;
355       end = ptr + n;
356     }
357   }
358   switch (BYTE_TYPE(enc, ptr)) {
359   case BT_RSQB:
360     ptr += MINBPC(enc);
361     REQUIRE_CHAR(enc, ptr, end);
362     if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
363       break;
364     ptr += MINBPC(enc);
365     REQUIRE_CHAR(enc, ptr, end);
366     if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
367       ptr -= MINBPC(enc);
368       break;
369     }
370     *nextTokPtr = ptr + MINBPC(enc);
371     return XML_TOK_CDATA_SECT_CLOSE;
372   case BT_CR:
373     ptr += MINBPC(enc);
374     REQUIRE_CHAR(enc, ptr, end);
375     if (BYTE_TYPE(enc, ptr) == BT_LF)
376       ptr += MINBPC(enc);
377     *nextTokPtr = ptr;
378     return XML_TOK_DATA_NEWLINE;
379   case BT_LF:
380     *nextTokPtr = ptr + MINBPC(enc);
381     return XML_TOK_DATA_NEWLINE;
382     INVALID_CASES(ptr, nextTokPtr)
383   default:
384     ptr += MINBPC(enc);
385     break;
386   }
387   while (HAS_CHAR(enc, ptr, end)) {
388     switch (BYTE_TYPE(enc, ptr)) {
389 #  define LEAD_CASE(n)                                                         \
390   case BT_LEAD##n:                                                             \
391     if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) {                       \
392       *nextTokPtr = ptr;                                                       \
393       return XML_TOK_DATA_CHARS;                                               \
394     }                                                                          \
395     ptr += n;                                                                  \
396     break;
397       LEAD_CASE(2)
398       LEAD_CASE(3)
399       LEAD_CASE(4)
400 #  undef LEAD_CASE
401     case BT_NONXML:
402     case BT_MALFORM:
403     case BT_TRAIL:
404     case BT_CR:
405     case BT_LF:
406     case BT_RSQB:
407       *nextTokPtr = ptr;
408       return XML_TOK_DATA_CHARS;
409     default:
410       ptr += MINBPC(enc);
411       break;
412     }
413   }
414   *nextTokPtr = ptr;
415   return XML_TOK_DATA_CHARS;
416 }
417 
418 /* ptr points to character following "</" */
419 
420 static int PTRCALL
421 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
422                    const char **nextTokPtr) {
423   REQUIRE_CHAR(enc, ptr, end);
424   switch (BYTE_TYPE(enc, ptr)) {
425     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
426   default:
427     *nextTokPtr = ptr;
428     return XML_TOK_INVALID;
429   }
430   while (HAS_CHAR(enc, ptr, end)) {
431     switch (BYTE_TYPE(enc, ptr)) {
432       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
433     case BT_S:
434     case BT_CR:
435     case BT_LF:
436       for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
437         switch (BYTE_TYPE(enc, ptr)) {
438         case BT_S:
439         case BT_CR:
440         case BT_LF:
441           break;
442         case BT_GT:
443           *nextTokPtr = ptr + MINBPC(enc);
444           return XML_TOK_END_TAG;
445         default:
446           *nextTokPtr = ptr;
447           return XML_TOK_INVALID;
448         }
449       }
450       return XML_TOK_PARTIAL;
451 #  ifdef XML_NS
452     case BT_COLON:
453       /* no need to check qname syntax here,
454          since end-tag must match exactly */
455       ptr += MINBPC(enc);
456       break;
457 #  endif
458     case BT_GT:
459       *nextTokPtr = ptr + MINBPC(enc);
460       return XML_TOK_END_TAG;
461     default:
462       *nextTokPtr = ptr;
463       return XML_TOK_INVALID;
464     }
465   }
466   return XML_TOK_PARTIAL;
467 }
468 
469 /* ptr points to character following "&#X" */
470 
471 static int PTRCALL
472 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
473                        const char **nextTokPtr) {
474   if (HAS_CHAR(enc, ptr, end)) {
475     switch (BYTE_TYPE(enc, ptr)) {
476     case BT_DIGIT:
477     case BT_HEX:
478       break;
479     default:
480       *nextTokPtr = ptr;
481       return XML_TOK_INVALID;
482     }
483     for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
484       switch (BYTE_TYPE(enc, ptr)) {
485       case BT_DIGIT:
486       case BT_HEX:
487         break;
488       case BT_SEMI:
489         *nextTokPtr = ptr + MINBPC(enc);
490         return XML_TOK_CHAR_REF;
491       default:
492         *nextTokPtr = ptr;
493         return XML_TOK_INVALID;
494       }
495     }
496   }
497   return XML_TOK_PARTIAL;
498 }
499 
500 /* ptr points to character following "&#" */
501 
502 static int PTRCALL
503 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
504                     const char **nextTokPtr) {
505   if (HAS_CHAR(enc, ptr, end)) {
506     if (CHAR_MATCHES(enc, ptr, ASCII_x))
507       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
508     switch (BYTE_TYPE(enc, ptr)) {
509     case BT_DIGIT:
510       break;
511     default:
512       *nextTokPtr = ptr;
513       return XML_TOK_INVALID;
514     }
515     for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
516       switch (BYTE_TYPE(enc, ptr)) {
517       case BT_DIGIT:
518         break;
519       case BT_SEMI:
520         *nextTokPtr = ptr + MINBPC(enc);
521         return XML_TOK_CHAR_REF;
522       default:
523         *nextTokPtr = ptr;
524         return XML_TOK_INVALID;
525       }
526     }
527   }
528   return XML_TOK_PARTIAL;
529 }
530 
531 /* ptr points to character following "&" */
532 
533 static int PTRCALL
534 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
535                 const char **nextTokPtr) {
536   REQUIRE_CHAR(enc, ptr, end);
537   switch (BYTE_TYPE(enc, ptr)) {
538     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
539   case BT_NUM:
540     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
541   default:
542     *nextTokPtr = ptr;
543     return XML_TOK_INVALID;
544   }
545   while (HAS_CHAR(enc, ptr, end)) {
546     switch (BYTE_TYPE(enc, ptr)) {
547       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
548     case BT_SEMI:
549       *nextTokPtr = ptr + MINBPC(enc);
550       return XML_TOK_ENTITY_REF;
551     default:
552       *nextTokPtr = ptr;
553       return XML_TOK_INVALID;
554     }
555   }
556   return XML_TOK_PARTIAL;
557 }
558 
559 /* ptr points to character following first character of attribute name */
560 
561 static int PTRCALL
562 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
563                  const char **nextTokPtr) {
564 #  ifdef XML_NS
565   int hadColon = 0;
566 #  endif
567   while (HAS_CHAR(enc, ptr, end)) {
568     switch (BYTE_TYPE(enc, ptr)) {
569       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
570 #  ifdef XML_NS
571     case BT_COLON:
572       if (hadColon) {
573         *nextTokPtr = ptr;
574         return XML_TOK_INVALID;
575       }
576       hadColon = 1;
577       ptr += MINBPC(enc);
578       REQUIRE_CHAR(enc, ptr, end);
579       switch (BYTE_TYPE(enc, ptr)) {
580         CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
581       default:
582         *nextTokPtr = ptr;
583         return XML_TOK_INVALID;
584       }
585       break;
586 #  endif
587     case BT_S:
588     case BT_CR:
589     case BT_LF:
590       for (;;) {
591         int t;
592 
593         ptr += MINBPC(enc);
594         REQUIRE_CHAR(enc, ptr, end);
595         t = BYTE_TYPE(enc, ptr);
596         if (t == BT_EQUALS)
597           break;
598         switch (t) {
599         case BT_S:
600         case BT_LF:
601         case BT_CR:
602           break;
603         default:
604           *nextTokPtr = ptr;
605           return XML_TOK_INVALID;
606         }
607       }
608       /* fall through */
609     case BT_EQUALS: {
610       int open;
611 #  ifdef XML_NS
612       hadColon = 0;
613 #  endif
614       for (;;) {
615         ptr += MINBPC(enc);
616         REQUIRE_CHAR(enc, ptr, end);
617         open = BYTE_TYPE(enc, ptr);
618         if (open == BT_QUOT || open == BT_APOS)
619           break;
620         switch (open) {
621         case BT_S:
622         case BT_LF:
623         case BT_CR:
624           break;
625         default:
626           *nextTokPtr = ptr;
627           return XML_TOK_INVALID;
628         }
629       }
630       ptr += MINBPC(enc);
631       /* in attribute value */
632       for (;;) {
633         int t;
634         REQUIRE_CHAR(enc, ptr, end);
635         t = BYTE_TYPE(enc, ptr);
636         if (t == open)
637           break;
638         switch (t) {
639           INVALID_CASES(ptr, nextTokPtr)
640         case BT_AMP: {
641           int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
642           if (tok <= 0) {
643             if (tok == XML_TOK_INVALID)
644               *nextTokPtr = ptr;
645             return tok;
646           }
647           break;
648         }
649         case BT_LT:
650           *nextTokPtr = ptr;
651           return XML_TOK_INVALID;
652         default:
653           ptr += MINBPC(enc);
654           break;
655         }
656       }
657       ptr += MINBPC(enc);
658       REQUIRE_CHAR(enc, ptr, end);
659       switch (BYTE_TYPE(enc, ptr)) {
660       case BT_S:
661       case BT_CR:
662       case BT_LF:
663         break;
664       case BT_SOL:
665         goto sol;
666       case BT_GT:
667         goto gt;
668       default:
669         *nextTokPtr = ptr;
670         return XML_TOK_INVALID;
671       }
672       /* ptr points to closing quote */
673       for (;;) {
674         ptr += MINBPC(enc);
675         REQUIRE_CHAR(enc, ptr, end);
676         switch (BYTE_TYPE(enc, ptr)) {
677           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
678         case BT_S:
679         case BT_CR:
680         case BT_LF:
681           continue;
682         case BT_GT:
683         gt:
684           *nextTokPtr = ptr + MINBPC(enc);
685           return XML_TOK_START_TAG_WITH_ATTS;
686         case BT_SOL:
687         sol:
688           ptr += MINBPC(enc);
689           REQUIRE_CHAR(enc, ptr, end);
690           if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
691             *nextTokPtr = ptr;
692             return XML_TOK_INVALID;
693           }
694           *nextTokPtr = ptr + MINBPC(enc);
695           return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
696         default:
697           *nextTokPtr = ptr;
698           return XML_TOK_INVALID;
699         }
700         break;
701       }
702       break;
703     }
704     default:
705       *nextTokPtr = ptr;
706       return XML_TOK_INVALID;
707     }
708   }
709   return XML_TOK_PARTIAL;
710 }
711 
712 /* ptr points to character following "<" */
713 
714 static int PTRCALL
715 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
716                const char **nextTokPtr) {
717 #  ifdef XML_NS
718   int hadColon;
719 #  endif
720   REQUIRE_CHAR(enc, ptr, end);
721   switch (BYTE_TYPE(enc, ptr)) {
722     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
723   case BT_EXCL:
724     ptr += MINBPC(enc);
725     REQUIRE_CHAR(enc, ptr, end);
726     switch (BYTE_TYPE(enc, ptr)) {
727     case BT_MINUS:
728       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
729     case BT_LSQB:
730       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
731     }
732     *nextTokPtr = ptr;
733     return XML_TOK_INVALID;
734   case BT_QUEST:
735     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
736   case BT_SOL:
737     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
738   default:
739     *nextTokPtr = ptr;
740     return XML_TOK_INVALID;
741   }
742 #  ifdef XML_NS
743   hadColon = 0;
744 #  endif
745   /* we have a start-tag */
746   while (HAS_CHAR(enc, ptr, end)) {
747     switch (BYTE_TYPE(enc, ptr)) {
748       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
749 #  ifdef XML_NS
750     case BT_COLON:
751       if (hadColon) {
752         *nextTokPtr = ptr;
753         return XML_TOK_INVALID;
754       }
755       hadColon = 1;
756       ptr += MINBPC(enc);
757       REQUIRE_CHAR(enc, ptr, end);
758       switch (BYTE_TYPE(enc, ptr)) {
759         CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
760       default:
761         *nextTokPtr = ptr;
762         return XML_TOK_INVALID;
763       }
764       break;
765 #  endif
766     case BT_S:
767     case BT_CR:
768     case BT_LF: {
769       ptr += MINBPC(enc);
770       while (HAS_CHAR(enc, ptr, end)) {
771         switch (BYTE_TYPE(enc, ptr)) {
772           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
773         case BT_GT:
774           goto gt;
775         case BT_SOL:
776           goto sol;
777         case BT_S:
778         case BT_CR:
779         case BT_LF:
780           ptr += MINBPC(enc);
781           continue;
782         default:
783           *nextTokPtr = ptr;
784           return XML_TOK_INVALID;
785         }
786         return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
787       }
788       return XML_TOK_PARTIAL;
789     }
790     case BT_GT:
791     gt:
792       *nextTokPtr = ptr + MINBPC(enc);
793       return XML_TOK_START_TAG_NO_ATTS;
794     case BT_SOL:
795     sol:
796       ptr += MINBPC(enc);
797       REQUIRE_CHAR(enc, ptr, end);
798       if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
799         *nextTokPtr = ptr;
800         return XML_TOK_INVALID;
801       }
802       *nextTokPtr = ptr + MINBPC(enc);
803       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
804     default:
805       *nextTokPtr = ptr;
806       return XML_TOK_INVALID;
807     }
808   }
809   return XML_TOK_PARTIAL;
810 }
811 
812 static int PTRCALL
813 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
814                    const char **nextTokPtr) {
815   if (ptr >= end)
816     return XML_TOK_NONE;
817   if (MINBPC(enc) > 1) {
818     size_t n = end - ptr;
819     if (n & (MINBPC(enc) - 1)) {
820       n &= ~(MINBPC(enc) - 1);
821       if (n == 0)
822         return XML_TOK_PARTIAL;
823       end = ptr + n;
824     }
825   }
826   switch (BYTE_TYPE(enc, ptr)) {
827   case BT_LT:
828     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
829   case BT_AMP:
830     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
831   case BT_CR:
832     ptr += MINBPC(enc);
833     if (! HAS_CHAR(enc, ptr, end))
834       return XML_TOK_TRAILING_CR;
835     if (BYTE_TYPE(enc, ptr) == BT_LF)
836       ptr += MINBPC(enc);
837     *nextTokPtr = ptr;
838     return XML_TOK_DATA_NEWLINE;
839   case BT_LF:
840     *nextTokPtr = ptr + MINBPC(enc);
841     return XML_TOK_DATA_NEWLINE;
842   case BT_RSQB:
843     ptr += MINBPC(enc);
844     if (! HAS_CHAR(enc, ptr, end))
845       return XML_TOK_TRAILING_RSQB;
846     if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
847       break;
848     ptr += MINBPC(enc);
849     if (! HAS_CHAR(enc, ptr, end))
850       return XML_TOK_TRAILING_RSQB;
851     if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
852       ptr -= MINBPC(enc);
853       break;
854     }
855     *nextTokPtr = ptr;
856     return XML_TOK_INVALID;
857     INVALID_CASES(ptr, nextTokPtr)
858   default:
859     ptr += MINBPC(enc);
860     break;
861   }
862   while (HAS_CHAR(enc, ptr, end)) {
863     switch (BYTE_TYPE(enc, ptr)) {
864 #  define LEAD_CASE(n)                                                         \
865   case BT_LEAD##n:                                                             \
866     if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) {                       \
867       *nextTokPtr = ptr;                                                       \
868       return XML_TOK_DATA_CHARS;                                               \
869     }                                                                          \
870     ptr += n;                                                                  \
871     break;
872       LEAD_CASE(2)
873       LEAD_CASE(3)
874       LEAD_CASE(4)
875 #  undef LEAD_CASE
876     case BT_RSQB:
877       if (HAS_CHARS(enc, ptr, end, 2)) {
878         if (! CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
879           ptr += MINBPC(enc);
880           break;
881         }
882         if (HAS_CHARS(enc, ptr, end, 3)) {
883           if (! CHAR_MATCHES(enc, ptr + 2 * MINBPC(enc), ASCII_GT)) {
884             ptr += MINBPC(enc);
885             break;
886           }
887           *nextTokPtr = ptr + 2 * MINBPC(enc);
888           return XML_TOK_INVALID;
889         }
890       }
891       /* fall through */
892     case BT_AMP:
893     case BT_LT:
894     case BT_NONXML:
895     case BT_MALFORM:
896     case BT_TRAIL:
897     case BT_CR:
898     case BT_LF:
899       *nextTokPtr = ptr;
900       return XML_TOK_DATA_CHARS;
901     default:
902       ptr += MINBPC(enc);
903       break;
904     }
905   }
906   *nextTokPtr = ptr;
907   return XML_TOK_DATA_CHARS;
908 }
909 
910 /* ptr points to character following "%" */
911 
912 static int PTRCALL
913 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
914                     const char **nextTokPtr) {
915   REQUIRE_CHAR(enc, ptr, end);
916   switch (BYTE_TYPE(enc, ptr)) {
917     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
918   case BT_S:
919   case BT_LF:
920   case BT_CR:
921   case BT_PERCNT:
922     *nextTokPtr = ptr;
923     return XML_TOK_PERCENT;
924   default:
925     *nextTokPtr = ptr;
926     return XML_TOK_INVALID;
927   }
928   while (HAS_CHAR(enc, ptr, end)) {
929     switch (BYTE_TYPE(enc, ptr)) {
930       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
931     case BT_SEMI:
932       *nextTokPtr = ptr + MINBPC(enc);
933       return XML_TOK_PARAM_ENTITY_REF;
934     default:
935       *nextTokPtr = ptr;
936       return XML_TOK_INVALID;
937     }
938   }
939   return XML_TOK_PARTIAL;
940 }
941 
942 static int PTRCALL
943 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
944                       const char **nextTokPtr) {
945   REQUIRE_CHAR(enc, ptr, end);
946   switch (BYTE_TYPE(enc, ptr)) {
947     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
948   default:
949     *nextTokPtr = ptr;
950     return XML_TOK_INVALID;
951   }
952   while (HAS_CHAR(enc, ptr, end)) {
953     switch (BYTE_TYPE(enc, ptr)) {
954       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
955     case BT_CR:
956     case BT_LF:
957     case BT_S:
958     case BT_RPAR:
959     case BT_GT:
960     case BT_PERCNT:
961     case BT_VERBAR:
962       *nextTokPtr = ptr;
963       return XML_TOK_POUND_NAME;
964     default:
965       *nextTokPtr = ptr;
966       return XML_TOK_INVALID;
967     }
968   }
969   return -XML_TOK_POUND_NAME;
970 }
971 
972 static int PTRCALL
973 PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end,
974                 const char **nextTokPtr) {
975   while (HAS_CHAR(enc, ptr, end)) {
976     int t = BYTE_TYPE(enc, ptr);
977     switch (t) {
978       INVALID_CASES(ptr, nextTokPtr)
979     case BT_QUOT:
980     case BT_APOS:
981       ptr += MINBPC(enc);
982       if (t != open)
983         break;
984       if (! HAS_CHAR(enc, ptr, end))
985         return -XML_TOK_LITERAL;
986       *nextTokPtr = ptr;
987       switch (BYTE_TYPE(enc, ptr)) {
988       case BT_S:
989       case BT_CR:
990       case BT_LF:
991       case BT_GT:
992       case BT_PERCNT:
993       case BT_LSQB:
994         return XML_TOK_LITERAL;
995       default:
996         return XML_TOK_INVALID;
997       }
998     default:
999       ptr += MINBPC(enc);
1000       break;
1001     }
1002   }
1003   return XML_TOK_PARTIAL;
1004 }
1005 
1006 static int PTRCALL
1007 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
1008                   const char **nextTokPtr) {
1009   int tok;
1010   if (ptr >= end)
1011     return XML_TOK_NONE;
1012   if (MINBPC(enc) > 1) {
1013     size_t n = end - ptr;
1014     if (n & (MINBPC(enc) - 1)) {
1015       n &= ~(MINBPC(enc) - 1);
1016       if (n == 0)
1017         return XML_TOK_PARTIAL;
1018       end = ptr + n;
1019     }
1020   }
1021   switch (BYTE_TYPE(enc, ptr)) {
1022   case BT_QUOT:
1023     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1024   case BT_APOS:
1025     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1026   case BT_LT: {
1027     ptr += MINBPC(enc);
1028     REQUIRE_CHAR(enc, ptr, end);
1029     switch (BYTE_TYPE(enc, ptr)) {
1030     case BT_EXCL:
1031       return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1032     case BT_QUEST:
1033       return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1034     case BT_NMSTRT:
1035     case BT_HEX:
1036     case BT_NONASCII:
1037     case BT_LEAD2:
1038     case BT_LEAD3:
1039     case BT_LEAD4:
1040       *nextTokPtr = ptr - MINBPC(enc);
1041       return XML_TOK_INSTANCE_START;
1042     }
1043     *nextTokPtr = ptr;
1044     return XML_TOK_INVALID;
1045   }
1046   case BT_CR:
1047     if (ptr + MINBPC(enc) == end) {
1048       *nextTokPtr = end;
1049       /* indicate that this might be part of a CR/LF pair */
1050       return -XML_TOK_PROLOG_S;
1051     }
1052     /* fall through */
1053   case BT_S:
1054   case BT_LF:
1055     for (;;) {
1056       ptr += MINBPC(enc);
1057       if (! HAS_CHAR(enc, ptr, end))
1058         break;
1059       switch (BYTE_TYPE(enc, ptr)) {
1060       case BT_S:
1061       case BT_LF:
1062         break;
1063       case BT_CR:
1064         /* don't split CR/LF pair */
1065         if (ptr + MINBPC(enc) != end)
1066           break;
1067         /* fall through */
1068       default:
1069         *nextTokPtr = ptr;
1070         return XML_TOK_PROLOG_S;
1071       }
1072     }
1073     *nextTokPtr = ptr;
1074     return XML_TOK_PROLOG_S;
1075   case BT_PERCNT:
1076     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1077   case BT_COMMA:
1078     *nextTokPtr = ptr + MINBPC(enc);
1079     return XML_TOK_COMMA;
1080   case BT_LSQB:
1081     *nextTokPtr = ptr + MINBPC(enc);
1082     return XML_TOK_OPEN_BRACKET;
1083   case BT_RSQB:
1084     ptr += MINBPC(enc);
1085     if (! HAS_CHAR(enc, ptr, end))
1086       return -XML_TOK_CLOSE_BRACKET;
1087     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1088       REQUIRE_CHARS(enc, ptr, end, 2);
1089       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1090         *nextTokPtr = ptr + 2 * MINBPC(enc);
1091         return XML_TOK_COND_SECT_CLOSE;
1092       }
1093     }
1094     *nextTokPtr = ptr;
1095     return XML_TOK_CLOSE_BRACKET;
1096   case BT_LPAR:
1097     *nextTokPtr = ptr + MINBPC(enc);
1098     return XML_TOK_OPEN_PAREN;
1099   case BT_RPAR:
1100     ptr += MINBPC(enc);
1101     if (! HAS_CHAR(enc, ptr, end))
1102       return -XML_TOK_CLOSE_PAREN;
1103     switch (BYTE_TYPE(enc, ptr)) {
1104     case BT_AST:
1105       *nextTokPtr = ptr + MINBPC(enc);
1106       return XML_TOK_CLOSE_PAREN_ASTERISK;
1107     case BT_QUEST:
1108       *nextTokPtr = ptr + MINBPC(enc);
1109       return XML_TOK_CLOSE_PAREN_QUESTION;
1110     case BT_PLUS:
1111       *nextTokPtr = ptr + MINBPC(enc);
1112       return XML_TOK_CLOSE_PAREN_PLUS;
1113     case BT_CR:
1114     case BT_LF:
1115     case BT_S:
1116     case BT_GT:
1117     case BT_COMMA:
1118     case BT_VERBAR:
1119     case BT_RPAR:
1120       *nextTokPtr = ptr;
1121       return XML_TOK_CLOSE_PAREN;
1122     }
1123     *nextTokPtr = ptr;
1124     return XML_TOK_INVALID;
1125   case BT_VERBAR:
1126     *nextTokPtr = ptr + MINBPC(enc);
1127     return XML_TOK_OR;
1128   case BT_GT:
1129     *nextTokPtr = ptr + MINBPC(enc);
1130     return XML_TOK_DECL_CLOSE;
1131   case BT_NUM:
1132     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1133 #  define LEAD_CASE(n)                                                         \
1134   case BT_LEAD##n:                                                             \
1135     if (end - ptr < n)                                                         \
1136       return XML_TOK_PARTIAL_CHAR;                                             \
1137     if (IS_NMSTRT_CHAR(enc, ptr, n)) {                                         \
1138       ptr += n;                                                                \
1139       tok = XML_TOK_NAME;                                                      \
1140       break;                                                                   \
1141     }                                                                          \
1142     if (IS_NAME_CHAR(enc, ptr, n)) {                                           \
1143       ptr += n;                                                                \
1144       tok = XML_TOK_NMTOKEN;                                                   \
1145       break;                                                                   \
1146     }                                                                          \
1147     *nextTokPtr = ptr;                                                         \
1148     return XML_TOK_INVALID;
1149     LEAD_CASE(2)
1150     LEAD_CASE(3)
1151     LEAD_CASE(4)
1152 #  undef LEAD_CASE
1153   case BT_NMSTRT:
1154   case BT_HEX:
1155     tok = XML_TOK_NAME;
1156     ptr += MINBPC(enc);
1157     break;
1158   case BT_DIGIT:
1159   case BT_NAME:
1160   case BT_MINUS:
1161 #  ifdef XML_NS
1162   case BT_COLON:
1163 #  endif
1164     tok = XML_TOK_NMTOKEN;
1165     ptr += MINBPC(enc);
1166     break;
1167   case BT_NONASCII:
1168     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1169       ptr += MINBPC(enc);
1170       tok = XML_TOK_NAME;
1171       break;
1172     }
1173     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1174       ptr += MINBPC(enc);
1175       tok = XML_TOK_NMTOKEN;
1176       break;
1177     }
1178     /* fall through */
1179   default:
1180     *nextTokPtr = ptr;
1181     return XML_TOK_INVALID;
1182   }
1183   while (HAS_CHAR(enc, ptr, end)) {
1184     switch (BYTE_TYPE(enc, ptr)) {
1185       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1186     case BT_GT:
1187     case BT_RPAR:
1188     case BT_COMMA:
1189     case BT_VERBAR:
1190     case BT_LSQB:
1191     case BT_PERCNT:
1192     case BT_S:
1193     case BT_CR:
1194     case BT_LF:
1195       *nextTokPtr = ptr;
1196       return tok;
1197 #  ifdef XML_NS
1198     case BT_COLON:
1199       ptr += MINBPC(enc);
1200       switch (tok) {
1201       case XML_TOK_NAME:
1202         REQUIRE_CHAR(enc, ptr, end);
1203         tok = XML_TOK_PREFIXED_NAME;
1204         switch (BYTE_TYPE(enc, ptr)) {
1205           CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1206         default:
1207           tok = XML_TOK_NMTOKEN;
1208           break;
1209         }
1210         break;
1211       case XML_TOK_PREFIXED_NAME:
1212         tok = XML_TOK_NMTOKEN;
1213         break;
1214       }
1215       break;
1216 #  endif
1217     case BT_PLUS:
1218       if (tok == XML_TOK_NMTOKEN) {
1219         *nextTokPtr = ptr;
1220         return XML_TOK_INVALID;
1221       }
1222       *nextTokPtr = ptr + MINBPC(enc);
1223       return XML_TOK_NAME_PLUS;
1224     case BT_AST:
1225       if (tok == XML_TOK_NMTOKEN) {
1226         *nextTokPtr = ptr;
1227         return XML_TOK_INVALID;
1228       }
1229       *nextTokPtr = ptr + MINBPC(enc);
1230       return XML_TOK_NAME_ASTERISK;
1231     case BT_QUEST:
1232       if (tok == XML_TOK_NMTOKEN) {
1233         *nextTokPtr = ptr;
1234         return XML_TOK_INVALID;
1235       }
1236       *nextTokPtr = ptr + MINBPC(enc);
1237       return XML_TOK_NAME_QUESTION;
1238     default:
1239       *nextTokPtr = ptr;
1240       return XML_TOK_INVALID;
1241     }
1242   }
1243   return -tok;
1244 }
1245 
1246 static int PTRCALL
1247 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1248                           const char **nextTokPtr) {
1249   const char *start;
1250   if (ptr >= end)
1251     return XML_TOK_NONE;
1252   else if (! HAS_CHAR(enc, ptr, end)) {
1253     /* This line cannot be executed.  The incoming data has already
1254      * been tokenized once, so incomplete characters like this have
1255      * already been eliminated from the input.  Retaining the paranoia
1256      * check is still valuable, however.
1257      */
1258     return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1259   }
1260   start = ptr;
1261   while (HAS_CHAR(enc, ptr, end)) {
1262     switch (BYTE_TYPE(enc, ptr)) {
1263 #  define LEAD_CASE(n)                                                         \
1264   case BT_LEAD##n:                                                             \
1265     ptr += n;                                                                  \
1266     break;
1267       LEAD_CASE(2)
1268       LEAD_CASE(3)
1269       LEAD_CASE(4)
1270 #  undef LEAD_CASE
1271     case BT_AMP:
1272       if (ptr == start)
1273         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1274       *nextTokPtr = ptr;
1275       return XML_TOK_DATA_CHARS;
1276     case BT_LT:
1277       /* this is for inside entity references */
1278       *nextTokPtr = ptr;
1279       return XML_TOK_INVALID;
1280     case BT_LF:
1281       if (ptr == start) {
1282         *nextTokPtr = ptr + MINBPC(enc);
1283         return XML_TOK_DATA_NEWLINE;
1284       }
1285       *nextTokPtr = ptr;
1286       return XML_TOK_DATA_CHARS;
1287     case BT_CR:
1288       if (ptr == start) {
1289         ptr += MINBPC(enc);
1290         if (! HAS_CHAR(enc, ptr, end))
1291           return XML_TOK_TRAILING_CR;
1292         if (BYTE_TYPE(enc, ptr) == BT_LF)
1293           ptr += MINBPC(enc);
1294         *nextTokPtr = ptr;
1295         return XML_TOK_DATA_NEWLINE;
1296       }
1297       *nextTokPtr = ptr;
1298       return XML_TOK_DATA_CHARS;
1299     case BT_S:
1300       if (ptr == start) {
1301         *nextTokPtr = ptr + MINBPC(enc);
1302         return XML_TOK_ATTRIBUTE_VALUE_S;
1303       }
1304       *nextTokPtr = ptr;
1305       return XML_TOK_DATA_CHARS;
1306     default:
1307       ptr += MINBPC(enc);
1308       break;
1309     }
1310   }
1311   *nextTokPtr = ptr;
1312   return XML_TOK_DATA_CHARS;
1313 }
1314 
1315 static int PTRCALL
1316 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1317                        const char **nextTokPtr) {
1318   const char *start;
1319   if (ptr >= end)
1320     return XML_TOK_NONE;
1321   else if (! HAS_CHAR(enc, ptr, end)) {
1322     /* This line cannot be executed.  The incoming data has already
1323      * been tokenized once, so incomplete characters like this have
1324      * already been eliminated from the input.  Retaining the paranoia
1325      * check is still valuable, however.
1326      */
1327     return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1328   }
1329   start = ptr;
1330   while (HAS_CHAR(enc, ptr, end)) {
1331     switch (BYTE_TYPE(enc, ptr)) {
1332 #  define LEAD_CASE(n)                                                         \
1333   case BT_LEAD##n:                                                             \
1334     ptr += n;                                                                  \
1335     break;
1336       LEAD_CASE(2)
1337       LEAD_CASE(3)
1338       LEAD_CASE(4)
1339 #  undef LEAD_CASE
1340     case BT_AMP:
1341       if (ptr == start)
1342         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1343       *nextTokPtr = ptr;
1344       return XML_TOK_DATA_CHARS;
1345     case BT_PERCNT:
1346       if (ptr == start) {
1347         int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1348         return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1349       }
1350       *nextTokPtr = ptr;
1351       return XML_TOK_DATA_CHARS;
1352     case BT_LF:
1353       if (ptr == start) {
1354         *nextTokPtr = ptr + MINBPC(enc);
1355         return XML_TOK_DATA_NEWLINE;
1356       }
1357       *nextTokPtr = ptr;
1358       return XML_TOK_DATA_CHARS;
1359     case BT_CR:
1360       if (ptr == start) {
1361         ptr += MINBPC(enc);
1362         if (! HAS_CHAR(enc, ptr, end))
1363           return XML_TOK_TRAILING_CR;
1364         if (BYTE_TYPE(enc, ptr) == BT_LF)
1365           ptr += MINBPC(enc);
1366         *nextTokPtr = ptr;
1367         return XML_TOK_DATA_NEWLINE;
1368       }
1369       *nextTokPtr = ptr;
1370       return XML_TOK_DATA_CHARS;
1371     default:
1372       ptr += MINBPC(enc);
1373       break;
1374     }
1375   }
1376   *nextTokPtr = ptr;
1377   return XML_TOK_DATA_CHARS;
1378 }
1379 
1380 #  ifdef XML_DTD
1381 
1382 static int PTRCALL
1383 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1384                          const char **nextTokPtr) {
1385   int level = 0;
1386   if (MINBPC(enc) > 1) {
1387     size_t n = end - ptr;
1388     if (n & (MINBPC(enc) - 1)) {
1389       n &= ~(MINBPC(enc) - 1);
1390       end = ptr + n;
1391     }
1392   }
1393   while (HAS_CHAR(enc, ptr, end)) {
1394     switch (BYTE_TYPE(enc, ptr)) {
1395       INVALID_CASES(ptr, nextTokPtr)
1396     case BT_LT:
1397       ptr += MINBPC(enc);
1398       REQUIRE_CHAR(enc, ptr, end);
1399       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1400         ptr += MINBPC(enc);
1401         REQUIRE_CHAR(enc, ptr, end);
1402         if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1403           ++level;
1404           ptr += MINBPC(enc);
1405         }
1406       }
1407       break;
1408     case BT_RSQB:
1409       ptr += MINBPC(enc);
1410       REQUIRE_CHAR(enc, ptr, end);
1411       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1412         ptr += MINBPC(enc);
1413         REQUIRE_CHAR(enc, ptr, end);
1414         if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1415           ptr += MINBPC(enc);
1416           if (level == 0) {
1417             *nextTokPtr = ptr;
1418             return XML_TOK_IGNORE_SECT;
1419           }
1420           --level;
1421         }
1422       }
1423       break;
1424     default:
1425       ptr += MINBPC(enc);
1426       break;
1427     }
1428   }
1429   return XML_TOK_PARTIAL;
1430 }
1431 
1432 #  endif /* XML_DTD */
1433 
1434 static int PTRCALL
1435 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1436                    const char **badPtr) {
1437   ptr += MINBPC(enc);
1438   end -= MINBPC(enc);
1439   for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1440     switch (BYTE_TYPE(enc, ptr)) {
1441     case BT_DIGIT:
1442     case BT_HEX:
1443     case BT_MINUS:
1444     case BT_APOS:
1445     case BT_LPAR:
1446     case BT_RPAR:
1447     case BT_PLUS:
1448     case BT_COMMA:
1449     case BT_SOL:
1450     case BT_EQUALS:
1451     case BT_QUEST:
1452     case BT_CR:
1453     case BT_LF:
1454     case BT_SEMI:
1455     case BT_EXCL:
1456     case BT_AST:
1457     case BT_PERCNT:
1458     case BT_NUM:
1459 #  ifdef XML_NS
1460     case BT_COLON:
1461 #  endif
1462       break;
1463     case BT_S:
1464       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1465         *badPtr = ptr;
1466         return 0;
1467       }
1468       break;
1469     case BT_NAME:
1470     case BT_NMSTRT:
1471       if (! (BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1472         break;
1473       /* fall through */
1474     default:
1475       switch (BYTE_TO_ASCII(enc, ptr)) {
1476       case 0x24: /* $ */
1477       case 0x40: /* @ */
1478         break;
1479       default:
1480         *badPtr = ptr;
1481         return 0;
1482       }
1483       break;
1484     }
1485   }
1486   return 1;
1487 }
1488 
1489 /* This must only be called for a well-formed start-tag or empty
1490    element tag.  Returns the number of attributes.  Pointers to the
1491    first attsMax attributes are stored in atts.
1492 */
1493 
1494 static int PTRCALL
1495 PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
1496                 ATTRIBUTE *atts) {
1497   enum { other, inName, inValue } state = inName;
1498   int nAtts = 0;
1499   int open = 0; /* defined when state == inValue;
1500                    initialization just to shut up compilers */
1501 
1502   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1503     switch (BYTE_TYPE(enc, ptr)) {
1504 #  define START_NAME                                                           \
1505     if (state == other) {                                                      \
1506       if (nAtts < attsMax) {                                                   \
1507         atts[nAtts].name = ptr;                                                \
1508         atts[nAtts].normalized = 1;                                            \
1509       }                                                                        \
1510       state = inName;                                                          \
1511     }
1512 #  define LEAD_CASE(n)                                                         \
1513   case BT_LEAD##n:                                                             \
1514     START_NAME ptr += (n - MINBPC(enc));                                       \
1515     break;
1516       LEAD_CASE(2)
1517       LEAD_CASE(3)
1518       LEAD_CASE(4)
1519 #  undef LEAD_CASE
1520     case BT_NONASCII:
1521     case BT_NMSTRT:
1522     case BT_HEX:
1523       START_NAME
1524       break;
1525 #  undef START_NAME
1526     case BT_QUOT:
1527       if (state != inValue) {
1528         if (nAtts < attsMax)
1529           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1530         state = inValue;
1531         open = BT_QUOT;
1532       } else if (open == BT_QUOT) {
1533         state = other;
1534         if (nAtts < attsMax)
1535           atts[nAtts].valueEnd = ptr;
1536         nAtts++;
1537       }
1538       break;
1539     case BT_APOS:
1540       if (state != inValue) {
1541         if (nAtts < attsMax)
1542           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1543         state = inValue;
1544         open = BT_APOS;
1545       } else if (open == BT_APOS) {
1546         state = other;
1547         if (nAtts < attsMax)
1548           atts[nAtts].valueEnd = ptr;
1549         nAtts++;
1550       }
1551       break;
1552     case BT_AMP:
1553       if (nAtts < attsMax)
1554         atts[nAtts].normalized = 0;
1555       break;
1556     case BT_S:
1557       if (state == inName)
1558         state = other;
1559       else if (state == inValue && nAtts < attsMax && atts[nAtts].normalized
1560                && (ptr == atts[nAtts].valuePtr
1561                    || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1562                    || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1563                    || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1564         atts[nAtts].normalized = 0;
1565       break;
1566     case BT_CR:
1567     case BT_LF:
1568       /* This case ensures that the first attribute name is counted
1569          Apart from that we could just change state on the quote. */
1570       if (state == inName)
1571         state = other;
1572       else if (state == inValue && nAtts < attsMax)
1573         atts[nAtts].normalized = 0;
1574       break;
1575     case BT_GT:
1576     case BT_SOL:
1577       if (state != inValue)
1578         return nAtts;
1579       break;
1580     default:
1581       break;
1582     }
1583   }
1584   /* not reached */
1585 }
1586 
1587 static int PTRFASTCALL
1588 PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) {
1589   int result = 0;
1590   /* skip &# */
1591   UNUSED_P(enc);
1592   ptr += 2 * MINBPC(enc);
1593   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1594     for (ptr += MINBPC(enc); ! CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1595          ptr += MINBPC(enc)) {
1596       int c = BYTE_TO_ASCII(enc, ptr);
1597       switch (c) {
1598       case ASCII_0:
1599       case ASCII_1:
1600       case ASCII_2:
1601       case ASCII_3:
1602       case ASCII_4:
1603       case ASCII_5:
1604       case ASCII_6:
1605       case ASCII_7:
1606       case ASCII_8:
1607       case ASCII_9:
1608         result <<= 4;
1609         result |= (c - ASCII_0);
1610         break;
1611       case ASCII_A:
1612       case ASCII_B:
1613       case ASCII_C:
1614       case ASCII_D:
1615       case ASCII_E:
1616       case ASCII_F:
1617         result <<= 4;
1618         result += 10 + (c - ASCII_A);
1619         break;
1620       case ASCII_a:
1621       case ASCII_b:
1622       case ASCII_c:
1623       case ASCII_d:
1624       case ASCII_e:
1625       case ASCII_f:
1626         result <<= 4;
1627         result += 10 + (c - ASCII_a);
1628         break;
1629       }
1630       if (result >= 0x110000)
1631         return -1;
1632     }
1633   } else {
1634     for (; ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1635       int c = BYTE_TO_ASCII(enc, ptr);
1636       result *= 10;
1637       result += (c - ASCII_0);
1638       if (result >= 0x110000)
1639         return -1;
1640     }
1641   }
1642   return checkCharRefNumber(result);
1643 }
1644 
1645 static int PTRCALL
1646 PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1647                              const char *end) {
1648   UNUSED_P(enc);
1649   switch ((end - ptr) / MINBPC(enc)) {
1650   case 2:
1651     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1652       switch (BYTE_TO_ASCII(enc, ptr)) {
1653       case ASCII_l:
1654         return ASCII_LT;
1655       case ASCII_g:
1656         return ASCII_GT;
1657       }
1658     }
1659     break;
1660   case 3:
1661     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1662       ptr += MINBPC(enc);
1663       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1664         ptr += MINBPC(enc);
1665         if (CHAR_MATCHES(enc, ptr, ASCII_p))
1666           return ASCII_AMP;
1667       }
1668     }
1669     break;
1670   case 4:
1671     switch (BYTE_TO_ASCII(enc, ptr)) {
1672     case ASCII_q:
1673       ptr += MINBPC(enc);
1674       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1675         ptr += MINBPC(enc);
1676         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1677           ptr += MINBPC(enc);
1678           if (CHAR_MATCHES(enc, ptr, ASCII_t))
1679             return ASCII_QUOT;
1680         }
1681       }
1682       break;
1683     case ASCII_a:
1684       ptr += MINBPC(enc);
1685       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1686         ptr += MINBPC(enc);
1687         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1688           ptr += MINBPC(enc);
1689           if (CHAR_MATCHES(enc, ptr, ASCII_s))
1690             return ASCII_APOS;
1691         }
1692       }
1693       break;
1694     }
1695   }
1696   return 0;
1697 }
1698 
1699 static int PTRCALL
1700 PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1701                          const char *end1, const char *ptr2) {
1702   UNUSED_P(enc);
1703   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1704     if (end1 - ptr1 < MINBPC(enc)) {
1705       /* This line cannot be executed.  The incoming data has already
1706        * been tokenized once, so incomplete characters like this have
1707        * already been eliminated from the input.  Retaining the
1708        * paranoia check is still valuable, however.
1709        */
1710       return 0; /* LCOV_EXCL_LINE */
1711     }
1712     if (! CHAR_MATCHES(enc, ptr1, *ptr2))
1713       return 0;
1714   }
1715   return ptr1 == end1;
1716 }
1717 
1718 static int PTRFASTCALL
1719 PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
1720   const char *start = ptr;
1721   for (;;) {
1722     switch (BYTE_TYPE(enc, ptr)) {
1723 #  define LEAD_CASE(n)                                                         \
1724   case BT_LEAD##n:                                                             \
1725     ptr += n;                                                                  \
1726     break;
1727       LEAD_CASE(2)
1728       LEAD_CASE(3)
1729       LEAD_CASE(4)
1730 #  undef LEAD_CASE
1731     case BT_NONASCII:
1732     case BT_NMSTRT:
1733 #  ifdef XML_NS
1734     case BT_COLON:
1735 #  endif
1736     case BT_HEX:
1737     case BT_DIGIT:
1738     case BT_NAME:
1739     case BT_MINUS:
1740       ptr += MINBPC(enc);
1741       break;
1742     default:
1743       return (int)(ptr - start);
1744     }
1745   }
1746 }
1747 
1748 static const char *PTRFASTCALL
1749 PREFIX(skipS)(const ENCODING *enc, const char *ptr) {
1750   for (;;) {
1751     switch (BYTE_TYPE(enc, ptr)) {
1752     case BT_LF:
1753     case BT_CR:
1754     case BT_S:
1755       ptr += MINBPC(enc);
1756       break;
1757     default:
1758       return ptr;
1759     }
1760   }
1761 }
1762 
1763 static void PTRCALL
1764 PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
1765                        POSITION *pos) {
1766   while (HAS_CHAR(enc, ptr, end)) {
1767     switch (BYTE_TYPE(enc, ptr)) {
1768 #  define LEAD_CASE(n)                                                         \
1769   case BT_LEAD##n:                                                             \
1770     ptr += n;                                                                  \
1771     break;
1772       LEAD_CASE(2)
1773       LEAD_CASE(3)
1774       LEAD_CASE(4)
1775 #  undef LEAD_CASE
1776     case BT_LF:
1777       pos->columnNumber = (XML_Size)-1;
1778       pos->lineNumber++;
1779       ptr += MINBPC(enc);
1780       break;
1781     case BT_CR:
1782       pos->lineNumber++;
1783       ptr += MINBPC(enc);
1784       if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1785         ptr += MINBPC(enc);
1786       pos->columnNumber = (XML_Size)-1;
1787       break;
1788     default:
1789       ptr += MINBPC(enc);
1790       break;
1791     }
1792     pos->columnNumber++;
1793   }
1794 }
1795 
1796 #  undef DO_LEAD_CASE
1797 #  undef MULTIBYTE_CASES
1798 #  undef INVALID_CASES
1799 #  undef CHECK_NAME_CASE
1800 #  undef CHECK_NAME_CASES
1801 #  undef CHECK_NMSTRT_CASE
1802 #  undef CHECK_NMSTRT_CASES
1803 
1804 #endif /* XML_TOK_IMPL_C */
1805