1 /* This file is included!
2                             __  __            _
3                          ___\ \/ /_ __   __ _| |_
4                         / _ \\  /| '_ \ / _` | __|
5                        |  __//  \| |_) | (_| | |_
6                         \___/_/\_\ .__/ \__,_|\__|
7                                  |_| XML parser
8 
9    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10    Copyright (c) 2000-2017 Expat development team
11    Licensed under the MIT license:
12 
13    Permission is  hereby granted,  free of charge,  to any  person obtaining
14    a  copy  of  this  software   and  associated  documentation  files  (the
15    "Software"),  to  deal in  the  Software  without restriction,  including
16    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
17    distribute, sublicense, and/or sell copies of the Software, and to permit
18    persons  to whom  the Software  is  furnished to  do so,  subject to  the
19    following conditions:
20 
21    The above copyright  notice and this permission notice  shall be included
22    in all copies or substantial portions of the Software.
23 
24    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
25    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
26    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
29    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30    USE OR OTHER DEALINGS IN THE SOFTWARE.
31 */
32 
33 #ifdef XML_TOK_IMPL_C
34 
35 #ifndef IS_INVALID_CHAR
36 #define IS_INVALID_CHAR(enc, ptr, n) (0)
37 #endif
38 
39 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
40     case BT_LEAD ## n: \
41       if (end - ptr < n) \
42         return XML_TOK_PARTIAL_CHAR; \
43       if (IS_INVALID_CHAR(enc, ptr, n)) { \
44         *(nextTokPtr) = (ptr); \
45         return XML_TOK_INVALID; \
46       } \
47       ptr += n; \
48       break;
49 
50 #define INVALID_CASES(ptr, nextTokPtr) \
51   INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
52   INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
53   INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
54   case BT_NONXML: \
55   case BT_MALFORM: \
56   case BT_TRAIL: \
57     *(nextTokPtr) = (ptr); \
58     return XML_TOK_INVALID;
59 
60 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
61    case BT_LEAD ## n: \
62      if (end - ptr < n) \
63        return XML_TOK_PARTIAL_CHAR; \
64      if (!IS_NAME_CHAR(enc, ptr, n)) { \
65        *nextTokPtr = ptr; \
66        return XML_TOK_INVALID; \
67      } \
68      ptr += n; \
69      break;
70 
71 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
72   case BT_NONASCII: \
73     if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
74       *nextTokPtr = ptr; \
75       return XML_TOK_INVALID; \
76     } \
77   case BT_NMSTRT: \
78   case BT_HEX: \
79   case BT_DIGIT: \
80   case BT_NAME: \
81   case BT_MINUS: \
82     ptr += MINBPC(enc); \
83     break; \
84   CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
85   CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
86   CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
87 
88 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
89    case BT_LEAD ## n: \
90      if (end - ptr < n) \
91        return XML_TOK_PARTIAL_CHAR; \
92      if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
93        *nextTokPtr = ptr; \
94        return XML_TOK_INVALID; \
95      } \
96      ptr += n; \
97      break;
98 
99 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
100   case BT_NONASCII: \
101     if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
102       *nextTokPtr = ptr; \
103       return XML_TOK_INVALID; \
104     } \
105   case BT_NMSTRT: \
106   case BT_HEX: \
107     ptr += MINBPC(enc); \
108     break; \
109   CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
110   CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
111   CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
112 
113 #ifndef PREFIX
114 #define PREFIX(ident) ident
115 #endif
116 
117 
118 #define HAS_CHARS(enc, ptr, end, count) \
119     (end - ptr >= count * MINBPC(enc))
120 
121 #define HAS_CHAR(enc, ptr, end) \
122     HAS_CHARS(enc, ptr, end, 1)
123 
124 #define REQUIRE_CHARS(enc, ptr, end, count) \
125     { \
126       if (! HAS_CHARS(enc, ptr, end, count)) { \
127         return XML_TOK_PARTIAL; \
128       } \
129     }
130 
131 #define REQUIRE_CHAR(enc, ptr, end) \
132     REQUIRE_CHARS(enc, ptr, end, 1)
133 
134 
135 /* ptr points to character following "<!-" */
136 
137 static int PTRCALL
PREFIX(scanComment)138 PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
139                     const char *end, const char **nextTokPtr)
140 {
141   if (HAS_CHAR(enc, ptr, end)) {
142     if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
143       *nextTokPtr = ptr;
144       return XML_TOK_INVALID;
145     }
146     ptr += MINBPC(enc);
147     while (HAS_CHAR(enc, ptr, end)) {
148       switch (BYTE_TYPE(enc, ptr)) {
149       INVALID_CASES(ptr, nextTokPtr)
150       case BT_MINUS:
151         ptr += MINBPC(enc);
152         REQUIRE_CHAR(enc, ptr, end);
153         if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
154           ptr += MINBPC(enc);
155           REQUIRE_CHAR(enc, ptr, end);
156           if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
157             *nextTokPtr = ptr;
158             return XML_TOK_INVALID;
159           }
160           *nextTokPtr = ptr + MINBPC(enc);
161           return XML_TOK_COMMENT;
162         }
163         break;
164       default:
165         ptr += MINBPC(enc);
166         break;
167       }
168     }
169   }
170   return XML_TOK_PARTIAL;
171 }
172 
173 /* ptr points to character following "<!" */
174 
175 static int PTRCALL
PREFIX(scanDecl)176 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
177                  const char *end, const char **nextTokPtr)
178 {
179   REQUIRE_CHAR(enc, ptr, end);
180   switch (BYTE_TYPE(enc, ptr)) {
181   case BT_MINUS:
182     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
183   case BT_LSQB:
184     *nextTokPtr = ptr + MINBPC(enc);
185     return XML_TOK_COND_SECT_OPEN;
186   case BT_NMSTRT:
187   case BT_HEX:
188     ptr += MINBPC(enc);
189     break;
190   default:
191     *nextTokPtr = ptr;
192     return XML_TOK_INVALID;
193   }
194   while (HAS_CHAR(enc, ptr, end)) {
195     switch (BYTE_TYPE(enc, ptr)) {
196     case BT_PERCNT:
197       REQUIRE_CHARS(enc, ptr, end, 2);
198       /* don't allow <!ENTITY% foo "whatever"> */
199       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
200       case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
201         *nextTokPtr = ptr;
202         return XML_TOK_INVALID;
203       }
204       /* fall through */
205     case BT_S: case BT_CR: case BT_LF:
206       *nextTokPtr = ptr;
207       return XML_TOK_DECL_OPEN;
208     case BT_NMSTRT:
209     case BT_HEX:
210       ptr += MINBPC(enc);
211       break;
212     default:
213       *nextTokPtr = ptr;
214       return XML_TOK_INVALID;
215     }
216   }
217   return XML_TOK_PARTIAL;
218 }
219 
220 static int PTRCALL
PREFIX(checkPiTarget)221 PREFIX(checkPiTarget)(const ENCODING *UNUSED_P(enc), const char *ptr,
222                       const char *end, int *tokPtr)
223 {
224   int upper = 0;
225   *tokPtr = XML_TOK_PI;
226   if (end - ptr != MINBPC(enc)*3)
227     return 1;
228   switch (BYTE_TO_ASCII(enc, ptr)) {
229   case ASCII_x:
230     break;
231   case ASCII_X:
232     upper = 1;
233     break;
234   default:
235     return 1;
236   }
237   ptr += MINBPC(enc);
238   switch (BYTE_TO_ASCII(enc, ptr)) {
239   case ASCII_m:
240     break;
241   case ASCII_M:
242     upper = 1;
243     break;
244   default:
245     return 1;
246   }
247   ptr += MINBPC(enc);
248   switch (BYTE_TO_ASCII(enc, ptr)) {
249   case ASCII_l:
250     break;
251   case ASCII_L:
252     upper = 1;
253     break;
254   default:
255     return 1;
256   }
257   if (upper)
258     return 0;
259   *tokPtr = XML_TOK_XML_DECL;
260   return 1;
261 }
262 
263 /* ptr points to character following "<?" */
264 
265 static int PTRCALL
PREFIX(scanPi)266 PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
267                const char *end, const char **nextTokPtr)
268 {
269   int tok;
270   const char *target = ptr;
271   REQUIRE_CHAR(enc, ptr, end);
272   switch (BYTE_TYPE(enc, ptr)) {
273   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
274   default:
275     *nextTokPtr = ptr;
276     return XML_TOK_INVALID;
277   }
278   while (HAS_CHAR(enc, ptr, end)) {
279     switch (BYTE_TYPE(enc, ptr)) {
280     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
281     case BT_S: case BT_CR: case BT_LF:
282       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
283         *nextTokPtr = ptr;
284         return XML_TOK_INVALID;
285       }
286       ptr += MINBPC(enc);
287       while (HAS_CHAR(enc, ptr, end)) {
288         switch (BYTE_TYPE(enc, ptr)) {
289         INVALID_CASES(ptr, nextTokPtr)
290         case BT_QUEST:
291           ptr += MINBPC(enc);
292           REQUIRE_CHAR(enc, ptr, end);
293           if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
294             *nextTokPtr = ptr + MINBPC(enc);
295             return tok;
296           }
297           break;
298         default:
299           ptr += MINBPC(enc);
300           break;
301         }
302       }
303       return XML_TOK_PARTIAL;
304     case BT_QUEST:
305       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
306         *nextTokPtr = ptr;
307         return XML_TOK_INVALID;
308       }
309       ptr += MINBPC(enc);
310       REQUIRE_CHAR(enc, ptr, end);
311       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
312         *nextTokPtr = ptr + MINBPC(enc);
313         return tok;
314       }
315       /* fall through */
316     default:
317       *nextTokPtr = ptr;
318       return XML_TOK_INVALID;
319     }
320   }
321   return XML_TOK_PARTIAL;
322 }
323 
324 static int PTRCALL
PREFIX(scanCdataSection)325 PREFIX(scanCdataSection)(const ENCODING *UNUSED_P(enc), const char *ptr,
326                          const char *end, const char **nextTokPtr)
327 {
328   static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
329                                      ASCII_T, ASCII_A, ASCII_LSQB };
330   int i;
331   /* CDATA[ */
332   REQUIRE_CHARS(enc, ptr, end, 6);
333   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
334     if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
335       *nextTokPtr = ptr;
336       return XML_TOK_INVALID;
337     }
338   }
339   *nextTokPtr = ptr;
340   return XML_TOK_CDATA_SECT_OPEN;
341 }
342 
343 static int PTRCALL
PREFIX(cdataSectionTok)344 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
345                         const char *end, const char **nextTokPtr)
346 {
347   if (ptr >= end)
348     return XML_TOK_NONE;
349   if (MINBPC(enc) > 1) {
350     size_t n = end - ptr;
351     if (n & (MINBPC(enc) - 1)) {
352       n &= ~(MINBPC(enc) - 1);
353       if (n == 0)
354         return XML_TOK_PARTIAL;
355       end = ptr + n;
356     }
357   }
358   switch (BYTE_TYPE(enc, ptr)) {
359   case BT_RSQB:
360     ptr += MINBPC(enc);
361     REQUIRE_CHAR(enc, ptr, end);
362     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
363       break;
364     ptr += MINBPC(enc);
365     REQUIRE_CHAR(enc, ptr, end);
366     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
367       ptr -= MINBPC(enc);
368       break;
369     }
370     *nextTokPtr = ptr + MINBPC(enc);
371     return XML_TOK_CDATA_SECT_CLOSE;
372   case BT_CR:
373     ptr += MINBPC(enc);
374     REQUIRE_CHAR(enc, ptr, end);
375     if (BYTE_TYPE(enc, ptr) == BT_LF)
376       ptr += MINBPC(enc);
377     *nextTokPtr = ptr;
378     return XML_TOK_DATA_NEWLINE;
379   case BT_LF:
380     *nextTokPtr = ptr + MINBPC(enc);
381     return XML_TOK_DATA_NEWLINE;
382   INVALID_CASES(ptr, nextTokPtr)
383   default:
384     ptr += MINBPC(enc);
385     break;
386   }
387   while (HAS_CHAR(enc, ptr, end)) {
388     switch (BYTE_TYPE(enc, ptr)) {
389 #define LEAD_CASE(n) \
390     case BT_LEAD ## n: \
391       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
392         *nextTokPtr = ptr; \
393         return XML_TOK_DATA_CHARS; \
394       } \
395       ptr += n; \
396       break;
397     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
398 #undef LEAD_CASE
399     case BT_NONXML:
400     case BT_MALFORM:
401     case BT_TRAIL:
402     case BT_CR:
403     case BT_LF:
404     case BT_RSQB:
405       *nextTokPtr = ptr;
406       return XML_TOK_DATA_CHARS;
407     default:
408       ptr += MINBPC(enc);
409       break;
410     }
411   }
412   *nextTokPtr = ptr;
413   return XML_TOK_DATA_CHARS;
414 }
415 
416 /* ptr points to character following "</" */
417 
418 static int PTRCALL
PREFIX(scanEndTag)419 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
420                    const char *end, const char **nextTokPtr)
421 {
422   REQUIRE_CHAR(enc, ptr, end);
423   switch (BYTE_TYPE(enc, ptr)) {
424   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
425   default:
426     *nextTokPtr = ptr;
427     return XML_TOK_INVALID;
428   }
429   while (HAS_CHAR(enc, ptr, end)) {
430     switch (BYTE_TYPE(enc, ptr)) {
431     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
432     case BT_S: case BT_CR: case BT_LF:
433       for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
434         switch (BYTE_TYPE(enc, ptr)) {
435         case BT_S: case BT_CR: case BT_LF:
436           break;
437         case BT_GT:
438           *nextTokPtr = ptr + MINBPC(enc);
439           return XML_TOK_END_TAG;
440         default:
441           *nextTokPtr = ptr;
442           return XML_TOK_INVALID;
443         }
444       }
445       return XML_TOK_PARTIAL;
446 #ifdef XML_NS
447     case BT_COLON:
448       /* no need to check qname syntax here,
449          since end-tag must match exactly */
450       ptr += MINBPC(enc);
451       break;
452 #endif
453     case BT_GT:
454       *nextTokPtr = ptr + MINBPC(enc);
455       return XML_TOK_END_TAG;
456     default:
457       *nextTokPtr = ptr;
458       return XML_TOK_INVALID;
459     }
460   }
461   return XML_TOK_PARTIAL;
462 }
463 
464 /* ptr points to character following "&#X" */
465 
466 static int PTRCALL
PREFIX(scanHexCharRef)467 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
468                        const char *end, const char **nextTokPtr)
469 {
470   if (HAS_CHAR(enc, ptr, end)) {
471     switch (BYTE_TYPE(enc, ptr)) {
472     case BT_DIGIT:
473     case BT_HEX:
474       break;
475     default:
476       *nextTokPtr = ptr;
477       return XML_TOK_INVALID;
478     }
479     for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
480       switch (BYTE_TYPE(enc, ptr)) {
481       case BT_DIGIT:
482       case BT_HEX:
483         break;
484       case BT_SEMI:
485         *nextTokPtr = ptr + MINBPC(enc);
486         return XML_TOK_CHAR_REF;
487       default:
488         *nextTokPtr = ptr;
489         return XML_TOK_INVALID;
490       }
491     }
492   }
493   return XML_TOK_PARTIAL;
494 }
495 
496 /* ptr points to character following "&#" */
497 
498 static int PTRCALL
PREFIX(scanCharRef)499 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
500                     const char *end, const char **nextTokPtr)
501 {
502   if (HAS_CHAR(enc, ptr, end)) {
503     if (CHAR_MATCHES(enc, ptr, ASCII_x))
504       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
505     switch (BYTE_TYPE(enc, ptr)) {
506     case BT_DIGIT:
507       break;
508     default:
509       *nextTokPtr = ptr;
510       return XML_TOK_INVALID;
511     }
512     for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
513       switch (BYTE_TYPE(enc, ptr)) {
514       case BT_DIGIT:
515         break;
516       case BT_SEMI:
517         *nextTokPtr = ptr + MINBPC(enc);
518         return XML_TOK_CHAR_REF;
519       default:
520         *nextTokPtr = ptr;
521         return XML_TOK_INVALID;
522       }
523     }
524   }
525   return XML_TOK_PARTIAL;
526 }
527 
528 /* ptr points to character following "&" */
529 
530 static int PTRCALL
PREFIX(scanRef)531 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
532                 const char **nextTokPtr)
533 {
534   REQUIRE_CHAR(enc, ptr, end);
535   switch (BYTE_TYPE(enc, ptr)) {
536   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
537   case BT_NUM:
538     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
539   default:
540     *nextTokPtr = ptr;
541     return XML_TOK_INVALID;
542   }
543   while (HAS_CHAR(enc, ptr, end)) {
544     switch (BYTE_TYPE(enc, ptr)) {
545     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
546     case BT_SEMI:
547       *nextTokPtr = ptr + MINBPC(enc);
548       return XML_TOK_ENTITY_REF;
549     default:
550       *nextTokPtr = ptr;
551       return XML_TOK_INVALID;
552     }
553   }
554   return XML_TOK_PARTIAL;
555 }
556 
557 /* ptr points to character following first character of attribute name */
558 
559 static int PTRCALL
PREFIX(scanAtts)560 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
561                  const char **nextTokPtr)
562 {
563 #ifdef XML_NS
564   int hadColon = 0;
565 #endif
566   while (HAS_CHAR(enc, ptr, end)) {
567     switch (BYTE_TYPE(enc, ptr)) {
568     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
569 #ifdef XML_NS
570     case BT_COLON:
571       if (hadColon) {
572         *nextTokPtr = ptr;
573         return XML_TOK_INVALID;
574       }
575       hadColon = 1;
576       ptr += MINBPC(enc);
577       REQUIRE_CHAR(enc, ptr, end);
578       switch (BYTE_TYPE(enc, ptr)) {
579       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
580       default:
581         *nextTokPtr = ptr;
582         return XML_TOK_INVALID;
583       }
584       break;
585 #endif
586     case BT_S: case BT_CR: case BT_LF:
587       for (;;) {
588         int t;
589 
590         ptr += MINBPC(enc);
591         REQUIRE_CHAR(enc, ptr, end);
592         t = BYTE_TYPE(enc, ptr);
593         if (t == BT_EQUALS)
594           break;
595         switch (t) {
596         case BT_S:
597         case BT_LF:
598         case BT_CR:
599           break;
600         default:
601           *nextTokPtr = ptr;
602           return XML_TOK_INVALID;
603         }
604       }
605     /* fall through */
606     case BT_EQUALS:
607       {
608         int open;
609 #ifdef XML_NS
610         hadColon = 0;
611 #endif
612         for (;;) {
613           ptr += MINBPC(enc);
614           REQUIRE_CHAR(enc, ptr, end);
615           open = BYTE_TYPE(enc, ptr);
616           if (open == BT_QUOT || open == BT_APOS)
617             break;
618           switch (open) {
619           case BT_S:
620           case BT_LF:
621           case BT_CR:
622             break;
623           default:
624             *nextTokPtr = ptr;
625             return XML_TOK_INVALID;
626           }
627         }
628         ptr += MINBPC(enc);
629         /* in attribute value */
630         for (;;) {
631           int t;
632           REQUIRE_CHAR(enc, ptr, end);
633           t = BYTE_TYPE(enc, ptr);
634           if (t == open)
635             break;
636           switch (t) {
637           INVALID_CASES(ptr, nextTokPtr)
638           case BT_AMP:
639             {
640               int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
641               if (tok <= 0) {
642                 if (tok == XML_TOK_INVALID)
643                   *nextTokPtr = ptr;
644                 return tok;
645               }
646               break;
647             }
648           case BT_LT:
649             *nextTokPtr = ptr;
650             return XML_TOK_INVALID;
651           default:
652             ptr += MINBPC(enc);
653             break;
654           }
655         }
656         ptr += MINBPC(enc);
657         REQUIRE_CHAR(enc, ptr, end);
658         switch (BYTE_TYPE(enc, ptr)) {
659         case BT_S:
660         case BT_CR:
661         case BT_LF:
662           break;
663         case BT_SOL:
664           goto sol;
665         case BT_GT:
666           goto gt;
667         default:
668           *nextTokPtr = ptr;
669           return XML_TOK_INVALID;
670         }
671         /* ptr points to closing quote */
672         for (;;) {
673           ptr += MINBPC(enc);
674           REQUIRE_CHAR(enc, ptr, end);
675           switch (BYTE_TYPE(enc, ptr)) {
676           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
677           case BT_S: case BT_CR: case BT_LF:
678             continue;
679           case BT_GT:
680           gt:
681             *nextTokPtr = ptr + MINBPC(enc);
682             return XML_TOK_START_TAG_WITH_ATTS;
683           case BT_SOL:
684           sol:
685             ptr += MINBPC(enc);
686             REQUIRE_CHAR(enc, ptr, end);
687             if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
688               *nextTokPtr = ptr;
689               return XML_TOK_INVALID;
690             }
691             *nextTokPtr = ptr + MINBPC(enc);
692             return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
693           default:
694             *nextTokPtr = ptr;
695             return XML_TOK_INVALID;
696           }
697           break;
698         }
699         break;
700       }
701     default:
702       *nextTokPtr = ptr;
703       return XML_TOK_INVALID;
704     }
705   }
706   return XML_TOK_PARTIAL;
707 }
708 
709 /* ptr points to character following "<" */
710 
711 static int PTRCALL
PREFIX(scanLt)712 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
713                const char **nextTokPtr)
714 {
715 #ifdef XML_NS
716   int hadColon;
717 #endif
718   REQUIRE_CHAR(enc, ptr, end);
719   switch (BYTE_TYPE(enc, ptr)) {
720   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
721   case BT_EXCL:
722     ptr += MINBPC(enc);
723     REQUIRE_CHAR(enc, ptr, end);
724     switch (BYTE_TYPE(enc, ptr)) {
725     case BT_MINUS:
726       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
727     case BT_LSQB:
728       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
729                                       end, nextTokPtr);
730     }
731     *nextTokPtr = ptr;
732     return XML_TOK_INVALID;
733   case BT_QUEST:
734     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
735   case BT_SOL:
736     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
737   default:
738     *nextTokPtr = ptr;
739     return XML_TOK_INVALID;
740   }
741 #ifdef XML_NS
742   hadColon = 0;
743 #endif
744   /* we have a start-tag */
745   while (HAS_CHAR(enc, ptr, end)) {
746     switch (BYTE_TYPE(enc, ptr)) {
747     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
748 #ifdef XML_NS
749     case BT_COLON:
750       if (hadColon) {
751         *nextTokPtr = ptr;
752         return XML_TOK_INVALID;
753       }
754       hadColon = 1;
755       ptr += MINBPC(enc);
756       REQUIRE_CHAR(enc, ptr, end);
757       switch (BYTE_TYPE(enc, ptr)) {
758       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
759       default:
760         *nextTokPtr = ptr;
761         return XML_TOK_INVALID;
762       }
763       break;
764 #endif
765     case BT_S: case BT_CR: case BT_LF:
766       {
767         ptr += MINBPC(enc);
768         while (HAS_CHAR(enc, ptr, end)) {
769           switch (BYTE_TYPE(enc, ptr)) {
770           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
771           case BT_GT:
772             goto gt;
773           case BT_SOL:
774             goto sol;
775           case BT_S: case BT_CR: case BT_LF:
776             ptr += MINBPC(enc);
777             continue;
778           default:
779             *nextTokPtr = ptr;
780             return XML_TOK_INVALID;
781           }
782           return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
783         }
784         return XML_TOK_PARTIAL;
785       }
786     case BT_GT:
787     gt:
788       *nextTokPtr = ptr + MINBPC(enc);
789       return XML_TOK_START_TAG_NO_ATTS;
790     case BT_SOL:
791     sol:
792       ptr += MINBPC(enc);
793       REQUIRE_CHAR(enc, ptr, end);
794       if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
795         *nextTokPtr = ptr;
796         return XML_TOK_INVALID;
797       }
798       *nextTokPtr = ptr + MINBPC(enc);
799       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
800     default:
801       *nextTokPtr = ptr;
802       return XML_TOK_INVALID;
803     }
804   }
805   return XML_TOK_PARTIAL;
806 }
807 
808 static int PTRCALL
PREFIX(contentTok)809 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
810                    const char **nextTokPtr)
811 {
812   if (ptr >= end)
813     return XML_TOK_NONE;
814   if (MINBPC(enc) > 1) {
815     size_t n = end - ptr;
816     if (n & (MINBPC(enc) - 1)) {
817       n &= ~(MINBPC(enc) - 1);
818       if (n == 0)
819         return XML_TOK_PARTIAL;
820       end = ptr + n;
821     }
822   }
823   switch (BYTE_TYPE(enc, ptr)) {
824   case BT_LT:
825     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
826   case BT_AMP:
827     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
828   case BT_CR:
829     ptr += MINBPC(enc);
830     if (! HAS_CHAR(enc, ptr, end))
831       return XML_TOK_TRAILING_CR;
832     if (BYTE_TYPE(enc, ptr) == BT_LF)
833       ptr += MINBPC(enc);
834     *nextTokPtr = ptr;
835     return XML_TOK_DATA_NEWLINE;
836   case BT_LF:
837     *nextTokPtr = ptr + MINBPC(enc);
838     return XML_TOK_DATA_NEWLINE;
839   case BT_RSQB:
840     ptr += MINBPC(enc);
841     if (! HAS_CHAR(enc, ptr, end))
842       return XML_TOK_TRAILING_RSQB;
843     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
844       break;
845     ptr += MINBPC(enc);
846     if (! HAS_CHAR(enc, ptr, end))
847       return XML_TOK_TRAILING_RSQB;
848     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
849       ptr -= MINBPC(enc);
850       break;
851     }
852     *nextTokPtr = ptr;
853     return XML_TOK_INVALID;
854   INVALID_CASES(ptr, nextTokPtr)
855   default:
856     ptr += MINBPC(enc);
857     break;
858   }
859   while (HAS_CHAR(enc, ptr, end)) {
860     switch (BYTE_TYPE(enc, ptr)) {
861 #define LEAD_CASE(n) \
862     case BT_LEAD ## n: \
863       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
864         *nextTokPtr = ptr; \
865         return XML_TOK_DATA_CHARS; \
866       } \
867       ptr += n; \
868       break;
869     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
870 #undef LEAD_CASE
871     case BT_RSQB:
872       if (HAS_CHARS(enc, ptr, end, 2)) {
873          if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
874            ptr += MINBPC(enc);
875            break;
876          }
877          if (HAS_CHARS(enc, ptr, end, 3)) {
878            if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
879              ptr += MINBPC(enc);
880              break;
881            }
882            *nextTokPtr = ptr + 2*MINBPC(enc);
883            return XML_TOK_INVALID;
884          }
885       }
886       /* fall through */
887     case BT_AMP:
888     case BT_LT:
889     case BT_NONXML:
890     case BT_MALFORM:
891     case BT_TRAIL:
892     case BT_CR:
893     case BT_LF:
894       *nextTokPtr = ptr;
895       return XML_TOK_DATA_CHARS;
896     default:
897       ptr += MINBPC(enc);
898       break;
899     }
900   }
901   *nextTokPtr = ptr;
902   return XML_TOK_DATA_CHARS;
903 }
904 
905 /* ptr points to character following "%" */
906 
907 static int PTRCALL
PREFIX(scanPercent)908 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
909                     const char **nextTokPtr)
910 {
911   REQUIRE_CHAR(enc, ptr, end);
912   switch (BYTE_TYPE(enc, ptr)) {
913   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
914   case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
915     *nextTokPtr = ptr;
916     return XML_TOK_PERCENT;
917   default:
918     *nextTokPtr = ptr;
919     return XML_TOK_INVALID;
920   }
921   while (HAS_CHAR(enc, ptr, end)) {
922     switch (BYTE_TYPE(enc, ptr)) {
923     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
924     case BT_SEMI:
925       *nextTokPtr = ptr + MINBPC(enc);
926       return XML_TOK_PARAM_ENTITY_REF;
927     default:
928       *nextTokPtr = ptr;
929       return XML_TOK_INVALID;
930     }
931   }
932   return XML_TOK_PARTIAL;
933 }
934 
935 static int PTRCALL
PREFIX(scanPoundName)936 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
937                       const char **nextTokPtr)
938 {
939   REQUIRE_CHAR(enc, ptr, end);
940   switch (BYTE_TYPE(enc, ptr)) {
941   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
942   default:
943     *nextTokPtr = ptr;
944     return XML_TOK_INVALID;
945   }
946   while (HAS_CHAR(enc, ptr, end)) {
947     switch (BYTE_TYPE(enc, ptr)) {
948     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
949     case BT_CR: case BT_LF: case BT_S:
950     case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
951       *nextTokPtr = ptr;
952       return XML_TOK_POUND_NAME;
953     default:
954       *nextTokPtr = ptr;
955       return XML_TOK_INVALID;
956     }
957   }
958   return -XML_TOK_POUND_NAME;
959 }
960 
961 static int PTRCALL
PREFIX(scanLit)962 PREFIX(scanLit)(int open, const ENCODING *enc,
963                 const char *ptr, const char *end,
964                 const char **nextTokPtr)
965 {
966   while (HAS_CHAR(enc, ptr, end)) {
967     int t = BYTE_TYPE(enc, ptr);
968     switch (t) {
969     INVALID_CASES(ptr, nextTokPtr)
970     case BT_QUOT:
971     case BT_APOS:
972       ptr += MINBPC(enc);
973       if (t != open)
974         break;
975       if (! HAS_CHAR(enc, ptr, end))
976         return -XML_TOK_LITERAL;
977       *nextTokPtr = ptr;
978       switch (BYTE_TYPE(enc, ptr)) {
979       case BT_S: case BT_CR: case BT_LF:
980       case BT_GT: case BT_PERCNT: case BT_LSQB:
981         return XML_TOK_LITERAL;
982       default:
983         return XML_TOK_INVALID;
984       }
985     default:
986       ptr += MINBPC(enc);
987       break;
988     }
989   }
990   return XML_TOK_PARTIAL;
991 }
992 
993 static int PTRCALL
PREFIX(prologTok)994 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
995                   const char **nextTokPtr)
996 {
997   int tok;
998   if (ptr >= end)
999     return XML_TOK_NONE;
1000   if (MINBPC(enc) > 1) {
1001     size_t n = end - ptr;
1002     if (n & (MINBPC(enc) - 1)) {
1003       n &= ~(MINBPC(enc) - 1);
1004       if (n == 0)
1005         return XML_TOK_PARTIAL;
1006       end = ptr + n;
1007     }
1008   }
1009   switch (BYTE_TYPE(enc, ptr)) {
1010   case BT_QUOT:
1011     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1012   case BT_APOS:
1013     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1014   case BT_LT:
1015     {
1016       ptr += MINBPC(enc);
1017       REQUIRE_CHAR(enc, ptr, end);
1018       switch (BYTE_TYPE(enc, ptr)) {
1019       case BT_EXCL:
1020         return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1021       case BT_QUEST:
1022         return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1023       case BT_NMSTRT:
1024       case BT_HEX:
1025       case BT_NONASCII:
1026       case BT_LEAD2:
1027       case BT_LEAD3:
1028       case BT_LEAD4:
1029         *nextTokPtr = ptr - MINBPC(enc);
1030         return XML_TOK_INSTANCE_START;
1031       }
1032       *nextTokPtr = ptr;
1033       return XML_TOK_INVALID;
1034     }
1035   case BT_CR:
1036     if (ptr + MINBPC(enc) == end) {
1037       *nextTokPtr = end;
1038       /* indicate that this might be part of a CR/LF pair */
1039       return -XML_TOK_PROLOG_S;
1040     }
1041     /* fall through */
1042   case BT_S: case BT_LF:
1043     for (;;) {
1044       ptr += MINBPC(enc);
1045       if (! HAS_CHAR(enc, ptr, end))
1046         break;
1047       switch (BYTE_TYPE(enc, ptr)) {
1048       case BT_S: case BT_LF:
1049         break;
1050       case BT_CR:
1051         /* don't split CR/LF pair */
1052         if (ptr + MINBPC(enc) != end)
1053           break;
1054         /* fall through */
1055       default:
1056         *nextTokPtr = ptr;
1057         return XML_TOK_PROLOG_S;
1058       }
1059     }
1060     *nextTokPtr = ptr;
1061     return XML_TOK_PROLOG_S;
1062   case BT_PERCNT:
1063     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1064   case BT_COMMA:
1065     *nextTokPtr = ptr + MINBPC(enc);
1066     return XML_TOK_COMMA;
1067   case BT_LSQB:
1068     *nextTokPtr = ptr + MINBPC(enc);
1069     return XML_TOK_OPEN_BRACKET;
1070   case BT_RSQB:
1071     ptr += MINBPC(enc);
1072     if (! HAS_CHAR(enc, ptr, end))
1073       return -XML_TOK_CLOSE_BRACKET;
1074     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1075       REQUIRE_CHARS(enc, ptr, end, 2);
1076       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1077         *nextTokPtr = ptr + 2*MINBPC(enc);
1078         return XML_TOK_COND_SECT_CLOSE;
1079       }
1080     }
1081     *nextTokPtr = ptr;
1082     return XML_TOK_CLOSE_BRACKET;
1083   case BT_LPAR:
1084     *nextTokPtr = ptr + MINBPC(enc);
1085     return XML_TOK_OPEN_PAREN;
1086   case BT_RPAR:
1087     ptr += MINBPC(enc);
1088     if (! HAS_CHAR(enc, ptr, end))
1089       return -XML_TOK_CLOSE_PAREN;
1090     switch (BYTE_TYPE(enc, ptr)) {
1091     case BT_AST:
1092       *nextTokPtr = ptr + MINBPC(enc);
1093       return XML_TOK_CLOSE_PAREN_ASTERISK;
1094     case BT_QUEST:
1095       *nextTokPtr = ptr + MINBPC(enc);
1096       return XML_TOK_CLOSE_PAREN_QUESTION;
1097     case BT_PLUS:
1098       *nextTokPtr = ptr + MINBPC(enc);
1099       return XML_TOK_CLOSE_PAREN_PLUS;
1100     case BT_CR: case BT_LF: case BT_S:
1101     case BT_GT: case BT_COMMA: case BT_VERBAR:
1102     case BT_RPAR:
1103       *nextTokPtr = ptr;
1104       return XML_TOK_CLOSE_PAREN;
1105     }
1106     *nextTokPtr = ptr;
1107     return XML_TOK_INVALID;
1108   case BT_VERBAR:
1109     *nextTokPtr = ptr + MINBPC(enc);
1110     return XML_TOK_OR;
1111   case BT_GT:
1112     *nextTokPtr = ptr + MINBPC(enc);
1113     return XML_TOK_DECL_CLOSE;
1114   case BT_NUM:
1115     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1116 #define LEAD_CASE(n) \
1117   case BT_LEAD ## n: \
1118     if (end - ptr < n) \
1119       return XML_TOK_PARTIAL_CHAR; \
1120     if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1121       ptr += n; \
1122       tok = XML_TOK_NAME; \
1123       break; \
1124     } \
1125     if (IS_NAME_CHAR(enc, ptr, n)) { \
1126       ptr += n; \
1127       tok = XML_TOK_NMTOKEN; \
1128       break; \
1129     } \
1130     *nextTokPtr = ptr; \
1131     return XML_TOK_INVALID;
1132     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1133 #undef LEAD_CASE
1134   case BT_NMSTRT:
1135   case BT_HEX:
1136     tok = XML_TOK_NAME;
1137     ptr += MINBPC(enc);
1138     break;
1139   case BT_DIGIT:
1140   case BT_NAME:
1141   case BT_MINUS:
1142 #ifdef XML_NS
1143   case BT_COLON:
1144 #endif
1145     tok = XML_TOK_NMTOKEN;
1146     ptr += MINBPC(enc);
1147     break;
1148   case BT_NONASCII:
1149     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1150       ptr += MINBPC(enc);
1151       tok = XML_TOK_NAME;
1152       break;
1153     }
1154     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1155       ptr += MINBPC(enc);
1156       tok = XML_TOK_NMTOKEN;
1157       break;
1158     }
1159     /* fall through */
1160   default:
1161     *nextTokPtr = ptr;
1162     return XML_TOK_INVALID;
1163   }
1164   while (HAS_CHAR(enc, ptr, end)) {
1165     switch (BYTE_TYPE(enc, ptr)) {
1166     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1167     case BT_GT: case BT_RPAR: case BT_COMMA:
1168     case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1169     case BT_S: case BT_CR: case BT_LF:
1170       *nextTokPtr = ptr;
1171       return tok;
1172 #ifdef XML_NS
1173     case BT_COLON:
1174       ptr += MINBPC(enc);
1175       switch (tok) {
1176       case XML_TOK_NAME:
1177         REQUIRE_CHAR(enc, ptr, end);
1178         tok = XML_TOK_PREFIXED_NAME;
1179         switch (BYTE_TYPE(enc, ptr)) {
1180         CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1181         default:
1182           tok = XML_TOK_NMTOKEN;
1183           break;
1184         }
1185         break;
1186       case XML_TOK_PREFIXED_NAME:
1187         tok = XML_TOK_NMTOKEN;
1188         break;
1189       }
1190       break;
1191 #endif
1192     case BT_PLUS:
1193       if (tok == XML_TOK_NMTOKEN)  {
1194         *nextTokPtr = ptr;
1195         return XML_TOK_INVALID;
1196       }
1197       *nextTokPtr = ptr + MINBPC(enc);
1198       return XML_TOK_NAME_PLUS;
1199     case BT_AST:
1200       if (tok == XML_TOK_NMTOKEN)  {
1201         *nextTokPtr = ptr;
1202         return XML_TOK_INVALID;
1203       }
1204       *nextTokPtr = ptr + MINBPC(enc);
1205       return XML_TOK_NAME_ASTERISK;
1206     case BT_QUEST:
1207       if (tok == XML_TOK_NMTOKEN)  {
1208         *nextTokPtr = ptr;
1209         return XML_TOK_INVALID;
1210       }
1211       *nextTokPtr = ptr + MINBPC(enc);
1212       return XML_TOK_NAME_QUESTION;
1213     default:
1214       *nextTokPtr = ptr;
1215       return XML_TOK_INVALID;
1216     }
1217   }
1218   return -tok;
1219 }
1220 
1221 static int PTRCALL
PREFIX(attributeValueTok)1222 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1223                           const char *end, const char **nextTokPtr)
1224 {
1225   const char *start;
1226   if (ptr >= end)
1227     return XML_TOK_NONE;
1228   else if (! HAS_CHAR(enc, ptr, end)) {
1229     /* This line cannot be executed.  The incoming data has already
1230      * been tokenized once, so incomplete characters like this have
1231      * already been eliminated from the input.  Retaining the paranoia
1232      * check is still valuable, however.
1233      */
1234     return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1235   }
1236   start = ptr;
1237   while (HAS_CHAR(enc, ptr, end)) {
1238     switch (BYTE_TYPE(enc, ptr)) {
1239 #define LEAD_CASE(n) \
1240     case BT_LEAD ## n: ptr += n; break;
1241     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1242 #undef LEAD_CASE
1243     case BT_AMP:
1244       if (ptr == start)
1245         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1246       *nextTokPtr = ptr;
1247       return XML_TOK_DATA_CHARS;
1248     case BT_LT:
1249       /* this is for inside entity references */
1250       *nextTokPtr = ptr;
1251       return XML_TOK_INVALID;
1252     case BT_LF:
1253       if (ptr == start) {
1254         *nextTokPtr = ptr + MINBPC(enc);
1255         return XML_TOK_DATA_NEWLINE;
1256       }
1257       *nextTokPtr = ptr;
1258       return XML_TOK_DATA_CHARS;
1259     case BT_CR:
1260       if (ptr == start) {
1261         ptr += MINBPC(enc);
1262         if (! HAS_CHAR(enc, ptr, end))
1263           return XML_TOK_TRAILING_CR;
1264         if (BYTE_TYPE(enc, ptr) == BT_LF)
1265           ptr += MINBPC(enc);
1266         *nextTokPtr = ptr;
1267         return XML_TOK_DATA_NEWLINE;
1268       }
1269       *nextTokPtr = ptr;
1270       return XML_TOK_DATA_CHARS;
1271     case BT_S:
1272       if (ptr == start) {
1273         *nextTokPtr = ptr + MINBPC(enc);
1274         return XML_TOK_ATTRIBUTE_VALUE_S;
1275       }
1276       *nextTokPtr = ptr;
1277       return XML_TOK_DATA_CHARS;
1278     default:
1279       ptr += MINBPC(enc);
1280       break;
1281     }
1282   }
1283   *nextTokPtr = ptr;
1284   return XML_TOK_DATA_CHARS;
1285 }
1286 
1287 static int PTRCALL
PREFIX(entityValueTok)1288 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1289                        const char *end, const char **nextTokPtr)
1290 {
1291   const char *start;
1292   if (ptr >= end)
1293     return XML_TOK_NONE;
1294   else if (! HAS_CHAR(enc, ptr, end)) {
1295     /* This line cannot be executed.  The incoming data has already
1296      * been tokenized once, so incomplete characters like this have
1297      * already been eliminated from the input.  Retaining the paranoia
1298      * check is still valuable, however.
1299      */
1300     return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1301   }
1302   start = ptr;
1303   while (HAS_CHAR(enc, ptr, end)) {
1304     switch (BYTE_TYPE(enc, ptr)) {
1305 #define LEAD_CASE(n) \
1306     case BT_LEAD ## n: ptr += n; break;
1307     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1308 #undef LEAD_CASE
1309     case BT_AMP:
1310       if (ptr == start)
1311         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1312       *nextTokPtr = ptr;
1313       return XML_TOK_DATA_CHARS;
1314     case BT_PERCNT:
1315       if (ptr == start) {
1316         int tok =  PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1317                                        end, nextTokPtr);
1318         return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1319       }
1320       *nextTokPtr = ptr;
1321       return XML_TOK_DATA_CHARS;
1322     case BT_LF:
1323       if (ptr == start) {
1324         *nextTokPtr = ptr + MINBPC(enc);
1325         return XML_TOK_DATA_NEWLINE;
1326       }
1327       *nextTokPtr = ptr;
1328       return XML_TOK_DATA_CHARS;
1329     case BT_CR:
1330       if (ptr == start) {
1331         ptr += MINBPC(enc);
1332         if (! HAS_CHAR(enc, ptr, end))
1333           return XML_TOK_TRAILING_CR;
1334         if (BYTE_TYPE(enc, ptr) == BT_LF)
1335           ptr += MINBPC(enc);
1336         *nextTokPtr = ptr;
1337         return XML_TOK_DATA_NEWLINE;
1338       }
1339       *nextTokPtr = ptr;
1340       return XML_TOK_DATA_CHARS;
1341     default:
1342       ptr += MINBPC(enc);
1343       break;
1344     }
1345   }
1346   *nextTokPtr = ptr;
1347   return XML_TOK_DATA_CHARS;
1348 }
1349 
1350 #ifdef XML_DTD
1351 
1352 static int PTRCALL
PREFIX(ignoreSectionTok)1353 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1354                          const char *end, const char **nextTokPtr)
1355 {
1356   int level = 0;
1357   if (MINBPC(enc) > 1) {
1358     size_t n = end - ptr;
1359     if (n & (MINBPC(enc) - 1)) {
1360       n &= ~(MINBPC(enc) - 1);
1361       end = ptr + n;
1362     }
1363   }
1364   while (HAS_CHAR(enc, ptr, end)) {
1365     switch (BYTE_TYPE(enc, ptr)) {
1366     INVALID_CASES(ptr, nextTokPtr)
1367     case BT_LT:
1368       ptr += MINBPC(enc);
1369       REQUIRE_CHAR(enc, ptr, end);
1370       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1371         ptr += MINBPC(enc);
1372         REQUIRE_CHAR(enc, ptr, end);
1373         if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1374           ++level;
1375           ptr += MINBPC(enc);
1376         }
1377       }
1378       break;
1379     case BT_RSQB:
1380       ptr += MINBPC(enc);
1381       REQUIRE_CHAR(enc, ptr, end);
1382       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1383         ptr += MINBPC(enc);
1384         REQUIRE_CHAR(enc, ptr, end);
1385         if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1386           ptr += MINBPC(enc);
1387           if (level == 0) {
1388             *nextTokPtr = ptr;
1389             return XML_TOK_IGNORE_SECT;
1390           }
1391           --level;
1392         }
1393       }
1394       break;
1395     default:
1396       ptr += MINBPC(enc);
1397       break;
1398     }
1399   }
1400   return XML_TOK_PARTIAL;
1401 }
1402 
1403 #endif /* XML_DTD */
1404 
1405 static int PTRCALL
PREFIX(isPublicId)1406 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1407                    const char **badPtr)
1408 {
1409   ptr += MINBPC(enc);
1410   end -= MINBPC(enc);
1411   for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1412     switch (BYTE_TYPE(enc, ptr)) {
1413     case BT_DIGIT:
1414     case BT_HEX:
1415     case BT_MINUS:
1416     case BT_APOS:
1417     case BT_LPAR:
1418     case BT_RPAR:
1419     case BT_PLUS:
1420     case BT_COMMA:
1421     case BT_SOL:
1422     case BT_EQUALS:
1423     case BT_QUEST:
1424     case BT_CR:
1425     case BT_LF:
1426     case BT_SEMI:
1427     case BT_EXCL:
1428     case BT_AST:
1429     case BT_PERCNT:
1430     case BT_NUM:
1431 #ifdef XML_NS
1432     case BT_COLON:
1433 #endif
1434       break;
1435     case BT_S:
1436       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1437         *badPtr = ptr;
1438         return 0;
1439       }
1440       break;
1441     case BT_NAME:
1442     case BT_NMSTRT:
1443       if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1444         break;
1445     default:
1446       switch (BYTE_TO_ASCII(enc, ptr)) {
1447       case 0x24: /* $ */
1448       case 0x40: /* @ */
1449         break;
1450       default:
1451         *badPtr = ptr;
1452         return 0;
1453       }
1454       break;
1455     }
1456   }
1457   return 1;
1458 }
1459 
1460 /* This must only be called for a well-formed start-tag or empty
1461    element tag.  Returns the number of attributes.  Pointers to the
1462    first attsMax attributes are stored in atts.
1463 */
1464 
1465 static int PTRCALL
PREFIX(getAtts)1466 PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1467                 int attsMax, ATTRIBUTE *atts)
1468 {
1469   enum { other, inName, inValue } state = inName;
1470   int nAtts = 0;
1471   int open = 0; /* defined when state == inValue;
1472                    initialization just to shut up compilers */
1473 
1474   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1475     switch (BYTE_TYPE(enc, ptr)) {
1476 #define START_NAME \
1477       if (state == other) { \
1478         if (nAtts < attsMax) { \
1479           atts[nAtts].name = ptr; \
1480           atts[nAtts].normalized = 1; \
1481         } \
1482         state = inName; \
1483       }
1484 #define LEAD_CASE(n) \
1485     case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1486     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1487 #undef LEAD_CASE
1488     case BT_NONASCII:
1489     case BT_NMSTRT:
1490     case BT_HEX:
1491       START_NAME
1492       break;
1493 #undef START_NAME
1494     case BT_QUOT:
1495       if (state != inValue) {
1496         if (nAtts < attsMax)
1497           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1498         state = inValue;
1499         open = BT_QUOT;
1500       }
1501       else if (open == BT_QUOT) {
1502         state = other;
1503         if (nAtts < attsMax)
1504           atts[nAtts].valueEnd = ptr;
1505         nAtts++;
1506       }
1507       break;
1508     case BT_APOS:
1509       if (state != inValue) {
1510         if (nAtts < attsMax)
1511           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1512         state = inValue;
1513         open = BT_APOS;
1514       }
1515       else if (open == BT_APOS) {
1516         state = other;
1517         if (nAtts < attsMax)
1518           atts[nAtts].valueEnd = ptr;
1519         nAtts++;
1520       }
1521       break;
1522     case BT_AMP:
1523       if (nAtts < attsMax)
1524         atts[nAtts].normalized = 0;
1525       break;
1526     case BT_S:
1527       if (state == inName)
1528         state = other;
1529       else if (state == inValue
1530                && nAtts < attsMax
1531                && atts[nAtts].normalized
1532                && (ptr == atts[nAtts].valuePtr
1533                    || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1534                    || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1535                    || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1536         atts[nAtts].normalized = 0;
1537       break;
1538     case BT_CR: case BT_LF:
1539       /* This case ensures that the first attribute name is counted
1540          Apart from that we could just change state on the quote. */
1541       if (state == inName)
1542         state = other;
1543       else if (state == inValue && nAtts < attsMax)
1544         atts[nAtts].normalized = 0;
1545       break;
1546     case BT_GT:
1547     case BT_SOL:
1548       if (state != inValue)
1549         return nAtts;
1550       break;
1551     default:
1552       break;
1553     }
1554   }
1555   /* not reached */
1556 }
1557 
1558 static int PTRFASTCALL
PREFIX(charRefNumber)1559 PREFIX(charRefNumber)(const ENCODING *UNUSED_P(enc), const char *ptr)
1560 {
1561   int result = 0;
1562   /* skip &# */
1563   ptr += 2*MINBPC(enc);
1564   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1565     for (ptr += MINBPC(enc);
1566          !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1567          ptr += MINBPC(enc)) {
1568       int c = BYTE_TO_ASCII(enc, ptr);
1569       switch (c) {
1570       case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1571       case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1572         result <<= 4;
1573         result |= (c - ASCII_0);
1574         break;
1575       case ASCII_A: case ASCII_B: case ASCII_C:
1576       case ASCII_D: case ASCII_E: case ASCII_F:
1577         result <<= 4;
1578         result += 10 + (c - ASCII_A);
1579         break;
1580       case ASCII_a: case ASCII_b: case ASCII_c:
1581       case ASCII_d: case ASCII_e: case ASCII_f:
1582         result <<= 4;
1583         result += 10 + (c - ASCII_a);
1584         break;
1585       }
1586       if (result >= 0x110000)
1587         return -1;
1588     }
1589   }
1590   else {
1591     for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1592       int c = BYTE_TO_ASCII(enc, ptr);
1593       result *= 10;
1594       result += (c - ASCII_0);
1595       if (result >= 0x110000)
1596         return -1;
1597     }
1598   }
1599   return checkCharRefNumber(result);
1600 }
1601 
1602 static int PTRCALL
PREFIX(predefinedEntityName)1603 PREFIX(predefinedEntityName)(const ENCODING *UNUSED_P(enc), const char *ptr,
1604                              const char *end)
1605 {
1606   switch ((end - ptr)/MINBPC(enc)) {
1607   case 2:
1608     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1609       switch (BYTE_TO_ASCII(enc, ptr)) {
1610       case ASCII_l:
1611         return ASCII_LT;
1612       case ASCII_g:
1613         return ASCII_GT;
1614       }
1615     }
1616     break;
1617   case 3:
1618     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1619       ptr += MINBPC(enc);
1620       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1621         ptr += MINBPC(enc);
1622         if (CHAR_MATCHES(enc, ptr, ASCII_p))
1623           return ASCII_AMP;
1624       }
1625     }
1626     break;
1627   case 4:
1628     switch (BYTE_TO_ASCII(enc, ptr)) {
1629     case ASCII_q:
1630       ptr += MINBPC(enc);
1631       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1632         ptr += MINBPC(enc);
1633         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1634           ptr += MINBPC(enc);
1635           if (CHAR_MATCHES(enc, ptr, ASCII_t))
1636             return ASCII_QUOT;
1637         }
1638       }
1639       break;
1640     case ASCII_a:
1641       ptr += MINBPC(enc);
1642       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1643         ptr += MINBPC(enc);
1644         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1645           ptr += MINBPC(enc);
1646           if (CHAR_MATCHES(enc, ptr, ASCII_s))
1647             return ASCII_APOS;
1648         }
1649       }
1650       break;
1651     }
1652   }
1653   return 0;
1654 }
1655 
1656 static int PTRCALL
PREFIX(nameMatchesAscii)1657 PREFIX(nameMatchesAscii)(const ENCODING *UNUSED_P(enc), const char *ptr1,
1658                          const char *end1, const char *ptr2)
1659 {
1660   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1661     if (end1 - ptr1 < MINBPC(enc)) {
1662       /* This line cannot be executed.  THe incoming data has already
1663        * been tokenized once, so imcomplete characters like this have
1664        * already been eliminated from the input.  Retaining the
1665        * paranoia check is still valuable, however.
1666        */
1667       return 0; /* LCOV_EXCL_LINE */
1668     }
1669     if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1670       return 0;
1671   }
1672   return ptr1 == end1;
1673 }
1674 
1675 static int PTRFASTCALL
PREFIX(nameLength)1676 PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1677 {
1678   const char *start = ptr;
1679   for (;;) {
1680     switch (BYTE_TYPE(enc, ptr)) {
1681 #define LEAD_CASE(n) \
1682     case BT_LEAD ## n: ptr += n; break;
1683     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1684 #undef LEAD_CASE
1685     case BT_NONASCII:
1686     case BT_NMSTRT:
1687 #ifdef XML_NS
1688     case BT_COLON:
1689 #endif
1690     case BT_HEX:
1691     case BT_DIGIT:
1692     case BT_NAME:
1693     case BT_MINUS:
1694       ptr += MINBPC(enc);
1695       break;
1696     default:
1697       return (int)(ptr - start);
1698     }
1699   }
1700 }
1701 
1702 static const char * PTRFASTCALL
PREFIX(skipS)1703 PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1704 {
1705   for (;;) {
1706     switch (BYTE_TYPE(enc, ptr)) {
1707     case BT_LF:
1708     case BT_CR:
1709     case BT_S:
1710       ptr += MINBPC(enc);
1711       break;
1712     default:
1713       return ptr;
1714     }
1715   }
1716 }
1717 
1718 static void PTRCALL
PREFIX(updatePosition)1719 PREFIX(updatePosition)(const ENCODING *enc,
1720                        const char *ptr,
1721                        const char *end,
1722                        POSITION *pos)
1723 {
1724   while (HAS_CHAR(enc, ptr, end)) {
1725     switch (BYTE_TYPE(enc, ptr)) {
1726 #define LEAD_CASE(n) \
1727     case BT_LEAD ## n: \
1728       ptr += n; \
1729       break;
1730     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1731 #undef LEAD_CASE
1732     case BT_LF:
1733       pos->columnNumber = (XML_Size)-1;
1734       pos->lineNumber++;
1735       ptr += MINBPC(enc);
1736       break;
1737     case BT_CR:
1738       pos->lineNumber++;
1739       ptr += MINBPC(enc);
1740       if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1741         ptr += MINBPC(enc);
1742       pos->columnNumber = (XML_Size)-1;
1743       break;
1744     default:
1745       ptr += MINBPC(enc);
1746       break;
1747     }
1748     pos->columnNumber++;
1749   }
1750 }
1751 
1752 #undef DO_LEAD_CASE
1753 #undef MULTIBYTE_CASES
1754 #undef INVALID_CASES
1755 #undef CHECK_NAME_CASE
1756 #undef CHECK_NAME_CASES
1757 #undef CHECK_NMSTRT_CASE
1758 #undef CHECK_NMSTRT_CASES
1759 
1760 #endif /* XML_TOK_IMPL_C */
1761