1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2    See the file COPYING for copying permission.
3 */
4 
5 #ifndef IS_INVALID_CHAR
6 #define IS_INVALID_CHAR(enc, ptr, n) (0)
7 #endif
8 
9 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
10     case BT_LEAD ## n: \
11       if (end - ptr < n) \
12         return XML_TOK_PARTIAL_CHAR; \
13       if (IS_INVALID_CHAR(enc, ptr, n)) { \
14         *(nextTokPtr) = (ptr); \
15         return XML_TOK_INVALID; \
16       } \
17       ptr += n; \
18       break;
19 
20 #define INVALID_CASES(ptr, nextTokPtr) \
21   INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
22   INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
23   INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
24   case BT_NONXML: \
25   case BT_MALFORM: \
26   case BT_TRAIL: \
27     *(nextTokPtr) = (ptr); \
28     return XML_TOK_INVALID;
29 
30 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
31    case BT_LEAD ## n: \
32      if (end - ptr < n) \
33        return XML_TOK_PARTIAL_CHAR; \
34      if (!IS_NAME_CHAR(enc, ptr, n)) { \
35        *nextTokPtr = ptr; \
36        return XML_TOK_INVALID; \
37      } \
38      ptr += n; \
39      break;
40 
41 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
42   case BT_NONASCII: \
43     if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
44       *nextTokPtr = ptr; \
45       return XML_TOK_INVALID; \
46     } \
47   case BT_NMSTRT: \
48   case BT_HEX: \
49   case BT_DIGIT: \
50   case BT_NAME: \
51   case BT_MINUS: \
52     ptr += MINBPC(enc); \
53     break; \
54   CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
55   CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
56   CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
57 
58 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
59    case BT_LEAD ## n: \
60      if (end - ptr < n) \
61        return XML_TOK_PARTIAL_CHAR; \
62      if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
63        *nextTokPtr = ptr; \
64        return XML_TOK_INVALID; \
65      } \
66      ptr += n; \
67      break;
68 
69 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
70   case BT_NONASCII: \
71     if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
72       *nextTokPtr = ptr; \
73       return XML_TOK_INVALID; \
74     } \
75   case BT_NMSTRT: \
76   case BT_HEX: \
77     ptr += MINBPC(enc); \
78     break; \
79   CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
80   CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
81   CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
82 
83 #ifndef PREFIX
84 #define PREFIX(ident) ident
85 #endif
86 
87 /* ptr points to character following "<!-" */
88 
89 static int PTRCALL
PREFIX(scanComment)90 PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
91                     const char *end, const char **nextTokPtr)
92 {
93   if (ptr != end) {
94     if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
95       *nextTokPtr = ptr;
96       return XML_TOK_INVALID;
97     }
98     ptr += MINBPC(enc);
99     while (ptr != end) {
100       switch (BYTE_TYPE(enc, ptr)) {
101       INVALID_CASES(ptr, nextTokPtr)
102       case BT_MINUS:
103         if ((ptr += MINBPC(enc)) == end)
104           return XML_TOK_PARTIAL;
105         if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
106           if ((ptr += MINBPC(enc)) == end)
107             return XML_TOK_PARTIAL;
108           if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
109             *nextTokPtr = ptr;
110             return XML_TOK_INVALID;
111           }
112           *nextTokPtr = ptr + MINBPC(enc);
113           return XML_TOK_COMMENT;
114         }
115         break;
116       default:
117         ptr += MINBPC(enc);
118         break;
119       }
120     }
121   }
122   return XML_TOK_PARTIAL;
123 }
124 
125 /* ptr points to character following "<!" */
126 
127 static int PTRCALL
PREFIX(scanDecl)128 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
129                  const char *end, const char **nextTokPtr)
130 {
131   if (ptr == end)
132     return XML_TOK_PARTIAL;
133   switch (BYTE_TYPE(enc, ptr)) {
134   case BT_MINUS:
135     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
136   case BT_LSQB:
137     *nextTokPtr = ptr + MINBPC(enc);
138     return XML_TOK_COND_SECT_OPEN;
139   case BT_NMSTRT:
140   case BT_HEX:
141     ptr += MINBPC(enc);
142     break;
143   default:
144     *nextTokPtr = ptr;
145     return XML_TOK_INVALID;
146   }
147   while (ptr != end) {
148     switch (BYTE_TYPE(enc, ptr)) {
149     case BT_PERCNT:
150       if (ptr + MINBPC(enc) == end)
151         return XML_TOK_PARTIAL;
152       /* don't allow <!ENTITY% foo "whatever"> */
153       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
154       case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
155         *nextTokPtr = ptr;
156         return XML_TOK_INVALID;
157       }
158       /* fall through */
159     case BT_S: case BT_CR: case BT_LF:
160       *nextTokPtr = ptr;
161       return XML_TOK_DECL_OPEN;
162     case BT_NMSTRT:
163     case BT_HEX:
164       ptr += MINBPC(enc);
165       break;
166     default:
167       *nextTokPtr = ptr;
168       return XML_TOK_INVALID;
169     }
170   }
171   return XML_TOK_PARTIAL;
172 }
173 
174 static int PTRCALL
PREFIX(checkPiTarget)175 PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr,
176                       const char *end, int *tokPtr)
177 {
178   int upper = 0;
179   *tokPtr = XML_TOK_PI;
180   if (end - ptr != MINBPC(enc)*3)
181     return 1;
182   switch (BYTE_TO_ASCII(enc, ptr)) {
183   case ASCII_x:
184     break;
185   case ASCII_X:
186     upper = 1;
187     break;
188   default:
189     return 1;
190   }
191   ptr += MINBPC(enc);
192   switch (BYTE_TO_ASCII(enc, ptr)) {
193   case ASCII_m:
194     break;
195   case ASCII_M:
196     upper = 1;
197     break;
198   default:
199     return 1;
200   }
201   ptr += MINBPC(enc);
202   switch (BYTE_TO_ASCII(enc, ptr)) {
203   case ASCII_l:
204     break;
205   case ASCII_L:
206     upper = 1;
207     break;
208   default:
209     return 1;
210   }
211   if (upper)
212     return 0;
213   *tokPtr = XML_TOK_XML_DECL;
214   return 1;
215 }
216 
217 /* ptr points to character following "<?" */
218 
219 static int PTRCALL
PREFIX(scanPi)220 PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
221                const char *end, const char **nextTokPtr)
222 {
223   int tok;
224   const char *target = ptr;
225   if (ptr == end)
226     return XML_TOK_PARTIAL;
227   switch (BYTE_TYPE(enc, ptr)) {
228   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
229   default:
230     *nextTokPtr = ptr;
231     return XML_TOK_INVALID;
232   }
233   while (ptr != end) {
234     switch (BYTE_TYPE(enc, ptr)) {
235     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
236     case BT_S: case BT_CR: case BT_LF:
237       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
238         *nextTokPtr = ptr;
239         return XML_TOK_INVALID;
240       }
241       ptr += MINBPC(enc);
242       while (ptr != end) {
243         switch (BYTE_TYPE(enc, ptr)) {
244         INVALID_CASES(ptr, nextTokPtr)
245         case BT_QUEST:
246           ptr += MINBPC(enc);
247           if (ptr == end)
248             return XML_TOK_PARTIAL;
249           if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
250             *nextTokPtr = ptr + MINBPC(enc);
251             return tok;
252           }
253           break;
254         default:
255           ptr += MINBPC(enc);
256           break;
257         }
258       }
259       return XML_TOK_PARTIAL;
260     case BT_QUEST:
261       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
262         *nextTokPtr = ptr;
263         return XML_TOK_INVALID;
264       }
265       ptr += MINBPC(enc);
266       if (ptr == end)
267         return XML_TOK_PARTIAL;
268       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
269         *nextTokPtr = ptr + MINBPC(enc);
270         return tok;
271       }
272       /* fall through */
273     default:
274       *nextTokPtr = ptr;
275       return XML_TOK_INVALID;
276     }
277   }
278   return XML_TOK_PARTIAL;
279 }
280 
281 static int PTRCALL
PREFIX(scanCdataSection)282 PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr,
283                          const char *end, const char **nextTokPtr)
284 {
285   static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
286                                      ASCII_T, ASCII_A, ASCII_LSQB };
287   int i;
288   /* CDATA[ */
289   if (end - ptr < 6 * MINBPC(enc))
290     return XML_TOK_PARTIAL;
291   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
292     if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
293       *nextTokPtr = ptr;
294       return XML_TOK_INVALID;
295     }
296   }
297   *nextTokPtr = ptr;
298   return XML_TOK_CDATA_SECT_OPEN;
299 }
300 
301 static int PTRCALL
PREFIX(cdataSectionTok)302 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
303                         const char *end, const char **nextTokPtr)
304 {
305   if (ptr == end)
306     return XML_TOK_NONE;
307   if (MINBPC(enc) > 1) {
308     size_t n = end - ptr;
309     if (n & (MINBPC(enc) - 1)) {
310       n &= ~(MINBPC(enc) - 1);
311       if (n == 0)
312         return XML_TOK_PARTIAL;
313       end = ptr + n;
314     }
315   }
316   switch (BYTE_TYPE(enc, ptr)) {
317   case BT_RSQB:
318     ptr += MINBPC(enc);
319     if (ptr == end)
320       return XML_TOK_PARTIAL;
321     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
322       break;
323     ptr += MINBPC(enc);
324     if (ptr == end)
325       return XML_TOK_PARTIAL;
326     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
327       ptr -= MINBPC(enc);
328       break;
329     }
330     *nextTokPtr = ptr + MINBPC(enc);
331     return XML_TOK_CDATA_SECT_CLOSE;
332   case BT_CR:
333     ptr += MINBPC(enc);
334     if (ptr == end)
335       return XML_TOK_PARTIAL;
336     if (BYTE_TYPE(enc, ptr) == BT_LF)
337       ptr += MINBPC(enc);
338     *nextTokPtr = ptr;
339     return XML_TOK_DATA_NEWLINE;
340   case BT_LF:
341     *nextTokPtr = ptr + MINBPC(enc);
342     return XML_TOK_DATA_NEWLINE;
343   INVALID_CASES(ptr, nextTokPtr)
344   default:
345     ptr += MINBPC(enc);
346     break;
347   }
348   while (ptr != end) {
349     switch (BYTE_TYPE(enc, ptr)) {
350 #define LEAD_CASE(n) \
351     case BT_LEAD ## n: \
352       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
353         *nextTokPtr = ptr; \
354         return XML_TOK_DATA_CHARS; \
355       } \
356       ptr += n; \
357       break;
358     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
359 #undef LEAD_CASE
360     case BT_NONXML:
361     case BT_MALFORM:
362     case BT_TRAIL:
363     case BT_CR:
364     case BT_LF:
365     case BT_RSQB:
366       *nextTokPtr = ptr;
367       return XML_TOK_DATA_CHARS;
368     default:
369       ptr += MINBPC(enc);
370       break;
371     }
372   }
373   *nextTokPtr = ptr;
374   return XML_TOK_DATA_CHARS;
375 }
376 
377 /* ptr points to character following "</" */
378 
379 static int PTRCALL
PREFIX(scanEndTag)380 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
381                    const char *end, const char **nextTokPtr)
382 {
383   if (ptr == end)
384     return XML_TOK_PARTIAL;
385   switch (BYTE_TYPE(enc, ptr)) {
386   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
387   default:
388     *nextTokPtr = ptr;
389     return XML_TOK_INVALID;
390   }
391   while (ptr != end) {
392     switch (BYTE_TYPE(enc, ptr)) {
393     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
394     case BT_S: case BT_CR: case BT_LF:
395       for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
396         switch (BYTE_TYPE(enc, ptr)) {
397         case BT_S: case BT_CR: case BT_LF:
398           break;
399         case BT_GT:
400           *nextTokPtr = ptr + MINBPC(enc);
401           return XML_TOK_END_TAG;
402         default:
403           *nextTokPtr = ptr;
404           return XML_TOK_INVALID;
405         }
406       }
407       return XML_TOK_PARTIAL;
408 #ifdef XML_NS
409     case BT_COLON:
410       /* no need to check qname syntax here,
411          since end-tag must match exactly */
412       ptr += MINBPC(enc);
413       break;
414 #endif
415     case BT_GT:
416       *nextTokPtr = ptr + MINBPC(enc);
417       return XML_TOK_END_TAG;
418     default:
419       *nextTokPtr = ptr;
420       return XML_TOK_INVALID;
421     }
422   }
423   return XML_TOK_PARTIAL;
424 }
425 
426 /* ptr points to character following "&#X" */
427 
428 static int PTRCALL
PREFIX(scanHexCharRef)429 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
430                        const char *end, const char **nextTokPtr)
431 {
432   if (ptr != end) {
433     switch (BYTE_TYPE(enc, ptr)) {
434     case BT_DIGIT:
435     case BT_HEX:
436       break;
437     default:
438       *nextTokPtr = ptr;
439       return XML_TOK_INVALID;
440     }
441     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
442       switch (BYTE_TYPE(enc, ptr)) {
443       case BT_DIGIT:
444       case BT_HEX:
445         break;
446       case BT_SEMI:
447         *nextTokPtr = ptr + MINBPC(enc);
448         return XML_TOK_CHAR_REF;
449       default:
450         *nextTokPtr = ptr;
451         return XML_TOK_INVALID;
452       }
453     }
454   }
455   return XML_TOK_PARTIAL;
456 }
457 
458 /* ptr points to character following "&#" */
459 
460 static int PTRCALL
PREFIX(scanCharRef)461 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
462                     const char *end, const char **nextTokPtr)
463 {
464   if (ptr != end) {
465     if (CHAR_MATCHES(enc, ptr, ASCII_x))
466       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
467     switch (BYTE_TYPE(enc, ptr)) {
468     case BT_DIGIT:
469       break;
470     default:
471       *nextTokPtr = ptr;
472       return XML_TOK_INVALID;
473     }
474     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
475       switch (BYTE_TYPE(enc, ptr)) {
476       case BT_DIGIT:
477         break;
478       case BT_SEMI:
479         *nextTokPtr = ptr + MINBPC(enc);
480         return XML_TOK_CHAR_REF;
481       default:
482         *nextTokPtr = ptr;
483         return XML_TOK_INVALID;
484       }
485     }
486   }
487   return XML_TOK_PARTIAL;
488 }
489 
490 /* ptr points to character following "&" */
491 
492 static int PTRCALL
PREFIX(scanRef)493 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
494                 const char **nextTokPtr)
495 {
496   if (ptr == end)
497     return XML_TOK_PARTIAL;
498   switch (BYTE_TYPE(enc, ptr)) {
499   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
500   case BT_NUM:
501     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
502   default:
503     *nextTokPtr = ptr;
504     return XML_TOK_INVALID;
505   }
506   while (ptr != end) {
507     switch (BYTE_TYPE(enc, ptr)) {
508     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
509     case BT_SEMI:
510       *nextTokPtr = ptr + MINBPC(enc);
511       return XML_TOK_ENTITY_REF;
512     default:
513       *nextTokPtr = ptr;
514       return XML_TOK_INVALID;
515     }
516   }
517   return XML_TOK_PARTIAL;
518 }
519 
520 /* ptr points to character following first character of attribute name */
521 
522 static int PTRCALL
PREFIX(scanAtts)523 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
524                  const char **nextTokPtr)
525 {
526 #ifdef XML_NS
527   int hadColon = 0;
528 #endif
529   while (ptr != end) {
530     switch (BYTE_TYPE(enc, ptr)) {
531     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
532 #ifdef XML_NS
533     case BT_COLON:
534       if (hadColon) {
535         *nextTokPtr = ptr;
536         return XML_TOK_INVALID;
537       }
538       hadColon = 1;
539       ptr += MINBPC(enc);
540       if (ptr == end)
541         return XML_TOK_PARTIAL;
542       switch (BYTE_TYPE(enc, ptr)) {
543       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
544       default:
545         *nextTokPtr = ptr;
546         return XML_TOK_INVALID;
547       }
548       break;
549 #endif
550     case BT_S: case BT_CR: case BT_LF:
551       for (;;) {
552         int t;
553 
554         ptr += MINBPC(enc);
555         if (ptr == end)
556           return XML_TOK_PARTIAL;
557         t = BYTE_TYPE(enc, ptr);
558         if (t == BT_EQUALS)
559           break;
560         switch (t) {
561         case BT_S:
562         case BT_LF:
563         case BT_CR:
564           break;
565         default:
566           *nextTokPtr = ptr;
567           return XML_TOK_INVALID;
568         }
569       }
570     /* fall through */
571     case BT_EQUALS:
572       {
573         int open;
574 #ifdef XML_NS
575         hadColon = 0;
576 #endif
577         for (;;) {
578           ptr += MINBPC(enc);
579           if (ptr == end)
580             return XML_TOK_PARTIAL;
581           open = BYTE_TYPE(enc, ptr);
582           if (open == BT_QUOT || open == BT_APOS)
583             break;
584           switch (open) {
585           case BT_S:
586           case BT_LF:
587           case BT_CR:
588             break;
589           default:
590             *nextTokPtr = ptr;
591             return XML_TOK_INVALID;
592           }
593         }
594         ptr += MINBPC(enc);
595         /* in attribute value */
596         for (;;) {
597           int t;
598           if (ptr == end)
599             return XML_TOK_PARTIAL;
600           t = BYTE_TYPE(enc, ptr);
601           if (t == open)
602             break;
603           switch (t) {
604           INVALID_CASES(ptr, nextTokPtr)
605           case BT_AMP:
606             {
607               int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
608               if (tok <= 0) {
609                 if (tok == XML_TOK_INVALID)
610                   *nextTokPtr = ptr;
611                 return tok;
612               }
613               break;
614             }
615           case BT_LT:
616             *nextTokPtr = ptr;
617             return XML_TOK_INVALID;
618           default:
619             ptr += MINBPC(enc);
620             break;
621           }
622         }
623         ptr += MINBPC(enc);
624         if (ptr == end)
625           return XML_TOK_PARTIAL;
626         switch (BYTE_TYPE(enc, ptr)) {
627         case BT_S:
628         case BT_CR:
629         case BT_LF:
630           break;
631         case BT_SOL:
632           goto sol;
633         case BT_GT:
634           goto gt;
635         default:
636           *nextTokPtr = ptr;
637           return XML_TOK_INVALID;
638         }
639         /* ptr points to closing quote */
640         for (;;) {
641           ptr += MINBPC(enc);
642           if (ptr == end)
643             return XML_TOK_PARTIAL;
644           switch (BYTE_TYPE(enc, ptr)) {
645           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
646           case BT_S: case BT_CR: case BT_LF:
647             continue;
648           case BT_GT:
649           gt:
650             *nextTokPtr = ptr + MINBPC(enc);
651             return XML_TOK_START_TAG_WITH_ATTS;
652           case BT_SOL:
653           sol:
654             ptr += MINBPC(enc);
655             if (ptr == end)
656               return XML_TOK_PARTIAL;
657             if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
658               *nextTokPtr = ptr;
659               return XML_TOK_INVALID;
660             }
661             *nextTokPtr = ptr + MINBPC(enc);
662             return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
663           default:
664             *nextTokPtr = ptr;
665             return XML_TOK_INVALID;
666           }
667           break;
668         }
669         break;
670       }
671     default:
672       *nextTokPtr = ptr;
673       return XML_TOK_INVALID;
674     }
675   }
676   return XML_TOK_PARTIAL;
677 }
678 
679 /* ptr points to character following "<" */
680 
681 static int PTRCALL
PREFIX(scanLt)682 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
683                const char **nextTokPtr)
684 {
685 #ifdef XML_NS
686   int hadColon;
687 #endif
688   if (ptr == end)
689     return XML_TOK_PARTIAL;
690   switch (BYTE_TYPE(enc, ptr)) {
691   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
692   case BT_EXCL:
693     if ((ptr += MINBPC(enc)) == end)
694       return XML_TOK_PARTIAL;
695     switch (BYTE_TYPE(enc, ptr)) {
696     case BT_MINUS:
697       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
698     case BT_LSQB:
699       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
700                                       end, nextTokPtr);
701     }
702     *nextTokPtr = ptr;
703     return XML_TOK_INVALID;
704   case BT_QUEST:
705     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
706   case BT_SOL:
707     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
708   default:
709     *nextTokPtr = ptr;
710     return XML_TOK_INVALID;
711   }
712 #ifdef XML_NS
713   hadColon = 0;
714 #endif
715   /* we have a start-tag */
716   while (ptr != end) {
717     switch (BYTE_TYPE(enc, ptr)) {
718     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
719 #ifdef XML_NS
720     case BT_COLON:
721       if (hadColon) {
722         *nextTokPtr = ptr;
723         return XML_TOK_INVALID;
724       }
725       hadColon = 1;
726       ptr += MINBPC(enc);
727       if (ptr == end)
728         return XML_TOK_PARTIAL;
729       switch (BYTE_TYPE(enc, ptr)) {
730       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
731       default:
732         *nextTokPtr = ptr;
733         return XML_TOK_INVALID;
734       }
735       break;
736 #endif
737     case BT_S: case BT_CR: case BT_LF:
738       {
739         ptr += MINBPC(enc);
740         while (ptr != end) {
741           switch (BYTE_TYPE(enc, ptr)) {
742           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
743           case BT_GT:
744             goto gt;
745           case BT_SOL:
746             goto sol;
747           case BT_S: case BT_CR: case BT_LF:
748             ptr += MINBPC(enc);
749             continue;
750           default:
751             *nextTokPtr = ptr;
752             return XML_TOK_INVALID;
753           }
754           return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
755         }
756         return XML_TOK_PARTIAL;
757       }
758     case BT_GT:
759     gt:
760       *nextTokPtr = ptr + MINBPC(enc);
761       return XML_TOK_START_TAG_NO_ATTS;
762     case BT_SOL:
763     sol:
764       ptr += MINBPC(enc);
765       if (ptr == end)
766         return XML_TOK_PARTIAL;
767       if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
768         *nextTokPtr = ptr;
769         return XML_TOK_INVALID;
770       }
771       *nextTokPtr = ptr + MINBPC(enc);
772       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
773     default:
774       *nextTokPtr = ptr;
775       return XML_TOK_INVALID;
776     }
777   }
778   return XML_TOK_PARTIAL;
779 }
780 
781 static int PTRCALL
PREFIX(contentTok)782 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
783                    const char **nextTokPtr)
784 {
785   if (ptr == end)
786     return XML_TOK_NONE;
787   if (MINBPC(enc) > 1) {
788     size_t n = end - ptr;
789     if (n & (MINBPC(enc) - 1)) {
790       n &= ~(MINBPC(enc) - 1);
791       if (n == 0)
792         return XML_TOK_PARTIAL;
793       end = ptr + n;
794     }
795   }
796   switch (BYTE_TYPE(enc, ptr)) {
797   case BT_LT:
798     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
799   case BT_AMP:
800     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
801   case BT_CR:
802     ptr += MINBPC(enc);
803     if (ptr == end)
804       return XML_TOK_TRAILING_CR;
805     if (BYTE_TYPE(enc, ptr) == BT_LF)
806       ptr += MINBPC(enc);
807     *nextTokPtr = ptr;
808     return XML_TOK_DATA_NEWLINE;
809   case BT_LF:
810     *nextTokPtr = ptr + MINBPC(enc);
811     return XML_TOK_DATA_NEWLINE;
812   case BT_RSQB:
813     ptr += MINBPC(enc);
814     if (ptr == end)
815       return XML_TOK_TRAILING_RSQB;
816     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
817       break;
818     ptr += MINBPC(enc);
819     if (ptr == end)
820       return XML_TOK_TRAILING_RSQB;
821     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
822       ptr -= MINBPC(enc);
823       break;
824     }
825     *nextTokPtr = ptr;
826     return XML_TOK_INVALID;
827   INVALID_CASES(ptr, nextTokPtr)
828   default:
829     ptr += MINBPC(enc);
830     break;
831   }
832   while (ptr != end) {
833     switch (BYTE_TYPE(enc, ptr)) {
834 #define LEAD_CASE(n) \
835     case BT_LEAD ## n: \
836       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
837         *nextTokPtr = ptr; \
838         return XML_TOK_DATA_CHARS; \
839       } \
840       ptr += n; \
841       break;
842     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
843 #undef LEAD_CASE
844     case BT_RSQB:
845       if (ptr + MINBPC(enc) != end) {
846          if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
847            ptr += MINBPC(enc);
848            break;
849          }
850          if (ptr + 2*MINBPC(enc) != end) {
851            if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
852              ptr += MINBPC(enc);
853              break;
854            }
855            *nextTokPtr = ptr + 2*MINBPC(enc);
856            return XML_TOK_INVALID;
857          }
858       }
859       /* fall through */
860     case BT_AMP:
861     case BT_LT:
862     case BT_NONXML:
863     case BT_MALFORM:
864     case BT_TRAIL:
865     case BT_CR:
866     case BT_LF:
867       *nextTokPtr = ptr;
868       return XML_TOK_DATA_CHARS;
869     default:
870       ptr += MINBPC(enc);
871       break;
872     }
873   }
874   *nextTokPtr = ptr;
875   return XML_TOK_DATA_CHARS;
876 }
877 
878 /* ptr points to character following "%" */
879 
880 static int PTRCALL
PREFIX(scanPercent)881 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
882                     const char **nextTokPtr)
883 {
884   if (ptr == end)
885     return XML_TOK_PARTIAL;
886   switch (BYTE_TYPE(enc, ptr)) {
887   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
888   case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
889     *nextTokPtr = ptr;
890     return XML_TOK_PERCENT;
891   default:
892     *nextTokPtr = ptr;
893     return XML_TOK_INVALID;
894   }
895   while (ptr != end) {
896     switch (BYTE_TYPE(enc, ptr)) {
897     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
898     case BT_SEMI:
899       *nextTokPtr = ptr + MINBPC(enc);
900       return XML_TOK_PARAM_ENTITY_REF;
901     default:
902       *nextTokPtr = ptr;
903       return XML_TOK_INVALID;
904     }
905   }
906   return XML_TOK_PARTIAL;
907 }
908 
909 static int PTRCALL
PREFIX(scanPoundName)910 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
911                       const char **nextTokPtr)
912 {
913   if (ptr == end)
914     return XML_TOK_PARTIAL;
915   switch (BYTE_TYPE(enc, ptr)) {
916   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
917   default:
918     *nextTokPtr = ptr;
919     return XML_TOK_INVALID;
920   }
921   while (ptr != end) {
922     switch (BYTE_TYPE(enc, ptr)) {
923     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
924     case BT_CR: case BT_LF: case BT_S:
925     case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
926       *nextTokPtr = ptr;
927       return XML_TOK_POUND_NAME;
928     default:
929       *nextTokPtr = ptr;
930       return XML_TOK_INVALID;
931     }
932   }
933   return -XML_TOK_POUND_NAME;
934 }
935 
936 static int PTRCALL
PREFIX(scanLit)937 PREFIX(scanLit)(int open, const ENCODING *enc,
938                 const char *ptr, const char *end,
939                 const char **nextTokPtr)
940 {
941   while (ptr != end) {
942     int t = BYTE_TYPE(enc, ptr);
943     switch (t) {
944     INVALID_CASES(ptr, nextTokPtr)
945     case BT_QUOT:
946     case BT_APOS:
947       ptr += MINBPC(enc);
948       if (t != open)
949         break;
950       if (ptr == end)
951         return -XML_TOK_LITERAL;
952       *nextTokPtr = ptr;
953       switch (BYTE_TYPE(enc, ptr)) {
954       case BT_S: case BT_CR: case BT_LF:
955       case BT_GT: case BT_PERCNT: case BT_LSQB:
956         return XML_TOK_LITERAL;
957       default:
958         return XML_TOK_INVALID;
959       }
960     default:
961       ptr += MINBPC(enc);
962       break;
963     }
964   }
965   return XML_TOK_PARTIAL;
966 }
967 
968 static int PTRCALL
PREFIX(prologTok)969 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
970                   const char **nextTokPtr)
971 {
972   int tok;
973   if (ptr == end)
974     return XML_TOK_NONE;
975   if (MINBPC(enc) > 1) {
976     size_t n = end - ptr;
977     if (n & (MINBPC(enc) - 1)) {
978       n &= ~(MINBPC(enc) - 1);
979       if (n == 0)
980         return XML_TOK_PARTIAL;
981       end = ptr + n;
982     }
983   }
984   switch (BYTE_TYPE(enc, ptr)) {
985   case BT_QUOT:
986     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
987   case BT_APOS:
988     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
989   case BT_LT:
990     {
991       ptr += MINBPC(enc);
992       if (ptr == end)
993         return XML_TOK_PARTIAL;
994       switch (BYTE_TYPE(enc, ptr)) {
995       case BT_EXCL:
996         return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
997       case BT_QUEST:
998         return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
999       case BT_NMSTRT:
1000       case BT_HEX:
1001       case BT_NONASCII:
1002       case BT_LEAD2:
1003       case BT_LEAD3:
1004       case BT_LEAD4:
1005         *nextTokPtr = ptr - MINBPC(enc);
1006         return XML_TOK_INSTANCE_START;
1007       }
1008       *nextTokPtr = ptr;
1009       return XML_TOK_INVALID;
1010     }
1011   case BT_CR:
1012     if (ptr + MINBPC(enc) == end) {
1013       *nextTokPtr = end;
1014       /* indicate that this might be part of a CR/LF pair */
1015       return -XML_TOK_PROLOG_S;
1016     }
1017     /* fall through */
1018   case BT_S: case BT_LF:
1019     for (;;) {
1020       ptr += MINBPC(enc);
1021       if (ptr == end)
1022         break;
1023       switch (BYTE_TYPE(enc, ptr)) {
1024       case BT_S: case BT_LF:
1025         break;
1026       case BT_CR:
1027         /* don't split CR/LF pair */
1028         if (ptr + MINBPC(enc) != end)
1029           break;
1030         /* fall through */
1031       default:
1032         *nextTokPtr = ptr;
1033         return XML_TOK_PROLOG_S;
1034       }
1035     }
1036     *nextTokPtr = ptr;
1037     return XML_TOK_PROLOG_S;
1038   case BT_PERCNT:
1039     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1040   case BT_COMMA:
1041     *nextTokPtr = ptr + MINBPC(enc);
1042     return XML_TOK_COMMA;
1043   case BT_LSQB:
1044     *nextTokPtr = ptr + MINBPC(enc);
1045     return XML_TOK_OPEN_BRACKET;
1046   case BT_RSQB:
1047     ptr += MINBPC(enc);
1048     if (ptr == end)
1049       return -XML_TOK_CLOSE_BRACKET;
1050     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1051       if (ptr + MINBPC(enc) == end)
1052         return XML_TOK_PARTIAL;
1053       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1054         *nextTokPtr = ptr + 2*MINBPC(enc);
1055         return XML_TOK_COND_SECT_CLOSE;
1056       }
1057     }
1058     *nextTokPtr = ptr;
1059     return XML_TOK_CLOSE_BRACKET;
1060   case BT_LPAR:
1061     *nextTokPtr = ptr + MINBPC(enc);
1062     return XML_TOK_OPEN_PAREN;
1063   case BT_RPAR:
1064     ptr += MINBPC(enc);
1065     if (ptr == end)
1066       return -XML_TOK_CLOSE_PAREN;
1067     switch (BYTE_TYPE(enc, ptr)) {
1068     case BT_AST:
1069       *nextTokPtr = ptr + MINBPC(enc);
1070       return XML_TOK_CLOSE_PAREN_ASTERISK;
1071     case BT_QUEST:
1072       *nextTokPtr = ptr + MINBPC(enc);
1073       return XML_TOK_CLOSE_PAREN_QUESTION;
1074     case BT_PLUS:
1075       *nextTokPtr = ptr + MINBPC(enc);
1076       return XML_TOK_CLOSE_PAREN_PLUS;
1077     case BT_CR: case BT_LF: case BT_S:
1078     case BT_GT: case BT_COMMA: case BT_VERBAR:
1079     case BT_RPAR:
1080       *nextTokPtr = ptr;
1081       return XML_TOK_CLOSE_PAREN;
1082     }
1083     *nextTokPtr = ptr;
1084     return XML_TOK_INVALID;
1085   case BT_VERBAR:
1086     *nextTokPtr = ptr + MINBPC(enc);
1087     return XML_TOK_OR;
1088   case BT_GT:
1089     *nextTokPtr = ptr + MINBPC(enc);
1090     return XML_TOK_DECL_CLOSE;
1091   case BT_NUM:
1092     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1093 #define LEAD_CASE(n) \
1094   case BT_LEAD ## n: \
1095     if (end - ptr < n) \
1096       return XML_TOK_PARTIAL_CHAR; \
1097     if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1098       ptr += n; \
1099       tok = XML_TOK_NAME; \
1100       break; \
1101     } \
1102     if (IS_NAME_CHAR(enc, ptr, n)) { \
1103       ptr += n; \
1104       tok = XML_TOK_NMTOKEN; \
1105       break; \
1106     } \
1107     *nextTokPtr = ptr; \
1108     return XML_TOK_INVALID;
1109     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1110 #undef LEAD_CASE
1111   case BT_NMSTRT:
1112   case BT_HEX:
1113     tok = XML_TOK_NAME;
1114     ptr += MINBPC(enc);
1115     break;
1116   case BT_DIGIT:
1117   case BT_NAME:
1118   case BT_MINUS:
1119 #ifdef XML_NS
1120   case BT_COLON:
1121 #endif
1122     tok = XML_TOK_NMTOKEN;
1123     ptr += MINBPC(enc);
1124     break;
1125   case BT_NONASCII:
1126     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1127       ptr += MINBPC(enc);
1128       tok = XML_TOK_NAME;
1129       break;
1130     }
1131     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1132       ptr += MINBPC(enc);
1133       tok = XML_TOK_NMTOKEN;
1134       break;
1135     }
1136     /* fall through */
1137   default:
1138     *nextTokPtr = ptr;
1139     return XML_TOK_INVALID;
1140   }
1141   while (ptr != end) {
1142     switch (BYTE_TYPE(enc, ptr)) {
1143     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1144     case BT_GT: case BT_RPAR: case BT_COMMA:
1145     case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1146     case BT_S: case BT_CR: case BT_LF:
1147       *nextTokPtr = ptr;
1148       return tok;
1149 #ifdef XML_NS
1150     case BT_COLON:
1151       ptr += MINBPC(enc);
1152       switch (tok) {
1153       case XML_TOK_NAME:
1154         if (ptr == end)
1155           return XML_TOK_PARTIAL;
1156         tok = XML_TOK_PREFIXED_NAME;
1157         switch (BYTE_TYPE(enc, ptr)) {
1158         CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1159         default:
1160           tok = XML_TOK_NMTOKEN;
1161           break;
1162         }
1163         break;
1164       case XML_TOK_PREFIXED_NAME:
1165         tok = XML_TOK_NMTOKEN;
1166         break;
1167       }
1168       break;
1169 #endif
1170     case BT_PLUS:
1171       if (tok == XML_TOK_NMTOKEN)  {
1172         *nextTokPtr = ptr;
1173         return XML_TOK_INVALID;
1174       }
1175       *nextTokPtr = ptr + MINBPC(enc);
1176       return XML_TOK_NAME_PLUS;
1177     case BT_AST:
1178       if (tok == XML_TOK_NMTOKEN)  {
1179         *nextTokPtr = ptr;
1180         return XML_TOK_INVALID;
1181       }
1182       *nextTokPtr = ptr + MINBPC(enc);
1183       return XML_TOK_NAME_ASTERISK;
1184     case BT_QUEST:
1185       if (tok == XML_TOK_NMTOKEN)  {
1186         *nextTokPtr = ptr;
1187         return XML_TOK_INVALID;
1188       }
1189       *nextTokPtr = ptr + MINBPC(enc);
1190       return XML_TOK_NAME_QUESTION;
1191     default:
1192       *nextTokPtr = ptr;
1193       return XML_TOK_INVALID;
1194     }
1195   }
1196   return -tok;
1197 }
1198 
1199 static int PTRCALL
PREFIX(attributeValueTok)1200 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1201                           const char *end, const char **nextTokPtr)
1202 {
1203   const char *start;
1204   if (ptr == end)
1205     return XML_TOK_NONE;
1206   start = ptr;
1207   while (ptr != end) {
1208     switch (BYTE_TYPE(enc, ptr)) {
1209 #define LEAD_CASE(n) \
1210     case BT_LEAD ## n: ptr += n; break;
1211     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1212 #undef LEAD_CASE
1213     case BT_AMP:
1214       if (ptr == start)
1215         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1216       *nextTokPtr = ptr;
1217       return XML_TOK_DATA_CHARS;
1218     case BT_LT:
1219       /* this is for inside entity references */
1220       *nextTokPtr = ptr;
1221       return XML_TOK_INVALID;
1222     case BT_LF:
1223       if (ptr == start) {
1224         *nextTokPtr = ptr + MINBPC(enc);
1225         return XML_TOK_DATA_NEWLINE;
1226       }
1227       *nextTokPtr = ptr;
1228       return XML_TOK_DATA_CHARS;
1229     case BT_CR:
1230       if (ptr == start) {
1231         ptr += MINBPC(enc);
1232         if (ptr == end)
1233           return XML_TOK_TRAILING_CR;
1234         if (BYTE_TYPE(enc, ptr) == BT_LF)
1235           ptr += MINBPC(enc);
1236         *nextTokPtr = ptr;
1237         return XML_TOK_DATA_NEWLINE;
1238       }
1239       *nextTokPtr = ptr;
1240       return XML_TOK_DATA_CHARS;
1241     case BT_S:
1242       if (ptr == start) {
1243         *nextTokPtr = ptr + MINBPC(enc);
1244         return XML_TOK_ATTRIBUTE_VALUE_S;
1245       }
1246       *nextTokPtr = ptr;
1247       return XML_TOK_DATA_CHARS;
1248     default:
1249       ptr += MINBPC(enc);
1250       break;
1251     }
1252   }
1253   *nextTokPtr = ptr;
1254   return XML_TOK_DATA_CHARS;
1255 }
1256 
1257 static int PTRCALL
PREFIX(entityValueTok)1258 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1259                        const char *end, const char **nextTokPtr)
1260 {
1261   const char *start;
1262   if (ptr == end)
1263     return XML_TOK_NONE;
1264   start = ptr;
1265   while (ptr != end) {
1266     switch (BYTE_TYPE(enc, ptr)) {
1267 #define LEAD_CASE(n) \
1268     case BT_LEAD ## n: ptr += n; break;
1269     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1270 #undef LEAD_CASE
1271     case BT_AMP:
1272       if (ptr == start)
1273         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1274       *nextTokPtr = ptr;
1275       return XML_TOK_DATA_CHARS;
1276     case BT_PERCNT:
1277       if (ptr == start) {
1278         int tok =  PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1279                                        end, nextTokPtr);
1280         return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1281       }
1282       *nextTokPtr = ptr;
1283       return XML_TOK_DATA_CHARS;
1284     case BT_LF:
1285       if (ptr == start) {
1286         *nextTokPtr = ptr + MINBPC(enc);
1287         return XML_TOK_DATA_NEWLINE;
1288       }
1289       *nextTokPtr = ptr;
1290       return XML_TOK_DATA_CHARS;
1291     case BT_CR:
1292       if (ptr == start) {
1293         ptr += MINBPC(enc);
1294         if (ptr == end)
1295           return XML_TOK_TRAILING_CR;
1296         if (BYTE_TYPE(enc, ptr) == BT_LF)
1297           ptr += MINBPC(enc);
1298         *nextTokPtr = ptr;
1299         return XML_TOK_DATA_NEWLINE;
1300       }
1301       *nextTokPtr = ptr;
1302       return XML_TOK_DATA_CHARS;
1303     default:
1304       ptr += MINBPC(enc);
1305       break;
1306     }
1307   }
1308   *nextTokPtr = ptr;
1309   return XML_TOK_DATA_CHARS;
1310 }
1311 
1312 #ifdef XML_DTD
1313 
1314 static int PTRCALL
PREFIX(ignoreSectionTok)1315 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1316                          const char *end, const char **nextTokPtr)
1317 {
1318   int level = 0;
1319   if (MINBPC(enc) > 1) {
1320     size_t n = end - ptr;
1321     if (n & (MINBPC(enc) - 1)) {
1322       n &= ~(MINBPC(enc) - 1);
1323       end = ptr + n;
1324     }
1325   }
1326   while (ptr != end) {
1327     switch (BYTE_TYPE(enc, ptr)) {
1328     INVALID_CASES(ptr, nextTokPtr)
1329     case BT_LT:
1330       if ((ptr += MINBPC(enc)) == end)
1331         return XML_TOK_PARTIAL;
1332       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1333         if ((ptr += MINBPC(enc)) == end)
1334           return XML_TOK_PARTIAL;
1335         if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1336           ++level;
1337           ptr += MINBPC(enc);
1338         }
1339       }
1340       break;
1341     case BT_RSQB:
1342       if ((ptr += MINBPC(enc)) == end)
1343         return XML_TOK_PARTIAL;
1344       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1345         if ((ptr += MINBPC(enc)) == end)
1346           return XML_TOK_PARTIAL;
1347         if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1348           ptr += MINBPC(enc);
1349           if (level == 0) {
1350             *nextTokPtr = ptr;
1351             return XML_TOK_IGNORE_SECT;
1352           }
1353           --level;
1354         }
1355       }
1356       break;
1357     default:
1358       ptr += MINBPC(enc);
1359       break;
1360     }
1361   }
1362   return XML_TOK_PARTIAL;
1363 }
1364 
1365 #endif /* XML_DTD */
1366 
1367 static int PTRCALL
PREFIX(isPublicId)1368 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1369                    const char **badPtr)
1370 {
1371   ptr += MINBPC(enc);
1372   end -= MINBPC(enc);
1373   for (; ptr != end; ptr += MINBPC(enc)) {
1374     switch (BYTE_TYPE(enc, ptr)) {
1375     case BT_DIGIT:
1376     case BT_HEX:
1377     case BT_MINUS:
1378     case BT_APOS:
1379     case BT_LPAR:
1380     case BT_RPAR:
1381     case BT_PLUS:
1382     case BT_COMMA:
1383     case BT_SOL:
1384     case BT_EQUALS:
1385     case BT_QUEST:
1386     case BT_CR:
1387     case BT_LF:
1388     case BT_SEMI:
1389     case BT_EXCL:
1390     case BT_AST:
1391     case BT_PERCNT:
1392     case BT_NUM:
1393 #ifdef XML_NS
1394     case BT_COLON:
1395 #endif
1396       break;
1397     case BT_S:
1398       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1399         *badPtr = ptr;
1400         return 0;
1401       }
1402       break;
1403     case BT_NAME:
1404     case BT_NMSTRT:
1405       if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1406         break;
1407     default:
1408       switch (BYTE_TO_ASCII(enc, ptr)) {
1409       case 0x24: /* $ */
1410       case 0x40: /* @ */
1411         break;
1412       default:
1413         *badPtr = ptr;
1414         return 0;
1415       }
1416       break;
1417     }
1418   }
1419   return 1;
1420 }
1421 
1422 /* This must only be called for a well-formed start-tag or empty
1423    element tag.  Returns the number of attributes.  Pointers to the
1424    first attsMax attributes are stored in atts.
1425 */
1426 
1427 static int PTRCALL
PREFIX(getAtts)1428 PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1429                 int attsMax, ATTRIBUTE *atts)
1430 {
1431   enum { other, inName, inValue } state = inName;
1432   int nAtts = 0;
1433   int open = 0; /* defined when state == inValue;
1434                    initialization just to shut up compilers */
1435 
1436   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1437     switch (BYTE_TYPE(enc, ptr)) {
1438 #define START_NAME \
1439       if (state == other) { \
1440         if (nAtts < attsMax) { \
1441           atts[nAtts].name = ptr; \
1442           atts[nAtts].normalized = 1; \
1443         } \
1444         state = inName; \
1445       }
1446 #define LEAD_CASE(n) \
1447     case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1448     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1449 #undef LEAD_CASE
1450     case BT_NONASCII:
1451     case BT_NMSTRT:
1452     case BT_HEX:
1453       START_NAME
1454       break;
1455 #undef START_NAME
1456     case BT_QUOT:
1457       if (state != inValue) {
1458         if (nAtts < attsMax)
1459           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1460         state = inValue;
1461         open = BT_QUOT;
1462       }
1463       else if (open == BT_QUOT) {
1464         state = other;
1465         if (nAtts < attsMax)
1466           atts[nAtts].valueEnd = ptr;
1467         nAtts++;
1468       }
1469       break;
1470     case BT_APOS:
1471       if (state != inValue) {
1472         if (nAtts < attsMax)
1473           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1474         state = inValue;
1475         open = BT_APOS;
1476       }
1477       else if (open == BT_APOS) {
1478         state = other;
1479         if (nAtts < attsMax)
1480           atts[nAtts].valueEnd = ptr;
1481         nAtts++;
1482       }
1483       break;
1484     case BT_AMP:
1485       if (nAtts < attsMax)
1486         atts[nAtts].normalized = 0;
1487       break;
1488     case BT_S:
1489       if (state == inName)
1490         state = other;
1491       else if (state == inValue
1492                && nAtts < attsMax
1493                && atts[nAtts].normalized
1494                && (ptr == atts[nAtts].valuePtr
1495                    || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1496                    || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1497                    || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1498         atts[nAtts].normalized = 0;
1499       break;
1500     case BT_CR: case BT_LF:
1501       /* This case ensures that the first attribute name is counted
1502          Apart from that we could just change state on the quote. */
1503       if (state == inName)
1504         state = other;
1505       else if (state == inValue && nAtts < attsMax)
1506         atts[nAtts].normalized = 0;
1507       break;
1508     case BT_GT:
1509     case BT_SOL:
1510       if (state != inValue)
1511         return nAtts;
1512       break;
1513     default:
1514       break;
1515     }
1516   }
1517   /* not reached */
1518 }
1519 
1520 static int PTRFASTCALL
PREFIX(charRefNumber)1521 PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
1522 {
1523   int result = 0;
1524   /* skip &# */
1525   ptr += 2*MINBPC(enc);
1526   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1527     for (ptr += MINBPC(enc);
1528          !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1529          ptr += MINBPC(enc)) {
1530       int c = BYTE_TO_ASCII(enc, ptr);
1531       switch (c) {
1532       case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1533       case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1534         result <<= 4;
1535         result |= (c - ASCII_0);
1536         break;
1537       case ASCII_A: case ASCII_B: case ASCII_C:
1538       case ASCII_D: case ASCII_E: case ASCII_F:
1539         result <<= 4;
1540         result += 10 + (c - ASCII_A);
1541         break;
1542       case ASCII_a: case ASCII_b: case ASCII_c:
1543       case ASCII_d: case ASCII_e: case ASCII_f:
1544         result <<= 4;
1545         result += 10 + (c - ASCII_a);
1546         break;
1547       }
1548       if (result >= 0x110000)
1549         return -1;
1550     }
1551   }
1552   else {
1553     for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1554       int c = BYTE_TO_ASCII(enc, ptr);
1555       result *= 10;
1556       result += (c - ASCII_0);
1557       if (result >= 0x110000)
1558         return -1;
1559     }
1560   }
1561   return checkCharRefNumber(result);
1562 }
1563 
1564 static int PTRCALL
PREFIX(predefinedEntityName)1565 PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1566                              const char *end)
1567 {
1568   switch ((end - ptr)/MINBPC(enc)) {
1569   case 2:
1570     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1571       switch (BYTE_TO_ASCII(enc, ptr)) {
1572       case ASCII_l:
1573         return ASCII_LT;
1574       case ASCII_g:
1575         return ASCII_GT;
1576       }
1577     }
1578     break;
1579   case 3:
1580     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1581       ptr += MINBPC(enc);
1582       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1583         ptr += MINBPC(enc);
1584         if (CHAR_MATCHES(enc, ptr, ASCII_p))
1585           return ASCII_AMP;
1586       }
1587     }
1588     break;
1589   case 4:
1590     switch (BYTE_TO_ASCII(enc, ptr)) {
1591     case ASCII_q:
1592       ptr += MINBPC(enc);
1593       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1594         ptr += MINBPC(enc);
1595         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1596           ptr += MINBPC(enc);
1597           if (CHAR_MATCHES(enc, ptr, ASCII_t))
1598             return ASCII_QUOT;
1599         }
1600       }
1601       break;
1602     case ASCII_a:
1603       ptr += MINBPC(enc);
1604       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1605         ptr += MINBPC(enc);
1606         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1607           ptr += MINBPC(enc);
1608           if (CHAR_MATCHES(enc, ptr, ASCII_s))
1609             return ASCII_APOS;
1610         }
1611       }
1612       break;
1613     }
1614   }
1615   return 0;
1616 }
1617 
1618 static int PTRCALL
PREFIX(sameName)1619 PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1620 {
1621   for (;;) {
1622     switch (BYTE_TYPE(enc, ptr1)) {
1623 #define LEAD_CASE(n) \
1624     case BT_LEAD ## n: \
1625       if (*ptr1++ != *ptr2++) \
1626         return 0;
1627     LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1628 #undef LEAD_CASE
1629       /* fall through */
1630       if (*ptr1++ != *ptr2++)
1631         return 0;
1632       break;
1633     case BT_NONASCII:
1634     case BT_NMSTRT:
1635 #ifdef XML_NS
1636     case BT_COLON:
1637 #endif
1638     case BT_HEX:
1639     case BT_DIGIT:
1640     case BT_NAME:
1641     case BT_MINUS:
1642       if (*ptr2++ != *ptr1++)
1643         return 0;
1644       if (MINBPC(enc) > 1) {
1645         if (*ptr2++ != *ptr1++)
1646           return 0;
1647         if (MINBPC(enc) > 2) {
1648           if (*ptr2++ != *ptr1++)
1649             return 0;
1650           if (MINBPC(enc) > 3) {
1651             if (*ptr2++ != *ptr1++)
1652               return 0;
1653           }
1654         }
1655       }
1656       break;
1657     default:
1658       if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1659         return 1;
1660       switch (BYTE_TYPE(enc, ptr2)) {
1661       case BT_LEAD2:
1662       case BT_LEAD3:
1663       case BT_LEAD4:
1664       case BT_NONASCII:
1665       case BT_NMSTRT:
1666 #ifdef XML_NS
1667       case BT_COLON:
1668 #endif
1669       case BT_HEX:
1670       case BT_DIGIT:
1671       case BT_NAME:
1672       case BT_MINUS:
1673         return 0;
1674       default:
1675         return 1;
1676       }
1677     }
1678   }
1679   /* not reached */
1680 }
1681 
1682 static int PTRCALL
PREFIX(nameMatchesAscii)1683 PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1684                          const char *end1, const char *ptr2)
1685 {
1686   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1687     if (ptr1 == end1)
1688       return 0;
1689     if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1690       return 0;
1691   }
1692   return ptr1 == end1;
1693 }
1694 
1695 static int PTRFASTCALL
PREFIX(nameLength)1696 PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1697 {
1698   const char *start = ptr;
1699   for (;;) {
1700     switch (BYTE_TYPE(enc, ptr)) {
1701 #define LEAD_CASE(n) \
1702     case BT_LEAD ## n: ptr += n; break;
1703     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1704 #undef LEAD_CASE
1705     case BT_NONASCII:
1706     case BT_NMSTRT:
1707 #ifdef XML_NS
1708     case BT_COLON:
1709 #endif
1710     case BT_HEX:
1711     case BT_DIGIT:
1712     case BT_NAME:
1713     case BT_MINUS:
1714       ptr += MINBPC(enc);
1715       break;
1716     default:
1717       return ptr - start;
1718     }
1719   }
1720 }
1721 
1722 static const char * PTRFASTCALL
PREFIX(skipS)1723 PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1724 {
1725   for (;;) {
1726     switch (BYTE_TYPE(enc, ptr)) {
1727     case BT_LF:
1728     case BT_CR:
1729     case BT_S:
1730       ptr += MINBPC(enc);
1731       break;
1732     default:
1733       return ptr;
1734     }
1735   }
1736 }
1737 
1738 static void PTRCALL
PREFIX(updatePosition)1739 PREFIX(updatePosition)(const ENCODING *enc,
1740                        const char *ptr,
1741                        const char *end,
1742                        POSITION *pos)
1743 {
1744   while (ptr != end) {
1745     switch (BYTE_TYPE(enc, ptr)) {
1746 #define LEAD_CASE(n) \
1747     case BT_LEAD ## n: \
1748       ptr += n; \
1749       break;
1750     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1751 #undef LEAD_CASE
1752     case BT_LF:
1753       pos->columnNumber = (unsigned)-1;
1754       pos->lineNumber++;
1755       ptr += MINBPC(enc);
1756       break;
1757     case BT_CR:
1758       pos->lineNumber++;
1759       ptr += MINBPC(enc);
1760       if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
1761         ptr += MINBPC(enc);
1762       pos->columnNumber = (unsigned)-1;
1763       break;
1764     default:
1765       ptr += MINBPC(enc);
1766       break;
1767     }
1768     pos->columnNumber++;
1769   }
1770 }
1771 
1772 #undef DO_LEAD_CASE
1773 #undef MULTIBYTE_CASES
1774 #undef INVALID_CASES
1775 #undef CHECK_NAME_CASE
1776 #undef CHECK_NAME_CASES
1777 #undef CHECK_NMSTRT_CASE
1778 #undef CHECK_NMSTRT_CASES
1779 
1780