1 /*
2 Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
3 See the file COPYING for copying permission.
4 */
5 
6 #ifndef IS_INVALID_CHAR
7 #define IS_INVALID_CHAR(enc, ptr, n) (0)
8 #endif
9 
10 #ifndef INVALID_LEAD_CASE
11 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
12     case BT_LEAD ## n: \
13       if (end - ptr < n) \
14         return XML_TOK_PARTIAL_CHAR; \
15       if (IS_INVALID_CHAR(enc, ptr, n)) { \
16         *(nextTokPtr) = (ptr); \
17         return XML_TOK_INVALID; \
18       } \
19       ptr += n; \
20       break;
21 #endif
22 
23 #define INVALID_CASES(ptr, nextTokPtr) \
24   INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
25   INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
26   INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
27   case BT_NONXML: \
28   case BT_MALFORM: \
29   case BT_TRAIL: \
30     *(nextTokPtr) = (ptr); \
31     return XML_TOK_INVALID;
32 
33 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
34    case BT_LEAD ## n: \
35      if (end - ptr < n) \
36        return XML_TOK_PARTIAL_CHAR; \
37      if (!IS_NAME_CHAR(enc, ptr, n)) { \
38        *nextTokPtr = ptr; \
39        return XML_TOK_INVALID; \
40      } \
41      ptr += n; \
42      break;
43 
44 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
45   case BT_NONASCII: \
46     if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
47       *nextTokPtr = ptr; \
48       return XML_TOK_INVALID; \
49     } \
50   case BT_NMSTRT: \
51   case BT_HEX: \
52   case BT_DIGIT: \
53   case BT_NAME: \
54   case BT_MINUS: \
55     ptr += MINBPC(enc); \
56     break; \
57   CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
58   CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
59   CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
60 
61 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
62    case BT_LEAD ## n: \
63      if (end - ptr < n) \
64        return XML_TOK_PARTIAL_CHAR; \
65      if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
66        *nextTokPtr = ptr; \
67        return XML_TOK_INVALID; \
68      } \
69      ptr += n; \
70      break;
71 
72 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
73   case BT_NONASCII: \
74     if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
75       *nextTokPtr = ptr; \
76       return XML_TOK_INVALID; \
77     } \
78   case BT_NMSTRT: \
79   case BT_HEX: \
80     ptr += MINBPC(enc); \
81     break; \
82   CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
83   CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
84   CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
85 
86 #ifndef PREFIX
87 #define PREFIX(ident) ident
88 #endif
89 
90 /* ptr points to character following "<!-" */
91 
92 static
PREFIX(scanComment)93 int PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
94                         const char **nextTokPtr)
95 {
96   if (ptr != end) {
97     if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
98       *nextTokPtr = ptr;
99       return XML_TOK_INVALID;
100     }
101     ptr += MINBPC(enc);
102     while (ptr != end) {
103       switch (BYTE_TYPE(enc, ptr)) {
104       INVALID_CASES(ptr, nextTokPtr)
105       case BT_MINUS:
106         if ((ptr += MINBPC(enc)) == end)
107           return XML_TOK_PARTIAL;
108         if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
109           if ((ptr += MINBPC(enc)) == end)
110             return XML_TOK_PARTIAL;
111           if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
112             *nextTokPtr = ptr;
113             return XML_TOK_INVALID;
114           }
115           *nextTokPtr = ptr + MINBPC(enc);
116           return XML_TOK_COMMENT;
117         }
118         break;
119       default:
120         ptr += MINBPC(enc);
121         break;
122       }
123     }
124   }
125   return XML_TOK_PARTIAL;
126 }
127 
128 /* ptr points to character following "<!" */
129 
130 static
PREFIX(scanDecl)131 int PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
132                      const char **nextTokPtr)
133 {
134   if (ptr == end)
135     return XML_TOK_PARTIAL;
136   switch (BYTE_TYPE(enc, ptr)) {
137   case BT_MINUS:
138     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
139   case BT_LSQB:
140     *nextTokPtr = ptr + MINBPC(enc);
141     return XML_TOK_COND_SECT_OPEN;
142   case BT_NMSTRT:
143   case BT_HEX:
144     ptr += MINBPC(enc);
145     break;
146   default:
147     *nextTokPtr = ptr;
148     return XML_TOK_INVALID;
149   }
150   while (ptr != end) {
151     switch (BYTE_TYPE(enc, ptr)) {
152     case BT_PERCNT:
153       if (ptr + MINBPC(enc) == end)
154         return XML_TOK_PARTIAL;
155       /* don't allow <!ENTITY% foo "whatever"> */
156       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
157       case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
158         *nextTokPtr = ptr;
159         return XML_TOK_INVALID;
160       }
161       /* fall through */
162     case BT_S: case BT_CR: case BT_LF:
163       *nextTokPtr = ptr;
164       return XML_TOK_DECL_OPEN;
165     case BT_NMSTRT:
166     case BT_HEX:
167       ptr += MINBPC(enc);
168       break;
169     default:
170       *nextTokPtr = ptr;
171       return XML_TOK_INVALID;
172     }
173   }
174   return XML_TOK_PARTIAL;
175 }
176 
177 static
PREFIX(checkPiTarget)178 int PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end, int *tokPtr)
179 {
180   int upper = 0;
181   itkExpatUnused(enc);
182   *tokPtr = XML_TOK_PI;
183   if (end - ptr != MINBPC(enc)*3)
184     return 1;
185   switch (BYTE_TO_ASCII(enc, ptr)) {
186   case ASCII_x:
187     break;
188   case ASCII_X:
189     upper = 1;
190     break;
191   default:
192     return 1;
193   }
194   ptr += MINBPC(enc);
195   switch (BYTE_TO_ASCII(enc, ptr)) {
196   case ASCII_m:
197     break;
198   case ASCII_M:
199     upper = 1;
200     break;
201   default:
202     return 1;
203   }
204   ptr += MINBPC(enc);
205   switch (BYTE_TO_ASCII(enc, ptr)) {
206   case ASCII_l:
207     break;
208   case ASCII_L:
209     upper = 1;
210     break;
211   default:
212     return 1;
213   }
214   if (upper)
215     return 0;
216   *tokPtr = XML_TOK_XML_DECL;
217   return 1;
218 }
219 
220 /* ptr points to character following "<?" */
221 
222 static
PREFIX(scanPi)223 int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
224                    const char **nextTokPtr)
225 {
226   int tok;
227   const char *target = ptr;
228   if (ptr == end)
229     return XML_TOK_PARTIAL;
230   switch (BYTE_TYPE(enc, ptr)) {
231   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
232   default:
233     *nextTokPtr = ptr;
234     return XML_TOK_INVALID;
235   }
236   while (ptr != end) {
237     switch (BYTE_TYPE(enc, ptr)) {
238     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
239     case BT_S: case BT_CR: case BT_LF:
240       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
241         *nextTokPtr = ptr;
242         return XML_TOK_INVALID;
243       }
244       ptr += MINBPC(enc);
245       while (ptr != end) {
246         switch (BYTE_TYPE(enc, ptr)) {
247         INVALID_CASES(ptr, nextTokPtr)
248         case BT_QUEST:
249           ptr += MINBPC(enc);
250           if (ptr == end)
251             return XML_TOK_PARTIAL;
252           if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
253             *nextTokPtr = ptr + MINBPC(enc);
254             return tok;
255           }
256           break;
257         default:
258           ptr += MINBPC(enc);
259           break;
260         }
261       }
262       return XML_TOK_PARTIAL;
263     case BT_QUEST:
264       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
265         *nextTokPtr = ptr;
266         return XML_TOK_INVALID;
267       }
268       ptr += MINBPC(enc);
269       if (ptr == end)
270         return XML_TOK_PARTIAL;
271       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
272         *nextTokPtr = ptr + MINBPC(enc);
273         return tok;
274       }
275       /* fall through */
276     default:
277       *nextTokPtr = ptr;
278       return XML_TOK_INVALID;
279     }
280   }
281   return XML_TOK_PARTIAL;
282 }
283 
284 
285 static
PREFIX(scanCdataSection)286 int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
287                              const char **nextTokPtr)
288 {
289   static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB };
290   int i;
291   itkExpatUnused(enc);
292   /* CDATA[ */
293   if (end - ptr < 6 * MINBPC(enc))
294     return XML_TOK_PARTIAL;
295   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
296     if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
297       *nextTokPtr = ptr;
298       return XML_TOK_INVALID;
299     }
300   }
301   *nextTokPtr = ptr;
302   return XML_TOK_CDATA_SECT_OPEN;
303 }
304 
305 static
PREFIX(cdataSectionTok)306 int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
307                             const char **nextTokPtr)
308 {
309   if (ptr == end)
310     return XML_TOK_NONE;
311   if (MINBPC(enc) > 1) {
312     size_t n = end - ptr;
313     if (n & (MINBPC(enc) - 1)) {
314       n &= ~(MINBPC(enc) - 1);
315       if (n == 0)
316         return XML_TOK_PARTIAL;
317       end = ptr + n;
318     }
319   }
320   switch (BYTE_TYPE(enc, ptr)) {
321   case BT_RSQB:
322     ptr += MINBPC(enc);
323     if (ptr == end)
324       return XML_TOK_PARTIAL;
325     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
326       break;
327     ptr += MINBPC(enc);
328     if (ptr == end)
329       return XML_TOK_PARTIAL;
330     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
331       ptr -= MINBPC(enc);
332       break;
333     }
334     *nextTokPtr = ptr + MINBPC(enc);
335     return XML_TOK_CDATA_SECT_CLOSE;
336   case BT_CR:
337     ptr += MINBPC(enc);
338     if (ptr == end)
339       return XML_TOK_PARTIAL;
340     if (BYTE_TYPE(enc, ptr) == BT_LF)
341       ptr += MINBPC(enc);
342     *nextTokPtr = ptr;
343     return XML_TOK_DATA_NEWLINE;
344   case BT_LF:
345     *nextTokPtr = ptr + MINBPC(enc);
346     return XML_TOK_DATA_NEWLINE;
347   INVALID_CASES(ptr, nextTokPtr)
348   default:
349     ptr += MINBPC(enc);
350     break;
351   }
352   while (ptr != end) {
353     switch (BYTE_TYPE(enc, ptr)) {
354 #define LEAD_CASE(n) \
355     case BT_LEAD ## n: \
356       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
357         *nextTokPtr = ptr; \
358         return XML_TOK_DATA_CHARS; \
359       } \
360       ptr += n; \
361       break;
362     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
363 #undef LEAD_CASE
364     case BT_NONXML:
365     case BT_MALFORM:
366     case BT_TRAIL:
367     case BT_CR:
368     case BT_LF:
369     case BT_RSQB:
370       *nextTokPtr = ptr;
371       return XML_TOK_DATA_CHARS;
372     default:
373       ptr += MINBPC(enc);
374       break;
375     }
376   }
377   *nextTokPtr = ptr;
378   return XML_TOK_DATA_CHARS;
379 }
380 
381 /* ptr points to character following "</" */
382 
383 static
PREFIX(scanEndTag)384 int PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
385                        const char **nextTokPtr)
386 {
387   if (ptr == end)
388     return XML_TOK_PARTIAL;
389   switch (BYTE_TYPE(enc, ptr)) {
390   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
391   default:
392     *nextTokPtr = ptr;
393     return XML_TOK_INVALID;
394   }
395   while (ptr != end) {
396     switch (BYTE_TYPE(enc, ptr)) {
397     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
398     case BT_S: case BT_CR: case BT_LF:
399       for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
400         switch (BYTE_TYPE(enc, ptr)) {
401         case BT_S: case BT_CR: case BT_LF:
402           break;
403         case BT_GT:
404           *nextTokPtr = ptr + MINBPC(enc);
405           return XML_TOK_END_TAG;
406         default:
407           *nextTokPtr = ptr;
408           return XML_TOK_INVALID;
409         }
410       }
411       return XML_TOK_PARTIAL;
412 #ifdef XML_NS
413     case BT_COLON:
414       /* no need to check qname syntax here, since end-tag must match exactly */
415       ptr += MINBPC(enc);
416       break;
417 #endif
418     case BT_GT:
419       *nextTokPtr = ptr + MINBPC(enc);
420       return XML_TOK_END_TAG;
421     default:
422       *nextTokPtr = ptr;
423       return XML_TOK_INVALID;
424     }
425   }
426   return XML_TOK_PARTIAL;
427 }
428 
429 /* ptr points to character following "&#X" */
430 
431 static
PREFIX(scanHexCharRef)432 int PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
433                            const char **nextTokPtr)
434 {
435   if (ptr != end) {
436     switch (BYTE_TYPE(enc, ptr)) {
437     case BT_DIGIT:
438     case BT_HEX:
439       break;
440     default:
441       *nextTokPtr = ptr;
442       return XML_TOK_INVALID;
443     }
444     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
445       switch (BYTE_TYPE(enc, ptr)) {
446       case BT_DIGIT:
447       case BT_HEX:
448         break;
449       case BT_SEMI:
450         *nextTokPtr = ptr + MINBPC(enc);
451         return XML_TOK_CHAR_REF;
452       default:
453         *nextTokPtr = ptr;
454         return XML_TOK_INVALID;
455       }
456     }
457   }
458   return XML_TOK_PARTIAL;
459 }
460 
461 /* ptr points to character following "&#" */
462 
463 static
PREFIX(scanCharRef)464 int PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
465                         const char **nextTokPtr)
466 {
467   if (ptr != end) {
468     if (CHAR_MATCHES(enc, ptr, ASCII_x))
469       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
470     switch (BYTE_TYPE(enc, ptr)) {
471     case BT_DIGIT:
472       break;
473     default:
474       *nextTokPtr = ptr;
475       return XML_TOK_INVALID;
476     }
477     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
478       switch (BYTE_TYPE(enc, ptr)) {
479       case BT_DIGIT:
480         break;
481       case BT_SEMI:
482         *nextTokPtr = ptr + MINBPC(enc);
483         return XML_TOK_CHAR_REF;
484       default:
485         *nextTokPtr = ptr;
486         return XML_TOK_INVALID;
487       }
488     }
489   }
490   return XML_TOK_PARTIAL;
491 }
492 
493 /* ptr points to character following "&" */
494 
495 static
PREFIX(scanRef)496 int PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
497                     const char **nextTokPtr)
498 {
499   if (ptr == end)
500     return XML_TOK_PARTIAL;
501   switch (BYTE_TYPE(enc, ptr)) {
502   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
503   case BT_NUM:
504     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
505   default:
506     *nextTokPtr = ptr;
507     return XML_TOK_INVALID;
508   }
509   while (ptr != end) {
510     switch (BYTE_TYPE(enc, ptr)) {
511     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
512     case BT_SEMI:
513       *nextTokPtr = ptr + MINBPC(enc);
514       return XML_TOK_ENTITY_REF;
515     default:
516       *nextTokPtr = ptr;
517       return XML_TOK_INVALID;
518     }
519   }
520   return XML_TOK_PARTIAL;
521 }
522 
523 /* ptr points to character following first character of attribute name */
524 
525 static
PREFIX(scanAtts)526 int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
527                      const char **nextTokPtr)
528 {
529 #ifdef XML_NS
530   int hadColon = 0;
531 #endif
532   while (ptr != end) {
533     switch (BYTE_TYPE(enc, ptr)) {
534     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
535 #ifdef XML_NS
536     case BT_COLON:
537       if (hadColon) {
538         *nextTokPtr = ptr;
539         return XML_TOK_INVALID;
540       }
541       hadColon = 1;
542       ptr += MINBPC(enc);
543       if (ptr == end)
544         return XML_TOK_PARTIAL;
545       switch (BYTE_TYPE(enc, ptr)) {
546       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
547       default:
548         *nextTokPtr = ptr;
549         return XML_TOK_INVALID;
550       }
551       break;
552 #endif
553     case BT_S: case BT_CR: case BT_LF:
554       for (;;) {
555         int t;
556 
557         ptr += MINBPC(enc);
558         if (ptr == end)
559           return XML_TOK_PARTIAL;
560         t = BYTE_TYPE(enc, ptr);
561         if (t == BT_EQUALS)
562           break;
563         switch (t) {
564         case BT_S:
565         case BT_LF:
566         case BT_CR:
567           break;
568         default:
569           *nextTokPtr = ptr;
570           return XML_TOK_INVALID;
571         }
572       }
573     /* fall through */
574     case BT_EQUALS:
575       {
576         int open;
577 #ifdef XML_NS
578         hadColon = 0;
579 #endif
580         for (;;) {
581 
582           ptr += MINBPC(enc);
583           if (ptr == end)
584             return XML_TOK_PARTIAL;
585           open = BYTE_TYPE(enc, ptr);
586           if (open == BT_QUOT || open == BT_APOS)
587             break;
588           switch (open) {
589           case BT_S:
590           case BT_LF:
591           case BT_CR:
592             break;
593           default:
594             *nextTokPtr = ptr;
595             return XML_TOK_INVALID;
596           }
597         }
598         ptr += MINBPC(enc);
599         /* in attribute value */
600         for (;;) {
601           int t;
602           if (ptr == end)
603             return XML_TOK_PARTIAL;
604           t = BYTE_TYPE(enc, ptr);
605           if (t == open)
606             break;
607           switch (t) {
608           INVALID_CASES(ptr, nextTokPtr)
609           case BT_AMP:
610             {
611               int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
612               if (tok <= 0) {
613                 if (tok == XML_TOK_INVALID)
614                   *nextTokPtr = ptr;
615                 return tok;
616               }
617               break;
618             }
619           case BT_LT:
620             *nextTokPtr = ptr;
621             return XML_TOK_INVALID;
622           default:
623             ptr += MINBPC(enc);
624             break;
625           }
626         }
627         ptr += MINBPC(enc);
628         if (ptr == end)
629           return XML_TOK_PARTIAL;
630         switch (BYTE_TYPE(enc, ptr)) {
631         case BT_S:
632         case BT_CR:
633         case BT_LF:
634           break;
635         case BT_SOL:
636           goto sol;
637         case BT_GT:
638           goto gt;
639         default:
640           *nextTokPtr = ptr;
641           return XML_TOK_INVALID;
642         }
643         /* ptr points to closing quote */
644         for (;;) {
645           ptr += MINBPC(enc);
646           if (ptr == end)
647             return XML_TOK_PARTIAL;
648           switch (BYTE_TYPE(enc, ptr)) {
649           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
650           case BT_S: case BT_CR: case BT_LF:
651             continue;
652           case BT_GT:
653           gt:
654             *nextTokPtr = ptr + MINBPC(enc);
655             return XML_TOK_START_TAG_WITH_ATTS;
656           case BT_SOL:
657           sol:
658             ptr += MINBPC(enc);
659             if (ptr == end)
660               return XML_TOK_PARTIAL;
661             if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
662               *nextTokPtr = ptr;
663               return XML_TOK_INVALID;
664             }
665             *nextTokPtr = ptr + MINBPC(enc);
666             return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
667           default:
668             *nextTokPtr = ptr;
669             return XML_TOK_INVALID;
670           }
671           break;
672         }
673         break;
674       }
675     default:
676       *nextTokPtr = ptr;
677       return XML_TOK_INVALID;
678     }
679   }
680   return XML_TOK_PARTIAL;
681 }
682 
683 /* ptr points to character following "<" */
684 
685 static
PREFIX(scanLt)686 int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
687                    const char **nextTokPtr)
688 {
689 #ifdef XML_NS
690   int hadColon;
691 #endif
692   if (ptr == end)
693     return XML_TOK_PARTIAL;
694   switch (BYTE_TYPE(enc, ptr)) {
695   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
696   case BT_EXCL:
697     if ((ptr += MINBPC(enc)) == end)
698       return XML_TOK_PARTIAL;
699     switch (BYTE_TYPE(enc, ptr)) {
700     case BT_MINUS:
701       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
702     case BT_LSQB:
703       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
704     }
705     *nextTokPtr = ptr;
706     return XML_TOK_INVALID;
707   case BT_QUEST:
708     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
709   case BT_SOL:
710     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
711   default:
712     *nextTokPtr = ptr;
713     return XML_TOK_INVALID;
714   }
715 #ifdef XML_NS
716   hadColon = 0;
717 #endif
718   /* we have a start-tag */
719   while (ptr != end) {
720     switch (BYTE_TYPE(enc, ptr)) {
721     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
722 #ifdef XML_NS
723     case BT_COLON:
724       if (hadColon) {
725         *nextTokPtr = ptr;
726         return XML_TOK_INVALID;
727       }
728       hadColon = 1;
729       ptr += MINBPC(enc);
730       if (ptr == end)
731         return XML_TOK_PARTIAL;
732       switch (BYTE_TYPE(enc, ptr)) {
733       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
734       default:
735         *nextTokPtr = ptr;
736         return XML_TOK_INVALID;
737       }
738       break;
739 #endif
740     case BT_S: case BT_CR: case BT_LF:
741       {
742         ptr += MINBPC(enc);
743         while (ptr != end) {
744           switch (BYTE_TYPE(enc, ptr)) {
745           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
746           case BT_GT:
747             goto gt;
748           case BT_SOL:
749             goto sol;
750           case BT_S: case BT_CR: case BT_LF:
751             ptr += MINBPC(enc);
752             continue;
753           default:
754             *nextTokPtr = ptr;
755             return XML_TOK_INVALID;
756           }
757           return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
758         }
759         return XML_TOK_PARTIAL;
760       }
761     case BT_GT:
762     gt:
763       *nextTokPtr = ptr + MINBPC(enc);
764       return XML_TOK_START_TAG_NO_ATTS;
765     case BT_SOL:
766     sol:
767       ptr += MINBPC(enc);
768       if (ptr == end)
769         return XML_TOK_PARTIAL;
770       if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
771         *nextTokPtr = ptr;
772         return XML_TOK_INVALID;
773       }
774       *nextTokPtr = ptr + MINBPC(enc);
775       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
776     default:
777       *nextTokPtr = ptr;
778       return XML_TOK_INVALID;
779     }
780   }
781   return XML_TOK_PARTIAL;
782 }
783 
784 static
PREFIX(contentTok)785 int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
786                        const char **nextTokPtr)
787 {
788   if (ptr == end)
789     return XML_TOK_NONE;
790   if (MINBPC(enc) > 1) {
791     size_t n = end - ptr;
792     if (n & (MINBPC(enc) - 1)) {
793       n &= ~(MINBPC(enc) - 1);
794       if (n == 0)
795         return XML_TOK_PARTIAL;
796       end = ptr + n;
797     }
798   }
799   switch (BYTE_TYPE(enc, ptr)) {
800   case BT_LT:
801     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
802   case BT_AMP:
803     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
804   case BT_CR:
805     ptr += MINBPC(enc);
806     if (ptr == end)
807       return XML_TOK_TRAILING_CR;
808     if (BYTE_TYPE(enc, ptr) == BT_LF)
809       ptr += MINBPC(enc);
810     *nextTokPtr = ptr;
811     return XML_TOK_DATA_NEWLINE;
812   case BT_LF:
813     *nextTokPtr = ptr + MINBPC(enc);
814     return XML_TOK_DATA_NEWLINE;
815   case BT_RSQB:
816     ptr += MINBPC(enc);
817     if (ptr == end)
818       return XML_TOK_TRAILING_RSQB;
819     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
820       break;
821     ptr += MINBPC(enc);
822     if (ptr == end)
823       return XML_TOK_TRAILING_RSQB;
824     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
825       ptr -= MINBPC(enc);
826       break;
827     }
828     *nextTokPtr = ptr;
829     return XML_TOK_INVALID;
830   INVALID_CASES(ptr, nextTokPtr)
831   default:
832     ptr += MINBPC(enc);
833     break;
834   }
835   while (ptr != end) {
836     switch (BYTE_TYPE(enc, ptr)) {
837 #define LEAD_CASE(n) \
838     case BT_LEAD ## n: \
839       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
840         *nextTokPtr = ptr; \
841         return XML_TOK_DATA_CHARS; \
842       } \
843       ptr += n; \
844       break;
845     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
846 #undef LEAD_CASE
847     case BT_RSQB:
848       if (ptr + MINBPC(enc) != end) {
849          if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
850            ptr += MINBPC(enc);
851            break;
852          }
853          if (ptr + 2*MINBPC(enc) != end) {
854            if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
855              ptr += MINBPC(enc);
856              break;
857            }
858            *nextTokPtr = ptr + 2*MINBPC(enc);
859            return XML_TOK_INVALID;
860          }
861       }
862       /* fall through */
863     case BT_AMP:
864     case BT_LT:
865     case BT_NONXML:
866     case BT_MALFORM:
867     case BT_TRAIL:
868     case BT_CR:
869     case BT_LF:
870       *nextTokPtr = ptr;
871       return XML_TOK_DATA_CHARS;
872     default:
873       ptr += MINBPC(enc);
874       break;
875     }
876   }
877   *nextTokPtr = ptr;
878   return XML_TOK_DATA_CHARS;
879 }
880 
881 /* ptr points to character following "%" */
882 
883 static
PREFIX(scanPercent)884 int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
885                         const char **nextTokPtr)
886 {
887   if (ptr == end)
888     return XML_TOK_PARTIAL;
889   switch (BYTE_TYPE(enc, ptr)) {
890   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
891   case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
892     *nextTokPtr = ptr;
893     return XML_TOK_PERCENT;
894   default:
895     *nextTokPtr = ptr;
896     return XML_TOK_INVALID;
897   }
898   while (ptr != end) {
899     switch (BYTE_TYPE(enc, ptr)) {
900     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
901     case BT_SEMI:
902       *nextTokPtr = ptr + MINBPC(enc);
903       return XML_TOK_PARAM_ENTITY_REF;
904     default:
905       *nextTokPtr = ptr;
906       return XML_TOK_INVALID;
907     }
908   }
909   return XML_TOK_PARTIAL;
910 }
911 
912 static
PREFIX(scanPoundName)913 int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
914                           const char **nextTokPtr)
915 {
916   if (ptr == end)
917     return XML_TOK_PARTIAL;
918   switch (BYTE_TYPE(enc, ptr)) {
919   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
920   default:
921     *nextTokPtr = ptr;
922     return XML_TOK_INVALID;
923   }
924   while (ptr != end) {
925     switch (BYTE_TYPE(enc, ptr)) {
926     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
927     case BT_CR: case BT_LF: case BT_S:
928     case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
929       *nextTokPtr = ptr;
930       return XML_TOK_POUND_NAME;
931     default:
932       *nextTokPtr = ptr;
933       return XML_TOK_INVALID;
934     }
935   }
936   return -XML_TOK_POUND_NAME;
937 }
938 
939 static
PREFIX(scanLit)940 int PREFIX(scanLit)(int open, const ENCODING *enc,
941                     const char *ptr, const char *end,
942                     const char **nextTokPtr)
943 {
944   while (ptr != end) {
945     int t = BYTE_TYPE(enc, ptr);
946     switch (t) {
947     INVALID_CASES(ptr, nextTokPtr)
948     case BT_QUOT:
949     case BT_APOS:
950       ptr += MINBPC(enc);
951       if (t != open)
952         break;
953       if (ptr == end)
954         return -XML_TOK_LITERAL;
955       *nextTokPtr = ptr;
956       switch (BYTE_TYPE(enc, ptr)) {
957       case BT_S: case BT_CR: case BT_LF:
958       case BT_GT: case BT_PERCNT: case BT_LSQB:
959         return XML_TOK_LITERAL;
960       default:
961         return XML_TOK_INVALID;
962       }
963     default:
964       ptr += MINBPC(enc);
965       break;
966     }
967   }
968   return XML_TOK_PARTIAL;
969 }
970 
971 static
PREFIX(prologTok)972 int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
973                       const char **nextTokPtr)
974 {
975   int tok;
976   if (ptr == end)
977     return XML_TOK_NONE;
978   if (MINBPC(enc) > 1) {
979     size_t n = end - ptr;
980     if (n & (MINBPC(enc) - 1)) {
981       n &= ~(MINBPC(enc) - 1);
982       if (n == 0)
983         return XML_TOK_PARTIAL;
984       end = ptr + n;
985     }
986   }
987   switch (BYTE_TYPE(enc, ptr)) {
988   case BT_QUOT:
989     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
990   case BT_APOS:
991     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
992   case BT_LT:
993     {
994       ptr += MINBPC(enc);
995       if (ptr == end)
996         return XML_TOK_PARTIAL;
997       switch (BYTE_TYPE(enc, ptr)) {
998       case BT_EXCL:
999         return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1000       case BT_QUEST:
1001         return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1002       case BT_NMSTRT:
1003       case BT_HEX:
1004       case BT_NONASCII:
1005       case BT_LEAD2:
1006       case BT_LEAD3:
1007       case BT_LEAD4:
1008         *nextTokPtr = ptr - MINBPC(enc);
1009         return XML_TOK_INSTANCE_START;
1010       }
1011       *nextTokPtr = ptr;
1012       return XML_TOK_INVALID;
1013     }
1014   case BT_CR:
1015     if (ptr + MINBPC(enc) == end)
1016       return -XML_TOK_PROLOG_S;
1017     /* fall through */
1018   case BT_S: case BT_LF:
1019     for (;;) {
1020       ptr += MINBPC(enc);
1021       if (ptr == end)
1022         break;
1023       switch (BYTE_TYPE(enc, ptr)) {
1024       case BT_S: case BT_LF:
1025         break;
1026       case BT_CR:
1027         /* don't split CR/LF pair */
1028         if (ptr + MINBPC(enc) != end)
1029           break;
1030         /* fall through */
1031       default:
1032         *nextTokPtr = ptr;
1033         return XML_TOK_PROLOG_S;
1034       }
1035     }
1036     *nextTokPtr = ptr;
1037     return XML_TOK_PROLOG_S;
1038   case BT_PERCNT:
1039     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1040   case BT_COMMA:
1041     *nextTokPtr = ptr + MINBPC(enc);
1042     return XML_TOK_COMMA;
1043   case BT_LSQB:
1044     *nextTokPtr = ptr + MINBPC(enc);
1045     return XML_TOK_OPEN_BRACKET;
1046   case BT_RSQB:
1047     ptr += MINBPC(enc);
1048     if (ptr == end)
1049       return -XML_TOK_CLOSE_BRACKET;
1050     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1051       if (ptr + MINBPC(enc) == end)
1052         return XML_TOK_PARTIAL;
1053       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1054         *nextTokPtr = ptr + 2*MINBPC(enc);
1055         return XML_TOK_COND_SECT_CLOSE;
1056       }
1057     }
1058     *nextTokPtr = ptr;
1059     return XML_TOK_CLOSE_BRACKET;
1060   case BT_LPAR:
1061     *nextTokPtr = ptr + MINBPC(enc);
1062     return XML_TOK_OPEN_PAREN;
1063   case BT_RPAR:
1064     ptr += MINBPC(enc);
1065     if (ptr == end)
1066       return -XML_TOK_CLOSE_PAREN;
1067     switch (BYTE_TYPE(enc, ptr)) {
1068     case BT_AST:
1069       *nextTokPtr = ptr + MINBPC(enc);
1070       return XML_TOK_CLOSE_PAREN_ASTERISK;
1071     case BT_QUEST:
1072       *nextTokPtr = ptr + MINBPC(enc);
1073       return XML_TOK_CLOSE_PAREN_QUESTION;
1074     case BT_PLUS:
1075       *nextTokPtr = ptr + MINBPC(enc);
1076       return XML_TOK_CLOSE_PAREN_PLUS;
1077     case BT_CR: case BT_LF: case BT_S:
1078     case BT_GT: case BT_COMMA: case BT_VERBAR:
1079     case BT_RPAR:
1080       *nextTokPtr = ptr;
1081       return XML_TOK_CLOSE_PAREN;
1082     }
1083     *nextTokPtr = ptr;
1084     return XML_TOK_INVALID;
1085   case BT_VERBAR:
1086     *nextTokPtr = ptr + MINBPC(enc);
1087     return XML_TOK_OR;
1088   case BT_GT:
1089     *nextTokPtr = ptr + MINBPC(enc);
1090     return XML_TOK_DECL_CLOSE;
1091   case BT_NUM:
1092     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1093 #define LEAD_CASE(n) \
1094   case BT_LEAD ## n: \
1095     if (end - ptr < n) \
1096       return XML_TOK_PARTIAL_CHAR; \
1097     if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1098       ptr += n; \
1099       tok = XML_TOK_NAME; \
1100       break; \
1101     } \
1102     if (IS_NAME_CHAR(enc, ptr, n)) { \
1103       ptr += n; \
1104       tok = XML_TOK_NMTOKEN; \
1105       break; \
1106     } \
1107     *nextTokPtr = ptr; \
1108     return XML_TOK_INVALID;
1109     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1110 #undef LEAD_CASE
1111   case BT_NMSTRT:
1112   case BT_HEX:
1113     tok = XML_TOK_NAME;
1114     ptr += MINBPC(enc);
1115     break;
1116   case BT_DIGIT:
1117   case BT_NAME:
1118   case BT_MINUS:
1119 #ifdef XML_NS
1120   case BT_COLON:
1121 #endif
1122     tok = XML_TOK_NMTOKEN;
1123     ptr += MINBPC(enc);
1124     break;
1125   case BT_NONASCII:
1126     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1127       ptr += MINBPC(enc);
1128       tok = XML_TOK_NAME;
1129       break;
1130     }
1131     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1132       ptr += MINBPC(enc);
1133       tok = XML_TOK_NMTOKEN;
1134       break;
1135     }
1136     /* fall through */
1137   default:
1138     *nextTokPtr = ptr;
1139     return XML_TOK_INVALID;
1140   }
1141   while (ptr != end) {
1142     switch (BYTE_TYPE(enc, ptr)) {
1143     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1144     case BT_GT: case BT_RPAR: case BT_COMMA:
1145     case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1146     case BT_S: case BT_CR: case BT_LF:
1147       *nextTokPtr = ptr;
1148       return tok;
1149 #ifdef XML_NS
1150     case BT_COLON:
1151       ptr += MINBPC(enc);
1152       switch (tok) {
1153       case XML_TOK_NAME:
1154         if (ptr == end)
1155           return XML_TOK_PARTIAL;
1156         tok = XML_TOK_PREFIXED_NAME;
1157         switch (BYTE_TYPE(enc, ptr)) {
1158         CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1159         default:
1160           tok = XML_TOK_NMTOKEN;
1161           break;
1162         }
1163         break;
1164       case XML_TOK_PREFIXED_NAME:
1165         tok = XML_TOK_NMTOKEN;
1166         break;
1167       }
1168       break;
1169 #endif
1170     case BT_PLUS:
1171       if (tok == XML_TOK_NMTOKEN)  {
1172         *nextTokPtr = ptr;
1173         return XML_TOK_INVALID;
1174       }
1175       *nextTokPtr = ptr + MINBPC(enc);
1176       return XML_TOK_NAME_PLUS;
1177     case BT_AST:
1178       if (tok == XML_TOK_NMTOKEN)  {
1179         *nextTokPtr = ptr;
1180         return XML_TOK_INVALID;
1181       }
1182       *nextTokPtr = ptr + MINBPC(enc);
1183       return XML_TOK_NAME_ASTERISK;
1184     case BT_QUEST:
1185       if (tok == XML_TOK_NMTOKEN)  {
1186         *nextTokPtr = ptr;
1187         return XML_TOK_INVALID;
1188       }
1189       *nextTokPtr = ptr + MINBPC(enc);
1190       return XML_TOK_NAME_QUESTION;
1191     default:
1192       *nextTokPtr = ptr;
1193       return XML_TOK_INVALID;
1194     }
1195   }
1196   return -tok;
1197 }
1198 
1199 static
PREFIX(attributeValueTok)1200 int PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1201                               const char **nextTokPtr)
1202 {
1203   const char *start;
1204   if (ptr == end)
1205     return XML_TOK_NONE;
1206   start = ptr;
1207   while (ptr != end) {
1208     switch (BYTE_TYPE(enc, ptr)) {
1209 #define LEAD_CASE(n) \
1210     case BT_LEAD ## n: ptr += n; break;
1211     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1212 #undef LEAD_CASE
1213     case BT_AMP:
1214       if (ptr == start)
1215         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1216       *nextTokPtr = ptr;
1217       return XML_TOK_DATA_CHARS;
1218     case BT_LT:
1219       /* this is for inside entity references */
1220       *nextTokPtr = ptr;
1221       return XML_TOK_INVALID;
1222     case BT_LF:
1223       if (ptr == start) {
1224         *nextTokPtr = ptr + MINBPC(enc);
1225         return XML_TOK_DATA_NEWLINE;
1226       }
1227       *nextTokPtr = ptr;
1228       return XML_TOK_DATA_CHARS;
1229     case BT_CR:
1230       if (ptr == start) {
1231         ptr += MINBPC(enc);
1232         if (ptr == end)
1233           return XML_TOK_TRAILING_CR;
1234         if (BYTE_TYPE(enc, ptr) == BT_LF)
1235           ptr += MINBPC(enc);
1236         *nextTokPtr = ptr;
1237         return XML_TOK_DATA_NEWLINE;
1238       }
1239       *nextTokPtr = ptr;
1240       return XML_TOK_DATA_CHARS;
1241     case BT_S:
1242       if (ptr == start) {
1243         *nextTokPtr = ptr + MINBPC(enc);
1244         return XML_TOK_ATTRIBUTE_VALUE_S;
1245       }
1246       *nextTokPtr = ptr;
1247       return XML_TOK_DATA_CHARS;
1248     default:
1249       ptr += MINBPC(enc);
1250       break;
1251     }
1252   }
1253   *nextTokPtr = ptr;
1254   return XML_TOK_DATA_CHARS;
1255 }
1256 
1257 static
PREFIX(entityValueTok)1258 int PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1259                            const char **nextTokPtr)
1260 {
1261   const char *start;
1262   if (ptr == end)
1263     return XML_TOK_NONE;
1264   start = ptr;
1265   while (ptr != end) {
1266     switch (BYTE_TYPE(enc, ptr)) {
1267 #define LEAD_CASE(n) \
1268     case BT_LEAD ## n: ptr += n; break;
1269     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1270 #undef LEAD_CASE
1271     case BT_AMP:
1272       if (ptr == start)
1273         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1274       *nextTokPtr = ptr;
1275       return XML_TOK_DATA_CHARS;
1276     case BT_PERCNT:
1277       if (ptr == start) {
1278         int tok =  PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1279                                        end, nextTokPtr);
1280         return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1281       }
1282       *nextTokPtr = ptr;
1283       return XML_TOK_DATA_CHARS;
1284     case BT_LF:
1285       if (ptr == start) {
1286         *nextTokPtr = ptr + MINBPC(enc);
1287         return XML_TOK_DATA_NEWLINE;
1288       }
1289       *nextTokPtr = ptr;
1290       return XML_TOK_DATA_CHARS;
1291     case BT_CR:
1292       if (ptr == start) {
1293         ptr += MINBPC(enc);
1294         if (ptr == end)
1295           return XML_TOK_TRAILING_CR;
1296         if (BYTE_TYPE(enc, ptr) == BT_LF)
1297           ptr += MINBPC(enc);
1298         *nextTokPtr = ptr;
1299         return XML_TOK_DATA_NEWLINE;
1300       }
1301       *nextTokPtr = ptr;
1302       return XML_TOK_DATA_CHARS;
1303     default:
1304       ptr += MINBPC(enc);
1305       break;
1306     }
1307   }
1308   *nextTokPtr = ptr;
1309   return XML_TOK_DATA_CHARS;
1310 }
1311 
1312 #ifdef XML_DTD
1313 
1314 static
PREFIX(ignoreSectionTok)1315 int PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1316                              const char **nextTokPtr)
1317 {
1318   int level = 0;
1319   if (MINBPC(enc) > 1) {
1320     size_t n = end - ptr;
1321     if (n & (MINBPC(enc) - 1)) {
1322       n &= ~(MINBPC(enc) - 1);
1323       end = ptr + n;
1324     }
1325   }
1326   while (ptr != end) {
1327     switch (BYTE_TYPE(enc, ptr)) {
1328     INVALID_CASES(ptr, nextTokPtr)
1329     case BT_LT:
1330       if ((ptr += MINBPC(enc)) == end)
1331         return XML_TOK_PARTIAL;
1332       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1333         if ((ptr += MINBPC(enc)) == end)
1334           return XML_TOK_PARTIAL;
1335         if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1336           ++level;
1337           ptr += MINBPC(enc);
1338         }
1339       }
1340       break;
1341     case BT_RSQB:
1342       if ((ptr += MINBPC(enc)) == end)
1343         return XML_TOK_PARTIAL;
1344       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1345         if ((ptr += MINBPC(enc)) == end)
1346           return XML_TOK_PARTIAL;
1347         if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1348           ptr += MINBPC(enc);
1349           if (level == 0) {
1350             *nextTokPtr = ptr;
1351             return XML_TOK_IGNORE_SECT;
1352           }
1353           --level;
1354         }
1355       }
1356       break;
1357     default:
1358       ptr += MINBPC(enc);
1359       break;
1360     }
1361   }
1362   return XML_TOK_PARTIAL;
1363 }
1364 
1365 #endif /* XML_DTD */
1366 
1367 static
PREFIX(isPublicId)1368 int PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1369                        const char **badPtr)
1370 {
1371   ptr += MINBPC(enc);
1372   end -= MINBPC(enc);
1373   for (; ptr != end; ptr += MINBPC(enc)) {
1374     switch (BYTE_TYPE(enc, ptr)) {
1375     case BT_DIGIT:
1376     case BT_HEX:
1377     case BT_MINUS:
1378     case BT_APOS:
1379     case BT_LPAR:
1380     case BT_RPAR:
1381     case BT_PLUS:
1382     case BT_COMMA:
1383     case BT_SOL:
1384     case BT_EQUALS:
1385     case BT_QUEST:
1386     case BT_CR:
1387     case BT_LF:
1388     case BT_SEMI:
1389     case BT_EXCL:
1390     case BT_AST:
1391     case BT_PERCNT:
1392     case BT_NUM:
1393 #ifdef XML_NS
1394     case BT_COLON:
1395 #endif
1396       break;
1397     case BT_S:
1398       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1399         *badPtr = ptr;
1400         return 0;
1401       }
1402       break;
1403     case BT_NAME:
1404     case BT_NMSTRT:
1405       if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1406         break;
1407     default:
1408       switch (BYTE_TO_ASCII(enc, ptr)) {
1409       case 0x24: /* $ */
1410       case 0x40: /* @ */
1411         break;
1412       default:
1413         *badPtr = ptr;
1414         return 0;
1415       }
1416       break;
1417     }
1418   }
1419   return 1;
1420 }
1421 
1422 /* This must only be called for a well-formed start-tag or empty element tag.
1423 Returns the number of attributes.  Pointers to the first attsMax attributes
1424 are stored in atts. */
1425 
1426 static
PREFIX(getAtts)1427 int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1428                     int attsMax, ATTRIBUTE *atts)
1429 {
1430   enum { other, inName, inValue } state = inName;
1431   int nAtts = 0;
1432   int open = 0; /* defined when state == inValue;
1433                    initialization just to shut up compilers */
1434 
1435   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1436     switch (BYTE_TYPE(enc, ptr)) {
1437 #define START_NAME \
1438       if (state == other) { \
1439         if (nAtts < attsMax) { \
1440           atts[nAtts].name = ptr; \
1441           atts[nAtts].normalized = 1; \
1442         } \
1443         state = inName; \
1444       }
1445 #define LEAD_CASE(n) \
1446     case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1447     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1448 #undef LEAD_CASE
1449     case BT_NONASCII:
1450     case BT_NMSTRT:
1451     case BT_HEX:
1452       START_NAME
1453       break;
1454 #undef START_NAME
1455     case BT_QUOT:
1456       if (state != inValue) {
1457         if (nAtts < attsMax)
1458           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1459         state = inValue;
1460         open = BT_QUOT;
1461       }
1462       else if (open == BT_QUOT) {
1463         state = other;
1464         if (nAtts < attsMax)
1465           atts[nAtts].valueEnd = ptr;
1466         nAtts++;
1467       }
1468       break;
1469     case BT_APOS:
1470       if (state != inValue) {
1471         if (nAtts < attsMax)
1472           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1473         state = inValue;
1474         open = BT_APOS;
1475       }
1476       else if (open == BT_APOS) {
1477         state = other;
1478         if (nAtts < attsMax)
1479           atts[nAtts].valueEnd = ptr;
1480         nAtts++;
1481       }
1482       break;
1483     case BT_AMP:
1484       if (nAtts < attsMax)
1485         atts[nAtts].normalized = 0;
1486       break;
1487     case BT_S:
1488       if (state == inName)
1489         state = other;
1490       else if (state == inValue
1491                && nAtts < attsMax
1492                && atts[nAtts].normalized
1493                && (ptr == atts[nAtts].valuePtr
1494                    || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1495                    || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1496                    || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1497         atts[nAtts].normalized = 0;
1498       break;
1499     case BT_CR: case BT_LF:
1500       /* This case ensures that the first attribute name is counted
1501          Apart from that we could just change state on the quote. */
1502       if (state == inName)
1503         state = other;
1504       else if (state == inValue && nAtts < attsMax)
1505         atts[nAtts].normalized = 0;
1506       break;
1507     case BT_GT:
1508     case BT_SOL:
1509       if (state != inValue)
1510         return nAtts;
1511       break;
1512     default:
1513       break;
1514     }
1515   }
1516   /* not reached */
1517 }
1518 
1519 static
PREFIX(charRefNumber)1520 int PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
1521 {
1522   int result = 0;
1523   itkExpatUnused(enc);
1524   /* skip &# */
1525   ptr += 2*MINBPC(enc);
1526   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1527     for (ptr += MINBPC(enc); !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1528       int c = BYTE_TO_ASCII(enc, ptr);
1529       switch (c) {
1530       case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1531       case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1532         result <<= 4;
1533         result |= (c - ASCII_0);
1534         break;
1535       case ASCII_A: case ASCII_B: case ASCII_C: case ASCII_D: case ASCII_E: case ASCII_F:
1536         result <<= 4;
1537         result += 10 + (c - ASCII_A);
1538         break;
1539       case ASCII_a: case ASCII_b: case ASCII_c: case ASCII_d: case ASCII_e: case ASCII_f:
1540         result <<= 4;
1541         result += 10 + (c - ASCII_a);
1542         break;
1543       }
1544       if (result >= 0x110000)
1545         return -1;
1546     }
1547   }
1548   else {
1549     for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1550       int c = BYTE_TO_ASCII(enc, ptr);
1551       result *= 10;
1552       result += (c - ASCII_0);
1553       if (result >= 0x110000)
1554         return -1;
1555     }
1556   }
1557   return checkCharRefNumber(result);
1558 }
1559 
1560 static
PREFIX(predefinedEntityName)1561 int PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, const char *end)
1562 {
1563   itkExpatUnused(enc);
1564   switch ((end - ptr)/MINBPC(enc)) {
1565   case 2:
1566     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1567       switch (BYTE_TO_ASCII(enc, ptr)) {
1568       case ASCII_l:
1569         return ASCII_LT;
1570       case ASCII_g:
1571         return ASCII_GT;
1572       }
1573     }
1574     break;
1575   case 3:
1576     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1577       ptr += MINBPC(enc);
1578       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1579         ptr += MINBPC(enc);
1580         if (CHAR_MATCHES(enc, ptr, ASCII_p))
1581           return ASCII_AMP;
1582       }
1583     }
1584     break;
1585   case 4:
1586     switch (BYTE_TO_ASCII(enc, ptr)) {
1587     case ASCII_q:
1588       ptr += MINBPC(enc);
1589       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1590         ptr += MINBPC(enc);
1591         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1592           ptr += MINBPC(enc);
1593           if (CHAR_MATCHES(enc, ptr, ASCII_t))
1594             return ASCII_QUOT;
1595         }
1596       }
1597       break;
1598     case ASCII_a:
1599       ptr += MINBPC(enc);
1600       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1601         ptr += MINBPC(enc);
1602         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1603           ptr += MINBPC(enc);
1604           if (CHAR_MATCHES(enc, ptr, ASCII_s))
1605             return ASCII_APOS;
1606         }
1607       }
1608       break;
1609     }
1610   }
1611   return 0;
1612 }
1613 
1614 static
PREFIX(sameName)1615 int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1616 {
1617   for (;;) {
1618     switch (BYTE_TYPE(enc, ptr1)) {
1619 #define LEAD_CASE(n) \
1620     case BT_LEAD ## n: \
1621       if (*ptr1++ != *ptr2++) \
1622         return 0;
1623     LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1624 #undef LEAD_CASE
1625       /* fall through */
1626       if (*ptr1++ != *ptr2++)
1627         return 0;
1628       break;
1629     case BT_NONASCII:
1630     case BT_NMSTRT:
1631 #ifdef XML_NS
1632     case BT_COLON:
1633 #endif
1634     case BT_HEX:
1635     case BT_DIGIT:
1636     case BT_NAME:
1637     case BT_MINUS:
1638       if (*ptr2++ != *ptr1++)
1639         return 0;
1640       if (MINBPC(enc) > 1) {
1641         if (*ptr2++ != *ptr1++)
1642           return 0;
1643         if (MINBPC(enc) > 2) {
1644           if (*ptr2++ != *ptr1++)
1645             return 0;
1646           if (MINBPC(enc) > 3) {
1647             if (*ptr2++ != *ptr1++)
1648               return 0;
1649           }
1650         }
1651       }
1652       break;
1653     default:
1654       if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1655         return 1;
1656       switch (BYTE_TYPE(enc, ptr2)) {
1657       case BT_LEAD2:
1658       case BT_LEAD3:
1659       case BT_LEAD4:
1660       case BT_NONASCII:
1661       case BT_NMSTRT:
1662 #ifdef XML_NS
1663       case BT_COLON:
1664 #endif
1665       case BT_HEX:
1666       case BT_DIGIT:
1667       case BT_NAME:
1668       case BT_MINUS:
1669         return 0;
1670       default:
1671         return 1;
1672       }
1673     }
1674   }
1675   /* not reached */
1676 }
1677 
1678 static
PREFIX(nameMatchesAscii)1679 int PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1680                              const char *end1, const char *ptr2)
1681 {
1682   itkExpatUnused(enc);
1683   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1684     if (ptr1 == end1)
1685       return 0;
1686     if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1687       return 0;
1688   }
1689   return ptr1 == end1;
1690 }
1691 
1692 static
PREFIX(nameLength)1693 int PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1694 {
1695   const char *start = ptr;
1696   for (;;) {
1697     switch (BYTE_TYPE(enc, ptr)) {
1698 #define LEAD_CASE(n) \
1699     case BT_LEAD ## n: ptr += n; break;
1700     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1701 #undef LEAD_CASE
1702     case BT_NONASCII:
1703     case BT_NMSTRT:
1704 #ifdef XML_NS
1705     case BT_COLON:
1706 #endif
1707     case BT_HEX:
1708     case BT_DIGIT:
1709     case BT_NAME:
1710     case BT_MINUS:
1711       ptr += MINBPC(enc);
1712       break;
1713     default:
1714       return ptr - start;
1715     }
1716   }
1717 }
1718 
1719 static
PREFIX(skipS)1720 const char *PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1721 {
1722   for (;;) {
1723     switch (BYTE_TYPE(enc, ptr)) {
1724     case BT_LF:
1725     case BT_CR:
1726     case BT_S:
1727       ptr += MINBPC(enc);
1728       break;
1729     default:
1730       return ptr;
1731     }
1732   }
1733 }
1734 
1735 static
PREFIX(updatePosition)1736 void PREFIX(updatePosition)(const ENCODING *enc,
1737                             const char *ptr,
1738                             const char *end,
1739                             POSITION *pos)
1740 {
1741   while (ptr != end) {
1742     switch (BYTE_TYPE(enc, ptr)) {
1743 #define LEAD_CASE(n) \
1744     case BT_LEAD ## n: \
1745       ptr += n; \
1746       break;
1747     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1748 #undef LEAD_CASE
1749     case BT_LF:
1750       pos->columnNumber = (unsigned)-1;
1751       pos->lineNumber++;
1752       ptr += MINBPC(enc);
1753       break;
1754     case BT_CR:
1755       pos->lineNumber++;
1756       ptr += MINBPC(enc);
1757       if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
1758         ptr += MINBPC(enc);
1759       pos->columnNumber = (unsigned)-1;
1760       break;
1761     default:
1762       ptr += MINBPC(enc);
1763       break;
1764     }
1765     pos->columnNumber++;
1766   }
1767 }
1768 
1769 #undef DO_LEAD_CASE
1770 #undef MULTIBYTE_CASES
1771 #undef INVALID_CASES
1772 #undef CHECK_NAME_CASE
1773 #undef CHECK_NAME_CASES
1774 #undef CHECK_NMSTRT_CASE
1775 #undef CHECK_NMSTRT_CASES
1776