1 /*
2 Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
3 See the file COPYING for copying permission.
4 */
5 
6 #ifndef IS_INVALID_CHAR
7 #define IS_INVALID_CHAR(enc, ptr, n) (0)
8 #endif
9 
10 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
11     case BT_LEAD ## n: \
12       if (end - ptr < n) \
13 	return XML_TOK_PARTIAL_CHAR; \
14       if (IS_INVALID_CHAR(enc, ptr, n)) { \
15         *(nextTokPtr) = (ptr); \
16         return XML_TOK_INVALID; \
17       } \
18       ptr += n; \
19       break;
20 
21 #define INVALID_CASES(ptr, nextTokPtr) \
22   INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
23   INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
24   INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
25   case BT_NONXML: \
26   case BT_MALFORM: \
27   case BT_TRAIL: \
28     *(nextTokPtr) = (ptr); \
29     return XML_TOK_INVALID;
30 
31 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
32    case BT_LEAD ## n: \
33      if (end - ptr < n) \
34        return XML_TOK_PARTIAL_CHAR; \
35      if (!IS_NAME_CHAR(enc, ptr, n)) { \
36        *nextTokPtr = ptr; \
37        return XML_TOK_INVALID; \
38      } \
39      ptr += n; \
40      break;
41 
42 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
43   case BT_NONASCII: \
44     if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
45       *nextTokPtr = ptr; \
46       return XML_TOK_INVALID; \
47     } \
48   case BT_NMSTRT: \
49   case BT_HEX: \
50   case BT_DIGIT: \
51   case BT_NAME: \
52   case BT_MINUS: \
53     ptr += MINBPC(enc); \
54     break; \
55   CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
56   CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
57   CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
58 
59 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
60    case BT_LEAD ## n: \
61      if (end - ptr < n) \
62        return XML_TOK_PARTIAL_CHAR; \
63      if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
64        *nextTokPtr = ptr; \
65        return XML_TOK_INVALID; \
66      } \
67      ptr += n; \
68      break;
69 
70 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
71   case BT_NONASCII: \
72     if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
73       *nextTokPtr = ptr; \
74       return XML_TOK_INVALID; \
75     } \
76   case BT_NMSTRT: \
77   case BT_HEX: \
78     ptr += MINBPC(enc); \
79     break; \
80   CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
81   CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
82   CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
83 
84 #ifndef PREFIX
85 #define PREFIX(ident) ident
86 #endif
87 
88 /* ptr points to character following "<!-" */
89 
90 static
PREFIX(scanComment)91 int PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
92 			const char **nextTokPtr)
93 {
94   if (ptr != end) {
95     if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
96       *nextTokPtr = ptr;
97       return XML_TOK_INVALID;
98     }
99     ptr += MINBPC(enc);
100     while (ptr != end) {
101       switch (BYTE_TYPE(enc, ptr)) {
102       INVALID_CASES(ptr, nextTokPtr)
103       case BT_MINUS:
104 	if ((ptr += MINBPC(enc)) == end)
105 	  return XML_TOK_PARTIAL;
106 	if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
107 	  if ((ptr += MINBPC(enc)) == end)
108 	    return XML_TOK_PARTIAL;
109 	  if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
110 	    *nextTokPtr = ptr;
111 	    return XML_TOK_INVALID;
112 	  }
113 	  *nextTokPtr = ptr + MINBPC(enc);
114 	  return XML_TOK_COMMENT;
115 	}
116 	break;
117       default:
118 	ptr += MINBPC(enc);
119 	break;
120       }
121     }
122   }
123   return XML_TOK_PARTIAL;
124 }
125 
126 /* ptr points to character following "<!" */
127 
128 static
PREFIX(scanDecl)129 int PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
130 		     const char **nextTokPtr)
131 {
132   if (ptr == end)
133     return XML_TOK_PARTIAL;
134   switch (BYTE_TYPE(enc, ptr)) {
135   case BT_MINUS:
136     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
137   case BT_LSQB:
138     *nextTokPtr = ptr + MINBPC(enc);
139     return XML_TOK_COND_SECT_OPEN;
140   case BT_NMSTRT:
141   case BT_HEX:
142     ptr += MINBPC(enc);
143     break;
144   default:
145     *nextTokPtr = ptr;
146     return XML_TOK_INVALID;
147   }
148   while (ptr != end) {
149     switch (BYTE_TYPE(enc, ptr)) {
150     case BT_PERCNT:
151       if (ptr + MINBPC(enc) == end)
152 	return XML_TOK_PARTIAL;
153       /* don't allow <!ENTITY% foo "whatever"> */
154       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
155       case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
156 	*nextTokPtr = ptr;
157 	return XML_TOK_INVALID;
158       }
159       /* fall through */
160     case BT_S: case BT_CR: case BT_LF:
161       *nextTokPtr = ptr;
162       return XML_TOK_DECL_OPEN;
163     case BT_NMSTRT:
164     case BT_HEX:
165       ptr += MINBPC(enc);
166       break;
167     default:
168       *nextTokPtr = ptr;
169       return XML_TOK_INVALID;
170     }
171   }
172   return XML_TOK_PARTIAL;
173 }
174 
175 static
PREFIX(checkPiTarget)176 int PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end, int *tokPtr)
177 {
178   int upper = 0;
179   *tokPtr = XML_TOK_PI;
180   if (end - ptr != MINBPC(enc)*3)
181     return 1;
182   switch (BYTE_TO_ASCII(enc, ptr)) {
183   case ASCII_x:
184     break;
185   case ASCII_X:
186     upper = 1;
187     break;
188   default:
189     return 1;
190   }
191   ptr += MINBPC(enc);
192   switch (BYTE_TO_ASCII(enc, ptr)) {
193   case ASCII_m:
194     break;
195   case ASCII_M:
196     upper = 1;
197     break;
198   default:
199     return 1;
200   }
201   ptr += MINBPC(enc);
202   switch (BYTE_TO_ASCII(enc, ptr)) {
203   case ASCII_l:
204     break;
205   case ASCII_L:
206     upper = 1;
207     break;
208   default:
209     return 1;
210   }
211   if (upper)
212     return 0;
213   *tokPtr = XML_TOK_XML_DECL;
214   return 1;
215 }
216 
217 /* ptr points to character following "<?" */
218 
219 static
PREFIX(scanPi)220 int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
221 		   const char **nextTokPtr)
222 {
223   int tok;
224   const char *target = ptr;
225   if (ptr == end)
226     return XML_TOK_PARTIAL;
227   switch (BYTE_TYPE(enc, ptr)) {
228   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
229   default:
230     *nextTokPtr = ptr;
231     return XML_TOK_INVALID;
232   }
233   while (ptr != end) {
234     switch (BYTE_TYPE(enc, ptr)) {
235     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
236     case BT_S: case BT_CR: case BT_LF:
237       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
238 	*nextTokPtr = ptr;
239 	return XML_TOK_INVALID;
240       }
241       ptr += MINBPC(enc);
242       while (ptr != end) {
243         switch (BYTE_TYPE(enc, ptr)) {
244         INVALID_CASES(ptr, nextTokPtr)
245 	case BT_QUEST:
246 	  ptr += MINBPC(enc);
247 	  if (ptr == end)
248 	    return XML_TOK_PARTIAL;
249 	  if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
250 	    *nextTokPtr = ptr + MINBPC(enc);
251 	    return tok;
252 	  }
253 	  break;
254 	default:
255 	  ptr += MINBPC(enc);
256 	  break;
257 	}
258       }
259       return XML_TOK_PARTIAL;
260     case BT_QUEST:
261       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
262 	*nextTokPtr = ptr;
263 	return XML_TOK_INVALID;
264       }
265       ptr += MINBPC(enc);
266       if (ptr == end)
267 	return XML_TOK_PARTIAL;
268       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
269 	*nextTokPtr = ptr + MINBPC(enc);
270 	return tok;
271       }
272       /* fall through */
273     default:
274       *nextTokPtr = ptr;
275       return XML_TOK_INVALID;
276     }
277   }
278   return XML_TOK_PARTIAL;
279 }
280 
281 
282 static
PREFIX(scanCdataSection)283 int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
284 			     const char **nextTokPtr)
285 {
286   static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB };
287   int i;
288   /* CDATA[ */
289   if (end - ptr < 6 * MINBPC(enc))
290     return XML_TOK_PARTIAL;
291   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
292     if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
293       *nextTokPtr = ptr;
294       return XML_TOK_INVALID;
295     }
296   }
297   *nextTokPtr = ptr;
298   return XML_TOK_CDATA_SECT_OPEN;
299 }
300 
301 static
PREFIX(cdataSectionTok)302 int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
303 			    const char **nextTokPtr)
304 {
305   if (ptr == end)
306     return XML_TOK_NONE;
307   if (MINBPC(enc) > 1) {
308     size_t n = end - ptr;
309     if (n & (MINBPC(enc) - 1)) {
310       n &= ~(MINBPC(enc) - 1);
311       if (n == 0)
312 	return XML_TOK_PARTIAL;
313       end = ptr + n;
314     }
315   }
316   switch (BYTE_TYPE(enc, ptr)) {
317   case BT_RSQB:
318     ptr += MINBPC(enc);
319     if (ptr == end)
320       return XML_TOK_PARTIAL;
321     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
322       break;
323     ptr += MINBPC(enc);
324     if (ptr == end)
325       return XML_TOK_PARTIAL;
326     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
327       ptr -= MINBPC(enc);
328       break;
329     }
330     *nextTokPtr = ptr + MINBPC(enc);
331     return XML_TOK_CDATA_SECT_CLOSE;
332   case BT_CR:
333     ptr += MINBPC(enc);
334     if (ptr == end)
335       return XML_TOK_PARTIAL;
336     if (BYTE_TYPE(enc, ptr) == BT_LF)
337       ptr += MINBPC(enc);
338     *nextTokPtr = ptr;
339     return XML_TOK_DATA_NEWLINE;
340   case BT_LF:
341     *nextTokPtr = ptr + MINBPC(enc);
342     return XML_TOK_DATA_NEWLINE;
343   INVALID_CASES(ptr, nextTokPtr)
344   default:
345     ptr += MINBPC(enc);
346     break;
347   }
348   while (ptr != end) {
349     switch (BYTE_TYPE(enc, ptr)) {
350 #define LEAD_CASE(n) \
351     case BT_LEAD ## n: \
352       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
353 	*nextTokPtr = ptr; \
354 	return XML_TOK_DATA_CHARS; \
355       } \
356       ptr += n; \
357       break;
358     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
359 #undef LEAD_CASE
360     case BT_NONXML:
361     case BT_MALFORM:
362     case BT_TRAIL:
363     case BT_CR:
364     case BT_LF:
365     case BT_RSQB:
366       *nextTokPtr = ptr;
367       return XML_TOK_DATA_CHARS;
368     default:
369       ptr += MINBPC(enc);
370       break;
371     }
372   }
373   *nextTokPtr = ptr;
374   return XML_TOK_DATA_CHARS;
375 }
376 
377 /* ptr points to character following "</" */
378 
379 static
PREFIX(scanEndTag)380 int PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
381 		       const char **nextTokPtr)
382 {
383   if (ptr == end)
384     return XML_TOK_PARTIAL;
385   switch (BYTE_TYPE(enc, ptr)) {
386   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
387   default:
388     *nextTokPtr = ptr;
389     return XML_TOK_INVALID;
390   }
391   while (ptr != end) {
392     switch (BYTE_TYPE(enc, ptr)) {
393     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
394     case BT_S: case BT_CR: case BT_LF:
395       for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
396 	switch (BYTE_TYPE(enc, ptr)) {
397 	case BT_S: case BT_CR: case BT_LF:
398 	  break;
399 	case BT_GT:
400 	  *nextTokPtr = ptr + MINBPC(enc);
401           return XML_TOK_END_TAG;
402 	default:
403 	  *nextTokPtr = ptr;
404 	  return XML_TOK_INVALID;
405 	}
406       }
407       return XML_TOK_PARTIAL;
408 #ifdef XML_NS
409     case BT_COLON:
410       /* no need to check qname syntax here, since end-tag must match exactly */
411       ptr += MINBPC(enc);
412       break;
413 #endif
414     case BT_GT:
415       *nextTokPtr = ptr + MINBPC(enc);
416       return XML_TOK_END_TAG;
417     default:
418       *nextTokPtr = ptr;
419       return XML_TOK_INVALID;
420     }
421   }
422   return XML_TOK_PARTIAL;
423 }
424 
425 /* ptr points to character following "&#X" */
426 
427 static
PREFIX(scanHexCharRef)428 int PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
429 			   const char **nextTokPtr)
430 {
431   if (ptr != end) {
432     switch (BYTE_TYPE(enc, ptr)) {
433     case BT_DIGIT:
434     case BT_HEX:
435       break;
436     default:
437       *nextTokPtr = ptr;
438       return XML_TOK_INVALID;
439     }
440     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
441       switch (BYTE_TYPE(enc, ptr)) {
442       case BT_DIGIT:
443       case BT_HEX:
444 	break;
445       case BT_SEMI:
446 	*nextTokPtr = ptr + MINBPC(enc);
447 	return XML_TOK_CHAR_REF;
448       default:
449 	*nextTokPtr = ptr;
450 	return XML_TOK_INVALID;
451       }
452     }
453   }
454   return XML_TOK_PARTIAL;
455 }
456 
457 /* ptr points to character following "&#" */
458 
459 static
PREFIX(scanCharRef)460 int PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
461 			const char **nextTokPtr)
462 {
463   if (ptr != end) {
464     if (CHAR_MATCHES(enc, ptr, ASCII_x))
465       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
466     switch (BYTE_TYPE(enc, ptr)) {
467     case BT_DIGIT:
468       break;
469     default:
470       *nextTokPtr = ptr;
471       return XML_TOK_INVALID;
472     }
473     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
474       switch (BYTE_TYPE(enc, ptr)) {
475       case BT_DIGIT:
476 	break;
477       case BT_SEMI:
478 	*nextTokPtr = ptr + MINBPC(enc);
479 	return XML_TOK_CHAR_REF;
480       default:
481 	*nextTokPtr = ptr;
482 	return XML_TOK_INVALID;
483       }
484     }
485   }
486   return XML_TOK_PARTIAL;
487 }
488 
489 /* ptr points to character following "&" */
490 
491 static
PREFIX(scanRef)492 int PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
493 		    const char **nextTokPtr)
494 {
495   if (ptr == end)
496     return XML_TOK_PARTIAL;
497   switch (BYTE_TYPE(enc, ptr)) {
498   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
499   case BT_NUM:
500     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
501   default:
502     *nextTokPtr = ptr;
503     return XML_TOK_INVALID;
504   }
505   while (ptr != end) {
506     switch (BYTE_TYPE(enc, ptr)) {
507     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
508     case BT_SEMI:
509       *nextTokPtr = ptr + MINBPC(enc);
510       return XML_TOK_ENTITY_REF;
511     default:
512       *nextTokPtr = ptr;
513       return XML_TOK_INVALID;
514     }
515   }
516   return XML_TOK_PARTIAL;
517 }
518 
519 /* ptr points to character following first character of attribute name */
520 
521 static
PREFIX(scanAtts)522 int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
523 		     const char **nextTokPtr)
524 {
525 #ifdef XML_NS
526   int hadColon = 0;
527 #endif
528   while (ptr != end) {
529     switch (BYTE_TYPE(enc, ptr)) {
530     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
531 #ifdef XML_NS
532     case BT_COLON:
533       if (hadColon) {
534 	*nextTokPtr = ptr;
535 	return XML_TOK_INVALID;
536       }
537       hadColon = 1;
538       ptr += MINBPC(enc);
539       if (ptr == end)
540 	return XML_TOK_PARTIAL;
541       switch (BYTE_TYPE(enc, ptr)) {
542       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
543       default:
544 	*nextTokPtr = ptr;
545 	return XML_TOK_INVALID;
546       }
547       break;
548 #endif
549     case BT_S: case BT_CR: case BT_LF:
550       for (;;) {
551 	int t;
552 
553 	ptr += MINBPC(enc);
554 	if (ptr == end)
555 	  return XML_TOK_PARTIAL;
556 	t = BYTE_TYPE(enc, ptr);
557 	if (t == BT_EQUALS)
558 	  break;
559 	switch (t) {
560 	case BT_S:
561 	case BT_LF:
562 	case BT_CR:
563 	  break;
564 	default:
565 	  *nextTokPtr = ptr;
566 	  return XML_TOK_INVALID;
567 	}
568       }
569     /* fall through */
570     case BT_EQUALS:
571       {
572 	int open;
573 #ifdef XML_NS
574 	hadColon = 0;
575 #endif
576 	for (;;) {
577 
578 	  ptr += MINBPC(enc);
579 	  if (ptr == end)
580 	    return XML_TOK_PARTIAL;
581 	  open = BYTE_TYPE(enc, ptr);
582 	  if (open == BT_QUOT || open == BT_APOS)
583 	    break;
584 	  switch (open) {
585 	  case BT_S:
586 	  case BT_LF:
587 	  case BT_CR:
588 	    break;
589 	  default:
590 	    *nextTokPtr = ptr;
591 	    return XML_TOK_INVALID;
592 	  }
593 	}
594 	ptr += MINBPC(enc);
595 	/* in attribute value */
596 	for (;;) {
597 	  int t;
598 	  if (ptr == end)
599 	    return XML_TOK_PARTIAL;
600 	  t = BYTE_TYPE(enc, ptr);
601 	  if (t == open)
602 	    break;
603 	  switch (t) {
604 	  INVALID_CASES(ptr, nextTokPtr)
605 	  case BT_AMP:
606 	    {
607 	      int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
608 	      if (tok <= 0) {
609 		if (tok == XML_TOK_INVALID)
610 		  *nextTokPtr = ptr;
611 		return tok;
612 	      }
613 	      break;
614 	    }
615 	  case BT_LT:
616 	    *nextTokPtr = ptr;
617 	    return XML_TOK_INVALID;
618 	  default:
619 	    ptr += MINBPC(enc);
620 	    break;
621 	  }
622 	}
623 	ptr += MINBPC(enc);
624 	if (ptr == end)
625 	  return XML_TOK_PARTIAL;
626 	switch (BYTE_TYPE(enc, ptr)) {
627 	case BT_S:
628 	case BT_CR:
629 	case BT_LF:
630 	  break;
631 	case BT_SOL:
632 	  goto sol;
633 	case BT_GT:
634 	  goto gt;
635 	default:
636 	  *nextTokPtr = ptr;
637 	  return XML_TOK_INVALID;
638 	}
639 	/* ptr points to closing quote */
640 	for (;;) {
641 	  ptr += MINBPC(enc);
642 	  if (ptr == end)
643 	    return XML_TOK_PARTIAL;
644 	  switch (BYTE_TYPE(enc, ptr)) {
645 	  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
646 	  case BT_S: case BT_CR: case BT_LF:
647 	    continue;
648 	  case BT_GT:
649           gt:
650 	    *nextTokPtr = ptr + MINBPC(enc);
651 	    return XML_TOK_START_TAG_WITH_ATTS;
652 	  case BT_SOL:
653           sol:
654 	    ptr += MINBPC(enc);
655 	    if (ptr == end)
656 	      return XML_TOK_PARTIAL;
657 	    if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
658 	      *nextTokPtr = ptr;
659 	      return XML_TOK_INVALID;
660 	    }
661 	    *nextTokPtr = ptr + MINBPC(enc);
662 	    return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
663 	  default:
664 	    *nextTokPtr = ptr;
665 	    return XML_TOK_INVALID;
666 	  }
667 	  break;
668 	}
669 	break;
670       }
671     default:
672       *nextTokPtr = ptr;
673       return XML_TOK_INVALID;
674     }
675   }
676   return XML_TOK_PARTIAL;
677 }
678 
679 /* ptr points to character following "<" */
680 
681 static
PREFIX(scanLt)682 int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
683 		   const char **nextTokPtr)
684 {
685 #ifdef XML_NS
686   int hadColon;
687 #endif
688   if (ptr == end)
689     return XML_TOK_PARTIAL;
690   switch (BYTE_TYPE(enc, ptr)) {
691   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
692   case BT_EXCL:
693     if ((ptr += MINBPC(enc)) == end)
694       return XML_TOK_PARTIAL;
695     switch (BYTE_TYPE(enc, ptr)) {
696     case BT_MINUS:
697       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
698     case BT_LSQB:
699       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
700     }
701     *nextTokPtr = ptr;
702     return XML_TOK_INVALID;
703   case BT_QUEST:
704     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
705   case BT_SOL:
706     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
707   default:
708     *nextTokPtr = ptr;
709     return XML_TOK_INVALID;
710   }
711 #ifdef XML_NS
712   hadColon = 0;
713 #endif
714   /* we have a start-tag */
715   while (ptr != end) {
716     switch (BYTE_TYPE(enc, ptr)) {
717     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
718 #ifdef XML_NS
719     case BT_COLON:
720       if (hadColon) {
721 	*nextTokPtr = ptr;
722 	return XML_TOK_INVALID;
723       }
724       hadColon = 1;
725       ptr += MINBPC(enc);
726       if (ptr == end)
727 	return XML_TOK_PARTIAL;
728       switch (BYTE_TYPE(enc, ptr)) {
729       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
730       default:
731         *nextTokPtr = ptr;
732         return XML_TOK_INVALID;
733       }
734       break;
735 #endif
736     case BT_S: case BT_CR: case BT_LF:
737       {
738         ptr += MINBPC(enc);
739 	while (ptr != end) {
740 	  switch (BYTE_TYPE(enc, ptr)) {
741 	  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
742 	  case BT_GT:
743 	    goto gt;
744 	  case BT_SOL:
745 	    goto sol;
746 	  case BT_S: case BT_CR: case BT_LF:
747 	    ptr += MINBPC(enc);
748 	    continue;
749 	  default:
750 	    *nextTokPtr = ptr;
751 	    return XML_TOK_INVALID;
752 	  }
753 	  return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
754 	}
755 	return XML_TOK_PARTIAL;
756       }
757     case BT_GT:
758     gt:
759       *nextTokPtr = ptr + MINBPC(enc);
760       return XML_TOK_START_TAG_NO_ATTS;
761     case BT_SOL:
762     sol:
763       ptr += MINBPC(enc);
764       if (ptr == end)
765 	return XML_TOK_PARTIAL;
766       if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
767 	*nextTokPtr = ptr;
768 	return XML_TOK_INVALID;
769       }
770       *nextTokPtr = ptr + MINBPC(enc);
771       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
772     default:
773       *nextTokPtr = ptr;
774       return XML_TOK_INVALID;
775     }
776   }
777   return XML_TOK_PARTIAL;
778 }
779 
780 static
PREFIX(contentTok)781 int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
782 		       const char **nextTokPtr)
783 {
784   if (ptr == end)
785     return XML_TOK_NONE;
786   if (MINBPC(enc) > 1) {
787     size_t n = end - ptr;
788     if (n & (MINBPC(enc) - 1)) {
789       n &= ~(MINBPC(enc) - 1);
790       if (n == 0)
791 	return XML_TOK_PARTIAL;
792       end = ptr + n;
793     }
794   }
795   switch (BYTE_TYPE(enc, ptr)) {
796   case BT_LT:
797     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
798   case BT_AMP:
799     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
800   case BT_CR:
801     ptr += MINBPC(enc);
802     if (ptr == end)
803       return XML_TOK_TRAILING_CR;
804     if (BYTE_TYPE(enc, ptr) == BT_LF)
805       ptr += MINBPC(enc);
806     *nextTokPtr = ptr;
807     return XML_TOK_DATA_NEWLINE;
808   case BT_LF:
809     *nextTokPtr = ptr + MINBPC(enc);
810     return XML_TOK_DATA_NEWLINE;
811   case BT_RSQB:
812     ptr += MINBPC(enc);
813     if (ptr == end)
814       return XML_TOK_TRAILING_RSQB;
815     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
816       break;
817     ptr += MINBPC(enc);
818     if (ptr == end)
819       return XML_TOK_TRAILING_RSQB;
820     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
821       ptr -= MINBPC(enc);
822       break;
823     }
824     *nextTokPtr = ptr;
825     return XML_TOK_INVALID;
826   INVALID_CASES(ptr, nextTokPtr)
827   default:
828     ptr += MINBPC(enc);
829     break;
830   }
831   while (ptr != end) {
832     switch (BYTE_TYPE(enc, ptr)) {
833 #define LEAD_CASE(n) \
834     case BT_LEAD ## n: \
835       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
836 	*nextTokPtr = ptr; \
837 	return XML_TOK_DATA_CHARS; \
838       } \
839       ptr += n; \
840       break;
841     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
842 #undef LEAD_CASE
843     case BT_RSQB:
844       if (ptr + MINBPC(enc) != end) {
845 	 if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
846 	   ptr += MINBPC(enc);
847 	   break;
848 	 }
849 	 if (ptr + 2*MINBPC(enc) != end) {
850 	   if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
851 	     ptr += MINBPC(enc);
852 	     break;
853 	   }
854 	   *nextTokPtr = ptr + 2*MINBPC(enc);
855 	   return XML_TOK_INVALID;
856 	 }
857       }
858       /* fall through */
859     case BT_AMP:
860     case BT_LT:
861     case BT_NONXML:
862     case BT_MALFORM:
863     case BT_TRAIL:
864     case BT_CR:
865     case BT_LF:
866       *nextTokPtr = ptr;
867       return XML_TOK_DATA_CHARS;
868     default:
869       ptr += MINBPC(enc);
870       break;
871     }
872   }
873   *nextTokPtr = ptr;
874   return XML_TOK_DATA_CHARS;
875 }
876 
877 /* ptr points to character following "%" */
878 
879 static
PREFIX(scanPercent)880 int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
881 			const char **nextTokPtr)
882 {
883   if (ptr == end)
884     return XML_TOK_PARTIAL;
885   switch (BYTE_TYPE(enc, ptr)) {
886   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
887   case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
888     *nextTokPtr = ptr;
889     return XML_TOK_PERCENT;
890   default:
891     *nextTokPtr = ptr;
892     return XML_TOK_INVALID;
893   }
894   while (ptr != end) {
895     switch (BYTE_TYPE(enc, ptr)) {
896     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
897     case BT_SEMI:
898       *nextTokPtr = ptr + MINBPC(enc);
899       return XML_TOK_PARAM_ENTITY_REF;
900     default:
901       *nextTokPtr = ptr;
902       return XML_TOK_INVALID;
903     }
904   }
905   return XML_TOK_PARTIAL;
906 }
907 
908 static
PREFIX(scanPoundName)909 int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
910 			  const char **nextTokPtr)
911 {
912   if (ptr == end)
913     return XML_TOK_PARTIAL;
914   switch (BYTE_TYPE(enc, ptr)) {
915   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
916   default:
917     *nextTokPtr = ptr;
918     return XML_TOK_INVALID;
919   }
920   while (ptr != end) {
921     switch (BYTE_TYPE(enc, ptr)) {
922     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
923     case BT_CR: case BT_LF: case BT_S:
924     case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
925       *nextTokPtr = ptr;
926       return XML_TOK_POUND_NAME;
927     default:
928       *nextTokPtr = ptr;
929       return XML_TOK_INVALID;
930     }
931   }
932   return -XML_TOK_POUND_NAME;
933 }
934 
935 static
PREFIX(scanLit)936 int PREFIX(scanLit)(int open, const ENCODING *enc,
937 		    const char *ptr, const char *end,
938 		    const char **nextTokPtr)
939 {
940   while (ptr != end) {
941     int t = BYTE_TYPE(enc, ptr);
942     switch (t) {
943     INVALID_CASES(ptr, nextTokPtr)
944     case BT_QUOT:
945     case BT_APOS:
946       ptr += MINBPC(enc);
947       if (t != open)
948 	break;
949       if (ptr == end)
950 	return -XML_TOK_LITERAL;
951       *nextTokPtr = ptr;
952       switch (BYTE_TYPE(enc, ptr)) {
953       case BT_S: case BT_CR: case BT_LF:
954       case BT_GT: case BT_PERCNT: case BT_LSQB:
955 	return XML_TOK_LITERAL;
956       default:
957 	return XML_TOK_INVALID;
958       }
959     default:
960       ptr += MINBPC(enc);
961       break;
962     }
963   }
964   return XML_TOK_PARTIAL;
965 }
966 
967 static
PREFIX(prologTok)968 int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
969 		      const char **nextTokPtr)
970 {
971   int tok;
972   if (ptr == end)
973     return XML_TOK_NONE;
974   if (MINBPC(enc) > 1) {
975     size_t n = end - ptr;
976     if (n & (MINBPC(enc) - 1)) {
977       n &= ~(MINBPC(enc) - 1);
978       if (n == 0)
979 	return XML_TOK_PARTIAL;
980       end = ptr + n;
981     }
982   }
983   switch (BYTE_TYPE(enc, ptr)) {
984   case BT_QUOT:
985     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
986   case BT_APOS:
987     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
988   case BT_LT:
989     {
990       ptr += MINBPC(enc);
991       if (ptr == end)
992 	return XML_TOK_PARTIAL;
993       switch (BYTE_TYPE(enc, ptr)) {
994       case BT_EXCL:
995 	return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
996       case BT_QUEST:
997 	return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
998       case BT_NMSTRT:
999       case BT_HEX:
1000       case BT_NONASCII:
1001       case BT_LEAD2:
1002       case BT_LEAD3:
1003       case BT_LEAD4:
1004 	*nextTokPtr = ptr - MINBPC(enc);
1005 	return XML_TOK_INSTANCE_START;
1006       }
1007       *nextTokPtr = ptr;
1008       return XML_TOK_INVALID;
1009     }
1010   case BT_CR:
1011     if (ptr + MINBPC(enc) == end)
1012       return -XML_TOK_PROLOG_S;
1013     /* fall through */
1014   case BT_S: case BT_LF:
1015     for (;;) {
1016       ptr += MINBPC(enc);
1017       if (ptr == end)
1018 	break;
1019       switch (BYTE_TYPE(enc, ptr)) {
1020       case BT_S: case BT_LF:
1021 	break;
1022       case BT_CR:
1023 	/* don't split CR/LF pair */
1024 	if (ptr + MINBPC(enc) != end)
1025 	  break;
1026 	/* fall through */
1027       default:
1028 	*nextTokPtr = ptr;
1029 	return XML_TOK_PROLOG_S;
1030       }
1031     }
1032     *nextTokPtr = ptr;
1033     return XML_TOK_PROLOG_S;
1034   case BT_PERCNT:
1035     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1036   case BT_COMMA:
1037     *nextTokPtr = ptr + MINBPC(enc);
1038     return XML_TOK_COMMA;
1039   case BT_LSQB:
1040     *nextTokPtr = ptr + MINBPC(enc);
1041     return XML_TOK_OPEN_BRACKET;
1042   case BT_RSQB:
1043     ptr += MINBPC(enc);
1044     if (ptr == end)
1045       return -XML_TOK_CLOSE_BRACKET;
1046     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1047       if (ptr + MINBPC(enc) == end)
1048 	return XML_TOK_PARTIAL;
1049       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1050 	*nextTokPtr = ptr + 2*MINBPC(enc);
1051 	return XML_TOK_COND_SECT_CLOSE;
1052       }
1053     }
1054     *nextTokPtr = ptr;
1055     return XML_TOK_CLOSE_BRACKET;
1056   case BT_LPAR:
1057     *nextTokPtr = ptr + MINBPC(enc);
1058     return XML_TOK_OPEN_PAREN;
1059   case BT_RPAR:
1060     ptr += MINBPC(enc);
1061     if (ptr == end)
1062       return -XML_TOK_CLOSE_PAREN;
1063     switch (BYTE_TYPE(enc, ptr)) {
1064     case BT_AST:
1065       *nextTokPtr = ptr + MINBPC(enc);
1066       return XML_TOK_CLOSE_PAREN_ASTERISK;
1067     case BT_QUEST:
1068       *nextTokPtr = ptr + MINBPC(enc);
1069       return XML_TOK_CLOSE_PAREN_QUESTION;
1070     case BT_PLUS:
1071       *nextTokPtr = ptr + MINBPC(enc);
1072       return XML_TOK_CLOSE_PAREN_PLUS;
1073     case BT_CR: case BT_LF: case BT_S:
1074     case BT_GT: case BT_COMMA: case BT_VERBAR:
1075     case BT_RPAR:
1076       *nextTokPtr = ptr;
1077       return XML_TOK_CLOSE_PAREN;
1078     }
1079     *nextTokPtr = ptr;
1080     return XML_TOK_INVALID;
1081   case BT_VERBAR:
1082     *nextTokPtr = ptr + MINBPC(enc);
1083     return XML_TOK_OR;
1084   case BT_GT:
1085     *nextTokPtr = ptr + MINBPC(enc);
1086     return XML_TOK_DECL_CLOSE;
1087   case BT_NUM:
1088     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1089 #define LEAD_CASE(n) \
1090   case BT_LEAD ## n: \
1091     if (end - ptr < n) \
1092       return XML_TOK_PARTIAL_CHAR; \
1093     if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1094       ptr += n; \
1095       tok = XML_TOK_NAME; \
1096       break; \
1097     } \
1098     if (IS_NAME_CHAR(enc, ptr, n)) { \
1099       ptr += n; \
1100       tok = XML_TOK_NMTOKEN; \
1101       break; \
1102     } \
1103     *nextTokPtr = ptr; \
1104     return XML_TOK_INVALID;
1105     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1106 #undef LEAD_CASE
1107   case BT_NMSTRT:
1108   case BT_HEX:
1109     tok = XML_TOK_NAME;
1110     ptr += MINBPC(enc);
1111     break;
1112   case BT_DIGIT:
1113   case BT_NAME:
1114   case BT_MINUS:
1115 #ifdef XML_NS
1116   case BT_COLON:
1117 #endif
1118     tok = XML_TOK_NMTOKEN;
1119     ptr += MINBPC(enc);
1120     break;
1121   case BT_NONASCII:
1122     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1123       ptr += MINBPC(enc);
1124       tok = XML_TOK_NAME;
1125       break;
1126     }
1127     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1128       ptr += MINBPC(enc);
1129       tok = XML_TOK_NMTOKEN;
1130       break;
1131     }
1132     /* fall through */
1133   default:
1134     *nextTokPtr = ptr;
1135     return XML_TOK_INVALID;
1136   }
1137   while (ptr != end) {
1138     switch (BYTE_TYPE(enc, ptr)) {
1139     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1140     case BT_GT: case BT_RPAR: case BT_COMMA:
1141     case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1142     case BT_S: case BT_CR: case BT_LF:
1143       *nextTokPtr = ptr;
1144       return tok;
1145 #ifdef XML_NS
1146     case BT_COLON:
1147       ptr += MINBPC(enc);
1148       switch (tok) {
1149       case XML_TOK_NAME:
1150 	if (ptr == end)
1151 	  return XML_TOK_PARTIAL;
1152 	tok = XML_TOK_PREFIXED_NAME;
1153 	switch (BYTE_TYPE(enc, ptr)) {
1154 	CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1155 	default:
1156 	  tok = XML_TOK_NMTOKEN;
1157 	  break;
1158 	}
1159 	break;
1160       case XML_TOK_PREFIXED_NAME:
1161 	tok = XML_TOK_NMTOKEN;
1162 	break;
1163       }
1164       break;
1165 #endif
1166     case BT_PLUS:
1167       if (tok == XML_TOK_NMTOKEN)  {
1168 	*nextTokPtr = ptr;
1169 	return XML_TOK_INVALID;
1170       }
1171       *nextTokPtr = ptr + MINBPC(enc);
1172       return XML_TOK_NAME_PLUS;
1173     case BT_AST:
1174       if (tok == XML_TOK_NMTOKEN)  {
1175 	*nextTokPtr = ptr;
1176 	return XML_TOK_INVALID;
1177       }
1178       *nextTokPtr = ptr + MINBPC(enc);
1179       return XML_TOK_NAME_ASTERISK;
1180     case BT_QUEST:
1181       if (tok == XML_TOK_NMTOKEN)  {
1182 	*nextTokPtr = ptr;
1183 	return XML_TOK_INVALID;
1184       }
1185       *nextTokPtr = ptr + MINBPC(enc);
1186       return XML_TOK_NAME_QUESTION;
1187     default:
1188       *nextTokPtr = ptr;
1189       return XML_TOK_INVALID;
1190     }
1191   }
1192   return -tok;
1193 }
1194 
1195 static
PREFIX(attributeValueTok)1196 int PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1197 			      const char **nextTokPtr)
1198 {
1199   const char *start;
1200   if (ptr == end)
1201     return XML_TOK_NONE;
1202   start = ptr;
1203   while (ptr != end) {
1204     switch (BYTE_TYPE(enc, ptr)) {
1205 #define LEAD_CASE(n) \
1206     case BT_LEAD ## n: ptr += n; break;
1207     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1208 #undef LEAD_CASE
1209     case BT_AMP:
1210       if (ptr == start)
1211 	return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1212       *nextTokPtr = ptr;
1213       return XML_TOK_DATA_CHARS;
1214     case BT_LT:
1215       /* this is for inside entity references */
1216       *nextTokPtr = ptr;
1217       return XML_TOK_INVALID;
1218     case BT_LF:
1219       if (ptr == start) {
1220 	*nextTokPtr = ptr + MINBPC(enc);
1221 	return XML_TOK_DATA_NEWLINE;
1222       }
1223       *nextTokPtr = ptr;
1224       return XML_TOK_DATA_CHARS;
1225     case BT_CR:
1226       if (ptr == start) {
1227 	ptr += MINBPC(enc);
1228 	if (ptr == end)
1229 	  return XML_TOK_TRAILING_CR;
1230 	if (BYTE_TYPE(enc, ptr) == BT_LF)
1231 	  ptr += MINBPC(enc);
1232 	*nextTokPtr = ptr;
1233 	return XML_TOK_DATA_NEWLINE;
1234       }
1235       *nextTokPtr = ptr;
1236       return XML_TOK_DATA_CHARS;
1237     case BT_S:
1238       if (ptr == start) {
1239 	*nextTokPtr = ptr + MINBPC(enc);
1240 	return XML_TOK_ATTRIBUTE_VALUE_S;
1241       }
1242       *nextTokPtr = ptr;
1243       return XML_TOK_DATA_CHARS;
1244     default:
1245       ptr += MINBPC(enc);
1246       break;
1247     }
1248   }
1249   *nextTokPtr = ptr;
1250   return XML_TOK_DATA_CHARS;
1251 }
1252 
1253 static
PREFIX(entityValueTok)1254 int PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1255 			   const char **nextTokPtr)
1256 {
1257   const char *start;
1258   if (ptr == end)
1259     return XML_TOK_NONE;
1260   start = ptr;
1261   while (ptr != end) {
1262     switch (BYTE_TYPE(enc, ptr)) {
1263 #define LEAD_CASE(n) \
1264     case BT_LEAD ## n: ptr += n; break;
1265     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1266 #undef LEAD_CASE
1267     case BT_AMP:
1268       if (ptr == start)
1269 	return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1270       *nextTokPtr = ptr;
1271       return XML_TOK_DATA_CHARS;
1272     case BT_PERCNT:
1273       if (ptr == start) {
1274 	int tok =  PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1275 				       end, nextTokPtr);
1276 	return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1277       }
1278       *nextTokPtr = ptr;
1279       return XML_TOK_DATA_CHARS;
1280     case BT_LF:
1281       if (ptr == start) {
1282 	*nextTokPtr = ptr + MINBPC(enc);
1283 	return XML_TOK_DATA_NEWLINE;
1284       }
1285       *nextTokPtr = ptr;
1286       return XML_TOK_DATA_CHARS;
1287     case BT_CR:
1288       if (ptr == start) {
1289 	ptr += MINBPC(enc);
1290 	if (ptr == end)
1291 	  return XML_TOK_TRAILING_CR;
1292 	if (BYTE_TYPE(enc, ptr) == BT_LF)
1293 	  ptr += MINBPC(enc);
1294 	*nextTokPtr = ptr;
1295 	return XML_TOK_DATA_NEWLINE;
1296       }
1297       *nextTokPtr = ptr;
1298       return XML_TOK_DATA_CHARS;
1299     default:
1300       ptr += MINBPC(enc);
1301       break;
1302     }
1303   }
1304   *nextTokPtr = ptr;
1305   return XML_TOK_DATA_CHARS;
1306 }
1307 
1308 #ifdef XML_DTD
1309 
1310 static
PREFIX(ignoreSectionTok)1311 int PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1312 			     const char **nextTokPtr)
1313 {
1314   int level = 0;
1315   if (MINBPC(enc) > 1) {
1316     size_t n = end - ptr;
1317     if (n & (MINBPC(enc) - 1)) {
1318       n &= ~(MINBPC(enc) - 1);
1319       end = ptr + n;
1320     }
1321   }
1322   while (ptr != end) {
1323     switch (BYTE_TYPE(enc, ptr)) {
1324     INVALID_CASES(ptr, nextTokPtr)
1325     case BT_LT:
1326       if ((ptr += MINBPC(enc)) == end)
1327 	return XML_TOK_PARTIAL;
1328       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1329 	if ((ptr += MINBPC(enc)) == end)
1330 	  return XML_TOK_PARTIAL;
1331 	if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1332 	  ++level;
1333 	  ptr += MINBPC(enc);
1334 	}
1335       }
1336       break;
1337     case BT_RSQB:
1338       if ((ptr += MINBPC(enc)) == end)
1339 	return XML_TOK_PARTIAL;
1340       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1341 	if ((ptr += MINBPC(enc)) == end)
1342 	  return XML_TOK_PARTIAL;
1343 	if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1344 	  ptr += MINBPC(enc);
1345 	  if (level == 0) {
1346 	    *nextTokPtr = ptr;
1347 	    return XML_TOK_IGNORE_SECT;
1348 	  }
1349 	  --level;
1350 	}
1351       }
1352       break;
1353     default:
1354       ptr += MINBPC(enc);
1355       break;
1356     }
1357   }
1358   return XML_TOK_PARTIAL;
1359 }
1360 
1361 #endif /* XML_DTD */
1362 
1363 static
PREFIX(isPublicId)1364 int PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1365 		       const char **badPtr)
1366 {
1367   ptr += MINBPC(enc);
1368   end -= MINBPC(enc);
1369   for (; ptr != end; ptr += MINBPC(enc)) {
1370     switch (BYTE_TYPE(enc, ptr)) {
1371     case BT_DIGIT:
1372     case BT_HEX:
1373     case BT_MINUS:
1374     case BT_APOS:
1375     case BT_LPAR:
1376     case BT_RPAR:
1377     case BT_PLUS:
1378     case BT_COMMA:
1379     case BT_SOL:
1380     case BT_EQUALS:
1381     case BT_QUEST:
1382     case BT_CR:
1383     case BT_LF:
1384     case BT_SEMI:
1385     case BT_EXCL:
1386     case BT_AST:
1387     case BT_PERCNT:
1388     case BT_NUM:
1389 #ifdef XML_NS
1390     case BT_COLON:
1391 #endif
1392       break;
1393     case BT_S:
1394       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1395 	*badPtr = ptr;
1396 	return 0;
1397       }
1398       break;
1399     case BT_NAME:
1400     case BT_NMSTRT:
1401       if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1402 	break;
1403     default:
1404       switch (BYTE_TO_ASCII(enc, ptr)) {
1405       case 0x24: /* $ */
1406       case 0x40: /* @ */
1407 	break;
1408       default:
1409 	*badPtr = ptr;
1410 	return 0;
1411       }
1412       break;
1413     }
1414   }
1415   return 1;
1416 }
1417 
1418 /* This must only be called for a well-formed start-tag or empty element tag.
1419 Returns the number of attributes.  Pointers to the first attsMax attributes
1420 are stored in atts. */
1421 
1422 static
PREFIX(getAtts)1423 int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1424 		    int attsMax, ATTRIBUTE *atts)
1425 {
1426   enum { other, inName, inValue } state = inName;
1427   int nAtts = 0;
1428   int open = 0; /* defined when state == inValue;
1429 		   initialization just to shut up compilers */
1430 
1431   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1432     switch (BYTE_TYPE(enc, ptr)) {
1433 #define START_NAME \
1434       if (state == other) { \
1435 	if (nAtts < attsMax) { \
1436 	  atts[nAtts].name = ptr; \
1437 	  atts[nAtts].normalized = 1; \
1438 	} \
1439 	state = inName; \
1440       }
1441 #define LEAD_CASE(n) \
1442     case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1443     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1444 #undef LEAD_CASE
1445     case BT_NONASCII:
1446     case BT_NMSTRT:
1447     case BT_HEX:
1448       START_NAME
1449       break;
1450 #undef START_NAME
1451     case BT_QUOT:
1452       if (state != inValue) {
1453 	if (nAtts < attsMax)
1454 	  atts[nAtts].valuePtr = ptr + MINBPC(enc);
1455         state = inValue;
1456         open = BT_QUOT;
1457       }
1458       else if (open == BT_QUOT) {
1459         state = other;
1460 	if (nAtts < attsMax)
1461 	  atts[nAtts].valueEnd = ptr;
1462 	nAtts++;
1463       }
1464       break;
1465     case BT_APOS:
1466       if (state != inValue) {
1467 	if (nAtts < attsMax)
1468 	  atts[nAtts].valuePtr = ptr + MINBPC(enc);
1469         state = inValue;
1470         open = BT_APOS;
1471       }
1472       else if (open == BT_APOS) {
1473         state = other;
1474 	if (nAtts < attsMax)
1475 	  atts[nAtts].valueEnd = ptr;
1476 	nAtts++;
1477       }
1478       break;
1479     case BT_AMP:
1480       if (nAtts < attsMax)
1481 	atts[nAtts].normalized = 0;
1482       break;
1483     case BT_S:
1484       if (state == inName)
1485         state = other;
1486       else if (state == inValue
1487 	       && nAtts < attsMax
1488 	       && atts[nAtts].normalized
1489 	       && (ptr == atts[nAtts].valuePtr
1490 		   || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1491 		   || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1492 	           || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1493 	atts[nAtts].normalized = 0;
1494       break;
1495     case BT_CR: case BT_LF:
1496       /* This case ensures that the first attribute name is counted
1497          Apart from that we could just change state on the quote. */
1498       if (state == inName)
1499         state = other;
1500       else if (state == inValue && nAtts < attsMax)
1501 	atts[nAtts].normalized = 0;
1502       break;
1503     case BT_GT:
1504     case BT_SOL:
1505       if (state != inValue)
1506 	return nAtts;
1507       break;
1508     default:
1509       break;
1510     }
1511   }
1512   /* not reached */
1513 }
1514 
1515 static
PREFIX(charRefNumber)1516 int PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
1517 {
1518   int result = 0;
1519   /* skip &# */
1520   ptr += 2*MINBPC(enc);
1521   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1522     for (ptr += MINBPC(enc); !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1523       int c = BYTE_TO_ASCII(enc, ptr);
1524       switch (c) {
1525       case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1526       case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1527 	result <<= 4;
1528 	result |= (c - ASCII_0);
1529 	break;
1530       case ASCII_A: case ASCII_B: case ASCII_C: case ASCII_D: case ASCII_E: case ASCII_F:
1531 	result <<= 4;
1532 	result += 10 + (c - ASCII_A);
1533 	break;
1534       case ASCII_a: case ASCII_b: case ASCII_c: case ASCII_d: case ASCII_e: case ASCII_f:
1535 	result <<= 4;
1536 	result += 10 + (c - ASCII_a);
1537 	break;
1538       }
1539       if (result >= 0x110000)
1540 	return -1;
1541     }
1542   }
1543   else {
1544     for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1545       int c = BYTE_TO_ASCII(enc, ptr);
1546       result *= 10;
1547       result += (c - ASCII_0);
1548       if (result >= 0x110000)
1549 	return -1;
1550     }
1551   }
1552   return checkCharRefNumber(result);
1553 }
1554 
1555 static
PREFIX(predefinedEntityName)1556 int PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, const char *end)
1557 {
1558   switch ((end - ptr)/MINBPC(enc)) {
1559   case 2:
1560     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1561       switch (BYTE_TO_ASCII(enc, ptr)) {
1562       case ASCII_l:
1563 	return ASCII_LT;
1564       case ASCII_g:
1565 	return ASCII_GT;
1566       }
1567     }
1568     break;
1569   case 3:
1570     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1571       ptr += MINBPC(enc);
1572       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1573 	ptr += MINBPC(enc);
1574 	if (CHAR_MATCHES(enc, ptr, ASCII_p))
1575 	  return ASCII_AMP;
1576       }
1577     }
1578     break;
1579   case 4:
1580     switch (BYTE_TO_ASCII(enc, ptr)) {
1581     case ASCII_q:
1582       ptr += MINBPC(enc);
1583       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1584 	ptr += MINBPC(enc);
1585 	if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1586 	  ptr += MINBPC(enc);
1587   	  if (CHAR_MATCHES(enc, ptr, ASCII_t))
1588 	    return ASCII_QUOT;
1589 	}
1590       }
1591       break;
1592     case ASCII_a:
1593       ptr += MINBPC(enc);
1594       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1595 	ptr += MINBPC(enc);
1596 	if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1597 	  ptr += MINBPC(enc);
1598   	  if (CHAR_MATCHES(enc, ptr, ASCII_s))
1599 	    return ASCII_APOS;
1600 	}
1601       }
1602       break;
1603     }
1604   }
1605   return 0;
1606 }
1607 
1608 static
PREFIX(sameName)1609 int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1610 {
1611   for (;;) {
1612     switch (BYTE_TYPE(enc, ptr1)) {
1613 #define LEAD_CASE(n) \
1614     case BT_LEAD ## n: \
1615       if (*ptr1++ != *ptr2++) \
1616 	return 0;
1617     LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1618 #undef LEAD_CASE
1619       /* fall through */
1620       if (*ptr1++ != *ptr2++)
1621 	return 0;
1622       break;
1623     case BT_NONASCII:
1624     case BT_NMSTRT:
1625 #ifdef XML_NS
1626     case BT_COLON:
1627 #endif
1628     case BT_HEX:
1629     case BT_DIGIT:
1630     case BT_NAME:
1631     case BT_MINUS:
1632       if (*ptr2++ != *ptr1++)
1633 	return 0;
1634       if (MINBPC(enc) > 1) {
1635 	if (*ptr2++ != *ptr1++)
1636 	  return 0;
1637 	if (MINBPC(enc) > 2) {
1638 	  if (*ptr2++ != *ptr1++)
1639 	    return 0;
1640           if (MINBPC(enc) > 3) {
1641 	    if (*ptr2++ != *ptr1++)
1642       	      return 0;
1643 	  }
1644 	}
1645       }
1646       break;
1647     default:
1648       if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1649 	return 1;
1650       switch (BYTE_TYPE(enc, ptr2)) {
1651       case BT_LEAD2:
1652       case BT_LEAD3:
1653       case BT_LEAD4:
1654       case BT_NONASCII:
1655       case BT_NMSTRT:
1656 #ifdef XML_NS
1657       case BT_COLON:
1658 #endif
1659       case BT_HEX:
1660       case BT_DIGIT:
1661       case BT_NAME:
1662       case BT_MINUS:
1663 	return 0;
1664       default:
1665 	return 1;
1666       }
1667     }
1668   }
1669   /* not reached */
1670 }
1671 
1672 static
PREFIX(nameMatchesAscii)1673 int PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1674 			     const char *end1, const char *ptr2)
1675 {
1676   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1677     if (ptr1 == end1)
1678       return 0;
1679     if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1680       return 0;
1681   }
1682   return ptr1 == end1;
1683 }
1684 
1685 static
PREFIX(nameLength)1686 int PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1687 {
1688   const char *start = ptr;
1689   for (;;) {
1690     switch (BYTE_TYPE(enc, ptr)) {
1691 #define LEAD_CASE(n) \
1692     case BT_LEAD ## n: ptr += n; break;
1693     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1694 #undef LEAD_CASE
1695     case BT_NONASCII:
1696     case BT_NMSTRT:
1697 #ifdef XML_NS
1698     case BT_COLON:
1699 #endif
1700     case BT_HEX:
1701     case BT_DIGIT:
1702     case BT_NAME:
1703     case BT_MINUS:
1704       ptr += MINBPC(enc);
1705       break;
1706     default:
1707       return ptr - start;
1708     }
1709   }
1710 }
1711 
1712 static
PREFIX(skipS)1713 const char *PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1714 {
1715   for (;;) {
1716     switch (BYTE_TYPE(enc, ptr)) {
1717     case BT_LF:
1718     case BT_CR:
1719     case BT_S:
1720       ptr += MINBPC(enc);
1721       break;
1722     default:
1723       return ptr;
1724     }
1725   }
1726 }
1727 
1728 static
PREFIX(updatePosition)1729 void PREFIX(updatePosition)(const ENCODING *enc,
1730 			    const char *ptr,
1731 			    const char *end,
1732 			    POSITION *pos)
1733 {
1734   while (ptr != end) {
1735     switch (BYTE_TYPE(enc, ptr)) {
1736 #define LEAD_CASE(n) \
1737     case BT_LEAD ## n: \
1738       ptr += n; \
1739       break;
1740     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1741 #undef LEAD_CASE
1742     case BT_LF:
1743       pos->columnNumber = (unsigned)-1;
1744       pos->lineNumber++;
1745       ptr += MINBPC(enc);
1746       break;
1747     case BT_CR:
1748       pos->lineNumber++;
1749       ptr += MINBPC(enc);
1750       if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
1751 	ptr += MINBPC(enc);
1752       pos->columnNumber = (unsigned)-1;
1753       break;
1754     default:
1755       ptr += MINBPC(enc);
1756       break;
1757     }
1758     pos->columnNumber++;
1759   }
1760 }
1761 
1762 #undef DO_LEAD_CASE
1763 #undef MULTIBYTE_CASES
1764 #undef INVALID_CASES
1765 #undef CHECK_NAME_CASE
1766 #undef CHECK_NAME_CASES
1767 #undef CHECK_NMSTRT_CASE
1768 #undef CHECK_NMSTRT_CASES
1769