1 /* This file is included (from xmltok.c, 1-3 times depending on XML_MIN_SIZE)!
2                             __  __            _
3                          ___\ \/ /_ __   __ _| |_
4                         / _ \\  /| '_ \ / _` | __|
5                        |  __//  \| |_) | (_| | |_
6                         \___/_/\_\ .__/ \__,_|\__|
7                                  |_| XML parser
8 
9    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10    Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11    Copyright (c) 2002      Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12    Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
13    Copyright (c) 2016-2021 Sebastian Pipping <sebastian@pipping.org>
14    Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
15    Copyright (c) 2018      Benjamin Peterson <benjamin@python.org>
16    Copyright (c) 2018      Anton Maklakov <antmak.pub@gmail.com>
17    Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
18    Copyright (c) 2020      Boris Kolpackov <boris@codesynthesis.com>
19    Licensed under the MIT license:
20 
21    Permission is  hereby granted,  free of charge,  to any  person obtaining
22    a  copy  of  this  software   and  associated  documentation  files  (the
23    "Software"),  to  deal in  the  Software  without restriction,  including
24    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
25    distribute, sublicense, and/or sell copies of the Software, and to permit
26    persons  to whom  the Software  is  furnished to  do so,  subject to  the
27    following conditions:
28 
29    The above copyright  notice and this permission notice  shall be included
30    in all copies or substantial portions of the Software.
31 
32    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
33    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
34    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
35    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
36    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
37    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
38    USE OR OTHER DEALINGS IN THE SOFTWARE.
39 */
40 
41 #ifdef XML_TOK_IMPL_C
42 
43 #  ifndef IS_INVALID_CHAR // i.e. for UTF-16 and XML_MIN_SIZE not defined
44 #    define IS_INVALID_CHAR(enc, ptr, n) (0)
45 #  endif
46 
47 #  define INVALID_LEAD_CASE(n, ptr, nextTokPtr)                                \
48   case BT_LEAD##n:                                                             \
49     if (end - ptr < n)                                                         \
50       return XML_TOK_PARTIAL_CHAR;                                             \
51     if (IS_INVALID_CHAR(enc, ptr, n)) {                                        \
52       *(nextTokPtr) = (ptr);                                                   \
53       return XML_TOK_INVALID;                                                  \
54     }                                                                          \
55     ptr += n;                                                                  \
56     break;
57 
58 #  define INVALID_CASES(ptr, nextTokPtr)                                       \
59     INVALID_LEAD_CASE(2, ptr, nextTokPtr)                                      \
60     INVALID_LEAD_CASE(3, ptr, nextTokPtr)                                      \
61     INVALID_LEAD_CASE(4, ptr, nextTokPtr)                                      \
62   case BT_NONXML:                                                              \
63   case BT_MALFORM:                                                             \
64   case BT_TRAIL:                                                               \
65     *(nextTokPtr) = (ptr);                                                     \
66     return XML_TOK_INVALID;
67 
68 #  define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr)                        \
69   case BT_LEAD##n:                                                             \
70     if (end - ptr < n)                                                         \
71       return XML_TOK_PARTIAL_CHAR;                                             \
72     if (! IS_NAME_CHAR(enc, ptr, n)) {                                         \
73       *nextTokPtr = ptr;                                                       \
74       return XML_TOK_INVALID;                                                  \
75     }                                                                          \
76     ptr += n;                                                                  \
77     break;
78 
79 #  define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)                          \
80   case BT_NONASCII:                                                            \
81     if (! IS_NAME_CHAR_MINBPC(enc, ptr)) {                                     \
82       *nextTokPtr = ptr;                                                       \
83       return XML_TOK_INVALID;                                                  \
84     }                                                                          \
85     /* fall through */                                                         \
86   case BT_NMSTRT:                                                              \
87   case BT_HEX:                                                                 \
88   case BT_DIGIT:                                                               \
89   case BT_NAME:                                                                \
90   case BT_MINUS:                                                               \
91     ptr += MINBPC(enc);                                                        \
92     break;                                                                     \
93     CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr)                              \
94     CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr)                              \
95     CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
96 
97 #  define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr)                      \
98   case BT_LEAD##n:                                                             \
99     if (end - ptr < n)                                                         \
100       return XML_TOK_PARTIAL_CHAR;                                             \
101     if (! IS_NMSTRT_CHAR(enc, ptr, n)) {                                       \
102       *nextTokPtr = ptr;                                                       \
103       return XML_TOK_INVALID;                                                  \
104     }                                                                          \
105     ptr += n;                                                                  \
106     break;
107 
108 #  define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)                        \
109   case BT_NONASCII:                                                            \
110     if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {                                   \
111       *nextTokPtr = ptr;                                                       \
112       return XML_TOK_INVALID;                                                  \
113     }                                                                          \
114     /* fall through */                                                         \
115   case BT_NMSTRT:                                                              \
116   case BT_HEX:                                                                 \
117     ptr += MINBPC(enc);                                                        \
118     break;                                                                     \
119     CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr)                            \
120     CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr)                            \
121     CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
122 
123 #  ifndef PREFIX
124 #    define PREFIX(ident) ident
125 #  endif
126 
127 #  define HAS_CHARS(enc, ptr, end, count) (end - ptr >= count * MINBPC(enc))
128 
129 #  define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1)
130 
131 #  define REQUIRE_CHARS(enc, ptr, end, count)                                  \
132     {                                                                          \
133       if (! HAS_CHARS(enc, ptr, end, count)) {                                 \
134         return XML_TOK_PARTIAL;                                                \
135       }                                                                        \
136     }
137 
138 #  define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1)
139 
140 /* ptr points to character following "<!-" */
141 
142 static int PTRCALL
PREFIX(scanComment)143 PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
144                     const char **nextTokPtr) {
145   if (HAS_CHAR(enc, ptr, end)) {
146     if (! CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
147       *nextTokPtr = ptr;
148       return XML_TOK_INVALID;
149     }
150     ptr += MINBPC(enc);
151     while (HAS_CHAR(enc, ptr, end)) {
152       switch (BYTE_TYPE(enc, ptr)) {
153         INVALID_CASES(ptr, nextTokPtr)
154       case BT_MINUS:
155         ptr += MINBPC(enc);
156         REQUIRE_CHAR(enc, ptr, end);
157         if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
158           ptr += MINBPC(enc);
159           REQUIRE_CHAR(enc, ptr, end);
160           if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
161             *nextTokPtr = ptr;
162             return XML_TOK_INVALID;
163           }
164           *nextTokPtr = ptr + MINBPC(enc);
165           return XML_TOK_COMMENT;
166         }
167         break;
168       default:
169         ptr += MINBPC(enc);
170         break;
171       }
172     }
173   }
174   return XML_TOK_PARTIAL;
175 }
176 
177 /* ptr points to character following "<!" */
178 
179 static int PTRCALL
PREFIX(scanDecl)180 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
181                  const char **nextTokPtr) {
182   REQUIRE_CHAR(enc, ptr, end);
183   switch (BYTE_TYPE(enc, ptr)) {
184   case BT_MINUS:
185     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
186   case BT_LSQB:
187     *nextTokPtr = ptr + MINBPC(enc);
188     return XML_TOK_COND_SECT_OPEN;
189   case BT_NMSTRT:
190   case BT_HEX:
191     ptr += MINBPC(enc);
192     break;
193   default:
194     *nextTokPtr = ptr;
195     return XML_TOK_INVALID;
196   }
197   while (HAS_CHAR(enc, ptr, end)) {
198     switch (BYTE_TYPE(enc, ptr)) {
199     case BT_PERCNT:
200       REQUIRE_CHARS(enc, ptr, end, 2);
201       /* don't allow <!ENTITY% foo "whatever"> */
202       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
203       case BT_S:
204       case BT_CR:
205       case BT_LF:
206       case BT_PERCNT:
207         *nextTokPtr = ptr;
208         return XML_TOK_INVALID;
209       }
210       /* fall through */
211     case BT_S:
212     case BT_CR:
213     case BT_LF:
214       *nextTokPtr = ptr;
215       return XML_TOK_DECL_OPEN;
216     case BT_NMSTRT:
217     case BT_HEX:
218       ptr += MINBPC(enc);
219       break;
220     default:
221       *nextTokPtr = ptr;
222       return XML_TOK_INVALID;
223     }
224   }
225   return XML_TOK_PARTIAL;
226 }
227 
228 static int PTRCALL
PREFIX(checkPiTarget)229 PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end,
230                       int *tokPtr) {
231   int upper = 0;
232   UNUSED_P(enc);
233   *tokPtr = XML_TOK_PI;
234   if (end - ptr != MINBPC(enc) * 3)
235     return 1;
236   switch (BYTE_TO_ASCII(enc, ptr)) {
237   case ASCII_x:
238     break;
239   case ASCII_X:
240     upper = 1;
241     break;
242   default:
243     return 1;
244   }
245   ptr += MINBPC(enc);
246   switch (BYTE_TO_ASCII(enc, ptr)) {
247   case ASCII_m:
248     break;
249   case ASCII_M:
250     upper = 1;
251     break;
252   default:
253     return 1;
254   }
255   ptr += MINBPC(enc);
256   switch (BYTE_TO_ASCII(enc, ptr)) {
257   case ASCII_l:
258     break;
259   case ASCII_L:
260     upper = 1;
261     break;
262   default:
263     return 1;
264   }
265   if (upper)
266     return 0;
267   *tokPtr = XML_TOK_XML_DECL;
268   return 1;
269 }
270 
271 /* ptr points to character following "<?" */
272 
273 static int PTRCALL
PREFIX(scanPi)274 PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
275                const char **nextTokPtr) {
276   int tok;
277   const char *target = ptr;
278   REQUIRE_CHAR(enc, ptr, end);
279   switch (BYTE_TYPE(enc, ptr)) {
280     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
281   default:
282     *nextTokPtr = ptr;
283     return XML_TOK_INVALID;
284   }
285   while (HAS_CHAR(enc, ptr, end)) {
286     switch (BYTE_TYPE(enc, ptr)) {
287       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
288     case BT_S:
289     case BT_CR:
290     case BT_LF:
291       if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
292         *nextTokPtr = ptr;
293         return XML_TOK_INVALID;
294       }
295       ptr += MINBPC(enc);
296       while (HAS_CHAR(enc, ptr, end)) {
297         switch (BYTE_TYPE(enc, ptr)) {
298           INVALID_CASES(ptr, nextTokPtr)
299         case BT_QUEST:
300           ptr += MINBPC(enc);
301           REQUIRE_CHAR(enc, ptr, end);
302           if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
303             *nextTokPtr = ptr + MINBPC(enc);
304             return tok;
305           }
306           break;
307         default:
308           ptr += MINBPC(enc);
309           break;
310         }
311       }
312       return XML_TOK_PARTIAL;
313     case BT_QUEST:
314       if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
315         *nextTokPtr = ptr;
316         return XML_TOK_INVALID;
317       }
318       ptr += MINBPC(enc);
319       REQUIRE_CHAR(enc, ptr, end);
320       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
321         *nextTokPtr = ptr + MINBPC(enc);
322         return tok;
323       }
324       /* fall through */
325     default:
326       *nextTokPtr = ptr;
327       return XML_TOK_INVALID;
328     }
329   }
330   return XML_TOK_PARTIAL;
331 }
332 
333 static int PTRCALL
PREFIX(scanCdataSection)334 PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
335                          const char **nextTokPtr) {
336   static const char CDATA_LSQB[]
337       = {ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB};
338   int i;
339   UNUSED_P(enc);
340   /* CDATA[ */
341   REQUIRE_CHARS(enc, ptr, end, 6);
342   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
343     if (! CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
344       *nextTokPtr = ptr;
345       return XML_TOK_INVALID;
346     }
347   }
348   *nextTokPtr = ptr;
349   return XML_TOK_CDATA_SECT_OPEN;
350 }
351 
352 static int PTRCALL
PREFIX(cdataSectionTok)353 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
354                         const char **nextTokPtr) {
355   if (ptr >= end)
356     return XML_TOK_NONE;
357   if (MINBPC(enc) > 1) {
358     size_t n = end - ptr;
359     if (n & (MINBPC(enc) - 1)) {
360       n &= ~(MINBPC(enc) - 1);
361       if (n == 0)
362         return XML_TOK_PARTIAL;
363       end = ptr + n;
364     }
365   }
366   switch (BYTE_TYPE(enc, ptr)) {
367   case BT_RSQB:
368     ptr += MINBPC(enc);
369     REQUIRE_CHAR(enc, ptr, end);
370     if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
371       break;
372     ptr += MINBPC(enc);
373     REQUIRE_CHAR(enc, ptr, end);
374     if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
375       ptr -= MINBPC(enc);
376       break;
377     }
378     *nextTokPtr = ptr + MINBPC(enc);
379     return XML_TOK_CDATA_SECT_CLOSE;
380   case BT_CR:
381     ptr += MINBPC(enc);
382     REQUIRE_CHAR(enc, ptr, end);
383     if (BYTE_TYPE(enc, ptr) == BT_LF)
384       ptr += MINBPC(enc);
385     *nextTokPtr = ptr;
386     return XML_TOK_DATA_NEWLINE;
387   case BT_LF:
388     *nextTokPtr = ptr + MINBPC(enc);
389     return XML_TOK_DATA_NEWLINE;
390     INVALID_CASES(ptr, nextTokPtr)
391   default:
392     ptr += MINBPC(enc);
393     break;
394   }
395   while (HAS_CHAR(enc, ptr, end)) {
396     switch (BYTE_TYPE(enc, ptr)) {
397 #  define LEAD_CASE(n)                                                         \
398   case BT_LEAD##n:                                                             \
399     if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) {                       \
400       *nextTokPtr = ptr;                                                       \
401       return XML_TOK_DATA_CHARS;                                               \
402     }                                                                          \
403     ptr += n;                                                                  \
404     break;
405       LEAD_CASE(2)
406       LEAD_CASE(3)
407       LEAD_CASE(4)
408 #  undef LEAD_CASE
409     case BT_NONXML:
410     case BT_MALFORM:
411     case BT_TRAIL:
412     case BT_CR:
413     case BT_LF:
414     case BT_RSQB:
415       *nextTokPtr = ptr;
416       return XML_TOK_DATA_CHARS;
417     default:
418       ptr += MINBPC(enc);
419       break;
420     }
421   }
422   *nextTokPtr = ptr;
423   return XML_TOK_DATA_CHARS;
424 }
425 
426 /* ptr points to character following "</" */
427 
428 static int PTRCALL
PREFIX(scanEndTag)429 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
430                    const char **nextTokPtr) {
431   REQUIRE_CHAR(enc, ptr, end);
432   switch (BYTE_TYPE(enc, ptr)) {
433     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
434   default:
435     *nextTokPtr = ptr;
436     return XML_TOK_INVALID;
437   }
438   while (HAS_CHAR(enc, ptr, end)) {
439     switch (BYTE_TYPE(enc, ptr)) {
440       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
441     case BT_S:
442     case BT_CR:
443     case BT_LF:
444       for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
445         switch (BYTE_TYPE(enc, ptr)) {
446         case BT_S:
447         case BT_CR:
448         case BT_LF:
449           break;
450         case BT_GT:
451           *nextTokPtr = ptr + MINBPC(enc);
452           return XML_TOK_END_TAG;
453         default:
454           *nextTokPtr = ptr;
455           return XML_TOK_INVALID;
456         }
457       }
458       return XML_TOK_PARTIAL;
459 #  ifdef XML_NS
460     case BT_COLON:
461       /* no need to check qname syntax here,
462          since end-tag must match exactly */
463       ptr += MINBPC(enc);
464       break;
465 #  endif
466     case BT_GT:
467       *nextTokPtr = ptr + MINBPC(enc);
468       return XML_TOK_END_TAG;
469     default:
470       *nextTokPtr = ptr;
471       return XML_TOK_INVALID;
472     }
473   }
474   return XML_TOK_PARTIAL;
475 }
476 
477 /* ptr points to character following "&#X" */
478 
479 static int PTRCALL
PREFIX(scanHexCharRef)480 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
481                        const char **nextTokPtr) {
482   if (HAS_CHAR(enc, ptr, end)) {
483     switch (BYTE_TYPE(enc, ptr)) {
484     case BT_DIGIT:
485     case BT_HEX:
486       break;
487     default:
488       *nextTokPtr = ptr;
489       return XML_TOK_INVALID;
490     }
491     for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
492       switch (BYTE_TYPE(enc, ptr)) {
493       case BT_DIGIT:
494       case BT_HEX:
495         break;
496       case BT_SEMI:
497         *nextTokPtr = ptr + MINBPC(enc);
498         return XML_TOK_CHAR_REF;
499       default:
500         *nextTokPtr = ptr;
501         return XML_TOK_INVALID;
502       }
503     }
504   }
505   return XML_TOK_PARTIAL;
506 }
507 
508 /* ptr points to character following "&#" */
509 
510 static int PTRCALL
PREFIX(scanCharRef)511 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
512                     const char **nextTokPtr) {
513   if (HAS_CHAR(enc, ptr, end)) {
514     if (CHAR_MATCHES(enc, ptr, ASCII_x))
515       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
516     switch (BYTE_TYPE(enc, ptr)) {
517     case BT_DIGIT:
518       break;
519     default:
520       *nextTokPtr = ptr;
521       return XML_TOK_INVALID;
522     }
523     for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
524       switch (BYTE_TYPE(enc, ptr)) {
525       case BT_DIGIT:
526         break;
527       case BT_SEMI:
528         *nextTokPtr = ptr + MINBPC(enc);
529         return XML_TOK_CHAR_REF;
530       default:
531         *nextTokPtr = ptr;
532         return XML_TOK_INVALID;
533       }
534     }
535   }
536   return XML_TOK_PARTIAL;
537 }
538 
539 /* ptr points to character following "&" */
540 
541 static int PTRCALL
PREFIX(scanRef)542 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
543                 const char **nextTokPtr) {
544   REQUIRE_CHAR(enc, ptr, end);
545   switch (BYTE_TYPE(enc, ptr)) {
546     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
547   case BT_NUM:
548     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
549   default:
550     *nextTokPtr = ptr;
551     return XML_TOK_INVALID;
552   }
553   while (HAS_CHAR(enc, ptr, end)) {
554     switch (BYTE_TYPE(enc, ptr)) {
555       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
556     case BT_SEMI:
557       *nextTokPtr = ptr + MINBPC(enc);
558       return XML_TOK_ENTITY_REF;
559     default:
560       *nextTokPtr = ptr;
561       return XML_TOK_INVALID;
562     }
563   }
564   return XML_TOK_PARTIAL;
565 }
566 
567 /* ptr points to character following first character of attribute name */
568 
569 static int PTRCALL
PREFIX(scanAtts)570 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
571                  const char **nextTokPtr) {
572 #  ifdef XML_NS
573   int hadColon = 0;
574 #  endif
575   while (HAS_CHAR(enc, ptr, end)) {
576     switch (BYTE_TYPE(enc, ptr)) {
577       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
578 #  ifdef XML_NS
579     case BT_COLON:
580       if (hadColon) {
581         *nextTokPtr = ptr;
582         return XML_TOK_INVALID;
583       }
584       hadColon = 1;
585       ptr += MINBPC(enc);
586       REQUIRE_CHAR(enc, ptr, end);
587       switch (BYTE_TYPE(enc, ptr)) {
588         CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
589       default:
590         *nextTokPtr = ptr;
591         return XML_TOK_INVALID;
592       }
593       break;
594 #  endif
595     case BT_S:
596     case BT_CR:
597     case BT_LF:
598       for (;;) {
599         int t;
600 
601         ptr += MINBPC(enc);
602         REQUIRE_CHAR(enc, ptr, end);
603         t = BYTE_TYPE(enc, ptr);
604         if (t == BT_EQUALS)
605           break;
606         switch (t) {
607         case BT_S:
608         case BT_LF:
609         case BT_CR:
610           break;
611         default:
612           *nextTokPtr = ptr;
613           return XML_TOK_INVALID;
614         }
615       }
616       /* fall through */
617     case BT_EQUALS: {
618       int open;
619 #  ifdef XML_NS
620       hadColon = 0;
621 #  endif
622       for (;;) {
623         ptr += MINBPC(enc);
624         REQUIRE_CHAR(enc, ptr, end);
625         open = BYTE_TYPE(enc, ptr);
626         if (open == BT_QUOT || open == BT_APOS)
627           break;
628         switch (open) {
629         case BT_S:
630         case BT_LF:
631         case BT_CR:
632           break;
633         default:
634           *nextTokPtr = ptr;
635           return XML_TOK_INVALID;
636         }
637       }
638       ptr += MINBPC(enc);
639       /* in attribute value */
640       for (;;) {
641         int t;
642         REQUIRE_CHAR(enc, ptr, end);
643         t = BYTE_TYPE(enc, ptr);
644         if (t == open)
645           break;
646         switch (t) {
647           INVALID_CASES(ptr, nextTokPtr)
648         case BT_AMP: {
649           int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
650           if (tok <= 0) {
651             if (tok == XML_TOK_INVALID)
652               *nextTokPtr = ptr;
653             return tok;
654           }
655           break;
656         }
657         case BT_LT:
658           *nextTokPtr = ptr;
659           return XML_TOK_INVALID;
660         default:
661           ptr += MINBPC(enc);
662           break;
663         }
664       }
665       ptr += MINBPC(enc);
666       REQUIRE_CHAR(enc, ptr, end);
667       switch (BYTE_TYPE(enc, ptr)) {
668       case BT_S:
669       case BT_CR:
670       case BT_LF:
671         break;
672       case BT_SOL:
673         goto sol;
674       case BT_GT:
675         goto gt;
676       default:
677         *nextTokPtr = ptr;
678         return XML_TOK_INVALID;
679       }
680       /* ptr points to closing quote */
681       for (;;) {
682         ptr += MINBPC(enc);
683         REQUIRE_CHAR(enc, ptr, end);
684         switch (BYTE_TYPE(enc, ptr)) {
685           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
686         case BT_S:
687         case BT_CR:
688         case BT_LF:
689           continue;
690         case BT_GT:
691         gt:
692           *nextTokPtr = ptr + MINBPC(enc);
693           return XML_TOK_START_TAG_WITH_ATTS;
694         case BT_SOL:
695         sol:
696           ptr += MINBPC(enc);
697           REQUIRE_CHAR(enc, ptr, end);
698           if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
699             *nextTokPtr = ptr;
700             return XML_TOK_INVALID;
701           }
702           *nextTokPtr = ptr + MINBPC(enc);
703           return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
704         default:
705           *nextTokPtr = ptr;
706           return XML_TOK_INVALID;
707         }
708         break;
709       }
710       break;
711     }
712     default:
713       *nextTokPtr = ptr;
714       return XML_TOK_INVALID;
715     }
716   }
717   return XML_TOK_PARTIAL;
718 }
719 
720 /* ptr points to character following "<" */
721 
722 static int PTRCALL
PREFIX(scanLt)723 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
724                const char **nextTokPtr) {
725 #  ifdef XML_NS
726   int hadColon;
727 #  endif
728   REQUIRE_CHAR(enc, ptr, end);
729   switch (BYTE_TYPE(enc, ptr)) {
730     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
731   case BT_EXCL:
732     ptr += MINBPC(enc);
733     REQUIRE_CHAR(enc, ptr, end);
734     switch (BYTE_TYPE(enc, ptr)) {
735     case BT_MINUS:
736       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
737     case BT_LSQB:
738       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
739     }
740     *nextTokPtr = ptr;
741     return XML_TOK_INVALID;
742   case BT_QUEST:
743     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
744   case BT_SOL:
745     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
746   default:
747     *nextTokPtr = ptr;
748     return XML_TOK_INVALID;
749   }
750 #  ifdef XML_NS
751   hadColon = 0;
752 #  endif
753   /* we have a start-tag */
754   while (HAS_CHAR(enc, ptr, end)) {
755     switch (BYTE_TYPE(enc, ptr)) {
756       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
757 #  ifdef XML_NS
758     case BT_COLON:
759       if (hadColon) {
760         *nextTokPtr = ptr;
761         return XML_TOK_INVALID;
762       }
763       hadColon = 1;
764       ptr += MINBPC(enc);
765       REQUIRE_CHAR(enc, ptr, end);
766       switch (BYTE_TYPE(enc, ptr)) {
767         CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
768       default:
769         *nextTokPtr = ptr;
770         return XML_TOK_INVALID;
771       }
772       break;
773 #  endif
774     case BT_S:
775     case BT_CR:
776     case BT_LF: {
777       ptr += MINBPC(enc);
778       while (HAS_CHAR(enc, ptr, end)) {
779         switch (BYTE_TYPE(enc, ptr)) {
780           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
781         case BT_GT:
782           goto gt;
783         case BT_SOL:
784           goto sol;
785         case BT_S:
786         case BT_CR:
787         case BT_LF:
788           ptr += MINBPC(enc);
789           continue;
790         default:
791           *nextTokPtr = ptr;
792           return XML_TOK_INVALID;
793         }
794         return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
795       }
796       return XML_TOK_PARTIAL;
797     }
798     case BT_GT:
799     gt:
800       *nextTokPtr = ptr + MINBPC(enc);
801       return XML_TOK_START_TAG_NO_ATTS;
802     case BT_SOL:
803     sol:
804       ptr += MINBPC(enc);
805       REQUIRE_CHAR(enc, ptr, end);
806       if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
807         *nextTokPtr = ptr;
808         return XML_TOK_INVALID;
809       }
810       *nextTokPtr = ptr + MINBPC(enc);
811       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
812     default:
813       *nextTokPtr = ptr;
814       return XML_TOK_INVALID;
815     }
816   }
817   return XML_TOK_PARTIAL;
818 }
819 
820 static int PTRCALL
PREFIX(contentTok)821 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
822                    const char **nextTokPtr) {
823   if (ptr >= end)
824     return XML_TOK_NONE;
825   if (MINBPC(enc) > 1) {
826     size_t n = end - ptr;
827     if (n & (MINBPC(enc) - 1)) {
828       n &= ~(MINBPC(enc) - 1);
829       if (n == 0)
830         return XML_TOK_PARTIAL;
831       end = ptr + n;
832     }
833   }
834   switch (BYTE_TYPE(enc, ptr)) {
835   case BT_LT:
836     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
837   case BT_AMP:
838     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
839   case BT_CR:
840     ptr += MINBPC(enc);
841     if (! HAS_CHAR(enc, ptr, end))
842       return XML_TOK_TRAILING_CR;
843     if (BYTE_TYPE(enc, ptr) == BT_LF)
844       ptr += MINBPC(enc);
845     *nextTokPtr = ptr;
846     return XML_TOK_DATA_NEWLINE;
847   case BT_LF:
848     *nextTokPtr = ptr + MINBPC(enc);
849     return XML_TOK_DATA_NEWLINE;
850   case BT_RSQB:
851     ptr += MINBPC(enc);
852     if (! HAS_CHAR(enc, ptr, end))
853       return XML_TOK_TRAILING_RSQB;
854     if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
855       break;
856     ptr += MINBPC(enc);
857     if (! HAS_CHAR(enc, ptr, end))
858       return XML_TOK_TRAILING_RSQB;
859     if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
860       ptr -= MINBPC(enc);
861       break;
862     }
863     *nextTokPtr = ptr;
864     return XML_TOK_INVALID;
865     INVALID_CASES(ptr, nextTokPtr)
866   default:
867     ptr += MINBPC(enc);
868     break;
869   }
870   while (HAS_CHAR(enc, ptr, end)) {
871     switch (BYTE_TYPE(enc, ptr)) {
872 #  define LEAD_CASE(n)                                                         \
873   case BT_LEAD##n:                                                             \
874     if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) {                       \
875       *nextTokPtr = ptr;                                                       \
876       return XML_TOK_DATA_CHARS;                                               \
877     }                                                                          \
878     ptr += n;                                                                  \
879     break;
880       LEAD_CASE(2)
881       LEAD_CASE(3)
882       LEAD_CASE(4)
883 #  undef LEAD_CASE
884     case BT_RSQB:
885       if (HAS_CHARS(enc, ptr, end, 2)) {
886         if (! CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
887           ptr += MINBPC(enc);
888           break;
889         }
890         if (HAS_CHARS(enc, ptr, end, 3)) {
891           if (! CHAR_MATCHES(enc, ptr + 2 * MINBPC(enc), ASCII_GT)) {
892             ptr += MINBPC(enc);
893             break;
894           }
895           *nextTokPtr = ptr + 2 * MINBPC(enc);
896           return XML_TOK_INVALID;
897         }
898       }
899       /* fall through */
900     case BT_AMP:
901     case BT_LT:
902     case BT_NONXML:
903     case BT_MALFORM:
904     case BT_TRAIL:
905     case BT_CR:
906     case BT_LF:
907       *nextTokPtr = ptr;
908       return XML_TOK_DATA_CHARS;
909     default:
910       ptr += MINBPC(enc);
911       break;
912     }
913   }
914   *nextTokPtr = ptr;
915   return XML_TOK_DATA_CHARS;
916 }
917 
918 /* ptr points to character following "%" */
919 
920 static int PTRCALL
PREFIX(scanPercent)921 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
922                     const char **nextTokPtr) {
923   REQUIRE_CHAR(enc, ptr, end);
924   switch (BYTE_TYPE(enc, ptr)) {
925     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
926   case BT_S:
927   case BT_LF:
928   case BT_CR:
929   case BT_PERCNT:
930     *nextTokPtr = ptr;
931     return XML_TOK_PERCENT;
932   default:
933     *nextTokPtr = ptr;
934     return XML_TOK_INVALID;
935   }
936   while (HAS_CHAR(enc, ptr, end)) {
937     switch (BYTE_TYPE(enc, ptr)) {
938       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
939     case BT_SEMI:
940       *nextTokPtr = ptr + MINBPC(enc);
941       return XML_TOK_PARAM_ENTITY_REF;
942     default:
943       *nextTokPtr = ptr;
944       return XML_TOK_INVALID;
945     }
946   }
947   return XML_TOK_PARTIAL;
948 }
949 
950 static int PTRCALL
PREFIX(scanPoundName)951 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
952                       const char **nextTokPtr) {
953   REQUIRE_CHAR(enc, ptr, end);
954   switch (BYTE_TYPE(enc, ptr)) {
955     CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
956   default:
957     *nextTokPtr = ptr;
958     return XML_TOK_INVALID;
959   }
960   while (HAS_CHAR(enc, ptr, end)) {
961     switch (BYTE_TYPE(enc, ptr)) {
962       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
963     case BT_CR:
964     case BT_LF:
965     case BT_S:
966     case BT_RPAR:
967     case BT_GT:
968     case BT_PERCNT:
969     case BT_VERBAR:
970       *nextTokPtr = ptr;
971       return XML_TOK_POUND_NAME;
972     default:
973       *nextTokPtr = ptr;
974       return XML_TOK_INVALID;
975     }
976   }
977   return -XML_TOK_POUND_NAME;
978 }
979 
980 static int PTRCALL
PREFIX(scanLit)981 PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end,
982                 const char **nextTokPtr) {
983   while (HAS_CHAR(enc, ptr, end)) {
984     int t = BYTE_TYPE(enc, ptr);
985     switch (t) {
986       INVALID_CASES(ptr, nextTokPtr)
987     case BT_QUOT:
988     case BT_APOS:
989       ptr += MINBPC(enc);
990       if (t != open)
991         break;
992       if (! HAS_CHAR(enc, ptr, end))
993         return -XML_TOK_LITERAL;
994       *nextTokPtr = ptr;
995       switch (BYTE_TYPE(enc, ptr)) {
996       case BT_S:
997       case BT_CR:
998       case BT_LF:
999       case BT_GT:
1000       case BT_PERCNT:
1001       case BT_LSQB:
1002         return XML_TOK_LITERAL;
1003       default:
1004         return XML_TOK_INVALID;
1005       }
1006     default:
1007       ptr += MINBPC(enc);
1008       break;
1009     }
1010   }
1011   return XML_TOK_PARTIAL;
1012 }
1013 
1014 static int PTRCALL
PREFIX(prologTok)1015 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
1016                   const char **nextTokPtr) {
1017   int tok;
1018   if (ptr >= end)
1019     return XML_TOK_NONE;
1020   if (MINBPC(enc) > 1) {
1021     size_t n = end - ptr;
1022     if (n & (MINBPC(enc) - 1)) {
1023       n &= ~(MINBPC(enc) - 1);
1024       if (n == 0)
1025         return XML_TOK_PARTIAL;
1026       end = ptr + n;
1027     }
1028   }
1029   switch (BYTE_TYPE(enc, ptr)) {
1030   case BT_QUOT:
1031     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1032   case BT_APOS:
1033     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1034   case BT_LT: {
1035     ptr += MINBPC(enc);
1036     REQUIRE_CHAR(enc, ptr, end);
1037     switch (BYTE_TYPE(enc, ptr)) {
1038     case BT_EXCL:
1039       return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1040     case BT_QUEST:
1041       return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1042     case BT_NMSTRT:
1043     case BT_HEX:
1044     case BT_NONASCII:
1045     case BT_LEAD2:
1046     case BT_LEAD3:
1047     case BT_LEAD4:
1048       *nextTokPtr = ptr - MINBPC(enc);
1049       return XML_TOK_INSTANCE_START;
1050     }
1051     *nextTokPtr = ptr;
1052     return XML_TOK_INVALID;
1053   }
1054   case BT_CR:
1055     if (ptr + MINBPC(enc) == end) {
1056       *nextTokPtr = end;
1057       /* indicate that this might be part of a CR/LF pair */
1058       return -XML_TOK_PROLOG_S;
1059     }
1060     /* fall through */
1061   case BT_S:
1062   case BT_LF:
1063     for (;;) {
1064       ptr += MINBPC(enc);
1065       if (! HAS_CHAR(enc, ptr, end))
1066         break;
1067       switch (BYTE_TYPE(enc, ptr)) {
1068       case BT_S:
1069       case BT_LF:
1070         break;
1071       case BT_CR:
1072         /* don't split CR/LF pair */
1073         if (ptr + MINBPC(enc) != end)
1074           break;
1075         /* fall through */
1076       default:
1077         *nextTokPtr = ptr;
1078         return XML_TOK_PROLOG_S;
1079       }
1080     }
1081     *nextTokPtr = ptr;
1082     return XML_TOK_PROLOG_S;
1083   case BT_PERCNT:
1084     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1085   case BT_COMMA:
1086     *nextTokPtr = ptr + MINBPC(enc);
1087     return XML_TOK_COMMA;
1088   case BT_LSQB:
1089     *nextTokPtr = ptr + MINBPC(enc);
1090     return XML_TOK_OPEN_BRACKET;
1091   case BT_RSQB:
1092     ptr += MINBPC(enc);
1093     if (! HAS_CHAR(enc, ptr, end))
1094       return -XML_TOK_CLOSE_BRACKET;
1095     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1096       REQUIRE_CHARS(enc, ptr, end, 2);
1097       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1098         *nextTokPtr = ptr + 2 * MINBPC(enc);
1099         return XML_TOK_COND_SECT_CLOSE;
1100       }
1101     }
1102     *nextTokPtr = ptr;
1103     return XML_TOK_CLOSE_BRACKET;
1104   case BT_LPAR:
1105     *nextTokPtr = ptr + MINBPC(enc);
1106     return XML_TOK_OPEN_PAREN;
1107   case BT_RPAR:
1108     ptr += MINBPC(enc);
1109     if (! HAS_CHAR(enc, ptr, end))
1110       return -XML_TOK_CLOSE_PAREN;
1111     switch (BYTE_TYPE(enc, ptr)) {
1112     case BT_AST:
1113       *nextTokPtr = ptr + MINBPC(enc);
1114       return XML_TOK_CLOSE_PAREN_ASTERISK;
1115     case BT_QUEST:
1116       *nextTokPtr = ptr + MINBPC(enc);
1117       return XML_TOK_CLOSE_PAREN_QUESTION;
1118     case BT_PLUS:
1119       *nextTokPtr = ptr + MINBPC(enc);
1120       return XML_TOK_CLOSE_PAREN_PLUS;
1121     case BT_CR:
1122     case BT_LF:
1123     case BT_S:
1124     case BT_GT:
1125     case BT_COMMA:
1126     case BT_VERBAR:
1127     case BT_RPAR:
1128       *nextTokPtr = ptr;
1129       return XML_TOK_CLOSE_PAREN;
1130     }
1131     *nextTokPtr = ptr;
1132     return XML_TOK_INVALID;
1133   case BT_VERBAR:
1134     *nextTokPtr = ptr + MINBPC(enc);
1135     return XML_TOK_OR;
1136   case BT_GT:
1137     *nextTokPtr = ptr + MINBPC(enc);
1138     return XML_TOK_DECL_CLOSE;
1139   case BT_NUM:
1140     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1141 #  define LEAD_CASE(n)                                                         \
1142   case BT_LEAD##n:                                                             \
1143     if (end - ptr < n)                                                         \
1144       return XML_TOK_PARTIAL_CHAR;                                             \
1145     if (IS_NMSTRT_CHAR(enc, ptr, n)) {                                         \
1146       ptr += n;                                                                \
1147       tok = XML_TOK_NAME;                                                      \
1148       break;                                                                   \
1149     }                                                                          \
1150     if (IS_NAME_CHAR(enc, ptr, n)) {                                           \
1151       ptr += n;                                                                \
1152       tok = XML_TOK_NMTOKEN;                                                   \
1153       break;                                                                   \
1154     }                                                                          \
1155     *nextTokPtr = ptr;                                                         \
1156     return XML_TOK_INVALID;
1157     LEAD_CASE(2)
1158     LEAD_CASE(3)
1159     LEAD_CASE(4)
1160 #  undef LEAD_CASE
1161   case BT_NMSTRT:
1162   case BT_HEX:
1163     tok = XML_TOK_NAME;
1164     ptr += MINBPC(enc);
1165     break;
1166   case BT_DIGIT:
1167   case BT_NAME:
1168   case BT_MINUS:
1169 #  ifdef XML_NS
1170   case BT_COLON:
1171 #  endif
1172     tok = XML_TOK_NMTOKEN;
1173     ptr += MINBPC(enc);
1174     break;
1175   case BT_NONASCII:
1176     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1177       ptr += MINBPC(enc);
1178       tok = XML_TOK_NAME;
1179       break;
1180     }
1181     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1182       ptr += MINBPC(enc);
1183       tok = XML_TOK_NMTOKEN;
1184       break;
1185     }
1186     /* fall through */
1187   default:
1188     *nextTokPtr = ptr;
1189     return XML_TOK_INVALID;
1190   }
1191   while (HAS_CHAR(enc, ptr, end)) {
1192     switch (BYTE_TYPE(enc, ptr)) {
1193       CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1194     case BT_GT:
1195     case BT_RPAR:
1196     case BT_COMMA:
1197     case BT_VERBAR:
1198     case BT_LSQB:
1199     case BT_PERCNT:
1200     case BT_S:
1201     case BT_CR:
1202     case BT_LF:
1203       *nextTokPtr = ptr;
1204       return tok;
1205 #  ifdef XML_NS
1206     case BT_COLON:
1207       ptr += MINBPC(enc);
1208       switch (tok) {
1209       case XML_TOK_NAME:
1210         REQUIRE_CHAR(enc, ptr, end);
1211         tok = XML_TOK_PREFIXED_NAME;
1212         switch (BYTE_TYPE(enc, ptr)) {
1213           CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1214         default:
1215           tok = XML_TOK_NMTOKEN;
1216           break;
1217         }
1218         break;
1219       case XML_TOK_PREFIXED_NAME:
1220         tok = XML_TOK_NMTOKEN;
1221         break;
1222       }
1223       break;
1224 #  endif
1225     case BT_PLUS:
1226       if (tok == XML_TOK_NMTOKEN) {
1227         *nextTokPtr = ptr;
1228         return XML_TOK_INVALID;
1229       }
1230       *nextTokPtr = ptr + MINBPC(enc);
1231       return XML_TOK_NAME_PLUS;
1232     case BT_AST:
1233       if (tok == XML_TOK_NMTOKEN) {
1234         *nextTokPtr = ptr;
1235         return XML_TOK_INVALID;
1236       }
1237       *nextTokPtr = ptr + MINBPC(enc);
1238       return XML_TOK_NAME_ASTERISK;
1239     case BT_QUEST:
1240       if (tok == XML_TOK_NMTOKEN) {
1241         *nextTokPtr = ptr;
1242         return XML_TOK_INVALID;
1243       }
1244       *nextTokPtr = ptr + MINBPC(enc);
1245       return XML_TOK_NAME_QUESTION;
1246     default:
1247       *nextTokPtr = ptr;
1248       return XML_TOK_INVALID;
1249     }
1250   }
1251   return -tok;
1252 }
1253 
1254 static int PTRCALL
PREFIX(attributeValueTok)1255 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1256                           const char **nextTokPtr) {
1257   const char *start;
1258   if (ptr >= end)
1259     return XML_TOK_NONE;
1260   else if (! HAS_CHAR(enc, ptr, end)) {
1261     /* This line cannot be executed.  The incoming data has already
1262      * been tokenized once, so incomplete characters like this have
1263      * already been eliminated from the input.  Retaining the paranoia
1264      * check is still valuable, however.
1265      */
1266     return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1267   }
1268   start = ptr;
1269   while (HAS_CHAR(enc, ptr, end)) {
1270     switch (BYTE_TYPE(enc, ptr)) {
1271 #  define LEAD_CASE(n)                                                         \
1272   case BT_LEAD##n:                                                             \
1273     ptr += n;                                                                  \
1274     break;
1275       LEAD_CASE(2)
1276       LEAD_CASE(3)
1277       LEAD_CASE(4)
1278 #  undef LEAD_CASE
1279     case BT_AMP:
1280       if (ptr == start)
1281         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1282       *nextTokPtr = ptr;
1283       return XML_TOK_DATA_CHARS;
1284     case BT_LT:
1285       /* this is for inside entity references */
1286       *nextTokPtr = ptr;
1287       return XML_TOK_INVALID;
1288     case BT_LF:
1289       if (ptr == start) {
1290         *nextTokPtr = ptr + MINBPC(enc);
1291         return XML_TOK_DATA_NEWLINE;
1292       }
1293       *nextTokPtr = ptr;
1294       return XML_TOK_DATA_CHARS;
1295     case BT_CR:
1296       if (ptr == start) {
1297         ptr += MINBPC(enc);
1298         if (! HAS_CHAR(enc, ptr, end))
1299           return XML_TOK_TRAILING_CR;
1300         if (BYTE_TYPE(enc, ptr) == BT_LF)
1301           ptr += MINBPC(enc);
1302         *nextTokPtr = ptr;
1303         return XML_TOK_DATA_NEWLINE;
1304       }
1305       *nextTokPtr = ptr;
1306       return XML_TOK_DATA_CHARS;
1307     case BT_S:
1308       if (ptr == start) {
1309         *nextTokPtr = ptr + MINBPC(enc);
1310         return XML_TOK_ATTRIBUTE_VALUE_S;
1311       }
1312       *nextTokPtr = ptr;
1313       return XML_TOK_DATA_CHARS;
1314     default:
1315       ptr += MINBPC(enc);
1316       break;
1317     }
1318   }
1319   *nextTokPtr = ptr;
1320   return XML_TOK_DATA_CHARS;
1321 }
1322 
1323 static int PTRCALL
PREFIX(entityValueTok)1324 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1325                        const char **nextTokPtr) {
1326   const char *start;
1327   if (ptr >= end)
1328     return XML_TOK_NONE;
1329   else if (! HAS_CHAR(enc, ptr, end)) {
1330     /* This line cannot be executed.  The incoming data has already
1331      * been tokenized once, so incomplete characters like this have
1332      * already been eliminated from the input.  Retaining the paranoia
1333      * check is still valuable, however.
1334      */
1335     return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1336   }
1337   start = ptr;
1338   while (HAS_CHAR(enc, ptr, end)) {
1339     switch (BYTE_TYPE(enc, ptr)) {
1340 #  define LEAD_CASE(n)                                                         \
1341   case BT_LEAD##n:                                                             \
1342     ptr += n;                                                                  \
1343     break;
1344       LEAD_CASE(2)
1345       LEAD_CASE(3)
1346       LEAD_CASE(4)
1347 #  undef LEAD_CASE
1348     case BT_AMP:
1349       if (ptr == start)
1350         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1351       *nextTokPtr = ptr;
1352       return XML_TOK_DATA_CHARS;
1353     case BT_PERCNT:
1354       if (ptr == start) {
1355         int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1356         return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1357       }
1358       *nextTokPtr = ptr;
1359       return XML_TOK_DATA_CHARS;
1360     case BT_LF:
1361       if (ptr == start) {
1362         *nextTokPtr = ptr + MINBPC(enc);
1363         return XML_TOK_DATA_NEWLINE;
1364       }
1365       *nextTokPtr = ptr;
1366       return XML_TOK_DATA_CHARS;
1367     case BT_CR:
1368       if (ptr == start) {
1369         ptr += MINBPC(enc);
1370         if (! HAS_CHAR(enc, ptr, end))
1371           return XML_TOK_TRAILING_CR;
1372         if (BYTE_TYPE(enc, ptr) == BT_LF)
1373           ptr += MINBPC(enc);
1374         *nextTokPtr = ptr;
1375         return XML_TOK_DATA_NEWLINE;
1376       }
1377       *nextTokPtr = ptr;
1378       return XML_TOK_DATA_CHARS;
1379     default:
1380       ptr += MINBPC(enc);
1381       break;
1382     }
1383   }
1384   *nextTokPtr = ptr;
1385   return XML_TOK_DATA_CHARS;
1386 }
1387 
1388 #  ifdef XML_DTD
1389 
1390 static int PTRCALL
PREFIX(ignoreSectionTok)1391 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1392                          const char **nextTokPtr) {
1393   int level = 0;
1394   if (MINBPC(enc) > 1) {
1395     size_t n = end - ptr;
1396     if (n & (MINBPC(enc) - 1)) {
1397       n &= ~(MINBPC(enc) - 1);
1398       end = ptr + n;
1399     }
1400   }
1401   while (HAS_CHAR(enc, ptr, end)) {
1402     switch (BYTE_TYPE(enc, ptr)) {
1403       INVALID_CASES(ptr, nextTokPtr)
1404     case BT_LT:
1405       ptr += MINBPC(enc);
1406       REQUIRE_CHAR(enc, ptr, end);
1407       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1408         ptr += MINBPC(enc);
1409         REQUIRE_CHAR(enc, ptr, end);
1410         if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1411           ++level;
1412           ptr += MINBPC(enc);
1413         }
1414       }
1415       break;
1416     case BT_RSQB:
1417       ptr += MINBPC(enc);
1418       REQUIRE_CHAR(enc, ptr, end);
1419       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1420         ptr += MINBPC(enc);
1421         REQUIRE_CHAR(enc, ptr, end);
1422         if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1423           ptr += MINBPC(enc);
1424           if (level == 0) {
1425             *nextTokPtr = ptr;
1426             return XML_TOK_IGNORE_SECT;
1427           }
1428           --level;
1429         }
1430       }
1431       break;
1432     default:
1433       ptr += MINBPC(enc);
1434       break;
1435     }
1436   }
1437   return XML_TOK_PARTIAL;
1438 }
1439 
1440 #  endif /* XML_DTD */
1441 
1442 static int PTRCALL
PREFIX(isPublicId)1443 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1444                    const char **badPtr) {
1445   ptr += MINBPC(enc);
1446   end -= MINBPC(enc);
1447   for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1448     switch (BYTE_TYPE(enc, ptr)) {
1449     case BT_DIGIT:
1450     case BT_HEX:
1451     case BT_MINUS:
1452     case BT_APOS:
1453     case BT_LPAR:
1454     case BT_RPAR:
1455     case BT_PLUS:
1456     case BT_COMMA:
1457     case BT_SOL:
1458     case BT_EQUALS:
1459     case BT_QUEST:
1460     case BT_CR:
1461     case BT_LF:
1462     case BT_SEMI:
1463     case BT_EXCL:
1464     case BT_AST:
1465     case BT_PERCNT:
1466     case BT_NUM:
1467 #  ifdef XML_NS
1468     case BT_COLON:
1469 #  endif
1470       break;
1471     case BT_S:
1472       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1473         *badPtr = ptr;
1474         return 0;
1475       }
1476       break;
1477     case BT_NAME:
1478     case BT_NMSTRT:
1479       if (! (BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1480         break;
1481       /* fall through */
1482     default:
1483       switch (BYTE_TO_ASCII(enc, ptr)) {
1484       case 0x24: /* $ */
1485       case 0x40: /* @ */
1486         break;
1487       default:
1488         *badPtr = ptr;
1489         return 0;
1490       }
1491       break;
1492     }
1493   }
1494   return 1;
1495 }
1496 
1497 /* This must only be called for a well-formed start-tag or empty
1498    element tag.  Returns the number of attributes.  Pointers to the
1499    first attsMax attributes are stored in atts.
1500 */
1501 
1502 static int PTRCALL
PREFIX(getAtts)1503 PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
1504                 ATTRIBUTE *atts) {
1505   enum { other, inName, inValue } state = inName;
1506   int nAtts = 0;
1507   int open = 0; /* defined when state == inValue;
1508                    initialization just to shut up compilers */
1509 
1510   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1511     switch (BYTE_TYPE(enc, ptr)) {
1512 #  define START_NAME                                                           \
1513     if (state == other) {                                                      \
1514       if (nAtts < attsMax) {                                                   \
1515         atts[nAtts].name = ptr;                                                \
1516         atts[nAtts].normalized = 1;                                            \
1517       }                                                                        \
1518       state = inName;                                                          \
1519     }
1520 #  define LEAD_CASE(n)                                                         \
1521   case BT_LEAD##n:                                                             \
1522     START_NAME ptr += (n - MINBPC(enc));                                       \
1523     break;
1524       LEAD_CASE(2)
1525       LEAD_CASE(3)
1526       LEAD_CASE(4)
1527 #  undef LEAD_CASE
1528     case BT_NONASCII:
1529     case BT_NMSTRT:
1530     case BT_HEX:
1531       START_NAME
1532       break;
1533 #  undef START_NAME
1534     case BT_QUOT:
1535       if (state != inValue) {
1536         if (nAtts < attsMax)
1537           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1538         state = inValue;
1539         open = BT_QUOT;
1540       } else if (open == BT_QUOT) {
1541         state = other;
1542         if (nAtts < attsMax)
1543           atts[nAtts].valueEnd = ptr;
1544         nAtts++;
1545       }
1546       break;
1547     case BT_APOS:
1548       if (state != inValue) {
1549         if (nAtts < attsMax)
1550           atts[nAtts].valuePtr = ptr + MINBPC(enc);
1551         state = inValue;
1552         open = BT_APOS;
1553       } else if (open == BT_APOS) {
1554         state = other;
1555         if (nAtts < attsMax)
1556           atts[nAtts].valueEnd = ptr;
1557         nAtts++;
1558       }
1559       break;
1560     case BT_AMP:
1561       if (nAtts < attsMax)
1562         atts[nAtts].normalized = 0;
1563       break;
1564     case BT_S:
1565       if (state == inName)
1566         state = other;
1567       else if (state == inValue && nAtts < attsMax && atts[nAtts].normalized
1568                && (ptr == atts[nAtts].valuePtr
1569                    || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1570                    || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1571                    || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1572         atts[nAtts].normalized = 0;
1573       break;
1574     case BT_CR:
1575     case BT_LF:
1576       /* This case ensures that the first attribute name is counted
1577          Apart from that we could just change state on the quote. */
1578       if (state == inName)
1579         state = other;
1580       else if (state == inValue && nAtts < attsMax)
1581         atts[nAtts].normalized = 0;
1582       break;
1583     case BT_GT:
1584     case BT_SOL:
1585       if (state != inValue)
1586         return nAtts;
1587       break;
1588     default:
1589       break;
1590     }
1591   }
1592   /* not reached */
1593 }
1594 
1595 static int PTRFASTCALL
PREFIX(charRefNumber)1596 PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) {
1597   int result = 0;
1598   /* skip &# */
1599   UNUSED_P(enc);
1600   ptr += 2 * MINBPC(enc);
1601   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1602     for (ptr += MINBPC(enc); ! CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1603          ptr += MINBPC(enc)) {
1604       int c = BYTE_TO_ASCII(enc, ptr);
1605       switch (c) {
1606       case ASCII_0:
1607       case ASCII_1:
1608       case ASCII_2:
1609       case ASCII_3:
1610       case ASCII_4:
1611       case ASCII_5:
1612       case ASCII_6:
1613       case ASCII_7:
1614       case ASCII_8:
1615       case ASCII_9:
1616         result <<= 4;
1617         result |= (c - ASCII_0);
1618         break;
1619       case ASCII_A:
1620       case ASCII_B:
1621       case ASCII_C:
1622       case ASCII_D:
1623       case ASCII_E:
1624       case ASCII_F:
1625         result <<= 4;
1626         result += 10 + (c - ASCII_A);
1627         break;
1628       case ASCII_a:
1629       case ASCII_b:
1630       case ASCII_c:
1631       case ASCII_d:
1632       case ASCII_e:
1633       case ASCII_f:
1634         result <<= 4;
1635         result += 10 + (c - ASCII_a);
1636         break;
1637       }
1638       if (result >= 0x110000)
1639         return -1;
1640     }
1641   } else {
1642     for (; ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1643       int c = BYTE_TO_ASCII(enc, ptr);
1644       result *= 10;
1645       result += (c - ASCII_0);
1646       if (result >= 0x110000)
1647         return -1;
1648     }
1649   }
1650   return checkCharRefNumber(result);
1651 }
1652 
1653 static int PTRCALL
PREFIX(predefinedEntityName)1654 PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1655                              const char *end) {
1656   UNUSED_P(enc);
1657   switch ((end - ptr) / MINBPC(enc)) {
1658   case 2:
1659     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1660       switch (BYTE_TO_ASCII(enc, ptr)) {
1661       case ASCII_l:
1662         return ASCII_LT;
1663       case ASCII_g:
1664         return ASCII_GT;
1665       }
1666     }
1667     break;
1668   case 3:
1669     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1670       ptr += MINBPC(enc);
1671       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1672         ptr += MINBPC(enc);
1673         if (CHAR_MATCHES(enc, ptr, ASCII_p))
1674           return ASCII_AMP;
1675       }
1676     }
1677     break;
1678   case 4:
1679     switch (BYTE_TO_ASCII(enc, ptr)) {
1680     case ASCII_q:
1681       ptr += MINBPC(enc);
1682       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1683         ptr += MINBPC(enc);
1684         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1685           ptr += MINBPC(enc);
1686           if (CHAR_MATCHES(enc, ptr, ASCII_t))
1687             return ASCII_QUOT;
1688         }
1689       }
1690       break;
1691     case ASCII_a:
1692       ptr += MINBPC(enc);
1693       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1694         ptr += MINBPC(enc);
1695         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1696           ptr += MINBPC(enc);
1697           if (CHAR_MATCHES(enc, ptr, ASCII_s))
1698             return ASCII_APOS;
1699         }
1700       }
1701       break;
1702     }
1703   }
1704   return 0;
1705 }
1706 
1707 static int PTRCALL
PREFIX(nameMatchesAscii)1708 PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1709                          const char *end1, const char *ptr2) {
1710   UNUSED_P(enc);
1711   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1712     if (end1 - ptr1 < MINBPC(enc)) {
1713       /* This line cannot be executed.  The incoming data has already
1714        * been tokenized once, so incomplete characters like this have
1715        * already been eliminated from the input.  Retaining the
1716        * paranoia check is still valuable, however.
1717        */
1718       return 0; /* LCOV_EXCL_LINE */
1719     }
1720     if (! CHAR_MATCHES(enc, ptr1, *ptr2))
1721       return 0;
1722   }
1723   return ptr1 == end1;
1724 }
1725 
1726 static int PTRFASTCALL
PREFIX(nameLength)1727 PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
1728   const char *start = ptr;
1729   for (;;) {
1730     switch (BYTE_TYPE(enc, ptr)) {
1731 #  define LEAD_CASE(n)                                                         \
1732   case BT_LEAD##n:                                                             \
1733     ptr += n;                                                                  \
1734     break;
1735       LEAD_CASE(2)
1736       LEAD_CASE(3)
1737       LEAD_CASE(4)
1738 #  undef LEAD_CASE
1739     case BT_NONASCII:
1740     case BT_NMSTRT:
1741 #  ifdef XML_NS
1742     case BT_COLON:
1743 #  endif
1744     case BT_HEX:
1745     case BT_DIGIT:
1746     case BT_NAME:
1747     case BT_MINUS:
1748       ptr += MINBPC(enc);
1749       break;
1750     default:
1751       return (int)(ptr - start);
1752     }
1753   }
1754 }
1755 
1756 static const char *PTRFASTCALL
PREFIX(skipS)1757 PREFIX(skipS)(const ENCODING *enc, const char *ptr) {
1758   for (;;) {
1759     switch (BYTE_TYPE(enc, ptr)) {
1760     case BT_LF:
1761     case BT_CR:
1762     case BT_S:
1763       ptr += MINBPC(enc);
1764       break;
1765     default:
1766       return ptr;
1767     }
1768   }
1769 }
1770 
1771 static void PTRCALL
PREFIX(updatePosition)1772 PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
1773                        POSITION *pos) {
1774   while (HAS_CHAR(enc, ptr, end)) {
1775     switch (BYTE_TYPE(enc, ptr)) {
1776 #  define LEAD_CASE(n)                                                         \
1777   case BT_LEAD##n:                                                             \
1778     ptr += n;                                                                  \
1779     pos->columnNumber++;                                                       \
1780     break;
1781       LEAD_CASE(2)
1782       LEAD_CASE(3)
1783       LEAD_CASE(4)
1784 #  undef LEAD_CASE
1785     case BT_LF:
1786       pos->columnNumber = 0;
1787       pos->lineNumber++;
1788       ptr += MINBPC(enc);
1789       break;
1790     case BT_CR:
1791       pos->lineNumber++;
1792       ptr += MINBPC(enc);
1793       if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1794         ptr += MINBPC(enc);
1795       pos->columnNumber = 0;
1796       break;
1797     default:
1798       ptr += MINBPC(enc);
1799       pos->columnNumber++;
1800       break;
1801     }
1802   }
1803 }
1804 
1805 #  undef DO_LEAD_CASE
1806 #  undef MULTIBYTE_CASES
1807 #  undef INVALID_CASES
1808 #  undef CHECK_NAME_CASE
1809 #  undef CHECK_NAME_CASES
1810 #  undef CHECK_NMSTRT_CASE
1811 #  undef CHECK_NMSTRT_CASES
1812 
1813 #endif /* XML_TOK_IMPL_C */
1814