1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2    See the file COPYING for copying permission.
3 */
4 
5 #include <stddef.h>
6 
7 #ifdef COMPILED_FROM_DSP
8 #include "winconfig.h"
9 #elif defined(MACOS_CLASSIC)
10 #include "macconfig.h"
11 #else
12 #ifdef HAVE_EXPAT_CONFIG_H
13 #include <expat_config.h>
14 #endif
15 #endif /* ndef COMPILED_FROM_DSP */
16 
17 #include "expat_external.h"
18 #include "internal.h"
19 #include "xmltok.h"
20 #include "nametab.h"
21 
22 #ifdef XML_DTD
23 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
24 #else
25 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
26 #endif
27 
28 #define VTABLE1 \
29   { PREFIX(prologTok), PREFIX(contentTok), \
30     PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
31   { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
32   PREFIX(sameName), \
33   PREFIX(nameMatchesAscii), \
34   PREFIX(nameLength), \
35   PREFIX(skipS), \
36   PREFIX(getAtts), \
37   PREFIX(charRefNumber), \
38   PREFIX(predefinedEntityName), \
39   PREFIX(updatePosition), \
40   PREFIX(isPublicId)
41 
42 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
43 
44 #define UCS2_GET_NAMING(pages, hi, lo) \
45    (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
46 
47 /* A 2 byte UTF-8 representation splits the characters 11 bits between
48    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
49    pages, 3 bits to add to that index and 5 bits to generate the mask.
50 */
51 #define UTF8_GET_NAMING2(pages, byte) \
52     (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
53                       + ((((byte)[0]) & 3) << 1) \
54                       + ((((byte)[1]) >> 5) & 1)] \
55          & (1 << (((byte)[1]) & 0x1F)))
56 
57 /* A 3 byte UTF-8 representation splits the characters 16 bits between
58    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
59    into pages, 3 bits to add to that index and 5 bits to generate the
60    mask.
61 */
62 #define UTF8_GET_NAMING3(pages, byte) \
63   (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
64                              + ((((byte)[1]) >> 2) & 0xF)] \
65                        << 3) \
66                       + ((((byte)[1]) & 3) << 1) \
67                       + ((((byte)[2]) >> 5) & 1)] \
68          & (1 << (((byte)[2]) & 0x1F)))
69 
70 #define UTF8_GET_NAMING(pages, p, n) \
71   ((n) == 2 \
72   ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
73   : ((n) == 3 \
74      ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
75      : 0))
76 
77 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
78    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
79    with the additional restriction of not allowing the Unicode
80    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
81    Implementation details:
82      (A & 0x80) == 0     means A < 0x80
83    and
84      (A & 0xC0) == 0xC0  means A > 0xBF
85 */
86 
87 #define UTF8_INVALID2(p) \
88   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
89 
90 #define UTF8_INVALID3(p) \
91   (((p)[2] & 0x80) == 0 \
92   || \
93   ((*p) == 0xEF && (p)[1] == 0xBF \
94     ? \
95     (p)[2] > 0xBD \
96     : \
97     ((p)[2] & 0xC0) == 0xC0) \
98   || \
99   ((*p) == 0xE0 \
100     ? \
101     (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
102     : \
103     ((p)[1] & 0x80) == 0 \
104     || \
105     ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
106 
107 #define UTF8_INVALID4(p) \
108   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
109   || \
110   ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
111   || \
112   ((*p) == 0xF0 \
113     ? \
114     (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
115     : \
116     ((p)[1] & 0x80) == 0 \
117     || \
118     ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
119 
120 static int PTRFASTCALL
isNever(const ENCODING * enc,const char * p)121 isNever(const ENCODING *enc, const char *p)
122 {
123   return 0;
124 }
125 
126 static int PTRFASTCALL
utf8_isName2(const ENCODING * enc,const char * p)127 utf8_isName2(const ENCODING *enc, const char *p)
128 {
129   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
130 }
131 
132 static int PTRFASTCALL
utf8_isName3(const ENCODING * enc,const char * p)133 utf8_isName3(const ENCODING *enc, const char *p)
134 {
135   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
136 }
137 
138 #define utf8_isName4 isNever
139 
140 static int PTRFASTCALL
utf8_isNmstrt2(const ENCODING * enc,const char * p)141 utf8_isNmstrt2(const ENCODING *enc, const char *p)
142 {
143   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
144 }
145 
146 static int PTRFASTCALL
utf8_isNmstrt3(const ENCODING * enc,const char * p)147 utf8_isNmstrt3(const ENCODING *enc, const char *p)
148 {
149   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
150 }
151 
152 #define utf8_isNmstrt4 isNever
153 
154 static int PTRFASTCALL
utf8_isInvalid2(const ENCODING * enc,const char * p)155 utf8_isInvalid2(const ENCODING *enc, const char *p)
156 {
157   return UTF8_INVALID2((const unsigned char *)p);
158 }
159 
160 static int PTRFASTCALL
utf8_isInvalid3(const ENCODING * enc,const char * p)161 utf8_isInvalid3(const ENCODING *enc, const char *p)
162 {
163   return UTF8_INVALID3((const unsigned char *)p);
164 }
165 
166 static int PTRFASTCALL
utf8_isInvalid4(const ENCODING * enc,const char * p)167 utf8_isInvalid4(const ENCODING *enc, const char *p)
168 {
169   return UTF8_INVALID4((const unsigned char *)p);
170 }
171 
172 struct normal_encoding {
173   ENCODING enc;
174   unsigned char type[256];
175 #ifdef XML_MIN_SIZE
176   int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
177   int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
178   int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
179   int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
180   int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
181 #endif /* XML_MIN_SIZE */
182   int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
183   int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
184   int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
185   int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
186   int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
187   int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
188   int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
189   int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
190   int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
191 };
192 
193 #define AS_NORMAL_ENCODING(enc)   ((const struct normal_encoding *) (enc))
194 
195 #ifdef XML_MIN_SIZE
196 
197 #define STANDARD_VTABLE(E) \
198  E ## byteType, \
199  E ## isNameMin, \
200  E ## isNmstrtMin, \
201  E ## byteToAscii, \
202  E ## charMatches,
203 
204 #else
205 
206 #define STANDARD_VTABLE(E) /* as nothing */
207 
208 #endif
209 
210 #define NORMAL_VTABLE(E) \
211  E ## isName2, \
212  E ## isName3, \
213  E ## isName4, \
214  E ## isNmstrt2, \
215  E ## isNmstrt3, \
216  E ## isNmstrt4, \
217  E ## isInvalid2, \
218  E ## isInvalid3, \
219  E ## isInvalid4
220 
221 static int FASTCALL checkCharRefNumber(int);
222 
223 #include "xmltok_impl.h"
224 #include "ascii.h"
225 
226 #ifdef XML_MIN_SIZE
227 #define sb_isNameMin isNever
228 #define sb_isNmstrtMin isNever
229 #endif
230 
231 #ifdef XML_MIN_SIZE
232 #define MINBPC(enc) ((enc)->minBytesPerChar)
233 #else
234 /* minimum bytes per character */
235 #define MINBPC(enc) 1
236 #endif
237 
238 #define SB_BYTE_TYPE(enc, p) \
239   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
240 
241 #ifdef XML_MIN_SIZE
242 static int PTRFASTCALL
sb_byteType(const ENCODING * enc,const char * p)243 sb_byteType(const ENCODING *enc, const char *p)
244 {
245   return SB_BYTE_TYPE(enc, p);
246 }
247 #define BYTE_TYPE(enc, p) \
248  (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
249 #else
250 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
251 #endif
252 
253 #ifdef XML_MIN_SIZE
254 #define BYTE_TO_ASCII(enc, p) \
255  (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
256 static int PTRFASTCALL
sb_byteToAscii(const ENCODING * enc,const char * p)257 sb_byteToAscii(const ENCODING *enc, const char *p)
258 {
259   return *p;
260 }
261 #else
262 #define BYTE_TO_ASCII(enc, p) (*(p))
263 #endif
264 
265 #define IS_NAME_CHAR(enc, p, n) \
266  (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
267 #define IS_NMSTRT_CHAR(enc, p, n) \
268  (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
269 #define IS_INVALID_CHAR(enc, p, n) \
270  (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
271 
272 #ifdef XML_MIN_SIZE
273 #define IS_NAME_CHAR_MINBPC(enc, p) \
274  (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
275 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
276  (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
277 #else
278 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
279 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
280 #endif
281 
282 #ifdef XML_MIN_SIZE
283 #define CHAR_MATCHES(enc, p, c) \
284  (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
285 static int PTRCALL
sb_charMatches(const ENCODING * enc,const char * p,int c)286 sb_charMatches(const ENCODING *enc, const char *p, int c)
287 {
288   return *p == c;
289 }
290 #else
291 /* c is an ASCII character */
292 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
293 #endif
294 
295 #define PREFIX(ident) normal_ ## ident
296 #include "xmltok_impl.c"
297 
298 #undef MINBPC
299 #undef BYTE_TYPE
300 #undef BYTE_TO_ASCII
301 #undef CHAR_MATCHES
302 #undef IS_NAME_CHAR
303 #undef IS_NAME_CHAR_MINBPC
304 #undef IS_NMSTRT_CHAR
305 #undef IS_NMSTRT_CHAR_MINBPC
306 #undef IS_INVALID_CHAR
307 
308 enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
309   UTF8_cval1 = 0x00,
310   UTF8_cval2 = 0xc0,
311   UTF8_cval3 = 0xe0,
312   UTF8_cval4 = 0xf0
313 };
314 
315 static void PTRCALL
utf8_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)316 utf8_toUtf8(const ENCODING *enc,
317             const char **fromP, const char *fromLim,
318             char **toP, const char *toLim)
319 {
320   char *to;
321   const char *from;
322   if (fromLim - *fromP > toLim - *toP) {
323     /* Avoid copying partial characters. */
324     for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
325       if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
326         break;
327   }
328   for (to = *toP, from = *fromP; from != fromLim; from++, to++)
329     *to = *from;
330   *fromP = from;
331   *toP = to;
332 }
333 
334 static void PTRCALL
utf8_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)335 utf8_toUtf16(const ENCODING *enc,
336              const char **fromP, const char *fromLim,
337              unsigned short **toP, const unsigned short *toLim)
338 {
339   unsigned short *to = *toP;
340   const char *from = *fromP;
341   while (from != fromLim && to != toLim) {
342     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
343     case BT_LEAD2:
344       *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
345       from += 2;
346       break;
347     case BT_LEAD3:
348       *to++ = (unsigned short)(((from[0] & 0xf) << 12)
349                                | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
350       from += 3;
351       break;
352     case BT_LEAD4:
353       {
354         unsigned long n;
355         if (to + 1 == toLim)
356           goto after;
357         n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
358             | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
359         n -= 0x10000;
360         to[0] = (unsigned short)((n >> 10) | 0xD800);
361         to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
362         to += 2;
363         from += 4;
364       }
365       break;
366     default:
367       *to++ = *from++;
368       break;
369     }
370   }
371 after:
372   *fromP = from;
373   *toP = to;
374 }
375 
376 #ifdef XML_NS
377 static const struct normal_encoding utf8_encoding_ns = {
378   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
379   {
380 #include "asciitab.h"
381 #include "utf8tab.h"
382   },
383   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
384 };
385 #endif
386 
387 static const struct normal_encoding utf8_encoding = {
388   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
389   {
390 #define BT_COLON BT_NMSTRT
391 #include "asciitab.h"
392 #undef BT_COLON
393 #include "utf8tab.h"
394   },
395   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
396 };
397 
398 #ifdef XML_NS
399 
400 static const struct normal_encoding internal_utf8_encoding_ns = {
401   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
402   {
403 #include "iasciitab.h"
404 #include "utf8tab.h"
405   },
406   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
407 };
408 
409 #endif
410 
411 static const struct normal_encoding internal_utf8_encoding = {
412   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
413   {
414 #define BT_COLON BT_NMSTRT
415 #include "iasciitab.h"
416 #undef BT_COLON
417 #include "utf8tab.h"
418   },
419   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
420 };
421 
422 static void PTRCALL
latin1_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)423 latin1_toUtf8(const ENCODING *enc,
424               const char **fromP, const char *fromLim,
425               char **toP, const char *toLim)
426 {
427   for (;;) {
428     unsigned char c;
429     if (*fromP == fromLim)
430       break;
431     c = (unsigned char)**fromP;
432     if (c & 0x80) {
433       if (toLim - *toP < 2)
434         break;
435       *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
436       *(*toP)++ = (char)((c & 0x3f) | 0x80);
437       (*fromP)++;
438     }
439     else {
440       if (*toP == toLim)
441         break;
442       *(*toP)++ = *(*fromP)++;
443     }
444   }
445 }
446 
447 static void PTRCALL
latin1_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)448 latin1_toUtf16(const ENCODING *enc,
449                const char **fromP, const char *fromLim,
450                unsigned short **toP, const unsigned short *toLim)
451 {
452   while (*fromP != fromLim && *toP != toLim)
453     *(*toP)++ = (unsigned char)*(*fromP)++;
454 }
455 
456 #ifdef XML_NS
457 
458 static const struct normal_encoding latin1_encoding_ns = {
459   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
460   {
461 #include "asciitab.h"
462 #include "latin1tab.h"
463   },
464   STANDARD_VTABLE(sb_)
465 };
466 
467 #endif
468 
469 static const struct normal_encoding latin1_encoding = {
470   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
471   {
472 #define BT_COLON BT_NMSTRT
473 #include "asciitab.h"
474 #undef BT_COLON
475 #include "latin1tab.h"
476   },
477   STANDARD_VTABLE(sb_)
478 };
479 
480 static void PTRCALL
ascii_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)481 ascii_toUtf8(const ENCODING *enc,
482              const char **fromP, const char *fromLim,
483              char **toP, const char *toLim)
484 {
485   while (*fromP != fromLim && *toP != toLim)
486     *(*toP)++ = *(*fromP)++;
487 }
488 
489 #ifdef XML_NS
490 
491 static const struct normal_encoding ascii_encoding_ns = {
492   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
493   {
494 #include "asciitab.h"
495 /* BT_NONXML == 0 */
496   },
497   STANDARD_VTABLE(sb_)
498 };
499 
500 #endif
501 
502 static const struct normal_encoding ascii_encoding = {
503   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
504   {
505 #define BT_COLON BT_NMSTRT
506 #include "asciitab.h"
507 #undef BT_COLON
508 /* BT_NONXML == 0 */
509   },
510   STANDARD_VTABLE(sb_)
511 };
512 
513 static int PTRFASTCALL
unicode_byte_type(char hi,char lo)514 unicode_byte_type(char hi, char lo)
515 {
516   switch ((unsigned char)hi) {
517   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
518     return BT_LEAD4;
519   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
520     return BT_TRAIL;
521   case 0xFF:
522     switch ((unsigned char)lo) {
523     case 0xFF:
524     case 0xFE:
525       return BT_NONXML;
526     }
527     break;
528   }
529   return BT_NONASCII;
530 }
531 
532 #define DEFINE_UTF16_TO_UTF8(E) \
533 static void  PTRCALL \
534 E ## toUtf8(const ENCODING *enc, \
535             const char **fromP, const char *fromLim, \
536             char **toP, const char *toLim) \
537 { \
538   const char *from; \
539   for (from = *fromP; from != fromLim; from += 2) { \
540     int plane; \
541     unsigned char lo2; \
542     unsigned char lo = GET_LO(from); \
543     unsigned char hi = GET_HI(from); \
544     switch (hi) { \
545     case 0: \
546       if (lo < 0x80) { \
547         if (*toP == toLim) { \
548           *fromP = from; \
549           return; \
550         } \
551         *(*toP)++ = lo; \
552         break; \
553       } \
554       /* fall through */ \
555     case 0x1: case 0x2: case 0x3: \
556     case 0x4: case 0x5: case 0x6: case 0x7: \
557       if (toLim -  *toP < 2) { \
558         *fromP = from; \
559         return; \
560       } \
561       *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
562       *(*toP)++ = ((lo & 0x3f) | 0x80); \
563       break; \
564     default: \
565       if (toLim -  *toP < 3)  { \
566         *fromP = from; \
567         return; \
568       } \
569       /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
570       *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
571       *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
572       *(*toP)++ = ((lo & 0x3f) | 0x80); \
573       break; \
574     case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
575       if (toLim -  *toP < 4) { \
576         *fromP = from; \
577         return; \
578       } \
579       plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
580       *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
581       *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
582       from += 2; \
583       lo2 = GET_LO(from); \
584       *(*toP)++ = (((lo & 0x3) << 4) \
585                    | ((GET_HI(from) & 0x3) << 2) \
586                    | (lo2 >> 6) \
587                    | 0x80); \
588       *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
589       break; \
590     } \
591   } \
592   *fromP = from; \
593 }
594 
595 #define DEFINE_UTF16_TO_UTF16(E) \
596 static void  PTRCALL \
597 E ## toUtf16(const ENCODING *enc, \
598              const char **fromP, const char *fromLim, \
599              unsigned short **toP, const unsigned short *toLim) \
600 { \
601   /* Avoid copying first half only of surrogate */ \
602   if (fromLim - *fromP > ((toLim - *toP) << 1) \
603       && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
604     fromLim -= 2; \
605   for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
606     *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
607 }
608 
609 #define SET2(ptr, ch) \
610   (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
611 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
612 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
613 
614 DEFINE_UTF16_TO_UTF8(little2_)
DEFINE_UTF16_TO_UTF16(little2_)615 DEFINE_UTF16_TO_UTF16(little2_)
616 
617 #undef SET2
618 #undef GET_LO
619 #undef GET_HI
620 
621 #define SET2(ptr, ch) \
622   (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
623 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
624 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
625 
626 DEFINE_UTF16_TO_UTF8(big2_)
627 DEFINE_UTF16_TO_UTF16(big2_)
628 
629 #undef SET2
630 #undef GET_LO
631 #undef GET_HI
632 
633 #define LITTLE2_BYTE_TYPE(enc, p) \
634  ((p)[1] == 0 \
635   ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
636   : unicode_byte_type((p)[1], (p)[0]))
637 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
638 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
639 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
640   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
641 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
642   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
643 
644 #ifdef XML_MIN_SIZE
645 
646 static int PTRFASTCALL
647 little2_byteType(const ENCODING *enc, const char *p)
648 {
649   return LITTLE2_BYTE_TYPE(enc, p);
650 }
651 
652 static int PTRFASTCALL
little2_byteToAscii(const ENCODING * enc,const char * p)653 little2_byteToAscii(const ENCODING *enc, const char *p)
654 {
655   return LITTLE2_BYTE_TO_ASCII(enc, p);
656 }
657 
658 static int PTRCALL
little2_charMatches(const ENCODING * enc,const char * p,int c)659 little2_charMatches(const ENCODING *enc, const char *p, int c)
660 {
661   return LITTLE2_CHAR_MATCHES(enc, p, c);
662 }
663 
664 static int PTRFASTCALL
little2_isNameMin(const ENCODING * enc,const char * p)665 little2_isNameMin(const ENCODING *enc, const char *p)
666 {
667   return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
668 }
669 
670 static int PTRFASTCALL
little2_isNmstrtMin(const ENCODING * enc,const char * p)671 little2_isNmstrtMin(const ENCODING *enc, const char *p)
672 {
673   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
674 }
675 
676 #undef VTABLE
677 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
678 
679 #else /* not XML_MIN_SIZE */
680 
681 #undef PREFIX
682 #define PREFIX(ident) little2_ ## ident
683 #define MINBPC(enc) 2
684 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
685 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
686 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
687 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
688 #define IS_NAME_CHAR(enc, p, n) 0
689 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
690 #define IS_NMSTRT_CHAR(enc, p, n) (0)
691 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
692 
693 #include "xmltok_impl.c"
694 
695 #undef MINBPC
696 #undef BYTE_TYPE
697 #undef BYTE_TO_ASCII
698 #undef CHAR_MATCHES
699 #undef IS_NAME_CHAR
700 #undef IS_NAME_CHAR_MINBPC
701 #undef IS_NMSTRT_CHAR
702 #undef IS_NMSTRT_CHAR_MINBPC
703 #undef IS_INVALID_CHAR
704 
705 #endif /* not XML_MIN_SIZE */
706 
707 #ifdef XML_NS
708 
709 static const struct normal_encoding little2_encoding_ns = {
710   { VTABLE, 2, 0,
711 #if BYTEORDER == 1234
712     1
713 #else
714     0
715 #endif
716   },
717   {
718 #include "asciitab.h"
719 #include "latin1tab.h"
720   },
721   STANDARD_VTABLE(little2_)
722 };
723 
724 #endif
725 
726 static const struct normal_encoding little2_encoding = {
727   { VTABLE, 2, 0,
728 #if BYTEORDER == 1234
729     1
730 #else
731     0
732 #endif
733   },
734   {
735 #define BT_COLON BT_NMSTRT
736 #include "asciitab.h"
737 #undef BT_COLON
738 #include "latin1tab.h"
739   },
740   STANDARD_VTABLE(little2_)
741 };
742 
743 #if BYTEORDER != 4321
744 
745 #ifdef XML_NS
746 
747 static const struct normal_encoding internal_little2_encoding_ns = {
748   { VTABLE, 2, 0, 1 },
749   {
750 #include "iasciitab.h"
751 #include "latin1tab.h"
752   },
753   STANDARD_VTABLE(little2_)
754 };
755 
756 #endif
757 
758 static const struct normal_encoding internal_little2_encoding = {
759   { VTABLE, 2, 0, 1 },
760   {
761 #define BT_COLON BT_NMSTRT
762 #include "iasciitab.h"
763 #undef BT_COLON
764 #include "latin1tab.h"
765   },
766   STANDARD_VTABLE(little2_)
767 };
768 
769 #endif
770 
771 
772 #define BIG2_BYTE_TYPE(enc, p) \
773  ((p)[0] == 0 \
774   ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
775   : unicode_byte_type((p)[0], (p)[1]))
776 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
777 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
778 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
779   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
780 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
781   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
782 
783 #ifdef XML_MIN_SIZE
784 
785 static int PTRFASTCALL
big2_byteType(const ENCODING * enc,const char * p)786 big2_byteType(const ENCODING *enc, const char *p)
787 {
788   return BIG2_BYTE_TYPE(enc, p);
789 }
790 
791 static int PTRFASTCALL
big2_byteToAscii(const ENCODING * enc,const char * p)792 big2_byteToAscii(const ENCODING *enc, const char *p)
793 {
794   return BIG2_BYTE_TO_ASCII(enc, p);
795 }
796 
797 static int PTRCALL
big2_charMatches(const ENCODING * enc,const char * p,int c)798 big2_charMatches(const ENCODING *enc, const char *p, int c)
799 {
800   return BIG2_CHAR_MATCHES(enc, p, c);
801 }
802 
803 static int PTRFASTCALL
big2_isNameMin(const ENCODING * enc,const char * p)804 big2_isNameMin(const ENCODING *enc, const char *p)
805 {
806   return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
807 }
808 
809 static int PTRFASTCALL
big2_isNmstrtMin(const ENCODING * enc,const char * p)810 big2_isNmstrtMin(const ENCODING *enc, const char *p)
811 {
812   return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
813 }
814 
815 #undef VTABLE
816 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
817 
818 #else /* not XML_MIN_SIZE */
819 
820 #undef PREFIX
821 #define PREFIX(ident) big2_ ## ident
822 #define MINBPC(enc) 2
823 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
824 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
825 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
826 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
827 #define IS_NAME_CHAR(enc, p, n) 0
828 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
829 #define IS_NMSTRT_CHAR(enc, p, n) (0)
830 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
831 
832 #include "xmltok_impl.c"
833 
834 #undef MINBPC
835 #undef BYTE_TYPE
836 #undef BYTE_TO_ASCII
837 #undef CHAR_MATCHES
838 #undef IS_NAME_CHAR
839 #undef IS_NAME_CHAR_MINBPC
840 #undef IS_NMSTRT_CHAR
841 #undef IS_NMSTRT_CHAR_MINBPC
842 #undef IS_INVALID_CHAR
843 
844 #endif /* not XML_MIN_SIZE */
845 
846 #ifdef XML_NS
847 
848 static const struct normal_encoding big2_encoding_ns = {
849   { VTABLE, 2, 0,
850 #if BYTEORDER == 4321
851   1
852 #else
853   0
854 #endif
855   },
856   {
857 #include "asciitab.h"
858 #include "latin1tab.h"
859   },
860   STANDARD_VTABLE(big2_)
861 };
862 
863 #endif
864 
865 static const struct normal_encoding big2_encoding = {
866   { VTABLE, 2, 0,
867 #if BYTEORDER == 4321
868   1
869 #else
870   0
871 #endif
872   },
873   {
874 #define BT_COLON BT_NMSTRT
875 #include "asciitab.h"
876 #undef BT_COLON
877 #include "latin1tab.h"
878   },
879   STANDARD_VTABLE(big2_)
880 };
881 
882 #if BYTEORDER != 1234
883 
884 #ifdef XML_NS
885 
886 static const struct normal_encoding internal_big2_encoding_ns = {
887   { VTABLE, 2, 0, 1 },
888   {
889 #include "iasciitab.h"
890 #include "latin1tab.h"
891   },
892   STANDARD_VTABLE(big2_)
893 };
894 
895 #endif
896 
897 static const struct normal_encoding internal_big2_encoding = {
898   { VTABLE, 2, 0, 1 },
899   {
900 #define BT_COLON BT_NMSTRT
901 #include "iasciitab.h"
902 #undef BT_COLON
903 #include "latin1tab.h"
904   },
905   STANDARD_VTABLE(big2_)
906 };
907 
908 #endif
909 
910 #undef PREFIX
911 
912 static int FASTCALL
streqci(const char * s1,const char * s2)913 streqci(const char *s1, const char *s2)
914 {
915   for (;;) {
916     char c1 = *s1++;
917     char c2 = *s2++;
918     if (ASCII_a <= c1 && c1 <= ASCII_z)
919       c1 += ASCII_A - ASCII_a;
920     if (ASCII_a <= c2 && c2 <= ASCII_z)
921       c2 += ASCII_A - ASCII_a;
922     if (c1 != c2)
923       return 0;
924     if (!c1)
925       break;
926   }
927   return 1;
928 }
929 
930 static void PTRCALL
initUpdatePosition(const ENCODING * enc,const char * ptr,const char * end,POSITION * pos)931 initUpdatePosition(const ENCODING *enc, const char *ptr,
932                    const char *end, POSITION *pos)
933 {
934   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
935 }
936 
937 static int
toAscii(const ENCODING * enc,const char * ptr,const char * end)938 toAscii(const ENCODING *enc, const char *ptr, const char *end)
939 {
940   char buf[1];
941   char *p = buf;
942   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
943   if (p == buf)
944     return -1;
945   else
946     return buf[0];
947 }
948 
949 static int FASTCALL
isSpace(int c)950 isSpace(int c)
951 {
952   switch (c) {
953   case 0x20:
954   case 0xD:
955   case 0xA:
956   case 0x9:
957     return 1;
958   }
959   return 0;
960 }
961 
962 /* Return 1 if there's just optional white space or there's an S
963    followed by name=val.
964 */
965 static int
parsePseudoAttribute(const ENCODING * enc,const char * ptr,const char * end,const char ** namePtr,const char ** nameEndPtr,const char ** valPtr,const char ** nextTokPtr)966 parsePseudoAttribute(const ENCODING *enc,
967                      const char *ptr,
968                      const char *end,
969                      const char **namePtr,
970                      const char **nameEndPtr,
971                      const char **valPtr,
972                      const char **nextTokPtr)
973 {
974   int c;
975   char open;
976   if (ptr == end) {
977     *namePtr = NULL;
978     return 1;
979   }
980   if (!isSpace(toAscii(enc, ptr, end))) {
981     *nextTokPtr = ptr;
982     return 0;
983   }
984   do {
985     ptr += enc->minBytesPerChar;
986   } while (isSpace(toAscii(enc, ptr, end)));
987   if (ptr == end) {
988     *namePtr = NULL;
989     return 1;
990   }
991   *namePtr = ptr;
992   for (;;) {
993     c = toAscii(enc, ptr, end);
994     if (c == -1) {
995       *nextTokPtr = ptr;
996       return 0;
997     }
998     if (c == ASCII_EQUALS) {
999       *nameEndPtr = ptr;
1000       break;
1001     }
1002     if (isSpace(c)) {
1003       *nameEndPtr = ptr;
1004       do {
1005         ptr += enc->minBytesPerChar;
1006       } while (isSpace(c = toAscii(enc, ptr, end)));
1007       if (c != ASCII_EQUALS) {
1008         *nextTokPtr = ptr;
1009         return 0;
1010       }
1011       break;
1012     }
1013     ptr += enc->minBytesPerChar;
1014   }
1015   if (ptr == *namePtr) {
1016     *nextTokPtr = ptr;
1017     return 0;
1018   }
1019   ptr += enc->minBytesPerChar;
1020   c = toAscii(enc, ptr, end);
1021   while (isSpace(c)) {
1022     ptr += enc->minBytesPerChar;
1023     c = toAscii(enc, ptr, end);
1024   }
1025   if (c != ASCII_QUOT && c != ASCII_APOS) {
1026     *nextTokPtr = ptr;
1027     return 0;
1028   }
1029   open = (char)c;
1030   ptr += enc->minBytesPerChar;
1031   *valPtr = ptr;
1032   for (;; ptr += enc->minBytesPerChar) {
1033     c = toAscii(enc, ptr, end);
1034     if (c == open)
1035       break;
1036     if (!(ASCII_a <= c && c <= ASCII_z)
1037         && !(ASCII_A <= c && c <= ASCII_Z)
1038         && !(ASCII_0 <= c && c <= ASCII_9)
1039         && c != ASCII_PERIOD
1040         && c != ASCII_MINUS
1041         && c != ASCII_UNDERSCORE) {
1042       *nextTokPtr = ptr;
1043       return 0;
1044     }
1045   }
1046   *nextTokPtr = ptr + enc->minBytesPerChar;
1047   return 1;
1048 }
1049 
1050 static const char KW_version[] = {
1051   ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1052 };
1053 
1054 static const char KW_encoding[] = {
1055   ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1056 };
1057 
1058 static const char KW_standalone[] = {
1059   ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
1060   ASCII_n, ASCII_e, '\0'
1061 };
1062 
1063 static const char KW_yes[] = {
1064   ASCII_y, ASCII_e, ASCII_s,  '\0'
1065 };
1066 
1067 static const char KW_no[] = {
1068   ASCII_n, ASCII_o,  '\0'
1069 };
1070 
1071 static int
doParseXmlDecl(const ENCODING * (* encodingFinder)(const ENCODING *,const char *,const char *),int isGeneralTextEntity,const ENCODING * enc,const char * ptr,const char * end,const char ** badPtr,const char ** versionPtr,const char ** versionEndPtr,const char ** encodingName,const ENCODING ** encoding,int * standalone)1072 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1073                                                  const char *,
1074                                                  const char *),
1075                int isGeneralTextEntity,
1076                const ENCODING *enc,
1077                const char *ptr,
1078                const char *end,
1079                const char **badPtr,
1080                const char **versionPtr,
1081                const char **versionEndPtr,
1082                const char **encodingName,
1083                const ENCODING **encoding,
1084                int *standalone)
1085 {
1086   const char *val = NULL;
1087   const char *name = NULL;
1088   const char *nameEnd = NULL;
1089   ptr += 5 * enc->minBytesPerChar;
1090   end -= 2 * enc->minBytesPerChar;
1091   if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1092       || !name) {
1093     *badPtr = ptr;
1094     return 0;
1095   }
1096   if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1097     if (!isGeneralTextEntity) {
1098       *badPtr = name;
1099       return 0;
1100     }
1101   }
1102   else {
1103     if (versionPtr)
1104       *versionPtr = val;
1105     if (versionEndPtr)
1106       *versionEndPtr = ptr;
1107     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1108       *badPtr = ptr;
1109       return 0;
1110     }
1111     if (!name) {
1112       if (isGeneralTextEntity) {
1113         /* a TextDecl must have an EncodingDecl */
1114         *badPtr = ptr;
1115         return 0;
1116       }
1117       return 1;
1118     }
1119   }
1120   if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1121     int c = toAscii(enc, val, end);
1122     if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1123       *badPtr = val;
1124       return 0;
1125     }
1126     if (encodingName)
1127       *encodingName = val;
1128     if (encoding)
1129       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1130     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1131       *badPtr = ptr;
1132       return 0;
1133     }
1134     if (!name)
1135       return 1;
1136   }
1137   if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1138       || isGeneralTextEntity) {
1139     *badPtr = name;
1140     return 0;
1141   }
1142   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1143     if (standalone)
1144       *standalone = 1;
1145   }
1146   else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1147     if (standalone)
1148       *standalone = 0;
1149   }
1150   else {
1151     *badPtr = val;
1152     return 0;
1153   }
1154   while (isSpace(toAscii(enc, ptr, end)))
1155     ptr += enc->minBytesPerChar;
1156   if (ptr != end) {
1157     *badPtr = ptr;
1158     return 0;
1159   }
1160   return 1;
1161 }
1162 
1163 static int FASTCALL
checkCharRefNumber(int result)1164 checkCharRefNumber(int result)
1165 {
1166   switch (result >> 8) {
1167   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1168   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1169     return -1;
1170   case 0:
1171     if (latin1_encoding.type[result] == BT_NONXML)
1172       return -1;
1173     break;
1174   case 0xFF:
1175     if (result == 0xFFFE || result == 0xFFFF)
1176       return -1;
1177     break;
1178   }
1179   return result;
1180 }
1181 
1182 int FASTCALL
XmlUtf8Encode(int c,char * buf)1183 XmlUtf8Encode(int c, char *buf)
1184 {
1185   enum {
1186     /* minN is minimum legal resulting value for N byte sequence */
1187     min2 = 0x80,
1188     min3 = 0x800,
1189     min4 = 0x10000
1190   };
1191 
1192   if (c < 0)
1193     return 0;
1194   if (c < min2) {
1195     buf[0] = (char)(c | UTF8_cval1);
1196     return 1;
1197   }
1198   if (c < min3) {
1199     buf[0] = (char)((c >> 6) | UTF8_cval2);
1200     buf[1] = (char)((c & 0x3f) | 0x80);
1201     return 2;
1202   }
1203   if (c < min4) {
1204     buf[0] = (char)((c >> 12) | UTF8_cval3);
1205     buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1206     buf[2] = (char)((c & 0x3f) | 0x80);
1207     return 3;
1208   }
1209   if (c < 0x110000) {
1210     buf[0] = (char)((c >> 18) | UTF8_cval4);
1211     buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1212     buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1213     buf[3] = (char)((c & 0x3f) | 0x80);
1214     return 4;
1215   }
1216   return 0;
1217 }
1218 
1219 int FASTCALL
XmlUtf16Encode(int charNum,unsigned short * buf)1220 XmlUtf16Encode(int charNum, unsigned short *buf)
1221 {
1222   if (charNum < 0)
1223     return 0;
1224   if (charNum < 0x10000) {
1225     buf[0] = (unsigned short)charNum;
1226     return 1;
1227   }
1228   if (charNum < 0x110000) {
1229     charNum -= 0x10000;
1230     buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1231     buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1232     return 2;
1233   }
1234   return 0;
1235 }
1236 
1237 struct unknown_encoding {
1238   struct normal_encoding normal;
1239   CONVERTER convert;
1240   void *userData;
1241   unsigned short utf16[256];
1242   char utf8[256][4];
1243 };
1244 
1245 #define AS_UNKNOWN_ENCODING(enc)  ((const struct unknown_encoding *) (enc))
1246 
1247 int
XmlSizeOfUnknownEncoding(void)1248 XmlSizeOfUnknownEncoding(void)
1249 {
1250   return sizeof(struct unknown_encoding);
1251 }
1252 
1253 static int PTRFASTCALL
unknown_isName(const ENCODING * enc,const char * p)1254 unknown_isName(const ENCODING *enc, const char *p)
1255 {
1256   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1257   int c = uenc->convert(uenc->userData, p);
1258   if (c & ~0xFFFF)
1259     return 0;
1260   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1261 }
1262 
1263 static int PTRFASTCALL
unknown_isNmstrt(const ENCODING * enc,const char * p)1264 unknown_isNmstrt(const ENCODING *enc, const char *p)
1265 {
1266   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1267   int c = uenc->convert(uenc->userData, p);
1268   if (c & ~0xFFFF)
1269     return 0;
1270   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1271 }
1272 
1273 static int PTRFASTCALL
unknown_isInvalid(const ENCODING * enc,const char * p)1274 unknown_isInvalid(const ENCODING *enc, const char *p)
1275 {
1276   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1277   int c = uenc->convert(uenc->userData, p);
1278   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1279 }
1280 
1281 static void PTRCALL
unknown_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)1282 unknown_toUtf8(const ENCODING *enc,
1283                const char **fromP, const char *fromLim,
1284                char **toP, const char *toLim)
1285 {
1286   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1287   char buf[XML_UTF8_ENCODE_MAX];
1288   for (;;) {
1289     const char *utf8;
1290     int n;
1291     if (*fromP == fromLim)
1292       break;
1293     utf8 = uenc->utf8[(unsigned char)**fromP];
1294     n = *utf8++;
1295     if (n == 0) {
1296       int c = uenc->convert(uenc->userData, *fromP);
1297       n = XmlUtf8Encode(c, buf);
1298       if (n > toLim - *toP)
1299         break;
1300       utf8 = buf;
1301       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1302                  - (BT_LEAD2 - 2));
1303     }
1304     else {
1305       if (n > toLim - *toP)
1306         break;
1307       (*fromP)++;
1308     }
1309     do {
1310       *(*toP)++ = *utf8++;
1311     } while (--n != 0);
1312   }
1313 }
1314 
1315 static void PTRCALL
unknown_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)1316 unknown_toUtf16(const ENCODING *enc,
1317                 const char **fromP, const char *fromLim,
1318                 unsigned short **toP, const unsigned short *toLim)
1319 {
1320   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1321   while (*fromP != fromLim && *toP != toLim) {
1322     unsigned short c = uenc->utf16[(unsigned char)**fromP];
1323     if (c == 0) {
1324       c = (unsigned short)
1325           uenc->convert(uenc->userData, *fromP);
1326       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1327                  - (BT_LEAD2 - 2));
1328     }
1329     else
1330       (*fromP)++;
1331     *(*toP)++ = c;
1332   }
1333 }
1334 
1335 ENCODING *
XmlInitUnknownEncoding(void * mem,int * table,CONVERTER convert,void * userData)1336 XmlInitUnknownEncoding(void *mem,
1337                        int *table,
1338                        CONVERTER convert,
1339                        void *userData)
1340 {
1341   int i;
1342   struct unknown_encoding *e = (struct unknown_encoding *)mem;
1343   for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1344     ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1345   for (i = 0; i < 128; i++)
1346     if (latin1_encoding.type[i] != BT_OTHER
1347         && latin1_encoding.type[i] != BT_NONXML
1348         && table[i] != i)
1349       return 0;
1350   for (i = 0; i < 256; i++) {
1351     int c = table[i];
1352     if (c == -1) {
1353       e->normal.type[i] = BT_MALFORM;
1354       /* This shouldn't really get used. */
1355       e->utf16[i] = 0xFFFF;
1356       e->utf8[i][0] = 1;
1357       e->utf8[i][1] = 0;
1358     }
1359     else if (c < 0) {
1360       if (c < -4)
1361         return 0;
1362       e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1363       e->utf8[i][0] = 0;
1364       e->utf16[i] = 0;
1365     }
1366     else if (c < 0x80) {
1367       if (latin1_encoding.type[c] != BT_OTHER
1368           && latin1_encoding.type[c] != BT_NONXML
1369           && c != i)
1370         return 0;
1371       e->normal.type[i] = latin1_encoding.type[c];
1372       e->utf8[i][0] = 1;
1373       e->utf8[i][1] = (char)c;
1374       e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1375     }
1376     else if (checkCharRefNumber(c) < 0) {
1377       e->normal.type[i] = BT_NONXML;
1378       /* This shouldn't really get used. */
1379       e->utf16[i] = 0xFFFF;
1380       e->utf8[i][0] = 1;
1381       e->utf8[i][1] = 0;
1382     }
1383     else {
1384       if (c > 0xFFFF)
1385         return 0;
1386       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1387         e->normal.type[i] = BT_NMSTRT;
1388       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1389         e->normal.type[i] = BT_NAME;
1390       else
1391         e->normal.type[i] = BT_OTHER;
1392       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1393       e->utf16[i] = (unsigned short)c;
1394     }
1395   }
1396   e->userData = userData;
1397   e->convert = convert;
1398   if (convert) {
1399     e->normal.isName2 = unknown_isName;
1400     e->normal.isName3 = unknown_isName;
1401     e->normal.isName4 = unknown_isName;
1402     e->normal.isNmstrt2 = unknown_isNmstrt;
1403     e->normal.isNmstrt3 = unknown_isNmstrt;
1404     e->normal.isNmstrt4 = unknown_isNmstrt;
1405     e->normal.isInvalid2 = unknown_isInvalid;
1406     e->normal.isInvalid3 = unknown_isInvalid;
1407     e->normal.isInvalid4 = unknown_isInvalid;
1408   }
1409   e->normal.enc.utf8Convert = unknown_toUtf8;
1410   e->normal.enc.utf16Convert = unknown_toUtf16;
1411   return &(e->normal.enc);
1412 }
1413 
1414 /* If this enumeration is changed, getEncodingIndex and encodings
1415 must also be changed. */
1416 enum {
1417   UNKNOWN_ENC = -1,
1418   ISO_8859_1_ENC = 0,
1419   US_ASCII_ENC,
1420   UTF_8_ENC,
1421   UTF_16_ENC,
1422   UTF_16BE_ENC,
1423   UTF_16LE_ENC,
1424   /* must match encodingNames up to here */
1425   NO_ENC
1426 };
1427 
1428 static const char KW_ISO_8859_1[] = {
1429   ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
1430   ASCII_MINUS, ASCII_1, '\0'
1431 };
1432 static const char KW_US_ASCII[] = {
1433   ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
1434   '\0'
1435 };
1436 static const char KW_UTF_8[] =  {
1437   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1438 };
1439 static const char KW_UTF_16[] = {
1440   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1441 };
1442 static const char KW_UTF_16BE[] = {
1443   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
1444   '\0'
1445 };
1446 static const char KW_UTF_16LE[] = {
1447   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
1448   '\0'
1449 };
1450 
1451 static int FASTCALL
getEncodingIndex(const char * name)1452 getEncodingIndex(const char *name)
1453 {
1454   static const char *encodingNames[] = {
1455     KW_ISO_8859_1,
1456     KW_US_ASCII,
1457     KW_UTF_8,
1458     KW_UTF_16,
1459     KW_UTF_16BE,
1460     KW_UTF_16LE,
1461   };
1462   int i;
1463   if (name == NULL)
1464     return NO_ENC;
1465   for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1466     if (streqci(name, encodingNames[i]))
1467       return i;
1468   return UNKNOWN_ENC;
1469 }
1470 
1471 /* For binary compatibility, we store the index of the encoding
1472    specified at initialization in the isUtf16 member.
1473 */
1474 
1475 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1476 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1477 
1478 /* This is what detects the encoding.  encodingTable maps from
1479    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1480    the external (protocol) specified encoding; state is
1481    XML_CONTENT_STATE if we're parsing an external text entity, and
1482    XML_PROLOG_STATE otherwise.
1483 */
1484 
1485 
1486 static int
initScan(const ENCODING ** encodingTable,const INIT_ENCODING * enc,int state,const char * ptr,const char * end,const char ** nextTokPtr)1487 initScan(const ENCODING **encodingTable,
1488          const INIT_ENCODING *enc,
1489          int state,
1490          const char *ptr,
1491          const char *end,
1492          const char **nextTokPtr)
1493 {
1494   const ENCODING **encPtr;
1495 
1496   if (ptr == end)
1497     return XML_TOK_NONE;
1498   encPtr = enc->encPtr;
1499   if (ptr + 1 == end) {
1500     /* only a single byte available for auto-detection */
1501 #ifndef XML_DTD /* FIXME */
1502     /* a well-formed document entity must have more than one byte */
1503     if (state != XML_CONTENT_STATE)
1504       return XML_TOK_PARTIAL;
1505 #endif
1506     /* so we're parsing an external text entity... */
1507     /* if UTF-16 was externally specified, then we need at least 2 bytes */
1508     switch (INIT_ENC_INDEX(enc)) {
1509     case UTF_16_ENC:
1510     case UTF_16LE_ENC:
1511     case UTF_16BE_ENC:
1512       return XML_TOK_PARTIAL;
1513     }
1514     switch ((unsigned char)*ptr) {
1515     case 0xFE:
1516     case 0xFF:
1517     case 0xEF: /* possibly first byte of UTF-8 BOM */
1518       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1519           && state == XML_CONTENT_STATE)
1520         break;
1521       /* fall through */
1522     case 0x00:
1523     case 0x3C:
1524       return XML_TOK_PARTIAL;
1525     }
1526   }
1527   else {
1528     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1529     case 0xFEFF:
1530       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1531           && state == XML_CONTENT_STATE)
1532         break;
1533       *nextTokPtr = ptr + 2;
1534       *encPtr = encodingTable[UTF_16BE_ENC];
1535       return XML_TOK_BOM;
1536     /* 00 3C is handled in the default case */
1537     case 0x3C00:
1538       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1539            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1540           && state == XML_CONTENT_STATE)
1541         break;
1542       *encPtr = encodingTable[UTF_16LE_ENC];
1543       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1544     case 0xFFFE:
1545       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1546           && state == XML_CONTENT_STATE)
1547         break;
1548       *nextTokPtr = ptr + 2;
1549       *encPtr = encodingTable[UTF_16LE_ENC];
1550       return XML_TOK_BOM;
1551     case 0xEFBB:
1552       /* Maybe a UTF-8 BOM (EF BB BF) */
1553       /* If there's an explicitly specified (external) encoding
1554          of ISO-8859-1 or some flavour of UTF-16
1555          and this is an external text entity,
1556          don't look for the BOM,
1557          because it might be a legal data.
1558       */
1559       if (state == XML_CONTENT_STATE) {
1560         int e = INIT_ENC_INDEX(enc);
1561         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1562             || e == UTF_16LE_ENC || e == UTF_16_ENC)
1563           break;
1564       }
1565       if (ptr + 2 == end)
1566         return XML_TOK_PARTIAL;
1567       if ((unsigned char)ptr[2] == 0xBF) {
1568         *nextTokPtr = ptr + 3;
1569         *encPtr = encodingTable[UTF_8_ENC];
1570         return XML_TOK_BOM;
1571       }
1572       break;
1573     default:
1574       if (ptr[0] == '\0') {
1575         /* 0 isn't a legal data character. Furthermore a document
1576            entity can only start with ASCII characters.  So the only
1577            way this can fail to be big-endian UTF-16 if it it's an
1578            external parsed general entity that's labelled as
1579            UTF-16LE.
1580         */
1581         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1582           break;
1583         *encPtr = encodingTable[UTF_16BE_ENC];
1584         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1585       }
1586       else if (ptr[1] == '\0') {
1587         /* We could recover here in the case:
1588             - parsing an external entity
1589             - second byte is 0
1590             - no externally specified encoding
1591             - no encoding declaration
1592            by assuming UTF-16LE.  But we don't, because this would mean when
1593            presented just with a single byte, we couldn't reliably determine
1594            whether we needed further bytes.
1595         */
1596         if (state == XML_CONTENT_STATE)
1597           break;
1598         *encPtr = encodingTable[UTF_16LE_ENC];
1599         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1600       }
1601       break;
1602     }
1603   }
1604   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1605   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1606 }
1607 
1608 
1609 #define NS(x) x
1610 #define ns(x) x
1611 #include "xmltok_ns.c"
1612 #undef NS
1613 #undef ns
1614 
1615 #ifdef XML_NS
1616 
1617 #define NS(x) x ## NS
1618 #define ns(x) x ## _ns
1619 
1620 #include "xmltok_ns.c"
1621 
1622 #undef NS
1623 #undef ns
1624 
1625 ENCODING *
XmlInitUnknownEncodingNS(void * mem,int * table,CONVERTER convert,void * userData)1626 XmlInitUnknownEncodingNS(void *mem,
1627                          int *table,
1628                          CONVERTER convert,
1629                          void *userData)
1630 {
1631   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1632   if (enc)
1633     ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1634   return enc;
1635 }
1636 
1637 #endif /* XML_NS */
1638