1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2    See the file COPYING for copying permission.
3 */
4 
5 #include <stddef.h>
6 
7 #include <cutl/details/expat/config.h>
8 
9 #include <cutl/details/expat/expat_external.h>
10 #include <cutl/details/expat/internal.h>
11 #include <cutl/details/expat/xmltok.h>
12 #include <cutl/details/expat/nametab.h>
13 
14 #ifdef XML_DTD
15 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
16 #else
17 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
18 #endif
19 
20 #define VTABLE1 \
21   { PREFIX(prologTok), PREFIX(contentTok), \
22     PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
23   { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
24   PREFIX(sameName), \
25   PREFIX(nameMatchesAscii), \
26   PREFIX(nameLength), \
27   PREFIX(skipS), \
28   PREFIX(getAtts), \
29   PREFIX(charRefNumber), \
30   PREFIX(predefinedEntityName), \
31   PREFIX(updatePosition), \
32   PREFIX(isPublicId)
33 
34 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
35 
36 #define UCS2_GET_NAMING(pages, hi, lo) \
37    (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
38 
39 /* A 2 byte UTF-8 representation splits the characters 11 bits between
40    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
41    pages, 3 bits to add to that index and 5 bits to generate the mask.
42 */
43 #define UTF8_GET_NAMING2(pages, byte) \
44     (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
45                       + ((((byte)[0]) & 3) << 1) \
46                       + ((((byte)[1]) >> 5) & 1)] \
47          & (1 << (((byte)[1]) & 0x1F)))
48 
49 /* A 3 byte UTF-8 representation splits the characters 16 bits between
50    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
51    into pages, 3 bits to add to that index and 5 bits to generate the
52    mask.
53 */
54 #define UTF8_GET_NAMING3(pages, byte) \
55   (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
56                              + ((((byte)[1]) >> 2) & 0xF)] \
57                        << 3) \
58                       + ((((byte)[1]) & 3) << 1) \
59                       + ((((byte)[2]) >> 5) & 1)] \
60          & (1 << (((byte)[2]) & 0x1F)))
61 
62 #define UTF8_GET_NAMING(pages, p, n) \
63   ((n) == 2 \
64   ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
65   : ((n) == 3 \
66      ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
67      : 0))
68 
69 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
70    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
71    with the additional restriction of not allowing the Unicode
72    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
73    Implementation details:
74      (A & 0x80) == 0     means A < 0x80
75    and
76      (A & 0xC0) == 0xC0  means A > 0xBF
77 */
78 
79 #define UTF8_INVALID2(p) \
80   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
81 
82 #define UTF8_INVALID3(p) \
83   (((p)[2] & 0x80) == 0 \
84   || \
85   ((*p) == 0xEF && (p)[1] == 0xBF \
86     ? \
87     (p)[2] > 0xBD \
88     : \
89     ((p)[2] & 0xC0) == 0xC0) \
90   || \
91   ((*p) == 0xE0 \
92     ? \
93     (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
94     : \
95     ((p)[1] & 0x80) == 0 \
96     || \
97     ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
98 
99 #define UTF8_INVALID4(p) \
100   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
101   || \
102   ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
103   || \
104   ((*p) == 0xF0 \
105     ? \
106     (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
107     : \
108     ((p)[1] & 0x80) == 0 \
109     || \
110     ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
111 
112 static int PTRFASTCALL
isNever(const ENCODING * enc,const char * p)113 isNever(const ENCODING *enc, const char *p)
114 {
115   UNUSED(enc);
116   UNUSED(p);
117 
118   return 0;
119 }
120 
121 static int PTRFASTCALL
utf8_isName2(const ENCODING * enc,const char * p)122 utf8_isName2(const ENCODING *enc, const char *p)
123 {
124   UNUSED(enc);
125 
126   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
127 }
128 
129 static int PTRFASTCALL
utf8_isName3(const ENCODING * enc,const char * p)130 utf8_isName3(const ENCODING *enc, const char *p)
131 {
132   UNUSED(enc);
133 
134   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
135 }
136 
137 #define utf8_isName4 isNever
138 
139 static int PTRFASTCALL
utf8_isNmstrt2(const ENCODING * enc,const char * p)140 utf8_isNmstrt2(const ENCODING *enc, const char *p)
141 {
142   UNUSED(enc);
143 
144   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
145 }
146 
147 static int PTRFASTCALL
utf8_isNmstrt3(const ENCODING * enc,const char * p)148 utf8_isNmstrt3(const ENCODING *enc, const char *p)
149 {
150   UNUSED(enc);
151 
152   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
153 }
154 
155 #define utf8_isNmstrt4 isNever
156 
157 static int PTRFASTCALL
utf8_isInvalid2(const ENCODING * enc,const char * p)158 utf8_isInvalid2(const ENCODING *enc, const char *p)
159 {
160   UNUSED(enc);
161 
162   return UTF8_INVALID2((const unsigned char *)p);
163 }
164 
165 static int PTRFASTCALL
utf8_isInvalid3(const ENCODING * enc,const char * p)166 utf8_isInvalid3(const ENCODING *enc, const char *p)
167 {
168   UNUSED(enc);
169 
170   return UTF8_INVALID3((const unsigned char *)p);
171 }
172 
173 static int PTRFASTCALL
utf8_isInvalid4(const ENCODING * enc,const char * p)174 utf8_isInvalid4(const ENCODING *enc, const char *p)
175 {
176   UNUSED(enc);
177 
178   return UTF8_INVALID4((const unsigned char *)p);
179 }
180 
181 struct normal_encoding {
182   ENCODING enc;
183   unsigned char type[256];
184 #ifdef XML_MIN_SIZE
185   int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
186   int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
187   int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
188   int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
189   int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
190 #endif /* XML_MIN_SIZE */
191   int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
192   int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
193   int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
194   int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
195   int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
196   int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
197   int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
198   int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
199   int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
200 };
201 
202 #define AS_NORMAL_ENCODING(enc)   ((const struct normal_encoding *) (enc))
203 
204 #ifdef XML_MIN_SIZE
205 
206 #define STANDARD_VTABLE(E) \
207  E ## byteType, \
208  E ## isNameMin, \
209  E ## isNmstrtMin, \
210  E ## byteToAscii, \
211  E ## charMatches,
212 
213 #define ZERO_VTABLE /* as nothing */
214 
215 #else
216 
217 #define STANDARD_VTABLE(E) /* as nothing */
218 
219 #define ZERO_VTABLE \
220  0, \
221  0, \
222  0, \
223  0, \
224  0, \
225  0, \
226  0, \
227  0, \
228  0
229 
230 #endif
231 
232 #define NORMAL_VTABLE(E) \
233  E ## isName2, \
234  E ## isName3, \
235  E ## isName4, \
236  E ## isNmstrt2, \
237  E ## isNmstrt3, \
238  E ## isNmstrt4, \
239  E ## isInvalid2, \
240  E ## isInvalid3, \
241  E ## isInvalid4
242 
243 static int FASTCALL checkCharRefNumber(int);
244 
245 #include <cutl/details/expat/xmltok_impl.h>
246 #include <cutl/details/expat/ascii.h>
247 
248 #ifdef XML_MIN_SIZE
249 #define sb_isNameMin isNever
250 #define sb_isNmstrtMin isNever
251 #endif
252 
253 #ifdef XML_MIN_SIZE
254 #define MINBPC(enc) ((enc)->minBytesPerChar)
255 #else
256 /* minimum bytes per character */
257 #define MINBPC(enc) 1
258 #endif
259 
260 #define SB_BYTE_TYPE(enc, p) \
261   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
262 
263 #ifdef XML_MIN_SIZE
264 static int PTRFASTCALL
sb_byteType(const ENCODING * enc,const char * p)265 sb_byteType(const ENCODING *enc, const char *p)
266 {
267   return SB_BYTE_TYPE(enc, p);
268 }
269 #define BYTE_TYPE(enc, p) \
270  (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
271 #else
272 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
273 #endif
274 
275 #ifdef XML_MIN_SIZE
276 #define BYTE_TO_ASCII(enc, p) \
277  (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
278 static int PTRFASTCALL
sb_byteToAscii(const ENCODING * enc,const char * p)279 sb_byteToAscii(const ENCODING *enc, const char *p)
280 {
281   return *p;
282 }
283 #else
284 #define BYTE_TO_ASCII(enc, p) (*(p))
285 #endif
286 
287 #define IS_NAME_CHAR(enc, p, n) \
288  (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
289 #define IS_NMSTRT_CHAR(enc, p, n) \
290  (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
291 #define IS_INVALID_CHAR(enc, p, n) \
292  (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
293 
294 #ifdef XML_MIN_SIZE
295 #define IS_NAME_CHAR_MINBPC(enc, p) \
296  (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
297 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
298  (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
299 #else
300 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
301 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
302 #endif
303 
304 #ifdef XML_MIN_SIZE
305 #define CHAR_MATCHES(enc, p, c) \
306  (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
307 static int PTRCALL
sb_charMatches(const ENCODING * enc,const char * p,int c)308 sb_charMatches(const ENCODING *enc, const char *p, int c)
309 {
310   return *p == c;
311 }
312 #else
313 /* c is an ASCII character */
314 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
315 #endif
316 
317 #define PREFIX(ident) normal_ ## ident
318 #define XML_TOK_IMPL_C
319 #include <cutl/details/expat/xmltok_impl.c>
320 #undef XML_TOK_IMPL_C
321 
322 #undef MINBPC
323 #undef BYTE_TYPE
324 #undef BYTE_TO_ASCII
325 #undef CHAR_MATCHES
326 #undef IS_NAME_CHAR
327 #undef IS_NAME_CHAR_MINBPC
328 #undef IS_NMSTRT_CHAR
329 #undef IS_NMSTRT_CHAR_MINBPC
330 #undef IS_INVALID_CHAR
331 
332 enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
333   UTF8_cval1 = 0x00,
334   UTF8_cval2 = 0xc0,
335   UTF8_cval3 = 0xe0,
336   UTF8_cval4 = 0xf0
337 };
338 
339 static void PTRCALL
utf8_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)340 utf8_toUtf8(const ENCODING *enc,
341             const char **fromP, const char *fromLim,
342             char **toP, const char *toLim)
343 {
344   char *to;
345   const char *from;
346 
347   UNUSED(enc);
348 
349   if (fromLim - *fromP > toLim - *toP) {
350     /* Avoid copying partial characters. */
351     for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
352       if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
353         break;
354   }
355   for (to = *toP, from = *fromP; from != fromLim; from++, to++)
356     *to = *from;
357   *fromP = from;
358   *toP = to;
359 }
360 
361 static void PTRCALL
utf8_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)362 utf8_toUtf16(const ENCODING *enc,
363              const char **fromP, const char *fromLim,
364              unsigned short **toP, const unsigned short *toLim)
365 {
366   unsigned short *to = *toP;
367   const char *from = *fromP;
368   while (from != fromLim && to != toLim) {
369     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
370     case BT_LEAD2:
371       *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
372       from += 2;
373       break;
374     case BT_LEAD3:
375       *to++ = (unsigned short)(((from[0] & 0xf) << 12)
376                                | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
377       from += 3;
378       break;
379     case BT_LEAD4:
380       {
381         unsigned long n;
382         if (to + 1 == toLim)
383           goto after;
384         n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
385             | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
386         n -= 0x10000;
387         to[0] = (unsigned short)((n >> 10) | 0xD800);
388         to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
389         to += 2;
390         from += 4;
391       }
392       break;
393     default:
394       *to++ = *from++;
395       break;
396     }
397   }
398 after:
399   *fromP = from;
400   *toP = to;
401 }
402 
403 #ifdef XML_NS
404 static const struct normal_encoding utf8_encoding_ns = {
405   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
406   {
407 #include <cutl/details/expat/asciitab.h>
408 #include <cutl/details/expat/utf8tab.h>
409   },
410   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
411 };
412 #endif
413 
414 static const struct normal_encoding utf8_encoding = {
415   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
416   {
417 #define BT_COLON BT_NMSTRT
418 #include <cutl/details/expat/asciitab.h>
419 #undef BT_COLON
420 #include <cutl/details/expat/utf8tab.h>
421   },
422   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
423 };
424 
425 #ifdef XML_NS
426 
427 static const struct normal_encoding internal_utf8_encoding_ns = {
428   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
429   {
430 #include <cutl/details/expat/iasciitab.h>
431 #include <cutl/details/expat/utf8tab.h>
432   },
433   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
434 };
435 
436 #endif
437 
438 static const struct normal_encoding internal_utf8_encoding = {
439   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
440   {
441 #define BT_COLON BT_NMSTRT
442 #include <cutl/details/expat/iasciitab.h>
443 #undef BT_COLON
444 #include <cutl/details/expat/utf8tab.h>
445   },
446   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
447 };
448 
449 static void PTRCALL
latin1_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)450 latin1_toUtf8(const ENCODING *enc,
451               const char **fromP, const char *fromLim,
452               char **toP, const char *toLim)
453 {
454   UNUSED(enc);
455 
456   for (;;) {
457     unsigned char c;
458     if (*fromP == fromLim)
459       break;
460     c = (unsigned char)**fromP;
461     if (c & 0x80) {
462       if (toLim - *toP < 2)
463         break;
464       *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
465       *(*toP)++ = (char)((c & 0x3f) | 0x80);
466       (*fromP)++;
467     }
468     else {
469       if (*toP == toLim)
470         break;
471       *(*toP)++ = *(*fromP)++;
472     }
473   }
474 }
475 
476 static void PTRCALL
latin1_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)477 latin1_toUtf16(const ENCODING *enc,
478                const char **fromP, const char *fromLim,
479                unsigned short **toP, const unsigned short *toLim)
480 {
481   UNUSED(enc);
482 
483   while (*fromP != fromLim && *toP != toLim)
484     *(*toP)++ = (unsigned char)*(*fromP)++;
485 }
486 
487 #ifdef XML_NS
488 
489 static const struct normal_encoding latin1_encoding_ns = {
490   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
491   {
492 #include <cutl/details/expat/asciitab.h>
493 #include <cutl/details/expat/latin1tab.h>
494   },
495   STANDARD_VTABLE(sb_) ZERO_VTABLE
496 };
497 
498 #endif
499 
500 static const struct normal_encoding latin1_encoding = {
501   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
502   {
503 #define BT_COLON BT_NMSTRT
504 #include <cutl/details/expat/asciitab.h>
505 #undef BT_COLON
506 #include <cutl/details/expat/latin1tab.h>
507   },
508   STANDARD_VTABLE(sb_) ZERO_VTABLE
509 };
510 
511 static void PTRCALL
ascii_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)512 ascii_toUtf8(const ENCODING *enc,
513              const char **fromP, const char *fromLim,
514              char **toP, const char *toLim)
515 {
516   UNUSED(enc);
517 
518   while (*fromP != fromLim && *toP != toLim)
519     *(*toP)++ = *(*fromP)++;
520 }
521 
522 #ifdef XML_NS
523 
524 static const struct normal_encoding ascii_encoding_ns = {
525   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
526   {
527 #include <cutl/details/expat/asciitab.h>
528 /* BT_NONXML == 0 */
529   },
530   STANDARD_VTABLE(sb_) ZERO_VTABLE
531 };
532 
533 #endif
534 
535 static const struct normal_encoding ascii_encoding = {
536   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
537   {
538 #define BT_COLON BT_NMSTRT
539 #include <cutl/details/expat/asciitab.h>
540 #undef BT_COLON
541 /* BT_NONXML == 0 */
542   },
543   STANDARD_VTABLE(sb_) ZERO_VTABLE
544 };
545 
546 static int PTRFASTCALL
unicode_byte_type(char hi,char lo)547 unicode_byte_type(char hi, char lo)
548 {
549   switch ((unsigned char)hi) {
550   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
551     return BT_LEAD4;
552   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
553     return BT_TRAIL;
554   case 0xFF:
555     switch ((unsigned char)lo) {
556     case 0xFF:
557     case 0xFE:
558       return BT_NONXML;
559     }
560     break;
561   }
562   return BT_NONASCII;
563 }
564 
565 #define DEFINE_UTF16_TO_UTF8(E) \
566 static void  PTRCALL \
567 E ## toUtf8(const ENCODING *enc, \
568             const char **fromP, const char *fromLim, \
569             char **toP, const char *toLim) \
570 { \
571   const char *from; \
572   UNUSED(enc); \
573   for (from = *fromP; from != fromLim; from += 2) { \
574     int plane; \
575     unsigned char lo2; \
576     unsigned char lo = GET_LO(from); \
577     unsigned char hi = GET_HI(from); \
578     switch (hi) { \
579     case 0: \
580       if (lo < 0x80) { \
581         if (*toP == toLim) { \
582           *fromP = from; \
583           return; \
584         } \
585         *(*toP)++ = lo; \
586         break; \
587       } \
588       /* fall through */ \
589     case 0x1: case 0x2: case 0x3: \
590     case 0x4: case 0x5: case 0x6: case 0x7: \
591       if (toLim -  *toP < 2) { \
592         *fromP = from; \
593         return; \
594       } \
595       *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
596       *(*toP)++ = ((lo & 0x3f) | 0x80); \
597       break; \
598     default: \
599       if (toLim -  *toP < 3)  { \
600         *fromP = from; \
601         return; \
602       } \
603       /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
604       *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
605       *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
606       *(*toP)++ = ((lo & 0x3f) | 0x80); \
607       break; \
608     case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
609       if (toLim -  *toP < 4) { \
610         *fromP = from; \
611         return; \
612       } \
613       plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
614       *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
615       *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
616       from += 2; \
617       lo2 = GET_LO(from); \
618       *(*toP)++ = (((lo & 0x3) << 4) \
619                    | ((GET_HI(from) & 0x3) << 2) \
620                    | (lo2 >> 6) \
621                    | 0x80); \
622       *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
623       break; \
624     } \
625   } \
626   *fromP = from; \
627 }
628 
629 #define DEFINE_UTF16_TO_UTF16(E) \
630 static void  PTRCALL \
631 E ## toUtf16(const ENCODING *enc, \
632              const char **fromP, const char *fromLim, \
633              unsigned short **toP, const unsigned short *toLim) \
634 { \
635    UNUSED(enc); \
636   /* Avoid copying first half only of surrogate */ \
637   if (fromLim - *fromP > ((toLim - *toP) << 1) \
638       && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
639     fromLim -= 2; \
640   for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
641     *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
642 }
643 
644 #define SET2(ptr, ch) \
645   (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
646 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
647 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
648 
649 DEFINE_UTF16_TO_UTF8(little2_)
DEFINE_UTF16_TO_UTF16(little2_)650 DEFINE_UTF16_TO_UTF16(little2_)
651 
652 #undef SET2
653 #undef GET_LO
654 #undef GET_HI
655 
656 #define SET2(ptr, ch) \
657   (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
658 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
659 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
660 
661 DEFINE_UTF16_TO_UTF8(big2_)
662 DEFINE_UTF16_TO_UTF16(big2_)
663 
664 #undef SET2
665 #undef GET_LO
666 #undef GET_HI
667 
668 #define LITTLE2_BYTE_TYPE(enc, p) \
669  ((p)[1] == 0 \
670   ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
671   : unicode_byte_type((p)[1], (p)[0]))
672 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
673 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
674 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
675   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
676 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
677   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
678 
679 #ifdef XML_MIN_SIZE
680 
681 static int PTRFASTCALL
682 little2_byteType(const ENCODING *enc, const char *p)
683 {
684   return LITTLE2_BYTE_TYPE(enc, p);
685 }
686 
687 static int PTRFASTCALL
little2_byteToAscii(const ENCODING * enc,const char * p)688 little2_byteToAscii(const ENCODING *enc, const char *p)
689 {
690   return LITTLE2_BYTE_TO_ASCII(enc, p);
691 }
692 
693 static int PTRCALL
little2_charMatches(const ENCODING * enc,const char * p,int c)694 little2_charMatches(const ENCODING *enc, const char *p, int c)
695 {
696   return LITTLE2_CHAR_MATCHES(enc, p, c);
697 }
698 
699 static int PTRFASTCALL
little2_isNameMin(const ENCODING * enc,const char * p)700 little2_isNameMin(const ENCODING *enc, const char *p)
701 {
702   return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
703 }
704 
705 static int PTRFASTCALL
little2_isNmstrtMin(const ENCODING * enc,const char * p)706 little2_isNmstrtMin(const ENCODING *enc, const char *p)
707 {
708   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
709 }
710 
711 #undef VTABLE
712 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
713 
714 #else /* not XML_MIN_SIZE */
715 
716 #undef PREFIX
717 #define PREFIX(ident) little2_ ## ident
718 #define MINBPC(enc) 2
719 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
720 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
721 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
722 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
723 #define IS_NAME_CHAR(enc, p, n) 0
724 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
725 #define IS_NMSTRT_CHAR(enc, p, n) (0)
726 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
727 
728 #define XML_TOK_IMPL_C
729 #include <cutl/details/expat/xmltok_impl.c>
730 #undef XML_TOK_IMPL_C
731 
732 #undef MINBPC
733 #undef BYTE_TYPE
734 #undef BYTE_TO_ASCII
735 #undef CHAR_MATCHES
736 #undef IS_NAME_CHAR
737 #undef IS_NAME_CHAR_MINBPC
738 #undef IS_NMSTRT_CHAR
739 #undef IS_NMSTRT_CHAR_MINBPC
740 #undef IS_INVALID_CHAR
741 
742 #endif /* not XML_MIN_SIZE */
743 
744 #ifdef XML_NS
745 
746 static const struct normal_encoding little2_encoding_ns = {
747   { VTABLE, 2, 0,
748 #if BYTEORDER == 1234
749     1
750 #else
751     0
752 #endif
753   },
754   {
755 #include <cutl/details/expat/asciitab.h>
756 #include <cutl/details/expat/latin1tab.h>
757   },
758   STANDARD_VTABLE(little2_) ZERO_VTABLE
759 };
760 
761 #endif
762 
763 static const struct normal_encoding little2_encoding = {
764   { VTABLE, 2, 0,
765 #if BYTEORDER == 1234
766     1
767 #else
768     0
769 #endif
770   },
771   {
772 #define BT_COLON BT_NMSTRT
773 #include <cutl/details/expat/asciitab.h>
774 #undef BT_COLON
775 #include <cutl/details/expat/latin1tab.h>
776   },
777   STANDARD_VTABLE(little2_) ZERO_VTABLE
778 };
779 
780 #if BYTEORDER != 4321
781 
782 #ifdef XML_NS
783 
784 static const struct normal_encoding internal_little2_encoding_ns = {
785   { VTABLE, 2, 0, 1 },
786   {
787 #include <cutl/details/expat/iasciitab.h>
788 #include <cutl/details/expat/latin1tab.h>
789   },
790   STANDARD_VTABLE(little2_) ZERO_VTABLE
791 };
792 
793 #endif
794 
795 static const struct normal_encoding internal_little2_encoding = {
796   { VTABLE, 2, 0, 1 },
797   {
798 #define BT_COLON BT_NMSTRT
799 #include <cutl/details/expat/iasciitab.h>
800 #undef BT_COLON
801 #include <cutl/details/expat/latin1tab.h>
802   },
803   STANDARD_VTABLE(little2_) ZERO_VTABLE
804 };
805 
806 #endif
807 
808 
809 #define BIG2_BYTE_TYPE(enc, p) \
810  ((p)[0] == 0 \
811   ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
812   : unicode_byte_type((p)[0], (p)[1]))
813 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
814 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
815 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
816   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
817 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
818   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
819 
820 #ifdef XML_MIN_SIZE
821 
822 static int PTRFASTCALL
big2_byteType(const ENCODING * enc,const char * p)823 big2_byteType(const ENCODING *enc, const char *p)
824 {
825   return BIG2_BYTE_TYPE(enc, p);
826 }
827 
828 static int PTRFASTCALL
big2_byteToAscii(const ENCODING * enc,const char * p)829 big2_byteToAscii(const ENCODING *enc, const char *p)
830 {
831   return BIG2_BYTE_TO_ASCII(enc, p);
832 }
833 
834 static int PTRCALL
big2_charMatches(const ENCODING * enc,const char * p,int c)835 big2_charMatches(const ENCODING *enc, const char *p, int c)
836 {
837   return BIG2_CHAR_MATCHES(enc, p, c);
838 }
839 
840 static int PTRFASTCALL
big2_isNameMin(const ENCODING * enc,const char * p)841 big2_isNameMin(const ENCODING *enc, const char *p)
842 {
843   return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
844 }
845 
846 static int PTRFASTCALL
big2_isNmstrtMin(const ENCODING * enc,const char * p)847 big2_isNmstrtMin(const ENCODING *enc, const char *p)
848 {
849   return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
850 }
851 
852 #undef VTABLE
853 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
854 
855 #else /* not XML_MIN_SIZE */
856 
857 #undef PREFIX
858 #define PREFIX(ident) big2_ ## ident
859 #define MINBPC(enc) 2
860 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
861 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
862 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
863 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
864 #define IS_NAME_CHAR(enc, p, n) 0
865 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
866 #define IS_NMSTRT_CHAR(enc, p, n) (0)
867 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
868 
869 #define XML_TOK_IMPL_C
870 #include <cutl/details/expat/xmltok_impl.c>
871 #undef XML_TOK_IMPL_C
872 
873 #undef MINBPC
874 #undef BYTE_TYPE
875 #undef BYTE_TO_ASCII
876 #undef CHAR_MATCHES
877 #undef IS_NAME_CHAR
878 #undef IS_NAME_CHAR_MINBPC
879 #undef IS_NMSTRT_CHAR
880 #undef IS_NMSTRT_CHAR_MINBPC
881 #undef IS_INVALID_CHAR
882 
883 #endif /* not XML_MIN_SIZE */
884 
885 #ifdef XML_NS
886 
887 static const struct normal_encoding big2_encoding_ns = {
888   { VTABLE, 2, 0,
889 #if BYTEORDER == 4321
890   1
891 #else
892   0
893 #endif
894   },
895   {
896 #include <cutl/details/expat/asciitab.h>
897 #include <cutl/details/expat/latin1tab.h>
898   },
899   STANDARD_VTABLE(big2_) ZERO_VTABLE
900 };
901 
902 #endif
903 
904 static const struct normal_encoding big2_encoding = {
905   { VTABLE, 2, 0,
906 #if BYTEORDER == 4321
907   1
908 #else
909   0
910 #endif
911   },
912   {
913 #define BT_COLON BT_NMSTRT
914 #include <cutl/details/expat/asciitab.h>
915 #undef BT_COLON
916 #include <cutl/details/expat/latin1tab.h>
917   },
918   STANDARD_VTABLE(big2_) ZERO_VTABLE
919 };
920 
921 #if BYTEORDER != 1234
922 
923 #ifdef XML_NS
924 
925 static const struct normal_encoding internal_big2_encoding_ns = {
926   { VTABLE, 2, 0, 1 },
927   {
928 #include <cutl/details/expat/iasciitab.h>
929 #include <cutl/details/expat/latin1tab.h>
930   },
931   STANDARD_VTABLE(big2_) ZERO_VTABLE
932 };
933 
934 #endif
935 
936 static const struct normal_encoding internal_big2_encoding = {
937   { VTABLE, 2, 0, 1 },
938   {
939 #define BT_COLON BT_NMSTRT
940 #include <cutl/details/expat/iasciitab.h>
941 #undef BT_COLON
942 #include <cutl/details/expat/latin1tab.h>
943   },
944   STANDARD_VTABLE(big2_) ZERO_VTABLE
945 };
946 
947 #endif
948 
949 #undef PREFIX
950 
951 static int FASTCALL
streqci(const char * s1,const char * s2)952 streqci(const char *s1, const char *s2)
953 {
954   for (;;) {
955     char c1 = *s1++;
956     char c2 = *s2++;
957     if (ASCII_a <= c1 && c1 <= ASCII_z)
958       c1 += ASCII_A - ASCII_a;
959     if (ASCII_a <= c2 && c2 <= ASCII_z)
960       c2 += ASCII_A - ASCII_a;
961     if (c1 != c2)
962       return 0;
963     if (!c1)
964       break;
965   }
966   return 1;
967 }
968 
969 static void PTRCALL
initUpdatePosition(const ENCODING * enc,const char * ptr,const char * end,POSITION * pos)970 initUpdatePosition(const ENCODING *enc, const char *ptr,
971                    const char *end, POSITION *pos)
972 {
973   UNUSED(enc);
974   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
975 }
976 
977 static int
toAscii(const ENCODING * enc,const char * ptr,const char * end)978 toAscii(const ENCODING *enc, const char *ptr, const char *end)
979 {
980   char buf[1];
981   char *p = buf;
982   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
983   if (p == buf)
984     return -1;
985   else
986     return buf[0];
987 }
988 
989 static int FASTCALL
isSpace(int c)990 isSpace(int c)
991 {
992   switch (c) {
993   case 0x20:
994   case 0xD:
995   case 0xA:
996   case 0x9:
997     return 1;
998   }
999   return 0;
1000 }
1001 
1002 /* Return 1 if there's just optional white space or there's an S
1003    followed by name=val.
1004 */
1005 static int
parsePseudoAttribute(const ENCODING * enc,const char * ptr,const char * end,const char ** namePtr,const char ** nameEndPtr,const char ** valPtr,const char ** nextTokPtr)1006 parsePseudoAttribute(const ENCODING *enc,
1007                      const char *ptr,
1008                      const char *end,
1009                      const char **namePtr,
1010                      const char **nameEndPtr,
1011                      const char **valPtr,
1012                      const char **nextTokPtr)
1013 {
1014   int c;
1015   char open;
1016   if (ptr == end) {
1017     *namePtr = NULL;
1018     return 1;
1019   }
1020   if (!isSpace(toAscii(enc, ptr, end))) {
1021     *nextTokPtr = ptr;
1022     return 0;
1023   }
1024   do {
1025     ptr += enc->minBytesPerChar;
1026   } while (isSpace(toAscii(enc, ptr, end)));
1027   if (ptr == end) {
1028     *namePtr = NULL;
1029     return 1;
1030   }
1031   *namePtr = ptr;
1032   for (;;) {
1033     c = toAscii(enc, ptr, end);
1034     if (c == -1) {
1035       *nextTokPtr = ptr;
1036       return 0;
1037     }
1038     if (c == ASCII_EQUALS) {
1039       *nameEndPtr = ptr;
1040       break;
1041     }
1042     if (isSpace(c)) {
1043       *nameEndPtr = ptr;
1044       do {
1045         ptr += enc->minBytesPerChar;
1046       } while (isSpace(c = toAscii(enc, ptr, end)));
1047       if (c != ASCII_EQUALS) {
1048         *nextTokPtr = ptr;
1049         return 0;
1050       }
1051       break;
1052     }
1053     ptr += enc->minBytesPerChar;
1054   }
1055   if (ptr == *namePtr) {
1056     *nextTokPtr = ptr;
1057     return 0;
1058   }
1059   ptr += enc->minBytesPerChar;
1060   c = toAscii(enc, ptr, end);
1061   while (isSpace(c)) {
1062     ptr += enc->minBytesPerChar;
1063     c = toAscii(enc, ptr, end);
1064   }
1065   if (c != ASCII_QUOT && c != ASCII_APOS) {
1066     *nextTokPtr = ptr;
1067     return 0;
1068   }
1069   open = (char)c;
1070   ptr += enc->minBytesPerChar;
1071   *valPtr = ptr;
1072   for (;; ptr += enc->minBytesPerChar) {
1073     c = toAscii(enc, ptr, end);
1074     if (c == open)
1075       break;
1076     if (!(ASCII_a <= c && c <= ASCII_z)
1077         && !(ASCII_A <= c && c <= ASCII_Z)
1078         && !(ASCII_0 <= c && c <= ASCII_9)
1079         && c != ASCII_PERIOD
1080         && c != ASCII_MINUS
1081         && c != ASCII_UNDERSCORE) {
1082       *nextTokPtr = ptr;
1083       return 0;
1084     }
1085   }
1086   *nextTokPtr = ptr + enc->minBytesPerChar;
1087   return 1;
1088 }
1089 
1090 static const char KW_version[] = {
1091   ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1092 };
1093 
1094 static const char KW_encoding[] = {
1095   ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1096 };
1097 
1098 static const char KW_standalone[] = {
1099   ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
1100   ASCII_n, ASCII_e, '\0'
1101 };
1102 
1103 static const char KW_yes[] = {
1104   ASCII_y, ASCII_e, ASCII_s,  '\0'
1105 };
1106 
1107 static const char KW_no[] = {
1108   ASCII_n, ASCII_o,  '\0'
1109 };
1110 
1111 static int
doParseXmlDecl(const ENCODING * (* encodingFinder)(const ENCODING *,const char *,const char *),int isGeneralTextEntity,const ENCODING * enc,const char * ptr,const char * end,const char ** badPtr,const char ** versionPtr,const char ** versionEndPtr,const char ** encodingName,const ENCODING ** encoding,int * standalone)1112 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1113                                                  const char *,
1114                                                  const char *),
1115                int isGeneralTextEntity,
1116                const ENCODING *enc,
1117                const char *ptr,
1118                const char *end,
1119                const char **badPtr,
1120                const char **versionPtr,
1121                const char **versionEndPtr,
1122                const char **encodingName,
1123                const ENCODING **encoding,
1124                int *standalone)
1125 {
1126   const char *val = NULL;
1127   const char *name = NULL;
1128   const char *nameEnd = NULL;
1129   ptr += 5 * enc->minBytesPerChar;
1130   end -= 2 * enc->minBytesPerChar;
1131   if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1132       || !name) {
1133     *badPtr = ptr;
1134     return 0;
1135   }
1136   if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1137     if (!isGeneralTextEntity) {
1138       *badPtr = name;
1139       return 0;
1140     }
1141   }
1142   else {
1143     if (versionPtr)
1144       *versionPtr = val;
1145     if (versionEndPtr)
1146       *versionEndPtr = ptr;
1147     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1148       *badPtr = ptr;
1149       return 0;
1150     }
1151     if (!name) {
1152       if (isGeneralTextEntity) {
1153         /* a TextDecl must have an EncodingDecl */
1154         *badPtr = ptr;
1155         return 0;
1156       }
1157       return 1;
1158     }
1159   }
1160   if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1161     int c = toAscii(enc, val, end);
1162     if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1163       *badPtr = val;
1164       return 0;
1165     }
1166     if (encodingName)
1167       *encodingName = val;
1168     if (encoding)
1169       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1170     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1171       *badPtr = ptr;
1172       return 0;
1173     }
1174     if (!name)
1175       return 1;
1176   }
1177   if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1178       || isGeneralTextEntity) {
1179     *badPtr = name;
1180     return 0;
1181   }
1182   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1183     if (standalone)
1184       *standalone = 1;
1185   }
1186   else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1187     if (standalone)
1188       *standalone = 0;
1189   }
1190   else {
1191     *badPtr = val;
1192     return 0;
1193   }
1194   while (isSpace(toAscii(enc, ptr, end)))
1195     ptr += enc->minBytesPerChar;
1196   if (ptr != end) {
1197     *badPtr = ptr;
1198     return 0;
1199   }
1200   return 1;
1201 }
1202 
1203 static int FASTCALL
checkCharRefNumber(int result)1204 checkCharRefNumber(int result)
1205 {
1206   switch (result >> 8) {
1207   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1208   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1209     return -1;
1210   case 0:
1211     if (latin1_encoding.type[result] == BT_NONXML)
1212       return -1;
1213     break;
1214   case 0xFF:
1215     if (result == 0xFFFE || result == 0xFFFF)
1216       return -1;
1217     break;
1218   }
1219   return result;
1220 }
1221 
1222 int FASTCALL
XmlUtf8Encode(int c,char * buf)1223 XmlUtf8Encode(int c, char *buf)
1224 {
1225   enum {
1226     /* minN is minimum legal resulting value for N byte sequence */
1227     min2 = 0x80,
1228     min3 = 0x800,
1229     min4 = 0x10000
1230   };
1231 
1232   if (c < 0)
1233     return 0;
1234   if (c < min2) {
1235     buf[0] = (char)(c | UTF8_cval1);
1236     return 1;
1237   }
1238   if (c < min3) {
1239     buf[0] = (char)((c >> 6) | UTF8_cval2);
1240     buf[1] = (char)((c & 0x3f) | 0x80);
1241     return 2;
1242   }
1243   if (c < min4) {
1244     buf[0] = (char)((c >> 12) | UTF8_cval3);
1245     buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1246     buf[2] = (char)((c & 0x3f) | 0x80);
1247     return 3;
1248   }
1249   if (c < 0x110000) {
1250     buf[0] = (char)((c >> 18) | UTF8_cval4);
1251     buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1252     buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1253     buf[3] = (char)((c & 0x3f) | 0x80);
1254     return 4;
1255   }
1256   return 0;
1257 }
1258 
1259 int FASTCALL
XmlUtf16Encode(int charNum,unsigned short * buf)1260 XmlUtf16Encode(int charNum, unsigned short *buf)
1261 {
1262   if (charNum < 0)
1263     return 0;
1264   if (charNum < 0x10000) {
1265     buf[0] = (unsigned short)charNum;
1266     return 1;
1267   }
1268   if (charNum < 0x110000) {
1269     charNum -= 0x10000;
1270     buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1271     buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1272     return 2;
1273   }
1274   return 0;
1275 }
1276 
1277 struct unknown_encoding {
1278   struct normal_encoding normal;
1279   CONVERTER convert;
1280   void *userData;
1281   unsigned short utf16[256];
1282   char utf8[256][4];
1283 };
1284 
1285 #define AS_UNKNOWN_ENCODING(enc)  ((const struct unknown_encoding *) (enc))
1286 
1287 int
XmlSizeOfUnknownEncoding(void)1288 XmlSizeOfUnknownEncoding(void)
1289 {
1290   return sizeof(struct unknown_encoding);
1291 }
1292 
1293 static int PTRFASTCALL
unknown_isName(const ENCODING * enc,const char * p)1294 unknown_isName(const ENCODING *enc, const char *p)
1295 {
1296   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1297   int c = uenc->convert(uenc->userData, p);
1298   if (c & ~0xFFFF)
1299     return 0;
1300   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1301 }
1302 
1303 static int PTRFASTCALL
unknown_isNmstrt(const ENCODING * enc,const char * p)1304 unknown_isNmstrt(const ENCODING *enc, const char *p)
1305 {
1306   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1307   int c = uenc->convert(uenc->userData, p);
1308   if (c & ~0xFFFF)
1309     return 0;
1310   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1311 }
1312 
1313 static int PTRFASTCALL
unknown_isInvalid(const ENCODING * enc,const char * p)1314 unknown_isInvalid(const ENCODING *enc, const char *p)
1315 {
1316   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1317   int c = uenc->convert(uenc->userData, p);
1318   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1319 }
1320 
1321 static void PTRCALL
unknown_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)1322 unknown_toUtf8(const ENCODING *enc,
1323                const char **fromP, const char *fromLim,
1324                char **toP, const char *toLim)
1325 {
1326   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1327   char buf[XML_UTF8_ENCODE_MAX];
1328   for (;;) {
1329     const char *utf8;
1330     int n;
1331     if (*fromP == fromLim)
1332       break;
1333     utf8 = uenc->utf8[(unsigned char)**fromP];
1334     n = *utf8++;
1335     if (n == 0) {
1336       int c = uenc->convert(uenc->userData, *fromP);
1337       n = XmlUtf8Encode(c, buf);
1338       if (n > toLim - *toP)
1339         break;
1340       utf8 = buf;
1341       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1342                  - (BT_LEAD2 - 2));
1343     }
1344     else {
1345       if (n > toLim - *toP)
1346         break;
1347       (*fromP)++;
1348     }
1349     do {
1350       *(*toP)++ = *utf8++;
1351     } while (--n != 0);
1352   }
1353 }
1354 
1355 static void PTRCALL
unknown_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)1356 unknown_toUtf16(const ENCODING *enc,
1357                 const char **fromP, const char *fromLim,
1358                 unsigned short **toP, const unsigned short *toLim)
1359 {
1360   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1361   while (*fromP != fromLim && *toP != toLim) {
1362     unsigned short c = uenc->utf16[(unsigned char)**fromP];
1363     if (c == 0) {
1364       c = (unsigned short)
1365           uenc->convert(uenc->userData, *fromP);
1366       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1367                  - (BT_LEAD2 - 2));
1368     }
1369     else
1370       (*fromP)++;
1371     *(*toP)++ = c;
1372   }
1373 }
1374 
1375 ENCODING *
XmlInitUnknownEncoding(void * mem,int * table,CONVERTER convert,void * userData)1376 XmlInitUnknownEncoding(void *mem,
1377                        int *table,
1378                        CONVERTER convert,
1379                        void *userData)
1380 {
1381   int i;
1382   struct unknown_encoding *e = (struct unknown_encoding *)mem;
1383   for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1384     ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1385   for (i = 0; i < 128; i++)
1386     if (latin1_encoding.type[i] != BT_OTHER
1387         && latin1_encoding.type[i] != BT_NONXML
1388         && table[i] != i)
1389       return 0;
1390   for (i = 0; i < 256; i++) {
1391     int c = table[i];
1392     if (c == -1) {
1393       e->normal.type[i] = BT_MALFORM;
1394       /* This shouldn't really get used. */
1395       e->utf16[i] = 0xFFFF;
1396       e->utf8[i][0] = 1;
1397       e->utf8[i][1] = 0;
1398     }
1399     else if (c < 0) {
1400       if (c < -4)
1401         return 0;
1402       e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1403       e->utf8[i][0] = 0;
1404       e->utf16[i] = 0;
1405     }
1406     else if (c < 0x80) {
1407       if (latin1_encoding.type[c] != BT_OTHER
1408           && latin1_encoding.type[c] != BT_NONXML
1409           && c != i)
1410         return 0;
1411       e->normal.type[i] = latin1_encoding.type[c];
1412       e->utf8[i][0] = 1;
1413       e->utf8[i][1] = (char)c;
1414       e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1415     }
1416     else if (checkCharRefNumber(c) < 0) {
1417       e->normal.type[i] = BT_NONXML;
1418       /* This shouldn't really get used. */
1419       e->utf16[i] = 0xFFFF;
1420       e->utf8[i][0] = 1;
1421       e->utf8[i][1] = 0;
1422     }
1423     else {
1424       if (c > 0xFFFF)
1425         return 0;
1426       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1427         e->normal.type[i] = BT_NMSTRT;
1428       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1429         e->normal.type[i] = BT_NAME;
1430       else
1431         e->normal.type[i] = BT_OTHER;
1432       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1433       e->utf16[i] = (unsigned short)c;
1434     }
1435   }
1436   e->userData = userData;
1437   e->convert = convert;
1438   if (convert) {
1439     e->normal.isName2 = unknown_isName;
1440     e->normal.isName3 = unknown_isName;
1441     e->normal.isName4 = unknown_isName;
1442     e->normal.isNmstrt2 = unknown_isNmstrt;
1443     e->normal.isNmstrt3 = unknown_isNmstrt;
1444     e->normal.isNmstrt4 = unknown_isNmstrt;
1445     e->normal.isInvalid2 = unknown_isInvalid;
1446     e->normal.isInvalid3 = unknown_isInvalid;
1447     e->normal.isInvalid4 = unknown_isInvalid;
1448   }
1449   e->normal.enc.utf8Convert = unknown_toUtf8;
1450   e->normal.enc.utf16Convert = unknown_toUtf16;
1451   return &(e->normal.enc);
1452 }
1453 
1454 /* If this enumeration is changed, getEncodingIndex and encodings
1455 must also be changed. */
1456 enum {
1457   UNKNOWN_ENC = -1,
1458   ISO_8859_1_ENC = 0,
1459   US_ASCII_ENC,
1460   UTF_8_ENC,
1461   UTF_16_ENC,
1462   UTF_16BE_ENC,
1463   UTF_16LE_ENC,
1464   /* must match encodingNames up to here */
1465   NO_ENC
1466 };
1467 
1468 static const char KW_ISO_8859_1[] = {
1469   ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
1470   ASCII_MINUS, ASCII_1, '\0'
1471 };
1472 static const char KW_US_ASCII[] = {
1473   ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
1474   '\0'
1475 };
1476 static const char KW_UTF_8[] =  {
1477   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1478 };
1479 static const char KW_UTF_16[] = {
1480   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1481 };
1482 static const char KW_UTF_16BE[] = {
1483   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
1484   '\0'
1485 };
1486 static const char KW_UTF_16LE[] = {
1487   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
1488   '\0'
1489 };
1490 
1491 static int FASTCALL
getEncodingIndex(const char * name)1492 getEncodingIndex(const char *name)
1493 {
1494   static const char * const encodingNames[] = {
1495     KW_ISO_8859_1,
1496     KW_US_ASCII,
1497     KW_UTF_8,
1498     KW_UTF_16,
1499     KW_UTF_16BE,
1500     KW_UTF_16LE,
1501   };
1502   int i;
1503   if (name == NULL)
1504     return NO_ENC;
1505   for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1506     if (streqci(name, encodingNames[i]))
1507       return i;
1508   return UNKNOWN_ENC;
1509 }
1510 
1511 /* For binary compatibility, we store the index of the encoding
1512    specified at initialization in the isUtf16 member.
1513 */
1514 
1515 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1516 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1517 
1518 /* This is what detects the encoding.  encodingTable maps from
1519    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1520    the external (protocol) specified encoding; state is
1521    XML_CONTENT_STATE if we're parsing an external text entity, and
1522    XML_PROLOG_STATE otherwise.
1523 */
1524 
1525 
1526 static int
initScan(const ENCODING * const * encodingTable,const INIT_ENCODING * enc,int state,const char * ptr,const char * end,const char ** nextTokPtr)1527 initScan(const ENCODING * const *encodingTable,
1528          const INIT_ENCODING *enc,
1529          int state,
1530          const char *ptr,
1531          const char *end,
1532          const char **nextTokPtr)
1533 {
1534   const ENCODING **encPtr;
1535 
1536   if (ptr == end)
1537     return XML_TOK_NONE;
1538   encPtr = enc->encPtr;
1539   if (ptr + 1 == end) {
1540     /* only a single byte available for auto-detection */
1541 #ifndef XML_DTD /* FIXME */
1542     /* a well-formed document entity must have more than one byte */
1543     if (state != XML_CONTENT_STATE)
1544       return XML_TOK_PARTIAL;
1545 #endif
1546     /* so we're parsing an external text entity... */
1547     /* if UTF-16 was externally specified, then we need at least 2 bytes */
1548     switch (INIT_ENC_INDEX(enc)) {
1549     case UTF_16_ENC:
1550     case UTF_16LE_ENC:
1551     case UTF_16BE_ENC:
1552       return XML_TOK_PARTIAL;
1553     }
1554     switch ((unsigned char)*ptr) {
1555     case 0xFE:
1556     case 0xFF:
1557     case 0xEF: /* possibly first byte of UTF-8 BOM */
1558       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1559           && state == XML_CONTENT_STATE)
1560         break;
1561       /* fall through */
1562     case 0x00:
1563     case 0x3C:
1564       return XML_TOK_PARTIAL;
1565     }
1566   }
1567   else {
1568     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1569     case 0xFEFF:
1570       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1571           && state == XML_CONTENT_STATE)
1572         break;
1573       *nextTokPtr = ptr + 2;
1574       *encPtr = encodingTable[UTF_16BE_ENC];
1575       return XML_TOK_BOM;
1576     /* 00 3C is handled in the default case */
1577     case 0x3C00:
1578       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1579            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1580           && state == XML_CONTENT_STATE)
1581         break;
1582       *encPtr = encodingTable[UTF_16LE_ENC];
1583       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1584     case 0xFFFE:
1585       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1586           && state == XML_CONTENT_STATE)
1587         break;
1588       *nextTokPtr = ptr + 2;
1589       *encPtr = encodingTable[UTF_16LE_ENC];
1590       return XML_TOK_BOM;
1591     case 0xEFBB:
1592       /* Maybe a UTF-8 BOM (EF BB BF) */
1593       /* If there's an explicitly specified (external) encoding
1594          of ISO-8859-1 or some flavour of UTF-16
1595          and this is an external text entity,
1596          don't look for the BOM,
1597          because it might be a legal data.
1598       */
1599       if (state == XML_CONTENT_STATE) {
1600         int e = INIT_ENC_INDEX(enc);
1601         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1602             || e == UTF_16LE_ENC || e == UTF_16_ENC)
1603           break;
1604       }
1605       if (ptr + 2 == end)
1606         return XML_TOK_PARTIAL;
1607       if ((unsigned char)ptr[2] == 0xBF) {
1608         *nextTokPtr = ptr + 3;
1609         *encPtr = encodingTable[UTF_8_ENC];
1610         return XML_TOK_BOM;
1611       }
1612       break;
1613     default:
1614       if (ptr[0] == '\0') {
1615         /* 0 isn't a legal data character. Furthermore a document
1616            entity can only start with ASCII characters.  So the only
1617            way this can fail to be big-endian UTF-16 if it it's an
1618            external parsed general entity that's labelled as
1619            UTF-16LE.
1620         */
1621         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1622           break;
1623         *encPtr = encodingTable[UTF_16BE_ENC];
1624         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1625       }
1626       else if (ptr[1] == '\0') {
1627         /* We could recover here in the case:
1628             - parsing an external entity
1629             - second byte is 0
1630             - no externally specified encoding
1631             - no encoding declaration
1632            by assuming UTF-16LE.  But we don't, because this would mean when
1633            presented just with a single byte, we couldn't reliably determine
1634            whether we needed further bytes.
1635         */
1636         if (state == XML_CONTENT_STATE)
1637           break;
1638         *encPtr = encodingTable[UTF_16LE_ENC];
1639         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1640       }
1641       break;
1642     }
1643   }
1644   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1645   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1646 }
1647 
1648 
1649 #define NS(x) x
1650 #define ns(x) x
1651 #define XML_TOK_NS_C
1652 #include <cutl/details/expat/xmltok_ns.c>
1653 #undef XML_TOK_NS_C
1654 #undef NS
1655 #undef ns
1656 
1657 #ifdef XML_NS
1658 
1659 #define NS(x) x ## NS
1660 #define ns(x) x ## _ns
1661 
1662 #define XML_TOK_NS_C
1663 #include <cutl/details/expat/xmltok_ns.c>
1664 #undef XML_TOK_NS_C
1665 
1666 #undef NS
1667 #undef ns
1668 
1669 ENCODING *
XmlInitUnknownEncodingNS(void * mem,int * table,CONVERTER convert,void * userData)1670 XmlInitUnknownEncodingNS(void *mem,
1671                          int *table,
1672                          CONVERTER convert,
1673                          void *userData)
1674 {
1675   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1676   if (enc)
1677     ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1678   return enc;
1679 }
1680 
1681 #endif /* XML_NS */
1682