1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2 See the file COPYING for copying permission.
3 */
4
5 #include <stddef.h>
6
7 #include <cutl/details/expat/config.h>
8
9 #include <cutl/details/expat/expat_external.h>
10 #include <cutl/details/expat/internal.h>
11 #include <cutl/details/expat/xmltok.h>
12 #include <cutl/details/expat/nametab.h>
13
14 #ifdef XML_DTD
15 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
16 #else
17 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
18 #endif
19
20 #define VTABLE1 \
21 { PREFIX(prologTok), PREFIX(contentTok), \
22 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
23 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
24 PREFIX(sameName), \
25 PREFIX(nameMatchesAscii), \
26 PREFIX(nameLength), \
27 PREFIX(skipS), \
28 PREFIX(getAtts), \
29 PREFIX(charRefNumber), \
30 PREFIX(predefinedEntityName), \
31 PREFIX(updatePosition), \
32 PREFIX(isPublicId)
33
34 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
35
36 #define UCS2_GET_NAMING(pages, hi, lo) \
37 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
38
39 /* A 2 byte UTF-8 representation splits the characters 11 bits between
40 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
41 pages, 3 bits to add to that index and 5 bits to generate the mask.
42 */
43 #define UTF8_GET_NAMING2(pages, byte) \
44 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
45 + ((((byte)[0]) & 3) << 1) \
46 + ((((byte)[1]) >> 5) & 1)] \
47 & (1 << (((byte)[1]) & 0x1F)))
48
49 /* A 3 byte UTF-8 representation splits the characters 16 bits between
50 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
51 into pages, 3 bits to add to that index and 5 bits to generate the
52 mask.
53 */
54 #define UTF8_GET_NAMING3(pages, byte) \
55 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
56 + ((((byte)[1]) >> 2) & 0xF)] \
57 << 3) \
58 + ((((byte)[1]) & 3) << 1) \
59 + ((((byte)[2]) >> 5) & 1)] \
60 & (1 << (((byte)[2]) & 0x1F)))
61
62 #define UTF8_GET_NAMING(pages, p, n) \
63 ((n) == 2 \
64 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
65 : ((n) == 3 \
66 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
67 : 0))
68
69 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
70 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
71 with the additional restriction of not allowing the Unicode
72 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
73 Implementation details:
74 (A & 0x80) == 0 means A < 0x80
75 and
76 (A & 0xC0) == 0xC0 means A > 0xBF
77 */
78
79 #define UTF8_INVALID2(p) \
80 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
81
82 #define UTF8_INVALID3(p) \
83 (((p)[2] & 0x80) == 0 \
84 || \
85 ((*p) == 0xEF && (p)[1] == 0xBF \
86 ? \
87 (p)[2] > 0xBD \
88 : \
89 ((p)[2] & 0xC0) == 0xC0) \
90 || \
91 ((*p) == 0xE0 \
92 ? \
93 (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
94 : \
95 ((p)[1] & 0x80) == 0 \
96 || \
97 ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
98
99 #define UTF8_INVALID4(p) \
100 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
101 || \
102 ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
103 || \
104 ((*p) == 0xF0 \
105 ? \
106 (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
107 : \
108 ((p)[1] & 0x80) == 0 \
109 || \
110 ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
111
112 static int PTRFASTCALL
isNever(const ENCODING * enc,const char * p)113 isNever(const ENCODING *enc, const char *p)
114 {
115 UNUSED(enc);
116 UNUSED(p);
117
118 return 0;
119 }
120
121 static int PTRFASTCALL
utf8_isName2(const ENCODING * enc,const char * p)122 utf8_isName2(const ENCODING *enc, const char *p)
123 {
124 UNUSED(enc);
125
126 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
127 }
128
129 static int PTRFASTCALL
utf8_isName3(const ENCODING * enc,const char * p)130 utf8_isName3(const ENCODING *enc, const char *p)
131 {
132 UNUSED(enc);
133
134 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
135 }
136
137 #define utf8_isName4 isNever
138
139 static int PTRFASTCALL
utf8_isNmstrt2(const ENCODING * enc,const char * p)140 utf8_isNmstrt2(const ENCODING *enc, const char *p)
141 {
142 UNUSED(enc);
143
144 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
145 }
146
147 static int PTRFASTCALL
utf8_isNmstrt3(const ENCODING * enc,const char * p)148 utf8_isNmstrt3(const ENCODING *enc, const char *p)
149 {
150 UNUSED(enc);
151
152 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
153 }
154
155 #define utf8_isNmstrt4 isNever
156
157 static int PTRFASTCALL
utf8_isInvalid2(const ENCODING * enc,const char * p)158 utf8_isInvalid2(const ENCODING *enc, const char *p)
159 {
160 UNUSED(enc);
161
162 return UTF8_INVALID2((const unsigned char *)p);
163 }
164
165 static int PTRFASTCALL
utf8_isInvalid3(const ENCODING * enc,const char * p)166 utf8_isInvalid3(const ENCODING *enc, const char *p)
167 {
168 UNUSED(enc);
169
170 return UTF8_INVALID3((const unsigned char *)p);
171 }
172
173 static int PTRFASTCALL
utf8_isInvalid4(const ENCODING * enc,const char * p)174 utf8_isInvalid4(const ENCODING *enc, const char *p)
175 {
176 UNUSED(enc);
177
178 return UTF8_INVALID4((const unsigned char *)p);
179 }
180
181 struct normal_encoding {
182 ENCODING enc;
183 unsigned char type[256];
184 #ifdef XML_MIN_SIZE
185 int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
186 int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
187 int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
188 int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
189 int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
190 #endif /* XML_MIN_SIZE */
191 int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
192 int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
193 int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
194 int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
195 int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
196 int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
197 int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
198 int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
199 int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
200 };
201
202 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *) (enc))
203
204 #ifdef XML_MIN_SIZE
205
206 #define STANDARD_VTABLE(E) \
207 E ## byteType, \
208 E ## isNameMin, \
209 E ## isNmstrtMin, \
210 E ## byteToAscii, \
211 E ## charMatches,
212
213 #define ZERO_VTABLE /* as nothing */
214
215 #else
216
217 #define STANDARD_VTABLE(E) /* as nothing */
218
219 #define ZERO_VTABLE \
220 0, \
221 0, \
222 0, \
223 0, \
224 0, \
225 0, \
226 0, \
227 0, \
228 0
229
230 #endif
231
232 #define NORMAL_VTABLE(E) \
233 E ## isName2, \
234 E ## isName3, \
235 E ## isName4, \
236 E ## isNmstrt2, \
237 E ## isNmstrt3, \
238 E ## isNmstrt4, \
239 E ## isInvalid2, \
240 E ## isInvalid3, \
241 E ## isInvalid4
242
243 static int FASTCALL checkCharRefNumber(int);
244
245 #include <cutl/details/expat/xmltok_impl.h>
246 #include <cutl/details/expat/ascii.h>
247
248 #ifdef XML_MIN_SIZE
249 #define sb_isNameMin isNever
250 #define sb_isNmstrtMin isNever
251 #endif
252
253 #ifdef XML_MIN_SIZE
254 #define MINBPC(enc) ((enc)->minBytesPerChar)
255 #else
256 /* minimum bytes per character */
257 #define MINBPC(enc) 1
258 #endif
259
260 #define SB_BYTE_TYPE(enc, p) \
261 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
262
263 #ifdef XML_MIN_SIZE
264 static int PTRFASTCALL
sb_byteType(const ENCODING * enc,const char * p)265 sb_byteType(const ENCODING *enc, const char *p)
266 {
267 return SB_BYTE_TYPE(enc, p);
268 }
269 #define BYTE_TYPE(enc, p) \
270 (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
271 #else
272 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
273 #endif
274
275 #ifdef XML_MIN_SIZE
276 #define BYTE_TO_ASCII(enc, p) \
277 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
278 static int PTRFASTCALL
sb_byteToAscii(const ENCODING * enc,const char * p)279 sb_byteToAscii(const ENCODING *enc, const char *p)
280 {
281 return *p;
282 }
283 #else
284 #define BYTE_TO_ASCII(enc, p) (*(p))
285 #endif
286
287 #define IS_NAME_CHAR(enc, p, n) \
288 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
289 #define IS_NMSTRT_CHAR(enc, p, n) \
290 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
291 #define IS_INVALID_CHAR(enc, p, n) \
292 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
293
294 #ifdef XML_MIN_SIZE
295 #define IS_NAME_CHAR_MINBPC(enc, p) \
296 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
297 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
298 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
299 #else
300 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
301 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
302 #endif
303
304 #ifdef XML_MIN_SIZE
305 #define CHAR_MATCHES(enc, p, c) \
306 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
307 static int PTRCALL
sb_charMatches(const ENCODING * enc,const char * p,int c)308 sb_charMatches(const ENCODING *enc, const char *p, int c)
309 {
310 return *p == c;
311 }
312 #else
313 /* c is an ASCII character */
314 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
315 #endif
316
317 #define PREFIX(ident) normal_ ## ident
318 #define XML_TOK_IMPL_C
319 #include <cutl/details/expat/xmltok_impl.c>
320 #undef XML_TOK_IMPL_C
321
322 #undef MINBPC
323 #undef BYTE_TYPE
324 #undef BYTE_TO_ASCII
325 #undef CHAR_MATCHES
326 #undef IS_NAME_CHAR
327 #undef IS_NAME_CHAR_MINBPC
328 #undef IS_NMSTRT_CHAR
329 #undef IS_NMSTRT_CHAR_MINBPC
330 #undef IS_INVALID_CHAR
331
332 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
333 UTF8_cval1 = 0x00,
334 UTF8_cval2 = 0xc0,
335 UTF8_cval3 = 0xe0,
336 UTF8_cval4 = 0xf0
337 };
338
339 static void PTRCALL
utf8_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)340 utf8_toUtf8(const ENCODING *enc,
341 const char **fromP, const char *fromLim,
342 char **toP, const char *toLim)
343 {
344 char *to;
345 const char *from;
346
347 UNUSED(enc);
348
349 if (fromLim - *fromP > toLim - *toP) {
350 /* Avoid copying partial characters. */
351 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
352 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
353 break;
354 }
355 for (to = *toP, from = *fromP; from != fromLim; from++, to++)
356 *to = *from;
357 *fromP = from;
358 *toP = to;
359 }
360
361 static void PTRCALL
utf8_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)362 utf8_toUtf16(const ENCODING *enc,
363 const char **fromP, const char *fromLim,
364 unsigned short **toP, const unsigned short *toLim)
365 {
366 unsigned short *to = *toP;
367 const char *from = *fromP;
368 while (from != fromLim && to != toLim) {
369 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
370 case BT_LEAD2:
371 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
372 from += 2;
373 break;
374 case BT_LEAD3:
375 *to++ = (unsigned short)(((from[0] & 0xf) << 12)
376 | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
377 from += 3;
378 break;
379 case BT_LEAD4:
380 {
381 unsigned long n;
382 if (to + 1 == toLim)
383 goto after;
384 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
385 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
386 n -= 0x10000;
387 to[0] = (unsigned short)((n >> 10) | 0xD800);
388 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
389 to += 2;
390 from += 4;
391 }
392 break;
393 default:
394 *to++ = *from++;
395 break;
396 }
397 }
398 after:
399 *fromP = from;
400 *toP = to;
401 }
402
403 #ifdef XML_NS
404 static const struct normal_encoding utf8_encoding_ns = {
405 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
406 {
407 #include <cutl/details/expat/asciitab.h>
408 #include <cutl/details/expat/utf8tab.h>
409 },
410 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
411 };
412 #endif
413
414 static const struct normal_encoding utf8_encoding = {
415 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
416 {
417 #define BT_COLON BT_NMSTRT
418 #include <cutl/details/expat/asciitab.h>
419 #undef BT_COLON
420 #include <cutl/details/expat/utf8tab.h>
421 },
422 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
423 };
424
425 #ifdef XML_NS
426
427 static const struct normal_encoding internal_utf8_encoding_ns = {
428 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
429 {
430 #include <cutl/details/expat/iasciitab.h>
431 #include <cutl/details/expat/utf8tab.h>
432 },
433 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
434 };
435
436 #endif
437
438 static const struct normal_encoding internal_utf8_encoding = {
439 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
440 {
441 #define BT_COLON BT_NMSTRT
442 #include <cutl/details/expat/iasciitab.h>
443 #undef BT_COLON
444 #include <cutl/details/expat/utf8tab.h>
445 },
446 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
447 };
448
449 static void PTRCALL
latin1_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)450 latin1_toUtf8(const ENCODING *enc,
451 const char **fromP, const char *fromLim,
452 char **toP, const char *toLim)
453 {
454 UNUSED(enc);
455
456 for (;;) {
457 unsigned char c;
458 if (*fromP == fromLim)
459 break;
460 c = (unsigned char)**fromP;
461 if (c & 0x80) {
462 if (toLim - *toP < 2)
463 break;
464 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
465 *(*toP)++ = (char)((c & 0x3f) | 0x80);
466 (*fromP)++;
467 }
468 else {
469 if (*toP == toLim)
470 break;
471 *(*toP)++ = *(*fromP)++;
472 }
473 }
474 }
475
476 static void PTRCALL
latin1_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)477 latin1_toUtf16(const ENCODING *enc,
478 const char **fromP, const char *fromLim,
479 unsigned short **toP, const unsigned short *toLim)
480 {
481 UNUSED(enc);
482
483 while (*fromP != fromLim && *toP != toLim)
484 *(*toP)++ = (unsigned char)*(*fromP)++;
485 }
486
487 #ifdef XML_NS
488
489 static const struct normal_encoding latin1_encoding_ns = {
490 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
491 {
492 #include <cutl/details/expat/asciitab.h>
493 #include <cutl/details/expat/latin1tab.h>
494 },
495 STANDARD_VTABLE(sb_) ZERO_VTABLE
496 };
497
498 #endif
499
500 static const struct normal_encoding latin1_encoding = {
501 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
502 {
503 #define BT_COLON BT_NMSTRT
504 #include <cutl/details/expat/asciitab.h>
505 #undef BT_COLON
506 #include <cutl/details/expat/latin1tab.h>
507 },
508 STANDARD_VTABLE(sb_) ZERO_VTABLE
509 };
510
511 static void PTRCALL
ascii_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)512 ascii_toUtf8(const ENCODING *enc,
513 const char **fromP, const char *fromLim,
514 char **toP, const char *toLim)
515 {
516 UNUSED(enc);
517
518 while (*fromP != fromLim && *toP != toLim)
519 *(*toP)++ = *(*fromP)++;
520 }
521
522 #ifdef XML_NS
523
524 static const struct normal_encoding ascii_encoding_ns = {
525 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
526 {
527 #include <cutl/details/expat/asciitab.h>
528 /* BT_NONXML == 0 */
529 },
530 STANDARD_VTABLE(sb_) ZERO_VTABLE
531 };
532
533 #endif
534
535 static const struct normal_encoding ascii_encoding = {
536 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
537 {
538 #define BT_COLON BT_NMSTRT
539 #include <cutl/details/expat/asciitab.h>
540 #undef BT_COLON
541 /* BT_NONXML == 0 */
542 },
543 STANDARD_VTABLE(sb_) ZERO_VTABLE
544 };
545
546 static int PTRFASTCALL
unicode_byte_type(char hi,char lo)547 unicode_byte_type(char hi, char lo)
548 {
549 switch ((unsigned char)hi) {
550 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
551 return BT_LEAD4;
552 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
553 return BT_TRAIL;
554 case 0xFF:
555 switch ((unsigned char)lo) {
556 case 0xFF:
557 case 0xFE:
558 return BT_NONXML;
559 }
560 break;
561 }
562 return BT_NONASCII;
563 }
564
565 #define DEFINE_UTF16_TO_UTF8(E) \
566 static void PTRCALL \
567 E ## toUtf8(const ENCODING *enc, \
568 const char **fromP, const char *fromLim, \
569 char **toP, const char *toLim) \
570 { \
571 const char *from; \
572 UNUSED(enc); \
573 for (from = *fromP; from != fromLim; from += 2) { \
574 int plane; \
575 unsigned char lo2; \
576 unsigned char lo = GET_LO(from); \
577 unsigned char hi = GET_HI(from); \
578 switch (hi) { \
579 case 0: \
580 if (lo < 0x80) { \
581 if (*toP == toLim) { \
582 *fromP = from; \
583 return; \
584 } \
585 *(*toP)++ = lo; \
586 break; \
587 } \
588 /* fall through */ \
589 case 0x1: case 0x2: case 0x3: \
590 case 0x4: case 0x5: case 0x6: case 0x7: \
591 if (toLim - *toP < 2) { \
592 *fromP = from; \
593 return; \
594 } \
595 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
596 *(*toP)++ = ((lo & 0x3f) | 0x80); \
597 break; \
598 default: \
599 if (toLim - *toP < 3) { \
600 *fromP = from; \
601 return; \
602 } \
603 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
604 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
605 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
606 *(*toP)++ = ((lo & 0x3f) | 0x80); \
607 break; \
608 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
609 if (toLim - *toP < 4) { \
610 *fromP = from; \
611 return; \
612 } \
613 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
614 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
615 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
616 from += 2; \
617 lo2 = GET_LO(from); \
618 *(*toP)++ = (((lo & 0x3) << 4) \
619 | ((GET_HI(from) & 0x3) << 2) \
620 | (lo2 >> 6) \
621 | 0x80); \
622 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
623 break; \
624 } \
625 } \
626 *fromP = from; \
627 }
628
629 #define DEFINE_UTF16_TO_UTF16(E) \
630 static void PTRCALL \
631 E ## toUtf16(const ENCODING *enc, \
632 const char **fromP, const char *fromLim, \
633 unsigned short **toP, const unsigned short *toLim) \
634 { \
635 UNUSED(enc); \
636 /* Avoid copying first half only of surrogate */ \
637 if (fromLim - *fromP > ((toLim - *toP) << 1) \
638 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
639 fromLim -= 2; \
640 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
641 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
642 }
643
644 #define SET2(ptr, ch) \
645 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
646 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
647 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
648
649 DEFINE_UTF16_TO_UTF8(little2_)
DEFINE_UTF16_TO_UTF16(little2_)650 DEFINE_UTF16_TO_UTF16(little2_)
651
652 #undef SET2
653 #undef GET_LO
654 #undef GET_HI
655
656 #define SET2(ptr, ch) \
657 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
658 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
659 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
660
661 DEFINE_UTF16_TO_UTF8(big2_)
662 DEFINE_UTF16_TO_UTF16(big2_)
663
664 #undef SET2
665 #undef GET_LO
666 #undef GET_HI
667
668 #define LITTLE2_BYTE_TYPE(enc, p) \
669 ((p)[1] == 0 \
670 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
671 : unicode_byte_type((p)[1], (p)[0]))
672 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
673 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
674 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
675 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
676 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
677 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
678
679 #ifdef XML_MIN_SIZE
680
681 static int PTRFASTCALL
682 little2_byteType(const ENCODING *enc, const char *p)
683 {
684 return LITTLE2_BYTE_TYPE(enc, p);
685 }
686
687 static int PTRFASTCALL
little2_byteToAscii(const ENCODING * enc,const char * p)688 little2_byteToAscii(const ENCODING *enc, const char *p)
689 {
690 return LITTLE2_BYTE_TO_ASCII(enc, p);
691 }
692
693 static int PTRCALL
little2_charMatches(const ENCODING * enc,const char * p,int c)694 little2_charMatches(const ENCODING *enc, const char *p, int c)
695 {
696 return LITTLE2_CHAR_MATCHES(enc, p, c);
697 }
698
699 static int PTRFASTCALL
little2_isNameMin(const ENCODING * enc,const char * p)700 little2_isNameMin(const ENCODING *enc, const char *p)
701 {
702 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
703 }
704
705 static int PTRFASTCALL
little2_isNmstrtMin(const ENCODING * enc,const char * p)706 little2_isNmstrtMin(const ENCODING *enc, const char *p)
707 {
708 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
709 }
710
711 #undef VTABLE
712 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
713
714 #else /* not XML_MIN_SIZE */
715
716 #undef PREFIX
717 #define PREFIX(ident) little2_ ## ident
718 #define MINBPC(enc) 2
719 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
720 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
721 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
722 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
723 #define IS_NAME_CHAR(enc, p, n) 0
724 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
725 #define IS_NMSTRT_CHAR(enc, p, n) (0)
726 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
727
728 #define XML_TOK_IMPL_C
729 #include <cutl/details/expat/xmltok_impl.c>
730 #undef XML_TOK_IMPL_C
731
732 #undef MINBPC
733 #undef BYTE_TYPE
734 #undef BYTE_TO_ASCII
735 #undef CHAR_MATCHES
736 #undef IS_NAME_CHAR
737 #undef IS_NAME_CHAR_MINBPC
738 #undef IS_NMSTRT_CHAR
739 #undef IS_NMSTRT_CHAR_MINBPC
740 #undef IS_INVALID_CHAR
741
742 #endif /* not XML_MIN_SIZE */
743
744 #ifdef XML_NS
745
746 static const struct normal_encoding little2_encoding_ns = {
747 { VTABLE, 2, 0,
748 #if BYTEORDER == 1234
749 1
750 #else
751 0
752 #endif
753 },
754 {
755 #include <cutl/details/expat/asciitab.h>
756 #include <cutl/details/expat/latin1tab.h>
757 },
758 STANDARD_VTABLE(little2_) ZERO_VTABLE
759 };
760
761 #endif
762
763 static const struct normal_encoding little2_encoding = {
764 { VTABLE, 2, 0,
765 #if BYTEORDER == 1234
766 1
767 #else
768 0
769 #endif
770 },
771 {
772 #define BT_COLON BT_NMSTRT
773 #include <cutl/details/expat/asciitab.h>
774 #undef BT_COLON
775 #include <cutl/details/expat/latin1tab.h>
776 },
777 STANDARD_VTABLE(little2_) ZERO_VTABLE
778 };
779
780 #if BYTEORDER != 4321
781
782 #ifdef XML_NS
783
784 static const struct normal_encoding internal_little2_encoding_ns = {
785 { VTABLE, 2, 0, 1 },
786 {
787 #include <cutl/details/expat/iasciitab.h>
788 #include <cutl/details/expat/latin1tab.h>
789 },
790 STANDARD_VTABLE(little2_) ZERO_VTABLE
791 };
792
793 #endif
794
795 static const struct normal_encoding internal_little2_encoding = {
796 { VTABLE, 2, 0, 1 },
797 {
798 #define BT_COLON BT_NMSTRT
799 #include <cutl/details/expat/iasciitab.h>
800 #undef BT_COLON
801 #include <cutl/details/expat/latin1tab.h>
802 },
803 STANDARD_VTABLE(little2_) ZERO_VTABLE
804 };
805
806 #endif
807
808
809 #define BIG2_BYTE_TYPE(enc, p) \
810 ((p)[0] == 0 \
811 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
812 : unicode_byte_type((p)[0], (p)[1]))
813 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
814 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
815 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
816 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
817 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
818 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
819
820 #ifdef XML_MIN_SIZE
821
822 static int PTRFASTCALL
big2_byteType(const ENCODING * enc,const char * p)823 big2_byteType(const ENCODING *enc, const char *p)
824 {
825 return BIG2_BYTE_TYPE(enc, p);
826 }
827
828 static int PTRFASTCALL
big2_byteToAscii(const ENCODING * enc,const char * p)829 big2_byteToAscii(const ENCODING *enc, const char *p)
830 {
831 return BIG2_BYTE_TO_ASCII(enc, p);
832 }
833
834 static int PTRCALL
big2_charMatches(const ENCODING * enc,const char * p,int c)835 big2_charMatches(const ENCODING *enc, const char *p, int c)
836 {
837 return BIG2_CHAR_MATCHES(enc, p, c);
838 }
839
840 static int PTRFASTCALL
big2_isNameMin(const ENCODING * enc,const char * p)841 big2_isNameMin(const ENCODING *enc, const char *p)
842 {
843 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
844 }
845
846 static int PTRFASTCALL
big2_isNmstrtMin(const ENCODING * enc,const char * p)847 big2_isNmstrtMin(const ENCODING *enc, const char *p)
848 {
849 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
850 }
851
852 #undef VTABLE
853 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
854
855 #else /* not XML_MIN_SIZE */
856
857 #undef PREFIX
858 #define PREFIX(ident) big2_ ## ident
859 #define MINBPC(enc) 2
860 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
861 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
862 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
863 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
864 #define IS_NAME_CHAR(enc, p, n) 0
865 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
866 #define IS_NMSTRT_CHAR(enc, p, n) (0)
867 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
868
869 #define XML_TOK_IMPL_C
870 #include <cutl/details/expat/xmltok_impl.c>
871 #undef XML_TOK_IMPL_C
872
873 #undef MINBPC
874 #undef BYTE_TYPE
875 #undef BYTE_TO_ASCII
876 #undef CHAR_MATCHES
877 #undef IS_NAME_CHAR
878 #undef IS_NAME_CHAR_MINBPC
879 #undef IS_NMSTRT_CHAR
880 #undef IS_NMSTRT_CHAR_MINBPC
881 #undef IS_INVALID_CHAR
882
883 #endif /* not XML_MIN_SIZE */
884
885 #ifdef XML_NS
886
887 static const struct normal_encoding big2_encoding_ns = {
888 { VTABLE, 2, 0,
889 #if BYTEORDER == 4321
890 1
891 #else
892 0
893 #endif
894 },
895 {
896 #include <cutl/details/expat/asciitab.h>
897 #include <cutl/details/expat/latin1tab.h>
898 },
899 STANDARD_VTABLE(big2_) ZERO_VTABLE
900 };
901
902 #endif
903
904 static const struct normal_encoding big2_encoding = {
905 { VTABLE, 2, 0,
906 #if BYTEORDER == 4321
907 1
908 #else
909 0
910 #endif
911 },
912 {
913 #define BT_COLON BT_NMSTRT
914 #include <cutl/details/expat/asciitab.h>
915 #undef BT_COLON
916 #include <cutl/details/expat/latin1tab.h>
917 },
918 STANDARD_VTABLE(big2_) ZERO_VTABLE
919 };
920
921 #if BYTEORDER != 1234
922
923 #ifdef XML_NS
924
925 static const struct normal_encoding internal_big2_encoding_ns = {
926 { VTABLE, 2, 0, 1 },
927 {
928 #include <cutl/details/expat/iasciitab.h>
929 #include <cutl/details/expat/latin1tab.h>
930 },
931 STANDARD_VTABLE(big2_) ZERO_VTABLE
932 };
933
934 #endif
935
936 static const struct normal_encoding internal_big2_encoding = {
937 { VTABLE, 2, 0, 1 },
938 {
939 #define BT_COLON BT_NMSTRT
940 #include <cutl/details/expat/iasciitab.h>
941 #undef BT_COLON
942 #include <cutl/details/expat/latin1tab.h>
943 },
944 STANDARD_VTABLE(big2_) ZERO_VTABLE
945 };
946
947 #endif
948
949 #undef PREFIX
950
951 static int FASTCALL
streqci(const char * s1,const char * s2)952 streqci(const char *s1, const char *s2)
953 {
954 for (;;) {
955 char c1 = *s1++;
956 char c2 = *s2++;
957 if (ASCII_a <= c1 && c1 <= ASCII_z)
958 c1 += ASCII_A - ASCII_a;
959 if (ASCII_a <= c2 && c2 <= ASCII_z)
960 c2 += ASCII_A - ASCII_a;
961 if (c1 != c2)
962 return 0;
963 if (!c1)
964 break;
965 }
966 return 1;
967 }
968
969 static void PTRCALL
initUpdatePosition(const ENCODING * enc,const char * ptr,const char * end,POSITION * pos)970 initUpdatePosition(const ENCODING *enc, const char *ptr,
971 const char *end, POSITION *pos)
972 {
973 UNUSED(enc);
974 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
975 }
976
977 static int
toAscii(const ENCODING * enc,const char * ptr,const char * end)978 toAscii(const ENCODING *enc, const char *ptr, const char *end)
979 {
980 char buf[1];
981 char *p = buf;
982 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
983 if (p == buf)
984 return -1;
985 else
986 return buf[0];
987 }
988
989 static int FASTCALL
isSpace(int c)990 isSpace(int c)
991 {
992 switch (c) {
993 case 0x20:
994 case 0xD:
995 case 0xA:
996 case 0x9:
997 return 1;
998 }
999 return 0;
1000 }
1001
1002 /* Return 1 if there's just optional white space or there's an S
1003 followed by name=val.
1004 */
1005 static int
parsePseudoAttribute(const ENCODING * enc,const char * ptr,const char * end,const char ** namePtr,const char ** nameEndPtr,const char ** valPtr,const char ** nextTokPtr)1006 parsePseudoAttribute(const ENCODING *enc,
1007 const char *ptr,
1008 const char *end,
1009 const char **namePtr,
1010 const char **nameEndPtr,
1011 const char **valPtr,
1012 const char **nextTokPtr)
1013 {
1014 int c;
1015 char open;
1016 if (ptr == end) {
1017 *namePtr = NULL;
1018 return 1;
1019 }
1020 if (!isSpace(toAscii(enc, ptr, end))) {
1021 *nextTokPtr = ptr;
1022 return 0;
1023 }
1024 do {
1025 ptr += enc->minBytesPerChar;
1026 } while (isSpace(toAscii(enc, ptr, end)));
1027 if (ptr == end) {
1028 *namePtr = NULL;
1029 return 1;
1030 }
1031 *namePtr = ptr;
1032 for (;;) {
1033 c = toAscii(enc, ptr, end);
1034 if (c == -1) {
1035 *nextTokPtr = ptr;
1036 return 0;
1037 }
1038 if (c == ASCII_EQUALS) {
1039 *nameEndPtr = ptr;
1040 break;
1041 }
1042 if (isSpace(c)) {
1043 *nameEndPtr = ptr;
1044 do {
1045 ptr += enc->minBytesPerChar;
1046 } while (isSpace(c = toAscii(enc, ptr, end)));
1047 if (c != ASCII_EQUALS) {
1048 *nextTokPtr = ptr;
1049 return 0;
1050 }
1051 break;
1052 }
1053 ptr += enc->minBytesPerChar;
1054 }
1055 if (ptr == *namePtr) {
1056 *nextTokPtr = ptr;
1057 return 0;
1058 }
1059 ptr += enc->minBytesPerChar;
1060 c = toAscii(enc, ptr, end);
1061 while (isSpace(c)) {
1062 ptr += enc->minBytesPerChar;
1063 c = toAscii(enc, ptr, end);
1064 }
1065 if (c != ASCII_QUOT && c != ASCII_APOS) {
1066 *nextTokPtr = ptr;
1067 return 0;
1068 }
1069 open = (char)c;
1070 ptr += enc->minBytesPerChar;
1071 *valPtr = ptr;
1072 for (;; ptr += enc->minBytesPerChar) {
1073 c = toAscii(enc, ptr, end);
1074 if (c == open)
1075 break;
1076 if (!(ASCII_a <= c && c <= ASCII_z)
1077 && !(ASCII_A <= c && c <= ASCII_Z)
1078 && !(ASCII_0 <= c && c <= ASCII_9)
1079 && c != ASCII_PERIOD
1080 && c != ASCII_MINUS
1081 && c != ASCII_UNDERSCORE) {
1082 *nextTokPtr = ptr;
1083 return 0;
1084 }
1085 }
1086 *nextTokPtr = ptr + enc->minBytesPerChar;
1087 return 1;
1088 }
1089
1090 static const char KW_version[] = {
1091 ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1092 };
1093
1094 static const char KW_encoding[] = {
1095 ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1096 };
1097
1098 static const char KW_standalone[] = {
1099 ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
1100 ASCII_n, ASCII_e, '\0'
1101 };
1102
1103 static const char KW_yes[] = {
1104 ASCII_y, ASCII_e, ASCII_s, '\0'
1105 };
1106
1107 static const char KW_no[] = {
1108 ASCII_n, ASCII_o, '\0'
1109 };
1110
1111 static int
doParseXmlDecl(const ENCODING * (* encodingFinder)(const ENCODING *,const char *,const char *),int isGeneralTextEntity,const ENCODING * enc,const char * ptr,const char * end,const char ** badPtr,const char ** versionPtr,const char ** versionEndPtr,const char ** encodingName,const ENCODING ** encoding,int * standalone)1112 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1113 const char *,
1114 const char *),
1115 int isGeneralTextEntity,
1116 const ENCODING *enc,
1117 const char *ptr,
1118 const char *end,
1119 const char **badPtr,
1120 const char **versionPtr,
1121 const char **versionEndPtr,
1122 const char **encodingName,
1123 const ENCODING **encoding,
1124 int *standalone)
1125 {
1126 const char *val = NULL;
1127 const char *name = NULL;
1128 const char *nameEnd = NULL;
1129 ptr += 5 * enc->minBytesPerChar;
1130 end -= 2 * enc->minBytesPerChar;
1131 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1132 || !name) {
1133 *badPtr = ptr;
1134 return 0;
1135 }
1136 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1137 if (!isGeneralTextEntity) {
1138 *badPtr = name;
1139 return 0;
1140 }
1141 }
1142 else {
1143 if (versionPtr)
1144 *versionPtr = val;
1145 if (versionEndPtr)
1146 *versionEndPtr = ptr;
1147 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1148 *badPtr = ptr;
1149 return 0;
1150 }
1151 if (!name) {
1152 if (isGeneralTextEntity) {
1153 /* a TextDecl must have an EncodingDecl */
1154 *badPtr = ptr;
1155 return 0;
1156 }
1157 return 1;
1158 }
1159 }
1160 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1161 int c = toAscii(enc, val, end);
1162 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1163 *badPtr = val;
1164 return 0;
1165 }
1166 if (encodingName)
1167 *encodingName = val;
1168 if (encoding)
1169 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1170 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1171 *badPtr = ptr;
1172 return 0;
1173 }
1174 if (!name)
1175 return 1;
1176 }
1177 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1178 || isGeneralTextEntity) {
1179 *badPtr = name;
1180 return 0;
1181 }
1182 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1183 if (standalone)
1184 *standalone = 1;
1185 }
1186 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1187 if (standalone)
1188 *standalone = 0;
1189 }
1190 else {
1191 *badPtr = val;
1192 return 0;
1193 }
1194 while (isSpace(toAscii(enc, ptr, end)))
1195 ptr += enc->minBytesPerChar;
1196 if (ptr != end) {
1197 *badPtr = ptr;
1198 return 0;
1199 }
1200 return 1;
1201 }
1202
1203 static int FASTCALL
checkCharRefNumber(int result)1204 checkCharRefNumber(int result)
1205 {
1206 switch (result >> 8) {
1207 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1208 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1209 return -1;
1210 case 0:
1211 if (latin1_encoding.type[result] == BT_NONXML)
1212 return -1;
1213 break;
1214 case 0xFF:
1215 if (result == 0xFFFE || result == 0xFFFF)
1216 return -1;
1217 break;
1218 }
1219 return result;
1220 }
1221
1222 int FASTCALL
XmlUtf8Encode(int c,char * buf)1223 XmlUtf8Encode(int c, char *buf)
1224 {
1225 enum {
1226 /* minN is minimum legal resulting value for N byte sequence */
1227 min2 = 0x80,
1228 min3 = 0x800,
1229 min4 = 0x10000
1230 };
1231
1232 if (c < 0)
1233 return 0;
1234 if (c < min2) {
1235 buf[0] = (char)(c | UTF8_cval1);
1236 return 1;
1237 }
1238 if (c < min3) {
1239 buf[0] = (char)((c >> 6) | UTF8_cval2);
1240 buf[1] = (char)((c & 0x3f) | 0x80);
1241 return 2;
1242 }
1243 if (c < min4) {
1244 buf[0] = (char)((c >> 12) | UTF8_cval3);
1245 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1246 buf[2] = (char)((c & 0x3f) | 0x80);
1247 return 3;
1248 }
1249 if (c < 0x110000) {
1250 buf[0] = (char)((c >> 18) | UTF8_cval4);
1251 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1252 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1253 buf[3] = (char)((c & 0x3f) | 0x80);
1254 return 4;
1255 }
1256 return 0;
1257 }
1258
1259 int FASTCALL
XmlUtf16Encode(int charNum,unsigned short * buf)1260 XmlUtf16Encode(int charNum, unsigned short *buf)
1261 {
1262 if (charNum < 0)
1263 return 0;
1264 if (charNum < 0x10000) {
1265 buf[0] = (unsigned short)charNum;
1266 return 1;
1267 }
1268 if (charNum < 0x110000) {
1269 charNum -= 0x10000;
1270 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1271 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1272 return 2;
1273 }
1274 return 0;
1275 }
1276
1277 struct unknown_encoding {
1278 struct normal_encoding normal;
1279 CONVERTER convert;
1280 void *userData;
1281 unsigned short utf16[256];
1282 char utf8[256][4];
1283 };
1284
1285 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *) (enc))
1286
1287 int
XmlSizeOfUnknownEncoding(void)1288 XmlSizeOfUnknownEncoding(void)
1289 {
1290 return sizeof(struct unknown_encoding);
1291 }
1292
1293 static int PTRFASTCALL
unknown_isName(const ENCODING * enc,const char * p)1294 unknown_isName(const ENCODING *enc, const char *p)
1295 {
1296 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1297 int c = uenc->convert(uenc->userData, p);
1298 if (c & ~0xFFFF)
1299 return 0;
1300 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1301 }
1302
1303 static int PTRFASTCALL
unknown_isNmstrt(const ENCODING * enc,const char * p)1304 unknown_isNmstrt(const ENCODING *enc, const char *p)
1305 {
1306 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1307 int c = uenc->convert(uenc->userData, p);
1308 if (c & ~0xFFFF)
1309 return 0;
1310 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1311 }
1312
1313 static int PTRFASTCALL
unknown_isInvalid(const ENCODING * enc,const char * p)1314 unknown_isInvalid(const ENCODING *enc, const char *p)
1315 {
1316 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1317 int c = uenc->convert(uenc->userData, p);
1318 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1319 }
1320
1321 static void PTRCALL
unknown_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)1322 unknown_toUtf8(const ENCODING *enc,
1323 const char **fromP, const char *fromLim,
1324 char **toP, const char *toLim)
1325 {
1326 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1327 char buf[XML_UTF8_ENCODE_MAX];
1328 for (;;) {
1329 const char *utf8;
1330 int n;
1331 if (*fromP == fromLim)
1332 break;
1333 utf8 = uenc->utf8[(unsigned char)**fromP];
1334 n = *utf8++;
1335 if (n == 0) {
1336 int c = uenc->convert(uenc->userData, *fromP);
1337 n = XmlUtf8Encode(c, buf);
1338 if (n > toLim - *toP)
1339 break;
1340 utf8 = buf;
1341 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1342 - (BT_LEAD2 - 2));
1343 }
1344 else {
1345 if (n > toLim - *toP)
1346 break;
1347 (*fromP)++;
1348 }
1349 do {
1350 *(*toP)++ = *utf8++;
1351 } while (--n != 0);
1352 }
1353 }
1354
1355 static void PTRCALL
unknown_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)1356 unknown_toUtf16(const ENCODING *enc,
1357 const char **fromP, const char *fromLim,
1358 unsigned short **toP, const unsigned short *toLim)
1359 {
1360 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1361 while (*fromP != fromLim && *toP != toLim) {
1362 unsigned short c = uenc->utf16[(unsigned char)**fromP];
1363 if (c == 0) {
1364 c = (unsigned short)
1365 uenc->convert(uenc->userData, *fromP);
1366 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1367 - (BT_LEAD2 - 2));
1368 }
1369 else
1370 (*fromP)++;
1371 *(*toP)++ = c;
1372 }
1373 }
1374
1375 ENCODING *
XmlInitUnknownEncoding(void * mem,int * table,CONVERTER convert,void * userData)1376 XmlInitUnknownEncoding(void *mem,
1377 int *table,
1378 CONVERTER convert,
1379 void *userData)
1380 {
1381 int i;
1382 struct unknown_encoding *e = (struct unknown_encoding *)mem;
1383 for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1384 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1385 for (i = 0; i < 128; i++)
1386 if (latin1_encoding.type[i] != BT_OTHER
1387 && latin1_encoding.type[i] != BT_NONXML
1388 && table[i] != i)
1389 return 0;
1390 for (i = 0; i < 256; i++) {
1391 int c = table[i];
1392 if (c == -1) {
1393 e->normal.type[i] = BT_MALFORM;
1394 /* This shouldn't really get used. */
1395 e->utf16[i] = 0xFFFF;
1396 e->utf8[i][0] = 1;
1397 e->utf8[i][1] = 0;
1398 }
1399 else if (c < 0) {
1400 if (c < -4)
1401 return 0;
1402 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1403 e->utf8[i][0] = 0;
1404 e->utf16[i] = 0;
1405 }
1406 else if (c < 0x80) {
1407 if (latin1_encoding.type[c] != BT_OTHER
1408 && latin1_encoding.type[c] != BT_NONXML
1409 && c != i)
1410 return 0;
1411 e->normal.type[i] = latin1_encoding.type[c];
1412 e->utf8[i][0] = 1;
1413 e->utf8[i][1] = (char)c;
1414 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1415 }
1416 else if (checkCharRefNumber(c) < 0) {
1417 e->normal.type[i] = BT_NONXML;
1418 /* This shouldn't really get used. */
1419 e->utf16[i] = 0xFFFF;
1420 e->utf8[i][0] = 1;
1421 e->utf8[i][1] = 0;
1422 }
1423 else {
1424 if (c > 0xFFFF)
1425 return 0;
1426 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1427 e->normal.type[i] = BT_NMSTRT;
1428 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1429 e->normal.type[i] = BT_NAME;
1430 else
1431 e->normal.type[i] = BT_OTHER;
1432 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1433 e->utf16[i] = (unsigned short)c;
1434 }
1435 }
1436 e->userData = userData;
1437 e->convert = convert;
1438 if (convert) {
1439 e->normal.isName2 = unknown_isName;
1440 e->normal.isName3 = unknown_isName;
1441 e->normal.isName4 = unknown_isName;
1442 e->normal.isNmstrt2 = unknown_isNmstrt;
1443 e->normal.isNmstrt3 = unknown_isNmstrt;
1444 e->normal.isNmstrt4 = unknown_isNmstrt;
1445 e->normal.isInvalid2 = unknown_isInvalid;
1446 e->normal.isInvalid3 = unknown_isInvalid;
1447 e->normal.isInvalid4 = unknown_isInvalid;
1448 }
1449 e->normal.enc.utf8Convert = unknown_toUtf8;
1450 e->normal.enc.utf16Convert = unknown_toUtf16;
1451 return &(e->normal.enc);
1452 }
1453
1454 /* If this enumeration is changed, getEncodingIndex and encodings
1455 must also be changed. */
1456 enum {
1457 UNKNOWN_ENC = -1,
1458 ISO_8859_1_ENC = 0,
1459 US_ASCII_ENC,
1460 UTF_8_ENC,
1461 UTF_16_ENC,
1462 UTF_16BE_ENC,
1463 UTF_16LE_ENC,
1464 /* must match encodingNames up to here */
1465 NO_ENC
1466 };
1467
1468 static const char KW_ISO_8859_1[] = {
1469 ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
1470 ASCII_MINUS, ASCII_1, '\0'
1471 };
1472 static const char KW_US_ASCII[] = {
1473 ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
1474 '\0'
1475 };
1476 static const char KW_UTF_8[] = {
1477 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1478 };
1479 static const char KW_UTF_16[] = {
1480 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1481 };
1482 static const char KW_UTF_16BE[] = {
1483 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
1484 '\0'
1485 };
1486 static const char KW_UTF_16LE[] = {
1487 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
1488 '\0'
1489 };
1490
1491 static int FASTCALL
getEncodingIndex(const char * name)1492 getEncodingIndex(const char *name)
1493 {
1494 static const char * const encodingNames[] = {
1495 KW_ISO_8859_1,
1496 KW_US_ASCII,
1497 KW_UTF_8,
1498 KW_UTF_16,
1499 KW_UTF_16BE,
1500 KW_UTF_16LE,
1501 };
1502 int i;
1503 if (name == NULL)
1504 return NO_ENC;
1505 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1506 if (streqci(name, encodingNames[i]))
1507 return i;
1508 return UNKNOWN_ENC;
1509 }
1510
1511 /* For binary compatibility, we store the index of the encoding
1512 specified at initialization in the isUtf16 member.
1513 */
1514
1515 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1516 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1517
1518 /* This is what detects the encoding. encodingTable maps from
1519 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1520 the external (protocol) specified encoding; state is
1521 XML_CONTENT_STATE if we're parsing an external text entity, and
1522 XML_PROLOG_STATE otherwise.
1523 */
1524
1525
1526 static int
initScan(const ENCODING * const * encodingTable,const INIT_ENCODING * enc,int state,const char * ptr,const char * end,const char ** nextTokPtr)1527 initScan(const ENCODING * const *encodingTable,
1528 const INIT_ENCODING *enc,
1529 int state,
1530 const char *ptr,
1531 const char *end,
1532 const char **nextTokPtr)
1533 {
1534 const ENCODING **encPtr;
1535
1536 if (ptr == end)
1537 return XML_TOK_NONE;
1538 encPtr = enc->encPtr;
1539 if (ptr + 1 == end) {
1540 /* only a single byte available for auto-detection */
1541 #ifndef XML_DTD /* FIXME */
1542 /* a well-formed document entity must have more than one byte */
1543 if (state != XML_CONTENT_STATE)
1544 return XML_TOK_PARTIAL;
1545 #endif
1546 /* so we're parsing an external text entity... */
1547 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1548 switch (INIT_ENC_INDEX(enc)) {
1549 case UTF_16_ENC:
1550 case UTF_16LE_ENC:
1551 case UTF_16BE_ENC:
1552 return XML_TOK_PARTIAL;
1553 }
1554 switch ((unsigned char)*ptr) {
1555 case 0xFE:
1556 case 0xFF:
1557 case 0xEF: /* possibly first byte of UTF-8 BOM */
1558 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1559 && state == XML_CONTENT_STATE)
1560 break;
1561 /* fall through */
1562 case 0x00:
1563 case 0x3C:
1564 return XML_TOK_PARTIAL;
1565 }
1566 }
1567 else {
1568 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1569 case 0xFEFF:
1570 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1571 && state == XML_CONTENT_STATE)
1572 break;
1573 *nextTokPtr = ptr + 2;
1574 *encPtr = encodingTable[UTF_16BE_ENC];
1575 return XML_TOK_BOM;
1576 /* 00 3C is handled in the default case */
1577 case 0x3C00:
1578 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1579 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1580 && state == XML_CONTENT_STATE)
1581 break;
1582 *encPtr = encodingTable[UTF_16LE_ENC];
1583 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1584 case 0xFFFE:
1585 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1586 && state == XML_CONTENT_STATE)
1587 break;
1588 *nextTokPtr = ptr + 2;
1589 *encPtr = encodingTable[UTF_16LE_ENC];
1590 return XML_TOK_BOM;
1591 case 0xEFBB:
1592 /* Maybe a UTF-8 BOM (EF BB BF) */
1593 /* If there's an explicitly specified (external) encoding
1594 of ISO-8859-1 or some flavour of UTF-16
1595 and this is an external text entity,
1596 don't look for the BOM,
1597 because it might be a legal data.
1598 */
1599 if (state == XML_CONTENT_STATE) {
1600 int e = INIT_ENC_INDEX(enc);
1601 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1602 || e == UTF_16LE_ENC || e == UTF_16_ENC)
1603 break;
1604 }
1605 if (ptr + 2 == end)
1606 return XML_TOK_PARTIAL;
1607 if ((unsigned char)ptr[2] == 0xBF) {
1608 *nextTokPtr = ptr + 3;
1609 *encPtr = encodingTable[UTF_8_ENC];
1610 return XML_TOK_BOM;
1611 }
1612 break;
1613 default:
1614 if (ptr[0] == '\0') {
1615 /* 0 isn't a legal data character. Furthermore a document
1616 entity can only start with ASCII characters. So the only
1617 way this can fail to be big-endian UTF-16 if it it's an
1618 external parsed general entity that's labelled as
1619 UTF-16LE.
1620 */
1621 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1622 break;
1623 *encPtr = encodingTable[UTF_16BE_ENC];
1624 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1625 }
1626 else if (ptr[1] == '\0') {
1627 /* We could recover here in the case:
1628 - parsing an external entity
1629 - second byte is 0
1630 - no externally specified encoding
1631 - no encoding declaration
1632 by assuming UTF-16LE. But we don't, because this would mean when
1633 presented just with a single byte, we couldn't reliably determine
1634 whether we needed further bytes.
1635 */
1636 if (state == XML_CONTENT_STATE)
1637 break;
1638 *encPtr = encodingTable[UTF_16LE_ENC];
1639 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1640 }
1641 break;
1642 }
1643 }
1644 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1645 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1646 }
1647
1648
1649 #define NS(x) x
1650 #define ns(x) x
1651 #define XML_TOK_NS_C
1652 #include <cutl/details/expat/xmltok_ns.c>
1653 #undef XML_TOK_NS_C
1654 #undef NS
1655 #undef ns
1656
1657 #ifdef XML_NS
1658
1659 #define NS(x) x ## NS
1660 #define ns(x) x ## _ns
1661
1662 #define XML_TOK_NS_C
1663 #include <cutl/details/expat/xmltok_ns.c>
1664 #undef XML_TOK_NS_C
1665
1666 #undef NS
1667 #undef ns
1668
1669 ENCODING *
XmlInitUnknownEncodingNS(void * mem,int * table,CONVERTER convert,void * userData)1670 XmlInitUnknownEncodingNS(void *mem,
1671 int *table,
1672 CONVERTER convert,
1673 void *userData)
1674 {
1675 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1676 if (enc)
1677 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1678 return enc;
1679 }
1680
1681 #endif /* XML_NS */
1682