1 /*
2 The contents of this file are subject to the Mozilla Public License
3 Version 1.1 (the "License"); you may not use this file except in
4 compliance with the License. You may obtain a copy of the License at
5 http://www.mozilla.org/MPL/
6 
7 Software distributed under the License is distributed on an "AS IS"
8 basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
9 License for the specific language governing rights and limitations
10 under the License.
11 
12 The Original Code is expat.
13 
14 The Initial Developer of the Original Code is James Clark.
15 Portions created by James Clark are Copyright (C) 1998, 1999
16 James Clark. All Rights Reserved.
17 
18 Contributor(s):
19 
20 Alternatively, the contents of this file may be used under the terms
21 of the GNU General Public License (the "GPL"), in which case the
22 provisions of the GPL are applicable instead of those above.  If you
23 wish to allow use of your version of this file only under the terms of
24 the GPL and not to allow others to use your version of this file under
25 the MPL, indicate your decision by deleting the provisions above and
26 replace them with the notice and other provisions required by the
27 GPL. If you do not delete the provisions above, a recipient may use
28 your version of this file under either the MPL or the GPL.
29 */
30 
31 #include "xmldef.h"
32 #include "xmltok.h"
33 #include "nametab.h"
34 
35 #define VTABLE1 \
36   { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
37   { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
38   PREFIX(sameName), \
39   PREFIX(nameMatchesAscii), \
40   PREFIX(nameLength), \
41   PREFIX(skipS), \
42   PREFIX(getAtts), \
43   PREFIX(charRefNumber), \
44   PREFIX(predefinedEntityName), \
45   PREFIX(updatePosition), \
46   PREFIX(isPublicId)
47 
48 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
49 
50 #define UCS2_GET_NAMING(pages, hi, lo) \
51    (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
52 
53 /* A 2 byte UTF-8 representation splits the characters 11 bits
54 between the bottom 5 and 6 bits of the bytes.
55 We need 8 bits to index into pages, 3 bits to add to that index and
56 5 bits to generate the mask. */
57 #define UTF8_GET_NAMING2(pages, byte) \
58     (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
59 		      + ((((byte)[0]) & 3) << 1) \
60 		      + ((((byte)[1]) >> 5) & 1)] \
61 	 & (1 << (((byte)[1]) & 0x1F)))
62 
63 /* A 3 byte UTF-8 representation splits the characters 16 bits
64 between the bottom 4, 6 and 6 bits of the bytes.
65 We need 8 bits to index into pages, 3 bits to add to that index and
66 5 bits to generate the mask. */
67 #define UTF8_GET_NAMING3(pages, byte) \
68   (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
69 			     + ((((byte)[1]) >> 2) & 0xF)] \
70 	       << 3) \
71 		      + ((((byte)[1]) & 3) << 1) \
72 		      + ((((byte)[2]) >> 5) & 1)] \
73 	 & (1 << (((byte)[2]) & 0x1F)))
74 
75 #define UTF8_GET_NAMING(pages, p, n) \
76   ((n) == 2 \
77   ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
78   : ((n) == 3 \
79      ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
80      : 0))
81 
82 #define UTF8_INVALID3(p) \
83   ((*p) == 0xED \
84   ? (((p)[1] & 0x20) != 0) \
85   : ((*p) == 0xEF \
86      ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
87      : 0))
88 
89 #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
90 
91 static
isNever(const ENCODING * enc,const char * p)92 int isNever(const ENCODING *enc, const char *p)
93 {
94     return 0;
95 }
96 
97 static
utf8_isName2(const ENCODING * enc,const char * p)98 int utf8_isName2(const ENCODING *enc, const char *p)
99 {
100     return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
101 }
102 
103 static
utf8_isName3(const ENCODING * enc,const char * p)104 int utf8_isName3(const ENCODING *enc, const char *p)
105 {
106     return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
107 }
108 
109 #define utf8_isName4 isNever
110 
111 static
utf8_isNmstrt2(const ENCODING * enc,const char * p)112 int utf8_isNmstrt2(const ENCODING *enc, const char *p)
113 {
114     return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
115 }
116 
117 static
utf8_isNmstrt3(const ENCODING * enc,const char * p)118 int utf8_isNmstrt3(const ENCODING *enc, const char *p)
119 {
120     return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
121 }
122 
123 #define utf8_isNmstrt4 isNever
124 
125 #define utf8_isInvalid2 isNever
126 
127 static
utf8_isInvalid3(const ENCODING * enc,const char * p)128 int utf8_isInvalid3(const ENCODING *enc, const char *p)
129 {
130     return UTF8_INVALID3((const unsigned char *)p);
131 }
132 
133 static
utf8_isInvalid4(const ENCODING * enc,const char * p)134 int utf8_isInvalid4(const ENCODING *enc, const char *p)
135 {
136     return UTF8_INVALID4((const unsigned char *)p);
137 }
138 
139 struct normal_encoding {
140     ENCODING enc;
141     unsigned char type[256];
142 #ifdef XML_MIN_SIZE
143     int (*byteType)(const ENCODING *, const char *);
144     int (*isNameMin)(const ENCODING *, const char *);
145     int (*isNmstrtMin)(const ENCODING *, const char *);
146     int (*byteToAscii)(const ENCODING *, const char *);
147     int (*charMatches)(const ENCODING *, const char *, int);
148 #endif /* XML_MIN_SIZE */
149     int (*isName2)(const ENCODING *, const char *);
150     int (*isName3)(const ENCODING *, const char *);
151     int (*isName4)(const ENCODING *, const char *);
152     int (*isNmstrt2)(const ENCODING *, const char *);
153     int (*isNmstrt3)(const ENCODING *, const char *);
154     int (*isNmstrt4)(const ENCODING *, const char *);
155     int (*isInvalid2)(const ENCODING *, const char *);
156     int (*isInvalid3)(const ENCODING *, const char *);
157     int (*isInvalid4)(const ENCODING *, const char *);
158 };
159 
160 #ifdef XML_MIN_SIZE
161 
162 #define STANDARD_VTABLE(E) \
163  E ## byteType, \
164  E ## isNameMin, \
165  E ## isNmstrtMin, \
166  E ## byteToAscii, \
167  E ## charMatches,
168 
169 #else
170 
171 #define STANDARD_VTABLE(E) /* as nothing */
172 
173 #endif
174 
175 #define NORMAL_VTABLE(E) \
176  E ## isName2, \
177  E ## isName3, \
178  E ## isName4, \
179  E ## isNmstrt2, \
180  E ## isNmstrt3, \
181  E ## isNmstrt4, \
182  E ## isInvalid2, \
183  E ## isInvalid3, \
184  E ## isInvalid4
185 
186 static int checkCharRefNumber(int);
187 
188 #include "xmltok_impl.h"
189 
190 #ifdef XML_MIN_SIZE
191 #define sb_isNameMin isNever
192 #define sb_isNmstrtMin isNever
193 #endif
194 
195 #ifdef XML_MIN_SIZE
196 #define MINBPC(enc) ((enc)->minBytesPerChar)
197 #else
198 /* minimum bytes per character */
199 #define MINBPC(enc) 1
200 #endif
201 
202 #define SB_BYTE_TYPE(enc, p) \
203   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
204 
205 #ifdef XML_MIN_SIZE
206 static
sb_byteType(const ENCODING * enc,const char * p)207 int sb_byteType(const ENCODING *enc, const char *p)
208 {
209     return SB_BYTE_TYPE(enc, p);
210 }
211 #define BYTE_TYPE(enc, p) \
212  (((const struct normal_encoding *)(enc))->byteType(enc, p))
213 #else
214 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
215 #endif
216 
217 #ifdef XML_MIN_SIZE
218 #define BYTE_TO_ASCII(enc, p) \
219  (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
220 static
sb_byteToAscii(const ENCODING * enc,const char * p)221 int sb_byteToAscii(const ENCODING *enc, const char *p)
222 {
223     return *p;
224 }
225 #else
226 #define BYTE_TO_ASCII(enc, p) (*p)
227 #endif
228 
229 #define IS_NAME_CHAR(enc, p, n) \
230  (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
231 #define IS_NMSTRT_CHAR(enc, p, n) \
232  (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
233 #define IS_INVALID_CHAR(enc, p, n) \
234  (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
235 
236 #ifdef XML_MIN_SIZE
237 #define IS_NAME_CHAR_MINBPC(enc, p) \
238  (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
239 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
240  (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
241 #else
242 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
243 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
244 #endif
245 
246 #ifdef XML_MIN_SIZE
247 #define CHAR_MATCHES(enc, p, c) \
248  (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
249 static
sb_charMatches(const ENCODING * enc,const char * p,int c)250 int sb_charMatches(const ENCODING *enc, const char *p, int c)
251 {
252     return *p == c;
253 }
254 #else
255 /* c is an ASCII character */
256 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
257 #endif
258 
259 #define PREFIX(ident) normal_ ## ident
260 #include "xmltok_impl_c.h"
261 
262 #undef MINBPC
263 #undef BYTE_TYPE
264 #undef BYTE_TO_ASCII
265 #undef CHAR_MATCHES
266 #undef IS_NAME_CHAR
267 #undef IS_NAME_CHAR_MINBPC
268 #undef IS_NMSTRT_CHAR
269 #undef IS_NMSTRT_CHAR_MINBPC
270 #undef IS_INVALID_CHAR
271 
272 enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
273     UTF8_cval1 = 0x00,
274     UTF8_cval2 = 0xc0,
275     UTF8_cval3 = 0xe0,
276     UTF8_cval4 = 0xf0
277 };
278 
279 static
utf8_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)280 void utf8_toUtf8(const ENCODING *enc,
281 		 const char **fromP, const char *fromLim,
282 		 char **toP, const char *toLim)
283 {
284     char *to;
285     const char *from;
286     if (fromLim - *fromP > toLim - *toP) {
287 	/* Avoid copying partial characters. */
288 	for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
289 	    if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
290 		break;
291     }
292     for (to = *toP, from = *fromP; from != fromLim; from++, to++)
293 	*to = *from;
294     *fromP = from;
295     *toP = to;
296 }
297 
298 static
utf8_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)299 void utf8_toUtf16(const ENCODING *enc,
300 		  const char **fromP, const char *fromLim,
301 		  unsigned short **toP, const unsigned short *toLim)
302 {
303     unsigned short *to = *toP;
304     const char *from = *fromP;
305     while (from != fromLim && to != toLim) {
306 	switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
307 	case BT_LEAD2:
308 	    *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
309 	    from += 2;
310 	    break;
311 	case BT_LEAD3:
312 	    *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
313 	    from += 3;
314 	    break;
315 	case BT_LEAD4:
316 	    {
317 		unsigned long n;
318 		if (to + 1 == toLim)
319 		    break;
320 		n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
321 		n -= 0x10000;
322 		to[0] = (unsigned short)((n >> 10) | 0xD800);
323 		to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
324 		to += 2;
325 		from += 4;
326 	    }
327 	    break;
328 	default:
329 	    *to++ = *from++;
330 	    break;
331 	}
332     }
333     *fromP = from;
334     *toP = to;
335 }
336 
337 #ifdef XML_NS
338 static const struct normal_encoding utf8_encoding_ns = {
339 	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
340 	    {
341 #include "asciitab.h"
342 #include "utf8tab.h"
343 	    },
344 	    STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
345 	};
346 #endif
347 
348 static const struct normal_encoding utf8_encoding = {
349 	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
350 	    {
351 #define BT_COLON BT_NMSTRT
352 #include "asciitab.h"
353 #undef BT_COLON
354 #include "utf8tab.h"
355 	    },
356 	    STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
357 	};
358 
359 #ifdef XML_NS
360 
361 static const struct normal_encoding internal_utf8_encoding_ns = {
362 	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
363 	    {
364 #include "iasciitab.h"
365 #include "utf8tab.h"
366 	    },
367 	    STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
368 	};
369 
370 #endif
371 
372 static const struct normal_encoding internal_utf8_encoding = {
373 	{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
374 	    {
375 #define BT_COLON BT_NMSTRT
376 #include "iasciitab.h"
377 #undef BT_COLON
378 #include "utf8tab.h"
379 	    },
380 	    STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
381 	};
382 
383 static
latin1_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)384 void latin1_toUtf8(const ENCODING *enc,
385 		   const char **fromP, const char *fromLim,
386 		   char **toP, const char *toLim)
387 {
388     for (;;) {
389 	unsigned char c;
390 	if (*fromP == fromLim)
391 	    break;
392 	c = (unsigned char)**fromP;
393 	if (c & 0x80) {
394 	    if (toLim - *toP < 2)
395 		break;
396 	    *(*toP)++ = ((c >> 6) | UTF8_cval2);
397 	    *(*toP)++ = ((c & 0x3f) | 0x80);
398 	    (*fromP)++;
399 	}
400 	else {
401 	    if (*toP == toLim)
402 		break;
403 	    *(*toP)++ = *(*fromP)++;
404 	}
405     }
406 }
407 
408 static
latin1_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)409 void latin1_toUtf16(const ENCODING *enc,
410 		    const char **fromP, const char *fromLim,
411 		    unsigned short **toP, const unsigned short *toLim)
412 {
413     while (*fromP != fromLim && *toP != toLim)
414 	*(*toP)++ = (unsigned char)*(*fromP)++;
415 }
416 
417 #ifdef XML_NS
418 
419 static const struct normal_encoding latin1_encoding_ns = {
420 	{ VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
421 	    {
422 #include "asciitab.h"
423 #include "latin1tab.h"
424 	    },
425 	    STANDARD_VTABLE(sb_)
426 	};
427 
428 #endif
429 
430 static const struct normal_encoding latin1_encoding = {
431 	{ VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
432 	    {
433 #define BT_COLON BT_NMSTRT
434 #include "asciitab.h"
435 #undef BT_COLON
436 #include "latin1tab.h"
437 	    },
438 	    STANDARD_VTABLE(sb_)
439 	};
440 
441 static
ascii_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)442 void ascii_toUtf8(const ENCODING *enc,
443 		  const char **fromP, const char *fromLim,
444 		  char **toP, const char *toLim)
445 {
446     while (*fromP != fromLim && *toP != toLim)
447 	*(*toP)++ = *(*fromP)++;
448 }
449 
450 #ifdef XML_NS
451 
452 static const struct normal_encoding ascii_encoding_ns = {
453 	{ VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
454 	    {
455 #include "asciitab.h"
456 		/* BT_NONXML == 0 */
457 	    },
458 	    STANDARD_VTABLE(sb_)
459 	};
460 
461 #endif
462 
463 static const struct normal_encoding ascii_encoding = {
464 	{ VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
465 	    {
466 #define BT_COLON BT_NMSTRT
467 #include "asciitab.h"
468 #undef BT_COLON
469 		/* BT_NONXML == 0 */
470 	    },
471 	    STANDARD_VTABLE(sb_)
472 	};
473 
unicode_byte_type(char hi,char lo)474 static int unicode_byte_type(char hi, char lo)
475 {
476     switch ((unsigned char)hi) {
477 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
478 	return BT_LEAD4;
479 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
480 	return BT_TRAIL;
481     case 0xFF:
482 	switch ((unsigned char)lo) {
483 	case 0xFF:
484 	case 0xFE:
485 	    return BT_NONXML;
486 	}
487 	break;
488     }
489     return BT_NONASCII;
490 }
491 
492 #define DEFINE_UTF16_TO_UTF8(E) \
493 static \
494 void E ## toUtf8(const ENCODING *enc, \
495 	 const char **fromP, const char *fromLim, \
496 	 char **toP, const char *toLim) \
497 { \
498   const char *from; \
499   for (from = *fromP; from != fromLim; from += 2) { \
500     int plane; \
501     unsigned char lo2; \
502     unsigned char lo = GET_LO(from); \
503     unsigned char hi = GET_HI(from); \
504     switch (hi) { \
505     case 0: \
506       if (lo < 0x80) { \
507 	if (*toP == toLim) { \
508 	  *fromP = from; \
509       return; \
510 	} \
511 	*(*toP)++ = lo; \
512 	break; \
513       } \
514       /* fall through */ \
515     case 0x1: case 0x2: case 0x3: \
516     case 0x4: case 0x5: case 0x6: case 0x7: \
517       if (toLim -  *toP < 2) { \
518 	*fromP = from; \
519     return; \
520       } \
521       *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
522       *(*toP)++ = ((lo & 0x3f) | 0x80); \
523       break; \
524     default: \
525       if (toLim -  *toP < 3)  { \
526 	*fromP = from; \
527     return; \
528       } \
529       /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
530       *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
531       *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
532       *(*toP)++ = ((lo & 0x3f) | 0x80); \
533       break; \
534     case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
535       if (toLim -  *toP < 4) { \
536     *fromP = from; \
537     return; \
538       } \
539       plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
540       *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
541       *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
542       from += 2; \
543       lo2 = GET_LO(from); \
544       *(*toP)++ = (((lo & 0x3) << 4) \
545 	       | ((GET_HI(from) & 0x3) << 2) \
546 	   | (lo2 >> 6) \
547 	   | 0x80); \
548       *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
549       break; \
550     } \
551   } \
552   *fromP = from; \
553 }
554 
555 #define DEFINE_UTF16_TO_UTF16(E) \
556 static \
557 void E ## toUtf16(const ENCODING *enc, \
558 	  const char **fromP, const char *fromLim, \
559 	  unsigned short **toP, const unsigned short *toLim) \
560 { \
561   /* Avoid copying first half only of surrogate */ \
562   if (fromLim - *fromP > ((toLim - *toP) << 1) \
563       && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
564     fromLim -= 2; \
565   for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
566     *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
567 }
568 
569 #define SET2(ptr, ch) \
570   (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
571 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
572 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
573 
574 DEFINE_UTF16_TO_UTF8(little2_)
DEFINE_UTF16_TO_UTF16(little2_)575 DEFINE_UTF16_TO_UTF16(little2_)
576 
577 #undef SET2
578 #undef GET_LO
579 #undef GET_HI
580 
581 #define SET2(ptr, ch) \
582   (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
583 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
584 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
585 
586 DEFINE_UTF16_TO_UTF8(big2_)
587 DEFINE_UTF16_TO_UTF16(big2_)
588 
589 #undef SET2
590 #undef GET_LO
591 #undef GET_HI
592 
593 #define LITTLE2_BYTE_TYPE(enc, p) \
594  ((p)[1] == 0 \
595   ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
596   : unicode_byte_type((p)[1], (p)[0]))
597 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
598 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
599 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
600   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
601 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
602   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
603 
604 #ifdef XML_MIN_SIZE
605 
606 static
607 int little2_byteType(const ENCODING *enc, const char *p)
608 {
609     return LITTLE2_BYTE_TYPE(enc, p);
610 }
611 
612 static
little2_byteToAscii(const ENCODING * enc,const char * p)613 int little2_byteToAscii(const ENCODING *enc, const char *p)
614 {
615     return LITTLE2_BYTE_TO_ASCII(enc, p);
616 }
617 
618 static
little2_charMatches(const ENCODING * enc,const char * p,int c)619 int little2_charMatches(const ENCODING *enc, const char *p, int c)
620 {
621     return LITTLE2_CHAR_MATCHES(enc, p, c);
622 }
623 
624 static
little2_isNameMin(const ENCODING * enc,const char * p)625 int little2_isNameMin(const ENCODING *enc, const char *p)
626 {
627     return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
628 }
629 
630 static
little2_isNmstrtMin(const ENCODING * enc,const char * p)631 int little2_isNmstrtMin(const ENCODING *enc, const char *p)
632 {
633     return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
634 }
635 
636 #undef VTABLE
637 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
638 
639 #else /* not XML_MIN_SIZE */
640 
641 #undef PREFIX
642 #define PREFIX(ident) little2_ ## ident
643 #define MINBPC(enc) 2
644 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
645 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
646 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
647 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
648 #define IS_NAME_CHAR(enc, p, n) 0
649 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
650 #define IS_NMSTRT_CHAR(enc, p, n) (0)
651 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
652 
653 #include "xmltok_impl_c.h"
654 
655 #undef MINBPC
656 #undef BYTE_TYPE
657 #undef BYTE_TO_ASCII
658 #undef CHAR_MATCHES
659 #undef IS_NAME_CHAR
660 #undef IS_NAME_CHAR_MINBPC
661 #undef IS_NMSTRT_CHAR
662 #undef IS_NMSTRT_CHAR_MINBPC
663 #undef IS_INVALID_CHAR
664 
665 #endif /* not XML_MIN_SIZE */
666 
667 #ifdef XML_NS
668 
669 static const struct normal_encoding little2_encoding_ns = {
670 	    { VTABLE, 2, 0,
671 #if XML_BYTE_ORDER == 12
672 		1
673 #else
674 0
675 #endif
676 	    },
677 	    {
678 #include "asciitab.h"
679 #include "latin1tab.h"
680 	    },
681 	    STANDARD_VTABLE(little2_)
682 	};
683 
684 #endif
685 
686 static const struct normal_encoding little2_encoding = {
687 	    { VTABLE, 2, 0,
688 #if XML_BYTE_ORDER == 12
689 		1
690 #else
691 		0
692 #endif
693 	    },
694 	    {
695 #define BT_COLON BT_NMSTRT
696 #include "asciitab.h"
697 #undef BT_COLON
698 #include "latin1tab.h"
699 	    },
700 	    STANDARD_VTABLE(little2_)
701 	};
702 
703 #if XML_BYTE_ORDER != 21
704 
705 #ifdef XML_NS
706 
707 static const struct normal_encoding internal_little2_encoding_ns = {
708 	{ VTABLE, 2, 0, 1 },
709 	    {
710 #include "iasciitab.h"
711 #include "latin1tab.h"
712 	    },
713 	    STANDARD_VTABLE(little2_)
714 	};
715 
716 #endif
717 
718 static const struct normal_encoding internal_little2_encoding = {
719 	{ VTABLE, 2, 0, 1 },
720 	    {
721 #define BT_COLON BT_NMSTRT
722 #include "iasciitab.h"
723 #undef BT_COLON
724 #include "latin1tab.h"
725 	    },
726 	    STANDARD_VTABLE(little2_)
727 	};
728 
729 #endif
730 
731 
732 #define BIG2_BYTE_TYPE(enc, p) \
733  ((p)[0] == 0 \
734   ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
735   : unicode_byte_type((p)[0], (p)[1]))
736 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
737 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
738 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
739   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
740 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
741   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
742 
743 #ifdef XML_MIN_SIZE
744 
745 static
big2_byteType(const ENCODING * enc,const char * p)746 int big2_byteType(const ENCODING *enc, const char *p)
747 {
748     return BIG2_BYTE_TYPE(enc, p);
749 }
750 
751 static
big2_byteToAscii(const ENCODING * enc,const char * p)752 int big2_byteToAscii(const ENCODING *enc, const char *p)
753 {
754     return BIG2_BYTE_TO_ASCII(enc, p);
755 }
756 
757 static
big2_charMatches(const ENCODING * enc,const char * p,int c)758 int big2_charMatches(const ENCODING *enc, const char *p, int c)
759 {
760     return BIG2_CHAR_MATCHES(enc, p, c);
761 }
762 
763 static
big2_isNameMin(const ENCODING * enc,const char * p)764 int big2_isNameMin(const ENCODING *enc, const char *p)
765 {
766     return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
767 }
768 
769 static
big2_isNmstrtMin(const ENCODING * enc,const char * p)770 int big2_isNmstrtMin(const ENCODING *enc, const char *p)
771 {
772     return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
773 }
774 
775 #undef VTABLE
776 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
777 
778 #else /* not XML_MIN_SIZE */
779 
780 #undef PREFIX
781 #define PREFIX(ident) big2_ ## ident
782 #define MINBPC(enc) 2
783 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
784 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
785 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
786 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
787 #define IS_NAME_CHAR(enc, p, n) 0
788 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
789 #define IS_NMSTRT_CHAR(enc, p, n) (0)
790 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
791 
792 #include "xmltok_impl_c.h"
793 
794 #undef MINBPC
795 #undef BYTE_TYPE
796 #undef BYTE_TO_ASCII
797 #undef CHAR_MATCHES
798 #undef IS_NAME_CHAR
799 #undef IS_NAME_CHAR_MINBPC
800 #undef IS_NMSTRT_CHAR
801 #undef IS_NMSTRT_CHAR_MINBPC
802 #undef IS_INVALID_CHAR
803 
804 #endif /* not XML_MIN_SIZE */
805 
806 #ifdef XML_NS
807 
808 static const struct normal_encoding big2_encoding_ns = {
809 	    { VTABLE, 2, 0,
810 #if XML_BYTE_ORDER == 21
811 		1
812 #else
813 0
814 #endif
815 	    },
816 	    {
817 #include "asciitab.h"
818 #include "latin1tab.h"
819 	    },
820 	    STANDARD_VTABLE(big2_)
821 	};
822 
823 #endif
824 
825 static const struct normal_encoding big2_encoding = {
826 	    { VTABLE, 2, 0,
827 #if XML_BYTE_ORDER == 21
828 		1
829 #else
830 		0
831 #endif
832 	    },
833 	    {
834 #define BT_COLON BT_NMSTRT
835 #include "asciitab.h"
836 #undef BT_COLON
837 #include "latin1tab.h"
838 	    },
839 	    STANDARD_VTABLE(big2_)
840 	};
841 
842 #if XML_BYTE_ORDER != 12
843 
844 #ifdef XML_NS
845 
846 static const struct normal_encoding internal_big2_encoding_ns = {
847 	{ VTABLE, 2, 0, 1 },
848 	    {
849 #include "iasciitab.h"
850 #include "latin1tab.h"
851 	    },
852 	    STANDARD_VTABLE(big2_)
853 	};
854 
855 #endif
856 
857 static const struct normal_encoding internal_big2_encoding = {
858 	{ VTABLE, 2, 0, 1 },
859 	    {
860 #define BT_COLON BT_NMSTRT
861 #include "iasciitab.h"
862 #undef BT_COLON
863 #include "latin1tab.h"
864 	    },
865 	    STANDARD_VTABLE(big2_)
866 	};
867 
868 #endif
869 
870 #undef PREFIX
871 
872 static
streqci(const char * s1,const char * s2)873 int streqci(const char *s1, const char *s2)
874 {
875     for (;;) {
876 	char c1 = *s1++;
877 	char c2 = *s2++;
878 	if ('a' <= c1 && c1 <= 'z')
879 	    c1 += 'A' - 'a';
880 	if ('a' <= c2 && c2 <= 'z')
881 	    c2 += 'A' - 'a';
882 	if (c1 != c2)
883 	    return 0;
884 	if (!c1)
885 	    break;
886     }
887     return 1;
888 }
889 
890 static
initUpdatePosition(const ENCODING * enc,const char * ptr,const char * end,POSITION * pos)891 void initUpdatePosition(const ENCODING *enc, const char *ptr,
892 			const char *end, POSITION *pos)
893 {
894     normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
895 }
896 
897 static
toAscii(const ENCODING * enc,const char * ptr,const char * end)898 int toAscii(const ENCODING *enc, const char *ptr, const char *end)
899 {
900     char buf[1];
901     char *p = buf;
902     XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
903     if (p == buf)
904 	return -1;
905     else
906 	return buf[0];
907 }
908 
909 static
isSpace(int c)910 int isSpace(int c)
911 {
912     switch (c) {
913     case 0x20:
914     case 0xD:
915     case 0xA:
916     case 0x9:
917 	return 1;
918     }
919     return 0;
920 }
921 
922 /* Return 1 if there's just optional white space
923 or there's an S followed by name=val. */
924 static
parsePseudoAttribute(const ENCODING * enc,const char * ptr,const char * end,const char ** namePtr,const char ** valPtr,const char ** nextTokPtr)925 int parsePseudoAttribute(const ENCODING *enc,
926 			 const char *ptr,
927 			 const char *end,
928 			 const char **namePtr,
929 			 const char **valPtr,
930 			 const char **nextTokPtr)
931 {
932     int c;
933     char open;
934     if (ptr == end) {
935 	*namePtr = 0;
936 	return 1;
937     }
938     if (!isSpace(toAscii(enc, ptr, end))) {
939 	*nextTokPtr = ptr;
940 	return 0;
941     }
942     do {
943 	ptr += enc->minBytesPerChar;
944     } while (isSpace(toAscii(enc, ptr, end)));
945     if (ptr == end) {
946 	*namePtr = 0;
947 	return 1;
948     }
949     *namePtr = ptr;
950     for (;;) {
951 	c = toAscii(enc, ptr, end);
952 	if (c == -1) {
953 	    *nextTokPtr = ptr;
954 	    return 0;
955 	}
956 	if (c == '=')
957 	    break;
958 	if (isSpace(c)) {
959 	    do {
960 		ptr += enc->minBytesPerChar;
961 	    } while (isSpace(c = toAscii(enc, ptr, end)));
962 	    if (c != '=') {
963 		*nextTokPtr = ptr;
964 		return 0;
965 	    }
966 	    break;
967 	}
968 	ptr += enc->minBytesPerChar;
969     }
970     if (ptr == *namePtr) {
971 	*nextTokPtr = ptr;
972 	return 0;
973     }
974     ptr += enc->minBytesPerChar;
975     c = toAscii(enc, ptr, end);
976     while (isSpace(c)) {
977 	ptr += enc->minBytesPerChar;
978 	c = toAscii(enc, ptr, end);
979     }
980     if (c != '"' && c != '\'') {
981 	*nextTokPtr = ptr;
982 	return 0;
983     }
984     open = c;
985     ptr += enc->minBytesPerChar;
986     *valPtr = ptr;
987     for (;; ptr += enc->minBytesPerChar) {
988 	c = toAscii(enc, ptr, end);
989 	if (c == open)
990 	    break;
991 	if (!('a' <= c && c <= 'z')
992 		&& !('A' <= c && c <= 'Z')
993 		&& !('0' <= c && c <= '9')
994 		&& c != '.'
995 		&& c != '-'
996 		&& c != '_') {
997 	    *nextTokPtr = ptr;
998 	    return 0;
999 	}
1000     }
1001     *nextTokPtr = ptr + enc->minBytesPerChar;
1002     return 1;
1003 }
1004 
1005 static
doParseXmlDecl(const ENCODING * (* encodingFinder)(const ENCODING *,const char *,const char *),int isGeneralTextEntity,const ENCODING * enc,const char * ptr,const char * end,const char ** badPtr,const char ** versionPtr,const char ** encodingName,const ENCODING ** encoding,int * standalone)1006 int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1007 		   const char *,
1008 		   const char *),
1009 		   int isGeneralTextEntity,
1010 		   const ENCODING *enc,
1011 		   const char *ptr,
1012 		   const char *end,
1013 		   const char **badPtr,
1014 		   const char **versionPtr,
1015 		   const char **encodingName,
1016 		   const ENCODING **encoding,
1017 		   int *standalone)
1018 {
1019     const char *val = 0;
1020     const char *name = 0;
1021     ptr += 5 * enc->minBytesPerChar;
1022     end -= 2 * enc->minBytesPerChar;
1023     if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr) || !name) {
1024 	*badPtr = ptr;
1025 	return 0;
1026     }
1027     if (!XmlNameMatchesAscii(enc, name, "version")) {
1028 	if (!isGeneralTextEntity) {
1029 	    *badPtr = name;
1030 	    return 0;
1031 	}
1032     }
1033     else {
1034 	if (versionPtr)
1035 	    *versionPtr = val;
1036 	if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
1037 	    *badPtr = ptr;
1038 	    return 0;
1039 	}
1040 	if (!name) {
1041 	    if (isGeneralTextEntity) {
1042 		/* a TextDecl must have an EncodingDecl */
1043 		*badPtr = ptr;
1044 		return 0;
1045 	    }
1046 	    return 1;
1047 	}
1048     }
1049     if (XmlNameMatchesAscii(enc, name, "encoding")) {
1050 	int c = toAscii(enc, val, end);
1051 	if (!('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z')) {
1052 	    *badPtr = val;
1053 	    return 0;
1054 	}
1055 	if (encodingName)
1056 	    *encodingName = val;
1057 	if (encoding)
1058 	    *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1059 	if (!parsePseudoAttribute(enc, ptr, end, &name, &val, &ptr)) {
1060 	    *badPtr = ptr;
1061 	    return 0;
1062 	}
1063 	if (!name)
1064 	    return 1;
1065     }
1066     if (!XmlNameMatchesAscii(enc, name, "standalone") || isGeneralTextEntity) {
1067 	*badPtr = name;
1068 	return 0;
1069     }
1070     if (XmlNameMatchesAscii(enc, val, "yes")) {
1071 	if (standalone)
1072 	    *standalone = 1;
1073     }
1074     else if (XmlNameMatchesAscii(enc, val, "no")) {
1075 	if (standalone)
1076 	    *standalone = 0;
1077     }
1078     else {
1079 	*badPtr = val;
1080 	return 0;
1081     }
1082     while (isSpace(toAscii(enc, ptr, end)))
1083 	ptr += enc->minBytesPerChar;
1084     if (ptr != end) {
1085 	*badPtr = ptr;
1086 	return 0;
1087     }
1088     return 1;
1089 }
1090 
1091 static
checkCharRefNumber(int result)1092 int checkCharRefNumber(int result)
1093 {
1094     switch (result >> 8) {
1095 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1096 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1097 	return -1;
1098     case 0:
1099 	if (latin1_encoding.type[result] == BT_NONXML)
1100 	    return -1;
1101 	break;
1102     case 0xFF:
1103 	if (result == 0xFFFE || result == 0xFFFF)
1104 	    return -1;
1105 	break;
1106     }
1107     return result;
1108 }
1109 
XmlUtf8Encode(int c,char * buf)1110 int XmlUtf8Encode(int c, char *buf)
1111 {
1112     enum {
1113 	/* minN is minimum legal resulting value for N byte sequence */
1114 	min2 = 0x80,
1115 	min3 = 0x800,
1116 	min4 = 0x10000
1117     };
1118 
1119     if (c < 0)
1120 	return 0;
1121     if (c < min2) {
1122 	buf[0] = (c | UTF8_cval1);
1123 	return 1;
1124     }
1125     if (c < min3) {
1126 	buf[0] = ((c >> 6) | UTF8_cval2);
1127 	buf[1] = ((c & 0x3f) | 0x80);
1128 	return 2;
1129     }
1130     if (c < min4) {
1131 	buf[0] = ((c >> 12) | UTF8_cval3);
1132 	buf[1] = (((c >> 6) & 0x3f) | 0x80);
1133 	buf[2] = ((c & 0x3f) | 0x80);
1134 	return 3;
1135     }
1136     if (c < 0x110000) {
1137 	buf[0] = ((c >> 18) | UTF8_cval4);
1138 	buf[1] = (((c >> 12) & 0x3f) | 0x80);
1139 	buf[2] = (((c >> 6) & 0x3f) | 0x80);
1140 	buf[3] = ((c & 0x3f) | 0x80);
1141 	return 4;
1142     }
1143     return 0;
1144 }
1145 
XmlUtf16Encode(int charNum,unsigned short * buf)1146 int XmlUtf16Encode(int charNum, unsigned short *buf)
1147 {
1148     if (charNum < 0)
1149 	return 0;
1150     if (charNum < 0x10000) {
1151 	buf[0] = charNum;
1152 	return 1;
1153     }
1154     if (charNum < 0x110000) {
1155 	charNum -= 0x10000;
1156 	buf[0] = (charNum >> 10) + 0xD800;
1157 	buf[1] = (charNum & 0x3FF) + 0xDC00;
1158 	return 2;
1159     }
1160     return 0;
1161 }
1162 
1163 struct unknown_encoding {
1164     struct normal_encoding normal;
1165     int (*convert)(void *userData, const char *p);
1166     void *userData;
1167     unsigned short utf16[256];
1168     char utf8[256][4];
1169 };
1170 
XmlSizeOfUnknownEncoding()1171 int XmlSizeOfUnknownEncoding()
1172 {
1173     return sizeof(struct unknown_encoding);
1174 }
1175 
1176 static
unknown_isName(const ENCODING * enc,const char * p)1177 int unknown_isName(const ENCODING *enc, const char *p)
1178 {
1179     int c = ((const struct unknown_encoding *)enc)
1180 	    ->convert(((const struct unknown_encoding *)enc)->userData, p);
1181     if (c & ~0xFFFF)
1182 	return 0;
1183     return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1184 }
1185 
1186 static
unknown_isNmstrt(const ENCODING * enc,const char * p)1187 int unknown_isNmstrt(const ENCODING *enc, const char *p)
1188 {
1189     int c = ((const struct unknown_encoding *)enc)
1190 	    ->convert(((const struct unknown_encoding *)enc)->userData, p);
1191     if (c & ~0xFFFF)
1192 	return 0;
1193     return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1194 }
1195 
1196 static
unknown_isInvalid(const ENCODING * enc,const char * p)1197 int unknown_isInvalid(const ENCODING *enc, const char *p)
1198 {
1199     int c = ((const struct unknown_encoding *)enc)
1200 	    ->convert(((const struct unknown_encoding *)enc)->userData, p);
1201     return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1202 }
1203 
1204 static
unknown_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)1205 void unknown_toUtf8(const ENCODING *enc,
1206 		    const char **fromP, const char *fromLim,
1207 		    char **toP, const char *toLim)
1208 {
1209     char buf[XML_UTF8_ENCODE_MAX];
1210     for (;;) {
1211 	const char *utf8;
1212 	int n;
1213 	if (*fromP == fromLim)
1214 	    break;
1215 	utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
1216 	n = *utf8++;
1217 	if (n == 0) {
1218 	    int c = ((const struct unknown_encoding *)enc)
1219 		    ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1220 	    n = XmlUtf8Encode(c, buf);
1221 	    if (n > toLim - *toP)
1222 		break;
1223 	    utf8 = buf;
1224 	    *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1225 		      - (BT_LEAD2 - 2);
1226 	}
1227 	else {
1228 	    if (n > toLim - *toP)
1229 		break;
1230 	    (*fromP)++;
1231 	}
1232 	do {
1233 	    *(*toP)++ = *utf8++;
1234 	} while (--n != 0);
1235     }
1236 }
1237 
1238 static
unknown_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)1239 void unknown_toUtf16(const ENCODING *enc,
1240 		     const char **fromP, const char *fromLim,
1241 		     unsigned short **toP, const unsigned short *toLim)
1242 {
1243     while (*fromP != fromLim && *toP != toLim) {
1244 	unsigned short c
1245 	= ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
1246 	if (c == 0) {
1247 	    c = (unsigned short)((const struct unknown_encoding *)enc)
1248 		->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1249 	    *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1250 		      - (BT_LEAD2 - 2);
1251 	}
1252 	else
1253 	    (*fromP)++;
1254 	*(*toP)++ = c;
1255     }
1256 }
1257 
1258 ENCODING *
XmlInitUnknownEncoding(void * mem,int * table,int (* convert)(void * userData,const char * p),void * userData)1259 XmlInitUnknownEncoding(void *mem,
1260 		       int *table,
1261 		       int (*convert)(void *userData, const char *p),
1262 		       void *userData)
1263 {
1264     int i;
1265     struct unknown_encoding *e = mem;
1266     for (i = 0; i < sizeof(struct normal_encoding); i++)
1267 	((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1268     for (i = 0; i < 128; i++)
1269 	if (latin1_encoding.type[i] != BT_OTHER
1270 		&& latin1_encoding.type[i] != BT_NONXML
1271 		&& table[i] != i)
1272 	    return 0;
1273     for (i = 0; i < 256; i++) {
1274 	int c = table[i];
1275 	if (c == -1) {
1276 	    e->normal.type[i] = BT_MALFORM;
1277 	    /* This shouldn't really get used. */
1278 	    e->utf16[i] = 0xFFFF;
1279 	    e->utf8[i][0] = 1;
1280 	    e->utf8[i][1] = 0;
1281 	}
1282 	else if (c < 0) {
1283 	    if (c < -4)
1284 		return 0;
1285 	    e->normal.type[i] = BT_LEAD2 - (c + 2);
1286 	    e->utf8[i][0] = 0;
1287 	    e->utf16[i] = 0;
1288 	}
1289 	else if (c < 0x80) {
1290 	    if (latin1_encoding.type[c] != BT_OTHER
1291 		    && latin1_encoding.type[c] != BT_NONXML
1292 		    && c != i)
1293 		return 0;
1294 	    e->normal.type[i] = latin1_encoding.type[c];
1295 	    e->utf8[i][0] = 1;
1296 	    e->utf8[i][1] = (char)c;
1297 	    e->utf16[i] = c == 0 ? 0xFFFF : c;
1298 	}
1299 	else if (checkCharRefNumber(c) < 0) {
1300 	    e->normal.type[i] = BT_NONXML;
1301 	    /* This shouldn't really get used. */
1302 	    e->utf16[i] = 0xFFFF;
1303 	    e->utf8[i][0] = 1;
1304 	    e->utf8[i][1] = 0;
1305 	}
1306 	else {
1307 	    if (c > 0xFFFF)
1308 		return 0;
1309 	    if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1310 		e->normal.type[i] = BT_NMSTRT;
1311 	    else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1312 		e->normal.type[i] = BT_NAME;
1313 	    else
1314 		e->normal.type[i] = BT_OTHER;
1315 	    e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1316 	    e->utf16[i] = c;
1317 	}
1318     }
1319     e->userData = userData;
1320     e->convert = convert;
1321     if (convert) {
1322 	e->normal.isName2 = unknown_isName;
1323 	e->normal.isName3 = unknown_isName;
1324 	e->normal.isName4 = unknown_isName;
1325 	e->normal.isNmstrt2 = unknown_isNmstrt;
1326 	e->normal.isNmstrt3 = unknown_isNmstrt;
1327 	e->normal.isNmstrt4 = unknown_isNmstrt;
1328 	e->normal.isInvalid2 = unknown_isInvalid;
1329 	e->normal.isInvalid3 = unknown_isInvalid;
1330 	e->normal.isInvalid4 = unknown_isInvalid;
1331     }
1332     e->normal.enc.utf8Convert = unknown_toUtf8;
1333     e->normal.enc.utf16Convert = unknown_toUtf16;
1334     return &(e->normal.enc);
1335 }
1336 
1337 /* If this enumeration is changed, getEncodingIndex and encodings
1338 must also be changed. */
1339 enum {
1340     UNKNOWN_ENC = -1,
1341     ISO_8859_1_ENC = 0,
1342     US_ASCII_ENC,
1343     UTF_8_ENC,
1344     UTF_16_ENC,
1345     UTF_16BE_ENC,
1346     UTF_16LE_ENC,
1347     /* must match encodingNames up to here */
1348     NO_ENC
1349 };
1350 
1351 static
getEncodingIndex(const char * name)1352 int getEncodingIndex(const char *name)
1353 {
1354     static const char *encodingNames[] = {
1355 	"ISO-8859-1",
1356 	"US-ASCII",
1357 	"UTF-8",
1358 	"UTF-16",
1359 	"UTF-16BE"
1360 	"UTF-16LE",
1361     };
1362     int i;
1363     if (name == 0)
1364 	return NO_ENC;
1365     for (i = 0; i < sizeof(encodingNames)/sizeof(encodingNames[0]); i++)
1366 	if (streqci(name, encodingNames[i]))
1367 	    return i;
1368     return UNKNOWN_ENC;
1369 }
1370 
1371 /* For binary compatibility, we store the index of the encoding specified
1372 at initialization in the isUtf16 member. */
1373 
1374 #define INIT_ENC_INDEX(enc) ((enc)->initEnc.isUtf16)
1375 
1376 /* This is what detects the encoding.
1377 encodingTable maps from encoding indices to encodings;
1378 INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
1379 state is XML_CONTENT_STATE if we're parsing an external text entity,
1380 and XML_PROLOG_STATE otherwise.
1381 */
1382 
1383 
1384 static
initScan(const ENCODING ** encodingTable,const INIT_ENCODING * enc,int state,const char * ptr,const char * end,const char ** nextTokPtr)1385 int initScan(const ENCODING **encodingTable,
1386 	     const INIT_ENCODING *enc,
1387 	     int state,
1388 	     const char *ptr,
1389 	     const char *end,
1390 	     const char **nextTokPtr)
1391 {
1392     const ENCODING **encPtr;
1393 
1394     if (ptr == end)
1395 	return XML_TOK_NONE;
1396     encPtr = enc->encPtr;
1397     if (ptr + 1 == end) {
1398 	/* only a single byte available for auto-detection */
1399 	/* a well-formed document entity must have more than one byte */
1400 	if (state != XML_CONTENT_STATE)
1401 	    return XML_TOK_PARTIAL;
1402 	/* so we're parsing an external text entity... */
1403 	/* if UTF-16 was externally specified, then we need at least 2 bytes */
1404 	switch (INIT_ENC_INDEX(enc)) {
1405 	case UTF_16_ENC:
1406 	case UTF_16LE_ENC:
1407 	case UTF_16BE_ENC:
1408 	    return XML_TOK_PARTIAL;
1409 	}
1410 	switch ((unsigned char)*ptr) {
1411 	case 0xFE:
1412 	case 0xFF:
1413 	case 0xEF: /* possibly first byte of UTF-8 BOM */
1414 	    if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1415 		    && state == XML_CONTENT_STATE)
1416 		break;
1417 	    /* fall through */
1418 	case 0x00:
1419 	case 0x3C:
1420 	    return XML_TOK_PARTIAL;
1421 	}
1422     }
1423     else {
1424 	switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1425 	case 0xFEFF:
1426 	    if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1427 		    && state == XML_CONTENT_STATE)
1428 		break;
1429 	    *nextTokPtr = ptr + 2;
1430 	    *encPtr = encodingTable[UTF_16BE_ENC];
1431 	    return XML_TOK_BOM;
1432 	    /* 00 3C is handled in the default case */
1433 	case 0x3C00:
1434 	    if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1435 		    || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1436 		    && state == XML_CONTENT_STATE)
1437 		break;
1438 	    *encPtr = encodingTable[UTF_16LE_ENC];
1439 	    return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1440 	case 0xFFFE:
1441 	    if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1442 		    && state == XML_CONTENT_STATE)
1443 		break;
1444 	    *nextTokPtr = ptr + 2;
1445 	    *encPtr = encodingTable[UTF_16LE_ENC];
1446 	    return XML_TOK_BOM;
1447 	case 0xEFBB:
1448 	    /* Maybe a UTF-8 BOM (EF BB BF) */
1449 	    /* If there's an explicitly specified (external) encoding
1450 	       of ISO-8859-1 or some flavour of UTF-16
1451 	       and this is an external text entity,
1452 	    don't look for the BOM,
1453 	       because it might be a legal data. */
1454 	    if (state == XML_CONTENT_STATE) {
1455 		int e = INIT_ENC_INDEX(enc);
1456 		if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
1457 		    break;
1458 	    }
1459 	    if (ptr + 2 == end)
1460 		return XML_TOK_PARTIAL;
1461 	    if ((unsigned char)ptr[2] == 0xBF) {
1462 		*encPtr = encodingTable[UTF_8_ENC];
1463 		return XML_TOK_BOM;
1464 	    }
1465 	    break;
1466 	default:
1467 	    if (ptr[0] == '\0') {
1468 		/* 0 isn't a legal data character. Furthermore a document entity can only
1469 		   start with ASCII characters.  So the only way this can fail to be big-endian
1470 		   UTF-16 if it it's an external parsed general entity that's labelled as
1471 		   UTF-16LE. */
1472 		if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1473 		    break;
1474 		*encPtr = encodingTable[UTF_16BE_ENC];
1475 		return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1476 	    }
1477 	    else if (ptr[1] == '\0') {
1478 		/* We could recover here in the case:
1479 		    - parsing an external entity
1480 		    - second byte is 0
1481 		    - no externally specified encoding
1482 		    - no encoding declaration
1483 		   by assuming UTF-16LE.  But we don't, because this would mean when
1484 		   presented just with a single byte, we couldn't reliably determine
1485 		   whether we needed further bytes. */
1486 		if (state == XML_CONTENT_STATE)
1487 		    break;
1488 		*encPtr = encodingTable[UTF_16LE_ENC];
1489 		return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1490 	    }
1491 	    break;
1492 	}
1493     }
1494     *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1495     return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1496 }
1497 
1498 
1499 #define NS(x) x
1500 #define ns(x) x
1501 #include "xmltok_ns_c.h"
1502 #undef NS
1503 #undef ns
1504 
1505 #ifdef XML_NS
1506 
1507 #define NS(x) x ## NS
1508 #define ns(x) x ## _ns
1509 
1510 #include "xmltok_ns_c.h"
1511 
1512 #undef NS
1513 #undef ns
1514 
1515 ENCODING *
XmlInitUnknownEncodingNS(void * mem,int * table,int (* convert)(void * userData,const char * p),void * userData)1516 XmlInitUnknownEncodingNS(void *mem,
1517 			 int *table,
1518 			 int (*convert)(void *userData, const char *p),
1519 			 void *userData)
1520 {
1521     ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1522     if (enc)
1523 	((struct normal_encoding *)enc)->type[':'] = BT_COLON;
1524     return enc;
1525 }
1526 
1527 #endif /* XML_NS */
1528