1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <assert.h>
4 
5 #ifdef FOR_LT
6 
7 #include "lt-memory.h"
8 #include "nsllib.h"
9 
10 #define ERR(m) LT_ERROR(NECHAR,m)
11 #define ERR1(m,x) LT_ERROR1(NECHAR,m,x)
12 #define ERR2(m,x,y) LT_ERROR2(NECHAR,m,x,y)
13 #define ERR3(m,x,y,z) LT_ERROR3(NECHAR,m,x,y,z)
14 
15 #define Malloc salloc
16 #define Realloc srealloc
17 #define Free sfree
18 
19 #else
20 
21 #include "system.h"
22 #define ERR(m) fprintf(stderr,m)
23 #define ERR1(m,x) fprintf(stderr,m,x)
24 #define ERR2(m,x,y) fprintf(stderr,m,x,y)
25 #define ERR3(m,x,y,z) fprintf(stderr,m,x,y,z)
26 
27 #endif
28 
29 #include "charset.h"
30 #include "string16.h"
31 #include "dtd.h"
32 #include "input.h"
33 #include "url.h"
34 #include "ctype16.h"
35 
36 static void internal_reader(InputSource s);
37 static void external_reader(InputSource s);
38 
SourceFromFILE16(const char8 * description,FILE16 * file16)39 InputSource SourceFromFILE16(const char8 *description, FILE16 *file16)
40 {
41     Entity e;
42 
43     e = NewExternalEntity(0, 0, description, 0, 0);
44     if(!strchr8(description, '/'))
45     {
46 	char8 *base = default_base_url();
47 	EntitySetBaseURL(e, base);
48 	Free(base);
49     }
50 
51     return NewInputSource(e, file16);
52 }
53 
SourceFromStream(const char8 * description,FILE * file)54 InputSource SourceFromStream(const char8 *description, FILE *file)
55 {
56     FILE16 *file16;
57 
58     if(!(file16 = MakeFILE16FromFILE(file, "r")))
59 	return 0;
60 
61     return SourceFromFILE16(description, file16);
62 }
63 
EntityOpen(Entity e)64 InputSource EntityOpen(Entity e)
65 {
66     FILE16 *f16;
67     char8 *r_url;
68 
69     if(e->type == ET_external)
70     {
71 	const char8 *url = EntityURL(e);
72 
73 	if(!url || !(f16 = url_open(url, 0, "r", &r_url)))
74 	    return 0;
75 	if(r_url && !e->base_url)
76 	    EntitySetBaseURL(e, r_url);
77 	Free(r_url);
78     }
79     else
80     {
81 	f16 = MakeFILE16FromString(e->text, -1, "r");
82     }
83 
84     return NewInputSource(e, f16);
85 }
86 
87 
NewInputSource(Entity e,FILE16 * f16)88 InputSource NewInputSource(Entity e, FILE16 *f16)
89 {
90     InputSource source;
91 
92     if(!(source = Malloc(sizeof(*source))))
93 	return 0;
94 
95     source->line = 0;
96     source->line_alloc = 0;
97     source->line_length = 0;
98     source->expecting_low_surrogate = 0;
99     source->complicated_utf8_line = 0;
100     source->line_is_incomplete = 0;
101     source->next = 0;
102     source->seen_eoe = 0;
103 
104     source->entity = e;
105 
106     source->reader =
107 	(e->type == ET_external) ? external_reader :internal_reader;
108     source->map = xml_char_map;	/* 1.0 map unless changed by parser */
109 
110     source->file16 = f16;
111 
112     source->bytes_consumed = 0;
113     source->bytes_before_current_line = 0;
114     source->line_end_was_cr = 0;
115     source->line_number = 0;
116     source->not_read_yet = 1;
117     source->read_carefully = 0;
118 
119     source->nextin = source->insize = 0;
120 
121     source->parent = 0;
122 
123     source->seen_error = 0;
124     strcpy(source->error_msg, "no error (you should never see this)");
125 
126     return source;
127 }
128 
SourceClose(InputSource source)129 void SourceClose(InputSource source)
130 {
131     Fclose(source->file16);
132 
133     if(source->entity->type == ET_external)
134 	Free(source->line);
135     Free(source);
136 }
137 
SourceLineAndChar(InputSource s,int * linenum,int * charnum)138 int SourceLineAndChar(InputSource s, int *linenum, int *charnum)
139 {
140     Entity e = s->entity, f = e->parent;
141 
142     if(e->type == ET_external)
143     {
144 	*linenum = s->line_number;
145 	*charnum = s->next;
146 	return 1;
147     }
148 
149     if(f && f->type == ET_external)
150     {
151 	if(e->matches_parent_text)
152 	{
153 	    *linenum = e->line_offset + s->line_number;
154 	    *charnum = (s->line_number == 0 ? e->line1_char_offset : 0) +
155 		       s->next;
156 	    return 1;
157 	}
158 	else
159 	{
160 	    *linenum = e->line_offset;
161 	    *charnum = e->line1_char_offset;
162 	    return 0;
163 	}
164     }
165 
166     if(f && f->matches_parent_text)
167     {
168 	*linenum = f->line_offset + e->line_offset;
169 	*charnum = (e->line_offset == 0 ? f->line1_char_offset : 0) +
170 	    e->line1_char_offset;
171 	return 0;
172     }
173 
174     return -1;
175 }
176 
SourcePosition(InputSource s,Entity * entity,int * byte_offset)177 void SourcePosition(InputSource s, Entity *entity, int *byte_offset)
178 {
179     *entity = s->entity;
180     *byte_offset = SourceTell(s);
181 }
182 
SourceTell(InputSource s)183 int SourceTell(InputSource s)
184 {
185 #if CHAR_SIZE == 8
186     return s->bytes_before_current_line + s->next;
187 #else
188     switch(s->entity->encoding)
189     {
190     case CE_ISO_10646_UCS_2B:
191     case CE_UTF_16B:
192     case CE_ISO_10646_UCS_2L:
193     case CE_UTF_16L:
194 	return s->bytes_before_current_line + 2 * s->next;
195     case CE_ISO_646:
196     case CE_ISO_8859_1:
197     case CE_ISO_8859_2:
198     case CE_ISO_8859_3:
199     case CE_ISO_8859_4:
200     case CE_ISO_8859_5:
201     case CE_ISO_8859_6:
202     case CE_ISO_8859_7:
203     case CE_ISO_8859_8:
204     case CE_ISO_8859_9:
205     case CE_ISO_8859_10:
206     case CE_ISO_8859_11:
207     case CE_ISO_8859_13:
208     case CE_ISO_8859_14:
209     case CE_ISO_8859_15:
210     case CE_unspecified_ascii_superset:
211 	return s->bytes_before_current_line + s->next;
212     case CE_UTF_8:
213 	if(s->complicated_utf8_line)
214 	{
215 	    /* examine earlier chars in line to see how many bytes they used */
216 	    int i, c, n;
217 
218 	    /* We cache the last result to avoid N^2 slowness on very
219 	       long lines.  Thanks to Gait Boxman for suggesting this. */
220 
221 	    if(s->next < s->cached_line_char)
222 	    {
223 		/* Moved backwards in line; doesn't happen, I think */
224 		s->cached_line_char = 0;
225 		s->cached_line_byte = 0;
226 	    }
227 
228 	    n = s->cached_line_byte;
229 	    for(i = s->cached_line_char; i < s->next; i++)
230 	    {
231 		c = s->line[i];
232 		if(c <= 0x7f)
233 		    n += 1;
234 		else if(c <= 0x7ff)
235 		    n += 2;
236 		else if(c >= 0xd800 && c <= 0xdfff)
237 		    /* One of a surrogate pair, count 2 each */
238 		    n += 2;
239 		else if(c <= 0xffff)
240 		    n += 3;
241 		else if(c <= 0x1ffff)
242 		    n += 4;
243 		else if(c <= 0x3ffffff)
244 		    n += 5;
245 		else
246 		    n += 6;
247 
248 	    }
249 
250 	    s->cached_line_char = s->next;
251 	    s->cached_line_byte = n;
252 
253 	    return s->bytes_before_current_line + n;
254 	}
255 	else
256 	    return s->bytes_before_current_line + s->next;
257     default:
258 	return -1;
259     }
260 #endif
261 }
262 
SourceSeek(InputSource s,int byte_offset)263 int SourceSeek(InputSource s, int byte_offset)
264 {
265     s->line_length = 0;
266     s->next = 0;
267     s->seen_eoe = 0;
268     s->bytes_consumed = s->bytes_before_current_line = byte_offset;
269     s->nextin = s->insize = 0;
270     /* XXX line number will be wrong! */
271     s->line_number = -999999;
272     return Fseek(s->file16, byte_offset, SEEK_SET);
273 }
274 
275 /* reader for internal entities, doesn't need to do any encoding translation */
276 
internal_reader(InputSource s)277 static void internal_reader(InputSource s)
278 {
279     /* XXX reconsider use of FILE16 for internal entities */
280 
281     struct _FILE16 {
282 	void *handle;
283 	int handle2, handle3;
284 	/* we don't need the rest here */
285     };
286 
287     Char *p;
288     struct _FILE16 *f16 = (struct _FILE16 *)s->file16;
289 
290     s->line = (void *)((char *)f16->handle + f16->handle2);
291     for(p=s->line; *p && *p != '\n'; p++)
292 	;
293     if(*p)
294 	p++;
295     f16->handle2 = (char *)p - (char *)f16->handle;
296     s->line_length = p - s->line;
297 
298     s->bytes_before_current_line = f16->handle2;
299     s->next = 0;
300     if(s->not_read_yet)
301 	s->not_read_yet = 0;
302     else
303 	s->line_number++;
304 
305     return;
306 }
307 
308 /*
309  * Translate bytes starting at s->inbuf[s->nextin] until end of line
310  * or until s->nextin == s->insize.
311  * The output is placed starting at s->line[s->nextout], which must
312  * have enough space.
313  * Returns zero at end of line or error, one if more input is needed.
314  * In the case of an error (encoding error or illegal XML character) we
315  * set s->seen_error and put a BADCHAR in the output as a marker.
316  */
317 
318 
319 #define SETUP \
320     int c;		/* can't use Char, it might be >0x10000 */ \
321 \
322     /* local copies of fields of s, that are not modified */ \
323 \
324     unsigned char * const inbuf = s->inbuf; \
325     const int insize = s->insize; \
326     const int startin = s->nextin; \
327     Char * const outbuf = s->line; \
328     unsigned char *map = s->map; \
329 \
330     /* local copies of fields of s, that are modified (and restored) */ \
331 \
332     int nextin = s->nextin; \
333     int nextout = s->line_length; \
334     int ignore_linefeed = s->ignore_linefeed; \
335 
336 #define ERROR_CHECK \
337     if(c == -1) \
338     { \
339 	/* There was an error.  Put a BADCHAR character (see input.h) in \
340 	   as a marker, and end the line. */ \
341 	outbuf[nextout++] = BADCHAR; \
342 	s->seen_error = 1; \
343 	goto end_of_line; \
344     }
345 
346 #define LINEFEED \
347     if((c == '\n' || (c == 0x85 && map == xml_char_map_11)) && \
348        ignore_linefeed) \
349     { \
350 	/* Ignore lf at start of line if last line ended with cr */ \
351 	ignore_linefeed = 0; \
352 	s->bytes_before_current_line += (nextin - startin); \
353 	continue; \
354     } \
355 \
356     ignore_linefeed = 0; \
357 \
358     if(c == '\r') \
359     { \
360 	s->line_end_was_cr = 1; \
361 	c = '\n'; \
362     } \
363     if((c == 0x85 || c == 0x2028) && map == xml_char_map_11) \
364         c = '\n';
365 
366 #define OUTPUT \
367     outbuf[nextout++] = c; \
368 \
369     if(c == '\n') \
370         goto end_of_line
371 
372 #define OUTPUT_WITH_SURROGATES \
373     if(c >= 0x10000) \
374     { \
375 	/* Use surrogates */ \
376 	outbuf[nextout++] = ((c - 0x10000) >> 10) + 0xd800; \
377 	outbuf[nextout++] = ((c - 0x10000) & 0x3ff) + 0xdc00; \
378     } \
379     else \
380 	outbuf[nextout++] = c; \
381 \
382     if(c == '\n') \
383         goto end_of_line
384 
385 #define MORE_BYTES \
386  more_bytes: \
387 	s->nextin = nextin; \
388 	s->line_length = nextout; \
389         s->ignore_linefeed = ignore_linefeed; \
390 	return 1 \
391 
392 #define END_OF_LINE \
393  end_of_line: \
394 	s->nextin = nextin; \
395 	s->line_length = nextout; \
396         s->ignore_linefeed = ignore_linefeed; \
397 	return 0
398 
399 #if CHAR_SIZE == 8
400 
translate_8bit(InputSource s)401 static int translate_8bit(InputSource s)
402 {
403     SETUP;
404 
405     while(nextin < insize)
406     {
407 	c = inbuf[nextin++];
408 
409 	if(!is_xml_legal(c, map))
410 	{
411 	    sprintf(s->error_msg,
412 		    "Illegal character <0x%x> at file offset %d",
413 		    c, s->bytes_consumed + nextin - startin - 1);
414 	    c = -1;
415 	}
416 
417 	ERROR_CHECK;
418 
419 	LINEFEED;
420 
421 	OUTPUT;
422     }
423 
424     MORE_BYTES;
425 
426     END_OF_LINE;
427 }
428 
429 #else
430 
translate_latin(InputSource s)431 static int translate_latin(InputSource s)
432 {
433     CharacterEncoding enc = s->entity->encoding;
434     int *to_unicode = iso_to_unicode[enc - CE_ISO_8859_2];
435     SETUP;
436 
437     while(nextin < insize)
438     {
439 	c = to_unicode[inbuf[nextin++]];
440 	if(c == -1)
441 	{
442 	    sprintf(s->error_msg,
443 		    "Illegal byte <0x%x> for encoding %s at file offset %d",
444 		    inbuf[nextin-1], CharacterEncodingName[enc],
445 		    s->bytes_consumed + nextin - 1 - startin);
446 	}
447 	else if(!is_xml_legal(c, map))
448 	{
449 	    sprintf(s->error_msg,
450 		    "Illegal character <0x%x> "
451 		    "immediately before file offset %d",
452 		    c, s->bytes_consumed + nextin - startin);
453 	    c = -1;
454 	}
455 
456 	ERROR_CHECK;
457 
458 	LINEFEED;
459 
460 	OUTPUT;
461     }
462 
463     END_OF_LINE;
464 }
465 
translate_latin1(InputSource s)466 static int translate_latin1(InputSource s)
467 {
468     SETUP;
469 
470     while(nextin < insize)
471     {
472 	c = inbuf[nextin++];
473 	if(!is_xml_legal(c, map))
474 	{
475 	    sprintf(s->error_msg,
476 		    "Illegal character <0x%x> "
477 		    "immediately before file offset %d",
478 		    c, s->bytes_consumed + nextin - startin);
479 	    c = -1;
480 	}
481 
482 	ERROR_CHECK;
483 
484 	LINEFEED;
485 
486 	OUTPUT;
487     }
488 
489     END_OF_LINE;
490 }
491 
translate_utf8(InputSource s)492 static int translate_utf8(InputSource s)
493 {
494     int more, i, mincode;
495     SETUP;
496 
497     while(nextin < insize)
498     {
499 	c = inbuf[nextin++];
500 	if(c <= 0x7f)
501 	    goto gotit;
502 	else if(c <= 0xc0 || c >= 0xfe)
503 	{
504 	    sprintf(s->error_msg,
505 		   "Illegal UTF-8 start byte <0x%x> at file offset %d",
506 		    c, s->bytes_consumed + nextin - 1 - startin);
507 	    c = -1;
508 	    goto gotit;
509 	}
510 	else if(c <= 0xdf)
511 	{
512 	    c &= 0x1f;
513 	    more = 1;
514 	    mincode = 0x80;
515 	}
516 	else if(c <= 0xef)
517 	{
518 	    c &= 0x0f;
519 	    more = 2;
520 	    mincode = 0x800;
521 	}
522 	else if(c <= 0xf7)
523 	{
524 	    c &= 0x07;
525 	    more = 3;
526 	    mincode = 0x10000;
527 	}
528 	else if(c <= 0xfb)
529 	{
530 	    c &= 0x03;
531 	    more = 4;
532 	    mincode = 0x200000;
533 	}
534 	else
535 	{
536 	    c &= 0x01;
537 	    more = 5;
538 	    mincode = 0x4000000;
539 	}
540 	if(nextin+more > insize)
541 	{
542 	    nextin--;
543 	    goto more_bytes;
544 	}
545 	s->complicated_utf8_line = 1;
546 	s->cached_line_char = 0;
547 	s->cached_line_byte = 0;
548 
549 	for(i=0; i<more; i++)
550 	{
551 	    int t = inbuf[nextin++];
552 	    if((t & 0xc0) != 0x80)
553 	    {
554 		c = -1;
555 		sprintf(s->error_msg,
556 		      "Illegal UTF-8 byte %d <0x%x> at file offset %d",
557 			i+2, t,
558 			s->bytes_consumed + nextin - 1 - startin);
559 		break;
560 	    }
561 	    c = (c << 6) + (t & 0x3f);
562 	}
563 
564 	if(c < mincode && c != -1)
565 	{
566 	    sprintf(s->error_msg,
567 		    "Illegal (non-shortest) UTF-8 sequence for "
568 		    "character <0x%x> "
569 		    "immediately before file offset %d",
570 		    c, s->bytes_consumed + nextin - startin);
571 	    c = -1;
572 	}
573 
574     gotit:
575 	if(c >= 0 && !is_xml_legal(c, map))
576 	{
577 	    sprintf(s->error_msg,
578 		    "Illegal character <0x%x> "
579 		    "immediately before file offset %d",
580 		    c, s->bytes_consumed + nextin - startin);
581 	    c = -1;
582 	}
583 
584 	ERROR_CHECK;
585 
586 	LINEFEED;
587 
588 	OUTPUT_WITH_SURROGATES;
589 
590 	if(c == '>' && s->read_carefully)
591 	{
592 	    s->line_is_incomplete = 1;
593 	    goto end_of_line;
594 	}
595     }
596 
597     MORE_BYTES;
598 
599     END_OF_LINE;
600 }
601 
translate_utf16(InputSource s)602 static int translate_utf16(InputSource s)
603 {
604     int le = (s->entity->encoding == CE_ISO_10646_UCS_2L ||
605 	      s->entity->encoding == CE_UTF_16L);
606     SETUP;
607 
608     while(nextin < insize)
609     {
610 	if(nextin+2 > insize)
611 	    goto more_bytes;
612 
613 	if(le)
614 	    c = (inbuf[nextin+1] << 8) + inbuf[nextin];
615 	else
616 	    c = (inbuf[nextin] << 8) + inbuf[nextin+1];
617 	nextin += 2;
618 
619 	if(c >= 0xdc00 && c <= 0xdfff) /* low (2nd) surrogate */
620 	{
621 	    if(s->expecting_low_surrogate)
622 		s->expecting_low_surrogate = 0;
623 	    else
624 	    {
625 		sprintf(s->error_msg,
626 			"Unexpected low surrogate <0x%x> "
627 			"at file offset %d",
628 			c, s->bytes_consumed + nextin - startin - 2);
629 		c = -1;
630 	    }
631 	}
632 	else if(s->expecting_low_surrogate)
633 	{
634 	    sprintf(s->error_msg,
635 		    "Expected low surrogate but got <0x%x> "
636 		    "at file offset %d",
637 		    c, s->bytes_consumed + nextin - startin - 2);
638 	    c = -1;
639 	}
640 	if(c >= 0xd800 && c <= 0xdbff) /* high (1st) surrogate */
641 	    s->expecting_low_surrogate = 1;
642 
643 	if(c >= 0 && !is_xml_legal(c, map) &&
644 	   /* surrogates are legal in utf-16 */
645 	   !(c >= 0xd800 && c <= 0xdfff))
646 	{
647 	    sprintf(s->error_msg,
648 		    "Illegal character <0x%x> "
649 		    "immediately before file offset %d",
650 		    c, s->bytes_consumed + nextin - startin);
651 	    c = -1;
652 	}
653 
654 	ERROR_CHECK;
655 
656 	LINEFEED;
657 
658 	OUTPUT;
659     }
660 
661     MORE_BYTES;
662 
663     END_OF_LINE;
664 }
665 
666 #endif
667 
external_reader(InputSource s)668 static void external_reader(InputSource s)
669 {
670     int startin = s->nextin;
671     int (*trans)(InputSource);
672     int continuing_incomplete_line = s->line_is_incomplete;
673 
674     if(s->seen_error)
675 	return;
676 
677     s->line_is_incomplete = 0;
678     if(!continuing_incomplete_line)
679     {
680 	s->ignore_linefeed = s->line_end_was_cr;
681 	s->line_end_was_cr = 0;
682 	s->complicated_utf8_line = 0;
683 	s->line_length = 0;
684 	s->bytes_before_current_line = s->bytes_consumed;
685 	s->next = 0;
686     }
687 
688 #if CHAR_SIZE == 8
689     trans = translate_8bit;
690 #else
691     switch(s->entity->encoding)
692     {
693     case CE_ISO_646:	/* should really check for >127 in this case */
694     case CE_ISO_8859_1:
695     case CE_unspecified_ascii_superset:
696 	trans = translate_latin1;
697 	break;
698     case CE_ISO_8859_2:
699     case CE_ISO_8859_3:
700     case CE_ISO_8859_4:
701     case CE_ISO_8859_5:
702     case CE_ISO_8859_6:
703     case CE_ISO_8859_7:
704     case CE_ISO_8859_8:
705     case CE_ISO_8859_9:
706     case CE_ISO_8859_10:
707     case CE_ISO_8859_11:
708     case CE_ISO_8859_13:
709     case CE_ISO_8859_14:
710     case CE_ISO_8859_15:
711 	trans = translate_latin;
712 	break;
713     case CE_UTF_8:
714 	trans = translate_utf8;
715 	break;
716     case CE_ISO_10646_UCS_2B:
717     case CE_UTF_16B:
718     case CE_ISO_10646_UCS_2L:
719     case CE_UTF_16L:
720 	trans=translate_utf16;
721 	break;
722     default:
723 	assert(1==0);
724 	break;
725     }
726 #endif
727 
728     while(1)
729     {
730 	/* There are never more characters than bytes in the input */
731 	if(s->line_alloc < s->line_length + (s->insize - s->nextin))
732 	{
733 	    s->line_alloc = s->line_length + (s->insize - s->nextin);
734 	    s->line = Realloc(s->line, s->line_alloc * sizeof(Char));
735 	}
736 
737 	if(trans(s) == 0)
738 	{
739 	    s->bytes_consumed += (s->nextin - startin);
740 	    if(s->not_read_yet)
741 		s->not_read_yet = 0;
742 	    else if(!continuing_incomplete_line)
743 		s->line_number++;
744 	    return;
745 	}
746 	else
747 	{
748 	    int i, bytes_read, remaining = 0;
749 
750 	    /* more input needed */
751 
752 	    /* Copy down any partial character */
753 
754 	    remaining = s->insize - s->nextin;
755 	    for(i=0; i<remaining; i++)
756 		s->inbuf[i] = s->inbuf[s->nextin + i];
757 
758 	    /* Get another block */
759 
760 	    s->bytes_consumed += (s->nextin - startin);
761 
762 	    bytes_read = Readu(s->file16,
763 			       s->inbuf+remaining, sizeof(s->inbuf)-remaining);
764 	    s->nextin = startin = 0;
765 
766 	    if(bytes_read <= 0)
767 	    {
768 		if(remaining > 0)
769 		{
770 		    /* EOF or error in the middle of a character */
771 		    sprintf(s->error_msg, "EOF or error inside character at "
772 					  "file offset %d",
773 			    s->bytes_consumed + remaining);
774 		    /* There must be space because there is unconsumed input */
775 		    s->line[s->line_length++] = BADCHAR;
776 		    s->seen_error = 1;
777 		}
778 
779 		s->insize = 0;
780 
781 		if(s->not_read_yet)
782 		    s->not_read_yet = 0;
783 		else if(!continuing_incomplete_line)
784 		    s->line_number++;
785 
786 		return;
787 	    }
788 
789 	    s->insize = bytes_read + remaining;
790 	}
791     }
792 }
793 
determine_character_encoding(InputSource s)794 void determine_character_encoding(InputSource s)
795 {
796     Entity e = s->entity;
797     int nread;
798     unsigned char *b = (unsigned char *)s->inbuf;
799 
800     b[0] = b[1] = b[2] = b[3] = 0;
801 
802     while(s->insize < 4)
803     {
804 	nread = Readu(s->file16, s->inbuf + s->insize, 4 - s->insize);
805 	if(nread == -1)
806 	    return;
807 	if(nread == 0)
808 	    break;
809 	s->insize += nread;
810     }
811 
812 #if 0
813     if(b[0] == 0 && b[1] == 0 && b[2] == 0 && b[3] == '<')
814 	e->encoding = CE_ISO_10646_UCS_4B;
815     else if(b[0] == '<' && b[1] == 0 && b[2] == 0 && b[3] == 0)
816 	e->encoding = CE_ISO_10646_UCS_4L;
817     else
818 #endif
819     if(b[0] == 0xef && b[1] == 0xbb && b[2] == 0xbf)
820     {
821 	e->encoding = CE_UTF_8;
822 	s->nextin = 3;
823 	s->bytes_consumed = 3;
824     }
825     else
826     if(b[0] == 0xfe && b[1] == 0xff)
827     {
828 	e->encoding = CE_UTF_16B;
829 	s->nextin = 2;
830 	s->bytes_consumed = 2;
831     }
832     else if(b[0] == 0 && b[1] == '<' && b[2] == 0 && b[3] == '?')
833 	e->encoding = CE_UTF_16B;
834     else if(b[0] == 0xff && b[1] == 0xfe)
835     {
836 	e->encoding = CE_UTF_16L;
837 	s->nextin = 2;
838 	s->bytes_consumed = 2;
839     }
840     else if(b[0] == '<' && b[1] == 0 && b[2] == '?' && b[3] == 0)
841 	e->encoding = CE_UTF_16L;
842     else
843     {
844 #if CHAR_SIZE == 8
845 	e->encoding = CE_unspecified_ascii_superset;
846 #else
847         e->encoding = CE_UTF_8;
848 	s->read_carefully = 1;
849 #endif
850     }
851 }
852 
get_with_fill(InputSource s)853 int get_with_fill(InputSource s)
854 {
855     int old_length = s->next;
856     int old_cu8l = s->complicated_utf8_line;
857     int old_bbcl = s->bytes_before_current_line;
858     int old_ln = s->line_number;
859 
860     assert(!s->seen_eoe);
861 
862     if(s->seen_error)
863     {
864 	s->seen_eoe = 1;
865 	return XEOE;
866     }
867 
868     s->reader(s);
869 
870     if(s->line_length == 0)
871     {
872 	/* Restore old line */
873 	s->line_length = s->next = old_length;
874 	s->complicated_utf8_line = old_cu8l;
875 	s->bytes_before_current_line = old_bbcl;
876 	s->line_number = old_ln;
877 	s->seen_eoe = 1;
878 #if 0
879 	fprintf(stderr, "EOE on %s\n", EntityDescription(s->entity));
880 #endif
881 	return XEOE;
882     }
883 
884     if(s->next == s->line_length)
885     {
886 	/* "incomplete" line turned out to be at EOF */
887 #if 0
888 	fprintf(stderr, "EOE on %s\n", EntityDescription(s->entity));
889 #endif
890 	s->seen_eoe = 1;
891 	return XEOE;
892     }
893 
894 #if 0
895     Fprintf(Stderr, "line (len %d, next %d): |%.*S|\n",
896 	    s->line_length, s->next, s->line_length, s->line);
897 #endif
898 
899     return s->line[s->next++];
900 }
901