1 /* XmlParser.java --
2    Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
3 
4 This file is part of GNU Classpath.
5 
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10 
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING.  If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
20 
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library.  Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
25 
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module.  An independent module is a module which is not derived from
33 or based on this library.  If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so.  If you do not wish to do so, delete this
36 exception statement from your version.
37 
38 Partly derived from code which carried the following notice:
39 
40   Copyright (c) 1997, 1998 by Microstar Software Ltd.
41 
42   AElfred is free for both commercial and non-commercial use and
43   redistribution, provided that Microstar's copyright and disclaimer are
44   retained intact.  You are free to modify AElfred for your own use and
45   to redistribute AElfred with your modifications, provided that the
46   modifications are clearly documented.
47 
48   This program is distributed in the hope that it will be useful, but
49   WITHOUT ANY WARRANTY; without even the implied warranty of
50   merchantability or fitness for a particular purpose.  Please use it AT
51   YOUR OWN RISK.
52 */
53 
54 package gnu.xml.aelfred2;
55 
56 import gnu.java.security.action.GetPropertyAction;
57 
58 import java.io.BufferedInputStream;
59 import java.io.CharConversionException;
60 import java.io.EOFException;
61 import java.io.InputStream;
62 import java.io.InputStreamReader;
63 import java.io.IOException;
64 import java.io.Reader;
65 import java.io.UnsupportedEncodingException;
66 import java.net.URL;
67 import java.net.URLConnection;
68 import java.security.AccessController;
69 
70 import java.util.Iterator;
71 import java.util.HashMap;
72 import java.util.LinkedList;
73 
74 import org.xml.sax.InputSource;
75 import org.xml.sax.SAXException;
76 
77 
78 /**
79  * Parse XML documents and return parse events through call-backs.
80  * Use the <code>SAXDriver</code> class as your entry point, as all
81  * internal parser interfaces are subject to change.
82  *
83  * @author Written by David Megginson &lt;dmeggins@microstar.com&gt;
84  *      (version 1.2a with bugfixes)
85  * @author Updated by David Brownell &lt;dbrownell@users.sourceforge.net&gt;
86  * @see SAXDriver
87  */
88 final class XmlParser
89 {
90 
91   // avoid slow per-character readCh()
92   private final static boolean USE_CHEATS = true;
93 
94   ////////////////////////////////////////////////////////////////////////
95   // Constants.
96   ////////////////////////////////////////////////////////////////////////
97 
98   //
99   // Constants for element content type.
100   //
101 
102   /**
103    * Constant: an element has not been declared.
104    * @see #getElementContentType
105    */
106   public final static int CONTENT_UNDECLARED = 0;
107 
108   /**
109    * Constant: the element has a content model of ANY.
110    * @see #getElementContentType
111    */
112   public final static int CONTENT_ANY = 1;
113 
114   /**
115    * Constant: the element has declared content of EMPTY.
116    * @see #getElementContentType
117    */
118   public final static int CONTENT_EMPTY = 2;
119 
120   /**
121    * Constant: the element has mixed content.
122    * @see #getElementContentType
123    */
124   public final static int CONTENT_MIXED = 3;
125 
126   /**
127    * Constant: the element has element content.
128    * @see #getElementContentType
129    */
130   public final static int CONTENT_ELEMENTS = 4;
131 
132 
133   //
134   // Constants for the entity type.
135   //
136 
137   /**
138    * Constant: the entity has not been declared.
139    * @see #getEntityType
140    */
141   public final static int ENTITY_UNDECLARED = 0;
142 
143   /**
144    * Constant: the entity is internal.
145    * @see #getEntityType
146    */
147   public final static int ENTITY_INTERNAL = 1;
148 
149   /**
150    * Constant: the entity is external, non-parsable data.
151    * @see #getEntityType
152    */
153   public final static int ENTITY_NDATA = 2;
154 
155   /**
156    * Constant: the entity is external XML data.
157    * @see #getEntityType
158    */
159   public final static int ENTITY_TEXT = 3;
160 
161   //
162   // Attribute type constants are interned literal strings.
163   //
164 
165   //
166   // Constants for supported encodings.  "external" is just a flag.
167   //
168   private final static int ENCODING_EXTERNAL = 0;
169   private final static int ENCODING_UTF_8 = 1;
170   private final static int ENCODING_ISO_8859_1 = 2;
171   private final static int ENCODING_UCS_2_12 = 3;
172   private final static int ENCODING_UCS_2_21 = 4;
173   private final static int ENCODING_UCS_4_1234 = 5;
174   private final static int ENCODING_UCS_4_4321 = 6;
175   private final static int ENCODING_UCS_4_2143 = 7;
176   private final static int ENCODING_UCS_4_3412 = 8;
177   private final static int ENCODING_ASCII = 9;
178 
179   //
180   // Constants for attribute default value.
181   //
182 
183   /**
184    * Constant: the attribute is not declared.
185    * @see #getAttributeDefaultValueType
186    */
187   public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30;
188 
189   /**
190    * Constant: the attribute has a literal default value specified.
191    * @see #getAttributeDefaultValueType
192    * @see #getAttributeDefaultValue
193    */
194   public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31;
195 
196   /**
197    * Constant: the attribute was declared #IMPLIED.
198    * @see #getAttributeDefaultValueType
199    */
200   public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32;
201 
202   /**
203    * Constant: the attribute was declared #REQUIRED.
204    * @see #getAttributeDefaultValueType
205    */
206   public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33;
207 
208   /**
209    * Constant: the attribute was declared #FIXED.
210    * @see #getAttributeDefaultValueType
211    * @see #getAttributeDefaultValue
212    */
213   public final static int ATTRIBUTE_DEFAULT_FIXED = 34;
214 
215   //
216   // Constants for input.
217   //
218   private final static int INPUT_NONE = 0;
219   private final static int INPUT_INTERNAL = 1;
220   private final static int INPUT_STREAM = 3;
221   private final static int INPUT_READER = 5;
222 
223   //
224   // Flags for reading literals.
225   //
226   // expand general entity refs (attribute values in dtd and content)
227   private final static int LIT_ENTITY_REF = 2;
228   // normalize this value (space chars) (attributes, public ids)
229   private final static int LIT_NORMALIZE = 4;
230   // literal is an attribute value
231   private final static int LIT_ATTRIBUTE = 8;
232   // don't expand parameter entities
233   private final static int LIT_DISABLE_PE = 16;
234   // don't expand [or parse] character refs
235   private final static int LIT_DISABLE_CREF = 32;
236   // don't parse general entity refs
237   private final static int LIT_DISABLE_EREF = 64;
238   // literal is a public ID value
239   private final static int LIT_PUBID = 256;
240 
241   //
242   // Flags affecting PE handling in DTDs (if expandPE is true).
243   // PEs expand with space padding, except inside literals.
244   //
245   private final static int CONTEXT_NORMAL = 0;
246   private final static int CONTEXT_LITERAL = 1;
247 
248   // Emit warnings for relative URIs with no base URI.
249   static boolean uriWarnings;
250   static
251   {
252     String key = "gnu.xml.aelfred2.XmlParser.uriWarnings";
253     GetPropertyAction a = new GetPropertyAction(key);
254     uriWarnings = "true".equals(AccessController.doPrivileged(a));
255   }
256 
257   //
258   // The current XML handler interface.
259   //
260   private SAXDriver handler;
261 
262   //
263   // I/O information.
264   //
265   private Reader reader;   // current reader
266   private InputStream is;     // current input stream
267   private int line;     // current line number
268   private int column;   // current column number
269   private int sourceType;   // type of input source
270   private LinkedList inputStack;   // stack of input soruces
271   private URLConnection externalEntity; // current external entity
272   private int encoding;   // current character encoding
273   private int currentByteCount; // bytes read from current source
274   private InputSource scratch;  // temporary
275 
276   //
277   // Buffers for decoded but unparsed character input.
278   //
279   private char[] readBuffer;
280   private int readBufferPos;
281   private int readBufferLength;
282   private int readBufferOverflow;  // overflow from last data chunk.
283 
284   //
285   // Buffer for undecoded raw byte input.
286   //
287   private final static int READ_BUFFER_MAX = 16384;
288   private byte[] rawReadBuffer;
289 
290 
291   //
292   // Buffer for attribute values, char refs, DTD stuff.
293   //
294   private static int DATA_BUFFER_INITIAL = 4096;
295   private char[] dataBuffer;
296   private int dataBufferPos;
297 
298   //
299   // Buffer for parsed names.
300   //
301   private static int NAME_BUFFER_INITIAL = 1024;
302   private char[] nameBuffer;
303   private int nameBufferPos;
304 
305   //
306   // Save any standalone flag
307   //
308   private boolean docIsStandalone;
309 
310   //
311   // Hashtables for DTD information on elements, entities, and notations.
312   // Populated until we start ignoring decls (because of skipping a PE)
313   //
314   private HashMap elementInfo;
315   private HashMap entityInfo;
316   private HashMap notationInfo;
317   private boolean skippedPE;
318 
319   //
320   // Element type currently in force.
321   //
322   private String currentElement;
323   private int currentElementContent;
324 
325   //
326   // Stack of entity names, to detect recursion.
327   //
328   private LinkedList entityStack;
329 
330   //
331   // PE expansion is enabled in most chunks of the DTD, not all.
332   // When it's enabled, literals are treated differently.
333   //
334   private boolean inLiteral;
335   private boolean expandPE;
336   private boolean peIsError;
337 
338   //
339   // can't report entity expansion inside two constructs:
340   // - attribute expansions (internal entities only)
341   // - markup declarations (parameter entities only)
342   //
343   private boolean doReport;
344 
345   //
346   // Symbol table, for caching interned names.
347   //
348   // These show up wherever XML names or nmtokens are used:  naming elements,
349   // attributes, PIs, notations, entities, and enumerated attribute values.
350   //
351   // NOTE:  This hashtable doesn't grow.  The default size is intended to be
352   // rather large for most documents.  Example:  one snapshot of the DocBook
353   // XML 4.1 DTD used only about 350 such names.  As a rule, only pathological
354   // documents (ones that don't reuse names) should ever see much collision.
355   //
356   // Be sure that SYMBOL_TABLE_LENGTH always stays prime, for best hashing.
357   // "2039" keeps the hash table size at about two memory pages on typical
358   // 32 bit hardware.
359   //
360   private final static int SYMBOL_TABLE_LENGTH = 2039;
361 
362   private Object[][] symbolTable;
363 
364   //
365   // Hash table of attributes found in current start tag.
366   //
367   private String[] tagAttributes;
368   private int tagAttributePos;
369 
370   //
371   // Utility flag: have we noticed a CR while reading the last
372   // data chunk?  If so, we will have to go back and normalise
373   // CR or CR/LF line ends.
374   //
375   private boolean sawCR;
376 
377   //
378   // Utility flag: are we in CDATA?  If so, whitespace isn't ignorable.
379   //
380   private boolean inCDATA;
381 
382   //
383   // Xml version.
384   //
385   private static final int XML_10 = 0;
386   private static final int XML_11 = 1;
387   private int xmlVersion = XML_10;
388 
389   //////////////////////////////////////////////////////////////////////
390   // Constructors.
391   ////////////////////////////////////////////////////////////////////////
392 
393   /**
394    * Construct a new parser with no associated handler.
395    * @see #setHandler
396    * @see #parse
397    */
398   // package private
XmlParser()399   XmlParser()
400   {
401   }
402 
403   /**
404    * Set the handler that will receive parsing events.
405    * @param handler The handler to receive callback events.
406    * @see #parse
407    */
408   // package private
setHandler(SAXDriver handler)409   void setHandler(SAXDriver handler)
410   {
411     this.handler = handler;
412   }
413 
414   /**
415    * Parse an XML document from the character stream, byte stream, or URI
416    * that you provide (in that order of preference).  Any URI that you
417    * supply will become the base URI for resolving relative URI, and may
418    * be used to acquire a reader or byte stream.
419    *
420    * <p> Only one thread at a time may use this parser; since it is
421    * private to this package, post-parse cleanup is done by the caller,
422    * which MUST NOT REUSE the parser (just null it).
423    *
424    * @param systemId Absolute URI of the document; should never be null,
425    *    but may be so iff a reader <em>or</em> a stream is provided.
426    * @param publicId The public identifier of the document, or null.
427    * @param reader A character stream; must be null if stream isn't.
428    * @param stream A byte input stream; must be null if reader isn't.
429    * @param encoding The suggested encoding, or null if unknown.
430    * @exception java.lang.Exception Basically SAXException or IOException
431    */
432   // package private
doParse(String systemId, String publicId, Reader reader, InputStream stream, String encoding)433   void doParse(String systemId, String publicId, Reader reader,
434                InputStream stream, String encoding)
435     throws Exception
436   {
437     if (handler == null)
438       {
439         throw new IllegalStateException("no callback handler");
440       }
441 
442     initializeVariables();
443 
444     // predeclare the built-in entities here (replacement texts)
445     // we don't need to intern(), since we're guaranteed literals
446     // are always (globally) interned.
447     setInternalEntity("amp", "&#38;");
448     setInternalEntity("lt", "&#60;");
449     setInternalEntity("gt", "&#62;");
450     setInternalEntity("apos", "&#39;");
451     setInternalEntity("quot", "&#34;");
452 
453     try
454       {
455         // pushURL first to ensure locator is correct in startDocument
456         // ... it might report an IO or encoding exception.
457         handler.startDocument();
458         pushURL(false, "[document]",
459                 // default baseURI: null
460                 new ExternalIdentifiers(publicId, systemId, null),
461                 reader, stream, encoding, false);
462 
463         parseDocument();
464       }
465     catch (EOFException e)
466       {
467         //empty input
468         error("empty document, with no root element.");
469       }
470     finally
471       {
472         if (reader != null)
473           {
474             try
475               {
476                 reader.close();
477               }
478             catch (IOException e)
479               {
480                 /* ignore */
481               }
482           }
483         if (stream != null)
484           {
485             try
486               {
487                 stream.close();
488               }
489             catch (IOException e)
490               {
491                 /* ignore */
492               }
493           }
494         if (is != null)
495           {
496             try
497               {
498                 is.close();
499               }
500             catch (IOException e)
501               {
502                 /* ignore */
503               }
504           }
505         scratch = null;
506       }
507   }
508 
509   //////////////////////////////////////////////////////////////////////
510   // Error reporting.
511   //////////////////////////////////////////////////////////////////////
512 
513   /**
514    * Report an error.
515    * @param message The error message.
516    * @param textFound The text that caused the error (or null).
517    * @see SAXDriver#error
518    * @see #line
519    */
error(String message, String textFound, String textExpected)520   private void error(String message, String textFound, String textExpected)
521     throws SAXException
522   {
523     if (textFound != null)
524       {
525         message = message + " (found \"" + textFound + "\")";
526       }
527     if (textExpected != null)
528       {
529         message = message + " (expected \"" + textExpected + "\")";
530       }
531     handler.fatal(message);
532 
533     // "can't happen"
534     throw new SAXException(message);
535   }
536 
537   /**
538    * Report a serious error.
539    * @param message The error message.
540    * @param textFound The text that caused the error (or null).
541    */
error(String message, char textFound, String textExpected)542   private void error(String message, char textFound, String textExpected)
543     throws SAXException
544   {
545     error(message, Character.toString(textFound), textExpected);
546   }
547 
548   /**
549    * Report typical case fatal errors.
550    */
error(String message)551   private void error(String message)
552     throws SAXException
553   {
554     handler.fatal(message);
555   }
556 
557   //////////////////////////////////////////////////////////////////////
558   // Major syntactic productions.
559   //////////////////////////////////////////////////////////////////////
560 
561   /**
562    * Parse an XML document.
563    * <pre>
564    * [1] document ::= prolog element Misc*
565    * </pre>
566    * <p>This is the top-level parsing function for a single XML
567    * document.  As a minimum, a well-formed document must have
568    * a document element, and a valid document must have a prolog
569    * (one with doctype) as well.
570    */
parseDocument()571   private void parseDocument()
572     throws Exception
573   {
574     try
575       {                                       // added by MHK
576         boolean sawDTD = parseProlog();
577         require('<');
578         parseElement(!sawDTD);
579       }
580     catch (EOFException ee)
581       {                 // added by MHK
582         error("premature end of file", "[EOF]", null);
583       }
584 
585     try
586       {
587         parseMisc();   //skip all white, PIs, and comments
588         char c = readCh();    //if this doesn't throw an exception...
589         error("unexpected characters after document end", c, null);
590       }
591     catch (EOFException e)
592       {
593         return;
594       }
595   }
596 
597   static final char[] startDelimComment = { '<', '!', '-', '-' };
598   static final char[] endDelimComment = { '-', '-' };
599 
600   /**
601    * Skip a comment.
602    * <pre>
603    * [15] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
604    * </pre>
605    * <p> (The <code>&lt;!--</code> has already been read.)
606    */
parseComment()607   private void parseComment()
608     throws Exception
609   {
610     char c;
611     boolean saved = expandPE;
612 
613     expandPE = false;
614     parseUntil(endDelimComment);
615     require('>');
616     expandPE = saved;
617     handler.comment(dataBuffer, 0, dataBufferPos);
618     dataBufferPos = 0;
619   }
620 
621   static final char[] startDelimPI = { '<', '?' };
622   static final char[] endDelimPI = { '?', '>' };
623 
624   /**
625    * Parse a processing instruction and do a call-back.
626    * <pre>
627    * [16] PI ::= '&lt;?' PITarget
628    *    (S (Char* - (Char* '?&gt;' Char*)))?
629    *    '?&gt;'
630    * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
631    * </pre>
632    * <p> (The <code>&lt;?</code> has already been read.)
633    */
parsePI()634   private void parsePI()
635     throws SAXException, IOException
636   {
637     String name;
638     boolean saved = expandPE;
639 
640     expandPE = false;
641     name = readNmtoken(true);
642     //NE08
643     if (name.indexOf(':') >= 0)
644       {
645         error("Illegal character(':') in processing instruction name ",
646               name, null);
647       }
648     if ("xml".equalsIgnoreCase(name))
649       {
650         error("Illegal processing instruction target", name, null);
651       }
652     if (!tryRead(endDelimPI))
653       {
654         requireWhitespace();
655         parseUntil(endDelimPI);
656       }
657     expandPE = saved;
658     handler.processingInstruction(name, dataBufferToString());
659   }
660 
661   static final char[] endDelimCDATA = { ']', ']', '>' };
662 
663   private boolean isDirtyCurrentElement;
664 
665   /**
666    * Parse a CDATA section.
667    * <pre>
668    * [18] CDSect ::= CDStart CData CDEnd
669    * [19] CDStart ::= '&lt;![CDATA['
670    * [20] CData ::= (Char* - (Char* ']]&gt;' Char*))
671    * [21] CDEnd ::= ']]&gt;'
672    * </pre>
673    * <p> (The '&lt;![CDATA[' has already been read.)
674    */
parseCDSect()675   private void parseCDSect()
676     throws Exception
677   {
678     parseUntil(endDelimCDATA);
679     dataBufferFlush();
680   }
681 
682   /**
683    * Parse the prolog of an XML document.
684    * <pre>
685    * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
686    * </pre>
687    * <p>We do not look for the XML declaration here, because it was
688    * handled by pushURL ().
689    * @see pushURL
690    * @return true if a DTD was read.
691    */
parseProlog()692   private boolean parseProlog()
693     throws Exception
694   {
695     parseMisc();
696 
697     if (tryRead("<!DOCTYPE"))
698       {
699         parseDoctypedecl();
700         parseMisc();
701         return true;
702       }
703     return false;
704   }
705 
checkLegalVersion(String version)706   private void checkLegalVersion(String version)
707     throws SAXException
708   {
709     int len = version.length();
710     for (int i = 0; i < len; i++)
711       {
712         char c = version.charAt(i);
713         if ('0' <= c && c <= '9')
714           {
715             continue;
716           }
717         if (c == '_' || c == '.' || c == ':' || c == '-')
718           {
719             continue;
720           }
721         if ('a' <= c && c <= 'z')
722           {
723             continue;
724           }
725         if ('A' <= c && c <= 'Z')
726           {
727             continue;
728           }
729         error ("illegal character in version", version, "1.0");
730       }
731   }
732 
733   /**
734    * Parse the XML declaration.
735    * <pre>
736    * [23] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
737    * [24] VersionInfo ::= S 'version' Eq
738    *    ("'" VersionNum "'" | '"' VersionNum '"' )
739    * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')*
740    * [32] SDDecl ::= S 'standalone' Eq
741    *    ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' )
742    * [80] EncodingDecl ::= S 'encoding' Eq
743    *    ( "'" EncName "'" | "'" EncName "'" )
744    * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
745    * </pre>
746    * <p> (The <code>&lt;?xml</code> and whitespace have already been read.)
747    * @return the encoding in the declaration, uppercased; or null
748    * @see #parseTextDecl
749    * @see #setupDecoding
750    */
parseXMLDecl(boolean ignoreEncoding)751   private String parseXMLDecl(boolean ignoreEncoding)
752     throws SAXException, IOException
753   {
754     String version;
755     String encodingName = null;
756     String standalone = null;
757     int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
758     String inputEncoding = null;
759 
760     switch (this.encoding)
761       {
762       case ENCODING_EXTERNAL:
763       case ENCODING_UTF_8:
764         inputEncoding = "UTF-8";
765         break;
766       case ENCODING_ISO_8859_1:
767         inputEncoding = "ISO-8859-1";
768         break;
769       case ENCODING_UCS_2_12:
770         inputEncoding = "UTF-16BE";
771         break;
772       case ENCODING_UCS_2_21:
773         inputEncoding = "UTF-16LE";
774         break;
775       }
776 
777     // Read the version.
778     require("version");
779     parseEq();
780     checkLegalVersion(version = readLiteral(flags));
781     if (!version.equals("1.0"))
782       {
783         if (version.equals("1.1"))
784           {
785             handler.warn("expected XML version 1.0, not: " + version);
786             xmlVersion = XML_11;
787           }
788         else
789           {
790             error("illegal XML version", version, "1.0 or 1.1");
791           }
792       }
793     else
794       {
795         xmlVersion = XML_10;
796       }
797     // Try reading an encoding declaration.
798     boolean white = tryWhitespace();
799 
800     if (tryRead("encoding"))
801       {
802         if (!white)
803           {
804             error("whitespace required before 'encoding='");
805           }
806         parseEq();
807         encodingName = readLiteral(flags);
808         if (!ignoreEncoding)
809           {
810             setupDecoding(encodingName);
811           }
812       }
813 
814     // Try reading a standalone declaration
815     if (encodingName != null)
816       {
817         white = tryWhitespace();
818       }
819     if (tryRead("standalone"))
820       {
821         if (!white)
822           {
823             error("whitespace required before 'standalone='");
824           }
825         parseEq();
826         standalone = readLiteral(flags);
827         if ("yes".equals(standalone))
828           {
829             docIsStandalone = true;
830           }
831         else if (!"no".equals(standalone))
832           {
833             error("standalone flag must be 'yes' or 'no'");
834           }
835       }
836 
837     skipWhitespace();
838     require("?>");
839 
840     if (inputEncoding == null)
841       {
842         inputEncoding = encodingName;
843       }
844     return encodingName;
845   }
846 
847   /**
848    * Parse a text declaration.
849    * <pre>
850    * [79] TextDecl ::= '&lt;?xml' VersionInfo? EncodingDecl S? '?&gt;'
851    * [80] EncodingDecl ::= S 'encoding' Eq
852    *    ( '"' EncName '"' | "'" EncName "'" )
853    * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
854    * </pre>
855    * <p> (The <code>&lt;?xml</code>' and whitespace have already been read.)
856    * @return the encoding in the declaration, uppercased; or null
857    * @see #parseXMLDecl
858    * @see #setupDecoding
859    */
parseTextDecl(boolean ignoreEncoding)860   private String parseTextDecl(boolean ignoreEncoding)
861     throws SAXException, IOException
862   {
863     String encodingName = null;
864     int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
865 
866     // Read an optional version.
867     if (tryRead ("version"))
868       {
869         String version;
870         parseEq();
871         checkLegalVersion(version = readLiteral(flags));
872 
873         if (version.equals("1.1"))
874           {
875             if (xmlVersion == XML_10)
876               {
877                 error("external subset has later version number.", "1.0",
878                       version);
879               }
880             handler.warn("expected XML version 1.0, not: " + version);
881             xmlVersion = XML_11;
882           }
883         else if (!version.equals("1.0"))
884           {
885             error("illegal XML version", version, "1.0 or 1.1");
886           }
887         requireWhitespace();
888       }
889 
890     // Read the encoding.
891     require("encoding");
892     parseEq();
893     encodingName = readLiteral(flags);
894     if (!ignoreEncoding)
895       {
896         setupDecoding(encodingName);
897       }
898     skipWhitespace();
899     require("?>");
900 
901     return encodingName;
902   }
903 
904   /**
905    * Sets up internal state so that we can decode an entity using the
906    * specified encoding.  This is used when we start to read an entity
907    * and we have been given knowledge of its encoding before we start to
908    * read any data (e.g. from a SAX input source or from a MIME type).
909    *
910    * <p> It is also used after autodetection, at which point only very
911    * limited adjustments to the encoding may be used (switching between
912    * related builtin decoders).
913    *
914    * @param encodingName The name of the encoding specified by the user.
915    * @exception IOException if the encoding isn't supported either
916    *  internally to this parser, or by the hosting JVM.
917    * @see #parseXMLDecl
918    * @see #parseTextDecl
919      */
setupDecoding(String encodingName)920   private void setupDecoding(String encodingName)
921     throws SAXException, IOException
922   {
923     encodingName = encodingName.toUpperCase();
924 
925     // ENCODING_EXTERNAL indicates an encoding that wasn't
926     // autodetected ... we can use builtin decoders, or
927     // ones from the JVM (InputStreamReader).
928 
929     // Otherwise we can only tweak what was autodetected, and
930     // only for single byte (ASCII derived) builtin encodings.
931 
932     // ASCII-derived encodings
933     if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL)
934       {
935         if (encodingName.equals("ISO-8859-1")
936             || encodingName.equals("8859_1")
937             || encodingName.equals("ISO8859_1"))
938           {
939             encoding = ENCODING_ISO_8859_1;
940             return;
941           }
942         else if (encodingName.equals("US-ASCII")
943                  || encodingName.equals("ASCII"))
944           {
945             encoding = ENCODING_ASCII;
946             return;
947           }
948         else if (encodingName.equals("UTF-8")
949                  || encodingName.equals("UTF8"))
950           {
951             encoding = ENCODING_UTF_8;
952             return;
953           }
954         else if (encoding != ENCODING_EXTERNAL)
955           {
956             // used to start with a new reader ...
957             throw new UnsupportedEncodingException(encodingName);
958           }
959         // else fallthrough ...
960         // it's ASCII-ish and something other than a builtin
961       }
962 
963     // Unicode and such
964     if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21)
965       {
966         if (!(encodingName.equals("ISO-10646-UCS-2")
967               || encodingName.equals("UTF-16")
968               || encodingName.equals("UTF-16BE")
969               || encodingName.equals("UTF-16LE")))
970           {
971             error("unsupported Unicode encoding", encodingName, "UTF-16");
972           }
973         return;
974       }
975 
976     // four byte encodings
977     if (encoding == ENCODING_UCS_4_1234
978         || encoding == ENCODING_UCS_4_4321
979         || encoding == ENCODING_UCS_4_2143
980         || encoding == ENCODING_UCS_4_3412)
981       {
982         // Strictly:  "UCS-4" == "UTF-32BE"; also, "UTF-32LE" exists
983         if (!encodingName.equals("ISO-10646-UCS-4"))
984           {
985             error("unsupported 32-bit encoding", encodingName,
986                   "ISO-10646-UCS-4");
987           }
988         return;
989       }
990 
991     // assert encoding == ENCODING_EXTERNAL
992     // if (encoding != ENCODING_EXTERNAL)
993     //     throw new RuntimeException ("encoding = " + encoding);
994 
995     if (encodingName.equals("UTF-16BE"))
996       {
997         encoding = ENCODING_UCS_2_12;
998         return;
999       }
1000     if (encodingName.equals("UTF-16LE"))
1001       {
1002         encoding = ENCODING_UCS_2_21;
1003         return;
1004       }
1005 
1006     // We couldn't use the builtin decoders at all.  But we can try to
1007     // create a reader, since we haven't messed up buffering.  Tweak
1008     // the encoding name if necessary.
1009 
1010     if (encodingName.equals("UTF-16")
1011         || encodingName.equals("ISO-10646-UCS-2"))
1012       {
1013         encodingName = "Unicode";
1014       }
1015     // Ignoring all the EBCDIC aliases here
1016 
1017     reader = new InputStreamReader(is, encodingName);
1018     sourceType = INPUT_READER;
1019   }
1020 
1021   /**
1022    * Parse miscellaneous markup outside the document element and DOCTYPE
1023    * declaration.
1024    * <pre>
1025    * [27] Misc ::= Comment | PI | S
1026    * </pre>
1027    */
parseMisc()1028   private void parseMisc()
1029     throws Exception
1030   {
1031     while (true)
1032       {
1033         skipWhitespace();
1034         if (tryRead(startDelimPI))
1035           {
1036             parsePI();
1037           }
1038         else if (tryRead(startDelimComment))
1039           {
1040             parseComment();
1041           }
1042         else
1043           {
1044             return;
1045           }
1046       }
1047   }
1048 
1049   /**
1050    * Parse a document type declaration.
1051    * <pre>
1052    * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
1053    *    ('[' (markupdecl | PEReference | S)* ']' S?)? '&gt;'
1054    * </pre>
1055    * <p> (The <code>&lt;!DOCTYPE</code> has already been read.)
1056    */
parseDoctypedecl()1057   private void parseDoctypedecl()
1058     throws Exception
1059   {
1060     String rootName;
1061     ExternalIdentifiers ids;
1062 
1063     // Read the document type name.
1064     requireWhitespace();
1065     rootName = readNmtoken(true);
1066 
1067     // Read the External subset's IDs
1068     skipWhitespace();
1069     ids = readExternalIds(false, true);
1070 
1071     // report (a) declaration of name, (b) lexical info (ids)
1072     handler.doctypeDecl(rootName, ids.publicId, ids.systemId);
1073 
1074     // Internal subset is parsed first, if present
1075     skipWhitespace();
1076     if (tryRead('['))
1077       {
1078 
1079         // loop until the subset ends
1080         while (true)
1081           {
1082             doReport = expandPE = true;
1083             skipWhitespace();
1084             doReport = expandPE = false;
1085             if (tryRead(']'))
1086               {
1087                 break;     // end of subset
1088               }
1089             else
1090               {
1091                 // WFC, PEs in internal subset (only between decls)
1092                 peIsError = expandPE = true;
1093                 parseMarkupdecl();
1094                 peIsError = expandPE = false;
1095               }
1096           }
1097       }
1098     skipWhitespace();
1099     require('>');
1100 
1101     // Read the external subset, if any
1102     InputSource subset;
1103 
1104     if (ids.systemId == null)
1105       {
1106         subset = handler.getExternalSubset(rootName,
1107                                            handler.getSystemId());
1108       }
1109     else
1110       {
1111         subset = null;
1112       }
1113     if (ids.systemId != null || subset != null)
1114       {
1115         pushString(null, ">");
1116 
1117         // NOTE:  [dtd] is so we say what SAX2 expects,
1118         // though it's misleading (subset, not entire dtd)
1119         if (ids.systemId != null)
1120           {
1121             pushURL(true, "[dtd]", ids, null, null, null, true);
1122           }
1123         else
1124           {
1125             handler.warn("modifying document by adding external subset");
1126             pushURL(true, "[dtd]",
1127                     new ExternalIdentifiers(subset.getPublicId(),
1128                                             subset.getSystemId(),
1129                                             null),
1130                     subset.getCharacterStream(),
1131                     subset.getByteStream(),
1132                     subset.getEncoding(),
1133                     false);
1134           }
1135 
1136         // Loop until we end up back at '>'
1137         while (true)
1138           {
1139             doReport = expandPE = true;
1140             skipWhitespace();
1141             doReport = expandPE = false;
1142             if (tryRead('>'))
1143               {
1144                 break;
1145               }
1146             else
1147               {
1148                 expandPE = true;
1149                 parseMarkupdecl();
1150                 expandPE = false;
1151               }
1152           }
1153 
1154         // the ">" string isn't popped yet
1155         if (inputStack.size() != 1)
1156           {
1157             error("external subset has unmatched '>'");
1158           }
1159       }
1160 
1161     // done dtd
1162     handler.endDoctype();
1163     expandPE = false;
1164     doReport = true;
1165   }
1166 
1167   /**
1168    * Parse a markup declaration in the internal or external DTD subset.
1169    * <pre>
1170    * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
1171    *    | NotationDecl | PI | Comment
1172    * [30] extSubsetDecl ::= (markupdecl | conditionalSect
1173    *    | PEReference | S) *
1174    * </pre>
1175    * <p> Reading toplevel PE references is handled as a lexical issue
1176    * by the caller, as is whitespace.
1177    */
parseMarkupdecl()1178   private void parseMarkupdecl()
1179     throws Exception
1180   {
1181     char[] saved = null;
1182     boolean savedPE = expandPE;
1183 
1184     // prevent "<%foo;" and ensures saved entity is right
1185     require('<');
1186     unread('<');
1187     expandPE = false;
1188 
1189     if (tryRead("<!ELEMENT"))
1190       {
1191         saved = readBuffer;
1192         expandPE = savedPE;
1193         parseElementDecl();
1194       }
1195     else if (tryRead("<!ATTLIST"))
1196       {
1197         saved = readBuffer;
1198         expandPE = savedPE;
1199         parseAttlistDecl();
1200       }
1201     else if (tryRead("<!ENTITY"))
1202       {
1203         saved = readBuffer;
1204         expandPE = savedPE;
1205         parseEntityDecl();
1206       }
1207     else if (tryRead("<!NOTATION"))
1208       {
1209         saved = readBuffer;
1210         expandPE = savedPE;
1211         parseNotationDecl();
1212       }
1213     else if (tryRead(startDelimPI))
1214       {
1215         saved = readBuffer;
1216         expandPE = savedPE;
1217         parsePI();
1218       }
1219     else if (tryRead(startDelimComment))
1220       {
1221         saved = readBuffer;
1222         expandPE = savedPE;
1223         parseComment();
1224       }
1225     else if (tryRead("<!["))
1226       {
1227         saved = readBuffer;
1228         expandPE = savedPE;
1229         if (inputStack.size() > 0)
1230           {
1231             parseConditionalSect(saved);
1232           }
1233         else
1234           {
1235             error("conditional sections illegal in internal subset");
1236           }
1237       }
1238     else
1239       {
1240         error("expected markup declaration");
1241       }
1242 
1243     // VC: Proper Decl/PE Nesting
1244     if (readBuffer != saved)
1245       {
1246         handler.verror("Illegal Declaration/PE nesting");
1247       }
1248   }
1249 
1250   /**
1251    * Parse an element, with its tags.
1252    * <pre>
1253    * [39] element ::= EmptyElementTag | STag content ETag
1254    * [40] STag ::= '&lt;' Name (S Attribute)* S? '&gt;'
1255    * [44] EmptyElementTag ::= '&lt;' Name (S Attribute)* S? '/&gt;'
1256    * </pre>
1257    * <p> (The '&lt;' has already been read.)
1258    * <p>NOTE: this method actually chains onto parseContent (), if necessary,
1259    * and parseContent () will take care of calling parseETag ().
1260    */
parseElement(boolean maybeGetSubset)1261   private void parseElement(boolean maybeGetSubset)
1262     throws Exception
1263   {
1264     String gi;
1265     char c;
1266     int oldElementContent = currentElementContent;
1267     String oldElement = currentElement;
1268     ElementDecl element;
1269 
1270     // This is the (global) counter for the
1271     // array of specified attributes.
1272     tagAttributePos = 0;
1273 
1274     // Read the element type name.
1275     gi = readNmtoken(true);
1276 
1277     // If we saw no DTD, and this is the document root element,
1278     // let the application modify the input stream by providing one.
1279     if (maybeGetSubset)
1280       {
1281         InputSource subset = handler.getExternalSubset(gi,
1282                                                        handler.getSystemId());
1283         if (subset != null)
1284           {
1285             String publicId = subset.getPublicId();
1286             String systemId = subset.getSystemId();
1287 
1288             handler.warn("modifying document by adding DTD");
1289             handler.doctypeDecl(gi, publicId, systemId);
1290             pushString(null, ">");
1291 
1292             // NOTE:  [dtd] is so we say what SAX2 expects,
1293             // though it's misleading (subset, not entire dtd)
1294             pushURL(true, "[dtd]",
1295                     new ExternalIdentifiers(publicId, systemId, null),
1296                     subset.getCharacterStream(),
1297                     subset.getByteStream(),
1298                     subset.getEncoding(),
1299                     false);
1300 
1301             // Loop until we end up back at '>'
1302             while (true)
1303               {
1304                 doReport = expandPE = true;
1305                 skipWhitespace();
1306                 doReport = expandPE = false;
1307                 if (tryRead('>'))
1308                   {
1309                     break;
1310                   }
1311                 else
1312                   {
1313                     expandPE = true;
1314                     parseMarkupdecl();
1315                     expandPE = false;
1316                   }
1317               }
1318 
1319             // the ">" string isn't popped yet
1320             if (inputStack.size() != 1)
1321               {
1322                 error("external subset has unmatched '>'");
1323               }
1324 
1325             handler.endDoctype();
1326           }
1327       }
1328 
1329     // Determine the current content type.
1330     currentElement = gi;
1331     element = (ElementDecl) elementInfo.get(gi);
1332     currentElementContent = getContentType(element, CONTENT_ANY);
1333 
1334     // Read the attributes, if any.
1335     // After this loop, "c" is the closing delimiter.
1336     boolean white = tryWhitespace();
1337     c = readCh();
1338     while (c != '/' && c != '>')
1339       {
1340         unread(c);
1341         if (!white)
1342           {
1343             error("need whitespace between attributes");
1344           }
1345         parseAttribute(gi);
1346         white = tryWhitespace();
1347         c = readCh();
1348       }
1349 
1350     // Supply any defaulted attributes.
1351     Iterator atts = declaredAttributes(element);
1352     if (atts != null)
1353       {
1354         String aname;
1355 loop:
1356         while (atts.hasNext())
1357           {
1358             aname = (String) atts.next();
1359             // See if it was specified.
1360             for (int i = 0; i < tagAttributePos; i++)
1361               {
1362                 if (tagAttributes[i] == aname)
1363                   {
1364                     continue loop;
1365                   }
1366               }
1367             // ... or has a default
1368             String value = getAttributeDefaultValue(gi, aname);
1369 
1370             if (value == null)
1371               {
1372                 continue;
1373               }
1374             handler.attribute(aname, value, false);
1375           }
1376       }
1377 
1378     // Figure out if this is a start tag
1379     // or an empty element, and dispatch an
1380     // event accordingly.
1381     switch (c)
1382       {
1383       case '>':
1384         handler.startElement(gi);
1385         parseContent();
1386         break;
1387       case '/':
1388         require('>');
1389         handler.startElement(gi);
1390         handler.endElement(gi);
1391         break;
1392       }
1393 
1394     // Restore the previous state.
1395     currentElement = oldElement;
1396     currentElementContent = oldElementContent;
1397   }
1398 
1399   /**
1400    * Parse an attribute assignment.
1401    * <pre>
1402    * [41] Attribute ::= Name Eq AttValue
1403    * </pre>
1404    * @param name The name of the attribute's element.
1405    * @see SAXDriver#attribute
1406    */
parseAttribute(String name)1407   private void parseAttribute(String name)
1408     throws Exception
1409   {
1410     String aname;
1411     String type;
1412     String value;
1413     int flags = LIT_ATTRIBUTE |  LIT_ENTITY_REF;
1414 
1415     // Read the attribute name.
1416     aname = readNmtoken(true);
1417     type = getAttributeType(name, aname);
1418 
1419     // Parse '='
1420     parseEq();
1421 
1422     // Read the value, normalizing whitespace
1423     // unless it is CDATA.
1424     if (handler.stringInterning)
1425       {
1426         if (type == "CDATA" || type == null)
1427           {
1428             value = readLiteral(flags);
1429           }
1430         else
1431           {
1432             value = readLiteral(flags | LIT_NORMALIZE);
1433           }
1434       }
1435     else
1436       {
1437         if (type == null || type.equals("CDATA"))
1438           {
1439             value = readLiteral(flags);
1440           }
1441         else
1442           {
1443             value = readLiteral(flags | LIT_NORMALIZE);
1444           }
1445       }
1446 
1447     // WFC: no duplicate attributes
1448     for (int i = 0; i < tagAttributePos; i++)
1449       {
1450         if (aname.equals(tagAttributes [i]))
1451           {
1452             error("duplicate attribute", aname, null);
1453           }
1454       }
1455 
1456     // Inform the handler about the
1457     // attribute.
1458     handler.attribute(aname, value, true);
1459     dataBufferPos = 0;
1460 
1461     // Note that the attribute has been
1462     // specified.
1463     if (tagAttributePos == tagAttributes.length)
1464       {
1465         String newAttrib[] = new String[tagAttributes.length * 2];
1466         System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos);
1467         tagAttributes = newAttrib;
1468       }
1469     tagAttributes[tagAttributePos++] = aname;
1470   }
1471 
1472   /**
1473    * Parse an equals sign surrounded by optional whitespace.
1474    * <pre>
1475    * [25] Eq ::= S? '=' S?
1476    * </pre>
1477    */
parseEq()1478   private void parseEq()
1479     throws SAXException, IOException
1480   {
1481     skipWhitespace();
1482     require('=');
1483     skipWhitespace();
1484   }
1485 
1486   /**
1487    * Parse an end tag.
1488    * <pre>
1489    * [42] ETag ::= '</' Name S? '>'
1490    * </pre>
1491    * <p>NOTE: parseContent () chains to here, we already read the
1492    * "&lt;/".
1493    */
parseETag()1494   private void parseETag()
1495     throws Exception
1496   {
1497     require(currentElement);
1498     skipWhitespace();
1499     require('>');
1500     handler.endElement(currentElement);
1501     // not re-reporting any SAXException re bogus end tags,
1502     // even though that diagnostic might be clearer ...
1503   }
1504 
1505   /**
1506    * Parse the content of an element.
1507    * <pre>
1508    * [43] content ::= (element | CharData | Reference
1509    *    | CDSect | PI | Comment)*
1510    * [67] Reference ::= EntityRef | CharRef
1511    * </pre>
1512    * <p> NOTE: consumes ETtag.
1513    */
parseContent()1514   private void parseContent()
1515     throws Exception
1516   {
1517     char c;
1518 
1519     while (true)
1520       {
1521         // consume characters (or ignorable whitspace) until delimiter
1522         parseCharData();
1523 
1524         // Handle delimiters
1525         c = readCh();
1526         switch (c)
1527           {
1528           case '&':       // Found "&"
1529             c = readCh();
1530             if (c == '#')
1531               {
1532                 parseCharRef();
1533               }
1534             else
1535               {
1536                 unread(c);
1537                 parseEntityRef(true);
1538               }
1539             isDirtyCurrentElement = true;
1540             break;
1541 
1542           case '<':       // Found "<"
1543             dataBufferFlush();
1544             c = readCh();
1545             switch (c)
1546               {
1547               case '!':       // Found "<!"
1548                 c = readCh();
1549                 switch (c)
1550                   {
1551                   case '-':     // Found "<!-"
1552                     require('-');
1553                     isDirtyCurrentElement = false;
1554                     parseComment();
1555                     break;
1556                   case '[':     // Found "<!["
1557                     isDirtyCurrentElement = false;
1558                     require("CDATA[");
1559                     handler.startCDATA();
1560                     inCDATA = true;
1561                     parseCDSect();
1562                     inCDATA = false;
1563                     handler.endCDATA();
1564                     break;
1565                   default:
1566                     error("expected comment or CDATA section", c, null);
1567                     break;
1568                   }
1569                 break;
1570 
1571               case '?':     // Found "<?"
1572                 isDirtyCurrentElement = false;
1573                 parsePI();
1574                 break;
1575 
1576               case '/':     // Found "</"
1577                 isDirtyCurrentElement = false;
1578                 parseETag();
1579                 return;
1580 
1581               default:     // Found "<" followed by something else
1582                 isDirtyCurrentElement = false;
1583                 unread(c);
1584                 parseElement(false);
1585                 break;
1586               }
1587           }
1588       }
1589   }
1590 
1591   /**
1592    * Parse an element type declaration.
1593    * <pre>
1594    * [45] elementdecl ::= '&lt;!ELEMENT' S Name S contentspec S? '&gt;'
1595    * </pre>
1596    * <p> NOTE: the '&lt;!ELEMENT' has already been read.
1597    */
parseElementDecl()1598   private void parseElementDecl()
1599     throws Exception
1600   {
1601     String name;
1602 
1603     requireWhitespace();
1604     // Read the element type name.
1605     name = readNmtoken(true);
1606 
1607     requireWhitespace();
1608     // Read the content model.
1609     parseContentspec(name);
1610 
1611     skipWhitespace();
1612     require('>');
1613   }
1614 
1615   /**
1616    * Content specification.
1617    * <pre>
1618    * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1619    * </pre>
1620    */
parseContentspec(String name)1621   private void parseContentspec(String name)
1622     throws Exception
1623   {
1624     // FIXME: move elementDecl() into setElement(), pass EMTPY/ANY ...
1625     if (tryRead("EMPTY"))
1626       {
1627         setElement(name, CONTENT_EMPTY, null, null);
1628         if (!skippedPE)
1629           {
1630             handler.getDeclHandler().elementDecl(name, "EMPTY");
1631           }
1632         return;
1633       }
1634     else if (tryRead("ANY"))
1635       {
1636         setElement(name, CONTENT_ANY, null, null);
1637         if (!skippedPE)
1638           {
1639             handler.getDeclHandler().elementDecl(name, "ANY");
1640           }
1641         return;
1642       }
1643     else
1644       {
1645         String model;
1646         char[] saved;
1647 
1648         require('(');
1649         saved = readBuffer;
1650         dataBufferAppend('(');
1651         skipWhitespace();
1652         if (tryRead("#PCDATA"))
1653           {
1654             dataBufferAppend("#PCDATA");
1655             parseMixed(saved);
1656             model = dataBufferToString();
1657             setElement(name, CONTENT_MIXED, model, null);
1658           }
1659         else
1660           {
1661             parseElements(saved);
1662             model = dataBufferToString();
1663             setElement(name, CONTENT_ELEMENTS, model, null);
1664           }
1665         if (!skippedPE)
1666           {
1667             handler.getDeclHandler().elementDecl(name, model);
1668           }
1669       }
1670   }
1671 
1672   /**
1673    * Parse an element-content model.
1674    * <pre>
1675    * [47] elements ::= (choice | seq) ('?' | '*' | '+')?
1676    * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')'
1677    * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
1678    * </pre>
1679    *
1680    * <p> NOTE: the opening '(' and S have already been read.
1681    *
1682    * @param saved Buffer for entity that should have the terminal ')'
1683    */
parseElements(char[] saved)1684   private void parseElements(char[] saved)
1685     throws Exception
1686   {
1687     char c;
1688     char sep;
1689 
1690     // Parse the first content particle
1691     skipWhitespace();
1692     parseCp();
1693 
1694     // Check for end or for a separator.
1695     skipWhitespace();
1696     c = readCh();
1697     switch (c)
1698       {
1699       case ')':
1700         // VC: Proper Group/PE Nesting
1701         if (readBuffer != saved)
1702           {
1703             handler.verror("Illegal Group/PE nesting");
1704           }
1705 
1706         dataBufferAppend(')');
1707         c = readCh();
1708         switch (c)
1709           {
1710           case '*':
1711           case '+':
1712           case '?':
1713             dataBufferAppend(c);
1714             break;
1715           default:
1716             unread(c);
1717           }
1718         return;
1719       case ',':       // Register the separator.
1720       case '|':
1721         sep = c;
1722         dataBufferAppend(c);
1723         break;
1724       default:
1725         error("bad separator in content model", c, null);
1726         return;
1727       }
1728 
1729     // Parse the rest of the content model.
1730     while (true)
1731       {
1732         skipWhitespace();
1733         parseCp();
1734         skipWhitespace();
1735         c = readCh();
1736         if (c == ')')
1737           {
1738             // VC: Proper Group/PE Nesting
1739             if (readBuffer != saved)
1740               {
1741                 handler.verror("Illegal Group/PE nesting");
1742               }
1743 
1744             dataBufferAppend(')');
1745             break;
1746           }
1747         else if (c != sep)
1748           {
1749             error("bad separator in content model", c, null);
1750             return;
1751           }
1752         else
1753           {
1754             dataBufferAppend(c);
1755           }
1756       }
1757 
1758     // Check for the occurrence indicator.
1759     c = readCh();
1760     switch (c)
1761       {
1762       case '?':
1763       case '*':
1764       case '+':
1765         dataBufferAppend(c);
1766         return;
1767       default:
1768         unread(c);
1769         return;
1770       }
1771   }
1772 
1773   /**
1774    * Parse a content particle.
1775    * <pre>
1776    * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1777    * </pre>
1778    */
parseCp()1779   private void parseCp()
1780     throws Exception
1781   {
1782     if (tryRead('('))
1783       {
1784         dataBufferAppend('(');
1785         parseElements(readBuffer);
1786       }
1787     else
1788       {
1789         dataBufferAppend(readNmtoken(true));
1790         char c = readCh();
1791         switch (c)
1792           {
1793           case '?':
1794           case '*':
1795           case '+':
1796             dataBufferAppend(c);
1797             break;
1798           default:
1799             unread(c);
1800             break;
1801           }
1802       }
1803   }
1804 
1805   /**
1806    * Parse mixed content.
1807    * <pre>
1808    * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*'
1809    *        | '(' S? ('#PCDATA') S? ')'
1810    * </pre>
1811    *
1812    * @param saved Buffer for entity that should have the terminal ')'
1813    */
parseMixed(char[] saved)1814   private void parseMixed(char[] saved)
1815     throws Exception
1816   {
1817     // Check for PCDATA alone.
1818     skipWhitespace();
1819     if (tryRead(')'))
1820       {
1821         // VC: Proper Group/PE Nesting
1822         if (readBuffer != saved)
1823           {
1824             handler.verror("Illegal Group/PE nesting");
1825           }
1826 
1827         dataBufferAppend(")*");
1828         tryRead('*');
1829         return;
1830       }
1831 
1832     // Parse mixed content.
1833     skipWhitespace();
1834     while (!tryRead(")"))
1835       {
1836         require('|');
1837         dataBufferAppend('|');
1838         skipWhitespace();
1839         dataBufferAppend(readNmtoken(true));
1840         skipWhitespace();
1841       }
1842 
1843     // VC: Proper Group/PE Nesting
1844     if (readBuffer != saved)
1845       {
1846         handler.verror("Illegal Group/PE nesting");
1847       }
1848 
1849     require('*');
1850     dataBufferAppend(")*");
1851   }
1852 
1853   /**
1854    * Parse an attribute list declaration.
1855    * <pre>
1856    * [52] AttlistDecl ::= '&lt;!ATTLIST' S Name AttDef* S? '&gt;'
1857    * </pre>
1858    * <p>NOTE: the '&lt;!ATTLIST' has already been read.
1859    */
parseAttlistDecl()1860   private void parseAttlistDecl()
1861     throws Exception
1862   {
1863     String elementName;
1864 
1865     requireWhitespace();
1866     elementName = readNmtoken(true);
1867     boolean white = tryWhitespace();
1868     while (!tryRead('>'))
1869       {
1870         if (!white)
1871           {
1872             error("whitespace required before attribute definition");
1873           }
1874         parseAttDef(elementName);
1875         white = tryWhitespace();
1876       }
1877   }
1878 
1879   /**
1880    * Parse a single attribute definition.
1881    * <pre>
1882    * [53] AttDef ::= S Name S AttType S DefaultDecl
1883    * </pre>
1884    */
parseAttDef(String elementName)1885   private void parseAttDef(String elementName)
1886     throws Exception
1887   {
1888     String name;
1889     String type;
1890     String enumer = null;
1891 
1892     // Read the attribute name.
1893     name = readNmtoken(true);
1894 
1895     // Read the attribute type.
1896     requireWhitespace();
1897     type = readAttType();
1898 
1899     // Get the string of enumerated values if necessary.
1900     if (handler.stringInterning)
1901       {
1902         if ("ENUMERATION" == type || "NOTATION" == type)
1903           {
1904             enumer = dataBufferToString();
1905           }
1906       }
1907     else
1908       {
1909         if ("ENUMERATION".equals(type) || "NOTATION".equals(type))
1910           {
1911             enumer = dataBufferToString();
1912           }
1913       }
1914 
1915     // Read the default value.
1916     requireWhitespace();
1917     parseDefault(elementName, name, type, enumer);
1918   }
1919 
1920   /**
1921    * Parse the attribute type.
1922    * <pre>
1923    * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1924    * [55] StringType ::= 'CDATA'
1925    * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
1926    *    | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
1927    * [57] EnumeratedType ::= NotationType | Enumeration
1928    * </pre>
1929    */
readAttType()1930   private String readAttType()
1931     throws Exception
1932   {
1933     if (tryRead('('))
1934       {
1935         parseEnumeration(false);
1936         return "ENUMERATION";
1937       }
1938     else
1939       {
1940         String typeString = readNmtoken(true);
1941         if (handler.stringInterning)
1942           {
1943             if ("NOTATION" == typeString)
1944               {
1945                 parseNotationType();
1946                 return typeString;
1947               }
1948             else if ("CDATA" == typeString
1949                      || "ID" == typeString
1950                      || "IDREF" == typeString
1951                      || "IDREFS" == typeString
1952                      || "ENTITY" == typeString
1953                      || "ENTITIES" == typeString
1954                      || "NMTOKEN" == typeString
1955                      || "NMTOKENS" == typeString)
1956               {
1957                 return typeString;
1958               }
1959           }
1960         else
1961           {
1962             if ("NOTATION".equals(typeString))
1963               {
1964                 parseNotationType();
1965                 return typeString;
1966               }
1967             else if ("CDATA".equals(typeString)
1968                      || "ID".equals(typeString)
1969                      || "IDREF".equals(typeString)
1970                      || "IDREFS".equals(typeString)
1971                      || "ENTITY".equals(typeString)
1972                      || "ENTITIES".equals(typeString)
1973                      || "NMTOKEN".equals(typeString)
1974                      || "NMTOKENS".equals(typeString))
1975               {
1976                 return typeString;
1977               }
1978           }
1979         error("illegal attribute type", typeString, null);
1980         return null;
1981       }
1982   }
1983 
1984   /**
1985    * Parse an enumeration.
1986    * <pre>
1987    * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
1988    * </pre>
1989    * <p>NOTE: the '(' has already been read.
1990    */
parseEnumeration(boolean isNames)1991   private void parseEnumeration(boolean isNames)
1992     throws Exception
1993   {
1994     dataBufferAppend('(');
1995 
1996     // Read the first token.
1997     skipWhitespace();
1998     dataBufferAppend(readNmtoken(isNames));
1999     // Read the remaining tokens.
2000     skipWhitespace();
2001     while (!tryRead(')'))
2002       {
2003         require('|');
2004         dataBufferAppend('|');
2005         skipWhitespace();
2006         dataBufferAppend(readNmtoken (isNames));
2007         skipWhitespace();
2008       }
2009     dataBufferAppend(')');
2010   }
2011 
2012   /**
2013    * Parse a notation type for an attribute.
2014    * <pre>
2015    * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks
2016    *    (S? '|' S? name)* S? ')'
2017    * </pre>
2018    * <p>NOTE: the 'NOTATION' has already been read
2019    */
parseNotationType()2020   private void parseNotationType()
2021     throws Exception
2022   {
2023     requireWhitespace();
2024     require('(');
2025 
2026     parseEnumeration(true);
2027   }
2028 
2029   /**
2030    * Parse the default value for an attribute.
2031    * <pre>
2032    * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
2033    *    | (('#FIXED' S)? AttValue)
2034    * </pre>
2035    */
parseDefault(String elementName, String name, String type, String enumer)2036   private void parseDefault(String elementName, String name,
2037                             String type, String enumer)
2038     throws Exception
2039   {
2040     int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
2041     String value = null;
2042     int flags = LIT_ATTRIBUTE;
2043     boolean saved = expandPE;
2044     String defaultType = null;
2045 
2046     // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace
2047     // chars to spaces (doesn't matter when that's done if it doesn't
2048     // interfere with char refs expanding to whitespace).
2049 
2050     if (!skippedPE)
2051       {
2052         flags |= LIT_ENTITY_REF;
2053         if (handler.stringInterning)
2054           {
2055             if ("CDATA" != type)
2056               {
2057                 flags |= LIT_NORMALIZE;
2058               }
2059           }
2060         else
2061           {
2062             if (!"CDATA".equals(type))
2063               {
2064                 flags |= LIT_NORMALIZE;
2065               }
2066           }
2067       }
2068 
2069     expandPE = false;
2070     if (tryRead('#'))
2071       {
2072         if (tryRead("FIXED"))
2073           {
2074             defaultType = "#FIXED";
2075             valueType = ATTRIBUTE_DEFAULT_FIXED;
2076             requireWhitespace();
2077             value = readLiteral(flags);
2078           }
2079         else if (tryRead("REQUIRED"))
2080           {
2081             defaultType = "#REQUIRED";
2082             valueType = ATTRIBUTE_DEFAULT_REQUIRED;
2083           }
2084         else if (tryRead("IMPLIED"))
2085           {
2086             defaultType = "#IMPLIED";
2087             valueType = ATTRIBUTE_DEFAULT_IMPLIED;
2088           }
2089         else
2090           {
2091             error("illegal keyword for attribute default value");
2092           }
2093       }
2094     else
2095       {
2096         value = readLiteral(flags);
2097       }
2098     expandPE = saved;
2099     setAttribute(elementName, name, type, enumer, value, valueType);
2100     if (handler.stringInterning)
2101       {
2102         if ("ENUMERATION" == type)
2103           {
2104             type = enumer;
2105           }
2106         else if ("NOTATION" == type)
2107           {
2108             type = "NOTATION " + enumer;
2109           }
2110       }
2111     else
2112       {
2113         if ("ENUMERATION".equals(type))
2114           {
2115             type = enumer;
2116           }
2117         else if ("NOTATION".equals(type))
2118           {
2119             type = "NOTATION " + enumer;
2120           }
2121       }
2122     if (!skippedPE)
2123       {
2124         handler.getDeclHandler().attributeDecl(elementName, name, type,
2125                                                defaultType, value);
2126       }
2127   }
2128 
2129   /**
2130    * Parse a conditional section.
2131    * <pre>
2132    * [61] conditionalSect ::= includeSect || ignoreSect
2133    * [62] includeSect ::= '&lt;![' S? 'INCLUDE' S? '['
2134    *    extSubsetDecl ']]&gt;'
2135    * [63] ignoreSect ::= '&lt;![' S? 'IGNORE' S? '['
2136    *    ignoreSectContents* ']]&gt;'
2137    * [64] ignoreSectContents ::= Ignore
2138    *    ('&lt;![' ignoreSectContents* ']]&gt;' Ignore )*
2139    * [65] Ignore ::= Char* - (Char* ( '&lt;![' | ']]&gt;') Char* )
2140    * </pre>
2141    * <p> NOTE: the '&gt;![' has already been read.
2142    */
parseConditionalSect(char[] saved)2143   private void parseConditionalSect(char[] saved)
2144     throws Exception
2145   {
2146     skipWhitespace();
2147     if (tryRead("INCLUDE"))
2148       {
2149         skipWhitespace();
2150         require('[');
2151         // VC: Proper Conditional Section/PE Nesting
2152         if (readBuffer != saved)
2153           {
2154             handler.verror("Illegal Conditional Section/PE nesting");
2155           }
2156         skipWhitespace();
2157         while (!tryRead("]]>"))
2158           {
2159             parseMarkupdecl();
2160             skipWhitespace();
2161           }
2162       }
2163     else if (tryRead("IGNORE"))
2164       {
2165         skipWhitespace();
2166         require('[');
2167         // VC: Proper Conditional Section/PE Nesting
2168         if (readBuffer != saved)
2169           {
2170             handler.verror("Illegal Conditional Section/PE nesting");
2171           }
2172         int nesting = 1;
2173         char c;
2174         expandPE = false;
2175         for (int nest = 1; nest > 0; )
2176           {
2177             c = readCh();
2178             switch (c)
2179               {
2180               case '<':
2181                 if (tryRead("!["))
2182                   {
2183                     nest++;
2184                   }
2185                 break;
2186               case ']':
2187                 if (tryRead("]>"))
2188                   {
2189                     nest--;
2190                   }
2191               }
2192           }
2193         expandPE = true;
2194       }
2195     else
2196       {
2197         error("conditional section must begin with INCLUDE or IGNORE");
2198       }
2199   }
2200 
parseCharRef()2201   private void parseCharRef()
2202     throws SAXException, IOException
2203   {
2204     parseCharRef(true /* do flushDataBuffer by default */);
2205   }
2206 
2207   /**
2208    * Try to read a character reference without consuming data from buffer.
2209    * <pre>
2210    * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
2211    * </pre>
2212    * <p>NOTE: the '&#' has already been read.
2213    */
tryReadCharRef()2214   private void tryReadCharRef()
2215     throws SAXException, IOException
2216   {
2217     int value = 0;
2218     char c;
2219 
2220     if (tryRead('x'))
2221       {
2222 loop1:
2223         while (true)
2224           {
2225             c = readCh();
2226             if (c == ';')
2227               {
2228                 break loop1;
2229               }
2230             else
2231               {
2232                 int n = Character.digit(c, 16);
2233                 if (n == -1)
2234                   {
2235                     error("illegal character in character reference", c, null);
2236                     break loop1;
2237                   }
2238                 value *= 16;
2239                 value += n;
2240               }
2241           }
2242       }
2243     else
2244       {
2245 loop2:
2246         while (true)
2247           {
2248             c = readCh();
2249             if (c == ';')
2250               {
2251                 break loop2;
2252               }
2253             else
2254               {
2255                 int n = Character.digit(c, 10);
2256                 if (n == -1)
2257                   {
2258                     error("illegal character in character reference", c, null);
2259                     break loop2;
2260                   }
2261                 value *= 10;
2262                 value += n;
2263               }
2264           }
2265       }
2266 
2267     // check for character refs being legal XML
2268     if ((value < 0x0020
2269          && ! (value == '\n' || value == '\t' || value == '\r'))
2270         || (value >= 0xD800 && value <= 0xDFFF)
2271         || value == 0xFFFE || value == 0xFFFF
2272         || value > 0x0010ffff)
2273       {
2274         error("illegal XML character reference U+"
2275               + Integer.toHexString(value));
2276       }
2277 
2278     // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
2279     //  (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
2280     if (value > 0x0010ffff)
2281       {
2282         // too big for surrogate
2283         error("character reference " + value + " is too large for UTF-16",
2284               Integer.toString(value), null);
2285       }
2286 
2287   }
2288 
2289   /**
2290    * Read and interpret a character reference.
2291    * <pre>
2292    * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
2293    * </pre>
2294    * <p>NOTE: the '&#' has already been read.
2295    */
parseCharRef(boolean doFlush)2296   private void parseCharRef(boolean doFlush)
2297     throws SAXException, IOException
2298   {
2299     int value = 0;
2300     char c;
2301 
2302     if (tryRead('x'))
2303       {
2304 loop1:
2305         while (true)
2306           {
2307             c = readCh();
2308             if (c == ';')
2309               {
2310                 break loop1;
2311               }
2312             else
2313               {
2314                 int n = Character.digit(c, 16);
2315                 if (n == -1)
2316                   {
2317                     error("illegal character in character reference", c, null);
2318                     break loop1;
2319                   }
2320                 value *= 16;
2321                 value += n;
2322               }
2323           }
2324       }
2325     else
2326       {
2327 loop2:
2328         while (true)
2329           {
2330             c = readCh();
2331             if (c == ';')
2332               {
2333                 break loop2;
2334               }
2335             else
2336               {
2337                 int n = Character.digit(c, 10);
2338                 if (n == -1)
2339                   {
2340                     error("illegal character in character reference", c, null);
2341                     break loop2;
2342                   }
2343                 value *= 10;
2344                 value += c - '0';
2345               }
2346           }
2347       }
2348 
2349     // check for character refs being legal XML
2350     if ((value < 0x0020
2351          && ! (value == '\n' || value == '\t' || value == '\r'))
2352         || (value >= 0xD800 && value <= 0xDFFF)
2353         || value == 0xFFFE || value == 0xFFFF
2354         || value > 0x0010ffff)
2355       {
2356         error("illegal XML character reference U+"
2357               + Integer.toHexString(value));
2358       }
2359 
2360     // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
2361     //  (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
2362     if (value <= 0x0000ffff)
2363       {
2364         // no surrogates needed
2365         dataBufferAppend((char) value);
2366       }
2367     else if (value <= 0x0010ffff)
2368       {
2369         value -= 0x10000;
2370         // > 16 bits, surrogate needed
2371         dataBufferAppend((char) (0xd800 | (value >> 10)));
2372         dataBufferAppend((char) (0xdc00 | (value & 0x0003ff)));
2373       }
2374     else
2375       {
2376         // too big for surrogate
2377         error("character reference " + value + " is too large for UTF-16",
2378               Integer.toString(value), null);
2379       }
2380     if (doFlush)
2381       {
2382         dataBufferFlush();
2383       }
2384   }
2385 
2386   /**
2387    * Parse and expand an entity reference.
2388    * <pre>
2389    * [68] EntityRef ::= '&' Name ';'
2390    * </pre>
2391    * <p>NOTE: the '&amp;' has already been read.
2392    * @param externalAllowed External entities are allowed here.
2393    */
parseEntityRef(boolean externalAllowed)2394   private void parseEntityRef(boolean externalAllowed)
2395     throws SAXException, IOException
2396   {
2397     String name;
2398 
2399     name = readNmtoken(true);
2400     require(';');
2401     switch (getEntityType(name))
2402       {
2403       case ENTITY_UNDECLARED:
2404         // NOTE:  XML REC describes amazingly convoluted handling for
2405         // this case.  Nothing as meaningful as being a WFness error
2406         // unless the processor might _legitimately_ not have seen a
2407         // declaration ... which is what this implements.
2408         String message;
2409 
2410         message = "reference to undeclared general entity " + name;
2411         if (skippedPE && !docIsStandalone)
2412           {
2413             handler.verror(message);
2414             // we don't know this entity, and it might be external...
2415             if (externalAllowed)
2416               {
2417                 handler.skippedEntity(name);
2418               }
2419           }
2420         else
2421           {
2422             error(message);
2423           }
2424         break;
2425       case ENTITY_INTERNAL:
2426           pushString(name, getEntityValue(name));
2427 
2428           //workaround for possible input pop before marking
2429           //the buffer reading position
2430           char t = readCh();
2431           unread(t);
2432           int bufferPosMark = readBufferPos;
2433 
2434           int end = readBufferPos + getEntityValue(name).length();
2435           for (int k = readBufferPos; k < end; k++)
2436             {
2437               t = readCh();
2438               if (t == '&')
2439                 {
2440                   t = readCh();
2441                   if (t  == '#')
2442                     {
2443                       //try to match a character ref
2444                       tryReadCharRef();
2445 
2446                       //everything has been read
2447                       if (readBufferPos >= end)
2448                         {
2449                           break;
2450                         }
2451                       k = readBufferPos;
2452                       continue;
2453                     }
2454                   else if (Character.isLetter(t))
2455                     {
2456                       //looks like an entity ref
2457                       unread(t);
2458                       readNmtoken(true);
2459                       require(';');
2460 
2461                       //everything has been read
2462                       if (readBufferPos >= end)
2463                         {
2464                           break;
2465                         }
2466                       k = readBufferPos;
2467                       continue;
2468                     }
2469                   error(" malformed entity reference");
2470                 }
2471 
2472             }
2473           readBufferPos = bufferPosMark;
2474           break;
2475       case ENTITY_TEXT:
2476           if (externalAllowed)
2477             {
2478               pushURL(false, name, getEntityIds(name),
2479                       null, null, null, true);
2480             }
2481           else
2482             {
2483               error("reference to external entity in attribute value.",
2484                     name, null);
2485             }
2486           break;
2487       case ENTITY_NDATA:
2488           if (externalAllowed)
2489             {
2490               error("unparsed entity reference in content", name, null);
2491             }
2492           else
2493             {
2494               error("reference to external entity in attribute value.",
2495                     name, null);
2496             }
2497           break;
2498       default:
2499           throw new RuntimeException();
2500       }
2501   }
2502 
2503   /**
2504    * Parse and expand a parameter entity reference.
2505    * <pre>
2506    * [69] PEReference ::= '%' Name ';'
2507    * </pre>
2508    * <p>NOTE: the '%' has already been read.
2509    */
parsePEReference()2510   private void parsePEReference()
2511     throws SAXException, IOException
2512   {
2513     String name;
2514 
2515     name = "%" + readNmtoken(true);
2516     require(';');
2517     switch (getEntityType(name))
2518       {
2519       case ENTITY_UNDECLARED:
2520         // VC: Entity Declared
2521         handler.verror("reference to undeclared parameter entity " + name);
2522 
2523         // we should disable handling of all subsequent declarations
2524         // unless this is a standalone document (info discarded)
2525         break;
2526       case ENTITY_INTERNAL:
2527         if (inLiteral)
2528           {
2529             pushString(name, getEntityValue(name));
2530           }
2531         else
2532           {
2533             pushString(name, ' ' + getEntityValue(name) + ' ');
2534           }
2535         break;
2536       case ENTITY_TEXT:
2537         if (!inLiteral)
2538           {
2539             pushString(null, " ");
2540           }
2541         pushURL(true, name, getEntityIds(name), null, null, null, true);
2542         if (!inLiteral)
2543           {
2544             pushString(null, " ");
2545           }
2546         break;
2547       }
2548   }
2549 
2550   /**
2551    * Parse an entity declaration.
2552    * <pre>
2553    * [70] EntityDecl ::= GEDecl | PEDecl
2554    * [71] GEDecl ::= '&lt;!ENTITY' S Name S EntityDef S? '&gt;'
2555    * [72] PEDecl ::= '&lt;!ENTITY' S '%' S Name S PEDef S? '&gt;'
2556    * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
2557    * [74] PEDef ::= EntityValue | ExternalID
2558    * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2559    *       | 'PUBLIC' S PubidLiteral S SystemLiteral
2560    * [76] NDataDecl ::= S 'NDATA' S Name
2561    * </pre>
2562    * <p>NOTE: the '&lt;!ENTITY' has already been read.
2563    */
parseEntityDecl()2564   private void parseEntityDecl()
2565     throws Exception
2566   {
2567     boolean peFlag = false;
2568     int flags = 0;
2569 
2570     // Check for a parameter entity.
2571     expandPE = false;
2572     requireWhitespace();
2573     if (tryRead('%'))
2574       {
2575         peFlag = true;
2576         requireWhitespace();
2577       }
2578     expandPE = true;
2579 
2580     // Read the entity name, and prepend
2581     // '%' if necessary.
2582     String name = readNmtoken(true);
2583     //NE08
2584     if (name.indexOf(':') >= 0)
2585       {
2586         error("Illegal character(':') in entity name ", name, null);
2587       }
2588     if (peFlag)
2589       {
2590         name = "%" + name;
2591       }
2592 
2593     // Read the entity value.
2594     requireWhitespace();
2595     char c = readCh();
2596     unread (c);
2597     if (c == '"' || c == '\'')
2598       {
2599         // Internal entity ... replacement text has expanded refs
2600         // to characters and PEs, but not to general entities
2601         String value = readLiteral(flags);
2602         setInternalEntity(name, value);
2603       }
2604     else
2605       {
2606         // Read the external IDs
2607         ExternalIdentifiers ids = readExternalIds(false, false);
2608 
2609         // Check for NDATA declaration.
2610         boolean white = tryWhitespace();
2611         if (!peFlag && tryRead("NDATA"))
2612           {
2613             if (!white)
2614               {
2615                 error("whitespace required before NDATA");
2616               }
2617             requireWhitespace();
2618             String notationName = readNmtoken(true);
2619             if (!skippedPE)
2620               {
2621                 setExternalEntity(name, ENTITY_NDATA, ids, notationName);
2622                 handler.unparsedEntityDecl(name, ids.publicId, ids.systemId,
2623                                            ids.baseUri, notationName);
2624               }
2625           }
2626         else if (!skippedPE)
2627           {
2628             setExternalEntity(name, ENTITY_TEXT, ids, null);
2629             handler.getDeclHandler()
2630               .externalEntityDecl(name, ids.publicId,
2631                                    handler.resolveURIs()
2632                                    // FIXME: ASSUMES not skipped
2633                                    // "false" forces error on bad URI
2634                                    ? handler.absolutize(ids.baseUri,
2635                                                         ids.systemId,
2636                                                         false)
2637                                    : ids.systemId);
2638           }
2639       }
2640 
2641     // Finish the declaration.
2642     skipWhitespace();
2643     require('>');
2644   }
2645 
2646   /**
2647    * Parse a notation declaration.
2648    * <pre>
2649    * [82] NotationDecl ::= '&lt;!NOTATION' S Name S
2650    *    (ExternalID | PublicID) S? '&gt;'
2651    * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2652    * </pre>
2653    * <P>NOTE: the '&lt;!NOTATION' has already been read.
2654    */
parseNotationDecl()2655   private void parseNotationDecl()
2656     throws Exception
2657   {
2658     String nname;
2659     ExternalIdentifiers ids;
2660 
2661     requireWhitespace();
2662     nname = readNmtoken(true);
2663     //NE08
2664     if (nname.indexOf(':') >= 0)
2665       {
2666         error("Illegal character(':') in notation name ", nname, null);
2667       }
2668     requireWhitespace();
2669 
2670     // Read the external identifiers.
2671     ids = readExternalIds(true, false);
2672 
2673     // Register the notation.
2674     setNotation(nname, ids);
2675 
2676     skipWhitespace();
2677     require('>');
2678   }
2679 
2680   /**
2681    * Parse character data.
2682    * <pre>
2683    * [14] CharData ::= [^&lt;&amp;]* - ([^&lt;&amp;]* ']]&gt;' [^&lt;&amp;]*)
2684    * </pre>
2685    */
parseCharData()2686   private void parseCharData()
2687     throws Exception
2688   {
2689     char c;
2690     int state = 0;
2691     boolean pureWhite = false;
2692 
2693     // assert (dataBufferPos == 0);
2694 
2695     // are we expecting pure whitespace?  it might be dirty...
2696     if ((currentElementContent == CONTENT_ELEMENTS) && !isDirtyCurrentElement)
2697       {
2698         pureWhite = true;
2699       }
2700 
2701     // always report right out of readBuffer
2702     // to minimize (pointless) buffer copies
2703     while (true)
2704       {
2705         int lineAugment = 0;
2706         int columnAugment = 0;
2707         int i;
2708 
2709 loop:
2710         for (i = readBufferPos; i < readBufferLength; i++)
2711           {
2712             switch (c = readBuffer[i])
2713               {
2714               case '\n':
2715                 lineAugment++;
2716                 columnAugment = 0;
2717                 // pureWhite unmodified
2718                 break;
2719               case '\r':  // should not happen!!
2720               case '\t':
2721               case ' ':
2722                 // pureWhite unmodified
2723                 columnAugment++;
2724                 break;
2725               case '&':
2726               case '<':
2727                 columnAugment++;
2728                 // pureWhite unmodified
2729                 // CLEAN end of text sequence
2730                 state = 1;
2731                 break loop;
2732               case ']':
2733                 // that's not a whitespace char, and
2734                 // can not terminate pure whitespace either
2735                 pureWhite = false;
2736                 if ((i + 2) < readBufferLength)
2737                   {
2738                     if (readBuffer [i + 1] == ']'
2739                         && readBuffer [i + 2] == '>')
2740                       {
2741                         // ERROR end of text sequence
2742                         state = 2;
2743                         break loop;
2744                       }
2745                   }
2746                 else
2747                   {
2748                     // FIXME missing two end-of-buffer cases
2749                   }
2750                 columnAugment++;
2751                 break;
2752               default:
2753                 if ((c < 0x0020 || c > 0xFFFD)
2754                     || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085)
2755                         && xmlVersion == XML_11))
2756                   {
2757                     error("illegal XML character U+"
2758                           + Integer.toHexString(c));
2759                   }
2760                 // that's not a whitespace char
2761                 pureWhite = false;
2762                 columnAugment++;
2763               }
2764           }
2765 
2766         // report text thus far
2767         if (lineAugment > 0)
2768           {
2769             line += lineAugment;
2770             column = columnAugment;
2771           }
2772         else
2773           {
2774             column += columnAugment;
2775           }
2776 
2777         // report characters/whitspace
2778         int length = i - readBufferPos;
2779 
2780         if (length != 0)
2781           {
2782             if (pureWhite)
2783               {
2784                 handler.ignorableWhitespace(readBuffer,
2785                                             readBufferPos, length);
2786               }
2787             else
2788               {
2789                 handler.charData(readBuffer, readBufferPos, length);
2790               }
2791             readBufferPos = i;
2792           }
2793 
2794         if (state != 0)
2795           {
2796             break;
2797           }
2798 
2799         // fill next buffer from this entity, or
2800         // pop stack and continue with previous entity
2801         unread(readCh());
2802       }
2803     if (!pureWhite)
2804       {
2805         isDirtyCurrentElement = true;
2806       }
2807     // finish, maybe with error
2808     if (state != 1)  // finish, no error
2809       {
2810         error("character data may not contain ']]>'");
2811       }
2812   }
2813 
2814   //////////////////////////////////////////////////////////////////////
2815   // High-level reading and scanning methods.
2816   //////////////////////////////////////////////////////////////////////
2817 
2818   /**
2819    * Require whitespace characters.
2820    */
requireWhitespace()2821   private void requireWhitespace()
2822     throws SAXException, IOException
2823   {
2824     char c = readCh();
2825     if (isWhitespace(c))
2826       {
2827         skipWhitespace();
2828       }
2829     else
2830       {
2831         error("whitespace required", c, null);
2832       }
2833   }
2834 
2835   /**
2836    * Skip whitespace characters.
2837    * <pre>
2838    * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2839    * </pre>
2840    */
skipWhitespace()2841   private void skipWhitespace()
2842     throws SAXException, IOException
2843   {
2844     // Start with a little cheat.  Most of
2845     // the time, the white space will fall
2846     // within the current read buffer; if
2847     // not, then fall through.
2848     if (USE_CHEATS)
2849       {
2850         int lineAugment = 0;
2851         int columnAugment = 0;
2852 
2853 loop:
2854         for (int i = readBufferPos; i < readBufferLength; i++)
2855           {
2856             switch (readBuffer[i])
2857               {
2858               case ' ':
2859               case '\t':
2860               case '\r':
2861                 columnAugment++;
2862                 break;
2863               case '\n':
2864                 lineAugment++;
2865                 columnAugment = 0;
2866                 break;
2867               case '%':
2868                 if (expandPE)
2869                   {
2870                     break loop;
2871                   }
2872                 // else fall through...
2873               default:
2874                 readBufferPos = i;
2875                 if (lineAugment > 0)
2876                   {
2877                     line += lineAugment;
2878                     column = columnAugment;
2879                   }
2880                 else
2881                   {
2882                     column += columnAugment;
2883                   }
2884                 return;
2885               }
2886           }
2887       }
2888 
2889     // OK, do it the slow way.
2890     char c = readCh ();
2891     while (isWhitespace(c))
2892       {
2893         c = readCh();
2894       }
2895     unread(c);
2896   }
2897 
2898   /**
2899    * Read a name or (when parsing an enumeration) name token.
2900    * <pre>
2901    * [5] Name ::= (Letter | '_' | ':') (NameChar)*
2902    * [7] Nmtoken ::= (NameChar)+
2903    * </pre>
2904    */
readNmtoken(boolean isName)2905   private String readNmtoken(boolean isName)
2906     throws SAXException, IOException
2907   {
2908     char c;
2909 
2910     if (USE_CHEATS)
2911       {
2912 loop:
2913         for (int i = readBufferPos; i < readBufferLength; i++)
2914           {
2915             c = readBuffer[i];
2916             switch (c)
2917               {
2918               case '%':
2919                 if (expandPE)
2920                   {
2921                     break loop;
2922                   }
2923                 // else fall through...
2924 
2925                 // What may legitimately come AFTER a name/nmtoken?
2926               case '<': case '>': case '&':
2927               case ',': case '|': case '*': case '+': case '?':
2928               case ')':
2929               case '=':
2930               case '\'': case '"':
2931               case '[':
2932               case ' ': case '\t': case '\r': case '\n':
2933               case ';':
2934               case '/':
2935                 int start = readBufferPos;
2936                 if (i == start)
2937                   {
2938                     error("name expected", readBuffer[i], null);
2939                   }
2940                 readBufferPos = i;
2941                 return intern(readBuffer, start, i - start);
2942 
2943               default:
2944                 // FIXME ... per IBM's OASIS test submission, these:
2945                 //   ?    U+06dd
2946                 //   Combining  U+309B
2947                 //these switches are kind of ugly but at least we won't
2948                 //have to go over the whole lits for each char
2949                 if (isName && i == readBufferPos)
2950                   {
2951                     char c2 = (char) (c & 0x00f0);
2952                     switch (c & 0xff00)
2953                       {
2954                         //starting with 01
2955                       case 0x0100:
2956                         switch (c2)
2957                           {
2958                           case 0x0030:
2959                             if (c == 0x0132 || c == 0x0133 || c == 0x013f)
2960                               {
2961                                 error("Not a name start character, U+"
2962                                       + Integer.toHexString(c));
2963                               }
2964                             break;
2965                           case 0x0040:
2966                             if (c == 0x0140 || c == 0x0149)
2967                               {
2968                                 error("Not a name start character, U+"
2969                                       + Integer.toHexString(c));
2970                               }
2971                             break;
2972                           case 0x00c0:
2973                             if (c == 0x01c4 || c == 0x01cc)
2974                               {
2975                                 error("Not a name start character, U+"
2976                                       + Integer.toHexString(c));
2977                               }
2978                             break;
2979                           case 0x00f0:
2980                             if (c == 0x01f1 || c == 0x01f3)
2981                               {
2982                                 error("Not a name start character, U+"
2983                                       + Integer.toHexString(c));
2984                               }
2985                             break;
2986                           case 0x00b0:
2987                             if (c == 0x01f1 || c == 0x01f3)
2988                               {
2989                                 error("Not a name start character, U+"
2990                                       + Integer.toHexString(c));
2991                               }
2992                             break;
2993                           default:
2994                             if (c == 0x017f)
2995                               {
2996                                 error("Not a name start character, U+"
2997                                       + Integer.toHexString(c));
2998                               }
2999                           }
3000 
3001                         break;
3002                         //starting with 11
3003                       case 0x1100:
3004                         switch (c2)
3005                           {
3006                           case 0x0000:
3007                             if (c == 0x1104 || c == 0x1108 ||
3008                                 c == 0x110a || c == 0x110d)
3009                               {
3010                                 error("Not a name start character, U+"
3011                                       + Integer.toHexString(c));
3012                               }
3013                             break;
3014                           case 0x0030:
3015                             if (c == 0x113b || c == 0x113f)
3016                               {
3017                                 error("Not a name start character, U+"
3018                                       + Integer.toHexString(c));
3019                               }
3020                             break;
3021                           case 0x0040:
3022                             if (c == 0x1141 || c == 0x114d
3023                                 || c == 0x114f )
3024                               {
3025                                 error("Not a name start character, U+"
3026                                       + Integer.toHexString(c));
3027                               }
3028                             break;
3029                           case 0x0050:
3030                             if (c == 0x1151 || c == 0x1156)
3031                               {
3032                                 error("Not a name start character, U+"
3033                                       + Integer.toHexString(c));
3034                               }
3035                             break;
3036                           case 0x0060:
3037                             if (c == 0x1162 || c == 0x1164
3038                                 || c == 0x1166 || c == 0x116b
3039                                 || c == 0x116f)
3040                               {
3041                                 error("Not a name start character, U+"
3042                                       + Integer.toHexString(c));
3043                               }
3044                             break;
3045                           case 0x00b0:
3046                             if (c == 0x11b6 || c == 0x11b9
3047                                 || c == 0x11bb || c == 0x116f)
3048                               {
3049                                 error("Not a name start character, U+"
3050                                       + Integer.toHexString(c));
3051                               }
3052                             break;
3053                           default:
3054                             if (c == 0x1174 || c == 0x119f
3055                                 || c == 0x11ac || c == 0x11c3
3056                                 || c == 0x11f1)
3057                               {
3058                                 error("Not a name start character, U+"
3059                                       + Integer.toHexString(c));
3060                               }
3061                           }
3062                         break;
3063                       default:
3064                         if (c == 0x0e46 || c == 0x1011
3065                             || c == 0x212f || c == 0x0587
3066                             || c == 0x0230 )
3067                           {
3068                             error("Not a name start character, U+"
3069                                   + Integer.toHexString(c));
3070                           }
3071                       }
3072                   }
3073                 // punt on exact tests from Appendix A; approximate
3074                 // them using the Unicode ID start/part rules
3075                 if (i == readBufferPos && isName)
3076                   {
3077                     if (!Character.isUnicodeIdentifierStart(c)
3078                         && c != ':' && c != '_')
3079                       {
3080                         error("Not a name start character, U+"
3081                               + Integer.toHexString(c));
3082                       }
3083                   }
3084                 else if (!Character.isUnicodeIdentifierPart(c)
3085                          && c != '-' && c != ':' && c != '_' && c != '.'
3086                          && !isExtender(c))
3087                   {
3088                     error("Not a name character, U+"
3089                           + Integer.toHexString(c));
3090                   }
3091               }
3092           }
3093       }
3094 
3095     nameBufferPos = 0;
3096 
3097     // Read the first character.
3098     while (true)
3099       {
3100         c = readCh();
3101         switch (c)
3102           {
3103           case '%':
3104           case '<': case '>': case '&':
3105           case ',': case '|': case '*': case '+': case '?':
3106           case ')':
3107           case '=':
3108           case '\'': case '"':
3109           case '[':
3110           case ' ': case '\t': case '\n': case '\r':
3111           case ';':
3112           case '/':
3113             unread(c);
3114             if (nameBufferPos == 0)
3115               {
3116                 error ("name expected");
3117               }
3118             // punt on exact tests from Appendix A, but approximate them
3119             if (isName
3120                 && !Character.isUnicodeIdentifierStart(nameBuffer[0])
3121                 && ":_".indexOf(nameBuffer[0]) == -1)
3122               {
3123                 error("Not a name start character, U+"
3124                       + Integer.toHexString(nameBuffer[0]));
3125               }
3126             String s = intern(nameBuffer, 0, nameBufferPos);
3127             nameBufferPos = 0;
3128             return s;
3129           default:
3130             // punt on exact tests from Appendix A, but approximate them
3131 
3132             if ((nameBufferPos != 0 || !isName)
3133                 && !Character.isUnicodeIdentifierPart(c)
3134                 && ":-_.".indexOf(c) == -1
3135                 && !isExtender(c))
3136               {
3137                 error("Not a name character, U+"
3138                       + Integer.toHexString(c));
3139               }
3140             if (nameBufferPos >= nameBuffer.length)
3141               {
3142                 nameBuffer =
3143                   (char[]) extendArray(nameBuffer,
3144                                        nameBuffer.length, nameBufferPos);
3145               }
3146             nameBuffer[nameBufferPos++] = c;
3147           }
3148       }
3149   }
3150 
isExtender(char c)3151   private static boolean isExtender(char c)
3152   {
3153     // [88] Extender ::= ...
3154     return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
3155       || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
3156       || (c >= 0x3031 && c <= 0x3035)
3157       || (c >= 0x309d && c <= 0x309e)
3158       || (c >= 0x30fc && c <= 0x30fe);
3159   }
3160 
3161   /**
3162    * Read a literal.  With matching single or double quotes as
3163    * delimiters (and not embedded!) this is used to parse:
3164    * <pre>
3165    *  [9] EntityValue ::= ... ([^%&amp;] | PEReference | Reference)* ...
3166    *  [10] AttValue ::= ... ([^<&] | Reference)* ...
3167    *  [11] SystemLiteral ::= ... (URLchar - "'")* ...
3168    *  [12] PubidLiteral ::= ... (PubidChar - "'")* ...
3169    * </pre>
3170    * as well as the quoted strings in XML and text declarations
3171    * (for version, encoding, and standalone) which have their
3172    * own constraints.
3173    */
readLiteral(int flags)3174   private String readLiteral(int flags)
3175     throws SAXException, IOException
3176   {
3177     char delim, c;
3178     int startLine = line;
3179     boolean saved = expandPE;
3180     boolean savedReport = doReport;
3181 
3182     // Find the first delimiter.
3183     delim = readCh();
3184     if (delim != '"' && delim != '\'')
3185       {
3186         error("expected '\"' or \"'\"", delim, null);
3187         return null;
3188       }
3189     inLiteral = true;
3190     if ((flags & LIT_DISABLE_PE) != 0)
3191       {
3192         expandPE = false;
3193       }
3194     doReport = false;
3195 
3196     // Each level of input source has its own buffer; remember
3197     // ours, so we won't read the ending delimiter from any
3198     // other input source, regardless of entity processing.
3199     char[] ourBuf = readBuffer;
3200 
3201     // Read the literal.
3202     try
3203       {
3204         c = readCh();
3205         boolean ampRead = false;
3206 loop:
3207         while (! (c == delim && readBuffer == ourBuf))
3208           {
3209             switch (c)
3210               {
3211                 // attributes and public ids are normalized
3212                 // in almost the same ways
3213               case '\n':
3214               case '\r':
3215                 if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0)
3216                   {
3217                     c = ' ';
3218                   }
3219                 break;
3220               case '\t':
3221                 if ((flags & LIT_ATTRIBUTE) != 0)
3222                   {
3223                     c = ' ';
3224                   }
3225                 break;
3226               case '&':
3227                 c = readCh();
3228                 // Char refs are expanded immediately, except for
3229                 // all the cases where it's deferred.
3230                 if (c == '#')
3231                   {
3232                     if ((flags & LIT_DISABLE_CREF) != 0)
3233                       {
3234                         dataBufferAppend('&');
3235                         break;
3236                       }
3237                     parseCharRef(false /* Do not do flushDataBuffer */);
3238 
3239                     // exotic WFness risk: this is an entity literal,
3240                     // dataBuffer [dataBufferPos - 1] == '&', and
3241                     // following chars are a _partial_ entity/char ref
3242 
3243                     // It looks like an entity ref ...
3244                   }
3245                 else
3246                   {
3247                     unread(c);
3248                     // Expand it?
3249                     if ((flags & LIT_ENTITY_REF) > 0)
3250                       {
3251                         parseEntityRef(false);
3252                         if (String.valueOf(readBuffer).equals("&#38;"))
3253                           {
3254                             ampRead = true;
3255                           }
3256                         //Is it just data?
3257                       }
3258                     else if ((flags & LIT_DISABLE_EREF) != 0)
3259                       {
3260                         dataBufferAppend('&');
3261 
3262                         // OK, it will be an entity ref -- expanded later.
3263                       }
3264                     else
3265                       {
3266                         String name = readNmtoken(true);
3267                         require(';');
3268                         dataBufferAppend('&');
3269                         dataBufferAppend(name);
3270                         dataBufferAppend(';');
3271                       }
3272                   }
3273                 c = readCh();
3274                 continue loop;
3275 
3276               case '<':
3277                 // and why?  Perhaps so "&foo;" expands the same
3278                 // inside and outside an attribute?
3279                 if ((flags & LIT_ATTRIBUTE) != 0)
3280                   {
3281                     error("attribute values may not contain '<'");
3282                   }
3283                 break;
3284 
3285                 // We don't worry about case '%' and PE refs, readCh does.
3286 
3287               default:
3288                 break;
3289               }
3290             dataBufferAppend(c);
3291             c = readCh();
3292           }
3293       }
3294     catch (EOFException e)
3295       {
3296         error("end of input while looking for delimiter (started on line "
3297               + startLine + ')', null, Character.toString(delim));
3298       }
3299     inLiteral = false;
3300     expandPE = saved;
3301     doReport = savedReport;
3302 
3303     // Normalise whitespace if necessary.
3304     if ((flags & LIT_NORMALIZE) > 0)
3305       {
3306         dataBufferNormalize();
3307       }
3308 
3309     // Return the value.
3310     return dataBufferToString();
3311   }
3312 
3313   /**
3314    * Try reading external identifiers.
3315    * A system identifier is not required for notations.
3316    * @param inNotation Are we parsing a notation decl?
3317    * @param isSubset Parsing external subset decl (may be omitted)?
3318    * @return A three-member String array containing the identifiers,
3319    *  or nulls. Order: public, system, baseURI.
3320    */
readExternalIds(boolean inNotation, boolean isSubset)3321   private ExternalIdentifiers readExternalIds(boolean inNotation,
3322                                               boolean isSubset)
3323     throws Exception
3324   {
3325     char c;
3326     ExternalIdentifiers ids = new ExternalIdentifiers();
3327     int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
3328 
3329     if (tryRead("PUBLIC"))
3330       {
3331         requireWhitespace();
3332         ids.publicId = readLiteral(LIT_NORMALIZE | LIT_PUBID | flags);
3333         if (inNotation)
3334           {
3335             skipWhitespace();
3336             c = readCh();
3337             unread(c);
3338             if (c == '"' || c == '\'')
3339               {
3340                 ids.systemId = readLiteral(flags);
3341               }
3342           }
3343         else
3344           {
3345             requireWhitespace();
3346             ids.systemId = readLiteral(flags);
3347           }
3348 
3349         for (int i = 0; i < ids.publicId.length(); i++)
3350           {
3351             c = ids.publicId.charAt(i);
3352             if (c >= 'a' && c <= 'z')
3353               {
3354                 continue;
3355               }
3356             if (c >= 'A' && c <= 'Z')
3357               {
3358                 continue;
3359               }
3360             if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf(c) != -1)
3361               {
3362                 continue;
3363               }
3364             error("illegal PUBLIC id character U+"
3365                   + Integer.toHexString(c));
3366           }
3367       }
3368     else if (tryRead("SYSTEM"))
3369       {
3370         requireWhitespace();
3371         ids.systemId = readLiteral(flags);
3372       }
3373     else if (!isSubset)
3374       {
3375         error("missing SYSTEM or PUBLIC keyword");
3376       }
3377 
3378     if (ids.systemId != null)
3379       {
3380         if (ids.systemId.indexOf('#') != -1)
3381           {
3382             handler.verror("SYSTEM id has a URI fragment: " + ids.systemId);
3383           }
3384         ids.baseUri = handler.getSystemId();
3385         if (ids.baseUri == null && uriWarnings)
3386           {
3387             handler.warn("No base URI; hope URI is absolute: "
3388                          + ids.systemId);
3389           }
3390       }
3391 
3392     return ids;
3393   }
3394 
3395   /**
3396    * Test if a character is whitespace.
3397    * <pre>
3398    * [3] S ::= (#x20 | #x9 | #xd | #xa)+
3399    * </pre>
3400    * @param c The character to test.
3401    * @return true if the character is whitespace.
3402    */
isWhitespace(char c)3403   private final boolean isWhitespace(char c)
3404   {
3405     if (c > 0x20)
3406       {
3407         return false;
3408       }
3409     if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d)
3410       {
3411         return true;
3412       }
3413     return false;  // illegal ...
3414   }
3415 
3416   //////////////////////////////////////////////////////////////////////
3417   // Utility routines.
3418   //////////////////////////////////////////////////////////////////////
3419 
3420   /**
3421    * Add a character to the data buffer.
3422    */
dataBufferAppend(char c)3423   private void dataBufferAppend(char c)
3424   {
3425     // Expand buffer if necessary.
3426     if (dataBufferPos >= dataBuffer.length)
3427       {
3428         dataBuffer = (char[]) extendArray(dataBuffer,
3429                                           dataBuffer.length, dataBufferPos);
3430       }
3431     dataBuffer[dataBufferPos++] = c;
3432   }
3433 
3434   /**
3435    * Add a string to the data buffer.
3436    */
dataBufferAppend(String s)3437   private void dataBufferAppend(String s)
3438   {
3439     dataBufferAppend(s.toCharArray(), 0, s.length());
3440   }
3441 
3442   /**
3443    * Append (part of) a character array to the data buffer.
3444    */
dataBufferAppend(char[] ch, int start, int length)3445   private void dataBufferAppend(char[] ch, int start, int length)
3446   {
3447     dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length,
3448                                       dataBufferPos + length);
3449 
3450     System.arraycopy(ch, start, dataBuffer, dataBufferPos, length);
3451     dataBufferPos += length;
3452   }
3453 
3454   /**
3455    * Normalise space characters in the data buffer.
3456    */
dataBufferNormalize()3457   private void dataBufferNormalize()
3458   {
3459     int i = 0;
3460     int j = 0;
3461     int end = dataBufferPos;
3462 
3463     // Skip spaces at the start.
3464     while (j < end && dataBuffer[j] == ' ')
3465       {
3466         j++;
3467       }
3468 
3469     // Skip whitespace at the end.
3470     while (end > j && dataBuffer[end - 1] == ' ')
3471       {
3472         end --;
3473       }
3474 
3475     // Start copying to the left.
3476     while (j < end)
3477       {
3478 
3479         char c = dataBuffer[j++];
3480 
3481         // Normalise all other spaces to
3482         // a single space.
3483         if (c == ' ')
3484           {
3485             while (j < end && dataBuffer[j++] == ' ')
3486               {
3487                 continue;
3488               }
3489             dataBuffer[i++] = ' ';
3490             dataBuffer[i++] = dataBuffer[j - 1];
3491           }
3492         else
3493           {
3494             dataBuffer[i++] = c;
3495           }
3496       }
3497 
3498     // The new length is <= the old one.
3499     dataBufferPos = i;
3500   }
3501 
3502   /**
3503    * Convert the data buffer to a string.
3504    */
dataBufferToString()3505   private String dataBufferToString()
3506   {
3507     String s = new String(dataBuffer, 0, dataBufferPos);
3508     dataBufferPos = 0;
3509     return s;
3510   }
3511 
3512   /**
3513    * Flush the contents of the data buffer to the handler, as
3514    * appropriate, and reset the buffer for new input.
3515    */
dataBufferFlush()3516   private void dataBufferFlush()
3517     throws SAXException
3518   {
3519     if (currentElementContent == CONTENT_ELEMENTS
3520         && dataBufferPos > 0
3521         && !inCDATA)
3522       {
3523         // We can't just trust the buffer to be whitespace, there
3524         // are (error) cases when it isn't
3525         for (int i = 0; i < dataBufferPos; i++)
3526           {
3527             if (!isWhitespace(dataBuffer[i]))
3528               {
3529                 handler.charData(dataBuffer, 0, dataBufferPos);
3530                 dataBufferPos = 0;
3531               }
3532           }
3533         if (dataBufferPos > 0)
3534           {
3535             handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos);
3536             dataBufferPos = 0;
3537           }
3538       }
3539     else if (dataBufferPos > 0)
3540       {
3541         handler.charData(dataBuffer, 0, dataBufferPos);
3542         dataBufferPos = 0;
3543       }
3544   }
3545 
3546   /**
3547    * Require a string to appear, or throw an exception.
3548    * <p><em>Precondition:</em> Entity expansion is not required.
3549    * <p><em>Precondition:</em> data buffer has no characters that
3550    * will get sent to the application.
3551    */
require(String delim)3552   private void require(String delim)
3553     throws SAXException, IOException
3554   {
3555     int length = delim.length();
3556     char[] ch;
3557 
3558     if (length < dataBuffer.length)
3559       {
3560         ch = dataBuffer;
3561         delim.getChars(0, length, ch, 0);
3562       }
3563     else
3564       {
3565         ch = delim.toCharArray();
3566       }
3567 
3568     if (USE_CHEATS && length <= (readBufferLength - readBufferPos))
3569       {
3570         int offset = readBufferPos;
3571 
3572         for (int i = 0; i < length; i++, offset++)
3573           {
3574             if (ch[i] != readBuffer[offset])
3575               {
3576                 error ("required string", null, delim);
3577               }
3578           }
3579         readBufferPos = offset;
3580 
3581       }
3582     else
3583       {
3584         for (int i = 0; i < length; i++)
3585           {
3586             require(ch[i]);
3587           }
3588       }
3589   }
3590 
3591   /**
3592    * Require a character to appear, or throw an exception.
3593    */
require(char delim)3594   private void require(char delim)
3595     throws SAXException, IOException
3596   {
3597     char c = readCh();
3598 
3599     if (c != delim)
3600       {
3601         error("required character", c, Character.toString(delim));
3602       }
3603   }
3604 
3605   /**
3606    * Create an interned string from a character array.
3607    * &AElig;lfred uses this method to create an interned version
3608    * of all names and name tokens, so that it can test equality
3609    * with <code>==</code> instead of <code>String.equals ()</code>.
3610    *
3611    * <p>This is much more efficient than constructing a non-interned
3612    * string first, and then interning it.
3613    *
3614    * @param ch an array of characters for building the string.
3615    * @param start the starting position in the array.
3616    * @param length the number of characters to place in the string.
3617    * @return an interned string.
3618    * @see #intern (String)
3619    * @see java.lang.String#intern
3620    */
intern(char[] ch, int start, int length)3621   public String intern(char[] ch, int start, int length)
3622   {
3623     int index = 0;
3624     int hash = 0;
3625     Object[] bucket;
3626 
3627     // Generate a hash code.  This is a widely used string hash,
3628     // often attributed to Brian Kernighan.
3629     for (int i = start; i < start + length; i++)
3630       {
3631         hash = 31 * hash + ch[i];
3632       }
3633     hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH;
3634 
3635     // Get the bucket -- consists of {array,String} pairs
3636     if ((bucket = symbolTable[hash]) == null)
3637       {
3638         // first string in this bucket
3639         bucket = new Object[8];
3640 
3641         // Search for a matching tuple, and
3642         // return the string if we find one.
3643       }
3644     else
3645       {
3646         while (index < bucket.length)
3647           {
3648             char[] chFound = (char[]) bucket[index];
3649 
3650             // Stop when we hit an empty entry.
3651             if (chFound == null)
3652               {
3653                 break;
3654               }
3655 
3656             // If they're the same length, check for a match.
3657             if (chFound.length == length)
3658               {
3659                 for (int i = 0; i < chFound.length; i++)
3660                   {
3661                     // continue search on failure
3662                     if (ch[start + i] != chFound[i])
3663                       {
3664                         break;
3665                       }
3666                     else if (i == length - 1)
3667                       {
3668                         // That's it, we have a match!
3669                         return (String) bucket[index + 1];
3670                       }
3671                   }
3672               }
3673             index += 2;
3674           }
3675         // Not found -- we'll have to add it.
3676 
3677         // Do we have to grow the bucket?
3678         bucket = (Object[]) extendArray(bucket, bucket.length, index);
3679       }
3680     symbolTable[hash] = bucket;
3681 
3682     // OK, add it to the end of the bucket -- "local" interning.
3683     // Intern "globally" to let applications share interning benefits.
3684     // That is, "!=" and "==" work on our strings, not just equals().
3685     String s = new String(ch, start, length).intern();
3686     bucket[index] = s.toCharArray();
3687     bucket[index + 1] = s;
3688     return s;
3689   }
3690 
3691   /**
3692    * Ensure the capacity of an array, allocating a new one if
3693    * necessary.  Usually extends only for name hash collisions.
3694    */
extendArray(Object array, int currentSize, int requiredSize)3695   private Object extendArray(Object array, int currentSize, int requiredSize)
3696   {
3697     if (requiredSize < currentSize)
3698       {
3699         return array;
3700       }
3701     else
3702       {
3703         Object newArray = null;
3704         int newSize = currentSize * 2;
3705 
3706         if (newSize <= requiredSize)
3707           {
3708             newSize = requiredSize + 1;
3709           }
3710 
3711         if (array instanceof char[])
3712           {
3713             newArray = new char[newSize];
3714           }
3715         else if (array instanceof Object[])
3716           {
3717             newArray = new Object[newSize];
3718           }
3719         else
3720           {
3721             throw new RuntimeException();
3722           }
3723 
3724         System.arraycopy(array, 0, newArray, 0, currentSize);
3725         return newArray;
3726       }
3727   }
3728 
3729   //////////////////////////////////////////////////////////////////////
3730   // XML query routines.
3731   //////////////////////////////////////////////////////////////////////
3732 
isStandalone()3733   boolean isStandalone()
3734   {
3735     return docIsStandalone;
3736   }
3737 
3738   //
3739   // Elements
3740   //
3741 
getContentType(ElementDecl element, int defaultType)3742   private int getContentType(ElementDecl element, int defaultType)
3743   {
3744     int retval;
3745 
3746     if (element == null)
3747       {
3748         return defaultType;
3749       }
3750     retval = element.contentType;
3751     if (retval == CONTENT_UNDECLARED)
3752       {
3753         retval = defaultType;
3754       }
3755     return retval;
3756   }
3757 
3758   /**
3759    * Look up the content type of an element.
3760    * @param name The element type name.
3761    * @return An integer constant representing the content type.
3762    * @see #CONTENT_UNDECLARED
3763    * @see #CONTENT_ANY
3764    * @see #CONTENT_EMPTY
3765    * @see #CONTENT_MIXED
3766    * @see #CONTENT_ELEMENTS
3767    */
getElementContentType(String name)3768   public int getElementContentType(String name)
3769   {
3770     ElementDecl element = (ElementDecl) elementInfo.get(name);
3771     return getContentType(element, CONTENT_UNDECLARED);
3772   }
3773 
3774   /**
3775    * Register an element.
3776    * Array format:
3777    *  [0] element type name
3778    *  [1] content model (mixed, elements only)
3779    *  [2] attribute hash table
3780    */
setElement(String name, int contentType, String contentModel, HashMap attributes)3781   private void setElement(String name, int contentType,
3782                           String contentModel, HashMap attributes)
3783     throws SAXException
3784   {
3785     if (skippedPE)
3786       {
3787         return;
3788       }
3789 
3790     ElementDecl element = (ElementDecl) elementInfo.get(name);
3791 
3792     // first <!ELEMENT ...> or <!ATTLIST ...> for this type?
3793     if (element == null)
3794       {
3795         element = new ElementDecl();
3796         element.contentType = contentType;
3797         element.contentModel = contentModel;
3798         element.attributes = attributes;
3799         elementInfo.put(name, element);
3800         return;
3801       }
3802 
3803     // <!ELEMENT ...> declaration?
3804     if (contentType != CONTENT_UNDECLARED)
3805       {
3806         // ... following an associated <!ATTLIST ...>
3807         if (element.contentType == CONTENT_UNDECLARED)
3808           {
3809             element.contentType = contentType;
3810             element.contentModel = contentModel;
3811           }
3812         else
3813           {
3814             // VC: Unique Element Type Declaration
3815             handler.verror("multiple declarations for element type: "
3816                            + name);
3817           }
3818       }
3819 
3820     // first <!ATTLIST ...>, before <!ELEMENT ...> ?
3821     else if (attributes != null)
3822       {
3823         element.attributes = attributes;
3824       }
3825   }
3826 
3827   /**
3828    * Look up the attribute hash table for an element.
3829    * The hash table is the second item in the element array.
3830    */
getElementAttributes(String name)3831   private HashMap getElementAttributes(String name)
3832   {
3833     ElementDecl element = (ElementDecl) elementInfo.get(name);
3834     return (element == null) ? null : element.attributes;
3835   }
3836 
3837   //
3838   // Attributes
3839   //
3840 
3841   /**
3842    * Get the declared attributes for an element type.
3843    * @param elname The name of the element type.
3844    * @return An iterator over all the attributes declared for
3845    *   a specific element type.  The results will be valid only
3846    *   after the DTD (if any) has been parsed.
3847    * @see #getAttributeType
3848    * @see #getAttributeEnumeration
3849    * @see #getAttributeDefaultValueType
3850    * @see #getAttributeDefaultValue
3851    * @see #getAttributeExpandedValue
3852    */
declaredAttributes(ElementDecl element)3853   private Iterator declaredAttributes(ElementDecl element)
3854   {
3855     HashMap attlist;
3856 
3857     if (element == null)
3858       {
3859         return null;
3860       }
3861     if ((attlist = element.attributes) == null)
3862       {
3863         return null;
3864       }
3865     return attlist.keySet().iterator();
3866   }
3867 
3868   /**
3869    * Get the declared attributes for an element type.
3870    * @param elname The name of the element type.
3871    * @return An iterator over all the attributes declared for
3872    *   a specific element type.  The results will be valid only
3873    *   after the DTD (if any) has been parsed.
3874    * @see #getAttributeType
3875    * @see #getAttributeEnumeration
3876    * @see #getAttributeDefaultValueType
3877    * @see #getAttributeDefaultValue
3878    * @see #getAttributeExpandedValue
3879    */
declaredAttributes(String elname)3880   public Iterator declaredAttributes(String elname)
3881   {
3882     return declaredAttributes((ElementDecl) elementInfo.get(elname));
3883   }
3884 
3885   /**
3886    * Retrieve the declared type of an attribute.
3887    * @param name The name of the associated element.
3888    * @param aname The name of the attribute.
3889    * @return An interend string denoting the type, or null
3890    *  indicating an undeclared attribute.
3891    */
getAttributeType(String name, String aname)3892   public String getAttributeType(String name, String aname)
3893   {
3894     AttributeDecl attribute = getAttribute(name, aname);
3895     return (attribute == null) ? null : attribute.type;
3896   }
3897 
3898   /**
3899    * Retrieve the allowed values for an enumerated attribute type.
3900    * @param name The name of the associated element.
3901    * @param aname The name of the attribute.
3902    * @return A string containing the token list.
3903    */
getAttributeEnumeration(String name, String aname)3904   public String getAttributeEnumeration(String name, String aname)
3905   {
3906     AttributeDecl attribute = getAttribute(name, aname);
3907     // assert:  attribute.enumeration is "ENUMERATION" or "NOTATION"
3908     return (attribute == null) ? null : attribute.enumeration;
3909   }
3910 
3911   /**
3912    * Retrieve the default value of a declared attribute.
3913    * @param name The name of the associated element.
3914    * @param aname The name of the attribute.
3915    * @return The default value, or null if the attribute was
3916    *   #IMPLIED or simply undeclared and unspecified.
3917    * @see #getAttributeExpandedValue
3918    */
getAttributeDefaultValue(String name, String aname)3919   public String getAttributeDefaultValue(String name, String aname)
3920   {
3921     AttributeDecl attribute = getAttribute(name, aname);
3922     return (attribute == null) ? null : attribute.value;
3923   }
3924 
3925     /*
3926 
3927 // FIXME:  Leaving this in, until W3C finally resolves the confusion
3928 // between parts of the XML 2nd REC about when entity declararations
3929 // are guaranteed to be known.  Current code matches what section 5.1
3930 // (conformance) describes, but some readings of the self-contradicting
3931 // text in 4.1 (the "Entity Declared" WFC and VC) seem to expect that
3932 // attribute expansion/normalization must be deferred in some cases
3933 // (just TRY to identify them!).
3934 
3935      * Retrieve the expanded value of a declared attribute.
3936      * <p>General entities (and char refs) will be expanded (once).
3937      * @param name The name of the associated element.
3938      * @param aname The name of the attribute.
3939      * @return The expanded default value, or null if the attribute was
3940      *   #IMPLIED or simply undeclared
3941      * @see #getAttributeDefaultValue
3942     public String getAttributeExpandedValue (String name, String aname)
3943     throws Exception
3944     {
3945   AttributeDecl attribute = getAttribute (name, aname);
3946 
3947   if (attribute == null) {
3948       return null;
3949   } else if (attribute.defaultValue == null && attribute.value != null) {
3950       // we MUST use the same buf for both quotes else the literal
3951       // can't be properly terminated
3952       char buf [] = new char [1];
3953       int  flags = LIT_ENTITY_REF | LIT_ATTRIBUTE;
3954       String type = getAttributeType (name, aname);
3955 
3956       if (type != "CDATA" && type != null)
3957     flags |= LIT_NORMALIZE;
3958       buf [0] = '"';
3959       pushCharArray (null, buf, 0, 1);
3960       pushString (null, attribute.value);
3961       pushCharArray (null, buf, 0, 1);
3962       attribute.defaultValue = readLiteral (flags);
3963   }
3964   return attribute.defaultValue;
3965     }
3966      */
3967 
3968   /**
3969    * Retrieve the default value mode of a declared attribute.
3970    * @see #ATTRIBUTE_DEFAULT_SPECIFIED
3971    * @see #ATTRIBUTE_DEFAULT_IMPLIED
3972    * @see #ATTRIBUTE_DEFAULT_REQUIRED
3973    * @see #ATTRIBUTE_DEFAULT_FIXED
3974    */
getAttributeDefaultValueType(String name, String aname)3975   public int getAttributeDefaultValueType(String name, String aname)
3976   {
3977     AttributeDecl attribute = getAttribute(name, aname);
3978     return (attribute == null) ? ATTRIBUTE_DEFAULT_UNDECLARED :
3979       attribute.valueType;
3980   }
3981 
3982   /**
3983    * Register an attribute declaration for later retrieval.
3984    * Format:
3985    * - String type
3986    * - String default value
3987    * - int value type
3988    * - enumeration
3989    * - processed default value
3990    */
setAttribute(String elName, String name, String type, String enumeration, String value, int valueType)3991   private void setAttribute(String elName, String name, String type,
3992                             String enumeration, String value, int valueType)
3993     throws Exception
3994   {
3995     HashMap attlist;
3996 
3997     if (skippedPE)
3998       {
3999         return;
4000       }
4001 
4002     // Create a new hashtable if necessary.
4003     attlist = getElementAttributes(elName);
4004     if (attlist == null)
4005       {
4006         attlist = new HashMap();
4007       }
4008 
4009     // ignore multiple attribute declarations!
4010     if (attlist.get(name) != null)
4011       {
4012         // warn ...
4013         return;
4014       }
4015     else
4016       {
4017         AttributeDecl attribute = new AttributeDecl();
4018         attribute.type = type;
4019         attribute.value = value;
4020         attribute.valueType = valueType;
4021         attribute.enumeration = enumeration;
4022         attlist.put(name, attribute);
4023 
4024         // save; but don't overwrite any existing <!ELEMENT ...>
4025         setElement(elName, CONTENT_UNDECLARED, null, attlist);
4026       }
4027   }
4028 
4029   /**
4030    * Retrieve the attribute declaration for the given element name and name.
4031    */
getAttribute(String elName, String name)4032   private AttributeDecl getAttribute(String elName, String name)
4033   {
4034     HashMap attlist = getElementAttributes(elName);
4035     return (attlist == null) ? null : (AttributeDecl) attlist.get(name);
4036   }
4037 
4038   //
4039   // Entities
4040   //
4041 
4042   /**
4043    * Find the type of an entity.
4044    * @returns An integer constant representing the entity type.
4045    * @see #ENTITY_UNDECLARED
4046    * @see #ENTITY_INTERNAL
4047    * @see #ENTITY_NDATA
4048    * @see #ENTITY_TEXT
4049    */
getEntityType(String ename)4050   public int getEntityType(String ename)
4051   {
4052     EntityInfo entity = (EntityInfo) entityInfo.get(ename);
4053     return (entity == null) ?  ENTITY_UNDECLARED : entity.type;
4054   }
4055 
4056   /**
4057    * Return an external entity's identifiers.
4058    * @param ename The name of the external entity.
4059    * @return The entity's public identifier, system identifier, and base URI.
4060    *  Null if the entity was not declared as an external entity.
4061    * @see #getEntityType
4062    */
getEntityIds(String ename)4063   public ExternalIdentifiers getEntityIds(String ename)
4064   {
4065     EntityInfo entity = (EntityInfo) entityInfo.get(ename);
4066     return (entity == null) ? null : entity.ids;
4067   }
4068 
4069   /**
4070    * Return an internal entity's replacement text.
4071    * @param ename The name of the internal entity.
4072    * @return The entity's replacement text, or null if
4073    *   the entity was not declared as an internal entity.
4074    * @see #getEntityType
4075    */
getEntityValue(String ename)4076   public String getEntityValue(String ename)
4077   {
4078     EntityInfo entity = (EntityInfo) entityInfo.get(ename);
4079     return (entity == null) ? null : entity.value;
4080   }
4081 
4082   /**
4083    * Register an entity declaration for later retrieval.
4084    */
setInternalEntity(String eName, String value)4085   private void setInternalEntity(String eName, String value)
4086     throws SAXException
4087   {
4088     if (skippedPE)
4089       {
4090         return;
4091       }
4092 
4093     if (entityInfo.get(eName) == null)
4094       {
4095         EntityInfo entity = new EntityInfo();
4096         entity.type = ENTITY_INTERNAL;
4097         entity.value = value;
4098         entityInfo.put(eName, entity);
4099       }
4100     if (handler.stringInterning)
4101       {
4102         if ("lt" == eName || "gt" == eName || "quot" == eName
4103             || "apos" == eName || "amp" == eName)
4104           {
4105             return;
4106           }
4107       }
4108     else
4109       {
4110         if ("lt".equals(eName) || "gt".equals(eName) || "quot".equals(eName)
4111             || "apos".equals(eName) || "amp".equals(eName))
4112           {
4113             return;
4114           }
4115       }
4116     handler.getDeclHandler().internalEntityDecl(eName, value);
4117   }
4118 
4119   /**
4120    * Register an external entity declaration for later retrieval.
4121    */
setExternalEntity(String eName, int eClass, ExternalIdentifiers ids, String nName)4122   private void setExternalEntity(String eName, int eClass,
4123                                  ExternalIdentifiers ids, String nName)
4124   {
4125     if (entityInfo.get(eName) == null)
4126       {
4127         EntityInfo entity = new EntityInfo();
4128         entity.type = eClass;
4129         entity.ids = ids;
4130         entity.notationName = nName;
4131         entityInfo.put(eName, entity);
4132       }
4133   }
4134 
4135   //
4136   // Notations.
4137   //
4138 
4139   /**
4140    * Report a notation declaration, checking for duplicates.
4141    */
setNotation(String nname, ExternalIdentifiers ids)4142   private void setNotation(String nname, ExternalIdentifiers ids)
4143     throws SAXException
4144   {
4145     if (skippedPE)
4146       {
4147         return;
4148       }
4149 
4150     handler.notationDecl(nname, ids.publicId, ids.systemId, ids.baseUri);
4151     if (notationInfo.get(nname) == null)
4152       {
4153         notationInfo.put(nname, nname);
4154       }
4155     else
4156       {
4157         // VC: Unique Notation Name
4158         handler.verror("Duplicate notation name decl: " + nname);
4159       }
4160   }
4161 
4162   //
4163   // Location.
4164   //
4165 
4166   /**
4167    * Return the current line number.
4168    */
getLineNumber()4169   public int getLineNumber()
4170   {
4171     return line;
4172   }
4173 
4174   /**
4175    * Return the current column number.
4176    */
getColumnNumber()4177   public int getColumnNumber()
4178   {
4179     return column;
4180   }
4181 
4182   //////////////////////////////////////////////////////////////////////
4183   // High-level I/O.
4184   //////////////////////////////////////////////////////////////////////
4185 
4186   /**
4187    * Read a single character from the readBuffer.
4188    * <p>The readDataChunk () method maintains the buffer.
4189    * <p>If we hit the end of an entity, try to pop the stack and
4190    * keep going.
4191    * <p> (This approach doesn't really enforce XML's rules about
4192    * entity boundaries, but this is not currently a validating
4193    * parser).
4194    * <p>This routine also attempts to keep track of the current
4195    * position in external entities, but it's not entirely accurate.
4196    * @return The next available input character.
4197    * @see #unread (char)
4198    * @see #readDataChunk
4199    * @see #readBuffer
4200    * @see #line
4201    * @return The next character from the current input source.
4202    */
readCh()4203   private char readCh()
4204     throws SAXException, IOException
4205   {
4206     // As long as there's nothing in the
4207     // read buffer, try reading more data
4208     // (for an external entity) or popping
4209     // the entity stack (for either).
4210     while (readBufferPos >= readBufferLength)
4211       {
4212         switch (sourceType)
4213           {
4214           case INPUT_READER:
4215           case INPUT_STREAM:
4216             readDataChunk();
4217             while (readBufferLength < 1)
4218               {
4219                 popInput();
4220                 if (readBufferLength < 1)
4221                   {
4222                     readDataChunk();
4223                   }
4224               }
4225             break;
4226 
4227           default:
4228 
4229             popInput();
4230             break;
4231           }
4232       }
4233 
4234     char c = readBuffer[readBufferPos++];
4235 
4236     if (c == '\n')
4237       {
4238         line++;
4239         column = 0;
4240       }
4241     else
4242       {
4243         if (c == '<')
4244           {
4245             /* the most common return to parseContent () ... NOP */
4246           }
4247         else if (((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD)
4248                  || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085)
4249                      && xmlVersion == XML_11))
4250           {
4251             error("illegal XML character U+" + Integer.toHexString(c));
4252           }
4253 
4254         // If we're in the DTD and in a context where PEs get expanded,
4255         // do so ... 1/14/2000 errata identify those contexts.  There
4256         // are also spots in the internal subset where PE refs are fatal
4257         // errors, hence yet another flag.
4258         else if (c == '%' && expandPE)
4259           {
4260             if (peIsError)
4261               {
4262                 error("PE reference within decl in internal subset.");
4263               }
4264             parsePEReference();
4265             return readCh();
4266           }
4267         column++;
4268       }
4269 
4270     return c;
4271   }
4272 
4273   /**
4274    * Push a single character back onto the current input stream.
4275    * <p>This method usually pushes the character back onto
4276    * the readBuffer.
4277    * <p>I don't think that this would ever be called with
4278    * readBufferPos = 0, because the methods always reads a character
4279    * before unreading it, but just in case, I've added a boundary
4280    * condition.
4281    * @param c The character to push back.
4282    * @see #readCh
4283    * @see #unread (char[])
4284    * @see #readBuffer
4285    */
unread(char c)4286   private void unread(char c)
4287     throws SAXException
4288   {
4289     // Normal condition.
4290     if (c == '\n')
4291       {
4292         line--;
4293         column = -1;
4294       }
4295     if (readBufferPos > 0)
4296       {
4297         readBuffer[--readBufferPos] = c;
4298       }
4299     else
4300       {
4301         pushString(null, Character.toString(c));
4302       }
4303   }
4304 
4305   /**
4306    * Push a char array back onto the current input stream.
4307    * <p>NOTE: you must <em>never</em> push back characters that you
4308    * haven't actually read: use pushString () instead.
4309    * @see #readCh
4310    * @see #unread (char)
4311    * @see #readBuffer
4312    * @see #pushString
4313    */
unread(char[] ch, int length)4314   private void unread(char[] ch, int length)
4315     throws SAXException
4316   {
4317     for (int i = 0; i < length; i++)
4318       {
4319         if (ch[i] == '\n')
4320           {
4321             line--;
4322             column = -1;
4323           }
4324       }
4325     if (length < readBufferPos)
4326       {
4327         readBufferPos -= length;
4328       }
4329     else
4330       {
4331         pushCharArray(null, ch, 0, length);
4332       }
4333   }
4334 
4335   /**
4336    * Push, or skip, a new external input source.
4337    * The source will be some kind of parsed entity, such as a PE
4338    * (including the external DTD subset) or content for the body.
4339    *
4340    * @param url The java.net.URL object for the entity.
4341    * @see SAXDriver#resolveEntity
4342    * @see #pushString
4343    * @see #sourceType
4344    * @see #pushInput
4345    * @see #detectEncoding
4346    * @see #sourceType
4347    * @see #readBuffer
4348    */
pushURL(boolean isPE, String ename, ExternalIdentifiers ids, Reader reader, InputStream stream, String encoding, boolean doResolve)4349   private void pushURL(boolean isPE,
4350                        String ename,
4351                        ExternalIdentifiers ids,
4352                        Reader reader,
4353                        InputStream stream,
4354                        String encoding,
4355                        boolean doResolve)
4356     throws SAXException, IOException
4357   {
4358     boolean ignoreEncoding;
4359     String systemId;
4360     InputSource source;
4361 
4362     if (!isPE)
4363       {
4364         dataBufferFlush();
4365       }
4366 
4367     scratch.setPublicId(ids.publicId);
4368     scratch.setSystemId(ids.systemId);
4369 
4370     // See if we should skip or substitute the entity.
4371     // If we're not skipping, resolving reports startEntity()
4372     // and updates the (handler's) stack of URIs.
4373     if (doResolve)
4374       {
4375         // assert (stream == null && reader == null && encoding == null)
4376         source = handler.resolveEntity(isPE, ename, scratch, ids.baseUri);
4377         if (source == null)
4378           {
4379             handler.warn("skipping entity: " + ename);
4380             handler.skippedEntity(ename);
4381             if (isPE)
4382               {
4383                 skippedPE = true;
4384               }
4385             return;
4386           }
4387 
4388         // we might be using alternate IDs/encoding
4389         systemId = source.getSystemId();
4390         // The following warning and setting systemId was deleted bcause
4391         // the application has the option of not setting systemId
4392         // provided that it has set the characte/byte stream.
4393         /*
4394            if (systemId == null) {
4395            handler.warn ("missing system ID, using " + ids.systemId);
4396            systemId = ids.systemId;
4397            }
4398          */
4399       }
4400     else
4401       {
4402         // "[document]", or "[dtd]" via getExternalSubset()
4403         scratch.setCharacterStream(reader);
4404         scratch.setByteStream(stream);
4405         scratch.setEncoding(encoding);
4406         source = scratch;
4407         systemId = ids.systemId;
4408         if (handler.stringInterning)
4409           {
4410             handler.startExternalEntity(ename, systemId,
4411                                         "[document]" == ename);
4412           }
4413         else
4414           {
4415             handler.startExternalEntity(ename, systemId,
4416                                         "[document]".equals(ename));
4417           }
4418       }
4419 
4420     // we may have been given I/O streams directly
4421     if (source.getCharacterStream() != null)
4422       {
4423         if (source.getByteStream() != null)
4424           error("InputSource has two streams!");
4425         reader = source.getCharacterStream();
4426       }
4427     else if (source.getByteStream() != null)
4428       {
4429         encoding = source.getEncoding();
4430         if (encoding == null)
4431           {
4432             stream = source.getByteStream();
4433           }
4434         else
4435           {
4436             try
4437               {
4438                 reader = new InputStreamReader(source.getByteStream(),
4439                                                encoding);
4440               }
4441             catch (IOException e)
4442               {
4443                 stream = source.getByteStream();
4444               }
4445           }
4446       }
4447     else if (systemId == null)
4448       {
4449         error("InputSource has no URI!");
4450       }
4451     scratch.setCharacterStream(null);
4452     scratch.setByteStream(null);
4453     scratch.setEncoding(null);
4454 
4455     // Push the existing status.
4456     pushInput(ename);
4457 
4458     // Create a new read buffer.
4459     // (Note the four-character margin)
4460     readBuffer = new char[READ_BUFFER_MAX + 4];
4461     readBufferPos = 0;
4462     readBufferLength = 0;
4463     readBufferOverflow = -1;
4464     is = null;
4465     line = 1;
4466     column = 0;
4467     currentByteCount = 0;
4468 
4469     // If there's an explicit character stream, just
4470     // ignore encoding declarations.
4471     if (reader != null)
4472       {
4473         sourceType = INPUT_READER;
4474         this.reader = reader;
4475         tryEncodingDecl(true);
4476         return;
4477       }
4478 
4479     // Else we handle the conversion, and need to ensure
4480     // it's done right.
4481     sourceType = INPUT_STREAM;
4482     if (stream != null)
4483       {
4484         is = stream;
4485       }
4486     else
4487       {
4488         // We have to open our own stream to the URL.
4489         URL url = new URL(systemId);
4490 
4491         externalEntity = url.openConnection();
4492         externalEntity.connect();
4493         is = externalEntity.getInputStream();
4494       }
4495 
4496     // If we get to here, there must be
4497     // an InputStream available.
4498     if (!is.markSupported())
4499       {
4500         is = new BufferedInputStream(is);
4501       }
4502 
4503     // Get any external encoding label.
4504     if (encoding == null && externalEntity != null)
4505       {
4506         // External labels can be untrustworthy; filesystems in
4507         // particular often have the wrong default for content
4508         // that wasn't locally originated.  Those we autodetect.
4509         if (!"file".equals(externalEntity.getURL().getProtocol()))
4510           {
4511             int temp;
4512 
4513             // application/xml;charset=something;otherAttr=...
4514             // ... with many variants on 'something'
4515             encoding = externalEntity.getContentType();
4516 
4517             // MHK code (fix for Saxon 5.5.1/007):
4518             // protect against encoding==null
4519             if (encoding == null)
4520               {
4521                 temp = -1;
4522               }
4523             else
4524               {
4525                 temp = encoding.indexOf("charset");
4526               }
4527 
4528             // RFC 2376 sez MIME text defaults to ASCII, but since the
4529             // JDK will create a MIME type out of thin air, we always
4530             // autodetect when there's no explicit charset attribute.
4531             if (temp < 0)
4532               {
4533                 encoding = null;  // autodetect
4534               }
4535             else
4536               {
4537                 // only this one attribute
4538                 if ((temp = encoding.indexOf(';')) > 0)
4539                   {
4540                     encoding = encoding.substring(0, temp);
4541                   }
4542 
4543                 if ((temp = encoding.indexOf('=', temp + 7)) > 0)
4544                   {
4545                     encoding = encoding.substring(temp + 1);
4546 
4547                     // attributes can have comment fields (RFC 822)
4548                     if ((temp = encoding.indexOf('(')) > 0)
4549                       {
4550                         encoding = encoding.substring(0, temp);
4551                       }
4552                     // ... and values may be quoted
4553                     if ((temp = encoding.indexOf('"')) > 0)
4554                       {
4555                         encoding =
4556                           encoding.substring(temp + 1,
4557                                              encoding.indexOf('"', temp + 2));
4558                       }
4559                     encoding = encoding.trim();
4560                   }
4561                 else
4562                   {
4563                     handler.warn("ignoring illegal MIME attribute: "
4564                                  + encoding);
4565                     encoding = null;
4566                   }
4567               }
4568           }
4569       }
4570 
4571     // if we got an external encoding label, use it ...
4572     if (encoding != null)
4573       {
4574         this.encoding = ENCODING_EXTERNAL;
4575         setupDecoding(encoding);
4576         ignoreEncoding = true;
4577 
4578         // ... else autodetect from first bytes.
4579       }
4580     else
4581       {
4582         detectEncoding();
4583         ignoreEncoding = false;
4584       }
4585 
4586     // Read any XML or text declaration.
4587     // If we autodetected, it may tell us the "real" encoding.
4588     try
4589       {
4590         tryEncodingDecl(ignoreEncoding);
4591       }
4592     catch (UnsupportedEncodingException x)
4593       {
4594         encoding = x.getMessage();
4595 
4596         // if we don't handle the declared encoding,
4597         // try letting a JVM InputStreamReader do it
4598         try
4599           {
4600             if (sourceType != INPUT_STREAM)
4601               {
4602                 throw x;
4603               }
4604 
4605             is.reset();
4606             readBufferPos = 0;
4607             readBufferLength = 0;
4608             readBufferOverflow = -1;
4609             line = 1;
4610             currentByteCount = column = 0;
4611 
4612             sourceType = INPUT_READER;
4613             this.reader = new InputStreamReader(is, encoding);
4614             is = null;
4615 
4616             tryEncodingDecl(true);
4617 
4618           }
4619         catch (IOException e)
4620           {
4621             error("unsupported text encoding",
4622                   encoding,
4623                   null);
4624           }
4625       }
4626   }
4627 
4628   /**
4629    * Check for an encoding declaration.  This is the second part of the
4630    * XML encoding autodetection algorithm, relying on detectEncoding to
4631    * get to the point that this part can read any encoding declaration
4632    * in the document (using only US-ASCII characters).
4633    *
4634    * <p> Because this part starts to fill parser buffers with this data,
4635    * it's tricky to setup a reader so that Java's built-in decoders can be
4636    * used for the character encodings that aren't built in to this parser
4637    * (such as EUC-JP, KOI8-R, Big5, etc).
4638    *
4639    * @return any encoding in the declaration, uppercased; or null
4640    * @see detectEncoding
4641    */
tryEncodingDecl(boolean ignoreEncoding)4642   private String tryEncodingDecl(boolean ignoreEncoding)
4643     throws SAXException, IOException
4644   {
4645     // Read the XML/text declaration.
4646     if (tryRead("<?xml"))
4647       {
4648         if (tryWhitespace())
4649           {
4650             if (inputStack.size() > 0)
4651               {
4652                 return parseTextDecl(ignoreEncoding);
4653               }
4654             else
4655               {
4656                 return parseXMLDecl(ignoreEncoding);
4657               }
4658           }
4659         else
4660           {
4661             // <?xml-stylesheet ...?> or similar
4662             unread('l');
4663             unread('m');
4664             unread('x');
4665             unread('?');
4666             unread('<');
4667           }
4668       }
4669     return null;
4670   }
4671 
4672   /**
4673    * Attempt to detect the encoding of an entity.
4674    * <p>The trick here (as suggested in the XML standard) is that
4675    * any entity not in UTF-8, or in UCS-2 with a byte-order mark,
4676    * <b>must</b> begin with an XML declaration or an encoding
4677    * declaration; we simply have to look for "&lt;?xml" in various
4678    * encodings.
4679    * <p>This method has no way to distinguish among 8-bit encodings.
4680    * Instead, it sets up for UTF-8, then (possibly) revises its assumption
4681    * later in setupDecoding ().  Any ASCII-derived 8-bit encoding
4682    * should work, but most will be rejected later by setupDecoding ().
4683    * @see #tryEncoding (byte[], byte, byte, byte, byte)
4684    * @see #tryEncoding (byte[], byte, byte)
4685    * @see #setupDecoding
4686    */
detectEncoding()4687   private void detectEncoding()
4688     throws SAXException, IOException
4689   {
4690     byte[] signature = new byte[4];
4691 
4692     // Read the first four bytes for
4693     // autodetection.
4694     is.mark(4);
4695     is.read(signature);
4696     is.reset();
4697 
4698     //
4699     // FIRST:  four byte encodings (who uses these?)
4700     //
4701     if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
4702                     (byte) 0x00, (byte) 0x3c))
4703       {
4704         // UCS-4 must begin with "<?xml"
4705         // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234)
4706         // "UTF-32BE"
4707         encoding = ENCODING_UCS_4_1234;
4708       }
4709     else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
4710                          (byte) 0x00, (byte) 0x00))
4711       {
4712         // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321)
4713         // "UTF-32LE"
4714         encoding = ENCODING_UCS_4_4321;
4715       }
4716     else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
4717                          (byte) 0x3c, (byte) 0x00))
4718       {
4719         // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143)
4720         encoding = ENCODING_UCS_4_2143;
4721       }
4722     else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4723                          (byte) 0x00, (byte) 0x00))
4724       {
4725         // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421)
4726         encoding = ENCODING_UCS_4_3412;
4727 
4728         // 00 00 fe ff UCS_4_1234 (with BOM)
4729         // ff fe 00 00 UCS_4_4321 (with BOM)
4730       }
4731 
4732     //
4733     // SECOND:  two byte encodings
4734     // note ... with 1/14/2000 errata the XML spec identifies some
4735     // more "broken UTF-16" autodetection cases, with no XML decl,
4736     // which we don't handle here (that's legal too).
4737     //
4738     else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff))
4739       {
4740         // UCS-2 with a byte-order marker. (UTF-16)
4741         // 0xfe 0xff: UCS-2, big-endian (12)
4742         encoding = ENCODING_UCS_2_12;
4743         is.read(); is.read();
4744       }
4745     else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe))
4746       {
4747         // UCS-2 with a byte-order marker. (UTF-16)
4748         // 0xff 0xfe: UCS-2, little-endian (21)
4749         encoding = ENCODING_UCS_2_21;
4750         is.read(); is.read();
4751       }
4752     else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4753                          (byte) 0x00, (byte) 0x3f))
4754       {
4755         // UTF-16BE (otherwise, malformed UTF-16)
4756         // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark
4757         encoding = ENCODING_UCS_2_12;
4758         error("no byte-order mark for UCS-2 entity");
4759       }
4760     else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
4761                          (byte) 0x3f, (byte) 0x00))
4762       {
4763         // UTF-16LE (otherwise, malformed UTF-16)
4764         // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark
4765         encoding = ENCODING_UCS_2_21;
4766         error("no byte-order mark for UCS-2 entity");
4767       }
4768 
4769     //
4770     // THIRD:  ASCII-derived encodings, fixed and variable lengths
4771     //
4772     else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f,
4773                          (byte) 0x78, (byte) 0x6d))
4774       {
4775         // ASCII derived
4776         // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING)
4777         encoding = ENCODING_UTF_8;
4778         prefetchASCIIEncodingDecl();
4779       }
4780     else if (signature[0] == (byte) 0xef
4781              && signature[1] == (byte) 0xbb
4782              && signature[2] == (byte) 0xbf)
4783       {
4784         // 0xef 0xbb 0xbf: UTF-8 BOM (not part of document text)
4785         // this un-needed notion slipped into XML 2nd ed through a
4786         // "non-normative" erratum; now required by MSFT and UDDI,
4787         // and E22 made it normative.
4788         encoding = ENCODING_UTF_8;
4789         is.read(); is.read(); is.read();
4790       }
4791     else
4792       {
4793         // 4c 6f a7 94 ... we don't understand EBCDIC flavors
4794         // ... but we COULD at least kick in some fixed code page
4795 
4796         // (default) UTF-8 without encoding/XML declaration
4797         encoding = ENCODING_UTF_8;
4798       }
4799   }
4800 
4801   /**
4802    * Check for a four-byte signature.
4803    * <p>Utility routine for detectEncoding ().
4804    * <p>Always looks for some part of "<?XML" in a specific encoding.
4805    * @param sig The first four bytes read.
4806    * @param b1 The first byte of the signature
4807    * @param b2 The second byte of the signature
4808    * @param b3 The third byte of the signature
4809    * @param b4 The fourth byte of the signature
4810    * @see #detectEncoding
4811    */
tryEncoding(byte[] sig, byte b1, byte b2, byte b3, byte b4)4812   private static boolean tryEncoding(byte[] sig, byte b1, byte b2,
4813                                      byte b3, byte b4)
4814   {
4815     return (sig[0] == b1 && sig[1] == b2
4816             && sig[2] == b3 && sig[3] == b4);
4817   }
4818 
4819   /**
4820    * Check for a two-byte signature.
4821    * <p>Looks for a UCS-2 byte-order mark.
4822    * <p>Utility routine for detectEncoding ().
4823    * @param sig The first four bytes read.
4824    * @param b1 The first byte of the signature
4825    * @param b2 The second byte of the signature
4826    * @see #detectEncoding
4827    */
tryEncoding(byte[] sig, byte b1, byte b2)4828   private static boolean tryEncoding(byte[] sig, byte b1, byte b2)
4829   {
4830     return ((sig[0] == b1) && (sig[1] == b2));
4831   }
4832 
4833   /**
4834    * This method pushes a string back onto input.
4835    * <p>It is useful either as the expansion of an internal entity,
4836    * or for backtracking during the parse.
4837    * <p>Call pushCharArray () to do the actual work.
4838    * @param s The string to push back onto input.
4839    * @see #pushCharArray
4840    */
pushString(String ename, String s)4841   private void pushString(String ename, String s)
4842     throws SAXException
4843   {
4844     char[] ch = s.toCharArray();
4845     pushCharArray(ename, ch, 0, ch.length);
4846   }
4847 
4848   /**
4849    * Push a new internal input source.
4850    * <p>This method is useful for expanding an internal entity,
4851    * or for unreading a string of characters.  It creates a new
4852    * readBuffer containing the characters in the array, instead
4853    * of characters converted from an input byte stream.
4854    * @param ch The char array to push.
4855    * @see #pushString
4856    * @see #pushURL
4857    * @see #readBuffer
4858    * @see #sourceType
4859    * @see #pushInput
4860    */
pushCharArray(String ename, char[] ch, int start, int length)4861   private void pushCharArray(String ename, char[] ch, int start, int length)
4862     throws SAXException
4863   {
4864     // Push the existing status
4865     pushInput(ename);
4866     if (ename != null && doReport)
4867       {
4868         dataBufferFlush();
4869         handler.startInternalEntity(ename);
4870       }
4871     sourceType = INPUT_INTERNAL;
4872     readBuffer = ch;
4873     readBufferPos = start;
4874     readBufferLength = length;
4875     readBufferOverflow = -1;
4876   }
4877 
4878   /**
4879    * Save the current input source onto the stack.
4880    * <p>This method saves all of the global variables associated with
4881    * the current input source, so that they can be restored when a new
4882    * input source has finished.  It also tests for entity recursion.
4883    * <p>The method saves the following global variables onto a stack
4884    * using a fixed-length array:
4885    * <ol>
4886    * <li>sourceType
4887    * <li>externalEntity
4888    * <li>readBuffer
4889    * <li>readBufferPos
4890    * <li>readBufferLength
4891    * <li>line
4892    * <li>encoding
4893    * </ol>
4894    * @param ename The name of the entity (if any) causing the new input.
4895    * @see #popInput
4896    * @see #sourceType
4897    * @see #externalEntity
4898    * @see #readBuffer
4899    * @see #readBufferPos
4900    * @see #readBufferLength
4901    * @see #line
4902    * @see #encoding
4903    */
pushInput(String ename)4904   private void pushInput(String ename)
4905     throws SAXException
4906   {
4907     // Check for entity recursion.
4908     if (ename != null)
4909       {
4910         Iterator entities = entityStack.iterator();
4911         while (entities.hasNext())
4912           {
4913             String e = (String) entities.next();
4914             if (e != null && e == ename)
4915               {
4916                 error("recursive reference to entity", ename, null);
4917               }
4918           }
4919       }
4920     entityStack.addLast(ename);
4921 
4922     // Don't bother if there is no current input.
4923     if (sourceType == INPUT_NONE)
4924       {
4925         return;
4926       }
4927 
4928     // Set up a snapshot of the current
4929     // input source.
4930     Input input = new Input();
4931 
4932     input.sourceType = sourceType;
4933     input.externalEntity = externalEntity;
4934     input.readBuffer = readBuffer;
4935     input.readBufferPos = readBufferPos;
4936     input.readBufferLength = readBufferLength;
4937     input.line = line;
4938     input.encoding = encoding;
4939     input.readBufferOverflow = readBufferOverflow;
4940     input.is = is;
4941     input.currentByteCount = currentByteCount;
4942     input.column = column;
4943     input.reader = reader;
4944 
4945     // Push it onto the stack.
4946     inputStack.addLast(input);
4947   }
4948 
4949   /**
4950    * Restore a previous input source.
4951    * <p>This method restores all of the global variables associated with
4952    * the current input source.
4953    * @exception java.io.EOFException
4954    *    If there are no more entries on the input stack.
4955    * @see #pushInput
4956    * @see #sourceType
4957    * @see #externalEntity
4958    * @see #readBuffer
4959    * @see #readBufferPos
4960    * @see #readBufferLength
4961    * @see #line
4962    * @see #encoding
4963    */
popInput()4964   private void popInput()
4965     throws SAXException, IOException
4966   {
4967     String ename = (String) entityStack.removeLast();
4968 
4969     if (ename != null && doReport)
4970       {
4971         dataBufferFlush();
4972       }
4973     switch (sourceType)
4974       {
4975       case INPUT_STREAM:
4976         handler.endExternalEntity(ename);
4977         is.close();
4978         break;
4979       case INPUT_READER:
4980         handler.endExternalEntity(ename);
4981         reader.close();
4982         break;
4983       case INPUT_INTERNAL:
4984         if (ename != null && doReport)
4985           {
4986             handler.endInternalEntity(ename);
4987           }
4988         break;
4989       }
4990 
4991     // Throw an EOFException if there
4992     // is nothing else to pop.
4993     if (inputStack.isEmpty())
4994       {
4995         throw new EOFException("no more input");
4996       }
4997 
4998     Input input = (Input) inputStack.removeLast();
4999 
5000     sourceType = input.sourceType;
5001     externalEntity = input.externalEntity;
5002     readBuffer = input.readBuffer;
5003     readBufferPos = input.readBufferPos;
5004     readBufferLength = input.readBufferLength;
5005     line = input.line;
5006     encoding = input.encoding;
5007     readBufferOverflow = input.readBufferOverflow;
5008     is = input.is;
5009     currentByteCount = input.currentByteCount;
5010     column = input.column;
5011     reader = input.reader;
5012   }
5013 
5014   /**
5015    * Return true if we can read the expected character.
5016    * <p>Note that the character will be removed from the input stream
5017    * on success, but will be put back on failure.  Do not attempt to
5018    * read the character again if the method succeeds.
5019    * @param delim The character that should appear next.  For a
5020    *        insensitive match, you must supply this in upper-case.
5021    * @return true if the character was successfully read, or false if
5022    *   it was not.
5023    * @see #tryRead (String)
5024    */
tryRead(char delim)5025   private boolean tryRead(char delim)
5026     throws SAXException, IOException
5027   {
5028     char c;
5029 
5030     // Read the character
5031     c = readCh();
5032 
5033     // Test for a match, and push the character
5034     // back if the match fails.
5035     if (c == delim)
5036       {
5037         return true;
5038       }
5039     else
5040       {
5041         unread(c);
5042         return false;
5043       }
5044   }
5045 
5046   /**
5047    * Return true if we can read the expected string.
5048    * <p>This is simply a convenience method.
5049    * <p>Note that the string will be removed from the input stream
5050    * on success, but will be put back on failure.  Do not attempt to
5051    * read the string again if the method succeeds.
5052    * <p>This method will push back a character rather than an
5053    * array whenever possible (probably the majority of cases).
5054    * @param delim The string that should appear next.
5055    * @return true if the string was successfully read, or false if
5056    *   it was not.
5057    * @see #tryRead (char)
5058    */
tryRead(String delim)5059   private boolean tryRead(String delim)
5060     throws SAXException, IOException
5061   {
5062     return tryRead(delim.toCharArray());
5063   }
5064 
tryRead(char[] ch)5065   private boolean tryRead(char[] ch)
5066     throws SAXException, IOException
5067   {
5068     char c;
5069 
5070     // Compare the input, character-
5071     // by character.
5072 
5073     for (int i = 0; i < ch.length; i++)
5074       {
5075         c = readCh();
5076         if (c != ch[i])
5077           {
5078             unread(c);
5079             if (i != 0)
5080               {
5081                 unread(ch, i);
5082               }
5083             return false;
5084           }
5085       }
5086     return true;
5087   }
5088 
5089   /**
5090    * Return true if we can read some whitespace.
5091    * <p>This is simply a convenience method.
5092    * <p>This method will push back a character rather than an
5093    * array whenever possible (probably the majority of cases).
5094    * @return true if whitespace was found.
5095    */
tryWhitespace()5096   private boolean tryWhitespace()
5097     throws SAXException, IOException
5098   {
5099     char c;
5100     c = readCh();
5101     if (isWhitespace(c))
5102       {
5103         skipWhitespace();
5104         return true;
5105       }
5106     else
5107       {
5108         unread(c);
5109         return false;
5110       }
5111   }
5112 
5113   /**
5114    * Read all data until we find the specified string.
5115    * This is useful for scanning CDATA sections and PIs.
5116    * <p>This is inefficient right now, since it calls tryRead ()
5117    * for every character.
5118    * @param delim The string delimiter
5119    * @see #tryRead (String, boolean)
5120    * @see #readCh
5121    */
parseUntil(String delim)5122   private void parseUntil(String delim)
5123     throws SAXException, IOException
5124   {
5125     parseUntil(delim.toCharArray());
5126   }
5127 
parseUntil(char[] delim)5128   private void parseUntil(char[] delim)
5129     throws SAXException, IOException
5130   {
5131     char c;
5132     int startLine = line;
5133 
5134     try
5135       {
5136         while (!tryRead(delim))
5137           {
5138             c = readCh();
5139             dataBufferAppend(c);
5140           }
5141       }
5142     catch (EOFException e)
5143       {
5144         error("end of input while looking for delimiter "
5145               + "(started on line " + startLine
5146               + ')', null, new String(delim));
5147       }
5148   }
5149 
5150   //////////////////////////////////////////////////////////////////////
5151   // Low-level I/O.
5152   //////////////////////////////////////////////////////////////////////
5153 
5154   /**
5155    * Prefetch US-ASCII XML/text decl from input stream into read buffer.
5156    * Doesn't buffer more than absolutely needed, so that when an encoding
5157    * decl says we need to create an InputStreamReader, we can discard our
5158    * buffer and reset().  Caller knows the first chars of the decl exist
5159    * in the input stream.
5160    */
prefetchASCIIEncodingDecl()5161   private void prefetchASCIIEncodingDecl()
5162     throws SAXException, IOException
5163   {
5164     int ch;
5165     readBufferPos = readBufferLength = 0;
5166 
5167     is.mark(readBuffer.length);
5168     while (true)
5169       {
5170         ch = is.read();
5171         readBuffer[readBufferLength++] = (char) ch;
5172         switch (ch)
5173           {
5174           case (int) '>':
5175             return;
5176           case -1:
5177             error("file ends before end of XML or encoding declaration.",
5178                   null, "?>");
5179           }
5180         if (readBuffer.length == readBufferLength)
5181           {
5182             error("unfinished XML or encoding declaration");
5183           }
5184       }
5185   }
5186 
5187   /**
5188    * Read a chunk of data from an external input source.
5189    * <p>This is simply a front-end that fills the rawReadBuffer
5190    * with bytes, then calls the appropriate encoding handler.
5191    * @see #encoding
5192    * @see #rawReadBuffer
5193    * @see #readBuffer
5194    * @see #filterCR
5195    * @see #copyUtf8ReadBuffer
5196    * @see #copyIso8859_1ReadBuffer
5197    * @see #copyUcs_2ReadBuffer
5198    * @see #copyUcs_4ReadBuffer
5199    */
readDataChunk()5200   private void readDataChunk()
5201     throws SAXException, IOException
5202   {
5203     int count;
5204 
5205     // See if we have any overflow (filterCR sets for CR at end)
5206     if (readBufferOverflow > -1)
5207       {
5208         readBuffer[0] = (char) readBufferOverflow;
5209         readBufferOverflow = -1;
5210         readBufferPos = 1;
5211         sawCR = true;
5212       }
5213     else
5214       {
5215         readBufferPos = 0;
5216         sawCR = false;
5217       }
5218 
5219     // input from a character stream.
5220     if (sourceType == INPUT_READER)
5221       {
5222         count = reader.read(readBuffer,
5223                             readBufferPos, READ_BUFFER_MAX - readBufferPos);
5224         if (count < 0)
5225           {
5226             readBufferLength = readBufferPos;
5227           }
5228         else
5229           {
5230             readBufferLength = readBufferPos + count;
5231           }
5232         if (readBufferLength > 0)
5233           {
5234             filterCR(count >= 0);
5235           }
5236         sawCR = false;
5237         return;
5238       }
5239 
5240     // Read as many bytes as possible into the raw buffer.
5241     count = is.read(rawReadBuffer, 0, READ_BUFFER_MAX);
5242 
5243     // Dispatch to an encoding-specific reader method to populate
5244     // the readBuffer.  In most parser speed profiles, these routines
5245     // show up at the top of the CPU usage chart.
5246     if (count > 0)
5247       {
5248         switch (encoding)
5249           {
5250             // one byte builtins
5251           case ENCODING_ASCII:
5252             copyIso8859_1ReadBuffer(count, (char) 0x0080);
5253             break;
5254           case ENCODING_UTF_8:
5255             copyUtf8ReadBuffer(count);
5256             break;
5257           case ENCODING_ISO_8859_1:
5258             copyIso8859_1ReadBuffer(count, (char) 0);
5259             break;
5260 
5261             // two byte builtins
5262           case ENCODING_UCS_2_12:
5263             copyUcs2ReadBuffer(count, 8, 0);
5264             break;
5265           case ENCODING_UCS_2_21:
5266             copyUcs2ReadBuffer(count, 0, 8);
5267             break;
5268 
5269             // four byte builtins
5270           case ENCODING_UCS_4_1234:
5271             copyUcs4ReadBuffer(count, 24, 16, 8, 0);
5272             break;
5273           case ENCODING_UCS_4_4321:
5274             copyUcs4ReadBuffer(count, 0, 8, 16, 24);
5275             break;
5276           case ENCODING_UCS_4_2143:
5277             copyUcs4ReadBuffer(count, 16, 24, 0, 8);
5278             break;
5279           case ENCODING_UCS_4_3412:
5280             copyUcs4ReadBuffer(count, 8, 0, 24, 16);
5281             break;
5282           }
5283       }
5284     else
5285       {
5286         readBufferLength = readBufferPos;
5287       }
5288 
5289     readBufferPos = 0;
5290 
5291     // Filter out all carriage returns if we've seen any
5292     // (including any saved from a previous read)
5293     if (sawCR)
5294       {
5295         filterCR(count >= 0);
5296         sawCR = false;
5297 
5298         // must actively report EOF, lest some CRs get lost.
5299         if (readBufferLength == 0 && count >= 0)
5300           {
5301             readDataChunk();
5302           }
5303       }
5304 
5305     if (count > 0)
5306       {
5307         currentByteCount += count;
5308       }
5309   }
5310 
5311   /**
5312    * Filter carriage returns in the read buffer.
5313    * CRLF becomes LF; CR becomes LF.
5314    * @param moreData true iff more data might come from the same source
5315    * @see #readDataChunk
5316    * @see #readBuffer
5317    * @see #readBufferOverflow
5318    */
filterCR(boolean moreData)5319   private void filterCR(boolean moreData)
5320   {
5321     int i, j;
5322 
5323     readBufferOverflow = -1;
5324 
5325 loop:
5326     for (i = j = readBufferPos; j < readBufferLength; i++, j++)
5327       {
5328         switch (readBuffer[j])
5329           {
5330           case '\r':
5331             if (j == readBufferLength - 1)
5332               {
5333                 if (moreData)
5334                   {
5335                     readBufferOverflow = '\r';
5336                     readBufferLength--;
5337                   }
5338                 else   // CR at end of buffer
5339                   {
5340                     readBuffer[i++] = '\n';
5341                   }
5342                 break loop;
5343               }
5344             else if (readBuffer[j + 1] == '\n')
5345               {
5346                 j++;
5347               }
5348             readBuffer[i] = '\n';
5349             break;
5350 
5351           case '\n':
5352           default:
5353             readBuffer[i] = readBuffer[j];
5354             break;
5355           }
5356       }
5357     readBufferLength = i;
5358   }
5359 
5360   /**
5361    * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters.
5362    * <p>When readDataChunk () calls this method, the raw bytes are in
5363    * rawReadBuffer, and the final characters will appear in
5364    * readBuffer.
5365    * <p>Note that as of Unicode 3.1, good practice became a requirement,
5366    * so that each Unicode character has exactly one UTF-8 representation.
5367    * @param count The number of bytes to convert.
5368    * @see #readDataChunk
5369    * @see #rawReadBuffer
5370    * @see #readBuffer
5371    * @see #getNextUtf8Byte
5372    */
copyUtf8ReadBuffer(int count)5373   private void copyUtf8ReadBuffer(int count)
5374     throws SAXException, IOException
5375   {
5376     int i = 0;
5377     int j = readBufferPos;
5378     int b1;
5379     char c = 0;
5380 
5381     /*
5382     // check once, so the runtime won't (if it's smart enough)
5383     if (count < 0 || count > rawReadBuffer.length)
5384     throw new ArrayIndexOutOfBoundsException (Integer.toString (count));
5385      */
5386 
5387     while (i < count)
5388       {
5389         b1 = rawReadBuffer[i++];
5390 
5391         // Determine whether we are dealing
5392         // with a one-, two-, three-, or four-
5393         // byte sequence.
5394         if (b1 < 0)
5395           {
5396             if ((b1 & 0xe0) == 0xc0)
5397               {
5398                 // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
5399                 c = (char) (((b1 & 0x1f) << 6)
5400                             | getNextUtf8Byte(i++, count));
5401                 if (c < 0x0080)
5402                   {
5403                     encodingError("Illegal two byte UTF-8 sequence",
5404                                   c, 0);
5405                   }
5406 
5407                 //Sec 2.11
5408                 // [1] the two-character sequence #xD #xA
5409                 // [2] the two-character sequence #xD #x85
5410                 if ((c == 0x0085 || c == 0x000a) && sawCR)
5411                   {
5412                     continue;
5413                   }
5414 
5415                 // Sec 2.11
5416                 // [3] the single character #x85
5417 
5418                 if (c == 0x0085 && xmlVersion == XML_11)
5419                   {
5420                     readBuffer[j++] = '\r';
5421                   }
5422               }
5423             else if ((b1 & 0xf0) == 0xe0)
5424               {
5425                 // 3-byte sequence:
5426                 // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
5427                 // most CJKV characters
5428                 c = (char) (((b1 & 0x0f) << 12) |
5429                             (getNextUtf8Byte(i++, count) << 6) |
5430                             getNextUtf8Byte(i++, count));
5431                 //sec 2.11
5432                 //[4] the single character #x2028
5433                 if (c == 0x2028 && xmlVersion == XML_11)
5434                   {
5435                     readBuffer[j++] = '\r';
5436                     sawCR = true;
5437                     continue;
5438                   }
5439                 if (c < 0x0800 || (c >= 0xd800 && c <= 0xdfff))
5440                   {
5441                     encodingError("Illegal three byte UTF-8 sequence",
5442                                   c, 0);
5443                   }
5444               }
5445             else if ((b1 & 0xf8) == 0xf0)
5446               {
5447                 // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
5448                 //     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
5449                 // (uuuuu = wwww + 1)
5450                 // "Surrogate Pairs" ... from the "Astral Planes"
5451                 // Unicode 3.1 assigned the first characters there
5452                 int iso646 = b1 & 07;
5453                 iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count);
5454                 iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count);
5455                 iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count);
5456 
5457                 if (iso646 <= 0xffff)
5458                   {
5459                     encodingError("Illegal four byte UTF-8 sequence",
5460                                   iso646, 0);
5461                   }
5462                 else
5463                   {
5464                     if (iso646 > 0x0010ffff)
5465                       {
5466                         encodingError("UTF-8 value out of range for Unicode",
5467                                       iso646, 0);
5468                       }
5469                     iso646 -= 0x010000;
5470                     readBuffer[j++] = (char) (0xd800 | (iso646 >> 10));
5471                     readBuffer[j++] = (char) (0xdc00 | (iso646 & 0x03ff));
5472                     continue;
5473                   }
5474               }
5475             else
5476               {
5477                 // The five and six byte encodings aren't supported;
5478                 // they exceed the Unicode (and XML) range.
5479                 encodingError("unsupported five or six byte UTF-8 sequence",
5480                               0xff & b1, i);
5481                 // NOTREACHED
5482                 c = 0;
5483               }
5484           }
5485         else
5486           {
5487             // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
5488             // (US-ASCII character, "common" case, one branch to here)
5489             c = (char) b1;
5490           }
5491         readBuffer[j++] = c;
5492         if (c == '\r')
5493           {
5494             sawCR = true;
5495           }
5496       }
5497     // How many characters have we read?
5498     readBufferLength = j;
5499   }
5500 
5501   /**
5502    * Return the next byte value in a UTF-8 sequence.
5503    * If it is not possible to get a byte from the current
5504    * entity, throw an exception.
5505    * @param pos The current position in the rawReadBuffer.
5506    * @param count The number of bytes in the rawReadBuffer
5507    * @return The significant six bits of a non-initial byte in
5508    *   a UTF-8 sequence.
5509    * @exception EOFException If the sequence is incomplete.
5510    */
getNextUtf8Byte(int pos, int count)5511   private int getNextUtf8Byte(int pos, int count)
5512     throws SAXException, IOException
5513   {
5514     int val;
5515 
5516     // Take a character from the buffer
5517     // or from the actual input stream.
5518     if (pos < count)
5519       {
5520         val = rawReadBuffer[pos];
5521       }
5522     else
5523       {
5524         val = is.read();
5525         if (val == -1)
5526           {
5527             encodingError("unfinished multi-byte UTF-8 sequence at EOF",
5528                           -1, pos);
5529           }
5530       }
5531 
5532     // Check for the correct bits at the start.
5533     if ((val & 0xc0) != 0x80)
5534       {
5535         encodingError("bad continuation of multi-byte UTF-8 sequence",
5536                       val, pos + 1);
5537       }
5538 
5539     // Return the significant bits.
5540     return (val & 0x3f);
5541   }
5542 
5543   /**
5544    * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into
5545    * UTF-16 characters.
5546    *
5547    * <p>When readDataChunk () calls this method, the raw bytes are in
5548    * rawReadBuffer, and the final characters will appear in
5549    * readBuffer.
5550    *
5551    * @param count The number of bytes to convert.
5552    * @param mask For ASCII conversion, 0x7f; else, 0xff.
5553    * @see #readDataChunk
5554    * @see #rawReadBuffer
5555    * @see #readBuffer
5556    */
copyIso8859_1ReadBuffer(int count, char mask)5557   private void copyIso8859_1ReadBuffer(int count, char mask)
5558     throws IOException
5559   {
5560     int i, j;
5561     for (i = 0, j = readBufferPos; i < count; i++, j++)
5562       {
5563         char c = (char) (rawReadBuffer[i] & 0xff);
5564         if ((c & mask) != 0)
5565           {
5566             throw new CharConversionException("non-ASCII character U+"
5567                                               + Integer.toHexString(c));
5568           }
5569         if (c == 0x0085 && xmlVersion == XML_11)
5570           {
5571             c = '\r';
5572           }
5573         readBuffer[j] = c;
5574         if (c == '\r')
5575           {
5576             sawCR = true;
5577           }
5578       }
5579     readBufferLength = j;
5580   }
5581 
5582   /**
5583    * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters
5584    * (as used in Java string manipulation).
5585    *
5586    * <p>When readDataChunk () calls this method, the raw bytes are in
5587    * rawReadBuffer, and the final characters will appear in
5588    * readBuffer.
5589    * @param count The number of bytes to convert.
5590    * @param shift1 The number of bits to shift byte 1.
5591    * @param shift2 The number of bits to shift byte 2
5592    * @see #readDataChunk
5593    * @see #rawReadBuffer
5594    * @see #readBuffer
5595    */
copyUcs2ReadBuffer(int count, int shift1, int shift2)5596   private void copyUcs2ReadBuffer(int count, int shift1, int shift2)
5597     throws SAXException
5598   {
5599     int j = readBufferPos;
5600 
5601     if (count > 0 && (count % 2) != 0)
5602       {
5603         encodingError("odd number of bytes in UCS-2 encoding", -1, count);
5604       }
5605     // The loops are faster with less internal brancing; hence two
5606     if (shift1 == 0)
5607       {  // "UTF-16-LE"
5608         for (int i = 0; i < count; i += 2)
5609           {
5610             char c = (char) (rawReadBuffer[i + 1] << 8);
5611             c |= 0xff & rawReadBuffer[i];
5612             readBuffer[j++] = c;
5613             if (c == '\r')
5614               {
5615                 sawCR = true;
5616               }
5617           }
5618       }
5619     else
5620       {  // "UTF-16-BE"
5621         for (int i = 0; i < count; i += 2)
5622           {
5623             char c = (char) (rawReadBuffer[i] << 8);
5624             c |= 0xff & rawReadBuffer[i + 1];
5625             readBuffer[j++] = c;
5626             if (c == '\r')
5627               {
5628                 sawCR = true;
5629               }
5630           }
5631       }
5632     readBufferLength = j;
5633   }
5634 
5635   /**
5636    * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters.
5637    *
5638    * <p>When readDataChunk () calls this method, the raw bytes are in
5639    * rawReadBuffer, and the final characters will appear in
5640    * readBuffer.
5641    * <p>Java has Unicode chars, and this routine uses surrogate pairs
5642    * for ISO-10646 values between 0x00010000 and 0x000fffff.  An
5643    * exception is thrown if the ISO-10646 character has no Unicode
5644    * representation.
5645    *
5646    * @param count The number of bytes to convert.
5647    * @param shift1 The number of bits to shift byte 1.
5648    * @param shift2 The number of bits to shift byte 2
5649    * @param shift3 The number of bits to shift byte 2
5650    * @param shift4 The number of bits to shift byte 2
5651    * @see #readDataChunk
5652    * @see #rawReadBuffer
5653    * @see #readBuffer
5654    */
copyUcs4ReadBuffer(int count, int shift1, int shift2, int shift3, int shift4)5655   private void copyUcs4ReadBuffer(int count, int shift1, int shift2,
5656                                   int shift3, int shift4)
5657     throws SAXException
5658   {
5659     int j = readBufferPos;
5660 
5661     if (count > 0 && (count % 4) != 0)
5662       {
5663         encodingError("number of bytes in UCS-4 encoding " +
5664                       "not divisible by 4",
5665                       -1, count);
5666       }
5667     for (int i = 0; i < count; i += 4)
5668       {
5669         int value = (((rawReadBuffer [i] & 0xff) << shift1) |
5670                      ((rawReadBuffer [i + 1] & 0xff) << shift2) |
5671                      ((rawReadBuffer [i + 2] & 0xff) << shift3) |
5672                      ((rawReadBuffer [i + 3] & 0xff) << shift4));
5673         if (value < 0x0000ffff)
5674           {
5675             readBuffer [j++] = (char) value;
5676             if (value == (int) '\r')
5677               {
5678                 sawCR = true;
5679               }
5680           }
5681         else if (value < 0x0010ffff)
5682           {
5683             value -= 0x010000;
5684             readBuffer[j++] = (char) (0xd8 | ((value >> 10) & 0x03ff));
5685             readBuffer[j++] = (char) (0xdc | (value & 0x03ff));
5686           }
5687         else
5688           {
5689             encodingError("UCS-4 value out of range for Unicode",
5690                           value, i);
5691           }
5692       }
5693     readBufferLength = j;
5694   }
5695 
5696   /**
5697    * Report a character encoding error.
5698    */
encodingError(String message, int value, int offset)5699   private void encodingError(String message, int value, int offset)
5700     throws SAXException
5701   {
5702     if (value != -1)
5703       {
5704         message = message + " (character code: 0x" +
5705           Integer.toHexString(value) + ')';
5706         error(message);
5707       }
5708   }
5709 
5710   //////////////////////////////////////////////////////////////////////
5711   // Local Variables.
5712   //////////////////////////////////////////////////////////////////////
5713 
5714   /**
5715    * Re-initialize the variables for each parse.
5716    */
initializeVariables()5717   private void initializeVariables()
5718   {
5719     // First line
5720     line = 1;
5721     column = 0;
5722 
5723     // Set up the buffers for data and names
5724     dataBufferPos = 0;
5725     dataBuffer = new char[DATA_BUFFER_INITIAL];
5726     nameBufferPos = 0;
5727     nameBuffer = new char[NAME_BUFFER_INITIAL];
5728 
5729     // Set up the DTD hash tables
5730     elementInfo = new HashMap();
5731     entityInfo = new HashMap();
5732     notationInfo = new HashMap();
5733     skippedPE = false;
5734 
5735     // Set up the variables for the current
5736     // element context.
5737     currentElement = null;
5738     currentElementContent = CONTENT_UNDECLARED;
5739 
5740     // Set up the input variables
5741     sourceType = INPUT_NONE;
5742     inputStack = new LinkedList();
5743     entityStack = new LinkedList();
5744     externalEntity = null;
5745     tagAttributePos = 0;
5746     tagAttributes = new String[100];
5747     rawReadBuffer = new byte[READ_BUFFER_MAX];
5748     readBufferOverflow = -1;
5749 
5750     scratch = new InputSource();
5751 
5752     inLiteral = false;
5753     expandPE = false;
5754     peIsError = false;
5755 
5756     doReport = false;
5757 
5758     inCDATA = false;
5759 
5760     symbolTable = new Object[SYMBOL_TABLE_LENGTH][];
5761   }
5762 
5763   static class ExternalIdentifiers
5764   {
5765 
5766     String publicId;
5767     String systemId;
5768     String baseUri;
5769 
ExternalIdentifiers()5770     ExternalIdentifiers()
5771     {
5772     }
5773 
ExternalIdentifiers(String publicId, String systemId, String baseUri)5774     ExternalIdentifiers(String publicId, String systemId, String baseUri)
5775     {
5776       this.publicId = publicId;
5777       this.systemId = systemId;
5778       this.baseUri = baseUri;
5779     }
5780 
5781   }
5782 
5783   static class EntityInfo
5784   {
5785 
5786     int type;
5787     ExternalIdentifiers ids;
5788     String value;
5789     String notationName;
5790 
5791   }
5792 
5793   static class AttributeDecl
5794   {
5795 
5796     String type;
5797     String value;
5798     int valueType;
5799     String enumeration;
5800     String defaultValue;
5801 
5802   }
5803 
5804   static class ElementDecl
5805   {
5806 
5807     int contentType;
5808     String contentModel;
5809     HashMap attributes;
5810 
5811   }
5812 
5813   static class Input
5814   {
5815 
5816     int sourceType;
5817     URLConnection externalEntity;
5818     char[] readBuffer;
5819     int readBufferPos;
5820     int readBufferLength;
5821     int line;
5822     int encoding;
5823     int readBufferOverflow;
5824     InputStream is;
5825     int currentByteCount;
5826     int column;
5827     Reader reader;
5828 
5829   }
5830 
5831 }
5832