1 /* XmlParser.java -- 2 Copyright (C) 1999,2000,2001 Free Software Foundation, Inc. 3 4 This file is part of GNU Classpath. 5 6 GNU Classpath is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 GNU Classpath is distributed in the hope that it will be useful, but 12 WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GNU Classpath; see the file COPYING. If not, write to the 18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19 02110-1301 USA. 20 21 Linking this library statically or dynamically with other modules is 22 making a combined work based on this library. Thus, the terms and 23 conditions of the GNU General Public License cover the whole 24 combination. 25 26 As a special exception, the copyright holders of this library give you 27 permission to link this library with independent modules to produce an 28 executable, regardless of the license terms of these independent 29 modules, and to copy and distribute the resulting executable under 30 terms of your choice, provided that you also meet, for each linked 31 independent module, the terms and conditions of the license of that 32 module. An independent module is a module which is not derived from 33 or based on this library. If you modify this library, you may extend 34 this exception to your version of the library, but you are not 35 obligated to do so. If you do not wish to do so, delete this 36 exception statement from your version. 37 38 Partly derived from code which carried the following notice: 39 40 Copyright (c) 1997, 1998 by Microstar Software Ltd. 41 42 AElfred is free for both commercial and non-commercial use and 43 redistribution, provided that Microstar's copyright and disclaimer are 44 retained intact. You are free to modify AElfred for your own use and 45 to redistribute AElfred with your modifications, provided that the 46 modifications are clearly documented. 47 48 This program is distributed in the hope that it will be useful, but 49 WITHOUT ANY WARRANTY; without even the implied warranty of 50 merchantability or fitness for a particular purpose. Please use it AT 51 YOUR OWN RISK. 52 */ 53 54 package gnu.xml.aelfred2; 55 56 import gnu.java.security.action.GetPropertyAction; 57 58 import java.io.BufferedInputStream; 59 import java.io.CharConversionException; 60 import java.io.EOFException; 61 import java.io.InputStream; 62 import java.io.InputStreamReader; 63 import java.io.IOException; 64 import java.io.Reader; 65 import java.io.UnsupportedEncodingException; 66 import java.net.URL; 67 import java.net.URLConnection; 68 import java.security.AccessController; 69 70 import java.util.Iterator; 71 import java.util.HashMap; 72 import java.util.LinkedList; 73 74 import org.xml.sax.InputSource; 75 import org.xml.sax.SAXException; 76 77 78 /** 79 * Parse XML documents and return parse events through call-backs. 80 * Use the <code>SAXDriver</code> class as your entry point, as all 81 * internal parser interfaces are subject to change. 82 * 83 * @author Written by David Megginson <dmeggins@microstar.com> 84 * (version 1.2a with bugfixes) 85 * @author Updated by David Brownell <dbrownell@users.sourceforge.net> 86 * @see SAXDriver 87 */ 88 final class XmlParser 89 { 90 91 // avoid slow per-character readCh() 92 private final static boolean USE_CHEATS = true; 93 94 //////////////////////////////////////////////////////////////////////// 95 // Constants. 96 //////////////////////////////////////////////////////////////////////// 97 98 // 99 // Constants for element content type. 100 // 101 102 /** 103 * Constant: an element has not been declared. 104 * @see #getElementContentType 105 */ 106 public final static int CONTENT_UNDECLARED = 0; 107 108 /** 109 * Constant: the element has a content model of ANY. 110 * @see #getElementContentType 111 */ 112 public final static int CONTENT_ANY = 1; 113 114 /** 115 * Constant: the element has declared content of EMPTY. 116 * @see #getElementContentType 117 */ 118 public final static int CONTENT_EMPTY = 2; 119 120 /** 121 * Constant: the element has mixed content. 122 * @see #getElementContentType 123 */ 124 public final static int CONTENT_MIXED = 3; 125 126 /** 127 * Constant: the element has element content. 128 * @see #getElementContentType 129 */ 130 public final static int CONTENT_ELEMENTS = 4; 131 132 133 // 134 // Constants for the entity type. 135 // 136 137 /** 138 * Constant: the entity has not been declared. 139 * @see #getEntityType 140 */ 141 public final static int ENTITY_UNDECLARED = 0; 142 143 /** 144 * Constant: the entity is internal. 145 * @see #getEntityType 146 */ 147 public final static int ENTITY_INTERNAL = 1; 148 149 /** 150 * Constant: the entity is external, non-parsable data. 151 * @see #getEntityType 152 */ 153 public final static int ENTITY_NDATA = 2; 154 155 /** 156 * Constant: the entity is external XML data. 157 * @see #getEntityType 158 */ 159 public final static int ENTITY_TEXT = 3; 160 161 // 162 // Attribute type constants are interned literal strings. 163 // 164 165 // 166 // Constants for supported encodings. "external" is just a flag. 167 // 168 private final static int ENCODING_EXTERNAL = 0; 169 private final static int ENCODING_UTF_8 = 1; 170 private final static int ENCODING_ISO_8859_1 = 2; 171 private final static int ENCODING_UCS_2_12 = 3; 172 private final static int ENCODING_UCS_2_21 = 4; 173 private final static int ENCODING_UCS_4_1234 = 5; 174 private final static int ENCODING_UCS_4_4321 = 6; 175 private final static int ENCODING_UCS_4_2143 = 7; 176 private final static int ENCODING_UCS_4_3412 = 8; 177 private final static int ENCODING_ASCII = 9; 178 179 // 180 // Constants for attribute default value. 181 // 182 183 /** 184 * Constant: the attribute is not declared. 185 * @see #getAttributeDefaultValueType 186 */ 187 public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30; 188 189 /** 190 * Constant: the attribute has a literal default value specified. 191 * @see #getAttributeDefaultValueType 192 * @see #getAttributeDefaultValue 193 */ 194 public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31; 195 196 /** 197 * Constant: the attribute was declared #IMPLIED. 198 * @see #getAttributeDefaultValueType 199 */ 200 public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32; 201 202 /** 203 * Constant: the attribute was declared #REQUIRED. 204 * @see #getAttributeDefaultValueType 205 */ 206 public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33; 207 208 /** 209 * Constant: the attribute was declared #FIXED. 210 * @see #getAttributeDefaultValueType 211 * @see #getAttributeDefaultValue 212 */ 213 public final static int ATTRIBUTE_DEFAULT_FIXED = 34; 214 215 // 216 // Constants for input. 217 // 218 private final static int INPUT_NONE = 0; 219 private final static int INPUT_INTERNAL = 1; 220 private final static int INPUT_STREAM = 3; 221 private final static int INPUT_READER = 5; 222 223 // 224 // Flags for reading literals. 225 // 226 // expand general entity refs (attribute values in dtd and content) 227 private final static int LIT_ENTITY_REF = 2; 228 // normalize this value (space chars) (attributes, public ids) 229 private final static int LIT_NORMALIZE = 4; 230 // literal is an attribute value 231 private final static int LIT_ATTRIBUTE = 8; 232 // don't expand parameter entities 233 private final static int LIT_DISABLE_PE = 16; 234 // don't expand [or parse] character refs 235 private final static int LIT_DISABLE_CREF = 32; 236 // don't parse general entity refs 237 private final static int LIT_DISABLE_EREF = 64; 238 // literal is a public ID value 239 private final static int LIT_PUBID = 256; 240 241 // 242 // Flags affecting PE handling in DTDs (if expandPE is true). 243 // PEs expand with space padding, except inside literals. 244 // 245 private final static int CONTEXT_NORMAL = 0; 246 private final static int CONTEXT_LITERAL = 1; 247 248 // Emit warnings for relative URIs with no base URI. 249 static boolean uriWarnings; 250 static 251 { 252 String key = "gnu.xml.aelfred2.XmlParser.uriWarnings"; 253 GetPropertyAction a = new GetPropertyAction(key); 254 uriWarnings = "true".equals(AccessController.doPrivileged(a)); 255 } 256 257 // 258 // The current XML handler interface. 259 // 260 private SAXDriver handler; 261 262 // 263 // I/O information. 264 // 265 private Reader reader; // current reader 266 private InputStream is; // current input stream 267 private int line; // current line number 268 private int column; // current column number 269 private int sourceType; // type of input source 270 private LinkedList inputStack; // stack of input soruces 271 private URLConnection externalEntity; // current external entity 272 private int encoding; // current character encoding 273 private int currentByteCount; // bytes read from current source 274 private InputSource scratch; // temporary 275 276 // 277 // Buffers for decoded but unparsed character input. 278 // 279 private char[] readBuffer; 280 private int readBufferPos; 281 private int readBufferLength; 282 private int readBufferOverflow; // overflow from last data chunk. 283 284 // 285 // Buffer for undecoded raw byte input. 286 // 287 private final static int READ_BUFFER_MAX = 16384; 288 private byte[] rawReadBuffer; 289 290 291 // 292 // Buffer for attribute values, char refs, DTD stuff. 293 // 294 private static int DATA_BUFFER_INITIAL = 4096; 295 private char[] dataBuffer; 296 private int dataBufferPos; 297 298 // 299 // Buffer for parsed names. 300 // 301 private static int NAME_BUFFER_INITIAL = 1024; 302 private char[] nameBuffer; 303 private int nameBufferPos; 304 305 // 306 // Save any standalone flag 307 // 308 private boolean docIsStandalone; 309 310 // 311 // Hashtables for DTD information on elements, entities, and notations. 312 // Populated until we start ignoring decls (because of skipping a PE) 313 // 314 private HashMap elementInfo; 315 private HashMap entityInfo; 316 private HashMap notationInfo; 317 private boolean skippedPE; 318 319 // 320 // Element type currently in force. 321 // 322 private String currentElement; 323 private int currentElementContent; 324 325 // 326 // Stack of entity names, to detect recursion. 327 // 328 private LinkedList entityStack; 329 330 // 331 // PE expansion is enabled in most chunks of the DTD, not all. 332 // When it's enabled, literals are treated differently. 333 // 334 private boolean inLiteral; 335 private boolean expandPE; 336 private boolean peIsError; 337 338 // 339 // can't report entity expansion inside two constructs: 340 // - attribute expansions (internal entities only) 341 // - markup declarations (parameter entities only) 342 // 343 private boolean doReport; 344 345 // 346 // Symbol table, for caching interned names. 347 // 348 // These show up wherever XML names or nmtokens are used: naming elements, 349 // attributes, PIs, notations, entities, and enumerated attribute values. 350 // 351 // NOTE: This hashtable doesn't grow. The default size is intended to be 352 // rather large for most documents. Example: one snapshot of the DocBook 353 // XML 4.1 DTD used only about 350 such names. As a rule, only pathological 354 // documents (ones that don't reuse names) should ever see much collision. 355 // 356 // Be sure that SYMBOL_TABLE_LENGTH always stays prime, for best hashing. 357 // "2039" keeps the hash table size at about two memory pages on typical 358 // 32 bit hardware. 359 // 360 private final static int SYMBOL_TABLE_LENGTH = 2039; 361 362 private Object[][] symbolTable; 363 364 // 365 // Hash table of attributes found in current start tag. 366 // 367 private String[] tagAttributes; 368 private int tagAttributePos; 369 370 // 371 // Utility flag: have we noticed a CR while reading the last 372 // data chunk? If so, we will have to go back and normalise 373 // CR or CR/LF line ends. 374 // 375 private boolean sawCR; 376 377 // 378 // Utility flag: are we in CDATA? If so, whitespace isn't ignorable. 379 // 380 private boolean inCDATA; 381 382 // 383 // Xml version. 384 // 385 private static final int XML_10 = 0; 386 private static final int XML_11 = 1; 387 private int xmlVersion = XML_10; 388 389 ////////////////////////////////////////////////////////////////////// 390 // Constructors. 391 //////////////////////////////////////////////////////////////////////// 392 393 /** 394 * Construct a new parser with no associated handler. 395 * @see #setHandler 396 * @see #parse 397 */ 398 // package private XmlParser()399 XmlParser() 400 { 401 } 402 403 /** 404 * Set the handler that will receive parsing events. 405 * @param handler The handler to receive callback events. 406 * @see #parse 407 */ 408 // package private setHandler(SAXDriver handler)409 void setHandler(SAXDriver handler) 410 { 411 this.handler = handler; 412 } 413 414 /** 415 * Parse an XML document from the character stream, byte stream, or URI 416 * that you provide (in that order of preference). Any URI that you 417 * supply will become the base URI for resolving relative URI, and may 418 * be used to acquire a reader or byte stream. 419 * 420 * <p> Only one thread at a time may use this parser; since it is 421 * private to this package, post-parse cleanup is done by the caller, 422 * which MUST NOT REUSE the parser (just null it). 423 * 424 * @param systemId Absolute URI of the document; should never be null, 425 * but may be so iff a reader <em>or</em> a stream is provided. 426 * @param publicId The public identifier of the document, or null. 427 * @param reader A character stream; must be null if stream isn't. 428 * @param stream A byte input stream; must be null if reader isn't. 429 * @param encoding The suggested encoding, or null if unknown. 430 * @exception java.lang.Exception Basically SAXException or IOException 431 */ 432 // package private doParse(String systemId, String publicId, Reader reader, InputStream stream, String encoding)433 void doParse(String systemId, String publicId, Reader reader, 434 InputStream stream, String encoding) 435 throws Exception 436 { 437 if (handler == null) 438 { 439 throw new IllegalStateException("no callback handler"); 440 } 441 442 initializeVariables(); 443 444 // predeclare the built-in entities here (replacement texts) 445 // we don't need to intern(), since we're guaranteed literals 446 // are always (globally) interned. 447 setInternalEntity("amp", "&"); 448 setInternalEntity("lt", "<"); 449 setInternalEntity("gt", ">"); 450 setInternalEntity("apos", "'"); 451 setInternalEntity("quot", """); 452 453 try 454 { 455 // pushURL first to ensure locator is correct in startDocument 456 // ... it might report an IO or encoding exception. 457 handler.startDocument(); 458 pushURL(false, "[document]", 459 // default baseURI: null 460 new ExternalIdentifiers(publicId, systemId, null), 461 reader, stream, encoding, false); 462 463 parseDocument(); 464 } 465 catch (EOFException e) 466 { 467 //empty input 468 error("empty document, with no root element."); 469 } 470 finally 471 { 472 if (reader != null) 473 { 474 try 475 { 476 reader.close(); 477 } 478 catch (IOException e) 479 { 480 /* ignore */ 481 } 482 } 483 if (stream != null) 484 { 485 try 486 { 487 stream.close(); 488 } 489 catch (IOException e) 490 { 491 /* ignore */ 492 } 493 } 494 if (is != null) 495 { 496 try 497 { 498 is.close(); 499 } 500 catch (IOException e) 501 { 502 /* ignore */ 503 } 504 } 505 scratch = null; 506 } 507 } 508 509 ////////////////////////////////////////////////////////////////////// 510 // Error reporting. 511 ////////////////////////////////////////////////////////////////////// 512 513 /** 514 * Report an error. 515 * @param message The error message. 516 * @param textFound The text that caused the error (or null). 517 * @see SAXDriver#error 518 * @see #line 519 */ error(String message, String textFound, String textExpected)520 private void error(String message, String textFound, String textExpected) 521 throws SAXException 522 { 523 if (textFound != null) 524 { 525 message = message + " (found \"" + textFound + "\")"; 526 } 527 if (textExpected != null) 528 { 529 message = message + " (expected \"" + textExpected + "\")"; 530 } 531 handler.fatal(message); 532 533 // "can't happen" 534 throw new SAXException(message); 535 } 536 537 /** 538 * Report a serious error. 539 * @param message The error message. 540 * @param textFound The text that caused the error (or null). 541 */ error(String message, char textFound, String textExpected)542 private void error(String message, char textFound, String textExpected) 543 throws SAXException 544 { 545 error(message, Character.toString(textFound), textExpected); 546 } 547 548 /** 549 * Report typical case fatal errors. 550 */ error(String message)551 private void error(String message) 552 throws SAXException 553 { 554 handler.fatal(message); 555 } 556 557 ////////////////////////////////////////////////////////////////////// 558 // Major syntactic productions. 559 ////////////////////////////////////////////////////////////////////// 560 561 /** 562 * Parse an XML document. 563 * <pre> 564 * [1] document ::= prolog element Misc* 565 * </pre> 566 * <p>This is the top-level parsing function for a single XML 567 * document. As a minimum, a well-formed document must have 568 * a document element, and a valid document must have a prolog 569 * (one with doctype) as well. 570 */ parseDocument()571 private void parseDocument() 572 throws Exception 573 { 574 try 575 { // added by MHK 576 boolean sawDTD = parseProlog(); 577 require('<'); 578 parseElement(!sawDTD); 579 } 580 catch (EOFException ee) 581 { // added by MHK 582 error("premature end of file", "[EOF]", null); 583 } 584 585 try 586 { 587 parseMisc(); //skip all white, PIs, and comments 588 char c = readCh(); //if this doesn't throw an exception... 589 error("unexpected characters after document end", c, null); 590 } 591 catch (EOFException e) 592 { 593 return; 594 } 595 } 596 597 static final char[] startDelimComment = { '<', '!', '-', '-' }; 598 static final char[] endDelimComment = { '-', '-' }; 599 600 /** 601 * Skip a comment. 602 * <pre> 603 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->" 604 * </pre> 605 * <p> (The <code><!--</code> has already been read.) 606 */ parseComment()607 private void parseComment() 608 throws Exception 609 { 610 char c; 611 boolean saved = expandPE; 612 613 expandPE = false; 614 parseUntil(endDelimComment); 615 require('>'); 616 expandPE = saved; 617 handler.comment(dataBuffer, 0, dataBufferPos); 618 dataBufferPos = 0; 619 } 620 621 static final char[] startDelimPI = { '<', '?' }; 622 static final char[] endDelimPI = { '?', '>' }; 623 624 /** 625 * Parse a processing instruction and do a call-back. 626 * <pre> 627 * [16] PI ::= '<?' PITarget 628 * (S (Char* - (Char* '?>' Char*)))? 629 * '?>' 630 * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') ) 631 * </pre> 632 * <p> (The <code><?</code> has already been read.) 633 */ parsePI()634 private void parsePI() 635 throws SAXException, IOException 636 { 637 String name; 638 boolean saved = expandPE; 639 640 expandPE = false; 641 name = readNmtoken(true); 642 //NE08 643 if (name.indexOf(':') >= 0) 644 { 645 error("Illegal character(':') in processing instruction name ", 646 name, null); 647 } 648 if ("xml".equalsIgnoreCase(name)) 649 { 650 error("Illegal processing instruction target", name, null); 651 } 652 if (!tryRead(endDelimPI)) 653 { 654 requireWhitespace(); 655 parseUntil(endDelimPI); 656 } 657 expandPE = saved; 658 handler.processingInstruction(name, dataBufferToString()); 659 } 660 661 static final char[] endDelimCDATA = { ']', ']', '>' }; 662 663 private boolean isDirtyCurrentElement; 664 665 /** 666 * Parse a CDATA section. 667 * <pre> 668 * [18] CDSect ::= CDStart CData CDEnd 669 * [19] CDStart ::= '<![CDATA[' 670 * [20] CData ::= (Char* - (Char* ']]>' Char*)) 671 * [21] CDEnd ::= ']]>' 672 * </pre> 673 * <p> (The '<![CDATA[' has already been read.) 674 */ parseCDSect()675 private void parseCDSect() 676 throws Exception 677 { 678 parseUntil(endDelimCDATA); 679 dataBufferFlush(); 680 } 681 682 /** 683 * Parse the prolog of an XML document. 684 * <pre> 685 * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)? 686 * </pre> 687 * <p>We do not look for the XML declaration here, because it was 688 * handled by pushURL (). 689 * @see pushURL 690 * @return true if a DTD was read. 691 */ parseProlog()692 private boolean parseProlog() 693 throws Exception 694 { 695 parseMisc(); 696 697 if (tryRead("<!DOCTYPE")) 698 { 699 parseDoctypedecl(); 700 parseMisc(); 701 return true; 702 } 703 return false; 704 } 705 checkLegalVersion(String version)706 private void checkLegalVersion(String version) 707 throws SAXException 708 { 709 int len = version.length(); 710 for (int i = 0; i < len; i++) 711 { 712 char c = version.charAt(i); 713 if ('0' <= c && c <= '9') 714 { 715 continue; 716 } 717 if (c == '_' || c == '.' || c == ':' || c == '-') 718 { 719 continue; 720 } 721 if ('a' <= c && c <= 'z') 722 { 723 continue; 724 } 725 if ('A' <= c && c <= 'Z') 726 { 727 continue; 728 } 729 error ("illegal character in version", version, "1.0"); 730 } 731 } 732 733 /** 734 * Parse the XML declaration. 735 * <pre> 736 * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' 737 * [24] VersionInfo ::= S 'version' Eq 738 * ("'" VersionNum "'" | '"' VersionNum '"' ) 739 * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')* 740 * [32] SDDecl ::= S 'standalone' Eq 741 * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' ) 742 * [80] EncodingDecl ::= S 'encoding' Eq 743 * ( "'" EncName "'" | "'" EncName "'" ) 744 * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* 745 * </pre> 746 * <p> (The <code><?xml</code> and whitespace have already been read.) 747 * @return the encoding in the declaration, uppercased; or null 748 * @see #parseTextDecl 749 * @see #setupDecoding 750 */ parseXMLDecl(boolean ignoreEncoding)751 private String parseXMLDecl(boolean ignoreEncoding) 752 throws SAXException, IOException 753 { 754 String version; 755 String encodingName = null; 756 String standalone = null; 757 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; 758 String inputEncoding = null; 759 760 switch (this.encoding) 761 { 762 case ENCODING_EXTERNAL: 763 case ENCODING_UTF_8: 764 inputEncoding = "UTF-8"; 765 break; 766 case ENCODING_ISO_8859_1: 767 inputEncoding = "ISO-8859-1"; 768 break; 769 case ENCODING_UCS_2_12: 770 inputEncoding = "UTF-16BE"; 771 break; 772 case ENCODING_UCS_2_21: 773 inputEncoding = "UTF-16LE"; 774 break; 775 } 776 777 // Read the version. 778 require("version"); 779 parseEq(); 780 checkLegalVersion(version = readLiteral(flags)); 781 if (!version.equals("1.0")) 782 { 783 if (version.equals("1.1")) 784 { 785 handler.warn("expected XML version 1.0, not: " + version); 786 xmlVersion = XML_11; 787 } 788 else 789 { 790 error("illegal XML version", version, "1.0 or 1.1"); 791 } 792 } 793 else 794 { 795 xmlVersion = XML_10; 796 } 797 // Try reading an encoding declaration. 798 boolean white = tryWhitespace(); 799 800 if (tryRead("encoding")) 801 { 802 if (!white) 803 { 804 error("whitespace required before 'encoding='"); 805 } 806 parseEq(); 807 encodingName = readLiteral(flags); 808 if (!ignoreEncoding) 809 { 810 setupDecoding(encodingName); 811 } 812 } 813 814 // Try reading a standalone declaration 815 if (encodingName != null) 816 { 817 white = tryWhitespace(); 818 } 819 if (tryRead("standalone")) 820 { 821 if (!white) 822 { 823 error("whitespace required before 'standalone='"); 824 } 825 parseEq(); 826 standalone = readLiteral(flags); 827 if ("yes".equals(standalone)) 828 { 829 docIsStandalone = true; 830 } 831 else if (!"no".equals(standalone)) 832 { 833 error("standalone flag must be 'yes' or 'no'"); 834 } 835 } 836 837 skipWhitespace(); 838 require("?>"); 839 840 if (inputEncoding == null) 841 { 842 inputEncoding = encodingName; 843 } 844 return encodingName; 845 } 846 847 /** 848 * Parse a text declaration. 849 * <pre> 850 * [79] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' 851 * [80] EncodingDecl ::= S 'encoding' Eq 852 * ( '"' EncName '"' | "'" EncName "'" ) 853 * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* 854 * </pre> 855 * <p> (The <code><?xml</code>' and whitespace have already been read.) 856 * @return the encoding in the declaration, uppercased; or null 857 * @see #parseXMLDecl 858 * @see #setupDecoding 859 */ parseTextDecl(boolean ignoreEncoding)860 private String parseTextDecl(boolean ignoreEncoding) 861 throws SAXException, IOException 862 { 863 String encodingName = null; 864 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; 865 866 // Read an optional version. 867 if (tryRead ("version")) 868 { 869 String version; 870 parseEq(); 871 checkLegalVersion(version = readLiteral(flags)); 872 873 if (version.equals("1.1")) 874 { 875 if (xmlVersion == XML_10) 876 { 877 error("external subset has later version number.", "1.0", 878 version); 879 } 880 handler.warn("expected XML version 1.0, not: " + version); 881 xmlVersion = XML_11; 882 } 883 else if (!version.equals("1.0")) 884 { 885 error("illegal XML version", version, "1.0 or 1.1"); 886 } 887 requireWhitespace(); 888 } 889 890 // Read the encoding. 891 require("encoding"); 892 parseEq(); 893 encodingName = readLiteral(flags); 894 if (!ignoreEncoding) 895 { 896 setupDecoding(encodingName); 897 } 898 skipWhitespace(); 899 require("?>"); 900 901 return encodingName; 902 } 903 904 /** 905 * Sets up internal state so that we can decode an entity using the 906 * specified encoding. This is used when we start to read an entity 907 * and we have been given knowledge of its encoding before we start to 908 * read any data (e.g. from a SAX input source or from a MIME type). 909 * 910 * <p> It is also used after autodetection, at which point only very 911 * limited adjustments to the encoding may be used (switching between 912 * related builtin decoders). 913 * 914 * @param encodingName The name of the encoding specified by the user. 915 * @exception IOException if the encoding isn't supported either 916 * internally to this parser, or by the hosting JVM. 917 * @see #parseXMLDecl 918 * @see #parseTextDecl 919 */ setupDecoding(String encodingName)920 private void setupDecoding(String encodingName) 921 throws SAXException, IOException 922 { 923 encodingName = encodingName.toUpperCase(); 924 925 // ENCODING_EXTERNAL indicates an encoding that wasn't 926 // autodetected ... we can use builtin decoders, or 927 // ones from the JVM (InputStreamReader). 928 929 // Otherwise we can only tweak what was autodetected, and 930 // only for single byte (ASCII derived) builtin encodings. 931 932 // ASCII-derived encodings 933 if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL) 934 { 935 if (encodingName.equals("ISO-8859-1") 936 || encodingName.equals("8859_1") 937 || encodingName.equals("ISO8859_1")) 938 { 939 encoding = ENCODING_ISO_8859_1; 940 return; 941 } 942 else if (encodingName.equals("US-ASCII") 943 || encodingName.equals("ASCII")) 944 { 945 encoding = ENCODING_ASCII; 946 return; 947 } 948 else if (encodingName.equals("UTF-8") 949 || encodingName.equals("UTF8")) 950 { 951 encoding = ENCODING_UTF_8; 952 return; 953 } 954 else if (encoding != ENCODING_EXTERNAL) 955 { 956 // used to start with a new reader ... 957 throw new UnsupportedEncodingException(encodingName); 958 } 959 // else fallthrough ... 960 // it's ASCII-ish and something other than a builtin 961 } 962 963 // Unicode and such 964 if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21) 965 { 966 if (!(encodingName.equals("ISO-10646-UCS-2") 967 || encodingName.equals("UTF-16") 968 || encodingName.equals("UTF-16BE") 969 || encodingName.equals("UTF-16LE"))) 970 { 971 error("unsupported Unicode encoding", encodingName, "UTF-16"); 972 } 973 return; 974 } 975 976 // four byte encodings 977 if (encoding == ENCODING_UCS_4_1234 978 || encoding == ENCODING_UCS_4_4321 979 || encoding == ENCODING_UCS_4_2143 980 || encoding == ENCODING_UCS_4_3412) 981 { 982 // Strictly: "UCS-4" == "UTF-32BE"; also, "UTF-32LE" exists 983 if (!encodingName.equals("ISO-10646-UCS-4")) 984 { 985 error("unsupported 32-bit encoding", encodingName, 986 "ISO-10646-UCS-4"); 987 } 988 return; 989 } 990 991 // assert encoding == ENCODING_EXTERNAL 992 // if (encoding != ENCODING_EXTERNAL) 993 // throw new RuntimeException ("encoding = " + encoding); 994 995 if (encodingName.equals("UTF-16BE")) 996 { 997 encoding = ENCODING_UCS_2_12; 998 return; 999 } 1000 if (encodingName.equals("UTF-16LE")) 1001 { 1002 encoding = ENCODING_UCS_2_21; 1003 return; 1004 } 1005 1006 // We couldn't use the builtin decoders at all. But we can try to 1007 // create a reader, since we haven't messed up buffering. Tweak 1008 // the encoding name if necessary. 1009 1010 if (encodingName.equals("UTF-16") 1011 || encodingName.equals("ISO-10646-UCS-2")) 1012 { 1013 encodingName = "Unicode"; 1014 } 1015 // Ignoring all the EBCDIC aliases here 1016 1017 reader = new InputStreamReader(is, encodingName); 1018 sourceType = INPUT_READER; 1019 } 1020 1021 /** 1022 * Parse miscellaneous markup outside the document element and DOCTYPE 1023 * declaration. 1024 * <pre> 1025 * [27] Misc ::= Comment | PI | S 1026 * </pre> 1027 */ parseMisc()1028 private void parseMisc() 1029 throws Exception 1030 { 1031 while (true) 1032 { 1033 skipWhitespace(); 1034 if (tryRead(startDelimPI)) 1035 { 1036 parsePI(); 1037 } 1038 else if (tryRead(startDelimComment)) 1039 { 1040 parseComment(); 1041 } 1042 else 1043 { 1044 return; 1045 } 1046 } 1047 } 1048 1049 /** 1050 * Parse a document type declaration. 1051 * <pre> 1052 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? 1053 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' 1054 * </pre> 1055 * <p> (The <code><!DOCTYPE</code> has already been read.) 1056 */ parseDoctypedecl()1057 private void parseDoctypedecl() 1058 throws Exception 1059 { 1060 String rootName; 1061 ExternalIdentifiers ids; 1062 1063 // Read the document type name. 1064 requireWhitespace(); 1065 rootName = readNmtoken(true); 1066 1067 // Read the External subset's IDs 1068 skipWhitespace(); 1069 ids = readExternalIds(false, true); 1070 1071 // report (a) declaration of name, (b) lexical info (ids) 1072 handler.doctypeDecl(rootName, ids.publicId, ids.systemId); 1073 1074 // Internal subset is parsed first, if present 1075 skipWhitespace(); 1076 if (tryRead('[')) 1077 { 1078 1079 // loop until the subset ends 1080 while (true) 1081 { 1082 doReport = expandPE = true; 1083 skipWhitespace(); 1084 doReport = expandPE = false; 1085 if (tryRead(']')) 1086 { 1087 break; // end of subset 1088 } 1089 else 1090 { 1091 // WFC, PEs in internal subset (only between decls) 1092 peIsError = expandPE = true; 1093 parseMarkupdecl(); 1094 peIsError = expandPE = false; 1095 } 1096 } 1097 } 1098 skipWhitespace(); 1099 require('>'); 1100 1101 // Read the external subset, if any 1102 InputSource subset; 1103 1104 if (ids.systemId == null) 1105 { 1106 subset = handler.getExternalSubset(rootName, 1107 handler.getSystemId()); 1108 } 1109 else 1110 { 1111 subset = null; 1112 } 1113 if (ids.systemId != null || subset != null) 1114 { 1115 pushString(null, ">"); 1116 1117 // NOTE: [dtd] is so we say what SAX2 expects, 1118 // though it's misleading (subset, not entire dtd) 1119 if (ids.systemId != null) 1120 { 1121 pushURL(true, "[dtd]", ids, null, null, null, true); 1122 } 1123 else 1124 { 1125 handler.warn("modifying document by adding external subset"); 1126 pushURL(true, "[dtd]", 1127 new ExternalIdentifiers(subset.getPublicId(), 1128 subset.getSystemId(), 1129 null), 1130 subset.getCharacterStream(), 1131 subset.getByteStream(), 1132 subset.getEncoding(), 1133 false); 1134 } 1135 1136 // Loop until we end up back at '>' 1137 while (true) 1138 { 1139 doReport = expandPE = true; 1140 skipWhitespace(); 1141 doReport = expandPE = false; 1142 if (tryRead('>')) 1143 { 1144 break; 1145 } 1146 else 1147 { 1148 expandPE = true; 1149 parseMarkupdecl(); 1150 expandPE = false; 1151 } 1152 } 1153 1154 // the ">" string isn't popped yet 1155 if (inputStack.size() != 1) 1156 { 1157 error("external subset has unmatched '>'"); 1158 } 1159 } 1160 1161 // done dtd 1162 handler.endDoctype(); 1163 expandPE = false; 1164 doReport = true; 1165 } 1166 1167 /** 1168 * Parse a markup declaration in the internal or external DTD subset. 1169 * <pre> 1170 * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl 1171 * | NotationDecl | PI | Comment 1172 * [30] extSubsetDecl ::= (markupdecl | conditionalSect 1173 * | PEReference | S) * 1174 * </pre> 1175 * <p> Reading toplevel PE references is handled as a lexical issue 1176 * by the caller, as is whitespace. 1177 */ parseMarkupdecl()1178 private void parseMarkupdecl() 1179 throws Exception 1180 { 1181 char[] saved = null; 1182 boolean savedPE = expandPE; 1183 1184 // prevent "<%foo;" and ensures saved entity is right 1185 require('<'); 1186 unread('<'); 1187 expandPE = false; 1188 1189 if (tryRead("<!ELEMENT")) 1190 { 1191 saved = readBuffer; 1192 expandPE = savedPE; 1193 parseElementDecl(); 1194 } 1195 else if (tryRead("<!ATTLIST")) 1196 { 1197 saved = readBuffer; 1198 expandPE = savedPE; 1199 parseAttlistDecl(); 1200 } 1201 else if (tryRead("<!ENTITY")) 1202 { 1203 saved = readBuffer; 1204 expandPE = savedPE; 1205 parseEntityDecl(); 1206 } 1207 else if (tryRead("<!NOTATION")) 1208 { 1209 saved = readBuffer; 1210 expandPE = savedPE; 1211 parseNotationDecl(); 1212 } 1213 else if (tryRead(startDelimPI)) 1214 { 1215 saved = readBuffer; 1216 expandPE = savedPE; 1217 parsePI(); 1218 } 1219 else if (tryRead(startDelimComment)) 1220 { 1221 saved = readBuffer; 1222 expandPE = savedPE; 1223 parseComment(); 1224 } 1225 else if (tryRead("<![")) 1226 { 1227 saved = readBuffer; 1228 expandPE = savedPE; 1229 if (inputStack.size() > 0) 1230 { 1231 parseConditionalSect(saved); 1232 } 1233 else 1234 { 1235 error("conditional sections illegal in internal subset"); 1236 } 1237 } 1238 else 1239 { 1240 error("expected markup declaration"); 1241 } 1242 1243 // VC: Proper Decl/PE Nesting 1244 if (readBuffer != saved) 1245 { 1246 handler.verror("Illegal Declaration/PE nesting"); 1247 } 1248 } 1249 1250 /** 1251 * Parse an element, with its tags. 1252 * <pre> 1253 * [39] element ::= EmptyElementTag | STag content ETag 1254 * [40] STag ::= '<' Name (S Attribute)* S? '>' 1255 * [44] EmptyElementTag ::= '<' Name (S Attribute)* S? '/>' 1256 * </pre> 1257 * <p> (The '<' has already been read.) 1258 * <p>NOTE: this method actually chains onto parseContent (), if necessary, 1259 * and parseContent () will take care of calling parseETag (). 1260 */ parseElement(boolean maybeGetSubset)1261 private void parseElement(boolean maybeGetSubset) 1262 throws Exception 1263 { 1264 String gi; 1265 char c; 1266 int oldElementContent = currentElementContent; 1267 String oldElement = currentElement; 1268 ElementDecl element; 1269 1270 // This is the (global) counter for the 1271 // array of specified attributes. 1272 tagAttributePos = 0; 1273 1274 // Read the element type name. 1275 gi = readNmtoken(true); 1276 1277 // If we saw no DTD, and this is the document root element, 1278 // let the application modify the input stream by providing one. 1279 if (maybeGetSubset) 1280 { 1281 InputSource subset = handler.getExternalSubset(gi, 1282 handler.getSystemId()); 1283 if (subset != null) 1284 { 1285 String publicId = subset.getPublicId(); 1286 String systemId = subset.getSystemId(); 1287 1288 handler.warn("modifying document by adding DTD"); 1289 handler.doctypeDecl(gi, publicId, systemId); 1290 pushString(null, ">"); 1291 1292 // NOTE: [dtd] is so we say what SAX2 expects, 1293 // though it's misleading (subset, not entire dtd) 1294 pushURL(true, "[dtd]", 1295 new ExternalIdentifiers(publicId, systemId, null), 1296 subset.getCharacterStream(), 1297 subset.getByteStream(), 1298 subset.getEncoding(), 1299 false); 1300 1301 // Loop until we end up back at '>' 1302 while (true) 1303 { 1304 doReport = expandPE = true; 1305 skipWhitespace(); 1306 doReport = expandPE = false; 1307 if (tryRead('>')) 1308 { 1309 break; 1310 } 1311 else 1312 { 1313 expandPE = true; 1314 parseMarkupdecl(); 1315 expandPE = false; 1316 } 1317 } 1318 1319 // the ">" string isn't popped yet 1320 if (inputStack.size() != 1) 1321 { 1322 error("external subset has unmatched '>'"); 1323 } 1324 1325 handler.endDoctype(); 1326 } 1327 } 1328 1329 // Determine the current content type. 1330 currentElement = gi; 1331 element = (ElementDecl) elementInfo.get(gi); 1332 currentElementContent = getContentType(element, CONTENT_ANY); 1333 1334 // Read the attributes, if any. 1335 // After this loop, "c" is the closing delimiter. 1336 boolean white = tryWhitespace(); 1337 c = readCh(); 1338 while (c != '/' && c != '>') 1339 { 1340 unread(c); 1341 if (!white) 1342 { 1343 error("need whitespace between attributes"); 1344 } 1345 parseAttribute(gi); 1346 white = tryWhitespace(); 1347 c = readCh(); 1348 } 1349 1350 // Supply any defaulted attributes. 1351 Iterator atts = declaredAttributes(element); 1352 if (atts != null) 1353 { 1354 String aname; 1355 loop: 1356 while (atts.hasNext()) 1357 { 1358 aname = (String) atts.next(); 1359 // See if it was specified. 1360 for (int i = 0; i < tagAttributePos; i++) 1361 { 1362 if (tagAttributes[i] == aname) 1363 { 1364 continue loop; 1365 } 1366 } 1367 // ... or has a default 1368 String value = getAttributeDefaultValue(gi, aname); 1369 1370 if (value == null) 1371 { 1372 continue; 1373 } 1374 handler.attribute(aname, value, false); 1375 } 1376 } 1377 1378 // Figure out if this is a start tag 1379 // or an empty element, and dispatch an 1380 // event accordingly. 1381 switch (c) 1382 { 1383 case '>': 1384 handler.startElement(gi); 1385 parseContent(); 1386 break; 1387 case '/': 1388 require('>'); 1389 handler.startElement(gi); 1390 handler.endElement(gi); 1391 break; 1392 } 1393 1394 // Restore the previous state. 1395 currentElement = oldElement; 1396 currentElementContent = oldElementContent; 1397 } 1398 1399 /** 1400 * Parse an attribute assignment. 1401 * <pre> 1402 * [41] Attribute ::= Name Eq AttValue 1403 * </pre> 1404 * @param name The name of the attribute's element. 1405 * @see SAXDriver#attribute 1406 */ parseAttribute(String name)1407 private void parseAttribute(String name) 1408 throws Exception 1409 { 1410 String aname; 1411 String type; 1412 String value; 1413 int flags = LIT_ATTRIBUTE | LIT_ENTITY_REF; 1414 1415 // Read the attribute name. 1416 aname = readNmtoken(true); 1417 type = getAttributeType(name, aname); 1418 1419 // Parse '=' 1420 parseEq(); 1421 1422 // Read the value, normalizing whitespace 1423 // unless it is CDATA. 1424 if (handler.stringInterning) 1425 { 1426 if (type == "CDATA" || type == null) 1427 { 1428 value = readLiteral(flags); 1429 } 1430 else 1431 { 1432 value = readLiteral(flags | LIT_NORMALIZE); 1433 } 1434 } 1435 else 1436 { 1437 if (type == null || type.equals("CDATA")) 1438 { 1439 value = readLiteral(flags); 1440 } 1441 else 1442 { 1443 value = readLiteral(flags | LIT_NORMALIZE); 1444 } 1445 } 1446 1447 // WFC: no duplicate attributes 1448 for (int i = 0; i < tagAttributePos; i++) 1449 { 1450 if (aname.equals(tagAttributes [i])) 1451 { 1452 error("duplicate attribute", aname, null); 1453 } 1454 } 1455 1456 // Inform the handler about the 1457 // attribute. 1458 handler.attribute(aname, value, true); 1459 dataBufferPos = 0; 1460 1461 // Note that the attribute has been 1462 // specified. 1463 if (tagAttributePos == tagAttributes.length) 1464 { 1465 String newAttrib[] = new String[tagAttributes.length * 2]; 1466 System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos); 1467 tagAttributes = newAttrib; 1468 } 1469 tagAttributes[tagAttributePos++] = aname; 1470 } 1471 1472 /** 1473 * Parse an equals sign surrounded by optional whitespace. 1474 * <pre> 1475 * [25] Eq ::= S? '=' S? 1476 * </pre> 1477 */ parseEq()1478 private void parseEq() 1479 throws SAXException, IOException 1480 { 1481 skipWhitespace(); 1482 require('='); 1483 skipWhitespace(); 1484 } 1485 1486 /** 1487 * Parse an end tag. 1488 * <pre> 1489 * [42] ETag ::= '</' Name S? '>' 1490 * </pre> 1491 * <p>NOTE: parseContent () chains to here, we already read the 1492 * "</". 1493 */ parseETag()1494 private void parseETag() 1495 throws Exception 1496 { 1497 require(currentElement); 1498 skipWhitespace(); 1499 require('>'); 1500 handler.endElement(currentElement); 1501 // not re-reporting any SAXException re bogus end tags, 1502 // even though that diagnostic might be clearer ... 1503 } 1504 1505 /** 1506 * Parse the content of an element. 1507 * <pre> 1508 * [43] content ::= (element | CharData | Reference 1509 * | CDSect | PI | Comment)* 1510 * [67] Reference ::= EntityRef | CharRef 1511 * </pre> 1512 * <p> NOTE: consumes ETtag. 1513 */ parseContent()1514 private void parseContent() 1515 throws Exception 1516 { 1517 char c; 1518 1519 while (true) 1520 { 1521 // consume characters (or ignorable whitspace) until delimiter 1522 parseCharData(); 1523 1524 // Handle delimiters 1525 c = readCh(); 1526 switch (c) 1527 { 1528 case '&': // Found "&" 1529 c = readCh(); 1530 if (c == '#') 1531 { 1532 parseCharRef(); 1533 } 1534 else 1535 { 1536 unread(c); 1537 parseEntityRef(true); 1538 } 1539 isDirtyCurrentElement = true; 1540 break; 1541 1542 case '<': // Found "<" 1543 dataBufferFlush(); 1544 c = readCh(); 1545 switch (c) 1546 { 1547 case '!': // Found "<!" 1548 c = readCh(); 1549 switch (c) 1550 { 1551 case '-': // Found "<!-" 1552 require('-'); 1553 isDirtyCurrentElement = false; 1554 parseComment(); 1555 break; 1556 case '[': // Found "<![" 1557 isDirtyCurrentElement = false; 1558 require("CDATA["); 1559 handler.startCDATA(); 1560 inCDATA = true; 1561 parseCDSect(); 1562 inCDATA = false; 1563 handler.endCDATA(); 1564 break; 1565 default: 1566 error("expected comment or CDATA section", c, null); 1567 break; 1568 } 1569 break; 1570 1571 case '?': // Found "<?" 1572 isDirtyCurrentElement = false; 1573 parsePI(); 1574 break; 1575 1576 case '/': // Found "</" 1577 isDirtyCurrentElement = false; 1578 parseETag(); 1579 return; 1580 1581 default: // Found "<" followed by something else 1582 isDirtyCurrentElement = false; 1583 unread(c); 1584 parseElement(false); 1585 break; 1586 } 1587 } 1588 } 1589 } 1590 1591 /** 1592 * Parse an element type declaration. 1593 * <pre> 1594 * [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' 1595 * </pre> 1596 * <p> NOTE: the '<!ELEMENT' has already been read. 1597 */ parseElementDecl()1598 private void parseElementDecl() 1599 throws Exception 1600 { 1601 String name; 1602 1603 requireWhitespace(); 1604 // Read the element type name. 1605 name = readNmtoken(true); 1606 1607 requireWhitespace(); 1608 // Read the content model. 1609 parseContentspec(name); 1610 1611 skipWhitespace(); 1612 require('>'); 1613 } 1614 1615 /** 1616 * Content specification. 1617 * <pre> 1618 * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements 1619 * </pre> 1620 */ parseContentspec(String name)1621 private void parseContentspec(String name) 1622 throws Exception 1623 { 1624 // FIXME: move elementDecl() into setElement(), pass EMTPY/ANY ... 1625 if (tryRead("EMPTY")) 1626 { 1627 setElement(name, CONTENT_EMPTY, null, null); 1628 if (!skippedPE) 1629 { 1630 handler.getDeclHandler().elementDecl(name, "EMPTY"); 1631 } 1632 return; 1633 } 1634 else if (tryRead("ANY")) 1635 { 1636 setElement(name, CONTENT_ANY, null, null); 1637 if (!skippedPE) 1638 { 1639 handler.getDeclHandler().elementDecl(name, "ANY"); 1640 } 1641 return; 1642 } 1643 else 1644 { 1645 String model; 1646 char[] saved; 1647 1648 require('('); 1649 saved = readBuffer; 1650 dataBufferAppend('('); 1651 skipWhitespace(); 1652 if (tryRead("#PCDATA")) 1653 { 1654 dataBufferAppend("#PCDATA"); 1655 parseMixed(saved); 1656 model = dataBufferToString(); 1657 setElement(name, CONTENT_MIXED, model, null); 1658 } 1659 else 1660 { 1661 parseElements(saved); 1662 model = dataBufferToString(); 1663 setElement(name, CONTENT_ELEMENTS, model, null); 1664 } 1665 if (!skippedPE) 1666 { 1667 handler.getDeclHandler().elementDecl(name, model); 1668 } 1669 } 1670 } 1671 1672 /** 1673 * Parse an element-content model. 1674 * <pre> 1675 * [47] elements ::= (choice | seq) ('?' | '*' | '+')? 1676 * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')' 1677 * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')' 1678 * </pre> 1679 * 1680 * <p> NOTE: the opening '(' and S have already been read. 1681 * 1682 * @param saved Buffer for entity that should have the terminal ')' 1683 */ parseElements(char[] saved)1684 private void parseElements(char[] saved) 1685 throws Exception 1686 { 1687 char c; 1688 char sep; 1689 1690 // Parse the first content particle 1691 skipWhitespace(); 1692 parseCp(); 1693 1694 // Check for end or for a separator. 1695 skipWhitespace(); 1696 c = readCh(); 1697 switch (c) 1698 { 1699 case ')': 1700 // VC: Proper Group/PE Nesting 1701 if (readBuffer != saved) 1702 { 1703 handler.verror("Illegal Group/PE nesting"); 1704 } 1705 1706 dataBufferAppend(')'); 1707 c = readCh(); 1708 switch (c) 1709 { 1710 case '*': 1711 case '+': 1712 case '?': 1713 dataBufferAppend(c); 1714 break; 1715 default: 1716 unread(c); 1717 } 1718 return; 1719 case ',': // Register the separator. 1720 case '|': 1721 sep = c; 1722 dataBufferAppend(c); 1723 break; 1724 default: 1725 error("bad separator in content model", c, null); 1726 return; 1727 } 1728 1729 // Parse the rest of the content model. 1730 while (true) 1731 { 1732 skipWhitespace(); 1733 parseCp(); 1734 skipWhitespace(); 1735 c = readCh(); 1736 if (c == ')') 1737 { 1738 // VC: Proper Group/PE Nesting 1739 if (readBuffer != saved) 1740 { 1741 handler.verror("Illegal Group/PE nesting"); 1742 } 1743 1744 dataBufferAppend(')'); 1745 break; 1746 } 1747 else if (c != sep) 1748 { 1749 error("bad separator in content model", c, null); 1750 return; 1751 } 1752 else 1753 { 1754 dataBufferAppend(c); 1755 } 1756 } 1757 1758 // Check for the occurrence indicator. 1759 c = readCh(); 1760 switch (c) 1761 { 1762 case '?': 1763 case '*': 1764 case '+': 1765 dataBufferAppend(c); 1766 return; 1767 default: 1768 unread(c); 1769 return; 1770 } 1771 } 1772 1773 /** 1774 * Parse a content particle. 1775 * <pre> 1776 * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')? 1777 * </pre> 1778 */ parseCp()1779 private void parseCp() 1780 throws Exception 1781 { 1782 if (tryRead('(')) 1783 { 1784 dataBufferAppend('('); 1785 parseElements(readBuffer); 1786 } 1787 else 1788 { 1789 dataBufferAppend(readNmtoken(true)); 1790 char c = readCh(); 1791 switch (c) 1792 { 1793 case '?': 1794 case '*': 1795 case '+': 1796 dataBufferAppend(c); 1797 break; 1798 default: 1799 unread(c); 1800 break; 1801 } 1802 } 1803 } 1804 1805 /** 1806 * Parse mixed content. 1807 * <pre> 1808 * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*' 1809 * | '(' S? ('#PCDATA') S? ')' 1810 * </pre> 1811 * 1812 * @param saved Buffer for entity that should have the terminal ')' 1813 */ parseMixed(char[] saved)1814 private void parseMixed(char[] saved) 1815 throws Exception 1816 { 1817 // Check for PCDATA alone. 1818 skipWhitespace(); 1819 if (tryRead(')')) 1820 { 1821 // VC: Proper Group/PE Nesting 1822 if (readBuffer != saved) 1823 { 1824 handler.verror("Illegal Group/PE nesting"); 1825 } 1826 1827 dataBufferAppend(")*"); 1828 tryRead('*'); 1829 return; 1830 } 1831 1832 // Parse mixed content. 1833 skipWhitespace(); 1834 while (!tryRead(")")) 1835 { 1836 require('|'); 1837 dataBufferAppend('|'); 1838 skipWhitespace(); 1839 dataBufferAppend(readNmtoken(true)); 1840 skipWhitespace(); 1841 } 1842 1843 // VC: Proper Group/PE Nesting 1844 if (readBuffer != saved) 1845 { 1846 handler.verror("Illegal Group/PE nesting"); 1847 } 1848 1849 require('*'); 1850 dataBufferAppend(")*"); 1851 } 1852 1853 /** 1854 * Parse an attribute list declaration. 1855 * <pre> 1856 * [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>' 1857 * </pre> 1858 * <p>NOTE: the '<!ATTLIST' has already been read. 1859 */ parseAttlistDecl()1860 private void parseAttlistDecl() 1861 throws Exception 1862 { 1863 String elementName; 1864 1865 requireWhitespace(); 1866 elementName = readNmtoken(true); 1867 boolean white = tryWhitespace(); 1868 while (!tryRead('>')) 1869 { 1870 if (!white) 1871 { 1872 error("whitespace required before attribute definition"); 1873 } 1874 parseAttDef(elementName); 1875 white = tryWhitespace(); 1876 } 1877 } 1878 1879 /** 1880 * Parse a single attribute definition. 1881 * <pre> 1882 * [53] AttDef ::= S Name S AttType S DefaultDecl 1883 * </pre> 1884 */ parseAttDef(String elementName)1885 private void parseAttDef(String elementName) 1886 throws Exception 1887 { 1888 String name; 1889 String type; 1890 String enumer = null; 1891 1892 // Read the attribute name. 1893 name = readNmtoken(true); 1894 1895 // Read the attribute type. 1896 requireWhitespace(); 1897 type = readAttType(); 1898 1899 // Get the string of enumerated values if necessary. 1900 if (handler.stringInterning) 1901 { 1902 if ("ENUMERATION" == type || "NOTATION" == type) 1903 { 1904 enumer = dataBufferToString(); 1905 } 1906 } 1907 else 1908 { 1909 if ("ENUMERATION".equals(type) || "NOTATION".equals(type)) 1910 { 1911 enumer = dataBufferToString(); 1912 } 1913 } 1914 1915 // Read the default value. 1916 requireWhitespace(); 1917 parseDefault(elementName, name, type, enumer); 1918 } 1919 1920 /** 1921 * Parse the attribute type. 1922 * <pre> 1923 * [54] AttType ::= StringType | TokenizedType | EnumeratedType 1924 * [55] StringType ::= 'CDATA' 1925 * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' 1926 * | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS' 1927 * [57] EnumeratedType ::= NotationType | Enumeration 1928 * </pre> 1929 */ readAttType()1930 private String readAttType() 1931 throws Exception 1932 { 1933 if (tryRead('(')) 1934 { 1935 parseEnumeration(false); 1936 return "ENUMERATION"; 1937 } 1938 else 1939 { 1940 String typeString = readNmtoken(true); 1941 if (handler.stringInterning) 1942 { 1943 if ("NOTATION" == typeString) 1944 { 1945 parseNotationType(); 1946 return typeString; 1947 } 1948 else if ("CDATA" == typeString 1949 || "ID" == typeString 1950 || "IDREF" == typeString 1951 || "IDREFS" == typeString 1952 || "ENTITY" == typeString 1953 || "ENTITIES" == typeString 1954 || "NMTOKEN" == typeString 1955 || "NMTOKENS" == typeString) 1956 { 1957 return typeString; 1958 } 1959 } 1960 else 1961 { 1962 if ("NOTATION".equals(typeString)) 1963 { 1964 parseNotationType(); 1965 return typeString; 1966 } 1967 else if ("CDATA".equals(typeString) 1968 || "ID".equals(typeString) 1969 || "IDREF".equals(typeString) 1970 || "IDREFS".equals(typeString) 1971 || "ENTITY".equals(typeString) 1972 || "ENTITIES".equals(typeString) 1973 || "NMTOKEN".equals(typeString) 1974 || "NMTOKENS".equals(typeString)) 1975 { 1976 return typeString; 1977 } 1978 } 1979 error("illegal attribute type", typeString, null); 1980 return null; 1981 } 1982 } 1983 1984 /** 1985 * Parse an enumeration. 1986 * <pre> 1987 * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' 1988 * </pre> 1989 * <p>NOTE: the '(' has already been read. 1990 */ parseEnumeration(boolean isNames)1991 private void parseEnumeration(boolean isNames) 1992 throws Exception 1993 { 1994 dataBufferAppend('('); 1995 1996 // Read the first token. 1997 skipWhitespace(); 1998 dataBufferAppend(readNmtoken(isNames)); 1999 // Read the remaining tokens. 2000 skipWhitespace(); 2001 while (!tryRead(')')) 2002 { 2003 require('|'); 2004 dataBufferAppend('|'); 2005 skipWhitespace(); 2006 dataBufferAppend(readNmtoken (isNames)); 2007 skipWhitespace(); 2008 } 2009 dataBufferAppend(')'); 2010 } 2011 2012 /** 2013 * Parse a notation type for an attribute. 2014 * <pre> 2015 * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks 2016 * (S? '|' S? name)* S? ')' 2017 * </pre> 2018 * <p>NOTE: the 'NOTATION' has already been read 2019 */ parseNotationType()2020 private void parseNotationType() 2021 throws Exception 2022 { 2023 requireWhitespace(); 2024 require('('); 2025 2026 parseEnumeration(true); 2027 } 2028 2029 /** 2030 * Parse the default value for an attribute. 2031 * <pre> 2032 * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' 2033 * | (('#FIXED' S)? AttValue) 2034 * </pre> 2035 */ parseDefault(String elementName, String name, String type, String enumer)2036 private void parseDefault(String elementName, String name, 2037 String type, String enumer) 2038 throws Exception 2039 { 2040 int valueType = ATTRIBUTE_DEFAULT_SPECIFIED; 2041 String value = null; 2042 int flags = LIT_ATTRIBUTE; 2043 boolean saved = expandPE; 2044 String defaultType = null; 2045 2046 // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace 2047 // chars to spaces (doesn't matter when that's done if it doesn't 2048 // interfere with char refs expanding to whitespace). 2049 2050 if (!skippedPE) 2051 { 2052 flags |= LIT_ENTITY_REF; 2053 if (handler.stringInterning) 2054 { 2055 if ("CDATA" != type) 2056 { 2057 flags |= LIT_NORMALIZE; 2058 } 2059 } 2060 else 2061 { 2062 if (!"CDATA".equals(type)) 2063 { 2064 flags |= LIT_NORMALIZE; 2065 } 2066 } 2067 } 2068 2069 expandPE = false; 2070 if (tryRead('#')) 2071 { 2072 if (tryRead("FIXED")) 2073 { 2074 defaultType = "#FIXED"; 2075 valueType = ATTRIBUTE_DEFAULT_FIXED; 2076 requireWhitespace(); 2077 value = readLiteral(flags); 2078 } 2079 else if (tryRead("REQUIRED")) 2080 { 2081 defaultType = "#REQUIRED"; 2082 valueType = ATTRIBUTE_DEFAULT_REQUIRED; 2083 } 2084 else if (tryRead("IMPLIED")) 2085 { 2086 defaultType = "#IMPLIED"; 2087 valueType = ATTRIBUTE_DEFAULT_IMPLIED; 2088 } 2089 else 2090 { 2091 error("illegal keyword for attribute default value"); 2092 } 2093 } 2094 else 2095 { 2096 value = readLiteral(flags); 2097 } 2098 expandPE = saved; 2099 setAttribute(elementName, name, type, enumer, value, valueType); 2100 if (handler.stringInterning) 2101 { 2102 if ("ENUMERATION" == type) 2103 { 2104 type = enumer; 2105 } 2106 else if ("NOTATION" == type) 2107 { 2108 type = "NOTATION " + enumer; 2109 } 2110 } 2111 else 2112 { 2113 if ("ENUMERATION".equals(type)) 2114 { 2115 type = enumer; 2116 } 2117 else if ("NOTATION".equals(type)) 2118 { 2119 type = "NOTATION " + enumer; 2120 } 2121 } 2122 if (!skippedPE) 2123 { 2124 handler.getDeclHandler().attributeDecl(elementName, name, type, 2125 defaultType, value); 2126 } 2127 } 2128 2129 /** 2130 * Parse a conditional section. 2131 * <pre> 2132 * [61] conditionalSect ::= includeSect || ignoreSect 2133 * [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' 2134 * extSubsetDecl ']]>' 2135 * [63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' 2136 * ignoreSectContents* ']]>' 2137 * [64] ignoreSectContents ::= Ignore 2138 * ('<![' ignoreSectContents* ']]>' Ignore )* 2139 * [65] Ignore ::= Char* - (Char* ( '<![' | ']]>') Char* ) 2140 * </pre> 2141 * <p> NOTE: the '>![' has already been read. 2142 */ parseConditionalSect(char[] saved)2143 private void parseConditionalSect(char[] saved) 2144 throws Exception 2145 { 2146 skipWhitespace(); 2147 if (tryRead("INCLUDE")) 2148 { 2149 skipWhitespace(); 2150 require('['); 2151 // VC: Proper Conditional Section/PE Nesting 2152 if (readBuffer != saved) 2153 { 2154 handler.verror("Illegal Conditional Section/PE nesting"); 2155 } 2156 skipWhitespace(); 2157 while (!tryRead("]]>")) 2158 { 2159 parseMarkupdecl(); 2160 skipWhitespace(); 2161 } 2162 } 2163 else if (tryRead("IGNORE")) 2164 { 2165 skipWhitespace(); 2166 require('['); 2167 // VC: Proper Conditional Section/PE Nesting 2168 if (readBuffer != saved) 2169 { 2170 handler.verror("Illegal Conditional Section/PE nesting"); 2171 } 2172 int nesting = 1; 2173 char c; 2174 expandPE = false; 2175 for (int nest = 1; nest > 0; ) 2176 { 2177 c = readCh(); 2178 switch (c) 2179 { 2180 case '<': 2181 if (tryRead("![")) 2182 { 2183 nest++; 2184 } 2185 break; 2186 case ']': 2187 if (tryRead("]>")) 2188 { 2189 nest--; 2190 } 2191 } 2192 } 2193 expandPE = true; 2194 } 2195 else 2196 { 2197 error("conditional section must begin with INCLUDE or IGNORE"); 2198 } 2199 } 2200 parseCharRef()2201 private void parseCharRef() 2202 throws SAXException, IOException 2203 { 2204 parseCharRef(true /* do flushDataBuffer by default */); 2205 } 2206 2207 /** 2208 * Try to read a character reference without consuming data from buffer. 2209 * <pre> 2210 * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' 2211 * </pre> 2212 * <p>NOTE: the '&#' has already been read. 2213 */ tryReadCharRef()2214 private void tryReadCharRef() 2215 throws SAXException, IOException 2216 { 2217 int value = 0; 2218 char c; 2219 2220 if (tryRead('x')) 2221 { 2222 loop1: 2223 while (true) 2224 { 2225 c = readCh(); 2226 if (c == ';') 2227 { 2228 break loop1; 2229 } 2230 else 2231 { 2232 int n = Character.digit(c, 16); 2233 if (n == -1) 2234 { 2235 error("illegal character in character reference", c, null); 2236 break loop1; 2237 } 2238 value *= 16; 2239 value += n; 2240 } 2241 } 2242 } 2243 else 2244 { 2245 loop2: 2246 while (true) 2247 { 2248 c = readCh(); 2249 if (c == ';') 2250 { 2251 break loop2; 2252 } 2253 else 2254 { 2255 int n = Character.digit(c, 10); 2256 if (n == -1) 2257 { 2258 error("illegal character in character reference", c, null); 2259 break loop2; 2260 } 2261 value *= 10; 2262 value += n; 2263 } 2264 } 2265 } 2266 2267 // check for character refs being legal XML 2268 if ((value < 0x0020 2269 && ! (value == '\n' || value == '\t' || value == '\r')) 2270 || (value >= 0xD800 && value <= 0xDFFF) 2271 || value == 0xFFFE || value == 0xFFFF 2272 || value > 0x0010ffff) 2273 { 2274 error("illegal XML character reference U+" 2275 + Integer.toHexString(value)); 2276 } 2277 2278 // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz 2279 // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: 2280 if (value > 0x0010ffff) 2281 { 2282 // too big for surrogate 2283 error("character reference " + value + " is too large for UTF-16", 2284 Integer.toString(value), null); 2285 } 2286 2287 } 2288 2289 /** 2290 * Read and interpret a character reference. 2291 * <pre> 2292 * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' 2293 * </pre> 2294 * <p>NOTE: the '&#' has already been read. 2295 */ parseCharRef(boolean doFlush)2296 private void parseCharRef(boolean doFlush) 2297 throws SAXException, IOException 2298 { 2299 int value = 0; 2300 char c; 2301 2302 if (tryRead('x')) 2303 { 2304 loop1: 2305 while (true) 2306 { 2307 c = readCh(); 2308 if (c == ';') 2309 { 2310 break loop1; 2311 } 2312 else 2313 { 2314 int n = Character.digit(c, 16); 2315 if (n == -1) 2316 { 2317 error("illegal character in character reference", c, null); 2318 break loop1; 2319 } 2320 value *= 16; 2321 value += n; 2322 } 2323 } 2324 } 2325 else 2326 { 2327 loop2: 2328 while (true) 2329 { 2330 c = readCh(); 2331 if (c == ';') 2332 { 2333 break loop2; 2334 } 2335 else 2336 { 2337 int n = Character.digit(c, 10); 2338 if (n == -1) 2339 { 2340 error("illegal character in character reference", c, null); 2341 break loop2; 2342 } 2343 value *= 10; 2344 value += c - '0'; 2345 } 2346 } 2347 } 2348 2349 // check for character refs being legal XML 2350 if ((value < 0x0020 2351 && ! (value == '\n' || value == '\t' || value == '\r')) 2352 || (value >= 0xD800 && value <= 0xDFFF) 2353 || value == 0xFFFE || value == 0xFFFF 2354 || value > 0x0010ffff) 2355 { 2356 error("illegal XML character reference U+" 2357 + Integer.toHexString(value)); 2358 } 2359 2360 // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz 2361 // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: 2362 if (value <= 0x0000ffff) 2363 { 2364 // no surrogates needed 2365 dataBufferAppend((char) value); 2366 } 2367 else if (value <= 0x0010ffff) 2368 { 2369 value -= 0x10000; 2370 // > 16 bits, surrogate needed 2371 dataBufferAppend((char) (0xd800 | (value >> 10))); 2372 dataBufferAppend((char) (0xdc00 | (value & 0x0003ff))); 2373 } 2374 else 2375 { 2376 // too big for surrogate 2377 error("character reference " + value + " is too large for UTF-16", 2378 Integer.toString(value), null); 2379 } 2380 if (doFlush) 2381 { 2382 dataBufferFlush(); 2383 } 2384 } 2385 2386 /** 2387 * Parse and expand an entity reference. 2388 * <pre> 2389 * [68] EntityRef ::= '&' Name ';' 2390 * </pre> 2391 * <p>NOTE: the '&' has already been read. 2392 * @param externalAllowed External entities are allowed here. 2393 */ parseEntityRef(boolean externalAllowed)2394 private void parseEntityRef(boolean externalAllowed) 2395 throws SAXException, IOException 2396 { 2397 String name; 2398 2399 name = readNmtoken(true); 2400 require(';'); 2401 switch (getEntityType(name)) 2402 { 2403 case ENTITY_UNDECLARED: 2404 // NOTE: XML REC describes amazingly convoluted handling for 2405 // this case. Nothing as meaningful as being a WFness error 2406 // unless the processor might _legitimately_ not have seen a 2407 // declaration ... which is what this implements. 2408 String message; 2409 2410 message = "reference to undeclared general entity " + name; 2411 if (skippedPE && !docIsStandalone) 2412 { 2413 handler.verror(message); 2414 // we don't know this entity, and it might be external... 2415 if (externalAllowed) 2416 { 2417 handler.skippedEntity(name); 2418 } 2419 } 2420 else 2421 { 2422 error(message); 2423 } 2424 break; 2425 case ENTITY_INTERNAL: 2426 pushString(name, getEntityValue(name)); 2427 2428 //workaround for possible input pop before marking 2429 //the buffer reading position 2430 char t = readCh(); 2431 unread(t); 2432 int bufferPosMark = readBufferPos; 2433 2434 int end = readBufferPos + getEntityValue(name).length(); 2435 for (int k = readBufferPos; k < end; k++) 2436 { 2437 t = readCh(); 2438 if (t == '&') 2439 { 2440 t = readCh(); 2441 if (t == '#') 2442 { 2443 //try to match a character ref 2444 tryReadCharRef(); 2445 2446 //everything has been read 2447 if (readBufferPos >= end) 2448 { 2449 break; 2450 } 2451 k = readBufferPos; 2452 continue; 2453 } 2454 else if (Character.isLetter(t)) 2455 { 2456 //looks like an entity ref 2457 unread(t); 2458 readNmtoken(true); 2459 require(';'); 2460 2461 //everything has been read 2462 if (readBufferPos >= end) 2463 { 2464 break; 2465 } 2466 k = readBufferPos; 2467 continue; 2468 } 2469 error(" malformed entity reference"); 2470 } 2471 2472 } 2473 readBufferPos = bufferPosMark; 2474 break; 2475 case ENTITY_TEXT: 2476 if (externalAllowed) 2477 { 2478 pushURL(false, name, getEntityIds(name), 2479 null, null, null, true); 2480 } 2481 else 2482 { 2483 error("reference to external entity in attribute value.", 2484 name, null); 2485 } 2486 break; 2487 case ENTITY_NDATA: 2488 if (externalAllowed) 2489 { 2490 error("unparsed entity reference in content", name, null); 2491 } 2492 else 2493 { 2494 error("reference to external entity in attribute value.", 2495 name, null); 2496 } 2497 break; 2498 default: 2499 throw new RuntimeException(); 2500 } 2501 } 2502 2503 /** 2504 * Parse and expand a parameter entity reference. 2505 * <pre> 2506 * [69] PEReference ::= '%' Name ';' 2507 * </pre> 2508 * <p>NOTE: the '%' has already been read. 2509 */ parsePEReference()2510 private void parsePEReference() 2511 throws SAXException, IOException 2512 { 2513 String name; 2514 2515 name = "%" + readNmtoken(true); 2516 require(';'); 2517 switch (getEntityType(name)) 2518 { 2519 case ENTITY_UNDECLARED: 2520 // VC: Entity Declared 2521 handler.verror("reference to undeclared parameter entity " + name); 2522 2523 // we should disable handling of all subsequent declarations 2524 // unless this is a standalone document (info discarded) 2525 break; 2526 case ENTITY_INTERNAL: 2527 if (inLiteral) 2528 { 2529 pushString(name, getEntityValue(name)); 2530 } 2531 else 2532 { 2533 pushString(name, ' ' + getEntityValue(name) + ' '); 2534 } 2535 break; 2536 case ENTITY_TEXT: 2537 if (!inLiteral) 2538 { 2539 pushString(null, " "); 2540 } 2541 pushURL(true, name, getEntityIds(name), null, null, null, true); 2542 if (!inLiteral) 2543 { 2544 pushString(null, " "); 2545 } 2546 break; 2547 } 2548 } 2549 2550 /** 2551 * Parse an entity declaration. 2552 * <pre> 2553 * [70] EntityDecl ::= GEDecl | PEDecl 2554 * [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>' 2555 * [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>' 2556 * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) 2557 * [74] PEDef ::= EntityValue | ExternalID 2558 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral 2559 * | 'PUBLIC' S PubidLiteral S SystemLiteral 2560 * [76] NDataDecl ::= S 'NDATA' S Name 2561 * </pre> 2562 * <p>NOTE: the '<!ENTITY' has already been read. 2563 */ parseEntityDecl()2564 private void parseEntityDecl() 2565 throws Exception 2566 { 2567 boolean peFlag = false; 2568 int flags = 0; 2569 2570 // Check for a parameter entity. 2571 expandPE = false; 2572 requireWhitespace(); 2573 if (tryRead('%')) 2574 { 2575 peFlag = true; 2576 requireWhitespace(); 2577 } 2578 expandPE = true; 2579 2580 // Read the entity name, and prepend 2581 // '%' if necessary. 2582 String name = readNmtoken(true); 2583 //NE08 2584 if (name.indexOf(':') >= 0) 2585 { 2586 error("Illegal character(':') in entity name ", name, null); 2587 } 2588 if (peFlag) 2589 { 2590 name = "%" + name; 2591 } 2592 2593 // Read the entity value. 2594 requireWhitespace(); 2595 char c = readCh(); 2596 unread (c); 2597 if (c == '"' || c == '\'') 2598 { 2599 // Internal entity ... replacement text has expanded refs 2600 // to characters and PEs, but not to general entities 2601 String value = readLiteral(flags); 2602 setInternalEntity(name, value); 2603 } 2604 else 2605 { 2606 // Read the external IDs 2607 ExternalIdentifiers ids = readExternalIds(false, false); 2608 2609 // Check for NDATA declaration. 2610 boolean white = tryWhitespace(); 2611 if (!peFlag && tryRead("NDATA")) 2612 { 2613 if (!white) 2614 { 2615 error("whitespace required before NDATA"); 2616 } 2617 requireWhitespace(); 2618 String notationName = readNmtoken(true); 2619 if (!skippedPE) 2620 { 2621 setExternalEntity(name, ENTITY_NDATA, ids, notationName); 2622 handler.unparsedEntityDecl(name, ids.publicId, ids.systemId, 2623 ids.baseUri, notationName); 2624 } 2625 } 2626 else if (!skippedPE) 2627 { 2628 setExternalEntity(name, ENTITY_TEXT, ids, null); 2629 handler.getDeclHandler() 2630 .externalEntityDecl(name, ids.publicId, 2631 handler.resolveURIs() 2632 // FIXME: ASSUMES not skipped 2633 // "false" forces error on bad URI 2634 ? handler.absolutize(ids.baseUri, 2635 ids.systemId, 2636 false) 2637 : ids.systemId); 2638 } 2639 } 2640 2641 // Finish the declaration. 2642 skipWhitespace(); 2643 require('>'); 2644 } 2645 2646 /** 2647 * Parse a notation declaration. 2648 * <pre> 2649 * [82] NotationDecl ::= '<!NOTATION' S Name S 2650 * (ExternalID | PublicID) S? '>' 2651 * [83] PublicID ::= 'PUBLIC' S PubidLiteral 2652 * </pre> 2653 * <P>NOTE: the '<!NOTATION' has already been read. 2654 */ parseNotationDecl()2655 private void parseNotationDecl() 2656 throws Exception 2657 { 2658 String nname; 2659 ExternalIdentifiers ids; 2660 2661 requireWhitespace(); 2662 nname = readNmtoken(true); 2663 //NE08 2664 if (nname.indexOf(':') >= 0) 2665 { 2666 error("Illegal character(':') in notation name ", nname, null); 2667 } 2668 requireWhitespace(); 2669 2670 // Read the external identifiers. 2671 ids = readExternalIds(true, false); 2672 2673 // Register the notation. 2674 setNotation(nname, ids); 2675 2676 skipWhitespace(); 2677 require('>'); 2678 } 2679 2680 /** 2681 * Parse character data. 2682 * <pre> 2683 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 2684 * </pre> 2685 */ parseCharData()2686 private void parseCharData() 2687 throws Exception 2688 { 2689 char c; 2690 int state = 0; 2691 boolean pureWhite = false; 2692 2693 // assert (dataBufferPos == 0); 2694 2695 // are we expecting pure whitespace? it might be dirty... 2696 if ((currentElementContent == CONTENT_ELEMENTS) && !isDirtyCurrentElement) 2697 { 2698 pureWhite = true; 2699 } 2700 2701 // always report right out of readBuffer 2702 // to minimize (pointless) buffer copies 2703 while (true) 2704 { 2705 int lineAugment = 0; 2706 int columnAugment = 0; 2707 int i; 2708 2709 loop: 2710 for (i = readBufferPos; i < readBufferLength; i++) 2711 { 2712 switch (c = readBuffer[i]) 2713 { 2714 case '\n': 2715 lineAugment++; 2716 columnAugment = 0; 2717 // pureWhite unmodified 2718 break; 2719 case '\r': // should not happen!! 2720 case '\t': 2721 case ' ': 2722 // pureWhite unmodified 2723 columnAugment++; 2724 break; 2725 case '&': 2726 case '<': 2727 columnAugment++; 2728 // pureWhite unmodified 2729 // CLEAN end of text sequence 2730 state = 1; 2731 break loop; 2732 case ']': 2733 // that's not a whitespace char, and 2734 // can not terminate pure whitespace either 2735 pureWhite = false; 2736 if ((i + 2) < readBufferLength) 2737 { 2738 if (readBuffer [i + 1] == ']' 2739 && readBuffer [i + 2] == '>') 2740 { 2741 // ERROR end of text sequence 2742 state = 2; 2743 break loop; 2744 } 2745 } 2746 else 2747 { 2748 // FIXME missing two end-of-buffer cases 2749 } 2750 columnAugment++; 2751 break; 2752 default: 2753 if ((c < 0x0020 || c > 0xFFFD) 2754 || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) 2755 && xmlVersion == XML_11)) 2756 { 2757 error("illegal XML character U+" 2758 + Integer.toHexString(c)); 2759 } 2760 // that's not a whitespace char 2761 pureWhite = false; 2762 columnAugment++; 2763 } 2764 } 2765 2766 // report text thus far 2767 if (lineAugment > 0) 2768 { 2769 line += lineAugment; 2770 column = columnAugment; 2771 } 2772 else 2773 { 2774 column += columnAugment; 2775 } 2776 2777 // report characters/whitspace 2778 int length = i - readBufferPos; 2779 2780 if (length != 0) 2781 { 2782 if (pureWhite) 2783 { 2784 handler.ignorableWhitespace(readBuffer, 2785 readBufferPos, length); 2786 } 2787 else 2788 { 2789 handler.charData(readBuffer, readBufferPos, length); 2790 } 2791 readBufferPos = i; 2792 } 2793 2794 if (state != 0) 2795 { 2796 break; 2797 } 2798 2799 // fill next buffer from this entity, or 2800 // pop stack and continue with previous entity 2801 unread(readCh()); 2802 } 2803 if (!pureWhite) 2804 { 2805 isDirtyCurrentElement = true; 2806 } 2807 // finish, maybe with error 2808 if (state != 1) // finish, no error 2809 { 2810 error("character data may not contain ']]>'"); 2811 } 2812 } 2813 2814 ////////////////////////////////////////////////////////////////////// 2815 // High-level reading and scanning methods. 2816 ////////////////////////////////////////////////////////////////////// 2817 2818 /** 2819 * Require whitespace characters. 2820 */ requireWhitespace()2821 private void requireWhitespace() 2822 throws SAXException, IOException 2823 { 2824 char c = readCh(); 2825 if (isWhitespace(c)) 2826 { 2827 skipWhitespace(); 2828 } 2829 else 2830 { 2831 error("whitespace required", c, null); 2832 } 2833 } 2834 2835 /** 2836 * Skip whitespace characters. 2837 * <pre> 2838 * [3] S ::= (#x20 | #x9 | #xd | #xa)+ 2839 * </pre> 2840 */ skipWhitespace()2841 private void skipWhitespace() 2842 throws SAXException, IOException 2843 { 2844 // Start with a little cheat. Most of 2845 // the time, the white space will fall 2846 // within the current read buffer; if 2847 // not, then fall through. 2848 if (USE_CHEATS) 2849 { 2850 int lineAugment = 0; 2851 int columnAugment = 0; 2852 2853 loop: 2854 for (int i = readBufferPos; i < readBufferLength; i++) 2855 { 2856 switch (readBuffer[i]) 2857 { 2858 case ' ': 2859 case '\t': 2860 case '\r': 2861 columnAugment++; 2862 break; 2863 case '\n': 2864 lineAugment++; 2865 columnAugment = 0; 2866 break; 2867 case '%': 2868 if (expandPE) 2869 { 2870 break loop; 2871 } 2872 // else fall through... 2873 default: 2874 readBufferPos = i; 2875 if (lineAugment > 0) 2876 { 2877 line += lineAugment; 2878 column = columnAugment; 2879 } 2880 else 2881 { 2882 column += columnAugment; 2883 } 2884 return; 2885 } 2886 } 2887 } 2888 2889 // OK, do it the slow way. 2890 char c = readCh (); 2891 while (isWhitespace(c)) 2892 { 2893 c = readCh(); 2894 } 2895 unread(c); 2896 } 2897 2898 /** 2899 * Read a name or (when parsing an enumeration) name token. 2900 * <pre> 2901 * [5] Name ::= (Letter | '_' | ':') (NameChar)* 2902 * [7] Nmtoken ::= (NameChar)+ 2903 * </pre> 2904 */ readNmtoken(boolean isName)2905 private String readNmtoken(boolean isName) 2906 throws SAXException, IOException 2907 { 2908 char c; 2909 2910 if (USE_CHEATS) 2911 { 2912 loop: 2913 for (int i = readBufferPos; i < readBufferLength; i++) 2914 { 2915 c = readBuffer[i]; 2916 switch (c) 2917 { 2918 case '%': 2919 if (expandPE) 2920 { 2921 break loop; 2922 } 2923 // else fall through... 2924 2925 // What may legitimately come AFTER a name/nmtoken? 2926 case '<': case '>': case '&': 2927 case ',': case '|': case '*': case '+': case '?': 2928 case ')': 2929 case '=': 2930 case '\'': case '"': 2931 case '[': 2932 case ' ': case '\t': case '\r': case '\n': 2933 case ';': 2934 case '/': 2935 int start = readBufferPos; 2936 if (i == start) 2937 { 2938 error("name expected", readBuffer[i], null); 2939 } 2940 readBufferPos = i; 2941 return intern(readBuffer, start, i - start); 2942 2943 default: 2944 // FIXME ... per IBM's OASIS test submission, these: 2945 // ? U+06dd 2946 // Combining U+309B 2947 //these switches are kind of ugly but at least we won't 2948 //have to go over the whole lits for each char 2949 if (isName && i == readBufferPos) 2950 { 2951 char c2 = (char) (c & 0x00f0); 2952 switch (c & 0xff00) 2953 { 2954 //starting with 01 2955 case 0x0100: 2956 switch (c2) 2957 { 2958 case 0x0030: 2959 if (c == 0x0132 || c == 0x0133 || c == 0x013f) 2960 { 2961 error("Not a name start character, U+" 2962 + Integer.toHexString(c)); 2963 } 2964 break; 2965 case 0x0040: 2966 if (c == 0x0140 || c == 0x0149) 2967 { 2968 error("Not a name start character, U+" 2969 + Integer.toHexString(c)); 2970 } 2971 break; 2972 case 0x00c0: 2973 if (c == 0x01c4 || c == 0x01cc) 2974 { 2975 error("Not a name start character, U+" 2976 + Integer.toHexString(c)); 2977 } 2978 break; 2979 case 0x00f0: 2980 if (c == 0x01f1 || c == 0x01f3) 2981 { 2982 error("Not a name start character, U+" 2983 + Integer.toHexString(c)); 2984 } 2985 break; 2986 case 0x00b0: 2987 if (c == 0x01f1 || c == 0x01f3) 2988 { 2989 error("Not a name start character, U+" 2990 + Integer.toHexString(c)); 2991 } 2992 break; 2993 default: 2994 if (c == 0x017f) 2995 { 2996 error("Not a name start character, U+" 2997 + Integer.toHexString(c)); 2998 } 2999 } 3000 3001 break; 3002 //starting with 11 3003 case 0x1100: 3004 switch (c2) 3005 { 3006 case 0x0000: 3007 if (c == 0x1104 || c == 0x1108 || 3008 c == 0x110a || c == 0x110d) 3009 { 3010 error("Not a name start character, U+" 3011 + Integer.toHexString(c)); 3012 } 3013 break; 3014 case 0x0030: 3015 if (c == 0x113b || c == 0x113f) 3016 { 3017 error("Not a name start character, U+" 3018 + Integer.toHexString(c)); 3019 } 3020 break; 3021 case 0x0040: 3022 if (c == 0x1141 || c == 0x114d 3023 || c == 0x114f ) 3024 { 3025 error("Not a name start character, U+" 3026 + Integer.toHexString(c)); 3027 } 3028 break; 3029 case 0x0050: 3030 if (c == 0x1151 || c == 0x1156) 3031 { 3032 error("Not a name start character, U+" 3033 + Integer.toHexString(c)); 3034 } 3035 break; 3036 case 0x0060: 3037 if (c == 0x1162 || c == 0x1164 3038 || c == 0x1166 || c == 0x116b 3039 || c == 0x116f) 3040 { 3041 error("Not a name start character, U+" 3042 + Integer.toHexString(c)); 3043 } 3044 break; 3045 case 0x00b0: 3046 if (c == 0x11b6 || c == 0x11b9 3047 || c == 0x11bb || c == 0x116f) 3048 { 3049 error("Not a name start character, U+" 3050 + Integer.toHexString(c)); 3051 } 3052 break; 3053 default: 3054 if (c == 0x1174 || c == 0x119f 3055 || c == 0x11ac || c == 0x11c3 3056 || c == 0x11f1) 3057 { 3058 error("Not a name start character, U+" 3059 + Integer.toHexString(c)); 3060 } 3061 } 3062 break; 3063 default: 3064 if (c == 0x0e46 || c == 0x1011 3065 || c == 0x212f || c == 0x0587 3066 || c == 0x0230 ) 3067 { 3068 error("Not a name start character, U+" 3069 + Integer.toHexString(c)); 3070 } 3071 } 3072 } 3073 // punt on exact tests from Appendix A; approximate 3074 // them using the Unicode ID start/part rules 3075 if (i == readBufferPos && isName) 3076 { 3077 if (!Character.isUnicodeIdentifierStart(c) 3078 && c != ':' && c != '_') 3079 { 3080 error("Not a name start character, U+" 3081 + Integer.toHexString(c)); 3082 } 3083 } 3084 else if (!Character.isUnicodeIdentifierPart(c) 3085 && c != '-' && c != ':' && c != '_' && c != '.' 3086 && !isExtender(c)) 3087 { 3088 error("Not a name character, U+" 3089 + Integer.toHexString(c)); 3090 } 3091 } 3092 } 3093 } 3094 3095 nameBufferPos = 0; 3096 3097 // Read the first character. 3098 while (true) 3099 { 3100 c = readCh(); 3101 switch (c) 3102 { 3103 case '%': 3104 case '<': case '>': case '&': 3105 case ',': case '|': case '*': case '+': case '?': 3106 case ')': 3107 case '=': 3108 case '\'': case '"': 3109 case '[': 3110 case ' ': case '\t': case '\n': case '\r': 3111 case ';': 3112 case '/': 3113 unread(c); 3114 if (nameBufferPos == 0) 3115 { 3116 error ("name expected"); 3117 } 3118 // punt on exact tests from Appendix A, but approximate them 3119 if (isName 3120 && !Character.isUnicodeIdentifierStart(nameBuffer[0]) 3121 && ":_".indexOf(nameBuffer[0]) == -1) 3122 { 3123 error("Not a name start character, U+" 3124 + Integer.toHexString(nameBuffer[0])); 3125 } 3126 String s = intern(nameBuffer, 0, nameBufferPos); 3127 nameBufferPos = 0; 3128 return s; 3129 default: 3130 // punt on exact tests from Appendix A, but approximate them 3131 3132 if ((nameBufferPos != 0 || !isName) 3133 && !Character.isUnicodeIdentifierPart(c) 3134 && ":-_.".indexOf(c) == -1 3135 && !isExtender(c)) 3136 { 3137 error("Not a name character, U+" 3138 + Integer.toHexString(c)); 3139 } 3140 if (nameBufferPos >= nameBuffer.length) 3141 { 3142 nameBuffer = 3143 (char[]) extendArray(nameBuffer, 3144 nameBuffer.length, nameBufferPos); 3145 } 3146 nameBuffer[nameBufferPos++] = c; 3147 } 3148 } 3149 } 3150 isExtender(char c)3151 private static boolean isExtender(char c) 3152 { 3153 // [88] Extender ::= ... 3154 return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 3155 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005 3156 || (c >= 0x3031 && c <= 0x3035) 3157 || (c >= 0x309d && c <= 0x309e) 3158 || (c >= 0x30fc && c <= 0x30fe); 3159 } 3160 3161 /** 3162 * Read a literal. With matching single or double quotes as 3163 * delimiters (and not embedded!) this is used to parse: 3164 * <pre> 3165 * [9] EntityValue ::= ... ([^%&] | PEReference | Reference)* ... 3166 * [10] AttValue ::= ... ([^<&] | Reference)* ... 3167 * [11] SystemLiteral ::= ... (URLchar - "'")* ... 3168 * [12] PubidLiteral ::= ... (PubidChar - "'")* ... 3169 * </pre> 3170 * as well as the quoted strings in XML and text declarations 3171 * (for version, encoding, and standalone) which have their 3172 * own constraints. 3173 */ readLiteral(int flags)3174 private String readLiteral(int flags) 3175 throws SAXException, IOException 3176 { 3177 char delim, c; 3178 int startLine = line; 3179 boolean saved = expandPE; 3180 boolean savedReport = doReport; 3181 3182 // Find the first delimiter. 3183 delim = readCh(); 3184 if (delim != '"' && delim != '\'') 3185 { 3186 error("expected '\"' or \"'\"", delim, null); 3187 return null; 3188 } 3189 inLiteral = true; 3190 if ((flags & LIT_DISABLE_PE) != 0) 3191 { 3192 expandPE = false; 3193 } 3194 doReport = false; 3195 3196 // Each level of input source has its own buffer; remember 3197 // ours, so we won't read the ending delimiter from any 3198 // other input source, regardless of entity processing. 3199 char[] ourBuf = readBuffer; 3200 3201 // Read the literal. 3202 try 3203 { 3204 c = readCh(); 3205 boolean ampRead = false; 3206 loop: 3207 while (! (c == delim && readBuffer == ourBuf)) 3208 { 3209 switch (c) 3210 { 3211 // attributes and public ids are normalized 3212 // in almost the same ways 3213 case '\n': 3214 case '\r': 3215 if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0) 3216 { 3217 c = ' '; 3218 } 3219 break; 3220 case '\t': 3221 if ((flags & LIT_ATTRIBUTE) != 0) 3222 { 3223 c = ' '; 3224 } 3225 break; 3226 case '&': 3227 c = readCh(); 3228 // Char refs are expanded immediately, except for 3229 // all the cases where it's deferred. 3230 if (c == '#') 3231 { 3232 if ((flags & LIT_DISABLE_CREF) != 0) 3233 { 3234 dataBufferAppend('&'); 3235 break; 3236 } 3237 parseCharRef(false /* Do not do flushDataBuffer */); 3238 3239 // exotic WFness risk: this is an entity literal, 3240 // dataBuffer [dataBufferPos - 1] == '&', and 3241 // following chars are a _partial_ entity/char ref 3242 3243 // It looks like an entity ref ... 3244 } 3245 else 3246 { 3247 unread(c); 3248 // Expand it? 3249 if ((flags & LIT_ENTITY_REF) > 0) 3250 { 3251 parseEntityRef(false); 3252 if (String.valueOf(readBuffer).equals("&")) 3253 { 3254 ampRead = true; 3255 } 3256 //Is it just data? 3257 } 3258 else if ((flags & LIT_DISABLE_EREF) != 0) 3259 { 3260 dataBufferAppend('&'); 3261 3262 // OK, it will be an entity ref -- expanded later. 3263 } 3264 else 3265 { 3266 String name = readNmtoken(true); 3267 require(';'); 3268 dataBufferAppend('&'); 3269 dataBufferAppend(name); 3270 dataBufferAppend(';'); 3271 } 3272 } 3273 c = readCh(); 3274 continue loop; 3275 3276 case '<': 3277 // and why? Perhaps so "&foo;" expands the same 3278 // inside and outside an attribute? 3279 if ((flags & LIT_ATTRIBUTE) != 0) 3280 { 3281 error("attribute values may not contain '<'"); 3282 } 3283 break; 3284 3285 // We don't worry about case '%' and PE refs, readCh does. 3286 3287 default: 3288 break; 3289 } 3290 dataBufferAppend(c); 3291 c = readCh(); 3292 } 3293 } 3294 catch (EOFException e) 3295 { 3296 error("end of input while looking for delimiter (started on line " 3297 + startLine + ')', null, Character.toString(delim)); 3298 } 3299 inLiteral = false; 3300 expandPE = saved; 3301 doReport = savedReport; 3302 3303 // Normalise whitespace if necessary. 3304 if ((flags & LIT_NORMALIZE) > 0) 3305 { 3306 dataBufferNormalize(); 3307 } 3308 3309 // Return the value. 3310 return dataBufferToString(); 3311 } 3312 3313 /** 3314 * Try reading external identifiers. 3315 * A system identifier is not required for notations. 3316 * @param inNotation Are we parsing a notation decl? 3317 * @param isSubset Parsing external subset decl (may be omitted)? 3318 * @return A three-member String array containing the identifiers, 3319 * or nulls. Order: public, system, baseURI. 3320 */ readExternalIds(boolean inNotation, boolean isSubset)3321 private ExternalIdentifiers readExternalIds(boolean inNotation, 3322 boolean isSubset) 3323 throws Exception 3324 { 3325 char c; 3326 ExternalIdentifiers ids = new ExternalIdentifiers(); 3327 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; 3328 3329 if (tryRead("PUBLIC")) 3330 { 3331 requireWhitespace(); 3332 ids.publicId = readLiteral(LIT_NORMALIZE | LIT_PUBID | flags); 3333 if (inNotation) 3334 { 3335 skipWhitespace(); 3336 c = readCh(); 3337 unread(c); 3338 if (c == '"' || c == '\'') 3339 { 3340 ids.systemId = readLiteral(flags); 3341 } 3342 } 3343 else 3344 { 3345 requireWhitespace(); 3346 ids.systemId = readLiteral(flags); 3347 } 3348 3349 for (int i = 0; i < ids.publicId.length(); i++) 3350 { 3351 c = ids.publicId.charAt(i); 3352 if (c >= 'a' && c <= 'z') 3353 { 3354 continue; 3355 } 3356 if (c >= 'A' && c <= 'Z') 3357 { 3358 continue; 3359 } 3360 if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf(c) != -1) 3361 { 3362 continue; 3363 } 3364 error("illegal PUBLIC id character U+" 3365 + Integer.toHexString(c)); 3366 } 3367 } 3368 else if (tryRead("SYSTEM")) 3369 { 3370 requireWhitespace(); 3371 ids.systemId = readLiteral(flags); 3372 } 3373 else if (!isSubset) 3374 { 3375 error("missing SYSTEM or PUBLIC keyword"); 3376 } 3377 3378 if (ids.systemId != null) 3379 { 3380 if (ids.systemId.indexOf('#') != -1) 3381 { 3382 handler.verror("SYSTEM id has a URI fragment: " + ids.systemId); 3383 } 3384 ids.baseUri = handler.getSystemId(); 3385 if (ids.baseUri == null && uriWarnings) 3386 { 3387 handler.warn("No base URI; hope URI is absolute: " 3388 + ids.systemId); 3389 } 3390 } 3391 3392 return ids; 3393 } 3394 3395 /** 3396 * Test if a character is whitespace. 3397 * <pre> 3398 * [3] S ::= (#x20 | #x9 | #xd | #xa)+ 3399 * </pre> 3400 * @param c The character to test. 3401 * @return true if the character is whitespace. 3402 */ isWhitespace(char c)3403 private final boolean isWhitespace(char c) 3404 { 3405 if (c > 0x20) 3406 { 3407 return false; 3408 } 3409 if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d) 3410 { 3411 return true; 3412 } 3413 return false; // illegal ... 3414 } 3415 3416 ////////////////////////////////////////////////////////////////////// 3417 // Utility routines. 3418 ////////////////////////////////////////////////////////////////////// 3419 3420 /** 3421 * Add a character to the data buffer. 3422 */ dataBufferAppend(char c)3423 private void dataBufferAppend(char c) 3424 { 3425 // Expand buffer if necessary. 3426 if (dataBufferPos >= dataBuffer.length) 3427 { 3428 dataBuffer = (char[]) extendArray(dataBuffer, 3429 dataBuffer.length, dataBufferPos); 3430 } 3431 dataBuffer[dataBufferPos++] = c; 3432 } 3433 3434 /** 3435 * Add a string to the data buffer. 3436 */ dataBufferAppend(String s)3437 private void dataBufferAppend(String s) 3438 { 3439 dataBufferAppend(s.toCharArray(), 0, s.length()); 3440 } 3441 3442 /** 3443 * Append (part of) a character array to the data buffer. 3444 */ dataBufferAppend(char[] ch, int start, int length)3445 private void dataBufferAppend(char[] ch, int start, int length) 3446 { 3447 dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length, 3448 dataBufferPos + length); 3449 3450 System.arraycopy(ch, start, dataBuffer, dataBufferPos, length); 3451 dataBufferPos += length; 3452 } 3453 3454 /** 3455 * Normalise space characters in the data buffer. 3456 */ dataBufferNormalize()3457 private void dataBufferNormalize() 3458 { 3459 int i = 0; 3460 int j = 0; 3461 int end = dataBufferPos; 3462 3463 // Skip spaces at the start. 3464 while (j < end && dataBuffer[j] == ' ') 3465 { 3466 j++; 3467 } 3468 3469 // Skip whitespace at the end. 3470 while (end > j && dataBuffer[end - 1] == ' ') 3471 { 3472 end --; 3473 } 3474 3475 // Start copying to the left. 3476 while (j < end) 3477 { 3478 3479 char c = dataBuffer[j++]; 3480 3481 // Normalise all other spaces to 3482 // a single space. 3483 if (c == ' ') 3484 { 3485 while (j < end && dataBuffer[j++] == ' ') 3486 { 3487 continue; 3488 } 3489 dataBuffer[i++] = ' '; 3490 dataBuffer[i++] = dataBuffer[j - 1]; 3491 } 3492 else 3493 { 3494 dataBuffer[i++] = c; 3495 } 3496 } 3497 3498 // The new length is <= the old one. 3499 dataBufferPos = i; 3500 } 3501 3502 /** 3503 * Convert the data buffer to a string. 3504 */ dataBufferToString()3505 private String dataBufferToString() 3506 { 3507 String s = new String(dataBuffer, 0, dataBufferPos); 3508 dataBufferPos = 0; 3509 return s; 3510 } 3511 3512 /** 3513 * Flush the contents of the data buffer to the handler, as 3514 * appropriate, and reset the buffer for new input. 3515 */ dataBufferFlush()3516 private void dataBufferFlush() 3517 throws SAXException 3518 { 3519 if (currentElementContent == CONTENT_ELEMENTS 3520 && dataBufferPos > 0 3521 && !inCDATA) 3522 { 3523 // We can't just trust the buffer to be whitespace, there 3524 // are (error) cases when it isn't 3525 for (int i = 0; i < dataBufferPos; i++) 3526 { 3527 if (!isWhitespace(dataBuffer[i])) 3528 { 3529 handler.charData(dataBuffer, 0, dataBufferPos); 3530 dataBufferPos = 0; 3531 } 3532 } 3533 if (dataBufferPos > 0) 3534 { 3535 handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos); 3536 dataBufferPos = 0; 3537 } 3538 } 3539 else if (dataBufferPos > 0) 3540 { 3541 handler.charData(dataBuffer, 0, dataBufferPos); 3542 dataBufferPos = 0; 3543 } 3544 } 3545 3546 /** 3547 * Require a string to appear, or throw an exception. 3548 * <p><em>Precondition:</em> Entity expansion is not required. 3549 * <p><em>Precondition:</em> data buffer has no characters that 3550 * will get sent to the application. 3551 */ require(String delim)3552 private void require(String delim) 3553 throws SAXException, IOException 3554 { 3555 int length = delim.length(); 3556 char[] ch; 3557 3558 if (length < dataBuffer.length) 3559 { 3560 ch = dataBuffer; 3561 delim.getChars(0, length, ch, 0); 3562 } 3563 else 3564 { 3565 ch = delim.toCharArray(); 3566 } 3567 3568 if (USE_CHEATS && length <= (readBufferLength - readBufferPos)) 3569 { 3570 int offset = readBufferPos; 3571 3572 for (int i = 0; i < length; i++, offset++) 3573 { 3574 if (ch[i] != readBuffer[offset]) 3575 { 3576 error ("required string", null, delim); 3577 } 3578 } 3579 readBufferPos = offset; 3580 3581 } 3582 else 3583 { 3584 for (int i = 0; i < length; i++) 3585 { 3586 require(ch[i]); 3587 } 3588 } 3589 } 3590 3591 /** 3592 * Require a character to appear, or throw an exception. 3593 */ require(char delim)3594 private void require(char delim) 3595 throws SAXException, IOException 3596 { 3597 char c = readCh(); 3598 3599 if (c != delim) 3600 { 3601 error("required character", c, Character.toString(delim)); 3602 } 3603 } 3604 3605 /** 3606 * Create an interned string from a character array. 3607 * Ælfred uses this method to create an interned version 3608 * of all names and name tokens, so that it can test equality 3609 * with <code>==</code> instead of <code>String.equals ()</code>. 3610 * 3611 * <p>This is much more efficient than constructing a non-interned 3612 * string first, and then interning it. 3613 * 3614 * @param ch an array of characters for building the string. 3615 * @param start the starting position in the array. 3616 * @param length the number of characters to place in the string. 3617 * @return an interned string. 3618 * @see #intern (String) 3619 * @see java.lang.String#intern 3620 */ intern(char[] ch, int start, int length)3621 public String intern(char[] ch, int start, int length) 3622 { 3623 int index = 0; 3624 int hash = 0; 3625 Object[] bucket; 3626 3627 // Generate a hash code. This is a widely used string hash, 3628 // often attributed to Brian Kernighan. 3629 for (int i = start; i < start + length; i++) 3630 { 3631 hash = 31 * hash + ch[i]; 3632 } 3633 hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH; 3634 3635 // Get the bucket -- consists of {array,String} pairs 3636 if ((bucket = symbolTable[hash]) == null) 3637 { 3638 // first string in this bucket 3639 bucket = new Object[8]; 3640 3641 // Search for a matching tuple, and 3642 // return the string if we find one. 3643 } 3644 else 3645 { 3646 while (index < bucket.length) 3647 { 3648 char[] chFound = (char[]) bucket[index]; 3649 3650 // Stop when we hit an empty entry. 3651 if (chFound == null) 3652 { 3653 break; 3654 } 3655 3656 // If they're the same length, check for a match. 3657 if (chFound.length == length) 3658 { 3659 for (int i = 0; i < chFound.length; i++) 3660 { 3661 // continue search on failure 3662 if (ch[start + i] != chFound[i]) 3663 { 3664 break; 3665 } 3666 else if (i == length - 1) 3667 { 3668 // That's it, we have a match! 3669 return (String) bucket[index + 1]; 3670 } 3671 } 3672 } 3673 index += 2; 3674 } 3675 // Not found -- we'll have to add it. 3676 3677 // Do we have to grow the bucket? 3678 bucket = (Object[]) extendArray(bucket, bucket.length, index); 3679 } 3680 symbolTable[hash] = bucket; 3681 3682 // OK, add it to the end of the bucket -- "local" interning. 3683 // Intern "globally" to let applications share interning benefits. 3684 // That is, "!=" and "==" work on our strings, not just equals(). 3685 String s = new String(ch, start, length).intern(); 3686 bucket[index] = s.toCharArray(); 3687 bucket[index + 1] = s; 3688 return s; 3689 } 3690 3691 /** 3692 * Ensure the capacity of an array, allocating a new one if 3693 * necessary. Usually extends only for name hash collisions. 3694 */ extendArray(Object array, int currentSize, int requiredSize)3695 private Object extendArray(Object array, int currentSize, int requiredSize) 3696 { 3697 if (requiredSize < currentSize) 3698 { 3699 return array; 3700 } 3701 else 3702 { 3703 Object newArray = null; 3704 int newSize = currentSize * 2; 3705 3706 if (newSize <= requiredSize) 3707 { 3708 newSize = requiredSize + 1; 3709 } 3710 3711 if (array instanceof char[]) 3712 { 3713 newArray = new char[newSize]; 3714 } 3715 else if (array instanceof Object[]) 3716 { 3717 newArray = new Object[newSize]; 3718 } 3719 else 3720 { 3721 throw new RuntimeException(); 3722 } 3723 3724 System.arraycopy(array, 0, newArray, 0, currentSize); 3725 return newArray; 3726 } 3727 } 3728 3729 ////////////////////////////////////////////////////////////////////// 3730 // XML query routines. 3731 ////////////////////////////////////////////////////////////////////// 3732 isStandalone()3733 boolean isStandalone() 3734 { 3735 return docIsStandalone; 3736 } 3737 3738 // 3739 // Elements 3740 // 3741 getContentType(ElementDecl element, int defaultType)3742 private int getContentType(ElementDecl element, int defaultType) 3743 { 3744 int retval; 3745 3746 if (element == null) 3747 { 3748 return defaultType; 3749 } 3750 retval = element.contentType; 3751 if (retval == CONTENT_UNDECLARED) 3752 { 3753 retval = defaultType; 3754 } 3755 return retval; 3756 } 3757 3758 /** 3759 * Look up the content type of an element. 3760 * @param name The element type name. 3761 * @return An integer constant representing the content type. 3762 * @see #CONTENT_UNDECLARED 3763 * @see #CONTENT_ANY 3764 * @see #CONTENT_EMPTY 3765 * @see #CONTENT_MIXED 3766 * @see #CONTENT_ELEMENTS 3767 */ getElementContentType(String name)3768 public int getElementContentType(String name) 3769 { 3770 ElementDecl element = (ElementDecl) elementInfo.get(name); 3771 return getContentType(element, CONTENT_UNDECLARED); 3772 } 3773 3774 /** 3775 * Register an element. 3776 * Array format: 3777 * [0] element type name 3778 * [1] content model (mixed, elements only) 3779 * [2] attribute hash table 3780 */ setElement(String name, int contentType, String contentModel, HashMap attributes)3781 private void setElement(String name, int contentType, 3782 String contentModel, HashMap attributes) 3783 throws SAXException 3784 { 3785 if (skippedPE) 3786 { 3787 return; 3788 } 3789 3790 ElementDecl element = (ElementDecl) elementInfo.get(name); 3791 3792 // first <!ELEMENT ...> or <!ATTLIST ...> for this type? 3793 if (element == null) 3794 { 3795 element = new ElementDecl(); 3796 element.contentType = contentType; 3797 element.contentModel = contentModel; 3798 element.attributes = attributes; 3799 elementInfo.put(name, element); 3800 return; 3801 } 3802 3803 // <!ELEMENT ...> declaration? 3804 if (contentType != CONTENT_UNDECLARED) 3805 { 3806 // ... following an associated <!ATTLIST ...> 3807 if (element.contentType == CONTENT_UNDECLARED) 3808 { 3809 element.contentType = contentType; 3810 element.contentModel = contentModel; 3811 } 3812 else 3813 { 3814 // VC: Unique Element Type Declaration 3815 handler.verror("multiple declarations for element type: " 3816 + name); 3817 } 3818 } 3819 3820 // first <!ATTLIST ...>, before <!ELEMENT ...> ? 3821 else if (attributes != null) 3822 { 3823 element.attributes = attributes; 3824 } 3825 } 3826 3827 /** 3828 * Look up the attribute hash table for an element. 3829 * The hash table is the second item in the element array. 3830 */ getElementAttributes(String name)3831 private HashMap getElementAttributes(String name) 3832 { 3833 ElementDecl element = (ElementDecl) elementInfo.get(name); 3834 return (element == null) ? null : element.attributes; 3835 } 3836 3837 // 3838 // Attributes 3839 // 3840 3841 /** 3842 * Get the declared attributes for an element type. 3843 * @param elname The name of the element type. 3844 * @return An iterator over all the attributes declared for 3845 * a specific element type. The results will be valid only 3846 * after the DTD (if any) has been parsed. 3847 * @see #getAttributeType 3848 * @see #getAttributeEnumeration 3849 * @see #getAttributeDefaultValueType 3850 * @see #getAttributeDefaultValue 3851 * @see #getAttributeExpandedValue 3852 */ declaredAttributes(ElementDecl element)3853 private Iterator declaredAttributes(ElementDecl element) 3854 { 3855 HashMap attlist; 3856 3857 if (element == null) 3858 { 3859 return null; 3860 } 3861 if ((attlist = element.attributes) == null) 3862 { 3863 return null; 3864 } 3865 return attlist.keySet().iterator(); 3866 } 3867 3868 /** 3869 * Get the declared attributes for an element type. 3870 * @param elname The name of the element type. 3871 * @return An iterator over all the attributes declared for 3872 * a specific element type. The results will be valid only 3873 * after the DTD (if any) has been parsed. 3874 * @see #getAttributeType 3875 * @see #getAttributeEnumeration 3876 * @see #getAttributeDefaultValueType 3877 * @see #getAttributeDefaultValue 3878 * @see #getAttributeExpandedValue 3879 */ declaredAttributes(String elname)3880 public Iterator declaredAttributes(String elname) 3881 { 3882 return declaredAttributes((ElementDecl) elementInfo.get(elname)); 3883 } 3884 3885 /** 3886 * Retrieve the declared type of an attribute. 3887 * @param name The name of the associated element. 3888 * @param aname The name of the attribute. 3889 * @return An interend string denoting the type, or null 3890 * indicating an undeclared attribute. 3891 */ getAttributeType(String name, String aname)3892 public String getAttributeType(String name, String aname) 3893 { 3894 AttributeDecl attribute = getAttribute(name, aname); 3895 return (attribute == null) ? null : attribute.type; 3896 } 3897 3898 /** 3899 * Retrieve the allowed values for an enumerated attribute type. 3900 * @param name The name of the associated element. 3901 * @param aname The name of the attribute. 3902 * @return A string containing the token list. 3903 */ getAttributeEnumeration(String name, String aname)3904 public String getAttributeEnumeration(String name, String aname) 3905 { 3906 AttributeDecl attribute = getAttribute(name, aname); 3907 // assert: attribute.enumeration is "ENUMERATION" or "NOTATION" 3908 return (attribute == null) ? null : attribute.enumeration; 3909 } 3910 3911 /** 3912 * Retrieve the default value of a declared attribute. 3913 * @param name The name of the associated element. 3914 * @param aname The name of the attribute. 3915 * @return The default value, or null if the attribute was 3916 * #IMPLIED or simply undeclared and unspecified. 3917 * @see #getAttributeExpandedValue 3918 */ getAttributeDefaultValue(String name, String aname)3919 public String getAttributeDefaultValue(String name, String aname) 3920 { 3921 AttributeDecl attribute = getAttribute(name, aname); 3922 return (attribute == null) ? null : attribute.value; 3923 } 3924 3925 /* 3926 3927 // FIXME: Leaving this in, until W3C finally resolves the confusion 3928 // between parts of the XML 2nd REC about when entity declararations 3929 // are guaranteed to be known. Current code matches what section 5.1 3930 // (conformance) describes, but some readings of the self-contradicting 3931 // text in 4.1 (the "Entity Declared" WFC and VC) seem to expect that 3932 // attribute expansion/normalization must be deferred in some cases 3933 // (just TRY to identify them!). 3934 3935 * Retrieve the expanded value of a declared attribute. 3936 * <p>General entities (and char refs) will be expanded (once). 3937 * @param name The name of the associated element. 3938 * @param aname The name of the attribute. 3939 * @return The expanded default value, or null if the attribute was 3940 * #IMPLIED or simply undeclared 3941 * @see #getAttributeDefaultValue 3942 public String getAttributeExpandedValue (String name, String aname) 3943 throws Exception 3944 { 3945 AttributeDecl attribute = getAttribute (name, aname); 3946 3947 if (attribute == null) { 3948 return null; 3949 } else if (attribute.defaultValue == null && attribute.value != null) { 3950 // we MUST use the same buf for both quotes else the literal 3951 // can't be properly terminated 3952 char buf [] = new char [1]; 3953 int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE; 3954 String type = getAttributeType (name, aname); 3955 3956 if (type != "CDATA" && type != null) 3957 flags |= LIT_NORMALIZE; 3958 buf [0] = '"'; 3959 pushCharArray (null, buf, 0, 1); 3960 pushString (null, attribute.value); 3961 pushCharArray (null, buf, 0, 1); 3962 attribute.defaultValue = readLiteral (flags); 3963 } 3964 return attribute.defaultValue; 3965 } 3966 */ 3967 3968 /** 3969 * Retrieve the default value mode of a declared attribute. 3970 * @see #ATTRIBUTE_DEFAULT_SPECIFIED 3971 * @see #ATTRIBUTE_DEFAULT_IMPLIED 3972 * @see #ATTRIBUTE_DEFAULT_REQUIRED 3973 * @see #ATTRIBUTE_DEFAULT_FIXED 3974 */ getAttributeDefaultValueType(String name, String aname)3975 public int getAttributeDefaultValueType(String name, String aname) 3976 { 3977 AttributeDecl attribute = getAttribute(name, aname); 3978 return (attribute == null) ? ATTRIBUTE_DEFAULT_UNDECLARED : 3979 attribute.valueType; 3980 } 3981 3982 /** 3983 * Register an attribute declaration for later retrieval. 3984 * Format: 3985 * - String type 3986 * - String default value 3987 * - int value type 3988 * - enumeration 3989 * - processed default value 3990 */ setAttribute(String elName, String name, String type, String enumeration, String value, int valueType)3991 private void setAttribute(String elName, String name, String type, 3992 String enumeration, String value, int valueType) 3993 throws Exception 3994 { 3995 HashMap attlist; 3996 3997 if (skippedPE) 3998 { 3999 return; 4000 } 4001 4002 // Create a new hashtable if necessary. 4003 attlist = getElementAttributes(elName); 4004 if (attlist == null) 4005 { 4006 attlist = new HashMap(); 4007 } 4008 4009 // ignore multiple attribute declarations! 4010 if (attlist.get(name) != null) 4011 { 4012 // warn ... 4013 return; 4014 } 4015 else 4016 { 4017 AttributeDecl attribute = new AttributeDecl(); 4018 attribute.type = type; 4019 attribute.value = value; 4020 attribute.valueType = valueType; 4021 attribute.enumeration = enumeration; 4022 attlist.put(name, attribute); 4023 4024 // save; but don't overwrite any existing <!ELEMENT ...> 4025 setElement(elName, CONTENT_UNDECLARED, null, attlist); 4026 } 4027 } 4028 4029 /** 4030 * Retrieve the attribute declaration for the given element name and name. 4031 */ getAttribute(String elName, String name)4032 private AttributeDecl getAttribute(String elName, String name) 4033 { 4034 HashMap attlist = getElementAttributes(elName); 4035 return (attlist == null) ? null : (AttributeDecl) attlist.get(name); 4036 } 4037 4038 // 4039 // Entities 4040 // 4041 4042 /** 4043 * Find the type of an entity. 4044 * @returns An integer constant representing the entity type. 4045 * @see #ENTITY_UNDECLARED 4046 * @see #ENTITY_INTERNAL 4047 * @see #ENTITY_NDATA 4048 * @see #ENTITY_TEXT 4049 */ getEntityType(String ename)4050 public int getEntityType(String ename) 4051 { 4052 EntityInfo entity = (EntityInfo) entityInfo.get(ename); 4053 return (entity == null) ? ENTITY_UNDECLARED : entity.type; 4054 } 4055 4056 /** 4057 * Return an external entity's identifiers. 4058 * @param ename The name of the external entity. 4059 * @return The entity's public identifier, system identifier, and base URI. 4060 * Null if the entity was not declared as an external entity. 4061 * @see #getEntityType 4062 */ getEntityIds(String ename)4063 public ExternalIdentifiers getEntityIds(String ename) 4064 { 4065 EntityInfo entity = (EntityInfo) entityInfo.get(ename); 4066 return (entity == null) ? null : entity.ids; 4067 } 4068 4069 /** 4070 * Return an internal entity's replacement text. 4071 * @param ename The name of the internal entity. 4072 * @return The entity's replacement text, or null if 4073 * the entity was not declared as an internal entity. 4074 * @see #getEntityType 4075 */ getEntityValue(String ename)4076 public String getEntityValue(String ename) 4077 { 4078 EntityInfo entity = (EntityInfo) entityInfo.get(ename); 4079 return (entity == null) ? null : entity.value; 4080 } 4081 4082 /** 4083 * Register an entity declaration for later retrieval. 4084 */ setInternalEntity(String eName, String value)4085 private void setInternalEntity(String eName, String value) 4086 throws SAXException 4087 { 4088 if (skippedPE) 4089 { 4090 return; 4091 } 4092 4093 if (entityInfo.get(eName) == null) 4094 { 4095 EntityInfo entity = new EntityInfo(); 4096 entity.type = ENTITY_INTERNAL; 4097 entity.value = value; 4098 entityInfo.put(eName, entity); 4099 } 4100 if (handler.stringInterning) 4101 { 4102 if ("lt" == eName || "gt" == eName || "quot" == eName 4103 || "apos" == eName || "amp" == eName) 4104 { 4105 return; 4106 } 4107 } 4108 else 4109 { 4110 if ("lt".equals(eName) || "gt".equals(eName) || "quot".equals(eName) 4111 || "apos".equals(eName) || "amp".equals(eName)) 4112 { 4113 return; 4114 } 4115 } 4116 handler.getDeclHandler().internalEntityDecl(eName, value); 4117 } 4118 4119 /** 4120 * Register an external entity declaration for later retrieval. 4121 */ setExternalEntity(String eName, int eClass, ExternalIdentifiers ids, String nName)4122 private void setExternalEntity(String eName, int eClass, 4123 ExternalIdentifiers ids, String nName) 4124 { 4125 if (entityInfo.get(eName) == null) 4126 { 4127 EntityInfo entity = new EntityInfo(); 4128 entity.type = eClass; 4129 entity.ids = ids; 4130 entity.notationName = nName; 4131 entityInfo.put(eName, entity); 4132 } 4133 } 4134 4135 // 4136 // Notations. 4137 // 4138 4139 /** 4140 * Report a notation declaration, checking for duplicates. 4141 */ setNotation(String nname, ExternalIdentifiers ids)4142 private void setNotation(String nname, ExternalIdentifiers ids) 4143 throws SAXException 4144 { 4145 if (skippedPE) 4146 { 4147 return; 4148 } 4149 4150 handler.notationDecl(nname, ids.publicId, ids.systemId, ids.baseUri); 4151 if (notationInfo.get(nname) == null) 4152 { 4153 notationInfo.put(nname, nname); 4154 } 4155 else 4156 { 4157 // VC: Unique Notation Name 4158 handler.verror("Duplicate notation name decl: " + nname); 4159 } 4160 } 4161 4162 // 4163 // Location. 4164 // 4165 4166 /** 4167 * Return the current line number. 4168 */ getLineNumber()4169 public int getLineNumber() 4170 { 4171 return line; 4172 } 4173 4174 /** 4175 * Return the current column number. 4176 */ getColumnNumber()4177 public int getColumnNumber() 4178 { 4179 return column; 4180 } 4181 4182 ////////////////////////////////////////////////////////////////////// 4183 // High-level I/O. 4184 ////////////////////////////////////////////////////////////////////// 4185 4186 /** 4187 * Read a single character from the readBuffer. 4188 * <p>The readDataChunk () method maintains the buffer. 4189 * <p>If we hit the end of an entity, try to pop the stack and 4190 * keep going. 4191 * <p> (This approach doesn't really enforce XML's rules about 4192 * entity boundaries, but this is not currently a validating 4193 * parser). 4194 * <p>This routine also attempts to keep track of the current 4195 * position in external entities, but it's not entirely accurate. 4196 * @return The next available input character. 4197 * @see #unread (char) 4198 * @see #readDataChunk 4199 * @see #readBuffer 4200 * @see #line 4201 * @return The next character from the current input source. 4202 */ readCh()4203 private char readCh() 4204 throws SAXException, IOException 4205 { 4206 // As long as there's nothing in the 4207 // read buffer, try reading more data 4208 // (for an external entity) or popping 4209 // the entity stack (for either). 4210 while (readBufferPos >= readBufferLength) 4211 { 4212 switch (sourceType) 4213 { 4214 case INPUT_READER: 4215 case INPUT_STREAM: 4216 readDataChunk(); 4217 while (readBufferLength < 1) 4218 { 4219 popInput(); 4220 if (readBufferLength < 1) 4221 { 4222 readDataChunk(); 4223 } 4224 } 4225 break; 4226 4227 default: 4228 4229 popInput(); 4230 break; 4231 } 4232 } 4233 4234 char c = readBuffer[readBufferPos++]; 4235 4236 if (c == '\n') 4237 { 4238 line++; 4239 column = 0; 4240 } 4241 else 4242 { 4243 if (c == '<') 4244 { 4245 /* the most common return to parseContent () ... NOP */ 4246 } 4247 else if (((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD) 4248 || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) 4249 && xmlVersion == XML_11)) 4250 { 4251 error("illegal XML character U+" + Integer.toHexString(c)); 4252 } 4253 4254 // If we're in the DTD and in a context where PEs get expanded, 4255 // do so ... 1/14/2000 errata identify those contexts. There 4256 // are also spots in the internal subset where PE refs are fatal 4257 // errors, hence yet another flag. 4258 else if (c == '%' && expandPE) 4259 { 4260 if (peIsError) 4261 { 4262 error("PE reference within decl in internal subset."); 4263 } 4264 parsePEReference(); 4265 return readCh(); 4266 } 4267 column++; 4268 } 4269 4270 return c; 4271 } 4272 4273 /** 4274 * Push a single character back onto the current input stream. 4275 * <p>This method usually pushes the character back onto 4276 * the readBuffer. 4277 * <p>I don't think that this would ever be called with 4278 * readBufferPos = 0, because the methods always reads a character 4279 * before unreading it, but just in case, I've added a boundary 4280 * condition. 4281 * @param c The character to push back. 4282 * @see #readCh 4283 * @see #unread (char[]) 4284 * @see #readBuffer 4285 */ unread(char c)4286 private void unread(char c) 4287 throws SAXException 4288 { 4289 // Normal condition. 4290 if (c == '\n') 4291 { 4292 line--; 4293 column = -1; 4294 } 4295 if (readBufferPos > 0) 4296 { 4297 readBuffer[--readBufferPos] = c; 4298 } 4299 else 4300 { 4301 pushString(null, Character.toString(c)); 4302 } 4303 } 4304 4305 /** 4306 * Push a char array back onto the current input stream. 4307 * <p>NOTE: you must <em>never</em> push back characters that you 4308 * haven't actually read: use pushString () instead. 4309 * @see #readCh 4310 * @see #unread (char) 4311 * @see #readBuffer 4312 * @see #pushString 4313 */ unread(char[] ch, int length)4314 private void unread(char[] ch, int length) 4315 throws SAXException 4316 { 4317 for (int i = 0; i < length; i++) 4318 { 4319 if (ch[i] == '\n') 4320 { 4321 line--; 4322 column = -1; 4323 } 4324 } 4325 if (length < readBufferPos) 4326 { 4327 readBufferPos -= length; 4328 } 4329 else 4330 { 4331 pushCharArray(null, ch, 0, length); 4332 } 4333 } 4334 4335 /** 4336 * Push, or skip, a new external input source. 4337 * The source will be some kind of parsed entity, such as a PE 4338 * (including the external DTD subset) or content for the body. 4339 * 4340 * @param url The java.net.URL object for the entity. 4341 * @see SAXDriver#resolveEntity 4342 * @see #pushString 4343 * @see #sourceType 4344 * @see #pushInput 4345 * @see #detectEncoding 4346 * @see #sourceType 4347 * @see #readBuffer 4348 */ pushURL(boolean isPE, String ename, ExternalIdentifiers ids, Reader reader, InputStream stream, String encoding, boolean doResolve)4349 private void pushURL(boolean isPE, 4350 String ename, 4351 ExternalIdentifiers ids, 4352 Reader reader, 4353 InputStream stream, 4354 String encoding, 4355 boolean doResolve) 4356 throws SAXException, IOException 4357 { 4358 boolean ignoreEncoding; 4359 String systemId; 4360 InputSource source; 4361 4362 if (!isPE) 4363 { 4364 dataBufferFlush(); 4365 } 4366 4367 scratch.setPublicId(ids.publicId); 4368 scratch.setSystemId(ids.systemId); 4369 4370 // See if we should skip or substitute the entity. 4371 // If we're not skipping, resolving reports startEntity() 4372 // and updates the (handler's) stack of URIs. 4373 if (doResolve) 4374 { 4375 // assert (stream == null && reader == null && encoding == null) 4376 source = handler.resolveEntity(isPE, ename, scratch, ids.baseUri); 4377 if (source == null) 4378 { 4379 handler.warn("skipping entity: " + ename); 4380 handler.skippedEntity(ename); 4381 if (isPE) 4382 { 4383 skippedPE = true; 4384 } 4385 return; 4386 } 4387 4388 // we might be using alternate IDs/encoding 4389 systemId = source.getSystemId(); 4390 // The following warning and setting systemId was deleted bcause 4391 // the application has the option of not setting systemId 4392 // provided that it has set the characte/byte stream. 4393 /* 4394 if (systemId == null) { 4395 handler.warn ("missing system ID, using " + ids.systemId); 4396 systemId = ids.systemId; 4397 } 4398 */ 4399 } 4400 else 4401 { 4402 // "[document]", or "[dtd]" via getExternalSubset() 4403 scratch.setCharacterStream(reader); 4404 scratch.setByteStream(stream); 4405 scratch.setEncoding(encoding); 4406 source = scratch; 4407 systemId = ids.systemId; 4408 if (handler.stringInterning) 4409 { 4410 handler.startExternalEntity(ename, systemId, 4411 "[document]" == ename); 4412 } 4413 else 4414 { 4415 handler.startExternalEntity(ename, systemId, 4416 "[document]".equals(ename)); 4417 } 4418 } 4419 4420 // we may have been given I/O streams directly 4421 if (source.getCharacterStream() != null) 4422 { 4423 if (source.getByteStream() != null) 4424 error("InputSource has two streams!"); 4425 reader = source.getCharacterStream(); 4426 } 4427 else if (source.getByteStream() != null) 4428 { 4429 encoding = source.getEncoding(); 4430 if (encoding == null) 4431 { 4432 stream = source.getByteStream(); 4433 } 4434 else 4435 { 4436 try 4437 { 4438 reader = new InputStreamReader(source.getByteStream(), 4439 encoding); 4440 } 4441 catch (IOException e) 4442 { 4443 stream = source.getByteStream(); 4444 } 4445 } 4446 } 4447 else if (systemId == null) 4448 { 4449 error("InputSource has no URI!"); 4450 } 4451 scratch.setCharacterStream(null); 4452 scratch.setByteStream(null); 4453 scratch.setEncoding(null); 4454 4455 // Push the existing status. 4456 pushInput(ename); 4457 4458 // Create a new read buffer. 4459 // (Note the four-character margin) 4460 readBuffer = new char[READ_BUFFER_MAX + 4]; 4461 readBufferPos = 0; 4462 readBufferLength = 0; 4463 readBufferOverflow = -1; 4464 is = null; 4465 line = 1; 4466 column = 0; 4467 currentByteCount = 0; 4468 4469 // If there's an explicit character stream, just 4470 // ignore encoding declarations. 4471 if (reader != null) 4472 { 4473 sourceType = INPUT_READER; 4474 this.reader = reader; 4475 tryEncodingDecl(true); 4476 return; 4477 } 4478 4479 // Else we handle the conversion, and need to ensure 4480 // it's done right. 4481 sourceType = INPUT_STREAM; 4482 if (stream != null) 4483 { 4484 is = stream; 4485 } 4486 else 4487 { 4488 // We have to open our own stream to the URL. 4489 URL url = new URL(systemId); 4490 4491 externalEntity = url.openConnection(); 4492 externalEntity.connect(); 4493 is = externalEntity.getInputStream(); 4494 } 4495 4496 // If we get to here, there must be 4497 // an InputStream available. 4498 if (!is.markSupported()) 4499 { 4500 is = new BufferedInputStream(is); 4501 } 4502 4503 // Get any external encoding label. 4504 if (encoding == null && externalEntity != null) 4505 { 4506 // External labels can be untrustworthy; filesystems in 4507 // particular often have the wrong default for content 4508 // that wasn't locally originated. Those we autodetect. 4509 if (!"file".equals(externalEntity.getURL().getProtocol())) 4510 { 4511 int temp; 4512 4513 // application/xml;charset=something;otherAttr=... 4514 // ... with many variants on 'something' 4515 encoding = externalEntity.getContentType(); 4516 4517 // MHK code (fix for Saxon 5.5.1/007): 4518 // protect against encoding==null 4519 if (encoding == null) 4520 { 4521 temp = -1; 4522 } 4523 else 4524 { 4525 temp = encoding.indexOf("charset"); 4526 } 4527 4528 // RFC 2376 sez MIME text defaults to ASCII, but since the 4529 // JDK will create a MIME type out of thin air, we always 4530 // autodetect when there's no explicit charset attribute. 4531 if (temp < 0) 4532 { 4533 encoding = null; // autodetect 4534 } 4535 else 4536 { 4537 // only this one attribute 4538 if ((temp = encoding.indexOf(';')) > 0) 4539 { 4540 encoding = encoding.substring(0, temp); 4541 } 4542 4543 if ((temp = encoding.indexOf('=', temp + 7)) > 0) 4544 { 4545 encoding = encoding.substring(temp + 1); 4546 4547 // attributes can have comment fields (RFC 822) 4548 if ((temp = encoding.indexOf('(')) > 0) 4549 { 4550 encoding = encoding.substring(0, temp); 4551 } 4552 // ... and values may be quoted 4553 if ((temp = encoding.indexOf('"')) > 0) 4554 { 4555 encoding = 4556 encoding.substring(temp + 1, 4557 encoding.indexOf('"', temp + 2)); 4558 } 4559 encoding = encoding.trim(); 4560 } 4561 else 4562 { 4563 handler.warn("ignoring illegal MIME attribute: " 4564 + encoding); 4565 encoding = null; 4566 } 4567 } 4568 } 4569 } 4570 4571 // if we got an external encoding label, use it ... 4572 if (encoding != null) 4573 { 4574 this.encoding = ENCODING_EXTERNAL; 4575 setupDecoding(encoding); 4576 ignoreEncoding = true; 4577 4578 // ... else autodetect from first bytes. 4579 } 4580 else 4581 { 4582 detectEncoding(); 4583 ignoreEncoding = false; 4584 } 4585 4586 // Read any XML or text declaration. 4587 // If we autodetected, it may tell us the "real" encoding. 4588 try 4589 { 4590 tryEncodingDecl(ignoreEncoding); 4591 } 4592 catch (UnsupportedEncodingException x) 4593 { 4594 encoding = x.getMessage(); 4595 4596 // if we don't handle the declared encoding, 4597 // try letting a JVM InputStreamReader do it 4598 try 4599 { 4600 if (sourceType != INPUT_STREAM) 4601 { 4602 throw x; 4603 } 4604 4605 is.reset(); 4606 readBufferPos = 0; 4607 readBufferLength = 0; 4608 readBufferOverflow = -1; 4609 line = 1; 4610 currentByteCount = column = 0; 4611 4612 sourceType = INPUT_READER; 4613 this.reader = new InputStreamReader(is, encoding); 4614 is = null; 4615 4616 tryEncodingDecl(true); 4617 4618 } 4619 catch (IOException e) 4620 { 4621 error("unsupported text encoding", 4622 encoding, 4623 null); 4624 } 4625 } 4626 } 4627 4628 /** 4629 * Check for an encoding declaration. This is the second part of the 4630 * XML encoding autodetection algorithm, relying on detectEncoding to 4631 * get to the point that this part can read any encoding declaration 4632 * in the document (using only US-ASCII characters). 4633 * 4634 * <p> Because this part starts to fill parser buffers with this data, 4635 * it's tricky to setup a reader so that Java's built-in decoders can be 4636 * used for the character encodings that aren't built in to this parser 4637 * (such as EUC-JP, KOI8-R, Big5, etc). 4638 * 4639 * @return any encoding in the declaration, uppercased; or null 4640 * @see detectEncoding 4641 */ tryEncodingDecl(boolean ignoreEncoding)4642 private String tryEncodingDecl(boolean ignoreEncoding) 4643 throws SAXException, IOException 4644 { 4645 // Read the XML/text declaration. 4646 if (tryRead("<?xml")) 4647 { 4648 if (tryWhitespace()) 4649 { 4650 if (inputStack.size() > 0) 4651 { 4652 return parseTextDecl(ignoreEncoding); 4653 } 4654 else 4655 { 4656 return parseXMLDecl(ignoreEncoding); 4657 } 4658 } 4659 else 4660 { 4661 // <?xml-stylesheet ...?> or similar 4662 unread('l'); 4663 unread('m'); 4664 unread('x'); 4665 unread('?'); 4666 unread('<'); 4667 } 4668 } 4669 return null; 4670 } 4671 4672 /** 4673 * Attempt to detect the encoding of an entity. 4674 * <p>The trick here (as suggested in the XML standard) is that 4675 * any entity not in UTF-8, or in UCS-2 with a byte-order mark, 4676 * <b>must</b> begin with an XML declaration or an encoding 4677 * declaration; we simply have to look for "<?xml" in various 4678 * encodings. 4679 * <p>This method has no way to distinguish among 8-bit encodings. 4680 * Instead, it sets up for UTF-8, then (possibly) revises its assumption 4681 * later in setupDecoding (). Any ASCII-derived 8-bit encoding 4682 * should work, but most will be rejected later by setupDecoding (). 4683 * @see #tryEncoding (byte[], byte, byte, byte, byte) 4684 * @see #tryEncoding (byte[], byte, byte) 4685 * @see #setupDecoding 4686 */ detectEncoding()4687 private void detectEncoding() 4688 throws SAXException, IOException 4689 { 4690 byte[] signature = new byte[4]; 4691 4692 // Read the first four bytes for 4693 // autodetection. 4694 is.mark(4); 4695 is.read(signature); 4696 is.reset(); 4697 4698 // 4699 // FIRST: four byte encodings (who uses these?) 4700 // 4701 if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, 4702 (byte) 0x00, (byte) 0x3c)) 4703 { 4704 // UCS-4 must begin with "<?xml" 4705 // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234) 4706 // "UTF-32BE" 4707 encoding = ENCODING_UCS_4_1234; 4708 } 4709 else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00, 4710 (byte) 0x00, (byte) 0x00)) 4711 { 4712 // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321) 4713 // "UTF-32LE" 4714 encoding = ENCODING_UCS_4_4321; 4715 } 4716 else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, 4717 (byte) 0x3c, (byte) 0x00)) 4718 { 4719 // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143) 4720 encoding = ENCODING_UCS_4_2143; 4721 } 4722 else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, 4723 (byte) 0x00, (byte) 0x00)) 4724 { 4725 // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421) 4726 encoding = ENCODING_UCS_4_3412; 4727 4728 // 00 00 fe ff UCS_4_1234 (with BOM) 4729 // ff fe 00 00 UCS_4_4321 (with BOM) 4730 } 4731 4732 // 4733 // SECOND: two byte encodings 4734 // note ... with 1/14/2000 errata the XML spec identifies some 4735 // more "broken UTF-16" autodetection cases, with no XML decl, 4736 // which we don't handle here (that's legal too). 4737 // 4738 else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff)) 4739 { 4740 // UCS-2 with a byte-order marker. (UTF-16) 4741 // 0xfe 0xff: UCS-2, big-endian (12) 4742 encoding = ENCODING_UCS_2_12; 4743 is.read(); is.read(); 4744 } 4745 else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe)) 4746 { 4747 // UCS-2 with a byte-order marker. (UTF-16) 4748 // 0xff 0xfe: UCS-2, little-endian (21) 4749 encoding = ENCODING_UCS_2_21; 4750 is.read(); is.read(); 4751 } 4752 else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, 4753 (byte) 0x00, (byte) 0x3f)) 4754 { 4755 // UTF-16BE (otherwise, malformed UTF-16) 4756 // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark 4757 encoding = ENCODING_UCS_2_12; 4758 error("no byte-order mark for UCS-2 entity"); 4759 } 4760 else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00, 4761 (byte) 0x3f, (byte) 0x00)) 4762 { 4763 // UTF-16LE (otherwise, malformed UTF-16) 4764 // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark 4765 encoding = ENCODING_UCS_2_21; 4766 error("no byte-order mark for UCS-2 entity"); 4767 } 4768 4769 // 4770 // THIRD: ASCII-derived encodings, fixed and variable lengths 4771 // 4772 else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f, 4773 (byte) 0x78, (byte) 0x6d)) 4774 { 4775 // ASCII derived 4776 // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING) 4777 encoding = ENCODING_UTF_8; 4778 prefetchASCIIEncodingDecl(); 4779 } 4780 else if (signature[0] == (byte) 0xef 4781 && signature[1] == (byte) 0xbb 4782 && signature[2] == (byte) 0xbf) 4783 { 4784 // 0xef 0xbb 0xbf: UTF-8 BOM (not part of document text) 4785 // this un-needed notion slipped into XML 2nd ed through a 4786 // "non-normative" erratum; now required by MSFT and UDDI, 4787 // and E22 made it normative. 4788 encoding = ENCODING_UTF_8; 4789 is.read(); is.read(); is.read(); 4790 } 4791 else 4792 { 4793 // 4c 6f a7 94 ... we don't understand EBCDIC flavors 4794 // ... but we COULD at least kick in some fixed code page 4795 4796 // (default) UTF-8 without encoding/XML declaration 4797 encoding = ENCODING_UTF_8; 4798 } 4799 } 4800 4801 /** 4802 * Check for a four-byte signature. 4803 * <p>Utility routine for detectEncoding (). 4804 * <p>Always looks for some part of "<?XML" in a specific encoding. 4805 * @param sig The first four bytes read. 4806 * @param b1 The first byte of the signature 4807 * @param b2 The second byte of the signature 4808 * @param b3 The third byte of the signature 4809 * @param b4 The fourth byte of the signature 4810 * @see #detectEncoding 4811 */ tryEncoding(byte[] sig, byte b1, byte b2, byte b3, byte b4)4812 private static boolean tryEncoding(byte[] sig, byte b1, byte b2, 4813 byte b3, byte b4) 4814 { 4815 return (sig[0] == b1 && sig[1] == b2 4816 && sig[2] == b3 && sig[3] == b4); 4817 } 4818 4819 /** 4820 * Check for a two-byte signature. 4821 * <p>Looks for a UCS-2 byte-order mark. 4822 * <p>Utility routine for detectEncoding (). 4823 * @param sig The first four bytes read. 4824 * @param b1 The first byte of the signature 4825 * @param b2 The second byte of the signature 4826 * @see #detectEncoding 4827 */ tryEncoding(byte[] sig, byte b1, byte b2)4828 private static boolean tryEncoding(byte[] sig, byte b1, byte b2) 4829 { 4830 return ((sig[0] == b1) && (sig[1] == b2)); 4831 } 4832 4833 /** 4834 * This method pushes a string back onto input. 4835 * <p>It is useful either as the expansion of an internal entity, 4836 * or for backtracking during the parse. 4837 * <p>Call pushCharArray () to do the actual work. 4838 * @param s The string to push back onto input. 4839 * @see #pushCharArray 4840 */ pushString(String ename, String s)4841 private void pushString(String ename, String s) 4842 throws SAXException 4843 { 4844 char[] ch = s.toCharArray(); 4845 pushCharArray(ename, ch, 0, ch.length); 4846 } 4847 4848 /** 4849 * Push a new internal input source. 4850 * <p>This method is useful for expanding an internal entity, 4851 * or for unreading a string of characters. It creates a new 4852 * readBuffer containing the characters in the array, instead 4853 * of characters converted from an input byte stream. 4854 * @param ch The char array to push. 4855 * @see #pushString 4856 * @see #pushURL 4857 * @see #readBuffer 4858 * @see #sourceType 4859 * @see #pushInput 4860 */ pushCharArray(String ename, char[] ch, int start, int length)4861 private void pushCharArray(String ename, char[] ch, int start, int length) 4862 throws SAXException 4863 { 4864 // Push the existing status 4865 pushInput(ename); 4866 if (ename != null && doReport) 4867 { 4868 dataBufferFlush(); 4869 handler.startInternalEntity(ename); 4870 } 4871 sourceType = INPUT_INTERNAL; 4872 readBuffer = ch; 4873 readBufferPos = start; 4874 readBufferLength = length; 4875 readBufferOverflow = -1; 4876 } 4877 4878 /** 4879 * Save the current input source onto the stack. 4880 * <p>This method saves all of the global variables associated with 4881 * the current input source, so that they can be restored when a new 4882 * input source has finished. It also tests for entity recursion. 4883 * <p>The method saves the following global variables onto a stack 4884 * using a fixed-length array: 4885 * <ol> 4886 * <li>sourceType 4887 * <li>externalEntity 4888 * <li>readBuffer 4889 * <li>readBufferPos 4890 * <li>readBufferLength 4891 * <li>line 4892 * <li>encoding 4893 * </ol> 4894 * @param ename The name of the entity (if any) causing the new input. 4895 * @see #popInput 4896 * @see #sourceType 4897 * @see #externalEntity 4898 * @see #readBuffer 4899 * @see #readBufferPos 4900 * @see #readBufferLength 4901 * @see #line 4902 * @see #encoding 4903 */ pushInput(String ename)4904 private void pushInput(String ename) 4905 throws SAXException 4906 { 4907 // Check for entity recursion. 4908 if (ename != null) 4909 { 4910 Iterator entities = entityStack.iterator(); 4911 while (entities.hasNext()) 4912 { 4913 String e = (String) entities.next(); 4914 if (e != null && e == ename) 4915 { 4916 error("recursive reference to entity", ename, null); 4917 } 4918 } 4919 } 4920 entityStack.addLast(ename); 4921 4922 // Don't bother if there is no current input. 4923 if (sourceType == INPUT_NONE) 4924 { 4925 return; 4926 } 4927 4928 // Set up a snapshot of the current 4929 // input source. 4930 Input input = new Input(); 4931 4932 input.sourceType = sourceType; 4933 input.externalEntity = externalEntity; 4934 input.readBuffer = readBuffer; 4935 input.readBufferPos = readBufferPos; 4936 input.readBufferLength = readBufferLength; 4937 input.line = line; 4938 input.encoding = encoding; 4939 input.readBufferOverflow = readBufferOverflow; 4940 input.is = is; 4941 input.currentByteCount = currentByteCount; 4942 input.column = column; 4943 input.reader = reader; 4944 4945 // Push it onto the stack. 4946 inputStack.addLast(input); 4947 } 4948 4949 /** 4950 * Restore a previous input source. 4951 * <p>This method restores all of the global variables associated with 4952 * the current input source. 4953 * @exception java.io.EOFException 4954 * If there are no more entries on the input stack. 4955 * @see #pushInput 4956 * @see #sourceType 4957 * @see #externalEntity 4958 * @see #readBuffer 4959 * @see #readBufferPos 4960 * @see #readBufferLength 4961 * @see #line 4962 * @see #encoding 4963 */ popInput()4964 private void popInput() 4965 throws SAXException, IOException 4966 { 4967 String ename = (String) entityStack.removeLast(); 4968 4969 if (ename != null && doReport) 4970 { 4971 dataBufferFlush(); 4972 } 4973 switch (sourceType) 4974 { 4975 case INPUT_STREAM: 4976 handler.endExternalEntity(ename); 4977 is.close(); 4978 break; 4979 case INPUT_READER: 4980 handler.endExternalEntity(ename); 4981 reader.close(); 4982 break; 4983 case INPUT_INTERNAL: 4984 if (ename != null && doReport) 4985 { 4986 handler.endInternalEntity(ename); 4987 } 4988 break; 4989 } 4990 4991 // Throw an EOFException if there 4992 // is nothing else to pop. 4993 if (inputStack.isEmpty()) 4994 { 4995 throw new EOFException("no more input"); 4996 } 4997 4998 Input input = (Input) inputStack.removeLast(); 4999 5000 sourceType = input.sourceType; 5001 externalEntity = input.externalEntity; 5002 readBuffer = input.readBuffer; 5003 readBufferPos = input.readBufferPos; 5004 readBufferLength = input.readBufferLength; 5005 line = input.line; 5006 encoding = input.encoding; 5007 readBufferOverflow = input.readBufferOverflow; 5008 is = input.is; 5009 currentByteCount = input.currentByteCount; 5010 column = input.column; 5011 reader = input.reader; 5012 } 5013 5014 /** 5015 * Return true if we can read the expected character. 5016 * <p>Note that the character will be removed from the input stream 5017 * on success, but will be put back on failure. Do not attempt to 5018 * read the character again if the method succeeds. 5019 * @param delim The character that should appear next. For a 5020 * insensitive match, you must supply this in upper-case. 5021 * @return true if the character was successfully read, or false if 5022 * it was not. 5023 * @see #tryRead (String) 5024 */ tryRead(char delim)5025 private boolean tryRead(char delim) 5026 throws SAXException, IOException 5027 { 5028 char c; 5029 5030 // Read the character 5031 c = readCh(); 5032 5033 // Test for a match, and push the character 5034 // back if the match fails. 5035 if (c == delim) 5036 { 5037 return true; 5038 } 5039 else 5040 { 5041 unread(c); 5042 return false; 5043 } 5044 } 5045 5046 /** 5047 * Return true if we can read the expected string. 5048 * <p>This is simply a convenience method. 5049 * <p>Note that the string will be removed from the input stream 5050 * on success, but will be put back on failure. Do not attempt to 5051 * read the string again if the method succeeds. 5052 * <p>This method will push back a character rather than an 5053 * array whenever possible (probably the majority of cases). 5054 * @param delim The string that should appear next. 5055 * @return true if the string was successfully read, or false if 5056 * it was not. 5057 * @see #tryRead (char) 5058 */ tryRead(String delim)5059 private boolean tryRead(String delim) 5060 throws SAXException, IOException 5061 { 5062 return tryRead(delim.toCharArray()); 5063 } 5064 tryRead(char[] ch)5065 private boolean tryRead(char[] ch) 5066 throws SAXException, IOException 5067 { 5068 char c; 5069 5070 // Compare the input, character- 5071 // by character. 5072 5073 for (int i = 0; i < ch.length; i++) 5074 { 5075 c = readCh(); 5076 if (c != ch[i]) 5077 { 5078 unread(c); 5079 if (i != 0) 5080 { 5081 unread(ch, i); 5082 } 5083 return false; 5084 } 5085 } 5086 return true; 5087 } 5088 5089 /** 5090 * Return true if we can read some whitespace. 5091 * <p>This is simply a convenience method. 5092 * <p>This method will push back a character rather than an 5093 * array whenever possible (probably the majority of cases). 5094 * @return true if whitespace was found. 5095 */ tryWhitespace()5096 private boolean tryWhitespace() 5097 throws SAXException, IOException 5098 { 5099 char c; 5100 c = readCh(); 5101 if (isWhitespace(c)) 5102 { 5103 skipWhitespace(); 5104 return true; 5105 } 5106 else 5107 { 5108 unread(c); 5109 return false; 5110 } 5111 } 5112 5113 /** 5114 * Read all data until we find the specified string. 5115 * This is useful for scanning CDATA sections and PIs. 5116 * <p>This is inefficient right now, since it calls tryRead () 5117 * for every character. 5118 * @param delim The string delimiter 5119 * @see #tryRead (String, boolean) 5120 * @see #readCh 5121 */ parseUntil(String delim)5122 private void parseUntil(String delim) 5123 throws SAXException, IOException 5124 { 5125 parseUntil(delim.toCharArray()); 5126 } 5127 parseUntil(char[] delim)5128 private void parseUntil(char[] delim) 5129 throws SAXException, IOException 5130 { 5131 char c; 5132 int startLine = line; 5133 5134 try 5135 { 5136 while (!tryRead(delim)) 5137 { 5138 c = readCh(); 5139 dataBufferAppend(c); 5140 } 5141 } 5142 catch (EOFException e) 5143 { 5144 error("end of input while looking for delimiter " 5145 + "(started on line " + startLine 5146 + ')', null, new String(delim)); 5147 } 5148 } 5149 5150 ////////////////////////////////////////////////////////////////////// 5151 // Low-level I/O. 5152 ////////////////////////////////////////////////////////////////////// 5153 5154 /** 5155 * Prefetch US-ASCII XML/text decl from input stream into read buffer. 5156 * Doesn't buffer more than absolutely needed, so that when an encoding 5157 * decl says we need to create an InputStreamReader, we can discard our 5158 * buffer and reset(). Caller knows the first chars of the decl exist 5159 * in the input stream. 5160 */ prefetchASCIIEncodingDecl()5161 private void prefetchASCIIEncodingDecl() 5162 throws SAXException, IOException 5163 { 5164 int ch; 5165 readBufferPos = readBufferLength = 0; 5166 5167 is.mark(readBuffer.length); 5168 while (true) 5169 { 5170 ch = is.read(); 5171 readBuffer[readBufferLength++] = (char) ch; 5172 switch (ch) 5173 { 5174 case (int) '>': 5175 return; 5176 case -1: 5177 error("file ends before end of XML or encoding declaration.", 5178 null, "?>"); 5179 } 5180 if (readBuffer.length == readBufferLength) 5181 { 5182 error("unfinished XML or encoding declaration"); 5183 } 5184 } 5185 } 5186 5187 /** 5188 * Read a chunk of data from an external input source. 5189 * <p>This is simply a front-end that fills the rawReadBuffer 5190 * with bytes, then calls the appropriate encoding handler. 5191 * @see #encoding 5192 * @see #rawReadBuffer 5193 * @see #readBuffer 5194 * @see #filterCR 5195 * @see #copyUtf8ReadBuffer 5196 * @see #copyIso8859_1ReadBuffer 5197 * @see #copyUcs_2ReadBuffer 5198 * @see #copyUcs_4ReadBuffer 5199 */ readDataChunk()5200 private void readDataChunk() 5201 throws SAXException, IOException 5202 { 5203 int count; 5204 5205 // See if we have any overflow (filterCR sets for CR at end) 5206 if (readBufferOverflow > -1) 5207 { 5208 readBuffer[0] = (char) readBufferOverflow; 5209 readBufferOverflow = -1; 5210 readBufferPos = 1; 5211 sawCR = true; 5212 } 5213 else 5214 { 5215 readBufferPos = 0; 5216 sawCR = false; 5217 } 5218 5219 // input from a character stream. 5220 if (sourceType == INPUT_READER) 5221 { 5222 count = reader.read(readBuffer, 5223 readBufferPos, READ_BUFFER_MAX - readBufferPos); 5224 if (count < 0) 5225 { 5226 readBufferLength = readBufferPos; 5227 } 5228 else 5229 { 5230 readBufferLength = readBufferPos + count; 5231 } 5232 if (readBufferLength > 0) 5233 { 5234 filterCR(count >= 0); 5235 } 5236 sawCR = false; 5237 return; 5238 } 5239 5240 // Read as many bytes as possible into the raw buffer. 5241 count = is.read(rawReadBuffer, 0, READ_BUFFER_MAX); 5242 5243 // Dispatch to an encoding-specific reader method to populate 5244 // the readBuffer. In most parser speed profiles, these routines 5245 // show up at the top of the CPU usage chart. 5246 if (count > 0) 5247 { 5248 switch (encoding) 5249 { 5250 // one byte builtins 5251 case ENCODING_ASCII: 5252 copyIso8859_1ReadBuffer(count, (char) 0x0080); 5253 break; 5254 case ENCODING_UTF_8: 5255 copyUtf8ReadBuffer(count); 5256 break; 5257 case ENCODING_ISO_8859_1: 5258 copyIso8859_1ReadBuffer(count, (char) 0); 5259 break; 5260 5261 // two byte builtins 5262 case ENCODING_UCS_2_12: 5263 copyUcs2ReadBuffer(count, 8, 0); 5264 break; 5265 case ENCODING_UCS_2_21: 5266 copyUcs2ReadBuffer(count, 0, 8); 5267 break; 5268 5269 // four byte builtins 5270 case ENCODING_UCS_4_1234: 5271 copyUcs4ReadBuffer(count, 24, 16, 8, 0); 5272 break; 5273 case ENCODING_UCS_4_4321: 5274 copyUcs4ReadBuffer(count, 0, 8, 16, 24); 5275 break; 5276 case ENCODING_UCS_4_2143: 5277 copyUcs4ReadBuffer(count, 16, 24, 0, 8); 5278 break; 5279 case ENCODING_UCS_4_3412: 5280 copyUcs4ReadBuffer(count, 8, 0, 24, 16); 5281 break; 5282 } 5283 } 5284 else 5285 { 5286 readBufferLength = readBufferPos; 5287 } 5288 5289 readBufferPos = 0; 5290 5291 // Filter out all carriage returns if we've seen any 5292 // (including any saved from a previous read) 5293 if (sawCR) 5294 { 5295 filterCR(count >= 0); 5296 sawCR = false; 5297 5298 // must actively report EOF, lest some CRs get lost. 5299 if (readBufferLength == 0 && count >= 0) 5300 { 5301 readDataChunk(); 5302 } 5303 } 5304 5305 if (count > 0) 5306 { 5307 currentByteCount += count; 5308 } 5309 } 5310 5311 /** 5312 * Filter carriage returns in the read buffer. 5313 * CRLF becomes LF; CR becomes LF. 5314 * @param moreData true iff more data might come from the same source 5315 * @see #readDataChunk 5316 * @see #readBuffer 5317 * @see #readBufferOverflow 5318 */ filterCR(boolean moreData)5319 private void filterCR(boolean moreData) 5320 { 5321 int i, j; 5322 5323 readBufferOverflow = -1; 5324 5325 loop: 5326 for (i = j = readBufferPos; j < readBufferLength; i++, j++) 5327 { 5328 switch (readBuffer[j]) 5329 { 5330 case '\r': 5331 if (j == readBufferLength - 1) 5332 { 5333 if (moreData) 5334 { 5335 readBufferOverflow = '\r'; 5336 readBufferLength--; 5337 } 5338 else // CR at end of buffer 5339 { 5340 readBuffer[i++] = '\n'; 5341 } 5342 break loop; 5343 } 5344 else if (readBuffer[j + 1] == '\n') 5345 { 5346 j++; 5347 } 5348 readBuffer[i] = '\n'; 5349 break; 5350 5351 case '\n': 5352 default: 5353 readBuffer[i] = readBuffer[j]; 5354 break; 5355 } 5356 } 5357 readBufferLength = i; 5358 } 5359 5360 /** 5361 * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters. 5362 * <p>When readDataChunk () calls this method, the raw bytes are in 5363 * rawReadBuffer, and the final characters will appear in 5364 * readBuffer. 5365 * <p>Note that as of Unicode 3.1, good practice became a requirement, 5366 * so that each Unicode character has exactly one UTF-8 representation. 5367 * @param count The number of bytes to convert. 5368 * @see #readDataChunk 5369 * @see #rawReadBuffer 5370 * @see #readBuffer 5371 * @see #getNextUtf8Byte 5372 */ copyUtf8ReadBuffer(int count)5373 private void copyUtf8ReadBuffer(int count) 5374 throws SAXException, IOException 5375 { 5376 int i = 0; 5377 int j = readBufferPos; 5378 int b1; 5379 char c = 0; 5380 5381 /* 5382 // check once, so the runtime won't (if it's smart enough) 5383 if (count < 0 || count > rawReadBuffer.length) 5384 throw new ArrayIndexOutOfBoundsException (Integer.toString (count)); 5385 */ 5386 5387 while (i < count) 5388 { 5389 b1 = rawReadBuffer[i++]; 5390 5391 // Determine whether we are dealing 5392 // with a one-, two-, three-, or four- 5393 // byte sequence. 5394 if (b1 < 0) 5395 { 5396 if ((b1 & 0xe0) == 0xc0) 5397 { 5398 // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx 5399 c = (char) (((b1 & 0x1f) << 6) 5400 | getNextUtf8Byte(i++, count)); 5401 if (c < 0x0080) 5402 { 5403 encodingError("Illegal two byte UTF-8 sequence", 5404 c, 0); 5405 } 5406 5407 //Sec 2.11 5408 // [1] the two-character sequence #xD #xA 5409 // [2] the two-character sequence #xD #x85 5410 if ((c == 0x0085 || c == 0x000a) && sawCR) 5411 { 5412 continue; 5413 } 5414 5415 // Sec 2.11 5416 // [3] the single character #x85 5417 5418 if (c == 0x0085 && xmlVersion == XML_11) 5419 { 5420 readBuffer[j++] = '\r'; 5421 } 5422 } 5423 else if ((b1 & 0xf0) == 0xe0) 5424 { 5425 // 3-byte sequence: 5426 // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx 5427 // most CJKV characters 5428 c = (char) (((b1 & 0x0f) << 12) | 5429 (getNextUtf8Byte(i++, count) << 6) | 5430 getNextUtf8Byte(i++, count)); 5431 //sec 2.11 5432 //[4] the single character #x2028 5433 if (c == 0x2028 && xmlVersion == XML_11) 5434 { 5435 readBuffer[j++] = '\r'; 5436 sawCR = true; 5437 continue; 5438 } 5439 if (c < 0x0800 || (c >= 0xd800 && c <= 0xdfff)) 5440 { 5441 encodingError("Illegal three byte UTF-8 sequence", 5442 c, 0); 5443 } 5444 } 5445 else if ((b1 & 0xf8) == 0xf0) 5446 { 5447 // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx 5448 // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 5449 // (uuuuu = wwww + 1) 5450 // "Surrogate Pairs" ... from the "Astral Planes" 5451 // Unicode 3.1 assigned the first characters there 5452 int iso646 = b1 & 07; 5453 iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count); 5454 iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count); 5455 iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count); 5456 5457 if (iso646 <= 0xffff) 5458 { 5459 encodingError("Illegal four byte UTF-8 sequence", 5460 iso646, 0); 5461 } 5462 else 5463 { 5464 if (iso646 > 0x0010ffff) 5465 { 5466 encodingError("UTF-8 value out of range for Unicode", 5467 iso646, 0); 5468 } 5469 iso646 -= 0x010000; 5470 readBuffer[j++] = (char) (0xd800 | (iso646 >> 10)); 5471 readBuffer[j++] = (char) (0xdc00 | (iso646 & 0x03ff)); 5472 continue; 5473 } 5474 } 5475 else 5476 { 5477 // The five and six byte encodings aren't supported; 5478 // they exceed the Unicode (and XML) range. 5479 encodingError("unsupported five or six byte UTF-8 sequence", 5480 0xff & b1, i); 5481 // NOTREACHED 5482 c = 0; 5483 } 5484 } 5485 else 5486 { 5487 // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx 5488 // (US-ASCII character, "common" case, one branch to here) 5489 c = (char) b1; 5490 } 5491 readBuffer[j++] = c; 5492 if (c == '\r') 5493 { 5494 sawCR = true; 5495 } 5496 } 5497 // How many characters have we read? 5498 readBufferLength = j; 5499 } 5500 5501 /** 5502 * Return the next byte value in a UTF-8 sequence. 5503 * If it is not possible to get a byte from the current 5504 * entity, throw an exception. 5505 * @param pos The current position in the rawReadBuffer. 5506 * @param count The number of bytes in the rawReadBuffer 5507 * @return The significant six bits of a non-initial byte in 5508 * a UTF-8 sequence. 5509 * @exception EOFException If the sequence is incomplete. 5510 */ getNextUtf8Byte(int pos, int count)5511 private int getNextUtf8Byte(int pos, int count) 5512 throws SAXException, IOException 5513 { 5514 int val; 5515 5516 // Take a character from the buffer 5517 // or from the actual input stream. 5518 if (pos < count) 5519 { 5520 val = rawReadBuffer[pos]; 5521 } 5522 else 5523 { 5524 val = is.read(); 5525 if (val == -1) 5526 { 5527 encodingError("unfinished multi-byte UTF-8 sequence at EOF", 5528 -1, pos); 5529 } 5530 } 5531 5532 // Check for the correct bits at the start. 5533 if ((val & 0xc0) != 0x80) 5534 { 5535 encodingError("bad continuation of multi-byte UTF-8 sequence", 5536 val, pos + 1); 5537 } 5538 5539 // Return the significant bits. 5540 return (val & 0x3f); 5541 } 5542 5543 /** 5544 * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into 5545 * UTF-16 characters. 5546 * 5547 * <p>When readDataChunk () calls this method, the raw bytes are in 5548 * rawReadBuffer, and the final characters will appear in 5549 * readBuffer. 5550 * 5551 * @param count The number of bytes to convert. 5552 * @param mask For ASCII conversion, 0x7f; else, 0xff. 5553 * @see #readDataChunk 5554 * @see #rawReadBuffer 5555 * @see #readBuffer 5556 */ copyIso8859_1ReadBuffer(int count, char mask)5557 private void copyIso8859_1ReadBuffer(int count, char mask) 5558 throws IOException 5559 { 5560 int i, j; 5561 for (i = 0, j = readBufferPos; i < count; i++, j++) 5562 { 5563 char c = (char) (rawReadBuffer[i] & 0xff); 5564 if ((c & mask) != 0) 5565 { 5566 throw new CharConversionException("non-ASCII character U+" 5567 + Integer.toHexString(c)); 5568 } 5569 if (c == 0x0085 && xmlVersion == XML_11) 5570 { 5571 c = '\r'; 5572 } 5573 readBuffer[j] = c; 5574 if (c == '\r') 5575 { 5576 sawCR = true; 5577 } 5578 } 5579 readBufferLength = j; 5580 } 5581 5582 /** 5583 * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters 5584 * (as used in Java string manipulation). 5585 * 5586 * <p>When readDataChunk () calls this method, the raw bytes are in 5587 * rawReadBuffer, and the final characters will appear in 5588 * readBuffer. 5589 * @param count The number of bytes to convert. 5590 * @param shift1 The number of bits to shift byte 1. 5591 * @param shift2 The number of bits to shift byte 2 5592 * @see #readDataChunk 5593 * @see #rawReadBuffer 5594 * @see #readBuffer 5595 */ copyUcs2ReadBuffer(int count, int shift1, int shift2)5596 private void copyUcs2ReadBuffer(int count, int shift1, int shift2) 5597 throws SAXException 5598 { 5599 int j = readBufferPos; 5600 5601 if (count > 0 && (count % 2) != 0) 5602 { 5603 encodingError("odd number of bytes in UCS-2 encoding", -1, count); 5604 } 5605 // The loops are faster with less internal brancing; hence two 5606 if (shift1 == 0) 5607 { // "UTF-16-LE" 5608 for (int i = 0; i < count; i += 2) 5609 { 5610 char c = (char) (rawReadBuffer[i + 1] << 8); 5611 c |= 0xff & rawReadBuffer[i]; 5612 readBuffer[j++] = c; 5613 if (c == '\r') 5614 { 5615 sawCR = true; 5616 } 5617 } 5618 } 5619 else 5620 { // "UTF-16-BE" 5621 for (int i = 0; i < count; i += 2) 5622 { 5623 char c = (char) (rawReadBuffer[i] << 8); 5624 c |= 0xff & rawReadBuffer[i + 1]; 5625 readBuffer[j++] = c; 5626 if (c == '\r') 5627 { 5628 sawCR = true; 5629 } 5630 } 5631 } 5632 readBufferLength = j; 5633 } 5634 5635 /** 5636 * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters. 5637 * 5638 * <p>When readDataChunk () calls this method, the raw bytes are in 5639 * rawReadBuffer, and the final characters will appear in 5640 * readBuffer. 5641 * <p>Java has Unicode chars, and this routine uses surrogate pairs 5642 * for ISO-10646 values between 0x00010000 and 0x000fffff. An 5643 * exception is thrown if the ISO-10646 character has no Unicode 5644 * representation. 5645 * 5646 * @param count The number of bytes to convert. 5647 * @param shift1 The number of bits to shift byte 1. 5648 * @param shift2 The number of bits to shift byte 2 5649 * @param shift3 The number of bits to shift byte 2 5650 * @param shift4 The number of bits to shift byte 2 5651 * @see #readDataChunk 5652 * @see #rawReadBuffer 5653 * @see #readBuffer 5654 */ copyUcs4ReadBuffer(int count, int shift1, int shift2, int shift3, int shift4)5655 private void copyUcs4ReadBuffer(int count, int shift1, int shift2, 5656 int shift3, int shift4) 5657 throws SAXException 5658 { 5659 int j = readBufferPos; 5660 5661 if (count > 0 && (count % 4) != 0) 5662 { 5663 encodingError("number of bytes in UCS-4 encoding " + 5664 "not divisible by 4", 5665 -1, count); 5666 } 5667 for (int i = 0; i < count; i += 4) 5668 { 5669 int value = (((rawReadBuffer [i] & 0xff) << shift1) | 5670 ((rawReadBuffer [i + 1] & 0xff) << shift2) | 5671 ((rawReadBuffer [i + 2] & 0xff) << shift3) | 5672 ((rawReadBuffer [i + 3] & 0xff) << shift4)); 5673 if (value < 0x0000ffff) 5674 { 5675 readBuffer [j++] = (char) value; 5676 if (value == (int) '\r') 5677 { 5678 sawCR = true; 5679 } 5680 } 5681 else if (value < 0x0010ffff) 5682 { 5683 value -= 0x010000; 5684 readBuffer[j++] = (char) (0xd8 | ((value >> 10) & 0x03ff)); 5685 readBuffer[j++] = (char) (0xdc | (value & 0x03ff)); 5686 } 5687 else 5688 { 5689 encodingError("UCS-4 value out of range for Unicode", 5690 value, i); 5691 } 5692 } 5693 readBufferLength = j; 5694 } 5695 5696 /** 5697 * Report a character encoding error. 5698 */ encodingError(String message, int value, int offset)5699 private void encodingError(String message, int value, int offset) 5700 throws SAXException 5701 { 5702 if (value != -1) 5703 { 5704 message = message + " (character code: 0x" + 5705 Integer.toHexString(value) + ')'; 5706 error(message); 5707 } 5708 } 5709 5710 ////////////////////////////////////////////////////////////////////// 5711 // Local Variables. 5712 ////////////////////////////////////////////////////////////////////// 5713 5714 /** 5715 * Re-initialize the variables for each parse. 5716 */ initializeVariables()5717 private void initializeVariables() 5718 { 5719 // First line 5720 line = 1; 5721 column = 0; 5722 5723 // Set up the buffers for data and names 5724 dataBufferPos = 0; 5725 dataBuffer = new char[DATA_BUFFER_INITIAL]; 5726 nameBufferPos = 0; 5727 nameBuffer = new char[NAME_BUFFER_INITIAL]; 5728 5729 // Set up the DTD hash tables 5730 elementInfo = new HashMap(); 5731 entityInfo = new HashMap(); 5732 notationInfo = new HashMap(); 5733 skippedPE = false; 5734 5735 // Set up the variables for the current 5736 // element context. 5737 currentElement = null; 5738 currentElementContent = CONTENT_UNDECLARED; 5739 5740 // Set up the input variables 5741 sourceType = INPUT_NONE; 5742 inputStack = new LinkedList(); 5743 entityStack = new LinkedList(); 5744 externalEntity = null; 5745 tagAttributePos = 0; 5746 tagAttributes = new String[100]; 5747 rawReadBuffer = new byte[READ_BUFFER_MAX]; 5748 readBufferOverflow = -1; 5749 5750 scratch = new InputSource(); 5751 5752 inLiteral = false; 5753 expandPE = false; 5754 peIsError = false; 5755 5756 doReport = false; 5757 5758 inCDATA = false; 5759 5760 symbolTable = new Object[SYMBOL_TABLE_LENGTH][]; 5761 } 5762 5763 static class ExternalIdentifiers 5764 { 5765 5766 String publicId; 5767 String systemId; 5768 String baseUri; 5769 ExternalIdentifiers()5770 ExternalIdentifiers() 5771 { 5772 } 5773 ExternalIdentifiers(String publicId, String systemId, String baseUri)5774 ExternalIdentifiers(String publicId, String systemId, String baseUri) 5775 { 5776 this.publicId = publicId; 5777 this.systemId = systemId; 5778 this.baseUri = baseUri; 5779 } 5780 5781 } 5782 5783 static class EntityInfo 5784 { 5785 5786 int type; 5787 ExternalIdentifiers ids; 5788 String value; 5789 String notationName; 5790 5791 } 5792 5793 static class AttributeDecl 5794 { 5795 5796 String type; 5797 String value; 5798 int valueType; 5799 String enumeration; 5800 String defaultValue; 5801 5802 } 5803 5804 static class ElementDecl 5805 { 5806 5807 int contentType; 5808 String contentModel; 5809 HashMap attributes; 5810 5811 } 5812 5813 static class Input 5814 { 5815 5816 int sourceType; 5817 URLConnection externalEntity; 5818 char[] readBuffer; 5819 int readBufferPos; 5820 int readBufferLength; 5821 int line; 5822 int encoding; 5823 int readBufferOverflow; 5824 InputStream is; 5825 int currentByteCount; 5826 int column; 5827 Reader reader; 5828 5829 } 5830 5831 } 5832