1 /* 2 * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. 3 */ 4 /* 5 * Licensed to the Apache Software Foundation (ASF) under one or more 6 * contributor license agreements. See the NOTICE file distributed with 7 * this work for additional information regarding copyright ownership. 8 * The ASF licenses this file to You under the Apache License, Version 2.0 9 * (the "License"); you may not use this file except in compliance with 10 * the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 21 // Aug 21, 2000: 22 // Fixed bug in isElement and made HTMLdtd public. 23 // Contributed by Eric SCHAEFFER" <eschaeffer@posterconseil.com> 24 25 26 package com.sun.org.apache.xml.internal.serialize; 27 28 import com.sun.org.apache.xerces.internal.dom.DOMMessageFormatter; 29 import java.io.BufferedReader; 30 import java.io.InputStream; 31 import java.io.InputStreamReader; 32 import java.util.HashMap; 33 import java.util.Locale; 34 import java.util.Map; 35 36 37 /** 38 * Utility class for accessing information specific to HTML documents. 39 * The HTML DTD is expressed as three utility function groups. Two methods 40 * allow for checking whether an element requires an open tag on printing 41 * ({@link #isEmptyTag}) or on parsing ({@link #isOptionalClosing}). 42 * <P> 43 * Two other methods translate character references from name to value and 44 * from value to name. A small entities resource is loaded into memory the 45 * first time any of these methods is called for fast and efficient access. 46 * 47 * 48 * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a> 49 */ 50 public final class HTMLdtd 51 { 52 53 /** 54 * Public identifier for HTML 4.01 (Strict) document type. 55 */ 56 public static final String HTMLPublicId = "-//W3C//DTD HTML 4.01//EN"; 57 58 /** 59 * System identifier for HTML 4.01 (Strict) document type. 60 */ 61 public static final String HTMLSystemId = 62 "http://www.w3.org/TR/html4/strict.dtd"; 63 64 /** 65 * Public identifier for XHTML 1.0 (Strict) document type. 66 */ 67 public static final String XHTMLPublicId = 68 "-//W3C//DTD XHTML 1.0 Strict//EN"; 69 70 /** 71 * System identifier for XHTML 1.0 (Strict) document type. 72 */ 73 public static final String XHTMLSystemId = 74 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"; 75 76 /** 77 * Table of reverse character reference mapping. Character codes are held 78 * as single-character strings, mapped to their reference name. 79 */ 80 private static Map<Integer, String> _byChar; 81 82 83 /** 84 * Table of entity name to value mapping. Entities are held as strings, 85 * character references as <TT>Character</TT> objects. 86 */ 87 private static Map<String, Integer> _byName; 88 89 90 private static final Map<String, String[]> _boolAttrs; 91 92 93 /** 94 * Holds element definitions. 95 */ 96 private static final Map<String, Integer> _elemDefs; 97 98 99 /** 100 * Locates the HTML entities file that is loaded upon initialization. 101 * This file is a resource loaded with the default class loader. 102 */ 103 private static final String ENTITIES_RESOURCE = "HTMLEntities.res"; 104 105 106 /** 107 * Only opening tag should be printed. 108 */ 109 private static final int ONLY_OPENING = 0x0001; 110 111 /** 112 * Element contains element content only. 113 */ 114 private static final int ELEM_CONTENT = 0x0002; 115 116 117 /** 118 * Element preserve spaces. 119 */ 120 private static final int PRESERVE = 0x0004; 121 122 123 /** 124 * Optional closing tag. 125 */ 126 private static final int OPT_CLOSING = 0x0008; 127 128 129 /** 130 * Element is empty (also means only opening tag) 131 */ 132 private static final int EMPTY = 0x0010 | ONLY_OPENING; 133 134 135 /** 136 * Allowed to appear in head. 137 */ 138 private static final int ALLOWED_HEAD = 0x0020; 139 140 141 /** 142 * When opened, closes P. 143 */ 144 private static final int CLOSE_P = 0x0040; 145 146 147 /** 148 * When opened, closes DD or DT. 149 */ 150 private static final int CLOSE_DD_DT = 0x0080; 151 152 153 /** 154 * When opened, closes itself. 155 */ 156 private static final int CLOSE_SELF = 0x0100; 157 158 159 /** 160 * When opened, closes another table section. 161 */ 162 private static final int CLOSE_TABLE = 0x0200; 163 164 165 /** 166 * When opened, closes TH or TD. 167 */ 168 private static final int CLOSE_TH_TD = 0x04000; 169 170 171 /** 172 * Returns true if element is declared to be empty. HTML elements are 173 * defines as empty in the DTD, not by the document syntax. 174 * 175 * @param tagName The element tag name (upper case) 176 * @return True if element is empty 177 */ isEmptyTag( String tagName )178 public static boolean isEmptyTag( String tagName ) 179 { 180 return isElement( tagName, EMPTY ); 181 } 182 183 184 /** 185 * Returns true if element is declared to have element content. 186 * Whitespaces appearing inside element content will be ignored, 187 * other text will simply report an error. 188 * 189 * @param tagName The element tag name (upper case) 190 * @return True if element content 191 */ isElementContent( String tagName )192 public static boolean isElementContent( String tagName ) 193 { 194 return isElement( tagName, ELEM_CONTENT ); 195 } 196 197 198 /** 199 * Returns true if element's textual contents preserves spaces. 200 * This only applies to PRE and TEXTAREA, all other HTML elements 201 * do not preserve space. 202 * 203 * @param tagName The element tag name (upper case) 204 * @return True if element's text content preserves spaces 205 */ isPreserveSpace( String tagName )206 public static boolean isPreserveSpace( String tagName ) 207 { 208 return isElement( tagName, PRESERVE ); 209 } 210 211 212 /** 213 * Returns true if element's closing tag is optional and need not 214 * exist. An error will not be reported for such elements if they 215 * are not closed. For example, <tt>LI</tt> is most often not closed. 216 * 217 * @param tagName The element tag name (upper case) 218 * @return True if closing tag implied 219 */ isOptionalClosing( String tagName )220 public static boolean isOptionalClosing( String tagName ) 221 { 222 return isElement( tagName, OPT_CLOSING ); 223 } 224 225 226 /** 227 * Returns true if element's closing tag is generally not printed. 228 * For example, <tt>LI</tt> should not print the closing tag. 229 * 230 * @param tagName The element tag name (upper case) 231 * @return True if only opening tag should be printed 232 */ isOnlyOpening( String tagName )233 public static boolean isOnlyOpening( String tagName ) 234 { 235 return isElement( tagName, ONLY_OPENING ); 236 } 237 238 239 /** 240 * Returns true if the opening of one element (<tt>tagName</tt>) implies 241 * the closing of another open element (<tt>openTag</tt>). For example, 242 * every opening <tt>LI</tt> will close the previously open <tt>LI</tt>, 243 * and every opening <tt>BODY</tt> will close the previously open <tt>HEAD</tt>. 244 * 245 * @param tagName The newly opened element 246 * @param openTag The already opened element 247 * @return True if closing tag closes opening tag 248 */ isClosing( String tagName, String openTag )249 public static boolean isClosing( String tagName, String openTag ) 250 { 251 // Several elements are defined as closing the HEAD 252 if ( openTag.equalsIgnoreCase( "HEAD" ) ) 253 return ! isElement( tagName, ALLOWED_HEAD ); 254 // P closes iteself 255 if ( openTag.equalsIgnoreCase( "P" ) ) 256 return isElement( tagName, CLOSE_P ); 257 // DT closes DD, DD closes DT 258 if ( openTag.equalsIgnoreCase( "DT" ) || openTag.equalsIgnoreCase( "DD" ) ) 259 return isElement( tagName, CLOSE_DD_DT ); 260 // LI and OPTION close themselves 261 if ( openTag.equalsIgnoreCase( "LI" ) || openTag.equalsIgnoreCase( "OPTION" ) ) 262 return isElement( tagName, CLOSE_SELF ); 263 // Each of these table sections closes all the others 264 if ( openTag.equalsIgnoreCase( "THEAD" ) || openTag.equalsIgnoreCase( "TFOOT" ) || 265 openTag.equalsIgnoreCase( "TBODY" ) || openTag.equalsIgnoreCase( "TR" ) || 266 openTag.equalsIgnoreCase( "COLGROUP" ) ) 267 return isElement( tagName, CLOSE_TABLE ); 268 // TD closes TH and TH closes TD 269 if ( openTag.equalsIgnoreCase( "TH" ) || openTag.equalsIgnoreCase( "TD" ) ) 270 return isElement( tagName, CLOSE_TH_TD ); 271 return false; 272 } 273 274 275 /** 276 * Returns true if the specified attribute it a URI and should be 277 * escaped appropriately. In HTML URIs are escaped differently 278 * than normal attributes. 279 * 280 * @param tagName The element's tag name 281 * @param attrName The attribute's name 282 */ isURI( String tagName, String attrName )283 public static boolean isURI( String tagName, String attrName ) 284 { 285 // Stupid checks. 286 return ( attrName.equalsIgnoreCase( "href" ) || attrName.equalsIgnoreCase( "src" ) ); 287 } 288 289 290 /** 291 * Returns true if the specified attribute is a boolean and should be 292 * printed without the value. This applies to attributes that are true 293 * if they exist, such as selected (OPTION/INPUT). 294 * 295 * @param tagName The element's tag name 296 * @param attrName The attribute's name 297 */ isBoolean( String tagName, String attrName )298 public static boolean isBoolean( String tagName, String attrName ) 299 { 300 String[] attrNames; 301 302 attrNames = _boolAttrs.get( tagName.toUpperCase(Locale.ENGLISH) ); 303 if ( attrNames == null ) 304 return false; 305 for ( int i = 0 ; i < attrNames.length ; ++i ) 306 if ( attrNames[ i ].equalsIgnoreCase( attrName ) ) 307 return true; 308 return false; 309 } 310 311 312 /** 313 * Returns the value of an HTML character reference by its name. If the 314 * reference is not found or was not defined as a character reference, 315 * returns EOF (-1). 316 * 317 * @param name Name of character reference 318 * @return Character code or EOF (-1) 319 */ charFromName( String name )320 public static int charFromName( String name ) 321 { 322 Object value; 323 324 initialize(); 325 value = _byName.get( name ); 326 if ( value != null && value instanceof Integer ) 327 return ( (Integer) value ).intValue(); 328 else 329 return -1; 330 } 331 332 333 /** 334 * Returns the name of an HTML character reference based on its character 335 * value. Only valid for entities defined from character references. If no 336 * such character value was defined, return null. 337 * 338 * @param value Character value of entity 339 * @return Entity's name or null 340 */ fromChar(int value )341 public static String fromChar(int value ) 342 { 343 if (value > 0xffff) 344 return null; 345 346 String name; 347 348 initialize(); 349 name = _byChar.get(value); 350 return name; 351 } 352 353 354 /** 355 * Initialize upon first access. Will load all the HTML character references 356 * into a list that is accessible by name or character value and is optimized 357 * for character substitution. This method may be called any number of times 358 * but will execute only once. 359 */ initialize()360 private static void initialize() 361 { 362 InputStream is = null; 363 BufferedReader reader = null; 364 int index; 365 String name; 366 String value; 367 int code; 368 String line; 369 370 // Make sure not to initialize twice. 371 if ( _byName != null ) 372 return; 373 try { 374 _byName = new HashMap<>(); 375 _byChar = new HashMap<>(); 376 is = HTMLdtd.class.getResourceAsStream( ENTITIES_RESOURCE ); 377 if ( is == null ) { 378 throw new RuntimeException( 379 DOMMessageFormatter.formatMessage( 380 DOMMessageFormatter.SERIALIZER_DOMAIN, 381 "ResourceNotFound", new Object[] {ENTITIES_RESOURCE})); 382 } 383 reader = new BufferedReader( new InputStreamReader( is, "ASCII" ) ); 384 line = reader.readLine(); 385 while ( line != null ) { 386 if ( line.length() == 0 || line.charAt( 0 ) == '#' ) { 387 line = reader.readLine(); 388 continue; 389 } 390 index = line.indexOf( ' ' ); 391 if ( index > 1 ) { 392 name = line.substring( 0, index ); 393 ++index; 394 if ( index < line.length() ) { 395 value = line.substring( index ); 396 index = value.indexOf( ' ' ); 397 if ( index > 0 ) 398 value = value.substring( 0, index ); 399 code = Integer.parseInt( value ); 400 defineEntity( name, (char) code ); 401 } 402 } 403 line = reader.readLine(); 404 } 405 is.close(); 406 } catch ( Exception except ) { 407 throw new RuntimeException( 408 DOMMessageFormatter.formatMessage( 409 DOMMessageFormatter.SERIALIZER_DOMAIN, 410 "ResourceNotLoaded", new Object[] {ENTITIES_RESOURCE, except.toString()})); 411 } finally { 412 if ( is != null ) { 413 try { 414 is.close(); 415 } catch ( Exception except ) { } 416 } 417 } 418 } 419 420 421 /** 422 * Defines a new character reference. The reference's name and value are 423 * supplied. Nothing happens if the character reference is already defined. 424 * <P> 425 * Unlike internal entities, character references are a string to single 426 * character mapping. They are used to map non-ASCII characters both on 427 * parsing and printing, primarily for HTML documents. '<amp;' is an 428 * example of a character reference. 429 * 430 * @param name The entity's name 431 * @param value The entity's value 432 */ defineEntity( String name, char value )433 private static void defineEntity( String name, char value ) 434 { 435 if ( _byName.get( name ) == null ) { 436 _byName.put( name, new Integer( value ) ); 437 _byChar.put( new Integer( value ), name ); 438 } 439 } 440 441 defineElement( String name, int flags )442 private static void defineElement( String name, int flags ) 443 { 444 _elemDefs.put(name, flags); 445 } 446 447 defineBoolean( String tagName, String attrName )448 private static void defineBoolean( String tagName, String attrName ) 449 { 450 defineBoolean( tagName, new String[] { attrName } ); 451 } 452 453 defineBoolean( String tagName, String[] attrNames )454 private static void defineBoolean( String tagName, String[] attrNames ) 455 { 456 _boolAttrs.put( tagName, attrNames ); 457 } 458 459 isElement( String name, int flag )460 private static boolean isElement( String name, int flag ) 461 { 462 Integer flags; 463 464 flags = _elemDefs.get( name.toUpperCase(Locale.ENGLISH) ); 465 if ( flags == null ) 466 return false; 467 else 468 return ( ( flags.intValue() & flag ) == flag ); 469 } 470 471 472 static 473 { 474 _elemDefs = new HashMap<>(); 475 defineElement( "ADDRESS", CLOSE_P ); 476 defineElement( "AREA", EMPTY ); 477 defineElement( "BASE", EMPTY | ALLOWED_HEAD ); 478 defineElement( "BASEFONT", EMPTY ); 479 defineElement( "BLOCKQUOTE", CLOSE_P ); 480 defineElement( "BODY", OPT_CLOSING ); 481 defineElement( "BR", EMPTY ); 482 defineElement( "COL", EMPTY ); 483 defineElement( "COLGROUP", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); 484 defineElement( "DD", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT ); 485 defineElement( "DIV", CLOSE_P ); 486 defineElement( "DL", ELEM_CONTENT | CLOSE_P ); 487 defineElement( "DT", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT ); 488 defineElement( "FIELDSET", CLOSE_P ); 489 defineElement( "FORM", CLOSE_P ); 490 defineElement( "FRAME", EMPTY | OPT_CLOSING ); 491 defineElement( "H1", CLOSE_P ); 492 defineElement( "H2", CLOSE_P ); 493 defineElement( "H3", CLOSE_P ); 494 defineElement( "H4", CLOSE_P ); 495 defineElement( "H5", CLOSE_P ); 496 defineElement( "H6", CLOSE_P ); 497 defineElement( "HEAD", ELEM_CONTENT | OPT_CLOSING ); 498 defineElement( "HR", EMPTY | CLOSE_P ); 499 defineElement( "HTML", ELEM_CONTENT | OPT_CLOSING ); 500 defineElement( "IMG", EMPTY ); 501 defineElement( "INPUT", EMPTY ); 502 defineElement( "ISINDEX", EMPTY | ALLOWED_HEAD ); 503 defineElement( "LI", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF ); 504 defineElement( "LINK", EMPTY | ALLOWED_HEAD ); 505 defineElement( "MAP", ALLOWED_HEAD ); 506 defineElement( "META", EMPTY | ALLOWED_HEAD ); 507 defineElement( "OL", ELEM_CONTENT | CLOSE_P ); 508 defineElement( "OPTGROUP", ELEM_CONTENT ); 509 defineElement( "OPTION", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF ); 510 defineElement( "P", OPT_CLOSING | CLOSE_P | CLOSE_SELF ); 511 defineElement( "PARAM", EMPTY ); 512 defineElement( "PRE", PRESERVE | CLOSE_P ); 513 defineElement( "SCRIPT", ALLOWED_HEAD | PRESERVE ); 514 defineElement( "NOSCRIPT", ALLOWED_HEAD | PRESERVE ); 515 defineElement( "SELECT", ELEM_CONTENT ); 516 defineElement( "STYLE", ALLOWED_HEAD | PRESERVE ); 517 defineElement( "TABLE", ELEM_CONTENT | CLOSE_P ); 518 defineElement( "TBODY", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); 519 defineElement( "TD", OPT_CLOSING | CLOSE_TH_TD ); 520 defineElement( "TEXTAREA", PRESERVE ); 521 defineElement( "TFOOT", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); 522 defineElement( "TH", OPT_CLOSING | CLOSE_TH_TD ); 523 defineElement( "THEAD", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); 524 defineElement( "TITLE", ALLOWED_HEAD ); 525 defineElement( "TR", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); 526 defineElement( "UL", ELEM_CONTENT | CLOSE_P ); 527 528 _boolAttrs = new HashMap<>(); 529 defineBoolean( "AREA", "href" ); 530 defineBoolean( "BUTTON", "disabled" ); 531 defineBoolean( "DIR", "compact" ); 532 defineBoolean( "DL", "compact" ); 533 defineBoolean( "FRAME", "noresize" ); 534 defineBoolean( "HR", "noshade" ); 535 defineBoolean( "IMAGE", "ismap" ); 536 defineBoolean( "INPUT", new String[] { "defaultchecked", "checked", "readonly", "disabled" } ); 537 defineBoolean( "LINK", "link" ); 538 defineBoolean( "MENU", "compact" ); 539 defineBoolean( "OBJECT", "declare" ); 540 defineBoolean( "OL", "compact" ); 541 defineBoolean( "OPTGROUP", "disabled" ); 542 defineBoolean( "OPTION", new String[] { "default-selected", "selected", "disabled" } ); 543 defineBoolean( "SCRIPT", "defer" ); 544 defineBoolean( "SELECT", new String[] { "multiple", "disabled" } ); 545 defineBoolean( "STYLE", "disabled" ); 546 defineBoolean( "TD", "nowrap" ); 547 defineBoolean( "TH", "nowrap" ); 548 defineBoolean( "TEXTAREA", new String[] { "disabled", "readonly" } ); 549 defineBoolean( "UL", "compact" ); 550 initialize()551 initialize(); 552 } 553 554 555 556 } 557