1 /*
2  * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
3  */
4 /*
5  * Licensed to the Apache Software Foundation (ASF) under one or more
6  * contributor license agreements.  See the NOTICE file distributed with
7  * this work for additional information regarding copyright ownership.
8  * The ASF licenses this file to You under the Apache License, Version 2.0
9  * (the "License"); you may not use this file except in compliance with
10  * the License.  You may obtain a copy of the License at
11  *
12  *     http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  */
20 
21 // Aug 21, 2000:
22 //   Fixed bug in isElement and made HTMLdtd public.
23 //   Contributed by Eric SCHAEFFER" <eschaeffer@posterconseil.com>
24 
25 
26 package com.sun.org.apache.xml.internal.serialize;
27 
28 import com.sun.org.apache.xerces.internal.dom.DOMMessageFormatter;
29 import java.io.BufferedReader;
30 import java.io.InputStream;
31 import java.io.InputStreamReader;
32 import java.util.HashMap;
33 import java.util.Locale;
34 import java.util.Map;
35 
36 
37 /**
38  * Utility class for accessing information specific to HTML documents.
39  * The HTML DTD is expressed as three utility function groups. Two methods
40  * allow for checking whether an element requires an open tag on printing
41  * ({@link #isEmptyTag}) or on parsing ({@link #isOptionalClosing}).
42  * <P>
43  * Two other methods translate character references from name to value and
44  * from value to name. A small entities resource is loaded into memory the
45  * first time any of these methods is called for fast and efficient access.
46  *
47  *
48  * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
49  */
50 public final class HTMLdtd
51 {
52 
53     /**
54      * Public identifier for HTML 4.01 (Strict) document type.
55      */
56     public static final String HTMLPublicId = "-//W3C//DTD HTML 4.01//EN";
57 
58     /**
59      * System identifier for HTML 4.01 (Strict) document type.
60      */
61     public static final String HTMLSystemId =
62         "http://www.w3.org/TR/html4/strict.dtd";
63 
64     /**
65      * Public identifier for XHTML 1.0 (Strict) document type.
66      */
67     public static final String XHTMLPublicId =
68         "-//W3C//DTD XHTML 1.0 Strict//EN";
69 
70     /**
71      * System identifier for XHTML 1.0 (Strict) document type.
72      */
73     public static final String XHTMLSystemId =
74         "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
75 
76     /**
77      * Table of reverse character reference mapping. Character codes are held
78      * as single-character strings, mapped to their reference name.
79      */
80     private static Map<Integer, String> _byChar;
81 
82 
83     /**
84      * Table of entity name to value mapping. Entities are held as strings,
85      * character references as <TT>Character</TT> objects.
86      */
87     private static Map<String, Integer> _byName;
88 
89 
90     private static final Map<String, String[]> _boolAttrs;
91 
92 
93     /**
94      * Holds element definitions.
95      */
96     private static final Map<String, Integer> _elemDefs;
97 
98 
99     /**
100      * Locates the HTML entities file that is loaded upon initialization.
101      * This file is a resource loaded with the default class loader.
102      */
103     private static final String     ENTITIES_RESOURCE = "HTMLEntities.res";
104 
105 
106     /**
107      * Only opening tag should be printed.
108      */
109     private static final int ONLY_OPENING = 0x0001;
110 
111     /**
112      * Element contains element content only.
113      */
114     private static final int ELEM_CONTENT = 0x0002;
115 
116 
117     /**
118      * Element preserve spaces.
119      */
120     private static final int PRESERVE     = 0x0004;
121 
122 
123     /**
124      * Optional closing tag.
125      */
126     private static final int OPT_CLOSING  = 0x0008;
127 
128 
129     /**
130      * Element is empty (also means only opening tag)
131      */
132     private static final int EMPTY        = 0x0010 | ONLY_OPENING;
133 
134 
135     /**
136      * Allowed to appear in head.
137      */
138     private static final int ALLOWED_HEAD = 0x0020;
139 
140 
141     /**
142      * When opened, closes P.
143      */
144     private static final int CLOSE_P      = 0x0040;
145 
146 
147     /**
148      * When opened, closes DD or DT.
149      */
150     private static final int CLOSE_DD_DT  = 0x0080;
151 
152 
153     /**
154      * When opened, closes itself.
155      */
156     private static final int CLOSE_SELF   = 0x0100;
157 
158 
159     /**
160      * When opened, closes another table section.
161      */
162     private static final int CLOSE_TABLE  = 0x0200;
163 
164 
165     /**
166      * When opened, closes TH or TD.
167      */
168     private static final int CLOSE_TH_TD  = 0x04000;
169 
170 
171     /**
172      * Returns true if element is declared to be empty. HTML elements are
173      * defines as empty in the DTD, not by the document syntax.
174      *
175      * @param tagName The element tag name (upper case)
176      * @return True if element is empty
177      */
isEmptyTag( String tagName )178     public static boolean isEmptyTag( String tagName )
179     {
180         return isElement( tagName, EMPTY );
181     }
182 
183 
184     /**
185      * Returns true if element is declared to have element content.
186      * Whitespaces appearing inside element content will be ignored,
187      * other text will simply report an error.
188      *
189      * @param tagName The element tag name (upper case)
190      * @return True if element content
191      */
isElementContent( String tagName )192     public static boolean isElementContent( String tagName )
193     {
194         return isElement( tagName, ELEM_CONTENT );
195     }
196 
197 
198     /**
199      * Returns true if element's textual contents preserves spaces.
200      * This only applies to PRE and TEXTAREA, all other HTML elements
201      * do not preserve space.
202      *
203      * @param tagName The element tag name (upper case)
204      * @return True if element's text content preserves spaces
205      */
isPreserveSpace( String tagName )206     public static boolean isPreserveSpace( String tagName )
207     {
208         return isElement( tagName, PRESERVE );
209     }
210 
211 
212     /**
213      * Returns true if element's closing tag is optional and need not
214      * exist. An error will not be reported for such elements if they
215      * are not closed. For example, <tt>LI</tt> is most often not closed.
216      *
217      * @param tagName The element tag name (upper case)
218      * @return True if closing tag implied
219      */
isOptionalClosing( String tagName )220     public static boolean isOptionalClosing( String tagName )
221     {
222         return isElement( tagName, OPT_CLOSING );
223     }
224 
225 
226     /**
227      * Returns true if element's closing tag is generally not printed.
228      * For example, <tt>LI</tt> should not print the closing tag.
229      *
230      * @param tagName The element tag name (upper case)
231      * @return True if only opening tag should be printed
232      */
isOnlyOpening( String tagName )233     public static boolean isOnlyOpening( String tagName )
234     {
235         return isElement( tagName, ONLY_OPENING );
236     }
237 
238 
239     /**
240      * Returns true if the opening of one element (<tt>tagName</tt>) implies
241      * the closing of another open element (<tt>openTag</tt>). For example,
242      * every opening <tt>LI</tt> will close the previously open <tt>LI</tt>,
243      * and every opening <tt>BODY</tt> will close the previously open <tt>HEAD</tt>.
244      *
245      * @param tagName The newly opened element
246      * @param openTag The already opened element
247      * @return True if closing tag closes opening tag
248      */
isClosing( String tagName, String openTag )249     public static boolean isClosing( String tagName, String openTag )
250     {
251         // Several elements are defined as closing the HEAD
252         if ( openTag.equalsIgnoreCase( "HEAD" ) )
253             return ! isElement( tagName, ALLOWED_HEAD );
254         // P closes iteself
255         if ( openTag.equalsIgnoreCase( "P" ) )
256             return isElement( tagName, CLOSE_P );
257         // DT closes DD, DD closes DT
258         if ( openTag.equalsIgnoreCase( "DT" ) || openTag.equalsIgnoreCase( "DD" ) )
259             return isElement( tagName, CLOSE_DD_DT );
260         // LI and OPTION close themselves
261         if ( openTag.equalsIgnoreCase( "LI" ) || openTag.equalsIgnoreCase( "OPTION" ) )
262             return isElement( tagName, CLOSE_SELF );
263         // Each of these table sections closes all the others
264         if ( openTag.equalsIgnoreCase( "THEAD" ) || openTag.equalsIgnoreCase( "TFOOT" ) ||
265              openTag.equalsIgnoreCase( "TBODY" ) || openTag.equalsIgnoreCase( "TR" ) ||
266              openTag.equalsIgnoreCase( "COLGROUP" ) )
267             return isElement( tagName, CLOSE_TABLE );
268         // TD closes TH and TH closes TD
269         if ( openTag.equalsIgnoreCase( "TH" ) || openTag.equalsIgnoreCase( "TD" ) )
270             return isElement( tagName, CLOSE_TH_TD );
271         return false;
272     }
273 
274 
275     /**
276      * Returns true if the specified attribute it a URI and should be
277      * escaped appropriately. In HTML URIs are escaped differently
278      * than normal attributes.
279      *
280      * @param tagName The element's tag name
281      * @param attrName The attribute's name
282      */
isURI( String tagName, String attrName )283     public static boolean isURI( String tagName, String attrName )
284     {
285         // Stupid checks.
286         return ( attrName.equalsIgnoreCase( "href" ) || attrName.equalsIgnoreCase( "src" ) );
287     }
288 
289 
290     /**
291      * Returns true if the specified attribute is a boolean and should be
292      * printed without the value. This applies to attributes that are true
293      * if they exist, such as selected (OPTION/INPUT).
294      *
295      * @param tagName The element's tag name
296      * @param attrName The attribute's name
297      */
isBoolean( String tagName, String attrName )298     public static boolean isBoolean( String tagName, String attrName )
299     {
300         String[] attrNames;
301 
302         attrNames = _boolAttrs.get( tagName.toUpperCase(Locale.ENGLISH) );
303         if ( attrNames == null )
304             return false;
305         for ( int i = 0 ; i < attrNames.length ; ++i )
306             if ( attrNames[ i ].equalsIgnoreCase( attrName ) )
307                 return true;
308         return false;
309     }
310 
311 
312     /**
313      * Returns the value of an HTML character reference by its name. If the
314      * reference is not found or was not defined as a character reference,
315      * returns EOF (-1).
316      *
317      * @param name Name of character reference
318      * @return Character code or EOF (-1)
319      */
charFromName( String name )320     public static int charFromName( String name )
321     {
322         Object    value;
323 
324         initialize();
325         value = _byName.get( name );
326         if ( value != null && value instanceof Integer )
327             return ( (Integer) value ).intValue();
328         else
329             return -1;
330     }
331 
332 
333     /**
334      * Returns the name of an HTML character reference based on its character
335      * value. Only valid for entities defined from character references. If no
336      * such character value was defined, return null.
337      *
338      * @param value Character value of entity
339      * @return Entity's name or null
340      */
fromChar(int value )341     public static String fromChar(int value )
342     {
343        if (value > 0xffff)
344             return null;
345 
346         String name;
347 
348         initialize();
349         name = _byChar.get(value);
350         return name;
351     }
352 
353 
354     /**
355      * Initialize upon first access. Will load all the HTML character references
356      * into a list that is accessible by name or character value and is optimized
357      * for character substitution. This method may be called any number of times
358      * but will execute only once.
359      */
initialize()360     private static void initialize()
361     {
362         InputStream     is = null;
363         BufferedReader  reader = null;
364         int             index;
365         String          name;
366         String          value;
367         int             code;
368         String          line;
369 
370         // Make sure not to initialize twice.
371         if ( _byName != null )
372             return;
373         try {
374             _byName = new HashMap<>();
375             _byChar = new HashMap<>();
376             is = HTMLdtd.class.getResourceAsStream( ENTITIES_RESOURCE );
377             if ( is == null ) {
378                 throw new RuntimeException(
379                                     DOMMessageFormatter.formatMessage(
380                                     DOMMessageFormatter.SERIALIZER_DOMAIN,
381                     "ResourceNotFound", new Object[] {ENTITIES_RESOURCE}));
382             }
383             reader = new BufferedReader( new InputStreamReader( is, "ASCII" ) );
384             line = reader.readLine();
385             while ( line != null ) {
386                 if ( line.length() == 0 || line.charAt( 0 ) == '#' ) {
387                     line = reader.readLine();
388                     continue;
389                 }
390                 index = line.indexOf( ' ' );
391                 if ( index > 1 ) {
392                     name = line.substring( 0, index );
393                     ++index;
394                     if ( index < line.length() ) {
395                         value = line.substring( index );
396                         index = value.indexOf( ' ' );
397                         if ( index > 0 )
398                             value = value.substring( 0, index );
399                         code = Integer.parseInt( value );
400                                         defineEntity( name, (char) code );
401                     }
402                 }
403                 line = reader.readLine();
404             }
405             is.close();
406         }  catch ( Exception except ) {
407                         throw new RuntimeException(
408                                 DOMMessageFormatter.formatMessage(
409                                 DOMMessageFormatter.SERIALIZER_DOMAIN,
410                 "ResourceNotLoaded", new Object[] {ENTITIES_RESOURCE, except.toString()}));
411         } finally {
412             if ( is != null ) {
413                 try {
414                     is.close();
415                 } catch ( Exception except ) { }
416             }
417         }
418     }
419 
420 
421     /**
422      * Defines a new character reference. The reference's name and value are
423      * supplied. Nothing happens if the character reference is already defined.
424      * <P>
425      * Unlike internal entities, character references are a string to single
426      * character mapping. They are used to map non-ASCII characters both on
427      * parsing and printing, primarily for HTML documents. '&lt;amp;' is an
428      * example of a character reference.
429      *
430      * @param name The entity's name
431      * @param value The entity's value
432      */
defineEntity( String name, char value )433     private static void defineEntity( String name, char value )
434     {
435         if ( _byName.get( name ) == null ) {
436             _byName.put( name, new Integer( value ) );
437             _byChar.put( new Integer( value ), name );
438         }
439     }
440 
441 
defineElement( String name, int flags )442     private static void defineElement( String name, int flags )
443     {
444         _elemDefs.put(name, flags);
445     }
446 
447 
defineBoolean( String tagName, String attrName )448     private static void defineBoolean( String tagName, String attrName )
449     {
450         defineBoolean( tagName, new String[] { attrName } );
451     }
452 
453 
defineBoolean( String tagName, String[] attrNames )454     private static void defineBoolean( String tagName, String[] attrNames )
455     {
456         _boolAttrs.put( tagName, attrNames );
457     }
458 
459 
isElement( String name, int flag )460     private static boolean isElement( String name, int flag )
461     {
462         Integer flags;
463 
464         flags = _elemDefs.get( name.toUpperCase(Locale.ENGLISH) );
465         if ( flags == null )
466             return false;
467         else
468             return ( ( flags.intValue() & flag ) == flag );
469     }
470 
471 
472     static
473     {
474         _elemDefs = new HashMap<>();
475         defineElement( "ADDRESS", CLOSE_P );
476         defineElement( "AREA", EMPTY );
477         defineElement( "BASE",  EMPTY | ALLOWED_HEAD );
478         defineElement( "BASEFONT", EMPTY );
479         defineElement( "BLOCKQUOTE", CLOSE_P );
480         defineElement( "BODY", OPT_CLOSING );
481         defineElement( "BR", EMPTY );
482         defineElement( "COL", EMPTY );
483         defineElement( "COLGROUP", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
484         defineElement( "DD", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT );
485         defineElement( "DIV", CLOSE_P );
486         defineElement( "DL", ELEM_CONTENT | CLOSE_P );
487         defineElement( "DT", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT );
488         defineElement( "FIELDSET", CLOSE_P );
489         defineElement( "FORM", CLOSE_P );
490         defineElement( "FRAME", EMPTY | OPT_CLOSING );
491         defineElement( "H1", CLOSE_P );
492         defineElement( "H2", CLOSE_P );
493         defineElement( "H3", CLOSE_P );
494         defineElement( "H4", CLOSE_P );
495         defineElement( "H5", CLOSE_P );
496         defineElement( "H6", CLOSE_P );
497         defineElement( "HEAD", ELEM_CONTENT | OPT_CLOSING );
498         defineElement( "HR", EMPTY | CLOSE_P );
499         defineElement( "HTML", ELEM_CONTENT | OPT_CLOSING );
500         defineElement( "IMG", EMPTY );
501         defineElement( "INPUT", EMPTY );
502         defineElement( "ISINDEX", EMPTY | ALLOWED_HEAD );
503         defineElement( "LI", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF );
504         defineElement( "LINK", EMPTY | ALLOWED_HEAD );
505         defineElement( "MAP", ALLOWED_HEAD );
506         defineElement( "META", EMPTY | ALLOWED_HEAD );
507         defineElement( "OL", ELEM_CONTENT | CLOSE_P );
508         defineElement( "OPTGROUP", ELEM_CONTENT );
509         defineElement( "OPTION", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF );
510         defineElement( "P", OPT_CLOSING | CLOSE_P | CLOSE_SELF );
511         defineElement( "PARAM", EMPTY );
512         defineElement( "PRE", PRESERVE | CLOSE_P );
513         defineElement( "SCRIPT", ALLOWED_HEAD | PRESERVE );
514         defineElement( "NOSCRIPT", ALLOWED_HEAD | PRESERVE );
515         defineElement( "SELECT", ELEM_CONTENT );
516         defineElement( "STYLE", ALLOWED_HEAD | PRESERVE );
517         defineElement( "TABLE", ELEM_CONTENT | CLOSE_P );
518         defineElement( "TBODY", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
519         defineElement( "TD", OPT_CLOSING | CLOSE_TH_TD );
520         defineElement( "TEXTAREA", PRESERVE );
521         defineElement( "TFOOT", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
522         defineElement( "TH", OPT_CLOSING | CLOSE_TH_TD );
523         defineElement( "THEAD", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
524         defineElement( "TITLE", ALLOWED_HEAD );
525         defineElement( "TR", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
526         defineElement( "UL", ELEM_CONTENT | CLOSE_P );
527 
528         _boolAttrs = new HashMap<>();
529         defineBoolean( "AREA", "href" );
530         defineBoolean( "BUTTON", "disabled" );
531         defineBoolean( "DIR", "compact" );
532         defineBoolean( "DL", "compact" );
533         defineBoolean( "FRAME", "noresize" );
534         defineBoolean( "HR", "noshade" );
535         defineBoolean( "IMAGE", "ismap" );
536         defineBoolean( "INPUT", new String[] { "defaultchecked", "checked", "readonly", "disabled" } );
537         defineBoolean( "LINK", "link" );
538         defineBoolean( "MENU", "compact" );
539         defineBoolean( "OBJECT", "declare" );
540         defineBoolean( "OL", "compact" );
541         defineBoolean( "OPTGROUP", "disabled" );
542         defineBoolean( "OPTION", new String[] { "default-selected", "selected", "disabled" } );
543         defineBoolean( "SCRIPT", "defer" );
544         defineBoolean( "SELECT", new String[] { "multiple", "disabled" } );
545         defineBoolean( "STYLE", "disabled" );
546         defineBoolean( "TD", "nowrap" );
547         defineBoolean( "TH", "nowrap" );
548         defineBoolean( "TEXTAREA", new String[] { "disabled", "readonly" } );
549         defineBoolean( "UL", "compact" );
550 
initialize()551         initialize();
552     }
553 
554 
555 
556 }
557