1 /*
2  * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
3  */
4 /*
5  * Licensed to the Apache Software Foundation (ASF) under one or more
6  * contributor license agreements.  See the NOTICE file distributed with
7  * this work for additional information regarding copyright ownership.
8  * The ASF licenses this file to You under the Apache License, Version 2.0
9  * (the "License"); you may not use this file except in compliance with
10  * the License.  You may obtain a copy of the License at
11  *
12  *      http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  */
20 
21 
22 // Aug 21, 2000:
23 //  Added ability to omit DOCTYPE declaration.
24 //  Reported by Lars Martin <lars@smb-tec.com>
25 // Aug 25, 2000:
26 //  Added ability to omit comments.
27 //  Contributed by Anupam Bagchi <abagchi@jtcsv.com>
28 
29 
30 package com.sun.org.apache.xml.internal.serialize;
31 
32 
33 import java.io.UnsupportedEncodingException;
34 
35 import org.w3c.dom.Document;
36 import org.w3c.dom.DocumentType;
37 import org.w3c.dom.Node;
38 
39 
40 /**
41  * Specifies an output format to control the serializer. Based on the
42  * XSLT specification for output format, plus additional parameters.
43  * Used to select the suitable serializer and determine how the
44  * document should be formatted on output.
45  * <p>
46  * The two interesting constructors are:
47  * <ul>
48  * <li>{@link #OutputFormat(String,String,boolean)} creates a format
49  *  for the specified method (XML, HTML, Text, etc), encoding and indentation
50  * <li>{@link #OutputFormat(Document,String,boolean)} creates a format
51  *  compatible with the document type (XML, HTML, Text, etc), encoding and
52  *  indentation
53  * </ul>
54  *
55  *
56  * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
57  *         <a href="mailto:visco@intalio.com">Keith Visco</a>
58  * @see Serializer
59  * @see Method
60  * @see LineSeparator
61  *
62  * @deprecated As of JDK 9, Xerces 2.9.0, Xerces DOM L3 Serializer implementation
63  * is replaced by that of Xalan. Main class
64  * {@link com.sun.org.apache.xml.internal.serialize.DOMSerializerImpl} is replaced
65  * by {@link com.sun.org.apache.xml.internal.serializer.dom3.LSSerializerImpl}.
66  */
67 @Deprecated
68 public class OutputFormat
69 {
70 
71 
72     public static class DTD
73     {
74 
75         /**
76          * Public identifier for HTML 4.01 (Strict) document type.
77          */
78         public static final String HTMLPublicId = "-//W3C//DTD HTML 4.01//EN";
79 
80         /**
81          * System identifier for HTML 4.01 (Strict) document type.
82          */
83         public static final String HTMLSystemId =
84             "http://www.w3.org/TR/html4/strict.dtd";
85 
86         /**
87          * Public identifier for XHTML 1.0 (Strict) document type.
88          */
89         public static final String XHTMLPublicId =
90             "-//W3C//DTD XHTML 1.0 Strict//EN";
91 
92         /**
93          * System identifier for XHTML 1.0 (Strict) document type.
94          */
95         public static final String XHTMLSystemId =
96             "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
97 
98     }
99 
100 
101     public static class Defaults
102     {
103 
104         /**
105          * If indentation is turned on, the default identation
106          * level is 4.
107          *
108          * @see #setIndenting(boolean)
109          */
110         public static final int Indent = 4;
111 
112         /**
113          * The default encoding for Web documents it UTF-8.
114          *
115          * @see #getEncoding()
116          */
117         public static final String Encoding = "UTF-8";
118 
119         /**
120          * The default line width at which to break long lines
121          * when identing. This is set to 72.
122          */
123         public static final int LineWidth = 72;
124 
125     }
126 
127 
128     /**
129      * Holds the output method specified for this document,
130      * or null if no method was specified.
131      */
132     private String _method;
133 
134 
135     /**
136      * Specifies the version of the output method.
137      */
138     private String _version;
139 
140 
141     /**
142      * The indentation level, or zero if no indentation
143      * was requested.
144      */
145     private int _indent = 0;
146 
147 
148     /**
149      * The encoding to use, if an input stream is used.
150      * The default is always UTF-8.
151      */
152     private String _encoding = Defaults.Encoding;
153 
154     /**
155      * The EncodingInfo instance for _encoding.
156      */
157     private EncodingInfo _encodingInfo = null;
158 
159     // whether java names for encodings are permitted
160     private boolean _allowJavaNames = false;
161 
162     /**
163      * The specified media type or null.
164      */
165     private String _mediaType;
166 
167 
168     /**
169      * The specified document type system identifier, or null.
170      */
171     private String _doctypeSystem;
172 
173 
174     /**
175      * The specified document type public identifier, or null.
176      */
177     private String _doctypePublic;
178 
179 
180     /**
181      * Ture if the XML declaration should be ommited;
182      */
183     private boolean _omitXmlDeclaration = false;
184 
185 
186     /**
187      * Ture if the DOCTYPE declaration should be ommited;
188      */
189     private boolean _omitDoctype = false;
190 
191 
192     /**
193      * Ture if comments should be ommited;
194      */
195     private boolean _omitComments = false;
196 
197 
198     /**
199      * Ture if the comments should be ommited;
200      */
201     private boolean _stripComments = false;
202 
203 
204     /**
205      * True if the document type should be marked as standalone.
206      */
207     private boolean _standalone = false;
208 
209 
210     /**
211      * List of element tag names whose text node children must
212      * be output as CDATA.
213      */
214     private String[] _cdataElements;
215 
216 
217     /**
218      * List of element tag names whose text node children must
219      * be output unescaped.
220      */
221     private String[] _nonEscapingElements;
222 
223 
224     /**
225      * The selected line separator.
226      */
227     private String _lineSeparator = LineSeparator.Web;
228 
229 
230     /**
231      * The line width at which to wrap long lines when indenting.
232      */
233     private int _lineWidth = Defaults.LineWidth;
234 
235 
236     /**
237      * True if spaces should be preserved in elements that do not
238      * specify otherwise, or specify the default behavior.
239      */
240     private boolean _preserve = false;
241         /** If true, an empty string valued attribute is output as "". If false and
242          * and we are using the HTMLSerializer, then only the attribute name is
243          * serialized. Defaults to false for backwards compatibility.
244          */
245         private boolean _preserveEmptyAttributes = false;
246 
247     /**
248      * Constructs a new output format with the default values.
249      */
OutputFormat()250     public OutputFormat()
251     {
252     }
253 
254 
255     /**
256      * Constructs a new output format with the default values for
257      * the specified method and encoding. If <tt>indent</tt>
258      * is true, the document will be pretty printed with the default
259      * indentation level and default line wrapping.
260      *
261      * @param method The specified output method
262      * @param encoding The specified encoding
263      * @param indenting True for pretty printing
264      * @see #setEncoding
265      * @see #setIndenting
266      * @see #setMethod
267      */
OutputFormat( String method, String encoding, boolean indenting )268     public OutputFormat( String method, String encoding, boolean indenting )
269     {
270         setMethod( method );
271         setEncoding( encoding );
272         setIndenting( indenting );
273     }
274 
275     /**
276      * Returns the method specified for this output format.
277      * Typically the method will be <tt>xml</tt>, <tt>html</tt>
278      * or <tt>text</tt>, but it might be other values.
279      * If no method was specified, null will be returned
280      * and the most suitable method will be determined for
281      * the document by calling {@link #whichMethod}.
282      *
283      * @return The specified output method, or null
284      */
getMethod()285     public String getMethod()
286     {
287         return _method;
288     }
289 
290 
291     /**
292      * Sets the method for this output format.
293      *
294      * @see #getMethod
295      * @param method The output method, or null
296      */
setMethod( String method )297     public void setMethod( String method )
298     {
299         _method = method;
300     }
301 
302 
303     /**
304      * Returns the version for this output method.
305      * If no version was specified, will return null
306      * and the default version number will be used.
307      * If the serializerr does not support that particular
308      * version, it should default to a supported version.
309      *
310      * @return The specified method version, or null
311      */
getVersion()312     public String getVersion()
313     {
314         return _version;
315     }
316 
317 
318     /**
319      * Sets the version for this output method.
320      * For XML the value would be "1.0", for HTML
321      * it would be "4.0".
322      *
323      * @see #getVersion
324      * @param version The output method version, or null
325      */
setVersion( String version )326     public void setVersion( String version )
327     {
328         _version = version;
329     }
330 
331 
332     /**
333      * Returns the indentation specified. If no indentation
334      * was specified, zero is returned and the document
335      * should not be indented.
336      *
337      * @return The indentation or zero
338      * @see #setIndenting
339      */
getIndent()340     public int getIndent()
341     {
342         return _indent;
343     }
344 
345 
346     /**
347      * Returns true if indentation was specified.
348      */
getIndenting()349     public boolean getIndenting()
350     {
351         return ( _indent > 0 );
352     }
353 
354 
355     /**
356      * Sets the indentation. The document will not be
357      * indented if the indentation is set to zero.
358      * Calling {@link #setIndenting} will reset this
359      * value to zero (off) or the default (on).
360      *
361      * @param indent The indentation, or zero
362      */
setIndent( int indent )363     public void setIndent( int indent )
364     {
365         if ( indent < 0 )
366             _indent = 0;
367         else
368             _indent = indent;
369     }
370 
371 
372     /**
373      * Sets the indentation on and off. When set on, the default
374      * indentation level and default line wrapping is used
375      * (see {@link Defaults#Indent} and {@link Defaults#LineWidth}).
376      * To specify a different indentation level or line wrapping,
377      * use {@link #setIndent} and {@link #setLineWidth}.
378      *
379      * @param on True if indentation should be on
380      */
setIndenting( boolean on )381     public void setIndenting( boolean on )
382     {
383         if ( on ) {
384             _indent = Defaults.Indent;
385             _lineWidth = Defaults.LineWidth;
386         } else {
387             _indent = 0;
388             _lineWidth = 0;
389         }
390     }
391 
392 
393     /**
394      * Returns the specified encoding. If no encoding was
395      * specified, the default is always "UTF-8".
396      *
397      * @return The encoding
398      */
getEncoding()399     public String getEncoding()
400     {
401         return _encoding;
402     }
403 
404 
405     /**
406      * Sets the encoding for this output method. If no
407      * encoding was specified, the default is always "UTF-8".
408      * Make sure the encoding is compatible with the one
409      * used by the {@link java.io.Writer}.
410      *
411      * @see #getEncoding
412      * @param encoding The encoding, or null
413      */
setEncoding( String encoding )414     public void setEncoding( String encoding )
415     {
416         _encoding = encoding;
417         _encodingInfo = null;
418     }
419 
420     /**
421      * Sets the encoding for this output method with an <code>EncodingInfo</code>
422      * instance.
423      */
setEncoding(EncodingInfo encInfo)424     public void setEncoding(EncodingInfo encInfo) {
425         _encoding = encInfo.getIANAName();
426         _encodingInfo = encInfo;
427     }
428 
429     /**
430      * Returns an <code>EncodingInfo<code> instance for the encoding.
431      *
432      * @see #setEncoding
433      */
getEncodingInfo()434     public EncodingInfo getEncodingInfo() throws UnsupportedEncodingException {
435         if (_encodingInfo == null)
436             _encodingInfo = Encodings.getEncodingInfo(_encoding, _allowJavaNames);
437         return _encodingInfo;
438     }
439 
440     /**
441      * Sets whether java encoding names are permitted
442      */
setAllowJavaNames(boolean allow)443     public void setAllowJavaNames (boolean allow) {
444         _allowJavaNames = allow;
445     }
446 
447     /**
448      * Returns whether java encoding names are permitted
449      */
setAllowJavaNames()450     public boolean setAllowJavaNames () {
451         return _allowJavaNames;
452     }
453 
454     /**
455      * Returns the specified media type, or null.
456      * To determine the media type based on the
457      * document type, use {@link #whichMediaType}.
458      *
459      * @return The specified media type, or null
460      */
getMediaType()461     public String getMediaType()
462     {
463         return _mediaType;
464     }
465 
466 
467     /**
468      * Sets the media type.
469      *
470      * @see #getMediaType
471      * @param mediaType The specified media type
472      */
setMediaType( String mediaType )473     public void setMediaType( String mediaType )
474     {
475         _mediaType = mediaType;
476     }
477 
478 
479     /**
480      * Sets the document type public and system identifiers.
481      * Required only if the DOM Document or SAX events do not
482      * specify the document type, and one must be present in
483      * the serialized document. Any document type specified
484      * by the DOM Document or SAX events will override these
485      * values.
486      *
487      * @param publicId The public identifier, or null
488      * @param systemId The system identifier, or null
489      */
setDoctype( String publicId, String systemId )490     public void setDoctype( String publicId, String systemId )
491     {
492         _doctypePublic = publicId;
493         _doctypeSystem = systemId;
494     }
495 
496 
497     /**
498      * Returns the specified document type public identifier,
499      * or null.
500      */
getDoctypePublic()501     public String getDoctypePublic()
502     {
503         return _doctypePublic;
504     }
505 
506 
507     /**
508      * Returns the specified document type system identifier,
509      * or null.
510      */
getDoctypeSystem()511     public String getDoctypeSystem()
512     {
513         return _doctypeSystem;
514     }
515 
516 
517     /**
518      * Returns true if comments should be ommited.
519      * The default is false.
520      */
getOmitComments()521     public boolean getOmitComments()
522     {
523         return _omitComments;
524     }
525 
526 
527     /**
528      * Sets comment omitting on and off.
529      *
530      * @param omit True if comments should be ommited
531      */
setOmitComments( boolean omit )532     public void setOmitComments( boolean omit )
533     {
534         _omitComments = omit;
535     }
536 
537 
538     /**
539      * Returns true if the DOCTYPE declaration should
540      * be ommited. The default is false.
541      */
getOmitDocumentType()542     public boolean getOmitDocumentType()
543     {
544         return _omitDoctype;
545     }
546 
547 
548     /**
549      * Sets DOCTYPE declaration omitting on and off.
550      *
551      * @param omit True if DOCTYPE declaration should be ommited
552      */
setOmitDocumentType( boolean omit )553     public void setOmitDocumentType( boolean omit )
554     {
555         _omitDoctype = omit;
556     }
557 
558 
559     /**
560      * Returns true if the XML document declaration should
561      * be ommited. The default is false.
562      */
getOmitXMLDeclaration()563     public boolean getOmitXMLDeclaration()
564     {
565         return _omitXmlDeclaration;
566     }
567 
568 
569     /**
570      * Sets XML declaration omitting on and off.
571      *
572      * @param omit True if XML declaration should be ommited
573      */
setOmitXMLDeclaration( boolean omit )574     public void setOmitXMLDeclaration( boolean omit )
575     {
576         _omitXmlDeclaration = omit;
577     }
578 
579 
580     /**
581      * Returns true if the document type is standalone.
582      * The default is false.
583      */
getStandalone()584     public boolean getStandalone()
585     {
586         return _standalone;
587     }
588 
589 
590     /**
591      * Sets document DTD standalone. The public and system
592      * identifiers must be null for the document to be
593      * serialized as standalone.
594      *
595      * @param standalone True if document DTD is standalone
596      */
setStandalone( boolean standalone )597     public void setStandalone( boolean standalone )
598     {
599         _standalone = standalone;
600     }
601 
602 
603     /**
604      * Returns a list of all the elements whose text node children
605      * should be output as CDATA, or null if no such elements were
606      * specified.
607      */
getCDataElements()608     public String[] getCDataElements()
609     {
610         return _cdataElements;
611     }
612 
613 
614     /**
615      * Returns true if the text node children of the given elements
616      * should be output as CDATA.
617      *
618      * @param tagName The element's tag name
619      * @return True if should serialize as CDATA
620      */
isCDataElement( String tagName )621     public boolean isCDataElement( String tagName )
622     {
623         int i;
624 
625         if ( _cdataElements == null )
626             return false;
627         for ( i = 0 ; i < _cdataElements.length ; ++i )
628             if ( _cdataElements[ i ].equals( tagName ) )
629                 return true;
630         return false;
631     }
632 
633 
634     /**
635      * Sets the list of elements for which text node children
636      * should be output as CDATA.
637      *
638      * @param cdataElements List of CDATA element tag names
639      */
setCDataElements( String[] cdataElements )640     public void setCDataElements( String[] cdataElements )
641     {
642         _cdataElements = cdataElements;
643     }
644 
645 
646     /**
647      * Returns a list of all the elements whose text node children
648      * should be output unescaped (no character references), or null
649      * if no such elements were specified.
650      */
getNonEscapingElements()651     public String[] getNonEscapingElements()
652     {
653         return _nonEscapingElements;
654     }
655 
656 
657     /**
658      * Returns true if the text node children of the given elements
659      * should be output unescaped.
660      *
661      * @param tagName The element's tag name
662      * @return True if should serialize unescaped
663      */
isNonEscapingElement( String tagName )664     public boolean isNonEscapingElement( String tagName )
665     {
666         int i;
667 
668         if ( _nonEscapingElements == null ) {
669             return false;
670         }
671         for ( i = 0 ; i < _nonEscapingElements.length ; ++i )
672             if ( _nonEscapingElements[ i ].equals( tagName ) )
673                 return true;
674         return false;
675     }
676 
677 
678     /**
679      * Sets the list of elements for which text node children
680      * should be output unescaped (no character references).
681      *
682      * @param nonEscapingElements List of unescaped element tag names
683      */
setNonEscapingElements( String[] nonEscapingElements )684     public void setNonEscapingElements( String[] nonEscapingElements )
685     {
686         _nonEscapingElements = nonEscapingElements;
687     }
688 
689 
690 
691     /**
692      * Returns a specific line separator to use. The default is the
693      * Web line separator (<tt>\n</tt>). A string is returned to
694      * support double codes (CR + LF).
695      *
696      * @return The specified line separator
697      */
getLineSeparator()698     public String getLineSeparator()
699     {
700         return _lineSeparator;
701     }
702 
703 
704     /**
705      * Sets the line separator. The default is the Web line separator
706      * (<tt>\n</tt>). The machine's line separator can be obtained
707      * from the system property <tt>line.separator</tt>, but is only
708      * useful if the document is edited on machines of the same type.
709      * For general documents, use the Web line separator.
710      *
711      * @param lineSeparator The specified line separator
712      */
setLineSeparator( String lineSeparator )713     public void setLineSeparator( String lineSeparator )
714     {
715         if ( lineSeparator == null )
716             _lineSeparator =  LineSeparator.Web;
717         else
718             _lineSeparator = lineSeparator;
719     }
720 
721 
722     /**
723      * Returns true if the default behavior for this format is to
724      * preserve spaces. All elements that do not specify otherwise
725      * or specify the default behavior will be formatted based on
726      * this rule. All elements that specify space preserving will
727      * always preserve space.
728      */
getPreserveSpace()729     public boolean getPreserveSpace()
730     {
731         return _preserve;
732     }
733 
734 
735     /**
736      * Sets space preserving as the default behavior. The default is
737      * space stripping and all elements that do not specify otherwise
738      * or use the default value will not preserve spaces.
739      *
740      * @param preserve True if spaces should be preserved
741      */
setPreserveSpace( boolean preserve )742     public void setPreserveSpace( boolean preserve )
743     {
744         _preserve = preserve;
745     }
746 
747 
748     /**
749      * Return the selected line width for breaking up long lines.
750      * When indenting, and only when indenting, long lines will be
751      * broken at space boundaries based on this line width.
752      * No line wrapping occurs if this value is zero.
753      */
getLineWidth()754     public int getLineWidth()
755     {
756         return _lineWidth;
757     }
758 
759 
760     /**
761      * Sets the line width. If zero then no line wrapping will
762      * occur. Calling {@link #setIndenting} will reset this
763      * value to zero (off) or the default (on).
764      *
765      * @param lineWidth The line width to use, zero for default
766      * @see #getLineWidth
767      * @see #setIndenting
768      */
setLineWidth( int lineWidth )769     public void setLineWidth( int lineWidth )
770     {
771         if ( lineWidth <= 0 )
772             _lineWidth = 0;
773         else
774             _lineWidth = lineWidth;
775     }
776         /**
777          * Returns the preserveEmptyAttribute flag. If flag is false, then'
778          * attributes with empty string values are output as the attribute
779          * name only (in HTML mode).
780          * @return preserve the preserve flag
getPreserveEmptyAttributes()781          */     public boolean getPreserveEmptyAttributes () {          return _preserveEmptyAttributes;        }       /**
782          * Sets the preserveEmptyAttribute flag. If flag is false, then'
783          * attributes with empty string values are output as the attribute
784          * name only (in HTML mode).
785          * @param preserve the preserve flag
setPreserveEmptyAttributes(boolean preserve)786          */     public void setPreserveEmptyAttributes (boolean preserve) {             _preserveEmptyAttributes = preserve;    }
787 
788     /**
789      * Returns the last printable character based on the selected
790      * encoding. Control characters and non-printable characters
791      * are always printed as character references.
792      */
getLastPrintable()793     public char getLastPrintable()
794     {
795         if ( getEncoding() != null &&
796              ( getEncoding().equalsIgnoreCase( "ASCII" ) ) )
797             return 0xFF;
798         else
799             return 0xFFFF;
800     }
801 
802 
803     /**
804      * Returns the suitable media format for a document
805      * output with the specified method.
806      */
whichMediaType( String method )807     public static String whichMediaType( String method )
808     {
809         if ( method.equalsIgnoreCase( Method.XML ) )
810             return "text/xml";
811         if ( method.equalsIgnoreCase( Method.HTML ) )
812             return "text/html";
813         if ( method.equalsIgnoreCase( Method.XHTML ) )
814             return "text/html";
815         if ( method.equalsIgnoreCase( Method.TEXT ) )
816             return "text/plain";
817         if ( method.equalsIgnoreCase( Method.FOP ) )
818             return "application/pdf";
819         return null;
820     }
821 
822 
823 }
824