1<?php 2 3namespace Wikimedia\RemexHtml\Serializer; 4 5use Wikimedia\RemexHtml\DOM\DOMFormatter; 6use Wikimedia\RemexHtml\DOM\DOMUtils; 7use Wikimedia\RemexHtml\HTMLData; 8 9/** 10 * A formatter which follows the HTML 5 fragment serialization algorithm. 11 */ 12class HtmlFormatter implements Formatter, DOMFormatter { 13 /** 14 * The elements for which a closing tag is omitted. 15 */ 16 protected $voidElements = [ 17 'area' => true, 18 'base' => true, 19 'basefont' => true, 20 'bgsound' => true, 21 'br' => true, 22 'col' => true, 23 'embed' => true, 24 'frame' => true, 25 'hr' => true, 26 'img' => true, 27 'input' => true, 28 'keygen' => true, 29 'link' => true, 30 'menuitem' => true, 31 'meta' => true, 32 'param' => true, 33 'source' => true, 34 'track' => true, 35 'wbr' => true, 36 ]; 37 38 /** 39 * The elements which need a leading newline in their contents to be 40 * duplicated, since the parser strips a leading newline. 41 */ 42 protected $prefixLfElements = [ 43 'pre' => true, 44 'textarea' => true, 45 'listing' => true 46 ]; 47 48 /** 49 * The elements which have unescaped contents. 50 */ 51 protected $rawTextElements = [ 52 'style' => true, 53 'script' => true, 54 'xmp' => true, 55 'iframe' => true, 56 'noembed' => true, 57 'noframes' => true, 58 'plaintext' => true, 59 ]; 60 /** 61 * The escape table for attribute values 62 */ 63 protected $attributeEscapes = [ 64 '&' => '&', 65 "\xc2\xa0" => ' ', 66 '"' => '"', 67 ]; 68 /** 69 * The escape table for text nodes 70 */ 71 protected $textEscapes = [ 72 '&' => '&', 73 "\xc2\xa0" => ' ', 74 '<' => '<', 75 '>' => '>', 76 ]; 77 78 /** 79 * Attribute namespaces which have unqualified local names 80 */ 81 protected $unqualifiedNamespaces = [ 82 HTMLData::NS_HTML => true, 83 HTMLData::NS_MATHML => true, 84 HTMLData::NS_SVG => true, 85 ]; 86 87 protected $useSourceDoctype; 88 protected $reverseCoercion; 89 90 /** 91 * Constructor. 92 * 93 * @param array $options An associative array of options: 94 * - scriptingFlag : Set this to false to disable scripting. True by default. 95 * - useSourceDoctype : Emit the doctype used in the source. If this is 96 * false or absent, an HTML doctype will be used. 97 * - reverseCoercion : When formatting a DOM node, reverse the encoding 98 * of invalid names. False by default. 99 */ 100 public function __construct( $options = [] ) { 101 $options += [ 102 'scriptingFlag' => true, 103 'useSourceDoctype' => false, 104 'reverseCoercion' => false, 105 ]; 106 if ( $options['scriptingFlag'] ) { 107 $this->rawTextElements['noscript'] = true; 108 } 109 $this->useSourceDoctype = $options['useSourceDoctype']; 110 $this->reverseCoercion = $options['reverseCoercion']; 111 } 112 113 public function startDocument( $fragmentNamespace, $fragmentName ) { 114 return "<!DOCTYPE html>"; 115 } 116 117 public function characters( SerializerNode $parent, $text, $start, $length ) { 118 $text = substr( $text, $start, $length ); 119 if ( $parent->namespace !== HTMLData::NS_HTML 120 || !isset( $this->rawTextElements[$parent->name] ) 121 ) { 122 $text = strtr( $text, $this->textEscapes ); 123 } 124 return $text; 125 } 126 127 public function element( SerializerNode $parent, SerializerNode $node, $contents ) { 128 $name = $node->name; 129 $s = "<$name"; 130 foreach ( $node->attrs->getValues() as $attrName => $attrValue ) { 131 $encValue = strtr( $attrValue, $this->attributeEscapes ); 132 $s .= " $attrName=\"$encValue\""; 133 } 134 $s .= '>'; 135 if ( $node->namespace === HTMLData::NS_HTML ) { 136 if ( isset( $contents[0] ) && $contents[0] === "\n" 137 && isset( $this->prefixLfElements[$name] ) 138 ) { 139 $s .= "\n$contents</$name>"; 140 } elseif ( !isset( $this->voidElements[$name] ) ) { 141 $s .= "$contents</$name>"; 142 } 143 } else { 144 $s .= "$contents</$name>"; 145 } 146 return $s; 147 } 148 149 public function comment( SerializerNode $parent, $text ) { 150 return "<!--$text-->"; 151 } 152 153 public function doctype( $name, $public, $system ) { 154 return ''; 155 } 156 157 public function formatDOMNode( \DOMNode $node ) { 158 $contents = ''; 159 if ( $node->firstChild ) { 160 foreach ( $node->childNodes as $child ) { 161 $contents .= $this->formatDOMNode( $child ); 162 } 163 } 164 165 switch ( $node->nodeType ) { 166 case XML_ELEMENT_NODE: 167 '@phan-var \DOMElement $node'; /** @var \DOMElement $node */ 168 return $this->formatDOMElement( $node, $contents ); 169 170 case XML_DOCUMENT_NODE: 171 if ( !$this->useSourceDoctype ) { 172 return "<!DOCTYPE html>" . $contents; 173 } else { 174 return $contents; 175 } 176 177 case XML_DOCUMENT_FRAG_NODE: 178 return $contents; 179 180 case XML_TEXT_NODE: 181 '@phan-var \DOMCharacterData $node'; /** @var \DOMCharacterData $node */ 182 $text = $node->data; 183 $parent = $node->parentNode; 184 if ( $parent->namespaceURI !== HTMLData::NS_HTML 185 || !isset( $this->rawTextElements[$parent->nodeName] ) 186 ) { 187 $text = strtr( $text, $this->textEscapes ); 188 } 189 return $text; 190 191 case XML_CDATA_SECTION_NODE: 192 '@phan-var \DOMCdataSection $node'; /** @var \DOMCdataSection $node */ 193 $parent = $node->parentNode; 194 if ( $parent->namespaceURI === HTMLData::NS_HTML ) { 195 // CDATA is not allowed in HTML nodes 196 return $node->data; 197 } else { 198 return "<![CDATA[{$node->data}]]>"; 199 } 200 201 case XML_PI_NODE: 202 '@phan-var \DOMProcessingInstruction $node'; /** @var \DOMProcessingInstruction $node */ 203 return "<?{$node->target} {$node->data}>"; 204 205 case XML_COMMENT_NODE: 206 '@phan-var \DOMComment $node'; /** @var \DOMComment $node */ 207 return "<!--{$node->data}-->"; 208 209 case XML_DOCUMENT_TYPE_NODE: 210 '@phan-var \DOMDocumentType $node'; /** @var \DOMDocumentType $node */ 211 if ( $this->useSourceDoctype ) { 212 return "<!DOCTYPE {$node->name}>"; 213 } else { 214 return ''; 215 } 216 217 default: 218 return ''; 219 } 220 } 221 222 public function formatDOMElement( \DOMElement $node, $contents ) { 223 $ns = $node->namespaceURI; 224 if ( $ns === null 225 || isset( $this->unqualifiedNamespaces[$ns] ) 226 || $node->prefix === null 227 ) { 228 $name = $node->localName; 229 } else { 230 $name = $node->prefix . ':' . $node->localName; 231 } 232 if ( $this->reverseCoercion ) { 233 $name = DOMUtils::uncoerceName( $name ); 234 } 235 236 $s = '<' . $name; 237 foreach ( $node->attributes as $attr ) { 238 switch ( $attr->namespaceURI ) { 239 case HTMLData::NS_XML: 240 $attrName = 'xml:' . $attr->localName; 241 break; 242 case HTMLData::NS_XMLNS: 243 if ( $attr->localName === 'xmlns' ) { 244 $attrName = 'xmlns'; 245 } else { 246 $attrName = 'xmlns:' . $attr->localName; 247 } 248 break; 249 case HTMLData::NS_XLINK: 250 $attrName = 'xlink:' . $attr->localName; 251 break; 252 default: 253 if ( strlen( $attr->prefix ) ) { 254 $attrName = $attr->prefix . ':' . $attr->localName; 255 } else { 256 $attrName = $attr->localName; 257 } 258 } 259 if ( $this->reverseCoercion ) { 260 $attrName = DOMUtils::uncoerceName( $attrName ); 261 } 262 $encValue = strtr( $attr->value, $this->attributeEscapes ); 263 $s .= " $attrName=\"$encValue\""; 264 } 265 $s .= '>'; 266 if ( $ns === HTMLData::NS_HTML ) { 267 if ( isset( $contents[0] ) && $contents[0] === "\n" 268 && isset( $this->prefixLfElements[$name] ) 269 ) { 270 $s .= "\n$contents</$name>"; 271 } elseif ( !isset( $this->voidElements[$name] ) ) { 272 $s .= "$contents</$name>"; 273 } 274 } else { 275 $s .= "$contents</$name>"; 276 } 277 return $s; 278 } 279} 280 281// Retain the old namespace for backwards compatibility. 282class_alias( HtmlFormatter::class, 'RemexHtml\Serializer\HtmlFormatter' ); 283