1<?php
2
3// if want to implement error collecting here, we'll need to use some sort
4// of global data (probably trigger_error) because it's impossible to pass
5// $config or $context to the callback functions.
6
7/**
8 * Handles referencing and derefencing character entities
9 */
10class HTMLPurifier_EntityParser
11{
12
13    /**
14     * Reference to entity lookup table.
15     * @type HTMLPurifier_EntityLookup
16     */
17    protected $_entity_lookup;
18
19    /**
20     * Callback regex string for entities in text.
21     * @type string
22     */
23    protected $_textEntitiesRegex;
24
25    /**
26     * Callback regex string for entities in attributes.
27     * @type string
28     */
29    protected $_attrEntitiesRegex;
30
31    /**
32     * Tests if the beginning of a string is a semi-optional regex
33     */
34    protected $_semiOptionalPrefixRegex;
35
36    public function __construct() {
37        // From
38        // http://stackoverflow.com/questions/15532252/why-is-reg-being-rendered-as-without-the-bounding-semicolon
39        $semi_optional = "quot|QUOT|lt|LT|gt|GT|amp|AMP|AElig|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|Iacute|Icirc|Igrave|Iuml|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml";
40
41        // NB: three empty captures to put the fourth match in the right
42        // place
43        $this->_semiOptionalPrefixRegex = "/&()()()($semi_optional)/";
44
45        $this->_textEntitiesRegex =
46            '/&(?:'.
47            // hex
48            '[#]x([a-fA-F0-9]+);?|'.
49            // dec
50            '[#]0*(\d+);?|'.
51            // string (mandatory semicolon)
52            // NB: order matters: match semicolon preferentially
53            '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
54            // string (optional semicolon)
55            "($semi_optional)".
56            ')/';
57
58        $this->_attrEntitiesRegex =
59            '/&(?:'.
60            // hex
61            '[#]x([a-fA-F0-9]+);?|'.
62            // dec
63            '[#]0*(\d+);?|'.
64            // string (mandatory semicolon)
65            // NB: order matters: match semicolon preferentially
66            '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
67            // string (optional semicolon)
68            // don't match if trailing is equals or alphanumeric (URL
69            // like)
70            "($semi_optional)(?![=;A-Za-z0-9])".
71            ')/';
72
73    }
74
75    /**
76     * Substitute entities with the parsed equivalents.  Use this on
77     * textual data in an HTML document (as opposed to attributes.)
78     *
79     * @param string $string String to have entities parsed.
80     * @return string Parsed string.
81     */
82    public function substituteTextEntities($string)
83    {
84        return preg_replace_callback(
85            $this->_textEntitiesRegex,
86            array($this, 'entityCallback'),
87            $string
88        );
89    }
90
91    /**
92     * Substitute entities with the parsed equivalents.  Use this on
93     * attribute contents in documents.
94     *
95     * @param string $string String to have entities parsed.
96     * @return string Parsed string.
97     */
98    public function substituteAttrEntities($string)
99    {
100        return preg_replace_callback(
101            $this->_attrEntitiesRegex,
102            array($this, 'entityCallback'),
103            $string
104        );
105    }
106
107    /**
108     * Callback function for substituteNonSpecialEntities() that does the work.
109     *
110     * @param array $matches  PCRE matches array, with 0 the entire match, and
111     *                  either index 1, 2 or 3 set with a hex value, dec value,
112     *                  or string (respectively).
113     * @return string Replacement string.
114     */
115
116    protected function entityCallback($matches)
117    {
118        $entity = $matches[0];
119        $hex_part = @$matches[1];
120        $dec_part = @$matches[2];
121        $named_part = empty($matches[3]) ? (empty($matches[4]) ? "" : $matches[4]) : $matches[3];
122        if ($hex_part !== NULL && $hex_part !== "") {
123            return HTMLPurifier_Encoder::unichr(hexdec($hex_part));
124        } elseif ($dec_part !== NULL && $dec_part !== "") {
125            return HTMLPurifier_Encoder::unichr((int) $dec_part);
126        } else {
127            if (!$this->_entity_lookup) {
128                $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
129            }
130            if (isset($this->_entity_lookup->table[$named_part])) {
131                return $this->_entity_lookup->table[$named_part];
132            } else {
133                // exact match didn't match anything, so test if
134                // any of the semicolon optional match the prefix.
135                // Test that this is an EXACT match is important to
136                // prevent infinite loop
137                if (!empty($matches[3])) {
138                    return preg_replace_callback(
139                        $this->_semiOptionalPrefixRegex,
140                        array($this, 'entityCallback'),
141                        $entity
142                    );
143                }
144                return $entity;
145            }
146        }
147    }
148
149    // LEGACY CODE BELOW
150
151    /**
152     * Callback regex string for parsing entities.
153     * @type string
154     */
155    protected $_substituteEntitiesRegex =
156        '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
157        //     1. hex             2. dec      3. string (XML style)
158
159    /**
160     * Decimal to parsed string conversion table for special entities.
161     * @type array
162     */
163    protected $_special_dec2str =
164            array(
165                    34 => '"',
166                    38 => '&',
167                    39 => "'",
168                    60 => '<',
169                    62 => '>'
170            );
171
172    /**
173     * Stripped entity names to decimal conversion table for special entities.
174     * @type array
175     */
176    protected $_special_ent2dec =
177            array(
178                    'quot' => 34,
179                    'amp'  => 38,
180                    'lt'   => 60,
181                    'gt'   => 62
182            );
183
184    /**
185     * Substitutes non-special entities with their parsed equivalents. Since
186     * running this whenever you have parsed character is t3h 5uck, we run
187     * it before everything else.
188     *
189     * @param string $string String to have non-special entities parsed.
190     * @return string Parsed string.
191     */
192    public function substituteNonSpecialEntities($string)
193    {
194        // it will try to detect missing semicolons, but don't rely on it
195        return preg_replace_callback(
196            $this->_substituteEntitiesRegex,
197            array($this, 'nonSpecialEntityCallback'),
198            $string
199        );
200    }
201
202    /**
203     * Callback function for substituteNonSpecialEntities() that does the work.
204     *
205     * @param array $matches  PCRE matches array, with 0 the entire match, and
206     *                  either index 1, 2 or 3 set with a hex value, dec value,
207     *                  or string (respectively).
208     * @return string Replacement string.
209     */
210
211    protected function nonSpecialEntityCallback($matches)
212    {
213        // replaces all but big five
214        $entity = $matches[0];
215        $is_num = (@$matches[0][1] === '#');
216        if ($is_num) {
217            $is_hex = (@$entity[2] === 'x');
218            $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
219            // abort for special characters
220            if (isset($this->_special_dec2str[$code])) {
221                return $entity;
222            }
223            return HTMLPurifier_Encoder::unichr($code);
224        } else {
225            if (isset($this->_special_ent2dec[$matches[3]])) {
226                return $entity;
227            }
228            if (!$this->_entity_lookup) {
229                $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
230            }
231            if (isset($this->_entity_lookup->table[$matches[3]])) {
232                return $this->_entity_lookup->table[$matches[3]];
233            } else {
234                return $entity;
235            }
236        }
237    }
238
239    /**
240     * Substitutes only special entities with their parsed equivalents.
241     *
242     * @notice We try to avoid calling this function because otherwise, it
243     * would have to be called a lot (for every parsed section).
244     *
245     * @param string $string String to have non-special entities parsed.
246     * @return string Parsed string.
247     */
248    public function substituteSpecialEntities($string)
249    {
250        return preg_replace_callback(
251            $this->_substituteEntitiesRegex,
252            array($this, 'specialEntityCallback'),
253            $string
254        );
255    }
256
257    /**
258     * Callback function for substituteSpecialEntities() that does the work.
259     *
260     * This callback has same syntax as nonSpecialEntityCallback().
261     *
262     * @param array $matches  PCRE-style matches array, with 0 the entire match, and
263     *                  either index 1, 2 or 3 set with a hex value, dec value,
264     *                  or string (respectively).
265     * @return string Replacement string.
266     */
267    protected function specialEntityCallback($matches)
268    {
269        $entity = $matches[0];
270        $is_num = (@$matches[0][1] === '#');
271        if ($is_num) {
272            $is_hex = (@$entity[2] === 'x');
273            $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
274            return isset($this->_special_dec2str[$int]) ?
275                $this->_special_dec2str[$int] :
276                $entity;
277        } else {
278            return isset($this->_special_ent2dec[$matches[3]]) ?
279                $this->_special_dec2str[$this->_special_ent2dec[$matches[3]]] :
280                $entity;
281        }
282    }
283}
284
285// vim: et sw=4 sts=4
286