1<?php
2
3// warning: this file is encoded in UTF-8!
4
5class HTML5_Data
6{
7
8    // at some point this should be moved to a .ser file. Another
9    // possible optimization is to give UTF-8 bytes, not Unicode
10    // codepoints
11    // XXX: Not quite sure why it's named this; this is
12    // actually the numeric entity dereference table.
13    protected static $realCodepointTable = array(
14        0x00 => 0xFFFD, // REPLACEMENT CHARACTER
15        0x0D => 0x000A, // LINE FEED (LF)
16        0x80 => 0x20AC, // EURO SIGN ('€')
17        0x81 => 0x0081, // <control>
18        0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚')
19        0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ')
20        0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„')
21        0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…')
22        0x86 => 0x2020, // DAGGER ('†')
23        0x87 => 0x2021, // DOUBLE DAGGER ('‡')
24        0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ')
25        0x89 => 0x2030, // PER MILLE SIGN ('‰')
26        0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š')
27        0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹')
28        0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ')
29        0x8D => 0x008D, // <control>
30        0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž')
31        0x8F => 0x008F, // <control>
32        0x90 => 0x0090, // <control>
33        0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘')
34        0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’')
35        0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“')
36        0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”')
37        0x95 => 0x2022, // BULLET ('•')
38        0x96 => 0x2013, // EN DASH ('–')
39        0x97 => 0x2014, // EM DASH ('—')
40        0x98 => 0x02DC, // SMALL TILDE ('˜')
41        0x99 => 0x2122, // TRADE MARK SIGN ('™')
42        0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š')
43        0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›')
44        0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ')
45        0x9D => 0x009D, // <control>
46        0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž')
47        0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ')
48    );
49
50    protected static $namedCharacterReferences;
51
52    protected static $namedCharacterReferenceMaxLength;
53
54    /**
55     * Returns the "real" Unicode codepoint of a malformed character
56     * reference.
57     */
58    public static function getRealCodepoint($ref) {
59        if (!isset(self::$realCodepointTable[$ref])) {
60            return false;
61        } else {
62            return self::$realCodepointTable[$ref];
63        }
64    }
65
66    public static function getNamedCharacterReferences() {
67        if (!self::$namedCharacterReferences) {
68            self::$namedCharacterReferences = unserialize(
69                file_get_contents(dirname(__FILE__) . '/named-character-references.ser'));
70        }
71        return self::$namedCharacterReferences;
72    }
73
74    /**
75     * Converts a Unicode codepoint to sequence of UTF-8 bytes.
76     * @note Shamelessly stolen from HTML Purifier, which is also
77     *       shamelessly stolen from Feyd (which is in public domain).
78     */
79    public static function utf8chr($code) {
80        /* We don't care: we live dangerously
81         * if($code > 0x10FFFF or $code < 0x0 or
82          ($code >= 0xD800 and $code <= 0xDFFF) ) {
83            // bits are set outside the "valid" range as defined
84            // by UNICODE 4.1.0
85            return "\xEF\xBF\xBD";
86          }*/
87
88        $y = $z = $w = 0;
89        if ($code < 0x80) {
90            // regular ASCII character
91            $x = $code;
92        } else {
93            // set up bits for UTF-8
94            $x = ($code & 0x3F) | 0x80;
95            if ($code < 0x800) {
96               $y = (($code & 0x7FF) >> 6) | 0xC0;
97            } else {
98                $y = (($code & 0xFC0) >> 6) | 0x80;
99                if ($code < 0x10000) {
100                    $z = (($code >> 12) & 0x0F) | 0xE0;
101                } else {
102                    $z = (($code >> 12) & 0x3F) | 0x80;
103                    $w = (($code >> 18) & 0x07) | 0xF0;
104                }
105            }
106        }
107        // set up the actual character
108        $ret = '';
109        if ($w) {
110            $ret .= chr($w);
111        }
112        if ($z) {
113            $ret .= chr($z);
114        }
115        if ($y) {
116            $ret .= chr($y);
117        }
118        $ret .= chr($x);
119
120        return $ret;
121    }
122
123}
124