1<?php 2 3// warning: this file is encoded in UTF-8! 4 5class HTML5_Data 6{ 7 8 // at some point this should be moved to a .ser file. Another 9 // possible optimization is to give UTF-8 bytes, not Unicode 10 // codepoints 11 // XXX: Not quite sure why it's named this; this is 12 // actually the numeric entity dereference table. 13 protected static $realCodepointTable = array( 14 0x00 => 0xFFFD, // REPLACEMENT CHARACTER 15 0x0D => 0x000A, // LINE FEED (LF) 16 0x80 => 0x20AC, // EURO SIGN ('€') 17 0x81 => 0x0081, // <control> 18 0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('‚') 19 0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ') 20 0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„') 21 0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…') 22 0x86 => 0x2020, // DAGGER ('†') 23 0x87 => 0x2021, // DOUBLE DAGGER ('‡') 24 0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ') 25 0x89 => 0x2030, // PER MILLE SIGN ('‰') 26 0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š') 27 0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('‹') 28 0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ') 29 0x8D => 0x008D, // <control> 30 0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž') 31 0x8F => 0x008F, // <control> 32 0x90 => 0x0090, // <control> 33 0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('‘') 34 0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('’') 35 0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“') 36 0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”') 37 0x95 => 0x2022, // BULLET ('•') 38 0x96 => 0x2013, // EN DASH ('–') 39 0x97 => 0x2014, // EM DASH ('—') 40 0x98 => 0x02DC, // SMALL TILDE ('˜') 41 0x99 => 0x2122, // TRADE MARK SIGN ('™') 42 0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š') 43 0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('›') 44 0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ') 45 0x9D => 0x009D, // <control> 46 0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž') 47 0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ') 48 ); 49 50 protected static $namedCharacterReferences; 51 52 protected static $namedCharacterReferenceMaxLength; 53 54 /** 55 * Returns the "real" Unicode codepoint of a malformed character 56 * reference. 57 */ 58 public static function getRealCodepoint($ref) { 59 if (!isset(self::$realCodepointTable[$ref])) { 60 return false; 61 } else { 62 return self::$realCodepointTable[$ref]; 63 } 64 } 65 66 public static function getNamedCharacterReferences() { 67 if (!self::$namedCharacterReferences) { 68 self::$namedCharacterReferences = unserialize( 69 file_get_contents(dirname(__FILE__) . '/named-character-references.ser')); 70 } 71 return self::$namedCharacterReferences; 72 } 73 74 /** 75 * Converts a Unicode codepoint to sequence of UTF-8 bytes. 76 * @note Shamelessly stolen from HTML Purifier, which is also 77 * shamelessly stolen from Feyd (which is in public domain). 78 */ 79 public static function utf8chr($code) { 80 /* We don't care: we live dangerously 81 * if($code > 0x10FFFF or $code < 0x0 or 82 ($code >= 0xD800 and $code <= 0xDFFF) ) { 83 // bits are set outside the "valid" range as defined 84 // by UNICODE 4.1.0 85 return "\xEF\xBF\xBD"; 86 }*/ 87 88 $y = $z = $w = 0; 89 if ($code < 0x80) { 90 // regular ASCII character 91 $x = $code; 92 } else { 93 // set up bits for UTF-8 94 $x = ($code & 0x3F) | 0x80; 95 if ($code < 0x800) { 96 $y = (($code & 0x7FF) >> 6) | 0xC0; 97 } else { 98 $y = (($code & 0xFC0) >> 6) | 0x80; 99 if ($code < 0x10000) { 100 $z = (($code >> 12) & 0x0F) | 0xE0; 101 } else { 102 $z = (($code >> 12) & 0x3F) | 0x80; 103 $w = (($code >> 18) & 0x07) | 0xF0; 104 } 105 } 106 } 107 // set up the actual character 108 $ret = ''; 109 if ($w) { 110 $ret .= chr($w); 111 } 112 if ($z) { 113 $ret .= chr($z); 114 } 115 if ($y) { 116 $ret .= chr($y); 117 } 118 $ret .= chr($x); 119 120 return $ret; 121 } 122 123} 124