1use strict; 2use warnings; 3use utf8; 4 5use HTML::Entities qw(decode_entities encode_entities encode_entities_numeric); 6use Test::More tests => 20; 7 8my $x = "Våre norske tegn bør æres"; 9 10decode_entities($x); 11 12is($x, "Våre norske tegn bør æres"); 13 14encode_entities($x); 15 16is($x, "Våre norske tegn bør æres"); 17 18decode_entities($x); 19encode_entities_numeric($x); 20 21is($x, "Våre norske tegn bør æres"); 22 23$x = "<&>\"'"; 24is(encode_entities($x), "<&>"'"); 25is(encode_entities_numeric($x), "<&>"'"); 26 27$x = "abcdef"; 28is(encode_entities($x, 'a-c'), "abcdef"); 29 30$x = "[24/7]\\"; 31is(encode_entities($x, '/'), "[24/7]\\"); 32is(encode_entities($x, '\\/'), "[24/7]\\"); 33is(encode_entities($x, '\\'), "[24/7]\"); 34is(encode_entities($x, ']\\'), "[24/7]\"); 35 36# See how well it does against rfc1866... 37my $ent = ''; 38my $plain = ''; 39while (<DATA>) { 40 next unless /^\s*<!ENTITY\s+(\w+)\s*CDATA\s*\"&\#(\d+)/; 41 $ent .= "&$1;"; 42 $plain .= chr($2); 43} 44 45$x = $ent; 46decode_entities($x); 47is($x, $plain); 48 49# Try decoding when the ";" are left out 50$x = $ent; 51$x =~ s/;//g; 52decode_entities($x); 53is($x, $plain); 54 55 56$x = $plain; 57encode_entities($x); 58is($x, $ent); 59 60#RT #84144 - https://rt.cpan.org/Public/Bug/Display.html?id=84144 61{ 62 my %hash = ("Våre norske tegn bør æres" => 63 "Våre norske tegn bør æres",); 64 65 local $@; 66 my $got; 67 my $error; 68 69 #<<< do not let perltidy touch this 70 $error = $@ || 'Error' unless eval { 71 $got = decode_entities((keys %hash)[0]); 72 1; 73 }; 74 #>>> 75 76 ok(!$error, "decode_entitites() when processing a key as input"); 77 is($got, (values %hash)[0], "decode_entities() decodes a key properly"); 78} 79 80# From: Bill Simpson-Young <bill.simpson-young@cmis.csiro.au> 81# Subject: HTML entities problem with 5.11 82# To: libwww-perl@ics.uci.edu 83# Date: Fri, 05 Sep 1997 16:56:55 +1000 84# Message-Id: <199709050657.QAA10089@snowy.nsw.cmis.CSIRO.AU> 85# 86# Hi. I've got a problem that has surfaced with the changes to 87# HTML::Entities.pm for 5.11 (it doesn't happen with 5.08). It's happening 88# in the process of encoding then decoding special entities. Eg, what goes 89# in as "abc&def&ghi" comes out as "abc&def;&ghi;". 90 91is(decode_entities("abc&def&ghi&abc;&def;"), "abc&def&ghi&abc;&def;"); 92 93# Decoding of ' 94is(decode_entities("'"), "'"); 95is(encode_entities("'", "'"), "'"); 96 97is( 98 decode_entities( 99 "Attention Homeοωnөrs...1ѕt Tімe Eνөг" 100 ), 101 "Attention Home\x{3BF}\x{3C9}n\x{4E9}rs...1\x{455}t T\x{456}\x{43C}e E\x{3BD}\x{4E9}\x{433}" 102); 103is(decode_entities("{&amp;&amp;& also Яœ}"), 104 "{&&& also \x{42F}\x{153}}"); 105 106__END__ 107# Quoted from rfc1866.txt 108 10914. Proposed Entities 110 111 The HTML DTD references the "Added Latin 1" entity set, which only 112 supplies named entities for a subset of the non-ASCII characters in 113 [ISO-8859-1], namely the accented characters. The following entities 114 should be supported so that all ISO 8859-1 characters may only be 115 referenced symbolically. The names for these entities are taken from 116 the appendixes of [SGML]. 117 118 <!ENTITY nbsp CDATA " " -- no-break space --> 119 <!ENTITY iexcl CDATA "¡" -- inverted exclamation mark --> 120 <!ENTITY cent CDATA "¢" -- cent sign --> 121 <!ENTITY pound CDATA "£" -- pound sterling sign --> 122 <!ENTITY curren CDATA "¤" -- general currency sign --> 123 <!ENTITY yen CDATA "¥" -- yen sign --> 124 <!ENTITY brvbar CDATA "¦" -- broken (vertical) bar --> 125 <!ENTITY sect CDATA "§" -- section sign --> 126 <!ENTITY uml CDATA "¨" -- umlaut (dieresis) --> 127 <!ENTITY copy CDATA "©" -- copyright sign --> 128 <!ENTITY ordf CDATA "ª" -- ordinal indicator, feminine --> 129 <!ENTITY laquo CDATA "«" -- angle quotation mark, left --> 130 <!ENTITY not CDATA "¬" -- not sign --> 131 <!ENTITY shy CDATA "­" -- soft hyphen --> 132 <!ENTITY reg CDATA "®" -- registered sign --> 133 <!ENTITY macr CDATA "¯" -- macron --> 134 <!ENTITY deg CDATA "°" -- degree sign --> 135 <!ENTITY plusmn CDATA "±" -- plus-or-minus sign --> 136 <!ENTITY sup2 CDATA "²" -- superscript two --> 137 <!ENTITY sup3 CDATA "³" -- superscript three --> 138 <!ENTITY acute CDATA "´" -- acute accent --> 139 <!ENTITY micro CDATA "µ" -- micro sign --> 140 <!ENTITY para CDATA "¶" -- pilcrow (paragraph sign) --> 141 <!ENTITY middot CDATA "·" -- middle dot --> 142 <!ENTITY cedil CDATA "¸" -- cedilla --> 143 <!ENTITY sup1 CDATA "¹" -- superscript one --> 144 <!ENTITY ordm CDATA "º" -- ordinal indicator, masculine --> 145 <!ENTITY raquo CDATA "»" -- angle quotation mark, right --> 146 <!ENTITY frac14 CDATA "¼" -- fraction one-quarter --> 147 <!ENTITY frac12 CDATA "½" -- fraction one-half --> 148 <!ENTITY frac34 CDATA "¾" -- fraction three-quarters --> 149 <!ENTITY iquest CDATA "¿" -- inverted question mark --> 150 <!ENTITY Agrave CDATA "À" -- capital A, grave accent --> 151 <!ENTITY Aacute CDATA "Á" -- capital A, acute accent --> 152 <!ENTITY Acirc CDATA "Â" -- capital A, circumflex accent --> 153 154 155 156Berners-Lee & Connolly Standards Track [Page 75] 157 158RFC 1866 Hypertext Markup Language - 2.0 November 1995 159 160 161 <!ENTITY Atilde CDATA "Ã" -- capital A, tilde --> 162 <!ENTITY Auml CDATA "Ä" -- capital A, dieresis or umlaut mark --> 163 <!ENTITY Aring CDATA "Å" -- capital A, ring --> 164 <!ENTITY AElig CDATA "Æ" -- capital AE diphthong (ligature) --> 165 <!ENTITY Ccedil CDATA "Ç" -- capital C, cedilla --> 166 <!ENTITY Egrave CDATA "È" -- capital E, grave accent --> 167 <!ENTITY Eacute CDATA "É" -- capital E, acute accent --> 168 <!ENTITY Ecirc CDATA "Ê" -- capital E, circumflex accent --> 169 <!ENTITY Euml CDATA "Ë" -- capital E, dieresis or umlaut mark --> 170 <!ENTITY Igrave CDATA "Ì" -- capital I, grave accent --> 171 <!ENTITY Iacute CDATA "Í" -- capital I, acute accent --> 172 <!ENTITY Icirc CDATA "Î" -- capital I, circumflex accent --> 173 <!ENTITY Iuml CDATA "Ï" -- capital I, dieresis or umlaut mark --> 174 <!ENTITY ETH CDATA "Ð" -- capital Eth, Icelandic --> 175 <!ENTITY Ntilde CDATA "Ñ" -- capital N, tilde --> 176 <!ENTITY Ograve CDATA "Ò" -- capital O, grave accent --> 177 <!ENTITY Oacute CDATA "Ó" -- capital O, acute accent --> 178 <!ENTITY Ocirc CDATA "Ô" -- capital O, circumflex accent --> 179 <!ENTITY Otilde CDATA "Õ" -- capital O, tilde --> 180 <!ENTITY Ouml CDATA "Ö" -- capital O, dieresis or umlaut mark --> 181 <!ENTITY times CDATA "×" -- multiply sign --> 182 <!ENTITY Oslash CDATA "Ø" -- capital O, slash --> 183 <!ENTITY Ugrave CDATA "Ù" -- capital U, grave accent --> 184 <!ENTITY Uacute CDATA "Ú" -- capital U, acute accent --> 185 <!ENTITY Ucirc CDATA "Û" -- capital U, circumflex accent --> 186 <!ENTITY Uuml CDATA "Ü" -- capital U, dieresis or umlaut mark --> 187 <!ENTITY Yacute CDATA "Ý" -- capital Y, acute accent --> 188 <!ENTITY THORN CDATA "Þ" -- capital THORN, Icelandic --> 189 <!ENTITY szlig CDATA "ß" -- small sharp s, German (sz ligature) --> 190 <!ENTITY agrave CDATA "à" -- small a, grave accent --> 191 <!ENTITY aacute CDATA "á" -- small a, acute accent --> 192 <!ENTITY acirc CDATA "â" -- small a, circumflex accent --> 193 <!ENTITY atilde CDATA "ã" -- small a, tilde --> 194 <!ENTITY auml CDATA "ä" -- small a, dieresis or umlaut mark --> 195 <!ENTITY aring CDATA "å" -- small a, ring --> 196 <!ENTITY aelig CDATA "æ" -- small ae diphthong (ligature) --> 197 <!ENTITY ccedil CDATA "ç" -- small c, cedilla --> 198 <!ENTITY egrave CDATA "è" -- small e, grave accent --> 199 <!ENTITY eacute CDATA "é" -- small e, acute accent --> 200 <!ENTITY ecirc CDATA "ê" -- small e, circumflex accent --> 201 <!ENTITY euml CDATA "ë" -- small e, dieresis or umlaut mark --> 202 <!ENTITY igrave CDATA "ì" -- small i, grave accent --> 203 <!ENTITY iacute CDATA "í" -- small i, acute accent --> 204 <!ENTITY icirc CDATA "î" -- small i, circumflex accent --> 205 <!ENTITY iuml CDATA "ï" -- small i, dieresis or umlaut mark --> 206 <!ENTITY eth CDATA "ð" -- small eth, Icelandic --> 207 <!ENTITY ntilde CDATA "ñ" -- small n, tilde --> 208 <!ENTITY ograve CDATA "ò" -- small o, grave accent --> 209 210 211 212Berners-Lee & Connolly Standards Track [Page 76] 213 214RFC 1866 Hypertext Markup Language - 2.0 November 1995 215 216 217 <!ENTITY oacute CDATA "ó" -- small o, acute accent --> 218 <!ENTITY ocirc CDATA "ô" -- small o, circumflex accent --> 219 <!ENTITY otilde CDATA "õ" -- small o, tilde --> 220 <!ENTITY ouml CDATA "ö" -- small o, dieresis or umlaut mark --> 221 <!ENTITY divide CDATA "÷" -- divide sign --> 222 <!ENTITY oslash CDATA "ø" -- small o, slash --> 223 <!ENTITY ugrave CDATA "ù" -- small u, grave accent --> 224 <!ENTITY uacute CDATA "ú" -- small u, acute accent --> 225 <!ENTITY ucirc CDATA "û" -- small u, circumflex accent --> 226 <!ENTITY uuml CDATA "ü" -- small u, dieresis or umlaut mark --> 227 <!ENTITY yacute CDATA "ý" -- small y, acute accent --> 228 <!ENTITY thorn CDATA "þ" -- small thorn, Icelandic --> 229 <!ENTITY yuml CDATA "ÿ" -- small y, dieresis or umlaut mark --> 230