1use strict;
2use warnings;
3use utf8;
4
5use HTML::Entities qw(decode_entities encode_entities encode_entities_numeric);
6use Test::More tests => 20;
7
8my $x = "Våre norske tegn bør &#230res";
9
10decode_entities($x);
11
12is($x, "Våre norske tegn bør æres");
13
14encode_entities($x);
15
16is($x, "Våre norske tegn bør æres");
17
18decode_entities($x);
19encode_entities_numeric($x);
20
21is($x, "Våre norske tegn bør æres");
22
23$x = "<&>\"'";
24is(encode_entities($x),         "&lt;&amp;&gt;&quot;&#39;");
25is(encode_entities_numeric($x), "&#x3C;&#x26;&#x3E;&#x22;&#x27;");
26
27$x = "abcdef";
28is(encode_entities($x, 'a-c'), "&#97;&#98;&#99;def");
29
30$x = "[24/7]\\";
31is(encode_entities($x, '/'),   "[24&#47;7]\\");
32is(encode_entities($x, '\\/'), "[24&#47;7]\\");
33is(encode_entities($x, '\\'),  "[24/7]&#92;");
34is(encode_entities($x, ']\\'), "[24/7&#93;&#92;");
35
36# See how well it does against rfc1866...
37my $ent   = '';
38my $plain = '';
39while (<DATA>) {
40    next unless /^\s*<!ENTITY\s+(\w+)\s*CDATA\s*\"&\#(\d+)/;
41    $ent   .= "&$1;";
42    $plain .= chr($2);
43}
44
45$x = $ent;
46decode_entities($x);
47is($x, $plain);
48
49# Try decoding when the ";" are left out
50$x = $ent;
51$x =~ s/;//g;
52decode_entities($x);
53is($x, $plain);
54
55
56$x = $plain;
57encode_entities($x);
58is($x, $ent);
59
60#RT #84144 - https://rt.cpan.org/Public/Bug/Display.html?id=84144
61{
62    my %hash = ("V&aring;re norske tegn b&oslash;r &#230res" =>
63            "Våre norske tegn bør æres",);
64
65    local $@;
66    my $got;
67    my $error;
68
69    #<<<  do not let perltidy touch this
70    $error = $@ || 'Error' unless eval {
71        $got = decode_entities((keys %hash)[0]);
72        1;
73    };
74    #>>>
75
76    ok(!$error, "decode_entitites() when processing a key as input");
77    is($got, (values %hash)[0], "decode_entities() decodes a key properly");
78}
79
80# From: Bill Simpson-Young <bill.simpson-young@cmis.csiro.au>
81# Subject: HTML entities problem with 5.11
82# To: libwww-perl@ics.uci.edu
83# Date: Fri, 05 Sep 1997 16:56:55 +1000
84# Message-Id: <199709050657.QAA10089@snowy.nsw.cmis.CSIRO.AU>
85#
86# Hi. I've got a problem that has surfaced with the changes to
87# HTML::Entities.pm for 5.11 (it doesn't happen with 5.08).  It's happening
88# in the process of encoding then decoding special entities.  Eg, what goes
89# in as "abc&def&ghi" comes out as "abc&def;&ghi;".
90
91is(decode_entities("abc&def&ghi&abc;&def;"), "abc&def&ghi&abc;&def;");
92
93# Decoding of &apos;
94is(decode_entities("&apos;"), "'");
95is(encode_entities("'", "'"), "&#39;");
96
97is(
98    decode_entities(
99        "Attention Home&#959&#969n&#1257rs...1&#1109t T&#1110&#1084e E&#957&#1257&#1075"
100    ),
101    "Attention Home\x{3BF}\x{3C9}n\x{4E9}rs...1\x{455}t T\x{456}\x{43C}e E\x{3BD}\x{4E9}\x{433}"
102);
103is(decode_entities("{&#38;amp;&#x26;amp;&amp; also &#x42f;&#339;}"),
104    "{&amp;&amp;& also \x{42F}\x{153}}");
105
106__END__
107# Quoted from rfc1866.txt
108
10914. Proposed Entities
110
111   The HTML DTD references the "Added Latin 1" entity set, which only
112   supplies named entities for a subset of the non-ASCII characters in
113   [ISO-8859-1], namely the accented characters. The following entities
114   should be supported so that all ISO 8859-1 characters may only be
115   referenced symbolically. The names for these entities are taken from
116   the appendixes of [SGML].
117
118    <!ENTITY nbsp   CDATA "&#160;" -- no-break space -->
119    <!ENTITY iexcl  CDATA "&#161;" -- inverted exclamation mark -->
120    <!ENTITY cent   CDATA "&#162;" -- cent sign -->
121    <!ENTITY pound  CDATA "&#163;" -- pound sterling sign -->
122    <!ENTITY curren CDATA "&#164;" -- general currency sign -->
123    <!ENTITY yen    CDATA "&#165;" -- yen sign -->
124    <!ENTITY brvbar CDATA "&#166;" -- broken (vertical) bar -->
125    <!ENTITY sect   CDATA "&#167;" -- section sign -->
126    <!ENTITY uml    CDATA "&#168;" -- umlaut (dieresis) -->
127    <!ENTITY copy   CDATA "&#169;" -- copyright sign -->
128    <!ENTITY ordf   CDATA "&#170;" -- ordinal indicator, feminine -->
129    <!ENTITY laquo  CDATA "&#171;" -- angle quotation mark, left -->
130    <!ENTITY not    CDATA "&#172;" -- not sign -->
131    <!ENTITY shy    CDATA "&#173;" -- soft hyphen -->
132    <!ENTITY reg    CDATA "&#174;" -- registered sign -->
133    <!ENTITY macr   CDATA "&#175;" -- macron -->
134    <!ENTITY deg    CDATA "&#176;" -- degree sign -->
135    <!ENTITY plusmn CDATA "&#177;" -- plus-or-minus sign -->
136    <!ENTITY sup2   CDATA "&#178;" -- superscript two -->
137    <!ENTITY sup3   CDATA "&#179;" -- superscript three -->
138    <!ENTITY acute  CDATA "&#180;" -- acute accent -->
139    <!ENTITY micro  CDATA "&#181;" -- micro sign -->
140    <!ENTITY para   CDATA "&#182;" -- pilcrow (paragraph sign) -->
141    <!ENTITY middot CDATA "&#183;" -- middle dot -->
142    <!ENTITY cedil  CDATA "&#184;" -- cedilla -->
143    <!ENTITY sup1   CDATA "&#185;" -- superscript one -->
144    <!ENTITY ordm   CDATA "&#186;" -- ordinal indicator, masculine -->
145    <!ENTITY raquo  CDATA "&#187;" -- angle quotation mark, right -->
146    <!ENTITY frac14 CDATA "&#188;" -- fraction one-quarter -->
147    <!ENTITY frac12 CDATA "&#189;" -- fraction one-half -->
148    <!ENTITY frac34 CDATA "&#190;" -- fraction three-quarters -->
149    <!ENTITY iquest CDATA "&#191;" -- inverted question mark -->
150    <!ENTITY Agrave CDATA "&#192;" -- capital A, grave accent -->
151    <!ENTITY Aacute CDATA "&#193;" -- capital A, acute accent -->
152    <!ENTITY Acirc  CDATA "&#194;" -- capital A, circumflex accent -->
153
154
155
156Berners-Lee & Connolly      Standards Track                    [Page 75]
157
158RFC 1866            Hypertext Markup Language - 2.0        November 1995
159
160
161    <!ENTITY Atilde CDATA "&#195;" -- capital A, tilde -->
162    <!ENTITY Auml   CDATA "&#196;" -- capital A, dieresis or umlaut mark -->
163    <!ENTITY Aring  CDATA "&#197;" -- capital A, ring -->
164    <!ENTITY AElig  CDATA "&#198;" -- capital AE diphthong (ligature) -->
165    <!ENTITY Ccedil CDATA "&#199;" -- capital C, cedilla -->
166    <!ENTITY Egrave CDATA "&#200;" -- capital E, grave accent -->
167    <!ENTITY Eacute CDATA "&#201;" -- capital E, acute accent -->
168    <!ENTITY Ecirc  CDATA "&#202;" -- capital E, circumflex accent -->
169    <!ENTITY Euml   CDATA "&#203;" -- capital E, dieresis or umlaut mark -->
170    <!ENTITY Igrave CDATA "&#204;" -- capital I, grave accent -->
171    <!ENTITY Iacute CDATA "&#205;" -- capital I, acute accent -->
172    <!ENTITY Icirc  CDATA "&#206;" -- capital I, circumflex accent -->
173    <!ENTITY Iuml   CDATA "&#207;" -- capital I, dieresis or umlaut mark -->
174    <!ENTITY ETH    CDATA "&#208;" -- capital Eth, Icelandic -->
175    <!ENTITY Ntilde CDATA "&#209;" -- capital N, tilde -->
176    <!ENTITY Ograve CDATA "&#210;" -- capital O, grave accent -->
177    <!ENTITY Oacute CDATA "&#211;" -- capital O, acute accent -->
178    <!ENTITY Ocirc  CDATA "&#212;" -- capital O, circumflex accent -->
179    <!ENTITY Otilde CDATA "&#213;" -- capital O, tilde -->
180    <!ENTITY Ouml   CDATA "&#214;" -- capital O, dieresis or umlaut mark -->
181    <!ENTITY times  CDATA "&#215;" -- multiply sign -->
182    <!ENTITY Oslash CDATA "&#216;" -- capital O, slash -->
183    <!ENTITY Ugrave CDATA "&#217;" -- capital U, grave accent -->
184    <!ENTITY Uacute CDATA "&#218;" -- capital U, acute accent -->
185    <!ENTITY Ucirc  CDATA "&#219;" -- capital U, circumflex accent -->
186    <!ENTITY Uuml   CDATA "&#220;" -- capital U, dieresis or umlaut mark -->
187    <!ENTITY Yacute CDATA "&#221;" -- capital Y, acute accent -->
188    <!ENTITY THORN  CDATA "&#222;" -- capital THORN, Icelandic -->
189    <!ENTITY szlig  CDATA "&#223;" -- small sharp s, German (sz ligature) -->
190    <!ENTITY agrave CDATA "&#224;" -- small a, grave accent -->
191    <!ENTITY aacute CDATA "&#225;" -- small a, acute accent -->
192    <!ENTITY acirc  CDATA "&#226;" -- small a, circumflex accent -->
193    <!ENTITY atilde CDATA "&#227;" -- small a, tilde -->
194    <!ENTITY auml   CDATA "&#228;" -- small a, dieresis or umlaut mark -->
195    <!ENTITY aring  CDATA "&#229;" -- small a, ring -->
196    <!ENTITY aelig  CDATA "&#230;" -- small ae diphthong (ligature) -->
197    <!ENTITY ccedil CDATA "&#231;" -- small c, cedilla -->
198    <!ENTITY egrave CDATA "&#232;" -- small e, grave accent -->
199    <!ENTITY eacute CDATA "&#233;" -- small e, acute accent -->
200    <!ENTITY ecirc  CDATA "&#234;" -- small e, circumflex accent -->
201    <!ENTITY euml   CDATA "&#235;" -- small e, dieresis or umlaut mark -->
202    <!ENTITY igrave CDATA "&#236;" -- small i, grave accent -->
203    <!ENTITY iacute CDATA "&#237;" -- small i, acute accent -->
204    <!ENTITY icirc  CDATA "&#238;" -- small i, circumflex accent -->
205    <!ENTITY iuml   CDATA "&#239;" -- small i, dieresis or umlaut mark -->
206    <!ENTITY eth    CDATA "&#240;" -- small eth, Icelandic -->
207    <!ENTITY ntilde CDATA "&#241;" -- small n, tilde -->
208    <!ENTITY ograve CDATA "&#242;" -- small o, grave accent -->
209
210
211
212Berners-Lee & Connolly      Standards Track                    [Page 76]
213
214RFC 1866            Hypertext Markup Language - 2.0        November 1995
215
216
217    <!ENTITY oacute CDATA "&#243;" -- small o, acute accent -->
218    <!ENTITY ocirc  CDATA "&#244;" -- small o, circumflex accent -->
219    <!ENTITY otilde CDATA "&#245;" -- small o, tilde -->
220    <!ENTITY ouml   CDATA "&#246;" -- small o, dieresis or umlaut mark -->
221    <!ENTITY divide CDATA "&#247;" -- divide sign -->
222    <!ENTITY oslash CDATA "&#248;" -- small o, slash -->
223    <!ENTITY ugrave CDATA "&#249;" -- small u, grave accent -->
224    <!ENTITY uacute CDATA "&#250;" -- small u, acute accent -->
225    <!ENTITY ucirc  CDATA "&#251;" -- small u, circumflex accent -->
226    <!ENTITY uuml   CDATA "&#252;" -- small u, dieresis or umlaut mark -->
227    <!ENTITY yacute CDATA "&#253;" -- small y, acute accent -->
228    <!ENTITY thorn  CDATA "&#254;" -- small thorn, Icelandic -->
229    <!ENTITY yuml   CDATA "&#255;" -- small y, dieresis or umlaut mark -->
230