1package HTML::Entities;
2
3=encoding utf8
4
5=head1 NAME
6
7HTML::Entities - Encode or decode strings with HTML entities
8
9=head1 SYNOPSIS
10
11 use HTML::Entities;
12
13 $a = "Våre norske tegn bør &#230res";
14 decode_entities($a);
15 encode_entities($a, "\200-\377");
16
17For example, this:
18
19 $input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
20 print encode_entities($input), "\n"
21
22Prints this out:
23
24 vis-à-vis Beyoncé's naïve
25 papier-mâché résumé
26
27=head1 DESCRIPTION
28
29This module deals with encoding and decoding of strings with HTML
30character entities.  The module provides the following functions:
31
32=over 4
33
34=item decode_entities( $string, ... )
35
36This routine replaces HTML entities found in the $string with the
37corresponding Unicode character.  Unrecognized entities are left alone.
38
39If multiple strings are provided as argument they are each decoded
40separately and the same number of strings are returned.
41
42If called in void context the arguments are decoded in-place.
43
44This routine is exported by default.
45
46=item _decode_entities( $string, \%entity2char )
47
48=item _decode_entities( $string, \%entity2char, $expand_prefix )
49
50This will in-place replace HTML entities in $string.  The %entity2char
51hash must be provided.  Named entities not found in the %entity2char
52hash are left alone.  Numeric entities are expanded unless their value
53overflow.
54
55The keys in %entity2char are the entity names to be expanded and their
56values are what they should expand into.  The values do not have to be
57single character strings.  If a key has ";" as suffix,
58then occurrences in $string are only expanded if properly terminated
59with ";".  Entities without ";" will be expanded regardless of how
60they are terminated for compatibility with how common browsers treat
61entities in the Latin-1 range.
62
63If $expand_prefix is TRUE then entities without trailing ";" in
64%entity2char will even be expanded as a prefix of a longer
65unrecognized name.  The longest matching name in %entity2char will be
66used. This is mainly present for compatibility with an MSIE
67misfeature.
68
69   $string = "foo&nbspbar";
70   _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
71   print $string;  # will print "foo bar"
72
73This routine is exported by default.
74
75=item encode_entities( $string )
76
77=item encode_entities( $string, $unsafe_chars )
78
79This routine replaces unsafe characters in $string with their entity
80representation. A second argument can be given to specify which characters to
81consider unsafe.  The unsafe characters is specified using the regular
82expression character class syntax (what you find within brackets in regular
83expressions).
84
85The default set of characters to encode are control chars, high-bit chars, and
86the C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> characters.  But this,
87for example, would encode I<just> the C<< < >>, C<< & >>, C<< > >>, and C<< "
88>> characters:
89
90  $encoded = encode_entities($input, '<>&"');
91
92and this would only encode non-plain ASCII:
93
94  $encoded = encode_entities($input, '^\n\x20-\x25\x27-\x7e');
95
96This routine is exported by default.
97
98=item encode_entities_numeric( $string )
99
100=item encode_entities_numeric( $string, $unsafe_chars )
101
102This routine works just like encode_entities, except that the replacement
103entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>.  For
104example, C<encode_entities("r\xF4le")> returns "r&ocirc;le", but
105C<encode_entities_numeric("r\xF4le")> returns "r&#xF4;le".
106
107This routine is I<not> exported by default.  But you can always
108export it with C<use HTML::Entities qw(encode_entities_numeric);>
109or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
110
111=back
112
113All these routines modify the string passed as the first argument, if
114called in a void context.  In scalar and array contexts, the encoded or
115decoded string is returned (without changing the input string).
116
117If you prefer not to import these routines into your namespace, you can
118call them as:
119
120  use HTML::Entities ();
121  $decoded = HTML::Entities::decode($a);
122  $encoded = HTML::Entities::encode($a);
123  $encoded = HTML::Entities::encode_numeric($a);
124
125The module can also export the %char2entity and the %entity2char
126hashes, which contain the mapping from all characters to the
127corresponding entities (and vice versa, respectively).
128
129=head1 COPYRIGHT
130
131Copyright 1995-2006 Gisle Aas. All rights reserved.
132
133This library is free software; you can redistribute it and/or
134modify it under the same terms as Perl itself.
135
136=cut
137
138use strict;
139our $VERSION = '3.76';
140our (%entity2char, %char2entity);
141
142require 5.004;
143require Exporter;
144our @ISA = qw(Exporter);
145
146our @EXPORT = qw(encode_entities decode_entities _decode_entities);
147our @EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
148
149sub Version { $VERSION; }
150
151require HTML::Parser;  # for fast XS implemented decode_entities
152
153
154%entity2char = (
155 # Some normal chars that have special meaning in SGML context
156 amp    => '&',  # ampersand
157'gt'    => '>',  # greater than
158'lt'    => '<',  # less than
159 quot   => '"',  # double quote
160 apos   => "'",  # single quote
161
162 # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
163 AElig	=> chr(198),  # capital AE diphthong (ligature)
164 Aacute	=> chr(193),  # capital A, acute accent
165 Acirc	=> chr(194),  # capital A, circumflex accent
166 Agrave	=> chr(192),  # capital A, grave accent
167 Aring	=> chr(197),  # capital A, ring
168 Atilde	=> chr(195),  # capital A, tilde
169 Auml	=> chr(196),  # capital A, dieresis or umlaut mark
170 Ccedil	=> chr(199),  # capital C, cedilla
171 ETH	=> chr(208),  # capital Eth, Icelandic
172 Eacute	=> chr(201),  # capital E, acute accent
173 Ecirc	=> chr(202),  # capital E, circumflex accent
174 Egrave	=> chr(200),  # capital E, grave accent
175 Euml	=> chr(203),  # capital E, dieresis or umlaut mark
176 Iacute	=> chr(205),  # capital I, acute accent
177 Icirc	=> chr(206),  # capital I, circumflex accent
178 Igrave	=> chr(204),  # capital I, grave accent
179 Iuml	=> chr(207),  # capital I, dieresis or umlaut mark
180 Ntilde	=> chr(209),  # capital N, tilde
181 Oacute	=> chr(211),  # capital O, acute accent
182 Ocirc	=> chr(212),  # capital O, circumflex accent
183 Ograve	=> chr(210),  # capital O, grave accent
184 Oslash	=> chr(216),  # capital O, slash
185 Otilde	=> chr(213),  # capital O, tilde
186 Ouml	=> chr(214),  # capital O, dieresis or umlaut mark
187 THORN	=> chr(222),  # capital THORN, Icelandic
188 Uacute	=> chr(218),  # capital U, acute accent
189 Ucirc	=> chr(219),  # capital U, circumflex accent
190 Ugrave	=> chr(217),  # capital U, grave accent
191 Uuml	=> chr(220),  # capital U, dieresis or umlaut mark
192 Yacute	=> chr(221),  # capital Y, acute accent
193 aacute	=> chr(225),  # small a, acute accent
194 acirc	=> chr(226),  # small a, circumflex accent
195 aelig	=> chr(230),  # small ae diphthong (ligature)
196 agrave	=> chr(224),  # small a, grave accent
197 aring	=> chr(229),  # small a, ring
198 atilde	=> chr(227),  # small a, tilde
199 auml	=> chr(228),  # small a, dieresis or umlaut mark
200 ccedil	=> chr(231),  # small c, cedilla
201 eacute	=> chr(233),  # small e, acute accent
202 ecirc	=> chr(234),  # small e, circumflex accent
203 egrave	=> chr(232),  # small e, grave accent
204 eth	=> chr(240),  # small eth, Icelandic
205 euml	=> chr(235),  # small e, dieresis or umlaut mark
206 iacute	=> chr(237),  # small i, acute accent
207 icirc	=> chr(238),  # small i, circumflex accent
208 igrave	=> chr(236),  # small i, grave accent
209 iuml	=> chr(239),  # small i, dieresis or umlaut mark
210 ntilde	=> chr(241),  # small n, tilde
211 oacute	=> chr(243),  # small o, acute accent
212 ocirc	=> chr(244),  # small o, circumflex accent
213 ograve	=> chr(242),  # small o, grave accent
214 oslash	=> chr(248),  # small o, slash
215 otilde	=> chr(245),  # small o, tilde
216 ouml	=> chr(246),  # small o, dieresis or umlaut mark
217 szlig	=> chr(223),  # small sharp s, German (sz ligature)
218 thorn	=> chr(254),  # small thorn, Icelandic
219 uacute	=> chr(250),  # small u, acute accent
220 ucirc	=> chr(251),  # small u, circumflex accent
221 ugrave	=> chr(249),  # small u, grave accent
222 uuml	=> chr(252),  # small u, dieresis or umlaut mark
223 yacute	=> chr(253),  # small y, acute accent
224 yuml	=> chr(255),  # small y, dieresis or umlaut mark
225
226 # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
227 copy   => chr(169),  # copyright sign
228 reg    => chr(174),  # registered sign
229 nbsp   => chr(160),  # non breaking space
230
231 # Additional ISO-8859/1 entities listed in rfc1866 (section 14)
232 iexcl  => chr(161),
233 cent   => chr(162),
234 pound  => chr(163),
235 curren => chr(164),
236 yen    => chr(165),
237 brvbar => chr(166),
238 sect   => chr(167),
239 uml    => chr(168),
240 ordf   => chr(170),
241 laquo  => chr(171),
242'not'   => chr(172),    # not is a keyword in perl
243 shy    => chr(173),
244 macr   => chr(175),
245 deg    => chr(176),
246 plusmn => chr(177),
247 sup1   => chr(185),
248 sup2   => chr(178),
249 sup3   => chr(179),
250 acute  => chr(180),
251 micro  => chr(181),
252 para   => chr(182),
253 middot => chr(183),
254 cedil  => chr(184),
255 ordm   => chr(186),
256 raquo  => chr(187),
257 frac14 => chr(188),
258 frac12 => chr(189),
259 frac34 => chr(190),
260 iquest => chr(191),
261'times' => chr(215),    # times is a keyword in perl
262 divide => chr(247),
263
264 ( $] > 5.007 ? (
265  'OElig;'    => chr(338),
266  'oelig;'    => chr(339),
267  'Scaron;'   => chr(352),
268  'scaron;'   => chr(353),
269  'Yuml;'     => chr(376),
270  'fnof;'     => chr(402),
271  'circ;'     => chr(710),
272  'tilde;'    => chr(732),
273  'Alpha;'    => chr(913),
274  'Beta;'     => chr(914),
275  'Gamma;'    => chr(915),
276  'Delta;'    => chr(916),
277  'Epsilon;'  => chr(917),
278  'Zeta;'     => chr(918),
279  'Eta;'      => chr(919),
280  'Theta;'    => chr(920),
281  'Iota;'     => chr(921),
282  'Kappa;'    => chr(922),
283  'Lambda;'   => chr(923),
284  'Mu;'       => chr(924),
285  'Nu;'       => chr(925),
286  'Xi;'       => chr(926),
287  'Omicron;'  => chr(927),
288  'Pi;'       => chr(928),
289  'Rho;'      => chr(929),
290  'Sigma;'    => chr(931),
291  'Tau;'      => chr(932),
292  'Upsilon;'  => chr(933),
293  'Phi;'      => chr(934),
294  'Chi;'      => chr(935),
295  'Psi;'      => chr(936),
296  'Omega;'    => chr(937),
297  'alpha;'    => chr(945),
298  'beta;'     => chr(946),
299  'gamma;'    => chr(947),
300  'delta;'    => chr(948),
301  'epsilon;'  => chr(949),
302  'zeta;'     => chr(950),
303  'eta;'      => chr(951),
304  'theta;'    => chr(952),
305  'iota;'     => chr(953),
306  'kappa;'    => chr(954),
307  'lambda;'   => chr(955),
308  'mu;'       => chr(956),
309  'nu;'       => chr(957),
310  'xi;'       => chr(958),
311  'omicron;'  => chr(959),
312  'pi;'       => chr(960),
313  'rho;'      => chr(961),
314  'sigmaf;'   => chr(962),
315  'sigma;'    => chr(963),
316  'tau;'      => chr(964),
317  'upsilon;'  => chr(965),
318  'phi;'      => chr(966),
319  'chi;'      => chr(967),
320  'psi;'      => chr(968),
321  'omega;'    => chr(969),
322  'thetasym;' => chr(977),
323  'upsih;'    => chr(978),
324  'piv;'      => chr(982),
325  'ensp;'     => chr(8194),
326  'emsp;'     => chr(8195),
327  'thinsp;'   => chr(8201),
328  'zwnj;'     => chr(8204),
329  'zwj;'      => chr(8205),
330  'lrm;'      => chr(8206),
331  'rlm;'      => chr(8207),
332  'ndash;'    => chr(8211),
333  'mdash;'    => chr(8212),
334  'lsquo;'    => chr(8216),
335  'rsquo;'    => chr(8217),
336  'sbquo;'    => chr(8218),
337  'ldquo;'    => chr(8220),
338  'rdquo;'    => chr(8221),
339  'bdquo;'    => chr(8222),
340  'dagger;'   => chr(8224),
341  'Dagger;'   => chr(8225),
342  'bull;'     => chr(8226),
343  'hellip;'   => chr(8230),
344  'permil;'   => chr(8240),
345  'prime;'    => chr(8242),
346  'Prime;'    => chr(8243),
347  'lsaquo;'   => chr(8249),
348  'rsaquo;'   => chr(8250),
349  'oline;'    => chr(8254),
350  'frasl;'    => chr(8260),
351  'euro;'     => chr(8364),
352  'image;'    => chr(8465),
353  'weierp;'   => chr(8472),
354  'real;'     => chr(8476),
355  'trade;'    => chr(8482),
356  'alefsym;'  => chr(8501),
357  'larr;'     => chr(8592),
358  'uarr;'     => chr(8593),
359  'rarr;'     => chr(8594),
360  'darr;'     => chr(8595),
361  'harr;'     => chr(8596),
362  'crarr;'    => chr(8629),
363  'lArr;'     => chr(8656),
364  'uArr;'     => chr(8657),
365  'rArr;'     => chr(8658),
366  'dArr;'     => chr(8659),
367  'hArr;'     => chr(8660),
368  'forall;'   => chr(8704),
369  'part;'     => chr(8706),
370  'exist;'    => chr(8707),
371  'empty;'    => chr(8709),
372  'nabla;'    => chr(8711),
373  'isin;'     => chr(8712),
374  'notin;'    => chr(8713),
375  'ni;'       => chr(8715),
376  'prod;'     => chr(8719),
377  'sum;'      => chr(8721),
378  'minus;'    => chr(8722),
379  'lowast;'   => chr(8727),
380  'radic;'    => chr(8730),
381  'prop;'     => chr(8733),
382  'infin;'    => chr(8734),
383  'ang;'      => chr(8736),
384  'and;'      => chr(8743),
385  'or;'       => chr(8744),
386  'cap;'      => chr(8745),
387  'cup;'      => chr(8746),
388  'int;'      => chr(8747),
389  'there4;'   => chr(8756),
390  'sim;'      => chr(8764),
391  'cong;'     => chr(8773),
392  'asymp;'    => chr(8776),
393  'ne;'       => chr(8800),
394  'equiv;'    => chr(8801),
395  'le;'       => chr(8804),
396  'ge;'       => chr(8805),
397  'sub;'      => chr(8834),
398  'sup;'      => chr(8835),
399  'nsub;'     => chr(8836),
400  'sube;'     => chr(8838),
401  'supe;'     => chr(8839),
402  'oplus;'    => chr(8853),
403  'otimes;'   => chr(8855),
404  'perp;'     => chr(8869),
405  'sdot;'     => chr(8901),
406  'lceil;'    => chr(8968),
407  'rceil;'    => chr(8969),
408  'lfloor;'   => chr(8970),
409  'rfloor;'   => chr(8971),
410  'lang;'     => chr(9001),
411  'rang;'     => chr(9002),
412  'loz;'      => chr(9674),
413  'spades;'   => chr(9824),
414  'clubs;'    => chr(9827),
415  'hearts;'   => chr(9829),
416  'diams;'    => chr(9830),
417 ) : ())
418);
419
420
421# Make the opposite mapping
422while (my($entity, $char) = each(%entity2char)) {
423    $entity =~ s/;\z//;
424    $char2entity{$char} = "&$entity;";
425}
426delete $char2entity{"'"};  # only one-way decoding
427
428# Fill in missing entities
429for (0 .. 255) {
430    next if exists $char2entity{chr($_)};
431    $char2entity{chr($_)} = "&#$_;";
432}
433
434my %subst;  # compiled encoding regexps
435
436sub encode_entities
437{
438    return undef unless defined $_[0];
439    my $ref;
440    if (defined wantarray) {
441	my $x = $_[0];
442	$ref = \$x;     # copy
443    } else {
444	$ref = \$_[0];  # modify in-place
445    }
446    if (defined $_[1] and length $_[1]) {
447	unless (exists $subst{$_[1]}) {
448	    # Because we can't compile regex we fake it with a cached sub
449	    my $chars = $_[1];
450	    $chars =~ s,(?<!\\)([]/]),\\$1,g;
451	    $chars =~ s,(?<!\\)\\\z,\\\\,;
452	    my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
453	    $subst{$_[1]} = eval $code;
454	    die( $@ . " while trying to turn range: \"$_[1]\"\n "
455	      . "into code: $code\n "
456	    ) if $@;
457	}
458	&{$subst{$_[1]}}($$ref);
459    } else {
460	# Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
461	$$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
462    }
463    $$ref;
464}
465
466sub encode_entities_numeric {
467    local %char2entity;
468    return &encode_entities;   # a goto &encode_entities wouldn't work
469}
470
471
472sub num_entity {
473    sprintf "&#x%X;", ord($_[0]);
474}
475
476# Set up aliases
477*encode = \&encode_entities;
478*encode_numeric = \&encode_entities_numeric;
479*encode_numerically = \&encode_entities_numeric;
480*decode = \&decode_entities;
481
4821;
483