1package HTML::Entities; 2 3=encoding utf8 4 5=head1 NAME 6 7HTML::Entities - Encode or decode strings with HTML entities 8 9=head1 SYNOPSIS 10 11 use HTML::Entities; 12 13 $a = "Våre norske tegn bør æres"; 14 decode_entities($a); 15 encode_entities($a, "\200-\377"); 16 17For example, this: 18 19 $input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé"; 20 print encode_entities($input), "\n" 21 22Prints this out: 23 24 vis-à-vis Beyoncé's naïve 25 papier-mâché résumé 26 27=head1 DESCRIPTION 28 29This module deals with encoding and decoding of strings with HTML 30character entities. The module provides the following functions: 31 32=over 4 33 34=item decode_entities( $string, ... ) 35 36This routine replaces HTML entities found in the $string with the 37corresponding Unicode character. Unrecognized entities are left alone. 38 39If multiple strings are provided as argument they are each decoded 40separately and the same number of strings are returned. 41 42If called in void context the arguments are decoded in-place. 43 44This routine is exported by default. 45 46=item _decode_entities( $string, \%entity2char ) 47 48=item _decode_entities( $string, \%entity2char, $expand_prefix ) 49 50This will in-place replace HTML entities in $string. The %entity2char 51hash must be provided. Named entities not found in the %entity2char 52hash are left alone. Numeric entities are expanded unless their value 53overflow. 54 55The keys in %entity2char are the entity names to be expanded and their 56values are what they should expand into. The values do not have to be 57single character strings. If a key has ";" as suffix, 58then occurrences in $string are only expanded if properly terminated 59with ";". Entities without ";" will be expanded regardless of how 60they are terminated for compatibility with how common browsers treat 61entities in the Latin-1 range. 62 63If $expand_prefix is TRUE then entities without trailing ";" in 64%entity2char will even be expanded as a prefix of a longer 65unrecognized name. The longest matching name in %entity2char will be 66used. This is mainly present for compatibility with an MSIE 67misfeature. 68 69 $string = "foo bar"; 70 _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1); 71 print $string; # will print "foo bar" 72 73This routine is exported by default. 74 75=item encode_entities( $string ) 76 77=item encode_entities( $string, $unsafe_chars ) 78 79This routine replaces unsafe characters in $string with their entity 80representation. A second argument can be given to specify which characters to 81consider unsafe. The unsafe characters is specified using the regular 82expression character class syntax (what you find within brackets in regular 83expressions). 84 85The default set of characters to encode are control chars, high-bit chars, and 86the C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> characters. But this, 87for example, would encode I<just> the C<< < >>, C<< & >>, C<< > >>, and C<< " 88>> characters: 89 90 $encoded = encode_entities($input, '<>&"'); 91 92and this would only encode non-plain ASCII: 93 94 $encoded = encode_entities($input, '^\n\x20-\x25\x27-\x7e'); 95 96This routine is exported by default. 97 98=item encode_entities_numeric( $string ) 99 100=item encode_entities_numeric( $string, $unsafe_chars ) 101 102This routine works just like encode_entities, except that the replacement 103entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For 104example, C<encode_entities("r\xF4le")> returns "rôle", but 105C<encode_entities_numeric("r\xF4le")> returns "rôle". 106 107This routine is I<not> exported by default. But you can always 108export it with C<use HTML::Entities qw(encode_entities_numeric);> 109or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);> 110 111=back 112 113All these routines modify the string passed as the first argument, if 114called in a void context. In scalar and array contexts, the encoded or 115decoded string is returned (without changing the input string). 116 117If you prefer not to import these routines into your namespace, you can 118call them as: 119 120 use HTML::Entities (); 121 $decoded = HTML::Entities::decode($a); 122 $encoded = HTML::Entities::encode($a); 123 $encoded = HTML::Entities::encode_numeric($a); 124 125The module can also export the %char2entity and the %entity2char 126hashes, which contain the mapping from all characters to the 127corresponding entities (and vice versa, respectively). 128 129=head1 COPYRIGHT 130 131Copyright 1995-2006 Gisle Aas. All rights reserved. 132 133This library is free software; you can redistribute it and/or 134modify it under the same terms as Perl itself. 135 136=cut 137 138use strict; 139our $VERSION = '3.76'; 140our (%entity2char, %char2entity); 141 142require 5.004; 143require Exporter; 144our @ISA = qw(Exporter); 145 146our @EXPORT = qw(encode_entities decode_entities _decode_entities); 147our @EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric); 148 149sub Version { $VERSION; } 150 151require HTML::Parser; # for fast XS implemented decode_entities 152 153 154%entity2char = ( 155 # Some normal chars that have special meaning in SGML context 156 amp => '&', # ampersand 157'gt' => '>', # greater than 158'lt' => '<', # less than 159 quot => '"', # double quote 160 apos => "'", # single quote 161 162 # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML 163 AElig => chr(198), # capital AE diphthong (ligature) 164 Aacute => chr(193), # capital A, acute accent 165 Acirc => chr(194), # capital A, circumflex accent 166 Agrave => chr(192), # capital A, grave accent 167 Aring => chr(197), # capital A, ring 168 Atilde => chr(195), # capital A, tilde 169 Auml => chr(196), # capital A, dieresis or umlaut mark 170 Ccedil => chr(199), # capital C, cedilla 171 ETH => chr(208), # capital Eth, Icelandic 172 Eacute => chr(201), # capital E, acute accent 173 Ecirc => chr(202), # capital E, circumflex accent 174 Egrave => chr(200), # capital E, grave accent 175 Euml => chr(203), # capital E, dieresis or umlaut mark 176 Iacute => chr(205), # capital I, acute accent 177 Icirc => chr(206), # capital I, circumflex accent 178 Igrave => chr(204), # capital I, grave accent 179 Iuml => chr(207), # capital I, dieresis or umlaut mark 180 Ntilde => chr(209), # capital N, tilde 181 Oacute => chr(211), # capital O, acute accent 182 Ocirc => chr(212), # capital O, circumflex accent 183 Ograve => chr(210), # capital O, grave accent 184 Oslash => chr(216), # capital O, slash 185 Otilde => chr(213), # capital O, tilde 186 Ouml => chr(214), # capital O, dieresis or umlaut mark 187 THORN => chr(222), # capital THORN, Icelandic 188 Uacute => chr(218), # capital U, acute accent 189 Ucirc => chr(219), # capital U, circumflex accent 190 Ugrave => chr(217), # capital U, grave accent 191 Uuml => chr(220), # capital U, dieresis or umlaut mark 192 Yacute => chr(221), # capital Y, acute accent 193 aacute => chr(225), # small a, acute accent 194 acirc => chr(226), # small a, circumflex accent 195 aelig => chr(230), # small ae diphthong (ligature) 196 agrave => chr(224), # small a, grave accent 197 aring => chr(229), # small a, ring 198 atilde => chr(227), # small a, tilde 199 auml => chr(228), # small a, dieresis or umlaut mark 200 ccedil => chr(231), # small c, cedilla 201 eacute => chr(233), # small e, acute accent 202 ecirc => chr(234), # small e, circumflex accent 203 egrave => chr(232), # small e, grave accent 204 eth => chr(240), # small eth, Icelandic 205 euml => chr(235), # small e, dieresis or umlaut mark 206 iacute => chr(237), # small i, acute accent 207 icirc => chr(238), # small i, circumflex accent 208 igrave => chr(236), # small i, grave accent 209 iuml => chr(239), # small i, dieresis or umlaut mark 210 ntilde => chr(241), # small n, tilde 211 oacute => chr(243), # small o, acute accent 212 ocirc => chr(244), # small o, circumflex accent 213 ograve => chr(242), # small o, grave accent 214 oslash => chr(248), # small o, slash 215 otilde => chr(245), # small o, tilde 216 ouml => chr(246), # small o, dieresis or umlaut mark 217 szlig => chr(223), # small sharp s, German (sz ligature) 218 thorn => chr(254), # small thorn, Icelandic 219 uacute => chr(250), # small u, acute accent 220 ucirc => chr(251), # small u, circumflex accent 221 ugrave => chr(249), # small u, grave accent 222 uuml => chr(252), # small u, dieresis or umlaut mark 223 yacute => chr(253), # small y, acute accent 224 yuml => chr(255), # small y, dieresis or umlaut mark 225 226 # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96) 227 copy => chr(169), # copyright sign 228 reg => chr(174), # registered sign 229 nbsp => chr(160), # non breaking space 230 231 # Additional ISO-8859/1 entities listed in rfc1866 (section 14) 232 iexcl => chr(161), 233 cent => chr(162), 234 pound => chr(163), 235 curren => chr(164), 236 yen => chr(165), 237 brvbar => chr(166), 238 sect => chr(167), 239 uml => chr(168), 240 ordf => chr(170), 241 laquo => chr(171), 242'not' => chr(172), # not is a keyword in perl 243 shy => chr(173), 244 macr => chr(175), 245 deg => chr(176), 246 plusmn => chr(177), 247 sup1 => chr(185), 248 sup2 => chr(178), 249 sup3 => chr(179), 250 acute => chr(180), 251 micro => chr(181), 252 para => chr(182), 253 middot => chr(183), 254 cedil => chr(184), 255 ordm => chr(186), 256 raquo => chr(187), 257 frac14 => chr(188), 258 frac12 => chr(189), 259 frac34 => chr(190), 260 iquest => chr(191), 261'times' => chr(215), # times is a keyword in perl 262 divide => chr(247), 263 264 ( $] > 5.007 ? ( 265 'OElig;' => chr(338), 266 'oelig;' => chr(339), 267 'Scaron;' => chr(352), 268 'scaron;' => chr(353), 269 'Yuml;' => chr(376), 270 'fnof;' => chr(402), 271 'circ;' => chr(710), 272 'tilde;' => chr(732), 273 'Alpha;' => chr(913), 274 'Beta;' => chr(914), 275 'Gamma;' => chr(915), 276 'Delta;' => chr(916), 277 'Epsilon;' => chr(917), 278 'Zeta;' => chr(918), 279 'Eta;' => chr(919), 280 'Theta;' => chr(920), 281 'Iota;' => chr(921), 282 'Kappa;' => chr(922), 283 'Lambda;' => chr(923), 284 'Mu;' => chr(924), 285 'Nu;' => chr(925), 286 'Xi;' => chr(926), 287 'Omicron;' => chr(927), 288 'Pi;' => chr(928), 289 'Rho;' => chr(929), 290 'Sigma;' => chr(931), 291 'Tau;' => chr(932), 292 'Upsilon;' => chr(933), 293 'Phi;' => chr(934), 294 'Chi;' => chr(935), 295 'Psi;' => chr(936), 296 'Omega;' => chr(937), 297 'alpha;' => chr(945), 298 'beta;' => chr(946), 299 'gamma;' => chr(947), 300 'delta;' => chr(948), 301 'epsilon;' => chr(949), 302 'zeta;' => chr(950), 303 'eta;' => chr(951), 304 'theta;' => chr(952), 305 'iota;' => chr(953), 306 'kappa;' => chr(954), 307 'lambda;' => chr(955), 308 'mu;' => chr(956), 309 'nu;' => chr(957), 310 'xi;' => chr(958), 311 'omicron;' => chr(959), 312 'pi;' => chr(960), 313 'rho;' => chr(961), 314 'sigmaf;' => chr(962), 315 'sigma;' => chr(963), 316 'tau;' => chr(964), 317 'upsilon;' => chr(965), 318 'phi;' => chr(966), 319 'chi;' => chr(967), 320 'psi;' => chr(968), 321 'omega;' => chr(969), 322 'thetasym;' => chr(977), 323 'upsih;' => chr(978), 324 'piv;' => chr(982), 325 'ensp;' => chr(8194), 326 'emsp;' => chr(8195), 327 'thinsp;' => chr(8201), 328 'zwnj;' => chr(8204), 329 'zwj;' => chr(8205), 330 'lrm;' => chr(8206), 331 'rlm;' => chr(8207), 332 'ndash;' => chr(8211), 333 'mdash;' => chr(8212), 334 'lsquo;' => chr(8216), 335 'rsquo;' => chr(8217), 336 'sbquo;' => chr(8218), 337 'ldquo;' => chr(8220), 338 'rdquo;' => chr(8221), 339 'bdquo;' => chr(8222), 340 'dagger;' => chr(8224), 341 'Dagger;' => chr(8225), 342 'bull;' => chr(8226), 343 'hellip;' => chr(8230), 344 'permil;' => chr(8240), 345 'prime;' => chr(8242), 346 'Prime;' => chr(8243), 347 'lsaquo;' => chr(8249), 348 'rsaquo;' => chr(8250), 349 'oline;' => chr(8254), 350 'frasl;' => chr(8260), 351 'euro;' => chr(8364), 352 'image;' => chr(8465), 353 'weierp;' => chr(8472), 354 'real;' => chr(8476), 355 'trade;' => chr(8482), 356 'alefsym;' => chr(8501), 357 'larr;' => chr(8592), 358 'uarr;' => chr(8593), 359 'rarr;' => chr(8594), 360 'darr;' => chr(8595), 361 'harr;' => chr(8596), 362 'crarr;' => chr(8629), 363 'lArr;' => chr(8656), 364 'uArr;' => chr(8657), 365 'rArr;' => chr(8658), 366 'dArr;' => chr(8659), 367 'hArr;' => chr(8660), 368 'forall;' => chr(8704), 369 'part;' => chr(8706), 370 'exist;' => chr(8707), 371 'empty;' => chr(8709), 372 'nabla;' => chr(8711), 373 'isin;' => chr(8712), 374 'notin;' => chr(8713), 375 'ni;' => chr(8715), 376 'prod;' => chr(8719), 377 'sum;' => chr(8721), 378 'minus;' => chr(8722), 379 'lowast;' => chr(8727), 380 'radic;' => chr(8730), 381 'prop;' => chr(8733), 382 'infin;' => chr(8734), 383 'ang;' => chr(8736), 384 'and;' => chr(8743), 385 'or;' => chr(8744), 386 'cap;' => chr(8745), 387 'cup;' => chr(8746), 388 'int;' => chr(8747), 389 'there4;' => chr(8756), 390 'sim;' => chr(8764), 391 'cong;' => chr(8773), 392 'asymp;' => chr(8776), 393 'ne;' => chr(8800), 394 'equiv;' => chr(8801), 395 'le;' => chr(8804), 396 'ge;' => chr(8805), 397 'sub;' => chr(8834), 398 'sup;' => chr(8835), 399 'nsub;' => chr(8836), 400 'sube;' => chr(8838), 401 'supe;' => chr(8839), 402 'oplus;' => chr(8853), 403 'otimes;' => chr(8855), 404 'perp;' => chr(8869), 405 'sdot;' => chr(8901), 406 'lceil;' => chr(8968), 407 'rceil;' => chr(8969), 408 'lfloor;' => chr(8970), 409 'rfloor;' => chr(8971), 410 'lang;' => chr(9001), 411 'rang;' => chr(9002), 412 'loz;' => chr(9674), 413 'spades;' => chr(9824), 414 'clubs;' => chr(9827), 415 'hearts;' => chr(9829), 416 'diams;' => chr(9830), 417 ) : ()) 418); 419 420 421# Make the opposite mapping 422while (my($entity, $char) = each(%entity2char)) { 423 $entity =~ s/;\z//; 424 $char2entity{$char} = "&$entity;"; 425} 426delete $char2entity{"'"}; # only one-way decoding 427 428# Fill in missing entities 429for (0 .. 255) { 430 next if exists $char2entity{chr($_)}; 431 $char2entity{chr($_)} = "&#$_;"; 432} 433 434my %subst; # compiled encoding regexps 435 436sub encode_entities 437{ 438 return undef unless defined $_[0]; 439 my $ref; 440 if (defined wantarray) { 441 my $x = $_[0]; 442 $ref = \$x; # copy 443 } else { 444 $ref = \$_[0]; # modify in-place 445 } 446 if (defined $_[1] and length $_[1]) { 447 unless (exists $subst{$_[1]}) { 448 # Because we can't compile regex we fake it with a cached sub 449 my $chars = $_[1]; 450 $chars =~ s,(?<!\\)([]/]),\\$1,g; 451 $chars =~ s,(?<!\\)\\\z,\\\\,; 452 my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} || num_entity(\$1)/ge; }"; 453 $subst{$_[1]} = eval $code; 454 die( $@ . " while trying to turn range: \"$_[1]\"\n " 455 . "into code: $code\n " 456 ) if $@; 457 } 458 &{$subst{$_[1]}}($$ref); 459 } else { 460 # Encode control chars, high bit chars and '<', '&', '>', ''' and '"' 461 $$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge; 462 } 463 $$ref; 464} 465 466sub encode_entities_numeric { 467 local %char2entity; 468 return &encode_entities; # a goto &encode_entities wouldn't work 469} 470 471 472sub num_entity { 473 sprintf "&#x%X;", ord($_[0]); 474} 475 476# Set up aliases 477*encode = \&encode_entities; 478*encode_numeric = \&encode_entities_numeric; 479*encode_numerically = \&encode_entities_numeric; 480*decode = \&decode_entities; 481 4821; 483