1package Pod::Escapes; 2use strict; 3use warnings; 4use 5.006; 5 6use vars qw( 7 %Code2USASCII 8 %Name2character 9 %Name2character_number 10 %Latin1Code_to_fallback 11 %Latin1Char_to_fallback 12 $FAR_CHAR 13 $FAR_CHAR_NUMBER 14 $NOT_ASCII 15 @ISA $VERSION @EXPORT_OK %EXPORT_TAGS 16); 17 18require Exporter; 19@ISA = ('Exporter'); 20$VERSION = '1.07'; 21@EXPORT_OK = qw( 22 %Code2USASCII 23 %Name2character 24 %Name2character_number 25 %Latin1Code_to_fallback 26 %Latin1Char_to_fallback 27 e2char 28 e2charnum 29); 30%EXPORT_TAGS = ('ALL' => \@EXPORT_OK); 31 32#========================================================================== 33 34$FAR_CHAR = "?" unless defined $FAR_CHAR; 35$FAR_CHAR_NUMBER = ord($FAR_CHAR) unless defined $FAR_CHAR_NUMBER; 36 37$NOT_ASCII = 'A' ne chr(65) unless defined $NOT_ASCII; 38 39#-------------------------------------------------------------------------- 40sub e2char { 41 my $in = $_[0]; 42 return undef unless defined $in and length $in; 43 44 # Convert to decimal: 45 if($in =~ m/^(0[0-7]*)$/s ) { 46 $in = oct $in; 47 } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) { 48 $in = hex $1; 49 } # else it's decimal, or named 50 51 if($in =~ m/^\d+$/s) { 52 if($] < 5.007 and $in > 255) { # can't be trusted with Unicode 53 return $FAR_CHAR; 54 } elsif ($] >= 5.007003) { 55 return chr(utf8::unicode_to_native($in)); 56 } elsif ($NOT_ASCII) { 57 return $Code2USASCII{$in} # so "65" => "A" everywhere 58 || $Latin1Code_to_fallback{$in} # Fallback. 59 || $FAR_CHAR; # Fall further back 60 } else { 61 return chr($in); 62 } 63 } else { 64 return $Name2character{$in}; # returns undef if unknown 65 } 66} 67 68#-------------------------------------------------------------------------- 69sub e2charnum { 70 my $in = $_[0]; 71 return undef unless defined $in and length $in; 72 73 # Convert to decimal: 74 if($in =~ m/^(0[0-7]*)$/s ) { 75 $in = oct $in; 76 } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) { 77 $in = hex $1; 78 } # else it's decimal, or named 79 80 if($in =~ m/^[0-9]+$/s) { 81 return 0 + $in; 82 } else { 83 return $Name2character_number{$in}; # returns undef if unknown 84 } 85} 86 87#-------------------------------------------------------------------------- 88 89%Code2USASCII = ( 90# mostly generated by 91# perl -e "printf qq{ \x25 3s, '\x25s',\n}, $_, chr($_) foreach (32 .. 126)" 92 32, ' ', 93 33, '!', 94 34, '"', 95 35, '#', 96 36, '$', 97 37, '%', 98 38, '&', 99 39, "'", #! 100 40, '(', 101 41, ')', 102 42, '*', 103 43, '+', 104 44, ',', 105 45, '-', 106 46, '.', 107 47, '/', 108 48, '0', 109 49, '1', 110 50, '2', 111 51, '3', 112 52, '4', 113 53, '5', 114 54, '6', 115 55, '7', 116 56, '8', 117 57, '9', 118 58, ':', 119 59, ';', 120 60, '<', 121 61, '=', 122 62, '>', 123 63, '?', 124 64, '@', 125 65, 'A', 126 66, 'B', 127 67, 'C', 128 68, 'D', 129 69, 'E', 130 70, 'F', 131 71, 'G', 132 72, 'H', 133 73, 'I', 134 74, 'J', 135 75, 'K', 136 76, 'L', 137 77, 'M', 138 78, 'N', 139 79, 'O', 140 80, 'P', 141 81, 'Q', 142 82, 'R', 143 83, 'S', 144 84, 'T', 145 85, 'U', 146 86, 'V', 147 87, 'W', 148 88, 'X', 149 89, 'Y', 150 90, 'Z', 151 91, '[', 152 92, "\\", #! 153 93, ']', 154 94, '^', 155 95, '_', 156 96, '`', 157 97, 'a', 158 98, 'b', 159 99, 'c', 160 100, 'd', 161 101, 'e', 162 102, 'f', 163 103, 'g', 164 104, 'h', 165 105, 'i', 166 106, 'j', 167 107, 'k', 168 108, 'l', 169 109, 'm', 170 110, 'n', 171 111, 'o', 172 112, 'p', 173 113, 'q', 174 114, 'r', 175 115, 's', 176 116, 't', 177 117, 'u', 178 118, 'v', 179 119, 'w', 180 120, 'x', 181 121, 'y', 182 122, 'z', 183 123, '{', 184 124, '|', 185 125, '}', 186 126, '~', 187); 188 189#-------------------------------------------------------------------------- 190 191%Latin1Code_to_fallback = (); 192@Latin1Code_to_fallback{0xA0 .. 0xFF} = ( 193# Copied from Text/Unidecode/x00.pm: 194 195' ', qq{!}, qq{C/}, 'PS', qq{\$?}, qq{Y=}, qq{|}, 'SS', qq{"}, qq{(c)}, 'a', qq{<<}, qq{!}, "", qq{(r)}, qq{-}, 196'deg', qq{+-}, '2', '3', qq{'}, 'u', 'P', qq{*}, qq{,}, '1', 'o', qq{>>}, qq{1/4}, qq{1/2}, qq{3/4}, qq{?}, 197'A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', 198'D', 'N', 'O', 'O', 'O', 'O', 'O', 'x', 'O', 'U', 'U', 'U', 'U', 'U', 'Th', 'ss', 199'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', 200'd', 'n', 'o', 'o', 'o', 'o', 'o', qq{/}, 'o', 'u', 'u', 'u', 'u', 'y', 'th', 'y', 201 202); 203 204{ 205 # Now stuff %Latin1Char_to_fallback: 206 %Latin1Char_to_fallback = (); 207 my($k,$v); 208 while( ($k,$v) = each %Latin1Code_to_fallback) { 209 $Latin1Char_to_fallback{chr $k} = $v; 210 #print chr($k), ' => ', $v, "\n"; 211 } 212} 213 214#-------------------------------------------------------------------------- 215 216%Name2character_number = ( 217 # General XML/XHTML: 218 'lt' => 60, 219 'gt' => 62, 220 'quot' => 34, 221 'amp' => 38, 222 'apos' => 39, 223 224 # POD-specific: 225 'sol' => 47, 226 'verbar' => 124, 227 228 'lchevron' => 171, # legacy for laquo 229 'rchevron' => 187, # legacy for raquo 230 231 # Remember, grave looks like \ (as in virtu\) 232 # acute looks like / (as in re/sume/) 233 # circumflex looks like ^ (as in papier ma^che/) 234 # umlaut/dieresis looks like " (as in nai"ve, Chloe") 235 236 # From the XHTML 1 .ent files: 237 'nbsp' , 160, 238 'iexcl' , 161, 239 'cent' , 162, 240 'pound' , 163, 241 'curren' , 164, 242 'yen' , 165, 243 'brvbar' , 166, 244 'sect' , 167, 245 'uml' , 168, 246 'copy' , 169, 247 'ordf' , 170, 248 'laquo' , 171, 249 'not' , 172, 250 'shy' , 173, 251 'reg' , 174, 252 'macr' , 175, 253 'deg' , 176, 254 'plusmn' , 177, 255 'sup2' , 178, 256 'sup3' , 179, 257 'acute' , 180, 258 'micro' , 181, 259 'para' , 182, 260 'middot' , 183, 261 'cedil' , 184, 262 'sup1' , 185, 263 'ordm' , 186, 264 'raquo' , 187, 265 'frac14' , 188, 266 'frac12' , 189, 267 'frac34' , 190, 268 'iquest' , 191, 269 'Agrave' , 192, 270 'Aacute' , 193, 271 'Acirc' , 194, 272 'Atilde' , 195, 273 'Auml' , 196, 274 'Aring' , 197, 275 'AElig' , 198, 276 'Ccedil' , 199, 277 'Egrave' , 200, 278 'Eacute' , 201, 279 'Ecirc' , 202, 280 'Euml' , 203, 281 'Igrave' , 204, 282 'Iacute' , 205, 283 'Icirc' , 206, 284 'Iuml' , 207, 285 'ETH' , 208, 286 'Ntilde' , 209, 287 'Ograve' , 210, 288 'Oacute' , 211, 289 'Ocirc' , 212, 290 'Otilde' , 213, 291 'Ouml' , 214, 292 'times' , 215, 293 'Oslash' , 216, 294 'Ugrave' , 217, 295 'Uacute' , 218, 296 'Ucirc' , 219, 297 'Uuml' , 220, 298 'Yacute' , 221, 299 'THORN' , 222, 300 'szlig' , 223, 301 'agrave' , 224, 302 'aacute' , 225, 303 'acirc' , 226, 304 'atilde' , 227, 305 'auml' , 228, 306 'aring' , 229, 307 'aelig' , 230, 308 'ccedil' , 231, 309 'egrave' , 232, 310 'eacute' , 233, 311 'ecirc' , 234, 312 'euml' , 235, 313 'igrave' , 236, 314 'iacute' , 237, 315 'icirc' , 238, 316 'iuml' , 239, 317 'eth' , 240, 318 'ntilde' , 241, 319 'ograve' , 242, 320 'oacute' , 243, 321 'ocirc' , 244, 322 'otilde' , 245, 323 'ouml' , 246, 324 'divide' , 247, 325 'oslash' , 248, 326 'ugrave' , 249, 327 'uacute' , 250, 328 'ucirc' , 251, 329 'uuml' , 252, 330 'yacute' , 253, 331 'thorn' , 254, 332 'yuml' , 255, 333 334 'fnof' , 402, 335 'Alpha' , 913, 336 'Beta' , 914, 337 'Gamma' , 915, 338 'Delta' , 916, 339 'Epsilon' , 917, 340 'Zeta' , 918, 341 'Eta' , 919, 342 'Theta' , 920, 343 'Iota' , 921, 344 'Kappa' , 922, 345 'Lambda' , 923, 346 'Mu' , 924, 347 'Nu' , 925, 348 'Xi' , 926, 349 'Omicron' , 927, 350 'Pi' , 928, 351 'Rho' , 929, 352 'Sigma' , 931, 353 'Tau' , 932, 354 'Upsilon' , 933, 355 'Phi' , 934, 356 'Chi' , 935, 357 'Psi' , 936, 358 'Omega' , 937, 359 'alpha' , 945, 360 'beta' , 946, 361 'gamma' , 947, 362 'delta' , 948, 363 'epsilon' , 949, 364 'zeta' , 950, 365 'eta' , 951, 366 'theta' , 952, 367 'iota' , 953, 368 'kappa' , 954, 369 'lambda' , 955, 370 'mu' , 956, 371 'nu' , 957, 372 'xi' , 958, 373 'omicron' , 959, 374 'pi' , 960, 375 'rho' , 961, 376 'sigmaf' , 962, 377 'sigma' , 963, 378 'tau' , 964, 379 'upsilon' , 965, 380 'phi' , 966, 381 'chi' , 967, 382 'psi' , 968, 383 'omega' , 969, 384 'thetasym' , 977, 385 'upsih' , 978, 386 'piv' , 982, 387 'bull' , 8226, 388 'hellip' , 8230, 389 'prime' , 8242, 390 'Prime' , 8243, 391 'oline' , 8254, 392 'frasl' , 8260, 393 'weierp' , 8472, 394 'image' , 8465, 395 'real' , 8476, 396 'trade' , 8482, 397 'alefsym' , 8501, 398 'larr' , 8592, 399 'uarr' , 8593, 400 'rarr' , 8594, 401 'darr' , 8595, 402 'harr' , 8596, 403 'crarr' , 8629, 404 'lArr' , 8656, 405 'uArr' , 8657, 406 'rArr' , 8658, 407 'dArr' , 8659, 408 'hArr' , 8660, 409 'forall' , 8704, 410 'part' , 8706, 411 'exist' , 8707, 412 'empty' , 8709, 413 'nabla' , 8711, 414 'isin' , 8712, 415 'notin' , 8713, 416 'ni' , 8715, 417 'prod' , 8719, 418 'sum' , 8721, 419 'minus' , 8722, 420 'lowast' , 8727, 421 'radic' , 8730, 422 'prop' , 8733, 423 'infin' , 8734, 424 'ang' , 8736, 425 'and' , 8743, 426 'or' , 8744, 427 'cap' , 8745, 428 'cup' , 8746, 429 'int' , 8747, 430 'there4' , 8756, 431 'sim' , 8764, 432 'cong' , 8773, 433 'asymp' , 8776, 434 'ne' , 8800, 435 'equiv' , 8801, 436 'le' , 8804, 437 'ge' , 8805, 438 'sub' , 8834, 439 'sup' , 8835, 440 'nsub' , 8836, 441 'sube' , 8838, 442 'supe' , 8839, 443 'oplus' , 8853, 444 'otimes' , 8855, 445 'perp' , 8869, 446 'sdot' , 8901, 447 'lceil' , 8968, 448 'rceil' , 8969, 449 'lfloor' , 8970, 450 'rfloor' , 8971, 451 'lang' , 9001, 452 'rang' , 9002, 453 'loz' , 9674, 454 'spades' , 9824, 455 'clubs' , 9827, 456 'hearts' , 9829, 457 'diams' , 9830, 458 'OElig' , 338, 459 'oelig' , 339, 460 'Scaron' , 352, 461 'scaron' , 353, 462 'Yuml' , 376, 463 'circ' , 710, 464 'tilde' , 732, 465 'ensp' , 8194, 466 'emsp' , 8195, 467 'thinsp' , 8201, 468 'zwnj' , 8204, 469 'zwj' , 8205, 470 'lrm' , 8206, 471 'rlm' , 8207, 472 'ndash' , 8211, 473 'mdash' , 8212, 474 'lsquo' , 8216, 475 'rsquo' , 8217, 476 'sbquo' , 8218, 477 'ldquo' , 8220, 478 'rdquo' , 8221, 479 'bdquo' , 8222, 480 'dagger' , 8224, 481 'Dagger' , 8225, 482 'permil' , 8240, 483 'lsaquo' , 8249, 484 'rsaquo' , 8250, 485 'euro' , 8364, 486); 487 488 489# Fill out %Name2character... 490{ 491 %Name2character = (); 492 my($name, $number); 493 while( ($name, $number) = each %Name2character_number) { 494 if($] < 5.007 and $number > 255) { 495 $Name2character{$name} = $FAR_CHAR; 496 # substitute for Unicode characters, for perls 497 # that can't reliably handle them 498 } elsif ($] >= 5.007003) { 499 $Name2character{$name} = chr utf8::unicode_to_native($number); 500 # normal case for more recent Perls where we can translate from Unicode 501 # to the native character set. 502 } 503 elsif (exists $Code2USASCII{$number}) { 504 $Name2character{$name} = $Code2USASCII{$number}; 505 # on older Perls, we can use the translations we have hard-coded in this 506 # file, but these don't include the non-ASCII-range characters 507 } 508 elsif ($NOT_ASCII && $number > 127 && $number < 256) { 509 # this range on old non-ASCII-platform perls is wrong 510 if (exists $Latin1Code_to_fallback{$number}) { 511 $Name2character{$name} = $Latin1Code_to_fallback{$number}; 512 } else { 513 $Name2character{$name} = $FAR_CHAR; 514 } 515 } else { 516 $Name2character{$name} = chr $number; 517 } 518 } 519} 520 521#-------------------------------------------------------------------------- 5221; 523__END__ 524 525=head1 NAME 526 527Pod::Escapes - for resolving Pod EE<lt>...E<gt> sequences 528 529=head1 SYNOPSIS 530 531 use Pod::Escapes qw(e2char); 532 ...la la la, parsing POD, la la la... 533 $text = e2char($e_node->label); 534 unless(defined $text) { 535 print "Unknown E sequence \"", $e_node->label, "\"!"; 536 } 537 ...else print/interpolate $text... 538 539=head1 DESCRIPTION 540 541This module provides things that are useful in decoding 542Pod EE<lt>...E<gt> sequences. Presumably, it should be used 543only by Pod parsers and/or formatters. 544 545By default, Pod::Escapes exports none of its symbols. But 546you can request any of them to be exported. 547Either request them individually, as with 548C<use Pod::Escapes qw(symbolname symbolname2...);>, 549or you can do C<use Pod::Escapes qw(:ALL);> to get all 550exportable symbols. 551 552=head1 GOODIES 553 554=over 555 556=item e2char($e_content) 557 558Given a name or number that could appear in a 559C<EE<lt>name_or_numE<gt>> sequence, this returns the string that 560it stands for. For example, C<e2char('sol')>, C<e2char('47')>, 561C<e2char('0x2F')>, and C<e2char('057')> all return "/", 562because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>, 563and C<EE<lt>057E<gt>>, all mean "/". If 564the name has no known value (as with a name of "qacute") or is 565syntactically invalid (as with a name of "1/4"), this returns undef. 566 567=item e2charnum($e_content) 568 569Given a name or number that could appear in a 570C<EE<lt>name_or_numE<gt>> sequence, this returns the number of 571the Unicode character that this stands for. For example, 572C<e2char('sol')>, C<e2char('47')>, 573C<e2char('0x2F')>, and C<e2char('057')> all return 47, 574because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>, 575and C<EE<lt>057E<gt>>, all mean "/", whose Unicode number is 47. If 576the name has no known value (as with a name of "qacute") or is 577syntactically invalid (as with a name of "1/4"), this returns undef. 578 579=item $Name2character{I<name>} 580 581Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol" 582to the string that each stands for. Note that this does not 583include numerics (like "64" or "x981c"). Under old Perl versions 584(before 5.7) you get a "?" in place of characters whose Unicode 585value is over 255. 586 587=item $Name2character_number{I<name>} 588 589Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol" 590to the Unicode value that each stands for. For example, 591C<$Name2character_number{'eacute'}> is 201, and 592C<$Name2character_number{'eacute'}> is 8364. You get the correct 593Unicode value, regardless of the version of Perl you're using -- 594which differs from C<%Name2character>'s behavior under pre-5.7 Perls. 595 596Note that this hash does not 597include numerics (like "64" or "x981c"). 598 599=item $Latin1Code_to_fallback{I<integer>} 600 601For numbers in the range 160 (0x00A0) to 255 (0x00FF), this maps 602from the character code for a Latin-1 character (like 233 for 603lowercase e-acute) to the US-ASCII character that best aproximates 604it (like "e"). You may find this useful if you are rendering 605POD in a format that you think deals well only with US-ASCII 606characters. 607 608=item $Latin1Char_to_fallback{I<character>} 609 610Just as above, but maps from characters (like "\xE9", 611lowercase e-acute) to characters (like "e"). 612 613=item $Code2USASCII{I<integer>} 614 615This maps from US-ASCII codes (like 32) to the corresponding 616character (like space, for 32). Only characters 32 to 126 are 617defined. This is meant for use by C<e2char($x)> when it senses 618that it's running on a non-ASCII platform (where chr(32) doesn't 619get you a space -- but $Code2USASCII{32} will). It's 620documented here just in case you might find it useful. 621 622=back 623 624=head1 CAVEATS 625 626On Perl versions before 5.7, Unicode characters with a value 627over 255 (like lambda or emdash) can't be conveyed. This 628module does work under such early Perl versions, but in the 629place of each such character, you get a "?". Latin-1 630characters (characters 160-255) are unaffected. 631 632Under EBCDIC platforms, C<e2char($n)> may not always be the 633same as C<chr(e2charnum($n))>, and ditto for 634C<$Name2character{$name}> and 635C<chr($Name2character_number{$name})>, because the strings are returned as 636native, and the numbers are returned as Unicode. 637However, for Perls starting with v5.8, C<e2char($n)> is the same as 638C<chr(utf8::unicode_to_native(e2charnum($n)))>, and ditto for 639C<$Name2character{$name}> and 640C<chr(utf8::unicode_to_native($Name2character_number{$name}))>. 641 642=head1 SEE ALSO 643 644L<Pod::Browser> - a pod web server based on L<Catalyst>. 645 646L<Pod::Checker> - check pod documents for syntax errors. 647 648L<Pod::Coverage> - check if the documentation for a module is comprehensive. 649 650L<perlpod> - description of pod format (for people documenting with pod). 651 652L<perlpodspec> - specification of pod format (for people processing it). 653 654L<Text::Unidecode> - ASCII transliteration of Unicode text. 655 656=head1 REPOSITORY 657 658L<https://github.com/neilbowers/Pod-Escapes> 659 660=head1 COPYRIGHT AND DISCLAIMERS 661 662Copyright (c) 2001-2004 Sean M. Burke. All rights reserved. 663 664This library is free software; you can redistribute it and/or modify 665it under the same terms as Perl itself. 666 667This program is distributed in the hope that it will be useful, but 668without any warranty; without even the implied warranty of 669merchantability or fitness for a particular purpose. 670 671Portions of the data tables in this module are derived from the 672entity declarations in the W3C XHTML specification. 673 674Currently (October 2001), that's these three: 675 676 http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent 677 http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent 678 http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent 679 680=head1 AUTHOR 681 682Sean M. Burke C<sburke@cpan.org> 683 684Now being maintained by Neil Bowers E<lt>neilb@cpan.orgE<gt> 685 686=cut 687 688#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 689# What I used for reading the XHTML .ent files: 690 691my(@norms, @good, @bad); 692my $dir = 'c:/sgml/docbook/'; 693my %escapes; 694foreach my $file (qw( 695 xhtml-symbol.ent 696 xhtml-lat1.ent 697 xhtml-special.ent 698)) { 699 open(IN, "<$dir$file") or die "can't read-open $dir$file: $!"; 700 print "Reading $file...\n"; 701 while(<IN>) { 702 if(m/<!ENTITY\s+(\S+)\s+"&#([^;]+);">/) { 703 my($name, $value) = ($1,$2); 704 next if $name eq 'quot' or $name eq 'apos' or $name eq 'gt'; 705 706 $value = hex $1 if $value =~ m/^x([a-fA-F0-9]+)$/s; 707 print "ILLEGAL VALUE $value" unless $value =~ m/^\d+$/s; 708 if($value > 255) { 709 push @good , sprintf " %-10s , chr(%s),\n", "'$name'", $value; 710 push @bad , sprintf " %-10s , \$bad,\n", "'$name'", $value; 711 } else { 712 push @norms, sprintf " %-10s , chr(%s),\n", "'$name'", $value; 713 } 714 } elsif(m/<!ENT/) { 715 print "# Skipping $_"; 716 } 717 718 } 719 close(IN); 720} 721 722print @norms; 723print "\n ( \$] .= 5.006001 ? (\n"; 724print @good; 725print " ) : (\n"; 726print @bad; 727print " )\n);\n"; 728 729__END__ 730#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 731 732 733