1package Pod::Escapes; 2use strict; 3use warnings; 4use 5.006; 5 6use vars qw( 7 %Code2USASCII 8 %Name2character 9 %Name2character_number 10 %Latin1Code_to_fallback 11 %Latin1Char_to_fallback 12 $FAR_CHAR 13 $FAR_CHAR_NUMBER 14 $NOT_ASCII 15 @ISA $VERSION @EXPORT_OK %EXPORT_TAGS 16); 17 18require Exporter; 19@ISA = ('Exporter'); 20$VERSION = '1.06'; 21@EXPORT_OK = qw( 22 %Code2USASCII 23 %Name2character 24 %Name2character_number 25 %Latin1Code_to_fallback 26 %Latin1Char_to_fallback 27 e2char 28 e2charnum 29); 30%EXPORT_TAGS = ('ALL' => \@EXPORT_OK); 31 32#========================================================================== 33 34$FAR_CHAR = "?" unless defined $FAR_CHAR; 35$FAR_CHAR_NUMBER = ord($FAR_CHAR) unless defined $FAR_CHAR_NUMBER; 36 37$NOT_ASCII = 'A' ne chr(65) unless defined $NOT_ASCII; 38 39#-------------------------------------------------------------------------- 40sub e2char { 41 my $in = $_[0]; 42 return undef unless defined $in and length $in; 43 44 # Convert to decimal: 45 if($in =~ m/^(0[0-7]*)$/s ) { 46 $in = oct $in; 47 } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) { 48 $in = hex $1; 49 } # else it's decimal, or named 50 51 if($NOT_ASCII) { 52 # We're in bizarro world of not-ASCII! 53 # Cope with US-ASCII codes, use fallbacks for Latin-1, or use FAR_CHAR. 54 unless($in =~ m/^\d+$/s) { 55 # It's a named character reference. Get its numeric Unicode value. 56 $in = $Name2character{$in}; 57 return undef unless defined $in; # (if there's no such name) 58 $in = ord $in; # (All ents must be one character long.) 59 # ...So $in holds the char's US-ASCII numeric value, which we'll 60 # now go get the local equivalent for. 61 } 62 63 # It's numeric, whether by origin or by mutation from a known name 64 return $Code2USASCII{$in} # so "65" => "A" everywhere 65 || $Latin1Code_to_fallback{$in} # Fallback. 66 || $FAR_CHAR; # Fall further back 67 } 68 69 # Normal handling: 70 if($in =~ m/^\d+$/s) { 71 if($] < 5.007 and $in > 255) { # can't be trusted with Unicode 72 return $FAR_CHAR; 73 } else { 74 return chr($in); 75 } 76 } else { 77 return $Name2character{$in}; # returns undef if unknown 78 } 79} 80 81#-------------------------------------------------------------------------- 82sub e2charnum { 83 my $in = $_[0]; 84 return undef unless defined $in and length $in; 85 86 # Convert to decimal: 87 if($in =~ m/^(0[0-7]*)$/s ) { 88 $in = oct $in; 89 } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) { 90 $in = hex $1; 91 } # else it's decimal, or named 92 93 if($in =~ m/^[0-9]+$/s) { 94 return 0 + $in; 95 } else { 96 return $Name2character_number{$in}; # returns undef if unknown 97 } 98} 99 100#-------------------------------------------------------------------------- 101 102%Name2character_number = ( 103 # General XML/XHTML: 104 'lt' => 60, 105 'gt' => 62, 106 'quot' => 34, 107 'amp' => 38, 108 'apos' => 39, 109 110 # POD-specific: 111 'sol' => 47, 112 'verbar' => 124, 113 114 'lchevron' => 171, # legacy for laquo 115 'rchevron' => 187, # legacy for raquo 116 117 # Remember, grave looks like \ (as in virtu\) 118 # acute looks like / (as in re/sume/) 119 # circumflex looks like ^ (as in papier ma^che/) 120 # umlaut/dieresis looks like " (as in nai"ve, Chloe") 121 122 # From the XHTML 1 .ent files: 123 'nbsp' , 160, 124 'iexcl' , 161, 125 'cent' , 162, 126 'pound' , 163, 127 'curren' , 164, 128 'yen' , 165, 129 'brvbar' , 166, 130 'sect' , 167, 131 'uml' , 168, 132 'copy' , 169, 133 'ordf' , 170, 134 'laquo' , 171, 135 'not' , 172, 136 'shy' , 173, 137 'reg' , 174, 138 'macr' , 175, 139 'deg' , 176, 140 'plusmn' , 177, 141 'sup2' , 178, 142 'sup3' , 179, 143 'acute' , 180, 144 'micro' , 181, 145 'para' , 182, 146 'middot' , 183, 147 'cedil' , 184, 148 'sup1' , 185, 149 'ordm' , 186, 150 'raquo' , 187, 151 'frac14' , 188, 152 'frac12' , 189, 153 'frac34' , 190, 154 'iquest' , 191, 155 'Agrave' , 192, 156 'Aacute' , 193, 157 'Acirc' , 194, 158 'Atilde' , 195, 159 'Auml' , 196, 160 'Aring' , 197, 161 'AElig' , 198, 162 'Ccedil' , 199, 163 'Egrave' , 200, 164 'Eacute' , 201, 165 'Ecirc' , 202, 166 'Euml' , 203, 167 'Igrave' , 204, 168 'Iacute' , 205, 169 'Icirc' , 206, 170 'Iuml' , 207, 171 'ETH' , 208, 172 'Ntilde' , 209, 173 'Ograve' , 210, 174 'Oacute' , 211, 175 'Ocirc' , 212, 176 'Otilde' , 213, 177 'Ouml' , 214, 178 'times' , 215, 179 'Oslash' , 216, 180 'Ugrave' , 217, 181 'Uacute' , 218, 182 'Ucirc' , 219, 183 'Uuml' , 220, 184 'Yacute' , 221, 185 'THORN' , 222, 186 'szlig' , 223, 187 'agrave' , 224, 188 'aacute' , 225, 189 'acirc' , 226, 190 'atilde' , 227, 191 'auml' , 228, 192 'aring' , 229, 193 'aelig' , 230, 194 'ccedil' , 231, 195 'egrave' , 232, 196 'eacute' , 233, 197 'ecirc' , 234, 198 'euml' , 235, 199 'igrave' , 236, 200 'iacute' , 237, 201 'icirc' , 238, 202 'iuml' , 239, 203 'eth' , 240, 204 'ntilde' , 241, 205 'ograve' , 242, 206 'oacute' , 243, 207 'ocirc' , 244, 208 'otilde' , 245, 209 'ouml' , 246, 210 'divide' , 247, 211 'oslash' , 248, 212 'ugrave' , 249, 213 'uacute' , 250, 214 'ucirc' , 251, 215 'uuml' , 252, 216 'yacute' , 253, 217 'thorn' , 254, 218 'yuml' , 255, 219 220 'fnof' , 402, 221 'Alpha' , 913, 222 'Beta' , 914, 223 'Gamma' , 915, 224 'Delta' , 916, 225 'Epsilon' , 917, 226 'Zeta' , 918, 227 'Eta' , 919, 228 'Theta' , 920, 229 'Iota' , 921, 230 'Kappa' , 922, 231 'Lambda' , 923, 232 'Mu' , 924, 233 'Nu' , 925, 234 'Xi' , 926, 235 'Omicron' , 927, 236 'Pi' , 928, 237 'Rho' , 929, 238 'Sigma' , 931, 239 'Tau' , 932, 240 'Upsilon' , 933, 241 'Phi' , 934, 242 'Chi' , 935, 243 'Psi' , 936, 244 'Omega' , 937, 245 'alpha' , 945, 246 'beta' , 946, 247 'gamma' , 947, 248 'delta' , 948, 249 'epsilon' , 949, 250 'zeta' , 950, 251 'eta' , 951, 252 'theta' , 952, 253 'iota' , 953, 254 'kappa' , 954, 255 'lambda' , 955, 256 'mu' , 956, 257 'nu' , 957, 258 'xi' , 958, 259 'omicron' , 959, 260 'pi' , 960, 261 'rho' , 961, 262 'sigmaf' , 962, 263 'sigma' , 963, 264 'tau' , 964, 265 'upsilon' , 965, 266 'phi' , 966, 267 'chi' , 967, 268 'psi' , 968, 269 'omega' , 969, 270 'thetasym' , 977, 271 'upsih' , 978, 272 'piv' , 982, 273 'bull' , 8226, 274 'hellip' , 8230, 275 'prime' , 8242, 276 'Prime' , 8243, 277 'oline' , 8254, 278 'frasl' , 8260, 279 'weierp' , 8472, 280 'image' , 8465, 281 'real' , 8476, 282 'trade' , 8482, 283 'alefsym' , 8501, 284 'larr' , 8592, 285 'uarr' , 8593, 286 'rarr' , 8594, 287 'darr' , 8595, 288 'harr' , 8596, 289 'crarr' , 8629, 290 'lArr' , 8656, 291 'uArr' , 8657, 292 'rArr' , 8658, 293 'dArr' , 8659, 294 'hArr' , 8660, 295 'forall' , 8704, 296 'part' , 8706, 297 'exist' , 8707, 298 'empty' , 8709, 299 'nabla' , 8711, 300 'isin' , 8712, 301 'notin' , 8713, 302 'ni' , 8715, 303 'prod' , 8719, 304 'sum' , 8721, 305 'minus' , 8722, 306 'lowast' , 8727, 307 'radic' , 8730, 308 'prop' , 8733, 309 'infin' , 8734, 310 'ang' , 8736, 311 'and' , 8743, 312 'or' , 8744, 313 'cap' , 8745, 314 'cup' , 8746, 315 'int' , 8747, 316 'there4' , 8756, 317 'sim' , 8764, 318 'cong' , 8773, 319 'asymp' , 8776, 320 'ne' , 8800, 321 'equiv' , 8801, 322 'le' , 8804, 323 'ge' , 8805, 324 'sub' , 8834, 325 'sup' , 8835, 326 'nsub' , 8836, 327 'sube' , 8838, 328 'supe' , 8839, 329 'oplus' , 8853, 330 'otimes' , 8855, 331 'perp' , 8869, 332 'sdot' , 8901, 333 'lceil' , 8968, 334 'rceil' , 8969, 335 'lfloor' , 8970, 336 'rfloor' , 8971, 337 'lang' , 9001, 338 'rang' , 9002, 339 'loz' , 9674, 340 'spades' , 9824, 341 'clubs' , 9827, 342 'hearts' , 9829, 343 'diams' , 9830, 344 'OElig' , 338, 345 'oelig' , 339, 346 'Scaron' , 352, 347 'scaron' , 353, 348 'Yuml' , 376, 349 'circ' , 710, 350 'tilde' , 732, 351 'ensp' , 8194, 352 'emsp' , 8195, 353 'thinsp' , 8201, 354 'zwnj' , 8204, 355 'zwj' , 8205, 356 'lrm' , 8206, 357 'rlm' , 8207, 358 'ndash' , 8211, 359 'mdash' , 8212, 360 'lsquo' , 8216, 361 'rsquo' , 8217, 362 'sbquo' , 8218, 363 'ldquo' , 8220, 364 'rdquo' , 8221, 365 'bdquo' , 8222, 366 'dagger' , 8224, 367 'Dagger' , 8225, 368 'permil' , 8240, 369 'lsaquo' , 8249, 370 'rsaquo' , 8250, 371 'euro' , 8364, 372); 373 374 375# Fill out %Name2character... 376{ 377 %Name2character = (); 378 my($name, $number); 379 while( ($name, $number) = each %Name2character_number) { 380 if($] < 5.007 and $number > 255) { 381 $Name2character{$name} = $FAR_CHAR; 382 # substitute for Unicode characters, for perls 383 # that can't reliable handle them 384 } else { 385 $Name2character{$name} = chr $number; 386 # normal case 387 } 388 } 389 # So they resolve 'right' even in EBCDIC-land 390 $Name2character{'lt' } = '<'; 391 $Name2character{'gt' } = '>'; 392 $Name2character{'quot'} = '"'; 393 $Name2character{'amp' } = '&'; 394 $Name2character{'apos'} = "'"; 395 $Name2character{'sol' } = '/'; 396 $Name2character{'verbar'} = '|'; 397} 398 399#-------------------------------------------------------------------------- 400 401%Code2USASCII = ( 402# mostly generated by 403# perl -e "printf qq{ \x25 3s, '\x25s',\n}, $_, chr($_) foreach (32 .. 126)" 404 32, ' ', 405 33, '!', 406 34, '"', 407 35, '#', 408 36, '$', 409 37, '%', 410 38, '&', 411 39, "'", #! 412 40, '(', 413 41, ')', 414 42, '*', 415 43, '+', 416 44, ',', 417 45, '-', 418 46, '.', 419 47, '/', 420 48, '0', 421 49, '1', 422 50, '2', 423 51, '3', 424 52, '4', 425 53, '5', 426 54, '6', 427 55, '7', 428 56, '8', 429 57, '9', 430 58, ':', 431 59, ';', 432 60, '<', 433 61, '=', 434 62, '>', 435 63, '?', 436 64, '@', 437 65, 'A', 438 66, 'B', 439 67, 'C', 440 68, 'D', 441 69, 'E', 442 70, 'F', 443 71, 'G', 444 72, 'H', 445 73, 'I', 446 74, 'J', 447 75, 'K', 448 76, 'L', 449 77, 'M', 450 78, 'N', 451 79, 'O', 452 80, 'P', 453 81, 'Q', 454 82, 'R', 455 83, 'S', 456 84, 'T', 457 85, 'U', 458 86, 'V', 459 87, 'W', 460 88, 'X', 461 89, 'Y', 462 90, 'Z', 463 91, '[', 464 92, "\\", #! 465 93, ']', 466 94, '^', 467 95, '_', 468 96, '`', 469 97, 'a', 470 98, 'b', 471 99, 'c', 472 100, 'd', 473 101, 'e', 474 102, 'f', 475 103, 'g', 476 104, 'h', 477 105, 'i', 478 106, 'j', 479 107, 'k', 480 108, 'l', 481 109, 'm', 482 110, 'n', 483 111, 'o', 484 112, 'p', 485 113, 'q', 486 114, 'r', 487 115, 's', 488 116, 't', 489 117, 'u', 490 118, 'v', 491 119, 'w', 492 120, 'x', 493 121, 'y', 494 122, 'z', 495 123, '{', 496 124, '|', 497 125, '}', 498 126, '~', 499); 500 501#-------------------------------------------------------------------------- 502 503%Latin1Code_to_fallback = (); 504@Latin1Code_to_fallback{0xA0 .. 0xFF} = ( 505# Copied from Text/Unidecode/x00.pm: 506 507' ', qq{!}, qq{C/}, 'PS', qq{\$?}, qq{Y=}, qq{|}, 'SS', qq{"}, qq{(c)}, 'a', qq{<<}, qq{!}, "", qq{(r)}, qq{-}, 508'deg', qq{+-}, '2', '3', qq{'}, 'u', 'P', qq{*}, qq{,}, '1', 'o', qq{>>}, qq{1/4}, qq{1/2}, qq{3/4}, qq{?}, 509'A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', 510'D', 'N', 'O', 'O', 'O', 'O', 'O', 'x', 'O', 'U', 'U', 'U', 'U', 'U', 'Th', 'ss', 511'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', 512'd', 'n', 'o', 'o', 'o', 'o', 'o', qq{/}, 'o', 'u', 'u', 'u', 'u', 'y', 'th', 'y', 513 514); 515 516{ 517 # Now stuff %Latin1Char_to_fallback: 518 %Latin1Char_to_fallback = (); 519 my($k,$v); 520 while( ($k,$v) = each %Latin1Code_to_fallback) { 521 $Latin1Char_to_fallback{chr $k} = $v; 522 #print chr($k), ' => ', $v, "\n"; 523 } 524} 525 526#-------------------------------------------------------------------------- 5271; 528__END__ 529 530=head1 NAME 531 532Pod::Escapes - for resolving Pod EE<lt>...E<gt> sequences 533 534=head1 SYNOPSIS 535 536 use Pod::Escapes qw(e2char); 537 ...la la la, parsing POD, la la la... 538 $text = e2char($e_node->label); 539 unless(defined $text) { 540 print "Unknown E sequence \"", $e_node->label, "\"!"; 541 } 542 ...else print/interpolate $text... 543 544=head1 DESCRIPTION 545 546This module provides things that are useful in decoding 547Pod EE<lt>...E<gt> sequences. Presumably, it should be used 548only by Pod parsers and/or formatters. 549 550By default, Pod::Escapes exports none of its symbols. But 551you can request any of them to be exported. 552Either request them individually, as with 553C<use Pod::Escapes qw(symbolname symbolname2...);>, 554or you can do C<use Pod::Escapes qw(:ALL);> to get all 555exportable symbols. 556 557=head1 GOODIES 558 559=over 560 561=item e2char($e_content) 562 563Given a name or number that could appear in a 564C<EE<lt>name_or_numE<gt>> sequence, this returns the string that 565it stands for. For example, C<e2char('sol')>, C<e2char('47')>, 566C<e2char('0x2F')>, and C<e2char('057')> all return "/", 567because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>, 568and C<EE<lt>057E<gt>>, all mean "/". If 569the name has no known value (as with a name of "qacute") or is 570syntactically invalid (as with a name of "1/4"), this returns undef. 571 572=item e2charnum($e_content) 573 574Given a name or number that could appear in a 575C<EE<lt>name_or_numE<gt>> sequence, this returns the number of 576the Unicode character that this stands for. For example, 577C<e2char('sol')>, C<e2char('47')>, 578C<e2char('0x2F')>, and C<e2char('057')> all return 47, 579because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>, 580and C<EE<lt>057E<gt>>, all mean "/", whose Unicode number is 47. If 581the name has no known value (as with a name of "qacute") or is 582syntactically invalid (as with a name of "1/4"), this returns undef. 583 584=item $Name2character{I<name>} 585 586Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol" 587to the string that each stands for. Note that this does not 588include numerics (like "64" or "x981c"). Under old Perl versions 589(before 5.7) you get a "?" in place of characters whose Unicode 590value is over 255. 591 592=item $Name2character_number{I<name>} 593 594Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol" 595to the Unicode value that each stands for. For example, 596C<$Name2character_number{'eacute'}> is 201, and 597C<$Name2character_number{'eacute'}> is 8364. You get the correct 598Unicode value, regardless of the version of Perl you're using -- 599which differs from C<%Name2character>'s behavior under pre-5.7 Perls. 600 601Note that this hash does not 602include numerics (like "64" or "x981c"). 603 604=item $Latin1Code_to_fallback{I<integer>} 605 606For numbers in the range 160 (0x00A0) to 255 (0x00FF), this maps 607from the character code for a Latin-1 character (like 233 for 608lowercase e-acute) to the US-ASCII character that best aproximates 609it (like "e"). You may find this useful if you are rendering 610POD in a format that you think deals well only with US-ASCII 611characters. 612 613=item $Latin1Char_to_fallback{I<character>} 614 615Just as above, but maps from characters (like "\xE9", 616lowercase e-acute) to characters (like "e"). 617 618=item $Code2USASCII{I<integer>} 619 620This maps from US-ASCII codes (like 32) to the corresponding 621character (like space, for 32). Only characters 32 to 126 are 622defined. This is meant for use by C<e2char($x)> when it senses 623that it's running on a non-ASCII platform (where chr(32) doesn't 624get you a space -- but $Code2USASCII{32} will). It's 625documented here just in case you might find it useful. 626 627=back 628 629=head1 CAVEATS 630 631On Perl versions before 5.7, Unicode characters with a value 632over 255 (like lambda or emdash) can't be conveyed. This 633module does work under such early Perl versions, but in the 634place of each such character, you get a "?". Latin-1 635characters (characters 160-255) are unaffected. 636 637Under EBCDIC platforms, C<e2char($n)> may not always be the 638same as C<chr(e2charnum($n))>, and ditto for 639C<$Name2character{$name}> and 640C<chr($Name2character_number{$name})>. 641 642=head1 SEE ALSO 643 644L<Pod::Browser> - a pod web server based on L<Catalyst>. 645 646L<Pod::Checker> - check pod documents for syntax errors. 647 648L<Pod::Coverage> - check if the documentation for a module is comprehensive. 649 650L<perlpod> - description of pod format (for people documenting with pod). 651 652L<perlpodspec> - specification of pod format (for people processing it). 653 654L<Text::Unidecode> - ASCII transliteration of Unicode text. 655 656=head1 REPOSITORY 657 658L<https://github.com/neilbowers/Pod-Escapes> 659 660=head1 COPYRIGHT AND DISCLAIMERS 661 662Copyright (c) 2001-2004 Sean M. Burke. All rights reserved. 663 664This library is free software; you can redistribute it and/or modify 665it under the same terms as Perl itself. 666 667This program is distributed in the hope that it will be useful, but 668without any warranty; without even the implied warranty of 669merchantability or fitness for a particular purpose. 670 671Portions of the data tables in this module are derived from the 672entity declarations in the W3C XHTML specification. 673 674Currently (October 2001), that's these three: 675 676 http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent 677 http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent 678 http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent 679 680=head1 AUTHOR 681 682Sean M. Burke C<sburke@cpan.org> 683 684Now being maintained by Neil Bowers E<lt>neilb@cpan.orgE<gt> 685 686=cut 687 688#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 689# What I used for reading the XHTML .ent files: 690 691my(@norms, @good, @bad); 692my $dir = 'c:/sgml/docbook/'; 693my %escapes; 694foreach my $file (qw( 695 xhtml-symbol.ent 696 xhtml-lat1.ent 697 xhtml-special.ent 698)) { 699 open(IN, "<$dir$file") or die "can't read-open $dir$file: $!"; 700 print "Reading $file...\n"; 701 while(<IN>) { 702 if(m/<!ENTITY\s+(\S+)\s+"&#([^;]+);">/) { 703 my($name, $value) = ($1,$2); 704 next if $name eq 'quot' or $name eq 'apos' or $name eq 'gt'; 705 706 $value = hex $1 if $value =~ m/^x([a-fA-F0-9]+)$/s; 707 print "ILLEGAL VALUE $value" unless $value =~ m/^\d+$/s; 708 if($value > 255) { 709 push @good , sprintf " %-10s , chr(%s),\n", "'$name'", $value; 710 push @bad , sprintf " %-10s , \$bad,\n", "'$name'", $value; 711 } else { 712 push @norms, sprintf " %-10s , chr(%s),\n", "'$name'", $value; 713 } 714 } elsif(m/<!ENT/) { 715 print "# Skipping $_"; 716 } 717 718 } 719 close(IN); 720} 721 722print @norms; 723print "\n ( \$] .= 5.006001 ? (\n"; 724print @good; 725print " ) : (\n"; 726print @bad; 727print " )\n);\n"; 728 729__END__ 730#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 731 732 733