1package Pod::Escapes;
2use strict;
3use warnings;
4use 5.006;
5
6use vars qw(
7  %Code2USASCII
8  %Name2character
9  %Name2character_number
10  %Latin1Code_to_fallback
11  %Latin1Char_to_fallback
12  $FAR_CHAR
13  $FAR_CHAR_NUMBER
14  $NOT_ASCII
15  @ISA $VERSION @EXPORT_OK %EXPORT_TAGS
16);
17
18require Exporter;
19@ISA = ('Exporter');
20$VERSION = '1.06';
21@EXPORT_OK = qw(
22  %Code2USASCII
23  %Name2character
24  %Name2character_number
25  %Latin1Code_to_fallback
26  %Latin1Char_to_fallback
27  e2char
28  e2charnum
29);
30%EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
31
32#==========================================================================
33
34$FAR_CHAR = "?" unless defined $FAR_CHAR;
35$FAR_CHAR_NUMBER = ord($FAR_CHAR) unless defined $FAR_CHAR_NUMBER;
36
37$NOT_ASCII = 'A' ne chr(65) unless defined $NOT_ASCII;
38
39#--------------------------------------------------------------------------
40sub e2char {
41  my $in = $_[0];
42  return undef unless defined $in and length $in;
43
44  # Convert to decimal:
45  if($in =~ m/^(0[0-7]*)$/s ) {
46    $in = oct $in;
47  } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) {
48    $in = hex $1;
49  } # else it's decimal, or named
50
51  if($NOT_ASCII) {
52    # We're in bizarro world of not-ASCII!
53    # Cope with US-ASCII codes, use fallbacks for Latin-1, or use FAR_CHAR.
54    unless($in =~ m/^\d+$/s) {
55      # It's a named character reference.  Get its numeric Unicode value.
56      $in = $Name2character{$in};
57      return undef unless defined $in;  # (if there's no such name)
58      $in = ord $in; # (All ents must be one character long.)
59        # ...So $in holds the char's US-ASCII numeric value, which we'll
60        #  now go get the local equivalent for.
61    }
62
63    # It's numeric, whether by origin or by mutation from a known name
64    return $Code2USASCII{$in} # so "65" => "A" everywhere
65        || $Latin1Code_to_fallback{$in} # Fallback.
66        || $FAR_CHAR; # Fall further back
67  }
68
69  # Normal handling:
70  if($in =~ m/^\d+$/s) {
71    if($] < 5.007  and  $in > 255) { # can't be trusted with Unicode
72      return $FAR_CHAR;
73    } else {
74      return chr($in);
75    }
76  } else {
77    return $Name2character{$in}; # returns undef if unknown
78  }
79}
80
81#--------------------------------------------------------------------------
82sub e2charnum {
83  my $in = $_[0];
84  return undef unless defined $in and length $in;
85
86  # Convert to decimal:
87  if($in =~ m/^(0[0-7]*)$/s ) {
88    $in = oct $in;
89  } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) {
90    $in = hex $1;
91  } # else it's decimal, or named
92
93  if($in =~ m/^[0-9]+$/s) {
94    return 0 + $in;
95  } else {
96    return $Name2character_number{$in}; # returns undef if unknown
97  }
98}
99
100#--------------------------------------------------------------------------
101
102%Name2character_number = (
103 # General XML/XHTML:
104 'lt'   => 60,
105 'gt'   => 62,
106 'quot' => 34,
107 'amp'  => 38,
108 'apos' => 39,
109
110 # POD-specific:
111 'sol'    => 47,
112 'verbar' => 124,
113
114 'lchevron' => 171, # legacy for laquo
115 'rchevron' => 187, # legacy for raquo
116
117 # Remember, grave looks like \ (as in virtu\)
118 #           acute looks like / (as in re/sume/)
119 #           circumflex looks like ^ (as in papier ma^che/)
120 #           umlaut/dieresis looks like " (as in nai"ve, Chloe")
121
122 # From the XHTML 1 .ent files:
123 'nbsp'     , 160,
124 'iexcl'    , 161,
125 'cent'     , 162,
126 'pound'    , 163,
127 'curren'   , 164,
128 'yen'      , 165,
129 'brvbar'   , 166,
130 'sect'     , 167,
131 'uml'      , 168,
132 'copy'     , 169,
133 'ordf'     , 170,
134 'laquo'    , 171,
135 'not'      , 172,
136 'shy'      , 173,
137 'reg'      , 174,
138 'macr'     , 175,
139 'deg'      , 176,
140 'plusmn'   , 177,
141 'sup2'     , 178,
142 'sup3'     , 179,
143 'acute'    , 180,
144 'micro'    , 181,
145 'para'     , 182,
146 'middot'   , 183,
147 'cedil'    , 184,
148 'sup1'     , 185,
149 'ordm'     , 186,
150 'raquo'    , 187,
151 'frac14'   , 188,
152 'frac12'   , 189,
153 'frac34'   , 190,
154 'iquest'   , 191,
155 'Agrave'   , 192,
156 'Aacute'   , 193,
157 'Acirc'    , 194,
158 'Atilde'   , 195,
159 'Auml'     , 196,
160 'Aring'    , 197,
161 'AElig'    , 198,
162 'Ccedil'   , 199,
163 'Egrave'   , 200,
164 'Eacute'   , 201,
165 'Ecirc'    , 202,
166 'Euml'     , 203,
167 'Igrave'   , 204,
168 'Iacute'   , 205,
169 'Icirc'    , 206,
170 'Iuml'     , 207,
171 'ETH'      , 208,
172 'Ntilde'   , 209,
173 'Ograve'   , 210,
174 'Oacute'   , 211,
175 'Ocirc'    , 212,
176 'Otilde'   , 213,
177 'Ouml'     , 214,
178 'times'    , 215,
179 'Oslash'   , 216,
180 'Ugrave'   , 217,
181 'Uacute'   , 218,
182 'Ucirc'    , 219,
183 'Uuml'     , 220,
184 'Yacute'   , 221,
185 'THORN'    , 222,
186 'szlig'    , 223,
187 'agrave'   , 224,
188 'aacute'   , 225,
189 'acirc'    , 226,
190 'atilde'   , 227,
191 'auml'     , 228,
192 'aring'    , 229,
193 'aelig'    , 230,
194 'ccedil'   , 231,
195 'egrave'   , 232,
196 'eacute'   , 233,
197 'ecirc'    , 234,
198 'euml'     , 235,
199 'igrave'   , 236,
200 'iacute'   , 237,
201 'icirc'    , 238,
202 'iuml'     , 239,
203 'eth'      , 240,
204 'ntilde'   , 241,
205 'ograve'   , 242,
206 'oacute'   , 243,
207 'ocirc'    , 244,
208 'otilde'   , 245,
209 'ouml'     , 246,
210 'divide'   , 247,
211 'oslash'   , 248,
212 'ugrave'   , 249,
213 'uacute'   , 250,
214 'ucirc'    , 251,
215 'uuml'     , 252,
216 'yacute'   , 253,
217 'thorn'    , 254,
218 'yuml'     , 255,
219
220 'fnof'     , 402,
221 'Alpha'    , 913,
222 'Beta'     , 914,
223 'Gamma'    , 915,
224 'Delta'    , 916,
225 'Epsilon'  , 917,
226 'Zeta'     , 918,
227 'Eta'      , 919,
228 'Theta'    , 920,
229 'Iota'     , 921,
230 'Kappa'    , 922,
231 'Lambda'   , 923,
232 'Mu'       , 924,
233 'Nu'       , 925,
234 'Xi'       , 926,
235 'Omicron'  , 927,
236 'Pi'       , 928,
237 'Rho'      , 929,
238 'Sigma'    , 931,
239 'Tau'      , 932,
240 'Upsilon'  , 933,
241 'Phi'      , 934,
242 'Chi'      , 935,
243 'Psi'      , 936,
244 'Omega'    , 937,
245 'alpha'    , 945,
246 'beta'     , 946,
247 'gamma'    , 947,
248 'delta'    , 948,
249 'epsilon'  , 949,
250 'zeta'     , 950,
251 'eta'      , 951,
252 'theta'    , 952,
253 'iota'     , 953,
254 'kappa'    , 954,
255 'lambda'   , 955,
256 'mu'       , 956,
257 'nu'       , 957,
258 'xi'       , 958,
259 'omicron'  , 959,
260 'pi'       , 960,
261 'rho'      , 961,
262 'sigmaf'   , 962,
263 'sigma'    , 963,
264 'tau'      , 964,
265 'upsilon'  , 965,
266 'phi'      , 966,
267 'chi'      , 967,
268 'psi'      , 968,
269 'omega'    , 969,
270 'thetasym' , 977,
271 'upsih'    , 978,
272 'piv'      , 982,
273 'bull'     , 8226,
274 'hellip'   , 8230,
275 'prime'    , 8242,
276 'Prime'    , 8243,
277 'oline'    , 8254,
278 'frasl'    , 8260,
279 'weierp'   , 8472,
280 'image'    , 8465,
281 'real'     , 8476,
282 'trade'    , 8482,
283 'alefsym'  , 8501,
284 'larr'     , 8592,
285 'uarr'     , 8593,
286 'rarr'     , 8594,
287 'darr'     , 8595,
288 'harr'     , 8596,
289 'crarr'    , 8629,
290 'lArr'     , 8656,
291 'uArr'     , 8657,
292 'rArr'     , 8658,
293 'dArr'     , 8659,
294 'hArr'     , 8660,
295 'forall'   , 8704,
296 'part'     , 8706,
297 'exist'    , 8707,
298 'empty'    , 8709,
299 'nabla'    , 8711,
300 'isin'     , 8712,
301 'notin'    , 8713,
302 'ni'       , 8715,
303 'prod'     , 8719,
304 'sum'      , 8721,
305 'minus'    , 8722,
306 'lowast'   , 8727,
307 'radic'    , 8730,
308 'prop'     , 8733,
309 'infin'    , 8734,
310 'ang'      , 8736,
311 'and'      , 8743,
312 'or'       , 8744,
313 'cap'      , 8745,
314 'cup'      , 8746,
315 'int'      , 8747,
316 'there4'   , 8756,
317 'sim'      , 8764,
318 'cong'     , 8773,
319 'asymp'    , 8776,
320 'ne'       , 8800,
321 'equiv'    , 8801,
322 'le'       , 8804,
323 'ge'       , 8805,
324 'sub'      , 8834,
325 'sup'      , 8835,
326 'nsub'     , 8836,
327 'sube'     , 8838,
328 'supe'     , 8839,
329 'oplus'    , 8853,
330 'otimes'   , 8855,
331 'perp'     , 8869,
332 'sdot'     , 8901,
333 'lceil'    , 8968,
334 'rceil'    , 8969,
335 'lfloor'   , 8970,
336 'rfloor'   , 8971,
337 'lang'     , 9001,
338 'rang'     , 9002,
339 'loz'      , 9674,
340 'spades'   , 9824,
341 'clubs'    , 9827,
342 'hearts'   , 9829,
343 'diams'    , 9830,
344 'OElig'    , 338,
345 'oelig'    , 339,
346 'Scaron'   , 352,
347 'scaron'   , 353,
348 'Yuml'     , 376,
349 'circ'     , 710,
350 'tilde'    , 732,
351 'ensp'     , 8194,
352 'emsp'     , 8195,
353 'thinsp'   , 8201,
354 'zwnj'     , 8204,
355 'zwj'      , 8205,
356 'lrm'      , 8206,
357 'rlm'      , 8207,
358 'ndash'    , 8211,
359 'mdash'    , 8212,
360 'lsquo'    , 8216,
361 'rsquo'    , 8217,
362 'sbquo'    , 8218,
363 'ldquo'    , 8220,
364 'rdquo'    , 8221,
365 'bdquo'    , 8222,
366 'dagger'   , 8224,
367 'Dagger'   , 8225,
368 'permil'   , 8240,
369 'lsaquo'   , 8249,
370 'rsaquo'   , 8250,
371 'euro'     , 8364,
372);
373
374
375# Fill out %Name2character...
376{
377  %Name2character = ();
378  my($name, $number);
379  while( ($name, $number) = each %Name2character_number) {
380    if($] < 5.007  and  $number > 255) {
381      $Name2character{$name} = $FAR_CHAR;
382      # substitute for Unicode characters, for perls
383      #  that can't reliable handle them
384    } else {
385      $Name2character{$name} = chr $number;
386      # normal case
387    }
388  }
389  # So they resolve 'right' even in EBCDIC-land
390  $Name2character{'lt'  }   = '<';
391  $Name2character{'gt'  }   = '>';
392  $Name2character{'quot'}   = '"';
393  $Name2character{'amp' }   = '&';
394  $Name2character{'apos'}   = "'";
395  $Name2character{'sol' }   = '/';
396  $Name2character{'verbar'} = '|';
397}
398
399#--------------------------------------------------------------------------
400
401%Code2USASCII = (
402# mostly generated by
403#  perl -e "printf qq{  \x25 3s, '\x25s',\n}, $_, chr($_) foreach (32 .. 126)"
404   32, ' ',
405   33, '!',
406   34, '"',
407   35, '#',
408   36, '$',
409   37, '%',
410   38, '&',
411   39, "'", #!
412   40, '(',
413   41, ')',
414   42, '*',
415   43, '+',
416   44, ',',
417   45, '-',
418   46, '.',
419   47, '/',
420   48, '0',
421   49, '1',
422   50, '2',
423   51, '3',
424   52, '4',
425   53, '5',
426   54, '6',
427   55, '7',
428   56, '8',
429   57, '9',
430   58, ':',
431   59, ';',
432   60, '<',
433   61, '=',
434   62, '>',
435   63, '?',
436   64, '@',
437   65, 'A',
438   66, 'B',
439   67, 'C',
440   68, 'D',
441   69, 'E',
442   70, 'F',
443   71, 'G',
444   72, 'H',
445   73, 'I',
446   74, 'J',
447   75, 'K',
448   76, 'L',
449   77, 'M',
450   78, 'N',
451   79, 'O',
452   80, 'P',
453   81, 'Q',
454   82, 'R',
455   83, 'S',
456   84, 'T',
457   85, 'U',
458   86, 'V',
459   87, 'W',
460   88, 'X',
461   89, 'Y',
462   90, 'Z',
463   91, '[',
464   92, "\\", #!
465   93, ']',
466   94, '^',
467   95, '_',
468   96, '`',
469   97, 'a',
470   98, 'b',
471   99, 'c',
472  100, 'd',
473  101, 'e',
474  102, 'f',
475  103, 'g',
476  104, 'h',
477  105, 'i',
478  106, 'j',
479  107, 'k',
480  108, 'l',
481  109, 'm',
482  110, 'n',
483  111, 'o',
484  112, 'p',
485  113, 'q',
486  114, 'r',
487  115, 's',
488  116, 't',
489  117, 'u',
490  118, 'v',
491  119, 'w',
492  120, 'x',
493  121, 'y',
494  122, 'z',
495  123, '{',
496  124, '|',
497  125, '}',
498  126, '~',
499);
500
501#--------------------------------------------------------------------------
502
503%Latin1Code_to_fallback = ();
504@Latin1Code_to_fallback{0xA0 .. 0xFF} = (
505# Copied from Text/Unidecode/x00.pm:
506
507' ', qq{!}, qq{C/}, 'PS', qq{\$?}, qq{Y=}, qq{|}, 'SS', qq{"}, qq{(c)}, 'a', qq{<<}, qq{!}, "", qq{(r)}, qq{-},
508'deg', qq{+-}, '2', '3', qq{'}, 'u', 'P', qq{*}, qq{,}, '1', 'o', qq{>>}, qq{1/4}, qq{1/2}, qq{3/4}, qq{?},
509'A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
510'D', 'N', 'O', 'O', 'O', 'O', 'O', 'x', 'O', 'U', 'U', 'U', 'U', 'U', 'Th', 'ss',
511'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
512'd', 'n', 'o', 'o', 'o', 'o', 'o', qq{/}, 'o', 'u', 'u', 'u', 'u', 'y', 'th', 'y',
513
514);
515
516{
517  # Now stuff %Latin1Char_to_fallback:
518  %Latin1Char_to_fallback = ();
519  my($k,$v);
520  while( ($k,$v) = each %Latin1Code_to_fallback) {
521    $Latin1Char_to_fallback{chr $k} = $v;
522    #print chr($k), ' => ', $v, "\n";
523  }
524}
525
526#--------------------------------------------------------------------------
5271;
528__END__
529
530=head1 NAME
531
532Pod::Escapes - for resolving Pod EE<lt>...E<gt> sequences
533
534=head1 SYNOPSIS
535
536  use Pod::Escapes qw(e2char);
537  ...la la la, parsing POD, la la la...
538  $text = e2char($e_node->label);
539  unless(defined $text) {
540    print "Unknown E sequence \"", $e_node->label, "\"!";
541  }
542  ...else print/interpolate $text...
543
544=head1 DESCRIPTION
545
546This module provides things that are useful in decoding
547Pod EE<lt>...E<gt> sequences.  Presumably, it should be used
548only by Pod parsers and/or formatters.
549
550By default, Pod::Escapes exports none of its symbols.  But
551you can request any of them to be exported.
552Either request them individually, as with
553C<use Pod::Escapes qw(symbolname symbolname2...);>,
554or you can do C<use Pod::Escapes qw(:ALL);> to get all
555exportable symbols.
556
557=head1 GOODIES
558
559=over
560
561=item e2char($e_content)
562
563Given a name or number that could appear in a
564C<EE<lt>name_or_numE<gt>> sequence, this returns the string that
565it stands for.  For example, C<e2char('sol')>, C<e2char('47')>,
566C<e2char('0x2F')>, and C<e2char('057')> all return "/",
567because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>,
568and C<EE<lt>057E<gt>>, all mean "/".  If
569the name has no known value (as with a name of "qacute") or is
570syntactically invalid (as with a name of "1/4"), this returns undef.
571
572=item e2charnum($e_content)
573
574Given a name or number that could appear in a
575C<EE<lt>name_or_numE<gt>> sequence, this returns the number of
576the Unicode character that this stands for.  For example,
577C<e2char('sol')>, C<e2char('47')>,
578C<e2char('0x2F')>, and C<e2char('057')> all return 47,
579because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>,
580and C<EE<lt>057E<gt>>, all mean "/", whose Unicode number is 47.  If
581the name has no known value (as with a name of "qacute") or is
582syntactically invalid (as with a name of "1/4"), this returns undef.
583
584=item $Name2character{I<name>}
585
586Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol"
587to the string that each stands for.  Note that this does not
588include numerics (like "64" or "x981c").  Under old Perl versions
589(before 5.7) you get a "?" in place of characters whose Unicode
590value is over 255.
591
592=item $Name2character_number{I<name>}
593
594Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol"
595to the Unicode value that each stands for.  For example,
596C<$Name2character_number{'eacute'}> is 201, and
597C<$Name2character_number{'eacute'}> is 8364.  You get the correct
598Unicode value, regardless of the version of Perl you're using --
599which differs from C<%Name2character>'s behavior under pre-5.7 Perls.
600
601Note that this hash does not
602include numerics (like "64" or "x981c").
603
604=item $Latin1Code_to_fallback{I<integer>}
605
606For numbers in the range 160 (0x00A0) to 255 (0x00FF), this maps
607from the character code for a Latin-1 character (like 233 for
608lowercase e-acute) to the US-ASCII character that best aproximates
609it (like "e").  You may find this useful if you are rendering
610POD in a format that you think deals well only with US-ASCII
611characters.
612
613=item $Latin1Char_to_fallback{I<character>}
614
615Just as above, but maps from characters (like "\xE9",
616lowercase e-acute) to characters (like "e").
617
618=item $Code2USASCII{I<integer>}
619
620This maps from US-ASCII codes (like 32) to the corresponding
621character (like space, for 32).  Only characters 32 to 126 are
622defined.  This is meant for use by C<e2char($x)> when it senses
623that it's running on a non-ASCII platform (where chr(32) doesn't
624get you a space -- but $Code2USASCII{32} will).  It's
625documented here just in case you might find it useful.
626
627=back
628
629=head1 CAVEATS
630
631On Perl versions before 5.7, Unicode characters with a value
632over 255 (like lambda or emdash) can't be conveyed.  This
633module does work under such early Perl versions, but in the
634place of each such character, you get a "?".  Latin-1
635characters (characters 160-255) are unaffected.
636
637Under EBCDIC platforms, C<e2char($n)> may not always be the
638same as C<chr(e2charnum($n))>, and ditto for
639C<$Name2character{$name}> and
640C<chr($Name2character_number{$name})>.
641
642=head1 SEE ALSO
643
644L<Pod::Browser> - a pod web server based on L<Catalyst>.
645
646L<Pod::Checker> - check pod documents for syntax errors.
647
648L<Pod::Coverage> - check if the documentation for a module is comprehensive.
649
650L<perlpod> - description of pod format (for people documenting with pod).
651
652L<perlpodspec> - specification of pod format (for people processing it).
653
654L<Text::Unidecode> - ASCII transliteration of Unicode text.
655
656=head1 REPOSITORY
657
658L<https://github.com/neilbowers/Pod-Escapes>
659
660=head1 COPYRIGHT AND DISCLAIMERS
661
662Copyright (c) 2001-2004 Sean M. Burke.  All rights reserved.
663
664This library is free software; you can redistribute it and/or modify
665it under the same terms as Perl itself.
666
667This program is distributed in the hope that it will be useful, but
668without any warranty; without even the implied warranty of
669merchantability or fitness for a particular purpose.
670
671Portions of the data tables in this module are derived from the
672entity declarations in the W3C XHTML specification.
673
674Currently (October 2001), that's these three:
675
676 http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent
677 http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent
678 http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent
679
680=head1 AUTHOR
681
682Sean M. Burke C<sburke@cpan.org>
683
684Now being maintained by Neil Bowers E<lt>neilb@cpan.orgE<gt>
685
686=cut
687
688#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
689# What I used for reading the XHTML .ent files:
690
691my(@norms, @good, @bad);
692my $dir = 'c:/sgml/docbook/';
693my %escapes;
694foreach my $file (qw(
695  xhtml-symbol.ent
696  xhtml-lat1.ent
697  xhtml-special.ent
698)) {
699  open(IN, "<$dir$file") or die "can't read-open $dir$file: $!";
700  print "Reading $file...\n";
701  while(<IN>) {
702    if(m/<!ENTITY\s+(\S+)\s+"&#([^;]+);">/) {
703      my($name, $value) = ($1,$2);
704      next if $name eq 'quot' or $name eq 'apos' or $name eq 'gt';
705
706      $value = hex $1 if $value =~ m/^x([a-fA-F0-9]+)$/s;
707      print "ILLEGAL VALUE $value" unless $value =~ m/^\d+$/s;
708      if($value > 255) {
709        push @good , sprintf "   %-10s , chr(%s),\n", "'$name'", $value;
710        push @bad  , sprintf "   %-10s , \$bad,\n", "'$name'", $value;
711      } else {
712        push @norms, sprintf " %-10s , chr(%s),\n", "'$name'", $value;
713      }
714    } elsif(m/<!ENT/) {
715      print "# Skipping $_";
716    }
717
718  }
719  close(IN);
720}
721
722print @norms;
723print "\n ( \$] .= 5.006001 ? (\n";
724print @good;
725print " ) : (\n";
726print @bad;
727print " )\n);\n";
728
729__END__
730#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
731
732
733