xref: /openbsd/gnu/usr.bin/perl/cpan/Encode/bin/enc2xs (revision d415bd75)
1#!./perl
2BEGIN {
3    # @INC poking  no longer needed w/ new MakeMaker and Makefile.PL's
4    # with $ENV{PERL_CORE} set
5    # In case we need it in future...
6    require Config; import Config;
7    pop @INC if $INC[-1] eq '.';
8}
9use strict;
10use warnings;
11use Getopt::Std;
12use Config;
13my @orig_ARGV = @ARGV;
14our $VERSION  = do { my @r = (q$Revision: 2.24 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
15
16# These may get re-ordered.
17# RAW is a do_now as inserted by &enter
18# AGG is an aggregated do_now, as built up by &process
19
20use constant {
21  RAW_NEXT => 0,
22  RAW_IN_LEN => 1,
23  RAW_OUT_BYTES => 2,
24  RAW_FALLBACK => 3,
25
26  AGG_MIN_IN => 0,
27  AGG_MAX_IN => 1,
28  AGG_OUT_BYTES => 2,
29  AGG_NEXT => 3,
30  AGG_IN_LEN => 4,
31  AGG_OUT_LEN => 5,
32  AGG_FALLBACK => 6,
33};
34
35# (See the algorithm in encengine.c - we're building structures for it)
36
37# There are two sorts of structures.
38# "do_now" (an array, two variants of what needs storing) is whatever we need
39# to do now we've read an input byte.
40# It's housed in a "do_next" (which is how we got to it), and in turn points
41# to a "do_next" which contains all the "do_now"s for the next input byte.
42
43# There will be a "do_next" which is the start state.
44# For a single byte encoding it's the only "do_next" - each "do_now" points
45# back to it, and each "do_now" will cause bytes. There is no state.
46
47# For a multi-byte encoding where all characters in the input are the same
48# length, then there will be a tree of "do_now"->"do_next"->"do_now"
49# branching out from the start state, one step for each input byte.
50# The leaf "do_now"s will all be at the same distance from the start state,
51# only the leaf "do_now"s cause output bytes, and they in turn point back to
52# the start state.
53
54# For an encoding where there are variable length input byte sequences, you
55# will encounter a leaf "do_now" sooner for the shorter input sequences, but
56# as before the leaves will point back to the start state.
57
58# The system will cope with escape encodings (imagine them as a mostly
59# self-contained tree for each escape state, and cross links between trees
60# at the state-switching characters) but so far no input format defines these.
61
62# The system will also cope with having output "leaves" in the middle of
63# the bifurcating branches, not just at the extremities, but again no
64# input format does this yet.
65
66# There are two variants of the "do_now" structure. The first, smaller variant
67# is generated by &enter as the input file is read. There is one structure
68# for each input byte. Say we are mapping a single byte encoding to a
69# single byte encoding, with  "ABCD" going "abcd". There will be
70# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
71
72# &process then walks the tree, building aggregate "do_now" structures for
73# adjacent bytes where possible. The aggregate is for a contiguous range of
74# bytes which each produce the same length of output, each move to the
75# same next state, and each have the same fallback flag.
76# So our 4 RAW "do_now"s above become replaced by a single structure
77# containing:
78# ["A", "D", "abcd", 1, ...]
79# ie, for an input byte $_ in "A".."D", output 1 byte, found as
80# substr ("abcd", (ord $_ - ord "A") * 1, 1)
81# which maps very nicely into pointer arithmetic in C for encengine.c
82
83sub encode_U
84{
85 # UTF-8 encode long hand - only covers part of perl's range
86 ## my $uv = shift;
87 # chr() works in native space so convert value from table
88 # into that space before using chr().
89 my $ch = chr(utf8::unicode_to_native($_[0]));
90 # Now get core perl to encode that the way it likes.
91 utf8::encode($ch);
92 return $ch;
93}
94
95sub encode_S
96{
97 # encode single byte
98 ## my ($ch,$page) = @_; return chr($ch);
99 return chr $_[0];
100}
101
102sub encode_D
103{
104 # encode double byte MS byte first
105 ## my ($ch,$page) = @_; return chr($page).chr($ch);
106 return chr ($_[1]) . chr $_[0];
107}
108
109sub encode_M
110{
111 # encode Multi-byte - single for 0..255 otherwise double
112 ## my ($ch,$page) = @_;
113 ## return &encode_D if $page;
114 ## return &encode_S;
115 return chr ($_[1]) . chr $_[0] if $_[1];
116 return chr $_[0];
117}
118
119my %encode_types = (U => \&encode_U,
120                    S => \&encode_S,
121                    D => \&encode_D,
122                    M => \&encode_M,
123                   );
124
125# Win32 does not expand globs on command line
126if ($^O eq 'MSWin32' and !$ENV{PERL_CORE}) {
127    eval "\@ARGV = map(glob(\$_),\@ARGV)";
128    @ARGV = @orig_ARGV unless @ARGV;
129}
130
131my %opt;
132# I think these are:
133# -Q to disable the duplicate codepoint test
134# -S make mapping errors fatal
135# -q to remove comments written to output files
136# -O to enable the (brute force) substring optimiser
137# -o <output> to specify the output file name (else it's the first arg)
138# -f <inlist> to give a file with a list of input files (else use the args)
139# -n <name> to name the encoding (else use the basename of the input file.
140#Getopt::Long::Configure("bundling");
141#GetOptions(\%opt, qw(C M=s S Q q O o=s f=s n=s v));
142getopts('CM:SQqOo:f:n:v',\%opt);
143
144$opt{M} and make_makefile_pl($opt{M}, @ARGV);
145$opt{C} and make_configlocal_pm($opt{C}, @ARGV);
146$opt{v} ||= $ENV{ENC2XS_VERBOSE};
147$opt{q} ||= $ENV{ENC2XS_NO_COMMENTS};
148
149sub verbose {
150    print STDERR @_ if $opt{v};
151}
152sub verbosef {
153    printf STDERR @_ if $opt{v};
154}
155
156
157# ($cpp, $static, $sized) = compiler_info($declaration)
158#
159# return some information about the compiler and compile options we're using:
160#
161#   $declaration - true if we're doing a declaration rather than a definition.
162#
163#   $cpp    - we're using C++
164#   $static - ok to declare the arrays as static
165#   $sized  - the array declarations should be sized
166
167sub compiler_info {
168    my ($declaration) = @_;
169
170    my $ccflags = $Config{ccflags};
171    if (defined $Config{ccwarnflags}) {
172        $ccflags .= " " . $Config{ccwarnflags};
173    }
174    my $compat   = $ccflags =~ /\Q-Wc++-compat/;
175    my $pedantic = $ccflags =~ /-pedantic/;
176
177    my $cpp      = ($Config{d_cplusplus} || '') eq 'define';
178
179    # The encpage_t tables contain recursive and mutually recursive
180    # references. To allow them to compile under C++ and some restrictive
181    # cc options, it may be necessary to make the tables non-static/const
182    # (thus moving them from the text to the data segment) and/or not
183    # include the size in the declaration.
184
185    my $static = !(
186                        $cpp
187                     || ($compat && $pedantic)
188                     || ($^O eq 'MacOS' && $declaration)
189                  );
190
191    # -Wc++-compat on its own warns if the array declaration is sized.
192    # The easiest way to avoid this warning is simply not to include
193    # the size in the declaration.
194    # With -pedantic as well, the issue doesn't arise because $static
195    # above becomes false.
196    my $sized  = $declaration && !($compat && !$pedantic);
197
198    return ($cpp, $static, $sized);
199}
200
201
202# This really should go first, else the die here causes empty (non-erroneous)
203# output files to be written.
204my @encfiles;
205if (exists $opt{f}) {
206    # -F is followed by name of file containing list of filenames
207    my $flist = $opt{f};
208    open(FLIST,$flist) || die "Cannot open $flist:$!";
209    chomp(@encfiles = <FLIST>);
210    close(FLIST);
211} else {
212    @encfiles = @ARGV;
213}
214
215my $cname = $opt{o} ? $opt{o} : shift(@ARGV);
216unless ($cname) { #debuging a win32 nmake error-only. works via cmdline
217    print "\nARGV:";
218    print "$_ " for @ARGV;
219    print "\nopt:";
220    print "  $_ => ",defined $opt{$_}?$opt{$_}:"undef","\n" for keys %opt;
221}
222chmod(0666,$cname) if -f $cname && !-w $cname;
223open(C,">", $cname) || die "Cannot open $cname:$!";
224
225my $dname = $cname;
226my $hname = $cname;
227
228my ($doC,$doEnc,$doUcm,$doPet);
229
230if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
231 {
232  $doC = 1;
233  $dname =~ s/(\.[^\.]*)?$/.exh/;
234  chmod(0666,$dname) if -f $cname && !-w $dname;
235  open(D,">", $dname) || die "Cannot open $dname:$!";
236  $hname =~ s/(\.[^\.]*)?$/.h/;
237  chmod(0666,$hname) if -f $cname && !-w $hname;
238  open(H,">", $hname) || die "Cannot open $hname:$!";
239
240  foreach my $fh (\*C,\*D,\*H)
241  {
242   print $fh <<"END" unless $opt{'q'};
243/*
244 !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
245 This file was autogenerated by:
246 $^X $0 @orig_ARGV
247 enc2xs VERSION $VERSION
248*/
249END
250  }
251
252  if ($cname =~ /\.c$/i && $Config{ccname} eq "gcc")
253   {
254    print C qq(#pragma GCC diagnostic ignored "-Wc++-compat"\n);
255   }
256
257  if ($cname =~ /\.xs$/i)
258   {
259    print C "#define PERL_NO_GET_CONTEXT\n";
260    print C "#include <EXTERN.h>\n";
261    print C "#include <perl.h>\n";
262    print C "#include <XSUB.h>\n";
263   }
264  print C "#include \"encode.h\"\n\n";
265
266 }
267elsif ($cname =~ /\.enc$/i)
268 {
269  $doEnc = 1;
270 }
271elsif ($cname =~ /\.ucm$/i)
272 {
273  $doUcm = 1;
274 }
275elsif ($cname =~ /\.pet$/i)
276 {
277  $doPet = 1;
278 }
279
280my %encoding;
281my %strings;
282my $string_acc;
283my %strings_in_acc;
284
285my $saved = 0;
286my $subsave = 0;
287my $strings = 0;
288
289sub cmp_name
290{
291 if ($a =~ /^.*-(\d+)/)
292  {
293   my $an = $1;
294   if ($b =~ /^.*-(\d+)/)
295    {
296     my $r = $an <=> $1;
297     return $r if $r;
298    }
299  }
300 return $a cmp $b;
301}
302
303
304foreach my $enc (sort cmp_name @encfiles)
305 {
306  my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
307  $name = $opt{'n'} if exists $opt{'n'};
308  if (open(E,$enc))
309   {
310    if ($sfx eq 'enc')
311     {
312      compile_enc(\*E,lc($name));
313     }
314    else
315     {
316      compile_ucm(\*E,lc($name));
317     }
318   }
319  else
320   {
321    warn "Cannot open $enc for $name:$!";
322   }
323 }
324
325if ($doC)
326 {
327  verbose "Writing compiled form\n";
328  foreach my $name (sort cmp_name keys %encoding)
329   {
330    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
331    process($name.'_utf8',$e2u);
332    addstrings(\*C,$e2u);
333
334    process('utf8_'.$name,$u2e);
335    addstrings(\*C,$u2e);
336   }
337  outbigstring(\*C,"enctable");
338  foreach my $name (sort cmp_name keys %encoding)
339   {
340    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
341    outtable(\*C,$e2u, "enctable");
342    outtable(\*C,$u2e, "enctable");
343
344    # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
345   }
346  my ($cpp) = compiler_info(0);
347  my $ext  = $cpp ? 'extern "C"' : "extern";
348  my $exta = $cpp ? 'extern "C"' : "static";
349  my $extb = $cpp ? 'extern "C"' : "";
350  foreach my $enc (sort cmp_name keys %encoding)
351   {
352    # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
353    my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
354    #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
355    my $replen = 0;
356    $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
357    my $sym = "${enc}_encoding";
358    $sym =~ s/\W+/_/g;
359    my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen,
360        $min_el,$max_el);
361    print C "${exta} const U8 ${sym}_rep_character[] = \"$rep\";\n";
362    print C "${exta} const char ${sym}_enc_name[] = \"$enc\";\n\n";
363    print C "${extb} const encode_t $sym = \n";
364    # This is to make null encoding work -- dankogai
365    for (my $i = (scalar @info) - 1;  $i >= 0; --$i){
366    $info[$i] ||= 1;
367    }
368    # end of null tweak -- dankogai
369    print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n";
370   }
371
372  foreach my $enc (sort cmp_name keys %encoding)
373   {
374    my $sym = "${enc}_encoding";
375    $sym =~ s/\W+/_/g;
376    print H "${ext} encode_t $sym;\n";
377    print D " Encode_XSEncoding(aTHX_ &$sym);\n";
378   }
379
380  if ($cname =~ /(\w+)\.xs$/)
381   {
382    my $mod = $1;
383    print C <<'END';
384
385static void
386Encode_XSEncoding(pTHX_ encode_t *enc)
387{
388 dSP;
389 HV *stash = gv_stashpv("Encode::XS", TRUE);
390 SV *iv    = newSViv(PTR2IV(enc));
391 SV *sv    = sv_bless(newRV_noinc(iv),stash);
392 int i = 0;
393 /* with the SvLEN() == 0 hack, PVX won't be freed. We cast away name's
394 constness, in the hope that perl won't mess with it. */
395 assert(SvTYPE(iv) >= SVt_PV); assert(SvLEN(iv) == 0);
396 SvFLAGS(iv) |= SVp_POK;
397 SvPVX(iv) = (char*) enc->name[0];
398 PUSHMARK(sp);
399 XPUSHs(sv);
400 while (enc->name[i])
401  {
402   const char *name = enc->name[i++];
403   XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
404  }
405 PUTBACK;
406 call_pv("Encode::define_encoding",G_DISCARD);
407 SvREFCNT_dec(sv);
408}
409
410END
411
412    print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
413    print C "BOOT:\n{\n";
414    print C "#include \"$dname\"\n";
415    print C "}\n";
416   }
417  # Close in void context is bad, m'kay
418  close(D) or warn "Error closing '$dname': $!";
419  close(H) or warn "Error closing '$hname': $!";
420
421  my $perc_saved    = $saved/($strings + $saved) * 100;
422  my $perc_subsaved = $subsave/($strings + $subsave) * 100;
423  verbosef "%d bytes in string tables\n",$strings;
424  verbosef "%d bytes (%.3g%%) saved spotting duplicates\n",
425    $saved, $perc_saved              if $saved;
426  verbosef "%d bytes (%.3g%%) saved using substrings\n",
427    $subsave, $perc_subsaved         if $subsave;
428 }
429elsif ($doEnc)
430 {
431  foreach my $name (sort cmp_name keys %encoding)
432   {
433    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
434    output_enc(\*C,$name,$e2u);
435   }
436 }
437elsif ($doUcm)
438 {
439  foreach my $name (sort cmp_name keys %encoding)
440   {
441    my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
442    output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
443   }
444 }
445
446# writing half meg files and then not checking to see if you just filled the
447# disk is bad, m'kay
448close(C) or die "Error closing '$cname': $!";
449
450# End of the main program.
451
452sub compile_ucm
453{
454 my ($fh,$name) = @_;
455 my $e2u = {};
456 my $u2e = {};
457 my $cs;
458 my %attr;
459 while (<$fh>)
460  {
461   s/#.*$//;
462   last if /^\s*CHARMAP\s*$/i;
463   if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
464    {
465     $attr{$1} = $2;
466    }
467  }
468 if (!defined($cs =  $attr{'code_set_name'}))
469  {
470   warn "No <code_set_name> in $name\n";
471  }
472 else
473  {
474   $name = $cs unless exists $opt{'n'};
475  }
476 my $erep;
477 my $urep;
478 my $max_el;
479 my $min_el;
480 if (exists $attr{'subchar'})
481  {
482   #my @byte;
483   #$attr{'subchar'} =~ /^\s*/cg;
484   #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
485   #$erep = join('',map(chr(hex($_)),@byte));
486   $erep = $attr{'subchar'};
487   $erep =~ s/^\s+//; $erep =~ s/\s+$//;
488  }
489 print "Reading $name ($cs)\n"
490   unless defined $ENV{MAKEFLAGS}
491      and $ENV{MAKEFLAGS} =~ /\b(s|silent|quiet)\b/;
492 my $nfb = 0;
493 my $hfb = 0;
494 while (<$fh>)
495  {
496   s/#.*$//;
497   last if /^\s*END\s+CHARMAP\s*$/i;
498   next if /^\s*$/;
499   my (@uni, @byte) = ();
500   my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
501       or die "Bad line: $_";
502   while ($uni =~  m/\G<([U0-9a-fA-F\+]+)>/g){
503       push @uni, map { substr($_, 1) } split(/\+/, $1);
504   }
505   while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
506       push @byte, $1;
507   }
508   if (@uni)
509    {
510     my $uch =  join('', map { encode_U(hex($_)) } @uni );
511     my $ech = join('',map(chr(hex($_)),@byte));
512     my $el  = length($ech);
513     $max_el = $el if (!defined($max_el) || $el > $max_el);
514     $min_el = $el if (!defined($min_el) || $el < $min_el);
515     if (length($fb))
516      {
517       $fb = substr($fb,1);
518       $hfb++;
519      }
520     else
521      {
522       $nfb++;
523       $fb = '0';
524      }
525     # $fb is fallback flag
526     # 0 - round trip safe
527     # 1 - fallback for unicode -> enc
528     # 2 - skip sub-char mapping
529     # 3 - fallback enc -> unicode
530     enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
531     enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
532    }
533   else
534    {
535     warn $_;
536    }
537  }
538 if ($nfb && $hfb)
539  {
540   die "$nfb entries without fallback, $hfb entries with\n";
541  }
542 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
543}
544
545
546
547sub compile_enc
548{
549 my ($fh,$name) = @_;
550 my $e2u = {};
551 my $u2e = {};
552
553 my $type;
554 while ($type = <$fh>)
555  {
556   last if $type !~ /^\s*#/;
557  }
558 chomp($type);
559 return if $type eq 'E';
560 # Do the hash lookup once, rather than once per function call. 4% speedup.
561 my $type_func = $encode_types{$type};
562 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
563 warn "$type encoded $name\n";
564 my $rep = '';
565 # Save a defined test by setting these to defined values.
566 my $min_el = ~0; # A very big integer
567 my $max_el = 0;  # Anything must be longer than 0
568 {
569  my $v = hex($def);
570  $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
571 }
572 my $errors;
573 my $seen;
574 # use -Q to silence the seen test. Makefile.PL uses this by default.
575 $seen = {} unless $opt{Q};
576 do
577  {
578   my $line = <$fh>;
579   chomp($line);
580   my $page = hex($line);
581   my $ch = 0;
582   my $i = 16;
583   do
584    {
585     # So why is it 1% faster to leave the my here?
586     my $line = <$fh>;
587     $line =~ s/\r\n$/\n/;
588     die "$.:${line}Line should be exactly 65 characters long including
589     newline (".length($line).")" unless length ($line) == 65;
590     # Split line into groups of 4 hex digits, convert groups to ints
591     # This takes 65.35
592     # map {hex $_} $line =~ /(....)/g
593     # This takes 63.75 (2.5% less time)
594     # unpack "n*", pack "H*", $line
595     # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
596     # Doing it as while ($line =~ /(....)/g) took 74.63
597     foreach my $val (unpack "n*", pack "H*", $line)
598      {
599       next if $val == 0xFFFD;
600       my $ech = &$type_func($ch,$page);
601       if ($val || (!$ch && !$page))
602        {
603         my $el  = length($ech);
604         $max_el = $el if $el > $max_el;
605         $min_el = $el if $el < $min_el;
606         my $uch = encode_U($val);
607         if ($seen) {
608           # We're doing the test.
609           # We don't need to read this quickly, so storing it as a scalar,
610           # rather than 3 (anon array, plus the 2 scalars it holds) saves
611           # RAM and may make us faster on low RAM systems. [see __END__]
612           if (exists $seen->{$uch})
613             {
614               warn sprintf("U%04X is %02X%02X and %04X\n",
615                            $val,$page,$ch,$seen->{$uch});
616               $errors++;
617             }
618           else
619             {
620               $seen->{$uch} = $page << 8 | $ch;
621             }
622         }
623         # Passing 2 extra args each time is 3.6% slower!
624         # Even with having to add $fallback ||= 0 later
625         enter_fb0($e2u,$ech,$uch);
626         enter_fb0($u2e,$uch,$ech);
627        }
628       else
629        {
630         # No character at this position
631         # enter($e2u,$ech,undef,$e2u);
632        }
633       $ch++;
634      }
635    } while --$i;
636  } while --$pages;
637 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
638   if $min_el > $max_el;
639 die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
640 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
641}
642
643# my ($a,$s,$d,$t,$fb) = @_;
644sub enter {
645  my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
646  # state we shift to after this (multibyte) input character defaults to same
647  # as current state.
648  $next ||= $current;
649  # Making sure it is defined seems to be faster than {no warnings;} in
650  # &process, or passing it in as 0 explicitly.
651  # XXX $fallback ||= 0;
652
653  # Start at the beginning and work forwards through the string to zero.
654  # effectively we are removing 1 character from the front each time
655  # but we don't actually edit the string. [this alone seems to be 14% speedup]
656  # Hence -$pos is the length of the remaining string.
657  my $pos = -length $inbytes;
658  while (1) {
659    my $byte = substr $inbytes, $pos, 1;
660    #  RAW_NEXT => 0,
661    #  RAW_IN_LEN => 1,
662    #  RAW_OUT_BYTES => 2,
663    #  RAW_FALLBACK => 3,
664    # to unicode an array would seem to be better, because the pages are dense.
665    # from unicode can be very sparse, favouring a hash.
666    # hash using the bytes (all length 1) as keys rather than ord value,
667    # as it's easier to sort these in &process.
668
669    # It's faster to always add $fallback even if it's undef, rather than
670    # choosing between 3 and 4 element array. (hence why we set it defined
671    # above)
672    my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
673    # When $pos was -1 we were at the last input character.
674    unless (++$pos) {
675      $do_now->[RAW_OUT_BYTES] = $outbytes;
676      $do_now->[RAW_NEXT] = $next;
677      return;
678    }
679    # Tail recursion. The intermediate state may not have a name yet.
680    $current = $do_now->[RAW_NEXT];
681  }
682}
683
684# This is purely for optimisation. It's just &enter hard coded for $fallback
685# of 0, using only a 3 entry array ref to save memory for every entry.
686sub enter_fb0 {
687  my ($current,$inbytes,$outbytes,$next) = @_;
688  $next ||= $current;
689
690  my $pos = -length $inbytes;
691  while (1) {
692    my $byte = substr $inbytes, $pos, 1;
693    my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
694    unless (++$pos) {
695      $do_now->[RAW_OUT_BYTES] = $outbytes;
696      $do_now->[RAW_NEXT] = $next;
697      return;
698    }
699    $current = $do_now->[RAW_NEXT];
700  }
701}
702
703sub process
704{
705  my ($name,$a) = @_;
706  $name =~ s/\W+/_/g;
707  $a->{Cname} = $name;
708  my $raw = $a->{Raw};
709  my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
710  my @ent;
711  $agg_max_in = 0;
712  foreach my $key (sort keys %$raw) {
713    #  RAW_NEXT => 0,
714    #  RAW_IN_LEN => 1,
715    #  RAW_OUT_BYTES => 2,
716    #  RAW_FALLBACK => 3,
717    my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
718    # Now we are converting from raw to aggregate, switch from 1 byte strings
719    # to numbers
720    my $b = ord $key;
721    $fallback ||= 0;
722    if ($l &&
723        # If this == fails, we're going to reset $agg_max_in below anyway.
724        $b == ++$agg_max_in &&
725        # References in numeric context give the pointer as an int.
726        $agg_next == $next &&
727        $agg_in_len == $in_len &&
728        $agg_out_len == length $out_bytes &&
729        $agg_fallback == $fallback
730        # && length($l->[AGG_OUT_BYTES]) < 16
731       ) {
732      #     my $i = ord($b)-ord($l->[AGG_MIN_IN]);
733      # we can aggregate this byte onto the end.
734      $l->[AGG_MAX_IN] = $b;
735      $l->[AGG_OUT_BYTES] .= $out_bytes;
736    } else {
737      # AGG_MIN_IN => 0,
738      # AGG_MAX_IN => 1,
739      # AGG_OUT_BYTES => 2,
740      # AGG_NEXT => 3,
741      # AGG_IN_LEN => 4,
742      # AGG_OUT_LEN => 5,
743      # AGG_FALLBACK => 6,
744      # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
745      # (only gains .6% on euc-jp  -- is it worth it?)
746      push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
747                       $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
748                       $agg_fallback = $fallback];
749    }
750    if (exists $next->{Cname}) {
751      $next->{'Forward'} = 1 if $next != $a;
752    } else {
753      process(sprintf("%s_%02x",$name,$b),$next);
754    }
755  }
756  # encengine.c rules say that last entry must be for 255
757  if ($agg_max_in < 255) {
758    push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
759  }
760  $a->{'Entries'} = \@ent;
761}
762
763
764sub addstrings
765{
766 my ($fh,$a) = @_;
767 my $name = $a->{'Cname'};
768 # String tables
769 foreach my $b (@{$a->{'Entries'}})
770  {
771   next unless $b->[AGG_OUT_LEN];
772   $strings{$b->[AGG_OUT_BYTES]} = undef;
773  }
774 if ($a->{'Forward'})
775  {
776   my ($cpp, $static, $sized) = compiler_info(1);
777   my $count = $sized ? scalar(@{$a->{'Entries'}}) : '';
778   if ($static) {
779     # we cannot ask Config for d_plusplus since we can override CC=g++-6 on the cmdline
780     print $fh "#ifdef __cplusplus\n"; # -fpermissive since g++-6
781     print $fh "extern encpage_t $name\[$count];\n";
782     print $fh "#else\n";
783     print $fh "static const encpage_t $name\[$count];\n";
784     print $fh "#endif\n";
785   } else {
786     print $fh "extern encpage_t $name\[$count];\n";
787   }
788  }
789 $a->{'DoneStrings'} = 1;
790 foreach my $b (@{$a->{'Entries'}})
791  {
792   my ($s,$e,$out,$t,$end,$l) = @$b;
793   addstrings($fh,$t) unless $t->{'DoneStrings'};
794  }
795}
796
797sub outbigstring
798{
799  my ($fh,$name) = @_;
800
801  $string_acc = '';
802
803  # Make the big string in the string accumulator. Longest first, on the hope
804  # that this makes it more likely that we find the short strings later on.
805  # Not sure if it helps sorting strings of the same length lexically.
806  foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
807    my $index = index $string_acc, $s;
808    if ($index >= 0) {
809      $saved += length($s);
810      $strings_in_acc{$s} = $index;
811    } else {
812    OPTIMISER: {
813    if ($opt{'O'}) {
814      my $sublength = length $s;
815      while (--$sublength > 0) {
816        # progressively lop characters off the end, to see if the start of
817        # the new string overlaps the end of the accumulator.
818        if (substr ($string_acc, -$sublength)
819        eq substr ($s, 0, $sublength)) {
820          $subsave += $sublength;
821          $strings_in_acc{$s} = length ($string_acc) - $sublength;
822          # append the last bit on the end.
823          $string_acc .= substr ($s, $sublength);
824          last OPTIMISER;
825        }
826        # or if the end of the new string overlaps the start of the
827        # accumulator
828        next unless substr ($string_acc, 0, $sublength)
829          eq substr ($s, -$sublength);
830        # well, the last $sublength characters of the accumulator match.
831        # so as we're prepending to the accumulator, need to shift all our
832        # existing offsets forwards
833        $_ += $sublength foreach values %strings_in_acc;
834        $subsave += $sublength;
835        $strings_in_acc{$s} = 0;
836        # append the first bit on the start.
837        $string_acc = substr ($s, 0, -$sublength) . $string_acc;
838        last OPTIMISER;
839      }
840    }
841    # Optimiser (if it ran) found nothing, so just going have to tack the
842    # whole thing on the end.
843    $strings_in_acc{$s} = length $string_acc;
844    $string_acc .= $s;
845      };
846    }
847  }
848
849  $strings = length $string_acc;
850  my ($cpp) = compiler_info(0);
851  my $var = $cpp ? '' : 'static';
852  my $definition = "\n$var const U8 $name\[$strings] = { " .
853    join(',',unpack "C*",$string_acc);
854  # We have a single long line. Split it at convenient commas.
855  print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
856  print $fh substr ($definition, pos $definition), " };\n";
857}
858
859sub findstring {
860  my ($name,$s) = @_;
861  my $offset = $strings_in_acc{$s};
862  die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
863    unless defined $offset;
864  "$name + $offset";
865}
866
867sub outtable
868{
869 my ($fh,$a,$bigname) = @_;
870 my $name = $a->{'Cname'};
871 $a->{'Done'} = 1;
872 foreach my $b (@{$a->{'Entries'}})
873  {
874   my ($s,$e,$out,$t,$end,$l) = @$b;
875   outtable($fh,$t,$bigname) unless $t->{'Done'};
876  }
877 my ($cpp, $static) = compiler_info(0);
878 my $count = scalar(@{$a->{'Entries'}});
879 if ($static) {
880     print $fh "#ifdef __cplusplus\n"; # -fpermissive since g++-6
881     print $fh "encpage_t $name\[$count] = {\n";
882     print $fh "#else\n";
883     print $fh "static const encpage_t $name\[$count] = {\n";
884     print $fh "#endif\n";
885 } else {
886   print $fh "\nencpage_t $name\[$count] = {\n";
887 }
888 foreach my $b (@{$a->{'Entries'}})
889  {
890   my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
891   # $end |= 0x80 if $fb; # what the heck was on your mind, Nick?  -- Dan
892   print  $fh "{";
893   if ($l)
894    {
895     printf $fh findstring($bigname,$out);
896    }
897   else
898    {
899     print  $fh "0";
900    }
901   print  $fh ",",$t->{Cname};
902   printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
903  }
904 print $fh "};\n";
905}
906
907sub output_enc
908{
909 my ($fh,$name,$a) = @_;
910 die "Changed - fix me for new structure";
911 foreach my $b (sort keys %$a)
912  {
913   my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
914  }
915}
916
917sub decode_U
918{
919 my $s = shift;
920}
921
922my @uname;
923sub char_names{} # cf. https://rt.cpan.org/Ticket/Display.html?id=132471
924
925sub output_ucm_page
926{
927  my ($cmap,$a,$t,$pre) = @_;
928  # warn sprintf("Page %x\n",$pre);
929  my $raw = $t->{Raw};
930  foreach my $key (sort keys %$raw) {
931    #  RAW_NEXT => 0,
932    #  RAW_IN_LEN => 1,
933    #  RAW_OUT_BYTES => 2,
934    #  RAW_FALLBACK => 3,
935    my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
936    my $u = ord $key;
937    $fallback ||= 0;
938
939    if ($next != $a && $next != $t) {
940      output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
941    } elsif (length $out_bytes) {
942      if ($pre) {
943        $u = $pre|($u &0x3f);
944      }
945      my $s = sprintf "<U%04X> ",$u;
946      #foreach my $c (split(//,$out_bytes)) {
947      #  $s .= sprintf "\\x%02X",ord($c);
948      #}
949      # 9.5% faster changing that loop to this:
950      $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
951      $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
952      push(@$cmap,$s);
953    } else {
954      warn join(',',$u, @{$raw->{$key}},$a,$t);
955    }
956  }
957}
958
959sub output_ucm
960{
961 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
962 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
963 print $fh "<code_set_name> \"$name\"\n";
964 char_names();
965 if (defined $min_el)
966  {
967   print $fh "<mb_cur_min> $min_el\n";
968  }
969 if (defined $max_el)
970  {
971   print $fh "<mb_cur_max> $max_el\n";
972  }
973 if (defined $rep)
974  {
975   print $fh "<subchar> ";
976   foreach my $c (split(//,$rep))
977    {
978     printf $fh "\\x%02X",ord($c);
979    }
980   print $fh "\n";
981  }
982 my @cmap;
983 output_ucm_page(\@cmap,$h,$h,0);
984 print $fh "#\nCHARMAP\n";
985 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
986  {
987   print $fh $line;
988  }
989 print $fh "END CHARMAP\n";
990}
991
992use vars qw(
993    $_Enc2xs
994    $_Version
995    $_Inc
996    $_E2X
997    $_Name
998    $_TableFiles
999    $_Now
1000);
1001
1002sub find_e2x{
1003    eval { require File::Find; };
1004    my (@inc, %e2x_dir);
1005    for my $inc (@INC){
1006    push @inc, $inc unless $inc eq '.'; #skip current dir
1007    }
1008    File::Find::find(
1009         sub {
1010         my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
1011             $atime,$mtime,$ctime,$blksize,$blocks)
1012             = lstat($_) or return;
1013         -f _ or return;
1014         if (/^.*\.e2x$/o){
1015             no warnings 'once';
1016             $e2x_dir{$File::Find::dir} ||= $mtime;
1017         }
1018         return;
1019         }, @inc);
1020    warn join("\n", keys %e2x_dir), "\n";
1021    for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
1022    $_E2X = $d;
1023    # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
1024    return $_E2X;
1025    }
1026}
1027
1028sub make_makefile_pl
1029{
1030    eval { require Encode } or die "You need to install Encode to use enc2xs -M\nerror: $@\n";
1031    # our used for variable expansion
1032    $_Enc2xs = $0;
1033    $_Version = $VERSION;
1034    $_E2X = find_e2x();
1035    $_Name = shift;
1036    $_TableFiles = join(",", map {qq('$_')} @_);
1037    $_Now = scalar localtime();
1038
1039    eval { require File::Spec; };
1040    _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
1041    _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"),        "$_Name.pm");
1042    _print_expand(File::Spec->catfile($_E2X,"_T.e2x"),         "t/$_Name.t");
1043    _print_expand(File::Spec->catfile($_E2X,"README.e2x"),     "README");
1044    _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"),    "Changes");
1045    exit;
1046}
1047
1048use vars qw(
1049        $_ModLines
1050        $_LocalVer
1051        );
1052
1053sub make_configlocal_pm {
1054    eval { require Encode } or die "Unable to require Encode: $@\n";
1055    eval { require File::Spec; };
1056
1057    # our used for variable expantion
1058    my %in_core = map { $_ => 1 } (
1059        'ascii',      'iso-8859-1', 'utf8',
1060        'ascii-ctrl', 'null',       'utf-8-strict'
1061    );
1062    my %LocalMod = ();
1063    # check @enc;
1064    use File::Find ();
1065    my $wanted = sub{
1066	-f $_ or return;
1067	$File::Find::name =~ /\A\./        and return;
1068	$File::Find::name =~ /\.pm\z/      or  return;
1069	$File::Find::name =~ m/\bEncode\b/ or  return;
1070	my $mod = $File::Find::name;
1071	$mod =~ s/.*\bEncode\b/Encode/o;
1072	$mod =~ s/\.pm\z//o;
1073	$mod =~ s,/,::,og;
1074	eval qq{ require $mod; } or return;
1075        warn qq{ require $mod;\n};
1076	for my $enc ( Encode->encodings() ) {
1077	    no warnings;
1078	    $in_core{$enc}                   and next;
1079	    $Encode::Config::ExtModule{$enc} and next;
1080	    $LocalMod{$enc} ||= $mod;
1081	}
1082    };
1083    File::Find::find({wanted => $wanted}, @INC);
1084    $_ModLines = "";
1085    for my $enc ( sort keys %LocalMod ) {
1086        $_ModLines .=
1087          qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n);
1088    }
1089    warn $_ModLines if $_ModLines;
1090    $_LocalVer = _mkversion();
1091    $_E2X      = find_e2x();
1092    $_Inc      = $INC{"Encode.pm"};
1093    $_Inc =~ s/\.pm$//o;
1094    _print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ),
1095        File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 );
1096    exit;
1097}
1098
1099sub _mkversion{
1100    # v-string is now depreciated; use time() instead;
1101    #my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
1102    #$yyyy += 1900, $mo +=1;
1103    #return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
1104    return time();
1105}
1106
1107sub _print_expand{
1108    eval { require File::Basename } or die "File::Basename needed.  Are you on miniperl?;\nerror: $@\n";
1109    File::Basename->import();
1110    my ($src, $dst, $clobber) = @_;
1111    if (!$clobber and -e $dst){
1112    warn "$dst exists. skipping\n";
1113    return;
1114    }
1115    warn "Generating $dst...\n";
1116    open my $in, $src or die "$src : $!";
1117    if ((my $d = dirname($dst)) ne '.'){
1118    -d $d or mkdir $d, 0755 or die  "mkdir $d : $!";
1119    }
1120    open my $out, ">", $dst or die "$!";
1121    my $asis = 0;
1122    while (<$in>){
1123    if (/^#### END_OF_HEADER/){
1124        $asis = 1; next;
1125    }
1126    s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
1127    print $out $_;
1128    }
1129}
1130__END__
1131
1132=head1 NAME
1133
1134enc2xs -- Perl Encode Module Generator
1135
1136=head1 SYNOPSIS
1137
1138  enc2xs -[options]
1139  enc2xs -M ModName mapfiles...
1140  enc2xs -C
1141
1142=head1 DESCRIPTION
1143
1144F<enc2xs> builds a Perl extension for use by Encode from either
1145Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
1146Besides being used internally during the build process of the Encode
1147module, you can use F<enc2xs> to add your own encoding to perl.
1148No knowledge of XS is necessary.
1149
1150=head1 Quick Guide
1151
1152If you want to know as little about Perl as possible but need to
1153add a new encoding, just read this chapter and forget the rest.
1154
1155=over 4
1156
1157=item 0.Z<>
1158
1159Have a .ucm file ready.  You can get it from somewhere or you can write
1160your own from scratch or you can grab one from the Encode distribution
1161and customize it.  For the UCM format, see the next Chapter.  In the
1162example below, I'll call my theoretical encoding myascii, defined
1163in I<my.ucm>.  C<$> is a shell prompt.
1164
1165  $ ls -F
1166  my.ucm
1167
1168=item 1.Z<>
1169
1170Issue a command as follows;
1171
1172  $ enc2xs -M My my.ucm
1173  generating Makefile.PL
1174  generating My.pm
1175  generating README
1176  generating Changes
1177
1178Now take a look at your current directory.  It should look like this.
1179
1180  $ ls -F
1181  Makefile.PL   My.pm         my.ucm        t/
1182
1183The following files were created.
1184
1185  Makefile.PL - MakeMaker script
1186  My.pm       - Encode submodule
1187  t/My.t      - test file
1188
1189=over 4
1190
1191=item 1.1.Z<>
1192
1193If you want *.ucm installed together with the modules, do as follows;
1194
1195  $ mkdir Encode
1196  $ mv *.ucm Encode
1197  $ enc2xs -M My Encode/*ucm
1198
1199=back
1200
1201=item 2.Z<>
1202
1203Edit the files generated.  You don't have to if you have no time AND no
1204intention to give it to someone else.  But it is a good idea to edit
1205the pod and to add more tests.
1206
1207=item 3.Z<>
1208
1209Now issue a command all Perl Mongers love:
1210
1211  $ perl Makefile.PL
1212  Writing Makefile for Encode::My
1213
1214=item 4.Z<>
1215
1216Now all you have to do is make.
1217
1218  $ make
1219  cp My.pm blib/lib/Encode/My.pm
1220  /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
1221    -o encode_t.c -f encode_t.fnm
1222  Reading myascii (myascii)
1223  Writing compiled form
1224  128 bytes in string tables
1225  384 bytes (75%) saved spotting duplicates
1226  1 bytes (0.775%) saved using substrings
1227  ....
1228  chmod 644 blib/arch/auto/Encode/My/My.bs
1229  $
1230
1231The time it takes varies depending on how fast your machine is and
1232how large your encoding is.  Unless you are working on something big
1233like euc-tw, it won't take too long.
1234
1235=item 5.Z<>
1236
1237You can "make install" already but you should test first.
1238
1239  $ make test
1240  PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
1241    -e 'use Test::Harness  qw(&runtests $verbose); \
1242    $verbose=0; runtests @ARGV;' t/*.t
1243  t/My....ok
1244  All tests successful.
1245  Files=1, Tests=2,  0 wallclock secs
1246   ( 0.09 cusr + 0.01 csys = 0.09 CPU)
1247
1248=item 6.Z<>
1249
1250If you are content with the test result, just "make install"
1251
1252=item 7.Z<>
1253
1254If you want to add your encoding to Encode's demand-loading list
1255(so you don't have to "use Encode::YourEncoding"), run
1256
1257  enc2xs -C
1258
1259to update Encode::ConfigLocal, a module that controls local settings.
1260After that, "use Encode;" is enough to load your encodings on demand.
1261
1262=back
1263
1264=head1 The Unicode Character Map
1265
1266Encode uses the Unicode Character Map (UCM) format for source character
1267mappings.  This format is used by IBM's ICU package and was adopted
1268by Nick Ing-Simmons for use with the Encode module.  Since UCM is
1269more flexible than Tcl's Encoding Map and far more user-friendly,
1270this is the recommended format for Encode now.
1271
1272A UCM file looks like this.
1273
1274  #
1275  # Comments
1276  #
1277  <code_set_name> "US-ascii" # Required
1278  <code_set_alias> "ascii"   # Optional
1279  <mb_cur_min> 1             # Required; usually 1
1280  <mb_cur_max> 1             # Max. # of bytes/char
1281  <subchar> \x3F             # Substitution char
1282  #
1283  CHARMAP
1284  <U0000> \x00 |0 # <control>
1285  <U0001> \x01 |0 # <control>
1286  <U0002> \x02 |0 # <control>
1287  ....
1288  <U007C> \x7C |0 # VERTICAL LINE
1289  <U007D> \x7D |0 # RIGHT CURLY BRACKET
1290  <U007E> \x7E |0 # TILDE
1291  <U007F> \x7F |0 # <control>
1292  END CHARMAP
1293
1294=over 4
1295
1296=item *
1297
1298Anything that follows C<#> is treated as a comment.
1299
1300=item *
1301
1302The header section continues until a line containing the word
1303CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
1304pair per line.  Strings used as values must be quoted. Barewords are
1305treated as numbers.  I<\xXX> represents a byte.
1306
1307Most of the keywords are self-explanatory. I<subchar> means
1308substitution character, not subcharacter.  When you decode a Unicode
1309sequence to this encoding but no matching character is found, the byte
1310sequence defined here will be used.  For most cases, the value here is
1311\x3F; in ASCII, this is a question mark.
1312
1313=item *
1314
1315CHARMAP starts the character map section.  Each line has a form as
1316follows:
1317
1318  <UXXXX> \xXX.. |0 # comment
1319    ^     ^      ^
1320    |     |      +- Fallback flag
1321    |     +-------- Encoded byte sequence
1322    +-------------- Unicode Character ID in hex
1323
1324The format is roughly the same as a header section except for the
1325fallback flag: | followed by 0..3.   The meaning of the possible
1326values is as follows:
1327
1328=over 4
1329
1330=item |0
1331
1332Round trip safe.  A character decoded to Unicode encodes back to the
1333same byte sequence.  Most characters have this flag.
1334
1335=item |1
1336
1337Fallback for unicode -> encoding.  When seen, enc2xs adds this
1338character for the encode map only.
1339
1340=item |2
1341
1342Skip sub-char mapping should there be no code point.
1343
1344=item |3
1345
1346Fallback for encoding -> unicode.  When seen, enc2xs adds this
1347character for the decode map only.
1348
1349=back
1350
1351=item *
1352
1353And finally, END OF CHARMAP ends the section.
1354
1355=back
1356
1357When you are manually creating a UCM file, you should copy ascii.ucm
1358or an existing encoding which is close to yours, rather than write
1359your own from scratch.
1360
1361When you do so, make sure you leave at least B<U0000> to B<U0020> as
1362is, unless your environment is EBCDIC.
1363
1364B<CAVEAT>: not all features in UCM are implemented.  For example,
1365icu:state is not used.  Because of that, you need to write a perl
1366module if you want to support algorithmical encodings, notably
1367the ISO-2022 series.  Such modules include L<Encode::JP::2022_JP>,
1368L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
1369
1370=head2 Coping with duplicate mappings
1371
1372When you create a map, you SHOULD make your mappings round-trip safe.
1373That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
1374$data> stands for all characters that are marked as C<|0>.  Here is
1375how to make sure:
1376
1377=over 4
1378
1379=item *
1380
1381Sort your map in Unicode order.
1382
1383=item *
1384
1385When you have a duplicate entry, mark either one with '|1' or '|3'.
1386
1387=item *
1388
1389And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
1390
1391=back
1392
1393Here is an example from big5-eten.
1394
1395  <U2550> \xF9\xF9 |0
1396  <U2550> \xA2\xA4 |3
1397
1398Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
1399this;
1400
1401  E to U               U to E
1402  --------------------------------------
1403  \xF9\xF9 => U2550    U2550 => \xF9\xF9
1404  \xA2\xA4 => U2550
1405
1406So it is round-trip safe for \xF9\xF9.  But if the line above is upside
1407down, here is what happens.
1408
1409  E to U               U to E
1410  --------------------------------------
1411  \xA2\xA4 => U2550    U2550 => \xF9\xF9
1412  (\xF9\xF9 => U2550 is now overwritten!)
1413
1414The Encode package comes with F<ucmlint>, a crude but sufficient
1415utility to check the integrity of a UCM file.  Check under the
1416Encode/bin directory for this.
1417
1418When in doubt, you can use F<ucmsort>, yet another utility under
1419Encode/bin directory.
1420
1421=head1 Bookmarks
1422
1423=over 4
1424
1425=item *
1426
1427ICU Home Page
1428L<http://www.icu-project.org/>
1429
1430=item *
1431
1432ICU Character Mapping Tables
1433L<http://site.icu-project.org/charts/charset>
1434
1435=item *
1436
1437ICU:Conversion Data
1438L<http://www.icu-project.org/userguide/conversion-data.html>
1439
1440=back
1441
1442=head1 SEE ALSO
1443
1444L<Encode>,
1445L<perlmod>,
1446L<perlpod>
1447
1448=cut
1449
1450# -Q to disable the duplicate codepoint test
1451# -S make mapping errors fatal
1452# -q to remove comments written to output files
1453# -O to enable the (brute force) substring optimiser
1454# -o <output> to specify the output file name (else it's the first arg)
1455# -f <inlist> to give a file with a list of input files (else use the args)
1456# -n <name> to name the encoding (else use the basename of the input file.
1457
1458With %seen holding array refs:
1459
1460      865.66 real        28.80 user         8.79 sys
1461      7904  maximum resident set size
1462      1356  average shared memory size
1463     18566  average unshared data size
1464       229  average unshared stack size
1465     46080  page reclaims
1466     33373  page faults
1467
1468With %seen holding simple scalars:
1469
1470      342.16 real        27.11 user         3.54 sys
1471      8388  maximum resident set size
1472      1394  average shared memory size
1473     14969  average unshared data size
1474       236  average unshared stack size
1475     28159  page reclaims
1476      9839  page faults
1477
1478Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
1479how %seen is storing things its seen. So it is pathalogically bad on a 16M
1480RAM machine, but it's going to help even on modern machines.
1481Swapping is bad, m'kay :-)
1482