1#!./perl 2BEGIN { 3 # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's 4 # with $ENV{PERL_CORE} set 5 # In case we need it in future... 6 require Config; import Config; 7 pop @INC if $INC[-1] eq '.'; 8} 9use strict; 10use warnings; 11use Getopt::Std; 12use Config; 13my @orig_ARGV = @ARGV; 14our $VERSION = do { my @r = (q$Revision: 2.24 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; 15 16# These may get re-ordered. 17# RAW is a do_now as inserted by &enter 18# AGG is an aggregated do_now, as built up by &process 19 20use constant { 21 RAW_NEXT => 0, 22 RAW_IN_LEN => 1, 23 RAW_OUT_BYTES => 2, 24 RAW_FALLBACK => 3, 25 26 AGG_MIN_IN => 0, 27 AGG_MAX_IN => 1, 28 AGG_OUT_BYTES => 2, 29 AGG_NEXT => 3, 30 AGG_IN_LEN => 4, 31 AGG_OUT_LEN => 5, 32 AGG_FALLBACK => 6, 33}; 34 35# (See the algorithm in encengine.c - we're building structures for it) 36 37# There are two sorts of structures. 38# "do_now" (an array, two variants of what needs storing) is whatever we need 39# to do now we've read an input byte. 40# It's housed in a "do_next" (which is how we got to it), and in turn points 41# to a "do_next" which contains all the "do_now"s for the next input byte. 42 43# There will be a "do_next" which is the start state. 44# For a single byte encoding it's the only "do_next" - each "do_now" points 45# back to it, and each "do_now" will cause bytes. There is no state. 46 47# For a multi-byte encoding where all characters in the input are the same 48# length, then there will be a tree of "do_now"->"do_next"->"do_now" 49# branching out from the start state, one step for each input byte. 50# The leaf "do_now"s will all be at the same distance from the start state, 51# only the leaf "do_now"s cause output bytes, and they in turn point back to 52# the start state. 53 54# For an encoding where there are variable length input byte sequences, you 55# will encounter a leaf "do_now" sooner for the shorter input sequences, but 56# as before the leaves will point back to the start state. 57 58# The system will cope with escape encodings (imagine them as a mostly 59# self-contained tree for each escape state, and cross links between trees 60# at the state-switching characters) but so far no input format defines these. 61 62# The system will also cope with having output "leaves" in the middle of 63# the bifurcating branches, not just at the extremities, but again no 64# input format does this yet. 65 66# There are two variants of the "do_now" structure. The first, smaller variant 67# is generated by &enter as the input file is read. There is one structure 68# for each input byte. Say we are mapping a single byte encoding to a 69# single byte encoding, with "ABCD" going "abcd". There will be 70# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...} 71 72# &process then walks the tree, building aggregate "do_now" structures for 73# adjacent bytes where possible. The aggregate is for a contiguous range of 74# bytes which each produce the same length of output, each move to the 75# same next state, and each have the same fallback flag. 76# So our 4 RAW "do_now"s above become replaced by a single structure 77# containing: 78# ["A", "D", "abcd", 1, ...] 79# ie, for an input byte $_ in "A".."D", output 1 byte, found as 80# substr ("abcd", (ord $_ - ord "A") * 1, 1) 81# which maps very nicely into pointer arithmetic in C for encengine.c 82 83sub encode_U 84{ 85 # UTF-8 encode long hand - only covers part of perl's range 86 ## my $uv = shift; 87 # chr() works in native space so convert value from table 88 # into that space before using chr(). 89 my $ch = chr(utf8::unicode_to_native($_[0])); 90 # Now get core perl to encode that the way it likes. 91 utf8::encode($ch); 92 return $ch; 93} 94 95sub encode_S 96{ 97 # encode single byte 98 ## my ($ch,$page) = @_; return chr($ch); 99 return chr $_[0]; 100} 101 102sub encode_D 103{ 104 # encode double byte MS byte first 105 ## my ($ch,$page) = @_; return chr($page).chr($ch); 106 return chr ($_[1]) . chr $_[0]; 107} 108 109sub encode_M 110{ 111 # encode Multi-byte - single for 0..255 otherwise double 112 ## my ($ch,$page) = @_; 113 ## return &encode_D if $page; 114 ## return &encode_S; 115 return chr ($_[1]) . chr $_[0] if $_[1]; 116 return chr $_[0]; 117} 118 119my %encode_types = (U => \&encode_U, 120 S => \&encode_S, 121 D => \&encode_D, 122 M => \&encode_M, 123 ); 124 125# Win32 does not expand globs on command line 126if ($^O eq 'MSWin32' and !$ENV{PERL_CORE}) { 127 eval "\@ARGV = map(glob(\$_),\@ARGV)"; 128 @ARGV = @orig_ARGV unless @ARGV; 129} 130 131my %opt; 132# I think these are: 133# -Q to disable the duplicate codepoint test 134# -S make mapping errors fatal 135# -q to remove comments written to output files 136# -O to enable the (brute force) substring optimiser 137# -o <output> to specify the output file name (else it's the first arg) 138# -f <inlist> to give a file with a list of input files (else use the args) 139# -n <name> to name the encoding (else use the basename of the input file. 140#Getopt::Long::Configure("bundling"); 141#GetOptions(\%opt, qw(C M=s S Q q O o=s f=s n=s v)); 142getopts('CM:SQqOo:f:n:v',\%opt); 143 144$opt{M} and make_makefile_pl($opt{M}, @ARGV); 145$opt{C} and make_configlocal_pm($opt{C}, @ARGV); 146$opt{v} ||= $ENV{ENC2XS_VERBOSE}; 147$opt{q} ||= $ENV{ENC2XS_NO_COMMENTS}; 148 149sub verbose { 150 print STDERR @_ if $opt{v}; 151} 152sub verbosef { 153 printf STDERR @_ if $opt{v}; 154} 155 156 157# ($cpp, $static, $sized) = compiler_info($declaration) 158# 159# return some information about the compiler and compile options we're using: 160# 161# $declaration - true if we're doing a declaration rather than a definition. 162# 163# $cpp - we're using C++ 164# $static - ok to declare the arrays as static 165# $sized - the array declarations should be sized 166 167sub compiler_info { 168 my ($declaration) = @_; 169 170 my $ccflags = $Config{ccflags}; 171 if (defined $Config{ccwarnflags}) { 172 $ccflags .= " " . $Config{ccwarnflags}; 173 } 174 my $compat = $ccflags =~ /\Q-Wc++-compat/; 175 my $pedantic = $ccflags =~ /-pedantic/; 176 177 my $cpp = ($Config{d_cplusplus} || '') eq 'define'; 178 179 # The encpage_t tables contain recursive and mutually recursive 180 # references. To allow them to compile under C++ and some restrictive 181 # cc options, it may be necessary to make the tables non-static/const 182 # (thus moving them from the text to the data segment) and/or not 183 # include the size in the declaration. 184 185 my $static = !( 186 $cpp 187 || ($compat && $pedantic) 188 || ($^O eq 'MacOS' && $declaration) 189 ); 190 191 # -Wc++-compat on its own warns if the array declaration is sized. 192 # The easiest way to avoid this warning is simply not to include 193 # the size in the declaration. 194 # With -pedantic as well, the issue doesn't arise because $static 195 # above becomes false. 196 my $sized = $declaration && !($compat && !$pedantic); 197 198 return ($cpp, $static, $sized); 199} 200 201 202# This really should go first, else the die here causes empty (non-erroneous) 203# output files to be written. 204my @encfiles; 205if (exists $opt{f}) { 206 # -F is followed by name of file containing list of filenames 207 my $flist = $opt{f}; 208 open(FLIST,$flist) || die "Cannot open $flist:$!"; 209 chomp(@encfiles = <FLIST>); 210 close(FLIST); 211} else { 212 @encfiles = @ARGV; 213} 214 215my $cname = $opt{o} ? $opt{o} : shift(@ARGV); 216unless ($cname) { #debuging a win32 nmake error-only. works via cmdline 217 print "\nARGV:"; 218 print "$_ " for @ARGV; 219 print "\nopt:"; 220 print " $_ => ",defined $opt{$_}?$opt{$_}:"undef","\n" for keys %opt; 221} 222chmod(0666,$cname) if -f $cname && !-w $cname; 223open(C,">", $cname) || die "Cannot open $cname:$!"; 224 225my $dname = $cname; 226my $hname = $cname; 227 228my ($doC,$doEnc,$doUcm,$doPet); 229 230if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined 231 { 232 $doC = 1; 233 $dname =~ s/(\.[^\.]*)?$/.exh/; 234 chmod(0666,$dname) if -f $cname && !-w $dname; 235 open(D,">", $dname) || die "Cannot open $dname:$!"; 236 $hname =~ s/(\.[^\.]*)?$/.h/; 237 chmod(0666,$hname) if -f $cname && !-w $hname; 238 open(H,">", $hname) || die "Cannot open $hname:$!"; 239 240 foreach my $fh (\*C,\*D,\*H) 241 { 242 print $fh <<"END" unless $opt{'q'}; 243/* 244 !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 245 This file was autogenerated by: 246 $^X $0 @orig_ARGV 247 enc2xs VERSION $VERSION 248*/ 249END 250 } 251 252 if ($cname =~ /\.c$/i && $Config{ccname} eq "gcc") 253 { 254 print C qq(#pragma GCC diagnostic ignored "-Wc++-compat"\n); 255 } 256 257 if ($cname =~ /\.xs$/i) 258 { 259 print C "#define PERL_NO_GET_CONTEXT\n"; 260 print C "#include <EXTERN.h>\n"; 261 print C "#include <perl.h>\n"; 262 print C "#include <XSUB.h>\n"; 263 } 264 print C "#include \"encode.h\"\n\n"; 265 266 } 267elsif ($cname =~ /\.enc$/i) 268 { 269 $doEnc = 1; 270 } 271elsif ($cname =~ /\.ucm$/i) 272 { 273 $doUcm = 1; 274 } 275elsif ($cname =~ /\.pet$/i) 276 { 277 $doPet = 1; 278 } 279 280my %encoding; 281my %strings; 282my $string_acc; 283my %strings_in_acc; 284 285my $saved = 0; 286my $subsave = 0; 287my $strings = 0; 288 289sub cmp_name 290{ 291 if ($a =~ /^.*-(\d+)/) 292 { 293 my $an = $1; 294 if ($b =~ /^.*-(\d+)/) 295 { 296 my $r = $an <=> $1; 297 return $r if $r; 298 } 299 } 300 return $a cmp $b; 301} 302 303 304foreach my $enc (sort cmp_name @encfiles) 305 { 306 my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/; 307 $name = $opt{'n'} if exists $opt{'n'}; 308 if (open(E,$enc)) 309 { 310 if ($sfx eq 'enc') 311 { 312 compile_enc(\*E,lc($name)); 313 } 314 else 315 { 316 compile_ucm(\*E,lc($name)); 317 } 318 } 319 else 320 { 321 warn "Cannot open $enc for $name:$!"; 322 } 323 } 324 325if ($doC) 326 { 327 verbose "Writing compiled form\n"; 328 foreach my $name (sort cmp_name keys %encoding) 329 { 330 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; 331 process($name.'_utf8',$e2u); 332 addstrings(\*C,$e2u); 333 334 process('utf8_'.$name,$u2e); 335 addstrings(\*C,$u2e); 336 } 337 outbigstring(\*C,"enctable"); 338 foreach my $name (sort cmp_name keys %encoding) 339 { 340 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; 341 outtable(\*C,$e2u, "enctable"); 342 outtable(\*C,$u2e, "enctable"); 343 344 # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep)); 345 } 346 my ($cpp) = compiler_info(0); 347 my $ext = $cpp ? 'extern "C"' : "extern"; 348 my $exta = $cpp ? 'extern "C"' : "static"; 349 my $extb = $cpp ? 'extern "C"' : ""; 350 foreach my $enc (sort cmp_name keys %encoding) 351 { 352 # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}}; 353 my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}}; 354 #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el); 355 my $replen = 0; 356 $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g); 357 my $sym = "${enc}_encoding"; 358 $sym =~ s/\W+/_/g; 359 my @info = ($e2u->{Cname},$u2e->{Cname},"${sym}_rep_character",$replen, 360 $min_el,$max_el); 361 print C "${exta} const U8 ${sym}_rep_character[] = \"$rep\";\n"; 362 print C "${exta} const char ${sym}_enc_name[] = \"$enc\";\n\n"; 363 print C "${extb} const encode_t $sym = \n"; 364 # This is to make null encoding work -- dankogai 365 for (my $i = (scalar @info) - 1; $i >= 0; --$i){ 366 $info[$i] ||= 1; 367 } 368 # end of null tweak -- dankogai 369 print C " {",join(',',@info,"{${sym}_enc_name,(const char *)0}"),"};\n\n"; 370 } 371 372 foreach my $enc (sort cmp_name keys %encoding) 373 { 374 my $sym = "${enc}_encoding"; 375 $sym =~ s/\W+/_/g; 376 print H "${ext} encode_t $sym;\n"; 377 print D " Encode_XSEncoding(aTHX_ &$sym);\n"; 378 } 379 380 if ($cname =~ /(\w+)\.xs$/) 381 { 382 my $mod = $1; 383 print C <<'END'; 384 385static void 386Encode_XSEncoding(pTHX_ encode_t *enc) 387{ 388 dSP; 389 HV *stash = gv_stashpv("Encode::XS", TRUE); 390 SV *iv = newSViv(PTR2IV(enc)); 391 SV *sv = sv_bless(newRV_noinc(iv),stash); 392 int i = 0; 393 /* with the SvLEN() == 0 hack, PVX won't be freed. We cast away name's 394 constness, in the hope that perl won't mess with it. */ 395 assert(SvTYPE(iv) >= SVt_PV); assert(SvLEN(iv) == 0); 396 SvFLAGS(iv) |= SVp_POK; 397 SvPVX(iv) = (char*) enc->name[0]; 398 PUSHMARK(sp); 399 XPUSHs(sv); 400 while (enc->name[i]) 401 { 402 const char *name = enc->name[i++]; 403 XPUSHs(sv_2mortal(newSVpvn(name,strlen(name)))); 404 } 405 PUTBACK; 406 call_pv("Encode::define_encoding",G_DISCARD); 407 SvREFCNT_dec(sv); 408} 409 410END 411 412 print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n"; 413 print C "BOOT:\n{\n"; 414 print C "#include \"$dname\"\n"; 415 print C "}\n"; 416 } 417 # Close in void context is bad, m'kay 418 close(D) or warn "Error closing '$dname': $!"; 419 close(H) or warn "Error closing '$hname': $!"; 420 421 my $perc_saved = $saved/($strings + $saved) * 100; 422 my $perc_subsaved = $subsave/($strings + $subsave) * 100; 423 verbosef "%d bytes in string tables\n",$strings; 424 verbosef "%d bytes (%.3g%%) saved spotting duplicates\n", 425 $saved, $perc_saved if $saved; 426 verbosef "%d bytes (%.3g%%) saved using substrings\n", 427 $subsave, $perc_subsaved if $subsave; 428 } 429elsif ($doEnc) 430 { 431 foreach my $name (sort cmp_name keys %encoding) 432 { 433 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; 434 output_enc(\*C,$name,$e2u); 435 } 436 } 437elsif ($doUcm) 438 { 439 foreach my $name (sort cmp_name keys %encoding) 440 { 441 my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; 442 output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el); 443 } 444 } 445 446# writing half meg files and then not checking to see if you just filled the 447# disk is bad, m'kay 448close(C) or die "Error closing '$cname': $!"; 449 450# End of the main program. 451 452sub compile_ucm 453{ 454 my ($fh,$name) = @_; 455 my $e2u = {}; 456 my $u2e = {}; 457 my $cs; 458 my %attr; 459 while (<$fh>) 460 { 461 s/#.*$//; 462 last if /^\s*CHARMAP\s*$/i; 463 if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr 464 { 465 $attr{$1} = $2; 466 } 467 } 468 if (!defined($cs = $attr{'code_set_name'})) 469 { 470 warn "No <code_set_name> in $name\n"; 471 } 472 else 473 { 474 $name = $cs unless exists $opt{'n'}; 475 } 476 my $erep; 477 my $urep; 478 my $max_el; 479 my $min_el; 480 if (exists $attr{'subchar'}) 481 { 482 #my @byte; 483 #$attr{'subchar'} =~ /^\s*/cg; 484 #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg; 485 #$erep = join('',map(chr(hex($_)),@byte)); 486 $erep = $attr{'subchar'}; 487 $erep =~ s/^\s+//; $erep =~ s/\s+$//; 488 } 489 print "Reading $name ($cs)\n" 490 unless defined $ENV{MAKEFLAGS} 491 and $ENV{MAKEFLAGS} =~ /\b(s|silent|quiet)\b/; 492 my $nfb = 0; 493 my $hfb = 0; 494 while (<$fh>) 495 { 496 s/#.*$//; 497 last if /^\s*END\s+CHARMAP\s*$/i; 498 next if /^\s*$/; 499 my (@uni, @byte) = (); 500 my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o 501 or die "Bad line: $_"; 502 while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){ 503 push @uni, map { substr($_, 1) } split(/\+/, $1); 504 } 505 while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){ 506 push @byte, $1; 507 } 508 if (@uni) 509 { 510 my $uch = join('', map { encode_U(hex($_)) } @uni ); 511 my $ech = join('',map(chr(hex($_)),@byte)); 512 my $el = length($ech); 513 $max_el = $el if (!defined($max_el) || $el > $max_el); 514 $min_el = $el if (!defined($min_el) || $el < $min_el); 515 if (length($fb)) 516 { 517 $fb = substr($fb,1); 518 $hfb++; 519 } 520 else 521 { 522 $nfb++; 523 $fb = '0'; 524 } 525 # $fb is fallback flag 526 # 0 - round trip safe 527 # 1 - fallback for unicode -> enc 528 # 2 - skip sub-char mapping 529 # 3 - fallback enc -> unicode 530 enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/); 531 enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/); 532 } 533 else 534 { 535 warn $_; 536 } 537 } 538 if ($nfb && $hfb) 539 { 540 die "$nfb entries without fallback, $hfb entries with\n"; 541 } 542 $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el]; 543} 544 545 546 547sub compile_enc 548{ 549 my ($fh,$name) = @_; 550 my $e2u = {}; 551 my $u2e = {}; 552 553 my $type; 554 while ($type = <$fh>) 555 { 556 last if $type !~ /^\s*#/; 557 } 558 chomp($type); 559 return if $type eq 'E'; 560 # Do the hash lookup once, rather than once per function call. 4% speedup. 561 my $type_func = $encode_types{$type}; 562 my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>)); 563 warn "$type encoded $name\n"; 564 my $rep = ''; 565 # Save a defined test by setting these to defined values. 566 my $min_el = ~0; # A very big integer 567 my $max_el = 0; # Anything must be longer than 0 568 { 569 my $v = hex($def); 570 $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe); 571 } 572 my $errors; 573 my $seen; 574 # use -Q to silence the seen test. Makefile.PL uses this by default. 575 $seen = {} unless $opt{Q}; 576 do 577 { 578 my $line = <$fh>; 579 chomp($line); 580 my $page = hex($line); 581 my $ch = 0; 582 my $i = 16; 583 do 584 { 585 # So why is it 1% faster to leave the my here? 586 my $line = <$fh>; 587 $line =~ s/\r\n$/\n/; 588 die "$.:${line}Line should be exactly 65 characters long including 589 newline (".length($line).")" unless length ($line) == 65; 590 # Split line into groups of 4 hex digits, convert groups to ints 591 # This takes 65.35 592 # map {hex $_} $line =~ /(....)/g 593 # This takes 63.75 (2.5% less time) 594 # unpack "n*", pack "H*", $line 595 # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay 596 # Doing it as while ($line =~ /(....)/g) took 74.63 597 foreach my $val (unpack "n*", pack "H*", $line) 598 { 599 next if $val == 0xFFFD; 600 my $ech = &$type_func($ch,$page); 601 if ($val || (!$ch && !$page)) 602 { 603 my $el = length($ech); 604 $max_el = $el if $el > $max_el; 605 $min_el = $el if $el < $min_el; 606 my $uch = encode_U($val); 607 if ($seen) { 608 # We're doing the test. 609 # We don't need to read this quickly, so storing it as a scalar, 610 # rather than 3 (anon array, plus the 2 scalars it holds) saves 611 # RAM and may make us faster on low RAM systems. [see __END__] 612 if (exists $seen->{$uch}) 613 { 614 warn sprintf("U%04X is %02X%02X and %04X\n", 615 $val,$page,$ch,$seen->{$uch}); 616 $errors++; 617 } 618 else 619 { 620 $seen->{$uch} = $page << 8 | $ch; 621 } 622 } 623 # Passing 2 extra args each time is 3.6% slower! 624 # Even with having to add $fallback ||= 0 later 625 enter_fb0($e2u,$ech,$uch); 626 enter_fb0($u2e,$uch,$ech); 627 } 628 else 629 { 630 # No character at this position 631 # enter($e2u,$ech,undef,$e2u); 632 } 633 $ch++; 634 } 635 } while --$i; 636 } while --$pages; 637 die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines" 638 if $min_el > $max_el; 639 die "$errors mapping conflicts\n" if ($errors && $opt{'S'}); 640 $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el]; 641} 642 643# my ($a,$s,$d,$t,$fb) = @_; 644sub enter { 645 my ($current,$inbytes,$outbytes,$next,$fallback) = @_; 646 # state we shift to after this (multibyte) input character defaults to same 647 # as current state. 648 $next ||= $current; 649 # Making sure it is defined seems to be faster than {no warnings;} in 650 # &process, or passing it in as 0 explicitly. 651 # XXX $fallback ||= 0; 652 653 # Start at the beginning and work forwards through the string to zero. 654 # effectively we are removing 1 character from the front each time 655 # but we don't actually edit the string. [this alone seems to be 14% speedup] 656 # Hence -$pos is the length of the remaining string. 657 my $pos = -length $inbytes; 658 while (1) { 659 my $byte = substr $inbytes, $pos, 1; 660 # RAW_NEXT => 0, 661 # RAW_IN_LEN => 1, 662 # RAW_OUT_BYTES => 2, 663 # RAW_FALLBACK => 3, 664 # to unicode an array would seem to be better, because the pages are dense. 665 # from unicode can be very sparse, favouring a hash. 666 # hash using the bytes (all length 1) as keys rather than ord value, 667 # as it's easier to sort these in &process. 668 669 # It's faster to always add $fallback even if it's undef, rather than 670 # choosing between 3 and 4 element array. (hence why we set it defined 671 # above) 672 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback]; 673 # When $pos was -1 we were at the last input character. 674 unless (++$pos) { 675 $do_now->[RAW_OUT_BYTES] = $outbytes; 676 $do_now->[RAW_NEXT] = $next; 677 return; 678 } 679 # Tail recursion. The intermediate state may not have a name yet. 680 $current = $do_now->[RAW_NEXT]; 681 } 682} 683 684# This is purely for optimisation. It's just &enter hard coded for $fallback 685# of 0, using only a 3 entry array ref to save memory for every entry. 686sub enter_fb0 { 687 my ($current,$inbytes,$outbytes,$next) = @_; 688 $next ||= $current; 689 690 my $pos = -length $inbytes; 691 while (1) { 692 my $byte = substr $inbytes, $pos, 1; 693 my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'']; 694 unless (++$pos) { 695 $do_now->[RAW_OUT_BYTES] = $outbytes; 696 $do_now->[RAW_NEXT] = $next; 697 return; 698 } 699 $current = $do_now->[RAW_NEXT]; 700 } 701} 702 703sub process 704{ 705 my ($name,$a) = @_; 706 $name =~ s/\W+/_/g; 707 $a->{Cname} = $name; 708 my $raw = $a->{Raw}; 709 my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback); 710 my @ent; 711 $agg_max_in = 0; 712 foreach my $key (sort keys %$raw) { 713 # RAW_NEXT => 0, 714 # RAW_IN_LEN => 1, 715 # RAW_OUT_BYTES => 2, 716 # RAW_FALLBACK => 3, 717 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}}; 718 # Now we are converting from raw to aggregate, switch from 1 byte strings 719 # to numbers 720 my $b = ord $key; 721 $fallback ||= 0; 722 if ($l && 723 # If this == fails, we're going to reset $agg_max_in below anyway. 724 $b == ++$agg_max_in && 725 # References in numeric context give the pointer as an int. 726 $agg_next == $next && 727 $agg_in_len == $in_len && 728 $agg_out_len == length $out_bytes && 729 $agg_fallback == $fallback 730 # && length($l->[AGG_OUT_BYTES]) < 16 731 ) { 732 # my $i = ord($b)-ord($l->[AGG_MIN_IN]); 733 # we can aggregate this byte onto the end. 734 $l->[AGG_MAX_IN] = $b; 735 $l->[AGG_OUT_BYTES] .= $out_bytes; 736 } else { 737 # AGG_MIN_IN => 0, 738 # AGG_MAX_IN => 1, 739 # AGG_OUT_BYTES => 2, 740 # AGG_NEXT => 3, 741 # AGG_IN_LEN => 4, 742 # AGG_OUT_LEN => 5, 743 # AGG_FALLBACK => 6, 744 # Reset the last thing we saw, plus set 5 lexicals to save some derefs. 745 # (only gains .6% on euc-jp -- is it worth it?) 746 push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next, 747 $agg_in_len = $in_len, $agg_out_len = length $out_bytes, 748 $agg_fallback = $fallback]; 749 } 750 if (exists $next->{Cname}) { 751 $next->{'Forward'} = 1 if $next != $a; 752 } else { 753 process(sprintf("%s_%02x",$name,$b),$next); 754 } 755 } 756 # encengine.c rules say that last entry must be for 255 757 if ($agg_max_in < 255) { 758 push @ent, [1+$agg_max_in, 255,undef,$a,0,0]; 759 } 760 $a->{'Entries'} = \@ent; 761} 762 763 764sub addstrings 765{ 766 my ($fh,$a) = @_; 767 my $name = $a->{'Cname'}; 768 # String tables 769 foreach my $b (@{$a->{'Entries'}}) 770 { 771 next unless $b->[AGG_OUT_LEN]; 772 $strings{$b->[AGG_OUT_BYTES]} = undef; 773 } 774 if ($a->{'Forward'}) 775 { 776 my ($cpp, $static, $sized) = compiler_info(1); 777 my $count = $sized ? scalar(@{$a->{'Entries'}}) : ''; 778 if ($static) { 779 # we cannot ask Config for d_plusplus since we can override CC=g++-6 on the cmdline 780 print $fh "#ifdef __cplusplus\n"; # -fpermissive since g++-6 781 print $fh "extern encpage_t $name\[$count];\n"; 782 print $fh "#else\n"; 783 print $fh "static const encpage_t $name\[$count];\n"; 784 print $fh "#endif\n"; 785 } else { 786 print $fh "extern encpage_t $name\[$count];\n"; 787 } 788 } 789 $a->{'DoneStrings'} = 1; 790 foreach my $b (@{$a->{'Entries'}}) 791 { 792 my ($s,$e,$out,$t,$end,$l) = @$b; 793 addstrings($fh,$t) unless $t->{'DoneStrings'}; 794 } 795} 796 797sub outbigstring 798{ 799 my ($fh,$name) = @_; 800 801 $string_acc = ''; 802 803 # Make the big string in the string accumulator. Longest first, on the hope 804 # that this makes it more likely that we find the short strings later on. 805 # Not sure if it helps sorting strings of the same length lexically. 806 foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) { 807 my $index = index $string_acc, $s; 808 if ($index >= 0) { 809 $saved += length($s); 810 $strings_in_acc{$s} = $index; 811 } else { 812 OPTIMISER: { 813 if ($opt{'O'}) { 814 my $sublength = length $s; 815 while (--$sublength > 0) { 816 # progressively lop characters off the end, to see if the start of 817 # the new string overlaps the end of the accumulator. 818 if (substr ($string_acc, -$sublength) 819 eq substr ($s, 0, $sublength)) { 820 $subsave += $sublength; 821 $strings_in_acc{$s} = length ($string_acc) - $sublength; 822 # append the last bit on the end. 823 $string_acc .= substr ($s, $sublength); 824 last OPTIMISER; 825 } 826 # or if the end of the new string overlaps the start of the 827 # accumulator 828 next unless substr ($string_acc, 0, $sublength) 829 eq substr ($s, -$sublength); 830 # well, the last $sublength characters of the accumulator match. 831 # so as we're prepending to the accumulator, need to shift all our 832 # existing offsets forwards 833 $_ += $sublength foreach values %strings_in_acc; 834 $subsave += $sublength; 835 $strings_in_acc{$s} = 0; 836 # append the first bit on the start. 837 $string_acc = substr ($s, 0, -$sublength) . $string_acc; 838 last OPTIMISER; 839 } 840 } 841 # Optimiser (if it ran) found nothing, so just going have to tack the 842 # whole thing on the end. 843 $strings_in_acc{$s} = length $string_acc; 844 $string_acc .= $s; 845 }; 846 } 847 } 848 849 $strings = length $string_acc; 850 my ($cpp) = compiler_info(0); 851 my $var = $cpp ? '' : 'static'; 852 my $definition = "\n$var const U8 $name\[$strings] = { " . 853 join(',',unpack "C*",$string_acc); 854 # We have a single long line. Split it at convenient commas. 855 print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs; 856 print $fh substr ($definition, pos $definition), " };\n"; 857} 858 859sub findstring { 860 my ($name,$s) = @_; 861 my $offset = $strings_in_acc{$s}; 862 die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator" 863 unless defined $offset; 864 "$name + $offset"; 865} 866 867sub outtable 868{ 869 my ($fh,$a,$bigname) = @_; 870 my $name = $a->{'Cname'}; 871 $a->{'Done'} = 1; 872 foreach my $b (@{$a->{'Entries'}}) 873 { 874 my ($s,$e,$out,$t,$end,$l) = @$b; 875 outtable($fh,$t,$bigname) unless $t->{'Done'}; 876 } 877 my ($cpp, $static) = compiler_info(0); 878 my $count = scalar(@{$a->{'Entries'}}); 879 if ($static) { 880 print $fh "#ifdef __cplusplus\n"; # -fpermissive since g++-6 881 print $fh "encpage_t $name\[$count] = {\n"; 882 print $fh "#else\n"; 883 print $fh "static const encpage_t $name\[$count] = {\n"; 884 print $fh "#endif\n"; 885 } else { 886 print $fh "\nencpage_t $name\[$count] = {\n"; 887 } 888 foreach my $b (@{$a->{'Entries'}}) 889 { 890 my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b; 891 # $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan 892 print $fh "{"; 893 if ($l) 894 { 895 printf $fh findstring($bigname,$out); 896 } 897 else 898 { 899 print $fh "0"; 900 } 901 print $fh ",",$t->{Cname}; 902 printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec; 903 } 904 print $fh "};\n"; 905} 906 907sub output_enc 908{ 909 my ($fh,$name,$a) = @_; 910 die "Changed - fix me for new structure"; 911 foreach my $b (sort keys %$a) 912 { 913 my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}}; 914 } 915} 916 917sub decode_U 918{ 919 my $s = shift; 920} 921 922my @uname; 923sub char_names{} # cf. https://rt.cpan.org/Ticket/Display.html?id=132471 924 925sub output_ucm_page 926{ 927 my ($cmap,$a,$t,$pre) = @_; 928 # warn sprintf("Page %x\n",$pre); 929 my $raw = $t->{Raw}; 930 foreach my $key (sort keys %$raw) { 931 # RAW_NEXT => 0, 932 # RAW_IN_LEN => 1, 933 # RAW_OUT_BYTES => 2, 934 # RAW_FALLBACK => 3, 935 my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}}; 936 my $u = ord $key; 937 $fallback ||= 0; 938 939 if ($next != $a && $next != $t) { 940 output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF); 941 } elsif (length $out_bytes) { 942 if ($pre) { 943 $u = $pre|($u &0x3f); 944 } 945 my $s = sprintf "<U%04X> ",$u; 946 #foreach my $c (split(//,$out_bytes)) { 947 # $s .= sprintf "\\x%02X",ord($c); 948 #} 949 # 9.5% faster changing that loop to this: 950 $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes; 951 $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u]; 952 push(@$cmap,$s); 953 } else { 954 warn join(',',$u, @{$raw->{$key}},$a,$t); 955 } 956 } 957} 958 959sub output_ucm 960{ 961 my ($fh,$name,$h,$rep,$min_el,$max_el) = @_; 962 print $fh "# $0 @orig_ARGV\n" unless $opt{'q'}; 963 print $fh "<code_set_name> \"$name\"\n"; 964 char_names(); 965 if (defined $min_el) 966 { 967 print $fh "<mb_cur_min> $min_el\n"; 968 } 969 if (defined $max_el) 970 { 971 print $fh "<mb_cur_max> $max_el\n"; 972 } 973 if (defined $rep) 974 { 975 print $fh "<subchar> "; 976 foreach my $c (split(//,$rep)) 977 { 978 printf $fh "\\x%02X",ord($c); 979 } 980 print $fh "\n"; 981 } 982 my @cmap; 983 output_ucm_page(\@cmap,$h,$h,0); 984 print $fh "#\nCHARMAP\n"; 985 foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap) 986 { 987 print $fh $line; 988 } 989 print $fh "END CHARMAP\n"; 990} 991 992use vars qw( 993 $_Enc2xs 994 $_Version 995 $_Inc 996 $_E2X 997 $_Name 998 $_TableFiles 999 $_Now 1000); 1001 1002sub find_e2x{ 1003 eval { require File::Find; }; 1004 my (@inc, %e2x_dir); 1005 for my $inc (@INC){ 1006 push @inc, $inc unless $inc eq '.'; #skip current dir 1007 } 1008 File::Find::find( 1009 sub { 1010 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, 1011 $atime,$mtime,$ctime,$blksize,$blocks) 1012 = lstat($_) or return; 1013 -f _ or return; 1014 if (/^.*\.e2x$/o){ 1015 no warnings 'once'; 1016 $e2x_dir{$File::Find::dir} ||= $mtime; 1017 } 1018 return; 1019 }, @inc); 1020 warn join("\n", keys %e2x_dir), "\n"; 1021 for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){ 1022 $_E2X = $d; 1023 # warn "$_E2X => ", scalar localtime($e2x_dir{$d}); 1024 return $_E2X; 1025 } 1026} 1027 1028sub make_makefile_pl 1029{ 1030 eval { require Encode } or die "You need to install Encode to use enc2xs -M\nerror: $@\n"; 1031 # our used for variable expansion 1032 $_Enc2xs = $0; 1033 $_Version = $VERSION; 1034 $_E2X = find_e2x(); 1035 $_Name = shift; 1036 $_TableFiles = join(",", map {qq('$_')} @_); 1037 $_Now = scalar localtime(); 1038 1039 eval { require File::Spec; }; 1040 _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL"); 1041 _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm"); 1042 _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t"); 1043 _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README"); 1044 _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes"); 1045 exit; 1046} 1047 1048use vars qw( 1049 $_ModLines 1050 $_LocalVer 1051 ); 1052 1053sub make_configlocal_pm { 1054 eval { require Encode } or die "Unable to require Encode: $@\n"; 1055 eval { require File::Spec; }; 1056 1057 # our used for variable expantion 1058 my %in_core = map { $_ => 1 } ( 1059 'ascii', 'iso-8859-1', 'utf8', 1060 'ascii-ctrl', 'null', 'utf-8-strict' 1061 ); 1062 my %LocalMod = (); 1063 # check @enc; 1064 use File::Find (); 1065 my $wanted = sub{ 1066 -f $_ or return; 1067 $File::Find::name =~ /\A\./ and return; 1068 $File::Find::name =~ /\.pm\z/ or return; 1069 $File::Find::name =~ m/\bEncode\b/ or return; 1070 my $mod = $File::Find::name; 1071 $mod =~ s/.*\bEncode\b/Encode/o; 1072 $mod =~ s/\.pm\z//o; 1073 $mod =~ s,/,::,og; 1074 eval qq{ require $mod; } or return; 1075 warn qq{ require $mod;\n}; 1076 for my $enc ( Encode->encodings() ) { 1077 no warnings; 1078 $in_core{$enc} and next; 1079 $Encode::Config::ExtModule{$enc} and next; 1080 $LocalMod{$enc} ||= $mod; 1081 } 1082 }; 1083 File::Find::find({wanted => $wanted}, @INC); 1084 $_ModLines = ""; 1085 for my $enc ( sort keys %LocalMod ) { 1086 $_ModLines .= 1087 qq(\$Encode::ExtModule{'$enc'} = "$LocalMod{$enc}";\n); 1088 } 1089 warn $_ModLines if $_ModLines; 1090 $_LocalVer = _mkversion(); 1091 $_E2X = find_e2x(); 1092 $_Inc = $INC{"Encode.pm"}; 1093 $_Inc =~ s/\.pm$//o; 1094 _print_expand( File::Spec->catfile( $_E2X, "ConfigLocal_PM.e2x" ), 1095 File::Spec->catfile( $_Inc, "ConfigLocal.pm" ), 1 ); 1096 exit; 1097} 1098 1099sub _mkversion{ 1100 # v-string is now depreciated; use time() instead; 1101 #my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime(); 1102 #$yyyy += 1900, $mo +=1; 1103 #return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm); 1104 return time(); 1105} 1106 1107sub _print_expand{ 1108 eval { require File::Basename } or die "File::Basename needed. Are you on miniperl?;\nerror: $@\n"; 1109 File::Basename->import(); 1110 my ($src, $dst, $clobber) = @_; 1111 if (!$clobber and -e $dst){ 1112 warn "$dst exists. skipping\n"; 1113 return; 1114 } 1115 warn "Generating $dst...\n"; 1116 open my $in, $src or die "$src : $!"; 1117 if ((my $d = dirname($dst)) ne '.'){ 1118 -d $d or mkdir $d, 0755 or die "mkdir $d : $!"; 1119 } 1120 open my $out, ">", $dst or die "$!"; 1121 my $asis = 0; 1122 while (<$in>){ 1123 if (/^#### END_OF_HEADER/){ 1124 $asis = 1; next; 1125 } 1126 s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis; 1127 print $out $_; 1128 } 1129} 1130__END__ 1131 1132=head1 NAME 1133 1134enc2xs -- Perl Encode Module Generator 1135 1136=head1 SYNOPSIS 1137 1138 enc2xs -[options] 1139 enc2xs -M ModName mapfiles... 1140 enc2xs -C 1141 1142=head1 DESCRIPTION 1143 1144F<enc2xs> builds a Perl extension for use by Encode from either 1145Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc). 1146Besides being used internally during the build process of the Encode 1147module, you can use F<enc2xs> to add your own encoding to perl. 1148No knowledge of XS is necessary. 1149 1150=head1 Quick Guide 1151 1152If you want to know as little about Perl as possible but need to 1153add a new encoding, just read this chapter and forget the rest. 1154 1155=over 4 1156 1157=item 0.Z<> 1158 1159Have a .ucm file ready. You can get it from somewhere or you can write 1160your own from scratch or you can grab one from the Encode distribution 1161and customize it. For the UCM format, see the next Chapter. In the 1162example below, I'll call my theoretical encoding myascii, defined 1163in I<my.ucm>. C<$> is a shell prompt. 1164 1165 $ ls -F 1166 my.ucm 1167 1168=item 1.Z<> 1169 1170Issue a command as follows; 1171 1172 $ enc2xs -M My my.ucm 1173 generating Makefile.PL 1174 generating My.pm 1175 generating README 1176 generating Changes 1177 1178Now take a look at your current directory. It should look like this. 1179 1180 $ ls -F 1181 Makefile.PL My.pm my.ucm t/ 1182 1183The following files were created. 1184 1185 Makefile.PL - MakeMaker script 1186 My.pm - Encode submodule 1187 t/My.t - test file 1188 1189=over 4 1190 1191=item 1.1.Z<> 1192 1193If you want *.ucm installed together with the modules, do as follows; 1194 1195 $ mkdir Encode 1196 $ mv *.ucm Encode 1197 $ enc2xs -M My Encode/*ucm 1198 1199=back 1200 1201=item 2.Z<> 1202 1203Edit the files generated. You don't have to if you have no time AND no 1204intention to give it to someone else. But it is a good idea to edit 1205the pod and to add more tests. 1206 1207=item 3.Z<> 1208 1209Now issue a command all Perl Mongers love: 1210 1211 $ perl Makefile.PL 1212 Writing Makefile for Encode::My 1213 1214=item 4.Z<> 1215 1216Now all you have to do is make. 1217 1218 $ make 1219 cp My.pm blib/lib/Encode/My.pm 1220 /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \ 1221 -o encode_t.c -f encode_t.fnm 1222 Reading myascii (myascii) 1223 Writing compiled form 1224 128 bytes in string tables 1225 384 bytes (75%) saved spotting duplicates 1226 1 bytes (0.775%) saved using substrings 1227 .... 1228 chmod 644 blib/arch/auto/Encode/My/My.bs 1229 $ 1230 1231The time it takes varies depending on how fast your machine is and 1232how large your encoding is. Unless you are working on something big 1233like euc-tw, it won't take too long. 1234 1235=item 5.Z<> 1236 1237You can "make install" already but you should test first. 1238 1239 $ make test 1240 PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \ 1241 -e 'use Test::Harness qw(&runtests $verbose); \ 1242 $verbose=0; runtests @ARGV;' t/*.t 1243 t/My....ok 1244 All tests successful. 1245 Files=1, Tests=2, 0 wallclock secs 1246 ( 0.09 cusr + 0.01 csys = 0.09 CPU) 1247 1248=item 6.Z<> 1249 1250If you are content with the test result, just "make install" 1251 1252=item 7.Z<> 1253 1254If you want to add your encoding to Encode's demand-loading list 1255(so you don't have to "use Encode::YourEncoding"), run 1256 1257 enc2xs -C 1258 1259to update Encode::ConfigLocal, a module that controls local settings. 1260After that, "use Encode;" is enough to load your encodings on demand. 1261 1262=back 1263 1264=head1 The Unicode Character Map 1265 1266Encode uses the Unicode Character Map (UCM) format for source character 1267mappings. This format is used by IBM's ICU package and was adopted 1268by Nick Ing-Simmons for use with the Encode module. Since UCM is 1269more flexible than Tcl's Encoding Map and far more user-friendly, 1270this is the recommended format for Encode now. 1271 1272A UCM file looks like this. 1273 1274 # 1275 # Comments 1276 # 1277 <code_set_name> "US-ascii" # Required 1278 <code_set_alias> "ascii" # Optional 1279 <mb_cur_min> 1 # Required; usually 1 1280 <mb_cur_max> 1 # Max. # of bytes/char 1281 <subchar> \x3F # Substitution char 1282 # 1283 CHARMAP 1284 <U0000> \x00 |0 # <control> 1285 <U0001> \x01 |0 # <control> 1286 <U0002> \x02 |0 # <control> 1287 .... 1288 <U007C> \x7C |0 # VERTICAL LINE 1289 <U007D> \x7D |0 # RIGHT CURLY BRACKET 1290 <U007E> \x7E |0 # TILDE 1291 <U007F> \x7F |0 # <control> 1292 END CHARMAP 1293 1294=over 4 1295 1296=item * 1297 1298Anything that follows C<#> is treated as a comment. 1299 1300=item * 1301 1302The header section continues until a line containing the word 1303CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one 1304pair per line. Strings used as values must be quoted. Barewords are 1305treated as numbers. I<\xXX> represents a byte. 1306 1307Most of the keywords are self-explanatory. I<subchar> means 1308substitution character, not subcharacter. When you decode a Unicode 1309sequence to this encoding but no matching character is found, the byte 1310sequence defined here will be used. For most cases, the value here is 1311\x3F; in ASCII, this is a question mark. 1312 1313=item * 1314 1315CHARMAP starts the character map section. Each line has a form as 1316follows: 1317 1318 <UXXXX> \xXX.. |0 # comment 1319 ^ ^ ^ 1320 | | +- Fallback flag 1321 | +-------- Encoded byte sequence 1322 +-------------- Unicode Character ID in hex 1323 1324The format is roughly the same as a header section except for the 1325fallback flag: | followed by 0..3. The meaning of the possible 1326values is as follows: 1327 1328=over 4 1329 1330=item |0 1331 1332Round trip safe. A character decoded to Unicode encodes back to the 1333same byte sequence. Most characters have this flag. 1334 1335=item |1 1336 1337Fallback for unicode -> encoding. When seen, enc2xs adds this 1338character for the encode map only. 1339 1340=item |2 1341 1342Skip sub-char mapping should there be no code point. 1343 1344=item |3 1345 1346Fallback for encoding -> unicode. When seen, enc2xs adds this 1347character for the decode map only. 1348 1349=back 1350 1351=item * 1352 1353And finally, END OF CHARMAP ends the section. 1354 1355=back 1356 1357When you are manually creating a UCM file, you should copy ascii.ucm 1358or an existing encoding which is close to yours, rather than write 1359your own from scratch. 1360 1361When you do so, make sure you leave at least B<U0000> to B<U0020> as 1362is, unless your environment is EBCDIC. 1363 1364B<CAVEAT>: not all features in UCM are implemented. For example, 1365icu:state is not used. Because of that, you need to write a perl 1366module if you want to support algorithmical encodings, notably 1367the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>, 1368L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>. 1369 1370=head2 Coping with duplicate mappings 1371 1372When you create a map, you SHOULD make your mappings round-trip safe. 1373That is, C<encode('your-encoding', decode('your-encoding', $data)) eq 1374$data> stands for all characters that are marked as C<|0>. Here is 1375how to make sure: 1376 1377=over 4 1378 1379=item * 1380 1381Sort your map in Unicode order. 1382 1383=item * 1384 1385When you have a duplicate entry, mark either one with '|1' or '|3'. 1386 1387=item * 1388 1389And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry. 1390 1391=back 1392 1393Here is an example from big5-eten. 1394 1395 <U2550> \xF9\xF9 |0 1396 <U2550> \xA2\xA4 |3 1397 1398Internally Encoding -> Unicode and Unicode -> Encoding Map looks like 1399this; 1400 1401 E to U U to E 1402 -------------------------------------- 1403 \xF9\xF9 => U2550 U2550 => \xF9\xF9 1404 \xA2\xA4 => U2550 1405 1406So it is round-trip safe for \xF9\xF9. But if the line above is upside 1407down, here is what happens. 1408 1409 E to U U to E 1410 -------------------------------------- 1411 \xA2\xA4 => U2550 U2550 => \xF9\xF9 1412 (\xF9\xF9 => U2550 is now overwritten!) 1413 1414The Encode package comes with F<ucmlint>, a crude but sufficient 1415utility to check the integrity of a UCM file. Check under the 1416Encode/bin directory for this. 1417 1418When in doubt, you can use F<ucmsort>, yet another utility under 1419Encode/bin directory. 1420 1421=head1 Bookmarks 1422 1423=over 4 1424 1425=item * 1426 1427ICU Home Page 1428L<http://www.icu-project.org/> 1429 1430=item * 1431 1432ICU Character Mapping Tables 1433L<http://site.icu-project.org/charts/charset> 1434 1435=item * 1436 1437ICU:Conversion Data 1438L<http://www.icu-project.org/userguide/conversion-data.html> 1439 1440=back 1441 1442=head1 SEE ALSO 1443 1444L<Encode>, 1445L<perlmod>, 1446L<perlpod> 1447 1448=cut 1449 1450# -Q to disable the duplicate codepoint test 1451# -S make mapping errors fatal 1452# -q to remove comments written to output files 1453# -O to enable the (brute force) substring optimiser 1454# -o <output> to specify the output file name (else it's the first arg) 1455# -f <inlist> to give a file with a list of input files (else use the args) 1456# -n <name> to name the encoding (else use the basename of the input file. 1457 1458With %seen holding array refs: 1459 1460 865.66 real 28.80 user 8.79 sys 1461 7904 maximum resident set size 1462 1356 average shared memory size 1463 18566 average unshared data size 1464 229 average unshared stack size 1465 46080 page reclaims 1466 33373 page faults 1467 1468With %seen holding simple scalars: 1469 1470 342.16 real 27.11 user 3.54 sys 1471 8388 maximum resident set size 1472 1394 average shared memory size 1473 14969 average unshared data size 1474 236 average unshared stack size 1475 28159 page reclaims 1476 9839 page faults 1477 1478Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is 1479how %seen is storing things its seen. So it is pathalogically bad on a 16M 1480RAM machine, but it's going to help even on modern machines. 1481Swapping is bad, m'kay :-) 1482