1# 2# $Id: _Classic.pm,v 2.0 2005/05/16 19:08:04 dankogai Exp $ 3# 4 5package Jcode::_Classic; 6use 5.004; 7use Carp; 8use strict; 9use vars qw($RCSID $VERSION $DEBUG); 10 11$RCSID = q$Id: _Classic.pm,v 2.0 2005/05/16 19:08:04 dankogai Exp $; 12$VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; 13 14$DEBUG = $Jcode::DEBUG; 15use vars qw($USE_CACHE $NOXS); 16 17$USE_CACHE = 1; 18$NOXS = 0; 19 20print $RCSID, "\n" if $DEBUG; 21 22use Jcode::Constants qw(:all); 23 24sub new { 25 my $class = shift; 26 my ($thingy, $icode) = @_; 27 my $r_str = ref $thingy ? $thingy : \$thingy; 28 my $nmatch; 29 ($icode, $nmatch) = getcode($r_str) unless $icode; 30 convert($r_str, 'euc', $icode); 31 my $self = [ 32 $r_str, 33 $icode, 34 $nmatch, 35 ]; 36 carp "Object of class $class created" if $DEBUG >= 2; 37 bless $self, $class; 38} 39 40sub r_str { $_[0]->[0] } 41sub icode { $_[0]->[1] } 42sub nmatch { $_[0]->[2] } 43 44sub set { 45 my $self = shift; 46 my ($thingy, $icode) = @_; 47 my $r_str = ref $thingy ? $thingy : \$thingy; 48 my $nmatch; 49 ($icode, $nmatch) = getcode($r_str) unless $icode; 50 convert($r_str, 'euc', $icode); 51 $self->[0] = $r_str; 52 $self->[1] = $icode; 53 $self->[2] = $nmatch; 54 $self->[3] = "Classic"; 55 return $self; 56} 57 58sub append { 59 my $self = shift; 60 my ($thingy, $icode) = @_; 61 my $r_str = ref $thingy ? $thingy : \$thingy; 62 my $nmatch; 63 ($icode, $nmatch) = getcode($r_str) unless $icode; 64 convert($r_str, 'euc', $icode); 65 ${$self->[0]} .= $$r_str; 66 $self->[1] = $icode; 67 $self->[2] = $nmatch; 68 return $self; 69} 70 71sub jcode { return Jcode->new(@_) } 72sub euc { return ${$_[0]->[0]} } 73sub jis { return &euc_jis(${$_[0]->[0]})} 74sub sjis { return &euc_sjis(${$_[0]->[0]})} 75sub iso_2022_jp{return $_[0]->h2z->jis} 76 77sub jfold{ 78 my $self = shift; 79 my ($bpl, $nl) = @_; 80 $bpl ||= 72; 81 $nl ||= "\n"; 82 my $r_str = $self->[0]; 83 my @lines = (); my $len = 0; my $i = 0; 84 while ($$r_str =~ 85 m/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|[\x00-\xff])/sgo) 86 { 87 if ($len + length($1) > $bpl){ # fold! 88 $i++; 89 $len = 0; 90 } 91 $lines[$i] .= $1; 92 $len += length($1); 93 } 94 defined($lines[$i]) or pop @lines; 95 $$r_str = join($nl, @lines); 96 return wantarray ? @lines : $self; 97} 98 99sub jlength { 100 my $self = shift; 101 my $r_str = $self->[0]; 102 return scalar (my @char = $$r_str =~ m/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|[\x00-\xff])/sgo); 103} 104 105sub mime_encode{ 106 my $self = shift; 107 my $r_str = $self->[0]; 108 my $lf = shift || "\n"; 109 my $bpl = shift || 76; 110 111 my ($trailing_crlf) = ($$r_str =~ /(\n|\r|\x0d\x0a)$/o); 112 my $str = _mime_unstructured_header($$r_str, $lf, $bpl); 113 not $trailing_crlf and $str =~ s/(\n|\r|\x0d\x0a)$//o; 114 $str; 115} 116 117# 118# shamelessly stolen from 119# http://www.din.or.jp/~ohzaki/perl.htm#JP_Base64 120# 121 122sub _add_encoded_word { 123 require MIME::Base64; 124 my($str, $line, $bpl) = @_; 125 my $result = ''; 126 while (length($str)) { 127 my $target = $str; 128 $str = ''; 129 if (length($line) + 22 + 130 ($target =~ /^(?:$RE{EUC_0212}|$RE{EUC_C})/o) * 8 > $bpl) { 131 $line =~ s/[ \t\n\r]*$/\n/; 132 $result .= $line; 133 $line = ' '; 134 } 135 while (1) { 136 my $iso_2022_jp = jcode($target, 'euc')->iso_2022_jp; 137 if (my $count = ($iso_2022_jp =~ tr/\x80-\xff//d)){ 138 $DEBUG and warn $count; 139 $target = jcode($iso_2022_jp, 'iso_2022_jp')->euc; 140 } 141 my $encoded = '=?ISO-2022-JP?B?' . 142 MIME::Base64::encode_base64($iso_2022_jp, '') 143 . '?='; 144 if (length($encoded) + length($line) > $bpl) { 145 $target =~ 146 s/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|$RE{ASCII})$//o; 147 $str = $1 . $str; 148 } else { 149 $line .= $encoded; 150 last; 151 } 152 } 153 } 154 return $result . $line; 155} 156 157sub _mime_unstructured_header { 158 my ($oldheader, $lf, $bpl) = @_; 159 my(@words, @wordstmp, $i); 160 my $header = ''; 161 $oldheader =~ s/\s+$//; 162 @wordstmp = split /\s+/, $oldheader; 163 for ($i = 0; $i < $#wordstmp; $i++) { 164 if ($wordstmp[$i] !~ /^[\x21-\x7E]+$/ and 165 $wordstmp[$i + 1] !~ /^[\x21-\x7E]+$/) { 166 $wordstmp[$i + 1] = "$wordstmp[$i] $wordstmp[$i + 1]"; 167 } else { 168 push(@words, $wordstmp[$i]); 169 } 170 } 171 push(@words, $wordstmp[-1]); 172 for my $word (@words) { 173 if ($word =~ /^[\x21-\x7E]+$/) { 174 $header =~ /(?:.*\n)*(.*)/; 175 if (length($1) + length($word) > $bpl) { 176 $header .= "$lf $word"; 177 } else { 178 $header .= $word; 179 } 180 } else { 181 $header = _add_encoded_word($word, $header, $bpl); 182 } 183 $header =~ /(?:.*\n)*(.*)/; 184 if (length($1) == $bpl) { 185 $header .= "$lf "; 186 } else { 187 $header .= ' '; 188 } 189 } 190 $header =~ s/\n? $/\n/; 191 $header; 192} 193 194# see http://www.din.or.jp/~ohzaki/perl.htm#JP_Base64 195#$lws = '(?:(?:\x0d\x0a)?[ \t])+'; 196#$ew_regex = '=\?ISO-2022-JP\?B\?([A-Za-z0-9+/]+=*)\?='; 197#$str =~ s/($ew_regex)$lws(?=$ew_regex)/$1/gio; 198#$str =~ s/$lws/ /go; $str =~ s/$ew_regex/decode_base64($1)/egio; 199 200sub mime_decode{ 201 require MIME::Base64; # not use 202 my $self = shift; 203 my $r_str = $self->[0]; 204 my $re_lws = '(?:(?:\r|\n|\x0d\x0a)?[ \t])+'; 205 my $re_ew = '=\?[Ii][Ss][Oo]-2022-[Jj][Pp]\?[Bb]\?([A-Za-z0-9+/]+=*)\?='; 206 $$r_str =~ s/($re_ew)$re_lws(?=$re_ew)/$1/sgo; 207 $$r_str =~ s/$re_lws/ /go; 208 $self->[2] = 209 ($$r_str =~ 210 s/$re_ew/jis_euc(MIME::Base64::decode_base64($1))/ego 211 ); 212 $self; 213} 214 215sub tr{ 216 require Jcode::Tr; # not use 217 my $self = shift; 218 $self->[2] = Jcode::Tr::tr($self->[0], @_); 219 return $self; 220} 221 222# 223# load needed module depending on the configuration just once! 224# 225 226use vars qw(%PKG_LOADED); 227 228sub load_module{ 229 my $pkg = shift; 230 return $pkg if $PKG_LOADED{$pkg}++; 231 unless ($NOXS){ 232 eval qq( require $pkg; ); 233 unless ($@){ 234 carp "$pkg loaded." if $DEBUG; 235 return $pkg; 236 } 237 } 238 $pkg .= "::NoXS"; 239 eval qq( require $pkg; ); 240 unless ($@){ 241 carp "$pkg loaded" if $DEBUG; 242 }else{ 243 croak "Loading $pkg failed!"; 244 } 245 $pkg; 246} 247 248sub ucs2{ 249 load_module("Jcode::Unicode"); 250 euc_ucs2(${$_[0]->[0]}); 251} 252 253sub utf8{ 254 load_module("Jcode::Unicode"); 255 euc_utf8(${$_[0]->[0]}); 256} 257 258sub getcode { 259 my $thingy = shift; 260 my $r_str = ref $thingy ? $thingy : \$thingy; 261 262 my ($code, $nmatch, $sjis, $euc, $utf8) = ("", 0, 0, 0, 0); 263 if ($$r_str =~ /$RE{BIN}/o) { # 'binary' 264 my $ucs2; 265 $ucs2 += length($1) 266 while $$r_str =~ /(\x00$RE{ASCII})+/go; 267 if ($ucs2){ # smells like raw unicode 268 ($code, $nmatch) = ('ucs2', $ucs2); 269 }else{ 270 ($code, $nmatch) = ('binary', 0); 271 } 272 } 273 elsif ($$r_str !~ /[\e\x80-\xff]/o) { # not Japanese 274 ($code, $nmatch) = ('ascii', 1); 275 } # 'jis' 276 elsif ($$r_str =~ 277 m[ 278 $RE{JIS_0208}|$RE{JIS_0212}|$RE{JIS_ASC}|$RE{JIS_KANA} 279 ]ox) 280 { 281 ($code, $nmatch) = ('jis', 1); 282 } 283 else { # should be euc|sjis|utf8 284 # use of (?:) by Hiroki Ohzaki <ohzaki@iod.ricoh.co.jp> 285 $sjis += length($1) 286 while $$r_str =~ /((?:$RE{SJIS_C})+)/go; 287 $euc += length($1) 288 while $$r_str =~ /((?:$RE{EUC_C}|$RE{EUC_KANA}|$RE{EUC_0212})+)/go; 289 $utf8 += length($1) 290 while $$r_str =~ /((?:$RE{UTF8})+)/go; 291 # $utf8 *= 1.5; # M. Takahashi's suggestion 292 $nmatch = _max($utf8, $sjis, $euc); 293 carp ">DEBUG:sjis = $sjis, euc = $euc, utf8 = $utf8" if $DEBUG >= 3; 294 $code = 295 ($euc > $sjis and $euc > $utf8) ? 'euc' : 296 ($sjis > $euc and $sjis > $utf8) ? 'sjis' : 297 ($utf8 > $euc and $utf8 > $sjis) ? 'utf8' : undef; 298 } 299 return wantarray ? ($code, $nmatch) : $code; 300} 301 302sub convert{ 303 my $thingy = shift; 304 my $r_str = ref $thingy ? $thingy : \$thingy; 305 my ($ocode, $icode, $opt) = @_; 306 307 my $nmatch; 308 ($icode, $nmatch) = getcode($r_str) unless $icode; 309 310 return $$r_str if $icode eq $ocode and !defined $opt; # do nothin' 311 312 no strict qw(refs); 313 my $method; 314 315 # convert to EUC 316 317 load_module("Jcode::Unicode") if $icode =~ /ucs2|utf8/o; 318 if ($icode and defined &{$method = "$icode" . "_euc"}){ 319 carp "Dispatching \&$method" if $DEBUG >= 2; 320 &{$method}($r_str) ; 321 } 322 323 # h2z or z2h 324 325 if ($opt){ 326 my $cmd = ($opt =~ /^z/o) ? "h2z" : ($opt =~ /^h/o) ? "z2h" : undef; 327 if ($cmd){ 328 require Jcode::H2Z; 329 &{'Jcode::H2Z::' . $cmd}($r_str); 330 } 331 } 332 333 # convert to $ocode 334 335 load_module("Jcode::Unicode") if $ocode =~ /ucs2|utf8/o; 336 if ($ocode and defined &{$method = "euc_" . $ocode}){ 337 carp "Dispatching \&$method" if $DEBUG >= 2; 338 &{$method}($r_str) ; 339 } 340 $$r_str; 341} 342 343sub h2z { 344 require Jcode::H2Z; # not use 345 my $self = shift; 346 $self->[2] = Jcode::H2Z::h2z($self->[0], @_); 347 return $self; 348} 349 350 351sub z2h { 352 require Jcode::H2Z; # not use 353 my $self = shift; 354 $self->[2] = &Jcode::H2Z::z2h($self->[0], @_); 355 return $self; 356} 357 358# JIS<->EUC 359 360sub jis_euc { 361 my $thingy = shift; 362 my $r_str = ref $thingy ? $thingy : \$thingy; 363 $$r_str =~ s( 364 ($RE{JIS_0212}|$RE{JIS_0208}|$RE{JIS_ASC}|$RE{JIS_KANA}) 365 ([^\e]*) 366 ) 367 { 368 my ($esc, $str) = ($1, $2); 369 if ($esc !~ /$RE{JIS_ASC}/o) { 370 $str =~ tr/\x21-\x7e/\xa1-\xfe/; 371 if ($esc =~ /$RE{JIS_KANA}/o) { 372 $str =~ s/([\xa1-\xdf])/\x8e$1/og; 373 } 374 elsif ($esc =~ /$RE{JIS_0212}/o) { 375 $str =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og; 376 } 377 } 378 $str; 379 }geox; 380 $$r_str; 381} 382 383# 384# euc_jis 385# 386# Based upon the contribution of 387# Kazuto Ichimura <ichimura@shimada.nuee.nagoya-u.ac.jp> 388# optimized by <ohzaki@iod.ricoh.co.jp> 389 390sub euc_jis{ 391 my $thingy = shift; 392 my $r_str = ref $thingy ? $thingy : \$thingy; 393 $$r_str =~ s{ 394 ((?:$RE{EUC_C})+|(?:$RE{EUC_KANA})+|(?:$RE{EUC_0212})+) 395 }{ 396 my $str = $1; 397 my $esc = 398 ( $str =~ tr/\x8E//d ) ? $ESC{KANA} : 399 ( $str =~ tr/\x8F//d ) ? $ESC{JIS_0212} : 400 $ESC{JIS_0208}; 401 $str =~ tr/\xA1-\xFE/\x21-\x7E/; 402 $esc . $str . $ESC{ASC}; 403 }geox; 404 $$r_str =~ 405 s/\Q$ESC{ASC}\E 406 (\Q$ESC{KANA}\E|\Q$ESC{JIS_0212}\E|\Q$ESC{JIS_0208}\E)/$1/gox; 407 $$r_str; 408} 409 410# EUC<->SJIS 411 412my %_S2E = (); 413my %_E2S = (); 414 415sub sjis_euc { 416 my $thingy = shift; 417 my $r_str = ref $thingy ? $thingy : \$thingy; 418 $$r_str =~ s( 419 ($RE{SJIS_C}|$RE{SJIS_KANA}) 420 ) 421 { 422 my $str = $1; 423 unless ($_S2E{$1}){ 424 my ($c1, $c2) = unpack('CC', $str); 425 if (0xa1 <= $c1 && $c1 <= 0xdf) { 426 $c2 = $c1; 427 $c1 = 0x8e; 428 } elsif (0x9f <= $c2) { 429 $c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe0 : 0x60); 430 $c2 += 2; 431 } else { 432 $c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe1 : 0x61); 433 $c2 += 0x60 + ($c2 < 0x7f); 434 } 435 $_S2E{$str} = pack('CC', $c1, $c2); 436 } 437 $_S2E{$str}; 438 }geox; 439 $$r_str; 440} 441 442# 443 444sub euc_sjis { 445 my $thingy = shift; 446 my $r_str = ref $thingy ? $thingy : \$thingy; 447 $$r_str =~ s( 448 ($RE{EUC_C}|$RE{EUC_KANA}|$RE{EUC_0212}) 449 ) 450 { 451 my $str = $1; 452 unless ($_E2S{$str}){ 453 my ($c1, $c2) = unpack('CC', $str); 454 if ($c1 == 0x8e) { # SS2 455 $_E2S{$str} = chr($c2); 456 } elsif ($c1 == 0x8f) { # SS3 457 $_E2S{$str} = $CHARCODE{UNDEF_SJIS}; 458 }else { #SS1 or X0208 459 if ($c1 % 2) { 460 $c1 = ($c1>>1) + ($c1 < 0xdf ? 0x31 : 0x71); 461 $c2 -= 0x60 + ($c2 < 0xe0); 462 } else { 463 $c1 = ($c1>>1) + ($c1 < 0xdf ? 0x30 : 0x70); 464 $c2 -= 2; 465 } 466 $_E2S{$str} = pack('CC', $c1, $c2); 467 } 468 } 469 $_E2S{$str}; 470 }geox; 471 $$r_str; 472} 473 474# 475# Util. Functions 476# 477 478sub _max { 479 my $result = shift; 480 for my $n (@_){ 481 $result = $n if $n > $result; 482 } 483 return $result; 484} 4851; 486