1#
2# $Id: _Classic.pm,v 2.0 2005/05/16 19:08:04 dankogai Exp $
3#
4
5package Jcode::_Classic;
6use 5.004;
7use Carp;
8use strict;
9use vars qw($RCSID $VERSION $DEBUG);
10
11$RCSID = q$Id: _Classic.pm,v 2.0 2005/05/16 19:08:04 dankogai Exp $;
12$VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
13
14$DEBUG = $Jcode::DEBUG;
15use vars qw($USE_CACHE $NOXS);
16
17$USE_CACHE = 1;
18$NOXS = 0;
19
20print $RCSID, "\n" if $DEBUG;
21
22use Jcode::Constants qw(:all);
23
24sub new {
25    my $class = shift;
26    my ($thingy, $icode) = @_;
27    my $r_str = ref $thingy ? $thingy : \$thingy;
28    my $nmatch;
29    ($icode, $nmatch) = getcode($r_str) unless $icode;
30    convert($r_str, 'euc', $icode);
31    my $self = [
32	$r_str,
33	$icode,
34	$nmatch,
35    ];
36    carp "Object of class $class created" if $DEBUG >= 2;
37    bless $self, $class;
38}
39
40sub r_str  { $_[0]->[0] }
41sub icode  { $_[0]->[1] }
42sub nmatch { $_[0]->[2] }
43
44sub set {
45    my $self = shift;
46    my ($thingy, $icode) = @_;
47    my $r_str = ref $thingy ? $thingy : \$thingy;
48    my $nmatch;
49    ($icode, $nmatch) = getcode($r_str) unless $icode;
50    convert($r_str, 'euc', $icode);
51    $self->[0] = $r_str;
52    $self->[1] = $icode;
53    $self->[2] = $nmatch;
54    $self->[3] = "Classic";
55    return $self;
56}
57
58sub append {
59    my $self = shift;
60    my ($thingy, $icode) = @_;
61    my $r_str = ref $thingy ? $thingy : \$thingy;
62    my $nmatch;
63    ($icode, $nmatch) = getcode($r_str) unless $icode;
64    convert($r_str, 'euc', $icode);
65    ${$self->[0]} .= $$r_str;
66    $self->[1] = $icode;
67    $self->[2] = $nmatch;
68    return $self;
69}
70
71sub jcode { return Jcode->new(@_) }
72sub euc   { return ${$_[0]->[0]} }
73sub jis   { return  &euc_jis(${$_[0]->[0]})}
74sub sjis  { return &euc_sjis(${$_[0]->[0]})}
75sub iso_2022_jp{return $_[0]->h2z->jis}
76
77sub jfold{
78    my $self = shift;
79    my ($bpl, $nl) = @_;
80    $bpl ||= 72;
81    $nl  ||= "\n";
82    my $r_str = $self->[0];
83    my @lines = (); my $len = 0; my $i = 0;
84    while ($$r_str =~
85	   m/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|[\x00-\xff])/sgo)
86    {
87	if ($len + length($1) > $bpl){ # fold!
88	    $i++;
89	    $len = 0;
90	}
91	$lines[$i] .= $1;
92	$len += length($1);
93    }
94    defined($lines[$i]) or pop @lines;
95    $$r_str = join($nl, @lines);
96    return wantarray ? @lines : $self;
97}
98
99sub jlength {
100    my $self = shift;
101    my $r_str = $self->[0];
102    return scalar (my @char = $$r_str =~ m/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|[\x00-\xff])/sgo);
103}
104
105sub mime_encode{
106    my $self = shift;
107    my $r_str = $self->[0];
108    my $lf  = shift || "\n";
109    my $bpl = shift || 76;
110
111    my ($trailing_crlf) = ($$r_str =~ /(\n|\r|\x0d\x0a)$/o);
112    my $str  = _mime_unstructured_header($$r_str, $lf, $bpl);
113    not $trailing_crlf and $str =~ s/(\n|\r|\x0d\x0a)$//o;
114    $str;
115}
116
117#
118# shamelessly stolen from
119# http://www.din.or.jp/~ohzaki/perl.htm#JP_Base64
120#
121
122sub _add_encoded_word {
123    require MIME::Base64;
124    my($str, $line, $bpl) = @_;
125    my $result = '';
126    while (length($str)) {
127	my $target = $str;
128	$str = '';
129	if (length($line) + 22 +
130	    ($target =~ /^(?:$RE{EUC_0212}|$RE{EUC_C})/o) * 8 > $bpl) {
131	    $line =~ s/[ \t\n\r]*$/\n/;
132	    $result .= $line;
133	    $line = ' ';
134	}
135	while (1) {
136	    my $iso_2022_jp = jcode($target, 'euc')->iso_2022_jp;
137	    if (my $count = ($iso_2022_jp =~ tr/\x80-\xff//d)){
138		$DEBUG and warn $count;
139		$target = jcode($iso_2022_jp, 'iso_2022_jp')->euc;
140	    }
141	    my $encoded = '=?ISO-2022-JP?B?' .
142	      MIME::Base64::encode_base64($iso_2022_jp, '')
143		  . '?=';
144	    if (length($encoded) + length($line) > $bpl) {
145		$target =~
146		    s/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|$RE{ASCII})$//o;
147		$str = $1 . $str;
148	    } else {
149		$line .= $encoded;
150		last;
151	    }
152	}
153    }
154    return $result . $line;
155}
156
157sub _mime_unstructured_header {
158    my ($oldheader, $lf, $bpl) = @_;
159    my(@words, @wordstmp, $i);
160    my $header = '';
161    $oldheader =~ s/\s+$//;
162    @wordstmp = split /\s+/, $oldheader;
163    for ($i = 0; $i < $#wordstmp; $i++) {
164	if ($wordstmp[$i] !~ /^[\x21-\x7E]+$/ and
165	    $wordstmp[$i + 1] !~ /^[\x21-\x7E]+$/) {
166	    $wordstmp[$i + 1] = "$wordstmp[$i] $wordstmp[$i + 1]";
167	} else {
168	    push(@words, $wordstmp[$i]);
169	}
170    }
171    push(@words, $wordstmp[-1]);
172    for my $word (@words) {
173	if ($word =~ /^[\x21-\x7E]+$/) {
174	    $header =~ /(?:.*\n)*(.*)/;
175	    if (length($1) + length($word) > $bpl) {
176		$header .= "$lf $word";
177	    } else {
178		$header .= $word;
179	    }
180	} else {
181	    $header = _add_encoded_word($word, $header, $bpl);
182	}
183	$header =~ /(?:.*\n)*(.*)/;
184	if (length($1) == $bpl) {
185	    $header .= "$lf ";
186	} else {
187	    $header .= ' ';
188	}
189    }
190    $header =~ s/\n? $/\n/;
191    $header;
192}
193
194# see http://www.din.or.jp/~ohzaki/perl.htm#JP_Base64
195#$lws = '(?:(?:\x0d\x0a)?[ \t])+';
196#$ew_regex = '=\?ISO-2022-JP\?B\?([A-Za-z0-9+/]+=*)\?=';
197#$str =~ s/($ew_regex)$lws(?=$ew_regex)/$1/gio;
198#$str =~ s/$lws/ /go; $str =~ s/$ew_regex/decode_base64($1)/egio;
199
200sub mime_decode{
201    require MIME::Base64; # not use
202    my $self = shift;
203    my $r_str = $self->[0];
204    my $re_lws = '(?:(?:\r|\n|\x0d\x0a)?[ \t])+';
205    my $re_ew = '=\?[Ii][Ss][Oo]-2022-[Jj][Pp]\?[Bb]\?([A-Za-z0-9+/]+=*)\?=';
206    $$r_str =~ s/($re_ew)$re_lws(?=$re_ew)/$1/sgo;
207    $$r_str =~ s/$re_lws/ /go;
208    $self->[2] =
209	($$r_str =~
210	 s/$re_ew/jis_euc(MIME::Base64::decode_base64($1))/ego
211	 );
212    $self;
213}
214
215sub tr{
216    require Jcode::Tr; # not use
217    my $self = shift;
218    $self->[2] = Jcode::Tr::tr($self->[0], @_);
219    return $self;
220}
221
222#
223# load needed module depending on the configuration just once!
224#
225
226use vars qw(%PKG_LOADED);
227
228sub load_module{
229    my $pkg = shift;
230    return $pkg if $PKG_LOADED{$pkg}++;
231    unless ($NOXS){
232	eval qq( require $pkg; );
233	unless ($@){
234	    carp "$pkg loaded." if $DEBUG;
235	    return $pkg;
236	}
237    }
238    $pkg .= "::NoXS";
239    eval qq( require $pkg; );
240    unless ($@){
241	carp "$pkg loaded" if $DEBUG;
242    }else{
243	croak "Loading $pkg failed!";
244    }
245    $pkg;
246}
247
248sub ucs2{
249    load_module("Jcode::Unicode");
250    euc_ucs2(${$_[0]->[0]});
251}
252
253sub utf8{
254    load_module("Jcode::Unicode");
255    euc_utf8(${$_[0]->[0]});
256}
257
258sub getcode {
259    my $thingy = shift;
260    my $r_str = ref $thingy ? $thingy : \$thingy;
261
262    my ($code, $nmatch, $sjis, $euc, $utf8) = ("", 0, 0, 0, 0);
263    if ($$r_str =~ /$RE{BIN}/o) {	# 'binary'
264	my $ucs2;
265	$ucs2 += length($1)
266	    while $$r_str =~ /(\x00$RE{ASCII})+/go;
267	if ($ucs2){      # smells like raw unicode
268	    ($code, $nmatch) = ('ucs2', $ucs2);
269	}else{
270	    ($code, $nmatch) = ('binary', 0);
271	 }
272    }
273    elsif ($$r_str !~ /[\e\x80-\xff]/o) {	# not Japanese
274	($code, $nmatch) = ('ascii', 1);
275    }				# 'jis'
276    elsif ($$r_str =~
277	   m[
278	     $RE{JIS_0208}|$RE{JIS_0212}|$RE{JIS_ASC}|$RE{JIS_KANA}
279	   ]ox)
280    {
281	($code, $nmatch) = ('jis', 1);
282    }
283    else { # should be euc|sjis|utf8
284	# use of (?:) by Hiroki Ohzaki <ohzaki@iod.ricoh.co.jp>
285	$sjis += length($1)
286	    while $$r_str =~ /((?:$RE{SJIS_C})+)/go;
287	$euc  += length($1)
288	    while $$r_str =~ /((?:$RE{EUC_C}|$RE{EUC_KANA}|$RE{EUC_0212})+)/go;
289	$utf8 += length($1)
290	    while $$r_str =~ /((?:$RE{UTF8})+)/go;
291	# $utf8 *= 1.5; # M. Takahashi's suggestion
292	$nmatch = _max($utf8, $sjis, $euc);
293	carp ">DEBUG:sjis = $sjis, euc = $euc, utf8 = $utf8" if $DEBUG >= 3;
294	$code =
295	    ($euc > $sjis and $euc > $utf8) ? 'euc' :
296		($sjis > $euc and $sjis > $utf8) ? 'sjis' :
297		    ($utf8 > $euc and $utf8 > $sjis) ? 'utf8' : undef;
298    }
299    return wantarray ? ($code, $nmatch) : $code;
300}
301
302sub convert{
303    my $thingy = shift;
304    my $r_str = ref $thingy ? $thingy : \$thingy;
305    my ($ocode, $icode, $opt) = @_;
306
307    my $nmatch;
308    ($icode, $nmatch) = getcode($r_str) unless $icode;
309
310    return $$r_str if $icode eq $ocode and !defined $opt; # do nothin'
311
312    no strict qw(refs);
313    my $method;
314
315    # convert to EUC
316
317    load_module("Jcode::Unicode") if $icode =~ /ucs2|utf8/o;
318    if ($icode and defined &{$method = "$icode" . "_euc"}){
319	carp "Dispatching \&$method" if $DEBUG >= 2;
320	&{$method}($r_str) ;
321    }
322
323    # h2z or z2h
324
325    if ($opt){
326	my $cmd = ($opt =~ /^z/o) ? "h2z" : ($opt =~ /^h/o) ? "z2h" : undef;
327	if ($cmd){
328	    require Jcode::H2Z;
329	    &{'Jcode::H2Z::' . $cmd}($r_str);
330	}
331    }
332
333    # convert to $ocode
334
335    load_module("Jcode::Unicode") if $ocode =~ /ucs2|utf8/o;
336    if ($ocode and defined &{$method =  "euc_" . $ocode}){
337	carp "Dispatching \&$method" if $DEBUG >= 2;
338	&{$method}($r_str) ;
339    }
340    $$r_str;
341}
342
343sub h2z {
344    require Jcode::H2Z; # not use
345    my $self = shift;
346    $self->[2] = Jcode::H2Z::h2z($self->[0], @_);
347    return $self;
348}
349
350
351sub z2h {
352    require Jcode::H2Z; # not use
353    my $self = shift;
354    $self->[2] =  &Jcode::H2Z::z2h($self->[0], @_);
355    return $self;
356}
357
358# JIS<->EUC
359
360sub jis_euc {
361    my $thingy = shift;
362    my $r_str = ref $thingy ? $thingy : \$thingy;
363    $$r_str =~ s(
364		 ($RE{JIS_0212}|$RE{JIS_0208}|$RE{JIS_ASC}|$RE{JIS_KANA})
365		 ([^\e]*)
366		 )
367    {
368	my ($esc, $str) = ($1, $2);
369	if ($esc !~ /$RE{JIS_ASC}/o) {
370	    $str =~ tr/\x21-\x7e/\xa1-\xfe/;
371	    if ($esc =~ /$RE{JIS_KANA}/o) {
372		$str =~ s/([\xa1-\xdf])/\x8e$1/og;
373	    }
374	    elsif ($esc =~ /$RE{JIS_0212}/o) {
375		$str =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
376	    }
377	}
378	$str;
379    }geox;
380    $$r_str;
381}
382
383#
384# euc_jis
385#
386# Based upon the contribution of
387# Kazuto Ichimura <ichimura@shimada.nuee.nagoya-u.ac.jp>
388# optimized by <ohzaki@iod.ricoh.co.jp>
389
390sub euc_jis{
391    my $thingy = shift;
392    my $r_str = ref $thingy ? $thingy : \$thingy;
393    $$r_str =~ s{
394	((?:$RE{EUC_C})+|(?:$RE{EUC_KANA})+|(?:$RE{EUC_0212})+)
395	}{
396	    my $str = $1;
397	    my $esc =
398		( $str =~ tr/\x8E//d ) ? $ESC{KANA} :
399		    ( $str =~ tr/\x8F//d ) ? $ESC{JIS_0212} :
400			$ESC{JIS_0208};
401	    $str =~ tr/\xA1-\xFE/\x21-\x7E/;
402	    $esc . $str . $ESC{ASC};
403	}geox;
404    $$r_str =~
405	s/\Q$ESC{ASC}\E
406	    (\Q$ESC{KANA}\E|\Q$ESC{JIS_0212}\E|\Q$ESC{JIS_0208}\E)/$1/gox;
407    $$r_str;
408}
409
410# EUC<->SJIS
411
412my %_S2E = ();
413my %_E2S = ();
414
415sub sjis_euc {
416    my $thingy = shift;
417    my $r_str = ref $thingy ? $thingy : \$thingy;
418    $$r_str =~ s(
419		 ($RE{SJIS_C}|$RE{SJIS_KANA})
420	     )
421    {
422	my $str = $1;
423	unless ($_S2E{$1}){
424	    my ($c1, $c2) = unpack('CC', $str);
425	    if (0xa1 <= $c1 && $c1 <= 0xdf) {
426		$c2 = $c1;
427		$c1 = 0x8e;
428	    } elsif (0x9f <= $c2) {
429		$c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe0 : 0x60);
430		$c2 += 2;
431	    } else {
432		$c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe1 : 0x61);
433		$c2 += 0x60 + ($c2 < 0x7f);
434	    }
435	    $_S2E{$str} = pack('CC', $c1, $c2);
436	}
437	$_S2E{$str};
438    }geox;
439    $$r_str;
440}
441
442#
443
444sub euc_sjis {
445    my $thingy = shift;
446    my $r_str = ref $thingy ? $thingy : \$thingy;
447    $$r_str =~ s(
448		 ($RE{EUC_C}|$RE{EUC_KANA}|$RE{EUC_0212})
449		 )
450    {
451	my $str = $1;
452	unless ($_E2S{$str}){
453	    my ($c1, $c2) = unpack('CC', $str);
454	    if ($c1 == 0x8e) {          # SS2
455		$_E2S{$str} = chr($c2);
456	    } elsif ($c1 == 0x8f) {     # SS3
457		$_E2S{$str} = $CHARCODE{UNDEF_SJIS};
458	    }else { #SS1 or X0208
459		if ($c1 % 2) {
460		    $c1 = ($c1>>1) + ($c1 < 0xdf ? 0x31 : 0x71);
461		    $c2 -= 0x60 + ($c2 < 0xe0);
462		} else {
463		    $c1 = ($c1>>1) + ($c1 < 0xdf ? 0x30 : 0x70);
464		    $c2 -= 2;
465		}
466		$_E2S{$str} = pack('CC', $c1, $c2);
467	    }
468	}
469	$_E2S{$str};
470    }geox;
471    $$r_str;
472}
473
474#
475# Util. Functions
476#
477
478sub _max {
479    my $result = shift;
480    for my $n (@_){
481	$result = $n if $n > $result;
482    }
483    return $result;
484}
4851;
486